diff --git a/.ccls b/.ccls
new file mode 100644
index 0000000000..2ee6701806
--- /dev/null
+++ b/.ccls
@@ -0,0 +1,3 @@
+%compile_commands.json
+%h -x
+%h c++-header
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..7a87b31eb7
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+---
+BasedOnStyle: Google
+ColumnLimit: 150
+DerivePointerAlignment: false
+PointerAlignment: Right
+---
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 006107a7d2..77621d1df4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,6 +30,7 @@ include(FeatureSummary)
 include(ExternalProject)
 
 option(SKIP_TESTS "Skips building all tests." OFF)
+option(ENABLE_BENCHMARKING "Enables building of benchmark suites." OFF)
 
 option(PORTABLE "Instructs the compiler to remove architecture specific optimizations" ON)
 
@@ -376,6 +377,7 @@ add_subdirectory(thirdparty/yaml-cpp-yaml-cpp-20171024)
 include_directories(thirdparty/concurrentqueue)
 include_directories(thirdparty/yaml-cpp-yaml-cpp-20171024/include)
 include_directories(thirdparty/rapidjson-1.1.0/include)
+include_directories(thirdparty/mio/include)
 
 ## Expression language extensions
 option(DISABLE_EXPRESSION_LANGUAGE "Disables the scripting extensions." OFF)
@@ -409,6 +411,7 @@ if (WIN32 OR NOT USE_SYSTEM_ZLIB)
 	add_dependencies(minifi zlib-external)
 endif(WIN32 OR NOT USE_SYSTEM_ZLIB)
 
+
 createExtension(STANDARD-PROCESSORS "STANDARD PROCESSORS" "Provides standard processors" "extensions/standard-processors" "extensions/standard-processors/tests/")
 
 
@@ -428,6 +431,7 @@ endif()
 
 ## Add the rocks DB extension
 if (NOT ROCKSDB_FOUND OR BUILD_ROCKSDB)
+	set(USE_RTTI "TRUE")
 	set(BUILD_RD "TRUE")
 endif()
 
@@ -706,6 +710,49 @@ include(CPack)
 
 if (NOT SKIP_TESTS)
 	include(BuildTests)
+
+    # BENCHMARKING depends on test support code
+    if (ENABLE_BENCHMARKING)
+        set(BENCHMARK_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}/thirdparty/google-benchmark-install/lib64")
+        set(BENCHMARK_BYPRODUCT "${BENCHMARK_LIB_DIR}/libbenchmark.a")
+        set(BENCHMARK_MAIN_BYPRODUCT "${BENCHMARK_LIB_DIR}/libbenchmark.a")
+        #  GIT_REPOSITORY "https://github.com/google/benchmark.git"
+        #GIT_TAG "090faecb454fbd6e6e17a75ef8146acb037118d4"  # Version 1.5.0
+        ExternalProject_Add(
+                google-benchmark-external
+                SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/benchmark-1.5.0"
+                CMAKE_ARGS ${PASSTHROUGH_CMAKE_ARGS}
+                "-DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/thirdparty/google-benchmark-install"
+                "-DBENCHMARK_ENABLE_TESTING=OFF"
+                BUILD_BYPRODUCTS ${BENCHMARK_BYPRODUCT} ${BENCHMARK_MAIN_BYPRODUCT}
+        )
+        add_library(benchmark STATIC IMPORTED)
+        set_target_properties(benchmark PROPERTIES IMPORTED_LOCATION ${BENCHMARK_BYPRODUCT})
+        add_dependencies(benchmark google-benchmark-external)
+        add_library(benchmark_main STATIC IMPORTED)
+        set_target_properties(benchmark_main PROPERTIES IMPORTED_LOCATION ${BENCHMARK_MAIN_BYPRODUCT})
+        add_dependencies(benchmark_main google-benchmark-external)
+        file(GLOB LIBMINIFI_BENCHMARKS  "libminifi/benchmark/*.cpp")
+        set(ALL_BENCHMARKS "${LIBMINIFI_BENCHMARKS}")
+
+        set(BENCHMARK_COUNT 0)
+        foreach(benchmarkfile ${ALL_BENCHMARKS})
+            get_filename_component(benchmarkfilename "${benchmarkfile}" NAME_WE)
+            add_executable("${benchmarkfilename}" "${benchmarkfile}")
+            target_link_libraries("${benchmarkfilename}" benchmark benchmark_main "${CMAKE_THREAD_LIBS_INIT}")
+            target_link_libraries ("${benchmarkfilename}" -Wl,--whole-archive core-minifi minifi -Wl,--no-whole-archive)
+            appendIncludes("${benchmarkfilename}")
+	    if (DISABLE_ROCKSDB STREQUAL "OFF" OR NOT DISABLE_ROCKSDB)
+	      target_include_directories("${benchmarkfilename}" BEFORE PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/rocksdb/include")
+	      target_include_directories("${benchmarkfilename}" BEFORE PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/extensions/rocksdb-repos")
+	      target_link_libraries ("${benchmarkfilename}" -Wl,--whole-archive minifi-rocksdb-repos -Wl,--no-whole-archive)
+	      add_definitions(-DENABLE_ROCKSDB_BENCHMARKS=1)
+	    endif()
+            target_include_directories("${benchmarkfilename}" BEFORE PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/benchmark-1.5.0/include")
+            math(EXPR BENCHMARK_COUNT "${BENCHMARK_COUNT}+1")
+        endforeach()
+        message("-- Finished building ${BENCHMARK_COUNT} benchmark file(s)...")
+    endif()
 endif()
 
 include(BuildDocs)
diff --git a/extensions/rocksdb-repos/DatabaseContentRepository.cpp b/extensions/rocksdb-repos/DatabaseContentRepository.cpp
index bdcc3e57c9..118d5ec03a 100644
--- a/extensions/rocksdb-repos/DatabaseContentRepository.cpp
+++ b/extensions/rocksdb-repos/DatabaseContentRepository.cpp
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 #include "RocksDbStream.h"
+#include "io/DatabaseMemoryMap.h"
 #include "rocksdb/merge_operator.h"
 
 namespace org {
@@ -38,8 +39,8 @@ bool DatabaseContentRepository::initialize(const std::shared_ptr<minifi::Configu
   }
   rocksdb::Options options;
   options.create_if_missing = true;
-  options.use_direct_io_for_flush_and_compaction = true;
-  options.use_direct_reads = true;
+//  options.use_direct_io_for_flush_and_compaction = true;
+//  options.use_direct_reads = true;
   options.merge_operator = std::make_shared<StringAppender>();
   options.error_if_exists = false;
   options.max_successive_merges = 0;
@@ -48,7 +49,7 @@ bool DatabaseContentRepository::initialize(const std::shared_ptr<minifi::Configu
     logger_->log_debug("NiFi Content DB Repository database open %s success", directory_);
     is_valid_ = true;
   } else {
-    logger_->log_error("NiFi Content DB Repository database open %s fail", directory_);
+    logger_->log_error("NiFi Content DB Repository database open %s fail due to %s", directory_, status.ToString());
     is_valid_ = false;
   }
   return is_valid_;
@@ -62,19 +63,40 @@ void DatabaseContentRepository::stop() {
 }
 
 std::shared_ptr<io::BaseStream> DatabaseContentRepository::write(const std::shared_ptr<minifi::ResourceClaim> &claim, bool append) {
-  // the traditional approach with these has been to return -1 from the stream; however, since we have the ability here
-  // we can simply return a nullptr, which is also valid from the API when this stream is not valid.
-  if (nullptr == claim || !is_valid_ || !db_)
-    return nullptr;
+  // the traditional approach with these has been to return -1 from the stream;
+  // however, since we have the ability here we can simply return a nullptr,
+  // which is also valid from the API when this stream is not valid.
+  if (nullptr == claim || !is_valid_ || !db_) return nullptr;
   // append is already supported in all modes
   return std::make_shared<io::RocksDbStream>(claim->getContentFullPath(), db_, true);
 }
 
+std::shared_ptr<io::BaseMemoryMap> DatabaseContentRepository::mmap(const std::shared_ptr<minifi::ResourceClaim> &claim, size_t map_size,
+                                                                   bool read_only) {
+  /**
+   * Because the underlying does not support direct mapping of the value to memory, we read the entire value in to memory, then write (iff not
+   * readOnly) it back to the db upon closure of the MemoryMap
+   */
+
+  auto mm = std::make_shared<io::DatabaseMemoryMap>(claim, map_size, [this](const std::shared_ptr<minifi::ResourceClaim> &claim) {
+    remove(claim);
+    return write(claim);
+  }, read_only);
+
+  auto rs = read(claim);
+
+  if (rs != nullptr) {
+    rs->readData(reinterpret_cast<uint8_t *>(mm->getData()), map_size);
+  }
+
+  return mm;
+}
+
 std::shared_ptr<io::BaseStream> DatabaseContentRepository::read(const std::shared_ptr<minifi::ResourceClaim> &claim) {
-  // the traditional approach with these has been to return -1 from the stream; however, since we have the ability here
-  // we can simply return a nullptr, which is also valid from the API when this stream is not valid.
-  if (nullptr == claim || !is_valid_ || !db_)
-    return nullptr;
+  // the traditional approach with these has been to return -1 from the stream;
+  // however, since we have the ability here we can simply return a nullptr,
+  // which is also valid from the API when this stream is not valid.
+  if (nullptr == claim || !is_valid_ || !db_) return nullptr;
   return std::make_shared<io::RocksDbStream>(claim->getContentFullPath(), db_, false);
 }
 
@@ -92,8 +114,7 @@ bool DatabaseContentRepository::exists(const std::shared_ptr<minifi::ResourceCla
 }
 
 bool DatabaseContentRepository::remove(const std::shared_ptr<minifi::ResourceClaim> &claim) {
-  if (nullptr == claim || !is_valid_ || !db_)
-    return false;
+  if (nullptr == claim || !is_valid_ || !db_) return false;
   rocksdb::Status status;
   status = db_->Delete(rocksdb::WriteOptions(), claim->getContentFullPath());
   if (status.ok()) {
diff --git a/extensions/rocksdb-repos/DatabaseContentRepository.h b/extensions/rocksdb-repos/DatabaseContentRepository.h
index 6d12460c5a..d5b69e4c39 100644
--- a/extensions/rocksdb-repos/DatabaseContentRepository.h
+++ b/extensions/rocksdb-repos/DatabaseContentRepository.h
@@ -18,13 +18,13 @@
 #ifndef LIBMINIFI_INCLUDE_CORE_REPOSITORY_DatabaseContentRepository_H_
 #define LIBMINIFI_INCLUDE_CORE_REPOSITORY_DatabaseContentRepository_H_
 
-#include "rocksdb/db.h"
-#include "rocksdb/merge_operator.h"
-#include "core/Core.h"
 #include "core/Connectable.h"
 #include "core/ContentRepository.h"
-#include "properties/Configure.h"
+#include "core/Core.h"
 #include "core/logging/LoggerConfiguration.h"
+#include "properties/Configure.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
 namespace org {
 namespace apache {
 namespace nifi {
@@ -35,15 +35,15 @@ namespace repository {
 class StringAppender : public rocksdb::AssociativeMergeOperator {
  public:
   // Constructor: specify delimiter
-  explicit StringAppender() {
-
-  }
+  explicit StringAppender() {}
 
-  virtual bool Merge(const rocksdb::Slice& key, const rocksdb::Slice* existing_value, const rocksdb::Slice& value, std::string* new_value, rocksdb::Logger* logger) const {
+  virtual bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value, const rocksdb::Slice &value, std::string *new_value,
+                     rocksdb::Logger *logger) const {
     // Clear the *new_value for writing.
     if (nullptr == new_value) {
       return false;
     }
+
     new_value->clear();
 
     if (!existing_value) {
@@ -58,29 +58,20 @@ class StringAppender : public rocksdb::AssociativeMergeOperator {
     return true;
   }
 
-  virtual const char* Name() const {
-    return "StringAppender";
-  }
+  virtual const char *Name() const { return "StringAppender"; }
 
  private:
-
 };
 
 /**
- * DatabaseContentRepository is a content repository that stores data onto the local file system.
+ * DatabaseContentRepository is a content repository that stores data onto the
+ * local file system.
  */
 class DatabaseContentRepository : public core::ContentRepository, public core::Connectable {
  public:
-
   DatabaseContentRepository(std::string name = getClassName<DatabaseContentRepository>(), utils::Identifier uuid = utils::Identifier())
-      : core::Connectable(name, uuid),
-        is_valid_(false),
-        db_(nullptr),
-        logger_(logging::LoggerFactory<DatabaseContentRepository>::getLogger()) {
-  }
-  virtual ~DatabaseContentRepository() {
-    stop();
-  }
+      : core::Connectable(name, uuid), is_valid_(false), db_(nullptr), logger_(logging::LoggerFactory<DatabaseContentRepository>::getLogger()) {}
+  virtual ~DatabaseContentRepository() { stop(); }
 
   virtual bool initialize(const std::shared_ptr<minifi::Configure> &configuration);
 
@@ -88,38 +79,32 @@ class DatabaseContentRepository : public core::ContentRepository, public core::C
 
   virtual std::shared_ptr<io::BaseStream> write(const std::shared_ptr<minifi::ResourceClaim> &claim, bool append = false);
 
+  virtual std::shared_ptr<io::BaseMemoryMap> mmap(const std::shared_ptr<minifi::ResourceClaim> &claim, size_t mapSize, bool readOnly);
+
   virtual std::shared_ptr<io::BaseStream> read(const std::shared_ptr<minifi::ResourceClaim> &claim);
 
-  virtual bool close(const std::shared_ptr<minifi::ResourceClaim> &claim) {
-    return remove(claim);
-  }
+  virtual bool close(const std::shared_ptr<minifi::ResourceClaim> &claim) { return remove(claim); }
 
   virtual bool remove(const std::shared_ptr<minifi::ResourceClaim> &claim);
 
   virtual bool exists(const std::shared_ptr<minifi::ResourceClaim> &streamId);
 
-  virtual void yield() {
-
-  }
+  virtual void yield() {}
 
   /**
    * Determines if we are connected and operating
    */
-  virtual bool isRunning() {
-    return true;
-  }
+  virtual bool isRunning() { return true; }
 
   /**
    * Determines if work is available by this connectable
    * @return boolean if work is available.
    */
-  virtual bool isWorkAvailable() {
-    return true;
-  }
+  virtual bool isWorkAvailable() { return true; }
 
  private:
   bool is_valid_;
-  rocksdb::DB* db_;
+  rocksdb::DB *db_;
   std::shared_ptr<logging::Logger> logger_;
 };
 
diff --git a/libminifi/benchmark/MemoryMapBenchmarks.cpp b/libminifi/benchmark/MemoryMapBenchmarks.cpp
new file mode 100644
index 0000000000..3e82a2f65d
--- /dev/null
+++ b/libminifi/benchmark/MemoryMapBenchmarks.cpp
@@ -0,0 +1,494 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include "../test/TestBase.h"
+#include "ResourceClaim.h"
+#include "core/Core.h"
+#include "repository/FileSystemRepository.h"
+#include "repository/VolatileContentRepository.h"
+#include "properties/Configure.h"
+
+#ifdef ENABLE_ROCKSDB_BENCHMARKS
+#include "DatabaseContentRepository.h"
+#endif  // ENABLE_ROCKSDB_BENCHMARKS
+
+template<class T>
+class MemoryMapBMFixture : public benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State &state) {
+    test_controller_ = std::make_shared<TestController>();
+    repo_ = std::make_shared<T>();
+    conf_ = std::make_shared<minifi::Configure>();
+    char format[] = "/tmp/testRepo.XXXXXX";
+    dir_ = std::string(test_controller_->createTempDirectory(format));
+    test_file_ = dir_ + "/testfile";
+    claim_ = std::make_shared<minifi::ResourceClaim>(test_file_, repo_);
+  }
+
+  void TearDown(const ::benchmark::State &state) {
+  }
+
+  void init_db_repo() {
+    conf_->set(minifi::Configure::nifi_dbcontent_repository_directory_default, dir_);
+    init_repo();
+  }
+
+  void init_repo() {
+    repo_->initialize(conf_);
+  }
+
+  void set_test_input(size_t size, char c) {
+    test_string_ = "";
+    test_string_.resize(size, c);
+    auto mm = repo_->mmap(claim_, test_string_.length(), false);
+    mm->resize(test_string_.length());
+    memcpy(mm->getData(), &test_string_[0], test_string_.length());
+  }
+
+  void set_test_expected_output(size_t size, char c) {
+    expected_string_ = "";
+    expected_string_.resize(size, c);
+  }
+
+  void validate_string(const char *read_string) {
+    if (strncmp(read_string, expected_string_.c_str(), expected_string_.length()) != 0) {
+      throw std::runtime_error("string read failed");
+    }
+  }
+
+  void validate_byte(size_t pos, const char b) {
+    if (b != expected_string_[pos]) {
+      throw std::runtime_error("byte read failed");
+    }
+  }
+
+  /**
+   * Get deterministic random points to access. Alternates between positions relative to start & end of file so as to not be sequential.
+   * @return set of random points
+   */
+  std::vector<size_t> random_points() {
+    std::vector<size_t> p;
+
+    for (size_t i = 0; i < test_string_.length() / 2; i += test_string_.length() / 100) {
+      p.push_back(i);
+      p.push_back(test_string_.length() - 1);
+    }
+
+    return p;
+  }
+
+  std::shared_ptr<minifi::Configure> conf_;
+  std::shared_ptr<TestController> test_controller_;
+  std::shared_ptr<T> repo_;
+  std::shared_ptr<minifi::ResourceClaim> claim_;
+  std::string test_file_;
+  std::string test_string_;
+  std::string expected_string_;
+  std::string dir_;
+};
+
+typedef MemoryMapBMFixture<core::repository::FileSystemRepository> FSMemoryMapBMFixture;
+typedef MemoryMapBMFixture<core::repository::VolatileContentRepository> VolatileMemoryMapBMFixture;
+
+#ifdef ENABLE_ROCKSDB_BENCHMARKS 
+typedef MemoryMapBMFixture<core::repository::DatabaseContentRepository> DatabaseMemoryMapBMFixture;
+#endif  // ENABLE_ROCKSDB_BENCHMARKS
+
+template<class T>
+void mmap_read(T *fixture, benchmark::State &st) {
+  for (auto _ : st) {
+    auto mm = fixture->repo_->mmap(fixture->claim_, fixture->test_string_.length(), true);
+    fixture->validate_string(reinterpret_cast<const char *>(mm->getData()));
+  }
+
+  fixture->repo_->stop();
+}
+
+template<class T>
+void mmap_read_random(T *fixture, benchmark::State &st) {
+  auto r = fixture->random_points();
+  auto mm = fixture->repo_->mmap(fixture->claim_, fixture->test_string_.length(), true);
+  for (auto _ : st) {
+    auto data = reinterpret_cast<char *>(mm->getData());
+    for (size_t p : r) {
+      fixture->validate_byte(p, data[p]);
+    }
+  }
+
+  fixture->repo_->stop();
+}
+
+template<class T>
+void mmap_write_read(T *fixture, benchmark::State &st) {
+  for (auto _ : st) {
+    fixture->repo_->remove(fixture->claim_);
+    auto mm = fixture->repo_->mmap(fixture->claim_, fixture->test_string_.length(), false);
+    memcpy(mm->getData(), &(fixture->expected_string_[0]), fixture->test_string_.length());
+    fixture->validate_string(reinterpret_cast<const char *>(mm->getData()));
+  }
+
+  fixture->repo_->stop();
+}
+
+template<class T>
+void cb_read(T *fixture, benchmark::State &st) {
+  for (auto _ : st) {
+    auto rs = fixture->repo_->read(fixture->claim_);
+    std::vector<uint8_t> buf;
+    rs->readData(buf, fixture->test_string_.length() + 1);
+    fixture->validate_string(reinterpret_cast<const char *>(&buf[0]));
+  }
+
+  fixture->repo_->stop();
+}
+
+template<class T>
+void cb_read_random(T *fixture, benchmark::State &st) {
+  auto r = fixture->random_points();
+  auto rs = fixture->repo_->read(fixture->claim_);
+  for (auto _ : st) {
+    for (size_t p : r) {
+      rs->seek(p);
+      char b;
+      rs->read(b);
+      fixture->validate_byte(p, b);
+    }
+  }
+
+  fixture->repo_->stop();
+}
+
+template <class T>
+void cb_write_read(T *fixture, benchmark::State &st) {
+  for (auto _ : st) {
+    {
+      fixture->repo_->remove(fixture->claim_);
+      auto ws = fixture->repo_->write(fixture->claim_, false);
+      ws->write(reinterpret_cast<uint8_t *>(&(fixture->expected_string_[0])), fixture->test_string_.length());
+    }
+
+    auto rs = fixture->repo_->read(fixture->claim_);
+    std::vector<uint8_t> buf;
+    rs->readData(buf, fixture->test_string_.length() + 1);
+    fixture->validate_string(reinterpret_cast<const char *>(&buf[0]));
+  }
+
+
+  fixture->repo_->stop();
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, MemoryMap_FileSystemRepository_Read_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'x');
+  mmap_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, Callback_FileSystemRepository_Read_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'x');
+  cb_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, MemoryMap_FileSystemRepository_WriteRead_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'y');
+  mmap_write_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, Callback_FileSystemRepository_WriteRead_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'y');
+  cb_write_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, MemoryMap_FileSystemRepository_Read_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'x');
+  mmap_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, Callback_FileSystemRepository_Read_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'x');
+  cb_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, MemoryMap_FileSystemRepository_WriteRead_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'y');
+  mmap_write_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, Callback_FileSystemRepository_WriteRead_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'y');
+  cb_write_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, MemoryMap_FileSystemRepository_Read_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  mmap_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, Callback_FileSystemRepository_Read_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  cb_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, MemoryMap_FileSystemRepository_WriteRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'y');
+  mmap_write_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, Callback_FileSystemRepository_WriteRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'y');
+  cb_write_read<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, MemoryMap_FileSystemRepository_RandomRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  mmap_read_random<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(FSMemoryMapBMFixture, Callback_FileSystemRepository_RandomRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  cb_read_random<FSMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, MemoryMap_VolatileRepository_Read_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'x');
+  mmap_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, Callback_VolatileRepository_Read_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'x');
+  cb_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, MemoryMap_VolatileRepository_WriteRead_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'y');
+  mmap_write_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, Callback_VolatileRepository_WriteRead_Tiny)(benchmark::State &st) {
+  init_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'y');
+  cb_write_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, MemoryMap_VolatileRepository_Read_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'x');
+  mmap_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, Callback_VolatileRepository_Read_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'x');
+  cb_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, MemoryMap_VolatileRepository_WriteRead_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'y');
+  mmap_write_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, Callback_VolatileRepository_WriteRead_Small)(benchmark::State &st) {
+  init_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'y');
+  cb_write_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, MemoryMap_VolatileRepository_Read_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  mmap_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, Callback_VolatileRepository_Read_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  cb_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, MemoryMap_VolatileRepository_WriteRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'y');
+  mmap_write_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, Callback_VolatileRepository_WriteRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'y');
+  cb_write_read<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, MemoryMap_VolatileRepository_RandomRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  mmap_read_random<VolatileMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(VolatileMemoryMapBMFixture, Callback_VolatileRepository_RandomRead_Large)(benchmark::State &st) {
+  init_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  cb_read_random<VolatileMemoryMapBMFixture>(this, st);
+}
+
+#ifdef ENABLE_ROCKSDB_BENCHMARKS 
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, MemoryMap_DatabaseRepository_Read_Tiny)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'x');
+  mmap_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, Callback_DatabaseRepository_Read_Tiny)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'x');
+  cb_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, MemoryMap_DatabaseRepository_WriteRead_Tiny)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'y');
+  mmap_write_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, Callback_DatabaseRepository_WriteRead_Tiny)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(10, 'x');
+  set_test_expected_output(10, 'y');
+  cb_write_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, MemoryMap_DatabaseRepository_Read_Small)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'x');
+  mmap_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, Callback_DatabaseRepository_Read_Small)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'x');
+  cb_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, MemoryMap_DatabaseRepository_WriteRead_Small)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'y');
+  mmap_write_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, Callback_DatabaseRepository_WriteRead_Small)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(131072, 'x');
+  set_test_expected_output(131072, 'y');
+  cb_write_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, MemoryMap_DatabaseRepository_Read_Large)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  mmap_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, Callback_DatabaseRepository_Read_Large)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  cb_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, MemoryMap_DatabaseRepository_WriteRead_Large)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'y');
+  mmap_write_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, Callback_DatabaseRepository_WriteRead_Large)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'y');
+  cb_write_read<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, MemoryMap_DatabaseRepository_RandomRead_Large)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  mmap_read_random<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+BENCHMARK_F(DatabaseMemoryMapBMFixture, Callback_DatabaseRepository_RandomRead_Large)(benchmark::State &st) {
+  init_db_repo();
+  set_test_input(33554432, 'x');
+  set_test_expected_output(33554432, 'x');
+  cb_read_random<DatabaseMemoryMapBMFixture>(this, st);
+}
+
+#endif  // ENABLE_ROCKSDB_BENCHMARKS
+
+BENCHMARK_MAIN();
diff --git a/libminifi/include/FlowFileRecord.h b/libminifi/include/FlowFileRecord.h
index 795003fd01..3b8cd67468 100644
--- a/libminifi/include/FlowFileRecord.h
+++ b/libminifi/include/FlowFileRecord.h
@@ -99,6 +99,11 @@ class OutputStreamCallback {
   virtual int64_t process(std::shared_ptr<io::BaseStream> stream) = 0;
 
 };
+class MemoryMapCallback {
+public:
+  virtual ~MemoryMapCallback() {}
+  virtual bool process(std::shared_ptr<io::BaseMemoryMap> map) = 0;
+};
 
 class FlowFileRecord : public core::FlowFile, public io::Serializable {
  public:
diff --git a/libminifi/include/core/ContentRepository.h b/libminifi/include/core/ContentRepository.h
index f42400af34..3410fe5d77 100644
--- a/libminifi/include/core/ContentRepository.h
+++ b/libminifi/include/core/ContentRepository.h
@@ -18,12 +18,14 @@
 #ifndef LIBMINIFI_INCLUDE_CORE_CONTENTREPOSITORY_H_
 #define LIBMINIFI_INCLUDE_CORE_CONTENTREPOSITORY_H_
 
-#include "properties/Configure.h"
+#include "MemoryMapManager.h"
 #include "ResourceClaim.h"
-#include "io/DataStream.h"
-#include "io/BaseStream.h"
 #include "StreamManager.h"
 #include "core/Connectable.h"
+#include "io/BaseMemoryMap.h"
+#include "io/BaseStream.h"
+#include "io/DataStream.h"
+#include "properties/Configure.h"
 
 namespace org {
 namespace apache {
@@ -34,21 +36,16 @@ namespace core {
 /**
  * Content repository definition that extends StreamManager.
  */
-class ContentRepository : public StreamManager<minifi::ResourceClaim> {
+class ContentRepository : public StreamManager<minifi::ResourceClaim>, public MemoryMapManager<minifi::ResourceClaim> {
  public:
-
-  virtual ~ContentRepository() {
-
-  }
+  virtual ~ContentRepository() {}
 
   /**
    * initialize this content repository using the provided configuration.
    */
   virtual bool initialize(const std::shared_ptr<Configure> &configure) = 0;
 
-  virtual std::string getStoragePath() {
-    return directory_;
-  }
+  virtual std::string getStoragePath() { return directory_; }
 
   /**
    * Stops this repository.
@@ -109,7 +106,6 @@ class ContentRepository : public StreamManager<minifi::ResourceClaim> {
   }
 
  protected:
-
   std::string directory_;
 
   std::mutex count_map_mutex_;
diff --git a/libminifi/include/core/MemoryMapManager.h b/libminifi/include/core/MemoryMapManager.h
new file mode 100644
index 0000000000..c8a87533c0
--- /dev/null
+++ b/libminifi/include/core/MemoryMapManager.h
@@ -0,0 +1,55 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBMINIFI_INCLUDE_CORE_MEMORYMAPMANAGER_H_
+#define LIBMINIFI_INCLUDE_CORE_MEMORYMAPMANAGER_H_
+
+#include "io/BaseMemoryMap.h"
+
+#include <memory>
+
+namespace org {
+namespace apache {
+namespace nifi {
+namespace minifi {
+namespace core {
+
+/**
+ * Purpose: Provides a base for all memory-mapping managers. The goal here is to
+ * provide a small set of interfaces that provide a small set of operations to
+ * provide state management of memory maps.
+ */
+template <typename T>
+class MemoryMapManager {
+ public:
+  virtual ~MemoryMapManager() {}
+
+  /**
+   * Create a memory map to the object.
+   * @param map_obj the object to map
+   * @return result of operation (true of succeeded)
+   */
+  virtual std::shared_ptr<io::BaseMemoryMap> mmap(const std::shared_ptr<T> &mapObj, size_t mapSize, bool readOnly) = 0;
+};
+
+} /* namespace core */
+} /* namespace minifi */
+} /* namespace nifi */
+} /* namespace apache */
+} /* namespace org */
+
+#endif /* LIBMINIFI_INCLUDE_CORE_MEMORYMAPMANAGER_H_ */
diff --git a/libminifi/include/core/ProcessSession.h b/libminifi/include/core/ProcessSession.h
index 7bcd7f5403..daf9513374 100644
--- a/libminifi/include/core/ProcessSession.h
+++ b/libminifi/include/core/ProcessSession.h
@@ -19,21 +19,22 @@
 #define __PROCESS_SESSION_H__
 
 #include <uuid/uuid.h>
-#include <vector>
-#include <queue>
+#include <algorithm>
+#include <atomic>
 #include <map>
 #include <mutex>
-#include <atomic>
-#include <algorithm>
+#include <queue>
 #include <set>
+#include <vector>
 
-#include "ProcessContext.h"
-#include "FlowFileRecord.h"
 #include "Exception.h"
 #include "core/logging/LoggerConfiguration.h"
 #include "core/Deprecated.h"
 #include "FlowFile.h"
+#include "FlowFileRecord.h"
+#include "ProcessContext.h"
 #include "WeakReference.h"
+#include "core/logging/LoggerConfiguration.h"
 #include "provenance/Provenance.h"
 
 namespace org {
@@ -50,41 +51,48 @@ class ProcessSession : public ReferenceContainer {
    * Create a new process session
    */
   ProcessSession(std::shared_ptr<ProcessContext> processContext = nullptr)
-      : process_context_(processContext),
-        logger_(logging::LoggerFactory<ProcessSession>::getLogger()) {
+      : process_context_(processContext), logger_(logging::LoggerFactory<ProcessSession>::getLogger()) {
     logger_->log_trace("ProcessSession created for %s", process_context_->getProcessorNode()->getName());
     auto repo = processContext->getProvenanceRepository();
-    //provenance_report_ = new provenance::ProvenanceReporter(repo, process_context_->getProcessorNode()->getName(), process_context_->getProcessorNode()->getName());
-    provenance_report_ = std::make_shared<provenance::ProvenanceReporter>(repo, process_context_->getProcessorNode()->getName(), process_context_->getProcessorNode()->getName());
+    // provenance_report_ = new provenance::ProvenanceReporter(repo,
+    // process_context_->getProcessorNode()->getName(),
+    // process_context_->getProcessorNode()->getName());
+    provenance_report_ = std::make_shared<provenance::ProvenanceReporter>(repo, process_context_->getProcessorNode()->getName(),
+                                                                          process_context_->getProcessorNode()->getName());
   }
 
-// Destructor
+  // Destructor
   virtual ~ProcessSession();
 
-// Commit the session
+  // Commit the session
   void commit();
   // Roll Back the session
   void rollback();
   // Get Provenance Report
-  std::shared_ptr<provenance::ProvenanceReporter> getProvenanceReporter() {
-    return provenance_report_;
-  }
+  std::shared_ptr<provenance::ProvenanceReporter> getProvenanceReporter() { return provenance_report_; }
   //
   // Get the FlowFile from the highest priority queue
   virtual std::shared_ptr<core::FlowFile> get();
-  // Create a new UUID FlowFile with no content resource claim and without parent
+  // Create a new UUID FlowFile with no content resource claim and without
+  // parent
   std::shared_ptr<core::FlowFile> create();
-  // Create a new UUID FlowFile with no content resource claim and inherit all attributes from parent
-  //std::shared_ptr<core::FlowFile> create(std::shared_ptr<core::FlowFile> &&parent);
-  // Create a new UUID FlowFile with no content resource claim and inherit all attributes from parent
+  // Create a new UUID FlowFile with no content resource claim and inherit all
+  // attributes from parent
+  // std::shared_ptr<core::FlowFile> create(std::shared_ptr<core::FlowFile>
+  // &&parent);
+  // Create a new UUID FlowFile with no content resource claim and inherit all
+  // attributes from parent
   std::shared_ptr<core::FlowFile> create(const std::shared_ptr<core::FlowFile> &parent);
   // Add a FlowFile to the session
   virtual void add(const std::shared_ptr<core::FlowFile> &flow);
-// Clone a new UUID FlowFile from parent both for content resource claim and attributes
+  // Clone a new UUID FlowFile from parent both for content resource claim and
+  // attributes
   std::shared_ptr<core::FlowFile> clone(const std::shared_ptr<core::FlowFile> &parent);
-  // Clone a new UUID FlowFile from parent for attributes and sub set of parent content resource claim
+  // Clone a new UUID FlowFile from parent for attributes and sub set of parent
+  // content resource claim
   std::shared_ptr<core::FlowFile> clone(const std::shared_ptr<core::FlowFile> &parent, int64_t offset, int64_t size);
-  // Duplicate a FlowFile with the same UUID and all attributes and content resource claim for the roll back of the session
+  // Duplicate a FlowFile with the same UUID and all attributes and content
+  // resource claim for the roll back of the session
   std::shared_ptr<core::FlowFile> duplicate(const std::shared_ptr<core::FlowFile> &original);
   // Transfer the FlowFile to the relationship
   virtual void transfer(const std::shared_ptr<core::FlowFile> &flow, Relationship relationship);
@@ -98,6 +106,8 @@ class ProcessSession : public ReferenceContainer {
   void read(const std::shared_ptr<core::FlowFile> &flow, InputStreamCallback *callback);
   // Execute the given write callback against the content
   void write(const std::shared_ptr<core::FlowFile> &flow, OutputStreamCallback *callback);
+  // Execute the given mmap callback against the content
+  void mmap(const std::shared_ptr<core::FlowFile> &flow, MemoryMapCallback *callback, size_t map_size, bool read_only);
   // Execute the given write/append callback against the content
   void append(const std::shared_ptr<core::FlowFile> &flow, OutputStreamCallback *callback);
   // Penalize the flow
@@ -105,7 +115,8 @@ class ProcessSession : public ReferenceContainer {
 
   /**
    * Imports a file from the data stream
-   * @param stream incoming data stream that contains the data to store into a file
+   * @param stream incoming data stream that contains the data to store into a
+   * file
    * @param flow flow file
    */
   void importFrom(io::DataStream &stream, const std::shared_ptr<core::FlowFile> &flow);
@@ -120,38 +131,38 @@ class ProcessSession : public ReferenceContainer {
    * @param flow flow file
    * @param bool whether or not to keep the content in the flow file
    */
-  bool exportContent(const std::string &destination, const std::shared_ptr<core::FlowFile> &flow,
-  bool keepContent);
+  bool exportContent(const std::string &destination, const std::shared_ptr<core::FlowFile> &flow, bool keepContent);
 
-  bool exportContent(const std::string &destination, const std::string &tmpFileName, const std::shared_ptr<core::FlowFile> &flow,
-  bool keepContent);
+  bool exportContent(const std::string &destination, const std::string &tmpFileName, const std::shared_ptr<core::FlowFile> &flow, bool keepContent);
 
   // Stash the content to a key
   void stash(const std::string &key, const std::shared_ptr<core::FlowFile> &flow);
   // Restore content previously stashed to a key
   void restore(const std::string &key, const std::shared_ptr<core::FlowFile> &flow);
 
-// Prevent default copy constructor and assignment operation
-// Only support pass by reference or pointer
+  // Prevent default copy constructor and assignment operation
+  // Only support pass by reference or pointer
   ProcessSession(const ProcessSession &parent) = delete;
   ProcessSession &operator=(const ProcessSession &parent) = delete;
 
  protected:
-// FlowFiles being modified by current process session
-  std::map<std::string, std::shared_ptr<core::FlowFile> > _updatedFlowFiles;
-  // Copy of the original FlowFiles being modified by current process session as above
-  std::map<std::string, std::shared_ptr<core::FlowFile> > _originalFlowFiles;
+  // FlowFiles being modified by current process session
+  std::map<std::string, std::shared_ptr<core::FlowFile>> _updatedFlowFiles;
+  // Copy of the original FlowFiles being modified by current process session as
+  // above
+  std::map<std::string, std::shared_ptr<core::FlowFile>> _originalFlowFiles;
   // FlowFiles being added by current process session
-  std::map<std::string, std::shared_ptr<core::FlowFile> > _addedFlowFiles;
+  std::map<std::string, std::shared_ptr<core::FlowFile>> _addedFlowFiles;
   // FlowFiles being deleted by current process session
-  std::map<std::string, std::shared_ptr<core::FlowFile> > _deletedFlowFiles;
+  std::map<std::string, std::shared_ptr<core::FlowFile>> _deletedFlowFiles;
   // FlowFiles being transfered to the relationship
   std::map<std::string, Relationship> _transferRelationship;
   // FlowFiles being cloned for multiple connections per relationship
-  std::map<std::string, std::shared_ptr<core::FlowFile> > _clonedFlowFiles;
+  std::map<std::string, std::shared_ptr<core::FlowFile>> _clonedFlowFiles;
 
  private:
-// Clone the flow file during transfer to multiple connections for a relationship
+  // Clone the flow file during transfer to multiple connections for a
+  // relationship
   std::shared_ptr<core::FlowFile> cloneDuringTransfer(std::shared_ptr<core::FlowFile> &parent);
   // ProcessContext
   std::shared_ptr<ProcessContext> process_context_;
diff --git a/libminifi/include/core/repository/AtomicRepoEntries.h b/libminifi/include/core/repository/AtomicRepoEntries.h
index 4235a23092..975152ea41 100644
--- a/libminifi/include/core/repository/AtomicRepoEntries.h
+++ b/libminifi/include/core/repository/AtomicRepoEntries.h
@@ -18,15 +18,15 @@
 #ifndef LIBMINIFI_INCLUDE_CORE_REPOSITORY_ATOMICREPOENTRIES_H_
 #define LIBMINIFI_INCLUDE_CORE_REPOSITORY_ATOMICREPOENTRIES_H_
 
-#include  <cstddef>
-#include <cstring>
-#include <iostream>
+#include <atomic>
 #include <chrono>
+#include <cstddef>
+#include <cstring>
 #include <functional>
-#include <atomic>
-#include <vector>
-#include <map>
+#include <iostream>
 #include <iterator>
+#include <map>
+#include <vector>
 
 namespace org {
 namespace apache {
@@ -41,12 +41,10 @@ namespace repository {
  * Justification: Since AtomicEntry is a static entry that does not move or change, the underlying
  * RepoValue can be changed to support atomic operations.
  */
-template<typename T>
+template <typename T>
 class RepoValue {
  public:
-
-  explicit RepoValue() {
-  }
+  explicit RepoValue() {}
 
   /**
    * Constructor that populates the item allowing for a custom key comparator.
@@ -55,9 +53,7 @@ class RepoValue {
    * @param size size buffer
    * @param comparator custom comparator.
    */
-  explicit RepoValue(T key, const uint8_t *ptr, size_t size, std::function<bool(T, T)> comparator = nullptr)
-      : key_(key),
-        comparator_(comparator) {
+  explicit RepoValue(T key, const uint8_t *ptr, size_t size, std::function<bool(T, T)> comparator = nullptr) : key_(key), comparator_(comparator) {
     if (nullptr == ptr) {
       size = 0;
     }
@@ -70,124 +66,97 @@ class RepoValue {
   /**
    * RepoValue that moves the other object into this.
    */
-  explicit RepoValue(RepoValue<T> &&other)
-noexcept      : key_(std::move(other.key_)),
-      buffer_(std::move(other.buffer_)),
-      comparator_(std::move(other.comparator_)) {
-      }
+  explicit RepoValue(RepoValue<T> &&other) noexcept
+      : key_(std::move(other.key_)), buffer_(std::move(other.buffer_)), comparator_(std::move(other.comparator_)) {}
 
-      ~RepoValue()
-      {
-      }
+  ~RepoValue() {}
 
-      T &getKey() {
-        return key_;
-      }
+  T &getKey() { return key_; }
 
-      /**
-       * Sets the key, relacing the custom comparator if needed.
-       */
-      void setKey(const T key, std::function<bool(T,T)> comparator = nullptr) {
-        key_ = key;
-        comparator_ = comparator;
-      }
+  /**
+   * Sets the key, relacing the custom comparator if needed.
+   */
+  void setKey(const T key, std::function<bool(T, T)> comparator = nullptr) {
+    key_ = key;
+    comparator_ = comparator;
+  }
 
-      /**
-       * Determines if the key is the same using the custom comparator
-       * @param other object to compare against
-       * @return result of the comparison
-       */
-      inline bool isEqual(RepoValue<T> *other)
-      {
-        return comparator_ == nullptr ? key_ == other->key_ : comparator_(key_,other->key_);
-      }
+  /**
+   * Determines if the key is the same using the custom comparator
+   * @param other object to compare against
+   * @return result of the comparison
+   */
+  inline bool isEqual(RepoValue<T> *other) { return comparator_ == nullptr ? key_ == other->key_ : comparator_(key_, other->key_); }
 
-      /**
-       * Determines if the key is the same using the custom comparator
-       * @param other object to compare against
-       * @return result of the comparison
-       */
-      inline bool isKey(T other)
-      {
-        return comparator_ == nullptr ? key_ == other : comparator_(key_,other);
-      }
+  /**
+   * Determines if the key is the same using the custom comparator
+   * @param other object to compare against
+   * @return result of the comparison
+   */
+  inline bool isKey(T other) { return comparator_ == nullptr ? key_ == other : comparator_(key_, other); }
 
-      /**
-       * Clears the buffer.
-       */
-      void clearBuffer() {
-        buffer_.resize(0);
-        buffer_.clear();
-      }
+  /**
+   * Clears the buffer.
+   */
+  void clearBuffer() {
+    buffer_.resize(0);
+    buffer_.clear();
+  }
 
-      /**
-       * Return the size of the memory within the key
-       * buffer, the size of timestamp, and the general
-       * system word size
-       */
-      uint64_t size() {
-        return buffer_.size();
-      }
+  /**
+   * Return the size of the memory within the key
+   * buffer, the size of timestamp, and the general
+   * system word size
+   */
+  uint64_t size() { return buffer_.size(); }
 
-      size_t getBufferSize() {
-        return buffer_.size();
-      }
+  size_t getBufferSize() { return buffer_.size(); }
 
-      const uint8_t *getBuffer()
-      {
-        return buffer_.data();
-      }
+  uint8_t *getBuffer() { return &buffer_[0]; }
 
-      /**
-       * Places the contents of buffer into str
-       * @param strnig into which we are placing the memory contained in buffer.
-       */
-      void emplace(std::string &str) {
-        str.insert(0, reinterpret_cast<const char*>(buffer_.data()), buffer_.size());
-      }
+  /**
+   * Places the contents of buffer into str
+   * @param strnig into which we are placing the memory contained in buffer.
+   */
+  void emplace(std::string &str) { str.insert(0, reinterpret_cast<const char *>(buffer_.data()), buffer_.size()); }
 
-      /**
-       * Appends ptr to the end of buffer.
-       * @param ptr pointer containing data to add to buffer_
-       */
-      void append(uint8_t *ptr, size_t size)
-      {
-        buffer_.insert(buffer_.end(), ptr, ptr + size);
-      }
+  /**
+   * Appends ptr to the end of buffer.
+   * @param ptr pointer containing data to add to buffer_
+   */
+  void append(uint8_t *ptr, size_t size) { buffer_.insert(buffer_.end(), ptr, ptr + size); }
 
-      RepoValue<T> &operator=(RepoValue<T> &&other) noexcept {
-        key_ = std::move(other.key_);
-        buffer_ = std::move(other.buffer_);
-        return *this;
-      }
+  /**
+   * Resizes buffer to the specified size.
+   */
+  void resize(size_t size) { buffer_.resize(size); }
 
-    private:
-      T key_;
-      std::function<bool(T,T)> comparator_;
-      std::vector<uint8_t> buffer_;
-    };
-
-    /**
-     * Purpose: Atomic Entry allows us to create a statically
-     * sized ring buffer, with the ability to create
-     *
-     **/
-template<typename T>
-class AtomicEntry {
+  RepoValue<T> &operator=(RepoValue<T> &&other) noexcept {
+    key_ = std::move(other.key_);
+    buffer_ = std::move(other.buffer_);
+    return *this;
+  }
 
+ private:
+  T key_;
+  std::function<bool(T, T)> comparator_;
+  std::vector<uint8_t> buffer_;
+};
+
+/**
+ * Purpose: Atomic Entry allows us to create a statically
+ * sized ring buffer, with the ability to create
+ *
+ **/
+template <typename T>
+class AtomicEntry {
  public:
   /**
    * Constructor that accepts a max size and an atomic counter for the total
    * size allowd by this and other atomic entries.
    */
   explicit AtomicEntry(std::atomic<size_t> *total_size, size_t *max_size)
-      : accumulated_repo_size_(total_size),
-        max_repo_size_(max_size),
-        write_pending_(false),
-        has_value_(false),
-        ref_count_(0),
-        free_required(false) {
-  }
+      : accumulated_repo_size_(total_size), max_repo_size_(max_size), write_pending_(false), has_value_(false), ref_count_(0), free_required(false) {}
 
   /**
    * Sets the repo value, moving the old value into old_value.
@@ -214,8 +183,7 @@ class AtomicEntry {
 
   AtomicEntry<T> *takeOwnership() {
     bool lock = false;
-    if (!write_pending_.compare_exchange_weak(lock, true))
-      return nullptr;
+    if (!write_pending_.compare_exchange_weak(lock, true)) return nullptr;
 
     ref_count_++;
 
@@ -229,11 +197,11 @@ class AtomicEntry {
    * with said object.
    * A custom comparator can be provided to augment the key being added into value_
    */
-  bool testAndSetKey(const T str, std::function<bool(T)> releaseTest = nullptr, std::function<void(T)> reclaimer = nullptr, std::function<bool(T, T)> comparator = nullptr) {
+  bool testAndSetKey(const T str, std::function<bool(T)> releaseTest = nullptr, std::function<void(T)> reclaimer = nullptr,
+                     std::function<bool(T, T)> comparator = nullptr) {
     bool lock = false;
 
-    if (!write_pending_.compare_exchange_weak(lock, true))
-      return false;
+    if (!write_pending_.compare_exchange_weak(lock, true)) return false;
 
     if (has_value_) {
       // we either don't have a release test or we cannot release this
@@ -252,7 +220,6 @@ class AtomicEntry {
         try_unlock();
         return false;
       }
-
     }
     ref_count_ = 1;
     value_.setKey(str, comparator);
@@ -365,7 +332,6 @@ class AtomicEntry {
       if (accumulated_repo_size_ != nullptr) {
         *accumulated_repo_size_ -= bufferSize;
       }
-
     }
     try_unlock();
     return ref;
@@ -377,7 +343,6 @@ class AtomicEntry {
     size = value_.getBufferSize();
     try_unlock();
     return size;
-
   }
 
   /**
@@ -440,7 +405,6 @@ class AtomicEntry {
   }
 
  private:
-
   /**
    * Spin lock to unlock the current atomic entry.
    */
diff --git a/libminifi/include/core/repository/FileSystemRepository.h b/libminifi/include/core/repository/FileSystemRepository.h
index 56a103c7cc..4dbdb41dff 100644
--- a/libminifi/include/core/repository/FileSystemRepository.h
+++ b/libminifi/include/core/repository/FileSystemRepository.h
@@ -18,10 +18,10 @@
 #ifndef LIBMINIFI_INCLUDE_CORE_REPOSITORY_FileSystemRepository_H_
 #define LIBMINIFI_INCLUDE_CORE_REPOSITORY_FileSystemRepository_H_
 
-#include "core/Core.h"
 #include "../ContentRepository.h"
-#include "properties/Configure.h"
+#include "core/Core.h"
 #include "core/logging/LoggerConfiguration.h"
+#include "properties/Configure.h"
 namespace org {
 namespace apache {
 namespace nifi {
@@ -30,18 +30,14 @@ namespace core {
 namespace repository {
 
 /**
- * FileSystemRepository is a content repository that stores data onto the local file system.
+ * FileSystemRepository is a content repository that stores data onto the local
+ * file system.
  */
 class FileSystemRepository : public core::ContentRepository, public core::CoreComponent {
  public:
   FileSystemRepository(std::string name = getClassName<FileSystemRepository>())
-      : core::CoreComponent(name),
-        logger_(logging::LoggerFactory<FileSystemRepository>::getLogger()) {
-
-  }
-  virtual ~FileSystemRepository() {
-
-  }
+      : core::CoreComponent(name), logger_(logging::LoggerFactory<FileSystemRepository>::getLogger()) {}
+  virtual ~FileSystemRepository() {}
 
   virtual bool initialize(const std::shared_ptr<minifi::Configure> &configuration);
 
@@ -53,14 +49,12 @@ class FileSystemRepository : public core::ContentRepository, public core::CoreCo
 
   virtual std::shared_ptr<io::BaseStream> read(const std::shared_ptr<minifi::ResourceClaim> &claim);
 
-  virtual bool close(const std::shared_ptr<minifi::ResourceClaim> &claim) {
-    return remove(claim);
-  }
+  virtual std::shared_ptr<io::BaseMemoryMap> mmap(const std::shared_ptr<minifi::ResourceClaim> &claim, size_t mapSize, bool readOnly);
 
+  virtual bool close(const std::shared_ptr<minifi::ResourceClaim> &claim) { return remove(claim); }
   virtual bool remove(const std::shared_ptr<minifi::ResourceClaim> &claim);
 
  private:
-
   std::shared_ptr<logging::Logger> logger_;
 };
 
diff --git a/libminifi/include/core/repository/VolatileContentRepository.h b/libminifi/include/core/repository/VolatileContentRepository.h
index d79232966e..bcdcf7d3db 100644
--- a/libminifi/include/core/repository/VolatileContentRepository.h
+++ b/libminifi/include/core/repository/VolatileContentRepository.h
@@ -18,14 +18,15 @@
 #ifndef LIBMINIFI_INCLUDE_CORE_REPOSITORY_VolatileContentRepository_H_
 #define LIBMINIFI_INCLUDE_CORE_REPOSITORY_VolatileContentRepository_H_
 
-#include "core/Core.h"
-#include "AtomicRepoEntries.h"
-#include "io/AtomicEntryStream.h"
 #include "../ContentRepository.h"
-#include "core/repository/VolatileRepository.h"
-#include "properties/Configure.h"
+#include "AtomicRepoEntries.h"
 #include "core/Connectable.h"
+#include "core/Core.h"
 #include "core/logging/LoggerConfiguration.h"
+#include "core/repository/VolatileRepository.h"
+#include "io/AtomicEntryStream.h"
+#include "io/AtomicEntryMemoryMap.h"
+#include "properties/Configure.h"
 namespace org {
 namespace apache {
 namespace nifi {
@@ -34,12 +35,13 @@ namespace core {
 namespace repository {
 
 /**
- * Purpose: Stages content into a volatile area of memory. Note that   when the maximum number
- * of entries is consumed we will rollback a session to wait for others to be freed.
+ * Purpose: Stages content into a volatile area of memory. Note that   when the
+ * maximum number of entries is consumed we will rollback a session to wait for
+ * others to be freed.
  */
-class VolatileContentRepository : public core::ContentRepository, public virtual core::repository::VolatileRepository<std::shared_ptr<minifi::ResourceClaim>> {
+class VolatileContentRepository : public core::ContentRepository,
+                                  public virtual core::repository::VolatileRepository<std::shared_ptr<minifi::ResourceClaim>> {
  public:
-
   static const char *minimal_locking;
 
   explicit VolatileContentRepository(std::string name = getClassName<VolatileContentRepository>())
@@ -58,7 +60,6 @@ class VolatileContentRepository : public core::ContentRepository, public virtual
       }
       master_list_.clear();
     }
-
   }
 
   /**
@@ -72,17 +73,32 @@ class VolatileContentRepository : public core::ContentRepository, public virtual
    */
   virtual void stop();
 
+  /**
+   * Generic operation which mutates the state of an object in the repo.
+   */
+  template <class T, template<typename> class U, typename... V>
+  std::shared_ptr<T> mutate(const std::shared_ptr<minifi::ResourceClaim> &claim, V... v);
+
   /**
    * Creates writable stream.
    * @param claim resource claim
-   * @return BaseStream shared pointer that represents the stream the consumer will write to.
+   * @return BaseStream shared pointer that represents the stream the consumer
+   * will write to.
    */
   virtual std::shared_ptr<io::BaseStream> write(const std::shared_ptr<minifi::ResourceClaim> &claim, bool append);
 
+  /**
+   * Create a passthrough memory map to the memory.
+   * @param map_obj the object to map
+   * @return BaseMemoryMap shared pointer mapped directly to the memory
+   */
+  virtual std::shared_ptr<io::BaseMemoryMap> mmap(const std::shared_ptr<minifi::ResourceClaim> &claim, size_t mapSize, bool readOnly);
+
   /**
    * Creates readable stream.
    * @param claim resource claim
-   * @return BaseStream shared pointer that represents the stream from which the consumer will read..
+   * @return BaseStream shared pointer that represents the stream from which the
+   * consumer will read..
    */
   virtual std::shared_ptr<io::BaseStream> read(const std::shared_ptr<minifi::ResourceClaim> &claim);
 
@@ -90,31 +106,29 @@ class VolatileContentRepository : public core::ContentRepository, public virtual
 
   /**
    * Closes the claim.
-   * @return whether or not the claim is associated with content stored in volatile memory.
+   * @return whether or not the claim is associated with content stored in
+   * volatile memory.
    */
-  virtual bool close(const std::shared_ptr<minifi::ResourceClaim> &claim) {
-    return remove(claim);
-  }
+  virtual bool close(const std::shared_ptr<minifi::ResourceClaim> &claim) { return remove(claim); }
 
   /**
    * Closes the claim.
-   * @return whether or not the claim is associated with content stored in volatile memory.
+   * @return whether or not the claim is associated with content stored in
+   * volatile memory.
    */
   virtual bool remove(const std::shared_ptr<minifi::ResourceClaim> &claim);
 
  protected:
-
   virtual void start();
 
   virtual void run();
 
-  template<typename T2>
+  template <typename T2>
   std::shared_ptr<T2> shared_from_parent() {
     return std::dynamic_pointer_cast<T2>(shared_from_this());
   }
 
  private:
-
   bool minimize_locking_;
 
   // function pointers that are associated with the claims.
@@ -122,11 +136,13 @@ class VolatileContentRepository : public core::ContentRepository, public virtual
   std::function<bool(std::shared_ptr<minifi::ResourceClaim>)> resource_claim_check_;
   std::function<void(std::shared_ptr<minifi::ResourceClaim>)> claim_reclaimer_;
 
-  // mutex and master list that represent a cache of Atomic entries. this exists so that we don't have to walk the atomic entry list.
-  // The idea is to reduce the computational complexity while keeping access as maximally lock free as we can.
+  // mutex and master list that represent a cache of Atomic entries. this exists
+  // so that we don't have to walk the atomic entry list. The idea is to reduce
+  // the computational complexity while keeping access as maximally lock free as
+  // we can.
   std::mutex map_mutex_;
 
-  std::map<std::string, AtomicEntry<std::shared_ptr<minifi::ResourceClaim>>*> master_list_;
+  std::map<std::string, AtomicEntry<std::shared_ptr<minifi::ResourceClaim>> *> master_list_;
 
   // logger
   std::shared_ptr<logging::Logger> logger_;
diff --git a/libminifi/include/io/AtomicEntryMemoryMap.h b/libminifi/include/io/AtomicEntryMemoryMap.h
new file mode 100644
index 0000000000..404ed2a43d
--- /dev/null
+++ b/libminifi/include/io/AtomicEntryMemoryMap.h
@@ -0,0 +1,92 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBMINIFI_INCLUDE_IO_ATOMICENTRYMEMORYMAP_H_
+#define LIBMINIFI_INCLUDE_IO_ATOMICENTRYMEMORYMAP_H_
+
+#include <cstring>
+#include <mutex>
+#include "BaseMemoryMap.h"
+#include "Exception.h"
+#include "core/logging/LoggerConfiguration.h"
+#include "core/repository/AtomicRepoEntries.h"
+namespace org {
+namespace apache {
+namespace nifi {
+namespace minifi {
+namespace io {
+
+template <typename T>
+class AtomicEntryMemoryMap : public BaseMemoryMap {
+ public:
+  AtomicEntryMemoryMap(const T key, core::repository::AtomicEntry<T> *entry, size_t map_size)
+      : key_(key), entry_(entry), logger_(logging::LoggerFactory<AtomicEntryMemoryMap()>::getLogger()) {
+    if (entry_->getValue(key, &value_)) {
+      value_->resize(map_size);
+      entry_->decrementOwnership();
+      invalid_stream_ = false;
+    } else {
+      invalid_stream_ = true;
+    }
+  }
+
+  virtual ~AtomicEntryMemoryMap() { entry_->decrementOwnership(); }
+
+  virtual void unmap() {}
+
+  virtual size_t getSize() {
+    if (invalid_stream_) {
+      return -1;
+    }
+
+    return value_->getBufferSize();
+  }
+
+  virtual void *getData() {
+    if (invalid_stream_) {
+      return nullptr;
+    }
+
+    return reinterpret_cast<void *>(value_->getBuffer());
+  }
+
+  /**
+   * Resize the underlying file.
+   * @return pointer to the remapped data
+   */
+  virtual void *resize(size_t new_size) {
+    value_->resize(new_size);
+    return reinterpret_cast<void *>(value_->getBuffer());
+  }
+
+ protected:
+  T key_;
+  core::repository::AtomicEntry<T> *entry_;
+  core::repository::RepoValue<T> *value_;
+  std::atomic<bool> invalid_stream_;
+
+  // Logger
+  std::shared_ptr<logging::Logger> logger_;
+};
+
+} /* namespace io */
+} /* namespace minifi */
+} /* namespace nifi */
+} /* namespace apache */
+} /* namespace org */
+
+#endif /* LIBMINIFI_INCLUDE_IO_ATOMICENTRYMEMORYMAP_H_ */
diff --git a/libminifi/include/io/BaseMemoryMap.h b/libminifi/include/io/BaseMemoryMap.h
new file mode 100644
index 0000000000..0907434ff9
--- /dev/null
+++ b/libminifi/include/io/BaseMemoryMap.h
@@ -0,0 +1,72 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBMINIFI_INCLUDE_IO_BASEMEMORYMAP_H_
+#define LIBMINIFI_INCLUDE_IO_BASEMEMORYMAP_H_
+#include <cstdint>
+#include <iostream>
+#include "Serializable.h"
+
+namespace org {
+namespace apache {
+namespace nifi {
+namespace minifi {
+namespace io {
+
+/**
+ * BaseMemoryMap is a generic interface to a chunk of memory mapped to an object (e.g. repository data).
+ *
+ * ** Not intended to be thread safe as it is not intended to be shared**
+ *
+ * Extensions may be thread safe and thus shareable, but that is up to the implementation.
+ */
+class BaseMemoryMap {
+ public:
+  virtual ~BaseMemoryMap() {}
+
+  /**
+   * Gets a the address of the mapped data.
+   * @return pointer to the mapped data, or nullptr if not mapped
+   **/
+  virtual void *getData() = 0;
+
+  /**
+   * Gets the size of the memory map.
+   * @return size of memory map
+   */
+  virtual size_t getSize() = 0;
+
+  /**
+   * Resize the underlying object.
+   * @return pointer to the remapped data
+   */
+  virtual void *resize(size_t newSize) = 0;
+
+  /**
+   * Explicitly unmap the memory. Memory will otherwise be unmapped at destruction.
+   * After this is called, getData will return nullptr.
+   */
+  virtual void unmap() = 0;
+};
+
+} /* namespace io */
+} /* namespace minifi */
+} /* namespace nifi */
+} /* namespace apache */
+} /* namespace org */
+#endif /* LIBMINIFI_INCLUDE_IO_BASEMEMORYMAP_H_ */
diff --git a/libminifi/include/io/DatabaseMemoryMap.h b/libminifi/include/io/DatabaseMemoryMap.h
new file mode 100644
index 0000000000..66ac6700bd
--- /dev/null
+++ b/libminifi/include/io/DatabaseMemoryMap.h
@@ -0,0 +1,99 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBMINIFI_INCLUDE_IO_PASSTHROUGHMEMORYMAP_H_
+#define LIBMINIFI_INCLUDE_IO_PASSTHROUGHMEMORYMAP_H_
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include "BaseMemoryMap.h"
+#include "Serializable.h"
+
+namespace org {
+namespace apache {
+namespace nifi {
+namespace minifi {
+namespace io {
+
+/**
+ * DatabaseMemoryMap allows access to an existing underlying memory buffer.
+ */
+class DatabaseMemoryMap : public BaseMemoryMap {
+ public:
+  DatabaseMemoryMap(const std::shared_ptr<minifi::ResourceClaim> &claim, size_t map_size, std::function<std::shared_ptr<io::BaseStream>(const std::shared_ptr<minifi::ResourceClaim> &)> write_fn, bool read_only) : claim_(claim), write_fn_(write_fn), read_only_(read_only) {
+    buf.resize(map_size);
+  }
+
+  virtual ~DatabaseMemoryMap() { unmap(); }
+
+  /**
+   * Gets a the address of the mapped data.
+   * @return pointer to the mapped data, or nullptr if not mapped
+   **/
+  virtual void *getData() {
+    return reinterpret_cast<void *>(&buf[0]);
+  }
+
+  /**
+   * Gets the size of the memory map.
+   * @return size of memory map
+   */
+  virtual size_t getSize() { return buf.size(); }
+
+  /**
+   * Resize the underlying buffer.
+   * @return pointer to the remapped data
+   */
+  virtual void *resize(size_t new_size) {
+    buf.resize(new_size);
+    return reinterpret_cast<void *>(&buf[0]);
+  }
+
+  /**
+   * Explicitly unmap the memory. Memory will otherwise be unmapped at
+   * destruction. After this is called, getData will return nullptr.
+   */
+  virtual void unmap() {
+    if (!read_only_) {
+      commit();
+    }
+  }
+
+  /**
+   * Commits the changes in memory to the underlying DB.
+   */
+  void commit() {
+    auto ws = write_fn_(claim_);
+    if (ws->writeData(&buf[0], getSize()) != 0) {
+      throw std::runtime_error("Failed to write memory map data to db: " + claim_->getContentFullPath());
+    }
+  }
+
+ protected:
+  std::vector<uint8_t> buf;
+  std::shared_ptr<minifi::ResourceClaim> claim_;
+  std::function<std::shared_ptr<io::BaseStream>(const std::shared_ptr<minifi::ResourceClaim> &)> write_fn_;
+  bool read_only_;
+};
+
+} /* namespace io */
+} /* namespace minifi */
+} /* namespace nifi */
+} /* namespace apache */
+} /* namespace org */
+#endif /* LIBMINIFI_INCLUDE_IO_PASSTHROUGHMEMORYMAP_H_ */
diff --git a/libminifi/include/io/FileMemoryMap.h b/libminifi/include/io/FileMemoryMap.h
new file mode 100644
index 0000000000..1022a3a980
--- /dev/null
+++ b/libminifi/include/io/FileMemoryMap.h
@@ -0,0 +1,103 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LIBMINIFI_INCLUDE_IO_TLS_FILEMEMORYMAP_H_
+#define LIBMINIFI_INCLUDE_IO_TLS_FILEMEMORYMAP_H_
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <unistd.h>
+#endif
+
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <mio/mmap.hpp>
+#include "BaseMemoryMap.h"
+#include "core/logging/LoggerConfiguration.h"
+
+namespace org {
+namespace apache {
+namespace nifi {
+namespace minifi {
+namespace io {
+
+/**
+ * Purpose: File Memory Map extension.
+ *
+ * Design: Memory maps file in construction, unmaps in destruction (RAII).
+ * Provides an interface to get the memory address and size.
+ */
+class FileMemoryMap : public io::BaseMemoryMap {
+ public:
+  /**
+   * File Memory Map constructor that opens and maps the given file with the
+   * given size.
+   */
+  FileMemoryMap(const std::string &path, size_t map_size, bool read_only);
+
+  virtual ~FileMemoryMap() { unmap(); }
+
+  /**
+   * Gets a the address of the mapped data.
+   * @return pointer to the mapped data, or nullptr if not mapped
+   **/
+  virtual void *getData();
+
+  /**
+   * Gets the size of the memory map.
+   * @return size of memory map
+   */
+  virtual size_t getSize();
+
+  /**
+   * Resize the underlying file.
+   * @return pointer to the remapped data
+   */
+  virtual void *resize(size_t new_size);
+
+  /**
+   * Explicitly unmap the memory. Memory will otherwise be unmapped at
+   * destruction. After this is called, getData will return nullptr.
+   */
+  virtual void unmap();
+
+ protected:
+
+  void map(const std::string &path, size_t map_size, bool read_only);
+
+#if defined(_POSIX_VERSION)
+  int unix_fd_;
+#endif
+  size_t length_;
+  mio::mmap_sink rw_mmap_;
+  mio::mmap_source ro_mmap_;
+  std::string path_;
+  bool read_only_;
+
+ private:
+  std::shared_ptr<logging::Logger> logger_;
+};
+
+} /* namespace io */
+} /* namespace minifi */
+} /* namespace nifi */
+} /* namespace apache */
+} /* namespace org */
+
+#endif /* LIBMINIFI_INCLUDE_IO_TLS_FILEMEMORYMAP_H_ */
diff --git a/libminifi/src/core/ProcessSession.cpp b/libminifi/src/core/ProcessSession.cpp
index 42271b802a..814f9b67b4 100644
--- a/libminifi/src/core/ProcessSession.cpp
+++ b/libminifi/src/core/ProcessSession.cpp
@@ -18,26 +18,27 @@
  * limitations under the License.
  */
 #include "core/ProcessSession.h"
-#include "core/ProcessSessionReadCallback.h"
 #include <time.h>
-#include <vector>
-#include <queue>
+#include <uuid/uuid.h>
+#include <chrono>
+#include <iostream>
 #include <map>
 #include <memory>
-#include <string>
+#include <queue>
 #include <set>
-#include <chrono>
+#include <string>
 #include <thread>
-#include <iostream>
-#include <uuid/uuid.h>
+#include <vector>
+#include "core/ProcessSessionReadCallback.h"
+#include "io/BaseMemoryMap.h"
 /* This implementation is only for native Windows systems.  */
 #if (defined _WIN32 || defined __WIN32__) && !defined __CYGWIN__
 #define _WINSOCKAPI_
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
-#include <WinSock2.h>
 #include <WS2tcpip.h>
+#include <WinSock2.h>
 #include <Windows.h>
 #pragma comment(lib, "Ws2_32.lib")
 #include <direct.h>
@@ -58,16 +59,15 @@ namespace core {
 
 std::shared_ptr<utils::IdGenerator> ProcessSession::id_generator_ = utils::IdGenerator::getIdGenerator();
 
-ProcessSession::~ProcessSession() {
-  removeReferences();
-}
+ProcessSession::~ProcessSession() { removeReferences(); }
 
 std::shared_ptr<core::FlowFile> ProcessSession::create() {
   std::map<std::string, std::string> empty;
 
   auto flow_version = process_context_->getProcessorNode()->getFlowIdentifier();
 
-  std::shared_ptr<FlowFileRecord> record = std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
+  std::shared_ptr<FlowFileRecord> record =
+      std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
   record->setSize(0);
   if (flow_version != nullptr) {
     auto flow_id = flow_version->getFlowId();
@@ -84,13 +84,12 @@ std::shared_ptr<core::FlowFile> ProcessSession::create() {
   return record;
 }
 
-void ProcessSession::add(const std::shared_ptr<core::FlowFile> &record) {
-  _addedFlowFiles[record->getUUIDStr()] = record;
-}
+void ProcessSession::add(const std::shared_ptr<core::FlowFile> &record) { _addedFlowFiles[record->getUUIDStr()] = record; }
 
 std::shared_ptr<core::FlowFile> ProcessSession::create(const std::shared_ptr<core::FlowFile> &parent) {
   std::map<std::string, std::string> empty;
-  std::shared_ptr<FlowFileRecord> record = std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
+  std::shared_ptr<FlowFileRecord> record =
+      std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
   if (record) {
     record->setSize(0);
     auto flow_version = process_context_->getProcessorNode()->getFlowIdentifier();
@@ -139,7 +138,8 @@ std::shared_ptr<core::FlowFile> ProcessSession::clone(const std::shared_ptr<core
 
 std::shared_ptr<core::FlowFile> ProcessSession::cloneDuringTransfer(std::shared_ptr<core::FlowFile> &parent) {
   std::map<std::string, std::string> empty;
-  std::shared_ptr<core::FlowFile> record = std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
+  std::shared_ptr<core::FlowFile> record =
+      std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
 
   if (record) {
     auto flow_version = process_context_->getProcessorNode()->getFlowIdentifier();
@@ -183,13 +183,12 @@ std::shared_ptr<core::FlowFile> ProcessSession::clone(const std::shared_ptr<core
   if (record) {
     logger_->log_debug("Cloned parent flow files %s to %s, with %u:%u", parent->getUUIDStr(), record->getUUIDStr(), offset, size);
     if (parent->getResourceClaim()) {
-      if ((uint64_t) (offset + size) > parent->getSize()) {
+      if ((uint64_t)(offset + size) > parent->getSize()) {
         // Set offset and size
         logger_->log_error("clone offset %ll and size %ll exceed parent size %llu", offset, size, parent->getSize());
         // Remove the Add FlowFile for the session
-        std::map<std::string, std::shared_ptr<core::FlowFile> >::iterator it = this->_addedFlowFiles.find(record->getUUIDStr());
-        if (it != this->_addedFlowFiles.end())
-          this->_addedFlowFiles.erase(record->getUUIDStr());
+        std::map<std::string, std::shared_ptr<core::FlowFile>>::iterator it = this->_addedFlowFiles.find(record->getUUIDStr());
+        if (it != this->_addedFlowFiles.end()) this->_addedFlowFiles.erase(record->getUUIDStr());
         return nullptr;
       }
       record->setOffset(parent->getOffset() + offset);
@@ -210,7 +209,8 @@ void ProcessSession::remove(const std::shared_ptr<core::FlowFile> &flow) {
   flow->setDeleted(true);
   if (flow->getResourceClaim() != nullptr) {
     flow->getResourceClaim()->decreaseFlowFileRecordOwnedCount();
-    logger_->log_debug("Auto terminated %s %llu %s", flow->getResourceClaim()->getContentFullPath(), flow->getResourceClaim()->getFlowFileRecordOwnedCount(), flow->getUUIDStr());
+    logger_->log_debug("Auto terminated %s %llu %s", flow->getResourceClaim()->getContentFullPath(),
+                       flow->getResourceClaim()->getFlowFileRecordOwnedCount(), flow->getUUIDStr());
   } else {
     logger_->log_debug("Flow does not contain content. no resource claim to decrement.");
   }
@@ -236,12 +236,14 @@ void ProcessSession::removeAttribute(const std::shared_ptr<core::FlowFile> &flow
 
 void ProcessSession::penalize(const std::shared_ptr<core::FlowFile> &flow) {
   uint64_t penalization_period = process_context_->getProcessorNode()->getPenalizationPeriodMsec();
-  logging::LOG_INFO(logger_) << "Penalizing " << flow->getUUIDStr() << " for " << penalization_period << "ms at " << process_context_->getProcessorNode()->getName();
+  logging::LOG_INFO(logger_) << "Penalizing " << flow->getUUIDStr() << " for " << penalization_period << "ms at "
+                             << process_context_->getProcessorNode()->getName();
   flow->setPenaltyExpiration(getTimeMillis() + penalization_period);
 }
 
 void ProcessSession::transfer(const std::shared_ptr<core::FlowFile> &flow, Relationship relationship) {
-  logging::LOG_INFO(logger_) << "Transferring " << flow->getUUIDStr() << " from " << process_context_->getProcessorNode()->getName() << " to relationship " << relationship.getName();
+  logging::LOG_INFO(logger_) << "Transferring " << flow->getUUIDStr() << " from " << process_context_->getProcessorNode()->getName()
+                             << " to relationship " << relationship.getName();
   _transferRelationship[flow->getUUIDStr()] = relationship;
 }
 
@@ -296,6 +298,57 @@ void ProcessSession::write(const std::shared_ptr<core::FlowFile> &flow, OutputSt
   }
 }
 
+void ProcessSession::mmap(const std::shared_ptr<core::FlowFile> &flow, MemoryMapCallback *callback, size_t map_size, bool read_only) {
+  std::shared_ptr<ResourceClaim> claim = std::make_shared<ResourceClaim>(process_context_->getContentRepository());
+
+  try {
+    uint64_t start_time = getTimeMillis();
+    claim->increaseFlowFileRecordOwnedCount();
+    std::shared_ptr<io::BaseMemoryMap> map = process_context_->getContentRepository()->mmap(claim, map_size, read_only);
+    // Call the callback to map the content
+    if (nullptr == map) {
+      claim->decreaseFlowFileRecordOwnedCount();
+      rollback();
+      return;
+    }
+    if (!callback->process(map)) {
+      claim->decreaseFlowFileRecordOwnedCount();
+      rollback();
+      return;
+    }
+
+    flow->setSize(map->getSize());
+    flow->setOffset(0);
+    std::shared_ptr<ResourceClaim> flow_claim = flow->getResourceClaim();
+    if (flow_claim != nullptr) {
+      // Remove the old claim
+      flow_claim->decreaseFlowFileRecordOwnedCount();
+      flow->clearResourceClaim();
+    }
+    flow->setResourceClaim(claim);
+
+    map->unmap();
+    std::stringstream details;
+    details << process_context_->getProcessorNode()->getName() << " modify flow record content " << flow->getUUIDStr();
+    uint64_t endTime = getTimeMillis();
+    provenance_report_->modifyContent(flow, details.str(), endTime - start_time);
+  } catch (std::exception &exception) {
+    if (flow && flow->getResourceClaim() == claim) {
+      flow->getResourceClaim()->decreaseFlowFileRecordOwnedCount();
+      flow->clearResourceClaim();
+    }
+    logger_->log_debug("Caught Exception %s", exception.what());
+    throw;
+  } catch (...) {
+    if (flow && flow->getResourceClaim() == claim) {
+      flow->getResourceClaim()->decreaseFlowFileRecordOwnedCount();
+      flow->clearResourceClaim();
+    }
+    logger_->log_debug("Caught Exception during process session write");
+    throw;
+  }
+}
+
 void ProcessSession::append(const std::shared_ptr<core::FlowFile> &flow, OutputStreamCallback *callback) {
   std::shared_ptr<ResourceClaim> claim = nullptr;
   if (flow->getResourceClaim() == nullptr) {
@@ -316,8 +369,7 @@ void ProcessSession::append(const std::shared_ptr<core::FlowFile> &flow, OutputS
 
     size_t oldPos = stream->getSize();
     // this prevents an issue if we write, above, with zero length.
-    if (oldPos > 0)
-      stream->seek(oldPos + 1);
+    if (oldPos > 0) stream->seek(oldPos + 1);
     if (callback->process(stream) < 0) {
       rollback();
       return;
@@ -377,7 +429,8 @@ void ProcessSession::read(const std::shared_ptr<core::FlowFile> &flow, InputStre
 
 /**
  * Imports a file from the data stream
- * @param stream incoming data stream that contains the data to store into a file
+ * @param stream incoming data stream that contains the data to store into a
+ * file
  * @param flow flow file
  *
  */
@@ -423,7 +476,8 @@ void ProcessSession::importFrom(io::DataStream &stream, const std::shared_ptr<co
     }
     flow->setResourceClaim(claim);
 
-    logger_->log_debug("Import offset %llu length %llu into content %s for FlowFile UUID %s", flow->getOffset(), flow->getSize(), flow->getResourceClaim()->getContentFullPath(), flow->getUUIDStr());
+    logger_->log_debug("Import offset %llu length %llu into content %s for FlowFile UUID %s", flow->getOffset(), flow->getSize(),
+                       flow->getResourceClaim()->getContentFullPath(), flow->getUUIDStr());
 
     content_stream->closeStream();
     std::stringstream details;
@@ -469,19 +523,22 @@ void ProcessSession::import(std::string source, const std::shared_ptr<core::Flow
       if (offset != 0) {
         input.seekg(offset);
         if (!input.good()) {
-          logger_->log_error("Seeking to %d failed for file %s (does file/filesystem support seeking?)", offset, source);
+          logger_->log_error(
+              "Seeking to %d failed for file %s (does file/filesystem support "
+              "seeking?)",
+              offset, source);
           invalidWrite = true;
         }
       }
       while (input.good()) {
-        input.read(reinterpret_cast<char*>(charBuffer.data()), size);
+        input.read(reinterpret_cast<char *>(charBuffer.data()), size);
         if (input) {
           if (stream->write(charBuffer.data(), size) < 0) {
             invalidWrite = true;
             break;
           }
         } else {
-          if (stream->write(reinterpret_cast<uint8_t*>(charBuffer.data()), input.gcount()) < 0) {
+          if (stream->write(reinterpret_cast<uint8_t *>(charBuffer.data()), input.gcount()) < 0) {
             invalidWrite = true;
             break;
           }
@@ -498,13 +555,14 @@ void ProcessSession::import(std::string source, const std::shared_ptr<core::Flow
         }
         flow->setResourceClaim(claim);
 
-        logger_->log_debug("Import offset %llu length %llu into content %s for FlowFile UUID %s", flow->getOffset(), flow->getSize(), flow->getResourceClaim()->getContentFullPath(),
-                           flow->getUUIDStr());
+        logger_->log_debug(
+            "Import offset %llu length %llu into content %s for FlowFile UUID "
+            "%s",
+            flow->getOffset(), flow->getSize(), flow->getResourceClaim()->getContentFullPath(), flow->getUUIDStr());
 
         stream->closeStream();
         input.close();
-        if (!keepSource)
-          std::remove(source.c_str());
+        if (!keepSource) std::remove(source.c_str());
         std::stringstream details;
         details << process_context_->getProcessorNode()->getName() << " modify flow record content " << flow->getUUIDStr();
         auto endTime = getTimeMillis();
@@ -615,8 +673,8 @@ void ProcessSession::import(const std::string& source, std::vector<std::shared_p
           }
           flowFile->setResourceClaim(claim);
           claim->increaseFlowFileRecordOwnedCount();
-          logger_->log_debug("Import offset %u length %u into content %s for FlowFile UUID %s", flowFile->getOffset(), flowFile->getSize(), flowFile->getResourceClaim()->getContentFullPath(),
-                             flowFile->getUUIDStr());
+          logger_->log_debug("Import offset %u length %u into content %s for FlowFile UUID %s", flowFile->getOffset(), flowFile->getSize(),
+                             flowFile->getResourceClaim()->getContentFullPath(), flowFile->getUUIDStr());
           stream->closeStream();
           std::string details = process_context_->getProcessorNode()->getName() + " modify flow record content " + flowFile->getUUIDStr();
           uint64_t endTime = getTimeMillis();
@@ -687,38 +745,40 @@ void ProcessSession::stash(const std::string &key, const std::shared_ptr<core::F
   logger_->log_debug("Stashing content from %s to key %s", flow->getUUIDStr(), key);
 
   if (!flow->getResourceClaim()) {
-    logger_->log_warn("Attempted to stash content of record %s when "
-                      "there is no resource claim",
-                      flow->getUUIDStr());
+    logger_->log_warn(
+        "Attempted to stash content of record %s when "
+        "there is no resource claim",
+        flow->getUUIDStr());
     return;
   }
 
-// Stash the claim
+  // Stash the claim
   auto claim = flow->getResourceClaim();
   flow->setStashClaim(key, claim);
 
-// Clear current claim
+  // Clear current claim
   flow->clearResourceClaim();
 }
 
 void ProcessSession::restore(const std::string &key, const std::shared_ptr<core::FlowFile> &flow) {
   logger_->log_info("Restoring content to %s from key %s", flow->getUUIDStr(), key);
 
-// Restore the claim
+  // Restore the claim
   if (!flow->hasStashClaim(key)) {
     logger_->log_warn("Requested restore to record %s from unknown key %s", flow->getUUIDStr(), key);
     return;
   }
 
-// Disown current claim if existing
+  // Disown current claim if existing
   if (flow->getResourceClaim()) {
-    logger_->log_warn("Restoring stashed content of record %s from key %s when there is "
-                      "existing content; existing content will be overwritten",
-                      flow->getUUIDStr(), key);
+    logger_->log_warn(
+        "Restoring stashed content of record %s from key %s when there is "
+        "existing content; existing content will be overwritten",
+        flow->getUUIDStr(), key);
     flow->releaseClaim(flow->getResourceClaim());
   }
 
-// Restore the claim
+  // Restore the claim
   auto stashClaim = flow->getStashClaim(key);
   flow->setResourceClaim(stashClaim);
   flow->clearStashClaim(key);
@@ -726,15 +786,16 @@ void ProcessSession::restore(const std::string &key, const std::shared_ptr<core:
 
 void ProcessSession::commit() {
   try {
-    // First we clone the flow record based on the transfered relationship for updated flow record
-    for (auto && it : _updatedFlowFiles) {
+    // First we clone the flow record based on the transfered relationship for
+    // updated flow record
+    for (auto &&it : _updatedFlowFiles) {
       std::shared_ptr<core::FlowFile> record = it.second;
-      if (record->isDeleted())
-        continue;
+      if (record->isDeleted()) continue;
       std::map<std::string, Relationship>::iterator itRelationship = this->_transferRelationship.find(record->getUUIDStr());
       if (itRelationship != _transferRelationship.end()) {
         Relationship relationship = itRelationship->second;
-        // Find the relationship, we need to find the connections for that relationship
+        // Find the relationship, we need to find the connections for that
+        // relationship
         std::set<std::shared_ptr<Connectable>> connections = process_context_->getProcessorNode()->getOutGoingConnections(relationship.getName());
         if (connections.empty()) {
           // No connection
@@ -747,8 +808,10 @@ void ProcessSession::commit() {
             remove(record);
           }
         } else {
-          // We connections, clone the flow and assign the connection accordingly
-          for (std::set<std::shared_ptr<Connectable>>::iterator itConnection = connections.begin(); itConnection != connections.end(); ++itConnection) {
+          // We connections, clone the flow and assign the connection
+          // accordingly
+          for (std::set<std::shared_ptr<Connectable>>::iterator itConnection = connections.begin(); itConnection != connections.end();
+               ++itConnection) {
             std::shared_ptr<Connectable> connection = *itConnection;
             if (itConnection == connections.begin()) {
               // First connection which the flow need be routed to
@@ -773,12 +836,12 @@ void ProcessSession::commit() {
     // Do the same thing for added flow file
     for (const auto it : _addedFlowFiles) {
       std::shared_ptr<core::FlowFile> record = it.second;
-      if (record->isDeleted())
-        continue;
+      if (record->isDeleted()) continue;
       std::map<std::string, Relationship>::iterator itRelationship = this->_transferRelationship.find(record->getUUIDStr());
       if (itRelationship != _transferRelationship.end()) {
         Relationship relationship = itRelationship->second;
-        // Find the relationship, we need to find the connections for that relationship
+        // Find the relationship, we need to find the connections for that
+        // relationship
         std::set<std::shared_ptr<Connectable>> connections = process_context_->getProcessorNode()->getOutGoingConnections(relationship.getName());
         if (connections.empty()) {
           // No connection
@@ -792,8 +855,10 @@ void ProcessSession::commit() {
             remove(record);
           }
         } else {
-          // We connections, clone the flow and assign the connection accordingly
-          for (std::set<std::shared_ptr<Connectable>>::iterator itConnection = connections.begin(); itConnection != connections.end(); ++itConnection) {
+          // We connections, clone the flow and assign the connection
+          // accordingly
+          for (std::set<std::shared_ptr<Connectable>>::iterator itConnection = connections.begin(); itConnection != connections.end();
+               ++itConnection) {
             std::shared_ptr<Connectable> connection(*itConnection);
             if (itConnection == connections.begin()) {
               // First connection which the flow need be routed to
@@ -816,7 +881,8 @@ void ProcessSession::commit() {
     }
 
     std::shared_ptr<Connection> connection = nullptr;
-    // Complete process the added and update flow files for the session, send the flow file to its queue
+    // Complete process the added and update flow files for the session, send
+    // the flow file to its queue
     for (const auto &it : _updatedFlowFiles) {
       std::shared_ptr<core::FlowFile> record = it.second;
       logger_->log_trace("See %s in %s", record->getUUIDStr(), "_updatedFlowFiles");
@@ -825,8 +891,7 @@ void ProcessSession::commit() {
       }
 
       connection = std::static_pointer_cast<Connection>(record->getConnection());
-      if ((connection) != nullptr)
-        connection->put(record);
+      if ((connection) != nullptr) connection->put(record);
     }
     for (const auto &it : _addedFlowFiles) {
       std::shared_ptr<core::FlowFile> record = it.second;
@@ -835,8 +900,7 @@ void ProcessSession::commit() {
         continue;
       }
       connection = std::static_pointer_cast<Connection>(record->getConnection());
-      if ((connection) != nullptr)
-        connection->put(record);
+      if ((connection) != nullptr) connection->put(record);
     }
     // Process the clone flow files
     for (const auto &it : _clonedFlowFiles) {
@@ -846,8 +910,7 @@ void ProcessSession::commit() {
         continue;
       }
       connection = std::static_pointer_cast<Connection>(record->getConnection());
-      if ((connection) != nullptr)
-        connection->put(record);
+      if ((connection) != nullptr) connection->put(record);
     }
 
     // All done
@@ -878,7 +941,8 @@ void ProcessSession::rollback() {
       if ((connection) != nullptr) {
         std::shared_ptr<FlowFileRecord> flowf = std::static_pointer_cast<FlowFileRecord>(record);
         flowf->setSnapShot(false);
-        logger_->log_debug("ProcessSession rollback for %s, record %s, to connection %s", process_context_->getProcessorNode()->getName(), record->getUUIDStr(), connection->getName());
+        logger_->log_debug("ProcessSession rollback for %s, record %s, to connection %s", process_context_->getProcessorNode()->getName(),
+                           record->getUUIDStr(), connection->getName());
         connection->put(record);
       }
     }
@@ -909,11 +973,11 @@ std::shared_ptr<core::FlowFile> ProcessSession::get() {
   std::shared_ptr<Connection> current = std::static_pointer_cast<Connection>(first);
 
   do {
-    std::set<std::shared_ptr<core::FlowFile> > expired;
+    std::set<std::shared_ptr<core::FlowFile>> expired;
     std::shared_ptr<core::FlowFile> ret = current->poll(expired);
     if (expired.size() > 0) {
       // Remove expired flow record
-      for (std::set<std::shared_ptr<core::FlowFile> >::iterator it = expired.begin(); it != expired.end(); ++it) {
+      for (std::set<std::shared_ptr<core::FlowFile>>::iterator it = expired.begin(); it != expired.end(); ++it) {
         std::shared_ptr<core::FlowFile> record = *it;
         std::stringstream details;
         details << process_context_->getProcessorNode()->getName() << " expire flow record " << record->getUUIDStr();
@@ -925,7 +989,8 @@ std::shared_ptr<core::FlowFile> ProcessSession::get() {
       ret->setDeleted(false);
       _updatedFlowFiles[ret->getUUIDStr()] = ret;
       std::map<std::string, std::string> empty;
-      std::shared_ptr<core::FlowFile> snapshot = std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
+      std::shared_ptr<core::FlowFile> snapshot =
+          std::make_shared<FlowFileRecord>(process_context_->getFlowFileRepository(), process_context_->getContentRepository(), empty);
       auto flow_version = process_context_->getProcessorNode()->getFlowIdentifier();
       if (flow_version != nullptr) {
         auto flow_id = flow_version->getFlowId();
diff --git a/libminifi/src/core/repository/FileSystemRepository.cpp b/libminifi/src/core/repository/FileSystemRepository.cpp
index 4607d74138..6fe4f09d90 100644
--- a/libminifi/src/core/repository/FileSystemRepository.cpp
+++ b/libminifi/src/core/repository/FileSystemRepository.cpp
@@ -19,6 +19,7 @@
 #include "core/repository/FileSystemRepository.h"
 #include <memory>
 #include <string>
+#include "io/FileMemoryMap.h"
 #include "io/FileStream.h"
 #include "utils/file/FileUtils.h"
 
@@ -39,13 +40,16 @@ bool FileSystemRepository::initialize(const std::shared_ptr<minifi::Configure> &
   utils::file::FileUtils::create_dir(directory_);
   return true;
 }
-void FileSystemRepository::stop() {
-}
+void FileSystemRepository::stop() {}
 
 std::shared_ptr<io::BaseStream> FileSystemRepository::write(const std::shared_ptr<minifi::ResourceClaim> &claim, bool append) {
   return std::make_shared<io::FileStream>(claim->getContentFullPath(), append);
 }
 
+std::shared_ptr<io::BaseMemoryMap> FileSystemRepository::mmap(const std::shared_ptr<minifi::ResourceClaim> &claim, size_t mapSize, bool readOnly) {
+  return std::make_shared<io::FileMemoryMap>(claim->getContentFullPath(), mapSize, readOnly);
+}
+
 bool FileSystemRepository::exists(const std::shared_ptr<minifi::ResourceClaim> &streamId) {
   std::ifstream file(streamId->getContentFullPath());
   return file.good();
diff --git a/libminifi/src/core/repository/VolatileContentRepository.cpp b/libminifi/src/core/repository/VolatileContentRepository.cpp
index 9b5f9be920..7b9569831b 100644
--- a/libminifi/src/core/repository/VolatileContentRepository.cpp
+++ b/libminifi/src/core/repository/VolatileContentRepository.cpp
@@ -17,13 +17,14 @@
  */
 
 #include "core/repository/VolatileContentRepository.h"
-#include "core/expect.h"
 #include <cstdio>
-#include <string>
 #include <memory>
+#include <string>
 #include <thread>
-#include "utils/StringUtils.h"
+#include "core/expect.h"
+#include "io/AtomicEntryMemoryMap.h"
 #include "io/FileStream.h"
+#include "utils/StringUtils.h"
 
 namespace org {
 namespace apache {
@@ -40,10 +41,11 @@ bool VolatileContentRepository::initialize(const std::shared_ptr<Configure> &con
     if (lhsPtr == nullptr || rhsPtr == nullptr) {
       return false;
     }
-    return lhsPtr->getContentFullPath() == rhsPtr->getContentFullPath();};
-  resource_claim_check_ = [](std::shared_ptr<minifi::ResourceClaim> claim) {
-    return claim->getFlowFileRecordOwnedCount() <= 0;};
-  claim_reclaimer_ = [&](std::shared_ptr<minifi::ResourceClaim> claim) {if (claim->getFlowFileRecordOwnedCount() <= 0) {
+    return lhsPtr->getContentFullPath() == rhsPtr->getContentFullPath();
+  };
+  resource_claim_check_ = [](std::shared_ptr<minifi::ResourceClaim> claim) { return claim->getFlowFileRecordOwnedCount() <= 0; };
+  claim_reclaimer_ = [&](std::shared_ptr<minifi::ResourceClaim> claim) {
+    if (claim->getFlowFileRecordOwnedCount() <= 0) {
       remove(claim);
     }
   };
@@ -69,26 +71,22 @@ bool VolatileContentRepository::initialize(const std::shared_ptr<Configure> &con
   return true;
 }
 
-void VolatileContentRepository::stop() {
-  running_ = false;
-}
+void VolatileContentRepository::stop() { running_ = false; }
 
-void VolatileContentRepository::run() {
-}
+void VolatileContentRepository::run() {}
 
 void VolatileContentRepository::start() {
-  if (this->purge_period_ <= 0)
-    return;
-  if (running_)
-    return;
+  if (this->purge_period_ <= 0) return;
+  if (running_) return;
   thread_ = std::thread(&VolatileContentRepository::run, shared_from_parent<VolatileContentRepository>());
   thread_.detach();
   running_ = true;
   logger_->log_info("%s Repository Monitor Thread Start", getName());
 }
 
-std::shared_ptr<io::BaseStream> VolatileContentRepository::write(const std::shared_ptr<minifi::ResourceClaim> &claim, bool append) {
-  logger_->log_info("enter write for %s", claim->getContentFullPath());
+template <class T, template<typename> class U, typename... V>
+std::shared_ptr<T> VolatileContentRepository::mutate(const std::shared_ptr<minifi::ResourceClaim> &claim, V... v) {
+  logger_->log_info("enter mutate for %s", claim->getContentFullPath());
   {
     std::lock_guard<std::mutex> lock(map_mutex_);
     auto claim_check = master_list_.find(claim->getContentFullPath());
@@ -98,7 +96,7 @@ std::shared_ptr<io::BaseStream> VolatileContentRepository::write(const std::shar
       if (ent == nullptr) {
         return nullptr;
       }
-      return std::make_shared<io::AtomicEntryStream<std::shared_ptr<minifi::ResourceClaim>>>(claim, ent);
+      return std::make_shared<U<std::shared_ptr<ResourceClaim>>>(claim, ent, v...);
     }
   }
 
@@ -109,7 +107,7 @@ std::shared_ptr<io::BaseStream> VolatileContentRepository::write(const std::shar
         std::lock_guard<std::mutex> lock(map_mutex_);
         master_list_[claim->getContentFullPath()] = ent;
         logger_->log_info("Minimize locking, return stream for %s", claim->getContentFullPath());
-        return std::make_shared<io::AtomicEntryStream<std::shared_ptr<minifi::ResourceClaim>>>(claim, ent);
+        return std::make_shared<U<std::shared_ptr<ResourceClaim>>>(claim, ent, v...);
       }
       size++;
     }
@@ -117,19 +115,31 @@ std::shared_ptr<io::BaseStream> VolatileContentRepository::write(const std::shar
     std::lock_guard<std::mutex> lock(map_mutex_);
     auto claim_check = master_list_.find(claim->getContentFullPath());
     if (claim_check != master_list_.end()) {
-      return std::make_shared<io::AtomicEntryStream<std::shared_ptr<minifi::ResourceClaim>>>(claim, claim_check->second);
+      return std::make_shared<U<std::shared_ptr<ResourceClaim>>>(claim, claim_check->second, v...);
     } else {
       AtomicEntry<std::shared_ptr<minifi::ResourceClaim>> *ent = new AtomicEntry<std::shared_ptr<minifi::ResourceClaim>>(&current_size_, &max_size_);
       if (ent->testAndSetKey(claim, nullptr, nullptr, resource_claim_comparator_)) {
         master_list_[claim->getContentFullPath()] = ent;
-        return std::make_shared<io::AtomicEntryStream<std::shared_ptr<minifi::ResourceClaim>>>(claim, ent);
+        return std::make_shared<U<std::shared_ptr<ResourceClaim>>>(claim, ent, v...);
       }
     }
   }
-  logger_->log_info("Cannot write %s %d, returning nullptr to roll back session. Repo is either full or locked", claim->getContentFullPath(), size);
+  logger_->log_info(
+      "Cannot mutate %s %d, returning nullptr to roll back "
+      "session. Repo is either full or locked",
+      claim->getContentFullPath(), size);
   return nullptr;
 }
 
+std::shared_ptr<io::BaseStream> VolatileContentRepository::write(const std::shared_ptr<minifi::ResourceClaim> &claim, bool append) {
+  return mutate<io::BaseStream, io::AtomicEntryStream>(claim);
+}
+
+std::shared_ptr<io::BaseMemoryMap> VolatileContentRepository::mmap(const std::shared_ptr<minifi::ResourceClaim> &claim, size_t mapSize,
+                                                                   bool readOnly) {
+  return mutate<io::BaseMemoryMap, io::AtomicEntryMemoryMap, size_t>(claim, mapSize);
+}
+
 bool VolatileContentRepository::exists(const std::shared_ptr<minifi::ResourceClaim> &claim) {
   std::lock_guard<std::mutex> lock(map_mutex_);
   auto claim_check = master_list_.find(claim->getContentFullPath());
diff --git a/libminifi/src/io/FileMemoryMap.cpp b/libminifi/src/io/FileMemoryMap.cpp
new file mode 100644
index 0000000000..44c78eef03
--- /dev/null
+++ b/libminifi/src/io/FileMemoryMap.cpp
@@ -0,0 +1,176 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/FileMemoryMap.h"
+
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include <mio/mmap.hpp>
+
+namespace org {
+namespace apache {
+namespace nifi {
+namespace minifi {
+namespace io {
+
+FileMemoryMap::FileMemoryMap(const std::string &path, size_t map_size, bool read_only)
+    : path_(path), length_(map_size), read_only_(read_only), logger_(logging::LoggerFactory<FileMemoryMap>::getLogger()) {
+  map(path, map_size, read_only);
+}
+
+void FileMemoryMap::map(const std::string &path, size_t map_size, bool read_only) {
+  std::error_code error;
+
+  /**
+   * Below, we use two different methods to ensure that the file exists and it is as big
+   * as the requested mapping (rw mode), or error if it isn't (ro mode). Then UNIX version is
+   * faster as it creates and efficiently uses only one native file handle, which is passed
+   * to mio for mapping.
+   */
+
+#if defined(_POSIX_VERSION)
+  // open the file (UNIX-optimized version, faster than generic version)
+  if (!read_only) {
+    unix_fd_ = open(path.c_str(), O_RDWR | O_CREAT, 0600);
+  } else {
+    unix_fd_ = open(path.c_str(), O_RDONLY | O_CREAT, 0600);
+  }
+
+  if (unix_fd_ < 0) {
+    throw std::runtime_error("Failed to open for memory mapping: " + path);
+  }
+
+  // ensure file is at least as big as requested map size (UNIX-optimized version, faster than generic version)
+  off_t cur_size = lseek(unix_fd_, 0, SEEK_END);
+
+  if (cur_size < 0) {
+    throw std::runtime_error("Failed to seek end of file for mapping: " + path);
+  }
+
+  if (cur_size < map_size) {
+    if (!read_only) {
+      if (lseek(unix_fd_, map_size - 1, SEEK_SET) < 0) {
+        throw std::runtime_error("Failed to seek " + std::to_string(map_size - 1) + " bytes for mapping: " + path);
+      }
+
+      if (write(unix_fd_, "", 1) < 0) {
+        close(unix_fd_);
+        unix_fd_ = -1;
+        throw std::runtime_error("Failed to write 0 byte at end of file to expand file: " + path);
+      }
+    } else {
+      close(unix_fd_);
+      unix_fd_ = -1;
+      throw std::runtime_error("File is smaller than map size and read-only mode is set: " + path);
+    }
+  }
+
+  // memory map the file
+  if (read_only) {
+    ro_mmap_ = mio::make_mmap_source(unix_fd_, error);
+  } else {
+    rw_mmap_ = mio::make_mmap_sink(unix_fd_, error);
+  }
+
+#else
+  {
+    // ensure file is at least as big as requested map size (generic version)
+    std::ifstream ifs(path, std::ifstream::in | std::ifstream::ate | std::ifstream::binary);
+    std::ifstream::pos_type file_size = ifs.tellg();
+    ifs.close();
+
+    if (file_size < 0 || (size_t)file_size < map_size) {
+      if (!read_only) {
+        logger_->log_info("Resizing file '%s' to '%d' bytes", path, map_size);
+        std::ofstream ofs(path, std::fstream::out | std::fstream::binary);
+        ofs.seekp(map_size - file_size - 1, std::ios::end);
+        ofs << '\0';
+      } else {
+        throw std::runtime_error("File is smaller than map size and read-only mode is set: " + path);
+      }
+    }
+  }
+
+  // memory map the file
+  if (read_only) {
+    ro_mmap_ = mio::make_mmap_source(path, error);
+  } else {
+    rw_mmap_ = mio::make_mmap_sink(path, error);
+  }
+#endif
+
+  if (error) {
+    throw std::runtime_error("Failed to memory map file '" + path + "' due to: " + error.message());
+  }
+}
+
+void FileMemoryMap::unmap() {
+  if (read_only_) {
+    ro_mmap_.unmap();
+  } else {
+    std::error_code error;
+    rw_mmap_.sync(error);
+
+    if (error) {
+      throw std::runtime_error("Failed to unmap memory-mapped file due to: " + error.message());
+    }
+
+    rw_mmap_.unmap();
+  }
+
+#if defined(_POSIX_VERSION)
+  if (unix_fd_ > 0) {
+    close(unix_fd_);
+  }
+
+  unix_fd_ = -1;
+#endif
+}
+
+void *FileMemoryMap::getData() {
+  if (read_only_) {
+    return &ro_mmap_[0];
+  } else {
+    return &rw_mmap_[0];
+  }
+}
+
+size_t FileMemoryMap::getSize() {
+  return length_;
+}
+
+void *FileMemoryMap::resize(size_t new_size) {
+  if (read_only_) {
+    throw std::runtime_error("Cannot resize read-only mmap");
+  }
+
+  unmap();
+  map(path_, new_size, false);
+  length_ = new_size;
+
+  return &rw_mmap_[0];
+}
+
+}  // namespace io
+}  // namespace minifi
+}  // namespace nifi
+}  // namespace apache
+}  // namespace org
diff --git a/libminifi/test/rocksdb-tests/DBContentRepositoryTests.cpp b/libminifi/test/rocksdb-tests/DBContentRepositoryTests.cpp
index 15fb81dac2..a1ef7d7fb2 100644
--- a/libminifi/test/rocksdb-tests/DBContentRepositoryTests.cpp
+++ b/libminifi/test/rocksdb-tests/DBContentRepositoryTests.cpp
@@ -15,15 +15,15 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "../TestBase.h"
 #include <memory>
 #include <string>
+#include "../TestBase.h"
 #include "../unit/ProvenanceTestHelper.h"
-#include "provenance/Provenance.h"
+#include "DatabaseContentRepository.h"
 #include "FlowFileRecord.h"
 #include "core/Core.h"
-#include "DatabaseContentRepository.h"
 #include "properties/Configure.h"
+#include "provenance/Provenance.h"
 
 TEST_CASE("Write Claim", "[TestDBCR1]") {
   TestController testController;
@@ -35,7 +35,6 @@ TEST_CASE("Write Claim", "[TestDBCR1]") {
   configuration->set(minifi::Configure::nifi_dbcontent_repository_directory_default, dir);
   REQUIRE(true == content_repo->initialize(configuration));
 
-
   auto claim = std::make_shared<minifi::ResourceClaim>(content_repo);
   auto stream = content_repo->write(claim);
 
@@ -76,7 +75,6 @@ TEST_CASE("Delete Claim", "[TestDBCR2]") {
   configuration->set(minifi::Configure::nifi_dbcontent_repository_directory_default, dir);
   REQUIRE(true == content_repo->initialize(configuration));
 
-
   auto claim = std::make_shared<minifi::ResourceClaim>(content_repo);
   auto stream = content_repo->write(claim);
 
@@ -151,7 +149,6 @@ TEST_CASE("Test Null Claim", "[TestDBCR4]") {
   configuration->set(minifi::Configure::nifi_dbcontent_repository_directory_default, dir);
   REQUIRE(true == content_repo->initialize(configuration));
 
-
   auto claim = std::make_shared<minifi::ResourceClaim>(content_repo);
   auto stream = content_repo->write(nullptr);
 
diff --git a/libminifi/test/rocksdb-tests/RocksMMTests.cpp b/libminifi/test/rocksdb-tests/RocksMMTests.cpp
new file mode 100644
index 0000000000..5aa2bf814b
--- /dev/null
+++ b/libminifi/test/rocksdb-tests/RocksMMTests.cpp
@@ -0,0 +1,107 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <thread>
+#include "../TestBase.h"
+#include "../unit/ProvenanceTestHelper.h"
+#include "DatabaseContentRepository.h"
+#include "FlowFileRecord.h"
+#include "FlowFileRepository.h"
+#include "core/Core.h"
+#include "core/RepositoryFactory.h"
+#include "properties/Configure.h"
+#include "provenance/Provenance.h"
+
+TEST_CASE("DatabaseContentRepository Write/Read", "[RocksMMWriteRead]") {
+  TestController testController;
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+
+  auto configuration = std::make_shared<org::apache::nifi::minifi::Configure>();
+  configuration->set(minifi::Configure::nifi_dbcontent_repository_directory_default, dir);
+
+  auto dbr = std::make_shared<core::repository::DatabaseContentRepository>();
+  REQUIRE(true == dbr->initialize(configuration));
+
+  std::string write_test_string("test read val");
+
+  auto claim = std::make_shared<minifi::ResourceClaim>(test_file, dbr);
+
+  {
+    auto mm = dbr->mmap(claim, 1024, false);
+    REQUIRE(mm != nullptr);
+    std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string.c_str(), write_test_string.length());
+  }
+
+  {
+    auto mm = dbr->mmap(claim, 1024, true);
+    REQUIRE(mm != nullptr);
+    std::string read_string(reinterpret_cast<const char *>(mm->getData()));
+    REQUIRE(read_string == "test read val");
+  }
+}
+
+TEST_CASE("DatabaseContentRepository Resize", "[RocksMMResize]") {
+  TestController testController;
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+
+  auto configuration = std::make_shared<org::apache::nifi::minifi::Configure>();
+  configuration->set(minifi::Configure::nifi_dbcontent_repository_directory_default, dir);
+
+  std::string write_test_string("write test");
+  std::string write_test_string_resized("write testtset etirw");
+
+  {
+    auto dbr = std::make_shared<core::repository::DatabaseContentRepository>();
+    REQUIRE(true == dbr->initialize(configuration));
+    auto claim = std::make_shared<minifi::ResourceClaim>(test_file, dbr);
+    auto mm = dbr->mmap(claim, 11, false);
+    REQUIRE(mm != nullptr);
+    REQUIRE(mm->getSize() == 11);
+    std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string.c_str(), write_test_string.length() + 1);
+  }
+
+  {
+    auto dbr = std::make_shared<core::repository::DatabaseContentRepository>();
+    REQUIRE(true == dbr->initialize(configuration));
+    auto claim = std::make_shared<minifi::ResourceClaim>(test_file, dbr);
+    auto mm = dbr->mmap(claim, 11, false);
+    REQUIRE(mm != nullptr);
+    REQUIRE(mm->getSize() == 11);
+    mm->resize(21);
+    REQUIRE(mm->getSize() == 21);
+    std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string_resized.c_str(), write_test_string_resized.length() + 1);
+  }
+
+  {
+    auto dbr = std::make_shared<core::repository::DatabaseContentRepository>();
+    REQUIRE(true == dbr->initialize(configuration));
+    auto claim = std::make_shared<minifi::ResourceClaim>(test_file, dbr);
+    auto mm = dbr->mmap(claim, 21, true);
+    REQUIRE(mm != nullptr);
+    REQUIRE(mm->getSize() == 21);
+    std::string read_string(reinterpret_cast<const char *>(mm->getData()));
+    REQUIRE(read_string == write_test_string_resized);
+  }
+}
diff --git a/libminifi/test/unit/MemoryMapTests.cpp b/libminifi/test/unit/MemoryMapTests.cpp
new file mode 100644
index 0000000000..8136a8fd36
--- /dev/null
+++ b/libminifi/test/unit/MemoryMapTests.cpp
@@ -0,0 +1,181 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "../TestBase.h"
+#include "ResourceClaim.h"
+#include "core/Core.h"
+#include "io/FileMemoryMap.h"
+#include "properties/Configure.h"
+
+TEST_CASE("MemoryMap Test Test", "[MemoryMapTest1]") { REQUIRE(true); }
+
+TEST_CASE("MemoryMap FileSystemRepository Read", "[MemoryMapTestFSRead]") {
+  auto fsr = std::make_shared<core::repository::FileSystemRepository>();
+  TestController testController;
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+
+  {
+    std::ofstream os(test_file);
+    os << "hello";
+  }
+
+  auto claim = std::make_shared<minifi::ResourceClaim>(test_file, fsr);
+  auto mm = fsr->mmap(claim, 1024, false);
+  std::string read_string(reinterpret_cast<const char *>(mm->getData()));
+  REQUIRE(read_string == "hello");
+}
+
+TEST_CASE("MemoryMap FileSystemRepository RO Read", "[MemoryMapTestFSRORead]") {
+  auto fsr = std::make_shared<core::repository::FileSystemRepository>();
+  TestController testController;
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+
+  {
+    std::ofstream os(test_file);
+    os << "hello";
+  }
+
+  auto claim = std::make_shared<minifi::ResourceClaim>(test_file, fsr);
+  auto mm = fsr->mmap(claim, 5, true);
+  std::string read_string(reinterpret_cast<const char *>(mm->getData()), 5);
+  REQUIRE(read_string == "hello");
+}
+
+TEST_CASE("MemoryMap FileSystemRepository Resize", "[MemoryMapTestFSResize]") {
+  auto fsr = std::make_shared<core::repository::FileSystemRepository>();
+  TestController testController;
+  LogTestController::getInstance().setTrace<org::apache::nifi::minifi::io::FileMemoryMap>();
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+  auto claim = std::make_shared<minifi::ResourceClaim>(test_file, fsr);
+  auto mm = fsr->mmap(claim, 11, false);
+  std::string write_test_string("write test");
+  REQUIRE(mm->getSize() == 11);
+  std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string.c_str(), write_test_string.length());
+  std::string read_string(reinterpret_cast<const char *>(mm->getData()));
+  std::stringstream iss;
+  {
+    std::ifstream is(test_file);
+    iss << is.rdbuf();
+  }
+  REQUIRE(read_string == "write test");
+
+  mm->resize(21);
+  REQUIRE(mm->getSize() == 21);
+  std::string write_test_string_resized("write testtset etirw");
+  std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string_resized.c_str(), write_test_string_resized.length());
+
+  std::string read_string_resized(reinterpret_cast<const char *>(mm->getData()));
+  std::stringstream iss_resized;
+  {
+    std::ifstream is(test_file);
+    iss_resized << is.rdbuf();
+  }
+  REQUIRE(read_string_resized == write_test_string_resized);
+}
+
+TEST_CASE("MemoryMap VolatileContentRepository Write/Read", "[MemoryMapTestVWriteRead]") {
+  auto vr = std::make_shared<core::repository::VolatileContentRepository>();
+  auto c = std::make_shared<minifi::Configure>();
+  vr->initialize(c);
+  TestController testController;
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+  std::string write_test_string("test read val");
+
+  auto claim = std::make_shared<minifi::ResourceClaim>(test_file, vr);
+
+  {
+    auto mm = vr->mmap(claim, 1024, false);
+    std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string.c_str(), write_test_string.length());
+  }
+
+  {
+    auto mm = vr->mmap(claim, 1024, false);
+    std::string read_string(reinterpret_cast<const char *>(mm->getData()));
+    REQUIRE(read_string == "test read val");
+  }
+}
+
+TEST_CASE("MemoryMap VolatileContentRepository Resize", "[MemoryMapTestVResize]") {
+  auto vr = std::make_shared<core::repository::VolatileContentRepository>();
+  auto c = std::make_shared<minifi::Configure>();
+  vr->initialize(c);
+  TestController testController;
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+  std::string write_test_string("write test");
+  std::string write_test_string_resized("write testtset etirw");
+
+  auto claim = std::make_shared<minifi::ResourceClaim>(test_file, vr);
+
+  {
+    auto mm = vr->mmap(claim, 11, false);
+    REQUIRE(mm->getSize() == 11);
+    std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string.c_str(), write_test_string.length());
+  }
+
+  {
+    auto mm = vr->mmap(claim, 11, false);
+    REQUIRE(mm->getSize() == 11);
+    mm->resize(21);
+    REQUIRE(mm->getSize() == 21);
+    std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string_resized.c_str(), write_test_string_resized.length());
+  }
+
+  {
+    auto mm = vr->mmap(claim, 21, false);
+    std::string read_string(reinterpret_cast<const char *>(mm->getData()));
+    REQUIRE(read_string == write_test_string_resized);
+  }
+}
+
+TEST_CASE("MemoryMap VolatileContentRepository RO Write/Read", "[MemoryMapTestVROWriteRead]") {
+  auto vr = std::make_shared<core::repository::VolatileContentRepository>();
+  auto c = std::make_shared<minifi::Configure>();
+  vr->initialize(c);
+  TestController testController;
+  char format[] = "/tmp/testRepo.XXXXXX";
+  auto dir = std::string(testController.createTempDirectory(format));
+  auto test_file = dir + "/testfile";
+  std::string write_test_string("test read val");
+
+  auto claim = std::make_shared<minifi::ResourceClaim>(test_file, vr);
+
+  {
+    auto mm = vr->mmap(claim, 1024, true);
+    std::memcpy(reinterpret_cast<char *>(mm->getData()), write_test_string.c_str(), write_test_string.length());
+  }
+
+  {
+    auto mm = vr->mmap(claim, 1024, true);
+    std::string read_string(reinterpret_cast<const char *>(mm->getData()));
+    REQUIRE(read_string == "test read val");
+  }
+}
diff --git a/thirdparty/benchmark-1.5.0/.clang-format b/thirdparty/benchmark-1.5.0/.clang-format
new file mode 100644
index 0000000000..e7d00feaa0
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+PointerAlignment: Left
+...
diff --git a/thirdparty/benchmark-1.5.0/.gitignore b/thirdparty/benchmark-1.5.0/.gitignore
new file mode 100644
index 0000000000..806d04c6b3
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/.gitignore
@@ -0,0 +1,61 @@
+*.a
+*.so
+*.so.?*
+*.dll
+*.exe
+*.dylib
+*.cmake
+!/cmake/*.cmake
+!/test/AssemblyTests.cmake
+*~
+*.pyc
+__pycache__
+
+# lcov
+*.lcov
+/lcov
+
+# cmake files.
+/Testing
+CMakeCache.txt
+CMakeFiles/
+cmake_install.cmake
+
+# makefiles.
+Makefile
+
+# in-source build.
+bin/
+lib/
+/test/*_test
+
+# exuberant ctags.
+tags
+
+# YouCompleteMe configuration.
+.ycm_extra_conf.pyc
+
+# ninja generated files.
+.ninja_deps
+.ninja_log
+build.ninja
+install_manifest.txt
+rules.ninja
+
+# bazel output symlinks.
+bazel-*
+
+# out-of-source build top-level folders.
+build/
+_build/
+build*/
+
+# in-source dependencies
+/googletest/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+CMakeSettings.json
+
+# Visual Studio Code cache/options directory
+.vscode/
diff --git a/thirdparty/benchmark-1.5.0/.travis-libcxx-setup.sh b/thirdparty/benchmark-1.5.0/.travis-libcxx-setup.sh
new file mode 100644
index 0000000000..a591743c6a
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/.travis-libcxx-setup.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Install a newer CMake version
+curl -sSL https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh -o install-cmake.sh
+chmod +x install-cmake.sh
+sudo ./install-cmake.sh --prefix=/usr/local --skip-license
+
+# Checkout LLVM sources
+git clone --depth=1 https://github.com/llvm-mirror/llvm.git llvm-source
+git clone --depth=1 https://github.com/llvm-mirror/libcxx.git llvm-source/projects/libcxx
+git clone --depth=1 https://github.com/llvm-mirror/libcxxabi.git llvm-source/projects/libcxxabi
+
+# Setup libc++ options
+if [ -z "$BUILD_32_BITS" ]; then
+  export BUILD_32_BITS=OFF && echo disabling 32 bit build
+fi
+
+# Build and install libc++ (Use unstable ABI for better sanitizer coverage)
+mkdir llvm-build && cd llvm-build
+cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_INSTALL_PREFIX=/usr \
+      -DLIBCXX_ABI_UNSTABLE=ON \
+      -DLLVM_USE_SANITIZER=${LIBCXX_SANITIZER} \
+      -DLLVM_BUILD_32_BITS=${BUILD_32_BITS} \
+      ../llvm-source
+make cxx -j2
+sudo make install-cxxabi install-cxx
+cd ../
diff --git a/thirdparty/benchmark-1.5.0/.travis.yml b/thirdparty/benchmark-1.5.0/.travis.yml
new file mode 100644
index 0000000000..6b6cfc7046
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/.travis.yml
@@ -0,0 +1,235 @@
+sudo: required
+dist: trusty
+language: cpp
+
+env:
+  global:
+    - /usr/local/bin:$PATH
+
+matrix:
+  include:
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - lcov
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Coverage
+    - compiler: gcc
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Debug
+    - compiler: gcc
+      env: COMPILER=g++ C_COMPILER=gcc BUILD_TYPE=Release
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Debug
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - compiler: gcc
+      addons:
+        apt:
+          packages:
+            - g++-multilib
+            - libc6:i386
+      env:
+        - COMPILER=g++
+        - C_COMPILER=gcc
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - compiler: gcc
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=g++-6 C_COMPILER=gcc-6  BUILD_TYPE=Debug
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-fno-omit-frame-pointer -g -O2 -fsanitize=undefined,address -fuse-ld=gold"
+    - compiler: clang
+      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Debug
+    - compiler: clang
+      env: COMPILER=clang++ C_COMPILER=clang BUILD_TYPE=Release
+    # Clang w/ libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+            - libc6:i386
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ 32bit libc++
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            - clang-3.8
+            - g++-multilib
+            - libc6:i386
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Release
+        - LIBCXX_BUILD=1
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ libc++, ASAN, UBSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER="Undefined;Address"
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=undefined,address -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+        - UBSAN_OPTIONS=print_stacktrace=1
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=Debug
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=MemoryWithOrigins
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=memory -fsanitize-memory-track-origins"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    # Clang w/ libc++ and MSAN
+    - compiler: clang
+      dist: xenial
+      addons:
+        apt:
+          packages:
+            clang-3.8
+      env:
+        - INSTALL_GCC6_FROM_PPA=1
+        - COMPILER=clang++-3.8 C_COMPILER=clang-3.8 BUILD_TYPE=RelWithDebInfo
+        - LIBCXX_BUILD=1 LIBCXX_SANITIZER=Thread
+        - ENABLE_SANITIZER=1
+        - EXTRA_FLAGS="-g -O2 -fno-omit-frame-pointer -fsanitize=thread -fno-sanitize-recover=all"
+        - EXTRA_CXX_FLAGS="-stdlib=libc++"
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++ BUILD_TYPE=Debug
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++ BUILD_TYPE=Release
+    - os: osx
+      osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++
+        - BUILD_TYPE=Release
+        - BUILD_32_BITS=ON
+        - EXTRA_FLAGS="-m32"
+    - os: osx
+      osx_image: xcode8.3
+      compiler: gcc
+      env:
+        - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
+
+before_script:
+  - if [ -n "${LIBCXX_BUILD}" ]; then
+      source .travis-libcxx-setup.sh;
+    fi
+  - if [ -n "${ENABLE_SANITIZER}" ]; then
+      export EXTRA_OPTIONS="-DBENCHMARK_ENABLE_ASSEMBLY_TESTS=OFF";
+    else
+      export EXTRA_OPTIONS="";
+    fi
+  - mkdir -p build && cd build
+
+before_install:
+  - if [ -z "$BUILD_32_BITS" ]; then
+      export BUILD_32_BITS=OFF && echo disabling 32 bit build;
+    fi
+  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
+      sudo add-apt-repository -y "ppa:ubuntu-toolchain-r/test";
+      sudo apt-get update --option Acquire::Retries=100 --option Acquire::http::Timeout="60";
+    fi
+
+install:
+  - if [ -n "${INSTALL_GCC6_FROM_PPA}" ]; then
+      travis_wait sudo -E apt-get -yq --no-install-suggests --no-install-recommends install g++-6;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "linux" -a "${BUILD_32_BITS}" == "OFF" ]; then
+      travis_wait sudo -E apt-get -y --no-install-suggests --no-install-recommends install llvm-3.9-tools;
+      sudo cp /usr/lib/llvm-3.9/bin/FileCheck /usr/local/bin/;
+    fi
+  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
+      PATH=~/.local/bin:${PATH};
+      pip install --user --upgrade pip;
+      travis_wait pip install --user cpp-coveralls;
+    fi
+  - if [ "${C_COMPILER}" == "gcc-7" -a "${TRAVIS_OS_NAME}" == "osx" ]; then
+      rm -f /usr/local/include/c++;
+      brew update;
+      travis_wait brew install gcc@7;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "linux" ]; then
+      sudo apt-get update -qq;
+      sudo apt-get install -qq unzip cmake3;
+      wget https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-linux-x86_64.sh --output-document bazel-installer.sh;
+      travis_wait sudo bash bazel-installer.sh;
+    fi
+  - if [ "${TRAVIS_OS_NAME}" == "osx" ]; then
+      curl -L -o bazel-installer.sh https://github.com/bazelbuild/bazel/releases/download/0.10.1/bazel-0.10.1-installer-darwin-x86_64.sh;
+      travis_wait sudo bash bazel-installer.sh;
+    fi
+
+script:
+  - cmake -DCMAKE_C_COMPILER=${C_COMPILER} -DCMAKE_CXX_COMPILER=${COMPILER} -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DCMAKE_C_FLAGS="${EXTRA_FLAGS}" -DCMAKE_CXX_FLAGS="${EXTRA_FLAGS} ${EXTRA_CXX_FLAGS}" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON -DBENCHMARK_BUILD_32_BITS=${BUILD_32_BITS} ${EXTRA_OPTIONS} ..
+  - make
+  - ctest -C ${BUILD_TYPE} --output-on-failure
+  - bazel test -c dbg --define google_benchmark.have_regex=posix --announce_rc --verbose_failures --test_output=errors --keep_going //test/...
+
+after_success:
+  - if [ "${BUILD_TYPE}" == "Coverage" -a "${TRAVIS_OS_NAME}" == "linux" ]; then
+      coveralls --include src --include include --gcov-options '\-lp' --root .. --build-root .;
+    fi
diff --git a/thirdparty/benchmark-1.5.0/.ycm_extra_conf.py b/thirdparty/benchmark-1.5.0/.ycm_extra_conf.py
new file mode 100644
index 0000000000..5649ddcc74
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/.ycm_extra_conf.py
@@ -0,0 +1,115 @@
+import os
+import ycm_core
+
+# These are the compilation flags that will be used in case there's no
+# compilation database set (by default, one is not set).
+# CHANGE THIS LIST OF FLAGS. YES, THIS IS THE DROID YOU HAVE BEEN LOOKING FOR.
+flags = [
+'-Wall',
+'-Werror',
+'-pedantic-errors',
+'-std=c++0x',
+'-fno-strict-aliasing',
+'-O3',
+'-DNDEBUG',
+# ...and the same thing goes for the magic -x option which specifies the
+# language that the files to be compiled are written in. This is mostly
+# relevant for c++ headers.
+# For a C project, you would set this to 'c' instead of 'c++'.
+'-x', 'c++',
+'-I', 'include',
+'-isystem', '/usr/include',
+'-isystem', '/usr/local/include',
+]
+
+
+# Set this to the absolute path to the folder (NOT the file!) containing the
+# compile_commands.json file to use that instead of 'flags'. See here for
+# more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
+#
+# Most projects will NOT need to set this to anything; you can just change the
+# 'flags' list of compilation flags. Notice that YCM itself uses that approach.
+compilation_database_folder = ''
+
+if os.path.exists( compilation_database_folder ):
+  database = ycm_core.CompilationDatabase( compilation_database_folder )
+else:
+  database = None
+
+SOURCE_EXTENSIONS = [ '.cc' ]
+
+def DirectoryOfThisScript():
+  return os.path.dirname( os.path.abspath( __file__ ) )
+
+
+def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
+  if not working_directory:
+    return list( flags )
+  new_flags = []
+  make_next_absolute = False
+  path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
+  for flag in flags:
+    new_flag = flag
+
+    if make_next_absolute:
+      make_next_absolute = False
+      if not flag.startswith( '/' ):
+        new_flag = os.path.join( working_directory, flag )
+
+    for path_flag in path_flags:
+      if flag == path_flag:
+        make_next_absolute = True
+        break
+
+      if flag.startswith( path_flag ):
+        path = flag[ len( path_flag ): ]
+        new_flag = path_flag + os.path.join( working_directory, path )
+        break
+
+    if new_flag:
+      new_flags.append( new_flag )
+  return new_flags
+
+
+def IsHeaderFile( filename ):
+  extension = os.path.splitext( filename )[ 1 ]
+  return extension in [ '.h', '.hxx', '.hpp', '.hh' ]
+
+
+def GetCompilationInfoForFile( filename ):
+  # The compilation_commands.json file generated by CMake does not have entries
+  # for header files. So we do our best by asking the db for flags for a
+  # corresponding source file, if any. If one exists, the flags for that file
+  # should be good enough.
+  if IsHeaderFile( filename ):
+    basename = os.path.splitext( filename )[ 0 ]
+    for extension in SOURCE_EXTENSIONS:
+      replacement_file = basename + extension
+      if os.path.exists( replacement_file ):
+        compilation_info = database.GetCompilationInfoForFile(
+          replacement_file )
+        if compilation_info.compiler_flags_:
+          return compilation_info
+    return None
+  return database.GetCompilationInfoForFile( filename )
+
+
+def FlagsForFile( filename, **kwargs ):
+  if database:
+    # Bear in mind that compilation_info.compiler_flags_ does NOT return a
+    # python list, but a "list-like" StringVec object
+    compilation_info = GetCompilationInfoForFile( filename )
+    if not compilation_info:
+      return None
+
+    final_flags = MakeRelativePathsInFlagsAbsolute(
+      compilation_info.compiler_flags_,
+      compilation_info.compiler_working_dir_ )
+  else:
+    relative_to = DirectoryOfThisScript()
+    final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
+
+  return {
+    'flags': final_flags,
+    'do_cache': True
+  }
diff --git a/thirdparty/benchmark-1.5.0/AUTHORS b/thirdparty/benchmark-1.5.0/AUTHORS
new file mode 100644
index 0000000000..912cbbc13c
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/AUTHORS
@@ -0,0 +1,51 @@
+# This is the official list of benchmark authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+#
+# Names should be added to this file as:
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+#
+# Please keep the list sorted.
+
+Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steeleal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
+Arne Beer <arne@twobeer.de>
+Carto
+Christopher Seymour <chris.j.seymour@hotmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
+David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Deniz Evrenci <denizevrenci@gmail.com>
+Dirac Research 
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Eric Fiselier <eric@efcs.ca>
+Eugene Zhuk <eugene.zhuk@gmail.com>
+Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
+Felix Homann <linuxaudio@showlabor.de>
+Google Inc.
+International Business Machines Corporation
+Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Jern-Kuan Leong <jernkuan@gmail.com>
+JianXiong Zhou <zhoujianxiong2@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kaito Udagawa <umireon@gmail.com>
+Kishan Kumar <kumar.kishan@outlook.com>
+Lei Xu <eddyxu@gmail.com>
+Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
+MongoDB Inc.
+Nick Hutchinson <nshutchinson@gmail.com>
+Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
+Paul Redmond <paul.redmond@gmail.com>
+Radoslav Yovchev <radoslav.tm@gmail.com>
+Roman Lebedev <lebedev.ri@gmail.com>
+Shuo Chen <chenshuo@chenshuo.com>
+Steinar H. Gunderson <sgunderson@bigfoot.com>
+Stripe, Inc.
+Yixuan Qiu <yixuanq@gmail.com>
+Yusuke Suzuki <utatane.tea@gmail.com>
+Zbigniew Skowron <zbychs@gmail.com>
diff --git a/thirdparty/benchmark-1.5.0/BUILD.bazel b/thirdparty/benchmark-1.5.0/BUILD.bazel
new file mode 100644
index 0000000000..6ee69f2907
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/BUILD.bazel
@@ -0,0 +1,42 @@
+licenses(["notice"])
+
+config_setting(
+    name = "windows",
+    values = {
+        "cpu": "x64_windows",
+    },
+    visibility = [":__subpackages__"],
+)
+
+cc_library(
+    name = "benchmark",
+    srcs = glob(
+        [
+            "src/*.cc",
+            "src/*.h",
+        ],
+        exclude = ["src/benchmark_main.cc"],
+    ),
+    hdrs = ["include/benchmark/benchmark.h"],
+    linkopts = select({
+        ":windows": ["-DEFAULTLIB:shlwapi.lib"],
+        "//conditions:default": ["-pthread"],
+    }),
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "benchmark_main",
+    srcs = ["src/benchmark_main.cc"],
+    hdrs = ["include/benchmark/benchmark.h"],
+    strip_include_prefix = "include",
+    visibility = ["//visibility:public"],
+    deps = [":benchmark"],
+)
+
+cc_library(
+    name = "benchmark_internal_headers",
+    hdrs = glob(["src/*.h"]),
+    visibility = ["//test:__pkg__"],
+)
diff --git a/thirdparty/benchmark-1.5.0/CMakeLists.txt b/thirdparty/benchmark-1.5.0/CMakeLists.txt
new file mode 100644
index 0000000000..9db1361212
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/CMakeLists.txt
@@ -0,0 +1,276 @@
+cmake_minimum_required (VERSION 3.5.1)
+
+foreach(p
+    CMP0048 # OK to clear PROJECT_VERSION on project()
+    CMP0054 # CMake 3.1
+    CMP0056 # export EXE_LINKER_FLAGS to try_run
+    CMP0057 # Support no if() IN_LIST operator
+    CMP0063 # Honor visibility properties for all targets
+    )
+  if(POLICY ${p})
+    cmake_policy(SET ${p} NEW)
+  endif()
+endforeach()
+
+project (benchmark CXX)
+
+option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
+option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
+option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
+option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
+if(NOT MSVC)
+  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+else()
+  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
+endif()
+option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
+
+# Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
+# may require downloading the source code.
+option(BENCHMARK_DOWNLOAD_DEPENDENCIES "Allow the downloading and in-tree building of unmet dependencies" OFF)
+
+# This option can be used to disable building and running unit tests which depend on gtest
+# in cases where it is not possible to build or find a valid version of gtest.
+option(BENCHMARK_ENABLE_GTEST_TESTS "Enable building the unit tests which depend on gtest" ON)
+
+set(ENABLE_ASSEMBLY_TESTS_DEFAULT OFF)
+function(should_enable_assembly_tests)
+  if(CMAKE_BUILD_TYPE)
+    string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
+    if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
+      # FIXME: The --coverage flag needs to be removed when building assembly
+      # tests for this to work.
+      return()
+    endif()
+  endif()
+  if (MSVC)
+    return()
+  elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
+    return()
+  elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # FIXME: Make these work on 32 bit builds
+    return()
+  elseif(BENCHMARK_BUILD_32_BITS)
+     # FIXME: Make these work on 32 bit builds
+    return()
+  endif()
+  find_program(LLVM_FILECHECK_EXE FileCheck)
+  if (LLVM_FILECHECK_EXE)
+    set(LLVM_FILECHECK_EXE "${LLVM_FILECHECK_EXE}" CACHE PATH "llvm filecheck" FORCE)
+    message(STATUS "LLVM FileCheck Found: ${LLVM_FILECHECK_EXE}")
+  else()
+    message(STATUS "Failed to find LLVM FileCheck")
+    return()
+  endif()
+  set(ENABLE_ASSEMBLY_TESTS_DEFAULT ON PARENT_SCOPE)
+endfunction()
+should_enable_assembly_tests()
+
+# This option disables the building and running of the assembly verification tests
+option(BENCHMARK_ENABLE_ASSEMBLY_TESTS "Enable building and running the assembly tests"
+    ${ENABLE_ASSEMBLY_TESTS_DEFAULT})
+
+# Make sure we can import out CMake functions
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+
+# Read the git tags to determine the project version
+include(GetGitVersion)
+get_git_version(GIT_VERSION)
+
+# Tell the user what versions we are using
+string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" VERSION ${GIT_VERSION})
+message(STATUS "Version: ${VERSION}")
+
+# The version of the libraries
+set(GENERIC_LIB_VERSION ${VERSION})
+string(SUBSTRING ${VERSION} 0 1 GENERIC_LIB_SOVERSION)
+
+# Import our CMake modules
+include(CheckCXXCompilerFlag)
+include(AddCXXCompilerFlag)
+include(CXXFeatureCheck)
+
+if (BENCHMARK_BUILD_32_BITS)
+  add_required_cxx_compiler_flag(-m32)
+endif()
+
+if (MSVC)
+  # Turn compiler warnings up to 11
+  string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
+
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-EHs-)
+    add_cxx_compiler_flag(-EHa-)
+    add_definitions(-D_HAS_EXCEPTIONS=0)
+  endif()
+  # Link time optimisation
+  if (BENCHMARK_ENABLE_LTO)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELEASE "${CMAKE_STATIC_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} /GL")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+    string(REGEX REPLACE "[-/]INCREMENTAL" "/INCREMENTAL:NO" CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO}")
+    set(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} /LTCG")
+
+    set(CMAKE_CXX_FLAGS_MINSIZEREL "${CMAKE_CXX_FLAGS_MINSIZEREL} /GL")
+    set(CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL "${CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL "${CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL} /LTCG")
+    set(CMAKE_EXE_LINKER_FLAGS_MINSIZEREL "${CMAKE_EXE_LINKER_FLAGS_MINSIZEREL} /LTCG")
+  endif()
+else()
+  # Try and enable C++11. Don't use C++14 because it doesn't work in some
+  # configurations.
+  add_cxx_compiler_flag(-std=c++11)
+  if (NOT HAVE_CXX_FLAG_STD_CXX11)
+    add_cxx_compiler_flag(-std=c++0x)
+  endif()
+
+  # Turn compiler warnings up to 11
+  add_cxx_compiler_flag(-Wall)
+  add_cxx_compiler_flag(-Wextra)
+  add_cxx_compiler_flag(-Wshadow)
+  add_cxx_compiler_flag(-Werror RELEASE)
+  add_cxx_compiler_flag(-Werror RELWITHDEBINFO)
+  add_cxx_compiler_flag(-Werror MINSIZEREL)
+  add_cxx_compiler_flag(-pedantic)
+  add_cxx_compiler_flag(-pedantic-errors)
+  add_cxx_compiler_flag(-Wshorten-64-to-32)
+  add_cxx_compiler_flag(-fstrict-aliasing)
+  # Disable warnings regarding deprecated parts of the library while building
+  # and testing those parts of the library.
+  add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+  endif()
+  # Disable deprecation warnings for release builds (when -Werror is enabled).
+  add_cxx_compiler_flag(-Wno-deprecated RELEASE)
+  add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
+  add_cxx_compiler_flag(-Wno-deprecated MINSIZEREL)
+  if (NOT BENCHMARK_ENABLE_EXCEPTIONS)
+    add_cxx_compiler_flag(-fno-exceptions)
+  endif()
+
+  if (HAVE_CXX_FLAG_FSTRICT_ALIASING)
+    if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel") #ICC17u2: Many false positives for Wstrict-aliasing
+      add_cxx_compiler_flag(-Wstrict-aliasing)
+    endif()
+  endif()
+  # ICC17u2: overloaded virtual function "benchmark::Fixture::SetUp" is only partially overridden
+  # (because of deprecated overload)
+  add_cxx_compiler_flag(-wd654)
+  add_cxx_compiler_flag(-Wthread-safety)
+  if (HAVE_CXX_FLAG_WTHREAD_SAFETY)
+    cxx_feature_check(THREAD_SAFETY_ATTRIBUTES)
+  endif()
+
+  # On most UNIX like platforms g++ and clang++ define _GNU_SOURCE as a
+  # predefined macro, which turns on all of the wonderful libc extensions.
+  # However g++ doesn't do this in Cygwin so we have to define it ourselfs
+  # since we depend on GNU/POSIX/BSD extensions.
+  if (CYGWIN)
+    add_definitions(-D_GNU_SOURCE=1)
+  endif()
+
+  if (QNXNTO)
+    add_definitions(-D_QNX_SOURCE)
+  endif()
+
+  # Link time optimisation
+  if (BENCHMARK_ENABLE_LTO)
+    add_cxx_compiler_flag(-flto)
+    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+      find_program(GCC_AR gcc-ar)
+      if (GCC_AR)
+        set(CMAKE_AR ${GCC_AR})
+      endif()
+      find_program(GCC_RANLIB gcc-ranlib)
+      if (GCC_RANLIB)
+        set(CMAKE_RANLIB ${GCC_RANLIB})
+      endif()
+    elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+      include(llvm-toolchain)
+    endif()
+  endif()
+
+  # Coverage build type
+  set(BENCHMARK_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the C++ compiler during coverage builds."
+    FORCE)
+  set(BENCHMARK_EXE_LINKER_FLAGS_COVERAGE "${CMAKE_EXE_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used for linking binaries during coverage builds."
+    FORCE)
+  set(BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE "${CMAKE_SHARED_LINKER_FLAGS_DEBUG}"
+    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
+    FORCE)
+  mark_as_advanced(
+    BENCHMARK_CXX_FLAGS_COVERAGE
+    BENCHMARK_EXE_LINKER_FLAGS_COVERAGE
+    BENCHMARK_SHARED_LINKER_FLAGS_COVERAGE)
+  set(CMAKE_BUILD_TYPE "${CMAKE_BUILD_TYPE}" CACHE STRING
+    "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel Coverage.")
+  add_cxx_compiler_flag(--coverage COVERAGE)
+endif()
+
+if (BENCHMARK_USE_LIBCXX)
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+    add_cxx_compiler_flag(-stdlib=libc++)
+  elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
+          "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+    add_cxx_compiler_flag(-nostdinc++)
+    message(WARNING "libc++ header path must be manually specified using CMAKE_CXX_FLAGS")
+    # Adding -nodefaultlibs directly to CMAKE_<TYPE>_LINKER_FLAGS will break
+    # configuration checks such as 'find_package(Threads)'
+    list(APPEND BENCHMARK_CXX_LINKER_FLAGS -nodefaultlibs)
+    # -lc++ cannot be added directly to CMAKE_<TYPE>_LINKER_FLAGS because
+    # linker flags appear before all linker inputs and -lc++ must appear after.
+    list(APPEND BENCHMARK_CXX_LIBRARIES c++)
+  else()
+    message(FATAL_ERROR "-DBENCHMARK_USE_LIBCXX:BOOL=ON is not supported for compiler")
+  endif()
+endif(BENCHMARK_USE_LIBCXX)
+
+# C++ feature checks
+# Determine the correct regular expression engine to use
+cxx_feature_check(STD_REGEX)
+cxx_feature_check(GNU_POSIX_REGEX)
+cxx_feature_check(POSIX_REGEX)
+if(NOT HAVE_STD_REGEX AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(FATAL_ERROR "Failed to determine the source files for the regular expression backend")
+endif()
+if (NOT BENCHMARK_ENABLE_EXCEPTIONS AND HAVE_STD_REGEX
+        AND NOT HAVE_GNU_POSIX_REGEX AND NOT HAVE_POSIX_REGEX)
+  message(WARNING "Using std::regex with exceptions disabled is not fully supported")
+endif()
+cxx_feature_check(STEADY_CLOCK)
+# Ensure we have pthreads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+
+# Set up directories
+include_directories(${PROJECT_SOURCE_DIR}/include)
+
+# Build the targets
+add_subdirectory(src)
+
+if (BENCHMARK_ENABLE_TESTING)
+  enable_testing()
+  if (BENCHMARK_ENABLE_GTEST_TESTS AND
+      NOT (TARGET gtest AND TARGET gtest_main AND
+           TARGET gmock AND TARGET gmock_main))
+    include(GoogleTest)
+  endif()
+  add_subdirectory(test)
+endif()
diff --git a/thirdparty/benchmark-1.5.0/CONTRIBUTING.md b/thirdparty/benchmark-1.5.0/CONTRIBUTING.md
new file mode 100644
index 0000000000..43de4c9d47
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/CONTRIBUTING.md
@@ -0,0 +1,58 @@
+# How to contribute #
+
+We'd love to accept your patches and contributions to this project.  There are
+a just a few small guidelines you need to follow.
+
+
+## Contributor License Agreement ##
+
+Contributions to any Google project must be accompanied by a Contributor
+License Agreement.  This is not a copyright **assignment**, it simply gives
+Google permission to use and redistribute your contributions as part of the
+project.
+
+  * If you are an individual writing original source code and you're sure you
+    own the intellectual property, then you'll need to sign an [individual
+    CLA][].
+
+  * If you work for a company that wants to allow you to contribute your work,
+    then you'll need to sign a [corporate CLA][].
+
+You generally only need to submit a CLA once, so if you've already submitted
+one (even if it was for a different project), you probably don't need to do it
+again.
+
+[individual CLA]: https://developers.google.com/open-source/cla/individual
+[corporate CLA]: https://developers.google.com/open-source/cla/corporate
+
+Once your CLA is submitted (or if you already submitted one for
+another Google project), make a commit adding yourself to the
+[AUTHORS][] and [CONTRIBUTORS][] files. This commit can be part
+of your first [pull request][].
+
+[AUTHORS]: AUTHORS
+[CONTRIBUTORS]: CONTRIBUTORS
+
+
+## Submitting a patch ##
+
+  1. It's generally best to start by opening a new issue describing the bug or
+     feature you're intending to fix.  Even if you think it's relatively minor,
+     it's helpful to know what people are working on.  Mention in the initial
+     issue that you are planning to work on that bug or feature so that it can
+     be assigned to you.
+
+  1. Follow the normal process of [forking][] the project, and setup a new
+     branch to work in.  It's important that each group of changes be done in
+     separate branches in order to ensure that a pull request only includes the
+     commits related to that bug or feature.
+
+  1. Do your best to have [well-formed commit messages][] for each change.
+     This provides consistency throughout the project, and ensures that commit
+     messages are able to be formatted properly by various git tools.
+
+  1. Finally, push the commits to your fork and submit a [pull request][].
+
+[forking]: https://help.github.com/articles/fork-a-repo
+[well-formed commit messages]: http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
+[pull request]: https://help.github.com/articles/creating-a-pull-request
diff --git a/thirdparty/benchmark-1.5.0/CONTRIBUTORS b/thirdparty/benchmark-1.5.0/CONTRIBUTORS
new file mode 100644
index 0000000000..b680efc8c4
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/CONTRIBUTORS
@@ -0,0 +1,72 @@
+# People who have agreed to one of the CLAs and can contribute patches.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+# https://developers.google.com/open-source/cla/individual
+# https://developers.google.com/open-source/cla/corporate
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+#
+# Names should be added to this file as:
+#     Name <email address>
+#
+# Please keep the list sorted.
+
+Albert Pretorius <pretoalb@gmail.com>
+Alex Steele <steelal123@gmail.com>
+Andriy Berestovskyy <berestovskyy@gmail.com>
+Arne Beer <arne@twobeer.de>
+Billy Robert O'Neal III <billy.oneal@gmail.com> <bion@microsoft.com>
+Chris Kennelly <ckennelly@google.com> <ckennelly@ckennelly.com>
+Christopher Seymour <chris.j.seymour@hotmail.com>
+Cyrille Faucheux <cyrille.faucheux@gmail.com>
+Daniel Harvey <danielharvey458@gmail.com>
+David Coeurjolly <david.coeurjolly@liris.cnrs.fr>
+Deniz Evrenci <denizevrenci@gmail.com>
+Dominic Hamon <dma@stripysock.com> <dominic@google.com>
+Dominik Czarnota <dominik.b.czarnota@gmail.com>
+Eric Fiselier <eric@efcs.ca>
+Eugene Zhuk <eugene.zhuk@gmail.com>
+Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
+Felix Homann <linuxaudio@showlabor.de>
+Hannes Hauswedell <h2@fsfe.org>
+Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
+Jern-Kuan Leong <jernkuan@gmail.com>
+JianXiong Zhou <zhoujianxiong2@gmail.com>
+Joao Paulo Magalhaes <joaoppmagalhaes@gmail.com>
+John Millikin <jmillikin@stripe.com>
+Jussi Knuuttila <jussi.knuuttila@gmail.com>
+Kai Wolf <kai.wolf@gmail.com>
+Kishan Kumar <kumar.kishan@outlook.com>
+Kaito Udagawa <umireon@gmail.com>
+Lei Xu <eddyxu@gmail.com>
+Matt Clarkson <mattyclarkson@gmail.com>
+Maxim Vafin <maxvafin@gmail.com>
+Nick Hutchinson <nshutchinson@gmail.com>
+Oleksandr Sochka <sasha.sochka@gmail.com>
+Ori Livneh <ori.livneh@gmail.com>
+Pascal Leroy <phl@google.com>
+Paul Redmond <paul.redmond@gmail.com>
+Pierre Phaneuf <pphaneuf@google.com>
+Radoslav Yovchev <radoslav.tm@gmail.com>
+Raul Marin <rmrodriguez@cartodb.com>
+Ray Glover <ray.glover@uk.ibm.com>
+Robert Guo <robert.guo@mongodb.com>
+Roman Lebedev <lebedev.ri@gmail.com>
+Shuo Chen <chenshuo@chenshuo.com>
+Tobias Ulvgård <tobias.ulvgard@dirac.se>
+Tom Madams <tom.ej.madams@gmail.com> <tmadams@google.com>
+Yixuan Qiu <yixuanq@gmail.com>
+Yusuke Suzuki <utatane.tea@gmail.com>
+Zbigniew Skowron <zbychs@gmail.com>
diff --git a/thirdparty/benchmark-1.5.0/LICENSE b/thirdparty/benchmark-1.5.0/LICENSE
new file mode 100644
index 0000000000..d645695673
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/thirdparty/benchmark-1.5.0/README.md b/thirdparty/benchmark-1.5.0/README.md
new file mode 100644
index 0000000000..45e4158843
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/README.md
@@ -0,0 +1,1179 @@
+# Benchmark
+[![Build Status](https://travis-ci.org/google/benchmark.svg?branch=master)](https://travis-ci.org/google/benchmark)
+[![Build status](https://ci.appveyor.com/api/projects/status/u0qsyp7t1tk7cpxs/branch/master?svg=true)](https://ci.appveyor.com/project/google/benchmark/branch/master)
+[![Coverage Status](https://coveralls.io/repos/google/benchmark/badge.svg)](https://coveralls.io/r/google/benchmark)
+[![slackin](https://slackin-iqtfqnpzxd.now.sh/badge.svg)](https://slackin-iqtfqnpzxd.now.sh/)
+
+
+A library to benchmark code snippets, similar to unit tests. Example:
+
+```c++
+#include <benchmark/benchmark.h>
+
+static void BM_SomeFunction(benchmark::State& state) {
+  // Perform setup here
+  for (auto _ : state) {
+    // This code gets timed
+    SomeFunction();
+  }
+}
+// Register the function as a benchmark
+BENCHMARK(BM_SomeFunction);
+// Run the benchmark
+BENCHMARK_MAIN();
+```
+
+To get started, see [Requirements](#requirements) and
+[Installation](#installation). See [Usage](#usage) for a full example and the
+[User Guide](#user-guide) for a more comprehensive feature overview.
+
+It may also help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+as some of the structural aspects of the APIs are similar.
+
+### Resources
+
+[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
+
+IRC channel: [freenode](https://freenode.net) #googlebenchmark
+
+[Additional Tooling Documentation](docs/tools.md)
+
+[Assembly Testing Documentation](docs/AssemblyTests.md)
+
+## Requirements
+
+The library can be used with C++03. However, it requires C++11 to build,
+including compiler and standard library support.
+
+The following minimum versions are required to build the library:
+
+* GCC 4.8
+* Clang 3.4
+* Visual Studio 2013
+* Intel 2015 Update 1
+
+## Installation
+
+This describes the installation process using cmake. As pre-requisites, you'll
+need git and cmake installed.
+
+_See [dependencies.md](dependencies.md) for more details regarding supported
+versions of build tools._
+
+```bash
+# Check out the library.
+$ git clone https://github.com/google/benchmark.git
+# Benchmark requires Google Test as a dependency. Add the source tree as a subdirectory.
+$ git clone https://github.com/google/googletest.git benchmark/googletest
+# Make a build directory to place the build output.
+$ mkdir build && cd build
+# Generate a Makefile with cmake.
+# Use cmake -G <generator> to generate a different file type.
+$ cmake ../benchmark
+# Build the library.
+$ make
+```
+This builds the `benchmark` and `benchmark_main` libraries and tests.
+On a unix system, the build directory should now look something like this:
+
+```
+/benchmark
+/build
+  /src
+    /libbenchmark.a
+    /libbenchmark_main.a
+  /test
+    ...
+```
+
+Next, you can run the tests to check the build.
+
+```bash
+$ make test
+```
+
+If you want to install the library globally, also run:
+
+```
+sudo make install
+```
+
+Note that Google Benchmark requires Google Test to build and run the tests. This
+dependency can be provided two ways:
+
+* Checkout the Google Test sources into `benchmark/googletest` as above.
+* Otherwise, if `-DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON` is specified during
+  configuration, the library will automatically download and build any required
+  dependencies.
+
+If you do not wish to build and run the tests, add `-DBENCHMARK_ENABLE_GTEST_TESTS=OFF`
+to `CMAKE_ARGS`.
+
+### Debug vs Release
+
+By default, benchmark builds as a debug library. You will see a warning in the
+output when this is the case. To build it as a release library instead, use:
+
+```
+cmake -DCMAKE_BUILD_TYPE=Release
+```
+
+To enable link-time optimisation, use
+
+```
+cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
+```
+
+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
+cache variables, if autodetection fails.
+
+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
+`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
+
+
+### Stable and Experimental Library Versions
+
+The main branch contains the latest stable version of the benchmarking library;
+the API of which can be considered largely stable, with source breaking changes
+being made only upon the release of a new major version.
+
+Newer, experimental, features are implemented and tested on the
+[`v2` branch](https://github.com/google/benchmark/tree/v2). Users who wish
+to use, test, and provide feedback on the new features are encouraged to try
+this branch. However, this branch provides no stability guarantees and reserves
+the right to change and break the API at any time.
+
+## Usage
+### Basic usage
+Define a function that executes the code to measure, register it as a benchmark
+function using the `BENCHMARK` macro, and ensure an appropriate `main` function
+is available:
+
+```c++
+#include <benchmark/benchmark.h>
+
+static void BM_StringCreation(benchmark::State& state) {
+  for (auto _ : state)
+    std::string empty_string;
+}
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  for (auto _ : state)
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+BENCHMARK_MAIN();
+```
+
+To run the benchmark, compile and link against the `benchmark` library
+(libbenchmark.a/.so). If you followed the build steps above, this
+library will be under the build directory you created.
+
+```bash
+# Example on linux after running the build steps above. Assumes the
+# `benchmark` and `build` directories are under the current directory.
+$ g++ -std=c++11 -isystem benchmark/include -Lbuild/src -lpthread \
+  -lbenchmark mybenchmark.cc -o mybenchmark
+```
+
+Alternatively, link against the `benchmark_main` library and remove
+`BENCHMARK_MAIN();` above to get the same behavior.
+
+The compiled executable will run all benchmarks by default. Pass the `--help`
+flag for option information or see the guide below.
+
+### Platform-specific instructions
+
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+If you're running benchmarks on Windows, the shlwapi library (`-lshlwapi`) is
+also required.
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
+
+## User Guide
+
+### Command Line
+[Output Formats](#output-formats)
+
+[Output Files](#output-files)
+
+[Running a Subset of Benchmarks](#running-a-subset-of-benchmarks)
+
+[Result Comparison](#result-comparison)
+
+### Library
+[Runtime and Reporting Considerations](#runtime-and-reporting-considerations)
+
+[Passing Arguments](#passing-arguments)
+
+[Calculating Asymptotic Complexity](#asymptotic-complexity)
+
+[Templated Benchmarks](#templated-benchmarks)
+
+[Fixtures](#fixtures)
+
+[Custom Counters](#custom-counters)
+
+[Multithreaded Benchmarks](#multithreaded-benchmarks)
+
+[CPU Timers](#cpu-timers)
+
+[Manual Timing](#manual-timing)
+
+[Setting the Time Unit](#setting-the-time-unit)
+
+[Preventing Optimization](#preventing-optimization)
+
+[Reporting Statistics](#reporting-statistics)
+
+[Custom Statistics](#custom-statistics)
+
+[Using RegisterBenchmark](#using-register-benchmark)
+
+[Exiting with an Error](#exiting-with-an-error)
+
+[A Faster KeepRunning Loop](#a-faster-keep-running-loop)
+
+[Disabling CPU Frequency Scaling](#disabling-cpu-frequency-scaling)
+
+<a name="output-formats" />
+
+### Output Formats
+
+The library supports multiple output formats. Use the
+`--benchmark_format=<console|json|csv>` flag to set the format type. `console`
+is the default format.
+
+The Console format is intended to be a human readable format. By default
+the format generates color output. Context is output on stderr and the
+tabular data on stdout. Example tabular output looks like:
+```
+Benchmark                               Time(ns)    CPU(ns) Iterations
+----------------------------------------------------------------------
+BM_SetInsert/1024/1                        28928      29349      23853  133.097kB/s   33.2742k items/s
+BM_SetInsert/1024/8                        32065      32913      21375  949.487kB/s   237.372k items/s
+BM_SetInsert/1024/10                       33157      33648      21431  1.13369MB/s   290.225k items/s
+```
+
+The JSON format outputs human readable json split into two top level attributes.
+The `context` attribute contains information about the run in general, including
+information about the CPU and the date.
+The `benchmarks` attribute contains a list of every benchmark run. Example json
+output looks like:
+```json
+{
+  "context": {
+    "date": "2015/03/17-18:40:25",
+    "num_cpus": 40,
+    "mhz_per_cpu": 2801,
+    "cpu_scaling_enabled": false,
+    "build_type": "debug"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SetInsert/1024/1",
+      "iterations": 94877,
+      "real_time": 29275,
+      "cpu_time": 29836,
+      "bytes_per_second": 134066,
+      "items_per_second": 33516
+    },
+    {
+      "name": "BM_SetInsert/1024/8",
+      "iterations": 21609,
+      "real_time": 32317,
+      "cpu_time": 32429,
+      "bytes_per_second": 986770,
+      "items_per_second": 246693
+    },
+    {
+      "name": "BM_SetInsert/1024/10",
+      "iterations": 21393,
+      "real_time": 32724,
+      "cpu_time": 33355,
+      "bytes_per_second": 1199226,
+      "items_per_second": 299807
+    }
+  ]
+}
+```
+
+The CSV format outputs comma-separated values. The `context` is output on stderr
+and the CSV itself on stdout. Example CSV output looks like:
+```
+name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
+"BM_SetInsert/1024/1",65465,17890.7,8407.45,475768,118942,
+"BM_SetInsert/1024/8",116606,18810.1,9766.64,3.27646e+06,819115,
+"BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
+```
+
+<a name="output-files" />
+
+### Output Files
+
+Write benchmark results to a file with the `--benchmark_out=<filename>` option.
+Specify the output format with `--benchmark_out_format={json|console|csv}`. Note that Specifying
+`--benchmark_out` does not suppress the console output.
+
+<a name="running-a-subset-of-benchmarks" />
+
+### Running a Subset of Benchmarks
+
+The `--benchmark_filter=<regex>` option can be used to only run the benchmarks
+which match the specified `<regex>`. For example:
+
+```bash
+$ ./run_benchmarks.x --benchmark_filter=BM_memcpy/32
+Run on (1 X 2300 MHz CPU )
+2016-06-25 19:34:24
+Benchmark              Time           CPU Iterations
+----------------------------------------------------
+BM_memcpy/32          11 ns         11 ns   79545455
+BM_memcpy/32k       2181 ns       2185 ns     324074
+BM_memcpy/32          12 ns         12 ns   54687500
+BM_memcpy/32k       1834 ns       1837 ns     357143
+```
+
+<a name="result-comparison" />
+
+### Result comparison
+
+It is possible to compare the benchmarking results. See [Additional Tooling Documentation](docs/tools.md)
+
+<a name="runtime-and-reporting-considerations" />
+
+### Runtime and Reporting Considerations
+
+When the benchmark binary is executed, each benchmark function is run serially.
+The number of iterations to run is determined dynamically by running the
+benchmark a few times and measuring the time taken and ensuring that the
+ultimate result will be statistically stable. As such, faster benchmark
+functions will be run for more iterations than slower benchmark functions, and
+the number of iterations is thus reported.
+
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set per benchmark by calling `MinTime` on the registered benchmark object.
+
+Average timings are then reported over the iterations run. If multiple
+repetitions are requested using the `--benchmark_repetitions` command-line
+option, or at registration time, the benchmark function will be run several
+times and statistical results across these repetitions will also be reported.
+
+As well as the per-benchmark entries, a preamble in the report will include
+information about the machine on which the benchmarks are run.
+
+<a name="passing-arguments" />
+
+### Passing Arguments
+
+Sometimes a family of benchmarks can be implemented with just one routine that
+takes an extra argument to specify which one of the family of benchmarks to
+run. For example, the following code defines a family of benchmarks for
+measuring the speed of `memcpy()` calls of different lengths:
+
+```c++
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range(0)];
+  char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
+  for (auto _ : state)
+    memcpy(dst, src, state.range(0));
+  state.SetBytesProcessed(int64_t(state.iterations()) *
+                          int64_t(state.range(0)));
+  delete[] src;
+  delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following invocation will pick a few appropriate arguments in
+the specified range and will generate a benchmark for each such argument.
+
+```c++
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+```
+
+By default the arguments in the range are generated in multiples of eight and
+the command above selects [ 8, 64, 512, 4k, 8k ]. In the following code the
+range multiplier is changed to multiples of two.
+
+```c++
+BENCHMARK(BM_memcpy)->RangeMultiplier(2)->Range(8, 8<<10);
+```
+Now arguments generated are [ 8, 16, 32, 64, 128, 256, 512, 1024, 2k, 4k, 8k ].
+
+You might have a benchmark that depends on two or more inputs. For example, the
+following code defines a family of benchmarks for measuring the speed of set
+insertion.
+
+```c++
+static void BM_SetInsert(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming();
+    data = ConstructRandomSet(state.range(0));
+    state.ResumeTiming();
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+    ->Args({1<<10, 128})
+    ->Args({2<<10, 128})
+    ->Args({4<<10, 128})
+    ->Args({8<<10, 128})
+    ->Args({1<<10, 512})
+    ->Args({2<<10, 512})
+    ->Args({4<<10, 512})
+    ->Args({8<<10, 512});
+```
+
+The preceding code is quite repetitive, and can be replaced with the following
+short-hand. The following macro will pick a few appropriate arguments in the
+product of the two specified ranges and will generate a benchmark for each such
+pair.
+
+```c++
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+
+For more complex patterns of inputs, passing a custom function to `Apply` allows
+programmatic specification of an arbitrary set of arguments on which to run the
+benchmark. The following example enumerates a dense range on one parameter,
+and a sparse range on the second.
+
+```c++
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->Args({i, j});
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+```
+
+#### Passing Arbitrary Arguments to a Benchmark
+
+In C++11 it is possible to define a benchmark that takes an arbitrary number
+of extra arguments. The `BENCHMARK_CAPTURE(func, test_case_name, ...args)`
+macro creates a benchmark that invokes `func`  with the `benchmark::State` as
+the first argument followed by the specified `args...`.
+The `test_case_name` is appended to the name of the benchmark and
+should describe the values passed.
+
+```c++
+template <class ...ExtraArgs>
+void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+  [...]
+}
+// Registers a benchmark named "BM_takes_args/int_string_test" that passes
+// the specified values to `extra_args`.
+BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+```
+Note that elements of `...args` may refer to global variables. Users should
+avoid modifying global state inside of a benchmark.
+
+<a name="asymptotic-complexity" />
+
+### Calculating Asymptotic Complexity (Big O)
+
+Asymptotic complexity might be calculated for a family of benchmarks. The
+following code will calculate the coefficient for the high-order term in the
+running time and the normalized root-mean square error of string comparison.
+
+```c++
+static void BM_StringCompare(benchmark::State& state) {
+  std::string s1(state.range(0), '-');
+  std::string s2(state.range(0), '-');
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(s1.compare(s2));
+  }
+  state.SetComplexityN(state.range(0));
+}
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity(benchmark::oN);
+```
+
+As shown in the following invocation, asymptotic complexity might also be
+calculated automatically.
+
+```c++
+BENCHMARK(BM_StringCompare)
+    ->RangeMultiplier(2)->Range(1<<10, 1<<18)->Complexity();
+```
+
+The following code will specify asymptotic complexity with a lambda function,
+that might be used to customize high-order term calculation.
+
+```c++
+BENCHMARK(BM_StringCompare)->RangeMultiplier(2)
+    ->Range(1<<10, 1<<18)->Complexity([](int64_t n)->double{return n; });
+```
+
+<a name="templated-benchmarks" />
+
+### Templated Benchmarks
+
+This example produces and consumes messages of size `sizeof(v)` `range_x`
+times. It also outputs throughput in the absence of multiprogramming.
+
+```c++
+template <class Q> void BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i--; )
+      q.push(v);
+    for (int e = state.range(0); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(
+      static_cast<int64_t>(state.iterations())*state.range(0));
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+```
+
+Three macros are provided for adding benchmark templates.
+
+```c++
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE(func, ...) // Takes any number of parameters.
+#else // C++ < C++11
+#define BENCHMARK_TEMPLATE(func, arg1)
+#endif
+#define BENCHMARK_TEMPLATE1(func, arg1)
+#define BENCHMARK_TEMPLATE2(func, arg1, arg2)
+```
+
+<a name="fixtures" />
+
+### Fixtures
+
+Fixture tests are created by first defining a type that derives from
+`::benchmark::Fixture` and then creating/registering the tests using the
+following macros:
+
+* `BENCHMARK_F(ClassName, Method)`
+* `BENCHMARK_DEFINE_F(ClassName, Method)`
+* `BENCHMARK_REGISTER_F(ClassName, Method)`
+
+For Example:
+
+```c++
+class MyFixture : public benchmark::Fixture {
+public:
+  void SetUp(const ::benchmark::State& state) {
+  }
+
+  void TearDown(const ::benchmark::State& state) {
+  }
+};
+
+BENCHMARK_F(MyFixture, FooTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, BarTest)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+/* BarTest is NOT registered */
+BENCHMARK_REGISTER_F(MyFixture, BarTest)->Threads(2);
+/* BarTest is now registered */
+```
+
+#### Templated Fixtures
+
+Also you can create templated fixture by using the following macros:
+
+* `BENCHMARK_TEMPLATE_F(ClassName, Method, ...)`
+* `BENCHMARK_TEMPLATE_DEFINE_F(ClassName, Method, ...)`
+
+For example:
+```c++
+template<typename T>
+class MyFixture : public benchmark::Fixture {};
+
+BENCHMARK_TEMPLATE_F(MyFixture, IntTest, int)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, DoubleTest, double)(benchmark::State& st) {
+   for (auto _ : st) {
+     ...
+  }
+}
+
+BENCHMARK_REGISTER_F(MyFixture, DoubleTest)->Threads(2);
+```
+
+<a name="custom-counters" />
+
+### Custom Counters
+
+You can add your own counters with user-defined names. The example below
+will add columns "Foo", "Bar" and "Baz" in its output:
+
+```c++
+static void UserCountersExample1(benchmark::State& state) {
+  double numFoos = 0, numBars = 0, numBazs = 0;
+  for (auto _ : state) {
+    // ... count Foo,Bar,Baz events
+  }
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+}
+```
+
+The `state.counters` object is a `std::map` with `std::string` keys
+and `Counter` values. The latter is a `double`-like class, via an implicit
+conversion to `double&`. Thus you can use all of the standard arithmetic
+assignment operators (`=,+=,-=,*=,/=`) to change the value of each counter.
+
+In multithreaded benchmarks, each counter is set on the calling thread only.
+When the benchmark finishes, the counters from each thread will be summed;
+the resulting sum is the value which will be shown for the benchmark.
+
+The `Counter` constructor accepts three parameters: the value as a `double`
+; a bit flag which allows you to show counters as rates, and/or as per-thread
+iteration, and/or as per-thread averages, and/or iteration invariants;
+and a flag specifying the 'unit' - i.e. is 1k a 1000 (default,
+`benchmark::Counter::OneK::kIs1000`), or 1024
+(`benchmark::Counter::OneK::kIs1024`)?
+
+```c++
+  // sets a simple counter
+  state.counters["Foo"] = numFoos;
+
+  // Set the counter as a rate. It will be presented divided
+  // by the duration of the benchmark.
+  state.counters["FooRate"] = Counter(numFoos, benchmark::Counter::kIsRate);
+
+  // Set the counter as a thread-average quantity. It will
+  // be presented divided by the number of threads.
+  state.counters["FooAvg"] = Counter(numFoos, benchmark::Counter::kAvgThreads);
+
+  // There's also a combined flag:
+  state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+
+  // This says that we process with the rate of state.range(0) bytes every iteration:
+  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
+```
+
+When you're compiling in C++11 mode or later you can use `insert()` with
+`std::initializer_list`:
+
+```c++
+  // With C++11, this can be done:
+  state.counters.insert({{"Foo", numFoos}, {"Bar", numBars}, {"Baz", numBazs}});
+  // ... instead of:
+  state.counters["Foo"] = numFoos;
+  state.counters["Bar"] = numBars;
+  state.counters["Baz"] = numBazs;
+```
+
+#### Counter Reporting
+
+When using the console reporter, by default, user counters are are printed at
+the end after the table, the same way as ``bytes_processed`` and
+``items_processed``. This is best for cases in which there are few counters,
+or where there are only a couple of lines per benchmark. Here's an example of
+the default output:
+
+```
+------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations UserCounters...
+------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2248 ns      10277 ns      68808 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:1      9797 ns       9788 ns      71523 Bar=2 Bat=5 Baz=3 Foo=1024m
+BM_UserCounter/threads:2      4924 ns       9842 ns      71036 Bar=4 Bat=10 Baz=6 Foo=2
+BM_UserCounter/threads:4      2589 ns      10284 ns      68012 Bar=8 Bat=20 Baz=12 Foo=4
+BM_UserCounter/threads:8      2212 ns      10287 ns      68040 Bar=16 Bat=40 Baz=24 Foo=8
+BM_UserCounter/threads:16     1782 ns      10278 ns      68144 Bar=32 Bat=80 Baz=48 Foo=16
+BM_UserCounter/threads:32     1291 ns      10296 ns      68256 Bar=64 Bat=160 Baz=96 Foo=32
+BM_UserCounter/threads:4      2615 ns      10307 ns      68040 Bar=8 Bat=20 Baz=12 Foo=4
+BM_Factorial                    26 ns         26 ns   26608979 40320
+BM_Factorial/real_time          26 ns         26 ns   26587936 40320
+BM_CalculatePiRange/1           16 ns         16 ns   45704255 0
+BM_CalculatePiRange/8           73 ns         73 ns    9520927 3.28374
+BM_CalculatePiRange/64         609 ns        609 ns    1140647 3.15746
+BM_CalculatePiRange/512       4900 ns       4901 ns     142696 3.14355
+```
+
+If this doesn't suit you, you can print each counter as a table column by
+passing the flag `--benchmark_counters_tabular=true` to the benchmark
+application. This is best for cases in which there are a lot of counters, or
+a lot of lines per individual benchmark. Note that this will trigger a
+reprinting of the table header any time the counter set changes between
+individual benchmarks. Here's an example of corresponding output when
+`--benchmark_counters_tabular=true` is passed:
+
+```
+---------------------------------------------------------------------------------------
+Benchmark                        Time           CPU Iterations    Bar   Bat   Baz   Foo
+---------------------------------------------------------------------------------------
+BM_UserCounter/threads:8      2198 ns       9953 ns      70688     16    40    24     8
+BM_UserCounter/threads:1      9504 ns       9504 ns      73787      2     5     3     1
+BM_UserCounter/threads:2      4775 ns       9550 ns      72606      4    10     6     2
+BM_UserCounter/threads:4      2508 ns       9951 ns      70332      8    20    12     4
+BM_UserCounter/threads:8      2055 ns       9933 ns      70344     16    40    24     8
+BM_UserCounter/threads:16     1610 ns       9946 ns      70720     32    80    48    16
+BM_UserCounter/threads:32     1192 ns       9948 ns      70496     64   160    96    32
+BM_UserCounter/threads:4      2506 ns       9949 ns      70332      8    20    12     4
+--------------------------------------------------------------
+Benchmark                        Time           CPU Iterations
+--------------------------------------------------------------
+BM_Factorial                    26 ns         26 ns   26392245 40320
+BM_Factorial/real_time          26 ns         26 ns   26494107 40320
+BM_CalculatePiRange/1           15 ns         15 ns   45571597 0
+BM_CalculatePiRange/8           74 ns         74 ns    9450212 3.28374
+BM_CalculatePiRange/64         595 ns        595 ns    1173901 3.15746
+BM_CalculatePiRange/512       4752 ns       4752 ns     147380 3.14355
+BM_CalculatePiRange/4k       37970 ns      37972 ns      18453 3.14184
+BM_CalculatePiRange/32k     303733 ns     303744 ns       2305 3.14162
+BM_CalculatePiRange/256k   2434095 ns    2434186 ns        288 3.1416
+BM_CalculatePiRange/1024k  9721140 ns    9721413 ns         71 3.14159
+BM_CalculatePi/threads:8      2255 ns       9943 ns      70936
+```
+Note above the additional header printed when the benchmark changes from
+``BM_UserCounter`` to ``BM_Factorial``. This is because ``BM_Factorial`` does
+not have the same counter set as ``BM_UserCounter``.
+
+<a name="multithreaded-benchmarks"/>
+
+### Multithreaded Benchmarks
+
+In a multithreaded test (benchmark invoked by multiple threads simultaneously),
+it is guaranteed that none of the threads will start until all have reached
+the start of the benchmark loop, and all will have finished before any thread
+exits the benchmark loop. (This behavior is also provided by the `KeepRunning()`
+API) As such, any global setup or teardown can be wrapped in a check against the thread
+index:
+
+```c++
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  for (auto _ : state) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(2);
+```
+
+If the benchmarked code itself uses threads and you want to compare it to
+single-threaded code, you may want to use real-time ("wallclock") measurements
+for latency comparisons:
+
+```c++
+BENCHMARK(BM_test)->Range(8, 8<<10)->UseRealTime();
+```
+
+Without `UseRealTime`, CPU time is used by default.
+
+<a name="cpu-timers" />
+
+### CPU Timers
+
+By default, the CPU timer only measures the time spent by the main thread.
+If the benchmark itself uses threads internally, this measurement may not
+be what you are looking for. Instead, there is a way to measure the total
+CPU usage of the process, by all the threads.
+
+```c++
+void callee(int i);
+
+static void MyMain(int size) {
+#pragma omp parallel for
+  for(int i = 0; i < size; i++)
+    callee(i);
+}
+
+static void BM_OpenMP(benchmark::State& state) {
+  for (auto _ : state)
+    MyMain(state.range(0);
+}
+
+// Measure the time spent by the main thread, use it to decide for how long to
+// run the benchmark loop. Depending on the internal implementation detail may
+// measure to anywhere from near-zero (the overhead spent before/after work
+// handoff to worker thread[s]) to the whole single-thread time.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10);
+
+// Measure the user-visible time, the wall clock (literally, the time that
+// has passed on the clock on the wall), use it to decide for how long to
+// run the benchmark loop. This will always be meaningful, an will match the
+// time spent by the main thread in single-threaded case, in general decreasing
+// with the number of internal threads doing the work.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->UseRealTime();
+
+// Measure the total CPU consumption, use it to decide for how long to
+// run the benchmark loop. This will always measure to no less than the
+// time spent by the main thread in single-threaded case.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime();
+
+// A mixture of the last two. Measure the total CPU consumption, but use the
+// wall clock to decide for how long to run the benchmark loop.
+BENCHMARK(BM_OpenMP)->Range(8, 8<<10)->MeasureProcessCPUTime()->UseRealTime();
+```
+
+#### Controlling Timers
+
+Normally, the entire duration of the work loop (`for (auto _ : state) {}`)
+is measured. But sometimes, it is necessary to do some work inside of
+that loop, every iteration, but without counting that time to the benchmark time.
+That is possible, althought it is not recommended, since it has high overhead.
+
+```c++
+static void BM_SetInsert_With_Timer_Control(benchmark::State& state) {
+  std::set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming(); // Stop timers. They will not count until they are resumed.
+    data = ConstructRandomSet(state.range(0)); // Do something that should not be measured
+    state.ResumeTiming(); // And resume timers. They are now counting again.
+    // The rest will be measured.
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert_With_Timer_Control)->Ranges({{1<<10, 8<<10}, {128, 512}});
+```
+
+<a name="manual-timing" />
+
+### Manual Timing
+
+For benchmarking something for which neither CPU time nor real-time are
+correct or accurate enough, completely manual timing is supported using
+the `UseManualTime` function.
+
+When `UseManualTime` is used, the benchmarked code must call
+`SetIterationTime` once per iteration of the benchmark loop to
+report the manually measured time.
+
+An example use case for this is benchmarking GPU execution (e.g. OpenCL
+or CUDA kernels, OpenGL or Vulkan or Direct3D draw calls), which cannot
+be accurately measured using CPU time or real-time. Instead, they can be
+measured accurately using a dedicated API, and these measurement results
+can be reported back with `SetIterationTime`.
+
+```c++
+static void BM_ManualTiming(benchmark::State& state) {
+  int microseconds = state.range(0);
+  std::chrono::duration<double, std::micro> sleep_duration {
+    static_cast<double>(microseconds)
+  };
+
+  for (auto _ : state) {
+    auto start = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(sleep_duration);
+    auto end   = std::chrono::high_resolution_clock::now();
+
+    auto elapsed_seconds =
+      std::chrono::duration_cast<std::chrono::duration<double>>(
+        end - start);
+
+    state.SetIterationTime(elapsed_seconds.count());
+  }
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1<<17)->UseManualTime();
+```
+
+<a name="setting-the-time-unit" />
+
+### Setting the Time Unit
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+```c++
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+```
+
+<a name="preventing-optimization" />
+
+### Preventing Optimization
+
+To prevent a value or expression from being optimized away by the compiler
+the `benchmark::DoNotOptimize(...)` and `benchmark::ClobberMemory()`
+functions can be used.
+
+```c++
+static void BM_test(benchmark::State& state) {
+  for (auto _ : state) {
+      int x = 0;
+      for (int i=0; i < 64; ++i) {
+        benchmark::DoNotOptimize(x += i);
+      }
+  }
+}
+```
+
+`DoNotOptimize(<expr>)` forces the  *result* of `<expr>` to be stored in either
+memory or a register. For GNU based compilers it acts as read/write barrier
+for global memory. More specifically it forces the compiler to flush pending
+writes to memory and reload any other values as necessary.
+
+Note that `DoNotOptimize(<expr>)` does not prevent optimizations on `<expr>`
+in any way. `<expr>` may even be removed entirely when the result is already
+known. For example:
+
+```c++
+  /* Example 1: `<expr>` is removed entirely. */
+  int foo(int x) { return x + 42; }
+  while (...) DoNotOptimize(foo(0)); // Optimized to DoNotOptimize(42);
+
+  /*  Example 2: Result of '<expr>' is only reused */
+  int bar(int) __attribute__((const));
+  while (...) DoNotOptimize(bar(0)); // Optimized to:
+  // int __result__ = bar(0);
+  // while (...) DoNotOptimize(__result__);
+```
+
+The second tool for preventing optimizations is `ClobberMemory()`. In essence
+`ClobberMemory()` forces the compiler to perform all pending writes to global
+memory. Memory managed by block scope objects must be "escaped" using
+`DoNotOptimize(...)` before it can be clobbered. In the below example
+`ClobberMemory()` prevents the call to `v.push_back(42)` from being optimized
+away.
+
+```c++
+static void BM_vector_push_back(benchmark::State& state) {
+  for (auto _ : state) {
+    std::vector<int> v;
+    v.reserve(1);
+    benchmark::DoNotOptimize(v.data()); // Allow v.data() to be clobbered.
+    v.push_back(42);
+    benchmark::ClobberMemory(); // Force 42 to be written to memory.
+  }
+}
+```
+
+Note that `ClobberMemory()` is only available for GNU or MSVC based compilers.
+
+<a name="reporting-statistics" />
+
+### Statistics: Reporting the Mean, Median and Standard Deviation of Repeated Benchmarks
+
+By default each benchmark is run once and that single result is reported.
+However benchmarks are often noisy and a single result may not be representative
+of the overall behavior. For this reason it's possible to repeatedly rerun the
+benchmark.
+
+The number of runs of each benchmark is specified globally by the
+`--benchmark_repetitions` flag or on a per benchmark basis by calling
+`Repetitions` on the registered benchmark object. When a benchmark is run more
+than once the mean, median and standard deviation of the runs will be reported.
+
+Additionally the `--benchmark_report_aggregates_only={true|false}`,
+`--benchmark_display_aggregates_only={true|false}` flags or
+`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
+used to change how repeated tests are reported. By default the result of each
+repeated run is reported. When `report aggregates only` option is `true`,
+only the aggregates (i.e. mean, median and standard deviation, maybe complexity
+measurements if they were requested) of the runs is reported, to both the
+reporters - standard output (console), and the file.
+However when only the `display aggregates only` option is `true`,
+only the aggregates are displayed in the standard output, while the file
+output still contains everything.
+Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
+registered benchmark object overrides the value of the appropriate flag for that
+benchmark.
+
+<a name="custom-statistics" />
+
+### Custom Statistics
+
+While having mean, median and standard deviation is nice, this may not be
+enough for everyone. For example you may want to know what the largest
+observation is, e.g. because you have some real-time constraints. This is easy.
+The following code will specify a custom statistic to be calculated, defined
+by a lambda function.
+
+```c++
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK(BM_spin_empty)
+  ->ComputeStatistics("max", [](const std::vector<double>& v) -> double {
+    return *(std::max_element(std::begin(v), std::end(v)));
+  })
+  ->Arg(512);
+```
+
+<a name="using-register-benchmark" />
+
+### Using RegisterBenchmark(name, fn, args...)
+
+The `RegisterBenchmark(name, func, args...)` function provides an alternative
+way to create and register benchmarks.
+`RegisterBenchmark(name, func, args...)` creates, registers, and returns a
+pointer to a new benchmark with the specified `name` that invokes
+`func(st, args...)` where `st` is a `benchmark::State` object.
+
+Unlike the `BENCHMARK` registration macros, which can only be used at the global
+scope, the `RegisterBenchmark` can be called anywhere. This allows for
+benchmark tests to be registered programmatically.
+
+Additionally `RegisterBenchmark` allows any callable object to be registered
+as a benchmark. Including capturing lambdas and function objects.
+
+For Example:
+```c++
+auto BM_test = [](benchmark::State& st, auto Inputs) { /* ... */ };
+
+int main(int argc, char** argv) {
+  for (auto& test_input : { /* ... */ })
+      benchmark::RegisterBenchmark(test_input.name(), BM_test, test_input);
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+}
+```
+
+<a name="exiting-with-an-error" />
+
+### Exiting with an Error
+
+When errors caused by external influences, such as file I/O and network
+communication, occur within a benchmark the
+`State::SkipWithError(const char* msg)` function can be used to skip that run
+of benchmark and report the error. Note that only future iterations of the
+`KeepRunning()` are skipped. For the ranged-for version of the benchmark loop
+Users must explicitly exit the loop, otherwise all iterations will be performed.
+Users may explicitly return to exit the benchmark immediately.
+
+The `SkipWithError(...)` function may be used at any point within the benchmark,
+including before and after the benchmark loop.
+
+For example:
+
+```c++
+static void BM_test(benchmark::State& state) {
+  auto resource = GetResource();
+  if (!resource.good()) {
+      state.SkipWithError("Resource is not good!");
+      // KeepRunning() loop will not be entered.
+  }
+  for (state.KeepRunning()) {
+      auto data = resource.read_data();
+      if (!resource.good()) {
+        state.SkipWithError("Failed to read data!");
+        break; // Needed to skip the rest of the iteration.
+     }
+     do_stuff(data);
+  }
+}
+
+static void BM_test_ranged_fo(benchmark::State & state) {
+  state.SkipWithError("test will not be entered");
+  for (auto _ : state) {
+    state.SkipWithError("Failed!");
+    break; // REQUIRED to prevent all further iterations.
+  }
+}
+```
+<a name="a-faster-keep-running-loop" />
+
+### A Faster KeepRunning Loop
+
+In C++11 mode, a ranged-based for loop should be used in preference to
+the `KeepRunning` loop for running the benchmarks. For example:
+
+```c++
+static void BM_Fast(benchmark::State &state) {
+  for (auto _ : state) {
+    FastOperation();
+  }
+}
+BENCHMARK(BM_Fast);
+```
+
+The reason the ranged-for loop is faster than using `KeepRunning`, is
+because `KeepRunning` requires a memory load and store of the iteration count
+ever iteration, whereas the ranged-for variant is able to keep the iteration count
+in a register.
+
+For example, an empty inner loop of using the ranged-based for method looks like:
+
+```asm
+# Loop Init
+  mov rbx, qword ptr [r14 + 104]
+  call benchmark::State::StartKeepRunning()
+  test rbx, rbx
+  je .LoopEnd
+.LoopHeader: # =>This Inner Loop Header: Depth=1
+  add rbx, -1
+  jne .LoopHeader
+.LoopEnd:
+```
+
+Compared to an empty `KeepRunning` loop, which looks like:
+
+```asm
+.LoopHeader: # in Loop: Header=BB0_3 Depth=1
+  cmp byte ptr [rbx], 1
+  jne .LoopInit
+.LoopBody: # =>This Inner Loop Header: Depth=1
+  mov rax, qword ptr [rbx + 8]
+  lea rcx, [rax + 1]
+  mov qword ptr [rbx + 8], rcx
+  cmp rax, qword ptr [rbx + 104]
+  jb .LoopHeader
+  jmp .LoopEnd
+.LoopInit:
+  mov rdi, rbx
+  call benchmark::State::StartKeepRunning()
+  jmp .LoopBody
+.LoopEnd:
+```
+
+Unless C++03 compatibility is required, the ranged-for variant of writing
+the benchmark loop should be preferred.
+
+<a name="disabling-cpu-frequency-scaling" />
+
+### Disabling CPU Frequency Scaling
+If you see this error:
+```
+***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
+```
+you might want to disable the CPU frequency scaling while running the benchmark:
+```bash
+sudo cpupower frequency-set --governor performance
+./mybench
+sudo cpupower frequency-set --governor powersave
+```
diff --git a/thirdparty/benchmark-1.5.0/WORKSPACE b/thirdparty/benchmark-1.5.0/WORKSPACE
new file mode 100644
index 0000000000..9a75f968d9
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/WORKSPACE
@@ -0,0 +1,9 @@
+workspace(name = "com_github_google_benchmark")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+     name = "com_google_googletest",
+     urls = ["https://github.com/google/googletest/archive/3f0cf6b62ad1eb50d8736538363d3580dd640c3e.zip"],
+     strip_prefix = "googletest-3f0cf6b62ad1eb50d8736538363d3580dd640c3e",
+)
diff --git a/thirdparty/benchmark-1.5.0/_config.yml b/thirdparty/benchmark-1.5.0/_config.yml
new file mode 100644
index 0000000000..18854876c6
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
\ No newline at end of file
diff --git a/thirdparty/benchmark-1.5.0/appveyor.yml b/thirdparty/benchmark-1.5.0/appveyor.yml
new file mode 100644
index 0000000000..cf240190be
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/appveyor.yml
@@ -0,0 +1,50 @@
+version: '{build}'
+
+image: Visual Studio 2017
+
+configuration:
+  - Debug
+  - Release
+
+environment:
+  matrix:
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017"
+
+    - compiler: msvc-15-seh
+      generator: "Visual Studio 15 2017 Win64"
+
+    - compiler: msvc-14-seh
+      generator: "Visual Studio 14 2015"
+
+    - compiler: msvc-14-seh
+      generator: "Visual Studio 14 2015 Win64"
+
+    - compiler: gcc-5.3.0-posix
+      generator: "MinGW Makefiles"
+      cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
+      APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+
+matrix:
+  fast_finish: true
+
+install:
+  # git bash conflicts with MinGW makefiles
+  - if "%generator%"=="MinGW Makefiles" (set "PATH=%PATH:C:\Program Files\Git\usr\bin;=%")
+  - if not "%cxx_path%"=="" (set "PATH=%PATH%;%cxx_path%")
+
+build_script:
+  - md _build -Force
+  - cd _build
+  - echo %configuration%
+  - cmake -G "%generator%" "-DCMAKE_BUILD_TYPE=%configuration%" -DBENCHMARK_DOWNLOAD_DEPENDENCIES=ON ..
+  - cmake --build . --config %configuration%
+
+test_script:
+  - ctest -c %configuration% --timeout 300 --output-on-failure
+
+artifacts:
+  - path: '_build/CMakeFiles/*.log'
+    name: logs
+  - path: '_build/Testing/**/*.xml'
+    name: test_results
diff --git a/thirdparty/benchmark-1.5.0/cmake/AddCXXCompilerFlag.cmake b/thirdparty/benchmark-1.5.0/cmake/AddCXXCompilerFlag.cmake
new file mode 100644
index 0000000000..d0d2099814
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/AddCXXCompilerFlag.cmake
@@ -0,0 +1,74 @@
+# - Adds a compiler flag if it is supported by the compiler
+#
+# This function checks that the supplied compiler flag is supported and then
+# adds it to the corresponding compiler flags
+#
+#  add_cxx_compiler_flag(<FLAG> [<VARIANT>])
+#
+# - Example
+#
+# include(AddCXXCompilerFlag)
+# add_cxx_compiler_flag(-Wall)
+# add_cxx_compiler_flag(-no-strict-aliasing RELEASE)
+# Requires CMake 2.6+
+
+if(__add_cxx_compiler_flag)
+  return()
+endif()
+set(__add_cxx_compiler_flag INCLUDED)
+
+include(CheckCXXCompilerFlag)
+
+function(mangle_compiler_flag FLAG OUTPUT)
+  string(TOUPPER "HAVE_CXX_FLAG_${FLAG}" SANITIZED_FLAG)
+  string(REPLACE "+" "X" SANITIZED_FLAG ${SANITIZED_FLAG})
+  string(REGEX REPLACE "[^A-Za-z_0-9]" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
+  string(REGEX REPLACE "_+" "_" SANITIZED_FLAG ${SANITIZED_FLAG})
+  set(${OUTPUT} "${SANITIZED_FLAG}" PARENT_SCOPE)
+endfunction(mangle_compiler_flag)
+
+function(add_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${BENCHMARK_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(add_required_cxx_compiler_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+  if(${MANGLED_FLAG})
+    set(VARIANT ${ARGV1})
+    if(ARGV1)
+      string(TOUPPER "_${VARIANT}" VARIANT)
+    endif()
+    set(CMAKE_CXX_FLAGS${VARIANT} "${CMAKE_CXX_FLAGS${VARIANT}} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${FLAG}" PARENT_SCOPE)
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${FLAG}" PARENT_SCOPE)
+  else()
+    message(FATAL_ERROR "Required flag '${FLAG}' is not supported by the compiler")
+  endif()
+endfunction()
+
+function(check_cxx_warning_flag FLAG)
+  mangle_compiler_flag("${FLAG}" MANGLED_FLAG)
+  set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
+  # Add -Werror to ensure the compiler generates an error if the warning flag
+  # doesn't exist.
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror ${FLAG}")
+  check_cxx_compiler_flag("${FLAG}" ${MANGLED_FLAG})
+  set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
+endfunction()
diff --git a/thirdparty/benchmark-1.5.0/cmake/CXXFeatureCheck.cmake b/thirdparty/benchmark-1.5.0/cmake/CXXFeatureCheck.cmake
new file mode 100644
index 0000000000..99b56dd623
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/CXXFeatureCheck.cmake
@@ -0,0 +1,64 @@
+# - Compile and run code to check for C++ features
+#
+# This functions compiles a source file under the `cmake` folder
+# and adds the corresponding `HAVE_[FILENAME]` flag to the CMake
+# environment
+#
+#  cxx_feature_check(<FLAG> [<VARIANT>])
+#
+# - Example
+#
+# include(CXXFeatureCheck)
+# cxx_feature_check(STD_REGEX)
+# Requires CMake 2.8.12+
+
+if(__cxx_feature_check)
+  return()
+endif()
+set(__cxx_feature_check INCLUDED)
+
+function(cxx_feature_check FILE)
+  string(TOLOWER ${FILE} FILE)
+  string(TOUPPER ${FILE} VAR)
+  string(TOUPPER "HAVE_${VAR}" FEATURE)
+  if (DEFINED HAVE_${VAR})
+    set(HAVE_${VAR} 1 PARENT_SCOPE)
+    add_definitions(-DHAVE_${VAR})
+    return()
+  endif()
+
+  if (NOT DEFINED COMPILE_${FEATURE})
+    message(STATUS "Performing Test ${FEATURE}")
+    if(CMAKE_CROSSCOMPILING)
+      try_compile(COMPILE_${FEATURE}
+              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+      if(COMPILE_${FEATURE})
+        message(WARNING
+              "If you see build failures due to cross compilation, try setting HAVE_${VAR} to 0")
+        set(RUN_${FEATURE} 0)
+      else()
+        set(RUN_${FEATURE} 1)
+      endif()
+    else()
+      message(STATUS "Performing Test ${FEATURE}")
+      try_run(RUN_${FEATURE} COMPILE_${FEATURE}
+              ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/${FILE}.cpp
+              CMAKE_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS}
+              LINK_LIBRARIES ${BENCHMARK_CXX_LIBRARIES})
+    endif()
+  endif()
+
+  if(RUN_${FEATURE} EQUAL 0)
+    message(STATUS "Performing Test ${FEATURE} -- success")
+    set(HAVE_${VAR} 1 PARENT_SCOPE)
+    add_definitions(-DHAVE_${VAR})
+  else()
+    if(NOT COMPILE_${FEATURE})
+      message(STATUS "Performing Test ${FEATURE} -- failed to compile")
+    else()
+      message(STATUS "Performing Test ${FEATURE} -- compiled but failed to run")
+    endif()
+  endif()
+endfunction()
diff --git a/thirdparty/benchmark-1.5.0/cmake/Config.cmake.in b/thirdparty/benchmark-1.5.0/cmake/Config.cmake.in
new file mode 100644
index 0000000000..6e9256eea8
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/Config.cmake.in
@@ -0,0 +1 @@
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
diff --git a/thirdparty/benchmark-1.5.0/cmake/GetGitVersion.cmake b/thirdparty/benchmark-1.5.0/cmake/GetGitVersion.cmake
new file mode 100644
index 0000000000..4f10f226d7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/GetGitVersion.cmake
@@ -0,0 +1,54 @@
+# - Returns a version string from Git tags
+#
+# This function inspects the annotated git tags for the project and returns a string
+# into a CMake variable
+#
+#  get_git_version(<var>)
+#
+# - Example
+#
+# include(GetGitVersion)
+# get_git_version(GIT_VERSION)
+#
+# Requires CMake 2.8.11+
+find_package(Git)
+
+if(__get_git_version)
+  return()
+endif()
+set(__get_git_version INCLUDED)
+
+function(get_git_version var)
+  if(GIT_EXECUTABLE)
+      execute_process(COMMAND ${GIT_EXECUTABLE} describe --match "v[0-9]*.[0-9]*.[0-9]*" --abbrev=8
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+          RESULT_VARIABLE status
+          OUTPUT_VARIABLE GIT_VERSION
+          ERROR_QUIET)
+      if(${status})
+          set(GIT_VERSION "v0.0.0")
+      else()
+          string(STRIP ${GIT_VERSION} GIT_VERSION)
+          string(REGEX REPLACE "-[0-9]+-g" "-" GIT_VERSION ${GIT_VERSION})
+      endif()
+
+      # Work out if the repository is dirty
+      execute_process(COMMAND ${GIT_EXECUTABLE} update-index -q --refresh
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+          OUTPUT_QUIET
+          ERROR_QUIET)
+      execute_process(COMMAND ${GIT_EXECUTABLE} diff-index --name-only HEAD --
+          WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+          OUTPUT_VARIABLE GIT_DIFF_INDEX
+          ERROR_QUIET)
+      string(COMPARE NOTEQUAL "${GIT_DIFF_INDEX}" "" GIT_DIRTY)
+      if (${GIT_DIRTY})
+          set(GIT_VERSION "${GIT_VERSION}-dirty")
+      endif()
+  else()
+      set(GIT_VERSION "v0.0.0")
+  endif()
+
+  message(STATUS "git Version: ${GIT_VERSION}")
+  set(${var} ${GIT_VERSION} PARENT_SCOPE)
+endfunction()
diff --git a/thirdparty/benchmark-1.5.0/cmake/GoogleTest.cmake b/thirdparty/benchmark-1.5.0/cmake/GoogleTest.cmake
new file mode 100644
index 0000000000..fb7c6be25e
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/GoogleTest.cmake
@@ -0,0 +1,41 @@
+# Download and unpack googletest at configure time
+set(GOOGLETEST_PREFIX "${benchmark_BINARY_DIR}/third_party/googletest")
+configure_file(${benchmark_SOURCE_DIR}/cmake/GoogleTest.cmake.in ${GOOGLETEST_PREFIX}/CMakeLists.txt @ONLY)
+
+set(GOOGLETEST_PATH "${CMAKE_CURRENT_SOURCE_DIR}/googletest") # Mind the quotes
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+  -DALLOW_DOWNLOADING_GOOGLETEST=${BENCHMARK_DOWNLOAD_DEPENDENCIES} -DGOOGLETEST_PATH:PATH=${GOOGLETEST_PATH} .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${GOOGLETEST_PREFIX}
+)
+
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+include(${GOOGLETEST_PREFIX}/googletest-paths.cmake)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${GOOGLETEST_SOURCE_DIR}
+                 ${GOOGLETEST_BINARY_DIR}
+                 EXCLUDE_FROM_ALL)
+
+set_target_properties(gtest PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gtest_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gtest_main,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock,INTERFACE_INCLUDE_DIRECTORIES>)
+set_target_properties(gmock_main PROPERTIES INTERFACE_SYSTEM_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:gmock_main,INTERFACE_INCLUDE_DIRECTORIES>)
diff --git a/thirdparty/benchmark-1.5.0/cmake/GoogleTest.cmake.in b/thirdparty/benchmark-1.5.0/cmake/GoogleTest.cmake.in
new file mode 100644
index 0000000000..28818ee293
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/GoogleTest.cmake.in
@@ -0,0 +1,58 @@
+cmake_minimum_required(VERSION 2.8.12)
+
+project(googletest-download NONE)
+
+# Enable ExternalProject CMake module
+include(ExternalProject)
+
+option(ALLOW_DOWNLOADING_GOOGLETEST "If googletest src tree is not found in location specified by GOOGLETEST_PATH, do fetch the archive from internet" OFF)
+set(GOOGLETEST_PATH "/usr/src/googletest" CACHE PATH
+                    "Path to the googletest root tree. Should contain googletest and googlemock subdirs. And CMakeLists.txt in root, and in both of these subdirs")
+
+# Download and install GoogleTest
+
+message(STATUS "Looking for Google Test sources")
+message(STATUS "Looking for Google Test sources in ${GOOGLETEST_PATH}")
+if(EXISTS "${GOOGLETEST_PATH}"            AND IS_DIRECTORY "${GOOGLETEST_PATH}"            AND EXISTS "${GOOGLETEST_PATH}/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googletest" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googletest" AND EXISTS "${GOOGLETEST_PATH}/googletest/CMakeLists.txt" AND
+   EXISTS "${GOOGLETEST_PATH}/googlemock" AND IS_DIRECTORY "${GOOGLETEST_PATH}/googlemock" AND EXISTS "${GOOGLETEST_PATH}/googlemock/CMakeLists.txt")
+  message(STATUS "Found Google Test in ${GOOGLETEST_PATH}")
+
+  ExternalProject_Add(
+    googletest
+    PREFIX            "${CMAKE_BINARY_DIR}"
+    DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+    SOURCE_DIR        "${GOOGLETEST_PATH}" # use existing src dir.
+    BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+  )
+else()
+  if(NOT ALLOW_DOWNLOADING_GOOGLETEST)
+    message(SEND_ERROR "Did not find Google Test sources! Either pass correct path in GOOGLETEST_PATH, or enable ALLOW_DOWNLOADING_GOOGLETEST, or disable BENCHMARK_ENABLE_GTEST_TESTS / BENCHMARK_ENABLE_TESTING.")
+  else()
+    message(WARNING "Did not find Google Test sources! Fetching from web...")
+    ExternalProject_Add(
+      googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      PREFIX            "${CMAKE_BINARY_DIR}"
+      STAMP_DIR         "${CMAKE_BINARY_DIR}/stamp"
+      DOWNLOAD_DIR      "${CMAKE_BINARY_DIR}/download"
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+  endif()
+endif()
+
+ExternalProject_Get_Property(googletest SOURCE_DIR BINARY_DIR)
+file(WRITE googletest-paths.cmake
+"set(GOOGLETEST_SOURCE_DIR \"${SOURCE_DIR}\")
+set(GOOGLETEST_BINARY_DIR \"${BINARY_DIR}\")
+")
diff --git a/thirdparty/benchmark-1.5.0/cmake/benchmark.pc.in b/thirdparty/benchmark-1.5.0/cmake/benchmark.pc.in
new file mode 100644
index 0000000000..43ca8f91d7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/benchmark.pc.in
@@ -0,0 +1,12 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=${prefix}/lib
+includedir=${prefix}/include
+
+Name: @PROJECT_NAME@
+Description: Google microbenchmark framework
+Version: @VERSION@
+
+Libs: -L${libdir} -lbenchmark
+Libs.private: -lpthread
+Cflags: -I${includedir}
diff --git a/thirdparty/benchmark-1.5.0/cmake/gnu_posix_regex.cpp b/thirdparty/benchmark-1.5.0/cmake/gnu_posix_regex.cpp
new file mode 100644
index 0000000000..b5b91cdab7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/gnu_posix_regex.cpp
@@ -0,0 +1,12 @@
+#include <gnuregex.h>
+#include <string>
+int main() {
+  std::string str = "test0159";
+  regex_t re;
+  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    return ec;
+  }
+  return regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+}
+
diff --git a/thirdparty/benchmark-1.5.0/cmake/llvm-toolchain.cmake b/thirdparty/benchmark-1.5.0/cmake/llvm-toolchain.cmake
new file mode 100644
index 0000000000..fc119e52fd
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/llvm-toolchain.cmake
@@ -0,0 +1,8 @@
+find_package(LLVMAr REQUIRED)
+set(CMAKE_AR "${LLVMAR_EXECUTABLE}" CACHE FILEPATH "" FORCE)
+
+find_package(LLVMNm REQUIRED)
+set(CMAKE_NM "${LLVMNM_EXECUTABLE}" CACHE FILEPATH "" FORCE)
+
+find_package(LLVMRanLib REQUIRED)
+set(CMAKE_RANLIB "${LLVMRANLIB_EXECUTABLE}" CACHE FILEPATH "" FORCE)
diff --git a/thirdparty/benchmark-1.5.0/cmake/posix_regex.cpp b/thirdparty/benchmark-1.5.0/cmake/posix_regex.cpp
new file mode 100644
index 0000000000..466dc62560
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/posix_regex.cpp
@@ -0,0 +1,14 @@
+#include <regex.h>
+#include <string>
+int main() {
+  std::string str = "test0159";
+  regex_t re;
+  int ec = regcomp(&re, "^[a-z]+[0-9]+$", REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    return ec;
+  }
+  int ret = regexec(&re, str.c_str(), 0, nullptr, 0) ? -1 : 0;
+  regfree(&re);
+  return ret;
+}
+
diff --git a/thirdparty/benchmark-1.5.0/cmake/split_list.cmake b/thirdparty/benchmark-1.5.0/cmake/split_list.cmake
new file mode 100644
index 0000000000..67aed3fdc8
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/split_list.cmake
@@ -0,0 +1,3 @@
+macro(split_list listname)
+  string(REPLACE ";" " " ${listname} "${${listname}}")
+endmacro()
diff --git a/thirdparty/benchmark-1.5.0/cmake/std_regex.cpp b/thirdparty/benchmark-1.5.0/cmake/std_regex.cpp
new file mode 100644
index 0000000000..696f2a26bc
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/std_regex.cpp
@@ -0,0 +1,10 @@
+#include <regex>
+#include <string>
+int main() {
+  const std::string str = "test0159";
+  std::regex re;
+  re = std::regex("^[a-z]+[0-9]+$",
+       std::regex_constants::extended | std::regex_constants::nosubs);
+  return std::regex_search(str, re) ? 0 : -1;
+}
+
diff --git a/thirdparty/benchmark-1.5.0/cmake/steady_clock.cpp b/thirdparty/benchmark-1.5.0/cmake/steady_clock.cpp
new file mode 100644
index 0000000000..66d50d17e9
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/steady_clock.cpp
@@ -0,0 +1,7 @@
+#include <chrono>
+
+int main() {
+    typedef std::chrono::steady_clock Clock;
+    Clock::time_point tp = Clock::now();
+    ((void)tp);
+}
diff --git a/thirdparty/benchmark-1.5.0/cmake/thread_safety_attributes.cpp b/thirdparty/benchmark-1.5.0/cmake/thread_safety_attributes.cpp
new file mode 100644
index 0000000000..46161babdb
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/cmake/thread_safety_attributes.cpp
@@ -0,0 +1,4 @@
+#define HAVE_THREAD_SAFETY_ATTRIBUTES
+#include "../src/mutex.h"
+
+int main() {}
diff --git a/thirdparty/benchmark-1.5.0/conan/CMakeLists.txt b/thirdparty/benchmark-1.5.0/conan/CMakeLists.txt
new file mode 100644
index 0000000000..15b92ca91a
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/conan/CMakeLists.txt
@@ -0,0 +1,7 @@
+cmake_minimum_required(VERSION 2.8.11)
+project(cmake_wrapper)
+
+include(conanbuildinfo.cmake)
+conan_basic_setup()
+
+include(${CMAKE_SOURCE_DIR}/CMakeListsOriginal.txt)
diff --git a/thirdparty/benchmark-1.5.0/conan/test_package/CMakeLists.txt b/thirdparty/benchmark-1.5.0/conan/test_package/CMakeLists.txt
new file mode 100644
index 0000000000..089a6c729d
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/conan/test_package/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 2.8.11)
+project(test_package)
+
+set(CMAKE_VERBOSE_MAKEFILE TRUE)
+
+include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
+conan_basic_setup()
+
+add_executable(${PROJECT_NAME} test_package.cpp)
+target_link_libraries(${PROJECT_NAME} ${CONAN_LIBS})
diff --git a/thirdparty/benchmark-1.5.0/conan/test_package/conanfile.py b/thirdparty/benchmark-1.5.0/conan/test_package/conanfile.py
new file mode 100644
index 0000000000..d63f4088c9
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/conan/test_package/conanfile.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from conans import ConanFile, CMake
+import os
+
+
+class TestPackageConan(ConanFile):
+    settings = "os", "compiler", "build_type", "arch"
+    generators = "cmake"
+
+    def build(self):
+        cmake = CMake(self)
+        cmake.configure()
+        cmake.build()
+
+    def test(self):
+        bin_path = os.path.join("bin", "test_package")
+        self.run(bin_path, run_environment=True)
diff --git a/thirdparty/benchmark-1.5.0/conan/test_package/test_package.cpp b/thirdparty/benchmark-1.5.0/conan/test_package/test_package.cpp
new file mode 100644
index 0000000000..4fa7ec0bf9
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/conan/test_package/test_package.cpp
@@ -0,0 +1,18 @@
+#include "benchmark/benchmark.h"
+
+void BM_StringCreation(benchmark::State& state) {
+    while (state.KeepRunning())
+        std::string empty_string;
+}
+
+BENCHMARK(BM_StringCreation);
+
+void BM_StringCopy(benchmark::State& state) {
+    std::string x = "hello";
+    while (state.KeepRunning())
+        std::string copy(x);
+}
+
+BENCHMARK(BM_StringCopy);
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/conanfile.py b/thirdparty/benchmark-1.5.0/conanfile.py
new file mode 100644
index 0000000000..e31fc5268a
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/conanfile.py
@@ -0,0 +1,79 @@
+from conans import ConanFile, CMake, tools
+from conans.errors import ConanInvalidConfiguration
+import shutil
+import os
+
+
+class GoogleBenchmarkConan(ConanFile):
+    name = "benchmark"
+    description = "A microbenchmark support library."
+    topics = ("conan", "benchmark", "google", "microbenchmark")
+    url = "https://github.com/google/benchmark"
+    homepage = "https://github.com/google/benchmark"
+    author = "Google Inc."
+    license = "Apache-2.0"
+    exports_sources = ["*"]
+    generators = "cmake"
+
+    settings = "arch", "build_type", "compiler", "os"
+    options = {
+        "shared": [True, False],
+        "fPIC": [True, False],
+        "enable_lto": [True, False],
+        "enable_exceptions": [True, False]
+    }
+    default_options = {"shared": False, "fPIC": True, "enable_lto": False, "enable_exceptions": True}
+
+    _build_subfolder = "."
+
+    def source(self):
+        # Wrap the original CMake file to call conan_basic_setup
+        shutil.move("CMakeLists.txt", "CMakeListsOriginal.txt")
+        shutil.move(os.path.join("conan", "CMakeLists.txt"), "CMakeLists.txt")
+
+    def config_options(self):
+        if self.settings.os == "Windows":
+            if self.settings.compiler == "Visual Studio" and float(self.settings.compiler.version.value) <= 12:
+                raise ConanInvalidConfiguration("{} {} does not support Visual Studio <= 12".format(self.name, self.version))
+            del self.options.fPIC
+
+    def configure(self):
+        if self.settings.os == "Windows" and self.options.shared:
+            raise ConanInvalidConfiguration("Windows shared builds are not supported right now, see issue #639")
+
+    def _configure_cmake(self):
+        cmake = CMake(self)
+
+        cmake.definitions["BENCHMARK_ENABLE_TESTING"] = "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_GTEST_TESTS"] = "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_LTO"] = "ON" if self.options.enable_lto else "OFF"
+        cmake.definitions["BENCHMARK_ENABLE_EXCEPTIONS"] = "ON" if self.options.enable_exceptions else "OFF"
+
+        # See https://github.com/google/benchmark/pull/638 for Windows 32 build explanation
+        if self.settings.os != "Windows":
+            cmake.definitions["BENCHMARK_BUILD_32_BITS"] = "ON" if "64" not in str(self.settings.arch) else "OFF"
+            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "ON" if (str(self.settings.compiler.libcxx) == "libc++") else "OFF"
+        else:
+            cmake.definitions["BENCHMARK_USE_LIBCXX"] = "OFF"
+
+        cmake.configure(build_folder=self._build_subfolder)
+        return cmake
+
+    def build(self):
+        cmake = self._configure_cmake()
+        cmake.build()
+
+    def package(self):
+        cmake = self._configure_cmake()
+        cmake.install()
+
+        self.copy(pattern="LICENSE", dst="licenses")
+
+    def package_info(self):
+        self.cpp_info.libs = tools.collect_libs(self)
+        if self.settings.os == "Linux":
+            self.cpp_info.libs.extend(["pthread", "rt"])
+        elif self.settings.os == "Windows":
+            self.cpp_info.libs.append("shlwapi")
+        elif self.settings.os == "SunOS":
+            self.cpp_info.libs.append("kstat")
diff --git a/thirdparty/benchmark-1.5.0/dependencies.md b/thirdparty/benchmark-1.5.0/dependencies.md
new file mode 100644
index 0000000000..6289b4e354
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/dependencies.md
@@ -0,0 +1,18 @@
+# Build tool dependency policy
+
+To ensure the broadest compatibility when building the benchmark library, but
+still allow forward progress, we require any build tooling to be available for:
+
+* Debian stable AND
+* The last two Ubuntu LTS releases AND
+
+Currently, this means using build tool versions that are available for Ubuntu
+16.04 (Xenial), Ubuntu 18.04 (Bionic), and Debian stretch.
+
+_Note, [travis](.travis.yml) runs under Ubuntu 14.04 (Trusty) for linux builds._
+
+## cmake
+The current supported version is cmake 3.5.1 as of 2018-06-06.
+
+_Note, this version is also available for Ubuntu 14.04, the previous Ubuntu LTS
+release, as `cmake3`._
diff --git a/thirdparty/benchmark-1.5.0/docs/AssemblyTests.md b/thirdparty/benchmark-1.5.0/docs/AssemblyTests.md
new file mode 100644
index 0000000000..1fbdc269b5
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/docs/AssemblyTests.md
@@ -0,0 +1,147 @@
+# Assembly Tests
+
+The Benchmark library provides a number of functions whose primary
+purpose in to affect assembly generation, including `DoNotOptimize`
+and `ClobberMemory`. In addition there are other functions,
+such as `KeepRunning`, for which generating good assembly is paramount.
+
+For these functions it's important to have tests that verify the
+correctness and quality of the implementation. This requires testing
+the code generated by the compiler.
+
+This document describes how the Benchmark library tests compiler output,
+as well as how to properly write new tests.
+
+
+## Anatomy of a Test
+
+Writing a test has two steps:
+
+* Write the code you want to generate assembly for.
+* Add `// CHECK` lines to match against the verified assembly.
+
+Example:
+```c++
+
+// CHECK-LABEL: test_add:
+extern "C" int test_add() {
+    extern int ExternInt;
+    return ExternInt + 1;
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: addl %eax
+    // CHECK: ret
+}
+
+```
+
+#### LLVM Filecheck
+
+[LLVM's Filecheck](https://llvm.org/docs/CommandGuide/FileCheck.html)
+is used to test the generated assembly against the `// CHECK` lines
+specified in the tests source file. Please see the documentation
+linked above for information on how to write `CHECK` directives.
+
+#### Tips and Tricks:
+
+* Tests should match the minimal amount of output required to establish
+correctness. `CHECK` directives don't have to match on the exact next line
+after the previous match, so tests should omit checks for unimportant
+bits of assembly. ([`CHECK-NEXT`](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-next-directive)
+can be used to ensure a match occurs exactly after the previous match).
+
+* The tests are compiled with `-O3 -g0`. So we're only testing the
+optimized output.
+
+* The assembly output is further cleaned up using `tools/strip_asm.py`.
+This removes comments, assembler directives, and unused labels before
+the test is run.
+
+* The generated and stripped assembly file for a test is output under
+`<build-directory>/test/<test-name>.s`
+
+* Filecheck supports using [`CHECK` prefixes](https://llvm.org/docs/CommandGuide/FileCheck.html#cmdoption-check-prefixes)
+to specify lines that should only match in certain situations.
+The Benchmark tests use `CHECK-CLANG` and `CHECK-GNU` for lines that
+are only expected to match Clang or GCC's output respectively. Normal
+`CHECK` lines match against all compilers. (Note: `CHECK-NOT` and
+`CHECK-LABEL` are NOT prefixes. They are versions of non-prefixed
+`CHECK` lines)
+
+* Use `extern "C"` to disable name mangling for specific functions. This
+makes them easier to name in the `CHECK` lines.
+
+
+## Problems Writing Portable Tests
+
+Writing tests which check the code generated by a compiler are
+inherently non-portable. Different compilers and even different compiler
+versions may generate entirely different code. The Benchmark tests
+must tolerate this.
+
+LLVM Filecheck provides a number of mechanisms to help write
+"more portable" tests; including [matching using regular expressions](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-pattern-matching-syntax),
+allowing the creation of [named variables](https://llvm.org/docs/CommandGuide/FileCheck.html#filecheck-variables)
+for later matching, and [checking non-sequential matches](https://llvm.org/docs/CommandGuide/FileCheck.html#the-check-dag-directive).
+
+#### Capturing Variables
+
+For example, say GCC stores a variable in a register but Clang stores
+it in memory. To write a test that tolerates both cases we "capture"
+the destination of the store, and then use the captured expression
+to write the remainder of the test.
+
+```c++
+// CHECK-LABEL: test_div_no_op_into_shr:
+extern "C" void test_div_no_op_into_shr(int value) {
+    int divisor = 2;
+    benchmark::DoNotOptimize(divisor); // hide the value from the optimizer
+    return value / divisor;
+
+    // CHECK: movl $2, [[DEST:.*]]
+    // CHECK: idivl [[DEST]]
+    // CHECK: ret
+}
+```
+
+#### Using Regular Expressions to Match Differing Output
+
+Often tests require testing assembly lines which may subtly differ
+between compilers or compiler versions. A common example of this
+is matching stack frame addresses. In this case regular expressions
+can be used to match the differing bits of output. For example:
+
+```c++
+int ExternInt;
+struct Point { int x, y, z; };
+
+// CHECK-LABEL: test_store_point:
+extern "C" void test_store_point() {
+    Point p{ExternInt, ExternInt, ExternInt};
+    benchmark::DoNotOptimize(p);
+
+    // CHECK: movl ExternInt(%rip), %eax
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: movl %eax, -{{[0-9]+}}(%rsp)
+    // CHECK: ret
+}
+```
+
+## Current Requirements and Limitations
+
+The tests require Filecheck to be installed along the `PATH` of the
+build machine. Otherwise the tests will be disabled.
+
+Additionally, as mentioned in the previous section, codegen tests are
+inherently non-portable. Currently the tests are limited to:
+
+* x86_64 targets.
+* Compiled with GCC or Clang
+
+Further work could be done, at least on a limited basis, to extend the
+tests to other architectures and compilers (using `CHECK` prefixes).
+
+Furthermore, the tests fail for builds which specify additional flags
+that modify code generation, including `--coverage` or `-fsanitize=`.
+
diff --git a/thirdparty/benchmark-1.5.0/docs/_config.yml b/thirdparty/benchmark-1.5.0/docs/_config.yml
new file mode 100644
index 0000000000..18854876c6
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/docs/_config.yml
@@ -0,0 +1 @@
+theme: jekyll-theme-midnight
\ No newline at end of file
diff --git a/thirdparty/benchmark-1.5.0/docs/tools.md b/thirdparty/benchmark-1.5.0/docs/tools.md
new file mode 100644
index 0000000000..4a3b2e9bd2
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/docs/tools.md
@@ -0,0 +1,199 @@
+# Benchmark Tools
+
+## compare.py
+
+The `compare.py` can be used to compare the result of benchmarks.
+
+**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
+
+### Displaying aggregates only
+
+The switch `-a` / `--display_aggregates_only` can be used to control the
+displayment of the normal iterations vs the aggregates. When passed, it will
+be passthrough to the benchmark binaries to be run, and will be accounted for
+in the tool itself; only the aggregates will be displayed, but not normal runs.
+It only affects the display, the separate runs will still be used to calculate
+the U test.
+
+### Modes of operation
+
+There are three modes of operation:
+
+1. Just compare two benchmarks
+The program is invoked like:
+
+``` bash
+$ compare.py benchmarks <benchmark_baseline> <benchmark_contender> [benchmark options]...
+```
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarks ./a.out ./a.out
+RUNNING: ./a.out --benchmark_out=/tmp/tmprBT5nW
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:44
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19101577   211.669MB/s
+BM_memcpy/64           76 ns         76 ns    9412571   800.199MB/s
+BM_memcpy/512          84 ns         84 ns    8249070   5.64771GB/s
+BM_memcpy/1024        116 ns        116 ns    6181763   8.19505GB/s
+BM_memcpy/8192        643 ns        643 ns    1062855   11.8636GB/s
+BM_copy/8             222 ns        222 ns    3137987   34.3772MB/s
+BM_copy/64           1608 ns       1608 ns     432758   37.9501MB/s
+BM_copy/512         12589 ns      12589 ns      54806   38.7867MB/s
+BM_copy/1024        25169 ns      25169 ns      27713   38.8003MB/s
+BM_copy/8192       201165 ns     201112 ns       3486   38.8466MB/s
+RUNNING: ./a.out --benchmark_out=/tmp/tmpt1wwG_
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:16:53
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   19397903   211.255MB/s
+BM_memcpy/64           73 ns         73 ns    9691174   839.635MB/s
+BM_memcpy/512          85 ns         85 ns    8312329   5.60101GB/s
+BM_memcpy/1024        118 ns        118 ns    6438774   8.11608GB/s
+BM_memcpy/8192        656 ns        656 ns    1068644   11.6277GB/s
+BM_copy/8             223 ns        223 ns    3146977   34.2338MB/s
+BM_copy/64           1611 ns       1611 ns     435340   37.8751MB/s
+BM_copy/512         12622 ns      12622 ns      54818   38.6844MB/s
+BM_copy/1024        25257 ns      25239 ns      27779   38.6927MB/s
+BM_copy/8192       205013 ns     205010 ns       3479    38.108MB/s
+Comparing ./a.out to ./a.out
+Benchmark                 Time             CPU      Time Old      Time New       CPU Old       CPU New
+------------------------------------------------------------------------------------------------------
+BM_memcpy/8            +0.0020         +0.0020            36            36            36            36
+BM_memcpy/64           -0.0468         -0.0470            76            73            76            73
+BM_memcpy/512          +0.0081         +0.0083            84            85            84            85
+BM_memcpy/1024         +0.0098         +0.0097           116           118           116           118
+BM_memcpy/8192         +0.0200         +0.0203           643           656           643           656
+BM_copy/8              +0.0046         +0.0042           222           223           222           223
+BM_copy/64             +0.0020         +0.0020          1608          1611          1608          1611
+BM_copy/512            +0.0027         +0.0026         12589         12622         12589         12622
+BM_copy/1024           +0.0035         +0.0028         25169         25257         25169         25239
+BM_copy/8192           +0.0191         +0.0194        201165        205013        201112        205010
+```
+
+What it does is for the every benchmark from the first run it looks for the benchmark with exactly the same name in the second run, and then compares the results. If the names differ, the benchmark is omitted from the diff.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+2. Compare two different filters of one benchmark
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark> <filter_baseline> <filter_contender> [benchmark options]...
+```
+Where `<benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py filters ./a.out BM_memcpy BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmpBWKk0k
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:28
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            36 ns         36 ns   17891491   211.215MB/s
+BM_memcpy/64           74 ns         74 ns    9400999   825.646MB/s
+BM_memcpy/512          87 ns         87 ns    8027453   5.46126GB/s
+BM_memcpy/1024        111 ns        111 ns    6116853    8.5648GB/s
+BM_memcpy/8192        657 ns        656 ns    1064679   11.6247GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpAvWcOM
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:37:33
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           227 ns        227 ns    3038700   33.6264MB/s
+BM_copy/64         1640 ns       1640 ns     426893   37.2154MB/s
+BM_copy/512       12804 ns      12801 ns      55417   38.1444MB/s
+BM_copy/1024      25409 ns      25407 ns      27516   38.4365MB/s
+BM_copy/8192     202986 ns     202990 ns       3454   38.4871MB/s
+Comparing BM_memcpy to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.2829         +5.2812            36           227            36           227
+[BM_memcpy vs. BM_copy]/64          +21.1719        +21.1856            74          1640            74          1640
+[BM_memcpy vs. BM_copy]/512        +145.6487       +145.6097            87         12804            87         12801
+[BM_memcpy vs. BM_copy]/1024       +227.1860       +227.1776           111         25409           111         25407
+[BM_memcpy vs. BM_copy]/8192       +308.1664       +308.2898           657        202986           656        202990
+```
+
+As you can see, it applies filter to the benchmarks, both when running the benchmark, and before doing the diff. And to make the diff work, the matches are replaced with some common string. Thus, you can compare two different benchmark families within one benchmark binary.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+3. Compare filter one from benchmark one to filter two from benchmark two:
+The program is invoked like:
+
+``` bash
+$ compare.py filters <benchmark_baseline> <filter_baseline> <benchmark_contender> <filter_contender> [benchmark options]...
+```
+
+Where `<benchmark_baseline>` and `<benchmark_contender>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
+
+Where `<filter_baseline>` and `<filter_contender>` are the same regex filters that you would pass to the `[--benchmark_filter=<regex>]` parameter of the benchmark binary.
+
+`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
+
+Example output:
+```
+$ ./compare.py benchmarksfiltered ./a.out BM_memcpy ./a.out BM_copy
+RUNNING: ./a.out --benchmark_filter=BM_memcpy --benchmark_out=/tmp/tmp_FvbYg
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:27
+------------------------------------------------------
+Benchmark               Time           CPU Iterations
+------------------------------------------------------
+BM_memcpy/8            37 ns         37 ns   18953482   204.118MB/s
+BM_memcpy/64           74 ns         74 ns    9206578   828.245MB/s
+BM_memcpy/512          91 ns         91 ns    8086195   5.25476GB/s
+BM_memcpy/1024        120 ns        120 ns    5804513   7.95662GB/s
+BM_memcpy/8192        664 ns        664 ns    1028363   11.4948GB/s
+RUNNING: ./a.out --benchmark_filter=BM_copy --benchmark_out=/tmp/tmpDfL5iE
+Run on (8 X 4000 MHz CPU s)
+2017-11-07 21:38:32
+----------------------------------------------------
+Benchmark             Time           CPU Iterations
+----------------------------------------------------
+BM_copy/8           230 ns        230 ns    2985909   33.1161MB/s
+BM_copy/64         1654 ns       1653 ns     419408   36.9137MB/s
+BM_copy/512       13122 ns      13120 ns      53403   37.2156MB/s
+BM_copy/1024      26679 ns      26666 ns      26575   36.6218MB/s
+BM_copy/8192     215068 ns     215053 ns       3221   36.3283MB/s
+Comparing BM_memcpy (from ./a.out) to BM_copy (from ./a.out)
+Benchmark                               Time             CPU      Time Old      Time New       CPU Old       CPU New
+--------------------------------------------------------------------------------------------------------------------
+[BM_memcpy vs. BM_copy]/8            +5.1649         +5.1637            37           230            37           230
+[BM_memcpy vs. BM_copy]/64          +21.4352        +21.4374            74          1654            74          1653
+[BM_memcpy vs. BM_copy]/512        +143.6022       +143.5865            91         13122            91         13120
+[BM_memcpy vs. BM_copy]/1024       +221.5903       +221.4790           120         26679           120         26666
+[BM_memcpy vs. BM_copy]/8192       +322.9059       +323.0096           664        215068           664        215053
+```
+This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
+As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+### U test
+
+If there is a sufficient repetition count of the benchmarks, the tool can do
+a [U Test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), of the
+null hypothesis that it is equally likely that a randomly selected value from
+one sample will be less than or greater than a randomly selected value from a
+second sample.
+
+If the calculated p-value is below this value is lower than the significance
+level alpha, then the result is said to be statistically significant and the
+null hypothesis is rejected. Which in other words means that the two benchmarks
+aren't identical.
+
+**WARNING**: requires **LARGE** (no less than 9) number of repetitions to be
+meaningful!
diff --git a/thirdparty/benchmark-1.5.0/include/benchmark/benchmark.h b/thirdparty/benchmark-1.5.0/include/benchmark/benchmark.h
new file mode 100644
index 0000000000..6cb96f546d
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/include/benchmark/benchmark.h
@@ -0,0 +1,1583 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Support for registering benchmarks for functions.
+
+/* Example usage:
+// Define a function that executes the code to be measured a
+// specified number of times:
+static void BM_StringCreation(benchmark::State& state) {
+  for (auto _ : state)
+    std::string empty_string;
+}
+
+// Register the function as a benchmark
+BENCHMARK(BM_StringCreation);
+
+// Define another benchmark
+static void BM_StringCopy(benchmark::State& state) {
+  std::string x = "hello";
+  for (auto _ : state)
+    std::string copy(x);
+}
+BENCHMARK(BM_StringCopy);
+
+// Augment the main() program to invoke benchmarks if specified
+// via the --benchmarks command line flag.  E.g.,
+//       my_unittest --benchmark_filter=all
+//       my_unittest --benchmark_filter=BM_StringCreation
+//       my_unittest --benchmark_filter=String
+//       my_unittest --benchmark_filter='Copy|Creation'
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  return 0;
+}
+
+// Sometimes a family of microbenchmarks can be implemented with
+// just one routine that takes an extra argument to specify which
+// one of the family of benchmarks to run.  For example, the following
+// code defines a family of microbenchmarks for measuring the speed
+// of memcpy() calls of different lengths:
+
+static void BM_memcpy(benchmark::State& state) {
+  char* src = new char[state.range(0)]; char* dst = new char[state.range(0)];
+  memset(src, 'x', state.range(0));
+  for (auto _ : state)
+    memcpy(dst, src, state.range(0));
+  state.SetBytesProcessed(state.iterations() * state.range(0));
+  delete[] src; delete[] dst;
+}
+BENCHMARK(BM_memcpy)->Arg(8)->Arg(64)->Arg(512)->Arg(1<<10)->Arg(8<<10);
+
+// The preceding code is quite repetitive, and can be replaced with the
+// following short-hand.  The following invocation will pick a few
+// appropriate arguments in the specified range and will generate a
+// microbenchmark for each such argument.
+BENCHMARK(BM_memcpy)->Range(8, 8<<10);
+
+// You might have a microbenchmark that depends on two inputs.  For
+// example, the following code defines a family of microbenchmarks for
+// measuring the speed of set insertion.
+static void BM_SetInsert(benchmark::State& state) {
+  set<int> data;
+  for (auto _ : state) {
+    state.PauseTiming();
+    data = ConstructRandomSet(state.range(0));
+    state.ResumeTiming();
+    for (int j = 0; j < state.range(1); ++j)
+      data.insert(RandomNumber());
+  }
+}
+BENCHMARK(BM_SetInsert)
+   ->Args({1<<10, 128})
+   ->Args({2<<10, 128})
+   ->Args({4<<10, 128})
+   ->Args({8<<10, 128})
+   ->Args({1<<10, 512})
+   ->Args({2<<10, 512})
+   ->Args({4<<10, 512})
+   ->Args({8<<10, 512});
+
+// The preceding code is quite repetitive, and can be replaced with
+// the following short-hand.  The following macro will pick a few
+// appropriate arguments in the product of the two specified ranges
+// and will generate a microbenchmark for each such pair.
+BENCHMARK(BM_SetInsert)->Ranges({{1<<10, 8<<10}, {128, 512}});
+
+// For more complex patterns of inputs, passing a custom function
+// to Apply allows programmatic specification of an
+// arbitrary set of arguments to run the microbenchmark on.
+// The following example enumerates a dense range on
+// one parameter, and a sparse range on the second.
+static void CustomArguments(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i <= 10; ++i)
+    for (int j = 32; j <= 1024*1024; j *= 8)
+      b->Args({i, j});
+}
+BENCHMARK(BM_SetInsert)->Apply(CustomArguments);
+
+// Templated microbenchmarks work the same way:
+// Produce then consume 'size' messages 'iters' times
+// Measures throughput in the absence of multiprogramming.
+template <class Q> int BM_Sequential(benchmark::State& state) {
+  Q q;
+  typename Q::value_type v;
+  for (auto _ : state) {
+    for (int i = state.range(0); i--; )
+      q.push(v);
+    for (int e = state.range(0); e--; )
+      q.Wait(&v);
+  }
+  // actually messages, not bytes:
+  state.SetBytesProcessed(state.iterations() * state.range(0));
+}
+BENCHMARK_TEMPLATE(BM_Sequential, WaitQueue<int>)->Range(1<<0, 1<<10);
+
+Use `Benchmark::MinTime(double t)` to set the minimum time used to run the
+benchmark. This option overrides the `benchmark_min_time` flag.
+
+void BM_test(benchmark::State& state) {
+ ... body ...
+}
+BENCHMARK(BM_test)->MinTime(2.0); // Run for at least 2 seconds.
+
+In a multithreaded test, it is guaranteed that none of the threads will start
+until all have reached the loop start, and all will have finished before any
+thread exits the loop body. As such, any global setup or teardown you want to
+do can be wrapped in a check against the thread index:
+
+static void BM_MultiThreaded(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // Setup code here.
+  }
+  for (auto _ : state) {
+    // Run the test as normal.
+  }
+  if (state.thread_index == 0) {
+    // Teardown code here.
+  }
+}
+BENCHMARK(BM_MultiThreaded)->Threads(4);
+
+
+If a benchmark runs a few milliseconds it may be hard to visually compare the
+measured times, since the output data is given in nanoseconds per default. In
+order to manually set the time unit, you can specify it manually:
+
+BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
+*/
+
+#ifndef BENCHMARK_BENCHMARK_H_
+#define BENCHMARK_BENCHMARK_H_
+
+// The _MSVC_LANG check should detect Visual Studio 2015 Update 3 and newer.
+#if __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
+#define BENCHMARK_HAS_CXX11
+#endif
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iosfwd>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#if defined(BENCHMARK_HAS_CXX11)
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+#endif
+
+#if defined(_MSC_VER)
+#include <intrin.h>  // for _ReadWriteBarrier
+#endif
+
+#ifndef BENCHMARK_HAS_CXX11
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                         \
+  TypeName& operator=(const TypeName&)
+#else
+#define BENCHMARK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&) = delete;                \
+  TypeName& operator=(const TypeName&) = delete
+#endif
+
+#if defined(__GNUC__)
+#define BENCHMARK_UNUSED __attribute__((unused))
+#define BENCHMARK_ALWAYS_INLINE __attribute__((always_inline))
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#elif defined(_MSC_VER) && !defined(__clang__)
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE __forceinline
+#if _MSC_VER >= 1900
+#define BENCHMARK_NOEXCEPT noexcept
+#define BENCHMARK_NOEXCEPT_OP(x) noexcept(x)
+#else
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+#define __func__ __FUNCTION__
+#else
+#define BENCHMARK_UNUSED
+#define BENCHMARK_ALWAYS_INLINE
+#define BENCHMARK_NOEXCEPT
+#define BENCHMARK_NOEXCEPT_OP(x)
+#endif
+
+#define BENCHMARK_INTERNAL_TOSTRING2(x) #x
+#define BENCHMARK_INTERNAL_TOSTRING(x) BENCHMARK_INTERNAL_TOSTRING2(x)
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BENCHMARK_BUILTIN_EXPECT(x, y) __builtin_expect(x, y)
+#define BENCHMARK_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#else
+#define BENCHMARK_BUILTIN_EXPECT(x, y) x
+#define BENCHMARK_DEPRECATED_MSG(msg)
+#define BENCHMARK_WARNING_MSG(msg)                           \
+  __pragma(message(__FILE__ "(" BENCHMARK_INTERNAL_TOSTRING( \
+      __LINE__) ") : warning note: " msg))
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if defined(__GNUC__) || __has_builtin(__builtin_unreachable)
+#define BENCHMARK_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define BENCHMARK_UNREACHABLE() __assume(false)
+#else
+#define BENCHMARK_UNREACHABLE() ((void)0)
+#endif
+
+namespace benchmark {
+class BenchmarkReporter;
+class MemoryManager;
+
+void Initialize(int* argc, char** argv);
+
+// Report to stdout all arguments in 'argv' as unrecognized except the first.
+// Returns true there is at least on unrecognized argument (i.e. 'argc' > 1).
+bool ReportUnrecognizedArguments(int argc, char** argv);
+
+// Generate a list of benchmarks matching the specified --benchmark_filter flag
+// and if --benchmark_list_tests is specified return after printing the name
+// of each matching benchmark. Otherwise run each matching benchmark and
+// report the results.
+//
+// The second and third overload use the specified 'display_reporter' and
+//  'file_reporter' respectively. 'file_reporter' will write to the file
+//  specified
+//   by '--benchmark_output'. If '--benchmark_output' is not given the
+//  'file_reporter' is ignored.
+//
+// RETURNS: The number of matching benchmarks.
+size_t RunSpecifiedBenchmarks();
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter);
+
+// Register a MemoryManager instance that will be used to collect and report
+// allocation measurements for benchmark runs.
+void RegisterMemoryManager(MemoryManager* memory_manager);
+
+namespace internal {
+class Benchmark;
+class BenchmarkImp;
+class BenchmarkFamilies;
+
+void UseCharPointer(char const volatile*);
+
+// Take ownership of the pointer and register the benchmark. Return the
+// registered benchmark.
+Benchmark* RegisterBenchmarkInternal(Benchmark*);
+
+// Ensure that the standard streams are properly initialized in every TU.
+int InitializeStreams();
+BENCHMARK_UNUSED static int stream_init_anchor = InitializeStreams();
+
+}  // namespace internal
+
+#if (!defined(__GNUC__) && !defined(__clang__)) || defined(__pnacl__) || \
+    defined(__EMSCRIPTEN__)
+#define BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+#endif
+
+// The DoNotOptimize(...) function can be used to prevent a value or
+// expression from being optimized away by the compiler. This function is
+// intended to add little to no overhead.
+// See: https://youtu.be/nXaxk27zwlk?t=2441
+#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  asm volatile("" : : "r,m"(value) : "memory");
+}
+
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
+#if defined(__clang__)
+  asm volatile("" : "+r,m"(value) : : "memory");
+#else
+  asm volatile("" : "+m,r"(value) : : "memory");
+#endif
+}
+
+// Force the compiler to flush pending writes to global memory. Acts as an
+// effective read/write barrier
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
+  asm volatile("" : : : "memory");
+}
+#elif defined(_MSC_VER)
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+  _ReadWriteBarrier();
+}
+
+inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); }
+#else
+template <class Tp>
+inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
+  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
+}
+// FIXME Add ClobberMemory() for non-gnu and non-msvc compilers
+#endif
+
+// This class is used for user-defined counters.
+class Counter {
+ public:
+  enum Flags {
+    kDefaults = 0,
+    // Mark the counter as a rate. It will be presented divided
+    // by the duration of the benchmark.
+    kIsRate = 1U << 0U,
+    // Mark the counter as a thread-average quantity. It will be
+    // presented divided by the number of threads.
+    kAvgThreads = 1U << 1U,
+    // Mark the counter as a thread-average rate. See above.
+    kAvgThreadsRate = kIsRate | kAvgThreads,
+    // Mark the counter as a constant value, valid/same for *every* iteration.
+    // When reporting, it will be *multiplied* by the iteration count.
+    kIsIterationInvariant = 1U << 2U,
+    // Mark the counter as a constant rate.
+    // When reporting, it will be *multiplied* by the iteration count
+    // and then divided by the duration of the benchmark.
+    kIsIterationInvariantRate = kIsRate | kIsIterationInvariant,
+    // Mark the counter as a iteration-average quantity.
+    // It will be presented divided by the number of iterations.
+    kAvgIterations = 1U << 3U,
+    // Mark the counter as a iteration-average rate. See above.
+    kAvgIterationsRate = kIsRate | kAvgIterations
+  };
+
+  enum OneK {
+    // 1'000 items per 1k
+    kIs1000 = 1000,
+    // 1'024 items per 1k
+    kIs1024 = 1024
+  };
+
+  double value;
+  Flags flags;
+  OneK oneK;
+
+  BENCHMARK_ALWAYS_INLINE
+  Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
+      : value(v), flags(f), oneK(k) {}
+
+  BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
+  BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
+};
+
+// A helper for user code to create unforeseen combinations of Flags, without
+// having to do this cast manually each time, or providing this operator.
+Counter::Flags inline operator|(const Counter::Flags& LHS,
+                                const Counter::Flags& RHS) {
+  return static_cast<Counter::Flags>(static_cast<int>(LHS) |
+                                     static_cast<int>(RHS));
+}
+
+// This is the container for the user-defined counters.
+typedef std::map<std::string, Counter> UserCounters;
+
+// TimeUnit is passed to a benchmark in order to specify the order of magnitude
+// for the measured time.
+enum TimeUnit { kNanosecond, kMicrosecond, kMillisecond };
+
+// BigO is passed to a benchmark in order to specify the asymptotic
+// computational
+// complexity for the benchmark. In case oAuto is selected, complexity will be
+// calculated automatically to the best fit.
+enum BigO { oNone, o1, oN, oNSquared, oNCubed, oLogN, oNLogN, oAuto, oLambda };
+
+typedef uint64_t IterationCount;
+
+// BigOFunc is passed to a benchmark in order to specify the asymptotic
+// computational complexity for the benchmark.
+typedef double(BigOFunc)(IterationCount);
+
+// StatisticsFunc is passed to a benchmark in order to compute some descriptive
+// statistics over all the measurements of some type
+typedef double(StatisticsFunc)(const std::vector<double>&);
+
+namespace internal {
+struct Statistics {
+  std::string name_;
+  StatisticsFunc* compute_;
+
+  Statistics(const std::string& name, StatisticsFunc* compute)
+      : name_(name), compute_(compute) {}
+};
+
+struct BenchmarkInstance;
+class ThreadTimer;
+class ThreadManager;
+
+enum AggregationReportMode
+#if defined(BENCHMARK_HAS_CXX11)
+    : unsigned
+#else
+#endif
+{
+  // The mode has not been manually specified
+  ARM_Unspecified = 0,
+  // The mode is user-specified.
+  // This may or may not be set when the following bit-flags are set.
+  ARM_Default = 1U << 0U,
+  // File reporter should only output aggregates.
+  ARM_FileReportAggregatesOnly = 1U << 1U,
+  // Display reporter should only output aggregates
+  ARM_DisplayReportAggregatesOnly = 1U << 2U,
+  // Both reporters should only display aggregates.
+  ARM_ReportAggregatesOnly =
+      ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
+};
+
+}  // namespace internal
+
+// State is passed to a running Benchmark and contains state for the
+// benchmark to use.
+class State {
+ public:
+  struct StateIterator;
+  friend struct StateIterator;
+
+  // Returns iterators used to run each iteration of a benchmark using a
+  // C++11 ranged-based for loop. These functions should not be called directly.
+  //
+  // REQUIRES: The benchmark has not started running yet. Neither begin nor end
+  // have been called previously.
+  //
+  // NOTE: KeepRunning may not be used after calling either of these functions.
+  BENCHMARK_ALWAYS_INLINE StateIterator begin();
+  BENCHMARK_ALWAYS_INLINE StateIterator end();
+
+  // Returns true if the benchmark should continue through another iteration.
+  // NOTE: A benchmark may not return from the test until KeepRunning() has
+  // returned false.
+  bool KeepRunning();
+
+  // Returns true iff the benchmark should run n more iterations.
+  // REQUIRES: 'n' > 0.
+  // NOTE: A benchmark must not return from the test until KeepRunningBatch()
+  // has returned false.
+  // NOTE: KeepRunningBatch() may overshoot by up to 'n' iterations.
+  //
+  // Intended usage:
+  //   while (state.KeepRunningBatch(1000)) {
+  //     // process 1000 elements
+  //   }
+  bool KeepRunningBatch(IterationCount n);
+
+  // REQUIRES: timer is running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Stop the benchmark timer.  If not called, the timer will be
+  // automatically stopped after the last iteration of the benchmark loop.
+  //
+  // For threaded benchmarks the PauseTiming() function only pauses the timing
+  // for the current thread.
+  //
+  // NOTE: The "real time" measurement is per-thread. If different threads
+  // report different measurements the largest one is reported.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void PauseTiming();
+
+  // REQUIRES: timer is not running and 'SkipWithError(...)' has not been called
+  //           by the current thread.
+  // Start the benchmark timer.  The timer is NOT running on entrance to the
+  // benchmark function. It begins running after control flow enters the
+  // benchmark loop.
+  //
+  // NOTE: PauseTiming()/ResumeTiming() are relatively
+  // heavyweight, and so their use should generally be avoided
+  // within each benchmark iteration, if possible.
+  void ResumeTiming();
+
+  // REQUIRES: 'SkipWithError(...)' has not been called previously by the
+  //            current thread.
+  // Report the benchmark as resulting in an error with the specified 'msg'.
+  // After this call the user may explicitly 'return' from the benchmark.
+  //
+  // If the ranged-for style of benchmark loop is used, the user must explicitly
+  // break from the loop, otherwise all future iterations will be run.
+  // If the 'KeepRunning()' loop is used the current thread will automatically
+  // exit the loop at the end of the current iteration.
+  //
+  // For threaded benchmarks only the current thread stops executing and future
+  // calls to `KeepRunning()` will block until all threads have completed
+  // the `KeepRunning()` loop. If multiple threads report an error only the
+  // first error message is used.
+  //
+  // NOTE: Calling 'SkipWithError(...)' does not cause the benchmark to exit
+  // the current scope immediately. If the function is called from within
+  // the 'KeepRunning()' loop the current iteration will finish. It is the users
+  // responsibility to exit the scope as needed.
+  void SkipWithError(const char* msg);
+
+  // REQUIRES: called exactly once per iteration of the benchmarking loop.
+  // Set the manually measured time for this benchmark iteration, which
+  // is used instead of automatically measured time if UseManualTime() was
+  // specified.
+  //
+  // For threaded benchmarks the final value will be set to the largest
+  // reported values.
+  void SetIterationTime(double seconds);
+
+  // Set the number of bytes processed by the current benchmark
+  // execution.  This routine is typically called once at the end of a
+  // throughput oriented benchmark.
+  //
+  // REQUIRES: a benchmark has exited its benchmarking loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetBytesProcessed(int64_t bytes) {
+    counters["bytes_per_second"] =
+        Counter(static_cast<double>(bytes), Counter::kIsRate, Counter::kIs1024);
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  int64_t bytes_processed() const {
+    if (counters.find("bytes_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("bytes_per_second"));
+    return 0;
+  }
+
+  // If this routine is called with complexity_n > 0 and complexity report is
+  // requested for the
+  // family benchmark, then current benchmark will be part of the computation
+  // and complexity_n will
+  // represent the length of N.
+  BENCHMARK_ALWAYS_INLINE
+  void SetComplexityN(int64_t complexity_n) { complexity_n_ = complexity_n; }
+
+  BENCHMARK_ALWAYS_INLINE
+  int64_t complexity_length_n() { return complexity_n_; }
+
+  // If this routine is called with items > 0, then an items/s
+  // label is printed on the benchmark report line for the currently
+  // executing benchmark. It is typically called at the end of a processing
+  // benchmark where a processing items/second output is desired.
+  //
+  // REQUIRES: a benchmark has exited its benchmarking loop.
+  BENCHMARK_ALWAYS_INLINE
+  void SetItemsProcessed(int64_t items) {
+    counters["items_per_second"] =
+        Counter(static_cast<double>(items), benchmark::Counter::kIsRate);
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  int64_t items_processed() const {
+    if (counters.find("items_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("items_per_second"));
+    return 0;
+  }
+
+  // If this routine is called, the specified label is printed at the
+  // end of the benchmark report line for the currently executing
+  // benchmark.  Example:
+  //  static void BM_Compress(benchmark::State& state) {
+  //    ...
+  //    double compress = input_size / output_size;
+  //    state.SetLabel(StrFormat("compress:%.1f%%", 100.0*compression));
+  //  }
+  // Produces output that looks like:
+  //  BM_Compress   50         50   14115038  compress:27.3%
+  //
+  // REQUIRES: a benchmark has exited its benchmarking loop.
+  void SetLabel(const char* label);
+
+  void BENCHMARK_ALWAYS_INLINE SetLabel(const std::string& str) {
+    this->SetLabel(str.c_str());
+  }
+
+  // Range arguments for this run. CHECKs if the argument has been set.
+  BENCHMARK_ALWAYS_INLINE
+  int64_t range(std::size_t pos = 0) const {
+    assert(range_.size() > pos);
+    return range_[pos];
+  }
+
+  BENCHMARK_DEPRECATED_MSG("use 'range(0)' instead")
+  int64_t range_x() const { return range(0); }
+
+  BENCHMARK_DEPRECATED_MSG("use 'range(1)' instead")
+  int64_t range_y() const { return range(1); }
+
+  BENCHMARK_ALWAYS_INLINE
+  IterationCount iterations() const {
+    if (BENCHMARK_BUILTIN_EXPECT(!started_, false)) {
+      return 0;
+    }
+    return max_iterations - total_iterations_ + batch_leftover_;
+  }
+
+ private
+     :  // items we expect on the first cache line (ie 64 bytes of the struct)
+  // When total_iterations_ is 0, KeepRunning() and friends will return false.
+  // May be larger than max_iterations.
+  IterationCount total_iterations_;
+
+  // When using KeepRunningBatch(), batch_leftover_ holds the number of
+  // iterations beyond max_iters that were run. Used to track
+  // completed_iterations_ accurately.
+  IterationCount batch_leftover_;
+
+ public:
+  const IterationCount max_iterations;
+
+ private:
+  bool started_;
+  bool finished_;
+  bool error_occurred_;
+
+ private:  // items we don't need on the first cache line
+  std::vector<int64_t> range_;
+
+  int64_t complexity_n_;
+
+ public:
+  // Container for user-defined counters.
+  UserCounters counters;
+  // Index of the executing thread. Values from [0, threads).
+  const int thread_index;
+  // Number of threads concurrently executing the benchmark.
+  const int threads;
+
+ private:
+  State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+        int thread_i, int n_threads, internal::ThreadTimer* timer,
+        internal::ThreadManager* manager);
+
+  void StartKeepRunning();
+  // Implementation of KeepRunning() and KeepRunningBatch().
+  // is_batch must be true unless n is 1.
+  bool KeepRunningInternal(IterationCount n, bool is_batch);
+  void FinishKeepRunning();
+  internal::ThreadTimer* timer_;
+  internal::ThreadManager* manager_;
+
+  friend struct internal::BenchmarkInstance;
+};
+
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
+  return KeepRunningInternal(1, /*is_batch=*/false);
+}
+
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningBatch(IterationCount n) {
+  return KeepRunningInternal(n, /*is_batch=*/true);
+}
+
+inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunningInternal(IterationCount n,
+                                                               bool is_batch) {
+  // total_iterations_ is set to 0 by the constructor, and always set to a
+  // nonzero value by StartKepRunning().
+  assert(n > 0);
+  // n must be 1 unless is_batch is true.
+  assert(is_batch || n == 1);
+  if (BENCHMARK_BUILTIN_EXPECT(total_iterations_ >= n, true)) {
+    total_iterations_ -= n;
+    return true;
+  }
+  if (!started_) {
+    StartKeepRunning();
+    if (!error_occurred_ && total_iterations_ >= n) {
+      total_iterations_ -= n;
+      return true;
+    }
+  }
+  // For non-batch runs, total_iterations_ must be 0 by now.
+  if (is_batch && total_iterations_ != 0) {
+    batch_leftover_ = n - total_iterations_;
+    total_iterations_ = 0;
+    return true;
+  }
+  FinishKeepRunning();
+  return false;
+}
+
+struct State::StateIterator {
+  struct BENCHMARK_UNUSED Value {};
+  typedef std::forward_iterator_tag iterator_category;
+  typedef Value value_type;
+  typedef Value reference;
+  typedef Value pointer;
+  typedef std::ptrdiff_t difference_type;
+
+ private:
+  friend class State;
+  BENCHMARK_ALWAYS_INLINE
+  StateIterator() : cached_(0), parent_() {}
+
+  BENCHMARK_ALWAYS_INLINE
+  explicit StateIterator(State* st)
+      : cached_(st->error_occurred_ ? 0 : st->max_iterations), parent_(st) {}
+
+ public:
+  BENCHMARK_ALWAYS_INLINE
+  Value operator*() const { return Value(); }
+
+  BENCHMARK_ALWAYS_INLINE
+  StateIterator& operator++() {
+    assert(cached_ > 0);
+    --cached_;
+    return *this;
+  }
+
+  BENCHMARK_ALWAYS_INLINE
+  bool operator!=(StateIterator const&) const {
+    if (BENCHMARK_BUILTIN_EXPECT(cached_ != 0, true)) return true;
+    parent_->FinishKeepRunning();
+    return false;
+  }
+
+ private:
+  IterationCount cached_;
+  State* const parent_;
+};
+
+inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::begin() {
+  return StateIterator(this);
+}
+inline BENCHMARK_ALWAYS_INLINE State::StateIterator State::end() {
+  StartKeepRunning();
+  return StateIterator();
+}
+
+namespace internal {
+
+typedef void(Function)(State&);
+
+// ------------------------------------------------------
+// Benchmark registration object.  The BENCHMARK() macro expands
+// into an internal::Benchmark* object.  Various methods can
+// be called on this object to change the properties of the benchmark.
+// Each method returns "this" so that multiple method calls can
+// chained into one expression.
+class Benchmark {
+ public:
+  virtual ~Benchmark();
+
+  // Note: the following methods all return "this" so that multiple
+  // method calls can be chained together in one expression.
+
+  // Run this benchmark once with "x" as the extra argument passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Arg(int64_t x);
+
+  // Run this benchmark with the given time unit for the generated output report
+  Benchmark* Unit(TimeUnit unit);
+
+  // Run this benchmark once for a number of values picked from the
+  // range [start..limit].  (start and limit are always picked.)
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* Range(int64_t start, int64_t limit);
+
+  // Run this benchmark once for all values in the range [start..limit] with
+  // specific step
+  // REQUIRES: The function passed to the constructor must accept an arg1.
+  Benchmark* DenseRange(int64_t start, int64_t limit, int step = 1);
+
+  // Run this benchmark once with "args" as the extra arguments passed
+  // to the function.
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Args(const std::vector<int64_t>& args);
+
+  // Equivalent to Args({x, y})
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Args'.
+  Benchmark* ArgPair(int64_t x, int64_t y) {
+    std::vector<int64_t> args;
+    args.push_back(x);
+    args.push_back(y);
+    return Args(args);
+  }
+
+  // Run this benchmark once for a number of values picked from the
+  // ranges [start..limit].  (starts and limits are always picked.)
+  // REQUIRES: The function passed to the constructor must accept arg1, arg2 ...
+  Benchmark* Ranges(const std::vector<std::pair<int64_t, int64_t> >& ranges);
+
+  // Equivalent to ArgNames({name})
+  Benchmark* ArgName(const std::string& name);
+
+  // Set the argument names to display in the benchmark name. If not called,
+  // only argument values will be shown.
+  Benchmark* ArgNames(const std::vector<std::string>& names);
+
+  // Equivalent to Ranges({{lo1, hi1}, {lo2, hi2}}).
+  // NOTE: This is a legacy C++03 interface provided for compatibility only.
+  //   New code should use 'Ranges'.
+  Benchmark* RangePair(int64_t lo1, int64_t hi1, int64_t lo2, int64_t hi2) {
+    std::vector<std::pair<int64_t, int64_t> > ranges;
+    ranges.push_back(std::make_pair(lo1, hi1));
+    ranges.push_back(std::make_pair(lo2, hi2));
+    return Ranges(ranges);
+  }
+
+  // Pass this benchmark object to *func, which can customize
+  // the benchmark by calling various methods like Arg, Args,
+  // Threads, etc.
+  Benchmark* Apply(void (*func)(Benchmark* benchmark));
+
+  // Set the range multiplier for non-dense range. If not called, the range
+  // multiplier kRangeMultiplier will be used.
+  Benchmark* RangeMultiplier(int multiplier);
+
+  // Set the minimum amount of time to use when running this benchmark. This
+  // option overrides the `benchmark_min_time` flag.
+  // REQUIRES: `t > 0` and `Iterations` has not been called on this benchmark.
+  Benchmark* MinTime(double t);
+
+  // Specify the amount of iterations that should be run by this benchmark.
+  // REQUIRES: 'n > 0' and `MinTime` has not been called on this benchmark.
+  //
+  // NOTE: This function should only be used when *exact* iteration control is
+  //   needed and never to control or limit how long a benchmark runs, where
+  // `--benchmark_min_time=N` or `MinTime(...)` should be used instead.
+  Benchmark* Iterations(IterationCount n);
+
+  // Specify the amount of times to repeat this benchmark. This option overrides
+  // the `benchmark_repetitions` flag.
+  // REQUIRES: `n > 0`
+  Benchmark* Repetitions(int n);
+
+  // Specify if each repetition of the benchmark should be reported separately
+  // or if only the final statistics should be reported. If the benchmark
+  // is not repeated then the single result is always reported.
+  // Applies to *ALL* reporters (display and file).
+  Benchmark* ReportAggregatesOnly(bool value = true);
+
+  // Same as ReportAggregatesOnly(), but applies to display reporter only.
+  Benchmark* DisplayAggregatesOnly(bool value = true);
+
+  // By default, the CPU time is measured only for the main thread, which may
+  // be unrepresentative if the benchmark uses threads internally. If called,
+  // the total CPU time spent by all the threads will be measured instead.
+  // By default, the only the main thread CPU time will be measured.
+  Benchmark* MeasureProcessCPUTime();
+
+  // If a particular benchmark should use the Wall clock instead of the CPU time
+  // (be it either the CPU time of the main thread only (default), or the
+  // total CPU usage of the benchmark), call this method. If called, the elapsed
+  // (wall) time will be used to control how many iterations are run, and in the
+  // printing of items/second or MB/seconds values.
+  // If not called, the CPU time used by the benchmark will be used.
+  Benchmark* UseRealTime();
+
+  // If a benchmark must measure time manually (e.g. if GPU execution time is
+  // being
+  // measured), call this method. If called, each benchmark iteration should
+  // call
+  // SetIterationTime(seconds) to report the measured time, which will be used
+  // to control how many iterations are run, and in the printing of items/second
+  // or MB/second values.
+  Benchmark* UseManualTime();
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigO complexity = benchmark::oAuto);
+
+  // Set the asymptotic computational complexity for the benchmark. If called
+  // the asymptotic computational complexity will be shown on the output.
+  Benchmark* Complexity(BigOFunc* complexity);
+
+  // Add this statistics to be computed over all the values of benchmark run
+  Benchmark* ComputeStatistics(std::string name, StatisticsFunc* statistics);
+
+  // Support for running multiple copies of the same benchmark concurrently
+  // in multiple threads.  This may be useful when measuring the scaling
+  // of some piece of code.
+
+  // Run one instance of this benchmark concurrently in t threads.
+  Benchmark* Threads(int t);
+
+  // Pick a set of values T from [min_threads,max_threads].
+  // min_threads and max_threads are always included in T.  Run this
+  // benchmark once for each value in T.  The benchmark run for a
+  // particular value t consists of t threads running the benchmark
+  // function concurrently.  For example, consider:
+  //    BENCHMARK(Foo)->ThreadRange(1,16);
+  // This will run the following benchmarks:
+  //    Foo in 1 thread
+  //    Foo in 2 threads
+  //    Foo in 4 threads
+  //    Foo in 8 threads
+  //    Foo in 16 threads
+  Benchmark* ThreadRange(int min_threads, int max_threads);
+
+  // For each value n in the range, run this benchmark once using n threads.
+  // min_threads and max_threads are always included in the range.
+  // stride specifies the increment. E.g. DenseThreadRange(1, 8, 3) starts
+  // a benchmark with 1, 4, 7 and 8 threads.
+  Benchmark* DenseThreadRange(int min_threads, int max_threads, int stride = 1);
+
+  // Equivalent to ThreadRange(NumCPUs(), NumCPUs())
+  Benchmark* ThreadPerCpu();
+
+  virtual void Run(State& state) = 0;
+
+ protected:
+  explicit Benchmark(const char* name);
+  Benchmark(Benchmark const&);
+  void SetName(const char* name);
+
+  int ArgsCnt() const;
+
+ private:
+  friend class BenchmarkFamilies;
+
+  std::string name_;
+  AggregationReportMode aggregation_report_mode_;
+  std::vector<std::string> arg_names_;       // Args for all benchmark runs
+  std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
+  TimeUnit time_unit_;
+  int range_multiplier_;
+  double min_time_;
+  IterationCount iterations_;
+  int repetitions_;
+  bool measure_process_cpu_time_;
+  bool use_real_time_;
+  bool use_manual_time_;
+  BigO complexity_;
+  BigOFunc* complexity_lambda_;
+  std::vector<Statistics> statistics_;
+  std::vector<int> thread_counts_;
+
+  Benchmark& operator=(Benchmark const&);
+};
+
+}  // namespace internal
+
+// Create and register a benchmark with the specified 'name' that invokes
+// the specified functor 'fn'.
+//
+// RETURNS: A pointer to the registered benchmark.
+internal::Benchmark* RegisterBenchmark(const char* name,
+                                       internal::Function* fn);
+
+#if defined(BENCHMARK_HAS_CXX11)
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn);
+#endif
+
+// Remove all registered benchmarks. All pointers to previously registered
+// benchmarks are invalidated.
+void ClearRegisteredBenchmarks();
+
+namespace internal {
+// The class used to hold all Benchmarks created from static function.
+// (ie those created using the BENCHMARK(...) macros.
+class FunctionBenchmark : public Benchmark {
+ public:
+  FunctionBenchmark(const char* name, Function* func)
+      : Benchmark(name), func_(func) {}
+
+  virtual void Run(State& st);
+
+ private:
+  Function* func_;
+};
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+class LambdaBenchmark : public Benchmark {
+ public:
+  virtual void Run(State& st) { lambda_(st); }
+
+ private:
+  template <class OLambda>
+  LambdaBenchmark(const char* name, OLambda&& lam)
+      : Benchmark(name), lambda_(std::forward<OLambda>(lam)) {}
+
+  LambdaBenchmark(LambdaBenchmark const&) = delete;
+
+ private:
+  template <class Lam>
+  friend Benchmark* ::benchmark::RegisterBenchmark(const char*, Lam&&);
+
+  Lambda lambda_;
+};
+#endif
+
+}  // namespace internal
+
+inline internal::Benchmark* RegisterBenchmark(const char* name,
+                                              internal::Function* fn) {
+  return internal::RegisterBenchmarkInternal(
+      ::new internal::FunctionBenchmark(name, fn));
+}
+
+#ifdef BENCHMARK_HAS_CXX11
+template <class Lambda>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn) {
+  using BenchType =
+      internal::LambdaBenchmark<typename std::decay<Lambda>::type>;
+  return internal::RegisterBenchmarkInternal(
+      ::new BenchType(name, std::forward<Lambda>(fn)));
+}
+#endif
+
+#if defined(BENCHMARK_HAS_CXX11) && \
+    (!defined(BENCHMARK_GCC_VERSION) || BENCHMARK_GCC_VERSION >= 409)
+template <class Lambda, class... Args>
+internal::Benchmark* RegisterBenchmark(const char* name, Lambda&& fn,
+                                       Args&&... args) {
+  return benchmark::RegisterBenchmark(
+      name, [=](benchmark::State& st) { fn(st, args...); });
+}
+#else
+#define BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+#endif
+
+// The base class for all fixture tests.
+class Fixture : public internal::Benchmark {
+ public:
+  Fixture() : internal::Benchmark("") {}
+
+  virtual void Run(State& st) {
+    this->SetUp(st);
+    this->BenchmarkCase(st);
+    this->TearDown(st);
+  }
+
+  // These will be deprecated ...
+  virtual void SetUp(const State&) {}
+  virtual void TearDown(const State&) {}
+  // ... In favor of these.
+  virtual void SetUp(State& st) { SetUp(const_cast<const State&>(st)); }
+  virtual void TearDown(State& st) { TearDown(const_cast<const State&>(st)); }
+
+ protected:
+  virtual void BenchmarkCase(State&) = 0;
+};
+
+}  // namespace benchmark
+
+// ------------------------------------------------------
+// Macro to register benchmarks
+
+// Check that __COUNTER__ is defined and that __COUNTER__ increases by 1
+// every time it is expanded. X + 1 == X + 0 is used in case X is defined to be
+// empty. If X is empty the expression becomes (+1 == +0).
+#if defined(__COUNTER__) && (__COUNTER__ + 1 == __COUNTER__ + 0)
+#define BENCHMARK_PRIVATE_UNIQUE_ID __COUNTER__
+#else
+#define BENCHMARK_PRIVATE_UNIQUE_ID __LINE__
+#endif
+
+// Helpers for generating unique variable names
+#define BENCHMARK_PRIVATE_NAME(n) \
+  BENCHMARK_PRIVATE_CONCAT(_benchmark_, BENCHMARK_PRIVATE_UNIQUE_ID, n)
+#define BENCHMARK_PRIVATE_CONCAT(a, b, c) BENCHMARK_PRIVATE_CONCAT2(a, b, c)
+#define BENCHMARK_PRIVATE_CONCAT2(a, b, c) a##b##c
+
+#define BENCHMARK_PRIVATE_DECLARE(n)                                 \
+  static ::benchmark::internal::Benchmark* BENCHMARK_PRIVATE_NAME(n) \
+      BENCHMARK_UNUSED
+
+#define BENCHMARK(n)                                     \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(#n, n)))
+
+// Old-style macros
+#define BENCHMARK_WITH_ARG(n, a) BENCHMARK(n)->Arg((a))
+#define BENCHMARK_WITH_ARG2(n, a1, a2) BENCHMARK(n)->Args({(a1), (a2)})
+#define BENCHMARK_WITH_UNIT(n, t) BENCHMARK(n)->Unit((t))
+#define BENCHMARK_RANGE(n, lo, hi) BENCHMARK(n)->Range((lo), (hi))
+#define BENCHMARK_RANGE2(n, l1, h1, l2, h2) \
+  BENCHMARK(n)->RangePair({{(l1), (h1)}, {(l2), (h2)}})
+
+#ifdef BENCHMARK_HAS_CXX11
+
+// Register a benchmark which invokes the function specified by `func`
+// with the additional arguments specified by `...`.
+//
+// For example:
+//
+// template <class ...ExtraArgs>`
+// void BM_takes_args(benchmark::State& state, ExtraArgs&&... extra_args) {
+//  [...]
+//}
+// /* Registers a benchmark named "BM_takes_args/int_string_test` */
+// BENCHMARK_CAPTURE(BM_takes_args, int_string_test, 42, std::string("abc"));
+#define BENCHMARK_CAPTURE(func, test_case_name, ...)     \
+  BENCHMARK_PRIVATE_DECLARE(func) =                      \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #func "/" #test_case_name,                 \
+              [](::benchmark::State& st) { func(st, __VA_ARGS__); })))
+
+#endif  // BENCHMARK_HAS_CXX11
+
+// This will register a benchmark for a templatized function.  For example:
+//
+// template<int arg>
+// void BM_Foo(int iters);
+//
+// BENCHMARK_TEMPLATE(BM_Foo, 1);
+//
+// will register BM_Foo<1> as a benchmark.
+#define BENCHMARK_TEMPLATE1(n, a)                        \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a ">", n<a>)))
+
+#define BENCHMARK_TEMPLATE2(n, a, b)                                         \
+  BENCHMARK_PRIVATE_DECLARE(n) =                                             \
+      (::benchmark::internal::RegisterBenchmarkInternal(                     \
+          new ::benchmark::internal::FunctionBenchmark(#n "<" #a "," #b ">", \
+                                                       n<a, b>)))
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE(n, ...)                       \
+  BENCHMARK_PRIVATE_DECLARE(n) =                         \
+      (::benchmark::internal::RegisterBenchmarkInternal( \
+          new ::benchmark::internal::FunctionBenchmark(  \
+              #n "<" #__VA_ARGS__ ">", n<__VA_ARGS__>)))
+#else
+#define BENCHMARK_TEMPLATE(n, a) BENCHMARK_TEMPLATE1(n, a)
+#endif
+
+#define BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method)        \
+  class BaseClass##_##Method##_Benchmark : public BaseClass { \
+   public:                                                    \
+    BaseClass##_##Method##_Benchmark() : BaseClass() {        \
+      this->SetName(#BaseClass "/" #Method);                  \
+    }                                                         \
+                                                              \
+   protected:                                                 \
+    virtual void BenchmarkCase(::benchmark::State&);          \
+  };
+
+#define BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a> {    \
+   public:                                                          \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a>() {           \
+      this->SetName(#BaseClass "<" #a ">/" #Method);                \
+    }                                                               \
+                                                                    \
+   protected:                                                       \
+    virtual void BenchmarkCase(::benchmark::State&);                \
+  };
+
+#define BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<a, b> {    \
+   public:                                                             \
+    BaseClass##_##Method##_Benchmark() : BaseClass<a, b>() {           \
+      this->SetName(#BaseClass "<" #a "," #b ">/" #Method);            \
+    }                                                                  \
+                                                                       \
+   protected:                                                          \
+    virtual void BenchmarkCase(::benchmark::State&);                   \
+  };
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, ...)       \
+  class BaseClass##_##Method##_Benchmark : public BaseClass<__VA_ARGS__> { \
+   public:                                                                 \
+    BaseClass##_##Method##_Benchmark() : BaseClass<__VA_ARGS__>() {        \
+      this->SetName(#BaseClass "<" #__VA_ARGS__ ">/" #Method);             \
+    }                                                                      \
+                                                                           \
+   protected:                                                              \
+    virtual void BenchmarkCase(::benchmark::State&);                       \
+  };
+#else
+#define BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(n, a) \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(n, a)
+#endif
+
+#define BENCHMARK_DEFINE_F(BaseClass, Method)    \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)    \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE2_DEFINE_F(BaseClass, Method, a, b)    \
+  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, ...)            \
+  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+#else
+#define BENCHMARK_TEMPLATE_DEFINE_F(BaseClass, Method, a) \
+  BENCHMARK_TEMPLATE1_DEFINE_F(BaseClass, Method, a)
+#endif
+
+#define BENCHMARK_REGISTER_F(BaseClass, Method) \
+  BENCHMARK_PRIVATE_REGISTER_F(BaseClass##_##Method##_Benchmark)
+
+#define BENCHMARK_PRIVATE_REGISTER_F(TestName) \
+  BENCHMARK_PRIVATE_DECLARE(TestName) =        \
+      (::benchmark::internal::RegisterBenchmarkInternal(new TestName()))
+
+// This macro will define and register a benchmark within a fixture class.
+#define BENCHMARK_F(BaseClass, Method)           \
+  BENCHMARK_PRIVATE_DECLARE_F(BaseClass, Method) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);       \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)           \
+  BENCHMARK_TEMPLATE1_PRIVATE_DECLARE_F(BaseClass, Method, a) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                    \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#define BENCHMARK_TEMPLATE2_F(BaseClass, Method, a, b)           \
+  BENCHMARK_TEMPLATE2_PRIVATE_DECLARE_F(BaseClass, Method, a, b) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                       \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+
+#ifdef BENCHMARK_HAS_CXX11
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, ...)                   \
+  BENCHMARK_TEMPLATE_PRIVATE_DECLARE_F(BaseClass, Method, __VA_ARGS__) \
+  BENCHMARK_REGISTER_F(BaseClass, Method);                             \
+  void BaseClass##_##Method##_Benchmark::BenchmarkCase
+#else
+#define BENCHMARK_TEMPLATE_F(BaseClass, Method, a) \
+  BENCHMARK_TEMPLATE1_F(BaseClass, Method, a)
+#endif
+
+// Helper macro to create a main routine in a test that runs the benchmarks
+#define BENCHMARK_MAIN()                                                \
+  int main(int argc, char** argv) {                                     \
+    ::benchmark::Initialize(&argc, argv);                               \
+    if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; \
+    ::benchmark::RunSpecifiedBenchmarks();                              \
+  }                                                                     \
+  int main(int, char**)
+
+// ------------------------------------------------------
+// Benchmark Reporters
+
+namespace benchmark {
+
+struct CPUInfo {
+  struct CacheInfo {
+    std::string type;
+    int level;
+    int size;
+    int num_sharing;
+  };
+
+  int num_cpus;
+  double cycles_per_second;
+  std::vector<CacheInfo> caches;
+  bool scaling_enabled;
+  std::vector<double> load_avg;
+
+  static const CPUInfo& Get();
+
+ private:
+  CPUInfo();
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(CPUInfo);
+};
+
+// Adding Struct for System Information
+struct SystemInfo {
+  std::string name;
+  static const SystemInfo& Get();
+
+ private:
+  SystemInfo();
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(SystemInfo);
+};
+
+// BenchmarkName contains the components of the Benchmark's name
+// which allows individual fields to be modified or cleared before
+// building the final name using 'str()'.
+struct BenchmarkName {
+  std::string function_name;
+  std::string args;
+  std::string min_time;
+  std::string iterations;
+  std::string repetitions;
+  std::string time_type;
+  std::string threads;
+
+  // Return the full name of the benchmark with each non-empty
+  // field separated by a '/'
+  std::string str() const;
+};
+
+// Interface for custom benchmark result printers.
+// By default, benchmark reports are printed to stdout. However an application
+// can control the destination of the reports by calling
+// RunSpecifiedBenchmarks and passing it a custom reporter object.
+// The reporter object must implement the following interface.
+class BenchmarkReporter {
+ public:
+  struct Context {
+    CPUInfo const& cpu_info;
+    SystemInfo const& sys_info;
+    // The number of chars in the longest benchmark name.
+    size_t name_field_width;
+    static const char* executable_name;
+    Context();
+  };
+
+  struct Run {
+    static const int64_t no_repetition_index = -1;
+    enum RunType { RT_Iteration, RT_Aggregate };
+
+    Run()
+        : run_type(RT_Iteration),
+          error_occurred(false),
+          iterations(1),
+          threads(1),
+          time_unit(kNanosecond),
+          real_accumulated_time(0),
+          cpu_accumulated_time(0),
+          max_heapbytes_used(0),
+          complexity(oNone),
+          complexity_lambda(),
+          complexity_n(0),
+          report_big_o(false),
+          report_rms(false),
+          counters(),
+          has_memory_result(false),
+          allocs_per_iter(0.0),
+          max_bytes_used(0) {}
+
+    std::string benchmark_name() const;
+    BenchmarkName run_name;
+    RunType run_type;
+    std::string aggregate_name;
+    std::string report_label;  // Empty if not set by benchmark.
+    bool error_occurred;
+    std::string error_message;
+
+    IterationCount iterations;
+    int64_t threads;
+    int64_t repetition_index;
+    int64_t repetitions;
+    TimeUnit time_unit;
+    double real_accumulated_time;
+    double cpu_accumulated_time;
+
+    // Return a value representing the real time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedRealTime() const;
+
+    // Return a value representing the cpu time per iteration in the unit
+    // specified by 'time_unit'.
+    // NOTE: If 'iterations' is zero the returned value represents the
+    // accumulated time.
+    double GetAdjustedCPUTime() const;
+
+    // This is set to 0.0 if memory tracing is not enabled.
+    double max_heapbytes_used;
+
+    // Keep track of arguments to compute asymptotic complexity
+    BigO complexity;
+    BigOFunc* complexity_lambda;
+    int64_t complexity_n;
+
+    // what statistics to compute from the measurements
+    const std::vector<internal::Statistics>* statistics;
+
+    // Inform print function whether the current run is a complexity report
+    bool report_big_o;
+    bool report_rms;
+
+    UserCounters counters;
+
+    // Memory metrics.
+    bool has_memory_result;
+    double allocs_per_iter;
+    int64_t max_bytes_used;
+  };
+
+  // Construct a BenchmarkReporter with the output stream set to 'std::cout'
+  // and the error stream set to 'std::cerr'
+  BenchmarkReporter();
+
+  // Called once for every suite of benchmarks run.
+  // The parameter "context" contains information that the
+  // reporter may wish to use when generating its report, for example the
+  // platform under which the benchmarks are running. The benchmark run is
+  // never started if this function returns false, allowing the reporter
+  // to skip runs based on the context information.
+  virtual bool ReportContext(const Context& context) = 0;
+
+  // Called once for each group of benchmark runs, gives information about
+  // cpu-time and heap memory usage during the benchmark run. If the group
+  // of runs contained more than two entries then 'report' contains additional
+  // elements representing the mean and standard deviation of those runs.
+  // Additionally if this group of runs was the last in a family of benchmarks
+  // 'reports' contains additional entries representing the asymptotic
+  // complexity and RMS of that benchmark family.
+  virtual void ReportRuns(const std::vector<Run>& report) = 0;
+
+  // Called once and only once after ever group of benchmarks is run and
+  // reported.
+  virtual void Finalize() {}
+
+  // REQUIRES: The object referenced by 'out' is valid for the lifetime
+  // of the reporter.
+  void SetOutputStream(std::ostream* out) {
+    assert(out);
+    output_stream_ = out;
+  }
+
+  // REQUIRES: The object referenced by 'err' is valid for the lifetime
+  // of the reporter.
+  void SetErrorStream(std::ostream* err) {
+    assert(err);
+    error_stream_ = err;
+  }
+
+  std::ostream& GetOutputStream() const { return *output_stream_; }
+
+  std::ostream& GetErrorStream() const { return *error_stream_; }
+
+  virtual ~BenchmarkReporter();
+
+  // Write a human readable string to 'out' representing the specified
+  // 'context'.
+  // REQUIRES: 'out' is non-null.
+  static void PrintBasicContext(std::ostream* out, Context const& context);
+
+ private:
+  std::ostream* output_stream_;
+  std::ostream* error_stream_;
+};
+
+// Simple reporter that outputs benchmark data to the console. This is the
+// default reporter used by RunSpecifiedBenchmarks().
+class ConsoleReporter : public BenchmarkReporter {
+ public:
+  enum OutputOptions {
+    OO_None = 0,
+    OO_Color = 1,
+    OO_Tabular = 2,
+    OO_ColorTabular = OO_Color | OO_Tabular,
+    OO_Defaults = OO_ColorTabular
+  };
+  explicit ConsoleReporter(OutputOptions opts_ = OO_Defaults)
+      : output_options_(opts_),
+        name_field_width_(0),
+        prev_counters_(),
+        printed_header_(false) {}
+
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ protected:
+  virtual void PrintRunData(const Run& report);
+  virtual void PrintHeader(const Run& report);
+
+  OutputOptions output_options_;
+  size_t name_field_width_;
+  UserCounters prev_counters_;
+  bool printed_header_;
+};
+
+class JSONReporter : public BenchmarkReporter {
+ public:
+  JSONReporter() : first_report_(true) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+  virtual void Finalize();
+
+ private:
+  void PrintRunData(const Run& report);
+
+  bool first_report_;
+};
+
+class BENCHMARK_DEPRECATED_MSG(
+    "The CSV Reporter will be removed in a future release") CSVReporter
+    : public BenchmarkReporter {
+ public:
+  CSVReporter() : printed_header_(false) {}
+  virtual bool ReportContext(const Context& context);
+  virtual void ReportRuns(const std::vector<Run>& reports);
+
+ private:
+  void PrintRunData(const Run& report);
+
+  bool printed_header_;
+  std::set<std::string> user_counter_names_;
+};
+
+// If a MemoryManager is registered, it can be used to collect and report
+// allocation metrics for a run of the benchmark.
+class MemoryManager {
+ public:
+  struct Result {
+    Result() : num_allocs(0), max_bytes_used(0) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  virtual void Stop(Result* result) = 0;
+};
+
+inline const char* GetTimeUnitString(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return "ms";
+    case kMicrosecond:
+      return "us";
+    case kNanosecond:
+      return "ns";
+  }
+  BENCHMARK_UNREACHABLE();
+}
+
+inline double GetTimeUnitMultiplier(TimeUnit unit) {
+  switch (unit) {
+    case kMillisecond:
+      return 1e3;
+    case kMicrosecond:
+      return 1e6;
+    case kNanosecond:
+      return 1e9;
+  }
+  BENCHMARK_UNREACHABLE();
+}
+
+}  // namespace benchmark
+
+#endif  // BENCHMARK_BENCHMARK_H_
diff --git a/thirdparty/benchmark-1.5.0/mingw.py b/thirdparty/benchmark-1.5.0/mingw.py
new file mode 100644
index 0000000000..706ad559db
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/mingw.py
@@ -0,0 +1,320 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+import argparse
+import errno
+import logging
+import os
+import platform
+import re
+import sys
+import subprocess
+import tempfile
+
+try:
+    import winreg
+except ImportError:
+    import _winreg as winreg
+try:
+    import urllib.request as request
+except ImportError:
+    import urllib as request
+try:
+    import urllib.parse as parse
+except ImportError:
+    import urlparse as parse
+
+class EmptyLogger(object):
+    '''
+    Provides an implementation that performs no logging
+    '''
+    def debug(self, *k, **kw):
+        pass
+    def info(self, *k, **kw):
+        pass
+    def warn(self, *k, **kw):
+        pass
+    def error(self, *k, **kw):
+        pass
+    def critical(self, *k, **kw):
+        pass
+    def setLevel(self, *k, **kw):
+        pass
+
+urls = (
+    'http://downloads.sourceforge.net/project/mingw-w64/Toolchains%20'
+        'targetting%20Win32/Personal%20Builds/mingw-builds/installer/'
+        'repository.txt',
+    'http://downloads.sourceforge.net/project/mingwbuilds/host-windows/'
+        'repository.txt'
+)
+'''
+A list of mingw-build repositories
+'''
+
+def repository(urls = urls, log = EmptyLogger()):
+    '''
+    Downloads and parse mingw-build repository files and parses them
+    '''
+    log.info('getting mingw-builds repository')
+    versions = {}
+    re_sourceforge = re.compile(r'http://sourceforge.net/projects/([^/]+)/files')
+    re_sub = r'http://downloads.sourceforge.net/project/\1'
+    for url in urls:
+        log.debug(' - requesting: %s', url)
+        socket = request.urlopen(url)
+        repo = socket.read()
+        if not isinstance(repo, str):
+            repo = repo.decode();
+        socket.close()
+        for entry in repo.split('\n')[:-1]:
+            value = entry.split('|')
+            version = tuple([int(n) for n in value[0].strip().split('.')])
+            version = versions.setdefault(version, {})
+            arch = value[1].strip()
+            if arch == 'x32':
+                arch = 'i686'
+            elif arch == 'x64':
+                arch = 'x86_64'
+            arch = version.setdefault(arch, {})
+            threading = arch.setdefault(value[2].strip(), {})
+            exceptions = threading.setdefault(value[3].strip(), {})
+            revision = exceptions.setdefault(int(value[4].strip()[3:]),
+                re_sourceforge.sub(re_sub, value[5].strip()))
+    return versions
+
+def find_in_path(file, path=None):
+    '''
+    Attempts to find an executable in the path
+    '''
+    if platform.system() == 'Windows':
+        file += '.exe'
+    if path is None:
+        path = os.environ.get('PATH', '')
+    if type(path) is type(''):
+        path = path.split(os.pathsep)
+    return list(filter(os.path.exists,
+        map(lambda dir, file=file: os.path.join(dir, file), path)))
+
+def find_7zip(log = EmptyLogger()):
+    '''
+    Attempts to find 7zip for unpacking the mingw-build archives
+    '''
+    log.info('finding 7zip')
+    path = find_in_path('7z')
+    if not path:
+        key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r'SOFTWARE\7-Zip')
+        path, _ = winreg.QueryValueEx(key, 'Path')
+        path = [os.path.join(path, '7z.exe')]
+    log.debug('found \'%s\'', path[0])
+    return path[0]
+
+find_7zip()
+
+def unpack(archive, location, log = EmptyLogger()):
+    '''
+    Unpacks a mingw-builds archive
+    '''
+    sevenzip = find_7zip(log)
+    log.info('unpacking %s', os.path.basename(archive))
+    cmd = [sevenzip, 'x', archive, '-o' + location, '-y']
+    log.debug(' - %r', cmd)
+    with open(os.devnull, 'w') as devnull:
+        subprocess.check_call(cmd, stdout = devnull)
+
+def download(url, location, log = EmptyLogger()):
+    '''
+    Downloads and unpacks a mingw-builds archive
+    '''
+    log.info('downloading MinGW')
+    log.debug(' - url: %s', url)
+    log.debug(' - location: %s', location)
+
+    re_content = re.compile(r'attachment;[ \t]*filename=(")?([^"]*)(")?[\r\n]*')
+
+    stream = request.urlopen(url)
+    try:
+        content = stream.getheader('Content-Disposition') or ''
+    except AttributeError:
+        content = stream.headers.getheader('Content-Disposition') or ''
+    matches = re_content.match(content)
+    if matches:
+        filename = matches.group(2)
+    else:
+        parsed = parse.urlparse(stream.geturl())
+        filename = os.path.basename(parsed.path)
+
+    try:
+        os.makedirs(location)
+    except OSError as e:
+        if e.errno == errno.EEXIST and os.path.isdir(location):
+            pass
+        else:
+            raise
+
+    archive = os.path.join(location, filename)
+    with open(archive, 'wb') as out:
+        while True:
+            buf = stream.read(1024)
+            if not buf:
+                break
+            out.write(buf)
+    unpack(archive, location, log = log)
+    os.remove(archive)
+
+    possible = os.path.join(location, 'mingw64')
+    if not os.path.exists(possible):
+        possible = os.path.join(location, 'mingw32')
+        if not os.path.exists(possible):
+            raise ValueError('Failed to find unpacked MinGW: ' + possible)
+    return possible
+
+def root(location = None, arch = None, version = None, threading = None,
+        exceptions = None, revision = None, log = EmptyLogger()):
+    '''
+    Returns the root folder of a specific version of the mingw-builds variant
+    of gcc. Will download the compiler if needed
+    '''
+
+    # Get the repository if we don't have all the information
+    if not (arch and version and threading and exceptions and revision):
+        versions = repository(log = log)
+
+    # Determine some defaults
+    version = version or max(versions.keys())
+    if not arch:
+        arch = platform.machine().lower()
+        if arch == 'x86':
+            arch = 'i686'
+        elif arch == 'amd64':
+            arch = 'x86_64'
+    if not threading:
+        keys = versions[version][arch].keys()
+        if 'posix' in keys:
+            threading = 'posix'
+        elif 'win32' in keys:
+            threading = 'win32'
+        else:
+            threading = keys[0]
+    if not exceptions:
+        keys = versions[version][arch][threading].keys()
+        if 'seh' in keys:
+            exceptions = 'seh'
+        elif 'sjlj' in keys:
+            exceptions = 'sjlj'
+        else:
+            exceptions = keys[0]
+    if revision == None:
+        revision = max(versions[version][arch][threading][exceptions].keys())
+    if not location:
+        location = os.path.join(tempfile.gettempdir(), 'mingw-builds')
+
+    # Get the download url
+    url = versions[version][arch][threading][exceptions][revision]
+
+    # Tell the user whatzzup
+    log.info('finding MinGW %s', '.'.join(str(v) for v in version))
+    log.debug(' - arch: %s', arch)
+    log.debug(' - threading: %s', threading)
+    log.debug(' - exceptions: %s', exceptions)
+    log.debug(' - revision: %s', revision)
+    log.debug(' - url: %s', url)
+
+    # Store each specific revision differently
+    slug = '{version}-{arch}-{threading}-{exceptions}-rev{revision}'
+    slug = slug.format(
+        version = '.'.join(str(v) for v in version),
+        arch = arch,
+        threading = threading,
+        exceptions = exceptions,
+        revision = revision
+    )
+    if arch == 'x86_64':
+        root_dir = os.path.join(location, slug, 'mingw64')
+    elif arch == 'i686':
+        root_dir = os.path.join(location, slug, 'mingw32')
+    else:
+        raise ValueError('Unknown MinGW arch: ' + arch)
+
+    # Download if needed
+    if not os.path.exists(root_dir):
+        downloaded = download(url, os.path.join(location, slug), log = log)
+        if downloaded != root_dir:
+            raise ValueError('The location of mingw did not match\n%s\n%s'
+                % (downloaded, root_dir))
+
+    return root_dir
+
+def str2ver(string):
+    '''
+    Converts a version string into a tuple
+    '''
+    try:
+        version = tuple(int(v) for v in string.split('.'))
+        if len(version) is not 3:
+            raise ValueError()
+    except ValueError:
+        raise argparse.ArgumentTypeError(
+            'please provide a three digit version string')
+    return version
+
+def main():
+    '''
+    Invoked when the script is run directly by the python interpreter
+    '''
+    parser = argparse.ArgumentParser(
+        description = 'Downloads a specific version of MinGW',
+        formatter_class = argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('--location',
+        help = 'the location to download the compiler to',
+        default = os.path.join(tempfile.gettempdir(), 'mingw-builds'))
+    parser.add_argument('--arch', required = True, choices = ['i686', 'x86_64'],
+        help = 'the target MinGW architecture string')
+    parser.add_argument('--version', type = str2ver,
+        help = 'the version of GCC to download')
+    parser.add_argument('--threading', choices = ['posix', 'win32'],
+        help = 'the threading type of the compiler')
+    parser.add_argument('--exceptions', choices = ['sjlj', 'seh', 'dwarf'],
+        help = 'the method to throw exceptions')
+    parser.add_argument('--revision', type=int,
+        help = 'the revision of the MinGW release')
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument('-v', '--verbose', action='store_true',
+        help='increase the script output verbosity')
+    group.add_argument('-q', '--quiet', action='store_true',
+        help='only print errors and warning')
+    args = parser.parse_args()
+
+    # Create the logger
+    logger = logging.getLogger('mingw')
+    handler = logging.StreamHandler()
+    formatter = logging.Formatter('%(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    if args.quiet:
+        logger.setLevel(logging.WARN)
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+
+    # Get MinGW
+    root_dir = root(location = args.location, arch = args.arch,
+        version = args.version, threading = args.threading,
+        exceptions = args.exceptions, revision = args.revision,
+        log = logger)
+
+    sys.stdout.write('%s\n' % os.path.join(root_dir, 'bin'))
+
+if __name__ == '__main__':
+    try:
+        main()
+    except IOError as e:
+        sys.stderr.write('IO error: %s\n' % e)
+        sys.exit(1)
+    except OSError as e:
+        sys.stderr.write('OS error: %s\n' % e)
+        sys.exit(1)
+    except KeyboardInterrupt as e:
+        sys.stderr.write('Killed\n')
+        sys.exit(1)
diff --git a/thirdparty/benchmark-1.5.0/releasing.md b/thirdparty/benchmark-1.5.0/releasing.md
new file mode 100644
index 0000000000..f0cd7010e3
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/releasing.md
@@ -0,0 +1,16 @@
+# How to release
+
+* Make sure you're on master and synced to HEAD
+* Ensure the project builds and tests run (sanity check only, obviously)
+    * `parallel -j0 exec ::: test/*_test` can help ensure everything at least
+      passes
+* Prepare release notes
+    * `git log $(git describe --abbrev=0 --tags)..HEAD` gives you the list of
+      commits between the last annotated tag and HEAD
+    * Pick the most interesting.
+* Create a release through github's interface
+    * Note this will create a lightweight tag.
+    * Update this to an annotated tag:
+      * `git pull --tags`
+      * `git tag -a -f <tag> <tag>`
+      * `git push --force origin`
diff --git a/thirdparty/benchmark-1.5.0/src/CMakeLists.txt b/thirdparty/benchmark-1.5.0/src/CMakeLists.txt
new file mode 100644
index 0000000000..b47de6791c
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/CMakeLists.txt
@@ -0,0 +1,112 @@
+# Allow the source files to find headers in src/
+include(GNUInstallDirs)
+include_directories(${PROJECT_SOURCE_DIR}/src)
+
+if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
+  list(APPEND CMAKE_SHARED_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+  list(APPEND CMAKE_MODULE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+endif()
+
+file(GLOB
+  SOURCE_FILES
+    *.cc
+    ${PROJECT_SOURCE_DIR}/include/benchmark/*.h
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.h)
+file(GLOB BENCHMARK_MAIN "benchmark_main.cc")
+foreach(item ${BENCHMARK_MAIN})
+  list(REMOVE_ITEM SOURCE_FILES "${item}")
+endforeach()
+
+add_library(benchmark ${SOURCE_FILES})
+set_target_properties(benchmark PROPERTIES
+  OUTPUT_NAME "benchmark"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+)
+target_include_directories(benchmark PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+    )
+
+# Link threads.
+target_link_libraries(benchmark  ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+find_library(LIBRT rt)
+if(LIBRT)
+  target_link_libraries(benchmark ${LIBRT})
+endif()
+
+if(CMAKE_BUILD_TYPE)
+  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
+endif()
+if(NOT CMAKE_THREAD_LIBS_INIT AND "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}" MATCHES ".*-fsanitize=[^ ]*address.*")
+  message(WARNING "CMake's FindThreads.cmake did not fail, but CMAKE_THREAD_LIBS_INIT ended up being empty. This was fixed in https://github.com/Kitware/CMake/commit/d53317130e84898c5328c237186dbd995aaf1c12 Let's guess that -pthread is sufficient.")
+  target_link_libraries(benchmark -pthread)
+endif()
+
+# We need extra libraries on Windows
+if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+  target_link_libraries(benchmark Shlwapi)
+endif()
+
+# We need extra libraries on Solaris
+if(${CMAKE_SYSTEM_NAME} MATCHES "SunOS")
+  target_link_libraries(benchmark kstat)
+endif()
+
+# Benchmark main library
+add_library(benchmark_main "benchmark_main.cc")
+set_target_properties(benchmark_main PROPERTIES
+  OUTPUT_NAME "benchmark_main"
+  VERSION ${GENERIC_LIB_VERSION}
+  SOVERSION ${GENERIC_LIB_SOVERSION}
+)
+target_include_directories(benchmark PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+    )
+target_link_libraries(benchmark_main benchmark)
+
+
+set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated")
+
+set(version_config "${generated_dir}/${PROJECT_NAME}ConfigVersion.cmake")
+set(project_config "${generated_dir}/${PROJECT_NAME}Config.cmake")
+set(pkg_config "${generated_dir}/${PROJECT_NAME}.pc")
+set(targets_export_name "${PROJECT_NAME}Targets")
+
+set(namespace "${PROJECT_NAME}::")
+
+include(CMakePackageConfigHelpers)
+write_basic_package_version_file(
+  "${version_config}" VERSION ${GENERIC_LIB_VERSION} COMPATIBILITY SameMajorVersion
+)
+
+configure_file("${PROJECT_SOURCE_DIR}/cmake/Config.cmake.in" "${project_config}" @ONLY)
+configure_file("${PROJECT_SOURCE_DIR}/cmake/benchmark.pc.in" "${pkg_config}" @ONLY)
+
+if (BENCHMARK_ENABLE_INSTALL)
+  # Install target (will install the library to specified CMAKE_INSTALL_PREFIX variable)
+  install(
+    TARGETS benchmark benchmark_main
+    EXPORT ${targets_export_name}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+  install(
+    DIRECTORY "${PROJECT_SOURCE_DIR}/include/benchmark"
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    FILES_MATCHING PATTERN "*.*h")
+
+  install(
+      FILES "${project_config}" "${version_config}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+
+  install(
+      FILES "${pkg_config}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+
+  install(
+      EXPORT "${targets_export_name}"
+      NAMESPACE "${namespace}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}")
+endif()
diff --git a/thirdparty/benchmark-1.5.0/src/arraysize.h b/thirdparty/benchmark-1.5.0/src/arraysize.h
new file mode 100644
index 0000000000..51a50f2dff
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/arraysize.h
@@ -0,0 +1,33 @@
+#ifndef BENCHMARK_ARRAYSIZE_H_
+#define BENCHMARK_ARRAYSIZE_H_
+
+#include "internal_macros.h"
+
+namespace benchmark {
+namespace internal {
+// The arraysize(arr) macro returns the # of elements in an array arr.
+// The expression is a compile-time constant, and therefore can be
+// used in defining new arrays, for example.  If you use arraysize on
+// a pointer by mistake, you will get a compile-time error.
+//
+
+// This template function declaration is used in defining arraysize.
+// Note that the function doesn't need an implementation, as we only
+// use its type.
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+
+// That gcc wants both of these prototypes seems mysterious. VC, for
+// its part, can't decide which to use (another mystery). Matching of
+// template overloads: the final frontier.
+#ifndef COMPILER_MSVC
+template <typename T, size_t N>
+char (&ArraySizeHelper(const T (&array)[N]))[N];
+#endif
+
+#define arraysize(array) (sizeof(::benchmark::internal::ArraySizeHelper(array)))
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_ARRAYSIZE_H_
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark.cc b/thirdparty/benchmark-1.5.0/src/benchmark.cc
new file mode 100644
index 0000000000..29bfa3512f
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark.cc
@@ -0,0 +1,494 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "benchmark_runner.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+DEFINE_bool(benchmark_list_tests, false,
+            "Print a list of benchmarks. This option overrides all other "
+            "options.");
+
+DEFINE_string(benchmark_filter, ".",
+              "A regular expression that specifies the set of benchmarks "
+              "to execute.  If this flag is empty, or if this flag is the "
+              "string \"all\", all benchmarks linked into the binary are "
+              "run.");
+
+DEFINE_double(benchmark_min_time, 0.5,
+              "Minimum number of seconds we should run benchmark before "
+              "results are considered significant.  For cpu-time based "
+              "tests, this is the lower bound on the total cpu time "
+              "used by all threads that make up the test.  For real-time "
+              "based tests, this is the lower bound on the elapsed time "
+              "of the benchmark execution, regardless of number of "
+              "threads.");
+
+DEFINE_int32(benchmark_repetitions, 1,
+             "The number of runs of each benchmark. If greater than 1, the "
+             "mean and standard deviation of the runs will be reported.");
+
+DEFINE_bool(
+    benchmark_report_aggregates_only, false,
+    "Report the result of each benchmark repetitions. When 'true' is specified "
+    "only the mean, standard deviation, and other statistics are reported for "
+    "repeated benchmarks. Affects all reporters.");
+
+DEFINE_bool(
+    benchmark_display_aggregates_only, false,
+    "Display the result of each benchmark repetitions. When 'true' is "
+    "specified only the mean, standard deviation, and other statistics are "
+    "displayed for repeated benchmarks. Unlike "
+    "benchmark_report_aggregates_only, only affects the display reporter, but "
+    "*NOT* file reporter, which will still contain all the output.");
+
+DEFINE_string(benchmark_format, "console",
+              "The format to use for console output. Valid values are "
+              "'console', 'json', or 'csv'.");
+
+DEFINE_string(benchmark_out_format, "json",
+              "The format to use for file output. Valid values are "
+              "'console', 'json', or 'csv'.");
+
+DEFINE_string(benchmark_out, "", "The file to write additional output to");
+
+DEFINE_string(benchmark_color, "auto",
+              "Whether to use colors in the output.  Valid values: "
+              "'true'/'yes'/1, 'false'/'no'/0, and 'auto'. 'auto' means to use "
+              "colors if the output is being sent to a terminal and the TERM "
+              "environment variable is set to a terminal type that supports "
+              "colors.");
+
+DEFINE_bool(benchmark_counters_tabular, false,
+            "Whether to use tabular format when printing user counters to "
+            "the console.  Valid values: 'true'/'yes'/1, 'false'/'no'/0."
+            "Defaults to false.");
+
+DEFINE_int32(v, 0, "The level of verbose logging to output");
+
+namespace benchmark {
+
+namespace internal {
+
+// FIXME: wouldn't LTO mess this up?
+void UseCharPointer(char const volatile*) {}
+
+}  // namespace internal
+
+State::State(IterationCount max_iters, const std::vector<int64_t>& ranges,
+             int thread_i, int n_threads, internal::ThreadTimer* timer,
+             internal::ThreadManager* manager)
+    : total_iterations_(0),
+      batch_leftover_(0),
+      max_iterations(max_iters),
+      started_(false),
+      finished_(false),
+      error_occurred_(false),
+      range_(ranges),
+      complexity_n_(0),
+      counters(),
+      thread_index(thread_i),
+      threads(n_threads),
+      timer_(timer),
+      manager_(manager) {
+  CHECK(max_iterations != 0) << "At least one iteration must be run";
+  CHECK_LT(thread_index, threads) << "thread_index must be less than threads";
+
+  // Note: The use of offsetof below is technically undefined until C++17
+  // because State is not a standard layout type. However, all compilers
+  // currently provide well-defined behavior as an extension (which is
+  // demonstrated since constexpr evaluation must diagnose all undefined
+  // behavior). However, GCC and Clang also warn about this use of offsetof,
+  // which must be suppressed.
+#if defined(__INTEL_COMPILER)
+#pragma warning push
+#pragma warning(disable:1875)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winvalid-offsetof"
+#endif
+  // Offset tests to ensure commonly accessed data is on the first cache line.
+  const int cache_line_size = 64;
+  static_assert(offsetof(State, error_occurred_) <=
+                    (cache_line_size - sizeof(error_occurred_)),
+                "");
+#if defined(__INTEL_COMPILER)
+#pragma warning pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+void State::PauseTiming() {
+  // Add in time accumulated so far
+  CHECK(started_ && !finished_ && !error_occurred_);
+  timer_->StopTimer();
+}
+
+void State::ResumeTiming() {
+  CHECK(started_ && !finished_ && !error_occurred_);
+  timer_->StartTimer();
+}
+
+void State::SkipWithError(const char* msg) {
+  CHECK(msg);
+  error_occurred_ = true;
+  {
+    MutexLock l(manager_->GetBenchmarkMutex());
+    if (manager_->results.has_error_ == false) {
+      manager_->results.error_message_ = msg;
+      manager_->results.has_error_ = true;
+    }
+  }
+  total_iterations_ = 0;
+  if (timer_->running()) timer_->StopTimer();
+}
+
+void State::SetIterationTime(double seconds) {
+  timer_->SetIterationTime(seconds);
+}
+
+void State::SetLabel(const char* label) {
+  MutexLock l(manager_->GetBenchmarkMutex());
+  manager_->results.report_label_ = label;
+}
+
+void State::StartKeepRunning() {
+  CHECK(!started_ && !finished_);
+  started_ = true;
+  total_iterations_ = error_occurred_ ? 0 : max_iterations;
+  manager_->StartStopBarrier();
+  if (!error_occurred_) ResumeTiming();
+}
+
+void State::FinishKeepRunning() {
+  CHECK(started_ && (!finished_ || error_occurred_));
+  if (!error_occurred_) {
+    PauseTiming();
+  }
+  // Total iterations has now wrapped around past 0. Fix this.
+  total_iterations_ = 0;
+  finished_ = true;
+  manager_->StartStopBarrier();
+}
+
+namespace internal {
+namespace {
+
+void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
+                   BenchmarkReporter* display_reporter,
+                   BenchmarkReporter* file_reporter) {
+  // Note the file_reporter can be null.
+  CHECK(display_reporter != nullptr);
+
+  // Determine the width of the name field using a minimum width of 10.
+  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
+  size_t name_field_width = 10;
+  size_t stat_field_width = 0;
+  for (const BenchmarkInstance& benchmark : benchmarks) {
+    name_field_width =
+        std::max<size_t>(name_field_width, benchmark.name.str().size());
+    might_have_aggregates |= benchmark.repetitions > 1;
+
+    for (const auto& Stat : *benchmark.statistics)
+      stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
+  }
+  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
+
+  // Print header here
+  BenchmarkReporter::Context context;
+  context.name_field_width = name_field_width;
+
+  // Keep track of running times of all instances of current benchmark
+  std::vector<BenchmarkReporter::Run> complexity_reports;
+
+  // We flush streams after invoking reporter methods that write to them. This
+  // ensures users get timely updates even when streams are not line-buffered.
+  auto flushStreams = [](BenchmarkReporter* reporter) {
+    if (!reporter) return;
+    std::flush(reporter->GetOutputStream());
+    std::flush(reporter->GetErrorStream());
+  };
+
+  if (display_reporter->ReportContext(context) &&
+      (!file_reporter || file_reporter->ReportContext(context))) {
+    flushStreams(display_reporter);
+    flushStreams(file_reporter);
+
+    for (const auto& benchmark : benchmarks) {
+      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
+
+      auto report = [&run_results](BenchmarkReporter* reporter,
+                                   bool report_aggregates_only) {
+        assert(reporter);
+        // If there are no aggregates, do output non-aggregates.
+        report_aggregates_only &= !run_results.aggregates_only.empty();
+        if (!report_aggregates_only)
+          reporter->ReportRuns(run_results.non_aggregates);
+        if (!run_results.aggregates_only.empty())
+          reporter->ReportRuns(run_results.aggregates_only);
+      };
+
+      report(display_reporter, run_results.display_report_aggregates_only);
+      if (file_reporter)
+        report(file_reporter, run_results.file_report_aggregates_only);
+
+      flushStreams(display_reporter);
+      flushStreams(file_reporter);
+    }
+  }
+  display_reporter->Finalize();
+  if (file_reporter) file_reporter->Finalize();
+  flushStreams(display_reporter);
+  flushStreams(file_reporter);
+}
+
+std::unique_ptr<BenchmarkReporter> CreateReporter(
+    std::string const& name, ConsoleReporter::OutputOptions output_opts) {
+  typedef std::unique_ptr<BenchmarkReporter> PtrType;
+  if (name == "console") {
+    return PtrType(new ConsoleReporter(output_opts));
+  } else if (name == "json") {
+    return PtrType(new JSONReporter);
+  } else if (name == "csv") {
+    return PtrType(new CSVReporter);
+  } else {
+    std::cerr << "Unexpected format: '" << name << "'\n";
+    std::exit(1);
+  }
+}
+
+}  // end namespace
+
+bool IsZero(double n) {
+  return std::abs(n) < std::numeric_limits<double>::epsilon();
+}
+
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
+  int output_opts = ConsoleReporter::OO_Defaults;
+  auto is_benchmark_color = [force_no_color] () -> bool {
+    if (force_no_color) {
+      return false;
+    }
+    if (FLAGS_benchmark_color == "auto") {
+      return IsColorTerminal();
+    }
+    return IsTruthyFlagValue(FLAGS_benchmark_color);
+  };
+  if (is_benchmark_color()) {
+    output_opts |= ConsoleReporter::OO_Color;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Color;
+  }
+  if (FLAGS_benchmark_counters_tabular) {
+    output_opts |= ConsoleReporter::OO_Tabular;
+  } else {
+    output_opts &= ~ConsoleReporter::OO_Tabular;
+  }
+  return static_cast<ConsoleReporter::OutputOptions>(output_opts);
+}
+
+}  // end namespace internal
+
+size_t RunSpecifiedBenchmarks() {
+  return RunSpecifiedBenchmarks(nullptr, nullptr);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr);
+}
+
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
+                              BenchmarkReporter* file_reporter) {
+  std::string spec = FLAGS_benchmark_filter;
+  if (spec.empty() || spec == "all")
+    spec = ".";  // Regexp that matches all benchmarks
+
+  // Setup the reporters
+  std::ofstream output_file;
+  std::unique_ptr<BenchmarkReporter> default_display_reporter;
+  std::unique_ptr<BenchmarkReporter> default_file_reporter;
+  if (!display_reporter) {
+    default_display_reporter = internal::CreateReporter(
+        FLAGS_benchmark_format, internal::GetOutputOptions());
+    display_reporter = default_display_reporter.get();
+  }
+  auto& Out = display_reporter->GetOutputStream();
+  auto& Err = display_reporter->GetErrorStream();
+
+  std::string const& fname = FLAGS_benchmark_out;
+  if (fname.empty() && file_reporter) {
+    Err << "A custom file reporter was provided but "
+           "--benchmark_out=<file> was not specified."
+        << std::endl;
+    std::exit(1);
+  }
+  if (!fname.empty()) {
+    output_file.open(fname);
+    if (!output_file.is_open()) {
+      Err << "invalid file name: '" << fname << std::endl;
+      std::exit(1);
+    }
+    if (!file_reporter) {
+      default_file_reporter = internal::CreateReporter(
+          FLAGS_benchmark_out_format, ConsoleReporter::OO_None);
+      file_reporter = default_file_reporter.get();
+    }
+    file_reporter->SetOutputStream(&output_file);
+    file_reporter->SetErrorStream(&output_file);
+  }
+
+  std::vector<internal::BenchmarkInstance> benchmarks;
+  if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
+
+  if (benchmarks.empty()) {
+    Err << "Failed to match any benchmarks against regex: " << spec << "\n";
+    return 0;
+  }
+
+  if (FLAGS_benchmark_list_tests) {
+    for (auto const& benchmark : benchmarks)
+      Out << benchmark.name.str() << "\n";
+  } else {
+    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
+  }
+
+  return benchmarks.size();
+}
+
+void RegisterMemoryManager(MemoryManager* manager) {
+  internal::memory_manager = manager;
+}
+
+namespace internal {
+
+void PrintUsageAndExit() {
+  fprintf(stdout,
+          "benchmark"
+          " [--benchmark_list_tests={true|false}]\n"
+          "          [--benchmark_filter=<regex>]\n"
+          "          [--benchmark_min_time=<min_time>]\n"
+          "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
+          "          [--benchmark_format=<console|json|csv>]\n"
+          "          [--benchmark_out=<filename>]\n"
+          "          [--benchmark_out_format=<json|console|csv>]\n"
+          "          [--benchmark_color={auto|true|false}]\n"
+          "          [--benchmark_counters_tabular={true|false}]\n"
+          "          [--v=<verbosity>]\n");
+  exit(0);
+}
+
+void ParseCommandLineFlags(int* argc, char** argv) {
+  using namespace benchmark;
+  BenchmarkReporter::Context::executable_name =
+      (argc && *argc > 0) ? argv[0] : "unknown";
+  for (int i = 1; i < *argc; ++i) {
+    if (ParseBoolFlag(argv[i], "benchmark_list_tests",
+                      &FLAGS_benchmark_list_tests) ||
+        ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
+        ParseDoubleFlag(argv[i], "benchmark_min_time",
+                        &FLAGS_benchmark_min_time) ||
+        ParseInt32Flag(argv[i], "benchmark_repetitions",
+                       &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
+                      &FLAGS_benchmark_report_aggregates_only) ||
+        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
+                      &FLAGS_benchmark_display_aggregates_only) ||
+        ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
+        ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
+        ParseStringFlag(argv[i], "benchmark_out_format",
+                        &FLAGS_benchmark_out_format) ||
+        ParseStringFlag(argv[i], "benchmark_color", &FLAGS_benchmark_color) ||
+        // "color_print" is the deprecated name for "benchmark_color".
+        // TODO: Remove this.
+        ParseStringFlag(argv[i], "color_print", &FLAGS_benchmark_color) ||
+        ParseBoolFlag(argv[i], "benchmark_counters_tabular",
+                      &FLAGS_benchmark_counters_tabular) ||
+        ParseInt32Flag(argv[i], "v", &FLAGS_v)) {
+      for (int j = i; j != *argc - 1; ++j) argv[j] = argv[j + 1];
+
+      --(*argc);
+      --i;
+    } else if (IsFlag(argv[i], "help")) {
+      PrintUsageAndExit();
+    }
+  }
+  for (auto const* flag :
+       {&FLAGS_benchmark_format, &FLAGS_benchmark_out_format})
+    if (*flag != "console" && *flag != "json" && *flag != "csv") {
+      PrintUsageAndExit();
+    }
+  if (FLAGS_benchmark_color.empty()) {
+    PrintUsageAndExit();
+  }
+}
+
+int InitializeStreams() {
+  static std::ios_base::Init init;
+  return 0;
+}
+
+}  // end namespace internal
+
+void Initialize(int* argc, char** argv) {
+  internal::ParseCommandLineFlags(argc, argv);
+  internal::LogLevel() = FLAGS_v;
+}
+
+bool ReportUnrecognizedArguments(int argc, char** argv) {
+  for (int i = 1; i < argc; ++i) {
+    fprintf(stderr, "%s: error: unrecognized command-line flag: %s\n", argv[0],
+            argv[i]);
+  }
+  return argc > 1;
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_api_internal.cc b/thirdparty/benchmark-1.5.0/src/benchmark_api_internal.cc
new file mode 100644
index 0000000000..d468a257e3
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_api_internal.cc
@@ -0,0 +1,15 @@
+#include "benchmark_api_internal.h"
+
+namespace benchmark {
+namespace internal {
+
+State BenchmarkInstance::Run(IterationCount iters, int thread_id,
+                             internal::ThreadTimer* timer,
+                             internal::ThreadManager* manager) const {
+  State st(iters, arg, thread_id, threads, timer, manager);
+  benchmark->Run(st);
+  return st;
+}
+
+}  // internal
+}  // benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_api_internal.h b/thirdparty/benchmark-1.5.0/src/benchmark_api_internal.h
new file mode 100644
index 0000000000..264eff95c5
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_api_internal.h
@@ -0,0 +1,53 @@
+#ifndef BENCHMARK_API_INTERNAL_H
+#define BENCHMARK_API_INTERNAL_H
+
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
+#include <cmath>
+#include <iosfwd>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace benchmark {
+namespace internal {
+
+// Information kept per benchmark we may want to run
+struct BenchmarkInstance {
+  BenchmarkName name;
+  Benchmark* benchmark;
+  AggregationReportMode aggregation_report_mode;
+  std::vector<int64_t> arg;
+  TimeUnit time_unit;
+  int range_multiplier;
+  bool measure_process_cpu_time;
+  bool use_real_time;
+  bool use_manual_time;
+  BigO complexity;
+  BigOFunc* complexity_lambda;
+  UserCounters counters;
+  const std::vector<Statistics>* statistics;
+  bool last_benchmark_instance;
+  int repetitions;
+  double min_time;
+  IterationCount iterations;
+  int threads;  // Number of concurrent threads to us
+
+  State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager) const;
+};
+
+bool FindBenchmarksInternal(const std::string& re,
+                            std::vector<BenchmarkInstance>* benchmarks,
+                            std::ostream* Err);
+
+bool IsZero(double n);
+
+ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color = false);
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_API_INTERNAL_H
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_main.cc b/thirdparty/benchmark-1.5.0/src/benchmark_main.cc
new file mode 100644
index 0000000000..b3b2478314
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_main.cc
@@ -0,0 +1,17 @@
+// Copyright 2018 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_name.cc b/thirdparty/benchmark-1.5.0/src/benchmark_name.cc
new file mode 100644
index 0000000000..2a17ebce27
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_name.cc
@@ -0,0 +1,58 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <benchmark/benchmark.h>
+
+namespace benchmark {
+
+namespace {
+
+// Compute the total size of a pack of std::strings
+size_t size_impl() { return 0; }
+
+template <typename Head, typename... Tail>
+size_t size_impl(const Head& head, const Tail&... tail) {
+  return head.size() + size_impl(tail...);
+}
+
+// Join a pack of std::strings using a delimiter
+// TODO: use absl::StrJoin
+void join_impl(std::string&, char) {}
+
+template <typename Head, typename... Tail>
+void join_impl(std::string& s, const char delimiter, const Head& head,
+               const Tail&... tail) {
+  if (!s.empty() && !head.empty()) {
+    s += delimiter;
+  }
+
+  s += head;
+
+  join_impl(s, delimiter, tail...);
+}
+
+template <typename... Ts>
+std::string join(char delimiter, const Ts&... ts) {
+  std::string s;
+  s.reserve(sizeof...(Ts) + size_impl(ts...));
+  join_impl(s, delimiter, ts...);
+  return s;
+}
+}  // namespace
+
+std::string BenchmarkName::str() const {
+  return join('/', function_name, args, min_time, iterations, repetitions,
+              time_type, threads);
+}
+}  // namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_register.cc b/thirdparty/benchmark-1.5.0/src/benchmark_register.cc
new file mode 100644
index 0000000000..6696c382b8
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_register.cc
@@ -0,0 +1,504 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_register.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <thread>
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "check.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "timers.h"
+
+namespace benchmark {
+
+namespace {
+// For non-dense Range, intermediate values are powers of kRangeMultiplier.
+static const int kRangeMultiplier = 8;
+// The size of a benchmark family determines is the number of inputs to repeat
+// the benchmark on. If this is "large" then warn the user during configuration.
+static const size_t kMaxFamilySize = 100;
+}  // end namespace
+
+namespace internal {
+
+//=============================================================================//
+//                         BenchmarkFamilies
+//=============================================================================//
+
+// Class for managing registered benchmarks.  Note that each registered
+// benchmark identifies a family of related benchmarks to run.
+class BenchmarkFamilies {
+ public:
+  static BenchmarkFamilies* GetInstance();
+
+  // Registers a benchmark family and returns the index assigned to it.
+  size_t AddBenchmark(std::unique_ptr<Benchmark> family);
+
+  // Clear all registered benchmark families.
+  void ClearBenchmarks();
+
+  // Extract the list of benchmark instances that match the specified
+  // regular expression.
+  bool FindBenchmarks(std::string re,
+                      std::vector<BenchmarkInstance>* benchmarks,
+                      std::ostream* Err);
+
+ private:
+  BenchmarkFamilies() {}
+
+  std::vector<std::unique_ptr<Benchmark>> families_;
+  Mutex mutex_;
+};
+
+BenchmarkFamilies* BenchmarkFamilies::GetInstance() {
+  static BenchmarkFamilies instance;
+  return &instance;
+}
+
+size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) {
+  MutexLock l(mutex_);
+  size_t index = families_.size();
+  families_.push_back(std::move(family));
+  return index;
+}
+
+void BenchmarkFamilies::ClearBenchmarks() {
+  MutexLock l(mutex_);
+  families_.clear();
+  families_.shrink_to_fit();
+}
+
+bool BenchmarkFamilies::FindBenchmarks(
+    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
+    std::ostream* ErrStream) {
+  CHECK(ErrStream);
+  auto& Err = *ErrStream;
+  // Make regular expression out of command-line flag
+  std::string error_msg;
+  Regex re;
+  bool isNegativeFilter = false;
+  if (spec[0] == '-') {
+    spec.replace(0, 1, "");
+    isNegativeFilter = true;
+  }
+  if (!re.Init(spec, &error_msg)) {
+    Err << "Could not compile benchmark re: " << error_msg << std::endl;
+    return false;
+  }
+
+  // Special list of thread counts to use when none are specified
+  const std::vector<int> one_thread = {1};
+
+  MutexLock l(mutex_);
+  for (std::unique_ptr<Benchmark>& family : families_) {
+    // Family was deleted or benchmark doesn't match
+    if (!family) continue;
+
+    if (family->ArgsCnt() == -1) {
+      family->Args({});
+    }
+    const std::vector<int>* thread_counts =
+        (family->thread_counts_.empty()
+             ? &one_thread
+             : &static_cast<const std::vector<int>&>(family->thread_counts_));
+    const size_t family_size = family->args_.size() * thread_counts->size();
+    // The benchmark will be run at least 'family_size' different inputs.
+    // If 'family_size' is very large warn the user.
+    if (family_size > kMaxFamilySize) {
+      Err << "The number of inputs is very large. " << family->name_
+          << " will be repeated at least " << family_size << " times.\n";
+    }
+    // reserve in the special case the regex ".", since we know the final
+    // family size.
+    if (spec == ".") benchmarks->reserve(family_size);
+
+    for (auto const& args : family->args_) {
+      for (int num_threads : *thread_counts) {
+        BenchmarkInstance instance;
+        instance.name.function_name = family->name_;
+        instance.benchmark = family.get();
+        instance.aggregation_report_mode = family->aggregation_report_mode_;
+        instance.arg = args;
+        instance.time_unit = family->time_unit_;
+        instance.range_multiplier = family->range_multiplier_;
+        instance.min_time = family->min_time_;
+        instance.iterations = family->iterations_;
+        instance.repetitions = family->repetitions_;
+        instance.measure_process_cpu_time = family->measure_process_cpu_time_;
+        instance.use_real_time = family->use_real_time_;
+        instance.use_manual_time = family->use_manual_time_;
+        instance.complexity = family->complexity_;
+        instance.complexity_lambda = family->complexity_lambda_;
+        instance.statistics = &family->statistics_;
+        instance.threads = num_threads;
+
+        // Add arguments to instance name
+        size_t arg_i = 0;
+        for (auto const& arg : args) {
+          if (!instance.name.args.empty()) {
+            instance.name.args += '/';
+          }
+
+          if (arg_i < family->arg_names_.size()) {
+            const auto& arg_name = family->arg_names_[arg_i];
+            if (!arg_name.empty()) {
+              instance.name.args += StrFormat("%s:", arg_name.c_str());
+            }
+          }
+
+          instance.name.args += StrFormat("%" PRId64, arg);
+          ++arg_i;
+        }
+
+        if (!IsZero(family->min_time_))
+          instance.name.min_time =
+              StrFormat("min_time:%0.3f", family->min_time_);
+        if (family->iterations_ != 0) {
+          instance.name.iterations =
+              StrFormat("iterations:%lu",
+                        static_cast<unsigned long>(family->iterations_));
+        }
+        if (family->repetitions_ != 0)
+          instance.name.repetitions =
+              StrFormat("repeats:%d", family->repetitions_);
+
+        if (family->measure_process_cpu_time_) {
+          instance.name.time_type = "process_time";
+        }
+
+        if (family->use_manual_time_) {
+          if (!instance.name.time_type.empty()) {
+            instance.name.time_type += '/';
+          }
+          instance.name.time_type += "manual_time";
+        } else if (family->use_real_time_) {
+          if (!instance.name.time_type.empty()) {
+            instance.name.time_type += '/';
+          }
+          instance.name.time_type += "real_time";
+        }
+
+        // Add the number of threads used to the name
+        if (!family->thread_counts_.empty()) {
+          instance.name.threads = StrFormat("threads:%d", instance.threads);
+        }
+
+        const auto full_name = instance.name.str();
+        if ((re.Match(full_name) && !isNegativeFilter) ||
+            (!re.Match(full_name) && isNegativeFilter)) {
+          instance.last_benchmark_instance = (&args == &family->args_.back());
+          benchmarks->push_back(std::move(instance));
+        }
+      }
+    }
+  }
+  return true;
+}
+
+Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
+  std::unique_ptr<Benchmark> bench_ptr(bench);
+  BenchmarkFamilies* families = BenchmarkFamilies::GetInstance();
+  families->AddBenchmark(std::move(bench_ptr));
+  return bench;
+}
+
+// FIXME: This function is a hack so that benchmark.cc can access
+// `BenchmarkFamilies`
+bool FindBenchmarksInternal(const std::string& re,
+                            std::vector<BenchmarkInstance>* benchmarks,
+                            std::ostream* Err) {
+  return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err);
+}
+
+//=============================================================================//
+//                               Benchmark
+//=============================================================================//
+
+Benchmark::Benchmark(const char* name)
+    : name_(name),
+      aggregation_report_mode_(ARM_Unspecified),
+      time_unit_(kNanosecond),
+      range_multiplier_(kRangeMultiplier),
+      min_time_(0),
+      iterations_(0),
+      repetitions_(0),
+      measure_process_cpu_time_(false),
+      use_real_time_(false),
+      use_manual_time_(false),
+      complexity_(oNone),
+      complexity_lambda_(nullptr) {
+  ComputeStatistics("mean", StatisticsMean);
+  ComputeStatistics("median", StatisticsMedian);
+  ComputeStatistics("stddev", StatisticsStdDev);
+}
+
+Benchmark::~Benchmark() {}
+
+Benchmark* Benchmark::Arg(int64_t x) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  args_.push_back({x});
+  return this;
+}
+
+Benchmark* Benchmark::Unit(TimeUnit unit) {
+  time_unit_ = unit;
+  return this;
+}
+
+Benchmark* Benchmark::Range(int64_t start, int64_t limit) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  std::vector<int64_t> arglist;
+  AddRange(&arglist, start, limit, range_multiplier_);
+
+  for (int64_t i : arglist) {
+    args_.push_back({i});
+  }
+  return this;
+}
+
+Benchmark* Benchmark::Ranges(
+    const std::vector<std::pair<int64_t, int64_t>>& ranges) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(ranges.size()));
+  std::vector<std::vector<int64_t>> arglists(ranges.size());
+  std::size_t total = 1;
+  for (std::size_t i = 0; i < ranges.size(); i++) {
+    AddRange(&arglists[i], ranges[i].first, ranges[i].second,
+             range_multiplier_);
+    total *= arglists[i].size();
+  }
+
+  std::vector<std::size_t> ctr(arglists.size(), 0);
+
+  for (std::size_t i = 0; i < total; i++) {
+    std::vector<int64_t> tmp;
+    tmp.reserve(arglists.size());
+
+    for (std::size_t j = 0; j < arglists.size(); j++) {
+      tmp.push_back(arglists[j].at(ctr[j]));
+    }
+
+    args_.push_back(std::move(tmp));
+
+    for (std::size_t j = 0; j < arglists.size(); j++) {
+      if (ctr[j] + 1 < arglists[j].size()) {
+        ++ctr[j];
+        break;
+      }
+      ctr[j] = 0;
+    }
+  }
+  return this;
+}
+
+Benchmark* Benchmark::ArgName(const std::string& name) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  arg_names_ = {name};
+  return this;
+}
+
+Benchmark* Benchmark::ArgNames(const std::vector<std::string>& names) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(names.size()));
+  arg_names_ = names;
+  return this;
+}
+
+Benchmark* Benchmark::DenseRange(int64_t start, int64_t limit, int step) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == 1);
+  CHECK_LE(start, limit);
+  for (int64_t arg = start; arg <= limit; arg += step) {
+    args_.push_back({arg});
+  }
+  return this;
+}
+
+Benchmark* Benchmark::Args(const std::vector<int64_t>& args) {
+  CHECK(ArgsCnt() == -1 || ArgsCnt() == static_cast<int>(args.size()));
+  args_.push_back(args);
+  return this;
+}
+
+Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) {
+  custom_arguments(this);
+  return this;
+}
+
+Benchmark* Benchmark::RangeMultiplier(int multiplier) {
+  CHECK(multiplier > 1);
+  range_multiplier_ = multiplier;
+  return this;
+}
+
+Benchmark* Benchmark::MinTime(double t) {
+  CHECK(t > 0.0);
+  CHECK(iterations_ == 0);
+  min_time_ = t;
+  return this;
+}
+
+Benchmark* Benchmark::Iterations(IterationCount n) {
+  CHECK(n > 0);
+  CHECK(IsZero(min_time_));
+  iterations_ = n;
+  return this;
+}
+
+Benchmark* Benchmark::Repetitions(int n) {
+  CHECK(n > 0);
+  repetitions_ = n;
+  return this;
+}
+
+Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
+  aggregation_report_mode_ = value ? ARM_ReportAggregatesOnly : ARM_Default;
+  return this;
+}
+
+Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
+  // If we were called, the report mode is no longer 'unspecified', in any case.
+  aggregation_report_mode_ = static_cast<AggregationReportMode>(
+      aggregation_report_mode_ | ARM_Default);
+
+  if (value) {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ | ARM_DisplayReportAggregatesOnly);
+  } else {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ & ~ARM_DisplayReportAggregatesOnly);
+  }
+
+  return this;
+}
+
+Benchmark* Benchmark::MeasureProcessCPUTime() {
+  // Can be used together with UseRealTime() / UseManualTime().
+  measure_process_cpu_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::UseRealTime() {
+  CHECK(!use_manual_time_)
+      << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_real_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::UseManualTime() {
+  CHECK(!use_real_time_)
+      << "Cannot set UseRealTime and UseManualTime simultaneously.";
+  use_manual_time_ = true;
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigO complexity) {
+  complexity_ = complexity;
+  return this;
+}
+
+Benchmark* Benchmark::Complexity(BigOFunc* complexity) {
+  complexity_lambda_ = complexity;
+  complexity_ = oLambda;
+  return this;
+}
+
+Benchmark* Benchmark::ComputeStatistics(std::string name,
+                                        StatisticsFunc* statistics) {
+  statistics_.emplace_back(name, statistics);
+  return this;
+}
+
+Benchmark* Benchmark::Threads(int t) {
+  CHECK_GT(t, 0);
+  thread_counts_.push_back(t);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+
+  AddRange(&thread_counts_, min_threads, max_threads, 2);
+  return this;
+}
+
+Benchmark* Benchmark::DenseThreadRange(int min_threads, int max_threads,
+                                       int stride) {
+  CHECK_GT(min_threads, 0);
+  CHECK_GE(max_threads, min_threads);
+  CHECK_GE(stride, 1);
+
+  for (auto i = min_threads; i < max_threads; i += stride) {
+    thread_counts_.push_back(i);
+  }
+  thread_counts_.push_back(max_threads);
+  return this;
+}
+
+Benchmark* Benchmark::ThreadPerCpu() {
+  thread_counts_.push_back(CPUInfo::Get().num_cpus);
+  return this;
+}
+
+void Benchmark::SetName(const char* name) { name_ = name; }
+
+int Benchmark::ArgsCnt() const {
+  if (args_.empty()) {
+    if (arg_names_.empty()) return -1;
+    return static_cast<int>(arg_names_.size());
+  }
+  return static_cast<int>(args_.front().size());
+}
+
+//=============================================================================//
+//                            FunctionBenchmark
+//=============================================================================//
+
+void FunctionBenchmark::Run(State& st) { func_(st); }
+
+}  // end namespace internal
+
+void ClearRegisteredBenchmarks() {
+  internal::BenchmarkFamilies::GetInstance()->ClearBenchmarks();
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_register.h b/thirdparty/benchmark-1.5.0/src/benchmark_register.h
new file mode 100644
index 0000000000..61377d7423
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_register.h
@@ -0,0 +1,107 @@
+#ifndef BENCHMARK_REGISTER_H
+#define BENCHMARK_REGISTER_H
+
+#include <vector>
+
+#include "check.h"
+
+namespace benchmark {
+namespace internal {
+
+// Append the powers of 'mult' in the closed interval [lo, hi].
+// Returns iterator to the start of the inserted range.
+template <typename T>
+typename std::vector<T>::iterator
+AddPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  CHECK_GE(lo, 0);
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  const size_t start_offset = dst->size();
+
+  static const T kmax = std::numeric_limits<T>::max();
+
+  // Space out the values in multiples of "mult"
+  for (T i = 1; i <= hi; i *= mult) {
+    if (i >= lo) {
+      dst->push_back(i);
+    }
+    // Break the loop here since multiplying by
+    // 'mult' would move outside of the range of T
+    if (i > kmax / mult) break;
+  }
+
+  return dst->begin() + start_offset;
+}
+
+template <typename T>
+void AddNegatedPowers(std::vector<T>* dst, T lo, T hi, int mult) {
+  // We negate lo and hi so we require that they cannot be equal to 'min'.
+  CHECK_GT(lo, std::numeric_limits<T>::min());
+  CHECK_GT(hi, std::numeric_limits<T>::min());
+  CHECK_GE(hi, lo);
+  CHECK_LE(hi, 0);
+
+  // Add positive powers, then negate and reverse.
+  // Casts necessary since small integers get promoted
+  // to 'int' when negating.
+  const auto lo_complement = static_cast<T>(-lo);
+  const auto hi_complement = static_cast<T>(-hi);
+
+  const auto it = AddPowers(dst, hi_complement, lo_complement, mult);
+
+  std::for_each(it, dst->end(), [](T& t) { t *= -1; });
+  std::reverse(it, dst->end());
+}
+
+template <typename T>
+void AddRange(std::vector<T>* dst, T lo, T hi, int mult) {
+  static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
+                "Args type must be a signed integer");
+
+  CHECK_GE(hi, lo);
+  CHECK_GE(mult, 2);
+
+  // Add "lo"
+  dst->push_back(lo);
+
+  // Handle lo == hi as a special case, so we then know
+  // lo < hi and so it is safe to add 1 to lo and subtract 1
+  // from hi without falling outside of the range of T.
+  if (lo == hi) return;
+
+  // Ensure that lo_inner <= hi_inner below.
+  if (lo + 1 == hi) {
+    dst->push_back(hi);
+    return;
+  }
+
+  // Add all powers of 'mult' in the range [lo+1, hi-1] (inclusive).
+  const auto lo_inner = static_cast<T>(lo + 1);
+  const auto hi_inner = static_cast<T>(hi - 1);
+
+  // Insert negative values
+  if (lo_inner < 0) {
+    AddNegatedPowers(dst, lo_inner, std::min(hi_inner, T{-1}), mult);
+  }
+
+  // Treat 0 as a special case (see discussion on #762).
+  if (lo <= 0 && hi >= 0) {
+    dst->push_back(0);
+  }
+
+  // Insert positive values
+  if (hi_inner > 0) {
+    AddPowers(dst, std::max(lo_inner, T{1}), hi_inner, mult);
+  }
+
+  // Add "hi" (if different from last value).
+  if (hi != dst->back()) {
+    dst->push_back(hi);
+  }
+}
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_REGISTER_H
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_runner.cc b/thirdparty/benchmark-1.5.0/src/benchmark_runner.cc
new file mode 100644
index 0000000000..0bae6a545e
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_runner.cc
@@ -0,0 +1,361 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_runner.h"
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+namespace benchmark {
+
+namespace internal {
+
+MemoryManager* memory_manager = nullptr;
+
+namespace {
+
+static constexpr IterationCount kMaxIterations = 1000000000;
+
+BenchmarkReporter::Run CreateRunReport(
+    const benchmark::internal::BenchmarkInstance& b,
+    const internal::ThreadManager::Result& results,
+    IterationCount memory_iterations,
+    const MemoryManager::Result& memory_result, double seconds,
+    int64_t repetition_index) {
+  // Create report about this benchmark run.
+  BenchmarkReporter::Run report;
+
+  report.run_name = b.name;
+  report.error_occurred = results.has_error_;
+  report.error_message = results.error_message_;
+  report.report_label = results.report_label_;
+  // This is the total iterations across all threads.
+  report.iterations = results.iterations;
+  report.time_unit = b.time_unit;
+  report.threads = b.threads;
+  report.repetition_index = repetition_index;
+  report.repetitions = b.repetitions;
+
+  if (!report.error_occurred) {
+    if (b.use_manual_time) {
+      report.real_accumulated_time = results.manual_time_used;
+    } else {
+      report.real_accumulated_time = results.real_time_used;
+    }
+    report.cpu_accumulated_time = results.cpu_time_used;
+    report.complexity_n = results.complexity_n;
+    report.complexity = b.complexity;
+    report.complexity_lambda = b.complexity_lambda;
+    report.statistics = b.statistics;
+    report.counters = results.counters;
+
+    if (memory_iterations > 0) {
+      report.has_memory_result = true;
+      report.allocs_per_iter =
+          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+                                  memory_iterations
+                            : 0;
+      report.max_bytes_used = memory_result.max_bytes_used;
+    }
+
+    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+  }
+  return report;
+}
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const BenchmarkInstance* b, IterationCount iters,
+                 int thread_id, ThreadManager* manager) {
+  internal::ThreadTimer timer(
+      b->measure_process_cpu_time
+          ? internal::ThreadTimer::CreateProcessCpuTime()
+          : internal::ThreadTimer::Create());
+  State st = b->Run(iters, thread_id, &timer, manager);
+  CHECK(st.iterations() >= st.max_iterations)
+      << "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    internal::ThreadManager::Result& results = manager->results;
+    results.iterations += st.iterations();
+    results.cpu_time_used += timer.cpu_time_used();
+    results.real_time_used += timer.real_time_used();
+    results.manual_time_used += timer.manual_time_used();
+    results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
+  }
+  manager->NotifyThreadComplete();
+}
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
+      : b(b_),
+        complexity_reports(*complexity_reports_),
+        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
+        repeats(b.repetitions != 0 ? b.repetitions
+                                   : FLAGS_benchmark_repetitions),
+        has_explicit_iteration_count(b.iterations != 0),
+        pool(b.threads - 1),
+        iters(has_explicit_iteration_count ? b.iterations : 1) {
+    run_results.display_report_aggregates_only =
+        (FLAGS_benchmark_report_aggregates_only ||
+         FLAGS_benchmark_display_aggregates_only);
+    run_results.file_report_aggregates_only =
+        FLAGS_benchmark_report_aggregates_only;
+    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
+      run_results.display_report_aggregates_only =
+          (b.aggregation_report_mode &
+           internal::ARM_DisplayReportAggregatesOnly);
+      run_results.file_report_aggregates_only =
+          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
+    }
+
+    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
+      DoOneRepetition(repetition_num);
+    }
+
+    // Calculate additional statistics
+    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+    // Maybe calculate complexity report
+    if ((b.complexity != oNone) && b.last_benchmark_instance) {
+      auto additional_run_stats = ComputeBigO(complexity_reports);
+      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                         additional_run_stats.begin(),
+                                         additional_run_stats.end());
+      complexity_reports.clear();
+    }
+  }
+
+  RunResults&& get_results() { return std::move(run_results); }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  std::vector<BenchmarkReporter::Run>& complexity_reports;
+
+  const double min_time;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  std::vector<std::thread> pool;
+
+  IterationCount iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    IterationCount iters;
+    double seconds;
+  };
+  IterationResults DoNIterations() {
+    VLOG(2) << "Running " << b.name.str() << " for " << iters << "\n";
+
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(b.threads));
+
+    // Run all but one thread in separate threads
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                             manager.get());
+    }
+    // And run one thread here directly.
+    // (If we were asked to run just one thread, we don't create new threads.)
+    // Yes, we need to do this here *after* we start the separate threads.
+    RunInThread(&b, iters, 0, manager.get());
+
+    // The main thread has finished. Now let's wait for the other threads.
+    manager->WaitForAllThreads();
+    for (std::thread& thread : pool) thread.join();
+
+    IterationResults i;
+    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+    {
+      MutexLock l(manager->GetBenchmarkMutex());
+      i.results = manager->results;
+    }
+
+    // And get rid of the manager.
+    manager.reset();
+
+    // Adjust real/manual time stats since they were reported per thread.
+    i.results.real_time_used /= b.threads;
+    i.results.manual_time_used /= b.threads;
+    // If we were measuring whole-process CPU usage, adjust the CPU time too.
+    if (b.measure_process_cpu_time) i.results.cpu_time_used /= b.threads;
+
+    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+            << i.results.real_time_used << "\n";
+
+    // So for how long were we running?
+    i.iters = iters;
+    // Base decisions off of real time if requested by this benchmark.
+    i.seconds = i.results.cpu_time_used;
+    if (b.use_manual_time) {
+      i.seconds = i.results.manual_time_used;
+    } else if (b.use_real_time) {
+      i.seconds = i.results.real_time_used;
+    }
+
+    return i;
+  }
+
+  IterationCount PredictNumItersNeeded(const IterationResults& i) const {
+    // See how much iterations should be increased by.
+    // Note: Avoid division by zero with max(seconds, 1ns).
+    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+    // use the multiplier directly.
+    // Otherwise we use at most 10 times expansion.
+    // NOTE: When the last run was at least 10% of the min time the max
+    // expansion should be 14x.
+    bool is_significant = (i.seconds / min_time) > 0.1;
+    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+    if (multiplier <= 1.0) multiplier = 2.0;
+
+    // So what seems to be the sufficiently-large iteration count? Round up.
+    const IterationCount max_next_iters =
+        0.5 + std::max(multiplier * i.iters, i.iters + 1.0);
+    // But we do have *some* sanity limits though..
+    const IterationCount next_iters = std::min(max_next_iters, kMaxIterations);
+
+    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+    return next_iters;  // round up before conversion to integer.
+  }
+
+  bool ShouldReportIterationResults(const IterationResults& i) const {
+    // Determine if this run should be reported;
+    // Either it has run for a sufficient amount of time
+    // or because an error was reported.
+    return i.results.has_error_ ||
+           i.iters >= kMaxIterations ||  // Too many iterations already.
+           i.seconds >= min_time ||      // The elapsed time is large enough.
+           // CPU time is specified but the elapsed real time greatly exceeds
+           // the minimum time.
+           // Note that user provided timers are except from this sanity check.
+           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+  }
+
+  void DoOneRepetition(int64_t repetition_index) {
+    const bool is_the_first_repetition = repetition_index == 0;
+    IterationResults i;
+
+    // We *may* be gradually increasing the length (iteration count)
+    // of the benchmark until we decide the results are significant.
+    // And once we do, we report those last results and exit.
+    // Please do note that the if there are repetitions, the iteration count
+    // is *only* calculated for the *first* repetition, and other repetitions
+    // simply use that precomputed iteration count.
+    for (;;) {
+      i = DoNIterations();
+
+      // Do we consider the results to be significant?
+      // If we are doing repetitions, and the first repetition was already done,
+      // it has calculated the correct iteration time, so we have run that very
+      // iteration count just now. No need to calculate anything. Just report.
+      // Else, the normal rules apply.
+      const bool results_are_significant = !is_the_first_repetition ||
+                                           has_explicit_iteration_count ||
+                                           ShouldReportIterationResults(i);
+
+      if (results_are_significant) break;  // Good, let's report them!
+
+      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+      // iteration count, and run the benchmark again...
+
+      iters = PredictNumItersNeeded(i);
+      assert(iters > i.iters &&
+             "if we did more iterations than we want to do the next time, "
+             "then we should have accepted the current iteration run.");
+    }
+
+    // Oh, one last thing, we need to also produce the 'memory measurements'..
+    MemoryManager::Result memory_result;
+    IterationCount memory_iterations = 0;
+    if (memory_manager != nullptr) {
+      // Only run a few iterations to reduce the impact of one-time
+      // allocations in benchmarks that are not properly managed.
+      memory_iterations = std::min<IterationCount>(16, iters);
+      memory_manager->Start();
+      std::unique_ptr<internal::ThreadManager> manager;
+      manager.reset(new internal::ThreadManager(1));
+      RunInThread(&b, memory_iterations, 0, manager.get());
+      manager->WaitForAllThreads();
+      manager.reset();
+
+      memory_manager->Stop(&memory_result);
+    }
+
+    // Ok, now actualy report.
+    BenchmarkReporter::Run report =
+        CreateRunReport(b, i.results, memory_iterations, memory_result,
+                        i.seconds, repetition_index);
+
+    if (!report.error_occurred && b.complexity != oNone)
+      complexity_reports.push_back(report);
+
+    run_results.non_aggregates.push_back(report);
+  }
+};
+
+}  // end namespace
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports) {
+  internal::BenchmarkRunner r(b, complexity_reports);
+  return r.get_results();
+}
+
+}  // end namespace internal
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/benchmark_runner.h b/thirdparty/benchmark-1.5.0/src/benchmark_runner.h
new file mode 100644
index 0000000000..96e8282a11
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/benchmark_runner.h
@@ -0,0 +1,51 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RUNNER_H_
+#define BENCHMARK_RUNNER_H_
+
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+DECLARE_double(benchmark_min_time);
+
+DECLARE_int32(benchmark_repetitions);
+
+DECLARE_bool(benchmark_report_aggregates_only);
+
+DECLARE_bool(benchmark_display_aggregates_only);
+
+namespace benchmark {
+
+namespace internal {
+
+extern MemoryManager* memory_manager;
+
+struct RunResults {
+  std::vector<BenchmarkReporter::Run> non_aggregates;
+  std::vector<BenchmarkReporter::Run> aggregates_only;
+
+  bool display_report_aggregates_only = false;
+  bool file_report_aggregates_only = false;
+};
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports);
+
+}  // namespace internal
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RUNNER_H_
diff --git a/thirdparty/benchmark-1.5.0/src/check.h b/thirdparty/benchmark-1.5.0/src/check.h
new file mode 100644
index 0000000000..f5f8253f80
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/check.h
@@ -0,0 +1,82 @@
+#ifndef CHECK_H_
+#define CHECK_H_
+
+#include <cmath>
+#include <cstdlib>
+#include <ostream>
+
+#include "internal_macros.h"
+#include "log.h"
+
+namespace benchmark {
+namespace internal {
+
+typedef void(AbortHandlerT)();
+
+inline AbortHandlerT*& GetAbortHandler() {
+  static AbortHandlerT* handler = &std::abort;
+  return handler;
+}
+
+BENCHMARK_NORETURN inline void CallAbortHandler() {
+  GetAbortHandler()();
+  std::abort();  // fallback to enforce noreturn
+}
+
+// CheckHandler is the class constructed by failing CHECK macros. CheckHandler
+// will log information about the failures and abort when it is destructed.
+class CheckHandler {
+ public:
+  CheckHandler(const char* check, const char* file, const char* func, int line)
+      : log_(GetErrorLogInstance()) {
+    log_ << file << ":" << line << ": " << func << ": Check `" << check
+         << "' failed. ";
+  }
+
+  LogType& GetLog() { return log_; }
+
+  BENCHMARK_NORETURN ~CheckHandler() BENCHMARK_NOEXCEPT_OP(false) {
+    log_ << std::endl;
+    CallAbortHandler();
+  }
+
+  CheckHandler& operator=(const CheckHandler&) = delete;
+  CheckHandler(const CheckHandler&) = delete;
+  CheckHandler() = delete;
+
+ private:
+  LogType& log_;
+};
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+// The CHECK macro returns a std::ostream object that can have extra information
+// written to it.
+#ifndef NDEBUG
+#define CHECK(b)                                                             \
+  (b ? ::benchmark::internal::GetNullLogInstance()                           \
+     : ::benchmark::internal::CheckHandler(#b, __FILE__, __func__, __LINE__) \
+           .GetLog())
+#else
+#define CHECK(b) ::benchmark::internal::GetNullLogInstance()
+#endif
+
+// clang-format off
+// preserve whitespacing between operators for alignment
+#define CHECK_EQ(a, b) CHECK((a) == (b))
+#define CHECK_NE(a, b) CHECK((a) != (b))
+#define CHECK_GE(a, b) CHECK((a) >= (b))
+#define CHECK_LE(a, b) CHECK((a) <= (b))
+#define CHECK_GT(a, b) CHECK((a) > (b))
+#define CHECK_LT(a, b) CHECK((a) < (b))
+
+#define CHECK_FLOAT_EQ(a, b, eps) CHECK(std::fabs((a) - (b)) <  (eps))
+#define CHECK_FLOAT_NE(a, b, eps) CHECK(std::fabs((a) - (b)) >= (eps))
+#define CHECK_FLOAT_GE(a, b, eps) CHECK((a) - (b) > -(eps))
+#define CHECK_FLOAT_LE(a, b, eps) CHECK((b) - (a) > -(eps))
+#define CHECK_FLOAT_GT(a, b, eps) CHECK((a) - (b) >  (eps))
+#define CHECK_FLOAT_LT(a, b, eps) CHECK((b) - (a) >  (eps))
+//clang-format on
+
+#endif  // CHECK_H_
diff --git a/thirdparty/benchmark-1.5.0/src/colorprint.cc b/thirdparty/benchmark-1.5.0/src/colorprint.cc
new file mode 100644
index 0000000000..fff6a98818
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/colorprint.cc
@@ -0,0 +1,188 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "colorprint.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "check.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <windows.h>
+#include <io.h>
+#else
+#include <unistd.h>
+#endif  // BENCHMARK_OS_WINDOWS
+
+namespace benchmark {
+namespace {
+#ifdef BENCHMARK_OS_WINDOWS
+typedef WORD PlatformColorCode;
+#else
+typedef const char* PlatformColorCode;
+#endif
+
+PlatformColorCode GetPlatformColorCode(LogColor color) {
+#ifdef BENCHMARK_OS_WINDOWS
+  switch (color) {
+    case COLOR_RED:
+      return FOREGROUND_RED;
+    case COLOR_GREEN:
+      return FOREGROUND_GREEN;
+    case COLOR_YELLOW:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
+    case COLOR_BLUE:
+      return FOREGROUND_BLUE;
+    case COLOR_MAGENTA:
+      return FOREGROUND_BLUE | FOREGROUND_RED;
+    case COLOR_CYAN:
+      return FOREGROUND_BLUE | FOREGROUND_GREEN;
+    case COLOR_WHITE:  // fall through to default
+    default:
+      return 0;
+  }
+#else
+  switch (color) {
+    case COLOR_RED:
+      return "1";
+    case COLOR_GREEN:
+      return "2";
+    case COLOR_YELLOW:
+      return "3";
+    case COLOR_BLUE:
+      return "4";
+    case COLOR_MAGENTA:
+      return "5";
+    case COLOR_CYAN:
+      return "6";
+    case COLOR_WHITE:
+      return "7";
+    default:
+      return nullptr;
+  };
+#endif
+}
+
+}  // end namespace
+
+std::string FormatString(const char* msg, va_list args) {
+  // we might need a second shot at this, so pre-emptivly make a copy
+  va_list args_cp;
+  va_copy(args_cp, args);
+
+  std::size_t size = 256;
+  char local_buff[256];
+  auto ret = vsnprintf(local_buff, size, msg, args_cp);
+
+  va_end(args_cp);
+
+  // currently there is no error handling for failure, so this is hack.
+  CHECK(ret >= 0);
+
+  if (ret == 0)  // handle empty expansion
+    return {};
+  else if (static_cast<size_t>(ret) < size)
+    return local_buff;
+  else {
+    // we did not provide a long enough buffer on our first attempt.
+    size = (size_t)ret + 1;  // + 1 for the null byte
+    std::unique_ptr<char[]> buff(new char[size]);
+    ret = vsnprintf(buff.get(), size, msg, args);
+    CHECK(ret > 0 && ((size_t)ret) < size);
+    return buff.get();
+  }
+}
+
+std::string FormatString(const char* msg, ...) {
+  va_list args;
+  va_start(args, msg);
+  auto tmp = FormatString(msg, args);
+  va_end(args);
+  return tmp;
+}
+
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+  ColorPrintf(out, color, fmt, args);
+  va_end(args);
+}
+
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
+                 va_list args) {
+#ifdef BENCHMARK_OS_WINDOWS
+  ((void)out);  // suppress unused warning
+
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle,
+                          GetPlatformColorCode(color) | FOREGROUND_INTENSITY);
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  const char* color_code = GetPlatformColorCode(color);
+  if (color_code) out << FormatString("\033[0;3%sm", color_code);
+  out << FormatString(fmt, args) << "\033[m";
+#endif
+}
+
+bool IsColorTerminal() {
+#if BENCHMARK_OS_WINDOWS
+  // On Windows the TERM variable is usually not set, but the
+  // console there does support colors.
+  return 0 != _isatty(_fileno(stdout));
+#else
+  // On non-Windows platforms, we rely on the TERM variable. This list of
+  // supported TERM values is copied from Google Test:
+  // <https://github.com/google/googletest/blob/master/googletest/src/gtest.cc#L2925>.
+  const char* const SUPPORTED_TERM_VALUES[] = {
+      "xterm",         "xterm-color",     "xterm-256color",
+      "screen",        "screen-256color", "tmux",
+      "tmux-256color", "rxvt-unicode",    "rxvt-unicode-256color",
+      "linux",         "cygwin",
+  };
+
+  const char* const term = getenv("TERM");
+
+  bool term_supports_color = false;
+  for (const char* candidate : SUPPORTED_TERM_VALUES) {
+    if (term && 0 == strcmp(term, candidate)) {
+      term_supports_color = true;
+      break;
+    }
+  }
+
+  return 0 != isatty(fileno(stdout)) && term_supports_color;
+#endif  // BENCHMARK_OS_WINDOWS
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/colorprint.h b/thirdparty/benchmark-1.5.0/src/colorprint.h
new file mode 100644
index 0000000000..9f6fab9b34
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/colorprint.h
@@ -0,0 +1,33 @@
+#ifndef BENCHMARK_COLORPRINT_H_
+#define BENCHMARK_COLORPRINT_H_
+
+#include <cstdarg>
+#include <iostream>
+#include <string>
+
+namespace benchmark {
+enum LogColor {
+  COLOR_DEFAULT,
+  COLOR_RED,
+  COLOR_GREEN,
+  COLOR_YELLOW,
+  COLOR_BLUE,
+  COLOR_MAGENTA,
+  COLOR_CYAN,
+  COLOR_WHITE
+};
+
+std::string FormatString(const char* msg, va_list args);
+std::string FormatString(const char* msg, ...);
+
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt,
+                 va_list args);
+void ColorPrintf(std::ostream& out, LogColor color, const char* fmt, ...);
+
+// Returns true if stdout appears to be a terminal that supports colored
+// output, false otherwise.
+bool IsColorTerminal();
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_COLORPRINT_H_
diff --git a/thirdparty/benchmark-1.5.0/src/commandlineflags.cc b/thirdparty/benchmark-1.5.0/src/commandlineflags.cc
new file mode 100644
index 0000000000..6bd65c5ae7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/commandlineflags.cc
@@ -0,0 +1,222 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "commandlineflags.h"
+
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+
+namespace benchmark {
+namespace {
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const std::string& src_text, const char* str, int32_t* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  // Is the parsed value in the range of an Int32?
+  const int32_t result = static_cast<int32_t>(long_value);
+  if (long_value == std::numeric_limits<long>::max() ||
+      long_value == std::numeric_limits<long>::min() ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an Int32.
+  ) {
+    std::cerr << src_text << " is expected to be a 32-bit integer, "
+              << "but actually has value \"" << str << "\", "
+              << "which overflows.\n";
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Parses 'str' for a double.  If successful, writes the result to *value and
+// returns true; otherwise leaves *value unchanged and returns false.
+bool ParseDouble(const std::string& src_text, const char* str, double* value) {
+  // Parses the environment variable as a decimal integer.
+  char* end = nullptr;
+  const double double_value = strtod(str, &end);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    std::cerr << src_text << " is expected to be a double, "
+              << "but actually has value \"" << str << "\".\n";
+    return false;
+  }
+
+  *value = double_value;
+  return true;
+}
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "BENCHMARK_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char* flag) {
+  const std::string flag_str(flag);
+
+  std::string env_var;
+  for (size_t i = 0; i != flag_str.length(); ++i)
+    env_var += static_cast<char>(::toupper(flag_str.c_str()[i]));
+
+  return "BENCHMARK_" + env_var;
+}
+
+}  // namespace
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true iff it's not "0".
+bool BoolFromEnv(const char* flag, bool default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = getenv(env_var.c_str());
+  return string_value == nullptr ? default_value
+                                 : strcmp(string_value, "0") != 0;
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+int32_t Int32FromEnv(const char* flag, int32_t default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const string_value = getenv(env_var.c_str());
+  if (string_value == nullptr) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  int32_t result = default_value;
+  if (!ParseInt32(std::string("Environment variable ") + env_var, string_value,
+                  &result)) {
+    std::cout << "The default value " << default_value << " is used.\n";
+    return default_value;
+  }
+
+  return result;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char* StringFromEnv(const char* flag, const char* default_value) {
+  const std::string env_var = FlagToEnvVar(flag);
+  const char* const value = getenv(env_var.c_str());
+  return value == nullptr ? default_value : value;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or nullptr if the parsing failed.
+const char* ParseFlagValue(const char* str, const char* flag,
+                           bool def_optional) {
+  // str and flag must not be nullptr.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--".
+  const std::string flag_str = std::string("--") + std::string(flag);
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) return flag_end;
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = IsTruthyFlagValue(value_str);
+  return true;
+}
+
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(std::string("The value of flag --") + flag, value_str,
+                    value);
+}
+
+bool ParseDoubleFlag(const char* str, const char* flag, double* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseDouble(std::string("The value of flag --") + flag, value_str,
+                     value);
+}
+
+bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  *value = value_str;
+  return true;
+}
+
+bool IsFlag(const char* str, const char* flag) {
+  return (ParseFlagValue(str, flag, true) != nullptr);
+}
+
+bool IsTruthyFlagValue(const std::string& value) {
+  if (value.empty()) return true;
+  char ch = value[0];
+  return isalnum(ch) &&
+         !(ch == '0' || ch == 'f' || ch == 'F' || ch == 'n' || ch == 'N');
+}
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/commandlineflags.h b/thirdparty/benchmark-1.5.0/src/commandlineflags.h
new file mode 100644
index 0000000000..5eaea82a59
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/commandlineflags.h
@@ -0,0 +1,73 @@
+#ifndef BENCHMARK_COMMANDLINEFLAGS_H_
+#define BENCHMARK_COMMANDLINEFLAGS_H_
+
+#include <cstdint>
+#include <string>
+
+// Macro for referencing flags.
+#define FLAG(name) FLAGS_##name
+
+// Macros for declaring flags.
+#define DECLARE_bool(name) extern bool FLAG(name)
+#define DECLARE_int32(name) extern int32_t FLAG(name)
+#define DECLARE_int64(name) extern int64_t FLAG(name)
+#define DECLARE_double(name) extern double FLAG(name)
+#define DECLARE_string(name) extern std::string FLAG(name)
+
+// Macros for defining flags.
+#define DEFINE_bool(name, default_val, doc) bool FLAG(name) = (default_val)
+#define DEFINE_int32(name, default_val, doc) int32_t FLAG(name) = (default_val)
+#define DEFINE_int64(name, default_val, doc) int64_t FLAG(name) = (default_val)
+#define DEFINE_double(name, default_val, doc) double FLAG(name) = (default_val)
+#define DEFINE_string(name, default_val, doc) \
+  std::string FLAG(name) = (default_val)
+
+namespace benchmark {
+// Parses a bool/Int32/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromEnv(const char* flag, bool default_val);
+int32_t Int32FromEnv(const char* flag, int32_t default_val);
+const char* StringFromEnv(const char* flag, const char* default_val);
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true if it passes IsTruthyValue().
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseBoolFlag(const char* str, const char* flag, bool* value);
+
+// Parses a string for an Int32 flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char* str, const char* flag, int32_t* value);
+
+// Parses a string for a Double flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseDoubleFlag(const char* str, const char* flag, double* value);
+
+// Parses a string for a string flag, in the form of
+// "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseStringFlag(const char* str, const char* flag, std::string* value);
+
+// Returns true if the string matches the flag.
+bool IsFlag(const char* str, const char* flag);
+
+// Returns true unless value starts with one of: '0', 'f', 'F', 'n' or 'N', or
+// some non-alphanumeric character. As a special case, also returns true if
+// value is the empty string.
+bool IsTruthyFlagValue(const std::string& value);
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_COMMANDLINEFLAGS_H_
diff --git a/thirdparty/benchmark-1.5.0/src/complexity.cc b/thirdparty/benchmark-1.5.0/src/complexity.cc
new file mode 100644
index 0000000000..aeed67f0c7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/complexity.cc
@@ -0,0 +1,238 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Source project : https://github.com/ismaelJimenez/cpp.leastsq
+// Adapted to be used with google benchmark
+
+#include "benchmark/benchmark.h"
+
+#include <algorithm>
+#include <cmath>
+#include "check.h"
+#include "complexity.h"
+
+namespace benchmark {
+
+// Internal function to calculate the different scalability forms
+BigOFunc* FittingCurve(BigO complexity) {
+  static const double kLog2E = 1.44269504088896340736;
+  switch (complexity) {
+    case oN:
+      return [](IterationCount n) -> double { return static_cast<double>(n); };
+    case oNSquared:
+      return [](IterationCount n) -> double { return std::pow(n, 2); };
+    case oNCubed:
+      return [](IterationCount n) -> double { return std::pow(n, 3); };
+    case oLogN:
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return
+          [](IterationCount n) { return kLog2E * log(static_cast<double>(n)); };
+    case oNLogN:
+      /* Note: can't use log2 because Android's GNU STL lacks it */
+      return [](IterationCount n) {
+        return kLog2E * n * log(static_cast<double>(n));
+      };
+    case o1:
+    default:
+      return [](IterationCount) { return 1.0; };
+  }
+}
+
+// Function to return an string for the calculated complexity
+std::string GetBigOString(BigO complexity) {
+  switch (complexity) {
+    case oN:
+      return "N";
+    case oNSquared:
+      return "N^2";
+    case oNCubed:
+      return "N^3";
+    case oLogN:
+      return "lgN";
+    case oNLogN:
+      return "NlgN";
+    case o1:
+      return "(1)";
+    default:
+      return "f(N)";
+  }
+}
+
+// Find the coefficient for the high-order term in the running time, by
+// minimizing the sum of squares of relative error, for the fitting curve
+// given by the lambda expression.
+//   - n             : Vector containing the size of the benchmark tests.
+//   - time          : Vector containing the times for the benchmark tests.
+//   - fitting_curve : lambda expression (e.g. [](int64_t n) {return n; };).
+
+// For a deeper explanation on the algorithm logic, please refer to
+// https://en.wikipedia.org/wiki/Least_squares#Least_squares,_regression_analysis_and_statistics
+
+LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+                       const std::vector<double>& time,
+                       BigOFunc* fitting_curve) {
+  double sigma_gn = 0.0;
+  double sigma_gn_squared = 0.0;
+  double sigma_time = 0.0;
+  double sigma_time_gn = 0.0;
+
+  // Calculate least square fitting parameter
+  for (size_t i = 0; i < n.size(); ++i) {
+    double gn_i = fitting_curve(n[i]);
+    sigma_gn += gn_i;
+    sigma_gn_squared += gn_i * gn_i;
+    sigma_time += time[i];
+    sigma_time_gn += time[i] * gn_i;
+  }
+
+  LeastSq result;
+  result.complexity = oLambda;
+
+  // Calculate complexity.
+  result.coef = sigma_time_gn / sigma_gn_squared;
+
+  // Calculate RMS
+  double rms = 0.0;
+  for (size_t i = 0; i < n.size(); ++i) {
+    double fit = result.coef * fitting_curve(n[i]);
+    rms += pow((time[i] - fit), 2);
+  }
+
+  // Normalized RMS by the mean of the observed values
+  double mean = sigma_time / n.size();
+  result.rms = sqrt(rms / n.size()) / mean;
+
+  return result;
+}
+
+// Find the coefficient for the high-order term in the running time, by
+// minimizing the sum of squares of relative error.
+//   - n          : Vector containing the size of the benchmark tests.
+//   - time       : Vector containing the times for the benchmark tests.
+//   - complexity : If different than oAuto, the fitting curve will stick to
+//                  this one. If it is oAuto, it will be calculated the best
+//                  fitting curve.
+LeastSq MinimalLeastSq(const std::vector<int64_t>& n,
+                       const std::vector<double>& time, const BigO complexity) {
+  CHECK_EQ(n.size(), time.size());
+  CHECK_GE(n.size(), 2);  // Do not compute fitting curve is less than two
+                          // benchmark runs are given
+  CHECK_NE(complexity, oNone);
+
+  LeastSq best_fit;
+
+  if (complexity == oAuto) {
+    std::vector<BigO> fit_curves = {oLogN, oN, oNLogN, oNSquared, oNCubed};
+
+    // Take o1 as default best fitting curve
+    best_fit = MinimalLeastSq(n, time, FittingCurve(o1));
+    best_fit.complexity = o1;
+
+    // Compute all possible fitting curves and stick to the best one
+    for (const auto& fit : fit_curves) {
+      LeastSq current_fit = MinimalLeastSq(n, time, FittingCurve(fit));
+      if (current_fit.rms < best_fit.rms) {
+        best_fit = current_fit;
+        best_fit.complexity = fit;
+      }
+    }
+  } else {
+    best_fit = MinimalLeastSq(n, time, FittingCurve(complexity));
+    best_fit.complexity = complexity;
+  }
+
+  return best_fit;
+}
+
+std::vector<BenchmarkReporter::Run> ComputeBigO(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  if (reports.size() < 2) return results;
+
+  // Accumulators.
+  std::vector<int64_t> n;
+  std::vector<double> real_time;
+  std::vector<double> cpu_time;
+
+  // Populate the accumulators.
+  for (const Run& run : reports) {
+    CHECK_GT(run.complexity_n, 0) << "Did you forget to call SetComplexityN?";
+    n.push_back(run.complexity_n);
+    real_time.push_back(run.real_accumulated_time / run.iterations);
+    cpu_time.push_back(run.cpu_accumulated_time / run.iterations);
+  }
+
+  LeastSq result_cpu;
+  LeastSq result_real;
+
+  if (reports[0].complexity == oLambda) {
+    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity_lambda);
+    result_real = MinimalLeastSq(n, real_time, reports[0].complexity_lambda);
+  } else {
+    result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
+    result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
+  }
+
+  // Drop the 'args' when reporting complexity.
+  auto run_name = reports[0].run_name;
+  run_name.args.clear();
+
+  // Get the data from the accumulator to BenchmarkReporter::Run's.
+  Run big_o;
+  big_o.run_name = run_name;
+  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  big_o.repetitions = reports[0].repetitions;
+  big_o.repetition_index = Run::no_repetition_index;
+  big_o.threads = reports[0].threads;
+  big_o.aggregate_name = "BigO";
+  big_o.report_label = reports[0].report_label;
+  big_o.iterations = 0;
+  big_o.real_accumulated_time = result_real.coef;
+  big_o.cpu_accumulated_time = result_cpu.coef;
+  big_o.report_big_o = true;
+  big_o.complexity = result_cpu.complexity;
+
+  // All the time results are reported after being multiplied by the
+  // time unit multiplier. But since RMS is a relative quantity it
+  // should not be multiplied at all. So, here, we _divide_ it by the
+  // multiplier so that when it is multiplied later the result is the
+  // correct one.
+  double multiplier = GetTimeUnitMultiplier(reports[0].time_unit);
+
+  // Only add label to mean/stddev if it is same for all runs
+  Run rms;
+  rms.run_name = run_name;
+  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  rms.aggregate_name = "RMS";
+  rms.report_label = big_o.report_label;
+  rms.iterations = 0;
+  rms.repetition_index = Run::no_repetition_index;
+  rms.repetitions = reports[0].repetitions;
+  rms.threads = reports[0].threads;
+  rms.real_accumulated_time = result_real.rms / multiplier;
+  rms.cpu_accumulated_time = result_cpu.rms / multiplier;
+  rms.report_rms = true;
+  rms.complexity = result_cpu.complexity;
+  // don't forget to keep the time unit, or we won't be able to
+  // recover the correct value.
+  rms.time_unit = reports[0].time_unit;
+
+  results.push_back(big_o);
+  results.push_back(rms);
+  return results;
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/complexity.h b/thirdparty/benchmark-1.5.0/src/complexity.h
new file mode 100644
index 0000000000..df29b48d29
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/complexity.h
@@ -0,0 +1,55 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Source project : https://github.com/ismaelJimenez/cpp.leastsq
+// Adapted to be used with google benchmark
+
+#ifndef COMPLEXITY_H_
+#define COMPLEXITY_H_
+
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+namespace benchmark {
+
+// Return a vector containing the bigO and RMS information for the specified
+// list of reports. If 'reports.size() < 2' an empty vector is returned.
+std::vector<BenchmarkReporter::Run> ComputeBigO(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+// This data structure will contain the result returned by MinimalLeastSq
+//   - coef        : Estimated coeficient for the high-order term as
+//                   interpolated from data.
+//   - rms         : Normalized Root Mean Squared Error.
+//   - complexity  : Scalability form (e.g. oN, oNLogN). In case a scalability
+//                   form has been provided to MinimalLeastSq this will return
+//                   the same value. In case BigO::oAuto has been selected, this
+//                   parameter will return the best fitting curve detected.
+
+struct LeastSq {
+  LeastSq() : coef(0.0), rms(0.0), complexity(oNone) {}
+
+  double coef;
+  double rms;
+  BigO complexity;
+};
+
+// Function to return an string for the calculated complexity
+std::string GetBigOString(BigO complexity);
+
+}  // end namespace benchmark
+
+#endif  // COMPLEXITY_H_
diff --git a/thirdparty/benchmark-1.5.0/src/console_reporter.cc b/thirdparty/benchmark-1.5.0/src/console_reporter.cc
new file mode 100644
index 0000000000..cc8ae276f6
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/console_reporter.cc
@@ -0,0 +1,179 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "complexity.h"
+#include "counter.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "internal_macros.h"
+#include "string_util.h"
+#include "timers.h"
+
+namespace benchmark {
+
+bool ConsoleReporter::ReportContext(const Context& context) {
+  name_field_width_ = context.name_field_width;
+  printed_header_ = false;
+  prev_counters_.clear();
+
+  PrintBasicContext(&GetErrorStream(), context);
+
+#ifdef BENCHMARK_OS_WINDOWS
+  if ((output_options_ & OO_Color) && &std::cout != &GetOutputStream()) {
+    GetErrorStream()
+        << "Color printing is only supported for stdout on windows."
+           " Disabling color printing\n";
+    output_options_ = static_cast< OutputOptions >(output_options_ & ~OO_Color);
+  }
+#endif
+
+  return true;
+}
+
+void ConsoleReporter::PrintHeader(const Run& run) {
+  std::string str = FormatString("%-*s %13s %15s %12s", static_cast<int>(name_field_width_),
+                                 "Benchmark", "Time", "CPU", "Iterations");
+  if(!run.counters.empty()) {
+    if(output_options_ & OO_Tabular) {
+      for(auto const& c : run.counters) {
+        str += FormatString(" %10s", c.first.c_str());
+      }
+    } else {
+      str += " UserCounters...";
+    }
+  }
+  std::string line = std::string(str.length(), '-');
+  GetOutputStream() << line << "\n" << str << "\n" << line << "\n";
+}
+
+void ConsoleReporter::ReportRuns(const std::vector<Run>& reports) {
+  for (const auto& run : reports) {
+    // print the header:
+    // --- if none was printed yet
+    bool print_header = !printed_header_;
+    // --- or if the format is tabular and this run
+    //     has different fields from the prev header
+    print_header |= (output_options_ & OO_Tabular) &&
+                    (!internal::SameNames(run.counters, prev_counters_));
+    if (print_header) {
+      printed_header_ = true;
+      prev_counters_ = run.counters;
+      PrintHeader(run);
+    }
+    // As an alternative to printing the headers like this, we could sort
+    // the benchmarks by header and then print. But this would require
+    // waiting for the full results before printing, or printing twice.
+    PrintRunData(run);
+  }
+}
+
+static void IgnoreColorPrint(std::ostream& out, LogColor, const char* fmt,
+                             ...) {
+  va_list args;
+  va_start(args, fmt);
+  out << FormatString(fmt, args);
+  va_end(args);
+}
+
+
+static std::string FormatTime(double time) {
+  // Align decimal places...
+  if (time < 1.0) {
+    return FormatString("%10.3f", time);
+  }
+  if (time < 10.0) {
+    return FormatString("%10.2f", time);
+  }
+  if (time < 100.0) {
+    return FormatString("%10.1f", time);
+  }
+  return FormatString("%10.0f", time);
+}
+
+void ConsoleReporter::PrintRunData(const Run& result) {
+  typedef void(PrinterFn)(std::ostream&, LogColor, const char*, ...);
+  auto& Out = GetOutputStream();
+  PrinterFn* printer = (output_options_ & OO_Color) ?
+                         (PrinterFn*)ColorPrintf : IgnoreColorPrint;
+  auto name_color =
+      (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
+  printer(Out, name_color, "%-*s ", name_field_width_,
+          result.benchmark_name().c_str());
+
+  if (result.error_occurred) {
+    printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
+            result.error_message.c_str());
+    printer(Out, COLOR_DEFAULT, "\n");
+    return;
+  }
+
+  const double real_time = result.GetAdjustedRealTime();
+  const double cpu_time = result.GetAdjustedCPUTime();
+  const std::string real_time_str = FormatTime(real_time);
+  const std::string cpu_time_str = FormatTime(cpu_time);
+
+
+  if (result.report_big_o) {
+    std::string big_o = GetBigOString(result.complexity);
+    printer(Out, COLOR_YELLOW, "%10.2f %-4s %10.2f %-4s ", real_time, big_o.c_str(),
+            cpu_time, big_o.c_str());
+  } else if (result.report_rms) {
+    printer(Out, COLOR_YELLOW, "%10.0f %-4s %10.0f %-4s ", real_time * 100, "%",
+            cpu_time * 100, "%");
+  } else {
+    const char* timeLabel = GetTimeUnitString(result.time_unit);
+    printer(Out, COLOR_YELLOW, "%s %-4s %s %-4s ", real_time_str.c_str(), timeLabel,
+            cpu_time_str.c_str(), timeLabel);
+  }
+
+  if (!result.report_big_o && !result.report_rms) {
+    printer(Out, COLOR_CYAN, "%10lld", result.iterations);
+  }
+
+  for (auto& c : result.counters) {
+    const std::size_t cNameLen = std::max(std::string::size_type(10),
+                                          c.first.length());
+    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
+    if (output_options_ & OO_Tabular) {
+      if (c.second.flags & Counter::kIsRate) {
+        printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
+      } else {
+        printer(Out, COLOR_DEFAULT, " %*s", cNameLen, s.c_str());
+      }
+    } else {
+      const char* unit = (c.second.flags & Counter::kIsRate) ? "/s" : "";
+      printer(Out, COLOR_DEFAULT, " %s=%s%s", c.first.c_str(), s.c_str(),
+              unit);
+    }
+  }
+
+  if (!result.report_label.empty()) {
+    printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
+  }
+
+  printer(Out, COLOR_DEFAULT, "\n");
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/counter.cc b/thirdparty/benchmark-1.5.0/src/counter.cc
new file mode 100644
index 0000000000..c248ea110b
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/counter.cc
@@ -0,0 +1,76 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "counter.h"
+
+namespace benchmark {
+namespace internal {
+
+double Finish(Counter const& c, IterationCount iterations, double cpu_time,
+              double num_threads) {
+  double v = c.value;
+  if (c.flags & Counter::kIsRate) {
+    v /= cpu_time;
+  }
+  if (c.flags & Counter::kAvgThreads) {
+    v /= num_threads;
+  }
+  if (c.flags & Counter::kIsIterationInvariant) {
+    v *= iterations;
+  }
+  if (c.flags & Counter::kAvgIterations) {
+    v /= iterations;
+  }
+  return v;
+}
+
+void Finish(UserCounters* l, IterationCount iterations, double cpu_time,
+            double num_threads) {
+  for (auto& c : *l) {
+    c.second.value = Finish(c.second, iterations, cpu_time, num_threads);
+  }
+}
+
+void Increment(UserCounters* l, UserCounters const& r) {
+  // add counters present in both or just in *l
+  for (auto& c : *l) {
+    auto it = r.find(c.first);
+    if (it != r.end()) {
+      c.second.value = c.second + it->second;
+    }
+  }
+  // add counters present in r, but not in *l
+  for (auto const& tc : r) {
+    auto it = l->find(tc.first);
+    if (it == l->end()) {
+      (*l)[tc.first] = tc.second;
+    }
+  }
+}
+
+bool SameNames(UserCounters const& l, UserCounters const& r) {
+  if (&l == &r) return true;
+  if (l.size() != r.size()) {
+    return false;
+  }
+  for (auto const& c : l) {
+    if (r.find(c.first) == r.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // end namespace internal
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/counter.h b/thirdparty/benchmark-1.5.0/src/counter.h
new file mode 100644
index 0000000000..1ad46d4940
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/counter.h
@@ -0,0 +1,27 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+namespace benchmark {
+
+// these counter-related functions are hidden to reduce API surface.
+namespace internal {
+void Finish(UserCounters* l, IterationCount iterations, double time,
+            double num_threads);
+void Increment(UserCounters* l, UserCounters const& r);
+bool SameNames(UserCounters const& l, UserCounters const& r);
+}  // end namespace internal
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/csv_reporter.cc b/thirdparty/benchmark-1.5.0/src/csv_reporter.cc
new file mode 100644
index 0000000000..af2c18fc8a
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/csv_reporter.cc
@@ -0,0 +1,154 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "complexity.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "check.h"
+#include "string_util.h"
+#include "timers.h"
+
+// File format reference: http://edoceo.com/utilitas/csv-file-format.
+
+namespace benchmark {
+
+namespace {
+std::vector<std::string> elements = {
+    "name",           "iterations",       "real_time",        "cpu_time",
+    "time_unit",      "bytes_per_second", "items_per_second", "label",
+    "error_occurred", "error_message"};
+}  // namespace
+
+std::string CsvEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size() + 2);
+  for (char c : s) {
+    switch (c) {
+    case '"' : tmp += "\"\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return '"' + tmp + '"';
+}
+
+bool CSVReporter::ReportContext(const Context& context) {
+  PrintBasicContext(&GetErrorStream(), context);
+  return true;
+}
+
+void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
+  std::ostream& Out = GetOutputStream();
+
+  if (!printed_header_) {
+    // save the names of all the user counters
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
+        user_counter_names_.insert(cnt.first);
+      }
+    }
+
+    // print the header
+    for (auto B = elements.begin(); B != elements.end();) {
+      Out << *B++;
+      if (B != elements.end()) Out << ",";
+    }
+    for (auto B = user_counter_names_.begin();
+         B != user_counter_names_.end();) {
+      Out << ",\"" << *B++ << "\"";
+    }
+    Out << "\n";
+
+    printed_header_ = true;
+  } else {
+    // check that all the current counters are saved in the name set
+    for (const auto& run : reports) {
+      for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
+        CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
+            << "All counters must be present in each run. "
+            << "Counter named \"" << cnt.first
+            << "\" was not in a run after being added to the header";
+      }
+    }
+  }
+
+  // print results for each run
+  for (const auto& run : reports) {
+    PrintRunData(run);
+  }
+}
+
+void CSVReporter::PrintRunData(const Run& run) {
+  std::ostream& Out = GetOutputStream();
+  Out << CsvEscape(run.benchmark_name()) << ",";
+  if (run.error_occurred) {
+    Out << std::string(elements.size() - 3, ',');
+    Out << "true,";
+    Out << CsvEscape(run.error_message) << "\n";
+    return;
+  }
+
+  // Do not print iteration on bigO and RMS report
+  if (!run.report_big_o && !run.report_rms) {
+    Out << run.iterations;
+  }
+  Out << ",";
+
+  Out << run.GetAdjustedRealTime() << ",";
+  Out << run.GetAdjustedCPUTime() << ",";
+
+  // Do not print timeLabel on bigO and RMS report
+  if (run.report_big_o) {
+    Out << GetBigOString(run.complexity);
+  } else if (!run.report_rms) {
+    Out << GetTimeUnitString(run.time_unit);
+  }
+  Out << ",";
+
+  if (run.counters.find("bytes_per_second") != run.counters.end()) {
+    Out << run.counters.at("bytes_per_second");
+  }
+  Out << ",";
+  if (run.counters.find("items_per_second") != run.counters.end()) {
+    Out << run.counters.at("items_per_second");
+  }
+  Out << ",";
+  if (!run.report_label.empty()) {
+    Out << CsvEscape(run.report_label);
+  }
+  Out << ",,";  // for error_occurred and error_message
+
+  // Print user counters
+  for (const auto& ucn : user_counter_names_) {
+    auto it = run.counters.find(ucn);
+    if (it == run.counters.end()) {
+      Out << ",";
+    } else {
+      Out << "," << it->second;
+    }
+  }
+  Out << '\n';
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/cycleclock.h b/thirdparty/benchmark-1.5.0/src/cycleclock.h
new file mode 100644
index 0000000000..f5e37b011b
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/cycleclock.h
@@ -0,0 +1,177 @@
+// ----------------------------------------------------------------------
+// CycleClock
+//    A CycleClock tells you the current time in Cycles.  The "time"
+//    is actually time since power-on.  This is like time() but doesn't
+//    involve a system call and is much more precise.
+//
+// NOTE: Not all cpu/platform/kernel combinations guarantee that this
+// clock increments at a constant rate or is synchronized across all logical
+// cpus in a system.
+//
+// If you need the above guarantees, please consider using a different
+// API. There are efforts to provide an interface which provides a millisecond
+// granularity and implemented as a memory read. A memory read is generally
+// cheaper than the CycleClock for many architectures.
+//
+// Also, in some out of order CPU implementations, the CycleClock is not
+// serializing. So if you're trying to count at cycles granularity, your
+// data might be inaccurate due to out of order instruction execution.
+// ----------------------------------------------------------------------
+
+#ifndef BENCHMARK_CYCLECLOCK_H_
+#define BENCHMARK_CYCLECLOCK_H_
+
+#include <cstdint>
+
+#include "benchmark/benchmark.h"
+#include "internal_macros.h"
+
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_time.h>
+#endif
+// For MSVC, we want to use '_asm rdtsc' when possible (since it works
+// with even ancient MSVC compilers), and when not possible the
+// __rdtsc intrinsic, declared in <intrin.h>.  Unfortunately, in some
+// environments, <windows.h> and <intrin.h> have conflicting
+// declarations of some other intrinsics, breaking compilation.
+// Therefore, we simply declare __rdtsc ourselves. See also
+// http://connect.microsoft.com/VisualStudio/feedback/details/262047
+#if defined(COMPILER_MSVC) && !defined(_M_IX86)
+extern "C" uint64_t __rdtsc();
+#pragma intrinsic(__rdtsc)
+#endif
+
+#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW)
+#include <sys/time.h>
+#include <time.h>
+#endif
+
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
+#endif
+
+namespace benchmark {
+// NOTE: only i386 and x86_64 have been well tested.
+// PPC, sparc, alpha, and ia64 are based on
+//    http://peter.kuscsik.com/wordpress/?p=14
+// with modifications by m3b.  See also
+//    https://setisvn.ssl.berkeley.edu/svn/lib/fftw-3.0.1/kernel/cycle.h
+namespace cycleclock {
+// This should return the number of cycles since power-on.  Thread-safe.
+inline BENCHMARK_ALWAYS_INLINE int64_t Now() {
+#if defined(BENCHMARK_OS_MACOSX)
+  // this goes at the top because we need ALL Macs, regardless of
+  // architecture, to return the number of "mach time units" that
+  // have passed since startup.  See sysinfo.cc where
+  // InitializeSystemInfo() sets the supposed cpu clock frequency of
+  // macs to the number of mach time units per second, not actual
+  // CPU clock frequency (which can change in the face of CPU
+  // frequency scaling).  Also note that when the Mac sleeps, this
+  // counter pauses; it does not continue counting, nor does it
+  // reset to zero.
+  return mach_absolute_time();
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // this goes above x86-specific code because old versions of Emscripten
+  // define __x86_64__, although they have nothing to do with it.
+  return static_cast<int64_t>(emscripten_get_now() * 1e+6);
+#elif defined(__i386__)
+  int64_t ret;
+  __asm__ volatile("rdtsc" : "=A"(ret));
+  return ret;
+#elif defined(__x86_64__) || defined(__amd64__)
+  uint64_t low, high;
+  __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
+  return (high << 32) | low;
+#elif defined(__powerpc__) || defined(__ppc__)
+  // This returns a time-base, which is not always precisely a cycle-count.
+  int64_t tbl, tbu0, tbu1;
+  asm("mftbu %0" : "=r"(tbu0));
+  asm("mftb  %0" : "=r"(tbl));
+  asm("mftbu %0" : "=r"(tbu1));
+  tbl &= -static_cast<int64_t>(tbu0 == tbu1);
+  // high 32 bits in tbu1; low 32 bits in tbl  (tbu0 is garbage)
+  return (tbu1 << 32) | tbl;
+#elif defined(__sparc__)
+  int64_t tick;
+  asm(".byte 0x83, 0x41, 0x00, 0x00");
+  asm("mov   %%g1, %0" : "=r"(tick));
+  return tick;
+#elif defined(__ia64__)
+  int64_t itc;
+  asm("mov %0 = ar.itc" : "=r"(itc));
+  return itc;
+#elif defined(COMPILER_MSVC) && defined(_M_IX86)
+  // Older MSVC compilers (like 7.x) don't seem to support the
+  // __rdtsc intrinsic properly, so I prefer to use _asm instead
+  // when I know it will work.  Otherwise, I'll use __rdtsc and hope
+  // the code is being compiled with a non-ancient compiler.
+  _asm rdtsc
+#elif defined(COMPILER_MSVC)
+  return __rdtsc();
+#elif defined(BENCHMARK_OS_NACL)
+  // Native Client validator on x86/x86-64 allows RDTSC instructions,
+  // and this case is handled above. Native Client validator on ARM
+  // rejects MRC instructions (used in the ARM-specific sequence below),
+  // so we handle it here. Portable Native Client compiles to
+  // architecture-agnostic bytecode, which doesn't provide any
+  // cycle counter access mnemonics.
+
+  // Native Client does not provide any API to access cycle counter.
+  // Use clock_gettime(CLOCK_MONOTONIC, ...) instead of gettimeofday
+  // because is provides nanosecond resolution (which is noticable at
+  // least for PNaCl modules running on x86 Mac & Linux).
+  // Initialize to always return 0 if clock_gettime fails.
+  struct timespec ts = {0, 0};
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return static_cast<int64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#elif defined(__aarch64__)
+  // System timer of ARMv8 runs at a different frequency than the CPU's.
+  // The frequency is fixed, typically in the range 1-50MHz.  It can be
+  // read at CNTFRQ special register.  We assume the OS has set up
+  // the virtual timer properly.
+  int64_t virtual_timer_value;
+  asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
+  return virtual_timer_value;
+#elif defined(__ARM_ARCH)
+  // V6 is the earliest arch that has a standard cyclecount
+  // Native Client validator doesn't allow MRC instructions.
+#if (__ARM_ARCH >= 6)
+  uint32_t pmccntr;
+  uint32_t pmuseren;
+  uint32_t pmcntenset;
+  // Read the user mode perf monitor counter access permissions.
+  asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+  if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
+    asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+    if (pmcntenset & 0x80000000ul) {  // Is it counting?
+      asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+      // The counter is set up to count every 64th cycle
+      return static_cast<int64_t>(pmccntr) * 64;  // Should optimize to << 6
+    }
+  }
+#endif
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__mips__)
+  // mips apparently only allows rdtsc for superusers, so we fall
+  // back to gettimeofday.  It's possible clock_gettime would be better.
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<int64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(__s390__)  // Covers both s390 and s390x.
+  // Return the CPU clock.
+  uint64_t tsc;
+  asm("stck %0" : "=Q"(tsc) : : "cc");
+  return tsc;
+#else
+// The soft failover to a generic implementation is automatic only for ARM.
+// For other platforms the developer is expected to make an attempt to create
+// a fast implementation and use generic version if nothing better is available.
+#error You need to define CycleTimer for your OS and CPU
+#endif
+}
+}  // end namespace cycleclock
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_CYCLECLOCK_H_
diff --git a/thirdparty/benchmark-1.5.0/src/internal_macros.h b/thirdparty/benchmark-1.5.0/src/internal_macros.h
new file mode 100644
index 0000000000..6adf00d056
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/internal_macros.h
@@ -0,0 +1,94 @@
+#ifndef BENCHMARK_INTERNAL_MACROS_H_
+#define BENCHMARK_INTERNAL_MACROS_H_
+
+#include "benchmark/benchmark.h"
+
+/* Needed to detect STL */
+#include <cstdlib>
+
+// clang-format off
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if defined(__clang__)
+  #if !defined(COMPILER_CLANG)
+    #define COMPILER_CLANG
+  #endif
+#elif defined(_MSC_VER)
+  #if !defined(COMPILER_MSVC)
+    #define COMPILER_MSVC
+  #endif
+#elif defined(__GNUC__)
+  #if !defined(COMPILER_GCC)
+    #define COMPILER_GCC
+  #endif
+#endif
+
+#if __has_feature(cxx_attributes)
+  #define BENCHMARK_NORETURN [[noreturn]]
+#elif defined(__GNUC__)
+  #define BENCHMARK_NORETURN __attribute__((noreturn))
+#elif defined(COMPILER_MSVC)
+  #define BENCHMARK_NORETURN __declspec(noreturn)
+#else
+  #define BENCHMARK_NORETURN
+#endif
+
+#if defined(__CYGWIN__)
+  #define BENCHMARK_OS_CYGWIN 1
+#elif defined(_WIN32)
+  #define BENCHMARK_OS_WINDOWS 1
+  #if defined(__MINGW32__)
+    #define BENCHMARK_OS_MINGW 1
+  #endif
+#elif defined(__APPLE__)
+  #define BENCHMARK_OS_APPLE 1
+  #include "TargetConditionals.h"
+  #if defined(TARGET_OS_MAC)
+    #define BENCHMARK_OS_MACOSX 1
+    #if defined(TARGET_OS_IPHONE)
+      #define BENCHMARK_OS_IOS 1
+    #endif
+  #endif
+#elif defined(__FreeBSD__)
+  #define BENCHMARK_OS_FREEBSD 1
+#elif defined(__NetBSD__)
+  #define BENCHMARK_OS_NETBSD 1
+#elif defined(__OpenBSD__)
+  #define BENCHMARK_OS_OPENBSD 1
+#elif defined(__linux__)
+  #define BENCHMARK_OS_LINUX 1
+#elif defined(__native_client__)
+  #define BENCHMARK_OS_NACL 1
+#elif defined(__EMSCRIPTEN__)
+  #define BENCHMARK_OS_EMSCRIPTEN 1
+#elif defined(__rtems__)
+  #define BENCHMARK_OS_RTEMS 1
+#elif defined(__Fuchsia__)
+#define BENCHMARK_OS_FUCHSIA 1
+#elif defined (__SVR4) && defined (__sun)
+#define BENCHMARK_OS_SOLARIS 1
+#elif defined(__QNX__)
+#define BENCHMARK_OS_QNX 1
+#endif
+
+#if defined(__ANDROID__) && defined(__GLIBCXX__)
+#define BENCHMARK_STL_ANDROID_GNUSTL 1
+#endif
+
+#if !__has_feature(cxx_exceptions) && !defined(__cpp_exceptions) \
+     && !defined(__EXCEPTIONS)
+  #define BENCHMARK_HAS_NO_EXCEPTIONS
+#endif
+
+#if defined(COMPILER_CLANG) || defined(COMPILER_GCC)
+  #define BENCHMARK_MAYBE_UNUSED __attribute__((unused))
+#else
+  #define BENCHMARK_MAYBE_UNUSED
+#endif
+
+// clang-format on
+
+#endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/thirdparty/benchmark-1.5.0/src/json_reporter.cc b/thirdparty/benchmark-1.5.0/src/json_reporter.cc
new file mode 100644
index 0000000000..11db2b99d5
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/json_reporter.cc
@@ -0,0 +1,253 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "complexity.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>  // for setprecision
+#include <iostream>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "string_util.h"
+#include "timers.h"
+
+namespace benchmark {
+
+namespace {
+
+std::string StrEscape(const std::string & s) {
+  std::string tmp;
+  tmp.reserve(s.size());
+  for (char c : s) {
+    switch (c) {
+    case '\b': tmp += "\\b"; break;
+    case '\f': tmp += "\\f"; break;
+    case '\n': tmp += "\\n"; break;
+    case '\r': tmp += "\\r"; break;
+    case '\t': tmp += "\\t"; break;
+    case '\\': tmp += "\\\\"; break;
+    case '"' : tmp += "\\\""; break;
+    default  : tmp += c; break;
+    }
+  }
+  return tmp;
+}
+
+std::string FormatKV(std::string const& key, std::string const& value) {
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+}
+
+std::string FormatKV(std::string const& key, const char* value) {
+  return StrFormat("\"%s\": \"%s\"", StrEscape(key).c_str(), StrEscape(value).c_str());
+}
+
+std::string FormatKV(std::string const& key, bool value) {
+  return StrFormat("\"%s\": %s", StrEscape(key).c_str(), value ? "true" : "false");
+}
+
+std::string FormatKV(std::string const& key, int64_t value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": " << value;
+  return ss.str();
+}
+
+std::string FormatKV(std::string const& key, IterationCount value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": " << value;
+  return ss.str();
+}
+
+std::string FormatKV(std::string const& key, double value) {
+  std::stringstream ss;
+  ss << '"' << StrEscape(key) << "\": ";
+
+  if (std::isnan(value))
+    ss << (value < 0 ? "-" : "") << "NaN";
+  else if (std::isinf(value))
+    ss << (value < 0 ? "-" : "") << "Infinity";
+  else {
+    const auto max_digits10 =
+        std::numeric_limits<decltype(value)>::max_digits10;
+    const auto max_fractional_digits10 = max_digits10 - 1;
+    ss << std::scientific << std::setprecision(max_fractional_digits10)
+       << value;
+  }
+  return ss.str();
+}
+
+int64_t RoundDouble(double v) { return static_cast<int64_t>(v + 0.5); }
+
+}  // end namespace
+
+bool JSONReporter::ReportContext(const Context& context) {
+  std::ostream& out = GetOutputStream();
+
+  out << "{\n";
+  std::string inner_indent(2, ' ');
+
+  // Open context block and print context information.
+  out << inner_indent << "\"context\": {\n";
+  std::string indent(4, ' ');
+
+  std::string walltime_value = LocalDateTimeString();
+  out << indent << FormatKV("date", walltime_value) << ",\n";
+
+  out << indent << FormatKV("host_name", context.sys_info.name) << ",\n";
+
+  if (Context::executable_name) {
+    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
+  }
+
+  CPUInfo const& info = context.cpu_info;
+  out << indent << FormatKV("num_cpus", static_cast<int64_t>(info.num_cpus))
+      << ",\n";
+  out << indent
+      << FormatKV("mhz_per_cpu",
+                  RoundDouble(info.cycles_per_second / 1000000.0))
+      << ",\n";
+  out << indent << FormatKV("cpu_scaling_enabled", info.scaling_enabled)
+      << ",\n";
+
+  out << indent << "\"caches\": [\n";
+  indent = std::string(6, ' ');
+  std::string cache_indent(8, ' ');
+  for (size_t i = 0; i < info.caches.size(); ++i) {
+    auto& CI = info.caches[i];
+    out << indent << "{\n";
+    out << cache_indent << FormatKV("type", CI.type) << ",\n";
+    out << cache_indent << FormatKV("level", static_cast<int64_t>(CI.level))
+        << ",\n";
+    out << cache_indent
+        << FormatKV("size", static_cast<int64_t>(CI.size) * 1000u) << ",\n";
+    out << cache_indent
+        << FormatKV("num_sharing", static_cast<int64_t>(CI.num_sharing))
+        << "\n";
+    out << indent << "}";
+    if (i != info.caches.size() - 1) out << ",";
+    out << "\n";
+  }
+  indent = std::string(4, ' ');
+  out << indent << "],\n";
+  out << indent << "\"load_avg\": [";
+  for (auto it = info.load_avg.begin(); it != info.load_avg.end();) {
+    out << *it++;
+    if (it != info.load_avg.end()) out << ",";
+  }
+  out << "],\n";
+
+#if defined(NDEBUG)
+  const char build_type[] = "release";
+#else
+  const char build_type[] = "debug";
+#endif
+  out << indent << FormatKV("library_build_type", build_type) << "\n";
+  // Close context block and open the list of benchmarks.
+  out << inner_indent << "},\n";
+  out << inner_indent << "\"benchmarks\": [\n";
+  return true;
+}
+
+void JSONReporter::ReportRuns(std::vector<Run> const& reports) {
+  if (reports.empty()) {
+    return;
+  }
+  std::string indent(4, ' ');
+  std::ostream& out = GetOutputStream();
+  if (!first_report_) {
+    out << ",\n";
+  }
+  first_report_ = false;
+
+  for (auto it = reports.begin(); it != reports.end(); ++it) {
+    out << indent << "{\n";
+    PrintRunData(*it);
+    out << indent << '}';
+    auto it_cp = it;
+    if (++it_cp != reports.end()) {
+      out << ",\n";
+    }
+  }
+}
+
+void JSONReporter::Finalize() {
+  // Close the list of benchmarks and the top level object.
+  GetOutputStream() << "\n  ]\n}\n";
+}
+
+void JSONReporter::PrintRunData(Run const& run) {
+  std::string indent(6, ' ');
+  std::ostream& out = GetOutputStream();
+  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("run_name", run.run_name.str()) << ",\n";
+  out << indent << FormatKV("run_type", [&run]() -> const char* {
+    switch (run.run_type) {
+      case BenchmarkReporter::Run::RT_Iteration:
+        return "iteration";
+      case BenchmarkReporter::Run::RT_Aggregate:
+        return "aggregate";
+    }
+    BENCHMARK_UNREACHABLE();
+  }()) << ",\n";
+  out << indent << FormatKV("repetitions", run.repetitions) << ",\n";
+  if (run.run_type != BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("repetition_index", run.repetition_index)
+        << ",\n";
+  }
+  out << indent << FormatKV("threads", run.threads) << ",\n";
+  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+  }
+  if (run.error_occurred) {
+    out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
+    out << indent << FormatKV("error_message", run.error_message) << ",\n";
+  }
+  if (!run.report_big_o && !run.report_rms) {
+    out << indent << FormatKV("iterations", run.iterations) << ",\n";
+    out << indent << FormatKV("real_time", run.GetAdjustedRealTime()) << ",\n";
+    out << indent << FormatKV("cpu_time", run.GetAdjustedCPUTime());
+    out << ",\n"
+        << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+  } else if (run.report_big_o) {
+    out << indent << FormatKV("cpu_coefficient", run.GetAdjustedCPUTime())
+        << ",\n";
+    out << indent << FormatKV("real_coefficient", run.GetAdjustedRealTime())
+        << ",\n";
+    out << indent << FormatKV("big_o", GetBigOString(run.complexity)) << ",\n";
+    out << indent << FormatKV("time_unit", GetTimeUnitString(run.time_unit));
+  } else if (run.report_rms) {
+    out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
+  }
+
+  for (auto& c : run.counters) {
+    out << ",\n" << indent << FormatKV(c.first, c.second);
+  }
+
+  if (run.has_memory_result) {
+    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
+    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+  }
+
+  if (!run.report_label.empty()) {
+    out << ",\n" << indent << FormatKV("label", run.report_label);
+  }
+  out << '\n';
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/log.h b/thirdparty/benchmark-1.5.0/src/log.h
new file mode 100644
index 0000000000..47d0c35c01
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/log.h
@@ -0,0 +1,74 @@
+#ifndef BENCHMARK_LOG_H_
+#define BENCHMARK_LOG_H_
+
+#include <iostream>
+#include <ostream>
+
+#include "benchmark/benchmark.h"
+
+namespace benchmark {
+namespace internal {
+
+typedef std::basic_ostream<char>&(EndLType)(std::basic_ostream<char>&);
+
+class LogType {
+  friend LogType& GetNullLogInstance();
+  friend LogType& GetErrorLogInstance();
+
+  // FIXME: Add locking to output.
+  template <class Tp>
+  friend LogType& operator<<(LogType&, Tp const&);
+  friend LogType& operator<<(LogType&, EndLType*);
+
+ private:
+  LogType(std::ostream* out) : out_(out) {}
+  std::ostream* out_;
+  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(LogType);
+};
+
+template <class Tp>
+LogType& operator<<(LogType& log, Tp const& value) {
+  if (log.out_) {
+    *log.out_ << value;
+  }
+  return log;
+}
+
+inline LogType& operator<<(LogType& log, EndLType* m) {
+  if (log.out_) {
+    *log.out_ << m;
+  }
+  return log;
+}
+
+inline int& LogLevel() {
+  static int log_level = 0;
+  return log_level;
+}
+
+inline LogType& GetNullLogInstance() {
+  static LogType log(nullptr);
+  return log;
+}
+
+inline LogType& GetErrorLogInstance() {
+  static LogType log(&std::clog);
+  return log;
+}
+
+inline LogType& GetLogInstanceForLevel(int level) {
+  if (level <= LogLevel()) {
+    return GetErrorLogInstance();
+  }
+  return GetNullLogInstance();
+}
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+// clang-format off
+#define VLOG(x)                                                               \
+  (::benchmark::internal::GetLogInstanceForLevel(x) << "-- LOG(" << x << "):" \
+                                                                         " ")
+// clang-format on
+#endif
diff --git a/thirdparty/benchmark-1.5.0/src/mutex.h b/thirdparty/benchmark-1.5.0/src/mutex.h
new file mode 100644
index 0000000000..5f461d05a0
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/mutex.h
@@ -0,0 +1,155 @@
+#ifndef BENCHMARK_MUTEX_H_
+#define BENCHMARK_MUTEX_H_
+
+#include <condition_variable>
+#include <mutex>
+
+#include "check.h"
+
+// Enable thread safety attributes only with clang.
+// The attributes can be safely erased when compiling with other compilers.
+#if defined(HAVE_THREAD_SAFETY_ATTRIBUTES)
+#define THREAD_ANNOTATION_ATTRIBUTE__(x) __attribute__((x))
+#else
+#define THREAD_ANNOTATION_ATTRIBUTE__(x)  // no-op
+#endif
+
+#define CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(capability(x))
+
+#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+
+#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
+
+#define PT_GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(pt_guarded_by(x))
+
+#define ACQUIRED_BEFORE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
+
+#define ACQUIRED_AFTER(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
+
+#define REQUIRES(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
+
+#define REQUIRES_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
+
+#define ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
+
+#define ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
+
+#define RELEASE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
+
+#define RELEASE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
+
+#define TRY_ACQUIRE_SHARED(...) \
+  THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
+
+#define EXCLUDES(...) THREAD_ANNOTATION_ATTRIBUTE__(locks_excluded(__VA_ARGS__))
+
+#define ASSERT_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(x))
+
+#define ASSERT_SHARED_CAPABILITY(x) \
+  THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(x))
+
+#define RETURN_CAPABILITY(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
+
+#define NO_THREAD_SAFETY_ANALYSIS \
+  THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+
+namespace benchmark {
+
+typedef std::condition_variable Condition;
+
+// NOTE: Wrappers for std::mutex and std::unique_lock are provided so that
+// we can annotate them with thread safety attributes and use the
+// -Wthread-safety warning with clang. The standard library types cannot be
+// used directly because they do not provided the required annotations.
+class CAPABILITY("mutex") Mutex {
+ public:
+  Mutex() {}
+
+  void lock() ACQUIRE() { mut_.lock(); }
+  void unlock() RELEASE() { mut_.unlock(); }
+  std::mutex& native_handle() { return mut_; }
+
+ private:
+  std::mutex mut_;
+};
+
+class SCOPED_CAPABILITY MutexLock {
+  typedef std::unique_lock<std::mutex> MutexLockImp;
+
+ public:
+  MutexLock(Mutex& m) ACQUIRE(m) : ml_(m.native_handle()) {}
+  ~MutexLock() RELEASE() {}
+  MutexLockImp& native_handle() { return ml_; }
+
+ private:
+  MutexLockImp ml_;
+};
+
+class Barrier {
+ public:
+  Barrier(int num_threads) : running_threads_(num_threads) {}
+
+  // Called by each thread
+  bool wait() EXCLUDES(lock_) {
+    bool last_thread = false;
+    {
+      MutexLock ml(lock_);
+      last_thread = createBarrier(ml);
+    }
+    if (last_thread) phase_condition_.notify_all();
+    return last_thread;
+  }
+
+  void removeThread() EXCLUDES(lock_) {
+    MutexLock ml(lock_);
+    --running_threads_;
+    if (entered_ != 0) phase_condition_.notify_all();
+  }
+
+ private:
+  Mutex lock_;
+  Condition phase_condition_;
+  int running_threads_;
+
+  // State for barrier management
+  int phase_number_ = 0;
+  int entered_ = 0;  // Number of threads that have entered this barrier
+
+  // Enter the barrier and wait until all other threads have also
+  // entered the barrier.  Returns iff this is the last thread to
+  // enter the barrier.
+  bool createBarrier(MutexLock& ml) REQUIRES(lock_) {
+    CHECK_LT(entered_, running_threads_);
+    entered_++;
+    if (entered_ < running_threads_) {
+      // Wait for all threads to enter
+      int phase_number_cp = phase_number_;
+      auto cb = [this, phase_number_cp]() {
+        return this->phase_number_ > phase_number_cp ||
+               entered_ == running_threads_;  // A thread has aborted in error
+      };
+      phase_condition_.wait(ml.native_handle(), cb);
+      if (phase_number_ > phase_number_cp) return false;
+      // else (running_threads_ == entered_) and we are the last thread.
+    }
+    // Last thread has reached the barrier
+    phase_number_++;
+    entered_ = 0;
+    return true;
+  }
+};
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_MUTEX_H_
diff --git a/thirdparty/benchmark-1.5.0/src/re.h b/thirdparty/benchmark-1.5.0/src/re.h
new file mode 100644
index 0000000000..fbe25037b4
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/re.h
@@ -0,0 +1,158 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RE_H_
+#define BENCHMARK_RE_H_
+
+#include "internal_macros.h"
+
+// clang-format off
+
+#if !defined(HAVE_STD_REGEX) && \
+    !defined(HAVE_GNU_POSIX_REGEX) && \
+    !defined(HAVE_POSIX_REGEX)
+  // No explicit regex selection; detect based on builtin hints.
+  #if defined(BENCHMARK_OS_LINUX) || defined(BENCHMARK_OS_APPLE)
+    #define HAVE_POSIX_REGEX 1
+  #elif __cplusplus >= 199711L
+    #define HAVE_STD_REGEX 1
+  #endif
+#endif
+
+// Prefer C regex libraries when compiling w/o exceptions so that we can
+// correctly report errors.
+#if defined(BENCHMARK_HAS_NO_EXCEPTIONS) && \
+    defined(BENCHMARK_HAVE_STD_REGEX) && \
+    (defined(HAVE_GNU_POSIX_REGEX) || defined(HAVE_POSIX_REGEX))
+  #undef HAVE_STD_REGEX
+#endif
+
+#if defined(HAVE_STD_REGEX)
+  #include <regex>
+#elif defined(HAVE_GNU_POSIX_REGEX)
+  #include <gnuregex.h>
+#elif defined(HAVE_POSIX_REGEX)
+  #include <regex.h>
+#else
+#error No regular expression backend was found!
+#endif
+
+// clang-format on
+
+#include <string>
+
+#include "check.h"
+
+namespace benchmark {
+
+// A wrapper around the POSIX regular expression API that provides automatic
+// cleanup
+class Regex {
+ public:
+  Regex() : init_(false) {}
+
+  ~Regex();
+
+  // Compile a regular expression matcher from spec.  Returns true on success.
+  //
+  // On failure (and if error is not nullptr), error is populated with a human
+  // readable error message if an error occurs.
+  bool Init(const std::string& spec, std::string* error);
+
+  // Returns whether str matches the compiled regular expression.
+  bool Match(const std::string& str);
+
+ private:
+  bool init_;
+// Underlying regular expression object
+#if defined(HAVE_STD_REGEX)
+  std::regex re_;
+#elif defined(HAVE_POSIX_REGEX) || defined(HAVE_GNU_POSIX_REGEX)
+  regex_t re_;
+#else
+#error No regular expression backend implementation available
+#endif
+};
+
+#if defined(HAVE_STD_REGEX)
+
+inline bool Regex::Init(const std::string& spec, std::string* error) {
+#ifdef BENCHMARK_HAS_NO_EXCEPTIONS
+  ((void)error);  // suppress unused warning
+#else
+  try {
+#endif
+  re_ = std::regex(spec, std::regex_constants::extended);
+  init_ = true;
+#ifndef BENCHMARK_HAS_NO_EXCEPTIONS
+}
+catch (const std::regex_error& e) {
+  if (error) {
+    *error = e.what();
+  }
+}
+#endif
+return init_;
+}
+
+inline Regex::~Regex() {}
+
+inline bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+  return std::regex_search(str, re_);
+}
+
+#else
+inline bool Regex::Init(const std::string& spec, std::string* error) {
+  int ec = regcomp(&re_, spec.c_str(), REG_EXTENDED | REG_NOSUB);
+  if (ec != 0) {
+    if (error) {
+      size_t needed = regerror(ec, &re_, nullptr, 0);
+      char* errbuf = new char[needed];
+      regerror(ec, &re_, errbuf, needed);
+
+      // regerror returns the number of bytes necessary to null terminate
+      // the string, so we move that when assigning to error.
+      CHECK_NE(needed, 0);
+      error->assign(errbuf, needed - 1);
+
+      delete[] errbuf;
+    }
+
+    return false;
+  }
+
+  init_ = true;
+  return true;
+}
+
+inline Regex::~Regex() {
+  if (init_) {
+    regfree(&re_);
+  }
+}
+
+inline bool Regex::Match(const std::string& str) {
+  if (!init_) {
+    return false;
+  }
+  return regexec(&re_, str.c_str(), 0, nullptr, 0) == 0;
+}
+#endif
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RE_H_
diff --git a/thirdparty/benchmark-1.5.0/src/reporter.cc b/thirdparty/benchmark-1.5.0/src/reporter.cc
new file mode 100644
index 0000000000..4d3e477d44
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/reporter.cc
@@ -0,0 +1,105 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+#include "timers.h"
+
+#include <cstdlib>
+
+#include <iostream>
+#include <tuple>
+#include <vector>
+
+#include "check.h"
+#include "string_util.h"
+
+namespace benchmark {
+
+BenchmarkReporter::BenchmarkReporter()
+    : output_stream_(&std::cout), error_stream_(&std::cerr) {}
+
+BenchmarkReporter::~BenchmarkReporter() {}
+
+void BenchmarkReporter::PrintBasicContext(std::ostream *out,
+                                          Context const &context) {
+  CHECK(out) << "cannot be null";
+  auto &Out = *out;
+
+  Out << LocalDateTimeString() << "\n";
+
+  if (context.executable_name)
+    Out << "Running " << context.executable_name << "\n";
+
+  const CPUInfo &info = context.cpu_info;
+  Out << "Run on (" << info.num_cpus << " X "
+      << (info.cycles_per_second / 1000000.0) << " MHz CPU "
+      << ((info.num_cpus > 1) ? "s" : "") << ")\n";
+  if (info.caches.size() != 0) {
+    Out << "CPU Caches:\n";
+    for (auto &CInfo : info.caches) {
+      Out << "  L" << CInfo.level << " " << CInfo.type << " "
+          << (CInfo.size / 1000) << "K";
+      if (CInfo.num_sharing != 0)
+        Out << " (x" << (info.num_cpus / CInfo.num_sharing) << ")";
+      Out << "\n";
+    }
+  }
+  if (!info.load_avg.empty()) {
+    Out << "Load Average: ";
+    for (auto It = info.load_avg.begin(); It != info.load_avg.end();) {
+      Out << StrFormat("%.2f", *It++);
+      if (It != info.load_avg.end()) Out << ", ";
+    }
+    Out << "\n";
+  }
+
+  if (info.scaling_enabled) {
+    Out << "***WARNING*** CPU scaling is enabled, the benchmark "
+           "real time measurements may be noisy and will incur extra "
+           "overhead.\n";
+  }
+
+#ifndef NDEBUG
+  Out << "***WARNING*** Library was built as DEBUG. Timings may be "
+         "affected.\n";
+#endif
+}
+
+// No initializer because it's already initialized to NULL.
+const char *BenchmarkReporter::Context::executable_name;
+
+BenchmarkReporter::Context::Context()
+    : cpu_info(CPUInfo::Get()), sys_info(SystemInfo::Get()) {}
+
+std::string BenchmarkReporter::Run::benchmark_name() const {
+  std::string name = run_name.str();
+  if (run_type == RT_Aggregate) {
+    name += "_" + aggregate_name;
+  }
+  return name;
+}
+
+double BenchmarkReporter::Run::GetAdjustedRealTime() const {
+  double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
+  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  return new_time;
+}
+
+double BenchmarkReporter::Run::GetAdjustedCPUTime() const {
+  double new_time = cpu_accumulated_time * GetTimeUnitMultiplier(time_unit);
+  if (iterations != 0) new_time /= static_cast<double>(iterations);
+  return new_time;
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/sleep.cc b/thirdparty/benchmark-1.5.0/src/sleep.cc
new file mode 100644
index 0000000000..1512ac90f7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/sleep.cc
@@ -0,0 +1,51 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sleep.h"
+
+#include <cerrno>
+#include <cstdlib>
+#include <ctime>
+
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <windows.h>
+#endif
+
+namespace benchmark {
+#ifdef BENCHMARK_OS_WINDOWS
+// Window's Sleep takes milliseconds argument.
+void SleepForMilliseconds(int milliseconds) { Sleep(milliseconds); }
+void SleepForSeconds(double seconds) {
+  SleepForMilliseconds(static_cast<int>(kNumMillisPerSecond * seconds));
+}
+#else   // BENCHMARK_OS_WINDOWS
+void SleepForMicroseconds(int microseconds) {
+  struct timespec sleep_time;
+  sleep_time.tv_sec = microseconds / kNumMicrosPerSecond;
+  sleep_time.tv_nsec = (microseconds % kNumMicrosPerSecond) * kNumNanosPerMicro;
+  while (nanosleep(&sleep_time, &sleep_time) != 0 && errno == EINTR)
+    ;  // Ignore signals and wait for the full interval to elapse.
+}
+
+void SleepForMilliseconds(int milliseconds) {
+  SleepForMicroseconds(milliseconds * kNumMicrosPerMilli);
+}
+
+void SleepForSeconds(double seconds) {
+  SleepForMicroseconds(static_cast<int>(seconds * kNumMicrosPerSecond));
+}
+#endif  // BENCHMARK_OS_WINDOWS
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/sleep.h b/thirdparty/benchmark-1.5.0/src/sleep.h
new file mode 100644
index 0000000000..f98551afe2
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/sleep.h
@@ -0,0 +1,15 @@
+#ifndef BENCHMARK_SLEEP_H_
+#define BENCHMARK_SLEEP_H_
+
+namespace benchmark {
+const int kNumMillisPerSecond = 1000;
+const int kNumMicrosPerMilli = 1000;
+const int kNumMicrosPerSecond = kNumMillisPerSecond * 1000;
+const int kNumNanosPerMicro = 1000;
+const int kNumNanosPerSecond = kNumNanosPerMicro * kNumMicrosPerSecond;
+
+void SleepForMilliseconds(int milliseconds);
+void SleepForSeconds(double seconds);
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_SLEEP_H_
diff --git a/thirdparty/benchmark-1.5.0/src/statistics.cc b/thirdparty/benchmark-1.5.0/src/statistics.cc
new file mode 100644
index 0000000000..bd5a3d6597
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/statistics.cc
@@ -0,0 +1,193 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+// Copyright 2017 Roman Lebedev. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "check.h"
+#include "statistics.h"
+
+namespace benchmark {
+
+auto StatisticsSum = [](const std::vector<double>& v) {
+  return std::accumulate(v.begin(), v.end(), 0.0);
+};
+
+double StatisticsMean(const std::vector<double>& v) {
+  if (v.empty()) return 0.0;
+  return StatisticsSum(v) * (1.0 / v.size());
+}
+
+double StatisticsMedian(const std::vector<double>& v) {
+  if (v.size() < 3) return StatisticsMean(v);
+  std::vector<double> copy(v);
+
+  auto center = copy.begin() + v.size() / 2;
+  std::nth_element(copy.begin(), center, copy.end());
+
+  // did we have an odd number of samples?
+  // if yes, then center is the median
+  // it no, then we are looking for the average between center and the value
+  // before
+  if (v.size() % 2 == 1) return *center;
+  auto center2 = copy.begin() + v.size() / 2 - 1;
+  std::nth_element(copy.begin(), center2, copy.end());
+  return (*center + *center2) / 2.0;
+}
+
+// Return the sum of the squares of this sample set
+auto SumSquares = [](const std::vector<double>& v) {
+  return std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
+};
+
+auto Sqr = [](const double dat) { return dat * dat; };
+auto Sqrt = [](const double dat) {
+  // Avoid NaN due to imprecision in the calculations
+  if (dat < 0.0) return 0.0;
+  return std::sqrt(dat);
+};
+
+double StatisticsStdDev(const std::vector<double>& v) {
+  const auto mean = StatisticsMean(v);
+  if (v.empty()) return mean;
+
+  // Sample standard deviation is undefined for n = 1
+  if (v.size() == 1) return 0.0;
+
+  const double avg_squares = SumSquares(v) * (1.0 / v.size());
+  return Sqrt(v.size() / (v.size() - 1.0) * (avg_squares - Sqr(mean)));
+}
+
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports) {
+  typedef BenchmarkReporter::Run Run;
+  std::vector<Run> results;
+
+  auto error_count =
+      std::count_if(reports.begin(), reports.end(),
+                    [](Run const& run) { return run.error_occurred; });
+
+  if (reports.size() - error_count < 2) {
+    // We don't report aggregated data if there was a single run.
+    return results;
+  }
+
+  // Accumulators.
+  std::vector<double> real_accumulated_time_stat;
+  std::vector<double> cpu_accumulated_time_stat;
+
+  real_accumulated_time_stat.reserve(reports.size());
+  cpu_accumulated_time_stat.reserve(reports.size());
+
+  // All repetitions should be run with the same number of iterations so we
+  // can take this information from the first benchmark.
+  const IterationCount run_iterations = reports.front().iterations;
+  // create stats for user counters
+  struct CounterStat {
+    Counter c;
+    std::vector<double> s;
+  };
+  std::map<std::string, CounterStat> counter_stats;
+  for (Run const& r : reports) {
+    for (auto const& cnt : r.counters) {
+      auto it = counter_stats.find(cnt.first);
+      if (it == counter_stats.end()) {
+        counter_stats.insert({cnt.first, {cnt.second, std::vector<double>{}}});
+        it = counter_stats.find(cnt.first);
+        it->second.s.reserve(reports.size());
+      } else {
+        CHECK_EQ(counter_stats[cnt.first].c.flags, cnt.second.flags);
+      }
+    }
+  }
+
+  // Populate the accumulators.
+  for (Run const& run : reports) {
+    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
+    CHECK_EQ(run_iterations, run.iterations);
+    if (run.error_occurred) continue;
+    real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
+    cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
+    // user counters
+    for (auto const& cnt : run.counters) {
+      auto it = counter_stats.find(cnt.first);
+      CHECK_NE(it, counter_stats.end());
+      it->second.s.emplace_back(cnt.second);
+    }
+  }
+
+  // Only add label if it is same for all runs
+  std::string report_label = reports[0].report_label;
+  for (std::size_t i = 1; i < reports.size(); i++) {
+    if (reports[i].report_label != report_label) {
+      report_label = "";
+      break;
+    }
+  }
+
+  const double iteration_rescale_factor =
+      double(reports.size()) / double(run_iterations);
+
+  for (const auto& Stat : *reports[0].statistics) {
+    // Get the data from the accumulator to BenchmarkReporter::Run's.
+    Run data;
+    data.run_name = reports[0].run_name;
+    data.run_type = BenchmarkReporter::Run::RT_Aggregate;
+    data.threads = reports[0].threads;
+    data.repetitions = reports[0].repetitions;
+    data.repetition_index = Run::no_repetition_index;
+    data.aggregate_name = Stat.name_;
+    data.report_label = report_label;
+
+    // It is incorrect to say that an aggregate is computed over
+    // run's iterations, because those iterations already got averaged.
+    // Similarly, if there are N repetitions with 1 iterations each,
+    // an aggregate will be computed over N measurements, not 1.
+    // Thus it is best to simply use the count of separate reports.
+    data.iterations = reports.size();
+
+    data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
+    data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
+
+    // We will divide these times by data.iterations when reporting, but the
+    // data.iterations is not nessesairly the scale of these measurements,
+    // because in each repetition, these timers are sum over all the iterations.
+    // And if we want to say that the stats are over N repetitions and not
+    // M iterations, we need to multiply these by (N/M).
+    data.real_accumulated_time *= iteration_rescale_factor;
+    data.cpu_accumulated_time *= iteration_rescale_factor;
+
+    data.time_unit = reports[0].time_unit;
+
+    // user counters
+    for (auto const& kv : counter_stats) {
+      // Do *NOT* rescale the custom counters. They are already properly scaled.
+      const auto uc_stat = Stat.compute_(kv.second.s);
+      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags,
+                       counter_stats[kv.first].c.oneK);
+      data.counters[kv.first] = c;
+    }
+
+    results.push_back(data);
+  }
+
+  return results;
+}
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/statistics.h b/thirdparty/benchmark-1.5.0/src/statistics.h
new file mode 100644
index 0000000000..7eccc85536
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/statistics.h
@@ -0,0 +1,37 @@
+// Copyright 2016 Ismael Jimenez Martinez. All rights reserved.
+// Copyright 2017 Roman Lebedev. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATISTICS_H_
+#define STATISTICS_H_
+
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+namespace benchmark {
+
+// Return a vector containing the mean, median and standard devation information
+// (and any user-specified info) for the specified list of reports. If 'reports'
+// contains less than two non-errored runs an empty vector is returned
+std::vector<BenchmarkReporter::Run> ComputeStats(
+    const std::vector<BenchmarkReporter::Run>& reports);
+
+double StatisticsMean(const std::vector<double>& v);
+double StatisticsMedian(const std::vector<double>& v);
+double StatisticsStdDev(const std::vector<double>& v);
+
+}  // end namespace benchmark
+
+#endif  // STATISTICS_H_
diff --git a/thirdparty/benchmark-1.5.0/src/string_util.cc b/thirdparty/benchmark-1.5.0/src/string_util.cc
new file mode 100644
index 0000000000..39b01a1719
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/string_util.cc
@@ -0,0 +1,252 @@
+#include "string_util.h"
+
+#include <array>
+#include <cmath>
+#include <cstdarg>
+#include <cstdio>
+#include <memory>
+#include <sstream>
+
+#include "arraysize.h"
+
+namespace benchmark {
+namespace {
+
+// kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta.
+const char kBigSIUnits[] = "kMGTPEZY";
+// Kibi, Mebi, Gibi, Tebi, Pebi, Exbi, Zebi, Yobi.
+const char kBigIECUnits[] = "KMGTPEZY";
+// milli, micro, nano, pico, femto, atto, zepto, yocto.
+const char kSmallSIUnits[] = "munpfazy";
+
+// We require that all three arrays have the same size.
+static_assert(arraysize(kBigSIUnits) == arraysize(kBigIECUnits),
+              "SI and IEC unit arrays must be the same size");
+static_assert(arraysize(kSmallSIUnits) == arraysize(kBigSIUnits),
+              "Small SI and Big SI unit arrays must be the same size");
+
+static const int64_t kUnitsSize = arraysize(kBigSIUnits);
+
+void ToExponentAndMantissa(double val, double thresh, int precision,
+                           double one_k, std::string* mantissa,
+                           int64_t* exponent) {
+  std::stringstream mantissa_stream;
+
+  if (val < 0) {
+    mantissa_stream << "-";
+    val = -val;
+  }
+
+  // Adjust threshold so that it never excludes things which can't be rendered
+  // in 'precision' digits.
+  const double adjusted_threshold =
+      std::max(thresh, 1.0 / std::pow(10.0, precision));
+  const double big_threshold = adjusted_threshold * one_k;
+  const double small_threshold = adjusted_threshold;
+  // Values in ]simple_threshold,small_threshold[ will be printed as-is
+  const double simple_threshold = 0.01;
+
+  if (val > big_threshold) {
+    // Positive powers
+    double scaled = val;
+    for (size_t i = 0; i < arraysize(kBigSIUnits); ++i) {
+      scaled /= one_k;
+      if (scaled <= big_threshold) {
+        mantissa_stream << scaled;
+        *exponent = i + 1;
+        *mantissa = mantissa_stream.str();
+        return;
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else if (val < small_threshold) {
+    // Negative powers
+    if (val < simple_threshold) {
+      double scaled = val;
+      for (size_t i = 0; i < arraysize(kSmallSIUnits); ++i) {
+        scaled *= one_k;
+        if (scaled >= small_threshold) {
+          mantissa_stream << scaled;
+          *exponent = -static_cast<int64_t>(i + 1);
+          *mantissa = mantissa_stream.str();
+          return;
+        }
+      }
+    }
+    mantissa_stream << val;
+    *exponent = 0;
+  } else {
+    mantissa_stream << val;
+    *exponent = 0;
+  }
+  *mantissa = mantissa_stream.str();
+}
+
+std::string ExponentToPrefix(int64_t exponent, bool iec) {
+  if (exponent == 0) return "";
+
+  const int64_t index = (exponent > 0 ? exponent - 1 : -exponent - 1);
+  if (index >= kUnitsSize) return "";
+
+  const char* array =
+      (exponent > 0 ? (iec ? kBigIECUnits : kBigSIUnits) : kSmallSIUnits);
+  if (iec)
+    return array[index] + std::string("i");
+  else
+    return std::string(1, array[index]);
+}
+
+std::string ToBinaryStringFullySpecified(double value, double threshold,
+                                         int precision, double one_k = 1024.0) {
+  std::string mantissa;
+  int64_t exponent;
+  ToExponentAndMantissa(value, threshold, precision, one_k, &mantissa,
+                        &exponent);
+  return mantissa + ExponentToPrefix(exponent, false);
+}
+
+}  // end namespace
+
+void AppendHumanReadable(int n, std::string* str) {
+  std::stringstream ss;
+  // Round down to the nearest SI prefix.
+  ss << ToBinaryStringFullySpecified(n, 1.0, 0);
+  *str += ss.str();
+}
+
+std::string HumanReadableNumber(double n, double one_k) {
+  // 1.1 means that figures up to 1.1k should be shown with the next unit down;
+  // this softens edge effects.
+  // 1 means that we should show one decimal place of precision.
+  return ToBinaryStringFullySpecified(n, 1.1, 1, one_k);
+}
+
+std::string StrFormatImp(const char* msg, va_list args) {
+  // we might need a second shot at this, so pre-emptivly make a copy
+  va_list args_cp;
+  va_copy(args_cp, args);
+
+  // TODO(ericwf): use std::array for first attempt to avoid one memory
+  // allocation guess what the size might be
+  std::array<char, 256> local_buff;
+  std::size_t size = local_buff.size();
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
+  // in the android-ndk
+  auto ret = vsnprintf(local_buff.data(), size, msg, args_cp);
+
+  va_end(args_cp);
+
+  // handle empty expansion
+  if (ret == 0) return std::string{};
+  if (static_cast<std::size_t>(ret) < size)
+    return std::string(local_buff.data());
+
+  // we did not provide a long enough buffer on our first attempt.
+  // add 1 to size to account for null-byte in size cast to prevent overflow
+  size = static_cast<std::size_t>(ret) + 1;
+  auto buff_ptr = std::unique_ptr<char[]>(new char[size]);
+  // 2015-10-08: vsnprintf is used instead of snd::vsnprintf due to a limitation
+  // in the android-ndk
+  ret = vsnprintf(buff_ptr.get(), size, msg, args);
+  return std::string(buff_ptr.get());
+}
+
+std::string StrFormat(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  std::string tmp = StrFormatImp(format, args);
+  va_end(args);
+  return tmp;
+}
+
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const unsigned long result = strtoul(strStart, &strEnd, base);
+
+  const int strtoulErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtoulErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of unsigned long");
+  } else if (strEnd == strStart || strtoulErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return result;
+}
+
+int stoi(const std::string& str, size_t* pos, int base) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const long result = strtol(strStart, &strEnd, base);
+
+  const int strtolErrno = errno;
+  /* Restore previous errno */
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtolErrno == ERANGE || long(int(result)) != result) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtolErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return int(result);
+}
+
+double stod(const std::string& str, size_t* pos) {
+  /* Record previous errno */
+  const int oldErrno = errno;
+  errno = 0;
+
+  const char* strStart = str.c_str();
+  char* strEnd = const_cast<char*>(strStart);
+  const double result = strtod(strStart, &strEnd);
+
+  /* Restore previous errno */
+  const int strtodErrno = errno;
+  errno = oldErrno;
+
+  /* Check for errors and return */
+  if (strtodErrno == ERANGE) {
+    throw std::out_of_range(
+      "stoul failed: " + str + " is outside of range of int");
+  } else if (strEnd == strStart || strtodErrno != 0) {
+    throw std::invalid_argument(
+      "stoul failed: " + str + " is not an integer");
+  }
+  if (pos != nullptr) {
+    *pos = static_cast<size_t>(strEnd - strStart);
+  }
+  return result;
+}
+#endif
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/string_util.h b/thirdparty/benchmark-1.5.0/src/string_util.h
new file mode 100644
index 0000000000..09d7b4bd2a
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/string_util.h
@@ -0,0 +1,59 @@
+#ifndef BENCHMARK_STRING_UTIL_H_
+#define BENCHMARK_STRING_UTIL_H_
+
+#include <sstream>
+#include <string>
+#include <utility>
+#include "internal_macros.h"
+
+namespace benchmark {
+
+void AppendHumanReadable(int n, std::string* str);
+
+std::string HumanReadableNumber(double n, double one_k = 1024.0);
+
+#if defined(__MINGW32__)
+__attribute__((format(__MINGW_PRINTF_FORMAT, 1, 2)))
+#elif defined(__GNUC__)
+__attribute__((format(printf, 1, 2)))
+#endif
+std::string
+StrFormat(const char* format, ...);
+
+inline std::ostream& StrCatImp(std::ostream& out) BENCHMARK_NOEXCEPT {
+  return out;
+}
+
+template <class First, class... Rest>
+inline std::ostream& StrCatImp(std::ostream& out, First&& f, Rest&&... rest) {
+  out << std::forward<First>(f);
+  return StrCatImp(out, std::forward<Rest>(rest)...);
+}
+
+template <class... Args>
+inline std::string StrCat(Args&&... args) {
+  std::ostringstream ss;
+  StrCatImp(ss, std::forward<Args>(args)...);
+  return ss.str();
+}
+
+#ifdef BENCHMARK_STL_ANDROID_GNUSTL
+/*
+ * GNU STL in Android NDK lacks support for some C++11 functions, including
+ * stoul, stoi, stod. We reimplement them here using C functions strtoul,
+ * strtol, strtod. Note that reimplemented functions are in benchmark::
+ * namespace, not std:: namespace.
+ */
+unsigned long stoul(const std::string& str, size_t* pos = nullptr,
+                           int base = 10);
+int stoi(const std::string& str, size_t* pos = nullptr, int base = 10);
+double stod(const std::string& str, size_t* pos = nullptr);
+#else
+using std::stoul;
+using std::stoi;
+using std::stod;
+#endif
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_STRING_UTIL_H_
diff --git a/thirdparty/benchmark-1.5.0/src/sysinfo.cc b/thirdparty/benchmark-1.5.0/src/sysinfo.cc
new file mode 100644
index 0000000000..28126470ba
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/sysinfo.cc
@@ -0,0 +1,699 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <shlwapi.h>
+#undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
+#include <versionhelpers.h>
+#include <windows.h>
+#include <codecvt>
+#else
+#include <fcntl.h>
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX || \
+    defined BENCHMARK_OS_NETBSD || defined BENCHMARK_OS_OPENBSD
+#define BENCHMARK_HAS_SYSCTL
+#include <sys/sysctl.h>
+#endif
+#endif
+#if defined(BENCHMARK_OS_SOLARIS)
+#include <kstat.h>
+#endif
+#if defined(BENCHMARK_OS_QNX)
+#include <sys/syspage.h>
+#endif
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cerrno>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <locale>
+
+#include "check.h"
+#include "cycleclock.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+
+namespace benchmark {
+namespace {
+
+void PrintImp(std::ostream& out) { out << std::endl; }
+
+template <class First, class... Rest>
+void PrintImp(std::ostream& out, First&& f, Rest&&... rest) {
+  out << std::forward<First>(f);
+  PrintImp(out, std::forward<Rest>(rest)...);
+}
+
+template <class... Args>
+BENCHMARK_NORETURN void PrintErrorAndDie(Args&&... args) {
+  PrintImp(std::cerr, std::forward<Args>(args)...);
+  std::exit(EXIT_FAILURE);
+}
+
+#ifdef BENCHMARK_HAS_SYSCTL
+
+/// ValueUnion - A type used to correctly alias the byte-for-byte output of
+/// `sysctl` with the result type it's to be interpreted as.
+struct ValueUnion {
+  union DataT {
+    uint32_t uint32_value;
+    uint64_t uint64_value;
+    // For correct aliasing of union members from bytes.
+    char bytes[8];
+  };
+  using DataPtr = std::unique_ptr<DataT, decltype(&std::free)>;
+
+  // The size of the data union member + its trailing array size.
+  size_t Size;
+  DataPtr Buff;
+
+ public:
+  ValueUnion() : Size(0), Buff(nullptr, &std::free) {}
+
+  explicit ValueUnion(size_t BuffSize)
+      : Size(sizeof(DataT) + BuffSize),
+        Buff(::new (std::malloc(Size)) DataT(), &std::free) {}
+
+  ValueUnion(ValueUnion&& other) = default;
+
+  explicit operator bool() const { return bool(Buff); }
+
+  char* data() const { return Buff->bytes; }
+
+  std::string GetAsString() const { return std::string(data()); }
+
+  int64_t GetAsInteger() const {
+    if (Size == sizeof(Buff->uint32_value))
+      return static_cast<int32_t>(Buff->uint32_value);
+    else if (Size == sizeof(Buff->uint64_value))
+      return static_cast<int64_t>(Buff->uint64_value);
+    BENCHMARK_UNREACHABLE();
+  }
+
+  uint64_t GetAsUnsigned() const {
+    if (Size == sizeof(Buff->uint32_value))
+      return Buff->uint32_value;
+    else if (Size == sizeof(Buff->uint64_value))
+      return Buff->uint64_value;
+    BENCHMARK_UNREACHABLE();
+  }
+
+  template <class T, int N>
+  std::array<T, N> GetAsArray() {
+    const int ArrSize = sizeof(T) * N;
+    CHECK_LE(ArrSize, Size);
+    std::array<T, N> Arr;
+    std::memcpy(Arr.data(), data(), ArrSize);
+    return Arr;
+  }
+};
+
+ValueUnion GetSysctlImp(std::string const& Name) {
+#if defined BENCHMARK_OS_OPENBSD
+  int mib[2];
+
+  mib[0] = CTL_HW;
+  if ((Name == "hw.ncpu") || (Name == "hw.cpuspeed")){
+    ValueUnion buff(sizeof(int));
+
+    if (Name == "hw.ncpu") {
+      mib[1] = HW_NCPU;
+    } else {
+      mib[1] = HW_CPUSPEED;
+    }
+
+    if (sysctl(mib, 2, buff.data(), &buff.Size, nullptr, 0) == -1) {
+      return ValueUnion();
+    }
+    return buff;
+  }
+  return ValueUnion();
+#else
+  size_t CurBuffSize = 0;
+  if (sysctlbyname(Name.c_str(), nullptr, &CurBuffSize, nullptr, 0) == -1)
+    return ValueUnion();
+
+  ValueUnion buff(CurBuffSize);
+  if (sysctlbyname(Name.c_str(), buff.data(), &buff.Size, nullptr, 0) == 0)
+    return buff;
+  return ValueUnion();
+#endif
+}
+
+BENCHMARK_MAYBE_UNUSED
+bool GetSysctl(std::string const& Name, std::string* Out) {
+  Out->clear();
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  Out->assign(Buff.data());
+  return true;
+}
+
+template <class Tp,
+          class = typename std::enable_if<std::is_integral<Tp>::value>::type>
+bool GetSysctl(std::string const& Name, Tp* Out) {
+  *Out = 0;
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  *Out = static_cast<Tp>(Buff.GetAsUnsigned());
+  return true;
+}
+
+template <class Tp, size_t N>
+bool GetSysctl(std::string const& Name, std::array<Tp, N>* Out) {
+  auto Buff = GetSysctlImp(Name);
+  if (!Buff) return false;
+  *Out = Buff.GetAsArray<Tp, N>();
+  return true;
+}
+#endif
+
+template <class ArgT>
+bool ReadFromFile(std::string const& fname, ArgT* arg) {
+  *arg = ArgT();
+  std::ifstream f(fname.c_str());
+  if (!f.is_open()) return false;
+  f >> *arg;
+  return f.good();
+}
+
+bool CpuScalingEnabled(int num_cpus) {
+  // We don't have a valid CPU count, so don't even bother.
+  if (num_cpus <= 0) return false;
+#ifdef BENCHMARK_OS_QNX
+  return false;
+#endif
+#ifndef BENCHMARK_OS_WINDOWS
+  // On Linux, the CPUfreq subsystem exposes CPU information as files on the
+  // local file system. If reading the exported files fails, then we may not be
+  // running on Linux, so we silently ignore all the read errors.
+  std::string res;
+  for (int cpu = 0; cpu < num_cpus; ++cpu) {
+    std::string governor_file =
+        StrCat("/sys/devices/system/cpu/cpu", cpu, "/cpufreq/scaling_governor");
+    if (ReadFromFile(governor_file, &res) && res != "performance") return true;
+  }
+#endif
+  return false;
+}
+
+int CountSetBitsInCPUMap(std::string Val) {
+  auto CountBits = [](std::string Part) {
+    using CPUMask = std::bitset<sizeof(std::uintptr_t) * CHAR_BIT>;
+    Part = "0x" + Part;
+    CPUMask Mask(benchmark::stoul(Part, nullptr, 16));
+    return static_cast<int>(Mask.count());
+  };
+  size_t Pos;
+  int total = 0;
+  while ((Pos = Val.find(',')) != std::string::npos) {
+    total += CountBits(Val.substr(0, Pos));
+    Val = Val.substr(Pos + 1);
+  }
+  if (!Val.empty()) {
+    total += CountBits(Val);
+  }
+  return total;
+}
+
+BENCHMARK_MAYBE_UNUSED
+std::vector<CPUInfo::CacheInfo> GetCacheSizesFromKVFS() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::string dir = "/sys/devices/system/cpu/cpu0/cache/";
+  int Idx = 0;
+  while (true) {
+    CPUInfo::CacheInfo info;
+    std::string FPath = StrCat(dir, "index", Idx++, "/");
+    std::ifstream f(StrCat(FPath, "size").c_str());
+    if (!f.is_open()) break;
+    std::string suffix;
+    f >> info.size;
+    if (f.fail())
+      PrintErrorAndDie("Failed while reading file '", FPath, "size'");
+    if (f.good()) {
+      f >> suffix;
+      if (f.bad())
+        PrintErrorAndDie(
+            "Invalid cache size format: failed to read size suffix");
+      else if (f && suffix != "K")
+        PrintErrorAndDie("Invalid cache size format: Expected bytes ", suffix);
+      else if (suffix == "K")
+        info.size *= 1000;
+    }
+    if (!ReadFromFile(StrCat(FPath, "type"), &info.type))
+      PrintErrorAndDie("Failed to read from file ", FPath, "type");
+    if (!ReadFromFile(StrCat(FPath, "level"), &info.level))
+      PrintErrorAndDie("Failed to read from file ", FPath, "level");
+    std::string map_str;
+    if (!ReadFromFile(StrCat(FPath, "shared_cpu_map"), &map_str))
+      PrintErrorAndDie("Failed to read from file ", FPath, "shared_cpu_map");
+    info.num_sharing = CountSetBitsInCPUMap(map_str);
+    res.push_back(info);
+  }
+
+  return res;
+}
+
+#ifdef BENCHMARK_OS_MACOSX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  std::array<uint64_t, 4> CacheCounts{{0, 0, 0, 0}};
+  GetSysctl("hw.cacheconfig", &CacheCounts);
+
+  struct {
+    std::string name;
+    std::string type;
+    int level;
+    uint64_t num_sharing;
+  } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
+               {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
+               {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
+               {"hw.l3cachesize", "Unified", 3, CacheCounts[3]}};
+  for (auto& C : Cases) {
+    int val;
+    if (!GetSysctl(C.name, &val)) continue;
+    CPUInfo::CacheInfo info;
+    info.type = C.type;
+    info.level = C.level;
+    info.size = val;
+    info.num_sharing = static_cast<int>(C.num_sharing);
+    res.push_back(std::move(info));
+  }
+  return res;
+}
+#elif defined(BENCHMARK_OS_WINDOWS)
+std::vector<CPUInfo::CacheInfo> GetCacheSizesWindows() {
+  std::vector<CPUInfo::CacheInfo> res;
+  DWORD buffer_size = 0;
+  using PInfo = SYSTEM_LOGICAL_PROCESSOR_INFORMATION;
+  using CInfo = CACHE_DESCRIPTOR;
+
+  using UPtr = std::unique_ptr<PInfo, decltype(&std::free)>;
+  GetLogicalProcessorInformation(nullptr, &buffer_size);
+  UPtr buff((PInfo*)malloc(buffer_size), &std::free);
+  if (!GetLogicalProcessorInformation(buff.get(), &buffer_size))
+    PrintErrorAndDie("Failed during call to GetLogicalProcessorInformation: ",
+                     GetLastError());
+
+  PInfo* it = buff.get();
+  PInfo* end = buff.get() + (buffer_size / sizeof(PInfo));
+
+  for (; it != end; ++it) {
+    if (it->Relationship != RelationCache) continue;
+    using BitSet = std::bitset<sizeof(ULONG_PTR) * CHAR_BIT>;
+    BitSet B(it->ProcessorMask);
+    // To prevent duplicates, only consider caches where CPU 0 is specified
+    if (!B.test(0)) continue;
+    CInfo* Cache = &it->Cache;
+    CPUInfo::CacheInfo C;
+    C.num_sharing = static_cast<int>(B.count());
+    C.level = Cache->Level;
+    C.size = Cache->Size;
+    switch (Cache->Type) {
+      case CacheUnified:
+        C.type = "Unified";
+        break;
+      case CacheInstruction:
+        C.type = "Instruction";
+        break;
+      case CacheData:
+        C.type = "Data";
+        break;
+      case CacheTrace:
+        C.type = "Trace";
+        break;
+      default:
+        C.type = "Unknown";
+        break;
+    }
+    res.push_back(C);
+  }
+  return res;
+}
+#elif BENCHMARK_OS_QNX
+std::vector<CPUInfo::CacheInfo> GetCacheSizesQNX() {
+  std::vector<CPUInfo::CacheInfo> res;
+  struct cacheattr_entry *cache = SYSPAGE_ENTRY(cacheattr);
+  uint32_t const elsize = SYSPAGE_ELEMENT_SIZE(cacheattr);
+  int num = SYSPAGE_ENTRY_SIZE(cacheattr) / elsize ;
+  for(int i = 0; i < num; ++i ) {
+    CPUInfo::CacheInfo info;
+    switch (cache->flags){
+      case CACHE_FLAG_INSTR :
+        info.type = "Instruction";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_DATA :
+        info.type = "Data";
+        info.level = 1;
+        break;
+      case CACHE_FLAG_UNIFIED :
+        info.type = "Unified";
+        info.level = 2;
+      case CACHE_FLAG_SHARED :
+        info.type = "Shared";
+        info.level = 3;
+      default :
+        continue;
+        break;
+    }
+    info.size = cache->line_size * cache->num_lines;
+    info.num_sharing = 0;
+    res.push_back(std::move(info));
+    cache = SYSPAGE_ARRAY_ADJ_OFFSET(cacheattr, cache, elsize);
+  }
+  return res;
+}
+#endif
+
+std::vector<CPUInfo::CacheInfo> GetCacheSizes() {
+#ifdef BENCHMARK_OS_MACOSX
+  return GetCacheSizesMacOSX();
+#elif defined(BENCHMARK_OS_WINDOWS)
+  return GetCacheSizesWindows();
+#elif defined(BENCHMARK_OS_QNX)
+  return GetCacheSizesQNX();
+#else
+  return GetCacheSizesFromKVFS();
+#endif
+}
+
+std::string GetSystemName() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  std::string str;
+  const unsigned COUNT = MAX_COMPUTERNAME_LENGTH+1;
+  TCHAR  hostname[COUNT] = {'\0'};
+  DWORD DWCOUNT = COUNT;
+  if (!GetComputerName(hostname, &DWCOUNT))
+    return std::string("");
+#ifndef UNICODE
+  str = std::string(hostname, DWCOUNT);
+#else
+  //Using wstring_convert, Is deprecated in C++17
+  using convert_type = std::codecvt_utf8<wchar_t>;
+  std::wstring_convert<convert_type, wchar_t> converter;
+  std::wstring wStr(hostname, DWCOUNT);
+  str = converter.to_bytes(wStr);
+#endif
+  return str;
+#else // defined(BENCHMARK_OS_WINDOWS)
+#ifdef BENCHMARK_HAS_SYSCTL // BSD/Mac Doesnt have HOST_NAME_MAX defined
+#define HOST_NAME_MAX 64
+#elif defined(BENCHMARK_OS_QNX)
+#define HOST_NAME_MAX 154
+#endif
+  char hostname[HOST_NAME_MAX];
+  int retVal = gethostname(hostname, HOST_NAME_MAX);
+  if (retVal != 0) return std::string("");
+  return std::string(hostname);
+#endif // Catch-all POSIX block.
+}
+
+int GetNumCPUs() {
+#ifdef BENCHMARK_HAS_SYSCTL
+  int NumCPU = -1;
+  if (GetSysctl("hw.ncpu", &NumCPU)) return NumCPU;
+  fprintf(stderr, "Err: %s\n", strerror(errno));
+  std::exit(EXIT_FAILURE);
+#elif defined(BENCHMARK_OS_WINDOWS)
+  SYSTEM_INFO sysinfo;
+  // Use memset as opposed to = {} to avoid GCC missing initializer false
+  // positives.
+  std::memset(&sysinfo, 0, sizeof(SYSTEM_INFO));
+  GetSystemInfo(&sysinfo);
+  return sysinfo.dwNumberOfProcessors;  // number of logical
+                                        // processors in the current
+                                        // group
+#elif defined(BENCHMARK_OS_SOLARIS)
+  // Returns -1 in case of a failure.
+  int NumCPU = sysconf(_SC_NPROCESSORS_ONLN);
+  if (NumCPU < 0) {
+    fprintf(stderr,
+            "sysconf(_SC_NPROCESSORS_ONLN) failed with error: %s\n",
+            strerror(errno));
+  }
+  return NumCPU;
+#elif defined(BENCHMARK_OS_QNX)
+  return static_cast<int>(_syspage_ptr->num_cpu);
+#else
+  int NumCPUs = 0;
+  int MaxID = -1;
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return -1;
+  }
+  const std::string Key = "processor";
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    size_t SplitIdx = ln.find(':');
+    std::string value;
+#if defined(__s390__)
+    // s390 has another format in /proc/cpuinfo
+    // it needs to be parsed differently
+    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+#else
+    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+#endif
+    if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
+      NumCPUs++;
+      if (!value.empty()) {
+        int CurID = benchmark::stoi(value);
+        MaxID = std::max(CurID, MaxID);
+      }
+    }
+  }
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return -1;
+  }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return -1;
+  }
+  f.close();
+
+  if ((MaxID + 1) != NumCPUs) {
+    fprintf(stderr,
+            "CPU ID assignments in /proc/cpuinfo seem messed up."
+            " This is usually caused by a bad BIOS.\n");
+  }
+  return NumCPUs;
+#endif
+  BENCHMARK_UNREACHABLE();
+}
+
+double GetCPUCyclesPerSecond() {
+#if defined BENCHMARK_OS_LINUX || defined BENCHMARK_OS_CYGWIN
+  long freq;
+
+  // If the kernel is exporting the tsc frequency use that. There are issues
+  // where cpuinfo_max_freq cannot be relied on because the BIOS may be
+  // exporintg an invalid p-state (on x86) or p-states may be used to put the
+  // processor in a new mode (turbo mode). Essentially, those frequencies
+  // cannot always be relied upon. The same reasons apply to /proc/cpuinfo as
+  // well.
+  if (ReadFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz", &freq)
+      // If CPU scaling is in effect, we want to use the *maximum* frequency,
+      // not whatever CPU speed some random processor happens to be using now.
+      || ReadFromFile("/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                      &freq)) {
+    // The value is in kHz (as the file name suggests).  For example, on a
+    // 2GHz warpstation, the file contains the value "2000000".
+    return freq * 1000.0;
+  }
+
+  const double error_value = -1;
+  double bogo_clock = error_value;
+
+  std::ifstream f("/proc/cpuinfo");
+  if (!f.is_open()) {
+    std::cerr << "failed to open /proc/cpuinfo\n";
+    return error_value;
+  }
+
+  auto startsWithKey = [](std::string const& Value, std::string const& Key) {
+    if (Key.size() > Value.size()) return false;
+    auto Cmp = [&](char X, char Y) {
+      return std::tolower(X) == std::tolower(Y);
+    };
+    return std::equal(Key.begin(), Key.end(), Value.begin(), Cmp);
+  };
+
+  std::string ln;
+  while (std::getline(f, ln)) {
+    if (ln.empty()) continue;
+    size_t SplitIdx = ln.find(':');
+    std::string value;
+    if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+    // When parsing the "cpu MHz" and "bogomips" (fallback) entries, we only
+    // accept positive values. Some environments (virtual machines) report zero,
+    // which would cause infinite looping in WallTime_Init.
+    if (startsWithKey(ln, "cpu MHz")) {
+      if (!value.empty()) {
+        double cycles_per_second = benchmark::stod(value) * 1000000.0;
+        if (cycles_per_second > 0) return cycles_per_second;
+      }
+    } else if (startsWithKey(ln, "bogomips")) {
+      if (!value.empty()) {
+        bogo_clock = benchmark::stod(value) * 1000000.0;
+        if (bogo_clock < 0.0) bogo_clock = error_value;
+      }
+    }
+  }
+  if (f.bad()) {
+    std::cerr << "Failure reading /proc/cpuinfo\n";
+    return error_value;
+  }
+  if (!f.eof()) {
+    std::cerr << "Failed to read to end of /proc/cpuinfo\n";
+    return error_value;
+  }
+  f.close();
+  // If we found the bogomips clock, but nothing better, we'll use it (but
+  // we're not happy about it); otherwise, fallback to the rough estimation
+  // below.
+  if (bogo_clock >= 0.0) return bogo_clock;
+
+#elif defined BENCHMARK_HAS_SYSCTL
+  constexpr auto* FreqStr =
+#if defined(BENCHMARK_OS_FREEBSD) || defined(BENCHMARK_OS_NETBSD)
+      "machdep.tsc_freq";
+#elif defined BENCHMARK_OS_OPENBSD
+      "hw.cpuspeed";
+#else
+      "hw.cpufrequency";
+#endif
+  unsigned long long hz = 0;
+#if defined BENCHMARK_OS_OPENBSD
+  if (GetSysctl(FreqStr, &hz)) return hz * 1000000;
+#else
+  if (GetSysctl(FreqStr, &hz)) return hz;
+#endif
+  fprintf(stderr, "Unable to determine clock rate from sysctl: %s: %s\n",
+          FreqStr, strerror(errno));
+
+#elif defined BENCHMARK_OS_WINDOWS
+  // In NT, read MHz from the registry. If we fail to do so or we're in win9x
+  // then make a crude estimate.
+  DWORD data, data_size = sizeof(data);
+  if (IsWindowsXPOrGreater() &&
+      SUCCEEDED(
+          SHGetValueA(HKEY_LOCAL_MACHINE,
+                      "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0",
+                      "~MHz", nullptr, &data, &data_size)))
+    return static_cast<double>((int64_t)data *
+                               (int64_t)(1000 * 1000));  // was mhz
+#elif defined (BENCHMARK_OS_SOLARIS)
+  kstat_ctl_t *kc = kstat_open();
+  if (!kc) {
+    std::cerr << "failed to open /dev/kstat\n";
+    return -1;
+  }
+  kstat_t *ksp = kstat_lookup(kc, (char*)"cpu_info", -1, (char*)"cpu_info0");
+  if (!ksp) {
+    std::cerr << "failed to lookup in /dev/kstat\n";
+    return -1;
+  }
+  if (kstat_read(kc, ksp, NULL) < 0) {
+    std::cerr << "failed to read from /dev/kstat\n";
+    return -1;
+  }
+  kstat_named_t *knp =
+      (kstat_named_t*)kstat_data_lookup(ksp, (char*)"current_clock_Hz");
+  if (!knp) {
+    std::cerr << "failed to lookup data in /dev/kstat\n";
+    return -1;
+  }
+  if (knp->data_type != KSTAT_DATA_UINT64) {
+    std::cerr << "current_clock_Hz is of unexpected data type: "
+              << knp->data_type << "\n";
+    return -1;
+  }
+  double clock_hz = knp->value.ui64;
+  kstat_close(kc);
+  return clock_hz;
+#elif defined (BENCHMARK_OS_QNX)
+  return static_cast<double>((int64_t)(SYSPAGE_ENTRY(cpuinfo)->speed) *
+                             (int64_t)(1000 * 1000));
+#endif
+  // If we've fallen through, attempt to roughly estimate the CPU clock rate.
+  const int estimate_time_ms = 1000;
+  const auto start_ticks = cycleclock::Now();
+  SleepForMilliseconds(estimate_time_ms);
+  return static_cast<double>(cycleclock::Now() - start_ticks);
+}
+
+std::vector<double> GetLoadAvg() {
+#if (defined BENCHMARK_OS_FREEBSD || defined(BENCHMARK_OS_LINUX) || \
+    defined BENCHMARK_OS_MACOSX || defined BENCHMARK_OS_NETBSD ||  \
+    defined BENCHMARK_OS_OPENBSD) && !defined(__ANDROID__)
+  constexpr int kMaxSamples = 3;
+  std::vector<double> res(kMaxSamples, 0.0);
+  const int nelem = getloadavg(res.data(), kMaxSamples);
+  if (nelem < 1) {
+    res.clear();
+  } else {
+    res.resize(nelem);
+  }
+  return res;
+#else
+  return {};
+#endif
+}
+
+}  // end namespace
+
+const CPUInfo& CPUInfo::Get() {
+  static const CPUInfo* info = new CPUInfo();
+  return *info;
+}
+
+CPUInfo::CPUInfo()
+    : num_cpus(GetNumCPUs()),
+      cycles_per_second(GetCPUCyclesPerSecond()),
+      caches(GetCacheSizes()),
+      scaling_enabled(CpuScalingEnabled(num_cpus)),
+      load_avg(GetLoadAvg()) {}
+
+
+const SystemInfo& SystemInfo::Get() {
+  static const SystemInfo* info = new SystemInfo();
+  return *info;
+}
+
+SystemInfo::SystemInfo() : name(GetSystemName()) {}
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/thread_manager.h b/thirdparty/benchmark-1.5.0/src/thread_manager.h
new file mode 100644
index 0000000000..1720281f0a
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/thread_manager.h
@@ -0,0 +1,64 @@
+#ifndef BENCHMARK_THREAD_MANAGER_H
+#define BENCHMARK_THREAD_MANAGER_H
+
+#include <atomic>
+
+#include "benchmark/benchmark.h"
+#include "mutex.h"
+
+namespace benchmark {
+namespace internal {
+
+class ThreadManager {
+ public:
+  ThreadManager(int num_threads)
+      : alive_threads_(num_threads), start_stop_barrier_(num_threads) {}
+
+  Mutex& GetBenchmarkMutex() const RETURN_CAPABILITY(benchmark_mutex_) {
+    return benchmark_mutex_;
+  }
+
+  bool StartStopBarrier() EXCLUDES(end_cond_mutex_) {
+    return start_stop_barrier_.wait();
+  }
+
+  void NotifyThreadComplete() EXCLUDES(end_cond_mutex_) {
+    start_stop_barrier_.removeThread();
+    if (--alive_threads_ == 0) {
+      MutexLock lock(end_cond_mutex_);
+      end_condition_.notify_all();
+    }
+  }
+
+  void WaitForAllThreads() EXCLUDES(end_cond_mutex_) {
+    MutexLock lock(end_cond_mutex_);
+    end_condition_.wait(lock.native_handle(),
+                        [this]() { return alive_threads_ == 0; });
+  }
+
+ public:
+  struct Result {
+    IterationCount iterations = 0;
+    double real_time_used = 0;
+    double cpu_time_used = 0;
+    double manual_time_used = 0;
+    int64_t complexity_n = 0;
+    std::string report_label_;
+    std::string error_message_;
+    bool has_error_ = false;
+    UserCounters counters;
+  };
+  GUARDED_BY(GetBenchmarkMutex()) Result results;
+
+ private:
+  mutable Mutex benchmark_mutex_;
+  std::atomic<int> alive_threads_;
+  Barrier start_stop_barrier_;
+  Mutex end_cond_mutex_;
+  Condition end_condition_;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_THREAD_MANAGER_H
diff --git a/thirdparty/benchmark-1.5.0/src/thread_timer.h b/thirdparty/benchmark-1.5.0/src/thread_timer.h
new file mode 100644
index 0000000000..fbd298d3bd
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/thread_timer.h
@@ -0,0 +1,86 @@
+#ifndef BENCHMARK_THREAD_TIMER_H
+#define BENCHMARK_THREAD_TIMER_H
+
+#include "check.h"
+#include "timers.h"
+
+namespace benchmark {
+namespace internal {
+
+class ThreadTimer {
+  explicit ThreadTimer(bool measure_process_cpu_time_)
+      : measure_process_cpu_time(measure_process_cpu_time_) {}
+
+ public:
+  static ThreadTimer Create() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/false);
+  }
+  static ThreadTimer CreateProcessCpuTime() {
+    return ThreadTimer(/*measure_process_cpu_time_=*/true);
+  }
+
+  // Called by each thread
+  void StartTimer() {
+    running_ = true;
+    start_real_time_ = ChronoClockNow();
+    start_cpu_time_ = ReadCpuTimerOfChoice();
+  }
+
+  // Called by each thread
+  void StopTimer() {
+    CHECK(running_);
+    running_ = false;
+    real_time_used_ += ChronoClockNow() - start_real_time_;
+    // Floating point error can result in the subtraction producing a negative
+    // time. Guard against that.
+    cpu_time_used_ +=
+        std::max<double>(ReadCpuTimerOfChoice() - start_cpu_time_, 0);
+  }
+
+  // Called by each thread
+  void SetIterationTime(double seconds) { manual_time_used_ += seconds; }
+
+  bool running() const { return running_; }
+
+  // REQUIRES: timer is not running
+  double real_time_used() {
+    CHECK(!running_);
+    return real_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double cpu_time_used() {
+    CHECK(!running_);
+    return cpu_time_used_;
+  }
+
+  // REQUIRES: timer is not running
+  double manual_time_used() {
+    CHECK(!running_);
+    return manual_time_used_;
+  }
+
+ private:
+  double ReadCpuTimerOfChoice() const {
+    if (measure_process_cpu_time) return ProcessCPUUsage();
+    return ThreadCPUUsage();
+  }
+
+  // should the thread, or the process, time be measured?
+  const bool measure_process_cpu_time;
+
+  bool running_ = false;        // Is the timer running
+  double start_real_time_ = 0;  // If running_
+  double start_cpu_time_ = 0;   // If running_
+
+  // Accumulated time so far (does not contain current slice if running_)
+  double real_time_used_ = 0;
+  double cpu_time_used_ = 0;
+  // Manually set iteration time. User sets this with SetIterationTime(seconds).
+  double manual_time_used_ = 0;
+};
+
+}  // namespace internal
+}  // namespace benchmark
+
+#endif  // BENCHMARK_THREAD_TIMER_H
diff --git a/thirdparty/benchmark-1.5.0/src/timers.cc b/thirdparty/benchmark-1.5.0/src/timers.cc
new file mode 100644
index 0000000000..7613ff92c6
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/timers.cc
@@ -0,0 +1,217 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "timers.h"
+#include "internal_macros.h"
+
+#ifdef BENCHMARK_OS_WINDOWS
+#include <shlwapi.h>
+#undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
+#include <versionhelpers.h>
+#include <windows.h>
+#else
+#include <fcntl.h>
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <sys/types.h>  // this header must be included before 'sys/sysctl.h' to avoid compilation error on FreeBSD
+#include <unistd.h>
+#if defined BENCHMARK_OS_FREEBSD || defined BENCHMARK_OS_MACOSX
+#include <sys/sysctl.h>
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#include <mach/thread_act.h>
+#endif
+#endif
+
+#ifdef BENCHMARK_OS_EMSCRIPTEN
+#include <emscripten.h>
+#endif
+
+#include <cerrno>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <iostream>
+#include <limits>
+#include <mutex>
+
+#include "check.h"
+#include "log.h"
+#include "sleep.h"
+#include "string_util.h"
+
+namespace benchmark {
+
+// Suppress unused warnings on helper functions.
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+namespace {
+#if defined(BENCHMARK_OS_WINDOWS)
+double MakeTime(FILETIME const& kernel_time, FILETIME const& user_time) {
+  ULARGE_INTEGER kernel;
+  ULARGE_INTEGER user;
+  kernel.HighPart = kernel_time.dwHighDateTime;
+  kernel.LowPart = kernel_time.dwLowDateTime;
+  user.HighPart = user_time.dwHighDateTime;
+  user.LowPart = user_time.dwLowDateTime;
+  return (static_cast<double>(kernel.QuadPart) +
+          static_cast<double>(user.QuadPart)) *
+         1e-7;
+}
+#elif !defined(BENCHMARK_OS_FUCHSIA)
+double MakeTime(struct rusage const& ru) {
+  return (static_cast<double>(ru.ru_utime.tv_sec) +
+          static_cast<double>(ru.ru_utime.tv_usec) * 1e-6 +
+          static_cast<double>(ru.ru_stime.tv_sec) +
+          static_cast<double>(ru.ru_stime.tv_usec) * 1e-6);
+}
+#endif
+#if defined(BENCHMARK_OS_MACOSX)
+double MakeTime(thread_basic_info_data_t const& info) {
+  return (static_cast<double>(info.user_time.seconds) +
+          static_cast<double>(info.user_time.microseconds) * 1e-6 +
+          static_cast<double>(info.system_time.seconds) +
+          static_cast<double>(info.system_time.microseconds) * 1e-6);
+}
+#endif
+#if defined(CLOCK_PROCESS_CPUTIME_ID) || defined(CLOCK_THREAD_CPUTIME_ID)
+double MakeTime(struct timespec const& ts) {
+  return ts.tv_sec + (static_cast<double>(ts.tv_nsec) * 1e-9);
+}
+#endif
+
+BENCHMARK_NORETURN static void DiagnoseAndExit(const char* msg) {
+  std::cerr << "ERROR: " << msg << std::endl;
+  std::exit(EXIT_FAILURE);
+}
+
+}  // end namespace
+
+double ProcessCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE proc = GetCurrentProcess();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  if (GetProcessTimes(proc, &creation_time, &exit_time, &kernel_time,
+                      &user_time))
+    return MakeTime(kernel_time, user_time);
+  DiagnoseAndExit("GetProccessTimes() failed");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) returns 0 on Emscripten.
+  // Use Emscripten-specific API. Reported CPU time would be exactly the
+  // same as total time, but this is ok because there aren't long-latency
+  // syncronous system calls in Emscripten.
+  return emscripten_get_now() * 1e-3;
+#elif defined(CLOCK_PROCESS_CPUTIME_ID) && !defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
+  struct timespec spec;
+  if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &spec) == 0)
+    return MakeTime(spec);
+  DiagnoseAndExit("clock_gettime(CLOCK_PROCESS_CPUTIME_ID, ...) failed");
+#else
+  struct rusage ru;
+  if (getrusage(RUSAGE_SELF, &ru) == 0) return MakeTime(ru);
+  DiagnoseAndExit("getrusage(RUSAGE_SELF, ...) failed");
+#endif
+}
+
+double ThreadCPUUsage() {
+#if defined(BENCHMARK_OS_WINDOWS)
+  HANDLE this_thread = GetCurrentThread();
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  GetThreadTimes(this_thread, &creation_time, &exit_time, &kernel_time,
+                 &user_time);
+  return MakeTime(kernel_time, user_time);
+#elif defined(BENCHMARK_OS_MACOSX)
+  // FIXME We want to use clock_gettime, but its not available in MacOS 10.11. See
+  // https://github.com/google/benchmark/pull/292
+  mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
+  thread_basic_info_data_t info;
+  mach_port_t thread = pthread_mach_thread_np(pthread_self());
+  if (thread_info(thread, THREAD_BASIC_INFO, (thread_info_t)&info, &count) ==
+      KERN_SUCCESS) {
+    return MakeTime(info);
+  }
+  DiagnoseAndExit("ThreadCPUUsage() failed when evaluating thread_info");
+#elif defined(BENCHMARK_OS_EMSCRIPTEN)
+  // Emscripten doesn't support traditional threads
+  return ProcessCPUUsage();
+#elif defined(BENCHMARK_OS_RTEMS)
+  // RTEMS doesn't support CLOCK_THREAD_CPUTIME_ID. See
+  // https://github.com/RTEMS/rtems/blob/master/cpukit/posix/src/clockgettime.c
+  return ProcessCPUUsage();
+#elif defined(BENCHMARK_OS_SOLARIS)
+  struct rusage ru;
+  if (getrusage(RUSAGE_LWP, &ru) == 0) return MakeTime(ru);
+  DiagnoseAndExit("getrusage(RUSAGE_LWP, ...) failed");
+#elif defined(CLOCK_THREAD_CPUTIME_ID)
+  struct timespec ts;
+  if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0) return MakeTime(ts);
+  DiagnoseAndExit("clock_gettime(CLOCK_THREAD_CPUTIME_ID, ...) failed");
+#else
+#error Per-thread timing is not available on your system.
+#endif
+}
+
+namespace {
+
+std::string DateTimeString(bool local) {
+  typedef std::chrono::system_clock Clock;
+  std::time_t now = Clock::to_time_t(Clock::now());
+  const std::size_t kStorageSize = 128;
+  char storage[kStorageSize];
+  std::size_t written;
+
+  if (local) {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written =
+        std::strftime(storage, sizeof(storage), "%x %X", ::localtime(&now));
+#else
+    std::tm timeinfo;
+    ::localtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  } else {
+#if defined(BENCHMARK_OS_WINDOWS)
+    written = std::strftime(storage, sizeof(storage), "%x %X", ::gmtime(&now));
+#else
+    std::tm timeinfo;
+    ::gmtime_r(&now, &timeinfo);
+    written = std::strftime(storage, sizeof(storage), "%F %T", &timeinfo);
+#endif
+  }
+  CHECK(written < kStorageSize);
+  ((void)written);  // prevent unused variable in optimized mode.
+  return std::string(storage);
+}
+
+}  // end namespace
+
+std::string LocalDateTimeString() { return DateTimeString(true); }
+
+}  // end namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/src/timers.h b/thirdparty/benchmark-1.5.0/src/timers.h
new file mode 100644
index 0000000000..65606ccd93
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/src/timers.h
@@ -0,0 +1,48 @@
+#ifndef BENCHMARK_TIMERS_H
+#define BENCHMARK_TIMERS_H
+
+#include <chrono>
+#include <string>
+
+namespace benchmark {
+
+// Return the CPU usage of the current process
+double ProcessCPUUsage();
+
+// Return the CPU usage of the children of the current process
+double ChildrenCPUUsage();
+
+// Return the CPU usage of the current thread
+double ThreadCPUUsage();
+
+#if defined(HAVE_STEADY_CLOCK)
+template <bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
+struct ChooseSteadyClock {
+  typedef std::chrono::high_resolution_clock type;
+};
+
+template <>
+struct ChooseSteadyClock<false> {
+  typedef std::chrono::steady_clock type;
+};
+#endif
+
+struct ChooseClockType {
+#if defined(HAVE_STEADY_CLOCK)
+  typedef ChooseSteadyClock<>::type type;
+#else
+  typedef std::chrono::high_resolution_clock type;
+#endif
+};
+
+inline double ChronoClockNow() {
+  typedef ChooseClockType::type ClockType;
+  using FpSeconds = std::chrono::duration<double, std::chrono::seconds::period>;
+  return FpSeconds(ClockType::now().time_since_epoch()).count();
+}
+
+std::string LocalDateTimeString();
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_TIMERS_H
diff --git a/thirdparty/benchmark-1.5.0/test/AssemblyTests.cmake b/thirdparty/benchmark-1.5.0/test/AssemblyTests.cmake
new file mode 100644
index 0000000000..3d078586f1
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/AssemblyTests.cmake
@@ -0,0 +1,46 @@
+
+include(split_list)
+
+set(ASM_TEST_FLAGS "")
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+if (BENCHMARK_HAS_O3_FLAG)
+  list(APPEND ASM_TEST_FLAGS -O3)
+endif()
+
+check_cxx_compiler_flag(-g0 BENCHMARK_HAS_G0_FLAG)
+if (BENCHMARK_HAS_G0_FLAG)
+  list(APPEND ASM_TEST_FLAGS -g0)
+endif()
+
+check_cxx_compiler_flag(-fno-stack-protector BENCHMARK_HAS_FNO_STACK_PROTECTOR_FLAG)
+if (BENCHMARK_HAS_FNO_STACK_PROTECTOR_FLAG)
+  list(APPEND ASM_TEST_FLAGS -fno-stack-protector)
+endif()
+
+split_list(ASM_TEST_FLAGS)
+string(TOUPPER "${CMAKE_CXX_COMPILER_ID}" ASM_TEST_COMPILER)
+
+macro(add_filecheck_test name)
+  cmake_parse_arguments(ARG "" "" "CHECK_PREFIXES" ${ARGV})
+  add_library(${name} OBJECT ${name}.cc)
+  set_target_properties(${name} PROPERTIES COMPILE_FLAGS "-S ${ASM_TEST_FLAGS}")
+  set(ASM_OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${name}.s")
+  add_custom_target(copy_${name} ALL
+      COMMAND ${PROJECT_SOURCE_DIR}/tools/strip_asm.py
+        $<TARGET_OBJECTS:${name}>
+        ${ASM_OUTPUT_FILE}
+      BYPRODUCTS ${ASM_OUTPUT_FILE})
+  add_dependencies(copy_${name} ${name})
+  if (NOT ARG_CHECK_PREFIXES)
+    set(ARG_CHECK_PREFIXES "CHECK")
+  endif()
+  foreach(prefix ${ARG_CHECK_PREFIXES})
+    add_test(NAME run_${name}_${prefix}
+        COMMAND
+          ${LLVM_FILECHECK_EXE} ${name}.cc
+          --input-file=${ASM_OUTPUT_FILE}
+          --check-prefixes=CHECK,CHECK-${ASM_TEST_COMPILER}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endforeach()
+endmacro()
+
diff --git a/thirdparty/benchmark-1.5.0/test/BUILD b/thirdparty/benchmark-1.5.0/test/BUILD
new file mode 100644
index 0000000000..3f174c486f
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/BUILD
@@ -0,0 +1,65 @@
+TEST_COPTS = [
+    "-pedantic",
+    "-pedantic-errors",
+    "-std=c++11",
+    "-Wall",
+    "-Wextra",
+    "-Wshadow",
+#    "-Wshorten-64-to-32",
+    "-Wfloat-equal",
+    "-fstrict-aliasing",
+]
+
+PER_SRC_COPTS = ({
+    "cxx03_test.cc": ["-std=c++03"],
+    # Some of the issues with DoNotOptimize only occur when optimization is enabled
+    "donotoptimize_test.cc": ["-O3"],
+})
+
+
+TEST_ARGS = ["--benchmark_min_time=0.01"]
+
+PER_SRC_TEST_ARGS = ({
+    "user_counters_tabular_test.cc": ["--benchmark_counters_tabular=true"],
+})
+
+cc_library(
+    name = "output_test_helper",
+    testonly = 1,
+    srcs = ["output_test_helper.cc"],
+    hdrs = ["output_test.h"],
+    copts = TEST_COPTS,
+    deps = [
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+    ],
+)
+
+[
+  cc_test(
+    name = test_src[:-len(".cc")],
+    size = "small",
+    srcs = [test_src],
+    args = TEST_ARGS + PER_SRC_TEST_ARGS.get(test_src, []),
+    copts = TEST_COPTS + PER_SRC_COPTS.get(test_src, []),
+    deps = [
+        ":output_test_helper",
+        "//:benchmark",
+        "//:benchmark_internal_headers",
+        "@com_google_googletest//:gtest",
+    ] + (
+        ["@com_google_googletest//:gtest_main"] if (test_src[-len("gtest.cc"):] == "gtest.cc") else []
+    ),
+  # FIXME: Add support for assembly tests to bazel.
+  # See Issue #556
+  # https://github.com/google/benchmark/issues/556
+  ) for test_src in glob(["*test.cc"], exclude = ["*_assembly_test.cc", "link_main_test.cc"])
+]
+
+cc_test(
+    name = "link_main_test",
+    size = "small",
+    srcs = ["link_main_test.cc"],
+    copts = TEST_COPTS,
+    deps = ["//:benchmark_main"],
+)
diff --git a/thirdparty/benchmark-1.5.0/test/CMakeLists.txt b/thirdparty/benchmark-1.5.0/test/CMakeLists.txt
new file mode 100644
index 0000000000..030f35aae3
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/CMakeLists.txt
@@ -0,0 +1,259 @@
+# Enable the tests
+
+find_package(Threads REQUIRED)
+include(CheckCXXCompilerFlag)
+
+# NOTE: Some tests use `<cassert>` to perform the test. Therefore we must
+# strip -DNDEBUG from the default CMake flags in DEBUG mode.
+string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
+if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
+  add_definitions( -UNDEBUG )
+  add_definitions(-DTEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS)
+  # Also remove /D NDEBUG to avoid MSVC warnings about conflicting defines.
+  foreach (flags_var_to_scrub
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS_MINSIZEREL)
+    string (REGEX REPLACE "(^| )[/-]D *NDEBUG($| )" " "
+      "${flags_var_to_scrub}" "${${flags_var_to_scrub}}")
+  endforeach()
+endif()
+
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+set(BENCHMARK_O3_FLAG "")
+if (BENCHMARK_HAS_O3_FLAG)
+  set(BENCHMARK_O3_FLAG "-O3")
+endif()
+
+# NOTE: These flags must be added after find_package(Threads REQUIRED) otherwise
+# they will break the configuration check.
+if (DEFINED BENCHMARK_CXX_LINKER_FLAGS)
+  list(APPEND CMAKE_EXE_LINKER_FLAGS ${BENCHMARK_CXX_LINKER_FLAGS})
+endif()
+
+add_library(output_test_helper STATIC output_test_helper.cc output_test.h)
+
+macro(compile_benchmark_test name)
+  add_executable(${name} "${name}.cc")
+  target_link_libraries(${name} benchmark ${CMAKE_THREAD_LIBS_INIT})
+endmacro(compile_benchmark_test)
+
+macro(compile_benchmark_test_with_main name)
+  add_executable(${name} "${name}.cc")
+  target_link_libraries(${name} benchmark_main)
+endmacro(compile_benchmark_test_with_main)
+
+macro(compile_output_test name)
+  add_executable(${name} "${name}.cc" output_test.h)
+  target_link_libraries(${name} output_test_helper benchmark
+          ${BENCHMARK_CXX_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endmacro(compile_output_test)
+
+# Demonstration executable
+compile_benchmark_test(benchmark_test)
+add_test(benchmark benchmark_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(filter_test)
+macro(add_filter_test name filter expect)
+  add_test(${name} filter_test --benchmark_min_time=0.01 --benchmark_filter=${filter} ${expect})
+  add_test(${name}_list_only filter_test --benchmark_list_tests --benchmark_filter=${filter} ${expect})
+endmacro(add_filter_test)
+
+add_filter_test(filter_simple "Foo" 3)
+add_filter_test(filter_simple_negative "-Foo" 2)
+add_filter_test(filter_suffix "BM_.*" 4)
+add_filter_test(filter_suffix_negative "-BM_.*" 1)
+add_filter_test(filter_regex_all ".*" 5)
+add_filter_test(filter_regex_all_negative "-.*" 0)
+add_filter_test(filter_regex_blank "" 5)
+add_filter_test(filter_regex_blank_negative "-" 0)
+add_filter_test(filter_regex_none "monkey" 0)
+add_filter_test(filter_regex_none_negative "-monkey" 5)
+add_filter_test(filter_regex_wildcard ".*Foo.*" 3)
+add_filter_test(filter_regex_wildcard_negative "-.*Foo.*" 2)
+add_filter_test(filter_regex_begin "^BM_.*" 4)
+add_filter_test(filter_regex_begin_negative "-^BM_.*" 1)
+add_filter_test(filter_regex_begin2 "^N" 1)
+add_filter_test(filter_regex_begin2_negative "-^N" 4)
+add_filter_test(filter_regex_end ".*Ba$" 1)
+add_filter_test(filter_regex_end_negative "-.*Ba$" 4)
+
+compile_benchmark_test(options_test)
+add_test(options_benchmarks options_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(basic_test)
+add_test(basic_benchmark basic_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(diagnostics_test)
+add_test(diagnostics_test diagnostics_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(skip_with_error_test)
+add_test(skip_with_error_test skip_with_error_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(donotoptimize_test)
+# Some of the issues with DoNotOptimize only occur when optimization is enabled
+check_cxx_compiler_flag(-O3 BENCHMARK_HAS_O3_FLAG)
+if (BENCHMARK_HAS_O3_FLAG)
+  set_target_properties(donotoptimize_test PROPERTIES COMPILE_FLAGS "-O3")
+endif()
+add_test(donotoptimize_test donotoptimize_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(fixture_test)
+add_test(fixture_test fixture_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(register_benchmark_test)
+add_test(register_benchmark_test register_benchmark_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(map_test)
+add_test(map_test map_test --benchmark_min_time=0.01)
+
+compile_benchmark_test(multiple_ranges_test)
+add_test(multiple_ranges_test multiple_ranges_test --benchmark_min_time=0.01)
+
+compile_benchmark_test_with_main(link_main_test)
+add_test(link_main_test link_main_test --benchmark_min_time=0.01)
+
+compile_output_test(reporter_output_test)
+add_test(reporter_output_test reporter_output_test --benchmark_min_time=0.01)
+
+compile_output_test(templated_fixture_test)
+add_test(templated_fixture_test templated_fixture_test --benchmark_min_time=0.01)
+
+compile_output_test(user_counters_test)
+add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
+
+compile_output_test(internal_threading_test)
+add_test(internal_threading_test internal_threading_test --benchmark_min_time=0.01)
+
+compile_output_test(report_aggregates_only_test)
+add_test(report_aggregates_only_test report_aggregates_only_test --benchmark_min_time=0.01)
+
+compile_output_test(display_aggregates_only_test)
+add_test(display_aggregates_only_test display_aggregates_only_test --benchmark_min_time=0.01)
+
+compile_output_test(user_counters_tabular_test)
+add_test(user_counters_tabular_test user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
+
+compile_output_test(user_counters_thousands_test)
+add_test(user_counters_thousands_test user_counters_thousands_test --benchmark_min_time=0.01)
+
+compile_output_test(memory_manager_test)
+add_test(memory_manager_test memory_manager_test --benchmark_min_time=0.01)
+
+check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
+if (BENCHMARK_HAS_CXX03_FLAG)
+  compile_benchmark_test(cxx03_test)
+  set_target_properties(cxx03_test
+      PROPERTIES
+      COMPILE_FLAGS "-std=c++03")
+  # libstdc++ provides different definitions within <map> between dialects. When
+  # LTO is enabled and -Werror is specified GCC diagnoses this ODR violation
+  # causing the test to fail to compile. To prevent this we explicitly disable
+  # the warning.
+  check_cxx_compiler_flag(-Wno-odr BENCHMARK_HAS_WNO_ODR)
+  if (BENCHMARK_ENABLE_LTO AND BENCHMARK_HAS_WNO_ODR)
+    set_target_properties(cxx03_test
+        PROPERTIES
+        LINK_FLAGS "-Wno-odr")
+  endif()
+  add_test(cxx03 cxx03_test --benchmark_min_time=0.01)
+endif()
+
+# Attempt to work around flaky test failures when running on Appveyor servers.
+if (DEFINED ENV{APPVEYOR})
+  set(COMPLEXITY_MIN_TIME "0.5")
+else()
+  set(COMPLEXITY_MIN_TIME "0.01")
+endif()
+compile_output_test(complexity_test)
+add_test(complexity_benchmark complexity_test --benchmark_min_time=${COMPLEXITY_MIN_TIME})
+
+###############################################################################
+# GoogleTest Unit Tests
+###############################################################################
+
+if (BENCHMARK_ENABLE_GTEST_TESTS)
+  macro(compile_gtest name)
+    add_executable(${name} "${name}.cc")
+    target_link_libraries(${name} benchmark
+        gmock_main ${CMAKE_THREAD_LIBS_INIT})
+  endmacro(compile_gtest)
+
+  macro(add_gtest name)
+    compile_gtest(${name})
+    add_test(${name} ${name})
+  endmacro()
+
+  add_gtest(benchmark_gtest)
+  add_gtest(benchmark_name_gtest)
+  add_gtest(commandlineflags_gtest)
+  add_gtest(statistics_gtest)
+  add_gtest(string_util_gtest)
+endif(BENCHMARK_ENABLE_GTEST_TESTS)
+
+###############################################################################
+# Assembly Unit Tests
+###############################################################################
+
+if (BENCHMARK_ENABLE_ASSEMBLY_TESTS)
+  if (NOT LLVM_FILECHECK_EXE)
+    message(FATAL_ERROR "LLVM FileCheck is required when including this file")
+  endif()
+  include(AssemblyTests.cmake)
+  add_filecheck_test(donotoptimize_assembly_test)
+  add_filecheck_test(state_assembly_test)
+  add_filecheck_test(clobber_memory_assembly_test)
+endif()
+
+
+
+###############################################################################
+# Code Coverage Configuration
+###############################################################################
+
+# Add the coverage command(s)
+if(CMAKE_BUILD_TYPE)
+  string(TOLOWER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_LOWER)
+endif()
+if (${CMAKE_BUILD_TYPE_LOWER} MATCHES "coverage")
+  find_program(GCOV gcov)
+  find_program(LCOV lcov)
+  find_program(GENHTML genhtml)
+  find_program(CTEST ctest)
+  if (GCOV AND LCOV AND GENHTML AND CTEST AND HAVE_CXX_FLAG_COVERAGE)
+    add_custom_command(
+      OUTPUT ${CMAKE_BINARY_DIR}/lcov/index.html
+      COMMAND ${LCOV} -q -z -d .
+      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o before.lcov -i
+      COMMAND ${CTEST} --force-new-ctest-process
+      COMMAND ${LCOV} -q --no-external -c -b "${CMAKE_SOURCE_DIR}" -d . -o after.lcov
+      COMMAND ${LCOV} -q -a before.lcov -a after.lcov --output-file final.lcov
+      COMMAND ${LCOV} -q -r final.lcov "'${CMAKE_SOURCE_DIR}/test/*'" -o final.lcov
+      COMMAND ${GENHTML} final.lcov -o lcov --demangle-cpp --sort -p "${CMAKE_BINARY_DIR}" -t benchmark
+      DEPENDS filter_test benchmark_test options_test basic_test fixture_test cxx03_test complexity_test
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+      COMMENT "Running LCOV"
+    )
+    add_custom_target(coverage
+      DEPENDS ${CMAKE_BINARY_DIR}/lcov/index.html
+      COMMENT "LCOV report at lcov/index.html"
+    )
+    message(STATUS "Coverage command added")
+  else()
+    if (HAVE_CXX_FLAG_COVERAGE)
+      set(CXX_FLAG_COVERAGE_MESSAGE supported)
+    else()
+      set(CXX_FLAG_COVERAGE_MESSAGE unavailable)
+    endif()
+    message(WARNING
+      "Coverage not available:\n"
+      "  gcov: ${GCOV}\n"
+      "  lcov: ${LCOV}\n"
+      "  genhtml: ${GENHTML}\n"
+      "  ctest: ${CTEST}\n"
+      "  --coverage flag: ${CXX_FLAG_COVERAGE_MESSAGE}")
+  endif()
+endif()
diff --git a/thirdparty/benchmark-1.5.0/test/basic_test.cc b/thirdparty/benchmark-1.5.0/test/basic_test.cc
new file mode 100644
index 0000000000..5f3dd1a3ee
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/basic_test.cc
@@ -0,0 +1,136 @@
+
+#include "benchmark/benchmark.h"
+
+#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+BENCHMARK(BM_empty)->ThreadPerCpu();
+
+void BM_spin_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int x = 0; x < state.range(0); ++x) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_empty);
+BASIC_BENCHMARK_TEST(BM_spin_empty)->ThreadPerCpu();
+
+void BM_spin_pause_before(benchmark::State& state) {
+  for (int i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before)->ThreadPerCpu();
+
+void BM_spin_pause_during(benchmark::State& state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    for (int i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+    state.ResumeTiming();
+    for (int i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_during);
+BASIC_BENCHMARK_TEST(BM_spin_pause_during)->ThreadPerCpu();
+
+void BM_pause_during(benchmark::State& state) {
+  for (auto _ : state) {
+    state.PauseTiming();
+    state.ResumeTiming();
+  }
+}
+BENCHMARK(BM_pause_during);
+BENCHMARK(BM_pause_during)->ThreadPerCpu();
+BENCHMARK(BM_pause_during)->UseRealTime();
+BENCHMARK(BM_pause_during)->UseRealTime()->ThreadPerCpu();
+
+void BM_spin_pause_after(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (int i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_after)->ThreadPerCpu();
+
+void BM_spin_pause_before_and_after(benchmark::State& state) {
+  for (int i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i) {
+      benchmark::DoNotOptimize(i);
+    }
+  }
+  for (int i = 0; i < state.range(0); ++i) {
+    benchmark::DoNotOptimize(i);
+  }
+}
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after);
+BASIC_BENCHMARK_TEST(BM_spin_pause_before_and_after)->ThreadPerCpu();
+
+void BM_empty_stop_start(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_empty_stop_start);
+BENCHMARK(BM_empty_stop_start)->ThreadPerCpu();
+
+
+void BM_KeepRunning(benchmark::State& state) {
+  benchmark::IterationCount iter_count = 0;
+  assert(iter_count == state.iterations());
+  while (state.KeepRunning()) {
+    ++iter_count;
+  }
+  assert(iter_count == state.iterations());
+}
+BENCHMARK(BM_KeepRunning);
+
+void BM_KeepRunningBatch(benchmark::State& state) {
+  // Choose a prime batch size to avoid evenly dividing max_iterations.
+  const benchmark::IterationCount batch_size = 101;
+  benchmark::IterationCount iter_count = 0;
+  while (state.KeepRunningBatch(batch_size)) {
+    iter_count += batch_size;
+  }
+  assert(state.iterations() == iter_count);
+}
+BENCHMARK(BM_KeepRunningBatch);
+
+void BM_RangedFor(benchmark::State& state) {
+  benchmark::IterationCount iter_count = 0;
+  for (auto _ : state) {
+    ++iter_count;
+  }
+  assert(iter_count == state.max_iterations);
+}
+BENCHMARK(BM_RangedFor);
+
+// Ensure that StateIterator provides all the necessary typedefs required to
+// instantiate std::iterator_traits.
+static_assert(std::is_same<
+  typename std::iterator_traits<benchmark::State::StateIterator>::value_type,
+  typename benchmark::State::StateIterator::value_type>::value, "");
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/benchmark_gtest.cc b/thirdparty/benchmark-1.5.0/test/benchmark_gtest.cc
new file mode 100644
index 0000000000..9557b20ec7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/benchmark_gtest.cc
@@ -0,0 +1,128 @@
+#include <vector>
+
+#include "../src/benchmark_register.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace internal {
+namespace {
+
+TEST(AddRangeTest, Simple) {
+  std::vector<int> dst;
+  AddRange(&dst, 1, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2));
+}
+
+TEST(AddRangeTest, Simple64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, static_cast<int64_t>(1), static_cast<int64_t>(2), 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2));
+}
+
+TEST(AddRangeTest, Advanced) {
+  std::vector<int> dst;
+  AddRange(&dst, 5, 15, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
+}
+
+TEST(AddRangeTest, Advanced64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, static_cast<int64_t>(5), static_cast<int64_t>(15), 2);
+  EXPECT_THAT(dst, testing::ElementsAre(5, 8, 15));
+}
+
+TEST(AddRangeTest, FullRange8) {
+  std::vector<int8_t> dst;
+  AddRange(&dst, int8_t{1}, std::numeric_limits<int8_t>::max(), 8);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 8, 64, 127));
+}
+
+TEST(AddRangeTest, FullRange64) {
+  std::vector<int64_t> dst;
+  AddRange(&dst, int64_t{1}, std::numeric_limits<int64_t>::max(), 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAre(1LL, 1024LL, 1048576LL, 1073741824LL,
+                                1099511627776LL, 1125899906842624LL,
+                                1152921504606846976LL, 9223372036854775807LL));
+}
+
+TEST(AddRangeTest, NegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0));
+}
+
+TEST(AddRangeTest, StrictlyNegative) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, -1, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRanges) {
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -4, -2, -1, 0, 1, 2, 4, 8));
+}
+
+TEST(AddRangeTest, SymmetricNegativeRangesOddMult) {
+  std::vector<int> dst;
+  AddRange(&dst, -30, 32, 5);
+  EXPECT_THAT(dst, testing::ElementsAre(-30, -25, -5, -1, 0, 1, 5, 25, 32));
+}
+
+TEST(AddRangeTest, NegativeRangesAsymmetric) {
+  std::vector<int> dst;
+  AddRange(&dst, -3, 5, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-3, -2, -1, 0, 1, 2, 4, 5));
+}
+
+TEST(AddRangeTest, NegativeRangesLargeStep) {
+  // Always include -1, 0, 1 when crossing zero.
+  std::vector<int> dst;
+  AddRange(&dst, -8, 8, 10);
+  EXPECT_THAT(dst, testing::ElementsAre(-8, -1, 0, 1, 8));
+}
+
+TEST(AddRangeTest, ZeroOnlyRange) {
+  std::vector<int> dst;
+  AddRange(&dst, 0, 0, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(0));
+}
+
+TEST(AddRangeTest, NegativeRange64) {
+  std::vector<int64_t> dst;
+  AddRange<int64_t>(&dst, -4, 4, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(-4, -2, -1, 0, 1, 2, 4));
+}
+
+TEST(AddRangeTest, NegativeRangePreservesExistingOrder) {
+  // If elements already exist in the range, ensure we don't change
+  // their ordering by adding negative values.
+  std::vector<int64_t> dst = {1, 2, 3};
+  AddRange<int64_t>(&dst, -2, 2, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 3, -2, -1, 0, 1, 2));
+}
+
+TEST(AddRangeTest, FullNegativeRange64) {
+  std::vector<int64_t> dst;
+  const auto min = std::numeric_limits<int64_t>::min();
+  const auto max = std::numeric_limits<int64_t>::max();
+  AddRange(&dst, min, max, 1024);
+  EXPECT_THAT(
+      dst, testing::ElementsAreArray(std::vector<int64_t>{
+               min, -1152921504606846976LL, -1125899906842624LL,
+               -1099511627776LL, -1073741824LL, -1048576LL, -1024LL, -1LL, 0LL,
+               1LL, 1024LL, 1048576LL, 1073741824LL, 1099511627776LL,
+               1125899906842624LL, 1152921504606846976LL, max}));
+}
+
+TEST(AddRangeTest, Simple8) {
+  std::vector<int8_t> dst;
+  AddRange<int8_t>(&dst, 1, 8, 2);
+  EXPECT_THAT(dst, testing::ElementsAre(1, 2, 4, 8));
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/test/benchmark_name_gtest.cc b/thirdparty/benchmark-1.5.0/test/benchmark_name_gtest.cc
new file mode 100644
index 0000000000..afb401c1f5
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/benchmark_name_gtest.cc
@@ -0,0 +1,74 @@
+#include "benchmark/benchmark.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using namespace benchmark;
+using namespace benchmark::internal;
+
+TEST(BenchmarkNameTest, Empty) {
+  const auto name = BenchmarkName();
+  EXPECT_EQ(name.str(), std::string());
+}
+
+TEST(BenchmarkNameTest, FunctionName) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  EXPECT_EQ(name.str(), "function_name");
+}
+
+TEST(BenchmarkNameTest, FunctionNameAndArgs) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4/5";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/5");
+}
+
+TEST(BenchmarkNameTest, MinTime) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.args = "some_args:3/4";
+  name.min_time = "min_time:3.4s";
+  EXPECT_EQ(name.str(), "function_name/some_args:3/4/min_time:3.4s");
+}
+
+TEST(BenchmarkNameTest, Iterations) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.iterations = "iterations:42";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/iterations:42");
+}
+
+TEST(BenchmarkNameTest, Repetitions) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.repetitions = "repetitions:24";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/repetitions:24");
+}
+
+TEST(BenchmarkNameTest, TimeType) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.time_type = "hammer_time";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/hammer_time");
+}
+
+TEST(BenchmarkNameTest, Threads) {
+  auto name = BenchmarkName();
+  name.function_name = "function_name";
+  name.min_time = "min_time:3.4s";
+  name.threads = "threads:256";
+  EXPECT_EQ(name.str(), "function_name/min_time:3.4s/threads:256");
+}
+
+TEST(BenchmarkNameTest, TestEmptyFunctionName) {
+  auto name = BenchmarkName();
+  name.args = "first:3/second:4";
+  name.threads = "threads:22";
+  EXPECT_EQ(name.str(), "first:3/second:4/threads:22");
+}
+
+}  // end namespace
diff --git a/thirdparty/benchmark-1.5.0/test/benchmark_test.cc b/thirdparty/benchmark-1.5.0/test/benchmark_test.cc
new file mode 100644
index 0000000000..3cd4f5565f
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/benchmark_test.cc
@@ -0,0 +1,245 @@
+#include "benchmark/benchmark.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+
+#include <chrono>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#if defined(__GNUC__)
+#define BENCHMARK_NOINLINE __attribute__((noinline))
+#else
+#define BENCHMARK_NOINLINE
+#endif
+
+namespace {
+
+int BENCHMARK_NOINLINE Factorial(uint32_t n) {
+  return (n == 1) ? 1 : n * Factorial(n - 1);
+}
+
+double CalculatePi(int depth) {
+  double pi = 0.0;
+  for (int i = 0; i < depth; ++i) {
+    double numerator = static_cast<double>(((i % 2) * 2) - 1);
+    double denominator = static_cast<double>((2 * i) - 1);
+    pi += numerator / denominator;
+  }
+  return (pi - 1.0) * 4;
+}
+
+std::set<int64_t> ConstructRandomSet(int64_t size) {
+  std::set<int64_t> s;
+  for (int i = 0; i < size; ++i) s.insert(s.end(), i);
+  return s;
+}
+
+std::mutex test_vector_mu;
+std::vector<int>* test_vector = nullptr;
+
+}  // end namespace
+
+static void BM_Factorial(benchmark::State& state) {
+  int fac_42 = 0;
+  for (auto _ : state) fac_42 = Factorial(8);
+  // Prevent compiler optimizations
+  std::stringstream ss;
+  ss << fac_42;
+  state.SetLabel(ss.str());
+}
+BENCHMARK(BM_Factorial);
+BENCHMARK(BM_Factorial)->UseRealTime();
+
+static void BM_CalculatePiRange(benchmark::State& state) {
+  double pi = 0.0;
+  for (auto _ : state) pi = CalculatePi(static_cast<int>(state.range(0)));
+  std::stringstream ss;
+  ss << pi;
+  state.SetLabel(ss.str());
+}
+BENCHMARK_RANGE(BM_CalculatePiRange, 1, 1024 * 1024);
+
+static void BM_CalculatePi(benchmark::State& state) {
+  static const int depth = 1024;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(CalculatePi(static_cast<int>(depth)));
+  }
+}
+BENCHMARK(BM_CalculatePi)->Threads(8);
+BENCHMARK(BM_CalculatePi)->ThreadRange(1, 32);
+BENCHMARK(BM_CalculatePi)->ThreadPerCpu();
+
+static void BM_SetInsert(benchmark::State& state) {
+  std::set<int64_t> data;
+  for (auto _ : state) {
+    state.PauseTiming();
+    data = ConstructRandomSet(state.range(0));
+    state.ResumeTiming();
+    for (int j = 0; j < state.range(1); ++j) data.insert(rand());
+  }
+  state.SetItemsProcessed(state.iterations() * state.range(1));
+  state.SetBytesProcessed(state.iterations() * state.range(1) * sizeof(int));
+}
+
+// Test many inserts at once to reduce the total iterations needed. Otherwise, the slower,
+// non-timed part of each iteration will make the benchmark take forever.
+BENCHMARK(BM_SetInsert)->Ranges({{1 << 10, 8 << 10}, {128, 512}});
+
+template <typename Container,
+          typename ValueType = typename Container::value_type>
+static void BM_Sequential(benchmark::State& state) {
+  ValueType v = 42;
+  for (auto _ : state) {
+    Container c;
+    for (int64_t i = state.range(0); --i;) c.push_back(v);
+  }
+  const int64_t items_processed = state.iterations() * state.range(0);
+  state.SetItemsProcessed(items_processed);
+  state.SetBytesProcessed(items_processed * sizeof(v));
+}
+BENCHMARK_TEMPLATE2(BM_Sequential, std::vector<int>, int)
+    ->Range(1 << 0, 1 << 10);
+BENCHMARK_TEMPLATE(BM_Sequential, std::list<int>)->Range(1 << 0, 1 << 10);
+// Test the variadic version of BENCHMARK_TEMPLATE in C++11 and beyond.
+#ifdef BENCHMARK_HAS_CXX11
+BENCHMARK_TEMPLATE(BM_Sequential, std::vector<int>, int)->Arg(512);
+#endif
+
+static void BM_StringCompare(benchmark::State& state) {
+  size_t len = static_cast<size_t>(state.range(0));
+  std::string s1(len, '-');
+  std::string s2(len, '-');
+  for (auto _ : state) benchmark::DoNotOptimize(s1.compare(s2));
+}
+BENCHMARK(BM_StringCompare)->Range(1, 1 << 20);
+
+static void BM_SetupTeardown(benchmark::State& state) {
+  if (state.thread_index == 0) {
+    // No need to lock test_vector_mu here as this is running single-threaded.
+    test_vector = new std::vector<int>();
+  }
+  int i = 0;
+  for (auto _ : state) {
+    std::lock_guard<std::mutex> l(test_vector_mu);
+    if (i % 2 == 0)
+      test_vector->push_back(i);
+    else
+      test_vector->pop_back();
+    ++i;
+  }
+  if (state.thread_index == 0) {
+    delete test_vector;
+  }
+}
+BENCHMARK(BM_SetupTeardown)->ThreadPerCpu();
+
+static void BM_LongTest(benchmark::State& state) {
+  double tracker = 0.0;
+  for (auto _ : state) {
+    for (int i = 0; i < state.range(0); ++i)
+      benchmark::DoNotOptimize(tracker += i);
+  }
+}
+BENCHMARK(BM_LongTest)->Range(1 << 16, 1 << 28);
+
+static void BM_ParallelMemset(benchmark::State& state) {
+  int64_t size = state.range(0) / static_cast<int64_t>(sizeof(int));
+  int thread_size = static_cast<int>(size) / state.threads;
+  int from = thread_size * state.thread_index;
+  int to = from + thread_size;
+
+  if (state.thread_index == 0) {
+    test_vector = new std::vector<int>(static_cast<size_t>(size));
+  }
+
+  for (auto _ : state) {
+    for (int i = from; i < to; i++) {
+      // No need to lock test_vector_mu as ranges
+      // do not overlap between threads.
+      benchmark::DoNotOptimize(test_vector->at(i) = 1);
+    }
+  }
+
+  if (state.thread_index == 0) {
+    delete test_vector;
+  }
+}
+BENCHMARK(BM_ParallelMemset)->Arg(10 << 20)->ThreadRange(1, 4);
+
+static void BM_ManualTiming(benchmark::State& state) {
+  int64_t slept_for = 0;
+  int64_t microseconds = state.range(0);
+  std::chrono::duration<double, std::micro> sleep_duration{
+      static_cast<double>(microseconds)};
+
+  for (auto _ : state) {
+    auto start = std::chrono::high_resolution_clock::now();
+    // Simulate some useful workload with a sleep
+    std::this_thread::sleep_for(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
+    auto end = std::chrono::high_resolution_clock::now();
+
+    auto elapsed =
+        std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+
+    state.SetIterationTime(elapsed.count());
+    slept_for += microseconds;
+  }
+  state.SetItemsProcessed(slept_for);
+}
+BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseRealTime();
+BENCHMARK(BM_ManualTiming)->Range(1, 1 << 14)->UseManualTime();
+
+#ifdef BENCHMARK_HAS_CXX11
+
+template <class... Args>
+void BM_with_args(benchmark::State& state, Args&&...) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK_CAPTURE(BM_with_args, int_test, 42, 43, 44);
+BENCHMARK_CAPTURE(BM_with_args, string_and_pair_test, std::string("abc"),
+                  std::pair<int, double>(42, 3.8));
+
+void BM_non_template_args(benchmark::State& state, int, double) {
+  while(state.KeepRunning()) {}
+}
+BENCHMARK_CAPTURE(BM_non_template_args, basic_test, 0, 0);
+
+#endif  // BENCHMARK_HAS_CXX11
+
+static void BM_DenseThreadRanges(benchmark::State& st) {
+  switch (st.range(0)) {
+    case 1:
+      assert(st.threads == 1 || st.threads == 2 || st.threads == 3);
+      break;
+    case 2:
+      assert(st.threads == 1 || st.threads == 3 || st.threads == 4);
+      break;
+    case 3:
+      assert(st.threads == 5 || st.threads == 8 || st.threads == 11 ||
+             st.threads == 14);
+      break;
+    default:
+      assert(false && "Invalid test case number");
+  }
+  while (st.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_DenseThreadRanges)->Arg(1)->DenseThreadRange(1, 3);
+BENCHMARK(BM_DenseThreadRanges)->Arg(2)->DenseThreadRange(1, 4, 2);
+BENCHMARK(BM_DenseThreadRanges)->Arg(3)->DenseThreadRange(5, 14, 3);
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/clobber_memory_assembly_test.cc b/thirdparty/benchmark-1.5.0/test/clobber_memory_assembly_test.cc
new file mode 100644
index 0000000000..f41911a39c
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/clobber_memory_assembly_test.cc
@@ -0,0 +1,64 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+extern "C" {
+
+extern int ExternInt;
+extern int ExternInt2;
+extern int ExternInt3;
+
+}
+
+// CHECK-LABEL: test_basic:
+extern "C" void test_basic() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = 101;
+  benchmark::ClobberMemory();
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: movl $101, [[DEST]]
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_redundant_store:
+extern "C" void test_redundant_store() {
+  ExternInt = 3;
+  benchmark::ClobberMemory();
+  ExternInt = 51;
+  // CHECK-DAG: ExternInt
+  // CHECK-DAG: movl $3
+  // CHECK: movl $51
+}
+
+// CHECK-LABEL: test_redundant_read:
+extern "C" void test_redundant_read() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = ExternInt;
+  benchmark::ClobberMemory();
+  x = ExternInt2;
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK-NOT: ExternInt2
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_redundant_read2:
+extern "C" void test_redundant_read2() {
+  int x;
+  benchmark::DoNotOptimize(&x);
+  x = ExternInt;
+  benchmark::ClobberMemory();
+  x = ExternInt2;
+  benchmark::ClobberMemory();
+  // CHECK: leaq [[DEST:[^,]+]], %rax
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK: ExternInt2(%rip)
+  // CHECK: movl %eax, [[DEST]]
+  // CHECK: ret
+}
diff --git a/thirdparty/benchmark-1.5.0/test/commandlineflags_gtest.cc b/thirdparty/benchmark-1.5.0/test/commandlineflags_gtest.cc
new file mode 100644
index 0000000000..5460778c48
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/commandlineflags_gtest.cc
@@ -0,0 +1,78 @@
+#include <cstdlib>
+
+#include "../src/commandlineflags.h"
+#include "../src/internal_macros.h"
+#include "gtest/gtest.h"
+
+namespace benchmark {
+namespace {
+
+#if defined(BENCHMARK_OS_WINDOWS)
+int setenv(const char* name, const char* value, int overwrite) {
+  if (!overwrite) {
+    // NOTE: getenv_s is far superior but not available under mingw.
+    char* env_value = getenv(name);
+    if (env_value == nullptr) {
+      return -1;
+    }
+  }
+  return _putenv_s(name, value);
+}
+
+int unsetenv(const char* name) {
+  return _putenv_s(name, "");
+}
+
+#endif  // BENCHMARK_OS_WINDOWS
+
+TEST(BoolFromEnv, Default) {
+  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
+  EXPECT_EQ(BoolFromEnv("not_in_env", true), true);
+}
+
+TEST(BoolFromEnv, False) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "0", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", true), false);
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+TEST(BoolFromEnv, True) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "1", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("BENCHMARK_IN_ENV");
+
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(BoolFromEnv("in_env", false), true);
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+TEST(Int32FromEnv, NotInEnv) {
+  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
+  EXPECT_EQ(Int32FromEnv("not_in_env", 42), 42);
+}
+
+TEST(Int32FromEnv, InvalidInteger) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 42), 42);
+  ASSERT_EQ(unsetenv("BENCHMARK_IN_ENV"), 0);
+}
+
+TEST(Int32FromEnv, ValidInteger) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "42", 1), 0);
+  EXPECT_EQ(Int32FromEnv("in_env", 64), 42);
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+TEST(StringFromEnv, Default) {
+  ASSERT_EQ(unsetenv("BENCHMARK_NOT_IN_ENV"), 0);
+  EXPECT_STREQ(StringFromEnv("not_in_env", "foo"), "foo");
+}
+
+TEST(StringFromEnv, Valid) {
+  ASSERT_EQ(setenv("BENCHMARK_IN_ENV", "foo", 1), 0);
+  EXPECT_STREQ(StringFromEnv("in_env", "bar"), "foo");
+  unsetenv("BENCHMARK_IN_ENV");
+}
+
+}  // namespace
+}  // namespace benchmark
diff --git a/thirdparty/benchmark-1.5.0/test/complexity_test.cc b/thirdparty/benchmark-1.5.0/test/complexity_test.cc
new file mode 100644
index 0000000000..d4febbbc15
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/complexity_test.cc
@@ -0,0 +1,211 @@
+#undef NDEBUG
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+namespace {
+
+#define ADD_COMPLEXITY_CASES(...) \
+  int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
+
+int AddComplexityTest(std::string test_name, std::string big_o_test_name,
+                      std::string rms_test_name, std::string big_o) {
+  SetSubstitutions({{"%name", test_name},
+                    {"%bigo_name", big_o_test_name},
+                    {"%rms_name", rms_test_name},
+                    {"%bigo_str", "[ ]* %float " + big_o},
+                    {"%bigo", big_o},
+                    {"%rms", "[ ]*[0-9]+ %"}});
+  AddCases(
+      TC_ConsoleOut,
+      {{"^%bigo_name %bigo_str %bigo_str[ ]*$"},
+       {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
+       {"^%rms_name %rms %rms[ ]*$", MR_Next}});
+  AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"repetitions\": %int,$", MR_Next},
+                        {"\"threads\": 1,$", MR_Next},
+                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
+                        {"\"cpu_coefficient\": %float,$", MR_Next},
+                        {"\"real_coefficient\": %float,$", MR_Next},
+                        {"\"big_o\": \"%bigo\",$", MR_Next},
+                        {"\"time_unit\": \"ns\"$", MR_Next},
+                        {"}", MR_Next},
+                        {"\"name\": \"%rms_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"repetitions\": %int,$", MR_Next},
+                        {"\"threads\": 1,$", MR_Next},
+                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
+                        {"\"rms\": %float$", MR_Next},
+                        {"}", MR_Next}});
+  AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
+                       {"^\"%bigo_name\"", MR_Not},
+                       {"^\"%rms_name\",,%float,%float,,,,,,$", MR_Next}});
+  return 0;
+}
+
+}  // end namespace
+
+// ========================================================================= //
+// --------------------------- Testing BigO O(1) --------------------------- //
+// ========================================================================= //
+
+void BM_Complexity_O1(benchmark::State& state) {
+  for (auto _ : state) {
+    for (int i = 0; i < 1024; ++i) {
+      benchmark::DoNotOptimize(&i);
+    }
+  }
+  state.SetComplexityN(state.range(0));
+}
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity();
+BENCHMARK(BM_Complexity_O1)
+    ->Range(1, 1 << 18)
+    ->Complexity([](benchmark::IterationCount) { return 1.0; });
+
+const char *one_test_name = "BM_Complexity_O1";
+const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
+const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
+const char *enum_big_o_1 = "\\([0-9]+\\)";
+// FIXME: Tolerate both '(1)' and 'lgN' as output when the complexity is auto
+// deduced.
+// See https://github.com/google/benchmark/issues/272
+const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
+const char *lambda_big_o_1 = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     enum_big_o_1);
+
+// Add auto enum tests
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     auto_big_o_1);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     lambda_big_o_1);
+
+// ========================================================================= //
+// --------------------------- Testing BigO O(N) --------------------------- //
+// ========================================================================= //
+
+std::vector<int> ConstructRandomVector(int64_t size) {
+  std::vector<int> v;
+  v.reserve(static_cast<int>(size));
+  for (int i = 0; i < size; ++i) {
+    v.push_back(static_cast<int>(std::rand() % size));
+  }
+  return v;
+}
+
+void BM_Complexity_O_N(benchmark::State& state) {
+  auto v = ConstructRandomVector(state.range(0));
+  // Test worst case scenario (item not in vector)
+  const int64_t item_not_in_vector = state.range(0) * 2;
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::find(v.begin(), v.end(), item_not_in_vector));
+  }
+  state.SetComplexityN(state.range(0));
+}
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity(benchmark::oN);
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity([](benchmark::IterationCount n) -> double {
+      return static_cast<double>(n);
+    });
+BENCHMARK(BM_Complexity_O_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity();
+
+const char *n_test_name = "BM_Complexity_O_N";
+const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
+const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
+const char *enum_auto_big_o_n = "N";
+const char *lambda_big_o_n = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     enum_auto_big_o_n);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     lambda_big_o_n);
+
+// ========================================================================= //
+// ------------------------- Testing BigO O(N*lgN) ------------------------- //
+// ========================================================================= //
+
+static void BM_Complexity_O_N_log_N(benchmark::State& state) {
+  auto v = ConstructRandomVector(state.range(0));
+  for (auto _ : state) {
+    std::sort(v.begin(), v.end());
+  }
+  state.SetComplexityN(state.range(0));
+}
+static const double kLog2E = 1.44269504088896340736;
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity(benchmark::oNLogN);
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity([](benchmark::IterationCount n) {
+      return kLog2E * n * log(static_cast<double>(n));
+    });
+BENCHMARK(BM_Complexity_O_N_log_N)
+    ->RangeMultiplier(2)
+    ->Range(1 << 10, 1 << 16)
+    ->Complexity();
+
+const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N";
+const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
+const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
+const char *enum_auto_big_o_n_lg_n = "NlgN";
+const char *lambda_big_o_n_lg_n = "f\\(N\\)";
+
+// Add enum tests
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
+
+// Add lambda tests
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
+
+// ========================================================================= //
+// -------- Testing formatting of Complexity with captured args ------------ //
+// ========================================================================= //
+
+void BM_ComplexityCaptureArgs(benchmark::State& state, int n) {
+  for (auto _ : state) {
+  }
+  state.SetComplexityN(n);
+}
+
+BENCHMARK_CAPTURE(BM_ComplexityCaptureArgs, capture_test, 100)
+    ->Complexity(benchmark::oN)
+    ->Ranges({{1, 2}, {3, 4}});
+
+const std::string complexity_capture_name =
+    "BM_ComplexityCaptureArgs/capture_test";
+
+ADD_COMPLEXITY_CASES(complexity_capture_name, complexity_capture_name + "_BigO",
+                     complexity_capture_name + "_RMS", "N");
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char *argv[]) { RunOutputTests(argc, argv); }
diff --git a/thirdparty/benchmark-1.5.0/test/cxx03_test.cc b/thirdparty/benchmark-1.5.0/test/cxx03_test.cc
new file mode 100644
index 0000000000..c4c9a52273
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/cxx03_test.cc
@@ -0,0 +1,63 @@
+#undef NDEBUG
+#include <cassert>
+#include <cstddef>
+
+#include "benchmark/benchmark.h"
+
+#if __cplusplus >= 201103L
+#error C++11 or greater detected. Should be C++03.
+#endif
+
+#ifdef BENCHMARK_HAS_CXX11
+#error C++11 or greater detected by the library. BENCHMARK_HAS_CXX11 is defined.
+#endif
+
+void BM_empty(benchmark::State& state) {
+  while (state.KeepRunning()) {
+    volatile benchmark::IterationCount x = state.iterations();
+    ((void)x);
+  }
+}
+BENCHMARK(BM_empty);
+
+// The new C++11 interface for args/ranges requires initializer list support.
+// Therefore we provide the old interface to support C++03.
+void BM_old_arg_range_interface(benchmark::State& state) {
+  assert((state.range(0) == 1 && state.range(1) == 2) ||
+         (state.range(0) == 5 && state.range(1) == 6));
+  while (state.KeepRunning()) {
+  }
+}
+BENCHMARK(BM_old_arg_range_interface)->ArgPair(1, 2)->RangePair(5, 5, 6, 6);
+
+template <class T, class U>
+void BM_template2(benchmark::State& state) {
+  BM_empty(state);
+}
+BENCHMARK_TEMPLATE2(BM_template2, int, long);
+
+template <class T>
+void BM_template1(benchmark::State& state) {
+  BM_empty(state);
+}
+BENCHMARK_TEMPLATE(BM_template1, long);
+BENCHMARK_TEMPLATE1(BM_template1, int);
+
+template <class T>
+struct BM_Fixture : public ::benchmark::Fixture {
+};
+
+BENCHMARK_TEMPLATE_F(BM_Fixture, BM_template1, long)(benchmark::State& state) {
+  BM_empty(state);
+}
+BENCHMARK_TEMPLATE1_F(BM_Fixture, BM_template2, int)(benchmark::State& state) {
+  BM_empty(state);
+}
+
+void BM_counters(benchmark::State& state) {
+    BM_empty(state);
+    state.counters["Foo"] = 2;
+}
+BENCHMARK(BM_counters);
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/diagnostics_test.cc b/thirdparty/benchmark-1.5.0/test/diagnostics_test.cc
new file mode 100644
index 0000000000..dd64a33655
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/diagnostics_test.cc
@@ -0,0 +1,80 @@
+// Testing:
+//   State::PauseTiming()
+//   State::ResumeTiming()
+// Test that CHECK's within these function diagnose when they are called
+// outside of the KeepRunning() loop.
+//
+// NOTE: Users should NOT include or use src/check.h. This is only done in
+// order to test library internals.
+
+#include <cstdlib>
+#include <stdexcept>
+
+#include "../src/check.h"
+#include "benchmark/benchmark.h"
+
+#if defined(__GNUC__) && !defined(__EXCEPTIONS)
+#define TEST_HAS_NO_EXCEPTIONS
+#endif
+
+void TestHandler() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  throw std::logic_error("");
+#else
+  std::abort();
+#endif
+}
+
+void try_invalid_pause_resume(benchmark::State& state) {
+#if !defined(TEST_BENCHMARK_LIBRARY_HAS_NO_ASSERTIONS) && !defined(TEST_HAS_NO_EXCEPTIONS)
+  try {
+    state.PauseTiming();
+    std::abort();
+  } catch (std::logic_error const&) {
+  }
+  try {
+    state.ResumeTiming();
+    std::abort();
+  } catch (std::logic_error const&) {
+  }
+#else
+  (void)state;  // avoid unused warning
+#endif
+}
+
+void BM_diagnostic_test(benchmark::State& state) {
+  static bool called_once = false;
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  called_once = true;
+}
+BENCHMARK(BM_diagnostic_test);
+
+
+void BM_diagnostic_test_keep_running(benchmark::State& state) {
+  static bool called_once = false;
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  while(state.KeepRunning()) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+
+  if (called_once == false) try_invalid_pause_resume(state);
+
+  called_once = true;
+}
+BENCHMARK(BM_diagnostic_test_keep_running);
+
+int main(int argc, char* argv[]) {
+  benchmark::internal::GetAbortHandler() = &TestHandler;
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+}
diff --git a/thirdparty/benchmark-1.5.0/test/display_aggregates_only_test.cc b/thirdparty/benchmark-1.5.0/test/display_aggregates_only_test.cc
new file mode 100644
index 0000000000..3c36d3f03c
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/display_aggregates_only_test.cc
@@ -0,0 +1,43 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of DisplayAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find 6 "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/thirdparty/benchmark-1.5.0/test/donotoptimize_assembly_test.cc b/thirdparty/benchmark-1.5.0/test/donotoptimize_assembly_test.cc
new file mode 100644
index 0000000000..d4b0bab70e
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/donotoptimize_assembly_test.cc
@@ -0,0 +1,163 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+extern "C" {
+
+extern int ExternInt;
+extern int ExternInt2;
+extern int ExternInt3;
+
+inline int Add42(int x) { return x + 42; }
+
+struct NotTriviallyCopyable {
+  NotTriviallyCopyable();
+  explicit NotTriviallyCopyable(int x) : value(x) {}
+  NotTriviallyCopyable(NotTriviallyCopyable const&);
+  int value;
+};
+
+struct Large {
+  int value;
+  int data[2];
+};
+
+}
+// CHECK-LABEL: test_with_rvalue:
+extern "C" void test_with_rvalue() {
+  benchmark::DoNotOptimize(Add42(0));
+  // CHECK: movl $42, %eax
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_rvalue:
+extern "C" void test_with_large_rvalue() {
+  benchmark::DoNotOptimize(Large{ExternInt, {ExternInt, ExternInt}});
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]]
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_rvalue:
+extern "C" void test_with_non_trivial_rvalue() {
+  benchmark::DoNotOptimize(NotTriviallyCopyable(ExternInt));
+  // CHECK: mov{{l|q}} ExternInt(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_lvalue:
+extern "C" void test_with_lvalue() {
+  int x = 101;
+  benchmark::DoNotOptimize(x);
+  // CHECK-GNU: movl $101, %eax
+  // CHECK-CLANG: movl $101, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_lvalue:
+extern "C" void test_with_large_lvalue() {
+  Large L{ExternInt, {ExternInt, ExternInt}};
+  benchmark::DoNotOptimize(L);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_lvalue:
+extern "C" void test_with_non_trivial_lvalue() {
+  NotTriviallyCopyable NTC(ExternInt);
+  benchmark::DoNotOptimize(NTC);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_const_lvalue:
+extern "C" void test_with_const_lvalue() {
+  const int x = 123;
+  benchmark::DoNotOptimize(x);
+  // CHECK: movl $123, %eax
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_large_const_lvalue:
+extern "C" void test_with_large_const_lvalue() {
+  const Large L{ExternInt, {ExternInt, ExternInt}};
+  benchmark::DoNotOptimize(L);
+  // CHECK: ExternInt(%rip)
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: movl %eax, -{{[0-9]+}}(%[[REG]])
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_with_non_trivial_const_lvalue:
+extern "C" void test_with_non_trivial_const_lvalue() {
+  const NotTriviallyCopyable Obj(ExternInt);
+  benchmark::DoNotOptimize(Obj);
+  // CHECK: mov{{q|l}} ExternInt(%rip)
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_div_by_two:
+extern "C" int test_div_by_two(int input) {
+  int divisor = 2;
+  benchmark::DoNotOptimize(divisor);
+  return input / divisor;
+  // CHECK: movl $2, [[DEST:.*]]
+  // CHECK: idivl [[DEST]]
+  // CHECK: ret
+}
+
+// CHECK-LABEL: test_inc_integer:
+extern "C" int test_inc_integer() {
+  int x = 0;
+  for (int i=0; i < 5; ++i)
+    benchmark::DoNotOptimize(++x);
+  // CHECK: movl $1, [[DEST:.*]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK: {{(addl \$1,|incl)}} [[DEST]]
+  // CHECK-CLANG: movl [[DEST]], %eax
+  // CHECK: ret
+  return x;
+}
+
+// CHECK-LABEL: test_pointer_rvalue
+extern "C" void test_pointer_rvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+  int x = 42;
+  benchmark::DoNotOptimize(&x);
+}
+
+// CHECK-LABEL: test_pointer_const_lvalue:
+extern "C" void test_pointer_const_lvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z]+]])
+  // CHECK: ret
+  int x = 42;
+  int * const xp = &x;
+  benchmark::DoNotOptimize(xp);
+}
+
+// CHECK-LABEL: test_pointer_lvalue:
+extern "C" void test_pointer_lvalue() {
+  // CHECK: movl $42, [[DEST:.*]]
+  // CHECK: leaq [[DEST]], %rax
+  // CHECK-CLANG: movq %rax, -{{[0-9]+}}(%[[REG:[a-z+]+]])
+  // CHECK: ret
+  int x = 42;
+  int *xp = &x;
+  benchmark::DoNotOptimize(xp);
+}
diff --git a/thirdparty/benchmark-1.5.0/test/donotoptimize_test.cc b/thirdparty/benchmark-1.5.0/test/donotoptimize_test.cc
new file mode 100644
index 0000000000..2ce92d1c72
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/donotoptimize_test.cc
@@ -0,0 +1,52 @@
+#include "benchmark/benchmark.h"
+
+#include <cstdint>
+
+namespace {
+#if defined(__GNUC__)
+std::uint64_t double_up(const std::uint64_t x) __attribute__((const));
+#endif
+std::uint64_t double_up(const std::uint64_t x) { return x * 2; }
+}
+
+// Using DoNotOptimize on types like BitRef seem to cause a lot of problems
+// with the inline assembly on both GCC and Clang.
+struct BitRef {
+  int index;
+  unsigned char &byte;
+
+public:
+  static BitRef Make() {
+    static unsigned char arr[2] = {};
+    BitRef b(1, arr[0]);
+    return b;
+  }
+private:
+  BitRef(int i, unsigned char& b) : index(i), byte(b) {}
+};
+
+int main(int, char*[]) {
+  // this test verifies compilation of DoNotOptimize() for some types
+
+  char buffer8[8] = "";
+  benchmark::DoNotOptimize(buffer8);
+
+  char buffer20[20] = "";
+  benchmark::DoNotOptimize(buffer20);
+
+  char buffer1024[1024] = "";
+  benchmark::DoNotOptimize(buffer1024);
+  benchmark::DoNotOptimize(&buffer1024[0]);
+
+  int x = 123;
+  benchmark::DoNotOptimize(x);
+  benchmark::DoNotOptimize(&x);
+  benchmark::DoNotOptimize(x += 42);
+
+  benchmark::DoNotOptimize(double_up(x));
+
+  // These tests are to e
+  benchmark::DoNotOptimize(BitRef::Make());
+  BitRef lval = BitRef::Make();
+  benchmark::DoNotOptimize(lval);
+}
diff --git a/thirdparty/benchmark-1.5.0/test/filter_test.cc b/thirdparty/benchmark-1.5.0/test/filter_test.cc
new file mode 100644
index 0000000000..0e27065c15
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/filter_test.cc
@@ -0,0 +1,104 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    ++count_;
+    ConsoleReporter::ReportRuns(report);
+  };
+
+  TestReporter() : count_(0) {}
+
+  virtual ~TestReporter() {}
+
+  size_t GetCount() const { return count_; }
+
+ private:
+  mutable size_t count_;
+};
+
+}  // end namespace
+
+static void NoPrefix(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(NoPrefix);
+
+static void BM_Foo(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_Foo);
+
+static void BM_Bar(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_Bar);
+
+static void BM_FooBar(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_FooBar);
+
+static void BM_FooBa(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_FooBa);
+
+int main(int argc, char **argv) {
+  bool list_only = false;
+  for (int i = 0; i < argc; ++i)
+    list_only |= std::string(argv[i]).find("--benchmark_list_tests") !=
+                 std::string::npos;
+
+  benchmark::Initialize(&argc, argv);
+
+  TestReporter test_reporter;
+  const size_t returned_count =
+      benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  if (argc == 2) {
+    // Make sure we ran all of the tests
+    std::stringstream ss(argv[1]);
+    size_t expected_return;
+    ss >> expected_return;
+
+    if (returned_count != expected_return) {
+      std::cerr << "ERROR: Expected " << expected_return
+                << " tests to match the filter but returned_count = "
+                << returned_count << std::endl;
+      return -1;
+    }
+
+    const size_t expected_reports = list_only ? 0 : expected_return;
+    const size_t reports_count = test_reporter.GetCount();
+    if (reports_count != expected_reports) {
+      std::cerr << "ERROR: Expected " << expected_reports
+                << " tests to be run but reported_count = " << reports_count
+                << std::endl;
+      return -1;
+    }
+  }
+
+  return 0;
+}
diff --git a/thirdparty/benchmark-1.5.0/test/fixture_test.cc b/thirdparty/benchmark-1.5.0/test/fixture_test.cc
new file mode 100644
index 0000000000..1462b10f02
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/fixture_test.cc
@@ -0,0 +1,49 @@
+
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <memory>
+
+class MyFixture : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State& state) {
+    if (state.thread_index == 0) {
+      assert(data.get() == nullptr);
+      data.reset(new int(42));
+    }
+  }
+
+  void TearDown(const ::benchmark::State& state) {
+    if (state.thread_index == 0) {
+      assert(data.get() != nullptr);
+      data.reset();
+    }
+  }
+
+  ~MyFixture() { assert(data == nullptr); }
+
+  std::unique_ptr<int> data;
+};
+
+BENCHMARK_F(MyFixture, Foo)(benchmark::State &st) {
+  assert(data.get() != nullptr);
+  assert(*data == 42);
+  for (auto _ : st) {
+  }
+}
+
+BENCHMARK_DEFINE_F(MyFixture, Bar)(benchmark::State& st) {
+  if (st.thread_index == 0) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
+  }
+  for (auto _ : st) {
+    assert(data.get() != nullptr);
+    assert(*data == 42);
+  }
+  st.SetItemsProcessed(st.range(0));
+}
+BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42);
+BENCHMARK_REGISTER_F(MyFixture, Bar)->Arg(42)->ThreadPerCpu();
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/internal_threading_test.cc b/thirdparty/benchmark-1.5.0/test/internal_threading_test.cc
new file mode 100644
index 0000000000..039d7c14a8
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/internal_threading_test.cc
@@ -0,0 +1,184 @@
+
+#undef NDEBUG
+
+#include <chrono>
+#include <thread>
+#include "../src/timers.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+static const std::chrono::duration<double, std::milli> time_frame(50);
+static const double time_frame_in_sec(
+    std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1, 1>>>(
+        time_frame)
+        .count());
+
+void MyBusySpinwait() {
+  const auto start = benchmark::ChronoClockNow();
+
+  while (true) {
+    const auto now = benchmark::ChronoClockNow();
+    const auto elapsed = now - start;
+
+    if (std::chrono::duration<double, std::chrono::seconds::period>(elapsed) >=
+        time_frame)
+      return;
+  }
+}
+
+// ========================================================================= //
+// --------------------------- TEST CASES BEGIN ---------------------------- //
+// ========================================================================= //
+
+// ========================================================================= //
+// BM_MainThread
+
+void BM_MainThread(benchmark::State& state) {
+  for (auto _ : state) {
+    MyBusySpinwait();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_MainThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_WorkerThread
+
+void BM_WorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(1)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseRealTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->UseManualTime();
+BENCHMARK(BM_WorkerThread)->Iterations(1)->Threads(2)->MeasureProcessCPUTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_WorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// BM_MainThreadAndWorkerThread
+
+void BM_MainThreadAndWorkerThread(benchmark::State& state) {
+  for (auto _ : state) {
+    std::thread Worker(&MyBusySpinwait);
+    MyBusySpinwait();
+    Worker.join();
+    state.SetIterationTime(time_frame_in_sec);
+  }
+  state.counters["invtime"] =
+      benchmark::Counter{1, benchmark::Counter::kIsRate};
+}
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(1);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(1)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+BENCHMARK(BM_MainThreadAndWorkerThread)->Iterations(1)->Threads(2);
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->UseManualTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseRealTime();
+BENCHMARK(BM_MainThreadAndWorkerThread)
+    ->Iterations(1)
+    ->Threads(2)
+    ->MeasureProcessCPUTime()
+    ->UseManualTime();
+
+// ========================================================================= //
+// ---------------------------- TEST CASES END ----------------------------- //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/thirdparty/benchmark-1.5.0/test/link_main_test.cc b/thirdparty/benchmark-1.5.0/test/link_main_test.cc
new file mode 100644
index 0000000000..241ad5c390
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/link_main_test.cc
@@ -0,0 +1,8 @@
+#include "benchmark/benchmark.h"
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
diff --git a/thirdparty/benchmark-1.5.0/test/map_test.cc b/thirdparty/benchmark-1.5.0/test/map_test.cc
new file mode 100644
index 0000000000..dbf7982a36
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/map_test.cc
@@ -0,0 +1,57 @@
+#include "benchmark/benchmark.h"
+
+#include <cstdlib>
+#include <map>
+
+namespace {
+
+std::map<int, int> ConstructRandomMap(int size) {
+  std::map<int, int> m;
+  for (int i = 0; i < size; ++i) {
+    m.insert(std::make_pair(std::rand() % size, std::rand() % size));
+  }
+  return m;
+}
+
+}  // namespace
+
+// Basic version.
+static void BM_MapLookup(benchmark::State& state) {
+  const int size = static_cast<int>(state.range(0));
+  std::map<int, int> m;
+  for (auto _ : state) {
+    state.PauseTiming();
+    m = ConstructRandomMap(size);
+    state.ResumeTiming();
+    for (int i = 0; i < size; ++i) {
+      benchmark::DoNotOptimize(m.find(std::rand() % size));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * size);
+}
+BENCHMARK(BM_MapLookup)->Range(1 << 3, 1 << 12);
+
+// Using fixtures.
+class MapFixture : public ::benchmark::Fixture {
+ public:
+  void SetUp(const ::benchmark::State& st) {
+    m = ConstructRandomMap(static_cast<int>(st.range(0)));
+  }
+
+  void TearDown(const ::benchmark::State&) { m.clear(); }
+
+  std::map<int, int> m;
+};
+
+BENCHMARK_DEFINE_F(MapFixture, Lookup)(benchmark::State& state) {
+  const int size = static_cast<int>(state.range(0));
+  for (auto _ : state) {
+    for (int i = 0; i < size; ++i) {
+      benchmark::DoNotOptimize(m.find(std::rand() % size));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * size);
+}
+BENCHMARK_REGISTER_F(MapFixture, Lookup)->Range(1 << 3, 1 << 12);
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/memory_manager_test.cc b/thirdparty/benchmark-1.5.0/test/memory_manager_test.cc
new file mode 100644
index 0000000000..90bed16cff
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/memory_manager_test.cc
@@ -0,0 +1,44 @@
+#include <memory>
+
+#include "../src/check.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+class TestMemoryManager : public benchmark::MemoryManager {
+  void Start() {}
+  void Stop(Result* result) {
+    result->num_allocs = 42;
+    result->max_bytes_used = 42000;
+  }
+};
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"run_name\": \"BM_empty\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"allocs_per_iter\": %float,$", MR_Next},
+                       {"\"max_bytes_used\": 42000$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
+
+int main(int argc, char* argv[]) {
+  std::unique_ptr<benchmark::MemoryManager> mm(new TestMemoryManager());
+
+  benchmark::RegisterMemoryManager(mm.get());
+  RunOutputTests(argc, argv);
+  benchmark::RegisterMemoryManager(nullptr);
+}
diff --git a/thirdparty/benchmark-1.5.0/test/multiple_ranges_test.cc b/thirdparty/benchmark-1.5.0/test/multiple_ranges_test.cc
new file mode 100644
index 0000000000..b25f40eb52
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/multiple_ranges_test.cc
@@ -0,0 +1,96 @@
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <vector>
+
+class MultipleRangesFixture : public ::benchmark::Fixture {
+ public:
+  MultipleRangesFixture()
+      : expectedValues({{1, 3, 5},
+                        {1, 3, 8},
+                        {1, 3, 15},
+                        {2, 3, 5},
+                        {2, 3, 8},
+                        {2, 3, 15},
+                        {1, 4, 5},
+                        {1, 4, 8},
+                        {1, 4, 15},
+                        {2, 4, 5},
+                        {2, 4, 8},
+                        {2, 4, 15},
+                        {1, 7, 5},
+                        {1, 7, 8},
+                        {1, 7, 15},
+                        {2, 7, 5},
+                        {2, 7, 8},
+                        {2, 7, 15},
+                        {7, 6, 3}}) {}
+
+  void SetUp(const ::benchmark::State& state) {
+    std::vector<int64_t> ranges = {state.range(0), state.range(1),
+                                   state.range(2)};
+
+    assert(expectedValues.find(ranges) != expectedValues.end());
+
+    actualValues.insert(ranges);
+  }
+
+  // NOTE: This is not TearDown as we want to check after _all_ runs are
+  // complete.
+  virtual ~MultipleRangesFixture() {
+    if (actualValues != expectedValues) {
+      std::cout << "EXPECTED\n";
+      for (auto v : expectedValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+      std::cout << "ACTUAL\n";
+      for (auto v : actualValues) {
+        std::cout << "{";
+        for (int64_t iv : v) {
+          std::cout << iv << ", ";
+        }
+        std::cout << "}\n";
+      }
+    }
+  }
+
+  std::set<std::vector<int64_t>> expectedValues;
+  std::set<std::vector<int64_t>> actualValues;
+};
+
+BENCHMARK_DEFINE_F(MultipleRangesFixture, Empty)(benchmark::State& state) {
+  for (auto _ : state) {
+    int64_t product = state.range(0) * state.range(1) * state.range(2);
+    for (int64_t x = 0; x < product; x++) {
+      benchmark::DoNotOptimize(x);
+    }
+  }
+}
+
+BENCHMARK_REGISTER_F(MultipleRangesFixture, Empty)
+    ->RangeMultiplier(2)
+    ->Ranges({{1, 2}, {3, 7}, {5, 15}})
+    ->Args({7, 6, 3});
+
+void BM_CheckDefaultArgument(benchmark::State& state) {
+  // Test that the 'range()' without an argument is the same as 'range(0)'.
+  assert(state.range() == state.range(0));
+  assert(state.range() != state.range(1));
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_CheckDefaultArgument)->Ranges({{1, 5}, {6, 10}});
+
+static void BM_MultipleRanges(benchmark::State& st) {
+  for (auto _ : st) {
+  }
+}
+BENCHMARK(BM_MultipleRanges)->Ranges({{5, 5}, {6, 6}});
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/options_test.cc b/thirdparty/benchmark-1.5.0/test/options_test.cc
new file mode 100644
index 0000000000..7bfc235465
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/options_test.cc
@@ -0,0 +1,75 @@
+#include "benchmark/benchmark.h"
+#include <chrono>
+#include <thread>
+
+#if defined(NDEBUG)
+#undef NDEBUG
+#endif
+#include <cassert>
+
+void BM_basic(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+
+void BM_basic_slow(benchmark::State& state) {
+  std::chrono::milliseconds sleep_duration(state.range(0));
+  for (auto _ : state) {
+    std::this_thread::sleep_for(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(sleep_duration));
+  }
+}
+
+BENCHMARK(BM_basic);
+BENCHMARK(BM_basic)->Arg(42);
+BENCHMARK(BM_basic_slow)->Arg(10)->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_basic_slow)->Arg(100)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_basic_slow)->Arg(1000)->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_basic)->Range(1, 8);
+BENCHMARK(BM_basic)->RangeMultiplier(2)->Range(1, 8);
+BENCHMARK(BM_basic)->DenseRange(10, 15);
+BENCHMARK(BM_basic)->Args({42, 42});
+BENCHMARK(BM_basic)->Ranges({{64, 512}, {64, 512}});
+BENCHMARK(BM_basic)->MinTime(0.7);
+BENCHMARK(BM_basic)->UseRealTime();
+BENCHMARK(BM_basic)->ThreadRange(2, 4);
+BENCHMARK(BM_basic)->ThreadPerCpu();
+BENCHMARK(BM_basic)->Repetitions(3);
+BENCHMARK(BM_basic)
+    ->RangeMultiplier(std::numeric_limits<int>::max())
+    ->Range(std::numeric_limits<int64_t>::min(),
+            std::numeric_limits<int64_t>::max());
+
+// Negative ranges
+BENCHMARK(BM_basic)->Range(-64, -1);
+BENCHMARK(BM_basic)->RangeMultiplier(4)->Range(-8, 8);
+BENCHMARK(BM_basic)->DenseRange(-2, 2, 1);
+BENCHMARK(BM_basic)->Ranges({{-64, 1}, {-8, -1}});
+
+void CustomArgs(benchmark::internal::Benchmark* b) {
+  for (int i = 0; i < 10; ++i) {
+    b->Arg(i);
+  }
+}
+
+BENCHMARK(BM_basic)->Apply(CustomArgs);
+
+void BM_explicit_iteration_count(benchmark::State& state) {
+  // Test that benchmarks specified with an explicit iteration count are
+  // only run once.
+  static bool invoked_before = false;
+  assert(!invoked_before);
+  invoked_before = true;
+
+  // Test that the requested iteration count is respected.
+  assert(state.max_iterations == 42);
+  size_t actual_iterations = 0;
+  for (auto _ : state)
+    ++actual_iterations;
+  assert(state.iterations() == state.max_iterations);
+  assert(state.iterations() == 42);
+
+}
+BENCHMARK(BM_explicit_iteration_count)->Iterations(42);
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/output_test.h b/thirdparty/benchmark-1.5.0/test/output_test.h
new file mode 100644
index 0000000000..9385761b21
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/output_test.h
@@ -0,0 +1,213 @@
+#ifndef TEST_OUTPUT_TEST_H
+#define TEST_OUTPUT_TEST_H
+
+#undef NDEBUG
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../src/re.h"
+#include "benchmark/benchmark.h"
+
+#define CONCAT2(x, y) x##y
+#define CONCAT(x, y) CONCAT2(x, y)
+
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = ::AddCases(__VA_ARGS__)
+
+#define SET_SUBSTITUTIONS(...) \
+  int CONCAT(dummy, __LINE__) = ::SetSubstitutions(__VA_ARGS__)
+
+enum MatchRules {
+  MR_Default,  // Skip non-matching lines until a match is found.
+  MR_Next,     // Match must occur on the next line.
+  MR_Not  // No line between the current position and the next match matches
+          // the regex
+};
+
+struct TestCase {
+  TestCase(std::string re, int rule = MR_Default);
+
+  std::string regex_str;
+  int match_rule;
+  std::string substituted_regex;
+  std::shared_ptr<benchmark::Regex> regex;
+};
+
+enum TestCaseID {
+  TC_ConsoleOut,
+  TC_ConsoleErr,
+  TC_JSONOut,
+  TC_JSONErr,
+  TC_CSVOut,
+  TC_CSVErr,
+
+  TC_NumID  // PRIVATE
+};
+
+// Add a list of test cases to be run against the output specified by
+// 'ID'
+int AddCases(TestCaseID ID, std::initializer_list<TestCase> il);
+
+// Add or set a list of substitutions to be performed on constructed regex's
+// See 'output_test_helper.cc' for a list of default substitutions.
+int SetSubstitutions(
+    std::initializer_list<std::pair<std::string, std::string>> il);
+
+// Run all output tests.
+void RunOutputTests(int argc, char* argv[]);
+
+// Count the number of 'pat' substrings in the 'haystack' string.
+int SubstrCnt(const std::string& haystack, const std::string& pat);
+
+// Run registered benchmarks with file reporter enabled, and return the content
+// outputted by the file reporter.
+std::string GetFileReporterOutput(int argc, char* argv[]);
+
+// ========================================================================= //
+// ------------------------- Results checking ------------------------------ //
+// ========================================================================= //
+
+// Call this macro to register a benchmark for checking its results. This
+// should be all that's needed. It subscribes a function to check the (CSV)
+// results of a benchmark. This is done only after verifying that the output
+// strings are really as expected.
+// bm_name_pattern: a name or a regex pattern which will be matched against
+//                  all the benchmark names. Matching benchmarks
+//                  will be the subject of a call to checker_function
+// checker_function: should be of type ResultsCheckFn (see below)
+#define CHECK_BENCHMARK_RESULTS(bm_name_pattern, checker_function) \
+  size_t CONCAT(dummy, __LINE__) = AddChecker(bm_name_pattern, checker_function)
+
+struct Results;
+typedef std::function<void(Results const&)> ResultsCheckFn;
+
+size_t AddChecker(const char* bm_name_pattern, ResultsCheckFn fn);
+
+// Class holding the results of a benchmark.
+// It is passed in calls to checker functions.
+struct Results {
+  // the benchmark name
+  std::string name;
+  // the benchmark fields
+  std::map<std::string, std::string> values;
+
+  Results(const std::string& n) : name(n) {}
+
+  int NumThreads() const;
+
+  double NumIterations() const;
+
+  typedef enum { kCpuTime, kRealTime } BenchmarkTime;
+
+  // get cpu_time or real_time in seconds
+  double GetTime(BenchmarkTime which) const;
+
+  // get the real_time duration of the benchmark in seconds.
+  // it is better to use fuzzy float checks for this, as the float
+  // ASCII formatting is lossy.
+  double DurationRealTime() const {
+    return NumIterations() * GetTime(kRealTime);
+  }
+  // get the cpu_time duration of the benchmark in seconds
+  double DurationCPUTime() const {
+    return NumIterations() * GetTime(kCpuTime);
+  }
+
+  // get the string for a result by name, or nullptr if the name
+  // is not found
+  const std::string* Get(const char* entry_name) const {
+    auto it = values.find(entry_name);
+    if (it == values.end()) return nullptr;
+    return &it->second;
+  }
+
+  // get a result by name, parsed as a specific type.
+  // NOTE: for counters, use GetCounterAs instead.
+  template <class T>
+  T GetAs(const char* entry_name) const;
+
+  // counters are written as doubles, so they have to be read first
+  // as a double, and only then converted to the asked type.
+  template <class T>
+  T GetCounterAs(const char* entry_name) const {
+    double dval = GetAs<double>(entry_name);
+    T tval = static_cast<T>(dval);
+    return tval;
+  }
+};
+
+template <class T>
+T Results::GetAs(const char* entry_name) const {
+  auto* sv = Get(entry_name);
+  CHECK(sv != nullptr && !sv->empty());
+  std::stringstream ss;
+  ss << *sv;
+  T out;
+  ss >> out;
+  CHECK(!ss.fail());
+  return out;
+}
+
+//----------------------------------
+// Macros to help in result checking. Do not use them with arguments causing
+// side-effects.
+
+// clang-format off
+
+#define _CHECK_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value) \
+    CONCAT(CHECK_, relationship)                                        \
+    (entry.getfn< var_type >(var_name), (value)) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"
+
+// check with tolerance. eps_factor is the tolerance window, which is
+// interpreted relative to value (eg, 0.1 means 10% of value).
+#define _CHECK_FLOAT_RESULT_VALUE(entry, getfn, var_type, var_name, relationship, value, eps_factor) \
+    CONCAT(CHECK_FLOAT_, relationship)                                  \
+    (entry.getfn< var_type >(var_name), (value), (eps_factor) * (value)) << "\n" \
+    << __FILE__ << ":" << __LINE__ << ": " << (entry).name << ":\n"     \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "expected (" << #var_type << ")" << (var_name)                   \
+    << "=" << (entry).getfn< var_type >(var_name)                       \
+    << " to be " #relationship " to " << (value) << "\n"                \
+    << __FILE__ << ":" << __LINE__ << ": "                              \
+    << "with tolerance of " << (eps_factor) * (value)                   \
+    << " (" << (eps_factor)*100. << "%), "                              \
+    << "but delta was " << ((entry).getfn< var_type >(var_name) - (value)) \
+    << " (" << (((entry).getfn< var_type >(var_name) - (value))         \
+               /                                                        \
+               ((value) > 1.e-5 || value < -1.e-5 ? value : 1.e-5)*100.) \
+    << "%)"
+
+#define CHECK_RESULT_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetAs, var_type, var_name, relationship, value)
+
+#define CHECK_COUNTER_VALUE(entry, var_type, var_name, relationship, value) \
+    _CHECK_RESULT_VALUE(entry, GetCounterAs, var_type, var_name, relationship, value)
+
+#define CHECK_FLOAT_RESULT_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetAs, double, var_name, relationship, value, eps_factor)
+
+#define CHECK_FLOAT_COUNTER_VALUE(entry, var_name, relationship, value, eps_factor) \
+    _CHECK_FLOAT_RESULT_VALUE(entry, GetCounterAs, double, var_name, relationship, value, eps_factor)
+
+// clang-format on
+
+// ========================================================================= //
+// --------------------------- Misc Utilities ------------------------------ //
+// ========================================================================= //
+
+namespace {
+
+const char* const dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+
+}  //  end namespace
+
+#endif  // TEST_OUTPUT_TEST_H
diff --git a/thirdparty/benchmark-1.5.0/test/output_test_helper.cc b/thirdparty/benchmark-1.5.0/test/output_test_helper.cc
new file mode 100644
index 0000000000..5dc951d2bc
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/output_test_helper.cc
@@ -0,0 +1,505 @@
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <random>
+#include <sstream>
+#include <streambuf>
+
+#include "../src/benchmark_api_internal.h"
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "../src/re.h"     // NOTE: re.h is for internal use only
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------------ Internals -------------------------------- //
+// ========================================================================= //
+namespace internal {
+namespace {
+
+using TestCaseList = std::vector<TestCase>;
+
+// Use a vector because the order elements are added matters during iteration.
+// std::map/unordered_map don't guarantee that.
+// For example:
+//  SetSubstitutions({{"%HelloWorld", "Hello"}, {"%Hello", "Hi"}});
+//     Substitute("%HelloWorld") // Always expands to Hello.
+using SubMap = std::vector<std::pair<std::string, std::string>>;
+
+TestCaseList& GetTestCaseList(TestCaseID ID) {
+  // Uses function-local statics to ensure initialization occurs
+  // before first use.
+  static TestCaseList lists[TC_NumID];
+  return lists[ID];
+}
+
+SubMap& GetSubstitutions() {
+  // Don't use 'dec_re' from header because it may not yet be initialized.
+  // clang-format off
+  static std::string safe_dec_re = "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?";
+  static std::string time_re = "([0-9]+[.])?[0-9]+";
+  static SubMap map = {
+      {"%float", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?"},
+      // human-readable float
+      {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
+      {"%int", "[ ]*[0-9]+"},
+      {" %s ", "[ ]+"},
+      {"%time", "[ ]*" + time_re + "[ ]+ns"},
+      {"%console_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns [ ]*[0-9]+"},
+      {"%console_time_only_report", "[ ]*" + time_re + "[ ]+ns [ ]*" + time_re + "[ ]+ns"},
+      {"%console_us_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us [ ]*[0-9]+"},
+      {"%console_us_time_only_report", "[ ]*" + time_re + "[ ]+us [ ]*" + time_re + "[ ]+us"},
+      {"%csv_header",
+       "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
+       "items_per_second,label,error_occurred,error_message"},
+      {"%csv_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,,,"},
+      {"%csv_us_report", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",us,,,,,"},
+      {"%csv_bytes_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re + ",,,,"},
+      {"%csv_items_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,," + safe_dec_re + ",,,"},
+      {"%csv_bytes_items_report",
+       "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns," + safe_dec_re +
+       "," + safe_dec_re + ",,,"},
+      {"%csv_label_report_begin", "[0-9]+," + safe_dec_re + "," + safe_dec_re + ",ns,,,"},
+      {"%csv_label_report_end", ",,"}};
+  // clang-format on
+  return map;
+}
+
+std::string PerformSubstitutions(std::string source) {
+  SubMap const& subs = GetSubstitutions();
+  using SizeT = std::string::size_type;
+  for (auto const& KV : subs) {
+    SizeT pos;
+    SizeT next_start = 0;
+    while ((pos = source.find(KV.first, next_start)) != std::string::npos) {
+      next_start = pos + KV.second.size();
+      source.replace(pos, KV.first.size(), KV.second);
+    }
+  }
+  return source;
+}
+
+void CheckCase(std::stringstream& remaining_output, TestCase const& TC,
+               TestCaseList const& not_checks) {
+  std::string first_line;
+  bool on_first = true;
+  std::string line;
+  while (remaining_output.eof() == false) {
+    CHECK(remaining_output.good());
+    std::getline(remaining_output, line);
+    if (on_first) {
+      first_line = line;
+      on_first = false;
+    }
+    for (const auto& NC : not_checks) {
+      CHECK(!NC.regex->Match(line))
+          << "Unexpected match for line \"" << line << "\" for MR_Not regex \""
+          << NC.regex_str << "\""
+          << "\n    actual regex string \"" << TC.substituted_regex << "\""
+          << "\n    started matching near: " << first_line;
+    }
+    if (TC.regex->Match(line)) return;
+    CHECK(TC.match_rule != MR_Next)
+        << "Expected line \"" << line << "\" to match regex \"" << TC.regex_str
+        << "\""
+        << "\n    actual regex string \"" << TC.substituted_regex << "\""
+        << "\n    started matching near: " << first_line;
+  }
+  CHECK(remaining_output.eof() == false)
+      << "End of output reached before match for regex \"" << TC.regex_str
+      << "\" was found"
+      << "\n    actual regex string \"" << TC.substituted_regex << "\""
+      << "\n    started matching near: " << first_line;
+}
+
+void CheckCases(TestCaseList const& checks, std::stringstream& output) {
+  std::vector<TestCase> not_checks;
+  for (size_t i = 0; i < checks.size(); ++i) {
+    const auto& TC = checks[i];
+    if (TC.match_rule == MR_Not) {
+      not_checks.push_back(TC);
+      continue;
+    }
+    CheckCase(output, TC, not_checks);
+    not_checks.clear();
+  }
+}
+
+class TestReporter : public benchmark::BenchmarkReporter {
+ public:
+  TestReporter(std::vector<benchmark::BenchmarkReporter*> reps)
+      : reporters_(reps) {}
+
+  virtual bool ReportContext(const Context& context) {
+    bool last_ret = false;
+    bool first = true;
+    for (auto rep : reporters_) {
+      bool new_ret = rep->ReportContext(context);
+      CHECK(first || new_ret == last_ret)
+          << "Reports return different values for ReportContext";
+      first = false;
+      last_ret = new_ret;
+    }
+    (void)first;
+    return last_ret;
+  }
+
+  void ReportRuns(const std::vector<Run>& report) {
+    for (auto rep : reporters_) rep->ReportRuns(report);
+  }
+  void Finalize() {
+    for (auto rep : reporters_) rep->Finalize();
+  }
+
+ private:
+  std::vector<benchmark::BenchmarkReporter*> reporters_;
+};
+}  // namespace
+
+}  // end namespace internal
+
+// ========================================================================= //
+// -------------------------- Results checking ----------------------------- //
+// ========================================================================= //
+
+namespace internal {
+
+// Utility class to manage subscribers for checking benchmark results.
+// It works by parsing the CSV output to read the results.
+class ResultsChecker {
+ public:
+  struct PatternAndFn : public TestCase {  // reusing TestCase for its regexes
+    PatternAndFn(const std::string& rx, ResultsCheckFn fn_)
+        : TestCase(rx), fn(fn_) {}
+    ResultsCheckFn fn;
+  };
+
+  std::vector<PatternAndFn> check_patterns;
+  std::vector<Results> results;
+  std::vector<std::string> field_names;
+
+  void Add(const std::string& entry_pattern, ResultsCheckFn fn);
+
+  void CheckResults(std::stringstream& output);
+
+ private:
+  void SetHeader_(const std::string& csv_header);
+  void SetValues_(const std::string& entry_csv_line);
+
+  std::vector<std::string> SplitCsv_(const std::string& line);
+};
+
+// store the static ResultsChecker in a function to prevent initialization
+// order problems
+ResultsChecker& GetResultsChecker() {
+  static ResultsChecker rc;
+  return rc;
+}
+
+// add a results checker for a benchmark
+void ResultsChecker::Add(const std::string& entry_pattern, ResultsCheckFn fn) {
+  check_patterns.emplace_back(entry_pattern, fn);
+}
+
+// check the results of all subscribed benchmarks
+void ResultsChecker::CheckResults(std::stringstream& output) {
+  // first reset the stream to the start
+  {
+    auto start = std::stringstream::pos_type(0);
+    // clear before calling tellg()
+    output.clear();
+    // seek to zero only when needed
+    if (output.tellg() > start) output.seekg(start);
+    // and just in case
+    output.clear();
+  }
+  // now go over every line and publish it to the ResultsChecker
+  std::string line;
+  bool on_first = true;
+  while (output.eof() == false) {
+    CHECK(output.good());
+    std::getline(output, line);
+    if (on_first) {
+      SetHeader_(line);  // this is important
+      on_first = false;
+      continue;
+    }
+    SetValues_(line);
+  }
+  // finally we can call the subscribed check functions
+  for (const auto& p : check_patterns) {
+    VLOG(2) << "--------------------------------\n";
+    VLOG(2) << "checking for benchmarks matching " << p.regex_str << "...\n";
+    for (const auto& r : results) {
+      if (!p.regex->Match(r.name)) {
+        VLOG(2) << p.regex_str << " is not matched by " << r.name << "\n";
+        continue;
+      } else {
+        VLOG(2) << p.regex_str << " is matched by " << r.name << "\n";
+      }
+      VLOG(1) << "Checking results of " << r.name << ": ... \n";
+      p.fn(r);
+      VLOG(1) << "Checking results of " << r.name << ": OK.\n";
+    }
+  }
+}
+
+// prepare for the names in this header
+void ResultsChecker::SetHeader_(const std::string& csv_header) {
+  field_names = SplitCsv_(csv_header);
+}
+
+// set the values for a benchmark
+void ResultsChecker::SetValues_(const std::string& entry_csv_line) {
+  if (entry_csv_line.empty()) return;  // some lines are empty
+  CHECK(!field_names.empty());
+  auto vals = SplitCsv_(entry_csv_line);
+  CHECK_EQ(vals.size(), field_names.size());
+  results.emplace_back(vals[0]);  // vals[0] is the benchmark name
+  auto& entry = results.back();
+  for (size_t i = 1, e = vals.size(); i < e; ++i) {
+    entry.values[field_names[i]] = vals[i];
+  }
+}
+
+// a quick'n'dirty csv splitter (eliminating quotes)
+std::vector<std::string> ResultsChecker::SplitCsv_(const std::string& line) {
+  std::vector<std::string> out;
+  if (line.empty()) return out;
+  if (!field_names.empty()) out.reserve(field_names.size());
+  size_t prev = 0, pos = line.find_first_of(','), curr = pos;
+  while (pos != line.npos) {
+    CHECK(curr > 0);
+    if (line[prev] == '"') ++prev;
+    if (line[curr - 1] == '"') --curr;
+    out.push_back(line.substr(prev, curr - prev));
+    prev = pos + 1;
+    pos = line.find_first_of(',', pos + 1);
+    curr = pos;
+  }
+  curr = line.size();
+  if (line[prev] == '"') ++prev;
+  if (line[curr - 1] == '"') --curr;
+  out.push_back(line.substr(prev, curr - prev));
+  return out;
+}
+
+}  // end namespace internal
+
+size_t AddChecker(const char* bm_name, ResultsCheckFn fn) {
+  auto& rc = internal::GetResultsChecker();
+  rc.Add(bm_name, fn);
+  return rc.results.size();
+}
+
+int Results::NumThreads() const {
+  auto pos = name.find("/threads:");
+  if (pos == name.npos) return 1;
+  auto end = name.find('/', pos + 9);
+  std::stringstream ss;
+  ss << name.substr(pos + 9, end);
+  int num = 1;
+  ss >> num;
+  CHECK(!ss.fail());
+  return num;
+}
+
+double Results::NumIterations() const {
+  return GetAs<double>("iterations");
+}
+
+double Results::GetTime(BenchmarkTime which) const {
+  CHECK(which == kCpuTime || which == kRealTime);
+  const char* which_str = which == kCpuTime ? "cpu_time" : "real_time";
+  double val = GetAs<double>(which_str);
+  auto unit = Get("time_unit");
+  CHECK(unit);
+  if (*unit == "ns") {
+    return val * 1.e-9;
+  } else if (*unit == "us") {
+    return val * 1.e-6;
+  } else if (*unit == "ms") {
+    return val * 1.e-3;
+  } else if (*unit == "s") {
+    return val;
+  } else {
+    CHECK(1 == 0) << "unknown time unit: " << *unit;
+    return 0;
+  }
+}
+
+// ========================================================================= //
+// -------------------------- Public API Definitions------------------------ //
+// ========================================================================= //
+
+TestCase::TestCase(std::string re, int rule)
+    : regex_str(std::move(re)),
+      match_rule(rule),
+      substituted_regex(internal::PerformSubstitutions(regex_str)),
+      regex(std::make_shared<benchmark::Regex>()) {
+  std::string err_str;
+  regex->Init(substituted_regex, &err_str);
+  CHECK(err_str.empty()) << "Could not construct regex \"" << substituted_regex
+                         << "\""
+                         << "\n    originally \"" << regex_str << "\""
+                         << "\n    got error: " << err_str;
+}
+
+int AddCases(TestCaseID ID, std::initializer_list<TestCase> il) {
+  auto& L = internal::GetTestCaseList(ID);
+  L.insert(L.end(), il);
+  return 0;
+}
+
+int SetSubstitutions(
+    std::initializer_list<std::pair<std::string, std::string>> il) {
+  auto& subs = internal::GetSubstitutions();
+  for (auto KV : il) {
+    bool exists = false;
+    KV.second = internal::PerformSubstitutions(KV.second);
+    for (auto& EKV : subs) {
+      if (EKV.first == KV.first) {
+        EKV.second = std::move(KV.second);
+        exists = true;
+        break;
+      }
+    }
+    if (!exists) subs.push_back(std::move(KV));
+  }
+  return 0;
+}
+
+void RunOutputTests(int argc, char* argv[]) {
+  using internal::GetTestCaseList;
+  benchmark::Initialize(&argc, argv);
+  auto options = benchmark::internal::GetOutputOptions(/*force_no_color*/ true);
+  benchmark::ConsoleReporter CR(options);
+  benchmark::JSONReporter JR;
+  benchmark::CSVReporter CSVR;
+  struct ReporterTest {
+    const char* name;
+    std::vector<TestCase>& output_cases;
+    std::vector<TestCase>& error_cases;
+    benchmark::BenchmarkReporter& reporter;
+    std::stringstream out_stream;
+    std::stringstream err_stream;
+
+    ReporterTest(const char* n, std::vector<TestCase>& out_tc,
+                 std::vector<TestCase>& err_tc,
+                 benchmark::BenchmarkReporter& br)
+        : name(n), output_cases(out_tc), error_cases(err_tc), reporter(br) {
+      reporter.SetOutputStream(&out_stream);
+      reporter.SetErrorStream(&err_stream);
+    }
+  } TestCases[] = {
+      {"ConsoleReporter", GetTestCaseList(TC_ConsoleOut),
+       GetTestCaseList(TC_ConsoleErr), CR},
+      {"JSONReporter", GetTestCaseList(TC_JSONOut), GetTestCaseList(TC_JSONErr),
+       JR},
+      {"CSVReporter", GetTestCaseList(TC_CSVOut), GetTestCaseList(TC_CSVErr),
+       CSVR},
+  };
+
+  // Create the test reporter and run the benchmarks.
+  std::cout << "Running benchmarks...\n";
+  internal::TestReporter test_rep({&CR, &JR, &CSVR});
+  benchmark::RunSpecifiedBenchmarks(&test_rep);
+
+  for (auto& rep_test : TestCases) {
+    std::string msg = std::string("\nTesting ") + rep_test.name + " Output\n";
+    std::string banner(msg.size() - 1, '-');
+    std::cout << banner << msg << banner << "\n";
+
+    std::cerr << rep_test.err_stream.str();
+    std::cout << rep_test.out_stream.str();
+
+    internal::CheckCases(rep_test.error_cases, rep_test.err_stream);
+    internal::CheckCases(rep_test.output_cases, rep_test.out_stream);
+
+    std::cout << "\n";
+  }
+
+  // now that we know the output is as expected, we can dispatch
+  // the checks to subscribees.
+  auto& csv = TestCases[2];
+  // would use == but gcc spits a warning
+  CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
+  internal::GetResultsChecker().CheckResults(csv.out_stream);
+}
+
+int SubstrCnt(const std::string& haystack, const std::string& pat) {
+  if (pat.length() == 0) return 0;
+  int count = 0;
+  for (size_t offset = haystack.find(pat); offset != std::string::npos;
+       offset = haystack.find(pat, offset + pat.length()))
+    ++count;
+  return count;
+}
+
+static char ToHex(int ch) {
+  return ch < 10 ? static_cast<char>('0' + ch)
+                 : static_cast<char>('a' + (ch - 10));
+}
+
+static char RandomHexChar() {
+  static std::mt19937 rd{std::random_device{}()};
+  static std::uniform_int_distribution<int> mrand{0, 15};
+  return ToHex(mrand(rd));
+}
+
+static std::string GetRandomFileName() {
+  std::string model = "test.%%%%%%";
+  for (auto & ch :  model) {
+    if (ch == '%')
+      ch = RandomHexChar();
+  }
+  return model;
+}
+
+static bool FileExists(std::string const& name) {
+  std::ifstream in(name.c_str());
+  return in.good();
+}
+
+static std::string GetTempFileName() {
+  // This function attempts to avoid race conditions where two tests
+  // create the same file at the same time. However, it still introduces races
+  // similar to tmpnam.
+  int retries = 3;
+  while (--retries) {
+    std::string name = GetRandomFileName();
+    if (!FileExists(name))
+      return name;
+  }
+  std::cerr << "Failed to create unique temporary file name" << std::endl;
+  std::abort();
+}
+
+std::string GetFileReporterOutput(int argc, char* argv[]) {
+  std::vector<char*> new_argv(argv, argv + argc);
+  assert(static_cast<decltype(new_argv)::size_type>(argc) == new_argv.size());
+
+  std::string tmp_file_name = GetTempFileName();
+  std::cout << "Will be using this as the tmp file: " << tmp_file_name << '\n';
+
+  std::string tmp = "--benchmark_out=";
+  tmp += tmp_file_name;
+  new_argv.emplace_back(const_cast<char*>(tmp.c_str()));
+
+  argc = int(new_argv.size());
+
+  benchmark::Initialize(&argc, new_argv.data());
+  benchmark::RunSpecifiedBenchmarks();
+
+  // Read the output back from the file, and delete the file.
+  std::ifstream tmp_stream(tmp_file_name);
+  std::string output = std::string((std::istreambuf_iterator<char>(tmp_stream)),
+                                   std::istreambuf_iterator<char>());
+  std::remove(tmp_file_name.c_str());
+
+  return output;
+}
diff --git a/thirdparty/benchmark-1.5.0/test/register_benchmark_test.cc b/thirdparty/benchmark-1.5.0/test/register_benchmark_test.cc
new file mode 100644
index 0000000000..3ac5b21fb3
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/register_benchmark_test.cc
@@ -0,0 +1,184 @@
+
+#undef NDEBUG
+#include <cassert>
+#include <vector>
+
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "benchmark/benchmark.h"
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    all_runs_.insert(all_runs_.end(), begin(report), end(report));
+    ConsoleReporter::ReportRuns(report);
+  }
+
+  std::vector<Run> all_runs_;
+};
+
+struct TestCase {
+  std::string name;
+  const char* label;
+  // Note: not explicit as we rely on it being converted through ADD_CASES.
+  TestCase(const char* xname) : TestCase(xname, nullptr) {}
+  TestCase(const char* xname, const char* xlabel)
+      : name(xname), label(xlabel) {}
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+
+  void CheckRun(Run const& run) const {
+    // clang-format off
+    CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+                                      << run.benchmark_name();
+    if (label) {
+      CHECK(run.report_label == label) << "expected " << label << " got "
+                                       << run.report_label;
+    } else {
+      CHECK(run.report_label == "");
+    }
+    // clang-format on
+  }
+};
+
+std::vector<TestCase> ExpectedResults;
+
+int AddCases(std::initializer_list<TestCase> const& v) {
+  for (auto N : v) {
+    ExpectedResults.push_back(N);
+  }
+  return 0;
+}
+
+#define CONCAT(x, y) CONCAT2(x, y)
+#define CONCAT2(x, y) x##y
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases({__VA_ARGS__})
+
+}  // end namespace
+
+typedef benchmark::internal::Benchmark* ReturnVal;
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with no additional arguments
+//----------------------------------------------------------------------------//
+void BM_function(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_function);
+ReturnVal dummy = benchmark::RegisterBenchmark(
+    "BM_function_manual_registration", BM_function);
+ADD_CASES({"BM_function"}, {"BM_function_manual_registration"});
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with additional arguments
+// Note: GCC <= 4.8 do not support this form of RegisterBenchmark because they
+//       reject the variadic pack expansion of lambda captures.
+//----------------------------------------------------------------------------//
+#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+
+void BM_extra_args(benchmark::State& st, const char* label) {
+  for (auto _ : st) {
+  }
+  st.SetLabel(label);
+}
+int RegisterFromFunction() {
+  std::pair<const char*, const char*> cases[] = {
+      {"test1", "One"}, {"test2", "Two"}, {"test3", "Three"}};
+  for (auto const& c : cases)
+    benchmark::RegisterBenchmark(c.first, &BM_extra_args, c.second);
+  return 0;
+}
+int dummy2 = RegisterFromFunction();
+ADD_CASES({"test1", "One"}, {"test2", "Two"}, {"test3", "Three"});
+
+#endif  // BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+
+//----------------------------------------------------------------------------//
+// Test RegisterBenchmark with different callable types
+//----------------------------------------------------------------------------//
+
+struct CustomFixture {
+  void operator()(benchmark::State& st) {
+    for (auto _ : st) {
+    }
+  }
+};
+
+void TestRegistrationAtRuntime() {
+#ifdef BENCHMARK_HAS_CXX11
+  {
+    CustomFixture fx;
+    benchmark::RegisterBenchmark("custom_fixture", fx);
+    AddCases({"custom_fixture"});
+  }
+#endif
+#ifndef BENCHMARK_HAS_NO_VARIADIC_REGISTER_BENCHMARK
+  {
+    const char* x = "42";
+    auto capturing_lam = [=](benchmark::State& st) {
+      for (auto _ : st) {
+      }
+      st.SetLabel(x);
+    };
+    benchmark::RegisterBenchmark("lambda_benchmark", capturing_lam);
+    AddCases({{"lambda_benchmark", x}});
+  }
+#endif
+}
+
+// Test that all benchmarks, registered at either during static init or runtime,
+// are run and the results are passed to the reported.
+void RunTestOne() {
+  TestRegistrationAtRuntime();
+
+  TestReporter test_reporter;
+  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+}
+
+// Test that ClearRegisteredBenchmarks() clears all previously registered
+// benchmarks.
+// Also test that new benchmarks can be registered and ran afterwards.
+void RunTestTwo() {
+  assert(ExpectedResults.size() != 0 &&
+         "must have at least one registered benchmark");
+  ExpectedResults.clear();
+  benchmark::ClearRegisteredBenchmarks();
+
+  TestReporter test_reporter;
+  size_t num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  assert(num_ran == 0);
+  assert(test_reporter.all_runs_.begin() == test_reporter.all_runs_.end());
+
+  TestRegistrationAtRuntime();
+  num_ran = benchmark::RunSpecifiedBenchmarks(&test_reporter);
+  assert(num_ran == ExpectedResults.size());
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+}
+
+int main(int argc, char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  RunTestOne();
+  RunTestTwo();
+}
diff --git a/thirdparty/benchmark-1.5.0/test/report_aggregates_only_test.cc b/thirdparty/benchmark-1.5.0/test/report_aggregates_only_test.cc
new file mode 100644
index 0000000000..9646b9be53
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/report_aggregates_only_test.cc
@@ -0,0 +1,39 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of ReportAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find three "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/thirdparty/benchmark-1.5.0/test/reporter_output_test.cc b/thirdparty/benchmark-1.5.0/test/reporter_output_test.cc
new file mode 100644
index 0000000000..c8090d4aca
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/reporter_output_test.cc
@@ -0,0 +1,742 @@
+
+#undef NDEBUG
+#include <utility>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+ADD_CASES(TC_ConsoleOut, {{"^[-]+$", MR_Next},
+                          {"^Benchmark %s Time %s CPU %s Iterations$", MR_Next},
+                          {"^[-]+$", MR_Next}});
+static int AddContextCases() {
+  AddCases(TC_ConsoleErr,
+           {
+               {"%int[-/]%int[-/]%int %int:%int:%int$", MR_Default},
+               {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
+               {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
+           });
+  AddCases(TC_JSONOut,
+           {{"^\\{", MR_Default},
+            {"\"context\":", MR_Next},
+            {"\"date\": \"", MR_Next},
+            {"\"host_name\":", MR_Next},
+            {"\"executable\": \".*(/|\\\\)reporter_output_test(\\.exe)?\",",
+             MR_Next},
+            {"\"num_cpus\": %int,$", MR_Next},
+            {"\"mhz_per_cpu\": %float,$", MR_Next},
+            {"\"cpu_scaling_enabled\": ", MR_Next},
+            {"\"caches\": \\[$", MR_Next}});
+  auto const& Info = benchmark::CPUInfo::Get();
+  auto const& Caches = Info.caches;
+  if (!Caches.empty()) {
+    AddCases(TC_ConsoleErr, {{"CPU Caches:$", MR_Next}});
+  }
+  for (size_t I = 0; I < Caches.size(); ++I) {
+    std::string num_caches_str =
+        Caches[I].num_sharing != 0 ? " \\(x%int\\)$" : "$";
+    AddCases(
+        TC_ConsoleErr,
+        {{"L%int (Data|Instruction|Unified) %intK" + num_caches_str, MR_Next}});
+    AddCases(TC_JSONOut, {{"\\{$", MR_Next},
+                          {"\"type\": \"", MR_Next},
+                          {"\"level\": %int,$", MR_Next},
+                          {"\"size\": %int,$", MR_Next},
+                          {"\"num_sharing\": %int$", MR_Next},
+                          {"}[,]{0,1}$", MR_Next}});
+  }
+  AddCases(TC_JSONOut, {{"],$"}});
+  auto const& LoadAvg = Info.load_avg;
+  if (!LoadAvg.empty()) {
+    AddCases(TC_ConsoleErr,
+             {{"Load Average: (%float, ){0,2}%float$", MR_Next}});
+  }
+  AddCases(TC_JSONOut, {{"\"load_avg\": \\[(%float,?){0,3}],$", MR_Next}});
+  return 0;
+}
+int dummy_register = AddContextCases();
+ADD_CASES(TC_CSVOut, {{"%csv_header"}});
+
+// ========================================================================= //
+// ------------------------ Testing Basic Output --------------------------- //
+// ========================================================================= //
+
+void BM_basic(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_basic);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"run_name\": \"BM_basic\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_basic\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Bytes per Second Output ---------------- //
+// ========================================================================= //
+
+void BM_bytes_per_second(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.SetBytesProcessed(1);
+}
+BENCHMARK(BM_bytes_per_second);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
+                           "bytes_per_second=%float[kM]{0,1}/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bytes_per_second\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_bytes_per_second\",%csv_bytes_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Items per Second Output ---------------- //
+// ========================================================================= //
+
+void BM_items_per_second(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.SetItemsProcessed(1);
+}
+BENCHMARK(BM_items_per_second);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
+                           "items_per_second=%float[kM]{0,1}/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"items_per_second\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_items_per_second\",%csv_items_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Label Output --------------------------- //
+// ========================================================================= //
+
+void BM_label(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.SetLabel("some label");
+}
+BENCHMARK(BM_label);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"run_name\": \"BM_label\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"label\": \"some label\"$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_label\",%csv_label_report_begin\"some "
+                       "label\"%csv_label_report_end$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Error Output --------------------------- //
+// ========================================================================= //
+
+void BM_error(benchmark::State& state) {
+  state.SkipWithError("message");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_error);
+ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"run_name\": \"BM_error\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {"\"error_message\": \"message\",$", MR_Next}});
+
+ADD_CASES(TC_CSVOut, {{"^\"BM_error\",,,,,,,,true,\"message\"$"}});
+
+// ========================================================================= //
+// ------------------------ Testing No Arg Name Output -----------------------
+// //
+// ========================================================================= //
+
+void BM_no_arg_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_no_arg_name)->Arg(3);
+ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Arg Name Output ----------------------- //
+// ========================================================================= //
+
+void BM_arg_name(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
+ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Arg Names Output ----------------------- //
+// ========================================================================= //
+
+void BM_arg_names(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------ Testing Big Args Output ------------------------ //
+// ========================================================================= //
+
+void BM_BigArgs(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_BigArgs)->RangeMultiplier(2)->Range(1U << 30U, 1U << 31U);
+ADD_CASES(TC_ConsoleOut, {{"^BM_BigArgs/1073741824 %console_report$"},
+                          {"^BM_BigArgs/2147483648 %console_report$"}});
+
+// ========================================================================= //
+// ----------------------- Testing Complexity Output ----------------------- //
+// ========================================================================= //
+
+void BM_Complexity_O1(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.SetComplexityN(state.range(0));
+}
+BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity(benchmark::o1);
+SET_SUBSTITUTIONS({{"%bigOStr", "[ ]* %float \\([0-9]+\\)"},
+                   {"%RMS", "[ ]*[0-9]+ %"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Complexity_O1_BigO %bigOStr %bigOStr[ ]*$"},
+                          {"^BM_Complexity_O1_RMS %RMS %RMS[ ]*$"}});
+
+// ========================================================================= //
+// ----------------------- Testing Aggregate Output ------------------------ //
+// ========================================================================= //
+
+// Test that non-aggregate data is printed by default
+void BM_Repeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+// need two repetitions min to be able to output any aggregate output
+BENCHMARK(BM_Repeat)->Repetitions(2);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2_mean %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:2_stddev\",%csv_report$"}});
+// but for two repetitions, mean and median is the same, so let's repeat..
+BENCHMARK(BM_Repeat)->Repetitions(3);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
+// median differs between even/odd number of repetitions, so just to be sure
+BENCHMARK(BM_Repeat)->Repetitions(4);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4_mean %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 1,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 2,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"repetition_index\": 3,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"repetitions\": 4,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_mean\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_median\",%csv_report$"},
+                      {"^\"BM_Repeat/repeats:4_stddev\",%csv_report$"}});
+
+// Test that a non-repeated test still prints non-aggregate results even when
+// only-aggregate reports have been requested
+void BM_RepeatOnce(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
+ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 1,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
+
+// Test that non-aggregate data is not reported
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+     {"^BM_SummaryRepeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_median %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+                      {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
+                      {"^\"BM_SummaryRepeat/repeats:3_median\",%csv_report$"},
+                      {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
+
+// Test that non-aggregate data is not displayed.
+// NOTE: this test is kinda bad. we are only testing the display output.
+//       But we don't check that the file output still contains everything...
+void BM_SummaryDisplay(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryDisplay)->Repetitions(2)->DisplayAggregatesOnly();
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+     {"^BM_SummaryDisplay/repeats:2_mean %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_median %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_stddev %console_time_only_report [ ]*2$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"^\"BM_SummaryDisplay/repeats:2_mean\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_median\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_stddev\",%csv_report$"}});
+
+// Test repeats with custom time unit.
+void BM_RepeatTimeUnit(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_RepeatTimeUnit)
+    ->Repetitions(3)
+    ->ReportAggregatesOnly()
+    ->Unit(benchmark::kMicrosecond);
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+     {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_time_only_report [ ]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_median %console_us_time_only_report [ "
+      "]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_time_only_report [ "
+      "]*3$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 3,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+           {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
+           {"^\"BM_RepeatTimeUnit/repeats:3_median\",%csv_us_report$"},
+           {"^\"BM_RepeatTimeUnit/repeats:3_stddev\",%csv_us_report$"}});
+
+// ========================================================================= //
+// -------------------- Testing user-provided statistics ------------------- //
+// ========================================================================= //
+
+const auto UserStatistics = [](const std::vector<double>& v) {
+  return v.back();
+};
+void BM_UserStats(benchmark::State& state) {
+  for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
+  }
+}
+// clang-format off
+BENCHMARK(BM_UserStats)
+  ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
+  ->ComputeStatistics("", UserStatistics);
+// clang-format on
+
+// check that user-provided stats is calculated, and is after the default-ones
+// empty string as name is intentional, it would sort before anything else
+ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_stddev [ ]* 0.000 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time_ "
+                           "[ ]* 150 ns %time [ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 0,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 1,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"repetition_index\": 2,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"repetitions\": 3,$", MR_Next},
+     {"\"threads\": 1,$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_median\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_stddev\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
+
+// ========================================================================= //
+// ------------------------- Testing StrEscape JSON ------------------------ //
+// ========================================================================= //
+#if 0 // enable when csv testing code correctly handles multi-line fields
+void BM_JSON_Format(benchmark::State& state) {
+  state.SkipWithError("val\b\f\n\r\t\\\"with\"es,capes");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_JSON_Format);
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_JSON_Format\",$"},
+                       {"\"run_name\": \"BM_JSON_Format\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"error_occurred\": true,$", MR_Next},
+                       {R"("error_message": "val\\b\\f\\n\\r\\t\\\\\\"with\\"es,capes",$)", MR_Next}});
+#endif
+// ========================================================================= //
+// -------------------------- Testing CsvEscape ---------------------------- //
+// ========================================================================= //
+
+void BM_CSV_Format(benchmark::State& state) {
+  state.SkipWithError("\"freedom\"");
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_CSV_Format);
+ADD_CASES(TC_CSVOut, {{"^\"BM_CSV_Format\",,,,,,,,true,\"\"\"freedom\"\"\"$"}});
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/thirdparty/benchmark-1.5.0/test/skip_with_error_test.cc b/thirdparty/benchmark-1.5.0/test/skip_with_error_test.cc
new file mode 100644
index 0000000000..06579772ff
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/skip_with_error_test.cc
@@ -0,0 +1,189 @@
+
+#undef NDEBUG
+#include <cassert>
+#include <vector>
+
+#include "../src/check.h"  // NOTE: check.h is for internal use only!
+#include "benchmark/benchmark.h"
+
+namespace {
+
+class TestReporter : public benchmark::ConsoleReporter {
+ public:
+  virtual bool ReportContext(const Context& context) {
+    return ConsoleReporter::ReportContext(context);
+  };
+
+  virtual void ReportRuns(const std::vector<Run>& report) {
+    all_runs_.insert(all_runs_.end(), begin(report), end(report));
+    ConsoleReporter::ReportRuns(report);
+  }
+
+  TestReporter() {}
+  virtual ~TestReporter() {}
+
+  mutable std::vector<Run> all_runs_;
+};
+
+struct TestCase {
+  std::string name;
+  bool error_occurred;
+  std::string error_message;
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+
+  void CheckRun(Run const& run) const {
+    CHECK(name == run.benchmark_name())
+        << "expected " << name << " got " << run.benchmark_name();
+    CHECK(error_occurred == run.error_occurred);
+    CHECK(error_message == run.error_message);
+    if (error_occurred) {
+      // CHECK(run.iterations == 0);
+    } else {
+      CHECK(run.iterations != 0);
+    }
+  }
+};
+
+std::vector<TestCase> ExpectedResults;
+
+int AddCases(const char* base_name, std::initializer_list<TestCase> const& v) {
+  for (auto TC : v) {
+    TC.name = base_name + TC.name;
+    ExpectedResults.push_back(std::move(TC));
+  }
+  return 0;
+}
+
+#define CONCAT(x, y) CONCAT2(x, y)
+#define CONCAT2(x, y) x##y
+#define ADD_CASES(...) int CONCAT(dummy, __LINE__) = AddCases(__VA_ARGS__)
+
+}  // end namespace
+
+void BM_error_before_running(benchmark::State& state) {
+  state.SkipWithError("error message");
+  while (state.KeepRunning()) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running);
+ADD_CASES("BM_error_before_running", {{"", true, "error message"}});
+
+void BM_error_before_running_batch(benchmark::State& state) {
+  state.SkipWithError("error message");
+  while (state.KeepRunningBatch(17)) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running_batch);
+ADD_CASES("BM_error_before_running_batch", {{"", true, "error message"}});
+
+void BM_error_before_running_range_for(benchmark::State& state) {
+  state.SkipWithError("error message");
+  for (auto _ : state) {
+    assert(false);
+  }
+}
+BENCHMARK(BM_error_before_running_range_for);
+ADD_CASES("BM_error_before_running_range_for", {{"", true, "error message"}});
+
+void BM_error_during_running(benchmark::State& state) {
+  int first_iter = true;
+  while (state.KeepRunning()) {
+    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+      assert(first_iter);
+      first_iter = false;
+      state.SkipWithError("error message");
+    } else {
+      state.PauseTiming();
+      state.ResumeTiming();
+    }
+  }
+}
+BENCHMARK(BM_error_during_running)->Arg(1)->Arg(2)->ThreadRange(1, 8);
+ADD_CASES("BM_error_during_running", {{"/1/threads:1", true, "error message"},
+                                      {"/1/threads:2", true, "error message"},
+                                      {"/1/threads:4", true, "error message"},
+                                      {"/1/threads:8", true, "error message"},
+                                      {"/2/threads:1", false, ""},
+                                      {"/2/threads:2", false, ""},
+                                      {"/2/threads:4", false, ""},
+                                      {"/2/threads:8", false, ""}});
+
+void BM_error_during_running_ranged_for(benchmark::State& state) {
+  assert(state.max_iterations > 3 && "test requires at least a few iterations");
+  int first_iter = true;
+  // NOTE: Users should not write the for loop explicitly.
+  for (auto It = state.begin(), End = state.end(); It != End; ++It) {
+    if (state.range(0) == 1) {
+      assert(first_iter);
+      first_iter = false;
+      state.SkipWithError("error message");
+      // Test the unfortunate but documented behavior that the ranged-for loop
+      // doesn't automatically terminate when SkipWithError is set.
+      assert(++It != End);
+      break;  // Required behavior
+    }
+  }
+}
+BENCHMARK(BM_error_during_running_ranged_for)->Arg(1)->Arg(2)->Iterations(5);
+ADD_CASES("BM_error_during_running_ranged_for",
+          {{"/1/iterations:5", true, "error message"},
+           {"/2/iterations:5", false, ""}});
+
+void BM_error_after_running(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+  if (state.thread_index <= (state.threads / 2))
+    state.SkipWithError("error message");
+}
+BENCHMARK(BM_error_after_running)->ThreadRange(1, 8);
+ADD_CASES("BM_error_after_running", {{"/threads:1", true, "error message"},
+                                     {"/threads:2", true, "error message"},
+                                     {"/threads:4", true, "error message"},
+                                     {"/threads:8", true, "error message"}});
+
+void BM_error_while_paused(benchmark::State& state) {
+  bool first_iter = true;
+  while (state.KeepRunning()) {
+    if (state.range(0) == 1 && state.thread_index <= (state.threads / 2)) {
+      assert(first_iter);
+      first_iter = false;
+      state.PauseTiming();
+      state.SkipWithError("error message");
+    } else {
+      state.PauseTiming();
+      state.ResumeTiming();
+    }
+  }
+}
+BENCHMARK(BM_error_while_paused)->Arg(1)->Arg(2)->ThreadRange(1, 8);
+ADD_CASES("BM_error_while_paused", {{"/1/threads:1", true, "error message"},
+                                    {"/1/threads:2", true, "error message"},
+                                    {"/1/threads:4", true, "error message"},
+                                    {"/1/threads:8", true, "error message"},
+                                    {"/2/threads:1", false, ""},
+                                    {"/2/threads:2", false, ""},
+                                    {"/2/threads:4", false, ""},
+                                    {"/2/threads:8", false, ""}});
+
+int main(int argc, char* argv[]) {
+  benchmark::Initialize(&argc, argv);
+
+  TestReporter test_reporter;
+  benchmark::RunSpecifiedBenchmarks(&test_reporter);
+
+  typedef benchmark::BenchmarkReporter::Run Run;
+  auto EB = ExpectedResults.begin();
+
+  for (Run const& run : test_reporter.all_runs_) {
+    assert(EB != ExpectedResults.end());
+    EB->CheckRun(run);
+    ++EB;
+  }
+  assert(EB == ExpectedResults.end());
+
+  return 0;
+}
diff --git a/thirdparty/benchmark-1.5.0/test/state_assembly_test.cc b/thirdparty/benchmark-1.5.0/test/state_assembly_test.cc
new file mode 100644
index 0000000000..7ddbb3b2a9
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/state_assembly_test.cc
@@ -0,0 +1,68 @@
+#include <benchmark/benchmark.h>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wreturn-type"
+#endif
+
+// clang-format off
+extern "C" {
+  extern int ExternInt;
+  benchmark::State& GetState();
+  void Fn();
+}
+// clang-format on
+
+using benchmark::State;
+
+// CHECK-LABEL: test_for_auto_loop:
+extern "C" int test_for_auto_loop() {
+  State& S = GetState();
+  int x = 42;
+  // CHECK: 	[[CALL:call(q)*]]	_ZN9benchmark5State16StartKeepRunningEv
+  // CHECK-NEXT: testq %rbx, %rbx
+  // CHECK-NEXT: je [[LOOP_END:.*]]
+
+  for (auto _ : S) {
+    // CHECK: .L[[LOOP_HEAD:[a-zA-Z0-9_]+]]:
+    // CHECK-GNU-NEXT: subq $1, %rbx
+    // CHECK-CLANG-NEXT: {{(addq \$1, %rax|incq %rax|addq \$-1, %rbx)}}
+    // CHECK-NEXT: jne .L[[LOOP_HEAD]]
+    benchmark::DoNotOptimize(x);
+  }
+  // CHECK: [[LOOP_END]]:
+  // CHECK: [[CALL]]	_ZN9benchmark5State17FinishKeepRunningEv
+
+  // CHECK: movl $101, %eax
+  // CHECK: ret
+  return 101;
+}
+
+// CHECK-LABEL: test_while_loop:
+extern "C" int test_while_loop() {
+  State& S = GetState();
+  int x = 42;
+
+  // CHECK: j{{(e|mp)}} .L[[LOOP_HEADER:[a-zA-Z0-9_]+]]
+  // CHECK-NEXT: .L[[LOOP_BODY:[a-zA-Z0-9_]+]]:
+  while (S.KeepRunning()) {
+    // CHECK-GNU-NEXT: subq $1, %[[IREG:[a-z]+]]
+    // CHECK-CLANG-NEXT: {{(addq \$-1,|decq)}} %[[IREG:[a-z]+]]
+    // CHECK: movq %[[IREG]], [[DEST:.*]]
+    benchmark::DoNotOptimize(x);
+  }
+  // CHECK-DAG: movq [[DEST]], %[[IREG]]
+  // CHECK-DAG: testq %[[IREG]], %[[IREG]]
+  // CHECK-DAG: jne .L[[LOOP_BODY]]
+  // CHECK-DAG: .L[[LOOP_HEADER]]:
+
+  // CHECK: cmpb $0
+  // CHECK-NEXT: jne .L[[LOOP_END:[a-zA-Z0-9_]+]]
+  // CHECK: [[CALL:call(q)*]] _ZN9benchmark5State16StartKeepRunningEv
+
+  // CHECK: .L[[LOOP_END]]:
+  // CHECK: [[CALL]] _ZN9benchmark5State17FinishKeepRunningEv
+
+  // CHECK: movl $101, %eax
+  // CHECK: ret
+  return 101;
+}
diff --git a/thirdparty/benchmark-1.5.0/test/statistics_gtest.cc b/thirdparty/benchmark-1.5.0/test/statistics_gtest.cc
new file mode 100644
index 0000000000..99e314920c
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/statistics_gtest.cc
@@ -0,0 +1,28 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/statistics.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StatisticsTest, Mean) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMean({1, 2, 5, 10, 10, 14}), 7.0);
+}
+
+TEST(StatisticsTest, Median) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({42, 42, 42, 42}), 42.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 3, 4}), 2.5);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsMedian({1, 2, 5, 10, 10}), 5.0);
+}
+
+TEST(StatisticsTest, StdDev) {
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({101, 101, 101, 101}), 0.0);
+  EXPECT_DOUBLE_EQ(benchmark::StatisticsStdDev({1, 2, 3}), 1.0);
+  EXPECT_FLOAT_EQ(benchmark::StatisticsStdDev({1.5, 2.4, 3.3, 4.2, 5.1}),
+                  1.42302495);
+}
+
+}  // end namespace
diff --git a/thirdparty/benchmark-1.5.0/test/string_util_gtest.cc b/thirdparty/benchmark-1.5.0/test/string_util_gtest.cc
new file mode 100644
index 0000000000..2c5d073f61
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/string_util_gtest.cc
@@ -0,0 +1,146 @@
+//===---------------------------------------------------------------------===//
+// statistics_test - Unit tests for src/statistics.cc
+//===---------------------------------------------------------------------===//
+
+#include "../src/string_util.h"
+#include "gtest/gtest.h"
+
+namespace {
+TEST(StringUtilTest, stoul) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0ul, benchmark::stoul("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(7ul, benchmark::stoul("7", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(135ul, benchmark::stoul("135", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+#if ULONG_MAX == 0xFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFul, benchmark::stoul("4294967295", &pos));
+    EXPECT_EQ(10ul, pos);
+  }
+#elif ULONG_MAX == 0xFFFFFFFFFFFFFFFFul
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xFFFFFFFFFFFFFFFFul, benchmark::stoul("18446744073709551615", &pos));
+    EXPECT_EQ(20ul, pos);
+  }
+#endif
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10ul, benchmark::stoul("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520ul, benchmark::stoul("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010ul, benchmark::stoul("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112ul, benchmark::stoul("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEFul, benchmark::stoul("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    ASSERT_THROW(benchmark::stoul("this is a test"), std::invalid_argument);
+  }
+}
+
+TEST(StringUtilTest, stoi) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0, benchmark::stoi("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-17, benchmark::stoi("-17", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1357, benchmark::stoi("1357", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(10, benchmark::stoi("1010", &pos, 2));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(520, benchmark::stoi("1010", &pos, 8));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1010, benchmark::stoi("1010", &pos, 10));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(4112, benchmark::stoi("1010", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0xBEEF, benchmark::stoi("BEEF", &pos, 16));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    ASSERT_THROW(benchmark::stoi("this is a test"), std::invalid_argument);
+  }
+}
+
+TEST(StringUtilTest, stod) {
+  {
+    size_t pos = 0;
+    EXPECT_EQ(0.0, benchmark::stod("0", &pos));
+    EXPECT_EQ(1ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(-84.0, benchmark::stod("-84", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1234.0, benchmark::stod("1234", &pos));
+    EXPECT_EQ(4ul, pos);
+  }
+  {
+    size_t pos = 0;
+    EXPECT_EQ(1.5, benchmark::stod("1.5", &pos));
+    EXPECT_EQ(3ul, pos);
+  }
+  {
+    size_t pos = 0;
+    /* Note: exactly representable as double */
+    EXPECT_EQ(-1.25e+9, benchmark::stod("-1.25e+9", &pos));
+    EXPECT_EQ(8ul, pos);
+  }
+  {
+    ASSERT_THROW(benchmark::stod("this is a test"), std::invalid_argument);
+  }
+}
+
+}  // end namespace
diff --git a/thirdparty/benchmark-1.5.0/test/templated_fixture_test.cc b/thirdparty/benchmark-1.5.0/test/templated_fixture_test.cc
new file mode 100644
index 0000000000..fe9865cc77
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/templated_fixture_test.cc
@@ -0,0 +1,28 @@
+
+#include "benchmark/benchmark.h"
+
+#include <cassert>
+#include <memory>
+
+template <typename T>
+class MyFixture : public ::benchmark::Fixture {
+ public:
+  MyFixture() : data(0) {}
+
+  T data;
+};
+
+BENCHMARK_TEMPLATE_F(MyFixture, Foo, int)(benchmark::State& st) {
+  for (auto _ : st) {
+    data += 1;
+  }
+}
+
+BENCHMARK_TEMPLATE_DEFINE_F(MyFixture, Bar, double)(benchmark::State& st) {
+  for (auto _ : st) {
+    data += 1.0;
+  }
+}
+BENCHMARK_REGISTER_F(MyFixture, Bar);
+
+BENCHMARK_MAIN();
diff --git a/thirdparty/benchmark-1.5.0/test/user_counters_tabular_test.cc b/thirdparty/benchmark-1.5.0/test/user_counters_tabular_test.cc
new file mode 100644
index 0000000000..099464ef99
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/user_counters_tabular_test.cc
@@ -0,0 +1,283 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// @todo: <jpmag> this checks the full output at once; the rule for
+// CounterSet1 was failing because it was not matching "^[-]+$".
+// @todo: <jpmag> check that the counters are vertically aligned.
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        // keeping these lines long improves readability, so:
+        // clang-format off
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Bat %s Baz %s Foo %s Frob %s Lob$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_Counters_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^BM_CounterRates_Tabular/threads:%int %console_report [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s [ ]*%hrfloat/s$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bar %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet0_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet1_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^Benchmark %s Time %s CPU %s Iterations %s Bat %s Baz %s Foo$", MR_Next},
+    {"^[-]+$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$", MR_Next},
+    {"^BM_CounterSet2_Tabular/threads:%int %console_report [ ]*%hrfloat [ ]*%hrfloat [ ]*%hrfloat$"},
+        // clang-format on
+    });
+ADD_CASES(TC_CSVOut, {{"%csv_header,"
+                       "\"Bar\",\"Bat\",\"Baz\",\"Foo\",\"Frob\",\"Lob\""}});
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_Counters_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {1, bm::Counter::kAvgThreads}},
+      {"Bar", {2, bm::Counter::kAvgThreads}},
+      {"Baz", {4, bm::Counter::kAvgThreads}},
+      {"Bat", {8, bm::Counter::kAvgThreads}},
+      {"Frob", {16, bm::Counter::kAvgThreads}},
+      {"Lob", {32, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
+                       "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabular(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 2);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 4);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 8);
+  CHECK_COUNTER_VALUE(e, int, "Frob", EQ, 16);
+  CHECK_COUNTER_VALUE(e, int, "Lob", EQ, 32);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Tabular/threads:%int", &CheckTabular);
+
+// ========================================================================= //
+// -------------------- Tabular+Rate Counters Output ----------------------- //
+// ========================================================================= //
+
+void BM_CounterRates_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {1, bm::Counter::kAvgThreadsRate}},
+      {"Bar", {2, bm::Counter::kAvgThreadsRate}},
+      {"Baz", {4, bm::Counter::kAvgThreadsRate}},
+      {"Bat", {8, bm::Counter::kAvgThreadsRate}},
+      {"Frob", {16, bm::Counter::kAvgThreadsRate}},
+      {"Lob", {32, bm::Counter::kAvgThreadsRate}},
+  });
+}
+BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
+                       "%float,%float,%float,%float,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckTabularRate(Results const& e) {
+  double t = e.DurationCPUTime();
+  CHECK_FLOAT_COUNTER_VALUE(e, "Foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bar", EQ, 2. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Baz", EQ, 4. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Bat", EQ, 8. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Frob", EQ, 16. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "Lob", EQ, 32. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterRates_Tabular/threads:%int",
+                        &CheckTabularRate);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters
+void BM_CounterSet0_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bar", {20, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet0(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 20);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet0_Tabular", &CheckSet0);
+
+// again.
+void BM_CounterSet1_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {15, bm::Counter::kAvgThreads}},
+      {"Bar", {25, bm::Counter::kAvgThreads}},
+      {"Baz", {45, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
+                       "%float,,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet1(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 15);
+  CHECK_COUNTER_VALUE(e, int, "Bar", EQ, 25);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 45);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet1_Tabular/threads:%int", &CheckSet1);
+
+// ========================================================================= //
+// ------------------------- Tabular Counters Output ----------------------- //
+// ========================================================================= //
+
+// set only some of the counters, different set now.
+void BM_CounterSet2_Tabular(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"Foo", {10, bm::Counter::kAvgThreads}},
+      {"Bat", {30, bm::Counter::kAvgThreads}},
+      {"Baz", {40, bm::Counter::kAvgThreads}},
+  });
+}
+BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
+                       ",%float,%float,%float,,"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSet2(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "Foo", EQ, 10);
+  CHECK_COUNTER_VALUE(e, int, "Bat", EQ, 30);
+  CHECK_COUNTER_VALUE(e, int, "Baz", EQ, 40);
+}
+CHECK_BENCHMARK_RESULTS("BM_CounterSet2_Tabular", &CheckSet2);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/thirdparty/benchmark-1.5.0/test/user_counters_test.cc b/thirdparty/benchmark-1.5.0/test/user_counters_test.cc
new file mode 100644
index 0000000000..0775bc01f7
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/user_counters_test.cc
@@ -0,0 +1,438 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ---------------------- Testing Prologue Output -------------------------- //
+// ========================================================================= //
+
+// clang-format off
+
+ADD_CASES(TC_ConsoleOut,
+          {{"^[-]+$", MR_Next},
+           {"^Benchmark %s Time %s CPU %s Iterations UserCounters...$", MR_Next},
+           {"^[-]+$", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"%csv_header,\"bar\",\"foo\""}});
+
+// clang-format on
+
+// ========================================================================= //
+// ------------------------- Simple Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Simple(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2 * (double)state.iterations();
+}
+BENCHMARK(BM_Counters_Simple);
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Simple\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckSimple(Results const& e) {
+  double its = e.NumIterations();
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  // check that the value of bar is within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Simple", &CheckSimple);
+
+// ========================================================================= //
+// --------------------- Counters+Items+Bytes/s Output --------------------- //
+// ========================================================================= //
+
+namespace {
+int num_calls1 = 0;
+}
+void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = ++num_calls1;
+  state.SetBytesProcessed(364);
+  state.SetItemsProcessed(150);
+}
+BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
+                           "bar=%hrfloat bytes_per_second=%hrfloat/s "
+                           "foo=%hrfloat items_per_second=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"bytes_per_second\": %float,$", MR_Next},
+           {"\"foo\": %float,$", MR_Next},
+           {"\"items_per_second\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
+                       "%csv_bytes_items_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckBytesAndItemsPSec(Results const& e) {
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, num_calls1);
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_RESULT_VALUE(e, "bytes_per_second", EQ, 364. / t, 0.001);
+  CHECK_FLOAT_RESULT_VALUE(e, "items_per_second", EQ, 150. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_WithBytesAndItemsPSec",
+                        &CheckBytesAndItemsPSec);
+
+// ========================================================================= //
+// ------------------------- Rate Counters Output -------------------------- //
+// ========================================================================= //
+
+void BM_Counters_Rate(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsRate};
+}
+BENCHMARK(BM_Counters_Rate);
+ADD_CASES(
+    TC_ConsoleOut,
+    {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"repetitions\": 0,$", MR_Next},
+                       {"\"repetition_index\": 0,$", MR_Next},
+                       {"\"threads\": 1,$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"bar\": %float,$", MR_Next},
+                       {"\"foo\": %float$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Rate\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckRate(Results const& e) {
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Rate", &CheckRate);
+
+// ========================================================================= //
+// ------------------------- Thread Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Threads(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  state.counters["foo"] = 1;
+  state.counters["bar"] = 2;
+}
+BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, e.NumThreads());
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2 * e.NumThreads());
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Threads/threads:%int", &CheckThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreads(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreads};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreads};
+}
+BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
+                           "%console_report bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreads(Results const& e) {
+  CHECK_COUNTER_VALUE(e, int, "foo", EQ, 1);
+  CHECK_COUNTER_VALUE(e, int, "bar", EQ, 2);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreads/threads:%int",
+                        &CheckAvgThreads);
+
+// ========================================================================= //
+// ---------------------- ThreadAvg Counters Output ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgThreadsRate(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgThreadsRate};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgThreadsRate};
+}
+BENCHMARK(BM_Counters_AvgThreadsRate)->ThreadRange(1, 8);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_AvgThreadsRate/"
+                       "threads:%int\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgThreadsRate(Results const& e) {
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / e.DurationCPUTime(), 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / e.DurationCPUTime(), 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgThreadsRate/threads:%int",
+                        &CheckAvgThreadsRate);
+
+// ========================================================================= //
+// ------------------- IterationInvariant Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_IterationInvariant(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kIsIterationInvariant};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_IterationInvariant);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_IterationInvariant %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_IterationInvariant\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIterationInvariant(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. * its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_IterationInvariant",
+                        &CheckIterationInvariant);
+
+// ========================================================================= //
+// ----------------- IterationInvariantRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kIsIterationInvariantRate(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] =
+      bm::Counter{1, bm::Counter::kIsIterationInvariantRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kIsIterationInvariant};
+}
+BENCHMARK(BM_Counters_kIsIterationInvariantRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kIsIterationInvariantRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kIsIterationInvariantRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckIsIterationInvariantRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, its * 1. / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, its * 2. / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kIsIterationInvariantRate",
+                        &CheckIsIterationInvariantRate);
+
+// ========================================================================= //
+// ------------------- AvgIterations Counters Output ------------------ //
+// ========================================================================= //
+
+void BM_Counters_AvgIterations(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterations};
+  state.counters["bar"] = bm::Counter{2, bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_AvgIterations);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgIterations %console_report "
+                           "bar=%hrfloat foo=%hrfloat$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{"^\"BM_Counters_AvgIterations\",%csv_report,%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterations(Results const& e) {
+  double its = e.NumIterations();
+  // check that the values are within 0.1% of the expected value
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_AvgIterations", &CheckAvgIterations);
+
+// ========================================================================= //
+// ----------------- AvgIterationsRate Counters Output ---------------- //
+// ========================================================================= //
+
+void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters["foo"] = bm::Counter{1, bm::Counter::kAvgIterationsRate};
+  state.counters["bar"] =
+      bm::Counter{2, bm::Counter::kIsRate | bm::Counter::kAvgIterations};
+}
+BENCHMARK(BM_Counters_kAvgIterationsRate);
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate "
+                           "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 0,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kAvgIterationsRate\",%csv_report,"
+                       "%float,%float$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckAvgIterationsRate(Results const& e) {
+  double its = e.NumIterations();
+  double t = e.DurationCPUTime();  // this (and not real time) is the time used
+  // check that the values are within 0.1% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "foo", EQ, 1. / its / t, 0.001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "bar", EQ, 2. / its / t, 0.001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_kAvgIterationsRate",
+                        &CheckAvgIterationsRate);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/thirdparty/benchmark-1.5.0/test/user_counters_thousands_test.cc b/thirdparty/benchmark-1.5.0/test/user_counters_thousands_test.cc
new file mode 100644
index 0000000000..21d8285ded
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/test/user_counters_thousands_test.cc
@@ -0,0 +1,173 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Thousands Customisation ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Thousands(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"t0_1000000DefaultBase",
+       bm::Counter(1000 * 1000, bm::Counter::kDefaults)},
+      {"t1_1000000Base1000", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t2_1000000Base1024", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+      {"t3_1048576Base1000", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t4_1048576Base1024", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+  });
+}
+BENCHMARK(BM_Counters_Thousands)->Repetitions(2);
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_mean %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_median %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_stddev %console_time_only_report [ "
+         "]*2 t0_1000000DefaultBase=0 t1_1000000Base1000=0 "
+         "t2_1000000Base1024=0 t3_1048576Base1000=0 t4_1048576Base1024=0$"},
+    });
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 0,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"repetition_index\": 1,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"repetitions\": 2,$", MR_Next},
+           {"\"threads\": 1,$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t1_1000000Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t2_1000000Base1024\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t3_1048576Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t4_1048576Base1024\": 0\\.(0)*e\\+(0)*$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_mean\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_median\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/repeats:2_stddev\",%csv_report,0,0,0,0,0$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThousands(Results const& e) {
+  if (e.name != "BM_Counters_Thousands/repeats:2")
+    return;  // Do not check the aggregates!
+
+  // check that the values are within 0.01% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "t0_1000000DefaultBase", EQ, 1000 * 1000,
+                            0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t1_1000000Base1000", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t2_1000000Base1024", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t3_1048576Base1000", EQ, 1024 * 1024, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t4_1048576Base1024", EQ, 1024 * 1024, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Thousands", &CheckThousands);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/thirdparty/benchmark-1.5.0/tools/compare.py b/thirdparty/benchmark-1.5.0/tools/compare.py
new file mode 100755
index 0000000000..539ace6fb1
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/compare.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python
+
+import unittest
+"""
+compare.py - versatile benchmark output compare tool
+"""
+
+import argparse
+from argparse import ArgumentParser
+import sys
+import gbench
+from gbench import util, report
+from gbench.util import *
+
+
+def check_inputs(in1, in2, flags):
+    """
+    Perform checking on the user provided inputs and diagnose any abnormalities
+    """
+    in1_kind, in1_err = classify_input_file(in1)
+    in2_kind, in2_err = classify_input_file(in2)
+    output_file = find_benchmark_flag('--benchmark_out=', flags)
+    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+        print(("WARNING: '--benchmark_out=%s' will be passed to both "
+               "benchmarks causing it to be overwritten") % output_file)
+    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
+        print("WARNING: passing optional flags has no effect since both "
+              "inputs are JSON")
+    if output_type is not None and output_type != 'json':
+        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
+               " is not supported.") % output_type)
+        sys.exit(1)
+
+
+def create_parser():
+    parser = ArgumentParser(
+        description='versatile benchmark output compare tool')
+
+    parser.add_argument(
+        '-a',
+        '--display_aggregates_only',
+        dest='display_aggregates_only',
+        action="store_true",
+        help="If there are repetitions, by default, we display everything - the"
+             " actual runs, and the aggregates computed. Sometimes, it is "
+             "desirable to only view the aggregates. E.g. when there are a lot "
+             "of repetitions. Do note that only the display is affected. "
+             "Internally, all the actual runs are still used, e.g. for U test.")
+
+    utest = parser.add_argument_group()
+    utest.add_argument(
+        '--no-utest',
+        dest='utest',
+        default=True,
+        action="store_false",
+        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
+    alpha_default = 0.05
+    utest.add_argument(
+        "--alpha",
+        dest='utest_alpha',
+        default=alpha_default,
+        type=float,
+        help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
+        alpha_default)
+
+    subparsers = parser.add_subparsers(
+        help='This tool has multiple modes of operation:',
+        dest='mode')
+
+    parser_a = subparsers.add_parser(
+        'benchmarks',
+        help='The most simple use-case, compare all the output of these two benchmarks')
+    baseline = parser_a.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    contender = parser_a.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    parser_a.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_b = subparsers.add_parser(
+        'filters', help='Compare filter one with the filter two of benchmark')
+    baseline = parser_b.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test',
+        metavar='test',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_b.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_b.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_c = subparsers.add_parser(
+        'benchmarksfiltered',
+        help='Compare filter one of first benchmark with filter two of the second benchmark')
+    baseline = parser_c.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_c.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='The second benchmark executable or JSON output file, that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_c.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    return parser
+
+
+def main():
+    # Parse the command line flags
+    parser = create_parser()
+    args, unknown_args = parser.parse_known_args()
+    if args.mode is None:
+        parser.print_help()
+        exit(1)
+    assert not unknown_args
+    benchmark_options = args.benchmark_options
+
+    if args.mode == 'benchmarks':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = ''
+        filter_contender = ''
+
+        # NOTE: if test_baseline == test_contender, you are analyzing the stdev
+
+        description = 'Comparing %s to %s' % (test_baseline, test_contender)
+    elif args.mode == 'filters':
+        test_baseline = args.test[0].name
+        test_contender = args.test[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if filter_baseline == filter_contender, you are analyzing the
+        # stdev
+
+        description = 'Comparing %s to %s (from %s)' % (
+            filter_baseline, filter_contender, args.test[0].name)
+    elif args.mode == 'benchmarksfiltered':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if test_baseline == test_contender and
+        # filter_baseline == filter_contender, you are analyzing the stdev
+
+        description = 'Comparing %s (from %s) to %s (from %s)' % (
+            filter_baseline, test_baseline, filter_contender, test_contender)
+    else:
+        # should never happen
+        print("Unrecognized mode of operation: '%s'" % args.mode)
+        parser.print_help()
+        exit(1)
+
+    check_inputs(test_baseline, test_contender, benchmark_options)
+
+    if args.display_aggregates_only:
+        benchmark_options += ['--benchmark_display_aggregates_only=true']
+
+    options_baseline = []
+    options_contender = []
+
+    if filter_baseline and filter_contender:
+        options_baseline = ['--benchmark_filter=%s' % filter_baseline]
+        options_contender = ['--benchmark_filter=%s' % filter_contender]
+
+    # Run the benchmarks and report the results
+    json1 = json1_orig = gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline)
+    json2 = json2_orig = gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender)
+
+    # Now, filter the benchmarks so that the difference report can work
+    if filter_baseline and filter_contender:
+        replacement = '[%s vs. %s]' % (filter_baseline, filter_contender)
+        json1 = gbench.report.filter_benchmark(
+            json1_orig, filter_baseline, replacement)
+        json2 = gbench.report.filter_benchmark(
+            json2_orig, filter_contender, replacement)
+
+    # Diff and output
+    output_lines = gbench.report.generate_difference_report(
+        json1, json2, args.display_aggregates_only,
+        args.utest, args.utest_alpha)
+    print(description)
+    for ln in output_lines:
+        print(ln)
+
+
+class TestParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = create_parser()
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'gbench',
+            'Inputs')
+        self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
+        self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
+
+    def test_benchmarks_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.05)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_display_aggregates_only(self):
+        parsed = self.parser.parse_args(
+            ['-a', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertTrue(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['d'])
+
+    def test_benchmarks_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_basic(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_filters_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['f'])
+
+    def test_benchmarksfiltered_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarksfiltered_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'f')
+
+    def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'g')
+
+
+if __name__ == '__main__':
+    # unittest.main()
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test1_run1.json b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test1_run1.json
new file mode 100644
index 0000000000..601e327aef
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test1_run1.json
@@ -0,0 +1,119 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 10000,
+      "cpu_time": 10000,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 4.2749856294592886e+00,
+      "real_coefficient": 6.4789275289789780e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 4.5097802512472874e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.4,
+      "cpu_time": 0.5,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
+    }
+  ]
+}
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test1_run2.json b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test1_run2.json
new file mode 100644
index 0000000000..3cbcf39b0c
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test1_run2.json
@@ -0,0 +1,119 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 25,
+      "cpu_time": 25,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 20833333,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 98.9999999,
+      "cpu_time": 98.9999999,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100.9999999,
+      "cpu_time": 100.9999999,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 90,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 1.0000e+04,
+      "cpu_time": 1.0000e+04,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 66.665,
+      "cpu_time": 66.664,
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 5.6215779594361486e+00,
+      "real_coefficient": 5.6288314793554610e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 3.3128901852342174e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.04,
+      "cpu_time": 0.6,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test2_run.json b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test2_run.json
new file mode 100644
index 0000000000..15bc698030
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test2_run.json
@@ -0,0 +1,81 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_Hi",
+      "iterations": 1234,
+      "real_time": 42,
+      "cpu_time": 24,
+      "time_unit": "ms"
+    },
+    {
+      "name": "BM_Zero",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Zero/4",
+      "iterations": 4000,
+      "real_time": 40,
+      "cpu_time": 40,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero/3",
+      "iterations": 3000,
+      "real_time": 30,
+      "cpu_time": 30,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One",
+      "iterations": 5000,
+      "real_time": 5,
+      "cpu_time": 5,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One/4",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One/3",
+      "iterations": 1500,
+      "real_time": 15,
+      "cpu_time": 15,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Bye",
+      "iterations": 5321,
+      "real_time": 11,
+      "cpu_time": 63,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test3_run0.json b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test3_run0.json
new file mode 100644
index 0000000000..49f8b06143
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test3_run0.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 86,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 77,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 82,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test3_run1.json b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test3_run1.json
new file mode 100644
index 0000000000..acc5ba17ae
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/Inputs/test3_run1.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 89,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 72,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 75,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 762,
+      "real_time": 4.54,
+      "cpu_time": 66.6,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 800,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1200,
+      "real_time": 5,
+      "cpu_time": 53,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/__init__.py b/thirdparty/benchmark-1.5.0/tools/gbench/__init__.py
new file mode 100644
index 0000000000..fce1a1acfb
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/__init__.py
@@ -0,0 +1,8 @@
+"""Google Benchmark tooling"""
+
+__author__ = 'Eric Fiselier'
+__email__ = 'eric@efcs.ca'
+__versioninfo__ = (0, 5, 0)
+__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
+
+__all__ = []
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/report.py b/thirdparty/benchmark-1.5.0/tools/gbench/report.py
new file mode 100644
index 0000000000..5bd3a8d85d
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/report.py
@@ -0,0 +1,541 @@
+import unittest
+"""report.py - Utilities for reporting statistics about benchmark results
+"""
+import os
+import re
+import copy
+
+from scipy.stats import mannwhitneyu
+
+
+class BenchmarkColor(object):
+    def __init__(self, name, code):
+        self.name = name
+        self.code = code
+
+    def __repr__(self):
+        return '%s%r' % (self.__class__.__name__,
+                         (self.name, self.code))
+
+    def __format__(self, format):
+        return self.code
+
+
+# Benchmark Colors Enumeration
+BC_NONE = BenchmarkColor('NONE', '')
+BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
+BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
+BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
+BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
+BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
+BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
+BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
+BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
+BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
+BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
+
+UTEST_MIN_REPETITIONS = 2
+UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
+
+
+def color_format(use_color, fmt_str, *args, **kwargs):
+    """
+    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
+    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
+    is False then all color codes in 'args' and 'kwargs' are replaced with
+    the empty string.
+    """
+    assert use_color is True or use_color is False
+    if not use_color:
+        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                for arg in args]
+        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                  for key, arg in kwargs.items()}
+    return fmt_str.format(*args, **kwargs)
+
+
+def find_longest_name(benchmark_list):
+    """
+    Return the length of the longest benchmark name in a given list of
+    benchmark JSON objects
+    """
+    longest_name = 1
+    for bc in benchmark_list:
+        if len(bc['name']) > longest_name:
+            longest_name = len(bc['name'])
+    return longest_name
+
+
+def calculate_change(old_val, new_val):
+    """
+    Return a float representing the decimal change between old_val and new_val.
+    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
+    return float(new_val - old_val) / abs(old_val)
+
+
+def filter_benchmark(json_orig, family, replacement=""):
+    """
+    Apply a filter to the json, and only leave the 'family' of benchmarks.
+    """
+    regex = re.compile(family)
+    filtered = {}
+    filtered['benchmarks'] = []
+    for be in json_orig['benchmarks']:
+        if not regex.search(be['name']):
+            continue
+        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
+        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
+        filtered['benchmarks'].append(filteredbench)
+    return filtered
+
+
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        time_unit = None
+        # Pick the time unit from the first entry of the lhs benchmark.
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
+        # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+def calc_utest(timings_cpu, timings_time):
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return False, None, None
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+def print_utest(partition, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    timings_time = extract_field(partition, 'real_time')
+    timings_cpu = extract_field(partition, 'cpu_time')
+    have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(timings_cpu, timings_time)
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not have_optimal_repetitions and cpu_pvalue is None and time_pvalue is None:
+        return []
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        len(timings_cpu[0]), len(timings_cpu[1]))
+    dsc_color = BC_OKGREEN
+
+    # We still got some results to show but issue a warning about it.
+    if not have_optimal_repetitions:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    last_name = partition[0][0]['name']
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(last_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(time_pvalue), time_pvalue,
+                         get_utest_color(cpu_pvalue), cpu_pvalue,
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
+def generate_difference_report(
+        json1,
+        json2,
+        display_aggregates_only=False,
+        utest=False,
+        utest_alpha=0.05,
+        use_color=True):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'.
+    """
+    assert utest is True or utest is False
+    first_col_width = find_longest_name(json1['benchmarks'])
+
+    def find_test(name):
+        for b in json2['benchmarks']:
+            if b['name'] == name:
+                return b
+        return None
+
+    first_col_width = max(
+        first_col_width,
+        len('Benchmark'))
+    first_col_width += len(UTEST_COL_NAME)
+    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
+        'Benchmark', 12 + first_col_width)
+    output_strs = [first_line, '-' * len(first_line)]
+
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+
+            # *If* we were asked to only display aggregates,
+            # and if it is non-aggregate, then skip it.
+            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
+                assert bn['run_type'] == other_bench['run_type']
+                if bn['run_type'] != 'aggregate':
+                    continue
+
+            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+
+            def get_color(res):
+                if res > 0.05:
+                    return BC_FAIL
+                elif res > -0.07:
+                    return BC_WHITE
+                else:
+                    return BC_CYAN
+
+            tres = calculate_change(bn['real_time'], other_bench['real_time'])
+            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            output_strs += [color_format(use_color,
+                                         fmt_str,
+                                         BC_HEADER,
+                                         bn['name'],
+                                         first_col_width,
+                                         get_color(tres),
+                                         tres,
+                                         get_color(cpures),
+                                         cpures,
+                                         bn['real_time'],
+                                         other_bench['real_time'],
+                                         bn['cpu_time'],
+                                         other_bench['cpu_time'],
+                                         endc=BC_ENDC)]
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            output_strs += print_utest(partition,
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
+
+    return output_strs
+
+
+###############################################################################
+# Unit tests
+
+
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium',  # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
+
+class TestReportDifference(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+        testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_basic(self):
+        expect_lines = [
+            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
+            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
+            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
+            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
+            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
+            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
+            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
+            ['BM_100xSlower', '+99.0000', '+99.0000',
+                '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900',
+                '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000',
+                '-0.1000', '100', '110', '100', '90'],
+            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceBetweenFamilies(unittest.TestCase):
+    def load_result(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test2_run.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
+            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
+            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
+            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+        ]
+        json = self.load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        output_lines_with_header = generate_difference_report(
+            json1, json2, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceWithUTest(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_utest(self):
+        expect_lines = []
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_utest(self):
+        expect_lines = []
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, display_aggregates_only=True,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/thirdparty/benchmark-1.5.0/tools/gbench/util.py b/thirdparty/benchmark-1.5.0/tools/gbench/util.py
new file mode 100644
index 0000000000..1f8e8e2c47
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/gbench/util.py
@@ -0,0 +1,164 @@
+"""util.py - General utilities for running, loading, and processing benchmarks
+"""
+import json
+import os
+import tempfile
+import subprocess
+import sys
+
+# Input file type enumeration
+IT_Invalid = 0
+IT_JSON = 1
+IT_Executable = 2
+
+_num_magic_bytes = 2 if sys.platform.startswith('win') else 4
+
+
+def is_executable_file(filename):
+    """
+    Return 'True' if 'filename' names a valid file which is likely
+    an executable. A file is considered an executable if it starts with the
+    magic bytes for a EXE, Mach O, or ELF file.
+    """
+    if not os.path.isfile(filename):
+        return False
+    with open(filename, mode='rb') as f:
+        magic_bytes = f.read(_num_magic_bytes)
+    if sys.platform == 'darwin':
+        return magic_bytes in [
+            b'\xfe\xed\xfa\xce',  # MH_MAGIC
+            b'\xce\xfa\xed\xfe',  # MH_CIGAM
+            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
+            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
+        ]
+    elif sys.platform.startswith('win'):
+        return magic_bytes == b'MZ'
+    else:
+        return magic_bytes == b'\x7FELF'
+
+
+def is_json_file(filename):
+    """
+    Returns 'True' if 'filename' names a valid JSON output file.
+    'False' otherwise.
+    """
+    try:
+        with open(filename, 'r') as f:
+            json.load(f)
+        return True
+    except BaseException:
+        pass
+    return False
+
+
+def classify_input_file(filename):
+    """
+    Return a tuple (type, msg) where 'type' specifies the classified type
+    of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
+    string represeting the error.
+    """
+    ftype = IT_Invalid
+    err_msg = None
+    if not os.path.exists(filename):
+        err_msg = "'%s' does not exist" % filename
+    elif not os.path.isfile(filename):
+        err_msg = "'%s' does not name a file" % filename
+    elif is_executable_file(filename):
+        ftype = IT_Executable
+    elif is_json_file(filename):
+        ftype = IT_JSON
+    else:
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
+    return ftype, err_msg
+
+
+def check_input_file(filename):
+    """
+    Classify the file named by 'filename' and return the classification.
+    If the file is classified as 'IT_Invalid' print an error message and exit
+    the program.
+    """
+    ftype, msg = classify_input_file(filename)
+    if ftype == IT_Invalid:
+        print("Invalid input file: %s" % msg)
+        sys.exit(1)
+    return ftype
+
+
+def find_benchmark_flag(prefix, benchmark_flags):
+    """
+    Search the specified list of flags for a flag matching `<prefix><arg>` and
+    if it is found return the arg it specifies. If specified more than once the
+    last value is returned. If the flag is not found None is returned.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    result = None
+    for f in benchmark_flags:
+        if f.startswith(prefix):
+            result = f[len(prefix):]
+    return result
+
+
+def remove_benchmark_flags(prefix, benchmark_flags):
+    """
+    Return a new list containing the specified benchmark_flags except those
+    with the specified prefix.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    return [f for f in benchmark_flags if not f.startswith(prefix)]
+
+
+def load_benchmark_results(fname):
+    """
+    Read benchmark output from a file and return the JSON object.
+    REQUIRES: 'fname' names a file containing JSON benchmark output.
+    """
+    with open(fname, 'r') as f:
+        return json.load(f)
+
+
+def run_benchmark(exe_name, benchmark_flags):
+    """
+    Run a benchmark specified by 'exe_name' with the specified
+    'benchmark_flags'. The benchmark is run directly as a subprocess to preserve
+    real time console output.
+    RETURNS: A JSON object representing the benchmark output
+    """
+    output_name = find_benchmark_flag('--benchmark_out=',
+                                      benchmark_flags)
+    is_temp_output = False
+    if output_name is None:
+        is_temp_output = True
+        thandle, output_name = tempfile.mkstemp()
+        os.close(thandle)
+        benchmark_flags = list(benchmark_flags) + \
+            ['--benchmark_out=%s' % output_name]
+
+    cmd = [exe_name] + benchmark_flags
+    print("RUNNING: %s" % ' '.join(cmd))
+    exitCode = subprocess.call(cmd)
+    if exitCode != 0:
+        print('TEST FAILED...')
+        sys.exit(exitCode)
+    json_res = load_benchmark_results(output_name)
+    if is_temp_output:
+        os.unlink(output_name)
+    return json_res
+
+
+def run_or_load_benchmark(filename, benchmark_flags):
+    """
+    Get the results for a specified benchmark. If 'filename' specifies
+    an executable benchmark then the results are generated by running the
+    benchmark. Otherwise 'filename' must name a valid JSON output file,
+    which is loaded and the result returned.
+    """
+    ftype = check_input_file(filename)
+    if ftype == IT_JSON:
+        return load_benchmark_results(filename)
+    elif ftype == IT_Executable:
+        return run_benchmark(filename, benchmark_flags)
+    else:
+        assert False  # This branch is unreachable
diff --git a/thirdparty/benchmark-1.5.0/tools/strip_asm.py b/thirdparty/benchmark-1.5.0/tools/strip_asm.py
new file mode 100755
index 0000000000..9030550b43
--- /dev/null
+++ b/thirdparty/benchmark-1.5.0/tools/strip_asm.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+"""
+strip_asm.py - Cleanup ASM output for the specified file
+"""
+
+from argparse import ArgumentParser
+import sys
+import os
+import re
+
+def find_used_labels(asm):
+    found = set()
+    label_re = re.compile("\s*j[a-z]+\s+\.L([a-zA-Z0-9][a-zA-Z0-9_]*)")
+    for l in asm.splitlines():
+        m = label_re.match(l)
+        if m:
+            found.add('.L%s' % m.group(1))
+    return found
+
+
+def normalize_labels(asm):
+    decls = set()
+    label_decl = re.compile("^[.]{0,1}L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if m:
+            decls.add(m.group(0))
+    if len(decls) == 0:
+        return asm
+    needs_dot = next(iter(decls))[0] != '.'
+    if not needs_dot:
+        return asm
+    for ld in decls:
+        asm = re.sub("(^|\s+)" + ld + "(?=:|\s)", '\\1.' + ld, asm)
+    return asm
+
+
+def transform_labels(asm):
+    asm = normalize_labels(asm)
+    used_decls = find_used_labels(asm)
+    new_asm = ''
+    label_decl = re.compile("^\.L([a-zA-Z0-9][a-zA-Z0-9_]*)(?=:)")
+    for l in asm.splitlines():
+        m = label_decl.match(l)
+        if not m or m.group(0) in used_decls:
+            new_asm += l
+            new_asm += '\n'
+    return new_asm
+
+
+def is_identifier(tk):
+    if len(tk) == 0:
+        return False
+    first = tk[0]
+    if not first.isalpha() and first != '_':
+        return False
+    for i in range(1, len(tk)):
+        c = tk[i]
+        if not c.isalnum() and c != '_':
+            return False
+    return True
+
+def process_identifiers(l):
+    """
+    process_identifiers - process all identifiers and modify them to have
+    consistent names across all platforms; specifically across ELF and MachO.
+    For example, MachO inserts an additional understore at the beginning of
+    names. This function removes that.
+    """
+    parts = re.split(r'([a-zA-Z0-9_]+)', l)
+    new_line = ''
+    for tk in parts:
+        if is_identifier(tk):
+            if tk.startswith('__Z'):
+                tk = tk[1:]
+            elif tk.startswith('_') and len(tk) > 1 and \
+                    tk[1].isalpha() and tk[1] != 'Z':
+                tk = tk[1:]
+        new_line += tk
+    return new_line
+
+
+def process_asm(asm):
+    """
+    Strip the ASM of unwanted directives and lines
+    """
+    new_contents = ''
+    asm = transform_labels(asm)
+
+    # TODO: Add more things we want to remove
+    discard_regexes = [
+        re.compile("\s+\..*$"), # directive
+        re.compile("\s*#(NO_APP|APP)$"), #inline ASM
+        re.compile("\s*#.*$"), # comment line
+        re.compile("\s*\.globa?l\s*([.a-zA-Z_][a-zA-Z0-9$_.]*)"), #global directive
+        re.compile("\s*\.(string|asciz|ascii|[1248]?byte|short|word|long|quad|value|zero)"),
+    ]
+    keep_regexes = [
+
+    ]
+    fn_label_def = re.compile("^[a-zA-Z_][a-zA-Z0-9_.]*:")
+    for l in asm.splitlines():
+        # Remove Mach-O attribute
+        l = l.replace('@GOTPCREL', '')
+        add_line = True
+        for reg in discard_regexes:
+            if reg.match(l) is not None:
+                add_line = False
+                break
+        for reg in keep_regexes:
+            if reg.match(l) is not None:
+                add_line = True
+                break
+        if add_line:
+            if fn_label_def.match(l) and len(new_contents) != 0:
+                new_contents += '\n'
+            l = process_identifiers(l)
+            new_contents += l
+            new_contents += '\n'
+    return new_contents
+
+def main():
+    parser = ArgumentParser(
+        description='generate a stripped assembly file')
+    parser.add_argument(
+        'input', metavar='input', type=str, nargs=1,
+        help='An input assembly file')
+    parser.add_argument(
+        'out', metavar='output', type=str, nargs=1,
+        help='The output file')
+    args, unknown_args = parser.parse_known_args()
+    input = args.input[0]
+    output = args.out[0]
+    if not os.path.isfile(input):
+        print(("ERROR: input file '%s' does not exist") % input)
+        sys.exit(1)
+    contents = None
+    with open(input, 'r') as f:
+        contents = f.read()
+    new_contents = process_asm(contents)
+    with open(output, 'w') as f:
+        f.write(new_contents)
+
+
+if __name__ == '__main__':
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/thirdparty/mio/.gitignore b/thirdparty/mio/.gitignore
new file mode 100644
index 0000000000..f37d11baea
--- /dev/null
+++ b/thirdparty/mio/.gitignore
@@ -0,0 +1,5 @@
+test/**
+!test/test.cpp
+!test/example.cpp
+!test/CMakeLists.txt
+build/
diff --git a/thirdparty/mio/CMakeLists.txt b/thirdparty/mio/CMakeLists.txt
new file mode 100644
index 0000000000..339430f916
--- /dev/null
+++ b/thirdparty/mio/CMakeLists.txt
@@ -0,0 +1,152 @@
+cmake_minimum_required(VERSION 3.8)
+
+#
+# Here we check whether mio is being configured in isolation or as a component
+# of a larger project. To do so, we query whether the `PROJECT_NAME` CMake
+# variable has been defined. In the case it has, we can conclude mio is a
+# subproject.
+#
+# This convention has been borrowed from the Catch C++ unit testing library.
+#
+if(DEFINED PROJECT_NAME)
+  set(subproject ON)
+else()
+  set(subproject OFF)
+endif()
+
+project(mio VERSION 1.0.0 LANGUAGES C CXX)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+include (CTest)
+include (CMakeDependentOption)
+
+# Generate 'compile_commands.json' for clang_complete
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+#
+# The `mio.testing` options only appear as cmake-gui and ccmake options iff
+# mio is the highest level project. In the case that mio is a subproject, these
+# options are hidden from the user interface and set to `OFF`
+#
+# Iff mio is the highest level project, this option is defaulted to the value
+# of the traditional course grain testing option `BUILD_TESTING` established by
+# the CTest module
+#
+CMAKE_DEPENDENT_OPTION(mio.tests
+  "Build the mio tests and integrate with ctest"
+  ${BUILD_TESTING} "NOT subproject" OFF)
+
+#
+# mio has no compiled components. As such, we declare it as an `INTERFACE`
+# library, which denotes a collection of target properties to be applied
+# transitively to linking targets. In our case, this amounts to an include
+# directory and project header files.
+#
+add_library(mio_base INTERFACE)
+
+#
+# mio requires C++ 11 support, at a minimum. Setting the `cxx_std_11` compile
+# features ensures that the corresponding C++ standard flag is populated in
+# targets linking to mio
+#
+target_compile_features(mio_base INTERFACE cxx_std_11)
+
+# 
+# On Windows, so as to be a "good citizen", mio offers two different 
+# targets that control the imported surface area of the Windows API. The 
+# default `mio` target sets the necessary flags for a minimal Win API 
+# (`WIN32_LEAN_AND_MEAN`, etc.), while the `mio_full_winapi` target sets 
+# none of these flags so will not disable any of the modules.
+# 
+if(WIN32)
+    include(WinApiLevels)
+else()
+    # On non-Windows systems, the `mio` and `mio_base` targets are 
+    # effectively identical.
+    add_library(mio INTERFACE)
+    target_link_libraries(mio
+        INTERFACE mio_base
+    )
+endif()
+
+add_library(mio::mio ALIAS mio)
+
+#
+# The include directory for mio can be expected to vary between build
+# and installaion. Here we use a CMake generator expression to dispatch
+# on how the configuration under which this library is being consumed.
+#
+target_include_directories(mio_base INTERFACE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/include>
+  $<INSTALL_INTERFACE:include>)
+
+add_subdirectory(include/mio)
+
+if(mio.tests)
+  add_subdirectory(test)
+endif()
+
+#
+# Non-testing header files (preserving relative paths) are installed to the
+# `include` subdirectory of the `$INSTALL_DIR/${CMAKE_INSTALL_PREFIX}`
+# directory. Source file permissions preserved.
+#
+install(DIRECTORY include/
+  DESTINATION include
+  USE_SOURCE_PERMISSIONS
+  FILES_MATCHING PATTERN "*.*pp")
+
+#
+# As a header-only library, there are no target components to be installed
+# directly (the PUBLIC_HEADER property is not white listed for INTERFACE
+# targets for some reason).
+#
+# However, it is worthwhile export our target description in order to later
+# generate a CMake configuration file for consumption by CMake's `find_package`
+# intrinsic
+#
+install(TARGETS mio_base mio EXPORT mioConfig)
+install(EXPORT mioConfig
+  FILE mioConfig.cmake
+  NAMESPACE mio::
+  DESTINATION share/cmake/mio
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
+
+include(CMakePackageConfigHelpers) # provides `write_basic_package_version_file`
+write_basic_package_version_file("mioConfigVersion.cmake"
+  VERSION ${mio_VERSION}
+  COMPATIBILITY SameMajorVersion)
+
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mioConfigVersion.cmake"
+  DESTINATION share/cmake/mio
+  PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ)
+
+#
+# Rudimentary CPack support.
+#
+# CPack provides a mechanism to generate installation packaging for a project,
+# e.g., self-extracting shell scripts, compressed tarballs, Debian Package files,
+# RPM Package Manager files, Windows NSIS installation wizards,
+# Apple Disk Images (.dmg), etc.
+#
+# Any system libraries required (runtimes, threading, etc) should be bundled
+# with the project for this type of installation. The
+# `InstallRequiredSystemLibraries` CMake module attempts to provide this
+# functionality in an automated way. Additional libraries may be specified as
+#
+# ```cmake
+# list(APPEND CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS <library>)
+# ```
+#
+# A packaged installation can be generated by calling
+#
+# ```sh
+# cpack -G <packaging type> --config CPackConfig.cmake
+# ```
+#
+# See `cpack --help` or the CPack documentation for more information.
+#
+include( InstallRequiredSystemLibraries )
+set( CPACK_PACKAGE_VENDOR "mandreyel" )
+set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" )
+set( CMAKE_PROJECT_HOMEPAGE_URL "https://github.com/mandreyel/mio" )
+include( CPack )
diff --git a/thirdparty/mio/LICENSE b/thirdparty/mio/LICENSE
new file mode 100644
index 0000000000..361770744b
--- /dev/null
+++ b/thirdparty/mio/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 https://github.com/mandreyel/
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/thirdparty/mio/README.md b/thirdparty/mio/README.md
new file mode 100644
index 0000000000..e50b23ffc7
--- /dev/null
+++ b/thirdparty/mio/README.md
@@ -0,0 +1,337 @@
+# mio
+An easy to use header-only cross-platform C++11 memory mapping library with an MIT license.
+
+mio has been created with the goal to be easily includable (i.e. no dependencies) in any C++ project that needs memory mapped file IO without the need to pull in Boost.
+
+Please feel free to open an issue, I'll try to address any concerns as best I can.
+
+### Why?
+Because memory mapping is the best thing since sliced bread!
+
+More seriously, the primary motivation for writing this library instead of using Boost.Iostreams, was the lack of support for establishing a memory mapping with an already open file handle/descriptor. This is possible with mio.
+
+Furthermore, Boost.Iostreams' solution requires that the user pick offsets exactly at page boundaries, which is cumbersome and error prone. mio, on the other hand, manages this internally, accepting any offset and finding the nearest page boundary.
+
+Albeit a minor nitpick, Boost.Iostreams implements memory mapped file IO with a `std::shared_ptr` to provide shared semantics, even if not needed, and the overhead of the heap allocation may be unnecessary and/or unwanted.
+In mio, there are two classes to cover the two use-cases: one that is move-only (basically a zero-cost abstraction over the system specific mmapping functions), and the other that acts just like its Boost.Iostreams counterpart, with shared semantics.
+
+### How to create a mapping
+NOTE: the file must exist before creating a mapping.
+
+There are three ways to map a file into memory:
+
+- Using the constructor, which throws a `std::system_error` on failure:
+```c++
+mio::mmap_source mmap(path, offset, size_to_map);
+```
+or you can omit the `offset` and `size_to_map` arguments, in which case the
+entire file is mapped:
+```c++
+mio::mmap_source mmap(path);
+```
+
+- Using the factory function:
+```c++
+std::error_code error;
+mio::mmap_source mmap = mio::make_mmap_source(path, offset, size_to_map, error);
+```
+or:
+```c++
+mio::mmap_source mmap = mio::make_mmap_source(path, error);
+```
+
+- Using the `map` member function:
+```c++
+std::error_code error;
+mio::mmap_source mmap;
+mmap.map(path, offset, size_to_map, error);
+```
+or:
+```c++
+mmap.map(path, error);
+```
+**NOTE:** The constructors **require** exceptions to be enabled. If you prefer
+to build your projects with `-fno-exceptions`, you can still use the other ways.
+
+Moreover, in each case, you can provide either some string type for the file's path, or you can use an existing, valid file handle.
+```c++
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <mio/mmap.hpp>
+#include <algorithm>
+
+int main()
+{
+    // NOTE: error handling omitted for brevity.
+    const int fd = open("file.txt", O_RDONLY);
+    mio::mmap_source mmap(fd, 0, mio::map_entire_file);
+    // ...
+}
+```
+However, mio does not check whether the provided file descriptor has the same access permissions as the desired mapping, so the mapping may fail. Such errors are reported via the `std::error_code` out parameter that is passed to the mapping function.
+
+**WINDOWS USERS**: This library *does* support the use of wide character types
+for functions where character strings are expected (e.g. path parameters).
+
+### Example
+
+```c++
+#include <mio/mmap.hpp>
+#include <system_error> // for std::error_code
+#include <cstdio> // for std::printf
+#include <cassert>
+#include <algorithm>
+#include <fstream>
+
+int handle_error(const std::error_code& error);
+void allocate_file(const std::string& path, const int size);
+
+int main()
+{
+    const auto path = "file.txt";
+
+    // NOTE: mio does *not* create the file for you if it doesn't exist! You
+    // must ensure that the file exists before establishing a mapping. It
+    // must also be non-empty. So for illustrative purposes the file is
+    // created now.
+    allocate_file(path, 155);
+
+    // Read-write memory map the whole file by using `map_entire_file` where the
+    // length of the mapping is otherwise expected, with the factory method.
+    std::error_code error;
+    mio::mmap_sink rw_mmap = mio::make_mmap_sink(
+            path, 0, mio::map_entire_file, error);
+    if (error) { return handle_error(error); }
+
+    // You can use any iterator based function.
+    std::fill(rw_mmap.begin(), rw_mmap.end(), 'a');
+
+    // Or manually iterate through the mapped region just as if it were any other 
+    // container, and change each byte's value (since this is a read-write mapping).
+    for (auto& b : rw_mmap) {
+        b += 10;
+    }
+
+    // Or just change one value with the subscript operator.
+    const int answer_index = rw_mmap.size() / 2;
+    rw_mmap[answer_index] = 42;
+
+    // Don't forget to flush changes to disk before unmapping. However, if
+    // `rw_mmap` were to go out of scope at this point, the destructor would also
+    // automatically invoke `sync` before `unmap`.
+    rw_mmap.sync(error);
+    if (error) { return handle_error(error); }
+
+    // We can then remove the mapping, after which rw_mmap will be in a default
+    // constructed state, i.e. this and the above call to `sync` have the same
+    // effect as if the destructor had been invoked.
+    rw_mmap.unmap();
+
+    // Now create the same mapping, but in read-only mode. Note that calling the
+    // overload without the offset and file length parameters maps the entire
+    // file.
+    mio::mmap_source ro_mmap;
+    ro_mmap.map(path, error);
+    if (error) { return handle_error(error); }
+
+    const int the_answer_to_everything = ro_mmap[answer_index];
+    assert(the_answer_to_everything == 42);
+}
+
+int handle_error(const std::error_code& error)
+{
+    const auto& errmsg = error.message();
+    std::printf("error mapping file: %s, exiting...\n", errmsg.c_str());
+    return error.value();
+}
+
+void allocate_file(const std::string& path, const int size)
+{
+    std::ofstream file(path);
+    std::string s(size, '0');
+    file << s;
+}
+```
+
+`mio::basic_mmap` is move-only, but if multiple copies to the same mapping are needed, use `mio::basic_shared_mmap` which has `std::shared_ptr` semantics and has the same interface as `mio::basic_mmap`.
+```c++
+#include <mio/shared_mmap.hpp>
+
+mio::shared_mmap_source shared_mmap1("path", offset, size_to_map);
+mio::shared_mmap_source shared_mmap2(std::move(mmap1)); // or use operator=
+mio::shared_mmap_source shared_mmap3(std::make_shared<mio::mmap_source>(mmap1)); // or use operator=
+mio::shared_mmap_source shared_mmap4;
+shared_mmap4.map("path", offset, size_to_map, error);
+```
+
+It's possible to define the type of a byte (which has to be the same width as `char`), though aliases for the most common ones are provided by default:
+```c++
+using mmap_source = basic_mmap_source<char>;
+using ummap_source = basic_mmap_source<unsigned char>;
+
+using mmap_sink = basic_mmap_sink<char>;
+using ummap_sink = basic_mmap_sink<unsigned char>;
+```
+But it may be useful to define your own types, say when using the new `std::byte` type in C++17:
+```c++
+using mmap_source = mio::basic_mmap_source<std::byte>;
+using mmap_sink = mio::basic_mmap_sink<std::byte>;
+```
+
+Though generally not needed, since mio maps users requested offsets to page boundaries, you can query the underlying system's page allocation granularity by invoking `mio::page_size()`, which is located in `mio/page.hpp`.
+
+### Single Header File 
+Mio can be added to your project as a single header file simply by including `\single_include\mio\mio.hpp`. Single header files can be regenerated at any time by running the `amalgamate.py` script within `\third_party`.  
+```
+python amalgamate.py -c config.json -s ../include
+```
+
+## CMake
+As a header-only library, mio has no compiled components. Nevertheless, a [CMake](https://cmake.org/overview/) build system is provided to allow easy testing, installation, and subproject composition on many platforms and operating systems.
+
+### Testing
+Mio is distributed with a small suite of tests and examples.
+When mio is configured as the highest level CMake project, this suite of executables is built by default.
+Mio's test executables are integrated with the CMake test driver program, [CTest](https://cmake.org/cmake/help/latest/manual/ctest.1.html).
+
+CMake supports a number of backends for compilation and linking.
+
+To use a static configuration build tool, such as GNU Make or Ninja:
+
+```sh
+cd <mio source directory>
+mkdir build
+cd build
+
+# Configure the build
+cmake -D CMAKE_BUILD_TYPE=<Debug | Release> \
+      -G <"Unix Makefiles" | "Ninja"> ..
+
+# build the tests
+< make | ninja | cmake --build . >
+
+# run the tests
+< make test | ninja test | cmake --build . --target test | ctest >
+```
+
+To use a dynamic configuration build tool, such as Visual Studio or Xcode:
+
+```sh
+cd <mio source directory>
+mkdir build
+cd build
+
+# Configure the build
+cmake -G <"Visual Studio 14 2015 Win64" | "Xcode"> ..
+
+# build the tests
+cmake --build . --config <Debug | Release>
+
+# run the tests via ctest...
+ctest --build-config <Debug | Release>
+
+# ... or via CMake build tool mode...
+cmake --build . --config <Debug | Release> --target test
+```
+
+Of course the **build** and **test** steps can also be executed via the **all** and **test** targets, respectively, from within the IDE after opening the project file generated during the configuration step.
+
+Mio's testing is also configured to operate as a client to the [CDash](https://www.cdash.org/) software quality dashboard application. Please see the [Kitware documentation](https://cmake.org/cmake/help/latest/manual/ctest.1.html#dashboard-client) for more information on this mode of operation.
+
+### Installation
+
+Mio's build system provides an installation target and support for downstream consumption via CMake's [`find_package`](https://cmake.org/cmake/help/v3.0/command/find_package.html) intrinsic function.
+CMake allows installation to an arbitrary location, which may be specified by defining `CMAKE_INSTALL_PREFIX` at configure time.
+In the absense of a user specification, CMake will install mio to conventional location based on the platform operating system.
+
+To use a static configuration build tool, such as GNU Make or Ninja:
+
+```sh
+cd <mio source directory>
+mkdir build
+cd build
+
+# Configure the build
+cmake [-D CMAKE_INSTALL_PREFIX="path/to/installation"] \
+      [-D BUILD_TESTING=False]                         \
+      -D CMAKE_BUILD_TYPE=Release                      \
+      -G <"Unix Makefiles" | "Ninja"> ..
+
+# install mio
+<make install | ninja install | cmake --build . --target install>
+```
+
+To use a dynamic configuration build tool, such as Visual Studio or Xcode:
+
+```sh
+cd <mio source directory>
+mkdir build
+cd build
+
+# Configure the project
+cmake [-D CMAKE_INSTALL_PREFIX="path/to/installation"] \
+      [-D BUILD_TESTING=False]                         \
+      -G <"Visual Studio 14 2015 Win64" | "Xcode"> ..
+
+# install mio
+cmake --build . --config Release --target install
+```
+
+Note that the last command of the installation sequence may require administrator privileges (e.g. `sudo`) if the installation root directory lies outside your home directory.
+
+This installation
++ copies the mio header files to the `include/mio` subdirectory of the installation root
++ generates and copies several CMake configuration files to the `share/cmake/mio` subdirectory of the installation root
+
+This latter step allows downstream CMake projects to consume mio via `find_package`, e.g.
+
+```cmake
+find_package( mio REQUIRED )
+target_link_libraries( MyTarget PUBLIC mio::mio )
+```
+
+**WINDOWS USERS**: The `mio::mio` target `#define`s `WIN32_LEAN_AND_MEAN` and `NOMINMAX`. The former ensures the imported surface area of the Win API is minimal, and the latter disables Windows' `min` and `max` macros so they don't intefere with `std::min` and `std::max`. Because *mio* is a header only library, these defintions will leak into downstream CMake builds. If their presence is causing problems with your build then you can use the alternative `mio::mio_full_winapi` target, which adds none of these defintions.
+
+If mio was installed to a non-conventional location, it may be necessary for downstream projects to specify the mio installation root directory via either
+
++ the `CMAKE_PREFIX_PATH` configuration option,
++ the `CMAKE_PREFIX_PATH` environment variable, or
++ `mio_DIR` environment variable.
+
+Please see the [Kitware documentation](https://cmake.org/cmake/help/v3.0/command/find_package.html) for more information.
+
+In addition, mio supports packaged relocatable installations via [CPack](https://cmake.org/cmake/help/latest/manual/cpack.1.html).
+Following configuration, from the build directory, invoke cpack as follows to generate a packaged installation:
+
+```sh
+cpack -G <generator name> -C Release
+```
+
+The list of supported generators varies from platform to platform. See the output of `cpack --help` for a complete list of supported generators on your platform.
+
+### Subproject Composition
+To use mio as a subproject, copy the mio repository to your project's dependencies/externals folder.
+If your project is version controlled using git, a git submodule or git subtree can be used to syncronize with the updstream repository.
+The [use](https://services.github.com/on-demand/downloads/submodule-vs-subtree-cheat-sheet/) and [relative advantages](https://andrey.nering.com.br/2016/git-submodules-vs-subtrees/) of these git facilities is beyond the scope of this document, but in brief, each may be established as follows:
+
+```sh
+# via git submodule
+cd <my project's dependencies directory>
+git submodule add -b master https://github.com/mandreyel/mio.git
+
+# via git subtree
+cd <my project's root directory>
+git subtree add --prefix <path/to/dependencies>/mio       \
+    https://github.com/mandreyel/mio.git master --squash
+```
+
+Given a mio subdirectory in a project, simply add the following lines to your project's to add mio include directories to your target's include path.
+
+```cmake
+add_subdirectory( path/to/mio/ )
+target_link_libraries( MyTarget PUBLIC <mio::mio | mio> )
+```
+
+Note that, as a subproject, mio's tests and examples will not be built and CPack integration is deferred to the host project.
+
diff --git a/thirdparty/mio/cmake/WinApiLevels.cmake b/thirdparty/mio/cmake/WinApiLevels.cmake
new file mode 100644
index 0000000000..5f3ebad84e
--- /dev/null
+++ b/thirdparty/mio/cmake/WinApiLevels.cmake
@@ -0,0 +1,14 @@
+add_library(mio_full_winapi INTERFACE)
+target_link_libraries(mio_full_winapi
+    INTERFACE mio_base
+)
+add_library(mio::mio_full_winapi ALIAS mio_full_winapi)
+
+add_library(mio INTERFACE)
+target_link_libraries(mio
+    INTERFACE mio_full_winapi
+)
+target_compile_definitions(mio
+    INTERFACE WIN32_LEAN_AND_MEAN NOMINMAX
+)
+install(TARGETS mio_full_winapi EXPORT mioConfig)
diff --git a/thirdparty/mio/include/mio/CMakeLists.txt b/thirdparty/mio/include/mio/CMakeLists.txt
new file mode 100644
index 0000000000..c2d495c881
--- /dev/null
+++ b/thirdparty/mio/include/mio/CMakeLists.txt
@@ -0,0 +1,16 @@
+#
+# While not strictly necessary to specify header files as target sources,
+# doing so populates these files in the source listing when CMake is used
+# to generate XCode and Visual Studios projects
+#
+target_sources(mio_base INTERFACE
+  $<BUILD_INTERFACE:
+    ${CMAKE_CURRENT_LIST_DIR}/mmap.hpp
+    ${CMAKE_CURRENT_LIST_DIR}/page.hpp
+    ${CMAKE_CURRENT_LIST_DIR}/shared_mmap.hpp>
+  $<INSTALL_INTERFACE:
+    include/mio/mmap.hpp
+    include/mio/page.hpp
+    include/mio/shared_mmap.hpp>)
+
+add_subdirectory(detail)
diff --git a/thirdparty/mio/include/mio/detail/CMakeLists.txt b/thirdparty/mio/include/mio/detail/CMakeLists.txt
new file mode 100644
index 0000000000..d4f818c23a
--- /dev/null
+++ b/thirdparty/mio/include/mio/detail/CMakeLists.txt
@@ -0,0 +1,10 @@
+#
+# iff mio is the highest level project, include the implementation
+# detail files in the source listing for CMake-generated IDE projects
+#
+if(NOT subproject)
+  target_sources(mio_base INTERFACE
+    $<BUILD_INTERFACE:
+      ${CMAKE_CURRENT_LIST_DIR}/mmap.ipp
+      ${CMAKE_CURRENT_LIST_DIR}/string_util.hpp>)
+endif()
diff --git a/thirdparty/mio/include/mio/detail/mmap.ipp b/thirdparty/mio/include/mio/detail/mmap.ipp
new file mode 100644
index 0000000000..361db300be
--- /dev/null
+++ b/thirdparty/mio/include/mio/detail/mmap.ipp
@@ -0,0 +1,518 @@
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_BASIC_MMAP_IMPL
+#define MIO_BASIC_MMAP_IMPL
+
+#include "mio/mmap.hpp"
+#include "mio/page.hpp"
+#include "mio/detail/string_util.hpp"
+
+#include <algorithm>
+
+#ifndef _WIN32
+# include <unistd.h>
+# include <fcntl.h>
+# include <sys/mman.h>
+# include <sys/stat.h>
+#endif
+
+namespace mio {
+namespace detail {
+
+#ifdef _WIN32
+namespace win {
+
+/** Returns the 4 upper bytes of an 8-byte integer. */
+inline DWORD int64_high(int64_t n) noexcept
+{
+    return n >> 32;
+}
+
+/** Returns the 4 lower bytes of an 8-byte integer. */
+inline DWORD int64_low(int64_t n) noexcept
+{
+    return n & 0xffffffff;
+}
+
+template<
+    typename String,
+    typename = typename std::enable_if<
+        std::is_same<typename char_type<String>::type, char>::value
+    >::type
+> file_handle_type open_file_helper(const String& path, const access_mode mode)
+{
+    return ::CreateFileA(c_str(path),
+            mode == access_mode::read ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE,
+            FILE_SHARE_READ | FILE_SHARE_WRITE,
+            0,
+            OPEN_EXISTING,
+            FILE_ATTRIBUTE_NORMAL,
+            0);
+}
+
+template<typename String>
+typename std::enable_if<
+    std::is_same<typename char_type<String>::type, wchar_t>::value,
+    file_handle_type
+>::type open_file_helper(const String& path, const access_mode mode)
+{
+    return ::CreateFileW(c_str(path),
+            mode == access_mode::read ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE,
+            FILE_SHARE_READ | FILE_SHARE_WRITE,
+            0,
+            OPEN_EXISTING,
+            FILE_ATTRIBUTE_NORMAL,
+            0);
+}
+
+} // win
+#endif // _WIN32
+
+/**
+ * Returns the last platform specific system error (errno on POSIX and
+ * GetLastError on Win) as a `std::error_code`.
+ */
+inline std::error_code last_error() noexcept
+{
+    std::error_code error;
+#ifdef _WIN32
+    error.assign(GetLastError(), std::system_category());
+#else
+    error.assign(errno, std::system_category());
+#endif
+    return error;
+}
+
+template<typename String>
+file_handle_type open_file(const String& path, const access_mode mode,
+        std::error_code& error)
+{
+    error.clear();
+    if(detail::empty(path))
+    {
+        error = std::make_error_code(std::errc::invalid_argument);
+        return invalid_handle;
+    }
+#ifdef _WIN32
+    const auto handle = win::open_file_helper(path, mode);
+#else // POSIX
+    const auto handle = ::open(c_str(path),
+            mode == access_mode::read ? O_RDONLY : O_RDWR);
+#endif
+    if(handle == invalid_handle)
+    {
+        error = detail::last_error();
+    }
+    return handle;
+}
+
+inline size_t query_file_size(file_handle_type handle, std::error_code& error)
+{
+    error.clear();
+#ifdef _WIN32
+    LARGE_INTEGER file_size;
+    if(::GetFileSizeEx(handle, &file_size) == 0)
+    {
+        error = detail::last_error();
+        return 0;
+    }
+	return static_cast<int64_t>(file_size.QuadPart);
+#else // POSIX
+    struct stat sbuf;
+    if(::fstat(handle, &sbuf) == -1)
+    {
+        error = detail::last_error();
+        return 0;
+    }
+    return sbuf.st_size;
+#endif
+}
+
+struct mmap_context
+{
+    char* data;
+    int64_t length;
+    int64_t mapped_length;
+#ifdef _WIN32
+    file_handle_type file_mapping_handle;
+#endif
+};
+
+inline mmap_context memory_map(const file_handle_type file_handle, const int64_t offset,
+    const int64_t length, const access_mode mode, std::error_code& error)
+{
+    const int64_t aligned_offset = make_offset_page_aligned(offset);
+    const int64_t length_to_map = offset - aligned_offset + length;
+#ifdef _WIN32
+    const int64_t max_file_size = offset + length;
+    const auto file_mapping_handle = ::CreateFileMapping(
+            file_handle,
+            0,
+            mode == access_mode::read ? PAGE_READONLY : PAGE_READWRITE,
+            win::int64_high(max_file_size),
+            win::int64_low(max_file_size),
+            0);
+    if(file_mapping_handle == invalid_handle)
+    {
+        error = detail::last_error();
+        return {};
+    }
+    char* mapping_start = static_cast<char*>(::MapViewOfFile(
+            file_mapping_handle,
+            mode == access_mode::read ? FILE_MAP_READ : FILE_MAP_WRITE,
+            win::int64_high(aligned_offset),
+            win::int64_low(aligned_offset),
+            length_to_map));
+    if(mapping_start == nullptr)
+    {
+        error = detail::last_error();
+        return {};
+    }
+#else // POSIX
+    char* mapping_start = static_cast<char*>(::mmap(
+            0, // Don't give hint as to where to map.
+            length_to_map,
+            mode == access_mode::read ? PROT_READ : PROT_WRITE,
+            MAP_SHARED,
+            file_handle,
+            aligned_offset));
+    if(mapping_start == MAP_FAILED)
+    {
+        error = detail::last_error();
+        return {};
+    }
+#endif
+    mmap_context ctx;
+    ctx.data = mapping_start + offset - aligned_offset;
+    ctx.length = length;
+    ctx.mapped_length = length_to_map;
+#ifdef _WIN32
+    ctx.file_mapping_handle = file_mapping_handle;
+#endif
+    return ctx;
+}
+
+} // namespace detail
+
+// -- basic_mmap --
+
+template<access_mode AccessMode, typename ByteT>
+basic_mmap<AccessMode, ByteT>::~basic_mmap()
+{
+    conditional_sync();
+    unmap();
+}
+
+template<access_mode AccessMode, typename ByteT>
+basic_mmap<AccessMode, ByteT>::basic_mmap(basic_mmap&& other)
+    : data_(std::move(other.data_))
+    , length_(std::move(other.length_))
+    , mapped_length_(std::move(other.mapped_length_))
+    , file_handle_(std::move(other.file_handle_))
+#ifdef _WIN32
+    , file_mapping_handle_(std::move(other.file_mapping_handle_))
+#endif
+    , is_handle_internal_(std::move(other.is_handle_internal_))
+{
+    other.data_ = nullptr;
+    other.length_ = other.mapped_length_ = 0;
+    other.file_handle_ = invalid_handle;
+#ifdef _WIN32
+    other.file_mapping_handle_ = invalid_handle;
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+basic_mmap<AccessMode, ByteT>&
+basic_mmap<AccessMode, ByteT>::operator=(basic_mmap&& other)
+{
+    if(this != &other)
+    {
+        // First the existing mapping needs to be removed.
+        unmap();
+        data_ = std::move(other.data_);
+        length_ = std::move(other.length_);
+        mapped_length_ = std::move(other.mapped_length_);
+        file_handle_ = std::move(other.file_handle_);
+#ifdef _WIN32
+        file_mapping_handle_ = std::move(other.file_mapping_handle_);
+#endif
+        is_handle_internal_ = std::move(other.is_handle_internal_);
+
+        // The moved from basic_mmap's fields need to be reset, because
+        // otherwise other's destructor will unmap the same mapping that was
+        // just moved into this.
+        other.data_ = nullptr;
+        other.length_ = other.mapped_length_ = 0;
+        other.file_handle_ = invalid_handle;
+#ifdef _WIN32
+        other.file_mapping_handle_ = invalid_handle;
+#endif
+        other.is_handle_internal_ = false;
+    }
+    return *this;
+}
+
+template<access_mode AccessMode, typename ByteT>
+typename basic_mmap<AccessMode, ByteT>::handle_type
+basic_mmap<AccessMode, ByteT>::mapping_handle() const noexcept
+{
+#ifdef _WIN32
+    return file_mapping_handle_;
+#else
+    return file_handle_;
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<typename String>
+void basic_mmap<AccessMode, ByteT>::map(const String& path, const size_type offset,
+        const size_type length, std::error_code& error)
+{
+    error.clear();
+    if(detail::empty(path))
+    {
+        error = std::make_error_code(std::errc::invalid_argument);
+        return;
+    }
+    const auto handle = detail::open_file(path, AccessMode, error);
+    if(error)
+    {
+        return;
+    }
+
+    map(handle, offset, length, error);
+    // This MUST be after the call to map, as that sets this to true.
+    if(!error)
+    {
+        is_handle_internal_ = true;
+    }
+}
+
+template<access_mode AccessMode, typename ByteT>
+void basic_mmap<AccessMode, ByteT>::map(const handle_type handle,
+        const size_type offset, const size_type length, std::error_code& error)
+{
+    error.clear();
+    if(handle == invalid_handle)
+    {
+        error = std::make_error_code(std::errc::bad_file_descriptor);
+        return;
+    }
+
+    const auto file_size = detail::query_file_size(handle, error);
+    if(error)
+    {
+        return;
+    }
+
+    if(offset + length > file_size)
+    {
+        error = std::make_error_code(std::errc::invalid_argument);
+        return;
+    }
+
+    const auto ctx = detail::memory_map(handle, offset,
+            length == map_entire_file ? (file_size - offset) : length,
+            AccessMode, error);
+    if(!error)
+    {
+        // We must unmap the previous mapping that may have existed prior to this call.
+        // Note that this must only be invoked after a new mapping has been created in
+        // order to provide the strong guarantee that, should the new mapping fail, the
+        // `map` function leaves this instance in a state as though the function had
+        // never been invoked.
+        unmap();
+        file_handle_ = handle;
+        is_handle_internal_ = false;
+        data_ = reinterpret_cast<pointer>(ctx.data);
+        length_ = ctx.length;
+        mapped_length_ = ctx.mapped_length;
+#ifdef _WIN32
+        file_mapping_handle_ = ctx.file_mapping_handle;
+#endif
+    }
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<access_mode A>
+typename std::enable_if<A == access_mode::write, void>::type
+basic_mmap<AccessMode, ByteT>::sync(std::error_code& error)
+{
+    error.clear();
+    if(!is_open())
+    {
+        error = std::make_error_code(std::errc::bad_file_descriptor);
+        return;
+    }
+
+    if(data())
+    {
+#ifdef _WIN32
+        if(::FlushViewOfFile(get_mapping_start(), mapped_length_) == 0
+           || ::FlushFileBuffers(file_handle_) == 0)
+#else // POSIX
+        if(::msync(get_mapping_start(), mapped_length_, MS_SYNC) != 0)
+#endif
+        {
+            error = detail::last_error();
+            return;
+        }
+    }
+#ifdef _WIN32
+    if(::FlushFileBuffers(file_handle_) == 0)
+    {
+        error = detail::last_error();
+    }
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+void basic_mmap<AccessMode, ByteT>::unmap()
+{
+    if(!is_open()) { return; }
+    // TODO do we care about errors here?
+#ifdef _WIN32
+    if(is_mapped())
+    {
+        ::UnmapViewOfFile(get_mapping_start());
+        ::CloseHandle(file_mapping_handle_);
+    }
+#else // POSIX
+    if(data_) { ::munmap(const_cast<pointer>(get_mapping_start()), mapped_length_); }
+#endif
+
+    // If `file_handle_` was obtained by our opening it (when map is called with
+    // a path, rather than an existing file handle), we need to close it,
+    // otherwise it must not be closed as it may still be used outside this
+    // instance.
+    if(is_handle_internal_)
+    {
+#ifdef _WIN32
+        ::CloseHandle(file_handle_);
+#else // POSIX
+        ::close(file_handle_);
+#endif
+    }
+
+    // Reset fields to their default values.
+    data_ = nullptr;
+    length_ = mapped_length_ = 0;
+    file_handle_ = invalid_handle;
+#ifdef _WIN32
+    file_mapping_handle_ = invalid_handle;
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool basic_mmap<AccessMode, ByteT>::is_mapped() const noexcept
+{
+#ifdef _WIN32
+    return file_mapping_handle_ != invalid_handle;
+#else // POSIX
+    return is_open();
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+void basic_mmap<AccessMode, ByteT>::swap(basic_mmap& other)
+{
+    if(this != &other)
+    {
+        using std::swap;
+        swap(data_, other.data_); 
+        swap(file_handle_, other.file_handle_); 
+#ifdef _WIN32
+        swap(file_mapping_handle_, other.file_mapping_handle_); 
+#endif
+        swap(length_, other.length_); 
+        swap(mapped_length_, other.mapped_length_); 
+        swap(is_handle_internal_, other.is_handle_internal_); 
+    }
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<access_mode A>
+typename std::enable_if<A == access_mode::write, void>::type
+basic_mmap<AccessMode, ByteT>::conditional_sync()
+{
+    // This is invoked from the destructor, so not much we can do about
+    // failures here.
+    std::error_code ec;
+    sync(ec);
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<access_mode A>
+typename std::enable_if<A == access_mode::read, void>::type
+basic_mmap<AccessMode, ByteT>::conditional_sync()
+{
+    // noop
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator==(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return a.data() == b.data()
+        && a.size() == b.size();
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator!=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return !(a == b);
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    if(a.data() == b.data()) { return a.size() < b.size(); }
+    return a.data() < b.data();
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return !(a > b);
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    if(a.data() == b.data()) { return a.size() > b.size(); }
+    return a.data() > b.data();
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return !(a < b);
+}
+
+} // namespace mio
+
+#endif // MIO_BASIC_MMAP_IMPL
diff --git a/thirdparty/mio/include/mio/detail/string_util.hpp b/thirdparty/mio/include/mio/detail/string_util.hpp
new file mode 100644
index 0000000000..2f375aa2c2
--- /dev/null
+++ b/thirdparty/mio/include/mio/detail/string_util.hpp
@@ -0,0 +1,170 @@
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_STRING_UTIL_HEADER
+#define MIO_STRING_UTIL_HEADER
+
+#include <type_traits>
+
+namespace mio {
+namespace detail {
+
+template<
+    typename S,
+    typename C = typename std::decay<S>::type,
+    typename = decltype(std::declval<C>().data()),
+    typename = typename std::enable_if<
+        std::is_same<typename C::value_type, char>::value
+#ifdef _WIN32
+        || std::is_same<typename C::value_type, wchar_t>::value
+#endif
+    >::type
+> struct char_type_helper {
+    using type = typename C::value_type;
+};
+
+template<class T>
+struct char_type {
+    using type = typename char_type_helper<T>::type;
+};
+
+// TODO: can we avoid this brute force approach?
+template<>
+struct char_type<char*> {
+    using type = char;
+};
+
+template<>
+struct char_type<const char*> {
+    using type = char;
+};
+
+template<size_t N>
+struct char_type<char[N]> {
+    using type = char;
+};
+
+template<size_t N>
+struct char_type<const char[N]> {
+    using type = char;
+};
+
+#ifdef _WIN32
+template<>
+struct char_type<wchar_t*> {
+    using type = wchar_t;
+};
+
+template<>
+struct char_type<const wchar_t*> {
+    using type = wchar_t;
+};
+
+template<size_t N>
+struct char_type<wchar_t[N]> {
+    using type = wchar_t;
+};
+
+template<size_t N>
+struct char_type<const wchar_t[N]> {
+    using type = wchar_t;
+};
+#endif // _WIN32
+
+template<typename CharT, typename S>
+struct is_c_str_helper
+{
+    static constexpr bool value = std::is_same<
+        CharT*,
+        // TODO: I'm so sorry for this... Can this be made cleaner?
+        typename std::add_pointer<
+            typename std::remove_cv<
+                typename std::remove_pointer<
+                    typename std::decay<
+                        S
+                    >::type
+                >::type
+            >::type
+        >::type
+    >::value;
+};
+
+template<typename S>
+struct is_c_str
+{
+    static constexpr bool value = is_c_str_helper<char, S>::value;
+};
+
+#ifdef _WIN32
+template<typename S>
+struct is_c_wstr
+{
+    static constexpr bool value = is_c_str_helper<wchar_t, S>::value;
+};
+#endif // _WIN32
+
+template<typename S>
+struct is_c_str_or_c_wstr
+{
+    static constexpr bool value = is_c_str<S>::value
+#ifdef _WIN32
+        || is_c_wstr<S>::value
+#endif
+        ;
+};
+
+template<
+    typename String,
+    typename = decltype(std::declval<String>().data()),
+    typename = typename std::enable_if<!is_c_str_or_c_wstr<String>::value>::type
+> const typename char_type<String>::type* c_str(const String& path)
+{
+    return path.data();
+}
+
+template<
+    typename String,
+    typename = decltype(std::declval<String>().empty()),
+    typename = typename std::enable_if<!is_c_str_or_c_wstr<String>::value>::type
+> bool empty(const String& path)
+{
+    return path.empty();
+}
+
+template<
+    typename String,
+    typename = typename std::enable_if<is_c_str_or_c_wstr<String>::value>::type
+> const typename char_type<String>::type* c_str(String path)
+{
+    return path;
+}
+
+template<
+    typename String,
+    typename = typename std::enable_if<is_c_str_or_c_wstr<String>::value>::type
+> bool empty(String path)
+{
+    return !path || (*path == 0);
+}
+
+} // namespace detail
+} // namespace mio
+
+#endif // MIO_STRING_UTIL_HEADER
diff --git a/thirdparty/mio/include/mio/mmap.hpp b/thirdparty/mio/include/mio/mmap.hpp
new file mode 100644
index 0000000000..def559a9a2
--- /dev/null
+++ b/thirdparty/mio/include/mio/mmap.hpp
@@ -0,0 +1,492 @@
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_MMAP_HEADER
+#define MIO_MMAP_HEADER
+
+#include "mio/page.hpp"
+
+#include <iterator>
+#include <string>
+#include <system_error>
+#include <cstdint>
+
+#ifdef _WIN32
+# ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+# endif // WIN32_LEAN_AND_MEAN
+# include <windows.h>
+#else // ifdef _WIN32
+# define INVALID_HANDLE_VALUE -1
+#endif // ifdef _WIN32
+
+namespace mio {
+
+// This value may be provided as the `length` parameter to the constructor or
+// `map`, in which case a memory mapping of the entire file is created.
+enum { map_entire_file = 0 };
+
+#ifdef _WIN32
+using file_handle_type = HANDLE;
+#else
+using file_handle_type = int;
+#endif
+
+// This value represents an invalid file handle type. This can be used to
+// determine whether `basic_mmap::file_handle` is valid, for example.
+const static file_handle_type invalid_handle = INVALID_HANDLE_VALUE;
+
+template<access_mode AccessMode, typename ByteT>
+struct basic_mmap
+{
+    using value_type = ByteT;
+    using size_type = size_t;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using difference_type = std::ptrdiff_t;
+    using iterator = pointer;
+    using const_iterator = const_pointer;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+    using iterator_category = std::random_access_iterator_tag;
+    using handle_type = file_handle_type;
+
+    static_assert(sizeof(ByteT) == sizeof(char), "ByteT must be the same size as char.");
+
+private:
+    // Points to the first requested byte, and not to the actual start of the mapping.
+    pointer data_ = nullptr;
+
+    // Length--in bytes--requested by user (which may not be the length of the
+    // full mapping) and the length of the full mapping.
+    size_type length_ = 0;
+    size_type mapped_length_ = 0;
+
+    // Letting user map a file using both an existing file handle and a path
+    // introcudes some complexity (see `is_handle_internal_`).
+    // On POSIX, we only need a file handle to create a mapping, while on
+    // Windows systems the file handle is necessary to retrieve a file mapping
+    // handle, but any subsequent operations on the mapped region must be done
+    // through the latter.
+    handle_type file_handle_ = INVALID_HANDLE_VALUE;
+#ifdef _WIN32
+    handle_type file_mapping_handle_ = INVALID_HANDLE_VALUE;
+#endif
+
+    // Letting user map a file using both an existing file handle and a path
+    // introcudes some complexity in that we must not close the file handle if
+    // user provided it, but we must close it if we obtained it using the
+    // provided path. For this reason, this flag is used to determine when to
+    // close `file_handle_`.
+    bool is_handle_internal_;
+
+public:
+    /**
+     * The default constructed mmap object is in a non-mapped state, that is,
+     * any operation that attempts to access nonexistent underlying data will
+     * result in undefined behaviour/segmentation faults.
+     */
+    basic_mmap() = default;
+
+#ifdef __cpp_exceptions
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    template<typename String>
+    basic_mmap(const String& path, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(path, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    basic_mmap(const handle_type handle, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(handle, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+#endif // __cpp_exceptions
+
+    /**
+     * `basic_mmap` has single-ownership semantics, so transferring ownership
+     * may only be accomplished by moving the object.
+     */
+    basic_mmap(const basic_mmap&) = delete;
+    basic_mmap(basic_mmap&&);
+    basic_mmap& operator=(const basic_mmap&) = delete;
+    basic_mmap& operator=(basic_mmap&&);
+
+    /**
+     * If this is a read-write mapping, the destructor invokes sync. Regardless
+     * of the access mode, unmap is invoked as a final step.
+     */
+    ~basic_mmap();
+
+    /**
+     * On UNIX systems 'file_handle' and 'mapping_handle' are the same. On Windows,
+     * however, a mapped region of a file gets its own handle, which is returned by
+     * 'mapping_handle'.
+     */
+    handle_type file_handle() const noexcept { return file_handle_; }
+    handle_type mapping_handle() const noexcept;
+
+    /** Returns whether a valid memory mapping has been created. */
+    bool is_open() const noexcept { return file_handle_ != invalid_handle; }
+
+    /**
+     * Returns true if no mapping was established, that is, conceptually the
+     * same as though the length that was mapped was 0. This function is
+     * provided so that this class has Container semantics.
+     */
+    bool empty() const noexcept { return length() == 0; }
+
+    /** Returns true if a mapping was established. */
+    bool is_mapped() const noexcept;
+
+    /**
+     * `size` and `length` both return the logical length, i.e. the number of bytes
+     * user requested to be mapped, while `mapped_length` returns the actual number of
+     * bytes that were mapped which is a multiple of the underlying operating system's
+     * page allocation granularity.
+     */
+    size_type size() const noexcept { return length(); }
+    size_type length() const noexcept { return length_; }
+    size_type mapped_length() const noexcept { return mapped_length_; }
+
+    /** Returns the offset relative to the start of the mapping. */
+    size_type mapping_offset() const noexcept
+    {
+        return mapped_length_ - length_;
+    }
+
+    /**
+     * Returns a pointer to the first requested byte, or `nullptr` if no memory mapping
+     * exists.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > pointer data() noexcept { return data_; }
+    const_pointer data() const noexcept { return data_; }
+
+    /**
+     * Returns an iterator to the first requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > iterator begin() noexcept { return data(); }
+    const_iterator begin() const noexcept { return data(); }
+    const_iterator cbegin() const noexcept { return data(); }
+
+    /**
+     * Returns an iterator one past the last requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > iterator end() noexcept { return data() + length(); }
+    const_iterator end() const noexcept { return data() + length(); }
+    const_iterator cend() const noexcept { return data() + length(); }
+
+    /**
+     * Returns a reverse iterator to the last memory mapped byte, if a valid
+     * memory mapping exists, otherwise this function call is undefined
+     * behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
+    const_reverse_iterator rbegin() const noexcept
+    { return const_reverse_iterator(end()); }
+    const_reverse_iterator crbegin() const noexcept
+    { return const_reverse_iterator(end()); }
+
+    /**
+     * Returns a reverse iterator past the first mapped byte, if a valid memory
+     * mapping exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
+    const_reverse_iterator rend() const noexcept
+    { return const_reverse_iterator(begin()); }
+    const_reverse_iterator crend() const noexcept
+    { return const_reverse_iterator(begin()); }
+
+    /**
+     * Returns a reference to the `i`th byte from the first requested byte (as returned
+     * by `data`). If this is invoked when no valid memory mapping has been created
+     * prior to this call, undefined behaviour ensues.
+     */
+    reference operator[](const size_type i) noexcept { return data_[i]; }
+    const_reference operator[](const size_type i) const noexcept { return data_[i]; }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    template<typename String>
+    void map(const String& path, const size_type offset,
+            const size_type length, std::error_code& error);
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     * 
+     * The entire file is mapped.
+     */
+    template<typename String>
+    void map(const String& path, std::error_code& error)
+    {
+        map(path, 0, map_entire_file, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is
+     * unsuccesful, the reason is reported via `error` and the object remains in
+     * a state as if this function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    void map(const handle_type handle, const size_type offset,
+            const size_type length, std::error_code& error);
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is
+     * unsuccesful, the reason is reported via `error` and the object remains in
+     * a state as if this function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     * 
+     * The entire file is mapped.
+     */
+    void map(const handle_type handle, std::error_code& error)
+    {
+        map(handle, 0, map_entire_file, error);
+    }
+
+    /**
+     * If a valid memory mapping has been created prior to this call, this call
+     * instructs the kernel to unmap the memory region and disassociate this object
+     * from the file.
+     *
+     * The file handle associated with the file that is mapped is only closed if the
+     * mapping was created using a file path. If, on the other hand, an existing
+     * file handle was used to create the mapping, the file handle is not closed.
+     */
+    void unmap();
+
+    void swap(basic_mmap& other);
+
+    /** Flushes the memory mapped page to disk. Errors are reported via `error`. */
+    template<access_mode A = AccessMode>
+    typename std::enable_if<A == access_mode::write, void>::type
+    sync(std::error_code& error);
+
+    /**
+     * All operators compare the address of the first byte and size of the two mapped
+     * regions.
+     */
+
+private:
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > pointer get_mapping_start() noexcept
+    {
+        return !data() ? nullptr : data() - mapping_offset();
+    }
+
+    const_pointer get_mapping_start() const noexcept
+    {
+        return !data() ? nullptr : data() - mapping_offset();
+    }
+
+    /**
+     * The destructor syncs changes to disk if `AccessMode` is `write`, but not
+     * if it's `read`, but since the destructor cannot be templated, we need to
+     * do SFINAE in a dedicated function, where one syncs and the other is a noop.
+     */
+    template<access_mode A = AccessMode>
+    typename std::enable_if<A == access_mode::write, void>::type
+    conditional_sync();
+    template<access_mode A = AccessMode>
+    typename std::enable_if<A == access_mode::read, void>::type conditional_sync();
+};
+
+template<access_mode AccessMode, typename ByteT>
+bool operator==(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator!=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+/**
+ * This is the basis for all read-only mmap objects and should be preferred over
+ * directly using `basic_mmap`.
+ */
+template<typename ByteT>
+using basic_mmap_source = basic_mmap<access_mode::read, ByteT>;
+
+/**
+ * This is the basis for all read-write mmap objects and should be preferred over
+ * directly using `basic_mmap`.
+ */
+template<typename ByteT>
+using basic_mmap_sink = basic_mmap<access_mode::write, ByteT>;
+
+/**
+ * These aliases cover the most common use cases, both representing a raw byte stream
+ * (either with a char or an unsigned char/uint8_t).
+ */
+using mmap_source = basic_mmap_source<char>;
+using ummap_source = basic_mmap_source<unsigned char>;
+
+using mmap_sink = basic_mmap_sink<char>;
+using ummap_sink = basic_mmap_sink<unsigned char>;
+
+/**
+ * Convenience factory method that constructs a mapping for any `basic_mmap` or
+ * `basic_mmap` type.
+ */
+template<
+    typename MMap,
+    typename MappingToken
+> MMap make_mmap(const MappingToken& token,
+        int64_t offset, int64_t length, std::error_code& error)
+{
+    MMap mmap;
+    mmap.map(token, offset, length, error);
+    return mmap;
+}
+
+/**
+ * Convenience factory method.
+ *
+ * MappingToken may be a String (`std::string`, `std::string_view`, `const char*`,
+ * `std::filesystem::path`, `std::vector<char>`, or similar), or a
+ * `mmap_source::handle_type`.
+ */
+template<typename MappingToken>
+mmap_source make_mmap_source(const MappingToken& token, mmap_source::size_type offset,
+        mmap_source::size_type length, std::error_code& error)
+{
+    return make_mmap<mmap_source>(token, offset, length, error);
+}
+
+template<typename MappingToken>
+mmap_source make_mmap_source(const MappingToken& token, std::error_code& error)
+{
+    return make_mmap_source(token, 0, map_entire_file, error);
+}
+
+/**
+ * Convenience factory method.
+ *
+ * MappingToken may be a String (`std::string`, `std::string_view`, `const char*`,
+ * `std::filesystem::path`, `std::vector<char>`, or similar), or a
+ * `mmap_sink::handle_type`.
+ */
+template<typename MappingToken>
+mmap_sink make_mmap_sink(const MappingToken& token, mmap_sink::size_type offset,
+        mmap_sink::size_type length, std::error_code& error)
+{
+    return make_mmap<mmap_sink>(token, offset, length, error);
+}
+
+template<typename MappingToken>
+mmap_sink make_mmap_sink(const MappingToken& token, std::error_code& error)
+{
+    return make_mmap_sink(token, 0, map_entire_file, error);
+}
+
+} // namespace mio
+
+#include "detail/mmap.ipp"
+
+#endif // MIO_MMAP_HEADER
diff --git a/thirdparty/mio/include/mio/page.hpp b/thirdparty/mio/include/mio/page.hpp
new file mode 100644
index 0000000000..cae73775fd
--- /dev/null
+++ b/thirdparty/mio/include/mio/page.hpp
@@ -0,0 +1,78 @@
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_PAGE_HEADER
+#define MIO_PAGE_HEADER
+
+#ifdef _WIN32
+# include <windows.h>
+#else
+# include <unistd.h>
+#endif
+
+namespace mio {
+
+/**
+ * This is used by `basic_mmap` to determine whether to create a read-only or
+ * a read-write memory mapping.
+ */
+enum class access_mode
+{
+    read,
+    write
+};
+
+/**
+ * Determines the operating system's page allocation granularity.
+ *
+ * On the first call to this function, it invokes the operating system specific syscall
+ * to determine the page size, caches the value, and returns it. Any subsequent call to
+ * this function serves the cached value, so no further syscalls are made.
+ */
+inline size_t page_size()
+{
+    static const size_t page_size = []
+    {
+#ifdef _WIN32
+        SYSTEM_INFO SystemInfo;
+        GetSystemInfo(&SystemInfo);
+        return SystemInfo.dwAllocationGranularity;
+#else
+        return sysconf(_SC_PAGE_SIZE);
+#endif
+    }();
+    return page_size;
+}
+
+/**
+ * Alligns `offset` to the operating's system page size such that it subtracts the
+ * difference until the nearest page boundary before `offset`, or does nothing if
+ * `offset` is already page aligned.
+ */
+inline size_t make_offset_page_aligned(size_t offset) noexcept
+{
+    const size_t page_size_ = page_size();
+    // Use integer division to round down to the nearest page alignment.
+    return offset / page_size_ * page_size_;
+}
+
+} // namespace mio
+
+#endif // MIO_PAGE_HEADER
diff --git a/thirdparty/mio/include/mio/shared_mmap.hpp b/thirdparty/mio/include/mio/shared_mmap.hpp
new file mode 100644
index 0000000000..f125a59af8
--- /dev/null
+++ b/thirdparty/mio/include/mio/shared_mmap.hpp
@@ -0,0 +1,406 @@
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_SHARED_MMAP_HEADER
+#define MIO_SHARED_MMAP_HEADER
+
+#include "mio/mmap.hpp"
+
+#include <system_error> // std::error_code
+#include <memory> // std::shared_ptr
+
+namespace mio {
+
+/**
+ * Exposes (nearly) the same interface as `basic_mmap`, but endowes it with
+ * `std::shared_ptr` semantics.
+ *
+ * This is not the default behaviour of `basic_mmap` to avoid allocating on the heap if
+ * shared semantics are not required.
+ */
+template<
+    access_mode AccessMode,
+    typename ByteT
+> class basic_shared_mmap
+{
+    using impl_type = basic_mmap<AccessMode, ByteT>;
+    std::shared_ptr<impl_type> pimpl_;
+
+public:
+    using value_type = typename impl_type::value_type;
+    using size_type = typename impl_type::size_type;
+    using reference = typename impl_type::reference;
+    using const_reference = typename impl_type::const_reference;
+    using pointer = typename impl_type::pointer;
+    using const_pointer = typename impl_type::const_pointer;
+    using difference_type = typename impl_type::difference_type;
+    using iterator = typename impl_type::iterator;
+    using const_iterator = typename impl_type::const_iterator;
+    using reverse_iterator = typename impl_type::reverse_iterator;
+    using const_reverse_iterator = typename impl_type::const_reverse_iterator;
+    using iterator_category = typename impl_type::iterator_category;
+    using handle_type = typename impl_type::handle_type;
+    using mmap_type = impl_type;
+
+    basic_shared_mmap() = default;
+    basic_shared_mmap(const basic_shared_mmap&) = default;
+    basic_shared_mmap& operator=(const basic_shared_mmap&) = default;
+    basic_shared_mmap(basic_shared_mmap&&) = default;
+    basic_shared_mmap& operator=(basic_shared_mmap&&) = default;
+
+    /** Takes ownership of an existing mmap object. */
+    basic_shared_mmap(mmap_type&& mmap)
+        : pimpl_(std::make_shared<mmap_type>(std::move(mmap)))
+    {}
+
+    /** Takes ownership of an existing mmap object. */
+    basic_shared_mmap& operator=(mmap_type&& mmap)
+    {
+        pimpl_ = std::make_shared<mmap_type>(std::move(mmap));
+        return *this;
+    }
+
+    /** Initializes this object with an already established shared mmap. */
+    basic_shared_mmap(std::shared_ptr<mmap_type> mmap) : pimpl_(std::move(mmap)) {}
+
+    /** Initializes this object with an already established shared mmap. */
+    basic_shared_mmap& operator=(std::shared_ptr<mmap_type> mmap)
+    {
+        pimpl_ = std::move(mmap);
+        return *this;
+    }
+
+#ifdef __cpp_exceptions
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    template<typename String>
+    basic_shared_mmap(const String& path, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(path, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    basic_shared_mmap(const handle_type handle, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(handle, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+#endif // __cpp_exceptions
+
+    /**
+     * If this is a read-write mapping and the last reference to the mapping,
+     * the destructor invokes sync. Regardless of the access mode, unmap is
+     * invoked as a final step.
+     */
+    ~basic_shared_mmap() = default;
+
+    /** Returns the underlying `std::shared_ptr` instance that holds the mmap. */
+    std::shared_ptr<mmap_type> get_shared_ptr() { return pimpl_; }
+
+    /**
+     * On UNIX systems 'file_handle' and 'mapping_handle' are the same. On Windows,
+     * however, a mapped region of a file gets its own handle, which is returned by
+     * 'mapping_handle'.
+     */
+    handle_type file_handle() const noexcept
+    {
+        return pimpl_ ? pimpl_->file_handle() : invalid_handle;
+    }
+
+    handle_type mapping_handle() const noexcept
+    {
+        return pimpl_ ? pimpl_->mapping_handle() : invalid_handle;
+    }
+
+    /** Returns whether a valid memory mapping has been created. */
+    bool is_open() const noexcept { return pimpl_ && pimpl_->is_open(); }
+
+    /**
+     * Returns true if no mapping was established, that is, conceptually the
+     * same as though the length that was mapped was 0. This function is
+     * provided so that this class has Container semantics.
+     */
+    bool empty() const noexcept { return !pimpl_ || pimpl_->empty(); }
+
+    /**
+     * `size` and `length` both return the logical length, i.e. the number of bytes
+     * user requested to be mapped, while `mapped_length` returns the actual number of
+     * bytes that were mapped which is a multiple of the underlying operating system's
+     * page allocation granularity.
+     */
+    size_type size() const noexcept { return pimpl_ ? pimpl_->length() : 0; }
+    size_type length() const noexcept { return pimpl_ ? pimpl_->length() : 0; }
+    size_type mapped_length() const noexcept
+    {
+        return pimpl_ ? pimpl_->mapped_length() : 0;
+    }
+
+    /**
+     * Returns a pointer to the first requested byte, or `nullptr` if no memory mapping
+     * exists.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > pointer data() noexcept { return pimpl_->data(); }
+    const_pointer data() const noexcept { return pimpl_ ? pimpl_->data() : nullptr; }
+
+    /**
+     * Returns an iterator to the first requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    iterator begin() noexcept { return pimpl_->begin(); }
+    const_iterator begin() const noexcept { return pimpl_->begin(); }
+    const_iterator cbegin() const noexcept { return pimpl_->cbegin(); }
+
+    /**
+     * Returns an iterator one past the last requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > iterator end() noexcept { return pimpl_->end(); }
+    const_iterator end() const noexcept { return pimpl_->end(); }
+    const_iterator cend() const noexcept { return pimpl_->cend(); }
+
+    /**
+     * Returns a reverse iterator to the last memory mapped byte, if a valid
+     * memory mapping exists, otherwise this function call is undefined
+     * behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rbegin() noexcept { return pimpl_->rbegin(); }
+    const_reverse_iterator rbegin() const noexcept { return pimpl_->rbegin(); }
+    const_reverse_iterator crbegin() const noexcept { return pimpl_->crbegin(); }
+
+    /**
+     * Returns a reverse iterator past the first mapped byte, if a valid memory
+     * mapping exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rend() noexcept { return pimpl_->rend(); }
+    const_reverse_iterator rend() const noexcept { return pimpl_->rend(); }
+    const_reverse_iterator crend() const noexcept { return pimpl_->crend(); }
+
+    /**
+     * Returns a reference to the `i`th byte from the first requested byte (as returned
+     * by `data`). If this is invoked when no valid memory mapping has been created
+     * prior to this call, undefined behaviour ensues.
+     */
+    reference operator[](const size_type i) noexcept { return (*pimpl_)[i]; }
+    const_reference operator[](const size_type i) const noexcept { return (*pimpl_)[i]; }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    template<typename String>
+    void map(const String& path, const size_type offset,
+        const size_type length, std::error_code& error)
+    {
+        map_impl(path, offset, length, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     *
+     * The entire file is mapped.
+     */
+    template<typename String>
+    void map(const String& path, std::error_code& error)
+    {
+        map_impl(path, 0, map_entire_file, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    void map(const handle_type handle, const size_type offset,
+        const size_type length, std::error_code& error)
+    {
+        map_impl(handle, offset, length, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     *
+     * The entire file is mapped.
+     */
+    void map(const handle_type handle, std::error_code& error)
+    {
+        map_impl(handle, 0, map_entire_file, error);
+    }
+
+    /**
+     * If a valid memory mapping has been created prior to this call, this call
+     * instructs the kernel to unmap the memory region and disassociate this object
+     * from the file.
+     *
+     * The file handle associated with the file that is mapped is only closed if the
+     * mapping was created using a file path. If, on the other hand, an existing
+     * file handle was used to create the mapping, the file handle is not closed.
+     */
+    void unmap() { if(pimpl_) pimpl_->unmap(); }
+
+    void swap(basic_shared_mmap& other) { pimpl_.swap(other.pimpl_); }
+
+    /** Flushes the memory mapped page to disk. Errors are reported via `error`. */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > void sync(std::error_code& error) { if(pimpl_) pimpl_->sync(error); }
+
+    /** All operators compare the underlying `basic_mmap`'s addresses. */
+
+    friend bool operator==(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ == b.pimpl_;
+    }
+
+    friend bool operator!=(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return !(a == b);
+    }
+
+    friend bool operator<(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ < b.pimpl_;
+    }
+
+    friend bool operator<=(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ <= b.pimpl_;
+    }
+
+    friend bool operator>(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ > b.pimpl_;
+    }
+
+    friend bool operator>=(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ >= b.pimpl_;
+    }
+
+private:
+    template<typename MappingToken>
+    void map_impl(const MappingToken& token, const size_type offset,
+        const size_type length, std::error_code& error)
+    {
+        if(!pimpl_)
+        {
+            mmap_type mmap = make_mmap<mmap_type>(token, offset, length, error);
+            if(error) { return; }
+            pimpl_ = std::make_shared<mmap_type>(std::move(mmap));
+        }
+        else
+        {
+            pimpl_->map(token, offset, length, error);
+        }
+    }
+};
+
+/**
+ * This is the basis for all read-only mmap objects and should be preferred over
+ * directly using basic_shared_mmap.
+ */
+template<typename ByteT>
+using basic_shared_mmap_source = basic_shared_mmap<access_mode::read, ByteT>;
+
+/**
+ * This is the basis for all read-write mmap objects and should be preferred over
+ * directly using basic_shared_mmap.
+ */
+template<typename ByteT>
+using basic_shared_mmap_sink = basic_shared_mmap<access_mode::write, ByteT>;
+
+/**
+ * These aliases cover the most common use cases, both representing a raw byte stream
+ * (either with a char or an unsigned char/uint8_t).
+ */
+using shared_mmap_source = basic_shared_mmap_source<char>;
+using shared_ummap_source = basic_shared_mmap_source<unsigned char>;
+
+using shared_mmap_sink = basic_shared_mmap_sink<char>;
+using shared_ummap_sink = basic_shared_mmap_sink<unsigned char>;
+
+} // namespace mio
+
+#endif // MIO_SHARED_MMAP_HEADER
diff --git a/thirdparty/mio/single_include/mio/mio.hpp b/thirdparty/mio/single_include/mio/mio.hpp
new file mode 100644
index 0000000000..b4b8cd5e95
--- /dev/null
+++ b/thirdparty/mio/single_include/mio/mio.hpp
@@ -0,0 +1,1748 @@
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_MMAP_HEADER
+#define MIO_MMAP_HEADER
+
+// #include "mio/page.hpp"
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_PAGE_HEADER
+#define MIO_PAGE_HEADER
+
+#ifdef _WIN32
+# include <windows.h>
+#else
+# include <unistd.h>
+#endif
+
+namespace mio {
+
+/**
+ * This is used by `basic_mmap` to determine whether to create a read-only or
+ * a read-write memory mapping.
+ */
+enum class access_mode
+{
+    read,
+    write
+};
+
+/**
+ * Determines the operating system's page allocation granularity.
+ *
+ * On the first call to this function, it invokes the operating system specific syscall
+ * to determine the page size, caches the value, and returns it. Any subsequent call to
+ * this function serves the cached value, so no further syscalls are made.
+ */
+inline size_t page_size()
+{
+    static const size_t page_size = []
+    {
+#ifdef _WIN32
+        SYSTEM_INFO SystemInfo;
+        GetSystemInfo(&SystemInfo);
+        return SystemInfo.dwAllocationGranularity;
+#else
+        return sysconf(_SC_PAGE_SIZE);
+#endif
+    }();
+    return page_size;
+}
+
+/**
+ * Alligns `offset` to the operating's system page size such that it subtracts the
+ * difference until the nearest page boundary before `offset`, or does nothing if
+ * `offset` is already page aligned.
+ */
+inline size_t make_offset_page_aligned(size_t offset) noexcept
+{
+    const size_t page_size_ = page_size();
+    // Use integer division to round down to the nearest page alignment.
+    return offset / page_size_ * page_size_;
+}
+
+} // namespace mio
+
+#endif // MIO_PAGE_HEADER
+
+
+#include <iterator>
+#include <string>
+#include <system_error>
+#include <cstdint>
+
+#ifdef _WIN32
+# ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+# endif // WIN32_LEAN_AND_MEAN
+# include <windows.h>
+#else // ifdef _WIN32
+# define INVALID_HANDLE_VALUE -1
+#endif // ifdef _WIN32
+
+namespace mio {
+
+// This value may be provided as the `length` parameter to the constructor or
+// `map`, in which case a memory mapping of the entire file is created.
+enum { map_entire_file = 0 };
+
+#ifdef _WIN32
+using file_handle_type = HANDLE;
+#else
+using file_handle_type = int;
+#endif
+
+// This value represents an invalid file handle type. This can be used to
+// determine whether `basic_mmap::file_handle` is valid, for example.
+const static file_handle_type invalid_handle = INVALID_HANDLE_VALUE;
+
+template<access_mode AccessMode, typename ByteT>
+struct basic_mmap
+{
+    using value_type = ByteT;
+    using size_type = size_t;
+    using reference = value_type&;
+    using const_reference = const value_type&;
+    using pointer = value_type*;
+    using const_pointer = const value_type*;
+    using difference_type = std::ptrdiff_t;
+    using iterator = pointer;
+    using const_iterator = const_pointer;
+    using reverse_iterator = std::reverse_iterator<iterator>;
+    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+    using iterator_category = std::random_access_iterator_tag;
+    using handle_type = file_handle_type;
+
+    static_assert(sizeof(ByteT) == sizeof(char), "ByteT must be the same size as char.");
+
+private:
+    // Points to the first requested byte, and not to the actual start of the mapping.
+    pointer data_ = nullptr;
+
+    // Length--in bytes--requested by user (which may not be the length of the
+    // full mapping) and the length of the full mapping.
+    size_type length_ = 0;
+    size_type mapped_length_ = 0;
+
+    // Letting user map a file using both an existing file handle and a path
+    // introcudes some complexity (see `is_handle_internal_`).
+    // On POSIX, we only need a file handle to create a mapping, while on
+    // Windows systems the file handle is necessary to retrieve a file mapping
+    // handle, but any subsequent operations on the mapped region must be done
+    // through the latter.
+    handle_type file_handle_ = INVALID_HANDLE_VALUE;
+#ifdef _WIN32
+    handle_type file_mapping_handle_ = INVALID_HANDLE_VALUE;
+#endif
+
+    // Letting user map a file using both an existing file handle and a path
+    // introcudes some complexity in that we must not close the file handle if
+    // user provided it, but we must close it if we obtained it using the
+    // provided path. For this reason, this flag is used to determine when to
+    // close `file_handle_`.
+    bool is_handle_internal_;
+
+public:
+    /**
+     * The default constructed mmap object is in a non-mapped state, that is,
+     * any operation that attempts to access nonexistent underlying data will
+     * result in undefined behaviour/segmentation faults.
+     */
+    basic_mmap() = default;
+
+#ifdef __cpp_exceptions
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    template<typename String>
+    basic_mmap(const String& path, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(path, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    basic_mmap(const handle_type handle, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(handle, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+#endif // __cpp_exceptions
+
+    /**
+     * `basic_mmap` has single-ownership semantics, so transferring ownership
+     * may only be accomplished by moving the object.
+     */
+    basic_mmap(const basic_mmap&) = delete;
+    basic_mmap(basic_mmap&&);
+    basic_mmap& operator=(const basic_mmap&) = delete;
+    basic_mmap& operator=(basic_mmap&&);
+
+    /**
+     * If this is a read-write mapping, the destructor invokes sync. Regardless
+     * of the access mode, unmap is invoked as a final step.
+     */
+    ~basic_mmap();
+
+    /**
+     * On UNIX systems 'file_handle' and 'mapping_handle' are the same. On Windows,
+     * however, a mapped region of a file gets its own handle, which is returned by
+     * 'mapping_handle'.
+     */
+    handle_type file_handle() const noexcept { return file_handle_; }
+    handle_type mapping_handle() const noexcept;
+
+    /** Returns whether a valid memory mapping has been created. */
+    bool is_open() const noexcept { return file_handle_ != invalid_handle; }
+
+    /**
+     * Returns true if no mapping was established, that is, conceptually the
+     * same as though the length that was mapped was 0. This function is
+     * provided so that this class has Container semantics.
+     */
+    bool empty() const noexcept { return length() == 0; }
+
+    /** Returns true if a mapping was established. */
+    bool is_mapped() const noexcept;
+
+    /**
+     * `size` and `length` both return the logical length, i.e. the number of bytes
+     * user requested to be mapped, while `mapped_length` returns the actual number of
+     * bytes that were mapped which is a multiple of the underlying operating system's
+     * page allocation granularity.
+     */
+    size_type size() const noexcept { return length(); }
+    size_type length() const noexcept { return length_; }
+    size_type mapped_length() const noexcept { return mapped_length_; }
+
+    /** Returns the offset relative to the start of the mapping. */
+    size_type mapping_offset() const noexcept
+    {
+        return mapped_length_ - length_;
+    }
+
+    /**
+     * Returns a pointer to the first requested byte, or `nullptr` if no memory mapping
+     * exists.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > pointer data() noexcept { return data_; }
+    const_pointer data() const noexcept { return data_; }
+
+    /**
+     * Returns an iterator to the first requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > iterator begin() noexcept { return data(); }
+    const_iterator begin() const noexcept { return data(); }
+    const_iterator cbegin() const noexcept { return data(); }
+
+    /**
+     * Returns an iterator one past the last requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > iterator end() noexcept { return data() + length(); }
+    const_iterator end() const noexcept { return data() + length(); }
+    const_iterator cend() const noexcept { return data() + length(); }
+
+    /**
+     * Returns a reverse iterator to the last memory mapped byte, if a valid
+     * memory mapping exists, otherwise this function call is undefined
+     * behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
+    const_reverse_iterator rbegin() const noexcept
+    { return const_reverse_iterator(end()); }
+    const_reverse_iterator crbegin() const noexcept
+    { return const_reverse_iterator(end()); }
+
+    /**
+     * Returns a reverse iterator past the first mapped byte, if a valid memory
+     * mapping exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
+    const_reverse_iterator rend() const noexcept
+    { return const_reverse_iterator(begin()); }
+    const_reverse_iterator crend() const noexcept
+    { return const_reverse_iterator(begin()); }
+
+    /**
+     * Returns a reference to the `i`th byte from the first requested byte (as returned
+     * by `data`). If this is invoked when no valid memory mapping has been created
+     * prior to this call, undefined behaviour ensues.
+     */
+    reference operator[](const size_type i) noexcept { return data_[i]; }
+    const_reference operator[](const size_type i) const noexcept { return data_[i]; }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    template<typename String>
+    void map(const String& path, const size_type offset,
+            const size_type length, std::error_code& error);
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     * 
+     * The entire file is mapped.
+     */
+    template<typename String>
+    void map(const String& path, std::error_code& error)
+    {
+        map(path, 0, map_entire_file, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is
+     * unsuccesful, the reason is reported via `error` and the object remains in
+     * a state as if this function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    void map(const handle_type handle, const size_type offset,
+            const size_type length, std::error_code& error);
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is
+     * unsuccesful, the reason is reported via `error` and the object remains in
+     * a state as if this function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     * 
+     * The entire file is mapped.
+     */
+    void map(const handle_type handle, std::error_code& error)
+    {
+        map(handle, 0, map_entire_file, error);
+    }
+
+    /**
+     * If a valid memory mapping has been created prior to this call, this call
+     * instructs the kernel to unmap the memory region and disassociate this object
+     * from the file.
+     *
+     * The file handle associated with the file that is mapped is only closed if the
+     * mapping was created using a file path. If, on the other hand, an existing
+     * file handle was used to create the mapping, the file handle is not closed.
+     */
+    void unmap();
+
+    void swap(basic_mmap& other);
+
+    /** Flushes the memory mapped page to disk. Errors are reported via `error`. */
+    template<access_mode A = AccessMode>
+    typename std::enable_if<A == access_mode::write, void>::type
+    sync(std::error_code& error);
+
+    /**
+     * All operators compare the address of the first byte and size of the two mapped
+     * regions.
+     */
+
+private:
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > pointer get_mapping_start() noexcept
+    {
+        return !data() ? nullptr : data() - mapping_offset();
+    }
+
+    const_pointer get_mapping_start() const noexcept
+    {
+        return !data() ? nullptr : data() - mapping_offset();
+    }
+
+    /**
+     * The destructor syncs changes to disk if `AccessMode` is `write`, but not
+     * if it's `read`, but since the destructor cannot be templated, we need to
+     * do SFINAE in a dedicated function, where one syncs and the other is a noop.
+     */
+    template<access_mode A = AccessMode>
+    typename std::enable_if<A == access_mode::write, void>::type
+    conditional_sync();
+    template<access_mode A = AccessMode>
+    typename std::enable_if<A == access_mode::read, void>::type conditional_sync();
+};
+
+template<access_mode AccessMode, typename ByteT>
+bool operator==(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator!=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b);
+
+/**
+ * This is the basis for all read-only mmap objects and should be preferred over
+ * directly using `basic_mmap`.
+ */
+template<typename ByteT>
+using basic_mmap_source = basic_mmap<access_mode::read, ByteT>;
+
+/**
+ * This is the basis for all read-write mmap objects and should be preferred over
+ * directly using `basic_mmap`.
+ */
+template<typename ByteT>
+using basic_mmap_sink = basic_mmap<access_mode::write, ByteT>;
+
+/**
+ * These aliases cover the most common use cases, both representing a raw byte stream
+ * (either with a char or an unsigned char/uint8_t).
+ */
+using mmap_source = basic_mmap_source<char>;
+using ummap_source = basic_mmap_source<unsigned char>;
+
+using mmap_sink = basic_mmap_sink<char>;
+using ummap_sink = basic_mmap_sink<unsigned char>;
+
+/**
+ * Convenience factory method that constructs a mapping for any `basic_mmap` or
+ * `basic_mmap` type.
+ */
+template<
+    typename MMap,
+    typename MappingToken
+> MMap make_mmap(const MappingToken& token,
+        int64_t offset, int64_t length, std::error_code& error)
+{
+    MMap mmap;
+    mmap.map(token, offset, length, error);
+    return mmap;
+}
+
+/**
+ * Convenience factory method.
+ *
+ * MappingToken may be a String (`std::string`, `std::string_view`, `const char*`,
+ * `std::filesystem::path`, `std::vector<char>`, or similar), or a
+ * `mmap_source::handle_type`.
+ */
+template<typename MappingToken>
+mmap_source make_mmap_source(const MappingToken& token, mmap_source::size_type offset,
+        mmap_source::size_type length, std::error_code& error)
+{
+    return make_mmap<mmap_source>(token, offset, length, error);
+}
+
+template<typename MappingToken>
+mmap_source make_mmap_source(const MappingToken& token, std::error_code& error)
+{
+    return make_mmap_source(token, 0, map_entire_file, error);
+}
+
+/**
+ * Convenience factory method.
+ *
+ * MappingToken may be a String (`std::string`, `std::string_view`, `const char*`,
+ * `std::filesystem::path`, `std::vector<char>`, or similar), or a
+ * `mmap_sink::handle_type`.
+ */
+template<typename MappingToken>
+mmap_sink make_mmap_sink(const MappingToken& token, mmap_sink::size_type offset,
+        mmap_sink::size_type length, std::error_code& error)
+{
+    return make_mmap<mmap_sink>(token, offset, length, error);
+}
+
+template<typename MappingToken>
+mmap_sink make_mmap_sink(const MappingToken& token, std::error_code& error)
+{
+    return make_mmap_sink(token, 0, map_entire_file, error);
+}
+
+} // namespace mio
+
+// #include "detail/mmap.ipp"
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_BASIC_MMAP_IMPL
+#define MIO_BASIC_MMAP_IMPL
+
+// #include "mio/mmap.hpp"
+
+// #include "mio/page.hpp"
+
+// #include "mio/detail/string_util.hpp"
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_STRING_UTIL_HEADER
+#define MIO_STRING_UTIL_HEADER
+
+#include <type_traits>
+
+namespace mio {
+namespace detail {
+
+template<
+    typename S,
+    typename C = typename std::decay<S>::type,
+    typename = decltype(std::declval<C>().data()),
+    typename = typename std::enable_if<
+        std::is_same<typename C::value_type, char>::value
+#ifdef _WIN32
+        || std::is_same<typename C::value_type, wchar_t>::value
+#endif
+    >::type
+> struct char_type_helper {
+    using type = typename C::value_type;
+};
+
+template<class T>
+struct char_type {
+    using type = typename char_type_helper<T>::type;
+};
+
+// TODO: can we avoid this brute force approach?
+template<>
+struct char_type<char*> {
+    using type = char;
+};
+
+template<>
+struct char_type<const char*> {
+    using type = char;
+};
+
+template<size_t N>
+struct char_type<char[N]> {
+    using type = char;
+};
+
+template<size_t N>
+struct char_type<const char[N]> {
+    using type = char;
+};
+
+#ifdef _WIN32
+template<>
+struct char_type<wchar_t*> {
+    using type = wchar_t;
+};
+
+template<>
+struct char_type<const wchar_t*> {
+    using type = wchar_t;
+};
+
+template<size_t N>
+struct char_type<wchar_t[N]> {
+    using type = wchar_t;
+};
+
+template<size_t N>
+struct char_type<const wchar_t[N]> {
+    using type = wchar_t;
+};
+#endif // _WIN32
+
+template<typename CharT, typename S>
+struct is_c_str_helper
+{
+    static constexpr bool value = std::is_same<
+        CharT*,
+        // TODO: I'm so sorry for this... Can this be made cleaner?
+        typename std::add_pointer<
+            typename std::remove_cv<
+                typename std::remove_pointer<
+                    typename std::decay<
+                        S
+                    >::type
+                >::type
+            >::type
+        >::type
+    >::value;
+};
+
+template<typename S>
+struct is_c_str
+{
+    static constexpr bool value = is_c_str_helper<char, S>::value;
+};
+
+#ifdef _WIN32
+template<typename S>
+struct is_c_wstr
+{
+    static constexpr bool value = is_c_str_helper<wchar_t, S>::value;
+};
+#endif // _WIN32
+
+template<typename S>
+struct is_c_str_or_c_wstr
+{
+    static constexpr bool value = is_c_str<S>::value
+#ifdef _WIN32
+        || is_c_wstr<S>::value
+#endif
+        ;
+};
+
+template<
+    typename String,
+    typename = decltype(std::declval<String>().data()),
+    typename = typename std::enable_if<!is_c_str_or_c_wstr<String>::value>::type
+> const typename char_type<String>::type* c_str(const String& path)
+{
+    return path.data();
+}
+
+template<
+    typename String,
+    typename = decltype(std::declval<String>().empty()),
+    typename = typename std::enable_if<!is_c_str_or_c_wstr<String>::value>::type
+> bool empty(const String& path)
+{
+    return path.empty();
+}
+
+template<
+    typename String,
+    typename = typename std::enable_if<is_c_str_or_c_wstr<String>::value>::type
+> const typename char_type<String>::type* c_str(String path)
+{
+    return path;
+}
+
+template<
+    typename String,
+    typename = typename std::enable_if<is_c_str_or_c_wstr<String>::value>::type
+> bool empty(String path)
+{
+    return !path || (*path == 0);
+}
+
+} // namespace detail
+} // namespace mio
+
+#endif // MIO_STRING_UTIL_HEADER
+
+
+#include <algorithm>
+
+#ifndef _WIN32
+# include <unistd.h>
+# include <fcntl.h>
+# include <sys/mman.h>
+# include <sys/stat.h>
+#endif
+
+namespace mio {
+namespace detail {
+
+#ifdef _WIN32
+namespace win {
+
+/** Returns the 4 upper bytes of an 8-byte integer. */
+inline DWORD int64_high(int64_t n) noexcept
+{
+    return n >> 32;
+}
+
+/** Returns the 4 lower bytes of an 8-byte integer. */
+inline DWORD int64_low(int64_t n) noexcept
+{
+    return n & 0xffffffff;
+}
+
+template<
+    typename String,
+    typename = typename std::enable_if<
+        std::is_same<typename char_type<String>::type, char>::value
+    >::type
+> file_handle_type open_file_helper(const String& path, const access_mode mode)
+{
+    return ::CreateFileA(c_str(path),
+            mode == access_mode::read ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE,
+            FILE_SHARE_READ | FILE_SHARE_WRITE,
+            0,
+            OPEN_EXISTING,
+            FILE_ATTRIBUTE_NORMAL,
+            0);
+}
+
+template<typename String>
+typename std::enable_if<
+    std::is_same<typename char_type<String>::type, wchar_t>::value,
+    file_handle_type
+>::type open_file_helper(const String& path, const access_mode mode)
+{
+    return ::CreateFileW(c_str(path),
+            mode == access_mode::read ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE,
+            FILE_SHARE_READ | FILE_SHARE_WRITE,
+            0,
+            OPEN_EXISTING,
+            FILE_ATTRIBUTE_NORMAL,
+            0);
+}
+
+} // win
+#endif // _WIN32
+
+/**
+ * Returns the last platform specific system error (errno on POSIX and
+ * GetLastError on Win) as a `std::error_code`.
+ */
+inline std::error_code last_error() noexcept
+{
+    std::error_code error;
+#ifdef _WIN32
+    error.assign(GetLastError(), std::system_category());
+#else
+    error.assign(errno, std::system_category());
+#endif
+    return error;
+}
+
+template<typename String>
+file_handle_type open_file(const String& path, const access_mode mode,
+        std::error_code& error)
+{
+    error.clear();
+    if(detail::empty(path))
+    {
+        error = std::make_error_code(std::errc::invalid_argument);
+        return invalid_handle;
+    }
+#ifdef _WIN32
+    const auto handle = win::open_file_helper(path, mode);
+#else // POSIX
+    const auto handle = ::open(c_str(path),
+            mode == access_mode::read ? O_RDONLY : O_RDWR);
+#endif
+    if(handle == invalid_handle)
+    {
+        error = detail::last_error();
+    }
+    return handle;
+}
+
+inline size_t query_file_size(file_handle_type handle, std::error_code& error)
+{
+    error.clear();
+#ifdef _WIN32
+    LARGE_INTEGER file_size;
+    if(::GetFileSizeEx(handle, &file_size) == 0)
+    {
+        error = detail::last_error();
+        return 0;
+    }
+	return static_cast<int64_t>(file_size.QuadPart);
+#else // POSIX
+    struct stat sbuf;
+    if(::fstat(handle, &sbuf) == -1)
+    {
+        error = detail::last_error();
+        return 0;
+    }
+    return sbuf.st_size;
+#endif
+}
+
+struct mmap_context
+{
+    char* data;
+    int64_t length;
+    int64_t mapped_length;
+#ifdef _WIN32
+    file_handle_type file_mapping_handle;
+#endif
+};
+
+inline mmap_context memory_map(const file_handle_type file_handle, const int64_t offset,
+    const int64_t length, const access_mode mode, std::error_code& error)
+{
+    const int64_t aligned_offset = make_offset_page_aligned(offset);
+    const int64_t length_to_map = offset - aligned_offset + length;
+#ifdef _WIN32
+    const int64_t max_file_size = offset + length;
+    const auto file_mapping_handle = ::CreateFileMapping(
+            file_handle,
+            0,
+            mode == access_mode::read ? PAGE_READONLY : PAGE_READWRITE,
+            win::int64_high(max_file_size),
+            win::int64_low(max_file_size),
+            0);
+    if(file_mapping_handle == invalid_handle)
+    {
+        error = detail::last_error();
+        return {};
+    }
+    char* mapping_start = static_cast<char*>(::MapViewOfFile(
+            file_mapping_handle,
+            mode == access_mode::read ? FILE_MAP_READ : FILE_MAP_WRITE,
+            win::int64_high(aligned_offset),
+            win::int64_low(aligned_offset),
+            length_to_map));
+    if(mapping_start == nullptr)
+    {
+        error = detail::last_error();
+        return {};
+    }
+#else // POSIX
+    char* mapping_start = static_cast<char*>(::mmap(
+            0, // Don't give hint as to where to map.
+            length_to_map,
+            mode == access_mode::read ? PROT_READ : PROT_WRITE,
+            MAP_SHARED,
+            file_handle,
+            aligned_offset));
+    if(mapping_start == MAP_FAILED)
+    {
+        error = detail::last_error();
+        return {};
+    }
+#endif
+    mmap_context ctx;
+    ctx.data = mapping_start + offset - aligned_offset;
+    ctx.length = length;
+    ctx.mapped_length = length_to_map;
+#ifdef _WIN32
+    ctx.file_mapping_handle = file_mapping_handle;
+#endif
+    return ctx;
+}
+
+} // namespace detail
+
+// -- basic_mmap --
+
+template<access_mode AccessMode, typename ByteT>
+basic_mmap<AccessMode, ByteT>::~basic_mmap()
+{
+    conditional_sync();
+    unmap();
+}
+
+template<access_mode AccessMode, typename ByteT>
+basic_mmap<AccessMode, ByteT>::basic_mmap(basic_mmap&& other)
+    : data_(std::move(other.data_))
+    , length_(std::move(other.length_))
+    , mapped_length_(std::move(other.mapped_length_))
+    , file_handle_(std::move(other.file_handle_))
+#ifdef _WIN32
+    , file_mapping_handle_(std::move(other.file_mapping_handle_))
+#endif
+    , is_handle_internal_(std::move(other.is_handle_internal_))
+{
+    other.data_ = nullptr;
+    other.length_ = other.mapped_length_ = 0;
+    other.file_handle_ = invalid_handle;
+#ifdef _WIN32
+    other.file_mapping_handle_ = invalid_handle;
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+basic_mmap<AccessMode, ByteT>&
+basic_mmap<AccessMode, ByteT>::operator=(basic_mmap&& other)
+{
+    if(this != &other)
+    {
+        // First the existing mapping needs to be removed.
+        unmap();
+        data_ = std::move(other.data_);
+        length_ = std::move(other.length_);
+        mapped_length_ = std::move(other.mapped_length_);
+        file_handle_ = std::move(other.file_handle_);
+#ifdef _WIN32
+        file_mapping_handle_ = std::move(other.file_mapping_handle_);
+#endif
+        is_handle_internal_ = std::move(other.is_handle_internal_);
+
+        // The moved from basic_mmap's fields need to be reset, because
+        // otherwise other's destructor will unmap the same mapping that was
+        // just moved into this.
+        other.data_ = nullptr;
+        other.length_ = other.mapped_length_ = 0;
+        other.file_handle_ = invalid_handle;
+#ifdef _WIN32
+        other.file_mapping_handle_ = invalid_handle;
+#endif
+        other.is_handle_internal_ = false;
+    }
+    return *this;
+}
+
+template<access_mode AccessMode, typename ByteT>
+typename basic_mmap<AccessMode, ByteT>::handle_type
+basic_mmap<AccessMode, ByteT>::mapping_handle() const noexcept
+{
+#ifdef _WIN32
+    return file_mapping_handle_;
+#else
+    return file_handle_;
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<typename String>
+void basic_mmap<AccessMode, ByteT>::map(const String& path, const size_type offset,
+        const size_type length, std::error_code& error)
+{
+    error.clear();
+    if(detail::empty(path))
+    {
+        error = std::make_error_code(std::errc::invalid_argument);
+        return;
+    }
+    const auto handle = detail::open_file(path, AccessMode, error);
+    if(error)
+    {
+        return;
+    }
+
+    map(handle, offset, length, error);
+    // This MUST be after the call to map, as that sets this to true.
+    if(!error)
+    {
+        is_handle_internal_ = true;
+    }
+}
+
+template<access_mode AccessMode, typename ByteT>
+void basic_mmap<AccessMode, ByteT>::map(const handle_type handle,
+        const size_type offset, const size_type length, std::error_code& error)
+{
+    error.clear();
+    if(handle == invalid_handle)
+    {
+        error = std::make_error_code(std::errc::bad_file_descriptor);
+        return;
+    }
+
+    const auto file_size = detail::query_file_size(handle, error);
+    if(error)
+    {
+        return;
+    }
+
+    if(offset + length > file_size)
+    {
+        error = std::make_error_code(std::errc::invalid_argument);
+        return;
+    }
+
+    const auto ctx = detail::memory_map(handle, offset,
+            length == map_entire_file ? (file_size - offset) : length,
+            AccessMode, error);
+    if(!error)
+    {
+        // We must unmap the previous mapping that may have existed prior to this call.
+        // Note that this must only be invoked after a new mapping has been created in
+        // order to provide the strong guarantee that, should the new mapping fail, the
+        // `map` function leaves this instance in a state as though the function had
+        // never been invoked.
+        unmap();
+        file_handle_ = handle;
+        is_handle_internal_ = false;
+        data_ = reinterpret_cast<pointer>(ctx.data);
+        length_ = ctx.length;
+        mapped_length_ = ctx.mapped_length;
+#ifdef _WIN32
+        file_mapping_handle_ = ctx.file_mapping_handle;
+#endif
+    }
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<access_mode A>
+typename std::enable_if<A == access_mode::write, void>::type
+basic_mmap<AccessMode, ByteT>::sync(std::error_code& error)
+{
+    error.clear();
+    if(!is_open())
+    {
+        error = std::make_error_code(std::errc::bad_file_descriptor);
+        return;
+    }
+
+    if(data())
+    {
+#ifdef _WIN32
+        if(::FlushViewOfFile(get_mapping_start(), mapped_length_) == 0
+           || ::FlushFileBuffers(file_handle_) == 0)
+#else // POSIX
+        if(::msync(get_mapping_start(), mapped_length_, MS_SYNC) != 0)
+#endif
+        {
+            error = detail::last_error();
+            return;
+        }
+    }
+#ifdef _WIN32
+    if(::FlushFileBuffers(file_handle_) == 0)
+    {
+        error = detail::last_error();
+    }
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+void basic_mmap<AccessMode, ByteT>::unmap()
+{
+    if(!is_open()) { return; }
+    // TODO do we care about errors here?
+#ifdef _WIN32
+    if(is_mapped())
+    {
+        ::UnmapViewOfFile(get_mapping_start());
+        ::CloseHandle(file_mapping_handle_);
+    }
+#else // POSIX
+    if(data_) { ::munmap(const_cast<pointer>(get_mapping_start()), mapped_length_); }
+#endif
+
+    // If `file_handle_` was obtained by our opening it (when map is called with
+    // a path, rather than an existing file handle), we need to close it,
+    // otherwise it must not be closed as it may still be used outside this
+    // instance.
+    if(is_handle_internal_)
+    {
+#ifdef _WIN32
+        ::CloseHandle(file_handle_);
+#else // POSIX
+        ::close(file_handle_);
+#endif
+    }
+
+    // Reset fields to their default values.
+    data_ = nullptr;
+    length_ = mapped_length_ = 0;
+    file_handle_ = invalid_handle;
+#ifdef _WIN32
+    file_mapping_handle_ = invalid_handle;
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool basic_mmap<AccessMode, ByteT>::is_mapped() const noexcept
+{
+#ifdef _WIN32
+    return file_mapping_handle_ != invalid_handle;
+#else // POSIX
+    return is_open();
+#endif
+}
+
+template<access_mode AccessMode, typename ByteT>
+void basic_mmap<AccessMode, ByteT>::swap(basic_mmap& other)
+{
+    if(this != &other)
+    {
+        using std::swap;
+        swap(data_, other.data_); 
+        swap(file_handle_, other.file_handle_); 
+#ifdef _WIN32
+        swap(file_mapping_handle_, other.file_mapping_handle_); 
+#endif
+        swap(length_, other.length_); 
+        swap(mapped_length_, other.mapped_length_); 
+        swap(is_handle_internal_, other.is_handle_internal_); 
+    }
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<access_mode A>
+typename std::enable_if<A == access_mode::write, void>::type
+basic_mmap<AccessMode, ByteT>::conditional_sync()
+{
+    // This is invoked from the destructor, so not much we can do about
+    // failures here.
+    std::error_code ec;
+    sync(ec);
+}
+
+template<access_mode AccessMode, typename ByteT>
+template<access_mode A>
+typename std::enable_if<A == access_mode::read, void>::type
+basic_mmap<AccessMode, ByteT>::conditional_sync()
+{
+    // noop
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator==(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return a.data() == b.data()
+        && a.size() == b.size();
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator!=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return !(a == b);
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    if(a.data() == b.data()) { return a.size() < b.size(); }
+    return a.data() < b.data();
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator<=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return !(a > b);
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    if(a.data() == b.data()) { return a.size() > b.size(); }
+    return a.data() > b.data();
+}
+
+template<access_mode AccessMode, typename ByteT>
+bool operator>=(const basic_mmap<AccessMode, ByteT>& a,
+        const basic_mmap<AccessMode, ByteT>& b)
+{
+    return !(a < b);
+}
+
+} // namespace mio
+
+#endif // MIO_BASIC_MMAP_IMPL
+
+
+#endif // MIO_MMAP_HEADER
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_PAGE_HEADER
+#define MIO_PAGE_HEADER
+
+#ifdef _WIN32
+# include <windows.h>
+#else
+# include <unistd.h>
+#endif
+
+namespace mio {
+
+/**
+ * This is used by `basic_mmap` to determine whether to create a read-only or
+ * a read-write memory mapping.
+ */
+enum class access_mode
+{
+    read,
+    write
+};
+
+/**
+ * Determines the operating system's page allocation granularity.
+ *
+ * On the first call to this function, it invokes the operating system specific syscall
+ * to determine the page size, caches the value, and returns it. Any subsequent call to
+ * this function serves the cached value, so no further syscalls are made.
+ */
+inline size_t page_size()
+{
+    static const size_t page_size = []
+    {
+#ifdef _WIN32
+        SYSTEM_INFO SystemInfo;
+        GetSystemInfo(&SystemInfo);
+        return SystemInfo.dwAllocationGranularity;
+#else
+        return sysconf(_SC_PAGE_SIZE);
+#endif
+    }();
+    return page_size;
+}
+
+/**
+ * Alligns `offset` to the operating's system page size such that it subtracts the
+ * difference until the nearest page boundary before `offset`, or does nothing if
+ * `offset` is already page aligned.
+ */
+inline size_t make_offset_page_aligned(size_t offset) noexcept
+{
+    const size_t page_size_ = page_size();
+    // Use integer division to round down to the nearest page alignment.
+    return offset / page_size_ * page_size_;
+}
+
+} // namespace mio
+
+#endif // MIO_PAGE_HEADER
+/* Copyright 2017 https://github.com/mandreyel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of this
+ * software and associated documentation files (the "Software"), to deal in the Software
+ * without restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all copies
+ * or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef MIO_SHARED_MMAP_HEADER
+#define MIO_SHARED_MMAP_HEADER
+
+// #include "mio/mmap.hpp"
+
+
+#include <system_error> // std::error_code
+#include <memory> // std::shared_ptr
+
+namespace mio {
+
+/**
+ * Exposes (nearly) the same interface as `basic_mmap`, but endowes it with
+ * `std::shared_ptr` semantics.
+ *
+ * This is not the default behaviour of `basic_mmap` to avoid allocating on the heap if
+ * shared semantics are not required.
+ */
+template<
+    access_mode AccessMode,
+    typename ByteT
+> class basic_shared_mmap
+{
+    using impl_type = basic_mmap<AccessMode, ByteT>;
+    std::shared_ptr<impl_type> pimpl_;
+
+public:
+    using value_type = typename impl_type::value_type;
+    using size_type = typename impl_type::size_type;
+    using reference = typename impl_type::reference;
+    using const_reference = typename impl_type::const_reference;
+    using pointer = typename impl_type::pointer;
+    using const_pointer = typename impl_type::const_pointer;
+    using difference_type = typename impl_type::difference_type;
+    using iterator = typename impl_type::iterator;
+    using const_iterator = typename impl_type::const_iterator;
+    using reverse_iterator = typename impl_type::reverse_iterator;
+    using const_reverse_iterator = typename impl_type::const_reverse_iterator;
+    using iterator_category = typename impl_type::iterator_category;
+    using handle_type = typename impl_type::handle_type;
+    using mmap_type = impl_type;
+
+    basic_shared_mmap() = default;
+    basic_shared_mmap(const basic_shared_mmap&) = default;
+    basic_shared_mmap& operator=(const basic_shared_mmap&) = default;
+    basic_shared_mmap(basic_shared_mmap&&) = default;
+    basic_shared_mmap& operator=(basic_shared_mmap&&) = default;
+
+    /** Takes ownership of an existing mmap object. */
+    basic_shared_mmap(mmap_type&& mmap)
+        : pimpl_(std::make_shared<mmap_type>(std::move(mmap)))
+    {}
+
+    /** Takes ownership of an existing mmap object. */
+    basic_shared_mmap& operator=(mmap_type&& mmap)
+    {
+        pimpl_ = std::make_shared<mmap_type>(std::move(mmap));
+        return *this;
+    }
+
+    /** Initializes this object with an already established shared mmap. */
+    basic_shared_mmap(std::shared_ptr<mmap_type> mmap) : pimpl_(std::move(mmap)) {}
+
+    /** Initializes this object with an already established shared mmap. */
+    basic_shared_mmap& operator=(std::shared_ptr<mmap_type> mmap)
+    {
+        pimpl_ = std::move(mmap);
+        return *this;
+    }
+
+#ifdef __cpp_exceptions
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    template<typename String>
+    basic_shared_mmap(const String& path, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(path, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+
+    /**
+     * The same as invoking the `map` function, except any error that may occur
+     * while establishing the mapping is wrapped in a `std::system_error` and is
+     * thrown.
+     */
+    basic_shared_mmap(const handle_type handle, const size_type offset = 0, const size_type length = map_entire_file)
+    {
+        std::error_code error;
+        map(handle, offset, length, error);
+        if(error) { throw std::system_error(error); }
+    }
+#endif // __cpp_exceptions
+
+    /**
+     * If this is a read-write mapping and the last reference to the mapping,
+     * the destructor invokes sync. Regardless of the access mode, unmap is
+     * invoked as a final step.
+     */
+    ~basic_shared_mmap() = default;
+
+    /** Returns the underlying `std::shared_ptr` instance that holds the mmap. */
+    std::shared_ptr<mmap_type> get_shared_ptr() { return pimpl_; }
+
+    /**
+     * On UNIX systems 'file_handle' and 'mapping_handle' are the same. On Windows,
+     * however, a mapped region of a file gets its own handle, which is returned by
+     * 'mapping_handle'.
+     */
+    handle_type file_handle() const noexcept
+    {
+        return pimpl_ ? pimpl_->file_handle() : invalid_handle;
+    }
+
+    handle_type mapping_handle() const noexcept
+    {
+        return pimpl_ ? pimpl_->mapping_handle() : invalid_handle;
+    }
+
+    /** Returns whether a valid memory mapping has been created. */
+    bool is_open() const noexcept { return pimpl_ && pimpl_->is_open(); }
+
+    /**
+     * Returns true if no mapping was established, that is, conceptually the
+     * same as though the length that was mapped was 0. This function is
+     * provided so that this class has Container semantics.
+     */
+    bool empty() const noexcept { return !pimpl_ || pimpl_->empty(); }
+
+    /**
+     * `size` and `length` both return the logical length, i.e. the number of bytes
+     * user requested to be mapped, while `mapped_length` returns the actual number of
+     * bytes that were mapped which is a multiple of the underlying operating system's
+     * page allocation granularity.
+     */
+    size_type size() const noexcept { return pimpl_ ? pimpl_->length() : 0; }
+    size_type length() const noexcept { return pimpl_ ? pimpl_->length() : 0; }
+    size_type mapped_length() const noexcept
+    {
+        return pimpl_ ? pimpl_->mapped_length() : 0;
+    }
+
+    /**
+     * Returns a pointer to the first requested byte, or `nullptr` if no memory mapping
+     * exists.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > pointer data() noexcept { return pimpl_->data(); }
+    const_pointer data() const noexcept { return pimpl_ ? pimpl_->data() : nullptr; }
+
+    /**
+     * Returns an iterator to the first requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    iterator begin() noexcept { return pimpl_->begin(); }
+    const_iterator begin() const noexcept { return pimpl_->begin(); }
+    const_iterator cbegin() const noexcept { return pimpl_->cbegin(); }
+
+    /**
+     * Returns an iterator one past the last requested byte, if a valid memory mapping
+     * exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > iterator end() noexcept { return pimpl_->end(); }
+    const_iterator end() const noexcept { return pimpl_->end(); }
+    const_iterator cend() const noexcept { return pimpl_->cend(); }
+
+    /**
+     * Returns a reverse iterator to the last memory mapped byte, if a valid
+     * memory mapping exists, otherwise this function call is undefined
+     * behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rbegin() noexcept { return pimpl_->rbegin(); }
+    const_reverse_iterator rbegin() const noexcept { return pimpl_->rbegin(); }
+    const_reverse_iterator crbegin() const noexcept { return pimpl_->crbegin(); }
+
+    /**
+     * Returns a reverse iterator past the first mapped byte, if a valid memory
+     * mapping exists, otherwise this function call is undefined behaviour.
+     */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > reverse_iterator rend() noexcept { return pimpl_->rend(); }
+    const_reverse_iterator rend() const noexcept { return pimpl_->rend(); }
+    const_reverse_iterator crend() const noexcept { return pimpl_->crend(); }
+
+    /**
+     * Returns a reference to the `i`th byte from the first requested byte (as returned
+     * by `data`). If this is invoked when no valid memory mapping has been created
+     * prior to this call, undefined behaviour ensues.
+     */
+    reference operator[](const size_type i) noexcept { return (*pimpl_)[i]; }
+    const_reference operator[](const size_type i) const noexcept { return (*pimpl_)[i]; }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    template<typename String>
+    void map(const String& path, const size_type offset,
+        const size_type length, std::error_code& error)
+    {
+        map_impl(path, offset, length, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `path`, which must be a path to an existing file, is used to retrieve a file
+     * handle (which is closed when the object destructs or `unmap` is called), which is
+     * then used to memory map the requested region. Upon failure, `error` is set to
+     * indicate the reason and the object remains in an unmapped state.
+     *
+     * The entire file is mapped.
+     */
+    template<typename String>
+    void map(const String& path, std::error_code& error)
+    {
+        map_impl(path, 0, map_entire_file, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     *
+     * `offset` is the number of bytes, relative to the start of the file, where the
+     * mapping should begin. When specifying it, there is no need to worry about
+     * providing a value that is aligned with the operating system's page allocation
+     * granularity. This is adjusted by the implementation such that the first requested
+     * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at
+     * `offset` from the start of the file.
+     *
+     * `length` is the number of bytes to map. It may be `map_entire_file`, in which
+     * case a mapping of the entire file is created.
+     */
+    void map(const handle_type handle, const size_type offset,
+        const size_type length, std::error_code& error)
+    {
+        map_impl(handle, offset, length, error);
+    }
+
+    /**
+     * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the
+     * reason is reported via `error` and the object remains in a state as if this
+     * function hadn't been called.
+     *
+     * `handle`, which must be a valid file handle, which is used to memory map the
+     * requested region. Upon failure, `error` is set to indicate the reason and the
+     * object remains in an unmapped state.
+     *
+     * The entire file is mapped.
+     */
+    void map(const handle_type handle, std::error_code& error)
+    {
+        map_impl(handle, 0, map_entire_file, error);
+    }
+
+    /**
+     * If a valid memory mapping has been created prior to this call, this call
+     * instructs the kernel to unmap the memory region and disassociate this object
+     * from the file.
+     *
+     * The file handle associated with the file that is mapped is only closed if the
+     * mapping was created using a file path. If, on the other hand, an existing
+     * file handle was used to create the mapping, the file handle is not closed.
+     */
+    void unmap() { if(pimpl_) pimpl_->unmap(); }
+
+    void swap(basic_shared_mmap& other) { pimpl_.swap(other.pimpl_); }
+
+    /** Flushes the memory mapped page to disk. Errors are reported via `error`. */
+    template<
+        access_mode A = AccessMode,
+        typename = typename std::enable_if<A == access_mode::write>::type
+    > void sync(std::error_code& error) { if(pimpl_) pimpl_->sync(error); }
+
+    /** All operators compare the underlying `basic_mmap`'s addresses. */
+
+    friend bool operator==(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ == b.pimpl_;
+    }
+
+    friend bool operator!=(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return !(a == b);
+    }
+
+    friend bool operator<(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ < b.pimpl_;
+    }
+
+    friend bool operator<=(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ <= b.pimpl_;
+    }
+
+    friend bool operator>(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ > b.pimpl_;
+    }
+
+    friend bool operator>=(const basic_shared_mmap& a, const basic_shared_mmap& b)
+    {
+        return a.pimpl_ >= b.pimpl_;
+    }
+
+private:
+    template<typename MappingToken>
+    void map_impl(const MappingToken& token, const size_type offset,
+        const size_type length, std::error_code& error)
+    {
+        if(!pimpl_)
+        {
+            mmap_type mmap = make_mmap<mmap_type>(token, offset, length, error);
+            if(error) { return; }
+            pimpl_ = std::make_shared<mmap_type>(std::move(mmap));
+        }
+        else
+        {
+            pimpl_->map(token, offset, length, error);
+        }
+    }
+};
+
+/**
+ * This is the basis for all read-only mmap objects and should be preferred over
+ * directly using basic_shared_mmap.
+ */
+template<typename ByteT>
+using basic_shared_mmap_source = basic_shared_mmap<access_mode::read, ByteT>;
+
+/**
+ * This is the basis for all read-write mmap objects and should be preferred over
+ * directly using basic_shared_mmap.
+ */
+template<typename ByteT>
+using basic_shared_mmap_sink = basic_shared_mmap<access_mode::write, ByteT>;
+
+/**
+ * These aliases cover the most common use cases, both representing a raw byte stream
+ * (either with a char or an unsigned char/uint8_t).
+ */
+using shared_mmap_source = basic_shared_mmap_source<char>;
+using shared_ummap_source = basic_shared_mmap_source<unsigned char>;
+
+using shared_mmap_sink = basic_shared_mmap_sink<char>;
+using shared_ummap_sink = basic_shared_mmap_sink<unsigned char>;
+
+} // namespace mio
+
+#endif // MIO_SHARED_MMAP_HEADER
diff --git a/thirdparty/mio/test/CMakeLists.txt b/thirdparty/mio/test/CMakeLists.txt
new file mode 100644
index 0000000000..6bfc47e5a1
--- /dev/null
+++ b/thirdparty/mio/test/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_executable(mio.test test.cpp)
+target_link_libraries(mio.test PRIVATE mio::mio)
+add_test(NAME mio.test COMMAND mio.test)
+
+if(WIN32)
+    add_executable(mio.unicode.test test.cpp)
+    target_link_libraries(mio.unicode.test PRIVATE mio::mio)
+    target_compile_definitions(mio.unicode.test PRIVATE UNICODE)
+    add_test(NAME mio.unicode.test COMMAND mio.test)
+
+    add_executable(mio.fullwinapi.test test.cpp)
+    target_link_libraries(mio.fullwinapi.test 
+        PRIVATE mio::mio_full_winapi)
+    add_test(NAME mio.fullwinapi.test COMMAND mio.test)
+endif()
diff --git a/thirdparty/mio/test/example.cpp b/thirdparty/mio/test/example.cpp
new file mode 100644
index 0000000000..841a57fd81
--- /dev/null
+++ b/thirdparty/mio/test/example.cpp
@@ -0,0 +1,75 @@
+#include <mio/mmap.hpp>
+#include <system_error> // for std::error_code
+#include <cstdio> // for std::printf
+#include <cassert>
+#include <algorithm>
+#include <fstream>
+
+int handle_error(const std::error_code& error);
+void allocate_file(const std::string& path, const int size);
+
+int main()
+{
+    const auto path = "file.txt";
+
+    // NOTE: mio does *not* create the file for you if it doesn't exist! You
+    // must ensure that the file exists before establishing a mapping. It
+    // must also be non-empty. So for illustrative purposes the file is
+    // created now.
+    allocate_file(path, 155);
+
+    // Read-write memory map the whole file by using `map_entire_file` where the
+    // length of the mapping is otherwise expected, with the factory method.
+    std::error_code error;
+    mio::mmap_sink rw_mmap = mio::make_mmap_sink(
+            path, 0, mio::map_entire_file, error);
+    if (error) { return handle_error(error); }
+
+    // You can use any iterator based function.
+    std::fill(rw_mmap.begin(), rw_mmap.end(), 'a');
+
+    // Or manually iterate through the mapped region just as if it were any other 
+    // container, and change each byte's value (since this is a read-write mapping).
+    for (auto& b : rw_mmap) {
+        b += 10;
+    }
+
+    // Or just change one value with the subscript operator.
+    const int answer_index = rw_mmap.size() / 2;
+    rw_mmap[answer_index] = 42;
+
+    // Don't forget to flush changes to disk before unmapping. However, if
+    // `rw_mmap` were to go out of scope at this point, the destructor would also
+    // automatically invoke `sync` before `unmap`.
+    rw_mmap.sync(error);
+    if (error) { return handle_error(error); }
+
+    // We can then remove the mapping, after which rw_mmap will be in a default
+    // constructed state, i.e. this and the above call to `sync` have the same
+    // effect as if the destructor had been invoked.
+    rw_mmap.unmap();
+
+    // Now create the same mapping, but in read-only mode. Note that calling the
+    // overload without the offset and file length parameters maps the entire
+    // file.
+    mio::mmap_source ro_mmap;
+    ro_mmap.map(path, error);
+    if (error) { return handle_error(error); }
+
+    const int the_answer_to_everything = ro_mmap[answer_index];
+    assert(the_answer_to_everything == 42);
+}
+
+int handle_error(const std::error_code& error)
+{
+    const auto& errmsg = error.message();
+    std::printf("error mapping file: %s, exiting...\n", errmsg.c_str());
+    return error.value();
+}
+
+void allocate_file(const std::string& path, const int size)
+{
+    std::ofstream file(path);
+    std::string s(size, '0');
+    file << s;
+}
diff --git a/thirdparty/mio/test/test.cpp b/thirdparty/mio/test/test.cpp
new file mode 100644
index 0000000000..82827f924c
--- /dev/null
+++ b/thirdparty/mio/test/test.cpp
@@ -0,0 +1,182 @@
+#include <mio/mmap.hpp>
+#include <mio/shared_mmap.hpp>
+
+#include <string>
+#include <fstream>
+#include <cstdlib>
+#include <iostream>
+#include <cassert>
+#include <system_error>
+#include <numeric>
+
+#ifndef _WIN32
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#endif
+
+// Just make sure this compiles.
+#ifdef CXX17
+# include <cstddef>
+using mmap_source = mio::basic_mmap_source<std::byte>;
+#endif
+
+template<class MMap>
+void test_at_offset(const MMap& file_view, const std::string& buffer,
+        const size_t offset);
+void test_at_offset(const std::string& buffer, const char* path,
+        const size_t offset, std::error_code& error);
+int handle_error(const std::error_code& error);
+
+int main()
+{
+    std::error_code error;
+
+    // Make sure mio compiles with non-const char* strings too.
+    const char _path[] = "test-file";
+    const int path_len = sizeof(_path);
+    char* path = new char[path_len];
+    std::copy(_path, _path + path_len, path);
+
+    const auto page_size = mio::page_size();
+    // Fill buffer, then write it to file.
+    const int file_size = 4 * page_size - 250; // 16134, if page size is 4KiB
+    std::string buffer(file_size, 0);
+    // Start at first printable ASCII character.
+    char v = 33;
+    for (auto& b : buffer) {
+       b = v;
+       ++v; 
+       // Limit to last printable ASCII character.
+       v %= 126;
+       if(v == 0) {
+           v = 33;
+       }
+    }
+
+    std::ofstream file(path);
+    file << buffer;
+    file.close();
+
+    // Test whole file mapping.
+    test_at_offset(buffer, path, 0, error);
+    if (error) { return handle_error(error); }
+
+    // Test starting from below the page size.
+    test_at_offset(buffer, path, page_size - 3, error);
+    if (error) { return handle_error(error); }
+
+    // Test starting from above the page size.
+    test_at_offset(buffer, path, page_size + 3, error);
+    if (error) { return handle_error(error); }
+
+    // Test starting from above the page size.
+    test_at_offset(buffer, path, 2 * page_size + 3, error);
+    if (error) { return handle_error(error); }
+
+    {
+#define CHECK_INVALID_MMAP(m) do { \
+        assert(error); \
+        assert(m.empty()); \
+        assert(!m.is_open()); \
+        error.clear(); } while(0)
+
+        mio::mmap_source m;
+
+        // See if mapping an invalid file results in an error.
+        m = mio::make_mmap_source("garbage-that-hopefully-doesnt-exist", 0, 0, error);
+        CHECK_INVALID_MMAP(m);
+
+        // Empty path?
+        m = mio::make_mmap_source(static_cast<const char*>(0), 0, 0, error);
+        CHECK_INVALID_MMAP(m);
+        m = mio::make_mmap_source(std::string(), 0, 0, error);
+        CHECK_INVALID_MMAP(m);
+
+        // Invalid handle?
+        m = mio::make_mmap_source(mio::invalid_handle, 0, 0, error);
+        CHECK_INVALID_MMAP(m);
+
+        // Invalid offset?
+        m = mio::make_mmap_source(path, 100 * buffer.size(), buffer.size(), error);
+        CHECK_INVALID_MMAP(m);
+    }
+
+    // Make sure these compile.
+    {
+        mio::ummap_source _1;
+        mio::shared_ummap_source _2;
+        // Make sure shared_mmap mapping compiles as all testing was done on
+        // normal mmaps.
+        mio::shared_mmap_source _3(path, 0, mio::map_entire_file);
+        auto _4 = mio::make_mmap_source(path, error);
+        auto _5 = mio::make_mmap<mio::shared_mmap_source>(path, 0, mio::map_entire_file, error);
+#ifdef _WIN32
+        const wchar_t* wpath1 = L"dasfsf";
+        auto _6 = mio::make_mmap_source(wpath1, error);
+        mio::mmap_source _7;
+        _7.map(wpath1, error);
+        const std::wstring wpath2 = wpath1;
+        auto _8 = mio::make_mmap_source(wpath2, error);
+        mio::mmap_source _9;
+        _9.map(wpath1, error);
+#else
+        const int fd = open(path, O_RDONLY);
+        mio::mmap_source _fdmmap(fd, 0, mio::map_entire_file);
+        _fdmmap.unmap();
+        _fdmmap.map(fd, error);
+#endif
+    }
+
+    std::printf("all tests passed!\n");
+}
+
+void test_at_offset(const std::string& buffer, const char* path,
+        const size_t offset, std::error_code& error)
+{
+    // Sanity check.
+    assert(offset < buffer.size());
+
+    // Map the region of the file to which buffer was written.
+    mio::mmap_source file_view = mio::make_mmap_source(
+            path, offset, mio::map_entire_file, error);
+    if(error) { return; }
+
+    assert(file_view.is_open());
+    const size_t mapped_size = buffer.size() - offset;
+    assert(file_view.size() == mapped_size);
+
+    test_at_offset(file_view, buffer, offset);
+
+    // Turn file_view into a shared mmap.
+    mio::shared_mmap_source shared_file_view(std::move(file_view));
+    assert(!file_view.is_open());
+    assert(shared_file_view.is_open());
+    assert(shared_file_view.size() == mapped_size);
+
+    //test_at_offset(shared_file_view, buffer, offset);
+}
+
+template<class MMap>
+void test_at_offset(const MMap& file_view, const std::string& buffer,
+        const size_t offset)
+{
+    // Then verify that mmap's bytes correspond to that of buffer.
+    for(size_t buf_idx = offset, view_idx = 0;
+            buf_idx < buffer.size() && view_idx < file_view.size();
+            ++buf_idx, ++view_idx) {
+        if(file_view[view_idx] != buffer[buf_idx]) {
+            std::printf("%luth byte mismatch: expected(%d) <> actual(%d)",
+                    buf_idx, buffer[buf_idx], file_view[view_idx]);
+            std::cout << std::flush;
+            assert(0);
+        }
+    }
+}
+
+int handle_error(const std::error_code& error)
+{
+    const auto& errmsg = error.message();
+    std::printf("Error mapping file: %s, exiting...\n", errmsg.c_str());
+    return error.value();
+}
diff --git a/thirdparty/mio/third_party/LICENSE.md b/thirdparty/mio/third_party/LICENSE.md
new file mode 100644
index 0000000000..23edca5d15
--- /dev/null
+++ b/thirdparty/mio/third_party/LICENSE.md
@@ -0,0 +1,28 @@
+amalgamate.py - Amalgamate C source and header files
+Copyright (c) 2012, Erik Edlund <erik.edlund@32767.se>
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of Erik Edlund, nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/thirdparty/mio/third_party/amalgamate.py b/thirdparty/mio/third_party/amalgamate.py
new file mode 100644
index 0000000000..174e1c5379
--- /dev/null
+++ b/thirdparty/mio/third_party/amalgamate.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# amalgamate.py - Amalgamate C source and header files.
+# Copyright (c) 2012, Erik Edlund <erik.edlund@32767.se>
+# 
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 
+#  * Redistributions of source code must retain the above copyright notice,
+#  this list of conditions and the following disclaimer.
+# 
+#  * Redistributions in binary form must reproduce the above copyright notice,
+#  this list of conditions and the following disclaimer in the documentation
+#  and/or other materials provided with the distribution.
+# 
+#  * Neither the name of Erik Edlund, nor the names of its contributors may
+#  be used to endorse or promote products derived from this software without
+#  specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import datetime
+import json
+import os
+import re
+
+
+class Amalgamation(object):
+
+    # Prepends self.source_path to file_path if needed.
+    def actual_path(self, file_path):
+        if not os.path.isabs(file_path):
+            file_path = os.path.join(self.source_path, file_path)
+        return file_path
+
+    # Search included file_path in self.include_paths and
+    # in source_dir if specified.
+    def find_included_file(self, file_path, source_dir):
+        search_dirs = self.include_paths[:]
+        if source_dir:
+            search_dirs.insert(0, source_dir)
+
+        for search_dir in search_dirs:
+            search_path = os.path.join(search_dir, file_path)
+            if os.path.isfile(self.actual_path(search_path)):
+                return search_path
+        return None
+
+    def __init__(self, args):
+        with open(args.config, 'r') as f:
+            config = json.loads(f.read())
+            for key in config:
+                setattr(self, key, config[key])
+
+            self.verbose = args.verbose == "yes"
+            self.prologue = args.prologue
+            self.source_path = args.source_path
+            self.included_files = []
+
+    # Generate the amalgamation and write it to the target file.
+    def generate(self):
+        amalgamation = ""
+
+        if self.prologue:
+            with open(self.prologue, 'r') as f:
+                amalgamation += datetime.datetime.now().strftime(f.read())
+
+        if self.verbose:
+            print("Config:")
+            print(" target        = {0}".format(self.target))
+            print(" working_dir   = {0}".format(os.getcwd()))
+            print(" include_paths = {0}".format(self.include_paths))
+        print("Creating amalgamation:")
+        for file_path in self.sources:
+            # Do not check the include paths while processing the source
+            # list, all given source paths must be correct.
+            # actual_path = self.actual_path(file_path)
+            print(" - processing \"{0}\"".format(file_path))
+            t = TranslationUnit(file_path, self, True)
+            amalgamation += t.content
+
+        with open(self.target, 'w') as f:
+            f.write(amalgamation)
+
+        print("...done!\n")
+        if self.verbose:
+            print("Files processed: {0}".format(self.sources))
+            print("Files included: {0}".format(self.included_files))
+        print("")
+
+
+def _is_within(match, matches):
+    for m in matches:
+        if match.start() > m.start() and \
+                match.end() < m.end():
+            return True
+    return False
+
+
+class TranslationUnit(object):
+    # // C++ comment.
+    cpp_comment_pattern = re.compile(r"//.*?\n")
+
+    # /* C comment. */
+    c_comment_pattern = re.compile(r"/\*.*?\*/", re.S)
+
+    # "complex \"stri\\\ng\" value".
+    string_pattern = re.compile("[^']" r'".*?(?<=[^\\])"', re.S)
+
+    # Handle simple include directives. Support for advanced
+    # directives where macros and defines needs to expanded is
+    # not a concern right now.
+    include_pattern = re.compile(
+        r'#\s*include\s+(<|")(?P<path>.*?)("|>)', re.S)
+
+    # #pragma once
+    pragma_once_pattern = re.compile(r'#\s*pragma\s+once', re.S)
+
+    # Search for pattern in self.content, add the match to
+    # contexts if found and update the index accordingly.
+    def _search_content(self, index, pattern, contexts):
+        match = pattern.search(self.content, index)
+        if match:
+            contexts.append(match)
+            return match.end()
+        return index + 2
+
+    # Return all the skippable contexts, i.e., comments and strings
+    def _find_skippable_contexts(self):
+        # Find contexts in the content in which a found include
+        # directive should not be processed.
+        skippable_contexts = []
+
+        # Walk through the content char by char, and try to grab
+        # skippable contexts using regular expressions when found.
+        i = 1
+        content_len = len(self.content)
+        while i < content_len:
+            j = i - 1
+            current = self.content[i]
+            previous = self.content[j]
+
+            if current == '"':
+                # String value.
+                i = self._search_content(j, self.string_pattern,
+                                         skippable_contexts)
+            elif current == '*' and previous == '/':
+                # C style comment.
+                i = self._search_content(j, self.c_comment_pattern,
+                                         skippable_contexts)
+            elif current == '/' and previous == '/':
+                # C++ style comment.
+                i = self._search_content(j, self.cpp_comment_pattern,
+                                         skippable_contexts)
+            else:
+                # Skip to the next char.
+                i += 1
+
+        return skippable_contexts
+
+    # Returns True if the match is within list of other matches
+
+    # Removes pragma once from content
+    def _process_pragma_once(self):
+        content_len = len(self.content)
+        if content_len < len("#include <x>"):
+            return 0
+
+        # Find contexts in the content in which a found include
+        # directive should not be processed.
+        skippable_contexts = self._find_skippable_contexts()
+
+        pragmas = []
+        pragma_once_match = self.pragma_once_pattern.search(self.content)
+        while pragma_once_match:
+            if not _is_within(pragma_once_match, skippable_contexts):
+                pragmas.append(pragma_once_match)
+
+            pragma_once_match = self.pragma_once_pattern.search(self.content,
+                                                                pragma_once_match.end())
+
+        # Handle all collected pragma once directives.
+        prev_end = 0
+        tmp_content = ''
+        for pragma_match in pragmas:
+            tmp_content += self.content[prev_end:pragma_match.start()]
+            prev_end = pragma_match.end()
+        tmp_content += self.content[prev_end:]
+        self.content = tmp_content
+
+    # Include all trivial #include directives into self.content.
+    def _process_includes(self):
+        content_len = len(self.content)
+        if content_len < len("#include <x>"):
+            return 0
+
+        # Find contexts in the content in which a found include
+        # directive should not be processed.
+        skippable_contexts = self._find_skippable_contexts()
+
+        # Search for include directives in the content, collect those
+        # which should be included into the content.
+        includes = []
+        include_match = self.include_pattern.search(self.content)
+        while include_match:
+            if not _is_within(include_match, skippable_contexts):
+                include_path = include_match.group("path")
+                search_same_dir = include_match.group(1) == '"'
+                found_included_path = self.amalgamation.find_included_file(
+                    include_path, self.file_dir if search_same_dir else None)
+                if found_included_path:
+                    includes.append((include_match, found_included_path))
+
+            include_match = self.include_pattern.search(self.content,
+                                                        include_match.end())
+
+        # Handle all collected include directives.
+        prev_end = 0
+        tmp_content = ''
+        for include in includes:
+            include_match, found_included_path = include
+            tmp_content += self.content[prev_end:include_match.start()]
+            tmp_content += "// {0}\n".format(include_match.group(0))
+            if found_included_path not in self.amalgamation.included_files:
+                t = TranslationUnit(found_included_path, self.amalgamation, False)
+                tmp_content += t.content
+            prev_end = include_match.end()
+        tmp_content += self.content[prev_end:]
+        self.content = tmp_content
+
+        return len(includes)
+
+    # Make all content processing
+    def _process(self):
+        if not self.is_root:
+            self._process_pragma_once()
+        self._process_includes()
+
+    def __init__(self, file_path, amalgamation, is_root):
+        self.file_path = file_path
+        self.file_dir = os.path.dirname(file_path)
+        self.amalgamation = amalgamation
+        self.is_root = is_root
+
+        self.amalgamation.included_files.append(self.file_path)
+
+        actual_path = self.amalgamation.actual_path(file_path)
+        if not os.path.isfile(actual_path):
+            raise IOError("File not found: \"{0}\"".format(file_path))
+        with open(actual_path, 'r') as f:
+            self.content = f.read()
+            self._process()
+
+
+def main():
+    description = "Amalgamate C source and header files."
+    usage = " ".join([
+        "amalgamate.py",
+        "[-v]",
+        "-c path/to/config.json",
+        "-s path/to/source/dir",
+        "[-p path/to/prologue.(c|h)]"
+    ])
+    argsparser = argparse.ArgumentParser(
+        description=description, usage=usage)
+
+    argsparser.add_argument("-v", "--verbose", dest="verbose",
+                            choices=["yes", "no"], metavar="", help="be verbose")
+
+    argsparser.add_argument("-c", "--config", dest="config",
+                            required=True, metavar="", help="path to a JSON config file")
+
+    argsparser.add_argument("-s", "--source", dest="source_path",
+                            required=True, metavar="", help="source code path")
+
+    argsparser.add_argument("-p", "--prologue", dest="prologue",
+                            required=False, metavar="", help="path to a C prologue file")
+
+    amalgamation = Amalgamation(argsparser.parse_args())
+    amalgamation.generate()
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/thirdparty/mio/third_party/config.json b/thirdparty/mio/third_party/config.json
new file mode 100644
index 0000000000..fa2e71ad93
--- /dev/null
+++ b/thirdparty/mio/third_party/config.json
@@ -0,0 +1,11 @@
+{
+	"project": "Cross-platform C++11 header-only library for memory mapped file IO",
+	"target": "../single_include/mio/mio.hpp",
+	"sources": [
+		"../include/mio/mmap.hpp",
+		"../include/mio/page.hpp",
+		"../include/mio/shared_mmap.hpp"
+	],
+	"include_paths": ["../include"]
+}
+
diff --git a/thirdparty/rocksdb/.gitignore b/thirdparty/rocksdb/.gitignore
index 03b805983a..e88ccfc008 100644
--- a/thirdparty/rocksdb/.gitignore
+++ b/thirdparty/rocksdb/.gitignore
@@ -45,6 +45,8 @@ etags
 rocksdb_dump
 rocksdb_undump
 db_test2
+trace_analyzer
+trace_analyzer_test
 
 java/out
 java/target
diff --git a/thirdparty/rocksdb/.lgtm.yml b/thirdparty/rocksdb/.lgtm.yml
new file mode 100644
index 0000000000..12d6f1d4e5
--- /dev/null
+++ b/thirdparty/rocksdb/.lgtm.yml
@@ -0,0 +1,4 @@
+extraction:
+  cpp:
+    index:
+      build_command: make static_lib
diff --git a/thirdparty/rocksdb/.travis.yml b/thirdparty/rocksdb/.travis.yml
index b76973d4e8..e759a642a0 100644
--- a/thirdparty/rocksdb/.travis.yml
+++ b/thirdparty/rocksdb/.travis.yml
@@ -15,12 +15,23 @@ cache:
   - apt
 
 addons:
-   apt:
-      packages: ['zlib1g-dev', 'libbz2-dev', 'libsnappy-dev', 'curl', 'libgflags-dev', 'mingw-w64']
+  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - curl
+      - g++-8
+      - libbz2-dev
+      - libgflags-dev
+      - libsnappy-dev
+      - mingw-w64
+      - zlib1g-dev
 env:
   - TEST_GROUP=platform_dependent # 16-18 minutes
   - TEST_GROUP=1 # 33-35 minutes
-  - TEST_GROUP=2 # 30-32 minutes
+  - TEST_GROUP=2 # 18-20 minutes
+  - TEST_GROUP=3 # 20-22 minutes
+  - TEST_GROUP=4 # 12-14 minutes
   # Run java tests
   - JOB_NAME=java_test # 4-11 minutes
   # Build ROCKSDB_LITE
@@ -28,6 +39,7 @@ env:
   # Build examples
   - JOB_NAME=examples # 5-7 minutes
   - JOB_NAME=cmake # 3-5 minutes
+  - JOB_NAME=cmake-gcc8 # 3-5 minutes
   - JOB_NAME=cmake-mingw # 3 minutes
 
 matrix:
@@ -36,6 +48,12 @@ matrix:
     env: TEST_GROUP=1
   - os: osx
     env: TEST_GROUP=2
+  - os: osx
+    env: TEST_GROUP=3
+  - os: osx
+    env: TEST_GROUP=4
+  - os: osx
+    env: JOB_NAME=cmake-gcc8
   - os : osx
     env: JOB_NAME=cmake-mingw
   - os : linux
@@ -46,9 +64,15 @@ matrix:
 # https://docs.travis-ci.com/user/caching/#ccache-cache
 install:
   - if [ "${TRAVIS_OS_NAME}" == osx ]; then
-      brew install ccache;
+      brew install ccache zstd lz4 snappy xz;
       PATH=$PATH:/usr/local/opt/ccache/libexec;
     fi
+  - if [ "${JOB_NAME}" == cmake-gcc8 ]; then
+      CC=gcc-8 && CXX=g++-8;
+    fi
+  - if [[ "${JOB_NAME}" == cmake* ]] && [ "${TRAVIS_OS_NAME}" == linux ]; then
+      mkdir cmake-dist && curl -sfSL https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.tar.gz | tar --strip-components=1 -C cmake-dist -xz && export PATH=$PWD/cmake-dist/bin:$PATH;
+    fi
 
 before_script:
   # Increase the maximum number of open file descriptors, since some tests use
@@ -57,14 +81,41 @@ before_script:
 
 script:
   - ${CXX} --version
-  - if [ "${TEST_GROUP}" == 'platform_dependent' ]; then ccache -C && OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some; fi
-  - if [ "${TEST_GROUP}" == '1' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=comparator_db_test make -j4 check_some; fi
-  - if [ "${TEST_GROUP}" == '2' ]; then OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=comparator_db_test make -j4 check_some; fi
-  - if [ "${JOB_NAME}" == 'java_test' ]; then OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest; fi
-  - if [ "${JOB_NAME}" == 'lite_build' ]; then OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib tools; fi
-  - if [ "${JOB_NAME}" == 'examples' ]; then OPT=-DTRAVIS V=1 make -j4 static_lib; cd examples; make -j4; fi
-  - if [ "${JOB_NAME}" == 'cmake' ]; then mkdir build && cd build && cmake .. && make -j4 rocksdb; fi
-  - if [ "${JOB_NAME}" == 'cmake-mingw' ]; then mkdir build && cd build && cmake .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb; fi
+  - if [ `command -v ccache` ]; then ccache -C; fi
+  - case $TEST_GROUP in
+    platform_dependent)
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_END=db_block_cache_test make -j4 all_but_some_tests check_some
+      ;;
+    1)
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=db_block_cache_test ROCKSDBTESTS_END=full_filter_block_test make -j4 check_some
+      ;;
+    2)
+      OPT=-DTRAVIS V=1 make -j4 tools && OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=full_filter_block_test ROCKSDBTESTS_END=write_batch_with_index_test make -j4 check_some
+      ;;
+    3)
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_batch_with_index_test ROCKSDBTESTS_END=write_prepared_transaction_test make -j4 check_some
+      ;;
+    4)
+      OPT=-DTRAVIS V=1 ROCKSDBTESTS_START=write_prepared_transaction_test make -j4 check_some
+      ;;
+    esac
+  - case $JOB_NAME in
+    java_test)
+      OPT=-DTRAVIS V=1 make clean jclean && make rocksdbjava jtest
+      ;;
+    lite_build)
+      OPT='-DTRAVIS -DROCKSDB_LITE' V=1 make -j4 static_lib tools
+      ;;
+    examples)
+      OPT=-DTRAVIS V=1 make -j4 static_lib && cd examples && make -j4
+      ;;
+    cmake-mingw)
+      mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
+      ;;
+    cmake*)
+      mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_BUILD_TYPE=Release && make -j4 rocksdb rocksdbjni
+      ;;
+    esac
 notifications:
     email:
       - leveldb@fb.com
diff --git a/thirdparty/rocksdb/AUTHORS b/thirdparty/rocksdb/AUTHORS
index e644f5530f..a451875f1a 100644
--- a/thirdparty/rocksdb/AUTHORS
+++ b/thirdparty/rocksdb/AUTHORS
@@ -9,3 +9,4 @@ Sanjay Ghemawat <sanjay@google.com>
 # Partial list of contributors:
 Kevin Regan <kevin.d.regan@gmail.com>
 Johan Bilien <jobi@litl.com>
+Matthew Von-Maszewski <https://github.com/matthewvon> (Basho Technologies)
diff --git a/thirdparty/rocksdb/CMakeLists.txt b/thirdparty/rocksdb/CMakeLists.txt
index f20d09abf2..132d3b04e9 100644
--- a/thirdparty/rocksdb/CMakeLists.txt
+++ b/thirdparty/rocksdb/CMakeLists.txt
@@ -14,9 +14,9 @@
 #        cd build
 # 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries.
 #    See thirdparty.inc for more information.
-#        sample command: cmake -G "Visual Studio 14 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 -DJNI=1 ..
+#        sample command: cmake -G "Visual Studio 15 Win64" -DWITH_GFLAGS=1 -DWITH_SNAPPY=1 -DWITH_JEMALLOC=1 -DWITH_JNI=1 ..
 # 4. Then build the project in debug mode (you may want to add /m[:<N>] flag to run msbuild in <N> parallel threads
-#                                          or simply /m ot use all avail cores)
+#                                          or simply /m to use all avail cores)
 #        msbuild rocksdb.sln
 #
 #        rocksdb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything
@@ -32,21 +32,36 @@
 # 3. cmake ..
 # 4. make -j
 
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 2.8.12)
 project(rocksdb)
+enable_language(CXX)
+enable_language(C)
+enable_language(ASM)
 
 if(POLICY CMP0042)
   cmake_policy(SET CMP0042 NEW)
 endif()
 
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/modules/")
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/")
 
 option(WITH_JEMALLOC "build with JeMalloc" OFF)
+option(WITH_SNAPPY "build with SNAPPY" OFF)
+option(WITH_LZ4 "build with lz4" OFF)
+option(WITH_ZLIB "build with zlib" OFF)
+option(WITH_ZSTD "build with zstd" OFF)
+option(WITH_WINDOWS_UTF8_FILENAMES "use UTF8 as characterset for opening files, regardles of the system code page" OFF)
+if (WITH_WINDOWS_UTF8_FILENAMES)
+  add_definitions(-DROCKSDB_WINDOWS_UTF8_FILENAMES)
+endif()
 if(MSVC)
+  # Defaults currently different for GFLAGS.
+  #  We will address find_package work a little later
+  option(WITH_GFLAGS "build with GFlags" OFF)
+  option(WITH_XPRESS "build with windows built in compression" OFF)
   include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc)
 else()
   if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    # FreeBSD has jemaloc as default malloc
+    # FreeBSD has jemalloc as default malloc
     # but it does not have all the jemalloc files in include/...
     set(WITH_JEMALLOC ON)
   else()
@@ -54,10 +69,21 @@ else()
       find_package(JeMalloc REQUIRED)
       add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE)
       include_directories(${JEMALLOC_INCLUDE_DIR})
+      list(APPEND THIRDPARTY_LIBS ${JEMALLOC_LIBRARIES})
+    endif()
+  endif()
+
+  # No config file for this
+  option(WITH_GFLAGS "build with GFlags" ON)
+  if(WITH_GFLAGS)
+    find_package(gflags)
+    if(gflags_FOUND)
+      add_definitions(-DGFLAGS=1)
+      include_directories(${gflags_INCLUDE_DIR})
+      list(APPEND THIRDPARTY_LIBS ${gflags_LIBRARIES})
     endif()
   endif()
 
-  option(WITH_SNAPPY "build with SNAPPY" OFF)
   if(WITH_SNAPPY)
     find_package(snappy REQUIRED)
     add_definitions(-DSNAPPY)
@@ -65,11 +91,16 @@ else()
     list(APPEND THIRDPARTY_LIBS ${SNAPPY_LIBRARIES})
   endif()
 
-  option(WITH_ZLIB "build with zlib" OFF)
   if(WITH_ZLIB)
     find_package(zlib REQUIRED)
     add_definitions(-DZLIB)
-    include_directories(${ZLIB_INCLUDE_DIR})
+    if(ZLIB_INCLUDE_DIRS)
+      # CMake 3
+      include_directories(${ZLIB_INCLUDE_DIRS})
+    else()
+      # CMake 2
+      include_directories(${ZLIB_INCLUDE_DIR})
+    endif()
     list(APPEND THIRDPARTY_LIBS ${ZLIB_LIBRARIES})
   endif()
 
@@ -81,7 +112,6 @@ else()
     list(APPEND THIRDPARTY_LIBS ${BZIP2_LIBRARIES})
   endif()
 
-  option(WITH_LZ4 "build with lz4" OFF)
   if(WITH_LZ4)
     find_package(lz4 REQUIRED)
     add_definitions(-DLZ4)
@@ -89,7 +119,6 @@ else()
     list(APPEND THIRDPARTY_LIBS ${LZ4_LIBRARIES})
   endif()
 
-  option(WITH_ZSTD "build with zstd" OFF)
   if(WITH_ZSTD)
     find_package(zstd REQUIRED)
     add_definitions(-DZSTD)
@@ -98,17 +127,7 @@ else()
   endif()
 endif()
 
-if(WIN32)
-  execute_process(COMMAND powershell -noprofile -Command "Get-Date -format MM_dd_yyyy" OUTPUT_VARIABLE DATE)
-  execute_process(COMMAND powershell -noprofile -Command "Get-Date -format HH:mm:ss" OUTPUT_VARIABLE TIME)
-  string(REGEX REPLACE "(..)_(..)_..(..).*" "\\1/\\2/\\3" DATE "${DATE}")
-  string(REGEX REPLACE "(..):(.....).*" " \\1:\\2" TIME "${TIME}")
-  set(GIT_DATE_TIME "${DATE} ${TIME}")
-else()
-  execute_process(COMMAND date "+%Y/%m/%d %H:%M:%S" OUTPUT_VARIABLE DATETIME)
-  string(REGEX REPLACE "\n" "" DATETIME ${DATETIME})
-  set(GIT_DATE_TIME "${DATETIME}")
-endif()
+string(TIMESTAMP GIT_DATE_TIME "%Y/%m/%d %H:%M:%S" UTC)
 
 find_package(Git)
 
@@ -124,20 +143,17 @@ endif()
 
 string(REGEX REPLACE "[^0-9a-f]+" "" GIT_SHA "${GIT_SHA}")
 
-if(NOT WIN32)
-  execute_process(COMMAND
-      "./build_tools/version.sh" "full"
-      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-      OUTPUT_VARIABLE ROCKSDB_VERSION
-  )
-  string(STRIP "${ROCKSDB_VERSION}" ROCKSDB_VERSION)
-  execute_process(COMMAND
-      "./build_tools/version.sh" "major"
-      WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-      OUTPUT_VARIABLE ROCKSDB_VERSION_MAJOR
-  )
-  string(STRIP "${ROCKSDB_VERSION_MAJOR}" ROCKSDB_VERSION_MAJOR)
-endif()
+
+# Read rocksdb version from version.h header file.
+file(READ include/rocksdb/version.h version_header_file)
+string(REGEX MATCH "#define ROCKSDB_MAJOR ([0-9]+)" _ ${version_header_file})
+set(ROCKSDB_VERSION_MAJOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define ROCKSDB_MINOR ([0-9]+)" _ ${version_header_file})
+set(ROCKSDB_VERSION_MINOR ${CMAKE_MATCH_1})
+string(REGEX MATCH "#define ROCKSDB_PATCH ([0-9]+)" _ ${version_header_file})
+set(ROCKSDB_VERSION_PATCH ${CMAKE_MATCH_1})
+set(ROCKSDB_VERSION ${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH})
+
 
 option(WITH_MD_LIBRARY "build with MD" ON)
 if(WIN32 AND MSVC)
@@ -155,16 +171,16 @@ target_include_directories(build_version PRIVATE
   ${CMAKE_CURRENT_SOURCE_DIR}/util)
 if(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W3 /wd4127 /wd4800 /wd4996 /wd4351")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324")
 else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing")
   if(MINGW)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format")
   endif()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
   if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -fno-omit-frame-pointer")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
     include(CheckCXXCompilerFlag)
     CHECK_CXX_COMPILER_FLAG("-momit-leaf-frame-pointer" HAVE_OMIT_LEAF_FRAME_POINTER)
     if(HAVE_OMIT_LEAF_FRAME_POINTER)
@@ -173,33 +189,56 @@ else()
   endif()
 endif()
 
+include(CheckCCompilerFlag)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  CHECK_C_COMPILER_FLAG("-maltivec" HAS_ALTIVEC)
+  if(HAS_ALTIVEC)
+    message(STATUS " HAS_ALTIVEC yes")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maltivec")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=power8")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=power8")
+  endif(HAS_ALTIVEC)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le")
+
 option(PORTABLE "build a portable binary" OFF)
 option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
 if(PORTABLE)
   # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h
   # is available, it is available by default.
   if(FORCE_SSE42 AND NOT MSVC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
   endif()
 else()
   if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
   else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+    if(NOT HAVE_POWER8)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
+    endif()
   endif()
 endif()
 
-set(CMAKE_REQUIRED_FLAGS ${CMAKE_CXX_FLAGS})
 include(CheckCXXSourceCompiles)
+if(NOT MSVC)
+  set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
+endif()
 CHECK_CXX_SOURCE_COMPILES("
 #include <cstdint>
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 int main() {
   volatile uint32_t x = _mm_crc32_u32(0, 0);
+  const auto a = _mm_set_epi64x(0, 0);
+  const auto b = _mm_set_epi64x(0, 0);
+  const auto c = _mm_clmulepi64_si128(a, b, 0x00);
+  auto d = _mm_cvtsi128_si64(c);
 }
 " HAVE_SSE42)
+unset(CMAKE_REQUIRED_FLAGS)
 if(HAVE_SSE42)
   add_definitions(-DHAVE_SSE42)
+  add_definitions(-DHAVE_PCLMUL)
 elseif(FORCE_SSE42)
   message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
 endif()
@@ -256,21 +295,69 @@ if(WITH_UBSAN)
   endif()
 endif()
 
-# Used to run CI build and tests so we can run faster
-set(OPTIMIZE_DEBUG_DEFAULT 0)        # Debug build is unoptimized by default use -DOPTDBG=1 to optimize
+option(WITH_NUMA "build with NUMA policy support" OFF)
+if(WITH_NUMA)
+  find_package(NUMA REQUIRED)
+  add_definitions(-DNUMA)
+  include_directories(${NUMA_INCLUDE_DIR})
+  list(APPEND THIRDPARTY_LIBS ${NUMA_LIBRARIES})
+endif()
+
+option(WITH_TBB "build with Threading Building Blocks (TBB)" OFF)
+if(WITH_TBB)
+  find_package(TBB REQUIRED)
+  add_definitions(-DTBB)
+  include_directories(${TBB_INCLUDE_DIR})
+  list(APPEND THIRDPARTY_LIBS ${TBB_LIBRARIES})
+endif()
 
-if(DEFINED OPTDBG)
-   set(OPTIMIZE_DEBUG ${OPTDBG})
+# Stall notifications eat some performance from inserts
+option(DISABLE_STALL_NOTIF "Build with stall notifications" OFF)
+if(DISABLE_STALL_NOTIF)
+  add_definitions(-DROCKSDB_DISABLE_STALL_NOTIFICATION)
+endif()
+
+
+if(DEFINED USE_RTTI)
+  if(USE_RTTI)
+    message(STATUS "Enabling RTTI")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DROCKSDB_USE_RTTI")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DROCKSDB_USE_RTTI")
+  else()
+    if(MSVC)
+      message(STATUS "Disabling RTTI in Release builds. Always on in Debug.")
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DROCKSDB_USE_RTTI")
+      set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GR-")
+    else()
+      message(STATUS "Disabling RTTI in Release builds")
+      set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-rtti")
+      set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fno-rtti")
+    endif()
+  endif()
 else()
-   set(OPTIMIZE_DEBUG ${OPTIMIZE_DEBUG_DEFAULT})
+  message(STATUS "Enabling RTTI in Debug builds only (default)")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DROCKSDB_USE_RTTI")
+  if(MSVC)
+     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /GR-")
+  else()
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fno-rtti")
+  endif()
 endif()
 
+# Used to run CI build and tests so we can run faster
+option(OPTDBG "Build optimized debug build with MSVC" OFF)
+option(WITH_RUNTIME_DEBUG "build with debug version of runtime library" ON)
 if(MSVC)
-  if((${OPTIMIZE_DEBUG} EQUAL 1))
+  if(OPTDBG)
     message(STATUS "Debug optimization is enabled")
-    set(CMAKE_CXX_FLAGS_DEBUG "/Oxt /${RUNTIME_LIBRARY}d")
+    set(CMAKE_CXX_FLAGS_DEBUG "/Oxt")
   else()
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm /${RUNTIME_LIBRARY}d")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Od /RTC1 /Gm")
+  endif()
+  if(WITH_RUNTIME_DEBUG)
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}d")
+  else()
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /${RUNTIME_LIBRARY}")
   endif()
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /${RUNTIME_LIBRARY}")
 
@@ -285,7 +372,7 @@ endif()
 option(ROCKSDB_LITE "Build RocksDBLite version" OFF)
 if(ROCKSDB_LITE)
   add_definitions(-DROCKSDB_LITE)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os")
 endif()
 
 if(CMAKE_SYSTEM_NAME MATCHES "Cygwin")
@@ -323,16 +410,13 @@ if(NOT WIN32)
 endif()
 
 option(WITH_FALLOCATE "build with fallocate" ON)
-
 if(WITH_FALLOCATE)
-  set(CMAKE_REQUIRED_FLAGS ${CMAKE_C_FLAGS})
-  include(CheckCSourceCompiles)
-  CHECK_C_SOURCE_COMPILES("
+  CHECK_CXX_SOURCE_COMPILES("
 #include <fcntl.h>
 #include <linux/falloc.h>
 int main() {
  int fd = open(\"/dev/null\", 0);
- fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024);
+ fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 1024);
 }
 " HAVE_FALLOCATE)
   if(HAVE_FALLOCATE)
@@ -340,16 +424,44 @@ int main() {
   endif()
 endif()
 
-include(CheckFunctionExists)
-CHECK_FUNCTION_EXISTS(malloc_usable_size HAVE_MALLOC_USABLE_SIZE)
+CHECK_CXX_SOURCE_COMPILES("
+#include <fcntl.h>
+int main() {
+  int fd = open(\"/dev/null\", 0);
+  sync_file_range(fd, 0, 1024, SYNC_FILE_RANGE_WRITE);
+}
+" HAVE_SYNC_FILE_RANGE_WRITE)
+if(HAVE_SYNC_FILE_RANGE_WRITE)
+  add_definitions(-DROCKSDB_RANGESYNC_PRESENT)
+endif()
+
+CHECK_CXX_SOURCE_COMPILES("
+#include <pthread.h>
+int main() {
+  (void) PTHREAD_MUTEX_ADAPTIVE_NP;
+}
+" HAVE_PTHREAD_MUTEX_ADAPTIVE_NP)
+if(HAVE_PTHREAD_MUTEX_ADAPTIVE_NP)
+  add_definitions(-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX)
+endif()
+
+include(CheckCXXSymbolExists)
+check_cxx_symbol_exists(malloc_usable_size malloc.h HAVE_MALLOC_USABLE_SIZE)
 if(HAVE_MALLOC_USABLE_SIZE)
   add_definitions(-DROCKSDB_MALLOC_USABLE_SIZE)
 endif()
 
+check_cxx_symbol_exists(sched_getcpu sched.h HAVE_SCHED_GETCPU)
+if(HAVE_SCHED_GETCPU)
+  add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT)
+endif()
+
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/include)
+include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
 find_package(Threads REQUIRED)
 
+add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
 
 # Main library source code
 
@@ -365,6 +477,7 @@ set(SOURCES
         db/compaction_iterator.cc
         db/compaction_job.cc
         db/compaction_picker.cc
+        db/compaction_picker_fifo.cc
         db/compaction_picker_universal.cc
         db/convenience.cc
         db/db_filesnapshot.cc
@@ -376,9 +489,11 @@ set(SOURCES
         db/db_impl_debug.cc
         db/db_impl_experimental.cc
         db/db_impl_readonly.cc
+        db/db_impl_secondary.cc
         db/db_info_dumper.cc
         db/db_iter.cc
         db/dbformat.cc
+        db/error_handler.cc
         db/event_helpers.cc
         db/experimental.cc
         db/external_sst_file_ingestion_job.cc
@@ -387,15 +502,17 @@ set(SOURCES
         db/flush_scheduler.cc
         db/forward_iterator.cc
         db/internal_stats.cc
+        db/in_memory_stats_history.cc
+        db/logs_with_prep_tracker.cc
         db/log_reader.cc
         db/log_writer.cc
         db/malloc_stats.cc
-        db/managed_iterator.cc
         db/memtable.cc
         db/memtable_list.cc
         db/merge_helper.cc
         db/merge_operator.cc
         db/range_del_aggregator.cc
+        db/range_tombstone_fragmenter.cc
         db/repair.cc
         db/snapshot_impl.cc
         db/table_cache.cc
@@ -415,7 +532,6 @@ set(SOURCES
         env/env_hdfs.cc
         env/mock_env.cc
         memtable/alloc_tracker.cc
-        memtable/hash_cuckoo_rep.cc
         memtable/hash_linklist_rep.cc
         memtable/hash_skiplist_rep.cc
         memtable/skiplistrep.cc
@@ -446,11 +562,14 @@ set(SOURCES
         table/block_based_table_factory.cc
         table/block_based_table_reader.cc
         table/block_builder.cc
+        table/block_fetcher.cc
         table/block_prefix_index.cc
         table/bloom_block.cc
         table/cuckoo_table_builder.cc
         table/cuckoo_table_factory.cc
         table/cuckoo_table_reader.cc
+        table/data_block_hash_index.cc
+        table/data_block_footer.cc
         table/flush_block_policy.cc
         table/format.cc
         table/full_filter_block.cc
@@ -466,6 +585,7 @@ set(SOURCES
         table/plain_table_index.cc
         table/plain_table_key_coding.cc
         table/plain_table_reader.cc
+        table/sst_file_reader.cc
         table/sst_file_writer.cc
         table/table_properties.cc
         table/two_level_iterator.cc
@@ -474,13 +594,16 @@ set(SOURCES
         tools/ldb_cmd.cc
         tools/ldb_tool.cc
         tools/sst_dump_tool.cc
+        tools/trace_analyzer_tool.cc
         util/arena.cc
         util/auto_roll_logger.cc
         util/bloom.cc
         util/coding.cc
         util/compaction_job_stats_impl.cc
         util/comparator.cc
+        util/compression_context_cache.cc
         util/concurrent_arena.cc
+        util/concurrent_task_limiter_impl.cc
         util/crc32c.cc
         util/delete_scheduler.cc
         util/dynamic_bloom.cc
@@ -490,6 +613,7 @@ set(SOURCES
         util/filename.cc
         util/filter_policy.cc
         util/hash.cc
+        util/jemalloc_nodump_allocator.cc
         util/log_buffer.cc
         util/murmurhash.cc
         util/random.cc
@@ -497,40 +621,36 @@ set(SOURCES
         util/slice.cc
         util/sst_file_manager_impl.cc
         util/status.cc
-        util/status_message.cc
         util/string_util.cc
         util/sync_point.cc
+        util/sync_point_impl.cc
+        util/testutil.cc
         util/thread_local.cc
         util/threadpool_imp.cc
+        util/trace_replay.cc
+        util/transaction_test_util.cc
         util/xxhash.cc
         utilities/backupable/backupable_db.cc
+        utilities/blob_db/blob_compaction_filter.cc
         utilities/blob_db/blob_db.cc
         utilities/blob_db/blob_db_impl.cc
+        utilities/blob_db/blob_db_impl_filesnapshot.cc
         utilities/blob_db/blob_dump_tool.cc
         utilities/blob_db/blob_file.cc
         utilities/blob_db/blob_log_reader.cc
         utilities/blob_db/blob_log_writer.cc
         utilities/blob_db/blob_log_format.cc
-        utilities/blob_db/ttl_extractor.cc
         utilities/cassandra/cassandra_compaction_filter.cc
         utilities/cassandra/format.cc
         utilities/cassandra/merge_operator.cc
         utilities/checkpoint/checkpoint_impl.cc
-        utilities/col_buf_decoder.cc
-        utilities/col_buf_encoder.cc
-        utilities/column_aware_encoding_util.cc
         utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
-        utilities/date_tiered/date_tiered_db_impl.cc
         utilities/debug.cc
-        utilities/document/document_db.cc
-        utilities/document/json_document.cc
-        utilities/document/json_document_builder.cc
         utilities/env_mirror.cc
         utilities/env_timed.cc
-        utilities/geodb/geodb_impl.cc
         utilities/leveldb_options/leveldb_options.cc
-        utilities/lua/rocks_lua_compaction_filter.cc
         utilities/memory/memory_util.cc
+        utilities/merge_operators/bytesxor.cc
         utilities/merge_operators/max.cc
         utilities/merge_operators/put.cc
         utilities/merge_operators/string_append/stringappend.cc
@@ -543,24 +663,39 @@ set(SOURCES
         utilities/persistent_cache/block_cache_tier_metadata.cc
         utilities/persistent_cache/persistent_cache_tier.cc
         utilities/persistent_cache/volatile_tier_impl.cc
-        utilities/redis/redis_lists.cc
         utilities/simulator_cache/sim_cache.cc
-        utilities/spatialdb/spatial_db.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
+        utilities/trace/file_trace_reader_writer.cc
         utilities/transactions/optimistic_transaction_db_impl.cc
         utilities/transactions/optimistic_transaction.cc
-        utilities/transactions/transaction_base.cc
+        utilities/transactions/pessimistic_transaction.cc
         utilities/transactions/pessimistic_transaction_db.cc
+        utilities/transactions/snapshot_checker.cc
+        utilities/transactions/transaction_base.cc
         utilities/transactions/transaction_db_mutex_impl.cc
-        utilities/transactions/pessimistic_transaction.cc
         utilities/transactions/transaction_lock_mgr.cc
         utilities/transactions/transaction_util.cc
         utilities/transactions/write_prepared_txn.cc
+        utilities/transactions/write_prepared_txn_db.cc
+        utilities/transactions/write_unprepared_txn.cc
+        utilities/transactions/write_unprepared_txn_db.cc
         utilities/ttl/db_ttl_impl.cc
         utilities/write_batch_with_index/write_batch_with_index.cc
         utilities/write_batch_with_index/write_batch_with_index_internal.cc
         $<TARGET_OBJECTS:build_version>)
 
+if(HAVE_SSE42 AND NOT MSVC)
+  set_source_files_properties(
+    util/crc32c.cc
+    PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
+endif()
+
+if(HAVE_POWER8)
+  list(APPEND SOURCES
+    util/crc32c_ppc.c
+    util/crc32c_ppc_asm.S)
+endif(HAVE_POWER8)
+
 if(WIN32)
   list(APPEND SOURCES
     port/win/io_win.cc
@@ -568,14 +703,18 @@ if(WIN32)
     port/win/env_default.cc
     port/win/port_win.cc
     port/win/win_logger.cc
-    port/win/win_thread.cc
+    port/win/win_thread.cc)
+
+if(WITH_XPRESS)
+  list(APPEND SOURCES
     port/win/xpress_win.cc)
-	
+endif()
+
 if(WITH_JEMALLOC)
   list(APPEND SOURCES
     port/win/win_jemalloc.cc)
 endif()
-	
+
 else()
   list(APPEND SOURCES
     port/port_posix.cc
@@ -584,29 +723,8 @@ else()
 endif()
 
 set(ROCKSDB_STATIC_LIB rocksdb${ARTIFACT_SUFFIX})
-# commented out to avoid building the shared lib
-#set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX})
+set(ROCKSDB_SHARED_LIB rocksdb-shared${ARTIFACT_SUFFIX})
 set(ROCKSDB_IMPORT_LIB ${ROCKSDB_SHARED_LIB})
-if(WIN32)
-  #set(SYSTEM_LIBS ${SYSTEM_LIBS} shlwapi.lib Rpcrt4.lib)
-  set(SYSTEM_LIBS ${SYSTEM_LIBS}  Rpcrt4.lib)
-  set(LIBS ${ROCKSDB_STATIC_LIB} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
-else()
-  set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT})
-  set(LIBS ${ROCKSDB_STATIC_LIB} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
-# commented out to avoid building the shared lib
-# as there is no reason
-#add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES})
-
-# target_link_libraries(${ROCKSDB_SHARED_LIB}
-#    ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
-#  set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES
-#                        LINKER_LANGUAGE CXX
-#                        VERSION ${ROCKSDB_VERSION}
-#                        SOVERSION ${ROCKSDB_VERSION_MAJOR}
-#                        CXX_STANDARD 11
-#                        OUTPUT_NAME "rocksdb")
-endif()
 
 option(WITH_LIBRADOS "Build with librados" OFF)
 if(WITH_LIBRADOS)
@@ -615,13 +733,32 @@ if(WITH_LIBRADOS)
   list(APPEND THIRDPARTY_LIBS rados)
 endif()
 
+if(WIN32)
+  set(SYSTEM_LIBS ${SYSTEM_LIBS} Shlwapi.lib Rpcrt4.lib)
+  set(LIBS ${ROCKSDB_STATIC_LIB} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
+else()
+  set(SYSTEM_LIBS ${CMAKE_THREAD_LIBS_INIT})
+  set(LIBS ${ROCKSDB_SHARED_LIB} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
+
+  add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES})
+  target_link_libraries(${ROCKSDB_SHARED_LIB}
+    ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
+  set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES
+                        LINKER_LANGUAGE CXX
+                        VERSION ${ROCKSDB_VERSION}
+                        SOVERSION ${ROCKSDB_VERSION_MAJOR}
+                        CXX_STANDARD 11
+                        OUTPUT_NAME "rocksdb")
+endif()
+
 add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES})
 target_link_libraries(${ROCKSDB_STATIC_LIB}
   ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 
 if(WIN32)
   add_library(${ROCKSDB_IMPORT_LIB} SHARED ${SOURCES})
-  #target_link_libraries(${ROCKSDB_IMPORT_LIB}  ${SYSTEM_LIBS})
+  target_link_libraries(${ROCKSDB_IMPORT_LIB}
+    ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
   set_target_properties(${ROCKSDB_IMPORT_LIB} PROPERTIES
     COMPILE_DEFINITIONS "ROCKSDB_DLL;ROCKSDB_LIBRARY_EXPORTS")
   if(MSVC)
@@ -658,7 +795,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)
   set(package_config_destination ${CMAKE_INSTALL_LIBDIR}/cmake/rocksdb)
 
   configure_package_config_file(
-    ${CMAKE_SOURCE_DIR}/cmake/RocksDBConfig.cmake.in RocksDBConfig.cmake
+    ${CMAKE_CURRENT_LIST_DIR}/cmake/RocksDBConfig.cmake.in RocksDBConfig.cmake
     INSTALL_DESTINATION ${package_config_destination}
   )
 
@@ -682,6 +819,7 @@ if(NOT WIN32 OR ROCKSDB_INSTALL_ON_WINDOWS)
     TARGETS ${ROCKSDB_SHARED_LIB}
     EXPORT RocksDBTargets
     COMPONENT runtime
+    ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
     RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
     LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
     INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
@@ -728,6 +866,7 @@ if(WITH_TESTS)
         db/db_inplace_update_test.cc
         db/db_io_failure_test.cc
         db/db_iter_test.cc
+        db/db_iter_stress_test.cc
         db/db_iterator_test.cc
         db/db_log_iter_test.cc
         db/db_memtable_test.cc
@@ -735,6 +874,7 @@ if(WITH_TESTS)
         db/db_options_test.cc
         db/db_properties_test.cc
         db/db_range_del_test.cc
+        db/db_secondary_test.cc
         db/db_sst_test.cc
         db/db_statistics_test.cc
         db/db_table_properties_test.cc
@@ -746,6 +886,8 @@ if(WITH_TESTS)
         db/db_write_test.cc
         db/dbformat_test.cc
         db/deletefile_test.cc
+        db/error_handler_test.cc
+        db/obsolete_files_test.cc
         db/external_sst_file_basic_test.cc
         db/external_sst_file_test.cc
         db/fault_injection_test.cc
@@ -762,6 +904,8 @@ if(WITH_TESTS)
         db/perf_context_test.cc
         db/plain_table_db_test.cc
         db/prefix_test.cc
+        db/range_del_aggregator_test.cc
+        db/range_tombstone_fragmenter_test.cc
         db/repair_test.cc
         db/table_properties_collector_test.cc
         db/version_builder_test.cc
@@ -787,12 +931,15 @@ if(WITH_TESTS)
         table/cleanable_test.cc
         table/cuckoo_table_builder_test.cc
         table/cuckoo_table_reader_test.cc
+        table/data_block_hash_index_test.cc
         table/full_filter_block_test.cc
         table/merger_test.cc
+        table/sst_file_reader_test.cc
         table/table_test.cc
         tools/ldb_cmd_test.cc
         tools/reduce_levels_test.cc
         tools/sst_dump_test.cc
+        tools/trace_analyzer_test.cc
         util/arena_test.cc
         util/auto_roll_logger_test.cc
         util/autovector_test.cc
@@ -805,7 +952,9 @@ if(WITH_TESTS)
         util/file_reader_writer_test.cc
         util/filelock_test.cc
         util/hash_test.cc
+        util/heap_test.cc
         util/rate_limiter_test.cc
+        util/repeatable_thread_test.cc
         util/slice_transform_test.cc
         util/timer_queue_test.cc
         util/thread_list_test.cc
@@ -817,12 +966,6 @@ if(WITH_TESTS)
         utilities/cassandra/cassandra_row_merge_test.cc
         utilities/cassandra/cassandra_serialize_test.cc
         utilities/checkpoint/checkpoint_test.cc
-        utilities/column_aware_encoding_test.cc
-        utilities/date_tiered/date_tiered_test.cc
-        utilities/document/document_db_test.cc
-        utilities/document/json_document_test.cc
-        utilities/geodb/geodb_test.cc
-        utilities/lua/rocks_lua_test.cc
         utilities/memory/memory_test.cc
         utilities/merge_operators/string_append/stringappend_test.cc
         utilities/object_registry_test.cc
@@ -830,12 +973,12 @@ if(WITH_TESTS)
         utilities/options/options_util_test.cc
         utilities/persistent_cache/hash_table_test.cc
         utilities/persistent_cache/persistent_cache_test.cc
-        utilities/redis/redis_lists_test.cc
-        utilities/spatialdb/spatial_db_test.cc
         utilities/simulator_cache/sim_cache_test.cc
         utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
         utilities/transactions/optimistic_transaction_test.cc
         utilities/transactions/transaction_test.cc
+        utilities/transactions/write_prepared_transaction_test.cc
+        utilities/transactions/write_unprepared_transaction_test.cc
         utilities/ttl/ttl_test.cc
         utilities/write_batch_with_index/write_batch_with_index_test.cc
   )
@@ -843,7 +986,21 @@ if(WITH_TESTS)
     list(APPEND TESTS utilities/env_librados_test.cc)
   endif()
 
- 
+  set(BENCHMARKS
+    cache/cache_bench.cc
+    memtable/memtablerep_bench.cc
+    db/range_del_aggregator_bench.cc
+    tools/db_bench.cc
+    table/table_reader_bench.cc
+    utilities/persistent_cache/hash_table_bench.cc)
+  add_library(testharness OBJECT util/testharness.cc)
+  foreach(sourcefile ${BENCHMARKS})
+    get_filename_component(exename ${sourcefile} NAME_WE)
+    add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}
+      $<TARGET_OBJECTS:testharness>)
+    target_link_libraries(${exename}${ARTIFACT_SUFFIX} gtest ${LIBS})
+  endforeach(sourcefile ${BENCHMARKS})
+
   # For test util library that is build only in DEBUG mode
   # and linked to tests. Add test only code that is not #ifdefed for Release here.
   set(TESTUTIL_SOURCE
@@ -868,25 +1025,24 @@ if(WITH_TESTS)
         )
 
   # Tests are excluded from Release builds
-  #set(TEST_EXES ${TESTS})
-
-  # while tests are not built, we want to ensure that any reference to gtest is removed in case the user
-  # builds rocksdb manually from our third party directory
-  #foreach(sourcefile ${TEST_EXES})
-  #    get_filename_component(exename ${sourcefile} NAME_WE)
-  #    add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}
-  #      $<TARGET_OBJECTS:testharness>)
-  #    set_target_properties(${exename}${ARTIFACT_SUFFIX}
-  #      PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
-  #      EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
-  #      EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
-  #      )
-  #    target_link_libraries(${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} gtest ${LIBS})
-  #    if(NOT "${exename}" MATCHES "db_sanity_test")
-  #      add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX})
-  #      add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
-  #    endif()
-  #endforeach(sourcefile ${TEST_EXES})
+  set(TEST_EXES ${TESTS})
+
+  foreach(sourcefile ${TEST_EXES})
+      get_filename_component(exename ${sourcefile} NAME_WE)
+      add_executable(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} ${sourcefile}
+        $<TARGET_OBJECTS:testharness>)
+      set_target_properties(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX}
+        PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
+        EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
+        OUTPUT_NAME ${exename}${ARTIFACT_SUFFIX}
+        )
+      target_link_libraries(${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX} gtest ${LIBS})
+      if(NOT "${exename}" MATCHES "db_sanity_test")
+        add_test(NAME ${exename} COMMAND ${exename}${ARTIFACT_SUFFIX})
+        add_dependencies(check ${CMAKE_PROJECT_NAME}_${exename}${ARTIFACT_SUFFIX})
+      endif()
+  endforeach(sourcefile ${TEST_EXES})
 
   # C executables must link to a shared object
   set(C_TESTS db/c_test.c)
@@ -906,3 +1062,8 @@ if(WITH_TESTS)
       add_dependencies(check ${exename}${ARTIFACT_SUFFIX})
   endforeach(sourcefile ${C_TEST_EXES})
 endif()
+
+option(WITH_TOOLS "build with tools" ON)
+if(WITH_TOOLS)
+  add_subdirectory(tools)
+endif()
diff --git a/thirdparty/rocksdb/CODE_OF_CONDUCT.md b/thirdparty/rocksdb/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..0a45f9bd5f
--- /dev/null
+++ b/thirdparty/rocksdb/CODE_OF_CONDUCT.md
@@ -0,0 +1,3 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to. Please [read the full text](https://code.facebook.com/codeofconduct) so that you can understand what actions will and will not be tolerated.
diff --git a/thirdparty/rocksdb/CONTRIBUTING.md b/thirdparty/rocksdb/CONTRIBUTING.md
index b8b1a412e3..190100b429 100644
--- a/thirdparty/rocksdb/CONTRIBUTING.md
+++ b/thirdparty/rocksdb/CONTRIBUTING.md
@@ -1,5 +1,8 @@
 # Contributing to RocksDB
 
+## Code of Conduct
+The code of conduct is described in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md)
+
 ## Contributor License Agreement ("CLA")
 
 In order to accept your pull request, we need you to submit a CLA. You
diff --git a/thirdparty/rocksdb/HISTORY.md b/thirdparty/rocksdb/HISTORY.md
index 9156290e0c..6c281cc241 100644
--- a/thirdparty/rocksdb/HISTORY.md
+++ b/thirdparty/rocksdb/HISTORY.md
@@ -1,11 +1,301 @@
 # Rocksdb Change Log
-## 5.8.6 (11/20/2017)
+## 6.1.2 (6/4/2019)
 ### Bug Fixes
-* Fixed aligned_alloc issues with Windows.
+* Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level.
 
-## 5.8.1 (10/23/2017)
+## 6.1.1 (4/9/2019)
 ### New Features
+* When reading from option file/string/map, customized comparators and/or merge operators can be filled according to object registry.
+### Public API Change
+### Bug Fixes
+* Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction.
+* Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries.
+
+## 6.1.0 (3/27/2019)
+### New Features
+* Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers.
+* Added a feature to perform data-block sampling for compressibility, and report stats to user.
+* Add support for trace filtering.
+* Add DBOptions.avoid_unnecessary_blocking_io. If true, we avoid file deletion when destorying ColumnFamilyHandle and Iterator. Instead, a job is scheduled to delete the files in background.
+
+### Public API Change
+* Remove bundled fbson library.
+* statistics.stats_level_ becomes atomic. It is preferred to use statistics.set_stats_level() and statistics.get_stats_level() to access it.
+* Introduce a new IOError subcode, PathNotFound, to indicate trying to open a nonexistent file or directory for read.
+* Add initial support for multiple db instances sharing the same data in single-writer, multi-reader mode.
+* Removed some "using std::xxx" from public headers.
+
+### Bug Fixes
+* Fix JEMALLOC_CXX_THROW macro missing from older Jemalloc versions, causing build failures on some platforms.
+* Fix SstFileReader not able to open file ingested with write_glbal_seqno=true.
+
+## 6.0.0 (2/19/2019)
+### New Features
+* Enabled checkpoint on readonly db (DBImplReadOnly).
+* Make DB ignore dropped column families while committing results of atomic flush.
+* RocksDB may choose to preopen some files even if options.max_open_files != -1. This may make DB open slightly longer.
+* For users of dictionary compression with ZSTD v0.7.0+, we now reuse the same digested dictionary when compressing each of an SST file's data blocks for faster compression speeds.
+* For all users of dictionary compression who set `cache_index_and_filter_blocks == true`, we now store dictionary data used for decompression in the block cache for better control over memory usage. For users of ZSTD v1.1.4+ who compile with -DZSTD_STATIC_LINKING_ONLY, this includes a digested dictionary, which is used to increase decompression speed.
+* Add support for block checksums verification for external SST files before ingestion.
+* Introduce stats history which periodically saves Statistics snapshots and added `GetStatsHistory` API to retrieve these snapshots.
+* Add a place holder in manifest which indicate a record from future that can be safely ignored.
+* Add support for trace sampling.
+* Enable properties block checksum verification for block-based tables.
+* For all users of dictionary compression, we now generate a separate dictionary for compressing each bottom-level SST file. Previously we reused a single dictionary for a whole compaction to bottom level. The new approach achieves better compression ratios; however, it uses more memory and CPU for buffering/sampling data blocks and training dictionaries.
+* Add whole key bloom filter support in memtable.
+* Files written by `SstFileWriter` will now use dictionary compression if it is configured in the file writer's `CompressionOptions`.
+
+### Public API Change
+* Disallow CompactionFilter::IgnoreSnapshots() = false, because it is not very useful and the behavior is confusing. The filter will filter everything if there is no snapshot declared by the time the compaction starts. However, users can define a snapshot after the compaction starts and before it finishes and this new snapshot won't be repeatable, because after the compaction finishes, some keys may be dropped.
+* CompactionPri = kMinOverlappingRatio also uses compensated file size, which boosts file with lots of tombstones to be compacted first.
+* Transaction::GetForUpdate is extended with a do_validate parameter with default value of true. If false it skips validating the snapshot before doing the read. Similarly ::Merge, ::Put, ::Delete, and ::SingleDelete are extended with assume_tracked with default value of false. If true it indicates that call is assumed to be after a ::GetForUpdate.
+* `TableProperties::num_entries` and `TableProperties::num_deletions` now also account for number of range tombstones.
+* Remove geodb, spatial_db, document_db, json_document, date_tiered_db, and redis_lists.
+* With "ldb ----try_load_options", when wal_dir specified by the option file doesn't exist, ignore it.
+* Change time resolution in FileOperationInfo.
+* Deleting Blob files also go through SStFileManager.
+* Remove CuckooHash memtable.
+* The counter stat `number.block.not_compressed` now also counts blocks not compressed due to poor compression ratio.
+* Remove ttl option from `CompactionOptionsFIFO`. The option has been deprecated and ttl in `ColumnFamilyOptions` is used instead.
+* Support SST file ingestion across multiple column families via DB::IngestExternalFiles. See the function's comment about atomicity.
+* Remove Lua compaction filter.
+
+### Bug Fixes
+* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls.
+* Fix a memory leak when files with range tombstones are read in mmap mode and block cache is enabled
+* Fix handling of corrupt range tombstone blocks such that corruptions cannot cause deleted keys to reappear
+* Lock free MultiGet
+* Fix incorrect `NotFound` point lookup result when querying the endpoint of a file that has been extended by a range tombstone.
+* Fix with pipelined write, write leaders's callback failure lead to the whole write group fail.
+
+### Change Default Options
+* Change options.compaction_pri's default to kMinOverlappingRatio
+
+## 5.18.0 (11/30/2018)
+### New Features
+* Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump.
+* Introduced `PerfContextByLevel` as part of `PerfContext` which allows storing perf context at each level. Also replaced `__thread` with `thread_local` keyword for perf_context. Added per-level perf context for bloom filter and `Get` query.
+* With level_compaction_dynamic_level_bytes = true, level multiplier may be adjusted automatically when Level 0 to 1 compaction is lagged behind.
+* Introduced DB option `atomic_flush`. If true, RocksDB supports flushing multiple column families and atomically committing the result to MANIFEST. Useful when WAL is disabled.
+* Added `num_deletions` and `num_merge_operands` members to `TableProperties`.
+* Added "rocksdb.min-obsolete-sst-number-to-keep" DB property that reports the lower bound on SST file numbers that are being kept from deletion, even if the SSTs are obsolete.
+* Add xxhash64 checksum support
+* Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table.
+* Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental.
+
+### Public API Change
+* `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs.
+* `NO_ITERATORS` is divided into two counters `NO_ITERATOR_CREATED` and `NO_ITERATOR_DELETE`. Both of them are only increasing now, just as other counters.
+
+### Bug Fixes
+* Fix corner case where a write group leader blocked due to write stall blocks other writers in queue with WriteOptions::no_slowdown set.
+* Fix in-memory range tombstone truncation to avoid erroneously covering newer keys at a lower level, and include range tombstones in compacted files whose largest key is the range tombstone's start key.
+* Properly set the stop key for a truncated manual CompactRange
+* Fix slow flush/compaction when DB contains many snapshots. The problem became noticeable to us in DBs with 100,000+ snapshots, though it will affect others at different thresholds.
+* Fix the bug that WriteBatchWithIndex's SeekForPrev() doesn't see the entries with the same key.
+* Fix the bug where user comparator was sometimes fed with InternalKey instead of the user key. The bug manifests when during GenerateBottommostFiles.
+* Fix a bug in WritePrepared txns where if the number of old snapshots goes beyond the snapshot cache size (128 default) the rest will not be checked when evicting a commit entry from the commit cache.
+* Fixed Get correctness bug in the presence of range tombstones where merge operands covered by a range tombstone always result in NotFound.
+* Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously.
+* The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files.
+
+## 5.17.0 (10/05/2018)
+### Public API Change
+* `OnTableFileCreated` will now be called for empty files generated during compaction. In that case, `TableFileCreationInfo::file_path` will be "(nil)" and `TableFileCreationInfo::file_size` will be zero.
+* Add `FlushOptions::allow_write_stall`, which controls whether Flush calls start working immediately, even if it causes user writes to stall, or will wait until flush can be performed without causing write stall (similar to `CompactRangeOptions::allow_write_stall`). Note that the default value is false, meaning we add delay to Flush calls until stalling can be avoided when possible. This is behavior change compared to previous RocksDB versions, where Flush calls didn't check if they might cause stall or not.
+* Application using PessimisticTransactionDB is expected to rollback/commit recovered transactions before starting new ones. This assumption is used to skip concurrency control during recovery.
+* Expose column family id to `OnCompactionCompleted`.
+
+### New Features
+* TransactionOptions::skip_concurrency_control allows pessimistic transactions to skip the overhead of concurrency control. Could be used for optimizing certain transactions or during recovery.
+
+### Bug Fixes
+* Avoid creating empty SSTs and subsequently deleting them in certain cases during compaction.
+* Sync CURRENT file contents during checkpoint.
+
+## 5.16.3 (10/1/2018)
+### Bug Fixes
+* Fix crash caused when `CompactFiles` run with `CompactionOptions::compression == CompressionType::kDisableCompressionOption`. Now that setting causes the compression type to be chosen according to the column family-wide compression options.
+
+## 5.16.2 (9/21/2018)
+### Bug Fixes
+* Fix bug in partition filters with format_version=4.
+
+## 5.16.1 (9/17/2018)
+### Bug Fixes
+* Remove trace_analyzer_tool from rocksdb_lib target in TARGETS file.
+* Fix RocksDB Java build and tests.
+* Remove sync point in Block destructor.
+
+## 5.16.0 (8/21/2018)
+### Public API Change
+* The merge operands are passed to `MergeOperator::ShouldMerge` in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) for performance reasons
+* GetAllKeyVersions() to take an extra argument of `max_num_ikeys`.
+* Using ZSTD dictionary trainer (i.e., setting `CompressionOptions::zstd_max_train_bytes` to a nonzero value) now requires ZSTD version 1.1.3 or later.
+
+### New Features
+* Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of BlockHandle::offset of the non-head index entries in each restart interval. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 4 or above is used.
+* Add a new tool: trace_analyzer. Trace_analyzer analyzes the trace file generated by using trace_replay API. It can convert the binary format trace file to a human readable txt file, output the statistics of the analyzed query types such as access statistics and size statistics, combining the dumped whole key space file to analyze, support query correlation analyzing, and etc. Current supported query types are: Get, Put, Delete, SingleDelete, DeleteRange, Merge, Iterator (Seek, SeekForPrev only).
+* Add hash index support to data blocks, which helps reducing the cpu utilization of point-lookup operations. This feature is backward compatible with the data block created without the hash index. It is disabled by default unless BlockBasedTableOptions::data_block_index_type is set to data_block_index_type = kDataBlockBinaryAndHash.
+
+### Bug Fixes
+* Fix a bug in misreporting the estimated partition index size in properties block.
+
+## 5.15.0 (7/17/2018)
+### Public API Change
+* Remove managed iterator. ReadOptions.managed is not effective anymore.
+* For bottommost_compression, a compatible CompressionOptions is added via `bottommost_compression_opts`. To keep backward compatible, a new boolean `enabled` is added to CompressionOptions. For compression_opts, it will be always used no matter what value of `enabled` is. For bottommost_compression_opts, it will only be used when user set `enabled=true`, otherwise, compression_opts will be used for bottommost_compression as default.
+* With LRUCache, when high_pri_pool_ratio > 0, midpoint insertion strategy will be enabled to put low-pri items to the tail of low-pri list (the midpoint) when they first inserted into the cache. This is to make cache entries never get hit age out faster, improving cache efficiency when large background scan presents.
+* For users of `Statistics` objects created via `CreateDBStatistics()`, the format of the string returned by its `ToString()` method has changed.
+* The "rocksdb.num.entries" table property no longer counts range deletion tombstones as entries.
+
+### New Features
+* Changes the format of index blocks by storing the key in their raw form rather than converting them to InternalKey. This saves 8 bytes per index key. The feature is backward compatible but not forward compatible. It is disabled by default unless format_version 3 or above is used.
+* Avoid memcpy when reading mmap files with OpenReadOnly and max_open_files==-1.
+* Support dynamically changing `ColumnFamilyOptions::ttl` via `SetOptions()`.
+* Add a new table property, "rocksdb.num.range-deletions", which counts the number of range deletion tombstones in the table.
+* Improve the performance of iterators doing long range scans by using readahead, when using direct IO.
+* pin_top_level_index_and_filter (default true) in BlockBasedTableOptions can be used in combination with cache_index_and_filter_blocks to prefetch and pin the top-level index of partitioned index and filter blocks in cache. It has no impact when cache_index_and_filter_blocks is false.
+* Write properties meta-block at the end of block-based table to save read-ahead IO.
+
+### Bug Fixes
+* Fix deadlock with enable_pipelined_write=true and max_successive_merges > 0
+* Check conflict at output level in CompactFiles.
+* Fix corruption in non-iterator reads when mmap is used for file reads
+* Fix bug with prefix search in partition filters where a shared prefix would be ignored from the later partitions. The bug could report an eixstent key as missing. The bug could be triggered if prefix_extractor is set and partition filters is enabled.
+* Change default value of `bytes_max_delete_chunk` to 0 in NewSstFileManager() as it doesn't work well with checkpoints.
+* Fix a bug caused by not copying the block trailer with compressed SST file, direct IO, prefetcher and no compressed block cache.
+* Fix write can stuck indefinitely if enable_pipelined_write=true. The issue exists since pipelined write was introduced in 5.5.0.
+
+## 5.14.0 (5/16/2018)
+### Public API Change
+* Add a BlockBasedTableOption to align uncompressed data blocks on the smaller of block size or page size boundary, to reduce flash reads by avoiding reads spanning 4K pages.
+* The background thread naming convention changed (on supporting platforms) to "rocksdb:<thread pool priority><thread number>", e.g., "rocksdb:low0".
+* Add a new ticker stat rocksdb.number.multiget.keys.found to count number of keys successfully read in MultiGet calls
+* Touch-up to write-related counters in PerfContext. New counters added: write_scheduling_flushes_compactions_time, write_thread_wait_nanos. Counters whose behavior was fixed or modified: write_memtable_time, write_pre_and_post_process_time, write_delay_time.
+* Posix Env's NewRandomRWFile() will fail if the file doesn't exist.
+* Now, `DBOptions::use_direct_io_for_flush_and_compaction` only applies to background writes, and `DBOptions::use_direct_reads` applies to both user reads and background reads. This conforms with Linux's `open(2)` manpage, which advises against simultaneously reading a file in buffered and direct modes, due to possibly undefined behavior and degraded performance.
+* Iterator::Valid() always returns false if !status().ok(). So, now when doing a Seek() followed by some Next()s, there's no need to check status() after every operation.
+* Iterator::Seek()/SeekForPrev()/SeekToFirst()/SeekToLast() always resets status().
+* Introduced `CompressionOptions::kDefaultCompressionLevel`, which is a generic way to tell RocksDB to use the compression library's default level. It is now the default value for `CompressionOptions::level`. Previously the level defaulted to -1, which gave poor compression ratios in ZSTD.
+
+### New Features
+* Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data.
+* TransactionDBOptions::write_policy can be configured to enable WritePrepared 2PC transactions. Read more about them in the wiki.
+* Add DB properties "rocksdb.block-cache-capacity", "rocksdb.block-cache-usage", "rocksdb.block-cache-pinned-usage" to show block cache usage.
+* Add `Env::LowerThreadPoolCPUPriority(Priority)` method, which lowers the CPU priority of background (esp. compaction) threads to minimize interference with foreground tasks.
+* Fsync parent directory after deleting a file in delete scheduler.
+* In level-based compaction, if bottom-pri thread pool was setup via `Env::SetBackgroundThreads()`, compactions to the bottom level will be delegated to that thread pool.
+* `prefix_extractor` has been moved from ImmutableCFOptions to MutableCFOptions, meaning it can be dynamically changed without a DB restart.
+
+### Bug Fixes
+* Fsync after writing global seq number to the ingestion file in ExternalSstFileIngestionJob.
+* Fix WAL corruption caused by race condition between user write thread and FlushWAL when two_write_queue is not set.
+* Fix `BackupableDBOptions::max_valid_backups_to_open` to not delete backup files when refcount cannot be accurately determined.
+* Fix memory leak when pin_l0_filter_and_index_blocks_in_cache is used with partitioned filters
+* Disable rollback of merge operands in WritePrepared transactions to work around an issue in MyRocks. It can be enabled back by setting TransactionDBOptions::rollback_merge_operands to true.
+* Fix wrong results by ReverseBytewiseComparator::FindShortSuccessor()
+
+### Java API Changes
+* Add `BlockBasedTableConfig.setBlockCache` to allow sharing a block cache across DB instances.
+* Added SstFileManager to the Java API to allow managing SST files across DB instances.
+
+## 5.13.0 (3/20/2018)
+### Public API Change
+* RocksDBOptionsParser::Parse()'s `ignore_unknown_options` argument will only be effective if the option file shows it is generated using a higher version of RocksDB than the current version.
+* Remove CompactionEventListener.
+
+### New Features
+* SstFileManager now can cancel compactions if they will result in max space errors. SstFileManager users can also use SetCompactionBufferSize to specify how much space must be leftover during a compaction for auxiliary file functions such as logging and flushing.
+* Avoid unnecessarily flushing in `CompactRange()` when the range specified by the user does not overlap unflushed memtables.
+* If `ColumnFamilyOptions::max_subcompactions` is set greater than one, we now parallelize large manual level-based compactions.
+* Add "rocksdb.live-sst-files-size" DB property to return total bytes of all SST files belong to the latest LSM tree.
+* NewSstFileManager to add an argument bytes_max_delete_chunk with default 64MB. With this argument, a file larger than 64MB will be ftruncated multiple times based on this size.
+
+### Bug Fixes
+* Fix a leak in prepared_section_completed_ where the zeroed entries would not removed from the map.
+* Fix WAL corruption caused by race condition between user write thread and backup/checkpoint thread.
+
+## 5.12.0 (2/14/2018)
+### Public API Change
+* Iterator::SeekForPrev is now a pure virtual method. This is to prevent user who implement the Iterator interface fail to implement SeekForPrev by mistake.
+* Add `include_end` option to make the range end exclusive when `include_end == false` in `DeleteFilesInRange()`.
+* Add `CompactRangeOptions::allow_write_stall`, which makes `CompactRange` start working immediately, even if it causes user writes to stall. The default value is false, meaning we add delay to `CompactRange` calls until stalling can be avoided when possible. Note this delay is not present in previous RocksDB versions.
+* Creating checkpoint with empty directory now returns `Status::InvalidArgument`; previously, it returned `Status::IOError`.
+* Adds a BlockBasedTableOption to turn off index block compression.
+* Close() method now returns a status when closing a db.
+
+### New Features
+* Improve the performance of iterators doing long range scans by using readahead.
+* Add new function `DeleteFilesInRanges()` to delete files in multiple ranges at once for better performance.
+* FreeBSD build support for RocksDB and RocksJava.
+* Improved performance of long range scans with readahead.
+* Updated to and now continuously tested in Visual Studio 2017.
+
+### Bug Fixes
+* Fix `DisableFileDeletions()` followed by `GetSortedWalFiles()` to not return obsolete WAL files that `PurgeObsoleteFiles()` is going to delete.
+* Fix Handle error return from WriteBuffer() during WAL file close and DB close.
+* Fix advance reservation of arena block addresses.
+* Fix handling of empty string as checkpoint directory.
+
+## 5.11.0 (01/08/2018)
+### Public API Change
+* Add `autoTune` and `getBytesPerSecond()` to RocksJava RateLimiter
+
+### New Features
+* Add a new histogram stat called rocksdb.db.flush.micros for memtable flush.
+* Add "--use_txn" option to use transactional API in db_stress.
+* Disable onboard cache for compaction output in Windows platform.
+* Improve the performance of iterators doing long range scans by using readahead.
+
+### Bug Fixes
+* Fix a stack-use-after-scope bug in ForwardIterator.
+* Fix builds on platforms including Linux, Windows, and PowerPC.
+* Fix buffer overrun in backup engine for DBs with huge number of files.
+* Fix a mislabel bug for bottom-pri compaction threads.
+* Fix DB::Flush() keep waiting after flush finish under certain condition.
+
+## 5.10.0 (12/11/2017)
+### Public API Change
+* When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features.
+
+### New Features
+* Provide lifetime hints when writing files on Linux. This reduces hardware write-amp on storage devices supporting multiple streams.
+* Add a DB stat, `NUMBER_ITER_SKIP`, which returns how many internal keys were skipped during iterations (e.g., due to being tombstones or duplicate versions of a key).
+* Add PerfContext counters, `key_lock_wait_count` and `key_lock_wait_time`, which measure the number of times transactions wait on key locks and total amount of time waiting.
+
+### Bug Fixes
+* Fix IOError on WAL write doesn't propagate to write group follower
+* Make iterator invalid on merge error.
+* Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files.
+* Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker.
+
+## 5.9.0 (11/1/2017)
+### Public API Change
+* `BackupableDBOptions::max_valid_backups_to_open == 0` now means no backups will be opened during BackupEngine initialization. Previously this condition disabled limiting backups opened.
+* `DBOptions::preserve_deletes` is a new option that allows one to specify that DB should not drop tombstones for regular deletes if they have sequence number larger than what was set by the new API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)`. Disabled by default.
+* API call `DB::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum)` was added, users who wish to preserve deletes are expected to periodically call this function to advance the cutoff seqnum (all deletes made before this seqnum can be dropped by DB). It's user responsibility to figure out how to advance the seqnum in the way so the tombstones are kept for the desired period of time, yet are eventually processed in time and don't eat up too much space.
+* `ReadOptions::iter_start_seqnum` was added;
+if set to something > 0 user will see 2 changes in iterators behavior 1) only keys written with sequence larger than this parameter would be returned and 2) the `Slice` returned by iter->key() now points to the memory that keep User-oriented representation of the internal key, rather than user key. New struct `FullKey` was added to represent internal keys, along with a new helper function `ParseFullKey(const Slice& internal_key, FullKey* result);`.
+* Deprecate trash_dir param in NewSstFileManager, right now we will rename deleted files to <name>.trash instead of moving them to trash directory
+* Allow setting a custom trash/DB size ratio limit in the SstFileManager, after which files that are to be scheduled for deletion are deleted immediately, regardless of any delete ratelimit.
+* Return an error on write if write_options.sync = true and write_options.disableWAL = true to warn user of inconsistent options. Previously we will not write to WAL and not respecting the sync options in this case.
+
+### New Features
+* CRC32C is now using the 3-way pipelined SSE algorithm `crc32c_3way` on supported platforms to improve performance. The system will choose to use this algorithm on supported platforms automatically whenever possible. If PCLMULQDQ is not supported it will fall back to the old Fast_CRC32 algorithm.
+* `DBOptions::writable_file_max_buffer_size` can now be changed dynamically.
+* `DBOptions::bytes_per_sync`, `DBOptions::compaction_readahead_size`, and `DBOptions::wal_bytes_per_sync` can now be changed dynamically, `DBOptions::wal_bytes_per_sync` will flush all memtables and switch to a new WAL file.
+* Support dynamic adjustment of rate limit according to demand for background I/O. It can be enabled by passing `true` to the `auto_tuned` parameter in `NewGenericRateLimiter()`. The value passed as `rate_bytes_per_sec` will still be respected as an upper-bound.
+* Support dynamically changing `ColumnFamilyOptions::compaction_options_fifo`.
+* Introduce `EventListener::OnStallConditionsChanged()` callback. Users can implement it to be notified when user writes are stalled, stopped, or resumed.
 * Add a new db property "rocksdb.estimate-oldest-key-time" to return oldest data timestamp. The property is available only for FIFO compaction with compaction_options_fifo.allow_compaction = false.
+* Upon snapshot release, recompact bottommost files containing deleted/overwritten keys that previously could not be dropped due to the snapshot. This alleviates space-amp caused by long-held snapshots.
+* Support lower bound on iterators specified via `ReadOptions::iterate_lower_bound`.
+* Support for differential snapshots (via iterator emitting the sequence of key-values representing the difference between DB state at two different sequence numbers). Supports preserving and emitting puts and regular deletes, doesn't support SingleDeletes, MergeOperator, Blobs and Range Deletes.
+
+### Bug Fixes
+* Fix a potential data inconsistency issue during point-in-time recovery. `DB:Open()` will abort if column family inconsistency is found during PIT recovery.
+* Fix possible metadata corruption in databases using `DeleteRange()`.
 
 ## 5.8.0 (08/30/2017)
 ### Public API Change
diff --git a/thirdparty/rocksdb/INSTALL.md b/thirdparty/rocksdb/INSTALL.md
index 04f0eb2797..91a0935b27 100644
--- a/thirdparty/rocksdb/INSTALL.md
+++ b/thirdparty/rocksdb/INSTALL.md
@@ -43,6 +43,8 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
       command line flags processing. You can compile rocksdb library even
       if you don't have gflags installed.
 
+* If you wish to build the RocksJava static target, then cmake is required for building Snappy.
+
 ## Supported platforms
 
 * **Linux - Ubuntu**
@@ -107,6 +109,62 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
             * run `brew tap homebrew/versions; brew install gcc48 --use-llvm` to install gcc 4.8 (or higher).
     * run `brew install rocksdb`
 
+* **FreeBSD** (11.01):
+
+    * You can either install RocksDB from the Ports system using `cd /usr/ports/databases/rocksdb && make install`, or you can follow the details below to install dependencies and compile from source code:
+
+    * Install the dependencies for RocksDB:
+
+        export BATCH=YES
+        cd /usr/ports/devel/gmake && make install
+        cd /usr/ports/devel/gflags && make install
+
+        cd /usr/ports/archivers/snappy && make install
+        cd /usr/ports/archivers/bzip2 && make install
+        cd /usr/ports/archivers/liblz4 && make install
+        cd /usr/ports/archivesrs/zstd && make install
+
+        cd /usr/ports/devel/git && make install
+
+
+    * Install the dependencies for RocksJava (optional):
+
+        export BATCH=yes
+        cd /usr/ports/java/openjdk7 && make install
+
+    * Build RocksDB from source:
+        cd ~
+        git clone https://github.com/facebook/rocksdb.git
+        cd rocksdb
+        gmake static_lib
+
+    * Build RocksJava from source (optional):
+        cd rocksdb
+        export JAVA_HOME=/usr/local/openjdk7
+        gmake rocksdbjava
+
+* **OpenBSD** (6.3/-current):
+
+    * As RocksDB is not available in the ports yet you have to build it on your own:
+
+    * Install the dependencies for RocksDB:
+
+        pkg_add gmake gflags snappy bzip2 lz4 zstd git jdk bash findutils gnuwatch 
+
+    * Build RocksDB from source:
+
+        cd ~
+        git clone https://github.com/facebook/rocksdb.git
+        cd rocksdb
+        gmake static_lib
+
+    * Build RocksJava from source (optional):
+
+        cd rocksdb
+        export JAVA_HOME=/usr/local/jdk-1.8.0
+        export PATH=$PATH:/usr/local/jdk-1.8.0/bin
+        gmake rocksdbjava
+
 * **iOS**:
   * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
 
@@ -114,7 +172,7 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
   * For building with MS Visual Studio 13 you will need Update 4 installed.
   * Read and follow the instructions at CMakeLists.txt
   * Or install via [vcpkg](https://github.com/microsoft/vcpkg) 
-       * run `vcpkg install rocksdb`
+       * run `vcpkg install rocksdb:x64-windows`
 
 * **AIX 6.1**
     * Install AIX Toolbox rpms with gcc
diff --git a/thirdparty/rocksdb/LANGUAGE-BINDINGS.md b/thirdparty/rocksdb/LANGUAGE-BINDINGS.md
index ffeed98f28..73c2355a5d 100644
--- a/thirdparty/rocksdb/LANGUAGE-BINDINGS.md
+++ b/thirdparty/rocksdb/LANGUAGE-BINDINGS.md
@@ -1,7 +1,9 @@
 This is the list of all known third-party language bindings for RocksDB. If something is missing, please open a pull request to add it.
 
 * Java - https://github.com/facebook/rocksdb/tree/master/java
-* Python - http://pyrocksdb.readthedocs.org/en/latest/
+* Python
+    * http://python-rocksdb.readthedocs.io/en/latest/
+    * http://pyrocksdb.readthedocs.org/en/latest/ (unmaintained)
 * Perl - https://metacpan.org/pod/RocksDB
 * Node.js - https://npmjs.org/package/rocksdb
 * Go - https://github.com/tecbot/gorocksdb
@@ -10,7 +12,11 @@ This is the list of all known third-party language bindings for RocksDB. If some
 * PHP - https://github.com/Photonios/rocksdb-php
 * C# - https://github.com/warrenfalk/rocksdb-sharp
 * Rust
+    * https://github.com/pingcap/rust-rocksdb (used in production fork of https://github.com/spacejam/rust-rocksdb)
     * https://github.com/spacejam/rust-rocksdb
     * https://github.com/bh1xuw/rust-rocks
 * D programming language - https://github.com/b1naryth1ef/rocksdb
 * Erlang - https://gitlab.com/barrel-db/erlang-rocksdb
+* Elixir - https://github.com/urbint/rox
+* Nim - https://github.com/status-im/nim-rocksdb
+* Swift and Objective-C (iOS/OSX) - https://github.com/iabudiab/ObjectiveRocks 
diff --git a/thirdparty/rocksdb/Makefile b/thirdparty/rocksdb/Makefile
index 5a89f6bf79..eee0f9fba0 100644
--- a/thirdparty/rocksdb/Makefile
+++ b/thirdparty/rocksdb/Makefile
@@ -76,31 +76,70 @@ ifeq ($(MAKECMDGOALS),install)
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastatic)
-	DEBUG_LEVEL=0
+	ifneq ($(DEBUG_LEVEL),2)
+		DEBUG_LEVEL=0
+	endif
 endif
 
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticrelease)
 	DEBUG_LEVEL=0
 endif
 
+ifeq ($(MAKECMDGOALS),rocksdbjavastaticreleasedocker)
+        DEBUG_LEVEL=0
+endif
+
 ifeq ($(MAKECMDGOALS),rocksdbjavastaticpublish)
 	DEBUG_LEVEL=0
 endif
 
+# Lite build flag.
+LITE ?= 0
+ifeq ($(LITE), 0)
+ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
+  # Be backward compatible and support older format where OPT=-DROCKSDB_LITE is
+  # specified instead of LITE=1 on the command line.
+  LITE=1
+endif
+else ifeq ($(LITE), 1)
+ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
+	OPT += -DROCKSDB_LITE
+endif
+endif
+
+# Figure out optimize level.
+ifneq ($(DEBUG_LEVEL), 2)
+ifeq ($(LITE), 0)
+	OPT += -O2
+else
+	OPT += -Os
+endif
+endif
+
 # compile with -O2 if debug level is not 2
 ifneq ($(DEBUG_LEVEL), 2)
-OPT += -O2 -fno-omit-frame-pointer
+OPT += -fno-omit-frame-pointer
 # Skip for archs that don't support -momit-leaf-frame-pointer
 ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1))
 OPT += -momit-leaf-frame-pointer
 endif
 endif
 
-# if we're compiling for release, compile without debug code (-DNDEBUG) and
-# don't treat warnings as errors
+ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
+CXXFLAGS += -DHAS_ALTIVEC
+CFLAGS += -DHAS_ALTIVEC
+HAS_ALTIVEC=1
+endif
+
+ifeq (,$(shell $(CXX) -fsyntax-only -mcpu=power8 -xc /dev/null 2>&1))
+CXXFLAGS += -DHAVE_POWER8
+CFLAGS +=  -DHAVE_POWER8
+HAVE_POWER8=1
+endif
+
+# if we're compiling for release, compile without debug code (-DNDEBUG)
 ifeq ($(DEBUG_LEVEL),0)
 OPT += -DNDEBUG
-DISABLE_WARNING_AS_ERROR=1
 
 ifneq ($(USE_RTTI), 1)
 	CXXFLAGS += -fno-rtti
@@ -159,7 +198,7 @@ include make_config.mk
 CLEAN_FILES += make_config.mk
 
 missing_make_config_paths := $(shell				\
-	grep "\/\S*" -o $(CURDIR)/make_config.mk | 		\
+	grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | 	\
 	while read path;					\
 		do [ -e $$path ] || echo $$path; 		\
 	done | sort | uniq)
@@ -212,6 +251,9 @@ ifdef COMPILE_WITH_TSAN
 	PROFILING_FLAGS =
 	# LUA is not supported under TSAN
 	LUA_PATH =
+	# Limit keys for crash test under TSAN to avoid error:
+	# "ThreadSanitizer: DenseSlabAllocator overflow. Dying."
+	CRASH_TEST_EXT_ARGS += --max_key=1000000
 endif
 
 # AIX doesn't work with -pg
@@ -222,9 +264,18 @@ endif
 # USAN doesn't work well with jemalloc. If we're compiling with USAN, we should use regular malloc.
 ifdef COMPILE_WITH_UBSAN
 	DISABLE_JEMALLOC=1
-	EXEC_LDFLAGS += -fsanitize=undefined
-	PLATFORM_CCFLAGS += -fsanitize=undefined -DROCKSDB_UBSAN_RUN
-	PLATFORM_CXXFLAGS += -fsanitize=undefined -DROCKSDB_UBSAN_RUN
+	# Suppress alignment warning because murmurhash relies on casting unaligned
+	# memory to integer. Fixing it may cause performance regression. 3-way crc32
+	# relies on it too, although it can be rewritten to eliminate with minimal
+	# performance regression.
+	EXEC_LDFLAGS += -fsanitize=undefined -fno-sanitize-recover=all
+	PLATFORM_CCFLAGS += -fsanitize=undefined -fno-sanitize-recover=all -DROCKSDB_UBSAN_RUN
+	PLATFORM_CXXFLAGS += -fsanitize=undefined -fno-sanitize-recover=all -DROCKSDB_UBSAN_RUN
+endif
+
+ifdef ROCKSDB_VALGRIND_RUN
+	PLATFORM_CCFLAGS += -DROCKSDB_VALGRIND_RUN
+	PLATFORM_CXXFLAGS += -DROCKSDB_VALGRIND_RUN
 endif
 
 ifndef DISABLE_JEMALLOC
@@ -257,7 +308,11 @@ endif
 default: all
 
 WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare -Wshadow \
-  -Wno-unused-parameter
+  -Wunused-parameter
+
+ifeq ($(PLATFORM), OS_OPENBSD)
+	WARNING_FLAGS += -Wno-unused-lambda-capture
+endif
 
 ifndef DISABLE_WARNING_AS_ERROR
 	WARNING_FLAGS += -Werror
@@ -286,10 +341,13 @@ endif
 ifeq ("$(wildcard $(LUA_LIB))", "") # LUA_LIB does not exist
 $(error $(LUA_LIB) does not exist.  Try to specify both LUA_PATH and LUA_LIB manually)
 endif
-LDFLAGS += $(LUA_LIB)
+EXEC_LDFLAGS += $(LUA_LIB)
 
 endif
 
+ifeq ($(NO_THREEWAY_CRC32C), 1)
+	CXXFLAGS += -DNO_THREEWAY_CRC32C
+endif
 
 CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
 CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers
@@ -322,6 +380,14 @@ util/build_version.cc: FORCE
 endif
 
 LIBOBJECTS = $(LIB_SOURCES:.cc=.o)
+ifeq ($(HAVE_POWER8),1)
+LIB_CC_OBJECTS = $(LIB_SOURCES:.cc=.o)
+LIBOBJECTS += $(LIB_SOURCES_C:.c=.o)
+LIBOBJECTS += $(LIB_SOURCES_ASM:.S=.o)
+else
+LIB_CC_OBJECTS = $(LIB_SOURCES:.cc=.o)
+endif
+
 LIBOBJECTS += $(TOOL_LIB_SOURCES:.cc=.o)
 MOCKOBJECTS = $(MOCK_LIB_SOURCES:.cc=.o)
 
@@ -335,7 +401,9 @@ VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full
 
 BENCHTOOLOBJECTS = $(BENCH_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
 
-EXPOBJECTS = $(EXP_LIB_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL)
+ANALYZETOOLOBJECTS = $(ANALYZER_LIB_SOURCES:.cc=.o)
+
+EXPOBJECTS = $(LIBOBJECTS) $(TESTUTIL)
 
 TESTS = \
 	db_basic_test \
@@ -363,6 +431,7 @@ TESTS = \
 	db_blob_index_test \
 	db_bloom_filter_test \
 	db_iter_test \
+	db_iter_stress_test \
 	db_log_iter_test \
 	db_compaction_filter_test \
 	db_compaction_test \
@@ -374,14 +443,15 @@ TESTS = \
 	db_merge_operator_test \
 	db_options_test \
 	db_range_del_test \
+	db_secondary_test \
 	db_sst_test \
 	db_tailing_iter_test \
-	db_universal_compaction_test \
 	db_io_failure_test \
 	db_properties_test \
 	db_table_properties_test \
 	db_statistics_test \
 	db_write_test \
+	error_handler_test \
 	autovector_test \
 	blob_db_test \
 	cleanable_test \
@@ -389,6 +459,7 @@ TESTS = \
 	table_properties_collector_test \
 	arena_test \
 	block_test \
+	data_block_hash_index_test \
 	cache_test \
 	corruption_test \
 	slice_transform_test \
@@ -412,7 +483,6 @@ TESTS = \
 	merger_test \
 	util_merge_operators_test \
 	options_file_test \
-	redis_test \
 	reduce_levels_test \
 	plain_table_db_test \
 	comparator_db_test \
@@ -426,12 +496,8 @@ TESTS = \
 	cassandra_row_merge_test \
 	cassandra_serialize_test \
 	ttl_test \
-	date_tiered_test \
 	backupable_db_test \
-	document_db_test \
-	json_document_test \
 	sim_cache_test \
-	spatial_db_test \
 	version_edit_test \
 	version_set_test \
 	compaction_picker_test \
@@ -441,8 +507,8 @@ TESTS = \
 	write_batch_with_index_test \
 	write_controller_test\
 	deletefile_test \
+	obsolete_files_test \
 	table_test \
-	geodb_test \
 	delete_scheduler_test \
 	options_test \
 	options_settable_test \
@@ -459,7 +525,6 @@ TESTS = \
 	compaction_job_test \
 	thread_list_test \
 	sst_dump_test \
-	column_aware_encoding_test \
 	compact_files_test \
 	optimistic_transaction_test \
 	write_callback_test \
@@ -471,17 +536,25 @@ TESTS = \
 	ldb_cmd_test \
 	persistent_cache_test \
 	statistics_test \
-	lua_test \
-	range_del_aggregator_test \
 	lru_cache_test \
 	object_registry_test \
 	repair_test \
 	env_timed_test \
+	write_prepared_transaction_test \
+	write_unprepared_transaction_test \
+	db_universal_compaction_test \
+	trace_analyzer_test \
+	repeatable_thread_test \
+	range_tombstone_fragmenter_test \
+	range_del_aggregator_test \
+	sst_file_reader_test \
+	db_secondary_test \
 
 PARALLEL_TEST = \
 	backupable_db_test \
 	db_compaction_filter_test \
 	db_compaction_test \
+	db_merge_operator_test \
 	db_sst_test \
 	db_test \
 	db_universal_compaction_test \
@@ -492,8 +565,14 @@ PARALLEL_TEST = \
 	manual_compaction_test \
 	persistent_cache_test \
 	table_test \
-	transaction_test
+	transaction_test \
+	write_prepared_transaction_test \
+	write_unprepared_transaction_test \
 
+# options_settable_test doesn't pass with UBSAN as we use hack in the test
+ifdef COMPILE_WITH_UBSAN
+        TESTS := $(shell echo $(TESTS) | sed 's/\boptions_settable_test\b//g')
+endif
 SUBSET := $(TESTS)
 ifdef ROCKSDBTESTS_START
         SUBSET := $(shell echo $(SUBSET) | sed 's/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/')
@@ -513,12 +592,13 @@ TOOLS = \
 	rocksdb_dump \
 	rocksdb_undump \
 	blob_dump \
+	trace_analyzer \
 
 TEST_LIBS = \
 	librocksdb_env_basic_test.a
 
 # TODO: add back forward_iterator_bench, after making it build in all environemnts.
-BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench column_aware_encoding_exp persistent_cache_bench
+BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench persistent_cache_bench range_del_aggregator_bench
 
 # if user didn't config LIBNAME, set the default
 ifeq ($(LIBNAME),)
@@ -572,22 +652,44 @@ $(SHARED2): $(SHARED4)
 $(SHARED3): $(SHARED4)
 	ln -fs $(SHARED4) $(SHARED3)
 endif
-
+ifeq ($(HAVE_POWER8),1)
+SHARED_C_OBJECTS = $(LIB_SOURCES_C:.c=.o)
+SHARED_ASM_OBJECTS = $(LIB_SOURCES_ASM:.S=.o)
+SHARED_C_LIBOBJECTS = $(patsubst %.o,shared-objects/%.o,$(SHARED_C_OBJECTS))
+SHARED_ASM_LIBOBJECTS = $(patsubst %.o,shared-objects/%.o,$(SHARED_ASM_OBJECTS))
+shared_libobjects = $(patsubst %,shared-objects/%,$(LIB_CC_OBJECTS))
+else
 shared_libobjects = $(patsubst %,shared-objects/%,$(LIBOBJECTS))
+endif
+
 CLEAN_FILES += shared-objects
+shared_all_libobjects = $(shared_libobjects)
+
+ifeq ($(HAVE_POWER8),1)
+shared-ppc-objects = $(SHARED_C_LIBOBJECTS) $(SHARED_ASM_LIBOBJECTS)
 
+shared-objects/util/crc32c_ppc.o: util/crc32c_ppc.c
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+
+shared-objects/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+endif
 $(shared_libobjects): shared-objects/%.o: %.cc
 	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) -c $< -o $@
 
-$(SHARED4): $(shared_libobjects)
-	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(shared_libobjects) $(LDFLAGS) -o $@
+ifeq ($(HAVE_POWER8),1)
+shared_all_libobjects = $(shared_libobjects) $(shared-ppc-objects)
+endif
+$(SHARED4): $(shared_all_libobjects)
+	$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(shared_all_libobjects) $(LDFLAGS) -o $@
 
 endif  # PLATFORM_SHARED_EXT
 
 .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \
-	release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
+	release tags tags0 valgrind_check whitebox_crash_test format static_lib shared_lib all \
 	dbg rocksdbjavastatic rocksdbjava install install-static install-shared uninstall \
-	analyze tools tools_lib
+	analyze tools tools_lib \
+	blackbox_crash_test_with_atomic_flush whitebox_crash_test_with_atomic_flush
 
 
 all: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(TESTS)
@@ -616,7 +718,7 @@ coverage:
 	COVERAGEFLAGS="-fprofile-arcs -ftest-coverage" LDFLAGS+="-lgcov" $(MAKE) J=1 all check
 	cd coverage && ./coverage_test.sh
         # Delete intermediate files
-	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
 
 ifneq (,$(filter check parallel_check,$(MAKECMDGOALS)),)
 # Use /dev/shm if it has the sticky bit set (otherwise, /tmp),
@@ -701,7 +803,7 @@ gen_parallel_tests:
 # 107.816 PASS t/DBTest.EncodeDecompressedBlockSizeTest
 #
 slow_test_regexp = \
-	^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
+	^.*SnapshotConcurrentAccessTest.*$$|^t/run-table_test-HarnessTest.Randomized$$|^t/run-db_test-.*(?:FileCreationRandomFailure|EncodeDecompressedBlockSizeTest)$$|^.*RecoverFromCorruptedWALWithoutFlush$$
 prioritize_long_running_tests =						\
   perl -pe 's,($(slow_test_regexp)),100 $$1,'				\
     | sort -k1,1gr							\
@@ -717,7 +819,6 @@ J ?= 100%
 # Use this regexp to select the subset of tests whose names match.
 tests-regexp = .
 
-t_run = $(wildcard t/run-*)
 .PHONY: check_0
 check_0:
 	$(AM_V_GEN)export TEST_TMPDIR=$(TMPD); \
@@ -727,13 +828,13 @@ check_0:
 	test -t 1 && eta=--eta || eta=; \
 	{ \
 		printf './%s\n' $(filter-out $(PARALLEL_TEST),$(TESTS)); \
-		printf '%s\n' $(t_run); \
+		find t -name 'run-*' -print; \
 	} \
 	  | $(prioritize_long_running_tests)				\
 	  | grep -E '$(tests-regexp)'					\
 	  | build_tools/gnu_parallel -j$(J) --plain --joblog=LOG $$eta --gnu '{} >& t/log-{/}'
 
-valgrind-blacklist-regexp = InlineSkipTest.ConcurrentInsert|TransactionTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized
+valgrind-blacklist-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest.DeadlockStress|DBCompactionTest.SuggestCompactRangeNoTwoLevel0Compactions|BackupableDBTest.RateLimiting|DBTest.CloseSpeedup|DBTest.ThreadStatusFlush|DBTest.RateLimitingTest|DBTest.EncodeDecompressedBlockSizeTest|FaultInjectionTest.UninstalledCompaction|HarnessTest.Randomized|ExternalSSTFileTest.CompactDuringAddFileRandom|ExternalSSTFileTest.IngestFileWithGlobalSeqnoRandomized|MySQLStyleTransactionTest.TransactionStressTest
 
 .PHONY: valgrind_check_0
 valgrind_check_0:
@@ -744,7 +845,7 @@ valgrind_check_0:
 	test -t 1 && eta=--eta || eta=;					\
 	{								\
 	  printf './%s\n' $(filter-out $(PARALLEL_TEST) %skiplist_test options_settable_test, $(TESTS));		\
-	  printf '%s\n' $(t_run);					\
+	  find t -name 'run-*' -print; \
 	}								\
 	  | $(prioritize_long_running_tests)				\
 	  | grep -E '$(tests-regexp)'					\
@@ -763,7 +864,7 @@ CLEAN_FILES += t LOG $(TMPD)
 # regardless of their duration. As with any use of "watch", hit ^C to
 # interrupt.
 watch-log:
-	watch --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)'
+	$(WATCH) --interval=0 'sort -k7,7nr -k4,4gr LOG|$(quoted_perl_command)'
 
 # If J != 1 and GNU parallel is installed, run the tests in parallel,
 # via the check_0 rule above.  Otherwise, run them sequentially.
@@ -796,10 +897,15 @@ ldb_tests: ldb
 
 crash_test: whitebox_crash_test blackbox_crash_test
 
+crash_test_with_atomic_flush: whitebox_crash_test_with_atomic_flush blackbox_crash_test_with_atomic_flush
+
 blackbox_crash_test: db_stress
 	python -u tools/db_crashtest.py --simple blackbox $(CRASH_TEST_EXT_ARGS)
 	python -u tools/db_crashtest.py blackbox $(CRASH_TEST_EXT_ARGS)
 
+blackbox_crash_test_with_atomic_flush: db_stress
+	python -u tools/db_crashtest.py --enable_atomic_flush blackbox $(CRASH_TEST_EXT_ARGS)
+
 ifeq ($(CRASH_TEST_KILL_ODD),)
   CRASH_TEST_KILL_ODD=888887
 endif
@@ -810,6 +916,10 @@ whitebox_crash_test: db_stress
 	python -u tools/db_crashtest.py whitebox  --random_kill_odd \
       $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
 
+whitebox_crash_test_with_atomic_flush: db_stress
+	python -u tools/db_crashtest.py --enable_atomic_flush whitebox  --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
 asan_check:
 	$(MAKE) clean
 	COMPILE_WITH_ASAN=1 $(MAKE) check -j32
@@ -820,6 +930,11 @@ asan_crash_test:
 	COMPILE_WITH_ASAN=1 $(MAKE) crash_test
 	$(MAKE) clean
 
+asan_crash_test_with_atomic_flush:
+	$(MAKE) clean
+	COMPILE_WITH_ASAN=1 $(MAKE) crash_test_with_atomic_flush
+	$(MAKE) clean
+
 ubsan_check:
 	$(MAKE) clean
 	COMPILE_WITH_UBSAN=1 $(MAKE) check -j32
@@ -830,8 +945,13 @@ ubsan_crash_test:
 	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test
 	$(MAKE) clean
 
+ubsan_crash_test_with_atomic_flush:
+	$(MAKE) clean
+	COMPILE_WITH_UBSAN=1 $(MAKE) crash_test_with_atomic_flush
+	$(MAKE) clean
+
 valgrind_test:
-	DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
+	ROCKSDB_VALGRIND_RUN=1 DISABLE_JEMALLOC=1 $(MAKE) valgrind_check
 
 valgrind_check: $(TESTS)
 	$(MAKE) DRIVER="$(VALGRIND_VER) $(VALGRIND_OPTS)" gen_parallel_tests
@@ -917,8 +1037,10 @@ unity.a: unity.o
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o
 
+
+TOOLLIBOBJECTS = $(TOOL_LIB_SOURCES:.cc=.o)
 # try compiling db_test with unity
-unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) unity.a
+unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) $(TOOLLIBOBJECTS) unity.a
 	$(AM_LINK)
 	./unity_test
 
@@ -928,14 +1050,21 @@ rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
 clean:
 	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(LIBRARY) $(SHARED)
 	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
-	find . -name "*.[oda]" -exec rm -f {} \;
-	find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
+	$(FIND) . -name "*.[oda]" -exec rm -f {} \;
+	$(FIND) . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \;
 	rm -rf bzip2* snappy* zlib* lz4* zstd*
 	cd java; $(MAKE) clean
 
 tags:
-	ctags * -R
-	cscope -b `find . -name '*.cc'` `find . -name '*.h'` `find . -name '*.c'`
+	ctags -R .
+	cscope -b `$(FIND) . -name '*.cc'` `$(FIND) . -name '*.h'` `$(FIND) . -name '*.c'`
+	ctags -e -R -o etags *
+
+tags0:
+	ctags -R .
+	cscope -b `$(FIND) . -name '*.cc' -and ! -name '*_test.cc'` \
+		  `$(FIND) . -name '*.c' -and ! -name '*_test.c'` \
+		  `$(FIND) . -name '*.h' -and ! -name '*_test.h'`
 	ctags -e -R -o etags *
 
 format:
@@ -951,7 +1080,7 @@ $(LIBRARY): $(LIBOBJECTS)
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS)
 
-$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL)
+$(TOOLS_LIBRARY): $(BENCH_LIB_SOURCES:.cc=.o) $(TOOL_LIB_SOURCES:.cc=.o) $(LIB_SOURCES:.cc=.o) $(TESTUTIL) $(ANALYZER_LIB_SOURCES:.cc=.o)
 	$(AM_V_AR)rm -f $@
 	$(AM_V_at)$(AR) $(ARFLAGS) $@ $^
 
@@ -962,6 +1091,9 @@ librocksdb_env_basic_test.a: env/env_basic_test.o $(LIBOBJECTS) $(TESTHARNESS)
 db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS)
 	$(AM_LINK)
 
+trace_analyzer: tools/trace_analyzer.o $(ANALYZETOOLOBJECTS) $(LIBOBJECTS)
+	$(AM_LINK)
+
 cache_bench: cache/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
@@ -1031,9 +1163,6 @@ cassandra_row_merge_test: utilities/cassandra/cassandra_row_merge_test.o utiliti
 cassandra_serialize_test: utilities/cassandra/cassandra_serialize_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-redis_test: utilities/redis/redis_lists_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_LINK)
-
 hash_table_test: utilities/persistent_cache/hash_table_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1115,6 +1244,9 @@ db_statistics_test: db/db_statistics_test.o db/db_test_util.o $(LIBOBJECTS) $(TE
 db_write_test: db/db_write_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+error_handler_test: db/error_handler_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 external_sst_file_basic_test: db/external_sst_file_basic_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1127,6 +1259,9 @@ db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS)
 db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+db_iter_stress_test: db/db_iter_stress_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1166,18 +1301,9 @@ backupable_db_test: utilities/backupable/backupable_db_test.o $(LIBOBJECTS) $(TE
 checkpoint_test: utilities/checkpoint/checkpoint_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-document_db_test: utilities/document/document_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_LINK)
-
-json_document_test: utilities/document/json_document_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_LINK)
-
 sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_LINK)
-
 env_mirror_test: utilities/env_mirror_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1195,9 +1321,6 @@ object_registry_test: utilities/object_registry_test.o $(LIBOBJECTS) $(TESTHARNE
 ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-date_tiered_test: utilities/date_tiered/date_tiered_test.o $(LIBOBJECTS) $(TESTHARNESS)
-	$(AM_LINK)
-
 write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1231,7 +1354,7 @@ env_test: env/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
 fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
+rate_limiter_test: util/rate_limiter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 delete_scheduler_test: util/delete_scheduler_test.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1264,6 +1387,9 @@ table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
 block_test: table/block_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+data_block_hash_index_test: table/data_block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 inlineskiplist_test: memtable/inlineskiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1318,7 +1444,7 @@ options_file_test: db/options_file_test.o $(LIBOBJECTS) $(TESTHARNESS)
 deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS)
+obsolete_files_test: db/obsolete_files_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
 rocksdb_dump: tools/dump/rocksdb_dump.o $(LIBOBJECTS)
@@ -1357,6 +1483,9 @@ options_util_test: utilities/options/options_util_test.o $(LIBOBJECTS) $(TESTHAR
 db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+trace_analyzer_test: tools/trace_analyzer_test.o $(LIBOBJECTS) $(ANALYZETOOLOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1366,9 +1495,6 @@ timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS)
 sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-column_aware_encoding_test: utilities/column_aware_encoding_test.o $(TESTHARNESS) $(EXPOBJECTS)
-	$(AM_LINK)
-
 optimistic_transaction_test: utilities/transactions/optimistic_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
@@ -1396,13 +1522,16 @@ heap_test: util/heap_test.o $(GTEST)
 transaction_test: utilities/transactions/transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+write_prepared_transaction_test: utilities/transactions/write_prepared_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-blob_dump: tools/blob_dump.o $(LIBOBJECTS)
+write_unprepared_transaction_test: utilities/transactions/write_unprepared_transaction_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-column_aware_encoding_exp: utilities/column_aware_encoding_exp.o $(EXPOBJECTS)
+sst_dump: tools/sst_dump.o $(LIBOBJECTS)
+	$(AM_LINK)
+
+blob_dump: tools/blob_dump.o $(LIBOBJECTS)
 	$(AM_LINK)
 
 repair_test: db/repair_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
@@ -1426,15 +1555,27 @@ statistics_test: monitoring/statistics_test.o $(LIBOBJECTS) $(TESTHARNESS)
 lru_cache_test: cache/lru_cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-lua_test: utilities/lua/rocks_lua_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
-range_del_aggregator_test: db/range_del_aggregator_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+range_del_aggregator_bench: db/range_del_aggregator_bench.o $(LIBOBJECTS) $(TESTUTIL)
 	$(AM_LINK)
 
 blob_db_test: utilities/blob_db/blob_db_test.o $(LIBOBJECTS) $(TESTHARNESS)
 	$(AM_LINK)
 
+repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
+db_secondary_test: db/db_secondary_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 INSTALL_PATH ?= /usr/local
@@ -1449,10 +1590,10 @@ uninstall:
 
 install-headers:
 	install -d $(INSTALL_PATH)/lib
-	for header_dir in `find "include/rocksdb" -type d`; do \
+	for header_dir in `$(FIND) "include/rocksdb" -type d`; do \
 		install -d $(INSTALL_PATH)/$$header_dir; \
 	done
-	for header in `find "include/rocksdb" -type f -name *.h`; do \
+	for header in `$(FIND) "include/rocksdb" -type f -name *.h`; do \
 		install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
 	done
 
@@ -1479,6 +1620,12 @@ install: install-static
 JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
+else ifeq ($(PLATFORM), OS_OPENBSD)
+	ifneq (,$(filter $(MACHINE), amd64 arm64 sparc64))
+		ARCH := 64
+	else
+		ARCH := 32
+	endif
 else
 	ARCH := $(shell getconf LONG_BIT)
 endif
@@ -1499,16 +1646,17 @@ ZLIB_SHA256 ?= c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1
 ZLIB_DOWNLOAD_BASE ?= http://zlib.net
 BZIP2_VER ?= 1.0.6
 BZIP2_SHA256 ?= a2848f34fcd5d6cf47def00461fcb528a0484d8edef8208d6d2e2909dc61d9cd
-BZIP2_DOWNLOAD_BASE ?= http://www.bzip.org
-SNAPPY_VER ?= 1.1.4
-SNAPPY_SHA256 ?= 134bfe122fd25599bb807bb8130e7ba6d9bdb851e0b16efcb83ac4f5d0b70057
-SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/releases/download
-LZ4_VER ?= 1.7.5
-LZ4_SHA256 ?= 0190cacd63022ccb86f44fa5041dc6c3804407ad61550ca21c382827319e7e7e
+BZIP2_DOWNLOAD_BASE ?= https://web.archive.org/web/20180624184835/http://www.bzip.org
+SNAPPY_VER ?= 1.1.7
+SNAPPY_SHA256 ?= 3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4
+SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive
+LZ4_VER ?= 1.8.3
+LZ4_SHA256 ?= 33af5936ac06536805f9745e0b6d61da606a1f8b4cc5c04dd3cbaca3b9b4fc43
 LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive
-ZSTD_VER ?= 1.2.0
-ZSTD_SHA256 ?= 4a7e4593a3638276ca7f2a09dc4f38e674d8317bbea51626393ca73fc047cbfb
+ZSTD_VER ?= 1.3.7
+ZSTD_SHA256 ?= 5dd1e90eb16c25425880c8a91327f63de22891ffed082fcc17e5ae84fce0d5fb
 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive
+CURL_SSL_OPTS ?= --tlsv1
 
 ifeq ($(PLATFORM), OS_MACOSX)
 	ROCKSDBJNILIB = librocksdbjni-osx.jnilib
@@ -1521,7 +1669,7 @@ else
 endif
 endif
 ifeq ($(PLATFORM), OS_FREEBSD)
-	JAVA_INCLUDE += -I$(JAVA_HOME)/include/freebsd
+	JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/freebsd
 	ROCKSDBJNILIB = librocksdbjni-freebsd$(ARCH).so
 	ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-freebsd$(ARCH).jar
 endif
@@ -1537,92 +1685,123 @@ ifeq ($(PLATFORM), OS_AIX)
 	EXTRACT_SOURCES = gunzip < TAR_GZ | tar xvf -
 	SNAPPY_MAKE_TARGET = libsnappy.la
 endif
+ifeq ($(PLATFORM), OS_OPENBSD)
+        JAVA_INCLUDE = -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/openbsd
+	ROCKSDBJNILIB = librocksdbjni-openbsd$(ARCH).so
+        ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-openbsd$(ARCH).jar
+endif
 
 libz.a:
 	-rm -rf zlib-$(ZLIB_VER)
-	curl -O -L ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
+ifeq (,$(wildcard ./zlib-$(ZLIB_VER).tar.gz))
+	curl --output zlib-$(ZLIB_VER).tar.gz -L ${ZLIB_DOWNLOAD_BASE}/zlib-$(ZLIB_VER).tar.gz
+endif
 	ZLIB_SHA256_ACTUAL=`$(SHA256_CMD) zlib-$(ZLIB_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZLIB_SHA256)" != "$$ZLIB_SHA256_ACTUAL" ]; then \
 		echo zlib-$(ZLIB_VER).tar.gz checksum mismatch, expected=\"$(ZLIB_SHA256)\" actual=\"$$ZLIB_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
 	tar xvzf zlib-$(ZLIB_VER).tar.gz
-	cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${EXTRA_CFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --static && make
+	cd zlib-$(ZLIB_VER) && CFLAGS='-fPIC ${EXTRA_CFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --static && $(MAKE)
 	cp zlib-$(ZLIB_VER)/libz.a .
 
 libbz2.a:
 	-rm -rf bzip2-$(BZIP2_VER)
-	curl -O -L ${BZIP2_DOWNLOAD_BASE}/$(BZIP2_VER)/bzip2-$(BZIP2_VER).tar.gz
+ifeq (,$(wildcard ./bzip2-$(BZIP2_VER).tar.gz))
+	curl --output bzip2-$(BZIP2_VER).tar.gz -L ${BZIP2_DOWNLOAD_BASE}/$(BZIP2_VER)/bzip2-$(BZIP2_VER).tar.gz
+endif
 	BZIP2_SHA256_ACTUAL=`$(SHA256_CMD) bzip2-$(BZIP2_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(BZIP2_SHA256)" != "$$BZIP2_SHA256_ACTUAL" ]; then \
 		echo bzip2-$(BZIP2_VER).tar.gz checksum mismatch, expected=\"$(BZIP2_SHA256)\" actual=\"$$BZIP2_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
 	tar xvzf bzip2-$(BZIP2_VER).tar.gz
-	cd bzip2-$(BZIP2_VER) && make CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${EXTRA_CFLAGS}' AR='ar ${EXTRA_ARFLAGS}'
+	cd bzip2-$(BZIP2_VER) && $(MAKE) CFLAGS='-fPIC -O2 -g -D_FILE_OFFSET_BITS=64 ${EXTRA_CFLAGS}' AR='ar ${EXTRA_ARFLAGS}'
 	cp bzip2-$(BZIP2_VER)/libbz2.a .
 
 libsnappy.a:
 	-rm -rf snappy-$(SNAPPY_VER)
-	curl -O -L ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER)/snappy-$(SNAPPY_VER).tar.gz
+ifeq (,$(wildcard ./snappy-$(SNAPPY_VER).tar.gz))
+	curl --output snappy-$(SNAPPY_VER).tar.gz -L ${CURL_SSL_OPTS} ${SNAPPY_DOWNLOAD_BASE}/$(SNAPPY_VER).tar.gz
+endif
 	SNAPPY_SHA256_ACTUAL=`$(SHA256_CMD) snappy-$(SNAPPY_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(SNAPPY_SHA256)" != "$$SNAPPY_SHA256_ACTUAL" ]; then \
 		echo snappy-$(SNAPPY_VER).tar.gz checksum mismatch, expected=\"$(SNAPPY_SHA256)\" actual=\"$$SNAPPY_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
 	tar xvzf snappy-$(SNAPPY_VER).tar.gz
-	cd snappy-$(SNAPPY_VER) && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' ./configure --with-pic --enable-static --disable-shared
-	cd snappy-$(SNAPPY_VER) && make ${SNAPPY_MAKE_TARGET}
-	cp snappy-$(SNAPPY_VER)/.libs/libsnappy.a .
+	mkdir snappy-$(SNAPPY_VER)/build
+	cd snappy-$(SNAPPY_VER)/build && CFLAGS='${EXTRA_CFLAGS}' CXXFLAGS='${EXTRA_CXXFLAGS}' LDFLAGS='${EXTRA_LDFLAGS}' cmake -DCMAKE_POSITION_INDEPENDENT_CODE=ON .. && $(MAKE) ${SNAPPY_MAKE_TARGET}
+	cp snappy-$(SNAPPY_VER)/build/libsnappy.a .
 
 liblz4.a:
 	-rm -rf lz4-$(LZ4_VER)
-	curl -O -L ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz
-	mv v$(LZ4_VER).tar.gz lz4-$(LZ4_VER).tar.gz
+ifeq (,$(wildcard ./lz4-$(LZ4_VER).tar.gz))
+	curl --output lz4-$(LZ4_VER).tar.gz -L ${CURL_SSL_OPTS} ${LZ4_DOWNLOAD_BASE}/v$(LZ4_VER).tar.gz
+endif
 	LZ4_SHA256_ACTUAL=`$(SHA256_CMD) lz4-$(LZ4_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(LZ4_SHA256)" != "$$LZ4_SHA256_ACTUAL" ]; then \
 		echo lz4-$(LZ4_VER).tar.gz checksum mismatch, expected=\"$(LZ4_SHA256)\" actual=\"$$LZ4_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
 	tar xvzf lz4-$(LZ4_VER).tar.gz
-	cd lz4-$(LZ4_VER)/lib && make CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' all
+	cd lz4-$(LZ4_VER)/lib && $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' all
 	cp lz4-$(LZ4_VER)/lib/liblz4.a .
 
 libzstd.a:
 	-rm -rf zstd-$(ZSTD_VER)
-	curl -O -L ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz
-	mv v$(ZSTD_VER).tar.gz zstd-$(ZSTD_VER).tar.gz
+ifeq (,$(wildcard ./zstd-$(ZSTD_VER).tar.gz))
+	curl --output zstd-$(ZSTD_VER).tar.gz -L ${CURL_SSL_OPTS} ${ZSTD_DOWNLOAD_BASE}/v$(ZSTD_VER).tar.gz
+endif
 	ZSTD_SHA256_ACTUAL=`$(SHA256_CMD) zstd-$(ZSTD_VER).tar.gz | cut -d ' ' -f 1`; \
 	if [ "$(ZSTD_SHA256)" != "$$ZSTD_SHA256_ACTUAL" ]; then \
 		echo zstd-$(ZSTD_VER).tar.gz checksum mismatch, expected=\"$(ZSTD_SHA256)\" actual=\"$$ZSTD_SHA256_ACTUAL\"; \
 		exit 1; \
 	fi
 	tar xvzf zstd-$(ZSTD_VER).tar.gz
-	cd zstd-$(ZSTD_VER)/lib && make CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' all
+	cd zstd-$(ZSTD_VER)/lib && DESTDIR=. PREFIX= $(MAKE) CFLAGS='-fPIC -O2 ${EXTRA_CFLAGS}' install
 	cp zstd-$(ZSTD_VER)/lib/libzstd.a .
 
 # A version of each $(LIBOBJECTS) compiled with -fPIC and a fixed set of static compression libraries
-java_static_libobjects = $(patsubst %,jls/%,$(LIBOBJECTS))
+java_static_libobjects = $(patsubst %,jls/%,$(LIB_CC_OBJECTS))
 CLEAN_FILES += jls
+java_static_all_libobjects = $(java_static_libobjects)
 
 ifneq ($(ROCKSDB_JAVA_NO_COMPRESSION), 1)
 JAVA_COMPRESSIONS = libz.a libbz2.a libsnappy.a liblz4.a libzstd.a
 endif
 
 JAVA_STATIC_FLAGS = -DZLIB -DBZIP2 -DSNAPPY -DLZ4 -DZSTD
-JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib
+JAVA_STATIC_INCLUDES = -I./zlib-$(ZLIB_VER) -I./bzip2-$(BZIP2_VER) -I./snappy-$(SNAPPY_VER) -I./lz4-$(LZ4_VER)/lib -I./zstd-$(ZSTD_VER)/lib/include
+
+ifeq ($(HAVE_POWER8),1)
+JAVA_STATIC_C_LIBOBJECTS = $(patsubst %.c.o,jls/%.c.o,$(LIB_SOURCES_C:.c=.o))
+JAVA_STATIC_ASM_LIBOBJECTS = $(patsubst %.S.o,jls/%.S.o,$(LIB_SOURCES_ASM:.S=.o))
+
+java_static_ppc_libobjects = $(JAVA_STATIC_C_LIBOBJECTS) $(JAVA_STATIC_ASM_LIBOBJECTS)
+
+jls/util/crc32c_ppc.o: util/crc32c_ppc.c
+	$(AM_V_CC)$(CC) $(CFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -c $< -o $@
+
+jls/util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
+	$(AM_V_CC)$(CC) $(CFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -c $< -o $@
+
+java_static_all_libobjects += $(java_static_ppc_libobjects)
+endif
 
 $(java_static_libobjects): jls/%.o: %.cc $(JAVA_COMPRESSIONS)
 	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(JAVA_STATIC_FLAGS) $(JAVA_STATIC_INCLUDES) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
 
-rocksdbjavastatic: $(java_static_libobjects)
+rocksdbjavastatic: $(java_static_all_libobjects)
 	cd java;$(MAKE) javalib;
 	rm -f ./java/target/$(ROCKSDBJNILIB)
 	$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
 	  -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \
-	  $(java_static_libobjects) $(COVERAGEFLAGS) \
+	  $(java_static_all_libobjects) $(COVERAGEFLAGS) \
 	  $(JAVA_COMPRESSIONS) $(JAVA_STATIC_LDFLAGS)
-	cd java/target;strip $(STRIPFLAGS) $(ROCKSDBJNILIB)
+	cd java/target;if [ "$(DEBUG_LEVEL)" == "0" ]; then \
+		strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \
+	fi
 	cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
 	cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
 	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
@@ -1635,20 +1814,34 @@ rocksdbjavastaticrelease: rocksdbjavastatic
 	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 
-rocksdbjavastaticreleasedocker: rocksdbjavastatic
+rocksdbjavastaticreleasedocker: rocksdbjavastatic rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64
+	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
+	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
+	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+
+rocksdbjavastaticdockerx86:
+	mkdir -p java/target
+	DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \
+	if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \
+		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
+	fi
+	docker start -a rocksdb_linux_x86-be
+
+rocksdbjavastaticdockerx86_64:
+	mkdir -p java/target
 	DOCKER_LINUX_X64_CONTAINER=`docker ps -aqf name=rocksdb_linux_x64-be`; \
 	if [ -z "$$DOCKER_LINUX_X64_CONTAINER" ]; then \
 		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x64-be evolvedbinary/rocksjava:centos6_x64-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
 	fi
 	docker start -a rocksdb_linux_x64-be
-	DOCKER_LINUX_X86_CONTAINER=`docker ps -aqf name=rocksdb_linux_x86-be`; \
-	if [ -z "$$DOCKER_LINUX_X86_CONTAINER" ]; then \
-		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_x86-be evolvedbinary/rocksjava:centos6_x86-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
+
+rocksdbjavastaticdockerppc64le:
+	mkdir -p java/target
+	DOCKER_LINUX_PPC64LE_CONTAINER=`docker ps -aqf name=rocksdb_linux_ppc64le-be`; \
+	if [ -z "$$DOCKER_LINUX_PPC64LE_CONTAINER" ]; then \
+		docker container create --attach stdin --attach stdout --attach stderr --volume `pwd`:/rocksdb-host --name rocksdb_linux_ppc64le-be evolvedbinary/rocksjava:centos7_ppc64le-be /rocksdb-host/java/crossbuild/docker-build-linux-centos.sh; \
 	fi
-	docker start -a rocksdb_linux_x86-be
-	cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
-	cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
-	cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
+	docker start -a rocksdb_linux_ppc64le-be
 
 rocksdbjavastaticpublish: rocksdbjavastaticrelease rocksdbjavastaticpublishcentral
 
@@ -1664,16 +1857,39 @@ rocksdbjavastaticpublishcentral:
 	mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar
 
 # A version of each $(LIBOBJECTS) compiled with -fPIC
-java_libobjects = $(patsubst %,jl/%,$(LIBOBJECTS))
+ifeq ($(HAVE_POWER8),1)
+JAVA_CC_OBJECTS = $(SHARED_CC_OBJECTS)
+JAVA_C_OBJECTS = $(SHARED_C_OBJECTS)
+JAVA_ASM_OBJECTS = $(SHARED_ASM_OBJECTS)
+
+JAVA_C_LIBOBJECTS = $(patsubst %.c.o,jl/%.c.o,$(JAVA_C_OBJECTS))
+JAVA_ASM_LIBOBJECTS = $(patsubst %.S.o,jl/%.S.o,$(JAVA_ASM_OBJECTS))
+endif
+
+java_libobjects = $(patsubst %,jl/%,$(LIB_CC_OBJECTS))
 CLEAN_FILES += jl
+java_all_libobjects = $(java_libobjects)
+
+ifeq ($(HAVE_POWER8),1)
+java_ppc_libobjects = $(JAVA_C_LIBOBJECTS) $(JAVA_ASM_LIBOBJECTS)
+
+jl/crc32c_ppc.o: util/crc32c_ppc.c
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+
+jl/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+java_all_libobjects += $(java_ppc_libobjects)
+endif
 
 $(java_libobjects): jl/%.o: %.cc
 	$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
 
-rocksdbjava: $(java_libobjects)
+
+
+rocksdbjava: $(java_all_libobjects)
 	$(AM_V_GEN)cd java;$(MAKE) javalib;
 	$(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB)
-	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(java_libobjects) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
+	$(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(java_all_libobjects) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
 	$(AM_V_at)cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
 	$(AM_V_at)cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
 	$(AM_V_at)cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
@@ -1705,7 +1921,8 @@ commit_prereq: build_tools/rocksdb-lego-determinator \
 ifeq ($(PLATFORM), IOS)
 # For iOS, create universal object files to be used on both the simulator and
 # a device.
-PLATFORMSROOT=/Applications/Xcode.app/Contents/Developer/Platforms
+XCODEROOT=$(shell xcode-select -print-path)
+PLATFORMSROOT=$(XCODEROOT)/Platforms
 SIMULATORROOT=$(PLATFORMSROOT)/iPhoneSimulator.platform/Developer
 DEVICEROOT=$(PLATFORMSROOT)/iPhoneOS.platform/Developer
 IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBundleShortVersionString)
@@ -1725,30 +1942,54 @@ IOSVERSION=$(shell defaults read $(PLATFORMSROOT)/iPhoneOS.platform/version CFBu
 	lipo ios-x86/$@ ios-arm/$@ -create -output $@
 
 else
+ifeq ($(HAVE_POWER8),1)
+util/crc32c_ppc.o: util/crc32c_ppc.c
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+
+util/crc32c_ppc_asm.o: util/crc32c_ppc_asm.S
+	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+endif
 .cc.o:
 	$(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
 
 .c.o:
 	$(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
 endif
-
 # ---------------------------------------------------------------------------
 #  	Source files dependencies detection
 # ---------------------------------------------------------------------------
 
-all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(EXP_LIB_SOURCES)
-DEPFILES = $(all_sources:.cc=.d)
+all_sources = $(LIB_SOURCES) $(MAIN_SOURCES) $(MOCK_LIB_SOURCES) $(TOOL_LIB_SOURCES) $(BENCH_LIB_SOURCES) $(TEST_LIB_SOURCES) $(ANALYZER_LIB_SOURCES)
+DEPFILES = $(all_sources:.cc=.cc.d)
 
 # Add proper dependency support so changing a .h file forces a .cc file to
 # rebuild.
 
 # The .d file indicates .cc file's dependencies on .h files. We generate such
 # dependency by g++'s -MM option, whose output is a make dependency rule.
-$(DEPFILES): %.d: %.cc
+%.cc.d: %.cc
 	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
 	  -MM -MT'$@' -MT'$(<:.cc=.o)' "$<" -o '$@'
 
+ifeq ($(HAVE_POWER8),1)
+DEPFILES_C = $(LIB_SOURCES_C:.c=.c.d)
+DEPFILES_ASM = $(LIB_SOURCES_ASM:.S=.S.d)
+
+%.c.d: %.c
+	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.c=.o)' "$<" -o '$@'
+
+%.S.d: %.S
+	@$(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
+	  -MM -MT'$@' -MT'$(<:.S=.o)' "$<" -o '$@'
+
+$(DEPFILES_C): %.c.d
+
+$(DEPFILES_ASM): %.S.d
+depend: $(DEPFILES) $(DEPFILES_C) $(DEPFILES_ASM)
+else
 depend: $(DEPFILES)
+endif
 
 # if the make goal is either "clean" or "format", we shouldn't
 # try to import the *.d files.
diff --git a/thirdparty/rocksdb/README.md b/thirdparty/rocksdb/README.md
index 550c352b88..f1bc0c05f3 100644
--- a/thirdparty/rocksdb/README.md
+++ b/thirdparty/rocksdb/README.md
@@ -1,11 +1,11 @@
 ## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
 
-[![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
-[![Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/master?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/master)
-
+[![Linux/Mac Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
+[![Windows Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/master?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/master)
+[![PPC64le Build Status](http://140.211.168.68:8080/buildStatus/icon?job=Rocksdb)](http://140.211.168.68:8080/job/Rocksdb)
 
 RocksDB is developed and maintained by Facebook Database Engineering Team.
-It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
+It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)
 and Jeff Dean (jeff@google.com)
 
 This code is a library that forms the core building block for a fast
@@ -25,3 +25,7 @@ rely on the details of any other header files in this package.  Those
 internal APIs may be changed without warning.
 
 Design discussions are conducted in https://www.facebook.com/groups/rocksdb.dev/
+
+## License
+
+RocksDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory).  You may select, at your option, one of the above-listed licenses.
diff --git a/thirdparty/rocksdb/ROCKSDB_LITE.md b/thirdparty/rocksdb/ROCKSDB_LITE.md
index 41cfbecc2c..8991b95063 100644
--- a/thirdparty/rocksdb/ROCKSDB_LITE.md
+++ b/thirdparty/rocksdb/ROCKSDB_LITE.md
@@ -5,7 +5,7 @@ RocksDBLite is a project focused on mobile use cases, which don't need a lot of
 Some examples of the features disabled by ROCKSDB_LITE:
 * compiled-in support for LDB tool
 * No backupable DB
-* No support for replication (which we provide in form of TrasactionalIterator)
+* No support for replication (which we provide in form of TransactionalIterator)
 * No advanced monitoring tools
 * No special-purpose memtables that are highly optimized for specific use cases
 * No Transactions
diff --git a/thirdparty/rocksdb/TARGETS b/thirdparty/rocksdb/TARGETS
index ac85eab93c..073c977e5a 100644
--- a/thirdparty/rocksdb/TARGETS
+++ b/thirdparty/rocksdb/TARGETS
@@ -1,534 +1,1081 @@
+load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
+load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
+load(":defs.bzl", "test_binary")
 
-import os
+REPO_PATH = package_name() + "/"
 
-TARGETS_PATH = os.path.dirname(__file__)
-REPO_PATH = TARGETS_PATH[(TARGETS_PATH.find('fbcode/') + len('fbcode/')):] + "/"
-BUCK_BINS = "buck-out/gen/" + REPO_PATH
-TEST_RUNNER = REPO_PATH + "buckifier/rocks_test_runner.sh"
-rocksdb_compiler_flags = [
-  "-fno-builtin-memcmp",
-  "-DROCKSDB_PLATFORM_POSIX",
-  "-DROCKSDB_LIB_IO_POSIX",
-  "-DROCKSDB_FALLOCATE_PRESENT",
-  "-DROCKSDB_MALLOC_USABLE_SIZE",
-  "-DROCKSDB_RANGESYNC_PRESENT",
-  "-DROCKSDB_SCHED_GETCPU_PRESENT",
-  "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-  "-DOS_LINUX",
-  "-DROCKSDB_UBSAN_RUN",
-  # Flags to enable libs we include
-  "-DSNAPPY",
-  "-DZLIB",
-  "-DBZIP2",
-  "-DLZ4",
-  "-DZSTD",
-  "-DGFLAGS=gflags",
-  "-DNUMA",
-  "-DTBB",
-  # Needed to compile in fbcode
-  "-Wno-expansion-to-defined",
+ROCKSDB_COMPILER_FLAGS = [
+    "-fno-builtin-memcmp",
+    "-DROCKSDB_PLATFORM_POSIX",
+    "-DROCKSDB_LIB_IO_POSIX",
+    "-DROCKSDB_FALLOCATE_PRESENT",
+    "-DROCKSDB_MALLOC_USABLE_SIZE",
+    "-DROCKSDB_RANGESYNC_PRESENT",
+    "-DROCKSDB_SCHED_GETCPU_PRESENT",
+    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
+    "-DOS_LINUX",
+    # Flags to enable libs we include
+    "-DSNAPPY",
+    "-DZLIB",
+    "-DBZIP2",
+    "-DLZ4",
+    "-DZSTD",
+    "-DZSTD_STATIC_LINKING_ONLY",
+    "-DGFLAGS=gflags",
+    "-DNUMA",
+    "-DTBB",
+    # Needed to compile in fbcode
+    "-Wno-expansion-to-defined",
+    # Added missing flags from output of build_detect_platform
+    "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
+    "-DROCKSDB_BACKTRACE",
+    "-Wnarrowing",
 ]
 
-rocksdb_external_deps = [
-  ('bzip2', None, 'bz2'),
-  ('snappy', None, "snappy"),
-  ('zlib', None, 'z'),
-  ('gflags', None, 'gflags'),
-  ('lz4', None, 'lz4'),
-  ('zstd', None),
-  ('tbb', None),
-  ("numa", None, "numa"),
-  ("googletest", None, "gtest"),
+ROCKSDB_EXTERNAL_DEPS = [
+    ("bzip2", None, "bz2"),
+    ("snappy", None, "snappy"),
+    ("zlib", None, "z"),
+    ("gflags", None, "gflags"),
+    ("lz4", None, "lz4"),
+    ("zstd", None),
+    ("tbb", None),
+    ("numa", None, "numa"),
+    ("googletest", None, "gtest"),
 ]
 
-rocksdb_preprocessor_flags = [
-  # Directories with files for #include
-  "-I" + REPO_PATH + "include/",
-  "-I" + REPO_PATH,
+ROCKSDB_PREPROCESSOR_FLAGS = [
+    # Directories with files for #include
+    "-I" + REPO_PATH + "include/",
+    "-I" + REPO_PATH,
 ]
 
-rocksdb_arch_preprocessor_flags = {
-  "x86_64": ["-DHAVE_SSE42"],
+ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
+    "x86_64": [
+        "-DHAVE_SSE42",
+        "-DHAVE_PCLMUL",
+    ],
 }
 
+build_mode = read_config("fbcode", "build_mode")
+
+is_opt_mode = build_mode.startswith("opt")
+
+# -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
+# doesn't harm and avoid forgetting to add it.
+ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else [])
+
+sanitizer = read_config("fbcode", "sanitizer")
+
+# Do not enable jemalloc if sanitizer presents. RocksDB will further detect
+# whether the binary is linked with jemalloc at runtime.
+ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else [])
+
+ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else [])
+
 cpp_library(
     name = "rocksdb_lib",
-    headers = AutoHeaders.RECURSIVE_GLOB,
     srcs = [
-      "cache/clock_cache.cc",
-      "cache/lru_cache.cc",
-      "cache/sharded_cache.cc",
-      "db/builder.cc",
-      "db/c.cc",
-      "db/column_family.cc",
-      "db/compacted_db_impl.cc",
-      "db/compaction.cc",
-      "db/compaction_iterator.cc",
-      "db/compaction_job.cc",
-      "db/compaction_picker.cc",
-      "db/compaction_picker_universal.cc",
-      "db/convenience.cc",
-      "db/db_filesnapshot.cc",
-      "db/db_impl.cc",
-      "db/db_impl_write.cc",
-      "db/db_impl_compaction_flush.cc",
-      "db/db_impl_files.cc",
-      "db/db_impl_open.cc",
-      "db/db_impl_debug.cc",
-      "db/db_impl_experimental.cc",
-      "db/db_impl_readonly.cc",
-      "db/db_info_dumper.cc",
-      "db/db_iter.cc",
-      "db/dbformat.cc",
-      "db/event_helpers.cc",
-      "db/experimental.cc",
-      "db/external_sst_file_ingestion_job.cc",
-      "db/file_indexer.cc",
-      "db/flush_job.cc",
-      "db/flush_scheduler.cc",
-      "db/forward_iterator.cc",
-      "db/internal_stats.cc",
-      "db/log_reader.cc",
-      "db/log_writer.cc",
-      "db/malloc_stats.cc",
-      "db/managed_iterator.cc",
-      "db/memtable.cc",
-      "db/memtable_list.cc",
-      "db/merge_helper.cc",
-      "db/merge_operator.cc",
-      "db/range_del_aggregator.cc",
-      "db/repair.cc",
-      "db/snapshot_impl.cc",
-      "db/table_cache.cc",
-      "db/table_properties_collector.cc",
-      "db/transaction_log_impl.cc",
-      "db/version_builder.cc",
-      "db/version_edit.cc",
-      "db/version_set.cc",
-      "db/wal_manager.cc",
-      "db/write_batch.cc",
-      "db/write_batch_base.cc",
-      "db/write_controller.cc",
-      "db/write_thread.cc",
-      "env/env.cc",
-      "env/env_chroot.cc",
-      "env/env_encryption.cc",
-      "env/env_hdfs.cc",
-      "env/env_posix.cc",
-      "env/io_posix.cc",
-      "env/mock_env.cc",
-      "memtable/alloc_tracker.cc",
-      "memtable/hash_cuckoo_rep.cc",
-      "memtable/hash_linklist_rep.cc",
-      "memtable/hash_skiplist_rep.cc",
-      "memtable/skiplistrep.cc",
-      "memtable/vectorrep.cc",
-      "memtable/write_buffer_manager.cc",
-      "monitoring/histogram.cc",
-      "monitoring/histogram_windowing.cc",
-      "monitoring/instrumented_mutex.cc",
-      "monitoring/iostats_context.cc",
-      "monitoring/perf_context.cc",
-      "monitoring/perf_level.cc",
-      "monitoring/statistics.cc",
-      "monitoring/thread_status_impl.cc",
-      "monitoring/thread_status_updater.cc",
-      "monitoring/thread_status_updater_debug.cc",
-      "monitoring/thread_status_util.cc",
-      "monitoring/thread_status_util_debug.cc",
-      "options/cf_options.cc",
-      "options/db_options.cc",
-      "options/options.cc",
-      "options/options_helper.cc",
-      "options/options_parser.cc",
-      "options/options_sanity_check.cc",
-      "port/port_posix.cc",
-      "port/stack_trace.cc",
-      "table/adaptive_table_factory.cc",
-      "table/block.cc",
-      "table/block_based_filter_block.cc",
-      "table/block_based_table_builder.cc",
-      "table/block_based_table_factory.cc",
-      "table/block_based_table_reader.cc",
-      "table/block_builder.cc",
-      "table/block_prefix_index.cc",
-      "table/bloom_block.cc",
-      "table/cuckoo_table_builder.cc",
-      "table/cuckoo_table_factory.cc",
-      "table/cuckoo_table_reader.cc",
-      "table/flush_block_policy.cc",
-      "table/format.cc",
-      "table/full_filter_block.cc",
-      "table/get_context.cc",
-      "table/index_builder.cc",
-      "table/iterator.cc",
-      "table/merging_iterator.cc",
-      "table/meta_blocks.cc",
-      "table/partitioned_filter_block.cc",
-      "table/persistent_cache_helper.cc",
-      "table/plain_table_builder.cc",
-      "table/plain_table_factory.cc",
-      "table/plain_table_index.cc",
-      "table/plain_table_key_coding.cc",
-      "table/plain_table_reader.cc",
-      "table/sst_file_writer.cc",
-      "table/table_properties.cc",
-      "table/two_level_iterator.cc",
-      "tools/dump/db_dump_tool.cc",
-      "util/arena.cc",
-      "util/auto_roll_logger.cc",
-      "util/bloom.cc",
-      "util/build_version.cc",
-      "util/coding.cc",
-      "util/compaction_job_stats_impl.cc",
-      "util/comparator.cc",
-      "util/concurrent_arena.cc",
-      "util/crc32c.cc",
-      "util/delete_scheduler.cc",
-      "util/dynamic_bloom.cc",
-      "util/event_logger.cc",
-      "util/file_reader_writer.cc",
-      "util/file_util.cc",
-      "util/filename.cc",
-      "util/filter_policy.cc",
-      "util/hash.cc",
-      "util/log_buffer.cc",
-      "util/murmurhash.cc",
-      "util/random.cc",
-      "util/rate_limiter.cc",
-      "util/slice.cc",
-      "util/sst_file_manager_impl.cc",
-      "util/status.cc",
-      "util/status_message.cc",
-      "util/string_util.cc",
-      "util/sync_point.cc",
-      "util/thread_local.cc",
-      "util/threadpool_imp.cc",
-      "util/transaction_test_util.cc",
-      "util/xxhash.cc",
-      "utilities/backupable/backupable_db.cc",
-      "utilities/blob_db/blob_db.cc",
-      "utilities/blob_db/blob_db_impl.cc",
-      "utilities/blob_db/blob_file.cc",
-      "utilities/blob_db/blob_log_reader.cc",
-      "utilities/blob_db/blob_log_writer.cc",
-      "utilities/blob_db/blob_log_format.cc",
-      "utilities/blob_db/ttl_extractor.cc",
-      "utilities/cassandra/cassandra_compaction_filter.cc",
-      "utilities/cassandra/format.cc",
-      "utilities/cassandra/merge_operator.cc",
-      "utilities/checkpoint/checkpoint_impl.cc",
-      "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
-      "utilities/convenience/info_log_finder.cc",
-      "utilities/date_tiered/date_tiered_db_impl.cc",
-      "utilities/debug.cc",
-      "utilities/document/document_db.cc",
-      "utilities/document/json_document.cc",
-      "utilities/document/json_document_builder.cc",
-      "utilities/env_mirror.cc",
-      "utilities/env_timed.cc",
-      "utilities/geodb/geodb_impl.cc",
-      "utilities/leveldb_options/leveldb_options.cc",
-      "utilities/lua/rocks_lua_compaction_filter.cc",
-      "utilities/memory/memory_util.cc",
-      "utilities/merge_operators/max.cc",
-      "utilities/merge_operators/put.cc",
-      "utilities/merge_operators/string_append/stringappend.cc",
-      "utilities/merge_operators/string_append/stringappend2.cc",
-      "utilities/merge_operators/uint64add.cc",
-      "utilities/option_change_migration/option_change_migration.cc",
-      "utilities/options/options_util.cc",
-      "utilities/persistent_cache/block_cache_tier.cc",
-      "utilities/persistent_cache/block_cache_tier_file.cc",
-      "utilities/persistent_cache/block_cache_tier_metadata.cc",
-      "utilities/persistent_cache/persistent_cache_tier.cc",
-      "utilities/persistent_cache/volatile_tier_impl.cc",
-      "utilities/redis/redis_lists.cc",
-      "utilities/simulator_cache/sim_cache.cc",
-      "utilities/spatialdb/spatial_db.cc",
-      "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
-      "utilities/transactions/optimistic_transaction_db_impl.cc",
-      "utilities/transactions/optimistic_transaction.cc",
-      "utilities/transactions/transaction_base.cc",
-      "utilities/transactions/pessimistic_transaction_db.cc",
-      "utilities/transactions/transaction_db_mutex_impl.cc",
-      "utilities/transactions/pessimistic_transaction.cc",
-      "utilities/transactions/transaction_lock_mgr.cc",
-      "utilities/transactions/transaction_util.cc",
-      "utilities/transactions/write_prepared_txn.cc",
-      "utilities/ttl/db_ttl_impl.cc",
-      "utilities/write_batch_with_index/write_batch_with_index.cc",
-      "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
-      "tools/ldb_cmd.cc",
-      "tools/ldb_tool.cc",
-      "tools/sst_dump_tool.cc",
-      "utilities/blob_db/blob_dump_tool.cc",
+        "cache/clock_cache.cc",
+        "cache/lru_cache.cc",
+        "cache/sharded_cache.cc",
+        "db/builder.cc",
+        "db/c.cc",
+        "db/column_family.cc",
+        "db/compacted_db_impl.cc",
+        "db/compaction.cc",
+        "db/compaction_iterator.cc",
+        "db/compaction_job.cc",
+        "db/compaction_picker.cc",
+        "db/compaction_picker_fifo.cc",
+        "db/compaction_picker_universal.cc",
+        "db/convenience.cc",
+        "db/db_filesnapshot.cc",
+        "db/db_impl.cc",
+        "db/db_impl_compaction_flush.cc",
+        "db/db_impl_debug.cc",
+        "db/db_impl_experimental.cc",
+        "db/db_impl_files.cc",
+        "db/db_impl_open.cc",
+        "db/db_impl_readonly.cc",
+        "db/db_impl_secondary.cc",
+        "db/db_impl_write.cc",
+        "db/db_info_dumper.cc",
+        "db/db_iter.cc",
+        "db/dbformat.cc",
+        "db/error_handler.cc",
+        "db/event_helpers.cc",
+        "db/experimental.cc",
+        "db/external_sst_file_ingestion_job.cc",
+        "db/file_indexer.cc",
+        "db/flush_job.cc",
+        "db/flush_scheduler.cc",
+        "db/forward_iterator.cc",
+        "db/in_memory_stats_history.cc",
+        "db/internal_stats.cc",
+        "db/log_reader.cc",
+        "db/log_writer.cc",
+        "db/logs_with_prep_tracker.cc",
+        "db/malloc_stats.cc",
+        "db/memtable.cc",
+        "db/memtable_list.cc",
+        "db/merge_helper.cc",
+        "db/merge_operator.cc",
+        "db/range_del_aggregator.cc",
+        "db/range_tombstone_fragmenter.cc",
+        "db/repair.cc",
+        "db/snapshot_impl.cc",
+        "db/table_cache.cc",
+        "db/table_properties_collector.cc",
+        "db/transaction_log_impl.cc",
+        "db/version_builder.cc",
+        "db/version_edit.cc",
+        "db/version_set.cc",
+        "db/wal_manager.cc",
+        "db/write_batch.cc",
+        "db/write_batch_base.cc",
+        "db/write_controller.cc",
+        "db/write_thread.cc",
+        "env/env.cc",
+        "env/env_chroot.cc",
+        "env/env_encryption.cc",
+        "env/env_hdfs.cc",
+        "env/env_posix.cc",
+        "env/io_posix.cc",
+        "env/mock_env.cc",
+        "memtable/alloc_tracker.cc",
+        "memtable/hash_linklist_rep.cc",
+        "memtable/hash_skiplist_rep.cc",
+        "memtable/skiplistrep.cc",
+        "memtable/vectorrep.cc",
+        "memtable/write_buffer_manager.cc",
+        "monitoring/histogram.cc",
+        "monitoring/histogram_windowing.cc",
+        "monitoring/instrumented_mutex.cc",
+        "monitoring/iostats_context.cc",
+        "monitoring/perf_context.cc",
+        "monitoring/perf_level.cc",
+        "monitoring/statistics.cc",
+        "monitoring/thread_status_impl.cc",
+        "monitoring/thread_status_updater.cc",
+        "monitoring/thread_status_updater_debug.cc",
+        "monitoring/thread_status_util.cc",
+        "monitoring/thread_status_util_debug.cc",
+        "options/cf_options.cc",
+        "options/db_options.cc",
+        "options/options.cc",
+        "options/options_helper.cc",
+        "options/options_parser.cc",
+        "options/options_sanity_check.cc",
+        "port/port_posix.cc",
+        "port/stack_trace.cc",
+        "table/adaptive_table_factory.cc",
+        "table/block.cc",
+        "table/block_based_filter_block.cc",
+        "table/block_based_table_builder.cc",
+        "table/block_based_table_factory.cc",
+        "table/block_based_table_reader.cc",
+        "table/block_builder.cc",
+        "table/block_fetcher.cc",
+        "table/block_prefix_index.cc",
+        "table/bloom_block.cc",
+        "table/cuckoo_table_builder.cc",
+        "table/cuckoo_table_factory.cc",
+        "table/cuckoo_table_reader.cc",
+        "table/data_block_footer.cc",
+        "table/data_block_hash_index.cc",
+        "table/flush_block_policy.cc",
+        "table/format.cc",
+        "table/full_filter_block.cc",
+        "table/get_context.cc",
+        "table/index_builder.cc",
+        "table/iterator.cc",
+        "table/merging_iterator.cc",
+        "table/meta_blocks.cc",
+        "table/partitioned_filter_block.cc",
+        "table/persistent_cache_helper.cc",
+        "table/plain_table_builder.cc",
+        "table/plain_table_factory.cc",
+        "table/plain_table_index.cc",
+        "table/plain_table_key_coding.cc",
+        "table/plain_table_reader.cc",
+        "table/sst_file_reader.cc",
+        "table/sst_file_writer.cc",
+        "table/table_properties.cc",
+        "table/two_level_iterator.cc",
+        "tools/dump/db_dump_tool.cc",
+        "tools/ldb_cmd.cc",
+        "tools/ldb_tool.cc",
+        "tools/sst_dump_tool.cc",
+        "util/arena.cc",
+        "util/auto_roll_logger.cc",
+        "util/bloom.cc",
+        "util/build_version.cc",
+        "util/coding.cc",
+        "util/compaction_job_stats_impl.cc",
+        "util/comparator.cc",
+        "util/compression_context_cache.cc",
+        "util/concurrent_arena.cc",
+        "util/concurrent_task_limiter_impl.cc",
+        "util/crc32c.cc",
+        "util/delete_scheduler.cc",
+        "util/dynamic_bloom.cc",
+        "util/event_logger.cc",
+        "util/file_reader_writer.cc",
+        "util/file_util.cc",
+        "util/filename.cc",
+        "util/filter_policy.cc",
+        "util/hash.cc",
+        "util/jemalloc_nodump_allocator.cc",
+        "util/log_buffer.cc",
+        "util/murmurhash.cc",
+        "util/random.cc",
+        "util/rate_limiter.cc",
+        "util/slice.cc",
+        "util/sst_file_manager_impl.cc",
+        "util/status.cc",
+        "util/string_util.cc",
+        "util/sync_point.cc",
+        "util/sync_point_impl.cc",
+        "util/thread_local.cc",
+        "util/threadpool_imp.cc",
+        "util/trace_replay.cc",
+        "util/transaction_test_util.cc",
+        "util/xxhash.cc",
+        "utilities/backupable/backupable_db.cc",
+        "utilities/blob_db/blob_compaction_filter.cc",
+        "utilities/blob_db/blob_db.cc",
+        "utilities/blob_db/blob_db_impl.cc",
+        "utilities/blob_db/blob_db_impl_filesnapshot.cc",
+        "utilities/blob_db/blob_dump_tool.cc",
+        "utilities/blob_db/blob_file.cc",
+        "utilities/blob_db/blob_log_format.cc",
+        "utilities/blob_db/blob_log_reader.cc",
+        "utilities/blob_db/blob_log_writer.cc",
+        "utilities/cassandra/cassandra_compaction_filter.cc",
+        "utilities/cassandra/format.cc",
+        "utilities/cassandra/merge_operator.cc",
+        "utilities/checkpoint/checkpoint_impl.cc",
+        "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
+        "utilities/convenience/info_log_finder.cc",
+        "utilities/debug.cc",
+        "utilities/env_mirror.cc",
+        "utilities/env_timed.cc",
+        "utilities/leveldb_options/leveldb_options.cc",
+        "utilities/memory/memory_util.cc",
+        "utilities/merge_operators/bytesxor.cc",
+        "utilities/merge_operators/max.cc",
+        "utilities/merge_operators/put.cc",
+        "utilities/merge_operators/string_append/stringappend.cc",
+        "utilities/merge_operators/string_append/stringappend2.cc",
+        "utilities/merge_operators/uint64add.cc",
+        "utilities/option_change_migration/option_change_migration.cc",
+        "utilities/options/options_util.cc",
+        "utilities/persistent_cache/block_cache_tier.cc",
+        "utilities/persistent_cache/block_cache_tier_file.cc",
+        "utilities/persistent_cache/block_cache_tier_metadata.cc",
+        "utilities/persistent_cache/persistent_cache_tier.cc",
+        "utilities/persistent_cache/volatile_tier_impl.cc",
+        "utilities/simulator_cache/sim_cache.cc",
+        "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
+        "utilities/trace/file_trace_reader_writer.cc",
+        "utilities/transactions/optimistic_transaction.cc",
+        "utilities/transactions/optimistic_transaction_db_impl.cc",
+        "utilities/transactions/pessimistic_transaction.cc",
+        "utilities/transactions/pessimistic_transaction_db.cc",
+        "utilities/transactions/snapshot_checker.cc",
+        "utilities/transactions/transaction_base.cc",
+        "utilities/transactions/transaction_db_mutex_impl.cc",
+        "utilities/transactions/transaction_lock_mgr.cc",
+        "utilities/transactions/transaction_util.cc",
+        "utilities/transactions/write_prepared_txn.cc",
+        "utilities/transactions/write_prepared_txn_db.cc",
+        "utilities/transactions/write_unprepared_txn.cc",
+        "utilities/transactions/write_unprepared_txn_db.cc",
+        "utilities/ttl/db_ttl_impl.cc",
+        "utilities/write_batch_with_index/write_batch_with_index.cc",
+        "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
     ],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [],
-    preprocessor_flags = rocksdb_preprocessor_flags,
-    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-    compiler_flags = rocksdb_compiler_flags,
-    external_deps = rocksdb_external_deps,
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
 cpp_library(
     name = "rocksdb_test_lib",
-    headers = AutoHeaders.RECURSIVE_GLOB,
     srcs = [
-      "table/mock_table.cc",
-      "util/fault_injection_test_env.cc",
-      "util/testharness.cc",
-      "util/testutil.cc",
-      "db/db_test_util.cc",
-      "utilities/cassandra/test_utils.cc",
-      "utilities/col_buf_encoder.cc",
-      "utilities/col_buf_decoder.cc",
-      "utilities/column_aware_encoding_util.cc",
+        "db/db_test_util.cc",
+        "table/mock_table.cc",
+        "tools/trace_analyzer_tool.cc",
+        "util/fault_injection_test_env.cc",
+        "util/testharness.cc",
+        "util/testutil.cc",
+        "utilities/cassandra/test_utils.cc",
     ],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_lib"],
-    preprocessor_flags = rocksdb_preprocessor_flags,
-    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-    compiler_flags = rocksdb_compiler_flags,
-    external_deps = rocksdb_external_deps,
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
 cpp_library(
     name = "rocksdb_tools_lib",
-    headers = AutoHeaders.RECURSIVE_GLOB,
     srcs = [
-      "tools/db_bench_tool.cc",
-      "util/testutil.cc",
+        "tools/db_bench_tool.cc",
+        "tools/trace_analyzer_tool.cc",
+        "util/testutil.cc",
     ],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_lib"],
-    preprocessor_flags = rocksdb_preprocessor_flags,
-    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-    compiler_flags = rocksdb_compiler_flags,
-    external_deps = rocksdb_external_deps,
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
 cpp_library(
     name = "env_basic_test_lib",
-    headers = AutoHeaders.RECURSIVE_GLOB,
     srcs = ["env/env_basic_test.cc"],
+    auto_headers = AutoHeaders.RECURSIVE_GLOB,
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
     deps = [":rocksdb_test_lib"],
-    preprocessor_flags = rocksdb_preprocessor_flags,
-    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-    compiler_flags = rocksdb_compiler_flags,
-    external_deps = rocksdb_external_deps,
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 
 # [test_name, test_src, test_type]
-ROCKS_TESTS = [['arena_test', 'util/arena_test.cc', 'serial'],
- ['auto_roll_logger_test', 'util/auto_roll_logger_test.cc', 'serial'],
- ['autovector_test', 'util/autovector_test.cc', 'serial'],
- ['backupable_db_test',
-  'utilities/backupable/backupable_db_test.cc',
-  'parallel'],
- ['blob_db_test', 'utilities/blob_db/blob_db_test.cc', 'serial'],
- ['block_based_filter_block_test',
-  'table/block_based_filter_block_test.cc',
-  'serial'],
- ['block_test', 'table/block_test.cc', 'serial'],
- ['bloom_test', 'util/bloom_test.cc', 'serial'],
- ['c_test', 'db/c_test.c', 'serial'],
- ['cache_test', 'cache/cache_test.cc', 'serial'],
- ['cassandra_format_test',
-  'utilities/cassandra/cassandra_format_test.cc',
-  'serial'],
- ['cassandra_functional_test',
-  'utilities/cassandra/cassandra_functional_test.cc',
-  'serial'],
- ['cassandra_row_merge_test',
-  'utilities/cassandra/cassandra_row_merge_test.cc',
-  'serial'],
- ['cassandra_serialize_test',
-  'utilities/cassandra/cassandra_serialize_test.cc',
-  'serial'],
- ['checkpoint_test', 'utilities/checkpoint/checkpoint_test.cc', 'serial'],
- ['cleanable_test', 'table/cleanable_test.cc', 'serial'],
- ['coding_test', 'util/coding_test.cc', 'serial'],
- ['column_aware_encoding_test',
-  'utilities/column_aware_encoding_test.cc',
-  'serial'],
- ['column_family_test', 'db/column_family_test.cc', 'serial'],
- ['compact_files_test', 'db/compact_files_test.cc', 'serial'],
- ['compact_on_deletion_collector_test',
-  'utilities/table_properties_collectors/compact_on_deletion_collector_test.cc',
-  'serial'],
- ['compaction_iterator_test', 'db/compaction_iterator_test.cc', 'serial'],
- ['compaction_job_stats_test', 'db/compaction_job_stats_test.cc', 'serial'],
- ['compaction_job_test', 'db/compaction_job_test.cc', 'serial'],
- ['compaction_picker_test', 'db/compaction_picker_test.cc', 'serial'],
- ['comparator_db_test', 'db/comparator_db_test.cc', 'serial'],
- ['corruption_test', 'db/corruption_test.cc', 'serial'],
- ['crc32c_test', 'util/crc32c_test.cc', 'serial'],
- ['cuckoo_table_builder_test', 'table/cuckoo_table_builder_test.cc', 'serial'],
- ['cuckoo_table_db_test', 'db/cuckoo_table_db_test.cc', 'serial'],
- ['cuckoo_table_reader_test', 'table/cuckoo_table_reader_test.cc', 'serial'],
- ['date_tiered_test', 'utilities/date_tiered/date_tiered_test.cc', 'serial'],
- ['db_basic_test', 'db/db_basic_test.cc', 'serial'],
- ['db_blob_index_test', 'db/db_blob_index_test.cc', 'serial'],
- ['db_block_cache_test', 'db/db_block_cache_test.cc', 'serial'],
- ['db_bloom_filter_test', 'db/db_bloom_filter_test.cc', 'serial'],
- ['db_compaction_filter_test', 'db/db_compaction_filter_test.cc', 'parallel'],
- ['db_compaction_test', 'db/db_compaction_test.cc', 'parallel'],
- ['db_dynamic_level_test', 'db/db_dynamic_level_test.cc', 'serial'],
- ['db_encryption_test', 'db/db_encryption_test.cc', 'serial'],
- ['db_flush_test', 'db/db_flush_test.cc', 'serial'],
- ['db_inplace_update_test', 'db/db_inplace_update_test.cc', 'serial'],
- ['db_io_failure_test', 'db/db_io_failure_test.cc', 'serial'],
- ['db_iter_test', 'db/db_iter_test.cc', 'serial'],
- ['db_iterator_test', 'db/db_iterator_test.cc', 'serial'],
- ['db_log_iter_test', 'db/db_log_iter_test.cc', 'serial'],
- ['db_memtable_test', 'db/db_memtable_test.cc', 'serial'],
- ['db_merge_operator_test', 'db/db_merge_operator_test.cc', 'serial'],
- ['db_options_test', 'db/db_options_test.cc', 'serial'],
- ['db_properties_test', 'db/db_properties_test.cc', 'serial'],
- ['db_range_del_test', 'db/db_range_del_test.cc', 'serial'],
- ['db_sst_test', 'db/db_sst_test.cc', 'parallel'],
- ['db_statistics_test', 'db/db_statistics_test.cc', 'serial'],
- ['db_table_properties_test', 'db/db_table_properties_test.cc', 'serial'],
- ['db_tailing_iter_test', 'db/db_tailing_iter_test.cc', 'serial'],
- ['db_test', 'db/db_test.cc', 'parallel'],
- ['db_test2', 'db/db_test2.cc', 'serial'],
- ['db_universal_compaction_test',
-  'db/db_universal_compaction_test.cc',
-  'parallel'],
- ['db_wal_test', 'db/db_wal_test.cc', 'parallel'],
- ['db_write_test', 'db/db_write_test.cc', 'serial'],
- ['dbformat_test', 'db/dbformat_test.cc', 'serial'],
- ['delete_scheduler_test', 'util/delete_scheduler_test.cc', 'serial'],
- ['deletefile_test', 'db/deletefile_test.cc', 'serial'],
- ['document_db_test', 'utilities/document/document_db_test.cc', 'serial'],
- ['dynamic_bloom_test', 'util/dynamic_bloom_test.cc', 'serial'],
- ['env_basic_test', 'env/env_basic_test.cc', 'serial'],
- ['env_test', 'env/env_test.cc', 'serial'],
- ['env_timed_test', 'utilities/env_timed_test.cc', 'serial'],
- ['event_logger_test', 'util/event_logger_test.cc', 'serial'],
- ['external_sst_file_basic_test',
-  'db/external_sst_file_basic_test.cc',
-  'serial'],
- ['external_sst_file_test', 'db/external_sst_file_test.cc', 'parallel'],
- ['fault_injection_test', 'db/fault_injection_test.cc', 'parallel'],
- ['file_indexer_test', 'db/file_indexer_test.cc', 'serial'],
- ['file_reader_writer_test', 'util/file_reader_writer_test.cc', 'serial'],
- ['filelock_test', 'util/filelock_test.cc', 'serial'],
- ['filename_test', 'db/filename_test.cc', 'serial'],
- ['flush_job_test', 'db/flush_job_test.cc', 'serial'],
- ['full_filter_block_test', 'table/full_filter_block_test.cc', 'serial'],
- ['geodb_test', 'utilities/geodb/geodb_test.cc', 'serial'],
- ['hash_table_test',
-  'utilities/persistent_cache/hash_table_test.cc',
-  'serial'],
- ['hash_test', 'util/hash_test.cc', 'serial'],
- ['heap_test', 'util/heap_test.cc', 'serial'],
- ['histogram_test', 'monitoring/histogram_test.cc', 'serial'],
- ['inlineskiplist_test', 'memtable/inlineskiplist_test.cc', 'parallel'],
- ['iostats_context_test', 'monitoring/iostats_context_test.cc', 'serial'],
- ['json_document_test', 'utilities/document/json_document_test.cc', 'serial'],
- ['ldb_cmd_test', 'tools/ldb_cmd_test.cc', 'serial'],
- ['listener_test', 'db/listener_test.cc', 'serial'],
- ['log_test', 'db/log_test.cc', 'serial'],
- ['lru_cache_test', 'cache/lru_cache_test.cc', 'serial'],
- ['manual_compaction_test', 'db/manual_compaction_test.cc', 'parallel'],
- ['memory_test', 'utilities/memory/memory_test.cc', 'serial'],
- ['memtable_list_test', 'db/memtable_list_test.cc', 'serial'],
- ['merge_helper_test', 'db/merge_helper_test.cc', 'serial'],
- ['merge_test', 'db/merge_test.cc', 'serial'],
- ['merger_test', 'table/merger_test.cc', 'serial'],
- ['mock_env_test', 'env/mock_env_test.cc', 'serial'],
- ['object_registry_test', 'utilities/object_registry_test.cc', 'serial'],
- ['optimistic_transaction_test',
-  'utilities/transactions/optimistic_transaction_test.cc',
-  'serial'],
- ['option_change_migration_test',
-  'utilities/option_change_migration/option_change_migration_test.cc',
-  'serial'],
- ['options_file_test', 'db/options_file_test.cc', 'serial'],
- ['options_settable_test', 'options/options_settable_test.cc', 'serial'],
- ['options_test', 'options/options_test.cc', 'serial'],
- ['options_util_test', 'utilities/options/options_util_test.cc', 'serial'],
- ['partitioned_filter_block_test',
-  'table/partitioned_filter_block_test.cc',
-  'serial'],
- ['perf_context_test', 'db/perf_context_test.cc', 'serial'],
- ['persistent_cache_test',
-  'utilities/persistent_cache/persistent_cache_test.cc',
-  'parallel'],
- ['plain_table_db_test', 'db/plain_table_db_test.cc', 'serial'],
- ['prefix_test', 'db/prefix_test.cc', 'serial'],
- ['range_del_aggregator_test', 'db/range_del_aggregator_test.cc', 'serial'],
- ['rate_limiter_test', 'util/rate_limiter_test.cc', 'serial'],
- ['reduce_levels_test', 'tools/reduce_levels_test.cc', 'serial'],
- ['repair_test', 'db/repair_test.cc', 'serial'],
- ['sim_cache_test', 'utilities/simulator_cache/sim_cache_test.cc', 'serial'],
- ['skiplist_test', 'memtable/skiplist_test.cc', 'serial'],
- ['slice_transform_test', 'util/slice_transform_test.cc', 'serial'],
- ['spatial_db_test', 'utilities/spatialdb/spatial_db_test.cc', 'serial'],
- ['sst_dump_test', 'tools/sst_dump_test.cc', 'serial'],
- ['statistics_test', 'monitoring/statistics_test.cc', 'serial'],
- ['stringappend_test',
-  'utilities/merge_operators/string_append/stringappend_test.cc',
-  'serial'],
- ['table_properties_collector_test',
-  'db/table_properties_collector_test.cc',
-  'serial'],
- ['table_test', 'table/table_test.cc', 'parallel'],
- ['thread_list_test', 'util/thread_list_test.cc', 'serial'],
- ['thread_local_test', 'util/thread_local_test.cc', 'serial'],
- ['timer_queue_test', 'util/timer_queue_test.cc', 'serial'],
- ['transaction_test', 'utilities/transactions/transaction_test.cc', 'serial'],
- ['ttl_test', 'utilities/ttl/ttl_test.cc', 'serial'],
- ['util_merge_operators_test',
-  'utilities/util_merge_operators_test.cc',
-  'serial'],
- ['version_builder_test', 'db/version_builder_test.cc', 'serial'],
- ['version_edit_test', 'db/version_edit_test.cc', 'serial'],
- ['version_set_test', 'db/version_set_test.cc', 'serial'],
- ['wal_manager_test', 'db/wal_manager_test.cc', 'serial'],
- ['write_batch_test', 'db/write_batch_test.cc', 'serial'],
- ['write_batch_with_index_test',
-  'utilities/write_batch_with_index/write_batch_with_index_test.cc',
-  'serial'],
- ['write_buffer_manager_test',
-  'memtable/write_buffer_manager_test.cc',
-  'serial'],
- ['write_callback_test', 'db/write_callback_test.cc', 'serial'],
- ['write_controller_test', 'db/write_controller_test.cc', 'serial']]
-
+ROCKS_TESTS = [
+    [
+        "arena_test",
+        "util/arena_test.cc",
+        "serial",
+    ],
+    [
+        "auto_roll_logger_test",
+        "util/auto_roll_logger_test.cc",
+        "serial",
+    ],
+    [
+        "autovector_test",
+        "util/autovector_test.cc",
+        "serial",
+    ],
+    [
+        "backupable_db_test",
+        "utilities/backupable/backupable_db_test.cc",
+        "parallel",
+    ],
+    [
+        "blob_db_test",
+        "utilities/blob_db/blob_db_test.cc",
+        "serial",
+    ],
+    [
+        "block_based_filter_block_test",
+        "table/block_based_filter_block_test.cc",
+        "serial",
+    ],
+    [
+        "block_test",
+        "table/block_test.cc",
+        "serial",
+    ],
+    [
+        "bloom_test",
+        "util/bloom_test.cc",
+        "serial",
+    ],
+    [
+        "c_test",
+        "db/c_test.c",
+        "serial",
+    ],
+    [
+        "cache_test",
+        "cache/cache_test.cc",
+        "serial",
+    ],
+    [
+        "cassandra_format_test",
+        "utilities/cassandra/cassandra_format_test.cc",
+        "serial",
+    ],
+    [
+        "cassandra_functional_test",
+        "utilities/cassandra/cassandra_functional_test.cc",
+        "serial",
+    ],
+    [
+        "cassandra_row_merge_test",
+        "utilities/cassandra/cassandra_row_merge_test.cc",
+        "serial",
+    ],
+    [
+        "cassandra_serialize_test",
+        "utilities/cassandra/cassandra_serialize_test.cc",
+        "serial",
+    ],
+    [
+        "checkpoint_test",
+        "utilities/checkpoint/checkpoint_test.cc",
+        "serial",
+    ],
+    [
+        "cleanable_test",
+        "table/cleanable_test.cc",
+        "serial",
+    ],
+    [
+        "coding_test",
+        "util/coding_test.cc",
+        "serial",
+    ],
+    [
+        "column_family_test",
+        "db/column_family_test.cc",
+        "serial",
+    ],
+    [
+        "compact_files_test",
+        "db/compact_files_test.cc",
+        "serial",
+    ],
+    [
+        "compact_on_deletion_collector_test",
+        "utilities/table_properties_collectors/compact_on_deletion_collector_test.cc",
+        "serial",
+    ],
+    [
+        "compaction_iterator_test",
+        "db/compaction_iterator_test.cc",
+        "serial",
+    ],
+    [
+        "compaction_job_stats_test",
+        "db/compaction_job_stats_test.cc",
+        "serial",
+    ],
+    [
+        "compaction_job_test",
+        "db/compaction_job_test.cc",
+        "serial",
+    ],
+    [
+        "compaction_picker_test",
+        "db/compaction_picker_test.cc",
+        "serial",
+    ],
+    [
+        "comparator_db_test",
+        "db/comparator_db_test.cc",
+        "serial",
+    ],
+    [
+        "corruption_test",
+        "db/corruption_test.cc",
+        "serial",
+    ],
+    [
+        "crc32c_test",
+        "util/crc32c_test.cc",
+        "serial",
+    ],
+    [
+        "cuckoo_table_builder_test",
+        "table/cuckoo_table_builder_test.cc",
+        "serial",
+    ],
+    [
+        "cuckoo_table_db_test",
+        "db/cuckoo_table_db_test.cc",
+        "serial",
+    ],
+    [
+        "cuckoo_table_reader_test",
+        "table/cuckoo_table_reader_test.cc",
+        "serial",
+    ],
+    [
+        "data_block_hash_index_test",
+        "table/data_block_hash_index_test.cc",
+        "serial",
+    ],
+    [
+        "db_basic_test",
+        "db/db_basic_test.cc",
+        "serial",
+    ],
+    [
+        "db_blob_index_test",
+        "db/db_blob_index_test.cc",
+        "serial",
+    ],
+    [
+        "db_block_cache_test",
+        "db/db_block_cache_test.cc",
+        "serial",
+    ],
+    [
+        "db_bloom_filter_test",
+        "db/db_bloom_filter_test.cc",
+        "serial",
+    ],
+    [
+        "db_compaction_filter_test",
+        "db/db_compaction_filter_test.cc",
+        "parallel",
+    ],
+    [
+        "db_compaction_test",
+        "db/db_compaction_test.cc",
+        "parallel",
+    ],
+    [
+        "db_dynamic_level_test",
+        "db/db_dynamic_level_test.cc",
+        "serial",
+    ],
+    [
+        "db_encryption_test",
+        "db/db_encryption_test.cc",
+        "serial",
+    ],
+    [
+        "db_flush_test",
+        "db/db_flush_test.cc",
+        "serial",
+    ],
+    [
+        "db_inplace_update_test",
+        "db/db_inplace_update_test.cc",
+        "serial",
+    ],
+    [
+        "db_io_failure_test",
+        "db/db_io_failure_test.cc",
+        "serial",
+    ],
+    [
+        "db_iter_stress_test",
+        "db/db_iter_stress_test.cc",
+        "serial",
+    ],
+    [
+        "db_iter_test",
+        "db/db_iter_test.cc",
+        "serial",
+    ],
+    [
+        "db_iterator_test",
+        "db/db_iterator_test.cc",
+        "serial",
+    ],
+    [
+        "db_log_iter_test",
+        "db/db_log_iter_test.cc",
+        "serial",
+    ],
+    [
+        "db_memtable_test",
+        "db/db_memtable_test.cc",
+        "serial",
+    ],
+    [
+        "db_merge_operator_test",
+        "db/db_merge_operator_test.cc",
+        "parallel",
+    ],
+    [
+        "db_options_test",
+        "db/db_options_test.cc",
+        "serial",
+    ],
+    [
+        "db_properties_test",
+        "db/db_properties_test.cc",
+        "serial",
+    ],
+    [
+        "db_range_del_test",
+        "db/db_range_del_test.cc",
+        "serial",
+    ],
+    [
+        "db_secondary_test",
+        "db/db_secondary_test.cc",
+        "serial",
+    ],
+    [
+        "db_sst_test",
+        "db/db_sst_test.cc",
+        "parallel",
+    ],
+    [
+        "db_statistics_test",
+        "db/db_statistics_test.cc",
+        "serial",
+    ],
+    [
+        "db_table_properties_test",
+        "db/db_table_properties_test.cc",
+        "serial",
+    ],
+    [
+        "db_tailing_iter_test",
+        "db/db_tailing_iter_test.cc",
+        "serial",
+    ],
+    [
+        "db_test",
+        "db/db_test.cc",
+        "parallel",
+    ],
+    [
+        "db_test2",
+        "db/db_test2.cc",
+        "serial",
+    ],
+    [
+        "db_universal_compaction_test",
+        "db/db_universal_compaction_test.cc",
+        "parallel",
+    ],
+    [
+        "db_wal_test",
+        "db/db_wal_test.cc",
+        "parallel",
+    ],
+    [
+        "db_write_test",
+        "db/db_write_test.cc",
+        "serial",
+    ],
+    [
+        "dbformat_test",
+        "db/dbformat_test.cc",
+        "serial",
+    ],
+    [
+        "delete_scheduler_test",
+        "util/delete_scheduler_test.cc",
+        "serial",
+    ],
+    [
+        "deletefile_test",
+        "db/deletefile_test.cc",
+        "serial",
+    ],
+    [
+        "dynamic_bloom_test",
+        "util/dynamic_bloom_test.cc",
+        "serial",
+    ],
+    [
+        "env_basic_test",
+        "env/env_basic_test.cc",
+        "serial",
+    ],
+    [
+        "env_test",
+        "env/env_test.cc",
+        "serial",
+    ],
+    [
+        "env_timed_test",
+        "utilities/env_timed_test.cc",
+        "serial",
+    ],
+    [
+        "error_handler_test",
+        "db/error_handler_test.cc",
+        "serial",
+    ],
+    [
+        "event_logger_test",
+        "util/event_logger_test.cc",
+        "serial",
+    ],
+    [
+        "external_sst_file_basic_test",
+        "db/external_sst_file_basic_test.cc",
+        "serial",
+    ],
+    [
+        "external_sst_file_test",
+        "db/external_sst_file_test.cc",
+        "parallel",
+    ],
+    [
+        "fault_injection_test",
+        "db/fault_injection_test.cc",
+        "parallel",
+    ],
+    [
+        "file_indexer_test",
+        "db/file_indexer_test.cc",
+        "serial",
+    ],
+    [
+        "file_reader_writer_test",
+        "util/file_reader_writer_test.cc",
+        "serial",
+    ],
+    [
+        "filelock_test",
+        "util/filelock_test.cc",
+        "serial",
+    ],
+    [
+        "filename_test",
+        "db/filename_test.cc",
+        "serial",
+    ],
+    [
+        "flush_job_test",
+        "db/flush_job_test.cc",
+        "serial",
+    ],
+    [
+        "full_filter_block_test",
+        "table/full_filter_block_test.cc",
+        "serial",
+    ],
+    [
+        "hash_table_test",
+        "utilities/persistent_cache/hash_table_test.cc",
+        "serial",
+    ],
+    [
+        "hash_test",
+        "util/hash_test.cc",
+        "serial",
+    ],
+    [
+        "heap_test",
+        "util/heap_test.cc",
+        "serial",
+    ],
+    [
+        "histogram_test",
+        "monitoring/histogram_test.cc",
+        "serial",
+    ],
+    [
+        "inlineskiplist_test",
+        "memtable/inlineskiplist_test.cc",
+        "parallel",
+    ],
+    [
+        "iostats_context_test",
+        "monitoring/iostats_context_test.cc",
+        "serial",
+    ],
+    [
+        "ldb_cmd_test",
+        "tools/ldb_cmd_test.cc",
+        "serial",
+    ],
+    [
+        "listener_test",
+        "db/listener_test.cc",
+        "serial",
+    ],
+    [
+        "log_test",
+        "db/log_test.cc",
+        "serial",
+    ],
+    [
+        "lru_cache_test",
+        "cache/lru_cache_test.cc",
+        "serial",
+    ],
+    [
+        "manual_compaction_test",
+        "db/manual_compaction_test.cc",
+        "parallel",
+    ],
+    [
+        "memory_test",
+        "utilities/memory/memory_test.cc",
+        "serial",
+    ],
+    [
+        "memtable_list_test",
+        "db/memtable_list_test.cc",
+        "serial",
+    ],
+    [
+        "merge_helper_test",
+        "db/merge_helper_test.cc",
+        "serial",
+    ],
+    [
+        "merge_test",
+        "db/merge_test.cc",
+        "serial",
+    ],
+    [
+        "merger_test",
+        "table/merger_test.cc",
+        "serial",
+    ],
+    [
+        "mock_env_test",
+        "env/mock_env_test.cc",
+        "serial",
+    ],
+    [
+        "object_registry_test",
+        "utilities/object_registry_test.cc",
+        "serial",
+    ],
+    [
+        "obsolete_files_test",
+        "db/obsolete_files_test.cc",
+        "serial",
+    ],
+    [
+        "optimistic_transaction_test",
+        "utilities/transactions/optimistic_transaction_test.cc",
+        "serial",
+    ],
+    [
+        "option_change_migration_test",
+        "utilities/option_change_migration/option_change_migration_test.cc",
+        "serial",
+    ],
+    [
+        "options_file_test",
+        "db/options_file_test.cc",
+        "serial",
+    ],
+    [
+        "options_settable_test",
+        "options/options_settable_test.cc",
+        "serial",
+    ],
+    [
+        "options_test",
+        "options/options_test.cc",
+        "serial",
+    ],
+    [
+        "options_util_test",
+        "utilities/options/options_util_test.cc",
+        "serial",
+    ],
+    [
+        "partitioned_filter_block_test",
+        "table/partitioned_filter_block_test.cc",
+        "serial",
+    ],
+    [
+        "perf_context_test",
+        "db/perf_context_test.cc",
+        "serial",
+    ],
+    [
+        "persistent_cache_test",
+        "utilities/persistent_cache/persistent_cache_test.cc",
+        "parallel",
+    ],
+    [
+        "plain_table_db_test",
+        "db/plain_table_db_test.cc",
+        "serial",
+    ],
+    [
+        "prefix_test",
+        "db/prefix_test.cc",
+        "serial",
+    ],
+    [
+        "range_del_aggregator_test",
+        "db/range_del_aggregator_test.cc",
+        "serial",
+    ],
+    [
+        "range_tombstone_fragmenter_test",
+        "db/range_tombstone_fragmenter_test.cc",
+        "serial",
+    ],
+    [
+        "rate_limiter_test",
+        "util/rate_limiter_test.cc",
+        "serial",
+    ],
+    [
+        "reduce_levels_test",
+        "tools/reduce_levels_test.cc",
+        "serial",
+    ],
+    [
+        "repair_test",
+        "db/repair_test.cc",
+        "serial",
+    ],
+    [
+        "repeatable_thread_test",
+        "util/repeatable_thread_test.cc",
+        "serial",
+    ],
+    [
+        "sim_cache_test",
+        "utilities/simulator_cache/sim_cache_test.cc",
+        "serial",
+    ],
+    [
+        "skiplist_test",
+        "memtable/skiplist_test.cc",
+        "serial",
+    ],
+    [
+        "slice_transform_test",
+        "util/slice_transform_test.cc",
+        "serial",
+    ],
+    [
+        "sst_dump_test",
+        "tools/sst_dump_test.cc",
+        "serial",
+    ],
+    [
+        "sst_file_reader_test",
+        "table/sst_file_reader_test.cc",
+        "serial",
+    ],
+    [
+        "statistics_test",
+        "monitoring/statistics_test.cc",
+        "serial",
+    ],
+    [
+        "stringappend_test",
+        "utilities/merge_operators/string_append/stringappend_test.cc",
+        "serial",
+    ],
+    [
+        "table_properties_collector_test",
+        "db/table_properties_collector_test.cc",
+        "serial",
+    ],
+    [
+        "table_test",
+        "table/table_test.cc",
+        "parallel",
+    ],
+    [
+        "thread_list_test",
+        "util/thread_list_test.cc",
+        "serial",
+    ],
+    [
+        "thread_local_test",
+        "util/thread_local_test.cc",
+        "serial",
+    ],
+    [
+        "timer_queue_test",
+        "util/timer_queue_test.cc",
+        "serial",
+    ],
+    [
+        "trace_analyzer_test",
+        "tools/trace_analyzer_test.cc",
+        "serial",
+    ],
+    [
+        "transaction_test",
+        "utilities/transactions/transaction_test.cc",
+        "parallel",
+    ],
+    [
+        "ttl_test",
+        "utilities/ttl/ttl_test.cc",
+        "serial",
+    ],
+    [
+        "util_merge_operators_test",
+        "utilities/util_merge_operators_test.cc",
+        "serial",
+    ],
+    [
+        "version_builder_test",
+        "db/version_builder_test.cc",
+        "serial",
+    ],
+    [
+        "version_edit_test",
+        "db/version_edit_test.cc",
+        "serial",
+    ],
+    [
+        "version_set_test",
+        "db/version_set_test.cc",
+        "serial",
+    ],
+    [
+        "wal_manager_test",
+        "db/wal_manager_test.cc",
+        "serial",
+    ],
+    [
+        "write_batch_test",
+        "db/write_batch_test.cc",
+        "serial",
+    ],
+    [
+        "write_batch_with_index_test",
+        "utilities/write_batch_with_index/write_batch_with_index_test.cc",
+        "serial",
+    ],
+    [
+        "write_buffer_manager_test",
+        "memtable/write_buffer_manager_test.cc",
+        "serial",
+    ],
+    [
+        "write_callback_test",
+        "db/write_callback_test.cc",
+        "serial",
+    ],
+    [
+        "write_controller_test",
+        "db/write_controller_test.cc",
+        "serial",
+    ],
+    [
+        "write_prepared_transaction_test",
+        "utilities/transactions/write_prepared_transaction_test.cc",
+        "parallel",
+    ],
+    [
+        "write_unprepared_transaction_test",
+        "utilities/transactions/write_unprepared_transaction_test.cc",
+        "parallel",
+    ],
+]
 
 # Generate a test rule for each entry in ROCKS_TESTS
-for test_cfg in ROCKS_TESTS:
-    test_name = test_cfg[0]
-    test_cc = test_cfg[1]
-    ttype = "gtest" if test_cfg[2] == "parallel" else "simple"
-    test_bin = test_name + "_bin"
-
-    cpp_binary (
-      name = test_bin,
-      srcs = [test_cc],
-      deps = [":rocksdb_test_lib"],
-      preprocessor_flags = rocksdb_preprocessor_flags,
-      arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-      compiler_flags = rocksdb_compiler_flags,
-      external_deps = rocksdb_external_deps,
+# Do not build the tests in opt mode, since SyncPoint and other test code
+# will not be included.
+[
+    test_binary(
+        parallelism = parallelism,
+        rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+        rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
+        rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
+        rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+        test_cc = test_cc,
+        test_name = test_name,
     )
-
-    custom_unittest(
-      name = test_name,
-      type = ttype,
-      deps = [":" + test_bin],
-      command = [TEST_RUNNER, BUCK_BINS + test_bin]
-    )
-
-custom_unittest(
-    name = "make_rocksdbjavastatic",
-    type = "simple",
-    command = ["internal_repo_rocksdb/make_rocksdbjavastatic.sh"],
-)
-
-custom_unittest(
-    name = "make_rocksdb_lite_release",
-    type = "simple",
-    command = ["internal_repo_rocksdb/make_rocksdb_lite_release.sh"],
-)
+    for test_name, test_cc, parallelism in ROCKS_TESTS
+    if not is_opt_mode
+]
diff --git a/thirdparty/rocksdb/USERS.md b/thirdparty/rocksdb/USERS.md
index 7be093f958..a95903f066 100644
--- a/thirdparty/rocksdb/USERS.md
+++ b/thirdparty/rocksdb/USERS.md
@@ -5,12 +5,15 @@ At Facebook, we use RocksDB as storage engines in multiple data management servi
 
 1. MyRocks -- https://github.com/MySQLOnRocksDB/mysql-5.6
 2. MongoRocks -- https://github.com/mongodb-partners/mongo-rocks
-3. ZippyDB --  Facebook's distributed key-value store with Paxos-style replication, built on top of RocksDB.[*] https://www.youtube.com/watch?v=DfiN7pG0D0khtt
-4. Laser -- Laser is a high query throughput, low (millisecond) latency, key-value storage service built on top of RocksDB.[*]
+3. ZippyDB --  Facebook's distributed key-value store with Paxos-style replication, built on top of RocksDB.[1] https://www.youtube.com/watch?v=DfiN7pG0D0khtt
+4. Laser -- Laser is a high query throughput, low (millisecond) latency, key-value storage service built on top of RocksDB.[1]
 4. Dragon -- a distributed graph query engine. https://code.facebook.com/posts/1737605303120405/dragon-a-distributed-graph-query-engine/
-5. Stylus -- a low-level stream processing framework writtenin C++.[*]
+5. Stylus -- a low-level stream processing framework writtenin C++.[1]
+6. LogDevice -- a distributed data store for logs [2]
 
-[*] https://research.facebook.com/publications/realtime-data-processing-at-facebook/
+[1] https://research.facebook.com/publications/realtime-data-processing-at-facebook/
+
+[2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/
 
 ## LinkedIn
 Two different use cases at Linkedin are using RocksDB as a storage engine:
@@ -24,7 +27,7 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu
 Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
 
 ## CockroachDB
-CockroachDB is an open-source geo-replicated transactional database (still in development). They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
+CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
 
 ## DNANexus
 DNANexus is using RocksDB to speed up processing of genomics data.
@@ -83,3 +86,9 @@ quasardb uses a heavily tuned RocksDB as its persistence layer.
 
 ## LzLabs
 LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data.
+
+## ProfaneDB
+[ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files.
+
+## IOTA Foundation
+ [IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
\ No newline at end of file
diff --git a/thirdparty/rocksdb/Vagrantfile b/thirdparty/rocksdb/Vagrantfile
index d7c2991d79..07f2e99fdd 100644
--- a/thirdparty/rocksdb/Vagrantfile
+++ b/thirdparty/rocksdb/Vagrantfile
@@ -14,6 +14,11 @@ Vagrant.configure("2") do |config|
     box.vm.box = "chef/centos-6.5"
   end
 
+  config.vm.define "centos7" do |box|
+    box.vm.box = "centos/7"
+    box.vm.provision "shell", path: "build_tools/setup_centos7.sh"
+  end
+
   config.vm.define "FreeBSD10" do |box|
     box.vm.guest = :freebsd
     box.vm.box = "robin/freebsd-10"
diff --git a/thirdparty/rocksdb/WINDOWS_PORT.md b/thirdparty/rocksdb/WINDOWS_PORT.md
index a0fe1fe11f..57293c97c9 100644
--- a/thirdparty/rocksdb/WINDOWS_PORT.md
+++ b/thirdparty/rocksdb/WINDOWS_PORT.md
@@ -43,9 +43,9 @@ We plan to use this port for our business purposes here at Bing and this provide
 
 * Certain headers that are not present and not necessary on Windows were simply `#ifndef OS_WIN` in a few places (`unistd.h`)
 * All posix specific headers were replaced to port/port.h which worked well
-* Replaced `dirent.h` for `port/dirent.h` (very few places) with the implementation of the relevant interfaces within `rocksdb::port` namespace
+* Replaced `dirent.h` for `port/port_dirent.h` (very few places) with the implementation of the relevant interfaces within `rocksdb::port` namespace
 * Replaced `sys/time.h` to `port/sys_time.h` (few places) implemented equivalents within `rocksdb::port`
-* `printf %z` specification is not supported on Windows. To imitate existing standards we came up with a string macro `ROCKSDB_PRIszt` which expands to `%z` on posix systems and to Iu on windows.
+* `printf %z` specification is not supported on Windows. To imitate existing standards we came up with a string macro `ROCKSDB_PRIszt` which expands to `zu` on posix systems and to `Iu` on windows.
 * in class member initialization were moved to a __ctors in some cases
 * `constexpr` is not supported. We had to replace `std::numeric_limits<>::max/min()` to its C macros for constants. Sometimes we had to make class members `static const` and place a definition within a .cc file.
 * `constexpr` for functions was replaced to a template specialization (1 place)
diff --git a/thirdparty/rocksdb/appveyor.yml b/thirdparty/rocksdb/appveyor.yml
index be9b66b45c..9dae40af8f 100644
--- a/thirdparty/rocksdb/appveyor.yml
+++ b/thirdparty/rocksdb/appveyor.yml
@@ -1,14 +1,14 @@
 version: 1.0.{build}
-image: Visual Studio 2015
+image: Visual Studio 2017
 before_build:
 - md %APPVEYOR_BUILD_FOLDER%\build
 - cd %APPVEYOR_BUILD_FOLDER%\build
-- cmake -G "Visual Studio 14 2015 Win64" -DOPTDBG=1 -DXPRESS=1 -DPORTABLE=1 ..
+- cmake -G "Visual Studio 15 Win64" -DOPTDBG=1 -DWITH_XPRESS=1 -DPORTABLE=1 -DJNI=1 ..
 - cd ..
 build:
   project: build\rocksdb.sln
   parallel: true
-  verbosity: minimal
+  verbosity: normal
 test:
 test_script:
 - ps: build_tools\run_ci_db_test.ps1 -SuiteRun db_basic_test,db_test2,db_test,env_basic_test,env_test -Concurrency 8
diff --git a/thirdparty/rocksdb/buckifier/buckify_rocksdb.py b/thirdparty/rocksdb/buckifier/buckify_rocksdb.py
index a3c8be3b17..96903af682 100644
--- a/thirdparty/rocksdb/buckifier/buckify_rocksdb.py
+++ b/thirdparty/rocksdb/buckifier/buckify_rocksdb.py
@@ -3,14 +3,11 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 from targets_builder import TARGETSBuilder
-from optparse import OptionParser
 import os
 import fnmatch
 import sys
-import tempfile
 
 from util import ColorString
-import util
 
 # tests to export as libraries for inclusion in other projects
 _EXPORTED_TEST_LIBS = ["env_basic_test"]
@@ -36,7 +33,7 @@ def parse_src_mk(repo_path):
 # get all .cc / .c files
 def get_cc_files(repo_path):
     cc_files = []
-    for root, dirnames, filenames in os.walk(repo_path):
+    for root, dirnames, filenames in os.walk(repo_path):  # noqa: B007 T25377293 Grandfathered in
         root = root[(len(repo_path) + 1):]
         if "java" in root:
             # Skip java
@@ -112,12 +109,14 @@ def generate_targets(repo_path):
         "rocksdb_test_lib",
         src_mk.get("MOCK_LIB_SOURCES", []) +
         src_mk.get("TEST_LIB_SOURCES", []) +
-        src_mk.get("EXP_LIB_SOURCES", []),
+        src_mk.get("EXP_LIB_SOURCES", []) +
+        src_mk.get("ANALYZER_LIB_SOURCES", []),
         [":rocksdb_lib"])
     # rocksdb_tools_lib
     TARGETS.add_library(
         "rocksdb_tools_lib",
         src_mk.get("BENCH_LIB_SOURCES", []) +
+        src_mk.get("ANALYZER_LIB_SOURCES", []) +
         ["util/testutil.cc"],
         [":rocksdb_lib"])
 
diff --git a/thirdparty/rocksdb/buckifier/rocks_test_runner.sh b/thirdparty/rocksdb/buckifier/rocks_test_runner.sh
index e1f48a760d..baca6c2e40 100755
--- a/thirdparty/rocksdb/buckifier/rocks_test_runner.sh
+++ b/thirdparty/rocksdb/buckifier/rocks_test_runner.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
 # Create a tmp directory for the test to use
 TEST_DIR=$(mktemp -d /dev/shm/fbcode_rocksdb_XXXXXXX)
+# shellcheck disable=SC2068
 TEST_TMPDIR="$TEST_DIR" $@ && rm -rf "$TEST_DIR"
diff --git a/thirdparty/rocksdb/buckifier/targets_builder.py b/thirdparty/rocksdb/buckifier/targets_builder.py
index 7d47d2d1f9..3d5822d3cb 100644
--- a/thirdparty/rocksdb/buckifier/targets_builder.py
+++ b/thirdparty/rocksdb/buckifier/targets_builder.py
@@ -3,10 +3,8 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 import targets_cfg
-import pprint
 
-# TODO(tec): replace this with PrettyPrinter
-def pretty_list(lst, indent=6):
+def pretty_list(lst, indent=8):
     if lst is None or len(lst) == 0:
         return ""
 
@@ -14,8 +12,8 @@ def pretty_list(lst, indent=6):
         return "\"%s\"" % lst[0]
 
     separator = "\",\n%s\"" % (" " * indent)
-    res = separator.join(lst)
-    res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 2))
+    res = separator.join(sorted(lst))
+    res = "\n" + (" " * indent) + "\"" + res + "\",\n" + (" " * (indent - 4))
     return res
 
 
@@ -27,19 +25,22 @@ def __init__(self, path):
         self.total_lib = 0
         self.total_bin = 0
         self.total_test = 0
-        self.tests_cfg = []
+        self.tests_cfg = ""
 
     def __del__(self):
         self.targets_file.close()
 
     def add_library(self, name, srcs, deps=None, headers=None):
+        headers_attr_prefix = ""
         if headers is None:
+            headers_attr_prefix = "auto_"
             headers = "AutoHeaders.RECURSIVE_GLOB"
-        self.targets_file.write(targets_cfg.library_template % (
-            name,
-            headers,
-            pretty_list(srcs),
-            pretty_list(deps)))
+        self.targets_file.write(targets_cfg.library_template.format(
+            name=name,
+            srcs=pretty_list(srcs),
+            headers_attr_prefix=headers_attr_prefix,
+            headers=headers,
+            deps=pretty_list(deps)))
         self.total_lib = self.total_lib + 1
 
     def add_binary(self, name, srcs, deps=None):
@@ -53,13 +54,13 @@ def register_test(self, test_name, src, is_parallel):
         exec_mode = "serial"
         if is_parallel:
             exec_mode = "parallel"
-        self.tests_cfg.append([test_name, str(src), str(exec_mode)])
+        self.tests_cfg += targets_cfg.test_cfg_template % (
+            test_name,
+            str(src),
+            str(exec_mode))
 
         self.total_test = self.total_test + 1
 
     def flush_tests(self):
-        self.targets_file.write(targets_cfg.unittests_template % (
-            pprint.PrettyPrinter().pformat(self.tests_cfg)
-        ))
-
-        self.tests_cfg = []
+        self.targets_file.write(targets_cfg.unittests_template % self.tests_cfg)
+        self.tests_cfg = ""
diff --git a/thirdparty/rocksdb/buckifier/targets_cfg.py b/thirdparty/rocksdb/buckifier/targets_cfg.py
index 33023a589f..f881588c59 100644
--- a/thirdparty/rocksdb/buckifier/targets_cfg.py
+++ b/thirdparty/rocksdb/buckifier/targets_cfg.py
@@ -2,123 +2,134 @@
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-rocksdb_target_header = """
-import os
-
-TARGETS_PATH = os.path.dirname(__file__)
-REPO_PATH = "rocksdb/src/"
-BUCK_BINS = "buck-out/gen/" + REPO_PATH
-TEST_RUNNER = REPO_PATH + "buckifier/rocks_test_runner.sh"
-rocksdb_compiler_flags = [
-  "-fno-builtin-memcmp",
-  "-DROCKSDB_PLATFORM_POSIX",
-  "-DROCKSDB_LIB_IO_POSIX",
-  "-DROCKSDB_FALLOCATE_PRESENT",
-  "-DROCKSDB_MALLOC_USABLE_SIZE",
-  "-DROCKSDB_RANGESYNC_PRESENT",
-  "-DROCKSDB_SCHED_GETCPU_PRESENT",
-  "-DROCKSDB_SUPPORT_THREAD_LOCAL",
-  "-DOS_LINUX",
-  # Flags to enable libs we include
-  "-DSNAPPY",
-  "-DZLIB",
-  "-DBZIP2",
-  "-DLZ4",
-  "-DZSTD",
-  "-DGFLAGS=gflags",
-  "-DNUMA",
-  "-DTBB",
-  # Needed to compile in fbcode
-  "-Wno-expansion-to-defined",
+rocksdb_target_header = """load("@fbcode_macros//build_defs:auto_headers.bzl", "AutoHeaders")
+load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library")
+load(":defs.bzl", "test_binary")
+
+REPO_PATH = package_name() + "/"
+
+ROCKSDB_COMPILER_FLAGS = [
+    "-fno-builtin-memcmp",
+    "-DROCKSDB_PLATFORM_POSIX",
+    "-DROCKSDB_LIB_IO_POSIX",
+    "-DROCKSDB_FALLOCATE_PRESENT",
+    "-DROCKSDB_MALLOC_USABLE_SIZE",
+    "-DROCKSDB_RANGESYNC_PRESENT",
+    "-DROCKSDB_SCHED_GETCPU_PRESENT",
+    "-DROCKSDB_SUPPORT_THREAD_LOCAL",
+    "-DOS_LINUX",
+    # Flags to enable libs we include
+    "-DSNAPPY",
+    "-DZLIB",
+    "-DBZIP2",
+    "-DLZ4",
+    "-DZSTD",
+    "-DZSTD_STATIC_LINKING_ONLY",
+    "-DGFLAGS=gflags",
+    "-DNUMA",
+    "-DTBB",
+    # Needed to compile in fbcode
+    "-Wno-expansion-to-defined",
+    # Added missing flags from output of build_detect_platform
+    "-DROCKSDB_PTHREAD_ADAPTIVE_MUTEX",
+    "-DROCKSDB_BACKTRACE",
+    "-Wnarrowing",
 ]
 
-rocksdb_external_deps = [
-  ('bzip2', None, 'bz2'),
-  ('snappy', None, "snappy"),
-  ('zlib', None, 'z'),
-  ('gflags', None, 'gflags'),
-  ('lz4', None, 'lz4'),
-  ('zstd', None),
-  ('tbb', None),
-  ("numa", None, "numa"),
-  ("googletest", None, "gtest"),
+ROCKSDB_EXTERNAL_DEPS = [
+    ("bzip2", None, "bz2"),
+    ("snappy", None, "snappy"),
+    ("zlib", None, "z"),
+    ("gflags", None, "gflags"),
+    ("lz4", None, "lz4"),
+    ("zstd", None),
+    ("tbb", None),
+    ("numa", None, "numa"),
+    ("googletest", None, "gtest"),
 ]
 
-rocksdb_preprocessor_flags = [
-  # Directories with files for #include
-  "-I" + REPO_PATH + "include/",
-  "-I" + REPO_PATH,
+ROCKSDB_PREPROCESSOR_FLAGS = [
+    # Directories with files for #include
+    "-I" + REPO_PATH + "include/",
+    "-I" + REPO_PATH,
 ]
 
-rocksdb_arch_preprocessor_flags = {
-  "x86_64": ["-DHAVE_SSE42"],
+ROCKSDB_ARCH_PREPROCESSOR_FLAGS = {
+    "x86_64": [
+        "-DHAVE_SSE42",
+        "-DHAVE_PCLMUL",
+    ],
 }
+
+build_mode = read_config("fbcode", "build_mode")
+
+is_opt_mode = build_mode.startswith("opt")
+
+# -DNDEBUG is added by default in opt mode in fbcode. But adding it twice
+# doesn't harm and avoid forgetting to add it.
+ROCKSDB_COMPILER_FLAGS += (["-DNDEBUG"] if is_opt_mode else [])
+
+sanitizer = read_config("fbcode", "sanitizer")
+
+# Do not enable jemalloc if sanitizer presents. RocksDB will further detect
+# whether the binary is linked with jemalloc at runtime.
+ROCKSDB_COMPILER_FLAGS += (["-DROCKSDB_JEMALLOC"] if sanitizer == "" else [])
+
+ROCKSDB_EXTERNAL_DEPS += ([("jemalloc", None, "headers")] if sanitizer == "" else [])
 """
 
 
 library_template = """
 cpp_library(
-    name = "%s",
-    headers = %s,
-    srcs = [%s],
-    deps = [%s],
-    preprocessor_flags = rocksdb_preprocessor_flags,
-    arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-    compiler_flags = rocksdb_compiler_flags,
-    external_deps = rocksdb_external_deps,
+    name = "{name}",
+    srcs = [{srcs}],
+    {headers_attr_prefix}headers = {headers},
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    deps = [{deps}],
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 """
 
 binary_template = """
 cpp_binary(
-  name = "%s",
-  srcs = [%s],
-  deps = [%s],
-  preprocessor_flags = rocksdb_preprocessor_flags,
-  arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-  compiler_flags = rocksdb_compiler_flags,
-  external_deps = rocksdb_external_deps,
+    name = "%s",
+    srcs = [%s],
+    arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+    compiler_flags = ROCKSDB_COMPILER_FLAGS,
+    preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+    deps = [%s],
+    external_deps = ROCKSDB_EXTERNAL_DEPS,
 )
 """
 
+test_cfg_template = """    [
+        "%s",
+        "%s",
+        "%s",
+    ],
+"""
+
 unittests_template = """
 # [test_name, test_src, test_type]
-ROCKS_TESTS = %s
-
+ROCKS_TESTS = [
+%s]
 
 # Generate a test rule for each entry in ROCKS_TESTS
-for test_cfg in ROCKS_TESTS:
-    test_name = test_cfg[0]
-    test_cc = test_cfg[1]
-    ttype = "gtest" if test_cfg[2] == "parallel" else "simple"
-    test_bin = test_name + "_bin"
-
-    cpp_binary (
-      name = test_bin,
-      srcs = [test_cc],
-      deps = [":rocksdb_test_lib"],
-      preprocessor_flags = rocksdb_preprocessor_flags,
-      arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
-      compiler_flags = rocksdb_compiler_flags,
-      external_deps = rocksdb_external_deps,
-    )
-
-    custom_unittest(
-      name = test_name,
-      type = ttype,
-      deps = [":" + test_bin],
-      command = [TEST_RUNNER, BUCK_BINS + test_bin]
+# Do not build the tests in opt mode, since SyncPoint and other test code
+# will not be included.
+[
+    test_binary(
+        parallelism = parallelism,
+        rocksdb_arch_preprocessor_flags = ROCKSDB_ARCH_PREPROCESSOR_FLAGS,
+        rocksdb_compiler_flags = ROCKSDB_COMPILER_FLAGS,
+        rocksdb_external_deps = ROCKSDB_EXTERNAL_DEPS,
+        rocksdb_preprocessor_flags = ROCKSDB_PREPROCESSOR_FLAGS,
+        test_cc = test_cc,
+        test_name = test_name,
     )
-
-custom_unittest(
-    name = "make_rocksdbjavastatic",
-    type = "simple",
-    command = ["internal_repo_rocksdb/make_rocksdbjavastatic.sh"],
-)
-
-custom_unittest(
-    name = "make_rocksdb_lite_release",
-    type = "simple",
-    command = ["internal_repo_rocksdb/make_rocksdb_lite_release.sh"],
-)
+    for test_name, test_cc, parallelism in ROCKS_TESTS
+    if not is_opt_mode
+]
 """
diff --git a/thirdparty/rocksdb/build_tools/RocksDBCommonHelper.php b/thirdparty/rocksdb/build_tools/RocksDBCommonHelper.php
index 9fe770fe95..e7bfb52034 100644
--- a/thirdparty/rocksdb/build_tools/RocksDBCommonHelper.php
+++ b/thirdparty/rocksdb/build_tools/RocksDBCommonHelper.php
@@ -7,12 +7,12 @@
 // Name of the environment variables which need to be set by the entity which
 // triggers continuous runs so that code at the end of the file gets executed
 // and Sandcastle run starts.
-define("ENV_POST_RECEIVE_HOOK", "POST_RECEIVE_HOOK");
-define("ENV_HTTPS_APP_VALUE", "HTTPS_APP_VALUE");
-define("ENV_HTTPS_TOKEN_VALUE", "HTTPS_TOKEN_VALUE");
+const ENV_POST_RECEIVE_HOOK = "POST_RECEIVE_HOOK";
+const ENV_HTTPS_APP_VALUE = "HTTPS_APP_VALUE";
+const ENV_HTTPS_TOKEN_VALUE = "HTTPS_TOKEN_VALUE";
 
-define("PRIMARY_TOKEN_FILE", '/home/krad/.sandcastle');
-define("CONT_RUN_ALIAS", "leveldb");
+const PRIMARY_TOKEN_FILE = '/home/krad/.sandcastle';
+const CONT_RUN_ALIAS = "leveldb";
 
 //////////////////////////////////////////////////////////////////////
 /*  Run tests in sandcastle */
@@ -97,7 +97,7 @@ function getSteps($applyDiff, $diffID, $username, $test) {
   }
 
   // fbcode is a sub-repo. We cannot patch until we add it to ignore otherwise
-  // Git thinks it is an uncommited change.
+  // Git thinks it is an uncommitted change.
   $fix_git_ignore = array(
     "name" => "Fix git ignore",
     "shell" => "echo fbcode >> .git/info/exclude",
diff --git a/thirdparty/rocksdb/build_tools/build_detect_platform b/thirdparty/rocksdb/build_tools/build_detect_platform
index c7ddb7ccee..057f77ec53 100755
--- a/thirdparty/rocksdb/build_tools/build_detect_platform
+++ b/thirdparty/rocksdb/build_tools/build_detect_platform
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 #
 # Detects OS we're compiling on and outputs a file specified by the first
 # argument, which in turn gets read while processing Makefile.
@@ -16,6 +16,8 @@
 #   PLATFORM_CXXFLAGS           C++ compiler flags.  Will contain:
 #   PLATFORM_SHARED_VERSIONED   Set to 'true' if platform supports versioned
 #                               shared libraries, empty otherwise.
+#   FIND			Command for the find utility
+#   WATCH			Command for the watch utility
 #
 # The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
 #
@@ -51,11 +53,13 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
     FBCODE_BUILD="true"
     # If we're compiling with TSAN we need pic build
     PIC_BUILD=$COMPILE_WITH_TSAN
-    if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
-      source "$PWD/build_tools/fbcode_config.sh"
-    else
+    if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then
       # we need this to build with MySQL. Don't use for other purposes.
       source "$PWD/build_tools/fbcode_config4.8.1.sh"
+    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007" ]; then
+      source "$PWD/build_tools/fbcode_config_platform007.sh"
+    else
+      source "$PWD/build_tools/fbcode_config.sh"
     fi
 fi
 
@@ -64,11 +68,23 @@ rm -f "$OUTPUT"
 touch "$OUTPUT"
 
 if test -z "$CC"; then
-   CC=cc
+    if [ -x "$(command -v cc)" ]; then
+        CC=cc
+    elif [ -x "$(command -v clang)" ]; then
+        CC=clang
+    else
+        CC=cc
+    fi
 fi
 
 if test -z "$CXX"; then
-    CXX=g++
+    if [ -x "$(command -v g++)" ]; then
+        CXX=g++
+    elif [ -x "$(command -v clang++)" ]; then
+        CXX=clang++
+    else
+        CXX=g++
+    fi
 fi
 
 # Detect OS
@@ -85,7 +101,15 @@ if test -z "$CLANG_SCAN_BUILD"; then
 fi
 
 if test -z "$CLANG_ANALYZER"; then
-    CLANG_ANALYZER=$(which clang++ 2> /dev/null)
+    CLANG_ANALYZER=$(command -v clang++ 2> /dev/null)
+fi
+
+if test -z "$FIND"; then
+    FIND=find
+fi
+
+if test -z "$WATCH"; then
+    WATCH=watch
 fi
 
 COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
@@ -122,6 +146,8 @@ case "$TARGET_OS" in
         COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
         if [ -z "$USE_CLANG" ]; then
             COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
         fi
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
         # PORT_FILES=port/linux/linux_specific.cc
@@ -141,6 +167,7 @@ case "$TARGET_OS" in
         ;;
     FreeBSD)
         PLATFORM=OS_FREEBSD
+        CXX=clang++
         COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
         # PORT_FILES=port/freebsd/freebsd_specific.cc
@@ -153,9 +180,12 @@ case "$TARGET_OS" in
         ;;
     OpenBSD)
         PLATFORM=OS_OPENBSD
+	CXX=clang++
         COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
         # PORT_FILES=port/openbsd/openbsd_specific.cc
+	FIND=gfind
+	WATCH=gnuwatch
         ;;
     DragonFly)
         PLATFORM=OS_DRAGONFLYBSD
@@ -170,6 +200,8 @@ case "$TARGET_OS" in
         COMMON_FLAGS="$COMMON_FLAGS -DCYGWIN"
         if [ -z "$USE_CLANG" ]; then
             COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
+        else
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
         fi
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
         # PORT_FILES=port/linux/linux_specific.cc
@@ -202,7 +234,7 @@ else
           #include <linux/falloc.h>
           int main() {
       int fd = open("/dev/null", 0);
-      fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024);
+      fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 1024);
           }
 EOF
         if [ "$?" = 0 ]; then
@@ -210,119 +242,147 @@ EOF
         fi
     fi
 
-    # Test whether Snappy library is installed
-    # http://code.google.com/p/snappy/
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <snappy.h>
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_SNAPPY; then
+        # Test whether Snappy library is installed
+        # http://code.google.com/p/snappy/
+        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <snappy.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -lsnappy"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lsnappy"
+        fi
     fi
 
-    # Test whether gflags library is installed
-    # http://gflags.github.io/gflags/
-    # check if the namespace is gflags
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
-      #include <gflags/gflags.h>
-      using namespace gflags;
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_GFLAGS; then
+        # Test whether gflags library is installed
+        # http://gflags.github.io/gflags/
+        # check if the namespace is gflags
+        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+          #include <gflags/gflags.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=gflags"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
-    else
-      # check if namespace is google
-      $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
-        #include <gflags/gflags.h>
-        using namespace google;
-        int main() {}
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+        else
+          # check if namespace is google
+          $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null << EOF
+            #include <gflags/gflags.h>
+            using namespace google;
+            int main() {}
 EOF
-      if [ "$?" = 0 ]; then
-          COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google"
-          PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
-      fi
+          if [ "$?" = 0 ]; then
+              COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=google"
+              PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+          fi
+        fi
     fi
 
-    # Test whether zlib library is installed
-    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <zlib.h>
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_ZLIB; then
+        # Test whether zlib library is installed
+        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <zlib.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -lz"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lz"
+        fi
     fi
 
-    # Test whether bzip library is installed
-    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <bzlib.h>
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_BZIP; then
+        # Test whether bzip library is installed
+        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <bzlib.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -lbz2"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lbz2"
+        fi
     fi
 
-    # Test whether lz4 library is installed
-    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <lz4.h>
-      #include <lz4hc.h>
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_LZ4; then
+        # Test whether lz4 library is installed
+        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <lz4.h>
+          #include <lz4hc.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -llz4"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -llz4"
+        fi
     fi
 
-    # Test whether zstd library is installed
-    $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <zstd.h>
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_ZSTD; then
+        # Test whether zstd library is installed
+        $CXX $CFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <zstd.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DZSTD"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lzstd"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -lzstd"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DZSTD"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lzstd"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lzstd"
+        fi
     fi
 
-    # Test whether numa is available
-    $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null  <<EOF
-      #include <numa.h>
-      #include <numaif.h>
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_NUMA; then
+        # Test whether numa is available
+        $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null  <<EOF
+          #include <numa.h>
+          #include <numaif.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -lnuma"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -lnuma"
+        fi
     fi
 
-    # Test whether tbb is available
-    $CXX $CFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null  <<EOF
-      #include <tbb/tbb.h>
-      int main() {}
+    if ! test $ROCKSDB_DISABLE_TBB; then
+        # Test whether tbb is available
+        $CXX $CFLAGS $LDFLAGS -x c++ - -o /dev/null -ltbb 2>/dev/null  <<EOF
+          #include <tbb/tbb.h>
+          int main() {}
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DTBB"
-        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltbb"
-        JAVA_LDFLAGS="$JAVA_LDFLAGS -ltbb"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DTBB"
+            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltbb"
+            JAVA_LDFLAGS="$JAVA_LDFLAGS -ltbb"
+        fi
     fi
 
-    # Test whether jemalloc is available
-    if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \
-      2>/dev/null; then
-        # This will enable some preprocessor identifiers in the Makefile
-        JEMALLOC=1
-        # JEMALLOC can be enabled either using the flag (like here) or by
-        # providing direct link to the jemalloc library
-        WITH_JEMALLOC_FLAG=1
-    else
+    if ! test $ROCKSDB_DISABLE_JEMALLOC; then
+        # Test whether jemalloc is available
+        if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null -ljemalloc \
+          2>/dev/null; then
+            # This will enable some preprocessor identifiers in the Makefile
+            JEMALLOC=1
+            # JEMALLOC can be enabled either using the flag (like here) or by
+            # providing direct link to the jemalloc library
+            WITH_JEMALLOC_FLAG=1
+            # check for JEMALLOC installed with HomeBrew
+            if [ "$PLATFORM" == "OS_MACOSX" ]; then
+                if hash brew 2>/dev/null && brew ls --versions jemalloc > /dev/null; then
+                    JEMALLOC_VER=$(brew ls --versions jemalloc | tail -n 1 | cut -f 2 -d ' ')
+                    JEMALLOC_INCLUDE="-I/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/include"
+                    JEMALLOC_LIB="/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
+                    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $JEMALLOC_LIB"
+                    JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS $JEMALLOC_LIB"
+                fi
+            fi
+        fi
+    fi
+    if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then
         # jemalloc is not available. Let's try tcmalloc
         if echo 'int main() {}' | $CXX $CFLAGS -x c++ - -o /dev/null \
           -ltcmalloc 2>/dev/null; then
@@ -331,88 +391,111 @@ EOF
         fi
     fi
 
-    # Test whether malloc_usable_size is available
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <malloc.h>
-      int main() {
-        size_t res = malloc_usable_size(0);
-        return 0;
-      }
+    if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then
+        # Test whether malloc_usable_size is available
+        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <malloc.h>
+          int main() {
+            size_t res = malloc_usable_size(0);
+            return 0;
+          }
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE"
+        fi
     fi
 
-    # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <pthread.h>
-      int main() {
-        int x = PTHREAD_MUTEX_ADAPTIVE_NP;
-        return 0;
-      }
+    if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then
+        # Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
+        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <pthread.h>
+          int main() {
+            int x = PTHREAD_MUTEX_ADAPTIVE_NP;
+            return 0;
+          }
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX"
+        fi
     fi
 
-    # Test whether backtrace is available
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <execinfo.h>>
-      int main() {
-        void* frames[1];
-        backtrace_symbols(frames, backtrace(frames, 1));
-        return 0;
-      }
-EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
-    else
-        # Test whether execinfo library is installed
-        $CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null  <<EOF
-          #include <execinfo.h>
+    if ! test $ROCKSDB_DISABLE_BACKTRACE; then
+        # Test whether backtrace is available
+        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <execinfo.h>>
           int main() {
             void* frames[1];
             backtrace_symbols(frames, backtrace(frames, 1));
+            return 0;
           }
 EOF
         if [ "$?" = 0 ]; then
             COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
-            PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lexecinfo"
-            JAVA_LDFLAGS="$JAVA_LDFLAGS -lexecinfo"
+        else
+            # Test whether execinfo library is installed
+            $CXX $CFLAGS -lexecinfo -x c++ - -o /dev/null 2>/dev/null  <<EOF
+              #include <execinfo.h>
+              int main() {
+                void* frames[1];
+                backtrace_symbols(frames, backtrace(frames, 1));
+              }
+EOF
+            if [ "$?" = 0 ]; then
+                COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
+                PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lexecinfo"
+                JAVA_LDFLAGS="$JAVA_LDFLAGS -lexecinfo"
+            fi
         fi
     fi
 
-    # Test if -pg is supported
-    $CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      int main() {
-        return 0;
-      }
+    if ! test $ROCKSDB_DISABLE_PG; then
+        # Test if -pg is supported
+        $CXX $CFLAGS -pg -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          int main() {
+            return 0;
+          }
 EOF
-    if [ "$?" = 0 ]; then
-        PROFILING_FLAGS=-pg
+        if [ "$?" = 0 ]; then
+            PROFILING_FLAGS=-pg
+        fi
     fi
 
-    # Test whether sync_file_range is supported for compatibility with an old glibc
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <fcntl.h>
-      int main() {
-        int fd = open("/dev/null", 0);
-        sync_file_range(fd, 0, 1024, SYNC_FILE_RANGE_WRITE);
-      }
+    if ! test $ROCKSDB_DISABLE_SYNC_FILE_RANGE; then
+        # Test whether sync_file_range is supported for compatibility with an old glibc
+        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <fcntl.h>
+          int main() {
+            int fd = open("/dev/null", 0);
+            sync_file_range(fd, 0, 1024, SYNC_FILE_RANGE_WRITE);
+          }
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_RANGESYNC_PRESENT"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_RANGESYNC_PRESENT"
+        fi
     fi
 
-    # Test whether sched_getcpu is supported
-    $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
-      #include <sched.h>
-      int main() {
-        int cpuid = sched_getcpu();
-      }
+    if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then
+        # Test whether sched_getcpu is supported
+        $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null  <<EOF
+          #include <sched.h>
+          int main() {
+            int cpuid = sched_getcpu();
+          }
 EOF
-    if [ "$?" = 0 ]; then
-        COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_SCHED_GETCPU_PRESENT"
+        if [ "$?" = 0 ]; then
+            COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_SCHED_GETCPU_PRESENT"
+        fi
+    fi
+
+    if ! test $ROCKSDB_DISABLE_ALIGNED_NEW; then
+        # Test whether c++17 aligned-new is supported
+        $CXX $PLATFORM_CXXFLAGS -faligned-new -x c++ - -o /dev/null 2>/dev/null <<EOF
+            struct alignas(1024) t {int a;};
+            int main() {}
+EOF
+        if [ "$?" = 0 ]; then
+            PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -faligned-new -DHAVE_ALIGNED_NEW"
+        fi
     fi
 fi
 
@@ -435,8 +518,8 @@ if test "$USE_HDFS"; then
     echo "JAVA_HOME has to be set for HDFS usage."
     exit 1
   fi
-  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS"
-  HDFS_LDFLAGS="$HDFS_LDFLAGS -lhdfs -L$JAVA_HOME/jre/lib/amd64"
+  HDFS_CCFLAGS="$HDFS_CCFLAGS -I$JAVA_HOME/include -I$JAVA_HOME/include/linux -DUSE_HDFS -I$HADOOP_HOME/include"
+  HDFS_LDFLAGS="$HDFS_LDFLAGS -lhdfs -L$JAVA_HOME/jre/lib/amd64 -L$HADOOP_HOME/lib/native"
   HDFS_LDFLAGS="$HDFS_LDFLAGS -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib"
   HDFS_LDFLAGS="$HDFS_LDFLAGS -ldl -lverify -ljava -ljvm"
   COMMON_FLAGS="$COMMON_FLAGS $HDFS_CCFLAGS"
@@ -444,18 +527,25 @@ if test "$USE_HDFS"; then
   JAVA_LDFLAGS="$JAVA_LDFLAGS $HDFS_LDFLAGS"
 fi
 
-if test "$USE_SSE"; then
-  COMMON_FLAGS="$COMMON_FLAGS -msse4.2"
-elif test -z "$PORTABLE"; then
+if test -z "$PORTABLE"; then
   if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
     # Tune for this POWER processor, treating '+' models as base models
     POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
     COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER "
   elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
     COMMON_FLAGS="$COMMON_FLAGS -march=z10 "
-  elif [ "$TARGET_OS" != AIX ] && [ "$TARGET_OS" != SunOS ]; then
+  elif test -n "`echo $TARGET_ARCHITECTURE | grep ^arm`"; then
+    # TODO: Handle this with approprite options.
+    COMMON_FLAGS="$COMMON_FLAGS"
+  elif [ "$TARGET_OS" == "IOS" ]; then
+    COMMON_FLAGS="$COMMON_FLAGS"
+  elif [ "$TARGET_OS" != "AIX" ] && [ "$TARGET_OS" != "SunOS" ]; then
     COMMON_FLAGS="$COMMON_FLAGS -march=native "
+  elif test "$USE_SSE"; then
+    COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
   fi
+elif test "$USE_SSE"; then
+  COMMON_FLAGS="$COMMON_FLAGS -msse4.2 -mpclmul"
 fi
 
 $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
@@ -469,6 +559,24 @@ if [ "$?" = 0 ]; then
   COMMON_FLAGS="$COMMON_FLAGS -DHAVE_SSE42"
 elif test "$USE_SSE"; then
   echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling"
+  exit 1
+fi
+
+$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
+  #include <cstdint>
+  #include <wmmintrin.h>
+  int main() {
+    const auto a = _mm_set_epi64x(0, 0);
+    const auto b = _mm_set_epi64x(0, 0);
+    const auto c = _mm_clmulepi64_si128(a, b, 0x00);
+    auto d = _mm_cvtsi128_si64(c);
+  }
+EOF
+if [ "$?" = 0 ]; then
+  COMMON_FLAGS="$COMMON_FLAGS -DHAVE_PCLMUL"
+elif test "$USE_SSE"; then
+  echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling"
+  exit 1
 fi
 
 # iOS doesn't support thread-local storage, but this check would erroneously
@@ -519,6 +627,8 @@ echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
 echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
 echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"
 echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT"
+echo "FIND=$FIND" >> "$OUTPUT"
+echo "WATCH=$WATCH" >> "$OUTPUT"
 # This will enable some related identifiers for the preprocessor
 if test -n "$JEMALLOC"; then
   echo "JEMALLOC=1" >> "$OUTPUT"
diff --git a/thirdparty/rocksdb/build_tools/cont_integration.sh b/thirdparty/rocksdb/build_tools/cont_integration.sh
index 06f25c596e..66d2552278 100755
--- a/thirdparty/rocksdb/build_tools/cont_integration.sh
+++ b/thirdparty/rocksdb/build_tools/cont_integration.sh
@@ -13,10 +13,12 @@ error=0
 
 function log {
   DATE=`date +%Y-%m-%d:%H:%M:%S`
+  # shellcheck disable=SC2068
   echo $DATE $@
 }
 
 function log_err {
+  # shellcheck disable=SC2145
   log "ERROR: $@ Error code: $error."
 }
 
diff --git a/thirdparty/rocksdb/build_tools/dependencies.sh b/thirdparty/rocksdb/build_tools/dependencies.sh
index 868753b8a7..868e0bbddf 100644
--- a/thirdparty/rocksdb/build_tools/dependencies.sh
+++ b/thirdparty/rocksdb/build_tools/dependencies.sh
@@ -1,18 +1,18 @@
-GCC_BASE=/mnt/gvfs/third-party2/gcc/2928bb3ed95bf64f5b388ee88c30dc74710c3b35/5.x/centos6-native/f4950a1
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/a5fea028cb7ba43498976e1f8054b0b2e790c295/stable/centos6-native/6aaf4de
-LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/7a9099f6587ee4378c0b1fa32bb8934019d30ca4/5.x/gcc-5-glibc-2.23/339d858
-GLIBC_BASE=/mnt/gvfs/third-party2/glibc/3b7c6469854dfc7832a1c3cc5b86919a84e5f865/2.23/gcc-5-glibc-2.23/ca1d1c0
-SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/8c38a4c1e52b4c2cc8a9cdc31b9c947ed7dbfcb4/1.1.3/gcc-5-glibc-2.23/9bc6787
-ZLIB_BASE=/mnt/gvfs/third-party2/zlib/d7861abe6f0e27ab98c9303b95a662f0e4cdedb5/1.2.8/gcc-5-glibc-2.23/9bc6787
-BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/740325875f6729f42d28deaa2147b0854f3a347e/1.0.6/gcc-5-glibc-2.23/9bc6787
-LZ4_BASE=/mnt/gvfs/third-party2/lz4/0815d59804160c96caac5f27ca004f51af893dc6/r131/gcc-5-glibc-2.23/9bc6787
-ZSTD_BASE=/mnt/gvfs/third-party2/zstd/c15a4f5f619a2930478d01e2e34dc1e0652b0873/1.1.4/gcc-5-glibc-2.23/03859b5
-GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/f905a5e1032fb30c05db3d3752319857388c0c49/2.2.0/gcc-5-glibc-2.23/9bc6787
-JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/8d60633d822a2a55849c73db24e74a25e52b71db/master/gcc-5-glibc-2.23/1c32b4b
-NUMA_BASE=/mnt/gvfs/third-party2/numa/17c514c4d102a25ca15f4558be564eeed76f4b6a/2.0.8/gcc-5-glibc-2.23/9bc6787
-LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/8db74270cd6d0212ac92d69e7fc7beefe617d772/trunk/gcc-5-glibc-2.23/b1847cb
-TBB_BASE=/mnt/gvfs/third-party2/tbb/9d9a554877d0c5bef330fe818ab7178806dd316a/4.0_update2/gcc-5-glibc-2.23/9bc6787
-KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/90c9734afc5579c9d1db529fa788d09f97763b85/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/9e829389ef61b92c62de8748c80169aaf25ce1f0/2.26.1/centos6-native/da39a3e
-VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/d7f4d4d86674a57668e3a96f76f0e17dd0eb8765/3.11.0/gcc-5-glibc-2.23/9bc6787
-LUA_BASE=/mnt/gvfs/third-party2/lua/61e4abf5813bbc39bc4f548757ccfcadde175a48/5.2.3/gcc-5-glibc-2.23/65372bd
+GCC_BASE=/mnt/gvfs/third-party2/gcc/112ec378fec7002ad3e09afde022e656049f7191/5.x/centos7-native/c447969
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/04999bdb3ce81a11073535dcb00b5e13dc1cbaf5/stable/centos7-native/c9f9104
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/92b0c8e5c8eecc71eb042594ce1ab3413799b385/5.x/gcc-5-glibc-2.23/339d858
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/3d8698d5973ba94f41620a80a67e4457fdf01e90/2.23/gcc-5-glibc-2.23/ca1d1c0
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/gcc-5-glibc-2.23/9bc6787
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/gcc-5-glibc-2.23/9bc6787
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/gcc-5-glibc-2.23/9bc6787
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/gcc-5-glibc-2.23/9bc6787
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/af6628a46758f1a15484a1760cd7294164bc5ba1/1.3.5/gcc-5-glibc-2.23/03859b5
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/gcc-5-glibc-2.23/9bc6787
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b1a0e56c1e3e6929813a4331ade3a58ff083afbb/master/gcc-5-glibc-2.23/aa64d6b
+NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/gcc-5-glibc-2.23/9bc6787
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/gcc-5-glibc-2.23/b443de1
+TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/gcc-5-glibc-2.23/9bc6787
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/4.0.9-36_fbk5_2933_gd092e3f/gcc-5-glibc-2.23/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/55031de95a2b46c82948743419a603b3d6aefe28/2.29.1/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/gcc-5-glibc-2.23/9bc6787
+LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.2.3/gcc-5-glibc-2.23/65372bd
diff --git a/thirdparty/rocksdb/build_tools/dependencies_4.8.1.sh b/thirdparty/rocksdb/build_tools/dependencies_4.8.1.sh
index ef0cda2398..bd02165d8a 100644
--- a/thirdparty/rocksdb/build_tools/dependencies_4.8.1.sh
+++ b/thirdparty/rocksdb/build_tools/dependencies_4.8.1.sh
@@ -1,3 +1,4 @@
+# shellcheck disable=SC2148
 GCC_BASE=/mnt/gvfs/third-party2/gcc/cf7d14c625ce30bae1a4661c2319c5a283e4dd22/4.8.1/centos6-native/cc6c9dc
 CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/8598c375b0e94e1448182eb3df034704144a838d/stable/centos6-native/3f16ddd
 LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/d6e0a7da6faba45f5e5b1638f9edd7afc2f34e7d/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc
diff --git a/thirdparty/rocksdb/build_tools/dependencies_platform007.sh b/thirdparty/rocksdb/build_tools/dependencies_platform007.sh
new file mode 100644
index 0000000000..44e9e58f8e
--- /dev/null
+++ b/thirdparty/rocksdb/build_tools/dependencies_platform007.sh
@@ -0,0 +1,18 @@
+GCC_BASE=/mnt/gvfs/third-party2/gcc/6e8e715624fd15256a7970073387793dfcf79b46/7.x/centos7-native/b2ef2b6
+CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/ef37e1faa1c29782abfac1ae65a291b9b7966f6d/stable/centos7-native/c9f9104
+LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c67031f0f739ac61575a061518d6ef5038f99f90/7.x/platform007/5620abc
+GLIBC_BASE=/mnt/gvfs/third-party2/glibc/60d6f124a78798b73944f5ba87c2306ae3460153/2.26/platform007/f259413
+SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/7f9bdaada18f59bc27ec2b0871eb8a6144343aef/1.1.3/platform007/ca4da3d
+ZLIB_BASE=/mnt/gvfs/third-party2/zlib/22c2d65676fb7c23cfa797c4f6937f38b026f3cf/1.2.8/platform007/ca4da3d
+BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/dc49a21c5fceec6456a7a28a94dcd16690af1337/1.0.6/platform007/ca4da3d
+LZ4_BASE=/mnt/gvfs/third-party2/lz4/907b498203d297947f3bb70b9466f47e100f1873/r131/platform007/ca4da3d
+ZSTD_BASE=/mnt/gvfs/third-party2/zstd/3ee276cbacfad3074e3f07bf826ac47f06970f4e/1.3.5/platform007/15a3614
+GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/0b9929d2588991c65a57168bf88aff2db87c5d48/2.2.0/platform007/ca4da3d
+JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/9c910d36d6235cc40e8ff559358f1833452300ca/master/platform007/5b0f53e
+NUMA_BASE=/mnt/gvfs/third-party2/numa/9cbf2460284c669ed19c3ccb200a71f7dd7e53c7/2.0.11/platform007/ca4da3d
+LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/bf3d7497fe4e6d007354f0adffa16ce3003f8338/1.3/platform007/6f3e0a9
+TBB_BASE=/mnt/gvfs/third-party2/tbb/ff4e0b093534704d8abab678a4fd7f5ea7b094c7/2018_U5/platform007/ca4da3d
+KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/b5c4a61a5c483ba24722005ae07895971a2ac707/fb/platform007/da39a3e
+BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/92ff90349e2f43ea0a8246d8b1cf17b6869013e3/2.29.1/centos7-native/da39a3e
+VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/f3f697a28122e6bcd513273dd9c1ff23852fc59f/3.13.0/platform007/ca4da3d
+LUA_BASE=/mnt/gvfs/third-party2/lua/f0cd714433206d5139df61659eb7b28b1dea6683/5.3.4/platform007/5007832
diff --git a/thirdparty/rocksdb/build_tools/error_filter.py b/thirdparty/rocksdb/build_tools/error_filter.py
index 9f619cf4ba..5ef1e9c269 100644
--- a/thirdparty/rocksdb/build_tools/error_filter.py
+++ b/thirdparty/rocksdb/build_tools/error_filter.py
@@ -64,8 +64,12 @@ def parse_error(self, line):
 
 class CompilerErrorParser(MatchErrorParser):
     def __init__(self):
-        # format: '<filename>:<line #>:<column #>: error: <error msg>'
-        super(CompilerErrorParser, self).__init__(r'\S+:\d+:\d+: error:')
+        # format (compile error):
+        #   '<filename>:<line #>:<column #>: error: <error msg>'
+        # format (link error):
+        #   '<filename>:<line #>: error: <error msg>'
+        # The below regex catches both
+        super(CompilerErrorParser, self).__init__(r'\S+:\d+: error:')
 
 
 class ScanBuildErrorParser(MatchErrorParser):
@@ -128,11 +132,14 @@ def __init__(self):
     'lite': [CompilerErrorParser],
     'lite_test': [CompilerErrorParser, GTestErrorParser],
     'stress_crash': [CompilerErrorParser, DbCrashErrorParser],
+    'stress_crash_with_atomic_flush': [CompilerErrorParser, DbCrashErrorParser],
     'write_stress': [CompilerErrorParser, WriteStressErrorParser],
     'asan': [CompilerErrorParser, GTestErrorParser, AsanErrorParser],
     'asan_crash': [CompilerErrorParser, AsanErrorParser, DbCrashErrorParser],
+    'asan_crash_with_atomic_flush': [CompilerErrorParser, AsanErrorParser, DbCrashErrorParser],
     'ubsan': [CompilerErrorParser, GTestErrorParser, UbsanErrorParser],
     'ubsan_crash': [CompilerErrorParser, UbsanErrorParser, DbCrashErrorParser],
+    'ubsan_crash_with_atomic_flush': [CompilerErrorParser, UbsanErrorParser, DbCrashErrorParser],
     'valgrind': [CompilerErrorParser, GTestErrorParser, ValgrindErrorParser],
     'tsan': [CompilerErrorParser, GTestErrorParser, TsanErrorParser],
     'format_compatible': [CompilerErrorParser, CompatErrorParser],
diff --git a/thirdparty/rocksdb/build_tools/fbcode_config.sh b/thirdparty/rocksdb/build_tools/fbcode_config.sh
index b8609a11c6..f46a580bde 100644
--- a/thirdparty/rocksdb/build_tools/fbcode_config.sh
+++ b/thirdparty/rocksdb/build_tools/fbcode_config.sh
@@ -43,11 +43,15 @@ if test -z $PIC_BUILD; then
   LZ4_INCLUDE=" -I $LZ4_BASE/include/"
   LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
   CFLAGS+=" -DLZ4"
+fi
 
-  ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+if test -z $PIC_BUILD; then
   ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
-  CFLAGS+=" -DZSTD"
+else
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
 fi
+CFLAGS+=" -DZSTD -DZSTD_STATIC_LINKING_ONLY"
 
 # location of gflags headers and libraries
 GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
@@ -83,6 +87,7 @@ CFLAGS+=" -DTBB"
 
 # use Intel SSE support for checksum calculations
 export USE_SSE=1
+export PORTABLE=1
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
diff --git a/thirdparty/rocksdb/build_tools/fbcode_config4.8.1.sh b/thirdparty/rocksdb/build_tools/fbcode_config4.8.1.sh
index f5b8334db2..c40c10131a 100644
--- a/thirdparty/rocksdb/build_tools/fbcode_config4.8.1.sh
+++ b/thirdparty/rocksdb/build_tools/fbcode_config4.8.1.sh
@@ -54,6 +54,7 @@ TBB_LIBS="$TBB_BASE/lib/libtbb.a"
 
 # use Intel SSE support for checksum calculations
 export USE_SSE=1
+export PORTABLE=1
 
 BINUTILS="$BINUTILS_BASE/bin"
 AR="$BINUTILS/ar"
diff --git a/thirdparty/rocksdb/build_tools/fbcode_config_platform007.sh b/thirdparty/rocksdb/build_tools/fbcode_config_platform007.sh
new file mode 100644
index 0000000000..67d156e4c9
--- /dev/null
+++ b/thirdparty/rocksdb/build_tools/fbcode_config_platform007.sh
@@ -0,0 +1,157 @@
+#!/bin/sh
+#
+# Set environment variables so that we can compile rocksdb using
+# fbcode settings.  It uses the latest g++ and clang compilers and also
+# uses jemalloc
+# Environment variables that change the behavior of this script:
+# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
+
+
+BASEDIR=`dirname $BASH_SOURCE`
+source "$BASEDIR/dependencies_platform007.sh"
+
+CFLAGS=""
+
+# libgcc
+LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/7.3.0"
+LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
+
+# glibc
+GLIBC_INCLUDE="$GLIBC_BASE/include"
+GLIBC_LIBS=" -L $GLIBC_BASE/lib"
+
+# snappy
+SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
+if test -z $PIC_BUILD; then
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"
+else
+  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a"
+fi
+CFLAGS+=" -DSNAPPY"
+
+if test -z $PIC_BUILD; then
+  # location of zlib headers and libraries
+  ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
+  ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
+  CFLAGS+=" -DZLIB"
+
+  # location of bzip headers and libraries
+  BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
+  BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a"
+  CFLAGS+=" -DBZIP2"
+
+  LZ4_INCLUDE=" -I $LZ4_BASE/include/"
+  LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
+  CFLAGS+=" -DLZ4"
+fi
+
+ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
+if test -z $PIC_BUILD; then
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
+else
+  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
+fi
+CFLAGS+=" -DZSTD"
+
+# location of gflags headers and libraries
+GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
+if test -z $PIC_BUILD; then
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"
+else
+  GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
+fi
+CFLAGS+=" -DGFLAGS=gflags"
+
+# location of jemalloc
+JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
+JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a"
+
+if test -z $PIC_BUILD; then
+  # location of numa
+  NUMA_INCLUDE=" -I $NUMA_BASE/include/"
+  NUMA_LIB=" $NUMA_BASE/lib/libnuma.a"
+  CFLAGS+=" -DNUMA"
+
+  # location of libunwind
+  LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
+fi
+
+# location of TBB
+TBB_INCLUDE=" -isystem $TBB_BASE/include/"
+if test -z $PIC_BUILD; then
+  TBB_LIBS="$TBB_BASE/lib/libtbb.a"
+else
+  TBB_LIBS="$TBB_BASE/lib/libtbb_pic.a"
+fi
+CFLAGS+=" -DTBB"
+
+# use Intel SSE support for checksum calculations
+export USE_SSE=1
+export PORTABLE=1
+
+BINUTILS="$BINUTILS_BASE/bin"
+AR="$BINUTILS/ar"
+
+DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE"
+
+STDLIBS="-L $GCC_BASE/lib64"
+
+CLANG_BIN="$CLANG_BASE/bin"
+CLANG_LIB="$CLANG_BASE/lib"
+CLANG_SRC="$CLANG_BASE/../../src"
+
+CLANG_ANALYZER="$CLANG_BIN/clang++"
+CLANG_SCAN_BUILD="$CLANG_SRC/llvm/tools/clang/tools/scan-build/bin/scan-build"
+
+if [ -z "$USE_CLANG" ]; then
+  # gcc
+  CC="$GCC_BASE/bin/gcc"
+  CXX="$GCC_BASE/bin/g++"
+
+  CFLAGS+=" -B$BINUTILS/gold"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  JEMALLOC=1
+else
+  # clang
+  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
+  CC="$CLANG_BIN/clang"
+  CXX="$CLANG_BIN/clang++"
+
+  KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
+
+  CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x "
+  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/7.x/x86_64-facebook-linux "
+  CFLAGS+=" -isystem $GLIBC_INCLUDE"
+  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
+  CFLAGS+=" -isystem $CLANG_INCLUDE"
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
+  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
+  CFLAGS+=" -Wno-expansion-to-defined "
+  CXXFLAGS="-nostdinc++"
+fi
+
+CFLAGS+=" $DEPS_INCLUDE"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_SUPPORT_THREAD_LOCAL -DHAVE_SSE42"
+CXXFLAGS+=" $CFLAGS"
+
+EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
+EXEC_LDFLAGS+=" -B$BINUTILS/gold"
+EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform007/lib/ld.so"
+EXEC_LDFLAGS+=" $LIBUNWIND"
+EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform007/lib"
+# required by libtbb
+EXEC_LDFLAGS+=" -ldl"
+
+PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
+
+EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS"
+
+VALGRIND_VER="$VALGRIND_BASE/bin/"
+
+# lua not supported because it's on track for deprecation, I think
+LUA_PATH=
+LUA_LIB=
+
+export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
diff --git a/thirdparty/rocksdb/build_tools/gnu_parallel b/thirdparty/rocksdb/build_tools/gnu_parallel
index abbf8f1008..1cf164fff0 100755
--- a/thirdparty/rocksdb/build_tools/gnu_parallel
+++ b/thirdparty/rocksdb/build_tools/gnu_parallel
@@ -5082,8 +5082,8 @@ sub openoutputfiles {
 	# Set reading FD if using --group (--ungroup does not need)
 	for my $fdno (1,2) {
 	    # Re-open the file for reading
-	    # so fdw can be closed seperately
-	    # and fdr can be seeked seperately (for --line-buffer)
+	    # so fdw can be closed separately
+	    # and fdr can be seeked separately (for --line-buffer)
 	    open(my $fdr,"<", $self->fh($fdno,'name')) ||
 		::die_bug("fdr: Cannot open ".$self->fh($fdno,'name'));
 	    $self->set_fh($fdno,'r',$fdr);
diff --git a/thirdparty/rocksdb/build_tools/make_package.sh b/thirdparty/rocksdb/build_tools/make_package.sh
index 58bac44739..0d86548e82 100755
--- a/thirdparty/rocksdb/build_tools/make_package.sh
+++ b/thirdparty/rocksdb/build_tools/make_package.sh
@@ -1,3 +1,4 @@
+# shellcheck disable=SC1113
 #/usr/bin/env bash
 
 set -e
@@ -28,12 +29,14 @@ function package() {
     if dpkg --get-selections | grep --quiet $1; then
       log "$1 is already installed. skipping."
     else
+      # shellcheck disable=SC2068
       apt-get install $@ -y
     fi
   elif [[ $OS = "centos" ]]; then
     if rpm -qa | grep --quiet $1; then
       log "$1 is already installed. skipping."
     else
+      # shellcheck disable=SC2068
       yum install $@ -y
     fi
   fi
@@ -52,6 +55,7 @@ function gem_install() {
   if gem list | grep --quiet $1; then
     log "$1 is already installed. skipping."
   else
+    # shellcheck disable=SC2068
     gem install $@
   fi
 }
@@ -125,4 +129,5 @@ function main() {
     include $LIB_DIR
 }
 
+# shellcheck disable=SC2068
 main $@
diff --git a/thirdparty/rocksdb/build_tools/rocksdb-lego-determinator b/thirdparty/rocksdb/build_tools/rocksdb-lego-determinator
index 6e8ae9cd73..2447a19ae4 100755
--- a/thirdparty/rocksdb/build_tools/rocksdb-lego-determinator
+++ b/thirdparty/rocksdb/build_tools/rocksdb-lego-determinator
@@ -85,9 +85,12 @@ NON_SHM="TMPD=/tmp/rocksdb_test_tmp"
 GCC_481="ROCKSDB_FBCODE_BUILD_WITH_481=1"
 ASAN="COMPILE_WITH_ASAN=1"
 CLANG="USE_CLANG=1"
-LITE="OPT=\"-DROCKSDB_LITE -g\""
-TSAN="COMPILE_WITH_TSAN=1"
+# in gcc-5 there are known problems with TSAN like https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71090.
+# using platform007 gives us gcc-8 or higher which has that bug fixed.
+TSAN="ROCKSDB_FBCODE_BUILD_WITH_PLATFORM007=1 COMPILE_WITH_TSAN=1"
 UBSAN="COMPILE_WITH_UBSAN=1"
+TSAN_CRASH='CRASH_TEST_EXT_ARGS="--compression_type=zstd --log2_keys_per_lock=22"'
+NON_TSAN_CRASH="CRASH_TEST_EXT_ARGS=--compression_type=zstd"
 DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
 HTTP_PROXY="https_proxy=http://fwdproxy.29.prn1:8080 http_proxy=http://fwdproxy.29.prn1:8080 ftp_proxy=http://fwdproxy.29.prn1:8080"
 SETUP_JAVA_ENV="export $HTTP_PROXY; export JAVA_HOME=/usr/local/jdk-8u60-64/; export PATH=\$JAVA_HOME/bin:\$PATH"
@@ -343,7 +346,7 @@ LITE_BUILD_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build RocksDB debug version',
-                'shell':'$LITE make J=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
+                'shell':'make J=1 LITE=1 all check || $CONTRUN_NAME=lite $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -352,6 +355,22 @@ LITE_BUILD_COMMANDS="[
     }
 ]"
 
+#
+# Report RocksDB lite binary size to scuba
+REPORT_LITE_BINARY_SIZE_COMMANDS="[
+    {
+        'name':'Rocksdb Lite Binary Size',
+        'oncall':'$ONCALL',
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Report RocksDB Lite binary size to scuba',
+                'shell':'tools/report_lite_binary_size.sh',
+                'user':'root',
+            },
+        ],
+]"
+
 #
 # RocksDB stress/crash test
 #
@@ -364,14 +383,43 @@ STRESS_CRASH_TEST_COMMANDS="[
             $CLEANUP_ENV,
             {
                 'name':'Build and run RocksDB debug stress tests',
-                'shell':'$SHM $DEBUG make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
             {
                 'name':'Build and run RocksDB debug crash tests',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test || $CONTRUN_NAME=crash_test $TASK_CREATION_TOOL',
+                'user':'root',
+                $PARSER
+            }
+        ],
+        $ARTIFACTS,
+        $REPORT
+    }
+]"
+
+#
+# RocksDB stress/crash test with atomic flush
+#
+STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
+    {
+        'name':'Rocksdb Stress/Crash Test (atomic flush)',
+        'oncall':'$ONCALL',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and run RocksDB debug stress tests',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 db_stress || $CONTRUN_NAME=db_stress $TASK_CREATION_TOOL',
+                'user':'root',
+                $PARSER
+            },
+            {
+                'name':'Build and run RocksDB debug crash tests with atomic flush',
+                'timeout': 86400,
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             }
@@ -436,7 +484,29 @@ ASAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug asan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test || $CONTRUN_NAME=asan_crash_test $TASK_CREATION_TOOL',
+                'user':'root',
+                $PARSER
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB crash testing with atomic flush under address sanitizer
+#
+ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
+    {
+        'name':'Rocksdb crash test (atomic flush) under ASAN',
+        'oncall':'$ONCALL',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and run RocksDB debug asan_crash_test_with_atomic_flush',
+                'timeout': 86400,
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 asan_crash_test_with_atomic_flush || $CONTRUN_NAME=asan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -478,7 +548,29 @@ UBSAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Build and run RocksDB debug ubsan_crash_test',
                 'timeout': 86400,
-                'shell':'$SHM $DEBUG make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test || $CONTRUN_NAME=ubsan_crash_test $TASK_CREATION_TOOL',
+                'user':'root',
+                $PARSER
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB crash testing with atomic flush under undefined behavior sanitizer
+#
+UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
+    {
+        'name':'Rocksdb crash test (atomic flush) under UBSAN',
+        'oncall':'$ONCALL',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Build and run RocksDB debug ubsan_crash_test_with_atomic_flush',
+                'timeout': 86400,
+                'shell':'$SHM $DEBUG $NON_TSAN_CRASH make J=1 ubsan_crash_test_with_atomic_flush || $CONTRUN_NAME=ubsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -544,7 +636,29 @@ TSAN_CRASH_TEST_COMMANDS="[
             {
                 'name':'Compile and run',
                 'timeout': 86400,
-                'shell':'set -o pipefail && $SHM $DEBUG $TSAN CRASH_TEST_KILL_ODD=1887 CRASH_TEST_EXT_ARGS=--log2_keys_per_lock=22  make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
+                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test || $CONTRUN_NAME=tsan_crash_test $TASK_CREATION_TOOL',
+                'user':'root',
+                $PARSER
+            },
+        ],
+        $REPORT
+    }
+]"
+
+#
+# RocksDB crash test with atomic flush under TSAN
+#
+TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS="[
+    {
+        'name':'Rocksdb Crash Test with atomic flush under TSAN',
+        'oncall':'$ONCALL',
+        'timeout': 86400,
+        'steps': [
+            $CLEANUP_ENV,
+            {
+                'name':'Compile and run',
+                'timeout': 86400,
+                'shell':'set -o pipefail && $SHM $DEBUG $TSAN $TSAN_CRASH CRASH_TEST_KILL_ODD=1887 make J=1 crash_test_with_atomic_flush || $CONTRUN_NAME=tsan_crash_test_with_atomic_flush $TASK_CREATION_TOOL',
                 'user':'root',
                 $PARSER
             },
@@ -592,7 +706,7 @@ run_no_compression()
   rm -rf /dev/shm/rocksdb
   mkdir /dev/shm/rocksdb
   make clean
-  cat build_tools/fbcode_config.sh | grep -iv dzlib | grep -iv dlz4 | grep -iv dsnappy | grep -iv dbzip2 > .tmp.fbcode_config.sh
+  cat build_tools/fbcode_config.sh | grep -iv dzstd | grep -iv dzlib | grep -iv dlz4 | grep -iv dsnappy | grep -iv dbzip2 > .tmp.fbcode_config.sh
   mv .tmp.fbcode_config.sh build_tools/fbcode_config.sh
   cat Makefile | grep -v tools/ldb_test.py > .tmp.Makefile
   mv .tmp.Makefile Makefile
@@ -645,12 +759,12 @@ run_regression()
 
   # === lite build ===
   make clean
-  OPT=-DROCKSDB_LITE make -j$(nproc) static_lib
+  make LITE=1 -j$(nproc) static_lib
   send_size_to_ods static_lib_lite $(stat --printf="%s" librocksdb.a)
   strip librocksdb.a
   send_size_to_ods static_lib_lite_stripped $(stat --printf="%s" librocksdb.a)
 
-  OPT=-DROCKSDB_LITE make -j$(nproc) shared_lib
+  make LITE=1 -j$(nproc) shared_lib
   send_size_to_ods shared_lib_lite $(stat --printf="%s" `readlink -f librocksdb.so`)
   strip `readlink -f librocksdb.so`
   send_size_to_ods shared_lib_lite_stripped $(stat --printf="%s" `readlink -f librocksdb.so`)
@@ -728,9 +842,15 @@ case $1 in
   lite)
     echo $LITE_BUILD_COMMANDS
     ;;
+  report_lite_binary_size)
+    echo $REPORT_LITE_BINARY_SIZE_COMMANDS
+    ;;
   stress_crash)
     echo $STRESS_CRASH_TEST_COMMANDS
     ;;
+  stress_crash_with_atomic_flush)
+    echo $STRESS_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
+    ;;
   write_stress)
     echo $WRITE_STRESS_COMMANDS
     ;;
@@ -740,12 +860,18 @@ case $1 in
   asan_crash)
     echo $ASAN_CRASH_TEST_COMMANDS
     ;;
+  asan_crash_with_atomic_flush)
+    echo $ASAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
+    ;;
   ubsan)
     echo $UBSAN_TEST_COMMANDS
     ;;
   ubsan_crash)
     echo $UBSAN_CRASH_TEST_COMMANDS
     ;;
+  ubsan_crash_with_atomic_flush)
+    echo $UBSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
+    ;;
   valgrind)
     echo $VALGRIND_TEST_COMMANDS
     ;;
@@ -755,6 +881,9 @@ case $1 in
   tsan_crash)
     echo $TSAN_CRASH_TEST_COMMANDS
     ;;
+  tsan_crash_with_atomic_flush)
+    echo $TSAN_CRASH_TEST_WITH_ATOMIC_FLUSH_COMMANDS
+    ;;
   format_compatible)
     echo $FORMAT_COMPATIBLE_COMMANDS
     ;;
diff --git a/thirdparty/rocksdb/build_tools/run_ci_db_test.ps1 b/thirdparty/rocksdb/build_tools/run_ci_db_test.ps1
index c8167ed957..0f8198b484 100644
--- a/thirdparty/rocksdb/build_tools/run_ci_db_test.ps1
+++ b/thirdparty/rocksdb/build_tools/run_ci_db_test.ps1
@@ -1,13 +1,16 @@
 # This script enables you running RocksDB tests by running
 # All the tests concurrently and utilizing all the cores
 Param(
-  [switch]$EnableJE = $false,  # Look for and use _je executable, append _je to listed exclusions
+  [switch]$EnableJE = $false,  # Look for and use test executable, append _je to listed exclusions
   [switch]$RunAll = $false,    # Will attempt discover all *_test[_je].exe binaries and run all
                                # of them as Google suites. I.e. It will run test cases concurrently
                                # except those mentioned as $Run, those will run as individual test cases
                                # And any execlued with $ExcludeExes or $ExcludeCases
                                # It will also not run any individual test cases
                                # excluded but $ExcludeCasese
+  [switch]$RunAllExe = $false, # Look for and use test exdcutables, append _je to exclusions automatically
+                               # It will attempt to run them in parallel w/o breaking them up on individual
+                               # test cases. Those listed with $ExcludeExes will be excluded
   [string]$SuiteRun = "",      # Split test suites in test cases and run in parallel, not compatible with $RunAll
   [string]$Run = "",           # Run specified executables in parallel but do not split to test cases
   [string]$ExcludeCases = "",  # Exclude test cases, expects a comma separated list, no spaces
@@ -39,13 +42,18 @@ $RunOnly.Add("compact_on_deletion_collector_test") | Out-Null
 $RunOnly.Add("merge_test") | Out-Null
 $RunOnly.Add("stringappend_test") | Out-Null # Apparently incorrectly written
 $RunOnly.Add("backupable_db_test") | Out-Null # Disabled
-
+$RunOnly.Add("timer_queue_test") | Out-Null # Not a gtest
 
 if($RunAll -and $SuiteRun -ne "") {
     Write-Error "$RunAll and $SuiteRun are not compatible"
     exit 1
 }
 
+if($RunAllExe -and $Run -ne "") {
+    Write-Error "$RunAllExe and $Run are not compatible"
+    exit 1
+}
+
 # If running under Appveyor assume that root
 [string]$Appveyor = $Env:APPVEYOR_BUILD_FOLDER
 if($Appveyor -ne "") {
@@ -131,12 +139,8 @@ function ExtractTestCases([string]$GTestExe, $HashTable) {
 
       # Leading whitespace is fine
       $l = $l -replace '^\s+',''
-      # but no whitespace any other place
-      if($l -match "\s+") {
-        continue
-      }
       # Trailing dot is a test group but no whitespace
-      elseif ( $l -match "\.$" ) {
+      if ($l -match "\.$" -and $l -notmatch "\s+") {
         $Group = $l
       }  else {
         # Otherwise it is a test name, remove leading space
@@ -223,13 +227,11 @@ $TestExes = [ordered]@{}
 if($Run -ne "") {
 
   $test_list = $Run -split ' '
-
   ForEach($t in $test_list) {
 
     if($EnableJE) {
       $t += "_je"
     }
-
     MakeAndAdd -token $t -HashTable $TestExes
   }
 
@@ -237,6 +239,38 @@ if($Run -ne "") {
      Write-Error "Failed to extract tests from $Run"
      exit 1
   }
+} elseif($RunAllExe) {
+  # Discover all the test binaries
+  if($EnableJE) {
+    $pattern = "*_test_je.exe"
+  } else {
+    $pattern = "*_test.exe"
+  }
+
+  $search_path = -join ($BinariesFolder, $pattern)
+  Write-Host "Binaries Search Path: $search_path"
+
+  $DiscoveredExe = @()
+  dir -Path $search_path | ForEach-Object {
+     $DiscoveredExe += ($_.Name)     
+  }
+
+  # Remove exclusions
+  ForEach($e in $DiscoveredExe) {
+    $e = $e -replace '.exe$', ''
+    $bare_name = $e -replace '_je$', ''
+
+    if($ExcludeExesSet.Contains($bare_name)) {
+      Write-Warning "Test $e is excluded"
+      continue
+    }
+    MakeAndAdd -token $e -HashTable $TestExes
+  }
+
+  if($TestExes.Count -lt 1) {
+     Write-Error "Failed to discover test executables"
+     exit 1
+  }
 }
 
 # Ordered by exe @{ Exe = @{ TestCase = LogName }}
@@ -245,9 +279,7 @@ $CasesToRun = [ordered]@{}
 if($SuiteRun -ne "") {
   $suite_list = $SuiteRun -split ' '
   ProcessSuites -ListOfSuites $suite_list -HashOfHashes $CasesToRun
-}
-
-if($RunAll) {
+} elseif ($RunAll) {
 # Discover all the test binaries
   if($EnableJE) {
     $pattern = "*_test_je.exe"
@@ -255,7 +287,6 @@ if($RunAll) {
     $pattern = "*_test.exe"
   }
 
-
   $search_path = -join ($BinariesFolder, $pattern)
   Write-Host "Binaries Search Path: $search_path"
 
@@ -287,8 +318,6 @@ if($RunAll) {
 }
 
 
-Write-Host "Attempting to start: $NumTestsToStart tests"
-
 # Invoke a test with a filter and redirect all output
 $InvokeTestCase = {
     param($exe, $test, $log);
@@ -307,7 +336,7 @@ $InvokeTestAsync = {
 # Test limiting factor here
 [int]$count = 0
 # Overall status
-[bool]$success = $true;
+[bool]$script:success = $true;
 
 function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal)
 {
@@ -365,6 +394,7 @@ function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal)
                  break
                }
 
+              Write-Host "Starting $exe_name"
               [string]$Exe =  -Join ($BinariesFolder, $exe_name)
               $job = Start-Job -Name $exe_name -ScriptBlock $InvokeTestAsync -ArgumentList @($Exe,$log_path)
               $JobToLog.Add($job, $log_path)
@@ -395,7 +425,7 @@ function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal)
         $log_content = @(Get-Content $log)
 
         if($completed.State -ne "Completed") {
-            $success = $false
+            $script:success = $false
             Write-Warning $message
             $log_content | Write-Warning
         } else {
@@ -419,7 +449,7 @@ function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal)
             }
 
             if(!$pass_found) {
-                $success = $false;
+                $script:success = $false;
                 Write-Warning $message
                 $log_content | Write-Warning
             } else {
@@ -443,7 +473,7 @@ New-TimeSpan -Start $StartDate -End $EndDate |
   }
 
 
-if(!$success) {
+if(!$script:success) {
 # This does not succeed killing off jobs quick
 # So we simply exit
 #    Remove-Job -Job $jobs -Force
diff --git a/thirdparty/rocksdb/build_tools/setup_centos7.sh b/thirdparty/rocksdb/build_tools/setup_centos7.sh
new file mode 100755
index 0000000000..c633131de8
--- /dev/null
+++ b/thirdparty/rocksdb/build_tools/setup_centos7.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+ROCKSDB_VERSION="5.10.3"
+ZSTD_VERSION="1.1.3"
+
+echo "This script configures CentOS with everything needed to build and run RocksDB"
+
+yum update -y && yum install epel-release -y
+
+yum install -y \
+  wget \
+  gcc-c++ \
+  snappy snappy-devel \
+  zlib zlib-devel \
+  bzip2 bzip2-devel \
+  lz4-devel \
+  libasan \
+  gflags
+
+mkdir -pv /usr/local/rocksdb-${ROCKSDB_VERSION}
+ln -sfT /usr/local/rocksdb-${ROCKSDB_VERSION} /usr/local/rocksdb
+
+wget -qO /tmp/zstd-${ZSTD_VERSION}.tar.gz https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz
+wget -qO /tmp/rocksdb-${ROCKSDB_VERSION}.tar.gz https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz
+
+cd /tmp
+
+tar xzvf zstd-${ZSTD_VERSION}.tar.gz
+tar xzvf rocksdb-${ROCKSDB_VERSION}.tar.gz -C /usr/local/
+
+echo "Installing ZSTD..."
+pushd zstd-${ZSTD_VERSION}
+make && make install
+popd
+
+echo "Compiling RocksDB..."
+cd /usr/local/rocksdb
+chown -R vagrant:vagrant /usr/local/rocksdb/
+sudo -u vagrant make static_lib
+cd examples/
+sudo -u vagrant make all
+sudo -u vagrant ./c_simple_example
diff --git a/thirdparty/rocksdb/build_tools/update_dependencies.sh b/thirdparty/rocksdb/build_tools/update_dependencies.sh
index c7b9932646..b060544dfd 100755
--- a/thirdparty/rocksdb/build_tools/update_dependencies.sh
+++ b/thirdparty/rocksdb/build_tools/update_dependencies.sh
@@ -38,6 +38,7 @@ function get_lib_base()
     # platform is not provided, use latest gcc
     result=`ls -dr1v $result/gcc-*[^fb]/ | head -n1`
   else
+    echo $lib_platform
     result="$result/$lib_platform/"
   fi
   
@@ -52,6 +53,45 @@ function get_lib_base()
   log_variable $__res_var
 }
 
+###########################################################
+#                platform007 dependencies                 #
+###########################################################
+
+OUTPUT="$BASEDIR/dependencies_platform007.sh"
+
+rm -f "$OUTPUT"
+touch "$OUTPUT"
+
+echo "Writing dependencies to $OUTPUT"
+
+# Compilers locations
+GCC_BASE=`readlink -f $TP2_LATEST/gcc/7.x/centos7-native/*/`
+CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
+
+log_variable GCC_BASE
+log_variable CLANG_BASE
+
+# Libraries locations
+get_lib_base libgcc     7.x     platform007
+get_lib_base glibc      2.26    platform007
+get_lib_base snappy     LATEST  platform007
+get_lib_base zlib       LATEST  platform007
+get_lib_base bzip2      LATEST  platform007
+get_lib_base lz4        LATEST  platform007
+get_lib_base zstd       LATEST  platform007
+get_lib_base gflags     LATEST  platform007
+get_lib_base jemalloc   LATEST  platform007
+get_lib_base numa       LATEST  platform007
+get_lib_base libunwind  LATEST  platform007
+get_lib_base tbb        LATEST  platform007
+
+get_lib_base kernel-headers fb platform007
+get_lib_base binutils   LATEST centos7-native
+get_lib_base valgrind   LATEST platform007
+get_lib_base lua        5.3.4  platform007
+
+git diff $OUTPUT
+
 ###########################################################
 #                   5.x dependencies                      #
 ###########################################################
@@ -64,29 +104,29 @@ touch "$OUTPUT"
 echo "Writing dependencies to $OUTPUT"
 
 # Compilers locations
-GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos6-native/*/`
-CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos6-native/*/`
+GCC_BASE=`readlink -f $TP2_LATEST/gcc/5.x/centos7-native/*/`
+CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/stable/centos7-native/*/`
 
 log_variable GCC_BASE
 log_variable CLANG_BASE
 
 # Libraries locations
-get_lib_base libgcc     5.x
-get_lib_base glibc      2.23
-get_lib_base snappy     LATEST gcc-5-glibc-2.23
-get_lib_base zlib       LATEST
-get_lib_base bzip2      LATEST
-get_lib_base lz4        LATEST
-get_lib_base zstd       LATEST
-get_lib_base gflags     LATEST
-get_lib_base jemalloc   LATEST
-get_lib_base numa       LATEST
-get_lib_base libunwind  LATEST
-get_lib_base tbb        4.0_update2 gcc-5-glibc-2.23
-
-get_lib_base kernel-headers LATEST 
-get_lib_base binutils   LATEST centos6-native 
-get_lib_base valgrind   3.10.0 gcc-5-glibc-2.23
+get_lib_base libgcc     5.x     gcc-5-glibc-2.23
+get_lib_base glibc      2.23    gcc-5-glibc-2.23
+get_lib_base snappy     LATEST  gcc-5-glibc-2.23
+get_lib_base zlib       LATEST  gcc-5-glibc-2.23
+get_lib_base bzip2      LATEST  gcc-5-glibc-2.23
+get_lib_base lz4        LATEST  gcc-5-glibc-2.23
+get_lib_base zstd       LATEST  gcc-5-glibc-2.23
+get_lib_base gflags     LATEST  gcc-5-glibc-2.23
+get_lib_base jemalloc   LATEST  gcc-5-glibc-2.23
+get_lib_base numa       LATEST  gcc-5-glibc-2.23
+get_lib_base libunwind  LATEST  gcc-5-glibc-2.23
+get_lib_base tbb        LATEST  gcc-5-glibc-2.23
+
+get_lib_base kernel-headers 4.0.9-36_fbk5_2933_gd092e3f gcc-5-glibc-2.23
+get_lib_base binutils   LATEST centos7-native
+get_lib_base valgrind   LATEST gcc-5-glibc-2.23
 get_lib_base lua        5.2.3 gcc-5-glibc-2.23
 
 git diff $OUTPUT
diff --git a/thirdparty/rocksdb/build_tools/version.sh b/thirdparty/rocksdb/build_tools/version.sh
index f3ca98cf61..4e3b9f20de 100755
--- a/thirdparty/rocksdb/build_tools/version.sh
+++ b/thirdparty/rocksdb/build_tools/version.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env bash
 if [ "$#" = "0" ]; then
   echo "Usage: $0 major|minor|patch|full"
   exit 1
diff --git a/thirdparty/rocksdb/cache/cache_bench.cc b/thirdparty/rocksdb/cache/cache_bench.cc
index 16c2ced1dd..098813d9d7 100644
--- a/thirdparty/rocksdb/cache/cache_bench.cc
+++ b/thirdparty/rocksdb/cache/cache_bench.cc
@@ -17,16 +17,16 @@ int main() {
 #include <inttypes.h>
 #include <sys/types.h>
 #include <stdio.h>
-#include <gflags/gflags.h>
 
-#include "rocksdb/db.h"
+#include "port/port.h"
 #include "rocksdb/cache.h"
+#include "rocksdb/db.h"
 #include "rocksdb/env.h"
-#include "port/port.h"
+#include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 
-using GFLAGS::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
 static const uint32_t KB = 1024;
 
@@ -52,7 +52,7 @@ namespace rocksdb {
 
 class CacheBench;
 namespace {
-void deleter(const Slice& key, void* value) {
+void deleter(const Slice& /*key*/, void* value) {
     delete reinterpret_cast<char *>(value);
 }
 
diff --git a/thirdparty/rocksdb/cache/cache_test.cc b/thirdparty/rocksdb/cache/cache_test.cc
index 8e241226d9..f9f77234cd 100644
--- a/thirdparty/rocksdb/cache/cache_test.cc
+++ b/thirdparty/rocksdb/cache/cache_test.cc
@@ -40,9 +40,9 @@ static int DecodeValue(void* v) {
 const std::string kLRU = "lru";
 const std::string kClock = "clock";
 
-void dumbDeleter(const Slice& key, void* value) {}
+void dumbDeleter(const Slice& /*key*/, void* /*value*/) {}
 
-void eraseDeleter(const Slice& key, void* value) {
+void eraseDeleter(const Slice& /*key*/, void* value) {
   Cache* cache = reinterpret_cast<Cache*>(value);
   cache->Erase("foo");
 }
@@ -64,8 +64,8 @@ class CacheTest : public testing::TestWithParam<std::string> {
 
   std::vector<int> deleted_keys_;
   std::vector<int> deleted_values_;
-  shared_ptr<Cache> cache_;
-  shared_ptr<Cache> cache2_;
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<Cache> cache2_;
 
   CacheTest()
       : cache_(NewCache(kCacheSize, kNumShardBits, false)),
@@ -73,8 +73,7 @@ class CacheTest : public testing::TestWithParam<std::string> {
     current_ = this;
   }
 
-  ~CacheTest() {
-  }
+  ~CacheTest() override {}
 
   std::shared_ptr<Cache> NewCache(size_t capacity) {
     auto type = GetParam();
@@ -99,7 +98,7 @@ class CacheTest : public testing::TestWithParam<std::string> {
     return nullptr;
   }
 
-  int Lookup(shared_ptr<Cache> cache, int key) {
+  int Lookup(std::shared_ptr<Cache> cache, int key) {
     Cache::Handle* handle = cache->Lookup(EncodeKey(key));
     const int r = (handle == nullptr) ? -1 : DecodeValue(cache->Value(handle));
     if (handle != nullptr) {
@@ -108,16 +107,16 @@ class CacheTest : public testing::TestWithParam<std::string> {
     return r;
   }
 
-  void Insert(shared_ptr<Cache> cache, int key, int value, int charge = 1) {
+  void Insert(std::shared_ptr<Cache> cache, int key, int value,
+              int charge = 1) {
     cache->Insert(EncodeKey(key), EncodeValue(value), charge,
                   &CacheTest::Deleter);
   }
 
-  void Erase(shared_ptr<Cache> cache, int key) {
+  void Erase(std::shared_ptr<Cache> cache, int key) {
     cache->Erase(EncodeKey(key));
   }
 
-
   int Lookup(int key) {
     return Lookup(cache_, key);
   }
@@ -145,7 +144,7 @@ class CacheTest : public testing::TestWithParam<std::string> {
 CacheTest* CacheTest::current_;
 
 TEST_P(CacheTest, UsageTest) {
-  // cache is shared_ptr and will be automatically cleaned up.
+  // cache is std::shared_ptr and will be automatically cleaned up.
   const uint64_t kCapacity = 100000;
   auto cache = NewCache(kCapacity, 8, false);
 
@@ -173,7 +172,7 @@ TEST_P(CacheTest, UsageTest) {
 }
 
 TEST_P(CacheTest, PinnedUsageTest) {
-  // cache is shared_ptr and will be automatically cleaned up.
+  // cache is std::shared_ptr and will be automatically cleaned up.
   const uint64_t kCapacity = 100000;
   auto cache = NewCache(kCapacity, 8, false);
 
@@ -307,7 +306,7 @@ TEST_P(CacheTest, EvictionPolicy) {
   Insert(200, 201);
 
   // Frequently used entry must be kept around
-  for (int i = 0; i < kCacheSize + 100; i++) {
+  for (int i = 0; i < kCacheSize + 200; i++) {
     Insert(1000+i, 2000+i);
     ASSERT_EQ(101, Lookup(100));
   }
@@ -360,7 +359,7 @@ TEST_P(CacheTest, EvictionPolicyRef) {
   Insert(303, 104);
 
   // Insert entries much more than Cache capacity
-  for (int i = 0; i < kCacheSize + 100; i++) {
+  for (int i = 0; i < kCacheSize + 200; i++) {
     Insert(1000 + i, 2000 + i);
   }
 
@@ -470,7 +469,7 @@ class Value {
 };
 
 namespace {
-void deleter(const Slice& key, void* value) {
+void deleter(const Slice& /*key*/, void* value) {
   delete static_cast<Value *>(value);
 }
 }  // namespace
@@ -688,7 +687,8 @@ TEST_P(CacheTest, DefaultShardBits) {
 }
 
 #ifdef SUPPORT_CLOCK_CACHE
-shared_ptr<Cache> (*new_clock_cache_func)(size_t, int, bool) = NewClockCache;
+std::shared_ptr<Cache> (*new_clock_cache_func)(size_t, int,
+                                               bool) = NewClockCache;
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
                         testing::Values(kLRU, kClock));
 #else
diff --git a/thirdparty/rocksdb/cache/clock_cache.cc b/thirdparty/rocksdb/cache/clock_cache.cc
index 7e42714ef1..89173834e2 100644
--- a/thirdparty/rocksdb/cache/clock_cache.cc
+++ b/thirdparty/rocksdb/cache/clock_cache.cc
@@ -13,8 +13,8 @@
 
 namespace rocksdb {
 
-std::shared_ptr<Cache> NewClockCache(size_t capacity, int num_shard_bits,
-                                     bool strict_capacity_limit) {
+std::shared_ptr<Cache> NewClockCache(size_t /*capacity*/, int /*num_shard_bits*/,
+                                     bool /*strict_capacity_limit*/) {
   // Clock cache not supported.
   return nullptr;
 }
@@ -234,38 +234,35 @@ struct CleanupContext {
 };
 
 // A cache shard which maintains its own CLOCK cache.
-class ClockCacheShard : public CacheShard {
+class ClockCacheShard final : public CacheShard {
  public:
   // Hash map type.
   typedef tbb::concurrent_hash_map<CacheKey, CacheHandle*, CacheKey> HashTable;
 
   ClockCacheShard();
-  ~ClockCacheShard();
+  ~ClockCacheShard() override;
 
   // Interfaces
-  virtual void SetCapacity(size_t capacity) override;
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override;
-  virtual Status Insert(const Slice& key, uint32_t hash, void* value,
-                        size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Cache::Handle** handle,
-                        Cache::Priority priority) override;
-  virtual Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
+  void SetCapacity(size_t capacity) override;
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override;
+  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Cache::Handle** handle, Cache::Priority priority) override;
+  Cache::Handle* Lookup(const Slice& key, uint32_t hash) override;
   // If the entry in in cache, increase reference count and return true.
   // Return false otherwise.
   //
   // Not necessary to hold mutex_ before being called.
-  virtual bool Ref(Cache::Handle* handle) override;
-  virtual bool Release(Cache::Handle* handle,
-                       bool force_erase = false) override;
-  virtual void Erase(const Slice& key, uint32_t hash) override;
+  bool Ref(Cache::Handle* handle) override;
+  bool Release(Cache::Handle* handle, bool force_erase = false) override;
+  void Erase(const Slice& key, uint32_t hash) override;
   bool EraseAndConfirm(const Slice& key, uint32_t hash,
                        CleanupContext* context);
-  virtual size_t GetUsage() const override;
-  virtual size_t GetPinnedUsage() const override;
-  virtual void EraseUnRefEntries() override;
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) override;
+  size_t GetUsage() const override;
+  size_t GetPinnedUsage() const override;
+  void EraseUnRefEntries() override;
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe) override;
 
  private:
   static const uint32_t kInCacheBit = 1;
@@ -367,7 +364,9 @@ ClockCacheShard::~ClockCacheShard() {
   for (auto& handle : list_) {
     uint32_t flags = handle.flags.load(std::memory_order_relaxed);
     if (InCache(flags) || CountRefs(flags) > 0) {
-      (*handle.deleter)(handle.key, handle.value);
+      if (handle.deleter != nullptr) {
+        (*handle.deleter)(handle.key, handle.value);
+      }
       delete[] handle.key.data();
     }
   }
@@ -586,7 +585,7 @@ Status ClockCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
                                size_t charge,
                                void (*deleter)(const Slice& key, void* value),
                                Cache::Handle** out_handle,
-                               Cache::Priority priority) {
+                               Cache::Priority /*priority*/) {
   CleanupContext context;
   HashTable::accessor accessor;
   char* key_data = new char[key.size()];
@@ -673,7 +672,7 @@ void ClockCacheShard::EraseUnRefEntries() {
   Cleanup(context);
 }
 
-class ClockCache : public ShardedCache {
+class ClockCache final : public ShardedCache {
  public:
   ClockCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit)
       : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
@@ -683,31 +682,31 @@ class ClockCache : public ShardedCache {
     SetStrictCapacityLimit(strict_capacity_limit);
   }
 
-  virtual ~ClockCache() { delete[] shards_; }
+  ~ClockCache() override { delete[] shards_; }
 
-  virtual const char* Name() const override { return "ClockCache"; }
+  const char* Name() const override { return "ClockCache"; }
 
-  virtual CacheShard* GetShard(int shard) override {
+  CacheShard* GetShard(int shard) override {
     return reinterpret_cast<CacheShard*>(&shards_[shard]);
   }
 
-  virtual const CacheShard* GetShard(int shard) const override {
+  const CacheShard* GetShard(int shard) const override {
     return reinterpret_cast<CacheShard*>(&shards_[shard]);
   }
 
-  virtual void* Value(Handle* handle) override {
+  void* Value(Handle* handle) override {
     return reinterpret_cast<const CacheHandle*>(handle)->value;
   }
 
-  virtual size_t GetCharge(Handle* handle) const override {
+  size_t GetCharge(Handle* handle) const override {
     return reinterpret_cast<const CacheHandle*>(handle)->charge;
   }
 
-  virtual uint32_t GetHash(Handle* handle) const override {
+  uint32_t GetHash(Handle* handle) const override {
     return reinterpret_cast<const CacheHandle*>(handle)->hash;
   }
 
-  virtual void DisownData() override { shards_ = nullptr; }
+  void DisownData() override { shards_ = nullptr; }
 
  private:
   ClockCacheShard* shards_;
diff --git a/thirdparty/rocksdb/cache/lru_cache.cc b/thirdparty/rocksdb/cache/lru_cache.cc
index d29e709342..fdcbb4e86c 100644
--- a/thirdparty/rocksdb/cache/lru_cache.cc
+++ b/thirdparty/rocksdb/cache/lru_cache.cc
@@ -99,12 +99,22 @@ void LRUHandleTable::Resize() {
   length_ = new_length;
 }
 
-LRUCacheShard::LRUCacheShard()
-    : high_pri_pool_usage_(0), usage_(0), lru_usage_(0) {
+LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                             double high_pri_pool_ratio,
+                             bool use_adaptive_mutex)
+    : capacity_(0),
+      high_pri_pool_usage_(0),
+      strict_capacity_limit_(strict_capacity_limit),
+      high_pri_pool_ratio_(high_pri_pool_ratio),
+      high_pri_pool_capacity_(0),
+      usage_(0),
+      lru_usage_(0),
+      mutex_(use_adaptive_mutex) {
   // Make empty circular linked list
   lru_.next = &lru_;
   lru_.prev = &lru_;
   lru_low_pri_ = &lru_;
+  SetCapacity(capacity);
 }
 
 LRUCacheShard::~LRUCacheShard() {}
@@ -167,6 +177,11 @@ size_t LRUCacheShard::TEST_GetLRUSize() {
   return lru_size;
 }
 
+double LRUCacheShard::GetHighPriPoolRatio() {
+  MutexLock l(&mutex_);
+  return high_pri_pool_ratio_;
+}
+
 void LRUCacheShard::LRU_Remove(LRUHandle* e) {
   assert(e->next != nullptr);
   assert(e->prev != nullptr);
@@ -186,7 +201,7 @@ void LRUCacheShard::LRU_Remove(LRUHandle* e) {
 void LRUCacheShard::LRU_Insert(LRUHandle* e) {
   assert(e->next == nullptr);
   assert(e->prev == nullptr);
-  if (high_pri_pool_ratio_ > 0 && e->IsHighPri()) {
+  if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
     // Inset "e" to head of LRU list.
     e->next = &lru_;
     e->prev = lru_.prev;
@@ -233,22 +248,6 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
   }
 }
 
-void* LRUCacheShard::operator new(size_t size) {
-  return port::cacheline_aligned_alloc(size);
-}
-
-void* LRUCacheShard::operator new[](size_t size) {
-  return port::cacheline_aligned_alloc(size);
-}
-
-void LRUCacheShard::operator delete(void *memblock) {
-  port::cacheline_aligned_free(memblock);
-}
-
-void LRUCacheShard::operator delete[](void* memblock) {
-  port::cacheline_aligned_free(memblock);
-}
-
 void LRUCacheShard::SetCapacity(size_t capacity) {
   autovector<LRUHandle*> last_reference_list;
   {
@@ -278,6 +277,7 @@ Cache::Handle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash) {
       LRU_Remove(e);
     }
     e->refs++;
+    e->SetHit();
   }
   return reinterpret_cast<Cache::Handle*>(e);
 }
@@ -353,6 +353,7 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
   e->deleter = deleter;
   e->charge = charge;
   e->key_length = key.size();
+  e->flags = 0;
   e->hash = hash;
   e->refs = (handle == nullptr
                  ? 1
@@ -462,18 +463,31 @@ std::string LRUCacheShard::GetPrintableOptions() const {
 }
 
 LRUCache::LRUCache(size_t capacity, int num_shard_bits,
-                   bool strict_capacity_limit, double high_pri_pool_ratio)
-    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit) {
+                   bool strict_capacity_limit, double high_pri_pool_ratio,
+                   std::shared_ptr<MemoryAllocator> allocator,
+                   bool use_adaptive_mutex)
+    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
+                   std::move(allocator)) {
   num_shards_ = 1 << num_shard_bits;
-  shards_ = new LRUCacheShard[num_shards_];
-  SetCapacity(capacity);
-  SetStrictCapacityLimit(strict_capacity_limit);
+  shards_ = reinterpret_cast<LRUCacheShard*>(
+      port::cacheline_aligned_alloc(sizeof(LRUCacheShard) * num_shards_));
+  size_t per_shard = (capacity + (num_shards_ - 1)) / num_shards_;
   for (int i = 0; i < num_shards_; i++) {
-    shards_[i].SetHighPriorityPoolRatio(high_pri_pool_ratio);
+    new (&shards_[i])
+        LRUCacheShard(per_shard, strict_capacity_limit, high_pri_pool_ratio,
+            use_adaptive_mutex);
   }
 }
 
-LRUCache::~LRUCache() { delete[] shards_; }
+LRUCache::~LRUCache() {
+  if (shards_ != nullptr) {
+    assert(num_shards_ > 0);
+    for (int i = 0; i < num_shards_; i++) {
+      shards_[i].~LRUCacheShard();
+    }
+    port::cacheline_aligned_free(shards_);
+  }
+}
 
 CacheShard* LRUCache::GetShard(int shard) {
   return reinterpret_cast<CacheShard*>(&shards_[shard]);
@@ -497,9 +511,17 @@ uint32_t LRUCache::GetHash(Handle* handle) const {
 
 void LRUCache::DisownData() {
 // Do not drop data if compile with ASAN to suppress leak warning.
+#if defined(__clang__)
+#if !defined(__has_feature) || !__has_feature(address_sanitizer)
+  shards_ = nullptr;
+  num_shards_ = 0;
+#endif
+#else  // __clang__
 #ifndef __SANITIZE_ADDRESS__
   shards_ = nullptr;
+  num_shards_ = 0;
 #endif  // !__SANITIZE_ADDRESS__
+#endif  // __clang__
 }
 
 size_t LRUCache::TEST_GetLRUSize() {
@@ -510,9 +532,27 @@ size_t LRUCache::TEST_GetLRUSize() {
   return lru_size_of_all_shards;
 }
 
-std::shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
-                                   bool strict_capacity_limit,
-                                   double high_pri_pool_ratio) {
+double LRUCache::GetHighPriPoolRatio() {
+  double result = 0.0;
+  if (num_shards_ > 0) {
+    result = shards_[0].GetHighPriPoolRatio();
+  }
+  return result;
+}
+
+std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
+  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
+                     cache_opts.strict_capacity_limit,
+                     cache_opts.high_pri_pool_ratio,
+                     cache_opts.memory_allocator,
+                     cache_opts.use_adaptive_mutex);
+}
+
+std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+    double high_pri_pool_ratio,
+    std::shared_ptr<MemoryAllocator> memory_allocator,
+    bool use_adaptive_mutex) {
   if (num_shard_bits >= 20) {
     return nullptr;  // the cache cannot be sharded into too many fine pieces
   }
@@ -524,7 +564,9 @@ std::shared_ptr<Cache> NewLRUCache(size_t capacity, int num_shard_bits,
     num_shard_bits = GetDefaultCacheShardBits(capacity);
   }
   return std::make_shared<LRUCache>(capacity, num_shard_bits,
-                                    strict_capacity_limit, high_pri_pool_ratio);
+                                    strict_capacity_limit, high_pri_pool_ratio,
+                                    std::move(memory_allocator),
+                                    use_adaptive_mutex);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/cache/lru_cache.h b/thirdparty/rocksdb/cache/lru_cache.h
index abe78fd0c7..0d9a317486 100644
--- a/thirdparty/rocksdb/cache/lru_cache.h
+++ b/thirdparty/rocksdb/cache/lru_cache.h
@@ -55,10 +55,18 @@ struct LRUHandle {
                      // cache itself is counted as 1
 
   // Include the following flags:
-  //   in_cache:    whether this entry is referenced by the hash table.
-  //   is_high_pri: whether this entry is high priority entry.
-  //   in_high_pro_pool: whether this entry is in high-pri pool.
-  char flags;
+  //   IN_CACHE:         whether this entry is referenced by the hash table.
+  //   IS_HIGH_PRI:      whether this entry is high priority entry.
+  //   IN_HIGH_PRI_POOL: whether this entry is in high-pri pool.
+  //   HAS_HIT:          whether this entry has had any lookups (hits).
+  enum Flags : uint8_t {
+    IN_CACHE = (1 << 0),
+    IS_HIGH_PRI = (1 << 1),
+    IN_HIGH_PRI_POOL = (1 << 2),
+    HAS_HIT = (1 << 3),
+  };
+
+  uint8_t flags;
 
   uint32_t hash;     // Hash of key(); used for fast sharding and comparisons
 
@@ -74,34 +82,37 @@ struct LRUHandle {
     }
   }
 
-  bool InCache() { return flags & 1; }
-  bool IsHighPri() { return flags & 2; }
-  bool InHighPriPool() { return flags & 4; }
+  bool InCache() const { return flags & IN_CACHE; }
+  bool IsHighPri() const { return flags & IS_HIGH_PRI; }
+  bool InHighPriPool() const { return flags & IN_HIGH_PRI_POOL; }
+  bool HasHit() const { return flags & HAS_HIT; }
 
   void SetInCache(bool in_cache) {
     if (in_cache) {
-      flags |= 1;
+      flags |= IN_CACHE;
     } else {
-      flags &= ~1;
+      flags &= ~IN_CACHE;
     }
   }
 
   void SetPriority(Cache::Priority priority) {
     if (priority == Cache::Priority::HIGH) {
-      flags |= 2;
+      flags |= IS_HIGH_PRI;
     } else {
-      flags &= ~2;
+      flags &= ~IS_HIGH_PRI;
     }
   }
 
   void SetInHighPriPool(bool in_high_pri_pool) {
     if (in_high_pri_pool) {
-      flags |= 4;
+      flags |= IN_HIGH_PRI_POOL;
     } else {
-      flags &= ~4;
+      flags &= ~IN_HIGH_PRI_POOL;
     }
   }
 
+  void SetHit() { flags |= HAS_HIT; }
+
   void Free() {
     assert((refs == 1 && InCache()) || (refs == 0 && !InCache()));
     if (deleter) {
@@ -154,9 +165,10 @@ class LRUHandleTable {
 };
 
 // A single shard of sharded cache.
-class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard : public CacheShard {
+class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShard {
  public:
-  LRUCacheShard();
+  LRUCacheShard(size_t capacity, bool strict_capacity_limit,
+                double high_pri_pool_ratio, bool use_adaptive_mutex);
   virtual ~LRUCacheShard();
 
   // Separate from constructor so caller can easily make an array of LRUCache
@@ -202,14 +214,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard : public CacheShard {
   //  not threadsafe
   size_t TEST_GetLRUSize();
 
-  // Overloading to aligned it to cache line size
-  void* operator new(size_t);
-
-  void* operator new[](size_t);
-
-  void operator delete(void *);
-
-  void operator delete[](void*);
+  //  Retrives high pri pool ratio
+  double GetHighPriPoolRatio();
 
  private:
   void LRU_Remove(LRUHandle* e);
@@ -278,10 +284,16 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard : public CacheShard {
   mutable port::Mutex mutex_;
 };
 
-class LRUCache : public ShardedCache {
+class LRUCache
+#ifdef NDEBUG
+    final
+#endif
+    : public ShardedCache {
  public:
   LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-           double high_pri_pool_ratio);
+           double high_pri_pool_ratio,
+           std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+           bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
   virtual ~LRUCache();
   virtual const char* Name() const override { return "LRUCache"; }
   virtual CacheShard* GetShard(int shard) override;
@@ -293,9 +305,11 @@ class LRUCache : public ShardedCache {
 
   //  Retrieves number of elements in LRU, for unit test purpose only
   size_t TEST_GetLRUSize();
+  //  Retrives high pri pool ratio
+  double GetHighPriPoolRatio();
 
  private:
-  LRUCacheShard* shards_;
+  LRUCacheShard* shards_ = nullptr;
   int num_shards_ = 0;
 };
 
diff --git a/thirdparty/rocksdb/cache/lru_cache_test.cc b/thirdparty/rocksdb/cache/lru_cache_test.cc
index 1b83033c36..9980dd72b7 100644
--- a/thirdparty/rocksdb/cache/lru_cache_test.cc
+++ b/thirdparty/rocksdb/cache/lru_cache_test.cc
@@ -7,6 +7,7 @@
 
 #include <string>
 #include <vector>
+#include "port/port.h"
 #include "util/testharness.h"
 
 namespace rocksdb {
@@ -14,22 +15,23 @@ namespace rocksdb {
 class LRUCacheTest : public testing::Test {
  public:
   LRUCacheTest() {}
-  ~LRUCacheTest() {}
-
-  void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0) {
-    cache_.reset(
-#if defined(_MSC_VER)
-#pragma warning(push)
-#pragma warning(disable: 4316) // We've validated the alignment with the new operators
-#endif
-      new LRUCacheShard()
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-    );
-    cache_->SetCapacity(capacity);
-    cache_->SetStrictCapacityLimit(false);
-    cache_->SetHighPriorityPoolRatio(high_pri_pool_ratio);
+  ~LRUCacheTest() override { DeleteCache(); }
+
+  void DeleteCache() {
+    if (cache_ != nullptr) {
+      cache_->~LRUCacheShard();
+      port::cacheline_aligned_free(cache_);
+      cache_ = nullptr;
+    }
+  }
+
+  void NewCache(size_t capacity, double high_pri_pool_ratio = 0.0,
+                bool use_adaptive_mutex = kDefaultToAdaptiveMutex) {
+    DeleteCache();
+    cache_ = reinterpret_cast<LRUCacheShard*>(
+        port::cacheline_aligned_alloc(sizeof(LRUCacheShard)));
+    new (cache_) LRUCacheShard(capacity, false /*strict_capcity_limit*/,
+                               high_pri_pool_ratio, use_adaptive_mutex);
   }
 
   void Insert(const std::string& key,
@@ -85,7 +87,7 @@ class LRUCacheTest : public testing::Test {
   }
 
  private:
-  std::unique_ptr<LRUCacheShard> cache_;
+  LRUCacheShard* cache_ = nullptr;
 };
 
 TEST_F(LRUCacheTest, BasicLRU) {
@@ -114,7 +116,30 @@ TEST_F(LRUCacheTest, BasicLRU) {
   ValidateLRUList({"e", "z", "d", "u", "v"});
 }
 
-TEST_F(LRUCacheTest, MidPointInsertion) {
+TEST_F(LRUCacheTest, MidpointInsertion) {
+  // Allocate 2 cache entries to high-pri pool.
+  NewCache(5, 0.45);
+
+  Insert("a", Cache::Priority::LOW);
+  Insert("b", Cache::Priority::LOW);
+  Insert("c", Cache::Priority::LOW);
+  Insert("x", Cache::Priority::HIGH);
+  Insert("y", Cache::Priority::HIGH);
+  ValidateLRUList({"a", "b", "c", "x", "y"}, 2);
+
+  // Low-pri entries inserted to the tail of low-pri list (the midpoint).
+  // After lookup, it will move to the tail of the full list.
+  Insert("d", Cache::Priority::LOW);
+  ValidateLRUList({"b", "c", "d", "x", "y"}, 2);
+  ASSERT_TRUE(Lookup("d"));
+  ValidateLRUList({"b", "c", "x", "y", "d"}, 2);
+
+  // High-pri entries will be inserted to the tail of full list.
+  Insert("z", Cache::Priority::HIGH);
+  ValidateLRUList({"c", "x", "y", "d", "z"}, 2);
+}
+
+TEST_F(LRUCacheTest, EntriesWithPriority) {
   // Allocate 2 cache entries to high-pri pool.
   NewCache(5, 0.45);
 
@@ -140,15 +165,15 @@ TEST_F(LRUCacheTest, MidPointInsertion) {
   Insert("a", Cache::Priority::LOW);
   ValidateLRUList({"v", "X", "a", "Y", "Z"}, 2);
 
-  // Low-pri entries will be inserted to head of low-pri pool after lookup.
+  // Low-pri entries will be inserted to head of high-pri pool after lookup.
   ASSERT_TRUE(Lookup("v"));
-  ValidateLRUList({"X", "a", "v", "Y", "Z"}, 2);
+  ValidateLRUList({"X", "a", "Y", "Z", "v"}, 2);
 
   // High-pri entries will be inserted to the head of the list after lookup.
   ASSERT_TRUE(Lookup("X"));
-  ValidateLRUList({"a", "v", "Y", "Z", "X"}, 2);
+  ValidateLRUList({"a", "Y", "Z", "v", "X"}, 2);
   ASSERT_TRUE(Lookup("Z"));
-  ValidateLRUList({"a", "v", "Y", "X", "Z"}, 2);
+  ValidateLRUList({"a", "Y", "v", "X", "Z"}, 2);
 
   Erase("Y");
   ValidateLRUList({"a", "v", "X", "Z"}, 2);
@@ -161,7 +186,7 @@ TEST_F(LRUCacheTest, MidPointInsertion) {
   Insert("g", Cache::Priority::LOW);
   ValidateLRUList({"d", "e", "f", "g", "Z"}, 1);
   ASSERT_TRUE(Lookup("d"));
-  ValidateLRUList({"e", "f", "g", "d", "Z"}, 1);
+  ValidateLRUList({"e", "f", "g", "Z", "d"}, 2);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/cache/sharded_cache.cc b/thirdparty/rocksdb/cache/sharded_cache.cc
index 9bdea3a08e..a48a32185b 100644
--- a/thirdparty/rocksdb/cache/sharded_cache.cc
+++ b/thirdparty/rocksdb/cache/sharded_cache.cc
@@ -20,8 +20,10 @@
 namespace rocksdb {
 
 ShardedCache::ShardedCache(size_t capacity, int num_shard_bits,
-                           bool strict_capacity_limit)
-    : num_shard_bits_(num_shard_bits),
+                           bool strict_capacity_limit,
+                           std::shared_ptr<MemoryAllocator> allocator)
+    : Cache(std::move(allocator)),
+      num_shard_bits_(num_shard_bits),
       capacity_(capacity),
       strict_capacity_limit_(strict_capacity_limit),
       last_id_(1) {}
@@ -53,7 +55,7 @@ Status ShardedCache::Insert(const Slice& key, void* value, size_t charge,
       ->Insert(key, hash, value, charge, deleter, handle, priority);
 }
 
-Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* stats) {
+Cache::Handle* ShardedCache::Lookup(const Slice& key, Statistics* /*stats*/) {
   uint32_t hash = HashSlice(key);
   return GetShard(Shard(hash))->Lookup(key, hash);
 }
@@ -142,6 +144,9 @@ std::string ShardedCache::GetPrintableOptions() const {
              strict_capacity_limit_);
     ret.append(buffer);
   }
+  snprintf(buffer, kBufferSize, "    memory_allocator : %s\n",
+           memory_allocator() ? memory_allocator()->Name() : "None");
+  ret.append(buffer);
   ret.append(GetShard(0)->GetPrintableOptions());
   return ret;
 }
diff --git a/thirdparty/rocksdb/cache/sharded_cache.h b/thirdparty/rocksdb/cache/sharded_cache.h
index 4f9dea2ad0..920898b871 100644
--- a/thirdparty/rocksdb/cache/sharded_cache.h
+++ b/thirdparty/rocksdb/cache/sharded_cache.h
@@ -47,7 +47,8 @@ class CacheShard {
 // Keys are sharded by the highest num_shard_bits bits of hash value.
 class ShardedCache : public Cache {
  public:
-  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit);
+  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
+               std::shared_ptr<MemoryAllocator> memory_allocator = nullptr);
   virtual ~ShardedCache() = default;
   virtual const char* Name() const override = 0;
   virtual CacheShard* GetShard(int shard) = 0;
@@ -82,7 +83,7 @@ class ShardedCache : public Cache {
 
  private:
   static inline uint32_t HashSlice(const Slice& s) {
-    return Hash(s.data(), s.size(), 0);
+    return static_cast<uint32_t>(GetSliceNPHash64(s));
   }
 
   uint32_t Shard(uint32_t hash) {
diff --git a/thirdparty/rocksdb/coverage/parse_gcov_output.py b/thirdparty/rocksdb/coverage/parse_gcov_output.py
index 72e8b07230..4c358610c4 100644
--- a/thirdparty/rocksdb/coverage/parse_gcov_output.py
+++ b/thirdparty/rocksdb/coverage/parse_gcov_output.py
@@ -1,4 +1,3 @@
-import optparse
 import re
 import sys
 
@@ -72,7 +71,7 @@ def display_file_coverage(per_file_coverage, total_coverage):
     header_template = \
         "%" + str(max_file_name_length) + "s\t%s\t%s"
     separator = "-" * (max_file_name_length + 10 + 20)
-    print header_template % ("Filename", "Coverage", "Lines")
+    print header_template % ("Filename", "Coverage", "Lines")  # noqa: E999 T25377293 Grandfathered in
     print separator
 
     # -- Print body
diff --git a/thirdparty/rocksdb/db/builder.cc b/thirdparty/rocksdb/db/builder.cc
index 7cfa7800cc..a41a8ca4c3 100644
--- a/thirdparty/rocksdb/db/builder.cc
+++ b/thirdparty/rocksdb/db/builder.cc
@@ -18,6 +18,7 @@
 #include "db/event_helpers.h"
 #include "db/internal_stats.h"
 #include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
 #include "db/table_cache.h"
 #include "db/version_edit.h"
 #include "monitoring/iostats_context_imp.h"
@@ -28,6 +29,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/table.h"
 #include "table/block_based_table_builder.h"
+#include "table/format.h"
 #include "table/internal_iterator.h"
 #include "util/file_reader_writer.h"
 #include "util/filename.h"
@@ -39,23 +41,24 @@ namespace rocksdb {
 class TableFactory;
 
 TableBuilder* NewTableBuilder(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
     const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, const std::string& column_family_name,
     WritableFileWriter* file, const CompressionType compression_type,
-    const CompressionOptions& compression_opts, int level,
-    const std::string* compression_dict, const bool skip_filters,
-    const uint64_t creation_time, const uint64_t oldest_key_time) {
+    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
+    int level, const bool skip_filters, const uint64_t creation_time,
+    const uint64_t oldest_key_time, const uint64_t target_file_size) {
   assert((column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
          column_family_name.empty());
   return ioptions.table_factory->NewTableBuilder(
-      TableBuilderOptions(
-          ioptions, internal_comparator, int_tbl_prop_collector_factories,
-          compression_type, compression_opts, compression_dict, skip_filters,
-          column_family_name, level, creation_time, oldest_key_time),
+      TableBuilderOptions(ioptions, moptions, internal_comparator,
+                          int_tbl_prop_collector_factories, compression_type,
+                          sample_for_compression, compression_opts,
+                          skip_filters, column_family_name, level,
+                          creation_time, oldest_key_time, target_file_size),
       column_family_id, file);
 }
 
@@ -63,19 +66,21 @@ Status BuildTable(
     const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions,
     const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
     TableCache* table_cache, InternalIterator* iter,
-    std::unique_ptr<InternalIterator> range_del_iter, FileMetaData* meta,
-    const InternalKeyComparator& internal_comparator,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, const std::string& column_family_name,
     std::vector<SequenceNumber> snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
-    const CompressionType compression,
-    const CompressionOptions& compression_opts, bool paranoid_file_checks,
-    InternalStats* internal_stats, TableFileCreationReason reason,
-    EventLogger* event_logger, int job_id, const Env::IOPriority io_priority,
-    TableProperties* table_properties, int level, const uint64_t creation_time,
-    const uint64_t oldest_key_time) {
+    SnapshotChecker* snapshot_checker, const CompressionType compression,
+    uint64_t sample_for_compression, const CompressionOptions& compression_opts,
+    bool paranoid_file_checks, InternalStats* internal_stats,
+    TableFileCreationReason reason, EventLogger* event_logger, int job_id,
+    const Env::IOPriority io_priority, TableProperties* table_properties,
+    int level, const uint64_t creation_time, const uint64_t oldest_key_time,
+    Env::WriteLifeTimeHint write_hint) {
   assert((column_family_id ==
           TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
          column_family_name.empty());
@@ -84,15 +89,13 @@ Status BuildTable(
   Status s;
   meta->fd.file_size = 0;
   iter->SeekToFirst();
-  std::unique_ptr<RangeDelAggregator> range_del_agg(
-      new RangeDelAggregator(internal_comparator, snapshots));
-  s = range_del_agg->AddTombstones(std::move(range_del_iter));
-  if (!s.ok()) {
-    // may be non-ok if a range tombstone key is unparsable
-    return s;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&internal_comparator, snapshots));
+  for (auto& range_del_iter : range_del_iters) {
+    range_del_agg->AddTombstones(std::move(range_del_iter));
   }
 
-  std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
+  std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
                                     meta->fd.GetPathId());
 #ifndef ROCKSDB_LITE
   EventHelpers::NotifyTableFileCreationStarted(
@@ -100,11 +103,16 @@ Status BuildTable(
 #endif  // !ROCKSDB_LITE
   TableProperties tp;
 
-  if (iter->Valid() || range_del_agg->ShouldAddTombstones()) {
+  if (iter->Valid() || !range_del_agg->IsEmpty()) {
     TableBuilder* builder;
-    unique_ptr<WritableFileWriter> file_writer;
+    std::unique_ptr<WritableFileWriter> file_writer;
+    // Currently we only enable dictionary compression during compaction to the
+    // bottommost level.
+    CompressionOptions compression_opts_for_flush(compression_opts);
+    compression_opts_for_flush.max_dict_bytes = 0;
+    compression_opts_for_flush.zstd_max_train_bytes = 0;
     {
-      unique_ptr<WritableFile> file;
+      std::unique_ptr<WritableFile> file;
 #ifndef NDEBUG
       bool use_direct_writes = env_options.use_direct_writes;
       TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
@@ -117,24 +125,29 @@ Status BuildTable(
         return s;
       }
       file->SetIOPriority(io_priority);
+      file->SetWriteLifeTimeHint(write_hint);
 
-      file_writer.reset(new WritableFileWriter(std::move(file), env_options,
-                                               ioptions.statistics));
+      file_writer.reset(
+          new WritableFileWriter(std::move(file), fname, env_options, env,
+                                 ioptions.statistics, ioptions.listeners));
       builder = NewTableBuilder(
-          ioptions, internal_comparator, int_tbl_prop_collector_factories,
-          column_family_id, column_family_name, file_writer.get(), compression,
-          compression_opts, level, nullptr /* compression_dict */,
+          ioptions, mutable_cf_options, internal_comparator,
+          int_tbl_prop_collector_factories, column_family_id,
+          column_family_name, file_writer.get(), compression,
+          sample_for_compression, compression_opts_for_flush, level,
           false /* skip_filters */, creation_time, oldest_key_time);
     }
 
     MergeHelper merge(env, internal_comparator.user_comparator(),
                       ioptions.merge_operator, nullptr, ioptions.info_log,
                       true /* internal key corruption is not ok */,
-                      snapshots.empty() ? 0 : snapshots.back());
+                      snapshots.empty() ? 0 : snapshots.back(),
+                      snapshot_checker);
 
     CompactionIterator c_iter(
         iter, internal_comparator.user_comparator(), &merge, kMaxSequenceNumber,
-        &snapshots, earliest_write_conflict_snapshot, env,
+        &snapshots, earliest_write_conflict_snapshot, snapshot_checker, env,
+        ShouldReportDetailedTime(env, ioptions.statistics),
         true /* internal key corruption is not ok */, range_del_agg.get());
     c_iter.SeekToFirst();
     for (; c_iter.Valid(); c_iter.Next()) {
@@ -150,12 +163,20 @@ Status BuildTable(
             ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
       }
     }
-    // nullptr for table_{min,max} so all range tombstones will be flushed
-    range_del_agg->AddToBuilder(builder, nullptr /* lower_bound */,
-                                nullptr /* upper_bound */, meta);
+
+    auto range_del_it = range_del_agg->NewIterator();
+    for (range_del_it->SeekToFirst(); range_del_it->Valid();
+         range_del_it->Next()) {
+      auto tombstone = range_del_it->Tombstone();
+      auto kv = tombstone.Serialize();
+      builder->Add(kv.first.Encode(), kv.second);
+      meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
+                                     tombstone.seq_, internal_comparator);
+    }
 
     // Finish and check for builder errors
-    bool empty = builder->NumEntries() == 0;
+    tp = builder->GetTableProperties();
+    bool empty = builder->NumEntries() == 0 && tp.num_range_deletions == 0;
     s = c_iter.status();
     if (!s.ok() || empty) {
       builder->Abandon();
@@ -168,7 +189,7 @@ Status BuildTable(
       meta->fd.file_size = file_size;
       meta->marked_for_compaction = builder->NeedCompact();
       assert(meta->fd.GetFileSize() > 0);
-      tp = builder->GetTableProperties();
+      tp = builder->GetTableProperties(); // refresh now that builder is finished
       if (table_properties) {
         *table_properties = tp;
       }
@@ -192,8 +213,9 @@ Status BuildTable(
       // we will regrad this verification as user reads since the goal is
       // to cache it here for further user reads
       std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
-          ReadOptions(), env_options, internal_comparator, meta->fd,
-          nullptr /* range_del_agg */, nullptr,
+          ReadOptions(), env_options, internal_comparator, *meta,
+          nullptr /* range_del_agg */,
+          mutable_cf_options.prefix_extractor.get(), nullptr,
           (internal_stats == nullptr) ? nullptr
                                       : internal_stats->GetFileReadHist(0),
           false /* for_compaction */, nullptr /* arena */,
diff --git a/thirdparty/rocksdb/db/builder.h b/thirdparty/rocksdb/db/builder.h
index 5a5081c647..c00c8273ce 100644
--- a/thirdparty/rocksdb/db/builder.h
+++ b/thirdparty/rocksdb/db/builder.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "db/range_tombstone_fragmenter.h"
 #include "db/table_properties_collector.h"
 #include "options/cf_options.h"
 #include "rocksdb/comparator.h"
@@ -29,29 +30,27 @@ struct FileMetaData;
 class Env;
 struct EnvOptions;
 class Iterator;
+class SnapshotChecker;
 class TableCache;
 class VersionEdit;
 class TableBuilder;
 class WritableFileWriter;
 class InternalStats;
-class InternalIterator;
 
 // @param column_family_name Name of the column family that is also identified
 //    by column_family_id, or empty string if unknown. It must outlive the
 //    TableBuilder returned by this function.
-// @param compression_dict Data for presetting the compression library's
-//    dictionary, or nullptr.
 TableBuilder* NewTableBuilder(
-    const ImmutableCFOptions& options,
+    const ImmutableCFOptions& options, const MutableCFOptions& moptions,
     const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, const std::string& column_family_name,
     WritableFileWriter* file, const CompressionType compression_type,
+    const uint64_t sample_for_compression,
     const CompressionOptions& compression_opts, int level,
-    const std::string* compression_dict = nullptr,
     const bool skip_filters = false, const uint64_t creation_time = 0,
-    const uint64_t oldest_key_time = 0);
+    const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0);
 
 // Build a Table file from the contents of *iter.  The generated file
 // will be named according to number specified in meta. On success, the rest of
@@ -65,19 +64,22 @@ extern Status BuildTable(
     const std::string& dbname, Env* env, const ImmutableCFOptions& options,
     const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options,
     TableCache* table_cache, InternalIterator* iter,
-    std::unique_ptr<InternalIterator> range_del_iter, FileMetaData* meta,
-    const InternalKeyComparator& internal_comparator,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, const std::string& column_family_name,
     std::vector<SequenceNumber> snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
-    const CompressionType compression,
+    SnapshotChecker* snapshot_checker, const CompressionType compression,
+    const uint64_t sample_for_compression,
     const CompressionOptions& compression_opts, bool paranoid_file_checks,
     InternalStats* internal_stats, TableFileCreationReason reason,
     EventLogger* event_logger = nullptr, int job_id = 0,
     const Env::IOPriority io_priority = Env::IO_HIGH,
     TableProperties* table_properties = nullptr, int level = -1,
-    const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0);
+    const uint64_t creation_time = 0, const uint64_t oldest_key_time = 0,
+    Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/c.cc b/thirdparty/rocksdb/db/c.cc
index cbfb8557d0..8610871abd 100644
--- a/thirdparty/rocksdb/db/c.cc
+++ b/thirdparty/rocksdb/db/c.cc
@@ -21,23 +21,30 @@
 #include "rocksdb/env.h"
 #include "rocksdb/filter_policy.h"
 #include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
-#include "rocksdb/status.h"
-#include "rocksdb/write_batch.h"
-#include "rocksdb/memtablerep.h"
-#include "rocksdb/universal_compaction.h"
-#include "rocksdb/statistics.h"
+#include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "rocksdb/rate_limiter.h"
+#include "rocksdb/universal_compaction.h"
 #include "rocksdb/utilities/backupable_db.h"
-#include "rocksdb/utilities/write_batch_with_index.h"
-#include "utilities/merge_operators.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
-#include "rocksdb/utilities/optimistic_transaction_db.h"
-#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/perf_context.h"
+#include "utilities/merge_operators.h"
+
+#include <vector>
+#include <unordered_set>
+#include <map>
 
 using rocksdb::BytewiseComparator;
 using rocksdb::Cache;
@@ -88,8 +95,10 @@ using rocksdb::LiveFileMetaData;
 using rocksdb::BackupEngine;
 using rocksdb::BackupableDBOptions;
 using rocksdb::BackupInfo;
+using rocksdb::BackupID;
 using rocksdb::RestoreOptions;
 using rocksdb::CompactRangeOptions;
+using rocksdb::BottommostLevelCompaction;
 using rocksdb::RateLimiter;
 using rocksdb::NewGenericRateLimiter;
 using rocksdb::PinnableSlice;
@@ -100,8 +109,16 @@ using rocksdb::OptimisticTransactionDB;
 using rocksdb::OptimisticTransactionOptions;
 using rocksdb::Transaction;
 using rocksdb::Checkpoint;
+using rocksdb::TransactionLogIterator;
+using rocksdb::BatchResult;
+using rocksdb::PerfLevel;
+using rocksdb::PerfContext;
+using rocksdb::MemoryUtil;
 
 using std::shared_ptr;
+using std::vector;
+using std::unordered_set;
+using std::map;
 
 extern "C" {
 
@@ -117,7 +134,9 @@ struct rocksdb_flushoptions_t    { FlushOptions      rep; };
 struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
 struct rocksdb_readoptions_t {
    ReadOptions rep;
-   Slice upper_bound; // stack variable to set pointer to in ReadOptions
+   // stack variables to set pointers to in ReadOptions
+   Slice upper_bound;
+   Slice lower_bound;
 };
 struct rocksdb_writeoptions_t    { WriteOptions      rep; };
 struct rocksdb_options_t         { Options           rep; };
@@ -129,15 +148,24 @@ struct rocksdb_cuckoo_table_options_t  { CuckooTableOptions rep; };
 struct rocksdb_seqfile_t         { SequentialFile*   rep; };
 struct rocksdb_randomfile_t      { RandomAccessFile* rep; };
 struct rocksdb_writablefile_t    { WritableFile*     rep; };
+struct rocksdb_wal_iterator_t { TransactionLogIterator* rep; };
+struct rocksdb_wal_readoptions_t { TransactionLogIterator::ReadOptions rep; };
 struct rocksdb_filelock_t        { FileLock*         rep; };
-struct rocksdb_logger_t          { shared_ptr<Logger>  rep; };
-struct rocksdb_cache_t           { shared_ptr<Cache>   rep; };
+struct rocksdb_logger_t {
+  std::shared_ptr<Logger> rep;
+};
+struct rocksdb_cache_t {
+  std::shared_ptr<Cache> rep;
+};
 struct rocksdb_livefiles_t       { std::vector<LiveFileMetaData> rep; };
 struct rocksdb_column_family_handle_t  { ColumnFamilyHandle* rep; };
 struct rocksdb_envoptions_t      { EnvOptions        rep; };
 struct rocksdb_ingestexternalfileoptions_t  { IngestExternalFileOptions rep; };
 struct rocksdb_sstfilewriter_t   { SstFileWriter*    rep; };
-struct rocksdb_ratelimiter_t     { RateLimiter*      rep; };
+struct rocksdb_ratelimiter_t {
+  std::shared_ptr<RateLimiter> rep;
+};
+struct rocksdb_perfcontext_t     { PerfContext*      rep; };
 struct rocksdb_pinnableslice_t {
   PinnableSlice rep;
 };
@@ -180,13 +208,10 @@ struct rocksdb_compactionfilter_t : public CompactionFilter {
   const char* (*name_)(void*);
   unsigned char ignore_snapshots_;
 
-  virtual ~rocksdb_compactionfilter_t() {
-    (*destructor_)(state_);
-  }
+  ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); }
 
-  virtual bool Filter(int level, const Slice& key, const Slice& existing_value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int level, const Slice& key, const Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override {
     char* c_new_value = nullptr;
     size_t new_value_length = 0;
     unsigned char c_value_changed = 0;
@@ -203,9 +228,9 @@ struct rocksdb_compactionfilter_t : public CompactionFilter {
     return result;
   }
 
-  virtual const char* Name() const override { return (*name_)(state_); }
+  const char* Name() const override { return (*name_)(state_); }
 
-  virtual bool IgnoreSnapshots() const override { return ignore_snapshots_; }
+  bool IgnoreSnapshots() const override { return ignore_snapshots_; }
 };
 
 struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
@@ -215,9 +240,9 @@ struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
       void*, rocksdb_compactionfiltercontext_t* context);
   const char* (*name_)(void*);
 
-  virtual ~rocksdb_compactionfilterfactory_t() { (*destructor_)(state_); }
+  ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); }
 
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     rocksdb_compactionfiltercontext_t ccontext;
     ccontext.rep = context;
@@ -225,7 +250,7 @@ struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
     return std::unique_ptr<CompactionFilter>(cf);
   }
 
-  virtual const char* Name() const override { return (*name_)(state_); }
+  const char* Name() const override { return (*name_)(state_); }
 };
 
 struct rocksdb_comparator_t : public Comparator {
@@ -237,20 +262,17 @@ struct rocksdb_comparator_t : public Comparator {
       const char* b, size_t blen);
   const char* (*name_)(void*);
 
-  virtual ~rocksdb_comparator_t() {
-    (*destructor_)(state_);
-  }
+  ~rocksdb_comparator_t() override { (*destructor_)(state_); }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
   }
 
-  virtual const char* Name() const override { return (*name_)(state_); }
+  const char* Name() const override { return (*name_)(state_); }
 
   // No-ops since the C binding does not support key shortening methods.
-  virtual void FindShortestSeparator(std::string*,
-                                     const Slice&) const override {}
-  virtual void FindShortSuccessor(std::string* key) const override {}
+  void FindShortestSeparator(std::string*, const Slice&) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 
 struct rocksdb_filterpolicy_t : public FilterPolicy {
@@ -270,14 +292,11 @@ struct rocksdb_filterpolicy_t : public FilterPolicy {
       void*,
       const char* filter, size_t filter_length);
 
-  virtual ~rocksdb_filterpolicy_t() {
-    (*destructor_)(state_);
-  }
+  ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); }
 
-  virtual const char* Name() const override { return (*name_)(state_); }
+  const char* Name() const override { return (*name_)(state_); }
 
-  virtual void CreateFilter(const Slice* keys, int n,
-                            std::string* dst) const override {
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
     std::vector<const char*> key_pointers(n);
     std::vector<size_t> key_sizes(n);
     for (int i = 0; i < n; i++) {
@@ -295,8 +314,7 @@ struct rocksdb_filterpolicy_t : public FilterPolicy {
     }
   }
 
-  virtual bool KeyMayMatch(const Slice& key,
-                           const Slice& filter) const override {
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
     return (*key_match_)(state_, key.data(), key.size(),
                          filter.data(), filter.size());
   }
@@ -321,14 +339,12 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
       void*,
       const char* value, size_t value_length);
 
-  virtual ~rocksdb_mergeoperator_t() {
-    (*destructor_)(state_);
-  }
+  ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); }
 
-  virtual const char* Name() const override { return (*name_)(state_); }
+  const char* Name() const override { return (*name_)(state_); }
 
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
     size_t n = merge_in.operand_list.size();
     std::vector<const char*> operand_pointers(n);
     std::vector<size_t> operand_sizes(n);
@@ -362,10 +378,10 @@ struct rocksdb_mergeoperator_t : public MergeOperator {
     return success;
   }
 
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value,
-                                 Logger* logger) const override {
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
     size_t operand_count = operand_list.size();
     std::vector<const char*> operand_pointers(operand_count);
     std::vector<size_t> operand_sizes(operand_count);
@@ -416,23 +432,21 @@ struct rocksdb_slicetransform_t : public SliceTransform {
       void*,
       const char* key, size_t length);
 
-  virtual ~rocksdb_slicetransform_t() {
-    (*destructor_)(state_);
-  }
+  ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
 
-  virtual const char* Name() const override { return (*name_)(state_); }
+  const char* Name() const override { return (*name_)(state_); }
 
-  virtual Slice Transform(const Slice& src) const override {
+  Slice Transform(const Slice& src) const override {
     size_t len;
     char* dst = (*transform_)(state_, src.data(), src.size(), &len);
     return Slice(dst, len);
   }
 
-  virtual bool InDomain(const Slice& src) const override {
+  bool InDomain(const Slice& src) const override {
     return (*in_domain_)(state_, src.data(), src.size());
   }
 
-  virtual bool InRange(const Slice& src) const override {
+  bool InRange(const Slice& src) const override {
     return (*in_range_)(state_, src.data(), src.size());
   }
 };
@@ -475,6 +489,20 @@ rocksdb_t* rocksdb_open(
   return result;
 }
 
+rocksdb_t* rocksdb_open_with_ttl(
+    const rocksdb_options_t* options,
+    const char* name,
+    int ttl,
+    char** errptr) {
+  rocksdb::DBWithTTL* db;
+  if (SaveError(errptr, rocksdb::DBWithTTL::Open(options->rep, std::string(name), &db, ttl))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
 rocksdb_t* rocksdb_open_for_read_only(
     const rocksdb_options_t* options,
     const char* name,
@@ -506,10 +534,18 @@ rocksdb_backup_engine_t* rocksdb_backup_engine_open(
 }
 
 void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
-                                             rocksdb_t* db, char** errptr) {
+                                             rocksdb_t* db,
+                                             char** errptr) {
   SaveError(errptr, be->rep->CreateNewBackup(db->rep));
 }
 
+void rocksdb_backup_engine_create_new_backup_flush(rocksdb_backup_engine_t* be,
+                                                   rocksdb_t* db,
+                                                   unsigned char flush_before_backup,
+                                                   char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup));
+}
+
 void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be,
                                              uint32_t num_backups_to_keep,
                                              char** errptr) {
@@ -529,6 +565,12 @@ void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
   opt->rep.keep_log_files = v;
 }
 
+
+void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+    uint32_t backup_id, char** errptr) {
+  SaveError(errptr, be->rep->VerifyBackup(static_cast<BackupID>(backup_id)));
+}
+
 void rocksdb_backup_engine_restore_db_from_latest_backup(
     rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
     const rocksdb_restore_options_t* restore_options, char** errptr) {
@@ -909,6 +951,54 @@ rocksdb_iterator_t* rocksdb_create_iterator(
   return result;
 }
 
+rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+        rocksdb_t* db, uint64_t seq_number,
+        const rocksdb_wal_readoptions_t* options,
+        char** errptr) {
+  std::unique_ptr<TransactionLogIterator> iter;
+  TransactionLogIterator::ReadOptions ro;
+  if (options!=nullptr) {
+      ro = options->rep;
+  }
+  if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) {
+    return nullptr;
+  }
+  rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t;
+  result->rep = iter.release();
+  return result;
+}
+
+void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) {
+    iter->rep->Next();
+}
+
+unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) {
+    return iter->rep->Valid();
+}
+
+void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) {
+    SaveError(errptr, iter->rep->status());
+}
+
+void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
+  rocksdb_writebatch_t* result = rocksdb_writebatch_create();
+  BatchResult wal_batch = iter->rep->GetBatch();
+  result->rep = * wal_batch.writeBatchPtr.release();
+  if (seq != nullptr) {
+    *seq = wal_batch.sequence;
+  }
+  return result;
+}
+
+uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db) {
+    return db->rep->GetLatestSequenceNumber();
+}
+
 rocksdb_iterator_t* rocksdb_create_iterator_cf(
     rocksdb_t* db,
     const rocksdb_readoptions_t* options,
@@ -1386,23 +1476,24 @@ void rocksdb_writebatch_put_log_data(
   b->rep.PutLogData(Slice(blob, len));
 }
 
+class H : public WriteBatch::Handler {
+ public:
+  void* state_;
+  void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+  void (*deleted_)(void*, const char* k, size_t klen);
+  void Put(const Slice& key, const Slice& value) override {
+    (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+  }
+  void Delete(const Slice& key) override {
+    (*deleted_)(state_, key.data(), key.size());
+  }
+};
+
 void rocksdb_writebatch_iterate(
     rocksdb_writebatch_t* b,
     void* state,
     void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
     void (*deleted)(void*, const char* k, size_t klen)) {
-  class H : public WriteBatch::Handler {
-   public:
-    void* state_;
-    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
-    void (*deleted_)(void*, const char* k, size_t klen);
-    virtual void Put(const Slice& key, const Slice& value) override {
-      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
-    }
-    virtual void Delete(const Slice& key) override {
-      (*deleted_)(state_, key.data(), key.size());
-    }
-  };
   H handler;
   handler.state_ = state;
   handler.put_ = put;
@@ -1647,18 +1738,6 @@ void rocksdb_writebatch_wi_iterate(
     void* state,
     void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
     void (*deleted)(void*, const char* k, size_t klen)) {
-  class H : public WriteBatch::Handler {
-   public:
-    void* state_;
-    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
-    void (*deleted_)(void*, const char* k, size_t klen);
-    virtual void Put(const Slice& key, const Slice& value) override {
-      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
-    }
-    virtual void Delete(const Slice& key) override {
-      (*deleted_)(state_, key.data(), key.size());
-    }
-  };
   H handler;
   handler.state_ = state;
   handler.put_ = put;
@@ -1691,11 +1770,11 @@ rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
 }
 
 rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
-    rocksdb_writebatch_wi_t* wbwi,
-    rocksdb_iterator_t* base_iterator,
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
     rocksdb_column_family_handle_t* column_family) {
   rocksdb_iterator_t* result = new rocksdb_iterator_t;
-  result->rep = wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
+  result->rep =
+      wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
   delete base_iterator;
   return result;
 }
@@ -1824,6 +1903,26 @@ void rocksdb_block_based_options_set_block_restart_interval(
   options->rep.block_restart_interval = block_restart_interval;
 }
 
+void rocksdb_block_based_options_set_index_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int index_block_restart_interval) {
+  options->rep.index_block_restart_interval = index_block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_metadata_block_size(
+    rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size) {
+  options->rep.metadata_block_size = metadata_block_size;
+}
+
+void rocksdb_block_based_options_set_partition_filters(
+    rocksdb_block_based_table_options_t* options, unsigned char partition_filters) {
+  options->rep.partition_filters = partition_filters;
+}
+
+void rocksdb_block_based_options_set_use_delta_encoding(
+    rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding) {
+  options->rep.use_delta_encoding = use_delta_encoding;
+}
+
 void rocksdb_block_based_options_set_filter_policy(
     rocksdb_block_based_table_options_t* options,
     rocksdb_filterpolicy_t* filter_policy) {
@@ -1877,11 +1976,21 @@ void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
   options->rep.cache_index_and_filter_blocks = v;
 }
 
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks_with_high_priority = v;
+}
+
 void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
     rocksdb_block_based_table_options_t* options, unsigned char v) {
   options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
 }
 
+void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.pin_top_level_index_and_filter = v;
+}
+
 void rocksdb_options_set_block_based_table_factory(
     rocksdb_options_t *opt,
     rocksdb_block_based_table_options_t* table_options) {
@@ -1891,7 +2000,6 @@ void rocksdb_options_set_block_based_table_factory(
   }
 }
 
-
 rocksdb_cuckoo_table_options_t*
 rocksdb_cuckoo_options_create() {
   return new rocksdb_cuckoo_table_options_t;
@@ -1945,6 +2053,15 @@ void rocksdb_set_options(
             db->rep->SetOptions(options_map));
     }
 
+void rocksdb_set_options_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr) {
+        std::unordered_map<std::string, std::string> options_map;
+        for (int i=0; i<count; i++)
+            options_map[keys[i]] = values[i];
+        SaveError(errptr,
+            db->rep->SetOptions(handle->rep, options_map));
+    }
+
 rocksdb_options_t* rocksdb_options_create() {
   return new rocksdb_options_t;
 }
@@ -1973,6 +2090,11 @@ void rocksdb_options_optimize_universal_style_compaction(
   opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
 }
 
+void rocksdb_options_set_allow_ingest_behind(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.allow_ingest_behind = v;
+}
+
 void rocksdb_options_set_compaction_filter(
     rocksdb_options_t* opt,
     rocksdb_compactionfilter_t* filter) {
@@ -2023,8 +2145,8 @@ void rocksdb_options_set_paranoid_checks(
   opt->rep.paranoid_checks = v;
 }
 
-void rocksdb_options_set_db_paths(rocksdb_options_t* opt, 
-                                  const rocksdb_dbpath_t** dbpath_values, 
+void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
+                                  const rocksdb_dbpath_t** dbpath_values,
                                   size_t num_paths) {
   std::vector<DbPath> db_paths(num_paths);
   for (size_t i = 0; i < num_paths; ++i) {
@@ -2111,7 +2233,8 @@ void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
   opt->rep.statistics = rocksdb::CreateDBStatistics();
 }
 
-void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, unsigned char val) {
+void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+                                                      unsigned char val) {
   opt->rep.skip_stats_update_on_db_open = val;
 }
 
@@ -2134,8 +2257,8 @@ void rocksdb_options_set_level0_stop_writes_trigger(
   opt->rep.level0_stop_writes_trigger = n;
 }
 
-void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* opt,
-                                                  int n) {}
+void rocksdb_options_set_max_mem_compaction_level(rocksdb_options_t* /*opt*/,
+                                                  int /*n*/) {}
 
 void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt,int mode) {
   opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
@@ -2155,6 +2278,18 @@ void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
   }
 }
 
+void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt,
+                                                        int w_bits, int level,
+                                                        int strategy,
+                                                        int max_dict_bytes,
+                                                        bool enabled) {
+  opt->rep.bottommost_compression_opts.window_bits = w_bits;
+  opt->rep.bottommost_compression_opts.level = level;
+  opt->rep.bottommost_compression_opts.strategy = strategy;
+  opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
 void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
                                              int level, int strategy,
                                              int max_dict_bytes) {
@@ -2199,8 +2334,8 @@ void rocksdb_options_set_manifest_preallocation_size(
 }
 
 // noop
-void rocksdb_options_set_purge_redundant_kvs_while_flush(rocksdb_options_t* opt,
-                                                         unsigned char v) {}
+void rocksdb_options_set_purge_redundant_kvs_while_flush(
+    rocksdb_options_t* /*opt*/, unsigned char /*v*/) {}
 
 void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
                                           unsigned char v) {
@@ -2265,11 +2400,21 @@ void rocksdb_options_set_use_adaptive_mutex(
   opt->rep.use_adaptive_mutex = v;
 }
 
+void rocksdb_options_set_wal_bytes_per_sync(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.wal_bytes_per_sync = v;
+}
+
 void rocksdb_options_set_bytes_per_sync(
     rocksdb_options_t* opt, uint64_t v) {
   opt->rep.bytes_per_sync = v;
 }
 
+void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
+                                                       uint64_t v) {
+  opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
+}
+
 void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
                                                          unsigned char v) {
   opt->rep.allow_concurrent_memtable_write = v;
@@ -2298,6 +2443,20 @@ void rocksdb_options_set_max_write_buffer_number_to_maintain(
   opt->rep.max_write_buffer_number_to_maintain = n;
 }
 
+void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
+                                                unsigned char v) {
+  opt->rep.enable_pipelined_write = v;
+}
+
+void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
+                                            uint32_t n) {
+  opt->rep.max_subcompactions = n;
+}
+
+void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_jobs = n;
+}
+
 void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt, int n) {
   opt->rep.max_background_compactions = n;
 }
@@ -2360,7 +2519,7 @@ void rocksdb_options_set_table_cache_numshardbits(
 }
 
 void rocksdb_options_set_table_cache_remove_scan_count_limit(
-    rocksdb_options_t* opt, int v) {
+    rocksdb_options_t* /*opt*/, int /*v*/) {
   // this option is deprecated
 }
 
@@ -2474,8 +2633,9 @@ char *rocksdb_options_statistics_get_string(rocksdb_options_t *opt) {
 }
 
 void rocksdb_options_set_ratelimiter(rocksdb_options_t *opt, rocksdb_ratelimiter_t *limiter) {
-  opt->rep.rate_limiter.reset(limiter->rep);
-  limiter->rep = nullptr;
+  if (limiter) {
+    opt->rep.rate_limiter = limiter->rep;
+  }
 }
 
 rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
@@ -2483,18 +2643,186 @@ rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
     int64_t refill_period_us,
     int32_t fairness) {
   rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
-  rate_limiter->rep = NewGenericRateLimiter(rate_bytes_per_sec,
-                                            refill_period_us, fairness);
+  rate_limiter->rep.reset(
+               NewGenericRateLimiter(rate_bytes_per_sec,
+                                     refill_period_us, fairness));
   return rate_limiter;
 }
 
 void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t *limiter) {
-  if (limiter->rep) {
-    delete limiter->rep;
-  }
   delete limiter;
 }
 
+void rocksdb_set_perf_level(int v) {
+  PerfLevel level = static_cast<PerfLevel>(v);
+  SetPerfLevel(level);
+}
+
+rocksdb_perfcontext_t* rocksdb_perfcontext_create() {
+  rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t;
+  context->rep = rocksdb::get_perf_context();
+  return context;
+}
+
+void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) {
+  context->rep->Reset();
+}
+
+char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context,
+    unsigned char exclude_zero_counters) {
+  return strdup(context->rep->ToString(exclude_zero_counters).c_str());
+}
+
+uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
+    int metric) {
+  PerfContext* rep = context->rep;
+  switch (metric) {
+    case rocksdb_user_key_comparison_count:
+      return rep->user_key_comparison_count;
+    case rocksdb_block_cache_hit_count:
+      return rep->block_cache_hit_count;
+    case rocksdb_block_read_count:
+      return rep->block_read_count;
+    case rocksdb_block_read_byte:
+      return rep->block_read_byte;
+    case rocksdb_block_read_time:
+      return rep->block_read_time;
+    case rocksdb_block_checksum_time:
+      return rep->block_checksum_time;
+    case rocksdb_block_decompress_time:
+      return rep->block_decompress_time;
+    case rocksdb_get_read_bytes:
+      return rep->get_read_bytes;
+    case rocksdb_multiget_read_bytes:
+      return rep->multiget_read_bytes;
+    case rocksdb_iter_read_bytes:
+      return rep->iter_read_bytes;
+    case rocksdb_internal_key_skipped_count:
+      return rep->internal_key_skipped_count;
+    case rocksdb_internal_delete_skipped_count:
+      return rep->internal_delete_skipped_count;
+    case rocksdb_internal_recent_skipped_count:
+      return rep->internal_recent_skipped_count;
+    case rocksdb_internal_merge_count:
+      return rep->internal_merge_count;
+    case rocksdb_get_snapshot_time:
+      return rep->get_snapshot_time;
+    case rocksdb_get_from_memtable_time:
+      return rep->get_from_memtable_time;
+    case rocksdb_get_from_memtable_count:
+      return rep->get_from_memtable_count;
+    case rocksdb_get_post_process_time:
+      return rep->get_post_process_time;
+    case rocksdb_get_from_output_files_time:
+      return rep->get_from_output_files_time;
+    case rocksdb_seek_on_memtable_time:
+      return rep->seek_on_memtable_time;
+    case rocksdb_seek_on_memtable_count:
+      return rep->seek_on_memtable_count;
+    case rocksdb_next_on_memtable_count:
+      return rep->next_on_memtable_count;
+    case rocksdb_prev_on_memtable_count:
+      return rep->prev_on_memtable_count;
+    case rocksdb_seek_child_seek_time:
+      return rep->seek_child_seek_time;
+    case rocksdb_seek_child_seek_count:
+      return rep->seek_child_seek_count;
+    case rocksdb_seek_min_heap_time:
+      return rep->seek_min_heap_time;
+    case rocksdb_seek_max_heap_time:
+      return rep->seek_max_heap_time;
+    case rocksdb_seek_internal_seek_time:
+      return rep->seek_internal_seek_time;
+    case rocksdb_find_next_user_entry_time:
+      return rep->find_next_user_entry_time;
+    case rocksdb_write_wal_time:
+      return rep->write_wal_time;
+    case rocksdb_write_memtable_time:
+      return rep->write_memtable_time;
+    case rocksdb_write_delay_time:
+      return rep->write_delay_time;
+    case rocksdb_write_pre_and_post_process_time:
+      return rep->write_pre_and_post_process_time;
+    case rocksdb_db_mutex_lock_nanos:
+      return rep->db_mutex_lock_nanos;
+    case rocksdb_db_condition_wait_nanos:
+      return rep->db_condition_wait_nanos;
+    case rocksdb_merge_operator_time_nanos:
+      return rep->merge_operator_time_nanos;
+    case rocksdb_read_index_block_nanos:
+      return rep->read_index_block_nanos;
+    case rocksdb_read_filter_block_nanos:
+      return rep->read_filter_block_nanos;
+    case rocksdb_new_table_block_iter_nanos:
+      return rep->new_table_block_iter_nanos;
+    case rocksdb_new_table_iterator_nanos:
+      return rep->new_table_iterator_nanos;
+    case rocksdb_block_seek_nanos:
+      return rep->block_seek_nanos;
+    case rocksdb_find_table_nanos:
+      return rep->find_table_nanos;
+    case rocksdb_bloom_memtable_hit_count:
+      return rep->bloom_memtable_hit_count;
+    case rocksdb_bloom_memtable_miss_count:
+      return rep->bloom_memtable_miss_count;
+    case rocksdb_bloom_sst_hit_count:
+      return rep->bloom_sst_hit_count;
+    case rocksdb_bloom_sst_miss_count:
+      return rep->bloom_sst_miss_count;
+    case rocksdb_key_lock_wait_time:
+      return rep->key_lock_wait_time;
+    case rocksdb_key_lock_wait_count:
+      return rep->key_lock_wait_count;
+    case rocksdb_env_new_sequential_file_nanos:
+      return rep->env_new_sequential_file_nanos;
+    case rocksdb_env_new_random_access_file_nanos:
+      return rep->env_new_random_access_file_nanos;
+    case rocksdb_env_new_writable_file_nanos:
+      return rep->env_new_writable_file_nanos;
+    case rocksdb_env_reuse_writable_file_nanos:
+      return rep->env_reuse_writable_file_nanos;
+    case rocksdb_env_new_random_rw_file_nanos:
+      return rep->env_new_random_rw_file_nanos;
+    case rocksdb_env_new_directory_nanos:
+      return rep->env_new_directory_nanos;
+    case rocksdb_env_file_exists_nanos:
+      return rep->env_file_exists_nanos;
+    case rocksdb_env_get_children_nanos:
+      return rep->env_get_children_nanos;
+    case rocksdb_env_get_children_file_attributes_nanos:
+      return rep->env_get_children_file_attributes_nanos;
+    case rocksdb_env_delete_file_nanos:
+      return rep->env_delete_file_nanos;
+    case rocksdb_env_create_dir_nanos:
+      return rep->env_create_dir_nanos;
+    case rocksdb_env_create_dir_if_missing_nanos:
+      return rep->env_create_dir_if_missing_nanos;
+    case rocksdb_env_delete_dir_nanos:
+      return rep->env_delete_dir_nanos;
+    case rocksdb_env_get_file_size_nanos:
+      return rep->env_get_file_size_nanos;
+    case rocksdb_env_get_file_modification_time_nanos:
+      return rep->env_get_file_modification_time_nanos;
+    case rocksdb_env_rename_file_nanos:
+      return rep->env_rename_file_nanos;
+    case rocksdb_env_link_file_nanos:
+      return rep->env_link_file_nanos;
+    case rocksdb_env_lock_file_nanos:
+      return rep->env_lock_file_nanos;
+    case rocksdb_env_unlock_file_nanos:
+      return rep->env_unlock_file_nanos;
+    case rocksdb_env_new_logger_nanos:
+      return rep->env_new_logger_nanos;
+    default:
+      break;
+  }
+  return 0;
+}
+
+void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) {
+  delete context;
+}
+
 /*
 TODO:
 DB::OpenForReadOnly
@@ -2524,7 +2852,7 @@ rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
   result->state_ = state;
   result->destructor_ = destructor;
   result->filter_ = filter;
-  result->ignore_snapshots_ = false;
+  result->ignore_snapshots_ = true;
   result->name_ = name;
   return result;
 }
@@ -2624,7 +2952,7 @@ rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(int bits_per_ke
   // supplied C functions.
   struct Wrapper : public rocksdb_filterpolicy_t {
     const FilterPolicy* rep_;
-    ~Wrapper() { delete rep_; }
+    ~Wrapper() override { delete rep_; }
     const char* Name() const override { return rep_->Name(); }
     void CreateFilter(const Slice* keys, int n,
                       std::string* dst) const override {
@@ -2717,6 +3045,18 @@ void rocksdb_readoptions_set_iterate_upper_bound(
   }
 }
 
+void rocksdb_readoptions_set_iterate_lower_bound(
+    rocksdb_readoptions_t *opt,
+    const char* key, size_t keylen) {
+  if (key == nullptr) {
+    opt->lower_bound = Slice();
+    opt->rep.iterate_lower_bound = nullptr;
+  } else {
+    opt->lower_bound = Slice(key, keylen);
+    opt->rep.iterate_lower_bound = &opt->lower_bound;
+  }
+}
+
 void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t* opt, int v) {
   opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);
@@ -2727,11 +3067,21 @@ void rocksdb_readoptions_set_tailing(
   opt->rep.tailing = v;
 }
 
+void rocksdb_readoptions_set_managed(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.managed = v;
+}
+
 void rocksdb_readoptions_set_readahead_size(
     rocksdb_readoptions_t* opt, size_t v) {
   opt->rep.readahead_size = v;
 }
 
+void rocksdb_readoptions_set_prefix_same_as_start(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.prefix_same_as_start = v;
+}
+
 void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
                                       unsigned char v) {
   opt->rep.pin_data = v;
@@ -2742,6 +3092,22 @@ void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
   opt->rep.total_order_seek = v;
 }
 
+void rocksdb_readoptions_set_max_skippable_internal_keys(
+    rocksdb_readoptions_t* opt,
+    uint64_t v) {
+  opt->rep.max_skippable_internal_keys = v;
+}
+
+void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.background_purge_on_iterator_cleanup = v;
+}
+
+void rocksdb_readoptions_set_ignore_range_deletions(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.ignore_range_deletions = v;
+}
+
 rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
   return new rocksdb_writeoptions_t;
 }
@@ -2759,6 +3125,24 @@ void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt, int disable)
   opt->rep.disableWAL = disable;
 }
 
+void rocksdb_writeoptions_set_ignore_missing_column_families(
+    rocksdb_writeoptions_t* opt,
+    unsigned char v) {
+  opt->rep.ignore_missing_column_families = v;
+}
+
+void rocksdb_writeoptions_set_no_slowdown(
+    rocksdb_writeoptions_t* opt,
+    unsigned char v) {
+  opt->rep.no_slowdown = v;
+}
+
+void rocksdb_writeoptions_set_low_pri(
+    rocksdb_writeoptions_t* opt,
+    unsigned char v) {
+  opt->rep.low_pri = v;
+}
+
 rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
   return new rocksdb_compactoptions_t;
 }
@@ -2767,6 +3151,11 @@ void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) {
   delete opt;
 }
 
+void rocksdb_compactoptions_set_bottommost_level_compaction(
+    rocksdb_compactoptions_t* opt, unsigned char v) {
+  opt->rep.bottommost_level_compaction = static_cast<BottommostLevelCompaction>(v);
+}
+
 void rocksdb_compactoptions_set_exclusive_manual_compaction(
     rocksdb_compactoptions_t* opt, unsigned char v) {
   opt->rep.exclusive_manual_compaction = v;
@@ -2875,7 +3264,7 @@ rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create(
 
 rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator(
     const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
-    const rocksdb_comparator_t* comparator) {
+    const rocksdb_comparator_t* /*comparator*/) {
   rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
   writer->rep = new SstFileWriter(env->rep, io_options->rep);
   return writer;
@@ -2913,7 +3302,12 @@ void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer,
 
 void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
                                   char** errptr) {
-  SaveError(errptr, writer->rep->Finish(NULL));
+  SaveError(errptr, writer->rep->Finish(nullptr));
+}
+
+void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer,
+                                  uint64_t* file_size) {
+  *file_size = writer->rep->FileSize();
 }
 
 void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) {
@@ -2951,6 +3345,12 @@ void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
   opt->rep.allow_blocking_flush = allow_blocking_flush;
 }
 
+void rocksdb_ingestexternalfileoptions_set_ingest_behind(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char ingest_behind) {
+  opt->rep.ingest_behind = ingest_behind;
+}
+
 void rocksdb_ingestexternalfileoptions_destroy(
     rocksdb_ingestexternalfileoptions_t* opt) {
   delete opt;
@@ -3005,20 +3405,21 @@ void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) {
   delete st;
 }
 
+struct Wrapper : public rocksdb_slicetransform_t {
+  const SliceTransform* rep_;
+  ~Wrapper() override { delete rep_; }
+  const char* Name() const override { return rep_->Name(); }
+  Slice Transform(const Slice& src) const override {
+    return rep_->Transform(src);
+  }
+  bool InDomain(const Slice& src) const override {
+    return rep_->InDomain(src);
+  }
+  bool InRange(const Slice& src) const override { return rep_->InRange(src); }
+  static void DoNothing(void*) { }
+};
+
 rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t prefixLen) {
-  struct Wrapper : public rocksdb_slicetransform_t {
-    const SliceTransform* rep_;
-    ~Wrapper() { delete rep_; }
-    const char* Name() const override { return rep_->Name(); }
-    Slice Transform(const Slice& src) const override {
-      return rep_->Transform(src);
-    }
-    bool InDomain(const Slice& src) const override {
-      return rep_->InDomain(src);
-    }
-    bool InRange(const Slice& src) const override { return rep_->InRange(src); }
-    static void DoNothing(void*) { }
-  };
   Wrapper* wrapper = new Wrapper;
   wrapper->rep_ = rocksdb::NewFixedPrefixTransform(prefixLen);
   wrapper->state_ = nullptr;
@@ -3027,19 +3428,6 @@ rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(size_t pref
 }
 
 rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
-  struct Wrapper : public rocksdb_slicetransform_t {
-    const SliceTransform* rep_;
-    ~Wrapper() { delete rep_; }
-    const char* Name() const override { return rep_->Name(); }
-    Slice Transform(const Slice& src) const override {
-      return rep_->Transform(src);
-    }
-    bool InDomain(const Slice& src) const override {
-      return rep_->InDomain(src);
-    }
-    bool InRange(const Slice& src) const override { return rep_->InRange(src); }
-    static void DoNothing(void*) { }
-  };
   Wrapper* wrapper = new Wrapper;
   wrapper->rep_ = rocksdb::NewNoopTransform();
   wrapper->state_ = nullptr;
@@ -3157,6 +3545,18 @@ const char* rocksdb_livefiles_largestkey(
   return lf->rep[index].largestkey.data();
 }
 
+uint64_t rocksdb_livefiles_entries(
+    const rocksdb_livefiles_t* lf,
+    int index) {
+  return lf->rep[index].num_entries;
+}
+
+uint64_t rocksdb_livefiles_deletions(
+    const rocksdb_livefiles_t* lf,
+    int index) {
+  return lf->rep[index].num_deletions;
+}
+
 extern void rocksdb_livefiles_destroy(
   const rocksdb_livefiles_t* lf) {
   delete lf;
@@ -3302,6 +3702,38 @@ rocksdb_transactiondb_t* rocksdb_transactiondb_open(
   return result;
 }
 
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    int num_column_families, const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  TransactionDB* txn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+                                            std::string(name), column_families,
+                                            &handles, &txn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+  result->rep = txn_db;
+  return result;
+}
+
 const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot(
     rocksdb_transactiondb_t* txn_db) {
   rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
@@ -3339,6 +3771,14 @@ void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) {
   SaveError(errptr, txn->rep->Rollback());
 }
 
+void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) {
+  txn->rep->SetSavePoint();
+}
+
+void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->RollbackToSavePoint());
+}
+
 void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
   delete txn->rep;
   delete txn;
@@ -3414,6 +3854,26 @@ char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
   return result;
 }
 
+char* rocksdb_transaction_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, unsigned char exclusive, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+                                    Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
 // Read a key outside a transaction
 char* rocksdb_transactiondb_get(
     rocksdb_transactiondb_t* txn_db,
@@ -3476,8 +3936,8 @@ void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
                                const rocksdb_writeoptions_t* options,
                                const char* key, size_t klen, const char* val,
                                size_t vlen, char** errptr) {
-  SaveError(errptr, txn_db->rep->Put(options->rep, Slice(key, klen), 
-                                     Slice(val, vlen)));
+  SaveError(errptr,
+            txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
 }
 
 void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
@@ -3506,13 +3966,29 @@ void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
   SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
 }
 
+void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, const char* val,
+                                  size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen),
+                                    Slice(val, vlen)));
+}
+
 // Merge a key outside a transaction
 void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
                                  const rocksdb_writeoptions_t* options,
                                  const char* key, size_t klen, const char* val,
                                  size_t vlen, char** errptr) {
-  SaveError(errptr,
-    txn_db->rep->Merge(options->rep, Slice(key, klen), Slice(val, vlen)));
+  SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen),
+                                       Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_merge_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    const char* val, size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep,
+                                       Slice(key, klen), Slice(val, vlen)));
 }
 
 // Delete a key inside a transaction
@@ -3550,6 +4026,15 @@ rocksdb_iterator_t* rocksdb_transaction_create_iterator(
   return result;
 }
 
+// Create an iterator inside a transaction with column family
+rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn->rep->GetIterator(options->rep, column_family->rep);
+  return result;
+}
+
 // Create an iterator outside a transaction
 rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
     rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
@@ -3558,6 +4043,14 @@ rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
   return result;
 }
 
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
 void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
   delete txn_db->rep;
   delete txn_db;
@@ -3575,8 +4068,7 @@ rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
 }
 
 rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
-    const rocksdb_options_t* options, const char* name,
-    char** errptr) {
+    const rocksdb_options_t* options, const char* name, char** errptr) {
   OptimisticTransactionDB* otxn_db;
   if (SaveError(errptr, OptimisticTransactionDB::Open(
                             options->rep, std::string(name), &otxn_db))) {
@@ -3588,6 +4080,56 @@ rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
   return result;
 }
 
+rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  OptimisticTransactionDB* otxn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &otxn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_t* rocksdb_optimistictransactiondb_get_base_db(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  DB* base_db = otxn_db->rep->GetBaseDB();
+
+  if (base_db != nullptr) {
+    rocksdb_t* result = new rocksdb_t;
+    result->rep = base_db;
+    return result;
+  }
+
+  return nullptr;
+}
+
+void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) {
+  delete base_db;
+}
+
 rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
     rocksdb_optimistictransactiondb_t* otxn_db,
     const rocksdb_writeoptions_t* write_options,
@@ -3623,7 +4165,7 @@ rocksdb_pinnableslice_t* rocksdb_get_pinned(
     if (!s.IsNotFound()) {
       SaveError(errptr, s);
     }
-    return NULL;
+    return nullptr;
   }
   return v;
 }
@@ -3640,7 +4182,7 @@ rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
     if (!s.IsNotFound()) {
       SaveError(errptr, s);
     }
-    return NULL;
+    return nullptr;
   }
   return v;
 }
@@ -3651,12 +4193,104 @@ const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v,
                                         size_t* vlen) {
   if (!v) {
     *vlen = 0;
-    return NULL;
+    return nullptr;
   }
 
   *vlen = v->rep.size();
   return v->rep.data();
 }
+
+// container to keep databases and caches in order to use rocksdb::MemoryUtil
+struct rocksdb_memory_consumers_t {
+  std::vector<rocksdb_t*> dbs;
+  std::unordered_set<rocksdb_cache_t*> caches;
+};
+
+// initializes new container of memory consumers
+rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
+  return new rocksdb_memory_consumers_t;
+}
+
+// adds datatabase to the container of memory consumers
+void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
+                                     rocksdb_t* db) {
+  consumers->dbs.push_back(db);
+}
+
+// adds cache to the container of memory consumers
+void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
+                                        rocksdb_cache_t* cache) {
+  consumers->caches.insert(cache);
+}
+
+// deletes container with memory consumers
+void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
+  delete consumers;
+}
+
+// contains memory usage statistics provided by rocksdb::MemoryUtil
+struct rocksdb_memory_usage_t {
+  uint64_t mem_table_total;
+  uint64_t mem_table_unflushed;
+  uint64_t mem_table_readers_total;
+  uint64_t cache_total;
+};
+
+// estimates amount of memory occupied by consumers (dbs and caches)
+rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
+    rocksdb_memory_consumers_t* consumers, char** errptr) {
+
+  vector<DB*> dbs;
+  for (auto db : consumers->dbs) {
+    dbs.push_back(db->rep);
+  }
+
+  unordered_set<const Cache*> cache_set;
+  for (auto cache : consumers->caches) {
+    cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
+  }
+
+  std::map<rocksdb::MemoryUtil::UsageType, uint64_t> usage_by_type;
+
+  auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+                                                            &usage_by_type);
+  if (SaveError(errptr, status)) {
+    return nullptr;
+  }
+
+  auto result = new rocksdb_memory_usage_t;
+  result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
+  result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
+  result->mem_table_readers_total = usage_by_type[MemoryUtil::kTableReadersTotal];
+  result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
+  return result;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_unflushed;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_readers_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->cache_total;
+}
+
+// deletes container with memory usage estimates
+void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
+  delete usage;
+}
+
 }  // end extern "C"
 
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/c_test.c b/thirdparty/rocksdb/db/c_test.c
index 7b76badf1c..39b8b1cfb0 100644
--- a/thirdparty/rocksdb/db/c_test.c
+++ b/thirdparty/rocksdb/db/c_test.c
@@ -19,11 +19,8 @@
 
 // Can not use port/port.h macros as this is a c file
 #ifdef OS_WIN
-
 #include <windows.h>
 
-#define snprintf _snprintf
-
 // Ok for uniqueness
 int geteuid() {
   int result = 0;
@@ -34,6 +31,11 @@ int geteuid() {
   return result;
 }
 
+// VS < 2015
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#define snprintf _snprintf
+#endif
+
 #endif
 
 const char* phase = "";
@@ -47,12 +49,19 @@ static void StartPhase(const char* name) {
   fprintf(stderr, "=== Test %s\n", name);
   phase = name;
 }
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning (disable: 4996) // getenv security warning
+#endif
 static const char* GetTempDir(void) {
     const char* ret = getenv("TEST_TMPDIR");
     if (ret == NULL || ret[0] == '\0')
         ret = "/tmp";
     return ret;
 }
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 
 #define CheckNoError(err)                                               \
   if ((err) != NULL) {                                                  \
@@ -192,10 +201,11 @@ static void CheckDel(void* ptr, const char* k, size_t klen) {
   (*state)++;
 }
 
-static void CmpDestroy(void* arg) { }
+static void CmpDestroy(void* arg) { (void)arg; }
 
 static int CmpCompare(void* arg, const char* a, size_t alen,
                       const char* b, size_t blen) {
+  (void)arg;
   size_t n = (alen < blen) ? alen : blen;
   int r = memcmp(a, b, n);
   if (r == 0) {
@@ -206,13 +216,15 @@ static int CmpCompare(void* arg, const char* a, size_t alen,
 }
 
 static const char* CmpName(void* arg) {
+  (void)arg;
   return "foo";
 }
 
 // Custom filter policy
 static unsigned char fake_filter_result = 1;
-static void FilterDestroy(void* arg) { }
+static void FilterDestroy(void* arg) { (void)arg; }
 static const char* FilterName(void* arg) {
+  (void)arg;
   return "TestFilter";
 }
 static char* FilterCreate(
@@ -220,6 +232,10 @@ static char* FilterCreate(
     const char* const* key_array, const size_t* key_length_array,
     int num_keys,
     size_t* filter_length) {
+  (void)arg;
+  (void)key_array;
+  (void)key_length_array;
+  (void)num_keys;
   *filter_length = 4;
   char* result = malloc(4);
   memcpy(result, "fake", 4);
@@ -229,20 +245,30 @@ static unsigned char FilterKeyMatch(
     void* arg,
     const char* key, size_t length,
     const char* filter, size_t filter_length) {
+  (void)arg;
+  (void)key;
+  (void)length;
   CheckCondition(filter_length == 4);
   CheckCondition(memcmp(filter, "fake", 4) == 0);
   return fake_filter_result;
 }
 
 // Custom compaction filter
-static void CFilterDestroy(void* arg) {}
-static const char* CFilterName(void* arg) { return "foo"; }
+static void CFilterDestroy(void* arg) { (void)arg; }
+static const char* CFilterName(void* arg) {
+  (void)arg;
+  return "foo";
+}
 static unsigned char CFilterFilter(void* arg, int level, const char* key,
                                    size_t key_length,
                                    const char* existing_value,
                                    size_t value_length, char** new_value,
                                    size_t* new_value_length,
                                    unsigned char* value_changed) {
+  (void)arg;
+  (void)level;
+  (void)existing_value;
+  (void)value_length;
   if (key_length == 3) {
     if (memcmp(key, "bar", key_length) == 0) {
       return 1;
@@ -256,10 +282,15 @@ static unsigned char CFilterFilter(void* arg, int level, const char* key,
   return 0;
 }
 
-static void CFilterFactoryDestroy(void* arg) {}
-static const char* CFilterFactoryName(void* arg) { return "foo"; }
+static void CFilterFactoryDestroy(void* arg) { (void)arg; }
+static const char* CFilterFactoryName(void* arg) {
+  (void)arg;
+  return "foo";
+}
 static rocksdb_compactionfilter_t* CFilterCreate(
     void* arg, rocksdb_compactionfiltercontext_t* context) {
+  (void)arg;
+  (void)context;
   return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter,
                                          CFilterName);
 }
@@ -290,8 +321,9 @@ static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
 }
 
 // Custom merge operator
-static void MergeOperatorDestroy(void* arg) { }
+static void MergeOperatorDestroy(void* arg) { (void)arg; }
 static const char* MergeOperatorName(void* arg) {
+  (void)arg;
   return "TestMergeOperator";
 }
 static char* MergeOperatorFullMerge(
@@ -301,6 +333,14 @@ static char* MergeOperatorFullMerge(
     const char* const* operands_list, const size_t* operands_list_length,
     int num_operands,
     unsigned char* success, size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)existing_value;
+  (void)existing_value_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
   *new_value_length = 4;
   *success = 1;
   char* result = malloc(4);
@@ -313,6 +353,12 @@ static char* MergeOperatorPartialMerge(
     const char* const* operands_list, const size_t* operands_list_length,
     int num_operands,
     unsigned char* success, size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
   *new_value_length = 4;
   *success = 1;
   char* result = malloc(4);
@@ -334,6 +380,20 @@ static void CheckTxnGet(
         Free(&val);
 }
 
+static void CheckTxnGetCF(rocksdb_transaction_t* txn,
+                          const rocksdb_readoptions_t* options,
+                          rocksdb_column_family_handle_t* column_family,
+                          const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get_cf(txn, options, column_family, key,
+                                   strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
 static void CheckTxnDBGet(
         rocksdb_transactiondb_t* txn_db,
         const rocksdb_readoptions_t* options,
@@ -363,6 +423,8 @@ static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
 }
 
 int main(int argc, char** argv) {
+  (void)argc;
+  (void)argv;
   rocksdb_t* db;
   rocksdb_comparator_t* cmp;
   rocksdb_cache_t* cache;
@@ -378,6 +440,8 @@ int main(int argc, char** argv) {
   rocksdb_transactiondb_options_t* txn_db_options;
   rocksdb_transaction_t* txn;
   rocksdb_transaction_options_t* txn_options;
+  rocksdb_optimistictransactiondb_t* otxn_db;
+  rocksdb_optimistictransaction_options_t* otxn_options;
   char* err = NULL;
   int run = -1;
 
@@ -588,7 +652,7 @@ int main(int argc, char** argv) {
     rocksdb_sstfilewriter_t* writer =
         rocksdb_sstfilewriter_create(env_opt, io_options);
 
-    unlink(sstfilename);
+    remove(sstfilename);
     rocksdb_sstfilewriter_open(writer, sstfilename, &err);
     CheckNoError(err);
     rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err);
@@ -609,7 +673,7 @@ int main(int argc, char** argv) {
     CheckGet(db, roptions, "sstk2", "v2");
     CheckGet(db, roptions, "sstk3", "v3");
 
-    unlink(sstfilename);
+    remove(sstfilename);
     rocksdb_sstfilewriter_open(writer, sstfilename, &err);
     CheckNoError(err);
     rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err);
@@ -853,7 +917,8 @@ int main(int argc, char** argv) {
     rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
     rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
     rocksdb_writebatch_wi_delete(wbi, "foo", 3);
-    rocksdb_iterator_t* iter = rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter);
+    rocksdb_iterator_t* iter =
+        rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter);
     CheckCondition(!rocksdb_iter_valid(iter));
     rocksdb_iter_seek_to_first(iter);
     CheckCondition(rocksdb_iter_valid(iter));
@@ -1279,6 +1344,47 @@ int main(int argc, char** argv) {
     rocksdb_destroy_db(options, dbname, &err);
   }
 
+  // Check memory usage stats
+  StartPhase("approximate_memory_usage");
+  {
+    // Create database
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_memory_consumers_t* consumers;
+    consumers = rocksdb_memory_consumers_create();
+    rocksdb_memory_consumers_add_db(consumers, db);
+    rocksdb_memory_consumers_add_cache(consumers, cache);
+
+    // take memory usage report before write-read operation
+    rocksdb_memory_usage_t* mu1;
+    mu1 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // Put data (this should affect memtables)
+    rocksdb_put(db, woptions, "memory", 6, "test", 4, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "memory", "test");
+
+    // take memory usage report after write-read operation
+    rocksdb_memory_usage_t* mu2;
+    mu2 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // amount of memory used within memtables should grow
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_total(mu1));
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1));
+
+    rocksdb_memory_consumers_destroy(consumers);
+    rocksdb_approximate_memory_usage_destroy(mu1);
+    rocksdb_approximate_memory_usage_destroy(mu2);
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+  }
+
   StartPhase("cuckoo_options");
   {
     rocksdb_cuckoo_table_options_t* cuckoo_options;
@@ -1422,7 +1528,7 @@ int main(int argc, char** argv) {
     const rocksdb_snapshot_t* snapshot;
     snapshot = rocksdb_transactiondb_create_snapshot(txn_db);
     rocksdb_readoptions_set_snapshot(roptions, snapshot);
-  
+
     rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3,  &err);
     CheckNoError(err);
 
@@ -1447,6 +1553,25 @@ int main(int argc, char** argv) {
     CheckNoError(err);
     CheckTxnDBGet(txn_db, roptions, "bar", NULL);
 
+    // save point
+    rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err);
+    rocksdb_transaction_set_savepoint(txn);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err);
+    CheckTxnGet(txn, roptions, "foo2", "hi2");
+
+    // rollback to savepoint
+    rocksdb_transaction_rollback_to_savepoint(txn, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo2", NULL);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo1", NULL);
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+
     // Column families.
     rocksdb_column_family_handle_t* cfh;
     cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
@@ -1473,6 +1598,105 @@ int main(int argc, char** argv) {
     rocksdb_transactiondb_options_destroy(txn_db_options);
   }
 
+  StartPhase("optimistic_transactions");
+  {
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1);
+    otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err);
+    otxn_options = rocksdb_optimistictransaction_options_create();
+    rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn1, roptions, "key", "value");
+    rocksdb_transaction_commit(txn1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn2, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn1);
+    rocksdb_transaction_destroy(txn2);
+
+    // Check column family
+    db = rocksdb_optimistictransactiondb_get_base_db(otxn_db);
+    rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_t *cfh1, *cfh2;
+    cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
+    cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              NULL);
+    rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              txn);
+    CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+
+    // Check iterator with column family
+    rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err);
+    CheckNoError(err);
+    rocksdb_iterator_t* iter =
+        rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "key1_cf", "val1_cf");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_transaction_destroy(txn);
+    rocksdb_column_family_handle_destroy(cfh1);
+    rocksdb_column_family_handle_destroy(cfh2);
+    rocksdb_optimistictransactiondb_close_base_db(db);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+
+    // Check open optimistic transaction db with column families
+    size_t cf_len;
+    char** column_fams =
+        rocksdb_list_column_families(db_options, dbname, &cf_len, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("txn_db_cf1", column_fams[1], 10);
+    CheckEqual("txn_db_cf2", column_fams[2], 10);
+    CheckCondition(cf_len == 3);
+    rocksdb_list_column_families_destroy(column_fams, cf_len);
+
+    const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"};
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+    const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options};
+
+    rocksdb_options_set_error_if_exists(cf_options, 0);
+    rocksdb_column_family_handle_t* cf_handles[3];
+    otxn_db = rocksdb_optimistictransactiondb_open_column_families(
+        db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err);
+    CheckNoError(err);
+    rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+    rocksdb_transaction_destroy(txn_cf);
+    rocksdb_options_destroy(cf_options);
+    rocksdb_column_family_handle_destroy(cf_handles[0]);
+    rocksdb_column_family_handle_destroy(cf_handles[1]);
+    rocksdb_column_family_handle_destroy(cf_handles[2]);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+    rocksdb_destroy_db(db_options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_optimistictransaction_options_destroy(otxn_options);
+    CheckNoError(err);
+  }
+
   // Simple sanity check that setting memtable rep works.
   StartPhase("memtable_reps");
   {
@@ -1502,7 +1726,7 @@ int main(int argc, char** argv) {
     db = rocksdb_open(options, dbname, &err);
     CheckNoError(err);
   }
-  
+
   StartPhase("cleanup");
   rocksdb_close(db);
   rocksdb_options_destroy(options);
diff --git a/thirdparty/rocksdb/db/column_family.cc b/thirdparty/rocksdb/db/column_family.cc
index 6fd0787847..f9a4ae66d8 100644
--- a/thirdparty/rocksdb/db/column_family.cc
+++ b/thirdparty/rocksdb/db/column_family.cc
@@ -20,10 +20,12 @@
 #include <limits>
 
 #include "db/compaction_picker.h"
+#include "db/compaction_picker_fifo.h"
 #include "db/compaction_picker_universal.h"
 #include "db/db_impl.h"
 #include "db/internal_stats.h"
 #include "db/job_context.h"
+#include "db/range_del_aggregator.h"
 #include "db/table_properties_collector.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
@@ -31,8 +33,10 @@
 #include "monitoring/thread_status_util.h"
 #include "options/options_helper.h"
 #include "table/block_based_table_factory.h"
+#include "table/merging_iterator.h"
 #include "util/autovector.h"
 #include "util/compression.h"
+#include "util/sst_file_manager_impl.h"
 
 namespace rocksdb {
 
@@ -53,6 +57,9 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
 #endif  // ROCKSDB_LITE
     // Job id == 0 means that this is not our background process, but rather
     // user thread
+    // Need to hold some shared pointers owned by the initial_cf_options
+    // before final cleaning up finishes.
+    ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
     JobContext job_context(0);
     mutex_->Lock();
     if (cfd_->Unref()) {
@@ -61,7 +68,14 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
     db_->FindObsoleteFiles(&job_context, false, true);
     mutex_->Unlock();
     if (job_context.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(job_context);
+      bool defer_purge =
+          db_->immutable_db_options().avoid_unnecessary_blocking_io;
+      db_->PurgeObsoleteFiles(job_context, defer_purge);
+      if (defer_purge) {
+        mutex_->Lock();
+        db_->SchedulePurge();
+        mutex_->Unlock();
+      }
     }
     job_context.Clean();
   }
@@ -80,6 +94,7 @@ Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
   *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
   return Status::OK();
 #else
+  (void)desc;
   return Status::NotSupported();
 #endif  // !ROCKSDB_LITE
 }
@@ -99,9 +114,6 @@ void GetIntTblPropCollectorFactory(
     int_tbl_prop_collector_factories->emplace_back(
         new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
   }
-  // Add collector to collect internal key statistics
-  int_tbl_prop_collector_factories->emplace_back(
-      new InternalKeyPropertiesCollectorFactory);
 }
 
 Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
@@ -123,6 +135,18 @@ Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
           " is not linked with the binary.");
     }
   }
+  if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
+    if (!ZSTD_TrainDictionarySupported()) {
+      return Status::InvalidArgument(
+          "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+          "is not linked with the binary.");
+    }
+    if (cf_options.compression_opts.max_dict_bytes == 0) {
+      return Status::InvalidArgument(
+          "The dictionary size limit (`CompressionOptions::max_dict_bytes`) "
+          "should be nonzero if we're using zstd's dictionary generator.");
+    }
+  }
   return Status::OK();
 }
 
@@ -139,6 +163,28 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
   return Status::OK();
 }
 
+Status CheckCFPathsSupported(const DBOptions& db_options,
+                             const ColumnFamilyOptions& cf_options) {
+  // More than one cf_paths are supported only in universal
+  // and level compaction styles. This function also checks the case
+  // in which cf_paths is not specified, which results in db_paths
+  // being used.
+  if ((cf_options.compaction_style != kCompactionStyleUniversal) &&
+      (cf_options.compaction_style != kCompactionStyleLevel)) {
+    if (cf_options.cf_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one CF paths are only supported in "
+          "universal and level compaction styles. ");
+    } else if (cf_options.cf_paths.empty() &&
+               db_options.db_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one DB paths are only supported in "
+          "universal and level compaction styles. ");
+    }
+  }
+  return Status::OK();
+}
+
 ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                     const ColumnFamilyOptions& src) {
   ColumnFamilyOptions result = src;
@@ -257,9 +303,24 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
         result.hard_pending_compaction_bytes_limit;
   }
 
+#ifndef ROCKSDB_LITE
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm = static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
+  for (size_t i = 0; i < result.cf_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path);
+  }
+#endif
+
+  if (result.cf_paths.empty()) {
+    result.cf_paths = db_options.db_paths;
+  }
+
   if (result.level_compaction_dynamic_level_bytes) {
     if (result.compaction_style != kCompactionStyleLevel ||
-        db_options.db_paths.size() > 1U) {
+        result.cf_paths.size() > 1U) {
       // 1. level_compaction_dynamic_level_bytes only makes sense for
       //    level-based compaction.
       // 2. we don't yet know how to make both of this feature and multiple
@@ -328,12 +389,13 @@ void SuperVersionUnrefHandle(void* ptr) {
   // When latter happens, we are in ~ColumnFamilyData(), no get should happen as
   // well.
   SuperVersion* sv = static_cast<SuperVersion*>(ptr);
-  if (sv->Unref()) {
-    sv->db_mutex->Lock();
-    sv->Cleanup();
-    sv->db_mutex->Unlock();
-    delete sv;
-  }
+  bool was_last_ref __attribute__((__unused__));
+  was_last_ref = sv->Unref();
+  // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_.
+  // This is important because we can't do SuperVersion cleanup here.
+  // That would require locking DB mutex, which would deadlock because
+  // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex.
+  assert(!was_last_ref);
 }
 }  // anonymous namespace
 
@@ -365,11 +427,13 @@ ColumnFamilyData::ColumnFamilyData(
       next_(nullptr),
       prev_(nullptr),
       log_number_(0),
+      flush_reason_(FlushReason::kOthers),
       column_family_set_(column_family_set),
-      pending_flush_(false),
-      pending_compaction_(false),
+      queued_for_flush_(false),
+      queued_for_compaction_(false),
       prev_compaction_needed_bytes_(0),
-      allow_2pc_(db_options.allow_2pc) {
+      allow_2pc_(db_options.allow_2pc),
+      last_memtable_id_(0) {
   Ref();
 
   // Convert user defined table properties collector factories to internal ones.
@@ -442,8 +506,8 @@ ColumnFamilyData::~ColumnFamilyData() {
 
   // It would be wrong if this ColumnFamilyData is in flush_queue_ or
   // compaction_queue_ and we destroyed it
-  assert(!pending_flush_);
-  assert(!pending_compaction_);
+  assert(!queued_for_flush_);
+  assert(!queued_for_compaction_);
 
   if (super_version_ != nullptr) {
     // Release SuperVersion reference kept in ThreadLocalPtr.
@@ -452,7 +516,7 @@ ColumnFamilyData::~ColumnFamilyData() {
     local_sv_.reset();
     super_version_->db_mutex->Lock();
 
-    bool is_last_reference __attribute__((unused));
+    bool is_last_reference __attribute__((__unused__));
     is_last_reference = super_version_->Unref();
     assert(is_last_reference);
     super_version_->Cleanup();
@@ -463,7 +527,8 @@ ColumnFamilyData::~ColumnFamilyData() {
   if (dummy_versions_ != nullptr) {
     // List must be empty
     assert(dummy_versions_->TEST_Next() == dummy_versions_);
-    bool deleted __attribute__((unused)) = dummy_versions_->Unref();
+    bool deleted __attribute__((__unused__));
+    deleted = dummy_versions_->Unref();
     assert(deleted);
   }
 
@@ -495,7 +560,9 @@ uint64_t ColumnFamilyData::OldestLogToKeep() {
   auto current_log = GetLogNumber();
 
   if (allow_2pc_) {
-    auto imm_prep_log = imm()->GetMinLogContainingPrepSection();
+    autovector<MemTable*> empty_list;
+    auto imm_prep_log =
+        imm()->PrecomputeMinLogContainingPrepSection(empty_list);
     auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
 
     if (imm_prep_log > 0 && imm_prep_log < current_log) {
@@ -613,58 +680,97 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
 }
 }  // namespace
 
-void ColumnFamilyData::RecalculateWriteStallConditions(
+std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
+ColumnFamilyData::GetWriteStallConditionAndCause(
+    int num_unflushed_memtables, int num_l0_files,
+    uint64_t num_compaction_needed_bytes,
+    const MutableCFOptions& mutable_cf_options) {
+  if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.hard_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kStopped,
+            WriteStallCause::kPendingCompactionBytes};
+  } else if (mutable_cf_options.max_write_buffer_number > 3 &&
+             num_unflushed_memtables >=
+                 mutable_cf_options.max_write_buffer_number - 1) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+             num_l0_files >=
+                 mutable_cf_options.level0_slowdown_writes_trigger) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.soft_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kDelayed,
+            WriteStallCause::kPendingCompactionBytes};
+  }
+  return {WriteStallCondition::kNormal, WriteStallCause::kNone};
+}
+
+WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
       const MutableCFOptions& mutable_cf_options) {
+  auto write_stall_condition = WriteStallCondition::kNormal;
   if (current_ != nullptr) {
     auto* vstorage = current_->storage_info();
     auto write_controller = column_family_set_->write_controller_;
     uint64_t compaction_needed_bytes =
         vstorage->estimated_compaction_needed_bytes();
 
+    auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
+        imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
+        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options);
+    write_stall_condition = write_stall_condition_and_cause.first;
+    auto write_stall_cause = write_stall_condition_and_cause.second;
+
     bool was_stopped = write_controller->IsStopped();
     bool needed_delay = write_controller->NeedsDelay();
 
-    if (imm()->NumNotFlushed() >= mutable_cf_options.max_write_buffer_number) {
+    if (write_stall_condition == WriteStallCondition::kStopped &&
+        write_stall_cause == WriteStallCause::kMemtableLimit) {
       write_controller_token_ = write_controller->GetStopToken();
-      internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
       ROCKS_LOG_WARN(
           ioptions_.info_log,
           "[%s] Stopping writes because we have %d immutable memtables "
           "(waiting for flush), max_write_buffer_number is set to %d",
           name_.c_str(), imm()->NumNotFlushed(),
           mutable_cf_options.max_write_buffer_number);
-    } else if (!mutable_cf_options.disable_auto_compactions &&
-               vstorage->l0_delay_trigger_count() >=
-                   mutable_cf_options.level0_stop_writes_trigger) {
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
       write_controller_token_ = write_controller->GetStopToken();
-      internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES_TOTAL, 1);
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
       if (compaction_picker_->IsLevel0CompactionInProgress()) {
         internal_stats_->AddCFStats(
-            InternalStats::LEVEL0_NUM_FILES_WITH_COMPACTION, 1);
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
       }
       ROCKS_LOG_WARN(ioptions_.info_log,
                      "[%s] Stopping writes because we have %d level-0 files",
                      name_.c_str(), vstorage->l0_delay_trigger_count());
-    } else if (!mutable_cf_options.disable_auto_compactions &&
-               mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
-               compaction_needed_bytes >=
-                   mutable_cf_options.hard_pending_compaction_bytes_limit) {
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
       write_controller_token_ = write_controller->GetStopToken();
       internal_stats_->AddCFStats(
-          InternalStats::HARD_PENDING_COMPACTION_BYTES_LIMIT, 1);
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
       ROCKS_LOG_WARN(
           ioptions_.info_log,
           "[%s] Stopping writes because of estimated pending compaction "
           "bytes %" PRIu64,
           name_.c_str(), compaction_needed_bytes);
-    } else if (mutable_cf_options.max_write_buffer_number > 3 &&
-               imm()->NumNotFlushed() >=
-                   mutable_cf_options.max_write_buffer_number - 1) {
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kMemtableLimit) {
       write_controller_token_ =
           SetupDelay(write_controller, compaction_needed_bytes,
                      prev_compaction_needed_bytes_, was_stopped,
                      mutable_cf_options.disable_auto_compactions);
-      internal_stats_->AddCFStats(InternalStats::MEMTABLE_SLOWDOWN, 1);
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
       ROCKS_LOG_WARN(
           ioptions_.info_log,
           "[%s] Stalling writes because we have %d immutable memtables "
@@ -673,10 +779,8 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           name_.c_str(), imm()->NumNotFlushed(),
           mutable_cf_options.max_write_buffer_number,
           write_controller->delayed_write_rate());
-    } else if (!mutable_cf_options.disable_auto_compactions &&
-               mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
-               vstorage->l0_delay_trigger_count() >=
-                   mutable_cf_options.level0_slowdown_writes_trigger) {
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
       // L0 is the last two files from stopping.
       bool near_stop = vstorage->l0_delay_trigger_count() >=
                        mutable_cf_options.level0_stop_writes_trigger - 2;
@@ -684,20 +788,19 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           SetupDelay(write_controller, compaction_needed_bytes,
                      prev_compaction_needed_bytes_, was_stopped || near_stop,
                      mutable_cf_options.disable_auto_compactions);
-      internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN_TOTAL, 1);
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+                                  1);
       if (compaction_picker_->IsLevel0CompactionInProgress()) {
         internal_stats_->AddCFStats(
-            InternalStats::LEVEL0_SLOWDOWN_WITH_COMPACTION, 1);
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
       }
       ROCKS_LOG_WARN(ioptions_.info_log,
                      "[%s] Stalling writes because we have %d level-0 files "
                      "rate %" PRIu64,
                      name_.c_str(), vstorage->l0_delay_trigger_count(),
                      write_controller->delayed_write_rate());
-    } else if (!mutable_cf_options.disable_auto_compactions &&
-               mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
-               vstorage->estimated_compaction_needed_bytes() >=
-                   mutable_cf_options.soft_pending_compaction_bytes_limit) {
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
       // If the distance to hard limit is less than 1/4 of the gap between soft
       // and
       // hard bytes limit, we think it is near stop and speed up the slowdown.
@@ -714,7 +817,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
                      prev_compaction_needed_bytes_, was_stopped || near_stop,
                      mutable_cf_options.disable_auto_compactions);
       internal_stats_->AddCFStats(
-          InternalStats::SOFT_PENDING_COMPACTION_BYTES_LIMIT, 1);
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
       ROCKS_LOG_WARN(
           ioptions_.info_log,
           "[%s] Stalling writes because of estimated pending compaction "
@@ -722,6 +825,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
           name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
           write_controller->delayed_write_rate());
     } else {
+      assert(write_stall_condition == WriteStallCondition::kNormal);
       if (vstorage->l0_delay_trigger_count() >=
           GetL0ThresholdSpeedupCompaction(
               mutable_cf_options.level0_file_num_compaction_trigger,
@@ -769,6 +873,7 @@ void ColumnFamilyData::RecalculateWriteStallConditions(
     }
     prev_compaction_needed_bytes_ = compaction_needed_bytes;
   }
+  return write_stall_condition;
 }
 
 const EnvOptions* ColumnFamilyData::soptions() const {
@@ -787,6 +892,10 @@ uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
   return VersionSet::GetTotalSstFilesSize(dummy_versions_);
 }
 
+uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
+  return current_->GetSstFilesSize();
+}
+
 MemTable* ColumnFamilyData::ConstructNewMemtable(
     const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
   return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
@@ -823,16 +932,70 @@ bool ColumnFamilyData::RangeOverlapWithCompaction(
       smallest_user_key, largest_user_key, level);
 }
 
+Status ColumnFamilyData::RangesOverlapWithMemtables(
+    const autovector<Range>& ranges, SuperVersion* super_version,
+    bool* overlap) {
+  assert(overlap != nullptr);
+  *overlap = false;
+  // Create an InternalIterator over all unflushed memtables
+  Arena arena;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
+  merge_iter_builder.AddIterator(
+      super_version->mem->NewIterator(read_opts, &arena));
+  super_version->imm->AddIterators(read_opts, &merge_iter_builder);
+  ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
+
+  auto read_seq = super_version->current->version_set()->LastSequence();
+  ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
+  auto* active_range_del_iter =
+      super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq);
+  range_del_agg.AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
+  super_version->imm->AddRangeTombstoneIterators(read_opts, nullptr /* arena */,
+                                                 &range_del_agg);
+
+  Status status;
+  for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
+    auto* vstorage = super_version->current->storage_info();
+    auto* ucmp = vstorage->InternalComparator()->user_comparator();
+    InternalKey range_start(ranges[i].start, kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    memtable_iter->Seek(range_start.Encode());
+    status = memtable_iter->status();
+    ParsedInternalKey seek_result;
+    if (status.ok()) {
+      if (memtable_iter->Valid() &&
+          !ParseInternalKey(memtable_iter->key(), &seek_result)) {
+        status = Status::Corruption("DB have corrupted keys");
+      }
+    }
+    if (status.ok()) {
+      if (memtable_iter->Valid() &&
+          ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
+        *overlap = true;
+      } else if (range_del_agg.IsRangeOverlapped(ranges[i].start,
+                                                 ranges[i].limit)) {
+        *overlap = true;
+      }
+    }
+  }
+  return status;
+}
+
 const int ColumnFamilyData::kCompactAllLevels = -1;
 const int ColumnFamilyData::kCompactToBaseLevel = -2;
 
 Compaction* ColumnFamilyData::CompactRange(
     const MutableCFOptions& mutable_cf_options, int input_level,
-    int output_level, uint32_t output_path_id, const InternalKey* begin,
-    const InternalKey* end, InternalKey** compaction_end, bool* conflict) {
+    int output_level, uint32_t output_path_id, uint32_t max_subcompactions,
+    const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end, bool* conflict) {
   auto* result = compaction_picker_->CompactRange(
       GetName(), mutable_cf_options, current_->storage_info(), input_level,
-      output_level, output_path_id, begin, end, compaction_end, conflict);
+      output_level, output_path_id, max_subcompactions, begin, end,
+      compaction_end, conflict);
   if (result != nullptr) {
     result->SetInputVersion(current_);
   }
@@ -841,10 +1004,13 @@ Compaction* ColumnFamilyData::CompactRange(
 
 SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
     InstrumentedMutex* db_mutex) {
-  SuperVersion* sv = nullptr;
-  sv = GetThreadLocalSuperVersion(db_mutex);
+  SuperVersion* sv = GetThreadLocalSuperVersion(db_mutex);
   sv->Ref();
   if (!ReturnThreadLocalSuperVersion(sv)) {
+    // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion()
+    // when the thread-local pointer was populated. So, the Ref() earlier in
+    // this function still prevents the returned SuperVersion* from being
+    // deleted out from under the caller.
     sv->Unref();
   }
   return sv;
@@ -852,7 +1018,6 @@ SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(
 
 SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
     InstrumentedMutex* db_mutex) {
-  SuperVersion* sv = nullptr;
   // The SuperVersion is cached in thread local storage to avoid acquiring
   // mutex when SuperVersion does not change since the last use. When a new
   // SuperVersion is installed, the compaction or flush thread cleans up
@@ -871,7 +1036,7 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(
   // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
   // (if no Scrape happens).
   assert(ptr != SuperVersion::kSVInUse);
-  sv = static_cast<SuperVersion*>(ptr);
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
   if (sv == SuperVersion::kSVObsolete ||
       sv->version_number != super_version_number_.load()) {
     RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES);
@@ -914,15 +1079,16 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
   return false;
 }
 
-SuperVersion* ColumnFamilyData::InstallSuperVersion(
-    SuperVersion* new_superversion, InstrumentedMutex* db_mutex) {
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) {
   db_mutex->AssertHeld();
-  return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
+  return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_);
 }
 
-SuperVersion* ColumnFamilyData::InstallSuperVersion(
-    SuperVersion* new_superversion, InstrumentedMutex* db_mutex,
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
     const MutableCFOptions& mutable_cf_options) {
+  SuperVersion* new_superversion = sv_context->new_superversion.release();
   new_superversion->db_mutex = db_mutex;
   new_superversion->mutable_cf_options = mutable_cf_options;
   new_superversion->Init(mem_, imm_.current(), current_);
@@ -930,23 +1096,31 @@ SuperVersion* ColumnFamilyData::InstallSuperVersion(
   super_version_ = new_superversion;
   ++super_version_number_;
   super_version_->version_number = super_version_number_;
+  super_version_->write_stall_condition =
+      RecalculateWriteStallConditions(mutable_cf_options);
+
   if (old_superversion != nullptr) {
+    // Reset SuperVersions cached in thread local storage.
+    // This should be done before old_superversion->Unref(). That's to ensure
+    // that local_sv_ never holds the last reference to SuperVersion, since
+    // it has no means to safely do SuperVersion cleanup.
+    ResetThreadLocalSuperVersions();
+
     if (old_superversion->mutable_cf_options.write_buffer_size !=
         mutable_cf_options.write_buffer_size) {
       mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
     }
+    if (old_superversion->write_stall_condition !=
+        new_superversion->write_stall_condition) {
+      sv_context->PushWriteStallNotification(
+          old_superversion->write_stall_condition,
+          new_superversion->write_stall_condition, GetName(), ioptions());
+    }
+    if (old_superversion->Unref()) {
+      old_superversion->Cleanup();
+      sv_context->superversions_to_free.push_back(old_superversion);
+    }
   }
-
-  // Reset SuperVersions cached in thread local storage
-  ResetThreadLocalSuperVersions();
-
-  RecalculateWriteStallConditions(mutable_cf_options);
-
-  if (old_superversion != nullptr && old_superversion->Unref()) {
-    old_superversion->Cleanup();
-    return old_superversion;  // will let caller delete outside of mutex
-  }
-  return nullptr;
 }
 
 void ColumnFamilyData::ResetThreadLocalSuperVersions() {
@@ -958,10 +1132,12 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
       continue;
     }
     auto sv = static_cast<SuperVersion*>(ptr);
-    if (sv->Unref()) {
-      sv->Cleanup();
-      delete sv;
-    }
+    bool was_last_ref __attribute__((__unused__));
+    was_last_ref = sv->Unref();
+    // sv couldn't have been the last reference because
+    // ResetThreadLocalSuperVersions() is called before
+    // unref'ing super_version_.
+    assert(!was_last_ref);
   }
 }
 
@@ -969,8 +1145,9 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
 Status ColumnFamilyData::SetOptions(
       const std::unordered_map<std::string, std::string>& options_map) {
   MutableCFOptions new_mutable_cf_options;
-  Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
-                                          &new_mutable_cf_options);
+  Status s =
+      GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
+                                   ioptions_.info_log, &new_mutable_cf_options);
   if (s.ok()) {
     mutable_cf_options_ = new_mutable_cf_options;
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
@@ -979,6 +1156,49 @@ Status ColumnFamilyData::SetOptions(
 }
 #endif  // ROCKSDB_LITE
 
+// REQUIRES: DB mutex held
+Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
+  if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
+    return Env::WLTH_NOT_SET;
+  }
+  if (level == 0) {
+    return Env::WLTH_MEDIUM;
+  }
+  int base_level = current_->storage_info()->base_level();
+
+  // L1: medium, L2: long, ...
+  if (level - base_level >= 2) {
+    return Env::WLTH_EXTREME;
+  }
+  return static_cast<Env::WriteLifeTimeHint>(level - base_level +
+                            static_cast<int>(Env::WLTH_MEDIUM));
+}
+
+Status ColumnFamilyData::AddDirectories() {
+  Status s;
+  assert(data_dirs_.empty());
+  for (auto& p : ioptions_.cf_paths) {
+    std::unique_ptr<Directory> path_directory;
+    s = DBImpl::CreateAndNewDirectory(ioptions_.env, p.path, &path_directory);
+    if (!s.ok()) {
+      return s;
+    }
+    assert(path_directory != nullptr);
+    data_dirs_.emplace_back(path_directory.release());
+  }
+  assert(data_dirs_.size() == ioptions_.cf_paths.size());
+  return s;
+}
+
+Directory* ColumnFamilyData::GetDataDir(size_t path_id) const {
+  if (data_dirs_.empty()) {
+    return nullptr;
+  }
+
+  assert(path_id < data_dirs_.size());
+  return data_dirs_[path_id].get();
+}
+
 ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
                                  const ImmutableDBOptions* db_options,
                                  const EnvOptions& env_options,
@@ -1005,10 +1225,14 @@ ColumnFamilySet::~ColumnFamilySet() {
   while (column_family_data_.size() > 0) {
     // cfd destructor will delete itself from column_family_data_
     auto cfd = column_family_data_.begin()->second;
-    cfd->Unref();
+    bool last_ref __attribute__((__unused__));
+    last_ref = cfd->Unref();
+    assert(last_ref);
     delete cfd;
   }
-  dummy_cfd_->Unref();
+  bool dummy_last_ref __attribute__((__unused__));
+  dummy_last_ref = dummy_cfd_->Unref();
+  assert(dummy_last_ref);
   delete dummy_cfd_;
 }
 
diff --git a/thirdparty/rocksdb/db/column_family.h b/thirdparty/rocksdb/db/column_family.h
index 3a807d22b9..5ed20604ac 100644
--- a/thirdparty/rocksdb/db/column_family.h
+++ b/thirdparty/rocksdb/db/column_family.h
@@ -30,6 +30,7 @@ namespace rocksdb {
 
 class Version;
 class VersionSet;
+class VersionStorageInfo;
 class MemTable;
 class MemTableListVersion;
 class CompactionPicker;
@@ -41,6 +42,7 @@ class DBImpl;
 class LogBuffer;
 class InstrumentedMutex;
 class InstrumentedMutexLock;
+struct SuperVersionContext;
 
 extern const double kIncSlowdownRatio;
 
@@ -76,7 +78,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
 class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
  public:
   ColumnFamilyHandleInternal()
-      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {}
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), internal_cfd_(nullptr) {}
 
   void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
   virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
@@ -95,6 +97,7 @@ struct SuperVersion {
   MutableCFOptions mutable_cf_options;
   // Version number of the current SuperVersion
   uint64_t version_number;
+  WriteStallCondition write_stall_condition;
 
   InstrumentedMutex* db_mutex;
 
@@ -136,6 +139,9 @@ extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
 extern Status CheckConcurrentWritesSupported(
     const ColumnFamilyOptions& cf_options);
 
+extern Status CheckCFPathsSupported(const DBOptions& db_options,
+                                    const ColumnFamilyOptions& cf_options);
+
 extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                            const ColumnFamilyOptions& src);
 // Wrap user defined table proproties collector factories `from cf_options`
@@ -192,7 +198,7 @@ class ColumnFamilyData {
   // *) delete all memory associated with that column family
   // *) delete all the files associated with that column family
   void SetDropped();
-  bool IsDropped() const { return dropped_; }
+  bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
 
   // thread-safe
   int NumberLevels() const { return ioptions_.num_levels; }
@@ -200,6 +206,10 @@ class ColumnFamilyData {
   void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
   uint64_t GetLogNumber() const { return log_number_; }
 
+  void SetFlushReason(FlushReason flush_reason) {
+    flush_reason_ = flush_reason;
+  }
+  FlushReason GetFlushReason() const { return flush_reason_; }
   // thread-safe
   const EnvOptions* soptions() const;
   const ImmutableCFOptions* ioptions() const { return &ioptions_; }
@@ -237,7 +247,12 @@ class ColumnFamilyData {
   void SetCurrent(Version* _current);
   uint64_t GetNumLiveVersions() const;  // REQUIRE: DB mutex held
   uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
-  void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
+  uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
+  void SetMemtable(MemTable* new_mem) {
+    uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
+    new_mem->SetID(memtable_id);
+    mem_ = new_mem;
+  }
 
   // calculate the oldest log needed for the durability of this column family
   uint64_t OldestLogToKeep();
@@ -263,17 +278,27 @@ class ColumnFamilyData {
                                   const Slice& largest_user_key,
                                   int level) const;
 
+  // Check if the passed ranges overlap with any unflushed memtables
+  // (immutable or mutable).
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
+                                    SuperVersion* super_version, bool* overlap);
+
   // A flag to tell a manual compaction is to compact all levels together
-  // instad of for specific level.
+  // instead of a specific level.
   static const int kCompactAllLevels;
   // A flag to tell a manual compaction's output is base level.
   static const int kCompactToBaseLevel;
   // REQUIRES: DB mutex held
   Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
                            int input_level, int output_level,
-                           uint32_t output_path_id, const InternalKey* begin,
-                           const InternalKey* end, InternalKey** compaction_end,
-                           bool* manual_conflict);
+                           uint32_t output_path_id, uint32_t max_subcompactions,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end, bool* manual_conflict);
 
   CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
   // thread-safe
@@ -311,32 +336,55 @@ class ColumnFamilyData {
   // As argument takes a pointer to allocated SuperVersion to enable
   // the clients to allocate SuperVersion outside of mutex.
   // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
-  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
-                                    InstrumentedMutex* db_mutex,
-                                    const MutableCFOptions& mutable_cf_options);
-  SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
-                                    InstrumentedMutex* db_mutex);
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           InstrumentedMutex* db_mutex,
+                           const MutableCFOptions& mutable_cf_options);
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           InstrumentedMutex* db_mutex);
 
   void ResetThreadLocalSuperVersions();
 
   // Protected by DB mutex
-  void set_pending_flush(bool value) { pending_flush_ = value; }
-  void set_pending_compaction(bool value) { pending_compaction_ = value; }
-  bool pending_flush() { return pending_flush_; }
-  bool pending_compaction() { return pending_compaction_; }
+  void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
+  void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
+  bool queued_for_flush() { return queued_for_flush_; }
+  bool queued_for_compaction() { return queued_for_compaction_; }
+
+  enum class WriteStallCause {
+    kNone,
+    kMemtableLimit,
+    kL0FileCountLimit,
+    kPendingCompactionBytes,
+  };
+  static std::pair<WriteStallCondition, WriteStallCause>
+  GetWriteStallConditionAndCause(int num_unflushed_memtables, int num_l0_files,
+                                 uint64_t num_compaction_needed_bytes,
+                                 const MutableCFOptions& mutable_cf_options);
 
   // Recalculate some small conditions, which are changed only during
   // compaction, adding new memtable and/or
   // recalculation of compaction score. These values are used in
   // DBImpl::MakeRoomForWrite function to decide, if it need to make
   // a write stall
-  void RecalculateWriteStallConditions(
+  WriteStallCondition RecalculateWriteStallConditions(
       const MutableCFOptions& mutable_cf_options);
 
   void set_initialized() { initialized_.store(true); }
 
   bool initialized() const { return initialized_.load(); }
 
+  const ColumnFamilyOptions& initial_cf_options() {
+    return initial_cf_options_;
+  }
+
+  Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
+
+  Status AddDirectories();
+
+  Directory* GetDataDir(size_t path_id) const;
+
+  ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+
  private:
   friend class ColumnFamilySet;
   ColumnFamilyData(uint32_t id, const std::string& name,
@@ -354,7 +402,7 @@ class ColumnFamilyData {
 
   std::atomic<int> refs_;      // outstanding references to ColumnFamilyData
   std::atomic<bool> initialized_;
-  bool dropped_;               // true if client dropped it
+  std::atomic<bool> dropped_;  // true if client dropped it
 
   const InternalKeyComparator internal_comparator_;
   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
@@ -396,6 +444,8 @@ class ColumnFamilyData {
   // recovered from
   uint64_t log_number_;
 
+  std::atomic<FlushReason> flush_reason_;
+
   // An object that keeps all the compaction stats
   // and picks the next compaction
   std::unique_ptr<CompactionPicker> compaction_picker_;
@@ -405,16 +455,22 @@ class ColumnFamilyData {
   std::unique_ptr<WriteControllerToken> write_controller_token_;
 
   // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
-  bool pending_flush_;
+  bool queued_for_flush_;
 
   // If true --> this ColumnFamily is currently present in
   // DBImpl::compaction_queue_
-  bool pending_compaction_;
+  bool queued_for_compaction_;
 
   uint64_t prev_compaction_needed_bytes_;
 
   // if the database was opened with 2pc enabled
   bool allow_2pc_;
+
+  // Memtable id to track flush.
+  std::atomic<uint64_t> last_memtable_id_;
+
+  // Directories corresponding to cf_paths.
+  std::vector<std::unique_ptr<Directory>> data_dirs_;
 };
 
 // ColumnFamilySet has interesting thread-safety requirements
diff --git a/thirdparty/rocksdb/db/column_family_test.cc b/thirdparty/rocksdb/db/column_family_test.cc
index 88786d469d..bdc832bd23 100644
--- a/thirdparty/rocksdb/db/column_family_test.cc
+++ b/thirdparty/rocksdb/db/column_family_test.cc
@@ -14,6 +14,7 @@
 
 #include "db/db_impl.h"
 #include "db/db_test_util.h"
+#include "memtable/hash_skiplist_rep.h"
 #include "options/options_parser.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@@ -47,7 +48,7 @@ class EnvCounter : public EnvWrapper {
   int GetNumberOfNewWritableFileCalls() {
     return num_new_writable_file_;
   }
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& soptions) override {
     ++num_new_writable_file_;
     return EnvWrapper::NewWritableFile(f, r, soptions);
@@ -57,24 +58,36 @@ class EnvCounter : public EnvWrapper {
   std::atomic<int> num_new_writable_file_;
 };
 
-class ColumnFamilyTest : public testing::Test {
+class ColumnFamilyTestBase : public testing::Test {
  public:
-  ColumnFamilyTest() : rnd_(139) {
+  ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
     env_ = new EnvCounter(Env::Default());
-    dbname_ = test::TmpDir() + "/column_family_test";
+    dbname_ = test::PerThreadDBPath("column_family_test");
     db_options_.create_if_missing = true;
     db_options_.fail_if_options_file_error = true;
     db_options_.env = env_;
     DestroyDB(dbname_, Options(db_options_, column_family_options_));
   }
 
-  ~ColumnFamilyTest() {
+  ~ColumnFamilyTestBase() override {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (auto h : handles_) {
+      ColumnFamilyDescriptor cfdescriptor;
+      h->GetDescriptor(&cfdescriptor);
+      column_families.push_back(cfdescriptor);
+    }
     Close();
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-    Destroy();
+    Destroy(column_families);
     delete env_;
   }
 
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions options;
+    options.format_version = format_;
+    return options;
+  }
+
   // Return the value to associate with the specified key
   Slice Value(int k, std::string* storage) {
     if (k == 0) {
@@ -236,9 +249,11 @@ class ColumnFamilyTest : public testing::Test {
 #endif  // !ROCKSDB_LITE
   }
 
-  void Destroy() {
+  void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
+                  std::vector<ColumnFamilyDescriptor>()) {
     Close();
-    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
+    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_),
+                        column_families));
   }
 
   void CreateColumnFamilies(
@@ -291,6 +306,9 @@ class ColumnFamilyTest : public testing::Test {
   }
 
   void PutRandomData(int cf, int num, int key_value_size, bool save = false) {
+    if (cf >= static_cast<int>(keys_.size())) {
+      keys_.resize(cf + 1);
+    }
     for (int i = 0; i < num; ++i) {
       // 10 bytes for key, rest is value
       if (!save) {
@@ -298,7 +316,7 @@ class ColumnFamilyTest : public testing::Test {
                       RandomString(&rnd_, key_value_size - 10)));
       } else {
         std::string key = test::RandomKey(&rnd_, 11);
-        keys_.insert(key);
+        keys_[cf].insert(key);
         ASSERT_OK(Put(cf, key, RandomString(&rnd_, key_value_size - 10)));
       }
     }
@@ -383,6 +401,9 @@ class ColumnFamilyTest : public testing::Test {
   void AssertFilesPerLevel(const std::string& value, int cf) {
 #ifndef ROCKSDB_LITE
     ASSERT_EQ(value, FilesPerLevel(cf));
+#else
+    (void) value;
+    (void) cf;
 #endif
   }
 
@@ -397,6 +418,8 @@ class ColumnFamilyTest : public testing::Test {
   void AssertCountLiveFiles(int expected_value) {
 #ifndef ROCKSDB_LITE
     ASSERT_EQ(expected_value, CountLiveFiles());
+#else
+    (void) expected_value;
 #endif
   }
 
@@ -445,6 +468,8 @@ class ColumnFamilyTest : public testing::Test {
   void AssertCountLiveLogFiles(int value) {
 #ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
     ASSERT_EQ(value, CountLiveLogFiles());
+#else
+    (void) value;
 #endif  // !ROCKSDB_LITE
   }
 
@@ -462,9 +487,9 @@ class ColumnFamilyTest : public testing::Test {
   void CopyFile(const std::string& source, const std::string& destination,
                 uint64_t size = 0) {
     const EnvOptions soptions;
-    unique_ptr<SequentialFile> srcfile;
+    std::unique_ptr<SequentialFile> srcfile;
     ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
-    unique_ptr<WritableFile> destfile;
+    std::unique_ptr<WritableFile> destfile;
     ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
 
     if (size == 0) {
@@ -483,18 +508,47 @@ class ColumnFamilyTest : public testing::Test {
     ASSERT_OK(destfile->Close());
   }
 
+  int GetSstFileCount(std::string path) {
+    std::vector<std::string> files;
+    DBTestBase::GetSstFiles(env_, path, &files);
+    return static_cast<int>(files.size());
+  }
+
+  void RecalculateWriteStallConditions(ColumnFamilyData* cfd,
+      const MutableCFOptions& mutable_cf_options)  {
+    // add lock to avoid race condition between
+    // `RecalculateWriteStallConditions` which writes to CFStats and
+    // background `DBImpl::DumpStats()` threads which read CFStats
+    dbfull()->TEST_LockMutex();
+    cfd->RecalculateWriteStallConditions(mutable_cf_options);
+    dbfull()-> TEST_UnlockMutex();
+  }
+
   std::vector<ColumnFamilyHandle*> handles_;
   std::vector<std::string> names_;
-  std::set<std::string> keys_;
+  std::vector<std::set<std::string>> keys_;
   ColumnFamilyOptions column_family_options_;
   DBOptions db_options_;
   std::string dbname_;
   DB* db_ = nullptr;
   EnvCounter* env_;
   Random rnd_;
+  uint32_t format_;
 };
 
-TEST_F(ColumnFamilyTest, DontReuseColumnFamilyID) {
+class ColumnFamilyTest
+    : public ColumnFamilyTestBase,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
+                        testing::Values(test::kLatestFormatVersion));
+
+TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
   for (int iter = 0; iter < 3; ++iter) {
     Open();
     CreateColumnFamilies({"one", "two", "three"});
@@ -513,7 +567,8 @@ TEST_F(ColumnFamilyTest, DontReuseColumnFamilyID) {
       Reopen();
     }
     CreateColumnFamilies({"three2"});
-    // ID 3 that was used for dropped column family "three" should not be reused
+    // ID 3 that was used for dropped column family "three" should not be
+    // reused
     auto cfh3 = reinterpret_cast<ColumnFamilyHandleImpl*>(handles_[3]);
     ASSERT_EQ(4U, cfh3->GetID());
     Close();
@@ -522,7 +577,7 @@ TEST_F(ColumnFamilyTest, DontReuseColumnFamilyID) {
 }
 
 #ifndef ROCKSDB_LITE
-TEST_F(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
+TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
   Open();
 
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
@@ -545,10 +600,13 @@ TEST_F(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
 }
 #endif  // !ROCKSDB_LITE
 
-class FlushEmptyCFTestWithParam : public ColumnFamilyTest,
-                                  public testing::WithParamInterface<bool> {
+class FlushEmptyCFTestWithParam
+    : public ColumnFamilyTestBase,
+      virtual public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
  public:
-  FlushEmptyCFTestWithParam() { allow_2pc_ = GetParam(); }
+  FlushEmptyCFTestWithParam()
+      : ColumnFamilyTestBase(std::get<0>(GetParam())),
+        allow_2pc_(std::get<1>(GetParam())) {}
 
   // Required if inheriting from testing::WithParamInterface<>
   static void SetUpTestCase() {}
@@ -673,10 +731,16 @@ TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) {
   db_options_.env = env_;
 }
 
-INSTANTIATE_TEST_CASE_P(FlushEmptyCFTestWithParam, FlushEmptyCFTestWithParam,
-                        ::testing::Bool());
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(test::kDefaultFormatVersion, true),
+                    std::make_tuple(test::kDefaultFormatVersion, false)));
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(test::kLatestFormatVersion, true),
+                    std::make_tuple(test::kLatestFormatVersion, false)));
 
-TEST_F(ColumnFamilyTest, AddDrop) {
+TEST_P(ColumnFamilyTest, AddDrop) {
   Open();
   CreateColumnFamilies({"one", "two", "three"});
   ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
@@ -702,7 +766,7 @@ TEST_F(ColumnFamilyTest, AddDrop) {
               std::vector<std::string>({"default", "four", "three"}));
 }
 
-TEST_F(ColumnFamilyTest, BulkAddDrop) {
+TEST_P(ColumnFamilyTest, BulkAddDrop) {
   constexpr int kNumCF = 1000;
   ColumnFamilyOptions cf_options;
   WriteOptions write_options;
@@ -740,7 +804,7 @@ TEST_F(ColumnFamilyTest, BulkAddDrop) {
   ASSERT_TRUE(families == std::vector<std::string>({"default"}));
 }
 
-TEST_F(ColumnFamilyTest, DropTest) {
+TEST_P(ColumnFamilyTest, DropTest) {
   // first iteration - dont reopen DB before dropping
   // second iteration - reopen DB before dropping
   for (int iter = 0; iter < 2; ++iter) {
@@ -764,7 +828,7 @@ TEST_F(ColumnFamilyTest, DropTest) {
   }
 }
 
-TEST_F(ColumnFamilyTest, WriteBatchFailure) {
+TEST_P(ColumnFamilyTest, WriteBatchFailure) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   WriteBatch batch;
@@ -782,7 +846,7 @@ TEST_F(ColumnFamilyTest, WriteBatchFailure) {
   Close();
 }
 
-TEST_F(ColumnFamilyTest, ReadWrite) {
+TEST_P(ColumnFamilyTest, ReadWrite) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   ASSERT_OK(Put(0, "foo", "v1"));
@@ -806,7 +870,7 @@ TEST_F(ColumnFamilyTest, ReadWrite) {
   Close();
 }
 
-TEST_F(ColumnFamilyTest, IgnoreRecoveredLog) {
+TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
   std::string backup_logs = dbname_ + "/backup_logs";
 
   // delete old files in backup_logs directory
@@ -882,7 +946,7 @@ TEST_F(ColumnFamilyTest, IgnoreRecoveredLog) {
 }
 
 #ifndef ROCKSDB_LITE  // TEST functions used are not supported
-TEST_F(ColumnFamilyTest, FlushTest) {
+TEST_P(ColumnFamilyTest, FlushTest) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
   ASSERT_OK(Put(0, "foo", "v1"));
@@ -930,65 +994,65 @@ TEST_F(ColumnFamilyTest, FlushTest) {
 }
 
 // Makes sure that obsolete log files get deleted
-TEST_F(ColumnFamilyTest, LogDeletionTest) {
+TEST_P(ColumnFamilyTest, LogDeletionTest) {
   db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
   column_family_options_.arena_block_size = 4 * 1024;
-  column_family_options_.write_buffer_size = 100000;  // 100KB
+  column_family_options_.write_buffer_size = 128000;  // 128KB
   Open();
   CreateColumnFamilies({"one", "two", "three", "four"});
   // Each bracket is one log file. if number is in (), it means
   // we don't need it anymore (it's been flushed)
   // []
   AssertCountLiveLogFiles(0);
-  PutRandomData(0, 1, 100);
+  PutRandomData(0, 1, 128);
   // [0]
-  PutRandomData(1, 1, 100);
+  PutRandomData(1, 1, 128);
   // [0, 1]
-  PutRandomData(1, 1000, 100);
+  PutRandomData(1, 1000, 128);
   WaitForFlush(1);
   // [0, (1)] [1]
   AssertCountLiveLogFiles(2);
-  PutRandomData(0, 1, 100);
+  PutRandomData(0, 1, 128);
   // [0, (1)] [0, 1]
   AssertCountLiveLogFiles(2);
-  PutRandomData(2, 1, 100);
+  PutRandomData(2, 1, 128);
   // [0, (1)] [0, 1, 2]
-  PutRandomData(2, 1000, 100);
+  PutRandomData(2, 1000, 128);
   WaitForFlush(2);
   // [0, (1)] [0, 1, (2)] [2]
   AssertCountLiveLogFiles(3);
-  PutRandomData(2, 1000, 100);
+  PutRandomData(2, 1000, 128);
   WaitForFlush(2);
   // [0, (1)] [0, 1, (2)] [(2)] [2]
   AssertCountLiveLogFiles(4);
-  PutRandomData(3, 1, 100);
+  PutRandomData(3, 1, 128);
   // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
-  PutRandomData(1, 1, 100);
+  PutRandomData(1, 1, 128);
   // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
   AssertCountLiveLogFiles(4);
-  PutRandomData(1, 1000, 100);
+  PutRandomData(1, 1000, 128);
   WaitForFlush(1);
   // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
   AssertCountLiveLogFiles(5);
-  PutRandomData(0, 1000, 100);
+  PutRandomData(0, 1000, 128);
   WaitForFlush(0);
   // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
   // delete obsolete logs -->
   // [(1), 2, 3] [1, (0)] [0]
   AssertCountLiveLogFiles(3);
-  PutRandomData(0, 1000, 100);
+  PutRandomData(0, 1000, 128);
   WaitForFlush(0);
   // [(1), 2, 3] [1, (0)], [(0)] [0]
   AssertCountLiveLogFiles(4);
-  PutRandomData(1, 1000, 100);
+  PutRandomData(1, 1000, 128);
   WaitForFlush(1);
   // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
   AssertCountLiveLogFiles(5);
-  PutRandomData(2, 1000, 100);
+  PutRandomData(2, 1000, 128);
   WaitForFlush(2);
   // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
   AssertCountLiveLogFiles(6);
-  PutRandomData(3, 1000, 100);
+  PutRandomData(3, 1000, 128);
   WaitForFlush(3);
   // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
   // delete obsolete logs -->
@@ -998,7 +1062,7 @@ TEST_F(ColumnFamilyTest, LogDeletionTest) {
 }
 #endif  // !ROCKSDB_LITE
 
-TEST_F(ColumnFamilyTest, CrashAfterFlush) {
+TEST_P(ColumnFamilyTest, CrashAfterFlush) {
   std::unique_ptr<FaultInjectionTestEnv> fault_env(
       new FaultInjectionTestEnv(env_));
   db_options_.env = fault_env.get();
@@ -1030,7 +1094,7 @@ TEST_F(ColumnFamilyTest, CrashAfterFlush) {
   db_options_.env = env_;
 }
 
-TEST_F(ColumnFamilyTest, OpenNonexistentColumnFamily) {
+TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
   ASSERT_OK(TryOpen({"default"}));
   Close();
   ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
@@ -1038,7 +1102,7 @@ TEST_F(ColumnFamilyTest, OpenNonexistentColumnFamily) {
 
 #ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
 // Makes sure that obsolete log files get deleted
-TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) {
+TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
   // disable flushing stale column families
   db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
   Open();
@@ -1143,45 +1207,49 @@ TEST_F(ColumnFamilyTest, DifferentWriteBufferSizes) {
 }
 #endif  // !ROCKSDB_LITE
 
-#ifndef ROCKSDB_LITE  // Cuckoo is not supported in lite
-TEST_F(ColumnFamilyTest, MemtableNotSupportSnapshot) {
-  db_options_.allow_concurrent_memtable_write = false;
-  Open();
-  auto* s1 = dbfull()->GetSnapshot();
-  ASSERT_TRUE(s1 != nullptr);
-  dbfull()->ReleaseSnapshot(s1);
-
-  // Add a column family that doesn't support snapshot
-  ColumnFamilyOptions first;
-  first.memtable_factory.reset(NewHashCuckooRepFactory(1024 * 1024));
-  CreateColumnFamilies({"first"}, {first});
-  auto* s2 = dbfull()->GetSnapshot();
-  ASSERT_TRUE(s2 == nullptr);
-
-  // Add a column family that supports snapshot. Snapshot stays not supported.
-  ColumnFamilyOptions second;
-  CreateColumnFamilies({"second"}, {second});
-  auto* s3 = dbfull()->GetSnapshot();
-  ASSERT_TRUE(s3 == nullptr);
-  Close();
-}
-#endif  // !ROCKSDB_LITE
+// The test is commented out because we want to test that snapshot is
+// not created for memtables not supported it, but There isn't a memtable
+// that doesn't support snapshot right now. If we have one later, we can
+// re-enable the test.
+//
+// #ifndef ROCKSDB_LITE  // Cuckoo is not supported in lite
+//   TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+//   db_options_.allow_concurrent_memtable_write = false;
+//   Open();
+//   auto* s1 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s1 != nullptr);
+//   dbfull()->ReleaseSnapshot(s1);
+
+//   // Add a column family that doesn't support snapshot
+//   ColumnFamilyOptions first;
+//   first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot());
+//   CreateColumnFamilies({"first"}, {first});
+//   auto* s2 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s2 == nullptr);
+
+//   // Add a column family that supports snapshot. Snapshot stays not
+//   supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"},
+//   {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
+//   Close();
+// }
+// #endif  // !ROCKSDB_LITE
 
 class TestComparator : public Comparator {
-  int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const override {
+  int Compare(const rocksdb::Slice& /*a*/,
+              const rocksdb::Slice& /*b*/) const override {
     return 0;
   }
   const char* Name() const override { return "Test"; }
-  void FindShortestSeparator(std::string* start,
-                             const rocksdb::Slice& limit) const override {}
-  void FindShortSuccessor(std::string* key) const override {}
+  void FindShortestSeparator(std::string* /*start*/,
+                             const rocksdb::Slice& /*limit*/) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 
 static TestComparator third_comparator;
 static TestComparator fourth_comparator;
 
 // Test that we can retrieve the comparator from a created CF
-TEST_F(ColumnFamilyTest, GetComparator) {
+TEST_P(ColumnFamilyTest, GetComparator) {
   Open();
   // Add a column family with no comparator specified
   CreateColumnFamilies({"first"});
@@ -1200,7 +1268,7 @@ TEST_F(ColumnFamilyTest, GetComparator) {
   Close();
 }
 
-TEST_F(ColumnFamilyTest, DifferentMergeOperators) {
+TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
   Open();
   CreateColumnFamilies({"first", "second"});
   ColumnFamilyOptions default_cf, first, second;
@@ -1231,7 +1299,7 @@ TEST_F(ColumnFamilyTest, DifferentMergeOperators) {
 }
 
 #ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
-TEST_F(ColumnFamilyTest, DifferentCompactionStyles) {
+TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
   Open();
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
@@ -1243,7 +1311,7 @@ TEST_F(ColumnFamilyTest, DifferentCompactionStyles) {
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = static_cast<uint64_t>(1) << 60;
 
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -1303,7 +1371,7 @@ TEST_F(ColumnFamilyTest, DifferentCompactionStyles) {
 #ifndef ROCKSDB_LITE
 // Sync points not supported in RocksDB Lite
 
-TEST_F(ColumnFamilyTest, MultipleManualCompactions) {
+TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
   Open();
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
@@ -1315,7 +1383,7 @@ TEST_F(ColumnFamilyTest, MultipleManualCompactions) {
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -1346,7 +1414,7 @@ TEST_F(ColumnFamilyTest, MultipleManualCompactions) {
        {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"},
        {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (cf_1_1) {
           TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4");
           cf_1_1 = false;
@@ -1392,15 +1460,17 @@ TEST_F(ColumnFamilyTest, MultipleManualCompactions) {
   CompactAll(2);
   AssertFilesPerLevel("0,1", 2);
   // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
     ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
     key_iter++;
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
   Close();
 }
 
-TEST_F(ColumnFamilyTest, AutomaticAndManualCompactions) {
+TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
   Open();
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
@@ -1412,7 +1482,8 @@ TEST_F(ColumnFamilyTest, AutomaticAndManualCompactions) {
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -1439,7 +1510,7 @@ TEST_F(ColumnFamilyTest, AutomaticAndManualCompactions) {
        {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"},
        {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (cf_1_1) {
           cf_1_1 = false;
           TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
@@ -1486,14 +1557,16 @@ TEST_F(ColumnFamilyTest, AutomaticAndManualCompactions) {
   CompactAll(2);
   AssertFilesPerLevel("0,1", 2);
   // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
     ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
     key_iter++;
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
-TEST_F(ColumnFamilyTest, ManualAndAutomaticCompactions) {
+TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
   Open();
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
@@ -1505,7 +1578,8 @@ TEST_F(ColumnFamilyTest, ManualAndAutomaticCompactions) {
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -1540,7 +1614,7 @@ TEST_F(ColumnFamilyTest, ManualAndAutomaticCompactions) {
        {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"},
        {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (cf_1_1) {
           TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
           cf_1_1 = false;
@@ -1582,14 +1656,16 @@ TEST_F(ColumnFamilyTest, ManualAndAutomaticCompactions) {
   CompactAll(2);
   AssertFilesPerLevel("0,1", 2);
   // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
     ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
     key_iter++;
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
-TEST_F(ColumnFamilyTest, SameCFManualManualCompactions) {
+TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) {
   Open();
   CreateColumnFamilies({"one"});
   ColumnFamilyOptions default_cf, one;
@@ -1601,7 +1677,8 @@ TEST_F(ColumnFamilyTest, SameCFManualManualCompactions) {
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -1633,7 +1710,7 @@ TEST_F(ColumnFamilyTest, SameCFManualManualCompactions) {
        {"ColumnFamilyTest::ManualManual:1",
         "ColumnFamilyTest::ManualManual:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (cf_1_1) {
           TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4");
           cf_1_1 = false;
@@ -1681,14 +1758,16 @@ TEST_F(ColumnFamilyTest, SameCFManualManualCompactions) {
   ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
 
   // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
     ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
     key_iter++;
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
-TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
   Open();
   CreateColumnFamilies({"one"});
   ColumnFamilyOptions default_cf, one;
@@ -1700,7 +1779,8 @@ TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -1731,7 +1811,7 @@ TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
        {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"},
        {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (cf_1_1) {
           TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
           cf_1_1 = false;
@@ -1771,14 +1851,16 @@ TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
   ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
 
   // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
     ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
     key_iter++;
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
-TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
   Open();
   CreateColumnFamilies({"one"});
   ColumnFamilyOptions default_cf, one;
@@ -1790,7 +1872,8 @@ TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -1823,7 +1906,7 @@ TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
         "ColumnFamilyTest::ManualAuto:3"},
        {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (cf_1_1) {
           TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
           cf_1_1 = false;
@@ -1861,131 +1944,13 @@ TEST_F(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
   AssertFilesPerLevel("0,1", 1);
 
   // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
-    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
-    key_iter++;
-  }
-}
-
-// This test checks for automatic getting a conflict if there is a
-// manual which has not yet been scheduled.
-// The manual compaction waits in NotScheduled
-// We generate more files and then trigger an automatic compaction
-// This will wait because there is an unscheduled manual compaction.
-// Once the conflict is hit, the manual compaction starts and ends
-// Then another automatic will start and end.
-TEST_F(ColumnFamilyTest, SameCFManualAutomaticConflict) {
-  Open();
-  CreateColumnFamilies({"one"});
-  ColumnFamilyOptions default_cf, one;
-  db_options_.max_open_files = 20;  // only 10 files in file cache
-  db_options_.max_background_compactions = 3;
-
-  default_cf.compaction_style = kCompactionStyleLevel;
-  default_cf.num_levels = 3;
-  default_cf.write_buffer_size = 64 << 10;  // 64KB
-  default_cf.target_file_size_base = 30 << 10;
-  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
-  table_options.no_block_cache = true;
-  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
-  one.compaction_style = kCompactionStyleUniversal;
-
-  one.num_levels = 1;
-  // trigger compaction if there are >= 4 files
-  one.level0_file_num_compaction_trigger = 4;
-  one.write_buffer_size = 120000;
-
-  Reopen({default_cf, one});
-  // make sure all background compaction jobs can be scheduled
-  auto stop_token =
-      dbfull()->TEST_write_controler().GetCompactionPressureToken();
-
-  // SETUP column family "one" -- universal style
-  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
-    PutRandomData(1, 10, 12000, true);
-    PutRandomData(1, 1, 10, true);
-    WaitForFlush(1);
-    AssertFilesPerLevel(ToString(i + 1), 1);
-  }
-  bool cf_1_1 = true;
-  bool cf_1_2 = true;
-  rocksdb::SyncPoint::GetInstance()->LoadDependency(
-      {{"DBImpl::BackgroundCompaction()::Conflict",
-        "ColumnFamilyTest::ManualAutoCon:7"},
-       {"ColumnFamilyTest::ManualAutoCon:9",
-        "ColumnFamilyTest::ManualAutoCon:8"},
-       {"ColumnFamilyTest::ManualAutoCon:2",
-        "ColumnFamilyTest::ManualAutoCon:6"},
-       {"ColumnFamilyTest::ManualAutoCon:4",
-        "ColumnFamilyTest::ManualAutoCon:5"},
-       {"ColumnFamilyTest::ManualAutoCon:1",
-        "ColumnFamilyTest::ManualAutoCon:2"},
-       {"ColumnFamilyTest::ManualAutoCon:1",
-        "ColumnFamilyTest::ManualAutoCon:3"}});
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
-        if (cf_1_1) {
-          TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:4");
-          cf_1_1 = false;
-          TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:3");
-        } else if (cf_1_2) {
-          cf_1_2 = false;
-          TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:2");
-        }
-      });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::RunManualCompaction:NotScheduled", [&](void* arg) {
-        InstrumentedMutex* mutex = static_cast<InstrumentedMutex*>(arg);
-        mutex->Unlock();
-        TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:9");
-        TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:7");
-        mutex->Lock();
-      });
-
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  rocksdb::port::Thread threads([&] {
-    CompactRangeOptions compact_options;
-    compact_options.exclusive_manual_compaction = false;
-    ASSERT_OK(
-        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
-    TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:6");
-  });
-
-  TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:8");
-  WaitForFlush(1);
-
-  // Add more L0 files and force automatic compaction
-  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
-    PutRandomData(1, 10, 12000, true);
-    PutRandomData(1, 1, 10, true);
-    WaitForFlush(1);
-    AssertFilesPerLevel(ToString(one.level0_file_num_compaction_trigger + i),
-                        1);
-  }
-
-  TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:5");
-  // Add more L0 files and force automatic compaction
-  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
-    PutRandomData(1, 10, 12000, true);
-    PutRandomData(1, 1, 10, true);
-    WaitForFlush(1);
-  }
-  TEST_SYNC_POINT("ColumnFamilyTest::ManualAutoCon:1");
-
-  threads.join();
-  WaitForCompaction();
-  // VERIFY compaction "one"
-  ASSERT_LE(NumTableFilesAtLevel(0, 1), 3);
-
-  // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
     ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
     key_iter++;
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
 // In this test, we generate enough files to trigger automatic compactions.
@@ -1994,7 +1959,7 @@ TEST_F(ColumnFamilyTest, SameCFManualAutomaticConflict) {
 // This will wait because the automatic compaction has files it needs.
 // Once the conflict is hit, the automatic compaction starts and ends
 // Then the manual will run and end.
-TEST_F(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
+TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
   Open();
   CreateColumnFamilies({"one"});
   ColumnFamilyOptions default_cf, one;
@@ -2006,7 +1971,8 @@ TEST_F(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
   default_cf.write_buffer_size = 64 << 10;  // 64KB
   default_cf.target_file_size_base = 30 << 10;
   default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
   table_options.no_block_cache = true;
   default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -2030,7 +1996,7 @@ TEST_F(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
        {"CompactionPicker::CompactRange:Conflict",
         "ColumnFamilyTest::AutoManual:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (cf_1_1) {
           TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
           cf_1_1 = false;
@@ -2070,15 +2036,17 @@ TEST_F(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
   // VERIFY compaction "one"
   AssertFilesPerLevel("1", 1);
   // Compare against saved keys
-  std::set<std::string>::iterator key_iter = keys_.begin();
-  while (key_iter != keys_.end()) {
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
     ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
     key_iter++;
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 #endif  // !ROCKSDB_LITE
 
-#ifndef ROCKSDB_LITE  // Tailing interator not supported
+#ifndef ROCKSDB_LITE  // Tailing iterator not supported
 namespace {
 std::string IterStatus(Iterator* iter) {
   std::string result;
@@ -2091,7 +2059,7 @@ std::string IterStatus(Iterator* iter) {
 }
 }  // anonymous namespace
 
-TEST_F(ColumnFamilyTest, NewIteratorsTest) {
+TEST_P(ColumnFamilyTest, NewIteratorsTest) {
   // iter == 0 -- no tailing
   // iter == 2 -- tailing
   for (int iter = 0; iter < 2; ++iter) {
@@ -2138,7 +2106,7 @@ TEST_F(ColumnFamilyTest, NewIteratorsTest) {
 #endif  // !ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
-TEST_F(ColumnFamilyTest, ReadOnlyDBTest) {
+TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
   ASSERT_OK(Put(0, "a", "b"));
@@ -2190,7 +2158,7 @@ TEST_F(ColumnFamilyTest, ReadOnlyDBTest) {
 #endif  // !ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE  //  WaitForFlush() is not supported in lite
-TEST_F(ColumnFamilyTest, DontRollEmptyLogs) {
+TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
 
@@ -2214,7 +2182,7 @@ TEST_F(ColumnFamilyTest, DontRollEmptyLogs) {
 #endif  // !ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE  //  WaitForCompaction() is not supported in lite
-TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) {
+TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
   Open();
   CreateColumnFamilies({"one", "two"});
   ColumnFamilyOptions default_cf, one, two;
@@ -2249,7 +2217,7 @@ TEST_F(ColumnFamilyTest, FlushStaleColumnFamilies) {
 }
 #endif  // !ROCKSDB_LITE
 
-TEST_F(ColumnFamilyTest, CreateMissingColumnFamilies) {
+TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
   Status s = TryOpen({"one", "two"});
   ASSERT_TRUE(!s.ok());
   db_options_.create_missing_column_families = true;
@@ -2258,7 +2226,7 @@ TEST_F(ColumnFamilyTest, CreateMissingColumnFamilies) {
   Close();
 }
 
-TEST_F(ColumnFamilyTest, SanitizeOptions) {
+TEST_P(ColumnFamilyTest, SanitizeOptions) {
   DBOptions db_options;
   for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
     for (int l = 0; l <= 2; l++) {
@@ -2307,7 +2275,7 @@ TEST_F(ColumnFamilyTest, SanitizeOptions) {
   }
 }
 
-TEST_F(ColumnFamilyTest, ReadDroppedColumnFamily) {
+TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) {
   // iter 0 -- drop CF, don't reopen
   // iter 1 -- delete CF, reopen
   for (int iter = 0; iter < 2; ++iter) {
@@ -2379,7 +2347,7 @@ TEST_F(ColumnFamilyTest, ReadDroppedColumnFamily) {
   }
 }
 
-TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) {
+TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
   db_options_.create_missing_column_families = true;
   Open({"default", "one"});
   ColumnFamilyOptions options;
@@ -2445,12 +2413,13 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) {
 // skipped as persisting options is not supported in ROCKSDB_LITE
 namespace {
 std::atomic<int> test_stage(0);
+std::atomic<bool> ordered_by_writethread(false);
 const int kMainThreadStartPersistingOptionsFile = 1;
 const int kChildThreadFinishDroppingColumnFamily = 2;
-const int kChildThreadWaitingMainThreadPersistOptions = 3;
 void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id,
                             std::vector<Comparator*>* comparators) {
-  while (test_stage < kMainThreadStartPersistingOptionsFile) {
+  while (test_stage < kMainThreadStartPersistingOptionsFile &&
+         !ordered_by_writethread) {
     Env::Default()->SleepForMicroseconds(100);
   }
   cf_test->DropColumnFamilies({cf_id});
@@ -2462,7 +2431,7 @@ void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id,
 }
 }  // namespace
 
-TEST_F(ColumnFamilyTest, CreateAndDropRace) {
+TEST_P(ColumnFamilyTest, CreateAndDropRace) {
   const int kCfCount = 5;
   std::vector<ColumnFamilyOptions> cf_opts;
   std::vector<Comparator*> comparators;
@@ -2477,28 +2446,25 @@ TEST_F(ColumnFamilyTest, CreateAndDropRace) {
   auto main_thread_id = std::this_thread::get_id();
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack("PersistRocksDBOptions:start",
-                                                 [&](void* arg) {
+                                                 [&](void* /*arg*/) {
     auto current_thread_id = std::this_thread::get_id();
     // If it's the main thread hitting this sync-point, then it
     // will be blocked until some other thread update the test_stage.
     if (main_thread_id == current_thread_id) {
       test_stage = kMainThreadStartPersistingOptionsFile;
-      while (test_stage < kChildThreadFinishDroppingColumnFamily) {
+      while (test_stage < kChildThreadFinishDroppingColumnFamily &&
+             !ordered_by_writethread) {
         Env::Default()->SleepForMicroseconds(100);
       }
     }
   });
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "WriteThread::EnterUnbatched:Wait", [&](void* arg) {
+      "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) {
         // This means a thread doing DropColumnFamily() is waiting for
         // other thread to finish persisting options.
         // In such case, we update the test_stage to unblock the main thread.
-        test_stage = kChildThreadWaitingMainThreadPersistOptions;
-
-        // Note that based on the test setting, this must not be the
-        // main thread.
-        ASSERT_NE(main_thread_id, std::this_thread::get_id());
+        ordered_by_writethread = true;
       });
 
   // Create a database with four column families
@@ -2509,7 +2475,8 @@ TEST_F(ColumnFamilyTest, CreateAndDropRace) {
 
   // Start a thread that will drop the first column family
   // and its comparator
-  rocksdb::port::Thread drop_cf_thread(DropSingleColumnFamily, this, 1, &comparators);
+  rocksdb::port::Thread drop_cf_thread(DropSingleColumnFamily, this, 1,
+                                       &comparators);
 
   DropColumnFamilies({2});
 
@@ -2521,10 +2488,13 @@ TEST_F(ColumnFamilyTest, CreateAndDropRace) {
       delete comparator;
     }
   }
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 #endif  // !ROCKSDB_LITE
 
-TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) {
+TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
   const uint64_t kBaseRate = 800000u;
   db_options_.delayed_write_rate = kBaseRate;
   db_options_.max_background_compactions = 6;
@@ -2544,139 +2514,139 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) {
   mutable_cf_options.disable_auto_compactions = false;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(201);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(400);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(500);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(450);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(205);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(202);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(201);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(198);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(399);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(599);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(2001);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(3001);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(390);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(100);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->set_l0_delay_trigger_count(100);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(101);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(0);
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(101);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(200);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(0);
   vstorage->TEST_set_estimated_compaction_needed_bytes(0);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   mutable_cf_options.disable_auto_compactions = true;
   dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage->set_l0_delay_trigger_count(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(0, GetDbDelayedWriteRate());
@@ -2684,7 +2654,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) {
 
   vstorage->set_l0_delay_trigger_count(60);
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(0, GetDbDelayedWriteRate());
@@ -2693,20 +2663,20 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) {
   mutable_cf_options.disable_auto_compactions = false;
   vstorage->set_l0_delay_trigger_count(70);
   vstorage->TEST_set_estimated_compaction_needed_bytes(500);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->set_l0_delay_trigger_count(71);
   vstorage->TEST_set_estimated_compaction_needed_bytes(501);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 }
 
-TEST_F(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
+TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
   db_options_.max_background_compactions = 6;
   Open({"default"});
   ColumnFamilyData* cfd =
@@ -2725,31 +2695,31 @@ TEST_F(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
   mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(40);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(45);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(7);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(9);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(6);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6
@@ -2758,19 +2728,19 @@ TEST_F(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
   mutable_cf_options.level0_stop_writes_trigger = 30;
 
   vstorage->set_l0_delay_trigger_count(5);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(7);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(3);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 }
 
-TEST_F(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
+TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
   const uint64_t kBaseRate = 810000u;
   db_options_.delayed_write_rate = kBaseRate;
   Open();
@@ -2793,59 +2763,59 @@ TEST_F(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
   mutable_cf_options1.soft_pending_compaction_bytes_limit = 500;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(50);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(201);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(70);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(800);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(300);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(700);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(500);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_TRUE(!IsDbWriteStopped());
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
   ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
 }
 
-TEST_F(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
+TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
   db_options_.max_background_compactions = 6;
   column_family_options_.soft_pending_compaction_bytes_limit = 200;
   column_family_options_.hard_pending_compaction_bytes_limit = 2000;
@@ -2872,46 +2842,79 @@ TEST_F(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
   mutable_cf_options1.level0_slowdown_writes_trigger = 16;
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(40);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(60);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(30);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(70);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->TEST_set_estimated_compaction_needed_bytes(20);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->TEST_set_estimated_compaction_needed_bytes(3);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(9);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage1->set_l0_delay_trigger_count(2);
-  cfd1->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
   ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
 
   vstorage->set_l0_delay_trigger_count(0);
-  cfd->RecalculateWriteStallConditions(mutable_cf_options);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
   ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
 }
 
+TEST_P(ColumnFamilyTest, CreateAndDestoryOptions) {
+  std::unique_ptr<ColumnFamilyOptions> cfo(new ColumnFamilyOptions());
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh));
+  cfo.reset();
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
 #ifndef ROCKSDB_LITE
-TEST_F(ColumnFamilyTest, FlushCloseWALFiles) {
+TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DisableFileDeletions());
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
   SpecialEnv env(Env::Default());
   db_options_.env = &env;
   db_options_.max_background_flushes = 1;
@@ -2953,7 +2956,7 @@ TEST_F(ColumnFamilyTest, FlushCloseWALFiles) {
 #endif  // !ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
-TEST_F(ColumnFamilyTest, IteratorCloseWALFile1) {
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
   SpecialEnv env(Env::Default());
   db_options_.env = &env;
   db_options_.max_background_flushes = 1;
@@ -2998,7 +3001,7 @@ TEST_F(ColumnFamilyTest, IteratorCloseWALFile1) {
   Close();
 }
 
-TEST_F(ColumnFamilyTest, IteratorCloseWALFile2) {
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
   SpecialEnv env(Env::Default());
   // Allow both of flush and purge job to schedule.
   env.SetBackgroundThreads(2, Env::HIGH);
@@ -3055,7 +3058,7 @@ TEST_F(ColumnFamilyTest, IteratorCloseWALFile2) {
 #endif  // !ROCKSDB_LITE
 
 #ifndef ROCKSDB_LITE  // TEST functions are not supported in lite
-TEST_F(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
+TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
   SpecialEnv env(Env::Default());
   // Allow both of flush and purge job to schedule.
   env.SetBackgroundThreads(2, Env::HIGH);
@@ -3132,7 +3135,7 @@ TEST_F(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
 // Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
 // to return true which is not so in unbuffered mode.
 #ifndef OS_WIN
-TEST_F(ColumnFamilyTest, LogSyncConflictFlush) {
+TEST_P(ColumnFamilyTest, LogSyncConflictFlush) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
 
@@ -3167,7 +3170,7 @@ TEST_F(ColumnFamilyTest, LogSyncConflictFlush) {
 // test is being used to ensure a roll of wal files.
 // Basic idea is to test that WAL truncation is being detected and not
 // ignored
-TEST_F(ColumnFamilyTest, DISABLED_LogTruncationTest) {
+TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
   Open();
   CreateColumnFamiliesAndReopen({"one", "two"});
 
@@ -3236,6 +3239,77 @@ TEST_F(ColumnFamilyTest, DISABLED_LogTruncationTest) {
   // cleanup
   env_->DeleteDir(backup_logs);
 }
+
+TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
+  Open();
+  // Leave cf_paths for one column families to be empty.
+  // Files should be generated according to db_paths for that
+  // column family.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  // Fill Column family 1.
+  PutRandomData(1, 100, 100);
+  Flush(1);
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Fill column family 2
+  PutRandomData(2, 100, 100);
+  Flush(2);
+
+  // SST from Column family 2 should be generated in
+  // db_paths which is dbname_ in this case.
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+}
+
+TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
+  Open();
+  // Configure Column family specific paths.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  PutRandomData(1, 100, 100, true /* save */);
+  Flush(1);
+
+  // Check that files are generated in appropriate paths.
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  PutRandomData(2, 100, 100, true /* save */);
+  Flush(2);
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Re-open and verify the keys.
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  for (int cf = 1; cf != 3; ++cf) {
+    ReadOptions read_options;
+    read_options.readahead_size = 0;
+    auto it = dbi->NewIterator(read_options, handles_[cf]);
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      Slice key(it->key());
+      ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString()));
+    }
+    delete it;
+
+    for (const auto& key : keys_[cf]) {
+      ASSERT_NE("NOT_FOUND", Get(cf, key));
+    }
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/compact_files_test.cc b/thirdparty/rocksdb/db/compact_files_test.cc
index 5aad6114f5..ce80375e0e 100644
--- a/thirdparty/rocksdb/db/compact_files_test.cc
+++ b/thirdparty/rocksdb/db/compact_files_test.cc
@@ -24,7 +24,7 @@ class CompactFilesTest : public testing::Test {
  public:
   CompactFilesTest() {
     env_ = Env::Default();
-    db_name_ = test::TmpDir(env_) + "/compact_files_test";
+    db_name_ = test::PerThreadDBPath("compact_files_test");
   }
 
   std::string db_name_;
@@ -35,10 +35,9 @@ class CompactFilesTest : public testing::Test {
 class FlushedFileCollector : public EventListener {
  public:
   FlushedFileCollector() {}
-  ~FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
 
-  virtual void OnFlushCompleted(
-      DB* db, const FlushJobInfo& info) override {
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
     std::lock_guard<std::mutex> lock(mutex_);
     flushed_files_.push_back(info.file_path);
   }
@@ -257,9 +256,9 @@ TEST_F(CompactFilesTest, CapturingPendingFiles) {
 TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
   class FilterWithGet : public CompactionFilter {
    public:
-    virtual bool Filter(int level, const Slice& key, const Slice& value,
-                        std::string* new_value,
-                        bool* value_changed) const override {
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+                std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
       if (db_ == nullptr) {
         return true;
       }
@@ -272,7 +271,7 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
       db_ = db;
     }
 
-    virtual const char* Name() const override { return "FilterWithGet"; }
+    const char* Name() const override { return "FilterWithGet"; }
 
    private:
     DB* db_;
@@ -309,6 +308,100 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
   delete db;
 }
 
+TEST_F(CompactFilesTest, SentinelCompressionType) {
+  if (!Zlib_Supported()) {
+    fprintf(stderr, "zlib compression not supported, skip this test\n");
+    return;
+  }
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "snappy compression not supported, skip this test\n");
+    return;
+  }
+  // Check that passing `CompressionType::kDisableCompressionOption` to
+  // `CompactFiles` causes it to use the column family compression options.
+  for (auto compaction_style :
+       {CompactionStyle::kCompactionStyleLevel,
+        CompactionStyle::kCompactionStyleUniversal,
+        CompactionStyle::kCompactionStyleNone}) {
+    DestroyDB(db_name_, Options());
+    Options options;
+    options.compaction_style = compaction_style;
+    // L0: Snappy, L1: ZSTD, L2: Snappy
+    options.compression_per_level = {CompressionType::kSnappyCompression,
+                                     CompressionType::kZlibCompression,
+                                     CompressionType::kSnappyCompression};
+    options.create_if_missing = true;
+    FlushedFileCollector* collector = new FlushedFileCollector();
+    options.listeners.emplace_back(collector);
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(options, db_name_, &db));
+
+    db->Put(WriteOptions(), "key", "val");
+    db->Flush(FlushOptions());
+
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_EQ(1, l0_files.size());
+
+    // L0->L1 compaction, so output should be ZSTD-compressed
+    CompactionOptions compaction_opts;
+    compaction_opts.compression = CompressionType::kDisableCompressionOption;
+    ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1));
+
+    rocksdb::TablePropertiesCollection all_tables_props;
+    ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
+    for (const auto& name_and_table_props : all_tables_props) {
+      ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
+                name_and_table_props.second->compression_name);
+    }
+    delete db;
+  }
+}
+
+TEST_F(CompactFilesTest, GetCompactionJobInfo) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 1000;
+  options.level0_stop_writes_trigger = 1000;
+  options.write_buffer_size = 65536;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.max_compaction_bytes = 5000;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  DestroyDB(db_name_, options);
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // create couple files
+  for (int i = 0; i < 500; ++i) {
+    db->Put(WriteOptions(), ToString(i), std::string(1000, 'a' + (i % 26)));
+  }
+  reinterpret_cast<DBImpl*>(db)->TEST_WaitForFlushMemTable();
+  auto l0_files_1 = collector->GetFlushedFiles();
+  CompactionOptions co;
+  co.compression = CompressionType::kLZ4Compression;
+  CompactionJobInfo compaction_job_info;
+  ASSERT_OK(
+      db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
+  ASSERT_EQ(compaction_job_info.base_input_level, 0);
+  ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID());
+  ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName());
+  ASSERT_EQ(compaction_job_info.compaction_reason,
+            CompactionReason::kManualCompaction);
+  ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression);
+  ASSERT_EQ(compaction_job_info.output_level, 0);
+  ASSERT_OK(compaction_job_info.status);
+  // no assertion failure
+  delete db;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
@@ -319,7 +412,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
   return 0;
diff --git a/thirdparty/rocksdb/db/compacted_db_impl.cc b/thirdparty/rocksdb/db/compacted_db_impl.cc
index d1007d972a..acdaad4ec2 100644
--- a/thirdparty/rocksdb/db/compacted_db_impl.cc
+++ b/thirdparty/rocksdb/db/compacted_db_impl.cc
@@ -17,29 +17,20 @@ extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
 
 CompactedDBImpl::CompactedDBImpl(
   const DBOptions& options, const std::string& dbname)
-  : DBImpl(options, dbname) {
+  : DBImpl(options, dbname), cfd_(nullptr), version_(nullptr),
+    user_comparator_(nullptr) {
 }
 
 CompactedDBImpl::~CompactedDBImpl() {
 }
 
 size_t CompactedDBImpl::FindFile(const Slice& key) {
-  size_t left = 0;
   size_t right = files_.num_files - 1;
-  while (left < right) {
-    size_t mid = (left + right) >> 1;
-    const FdWithKeyRange& f = files_.files[mid];
-    if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
-    }
-  }
-  return right;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+  };
+  return static_cast<size_t>(std::lower_bound(files_.files,
+                            files_.files + right, key, cmp) - files_.files);
 }
 
 Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
@@ -48,8 +39,8 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
                          GetContext::kNotFound, key, value, nullptr, nullptr,
                          nullptr, nullptr);
   LookupKey lkey(key, kMaxSequenceNumber);
-  files_.files[FindFile(key)].fd.table_reader->Get(
-      options, lkey.internal_key(), &get_context);
+  files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(),
+                                                   &get_context, nullptr);
   if (get_context.State() == GetContext::kFound) {
     return Status::OK();
   }
@@ -81,7 +72,7 @@ std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
                              GetContext::kNotFound, keys[idx], &pinnable_val,
                              nullptr, nullptr, nullptr, nullptr);
       LookupKey lkey(keys[idx], kMaxSequenceNumber);
-      r->Get(options, lkey.internal_key(), &get_context);
+      r->Get(options, lkey.internal_key(), &get_context, nullptr);
       value.assign(pinnable_val.data(), pinnable_val.size());
       if (get_context.State() == GetContext::kFound) {
         statuses[idx] = Status::OK();
@@ -93,6 +84,7 @@ std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
 }
 
 Status CompactedDBImpl::Init(const Options& options) {
+  SuperVersionContext sv_context(/* create_superversion */ true);
   mutex_.Lock();
   ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
                             ColumnFamilyOptions(options));
@@ -100,9 +92,10 @@ Status CompactedDBImpl::Init(const Options& options) {
   if (s.ok()) {
     cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
               DefaultColumnFamily())->cfd();
-    delete cfd_->InstallSuperVersion(new SuperVersion(), &mutex_);
+    cfd_->InstallSuperVersion(&sv_context, &mutex_);
   }
   mutex_.Unlock();
+  sv_context.Clean();
   if (!s.ok()) {
     return s;
   }
@@ -154,6 +147,7 @@ Status CompactedDBImpl::Open(const Options& options,
   std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
   Status s = db->Init(options);
   if (s.ok()) {
+    db->StartTimedTasks();
     ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
                    "Opened the db as fully compacted mode");
     LogFlush(db->immutable_db_options_.info_log);
diff --git a/thirdparty/rocksdb/db/compacted_db_impl.h b/thirdparty/rocksdb/db/compacted_db_impl.h
index de32f21e68..5c574b4b9a 100644
--- a/thirdparty/rocksdb/db/compacted_db_impl.h
+++ b/thirdparty/rocksdb/db/compacted_db_impl.h
@@ -32,55 +32,57 @@ class CompactedDBImpl : public DBImpl {
     override;
 
   using DBImpl::Put;
-  virtual Status Put(const WriteOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value) override {
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
   using DBImpl::Merge;
-  virtual Status Merge(const WriteOptions& options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) override {
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
   using DBImpl::Delete;
-  virtual Status Delete(const WriteOptions& options,
-                        ColumnFamilyHandle* column_family,
-                        const Slice& key) override {
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
-  virtual Status Write(const WriteOptions& options,
-                       WriteBatch* updates) override {
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
   using DBImpl::CompactRange;
-  virtual Status CompactRange(const CompactRangeOptions& options,
-                              ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end) override {
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
 
   virtual Status DisableFileDeletions() override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
-  virtual Status EnableFileDeletions(bool force) override {
+  virtual Status EnableFileDeletions(bool /*force*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
-  virtual Status GetLiveFiles(std::vector<std::string>&,
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
                               uint64_t* manifest_file_size,
-                              bool flush_memtable = true) override {
-    return Status::NotSupported("Not supported in compacted db mode.");
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
   }
   using DBImpl::Flush;
-  virtual Status Flush(const FlushOptions& options,
-                       ColumnFamilyHandle* column_family) override {
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
   using DB::IngestExternalFile;
   virtual Status IngestExternalFile(
-      ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& external_files,
-      const IngestExternalFileOptions& ingestion_options) override {
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
     return Status::NotSupported("Not supported in compacted db mode.");
   }
 
diff --git a/thirdparty/rocksdb/db/compaction.cc b/thirdparty/rocksdb/db/compaction.cc
index 706eb3be03..f8805376f1 100644
--- a/thirdparty/rocksdb/db/compaction.cc
+++ b/thirdparty/rocksdb/db/compaction.cc
@@ -23,6 +23,43 @@
 
 namespace rocksdb {
 
+const uint64_t kRangeTombstoneSentinel =
+    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b) {
+  auto c = user_cmp->Compare(a.user_key(), b.user_key());
+  if (c != 0) {
+    return c;
+  }
+  auto a_footer = ExtractInternalKeyFooter(a.Encode());
+  auto b_footer = ExtractInternalKeyFooter(b.Encode());
+  if (a_footer == kRangeTombstoneSentinel) {
+    if (b_footer != kRangeTombstoneSentinel) {
+      return -1;
+    }
+  } else if (b_footer == kRangeTombstoneSentinel) {
+    return 1;
+  }
+  return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b) {
+  if (a == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b) {
+  if (b == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, a, *b);
+}
+
 uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   uint64_t sum = 0;
   for (size_t i = 0; i < files.size() && files[i]; i++) {
@@ -81,40 +118,71 @@ void Compaction::GetBoundaryKeys(
   }
 }
 
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+    VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i].level == 0 || inputs[i].files.empty()) {
+      continue;
+    }
+    inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+    AtomicCompactionUnitBoundary cur_boundary;
+    size_t first_atomic_idx = 0;
+    auto add_unit_boundary = [&](size_t to) {
+      if (first_atomic_idx == to) return;
+      for (size_t k = first_atomic_idx; k < to; k++) {
+        inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+      }
+      first_atomic_idx = to;
+    };
+    for (size_t j = 0; j < inputs[i].files.size(); j++) {
+      const auto* f = inputs[i].files[j];
+      if (j == 0) {
+        // First file in a level.
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+                 0) {
+        // SSTs overlap but the end key of the previous file was not
+        // artificially extended by a range tombstone. Extend the current
+        // boundary.
+        cur_boundary.largest = &f->largest;
+      } else {
+        // Atomic compaction unit has ended.
+        add_unit_boundary(j);
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      }
+    }
+    add_unit_boundary(inputs[i].files.size());
+    assert(inputs[i].files.size() ==
+           inputs[i].atomic_compaction_unit_boundaries.size());
+  }
+  return inputs;
+}
+
 // helper function to determine if compaction is creating files at the
 // bottommost level
 bool Compaction::IsBottommostLevel(
     int output_level, VersionStorageInfo* vstorage,
     const std::vector<CompactionInputFiles>& inputs) {
-  if (inputs[0].level == 0 &&
-      inputs[0].files.back() != vstorage->LevelFiles(0).back()) {
-    return false;
+  int output_l0_idx;
+  if (output_level == 0) {
+    output_l0_idx = 0;
+    for (const auto* file : vstorage->LevelFiles(0)) {
+      if (inputs[0].files.back() == file) {
+        break;
+      }
+      ++output_l0_idx;
+    }
+    assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+  } else {
+    output_l0_idx = -1;
   }
-
   Slice smallest_key, largest_key;
   GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
-
-  // Checks whether there are files living beyond the output_level.
-  // If lower levels have files, it checks for overlap between files
-  // if the compaction process and those files.
-  // Bottomlevel optimizations can be made if there are no files in
-  // lower levels or if there is no overlap with the files in
-  // the lower levels.
-  for (int i = output_level + 1; i < vstorage->num_levels(); i++) {
-    // It is not the bottommost level if there are files in higher
-    // levels when the output level is 0 or if there are files in
-    // higher levels which overlap with files to be compacted.
-    // output_level == 0 means that we want it to be considered
-    // s the bottommost level only if the last file on the level
-    // is a part of the files to be compacted - this is verified by
-    // the first if condition in this function
-    if (vstorage->NumLevelFiles(i) > 0 &&
-        (output_level == 0 ||
-         vstorage->OverlapInLevel(i, &smallest_key, &largest_key))) {
-      return false;
-    }
-  }
-  return true;
+  return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+                                                  output_level, output_l0_idx);
 }
 
 // test function to validate the functionality of IsBottommostLevel()
@@ -146,6 +214,8 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
                        int _output_level, uint64_t _target_file_size,
                        uint64_t _max_compaction_bytes, uint32_t _output_path_id,
                        CompressionType _compression,
+                       CompressionOptions _compression_opts,
+                       uint32_t _max_subcompactions,
                        std::vector<FileMetaData*> _grandparents,
                        bool _manual_compaction, double _score,
                        bool _deletion_compaction,
@@ -155,6 +225,7 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
       output_level_(_output_level),
       max_output_file_size_(_target_file_size),
       max_compaction_bytes_(_max_compaction_bytes),
+      max_subcompactions_(_max_subcompactions),
       immutable_cf_options_(_immutable_cf_options),
       mutable_cf_options_(_mutable_cf_options),
       input_version_(nullptr),
@@ -162,8 +233,9 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
       cfd_(nullptr),
       output_path_id_(_output_path_id),
       output_compression_(_compression),
+      output_compression_opts_(_compression_opts),
       deletion_compaction_(_deletion_compaction),
-      inputs_(std::move(_inputs)),
+      inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
       grandparents_(std::move(_grandparents)),
       score_(_score),
       bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
@@ -175,6 +247,15 @@ Compaction::Compaction(VersionStorageInfo* vstorage,
   if (is_manual_compaction_) {
     compaction_reason_ = CompactionReason::kManualCompaction;
   }
+  if (max_subcompactions_ == 0) {
+    max_subcompactions_ = immutable_cf_options_.max_subcompactions;
+  }
+  if (!bottommost_level_) {
+    // Currently we only enable dictionary compression during compaction to the
+    // bottommost level.
+    output_compression_opts_.max_dict_bytes = 0;
+    output_compression_opts_.zstd_max_train_bytes = 0;
+  }
 
 #ifndef NDEBUG
   for (size_t i = 1; i < inputs_.size(); ++i) {
@@ -241,7 +322,7 @@ bool Compaction::IsTrivialMove() const {
 
   // Used in universal compaction, where trivial move can be done if the
   // input files are non overlapping
-  if ((immutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+  if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
       (output_level_ != 0)) {
     return is_trivial_move_;
   }
@@ -284,10 +365,10 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
   assert(input_version_ != nullptr);
   assert(level_ptrs != nullptr);
   assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
-  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
-    if (output_level_ == 0) {
-      return false;
-    }
+  if (bottommost_level_) {
+    return true;
+  } else if (output_level_ != 0 &&
+             cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
     // Maybe use binary search to find right entry instead of linear search?
     const Comparator* user_cmp = cfd_->user_comparator();
     for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
@@ -298,8 +379,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
         if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
           // We've advanced far enough
           if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
-            // Key falls in this file's range, so definitely
-            // exists beyond output level
+            // Key falls in this file's range, so it may
+            // exist beyond output level
             return false;
           }
           break;
@@ -307,9 +388,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
       }
     }
     return true;
-  } else {
-    return bottommost_level_;
   }
+  return false;
 }
 
 // Mark (or clear) each file that is being compacted
@@ -337,12 +417,14 @@ const char* Compaction::InputLevelSummary(
     if (!is_first) {
       len +=
           snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+      len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
     } else {
       is_first = false;
     }
     len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
                     "%" ROCKSDB_PRIszt "@%d", input_level.size(),
                     input_level.level);
+    len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
   }
   snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
            " files to L%d", output_level());
@@ -418,20 +500,23 @@ void Compaction::Summary(char* output, int len) {
 uint64_t Compaction::OutputFilePreallocationSize() const {
   uint64_t preallocation_size = 0;
 
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      preallocation_size += file->fd.GetFileSize();
+    }
+  }
+
   if (max_output_file_size_ != port::kMaxUint64 &&
-      (cfd_->ioptions()->compaction_style == kCompactionStyleLevel ||
+      (immutable_cf_options_.compaction_style == kCompactionStyleLevel ||
        output_level() > 0)) {
-    preallocation_size = max_output_file_size_;
-  } else {
-    for (const auto& level_files : inputs_) {
-      for (const auto& file : level_files.files) {
-        preallocation_size += file->fd.GetFileSize();
-      }
-    }
+    preallocation_size = std::min(max_output_file_size_, preallocation_size);
   }
+
   // Over-estimate slightly so we don't end up just barely crossing
   // the threshold
-  return preallocation_size + (preallocation_size / 10);
+  // No point to prellocate more than 1GB.
+  return std::min(uint64_t{1073741824},
+                  preallocation_size + (preallocation_size / 10));
 }
 
 std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
@@ -452,11 +537,12 @@ bool Compaction::IsOutputLevelEmpty() const {
 }
 
 bool Compaction::ShouldFormSubcompactions() const {
-  if (immutable_cf_options_.max_subcompactions <= 1 || cfd_ == nullptr) {
+  if (max_subcompactions_ <= 1 || cfd_ == nullptr) {
     return false;
   }
   if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
-    return start_level_ == 0 && output_level_ > 0 && !IsOutputLevelEmpty();
+    return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0 &&
+           !IsOutputLevelEmpty();
   } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
     return number_levels_ > 1 && output_level_ > 0;
   } else {
@@ -477,4 +563,8 @@ uint64_t Compaction::MaxInputFileCreationTime() const {
   return max_creation_time;
 }
 
+int Compaction::GetInputBaseLevel() const {
+  return input_vstorage_->base_level();
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/compaction.h b/thirdparty/rocksdb/db/compaction.h
index 7be6df2c1e..2cf737b676 100644
--- a/thirdparty/rocksdb/db/compaction.h
+++ b/thirdparty/rocksdb/db/compaction.h
@@ -15,11 +15,43 @@
 
 namespace rocksdb {
 
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of  SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+  const InternalKey* smallest = nullptr;
+  const InternalKey* largest = nullptr;
+};
+
 // The structure that manages compaction input files associated
 // with the same physical level.
 struct CompactionInputFiles {
   int level;
   std::vector<FileMetaData*> files;
+  std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
   inline bool empty() const { return files.empty(); }
   inline size_t size() const { return files.size(); }
   inline void clear() { files.clear(); }
@@ -40,6 +72,7 @@ class Compaction {
              std::vector<CompactionInputFiles> inputs, int output_level,
              uint64_t target_file_size, uint64_t max_compaction_bytes,
              uint32_t output_path_id, CompressionType compression,
+             CompressionOptions compression_opts, uint32_t max_subcompactions,
              std::vector<FileMetaData*> grandparents,
              bool manual_compaction = false, double score = -1,
              bool deletion_compaction = false,
@@ -95,6 +128,12 @@ class Compaction {
     return inputs_[compaction_input_level][i];
   }
 
+  const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+  }
+
   // Returns the list of file meta data of the specified compaction
   // input level.
   // REQUIREMENT: "compaction_input_level" must be >= 0 and
@@ -118,6 +157,11 @@ class Compaction {
   // What compression for output
   CompressionType output_compression() const { return output_compression_; }
 
+  // What compression options for output
+  CompressionOptions output_compression_opts() const {
+    return output_compression_opts_;
+  }
+
   // Whether need to write output file to second DB path.
   uint32_t output_path_id() const { return output_path_id_; }
 
@@ -233,6 +277,8 @@ class Compaction {
 
   Slice GetLargestUserKey() const { return largest_user_key_; }
 
+  int GetInputBaseLevel() const;
+
   CompactionReason compaction_reason() { return compaction_reason_; }
 
   const std::vector<FileMetaData*>& grandparents() const {
@@ -241,6 +287,8 @@ class Compaction {
 
   uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
 
+  uint32_t max_subcompactions() const { return max_subcompactions_; }
+
   uint64_t MaxInputFileCreationTime() const;
 
  private:
@@ -252,6 +300,13 @@ class Compaction {
                               const std::vector<CompactionInputFiles>& inputs,
                               Slice* smallest_key, Slice* largest_key);
 
+  // Get the atomic file boundaries for all files in the compaction. Necessary
+  // in order to avoid the scenario described in
+  // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and plumb
+  // down appropriate key boundaries to RangeDelAggregator during compaction.
+  static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+      VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
   // helper function to determine if compaction with inputs and storage is
   // bottommost
   static bool IsBottommostLevel(
@@ -267,6 +322,7 @@ class Compaction {
   const int output_level_;  // levels to which output files are stored
   uint64_t max_output_file_size_;
   uint64_t max_compaction_bytes_;
+  uint32_t max_subcompactions_;
   const ImmutableCFOptions immutable_cf_options_;
   const MutableCFOptions mutable_cf_options_;
   Version* input_version_;
@@ -277,6 +333,7 @@ class Compaction {
 
   const uint32_t output_path_id_;
   CompressionType output_compression_;
+  CompressionOptions output_compression_opts_;
   // If true, then the comaction can be done by simply deleting input files.
   const bool deletion_compaction_;
 
diff --git a/thirdparty/rocksdb/db/compaction_iterator.cc b/thirdparty/rocksdb/db/compaction_iterator.cc
index ae63f04d83..93c2b5fa9e 100644
--- a/thirdparty/rocksdb/db/compaction_iterator.cc
+++ b/thirdparty/rocksdb/db/compaction_iterator.cc
@@ -4,102 +4,105 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/compaction_iterator.h"
+
+#include "db/snapshot_checker.h"
+#include "port/likely.h"
 #include "rocksdb/listener.h"
 #include "table/internal_iterator.h"
+#include "util/sync_point.h"
 
-namespace rocksdb {
+#define DEFINITELY_IN_SNAPSHOT(seq, snapshot)                       \
+  ((seq) <= (snapshot) &&                                           \
+   (snapshot_checker_ == nullptr ||                                 \
+    LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+           SnapshotCheckerResult::kInSnapshot)))
 
-#ifndef ROCKSDB_LITE
-CompactionEventListener::CompactionListenerValueType fromInternalValueType(
-    ValueType vt) {
-  switch (vt) {
-    case kTypeDeletion:
-      return CompactionEventListener::CompactionListenerValueType::kDelete;
-    case kTypeValue:
-      return CompactionEventListener::CompactionListenerValueType::kValue;
-    case kTypeMerge:
-      return CompactionEventListener::CompactionListenerValueType::
-          kMergeOperand;
-    case kTypeSingleDeletion:
-      return CompactionEventListener::CompactionListenerValueType::
-          kSingleDelete;
-    case kTypeRangeDeletion:
-      return CompactionEventListener::CompactionListenerValueType::kRangeDelete;
-    case kTypeBlobIndex:
-      return CompactionEventListener::CompactionListenerValueType::kBlobIndex;
-    default:
-      assert(false);
-      return CompactionEventListener::CompactionListenerValueType::kInvalid;
-  }
-}
-#endif  // ROCKSDB_LITE
+#define DEFINITELY_NOT_IN_SNAPSHOT(seq, snapshot)                     \
+  ((seq) > (snapshot) ||                                              \
+   (snapshot_checker_ != nullptr &&                                   \
+    UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == \
+             SnapshotCheckerResult::kNotInSnapshot)))
+
+#define IN_EARLIEST_SNAPSHOT(seq) \
+  ((seq) <= earliest_snapshot_ && \
+   (snapshot_checker_ == nullptr || LIKELY(IsInEarliestSnapshot(seq))))
+
+namespace rocksdb {
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
     SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
-    SequenceNumber earliest_write_conflict_snapshot, Env* env,
-    bool expect_valid_internal_key, RangeDelAggregator* range_del_agg,
-    const Compaction* compaction, const CompactionFilter* compaction_filter,
-    CompactionEventListener* compaction_listener,
-    const std::atomic<bool>* shutting_down)
+    SequenceNumber earliest_write_conflict_snapshot,
+    const SnapshotChecker* snapshot_checker, Env* env,
+    bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction,
+    const CompactionFilter* compaction_filter,
+    const std::atomic<bool>* shutting_down,
+    const SequenceNumber preserve_deletes_seqnum)
     : CompactionIterator(
           input, cmp, merge_helper, last_sequence, snapshots,
-          earliest_write_conflict_snapshot, env, expect_valid_internal_key,
-          range_del_agg,
+          earliest_write_conflict_snapshot, snapshot_checker, env,
+          report_detailed_time, expect_valid_internal_key, range_del_agg,
           std::unique_ptr<CompactionProxy>(
               compaction ? new CompactionProxy(compaction) : nullptr),
-          compaction_filter, compaction_listener, shutting_down) {}
+          compaction_filter, shutting_down, preserve_deletes_seqnum) {}
 
 CompactionIterator::CompactionIterator(
     InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
-    SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
-    SequenceNumber earliest_write_conflict_snapshot, Env* env,
-    bool expect_valid_internal_key, RangeDelAggregator* range_del_agg,
+    SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    const SnapshotChecker* snapshot_checker, Env* env,
+    bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg,
     std::unique_ptr<CompactionProxy> compaction,
     const CompactionFilter* compaction_filter,
-    CompactionEventListener* compaction_listener,
-    const std::atomic<bool>* shutting_down)
+    const std::atomic<bool>* shutting_down,
+    const SequenceNumber preserve_deletes_seqnum)
     : input_(input),
       cmp_(cmp),
       merge_helper_(merge_helper),
       snapshots_(snapshots),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
       env_(env),
+      report_detailed_time_(report_detailed_time),
       expect_valid_internal_key_(expect_valid_internal_key),
       range_del_agg_(range_del_agg),
       compaction_(std::move(compaction)),
       compaction_filter_(compaction_filter),
-#ifndef ROCKSDB_LITE
-      compaction_listener_(compaction_listener),
-#endif  // ROCKSDB_LITE
       shutting_down_(shutting_down),
-      ignore_snapshots_(false),
-      merge_out_iter_(merge_helper_) {
+      preserve_deletes_seqnum_(preserve_deletes_seqnum),
+      current_user_key_sequence_(0),
+      current_user_key_snapshot_(0),
+      merge_out_iter_(merge_helper_),
+      current_key_committed_(false) {
   assert(compaction_filter_ == nullptr || compaction_ != nullptr);
+  assert(snapshots_ != nullptr);
   bottommost_level_ =
       compaction_ == nullptr ? false : compaction_->bottommost_level();
   if (compaction_ != nullptr) {
     level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
   }
-
   if (snapshots_->size() == 0) {
     // optimize for fast path if there are no snapshots
     visible_at_tip_ = true;
+    earliest_snapshot_iter_ = snapshots_->end();
     earliest_snapshot_ = kMaxSequenceNumber;
     latest_snapshot_ = 0;
   } else {
     visible_at_tip_ = false;
+    earliest_snapshot_iter_ = snapshots_->begin();
     earliest_snapshot_ = snapshots_->at(0);
     latest_snapshot_ = snapshots_->back();
   }
-  if (compaction_filter_ != nullptr) {
-    if (compaction_filter_->IgnoreSnapshots()) {
-      ignore_snapshots_ = true;
-    }
-  } else {
-    ignore_snapshots_ = false;
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
   }
+#endif
   input_->SetPinnedItersMgr(&pinned_iters_mgr_);
+  TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }
 
 CompactionIterator::~CompactionIterator() {
@@ -131,8 +134,8 @@ void CompactionIterator::Next() {
     if (merge_out_iter_.Valid()) {
       key_ = merge_out_iter_.key();
       value_ = merge_out_iter_.value();
-      bool valid_key __attribute__((__unused__)) =
-          ParseInternalKey(key_, &ikey_);
+      bool valid_key __attribute__((__unused__));
+      valid_key =  ParseInternalKey(key_, &ikey_);
       // MergeUntil stops when it encounters a corrupt key and does not
       // include them in the result, so we expect the keys here to be valid.
       assert(valid_key);
@@ -166,6 +169,59 @@ void CompactionIterator::Next() {
   PrepareOutput();
 }
 
+void CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+                                              Slice* skip_until) {
+  if (compaction_filter_ != nullptr &&
+      (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex)) {
+    // If the user has specified a compaction filter and the sequence
+    // number is greater than any external snapshot, then invoke the
+    // filter. If the return value of the compaction filter is true,
+    // replace the entry with a deletion marker.
+    CompactionFilter::Decision filter;
+    compaction_filter_value_.clear();
+    compaction_filter_skip_until_.Clear();
+    CompactionFilter::ValueType value_type =
+        ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                                 : CompactionFilter::ValueType::kBlobIndex;
+    // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+    // to get sequence number.
+    Slice& filter_key = ikey_.type == kTypeValue ? ikey_.user_key : key_;
+    {
+      StopWatchNano timer(env_, report_detailed_time_);
+      filter = compaction_filter_->FilterV2(
+          compaction_->level(), filter_key, value_type, value_,
+          &compaction_filter_value_, compaction_filter_skip_until_.rep());
+      iter_stats_.total_filter_time +=
+          env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+    }
+
+    if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+        cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+            0) {
+      // Can't skip to a key smaller than the current one.
+      // Keep the key as per FilterV2 documentation.
+      filter = CompactionFilter::Decision::kKeep;
+    }
+
+    if (filter == CompactionFilter::Decision::kRemove) {
+      // convert the current key to a delete; key_ is pointing into
+      // current_key_ at this point, so updating current_key_ updates key()
+      ikey_.type = kTypeDeletion;
+      current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+      // no value associated with delete
+      value_.clear();
+      iter_stats_.num_record_drop_user++;
+    } else if (filter == CompactionFilter::Decision::kChangeValue) {
+      value_ = compaction_filter_value_;
+    } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+      *need_skip = true;
+      compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                       kValueTypeForSeek);
+      *skip_until = compaction_filter_skip_until_.Encode();
+    }
+  }
+}
+
 void CompactionIterator::NextFromInput() {
   at_next_ = false;
   valid_ = false;
@@ -192,6 +248,7 @@ void CompactionIterator::NextFromInput() {
       valid_ = true;
       break;
     }
+    TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
 
     // Update input statistics
     if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion) {
@@ -220,73 +277,14 @@ void CompactionIterator::NextFromInput() {
       has_outputted_key_ = false;
       current_user_key_sequence_ = kMaxSequenceNumber;
       current_user_key_snapshot_ = 0;
+      current_key_committed_ = KeyCommitted(ikey_.sequence);
 
-#ifndef ROCKSDB_LITE
-      if (compaction_listener_) {
-        compaction_listener_->OnCompaction(compaction_->level(), ikey_.user_key,
-                                           fromInternalValueType(ikey_.type),
-                                           value_, ikey_.sequence, true);
-      }
-#endif  // ROCKSDB_LITE
-
-      // apply the compaction filter to the first occurrence of the user key
-      if (compaction_filter_ != nullptr && 
-          (ikey_.type == kTypeValue || ikey_.type == kTypeBlobIndex) &&
-          (visible_at_tip_ || ikey_.sequence > latest_snapshot_ ||
-           ignore_snapshots_)) {
-        // If the user has specified a compaction filter and the sequence
-        // number is greater than any external snapshot, then invoke the
-        // filter. If the return value of the compaction filter is true,
-        // replace the entry with a deletion marker.
-        CompactionFilter::Decision filter;
-        compaction_filter_value_.clear();
-        compaction_filter_skip_until_.Clear();
-        CompactionFilter::ValueType value_type =
-            ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
-                                     : CompactionFilter::ValueType::kBlobIndex;
-        {
-          StopWatchNano timer(env_, true);
-          filter = compaction_filter_->FilterV2(
-              compaction_->level(), ikey_.user_key, value_type, value_,
-              &compaction_filter_value_, compaction_filter_skip_until_.rep());
-          iter_stats_.total_filter_time +=
-              env_ != nullptr ? timer.ElapsedNanos() : 0;
-        }
-
-        if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
-            cmp_->Compare(*compaction_filter_skip_until_.rep(),
-                          ikey_.user_key) <= 0) {
-          // Can't skip to a key smaller than the current one.
-          // Keep the key as per FilterV2 documentation.
-          filter = CompactionFilter::Decision::kKeep;
-        }
-
-        if (filter == CompactionFilter::Decision::kRemove) {
-          // convert the current key to a delete; key_ is pointing into
-          // current_key_ at this point, so updating current_key_ updates key()
-          ikey_.type = kTypeDeletion;
-          current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
-          // no value associated with delete
-          value_.clear();
-          iter_stats_.num_record_drop_user++;
-        } else if (filter == CompactionFilter::Decision::kChangeValue) {
-          value_ = compaction_filter_value_;
-        } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
-          need_skip = true;
-          compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
-                                                           kValueTypeForSeek);
-          skip_until = compaction_filter_skip_until_.Encode();
-        }
+      // Apply the compaction filter to the first committed version of the user
+      // key.
+      if (current_key_committed_) {
+        InvokeFilterIfNeeded(&need_skip, &skip_until);
       }
     } else {
-#ifndef ROCKSDB_LITE
-      if (compaction_listener_) {
-        compaction_listener_->OnCompaction(compaction_->level(), ikey_.user_key,
-                                           fromInternalValueType(ikey_.type),
-                                           value_, ikey_.sequence, false);
-      }
-#endif  // ROCKSDB_LITE
-
       // Update the current key to reflect the new sequence number/type without
       // copying the user key.
       // TODO(rven): Compaction filter does not process keys in this path
@@ -295,13 +293,32 @@ void CompactionIterator::NextFromInput() {
       current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
       key_ = current_key_.GetInternalKey();
       ikey_.user_key = current_key_.GetUserKey();
+
+      // Note that newer version of a key is ordered before older versions. If a
+      // newer version of a key is committed, so as the older version. No need
+      // to query snapshot_checker_ in that case.
+      if (UNLIKELY(!current_key_committed_)) {
+        assert(snapshot_checker_ != nullptr);
+        current_key_committed_ = KeyCommitted(ikey_.sequence);
+        // Apply the compaction filter to the first committed version of the
+        // user key.
+        if (current_key_committed_) {
+          InvokeFilterIfNeeded(&need_skip, &skip_until);
+        }
+      }
+    }
+
+    if (UNLIKELY(!current_key_committed_)) {
+      assert(snapshot_checker_ != nullptr);
+      valid_ = true;
+      break;
     }
 
     // If there are no snapshots, then this kv affect visibility at tip.
     // Otherwise, search though all existing snapshots to find the earliest
     // snapshot that is affected by this kv.
-    SequenceNumber last_sequence __attribute__((__unused__)) =
-        current_user_key_sequence_;
+    SequenceNumber last_sequence __attribute__((__unused__));
+    last_sequence = current_user_key_sequence_;
     current_user_key_sequence_ = ikey_.sequence;
     SequenceNumber last_snapshot = current_user_key_snapshot_;
     SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
@@ -366,7 +383,8 @@ void CompactionIterator::NextFromInput() {
           cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
         // Check whether the next key belongs to the same snapshot as the
         // SingleDelete.
-        if (prev_snapshot == 0 || next_ikey.sequence > prev_snapshot) {
+        if (prev_snapshot == 0 ||
+            DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot)) {
           if (next_ikey.type == kTypeSingleDeletion) {
             // We encountered two SingleDeletes in a row.  This could be due to
             // unexpected user input.
@@ -377,8 +395,9 @@ void CompactionIterator::NextFromInput() {
             // input_->Next().
             ++iter_stats_.num_record_drop_obsolete;
             ++iter_stats_.num_single_del_mismatch;
-          } else if ((ikey_.sequence <= earliest_write_conflict_snapshot_) ||
-                     has_outputted_key_) {
+          } else if (has_outputted_key_ ||
+                     DEFINITELY_IN_SNAPSHOT(
+                         ikey_.sequence, earliest_write_conflict_snapshot_)) {
             // Found a matching value, we can drop the single delete and the
             // value.  It is safe to drop both records since we've already
             // outputted a key in this snapshot, or there is no earlier
@@ -388,7 +407,8 @@ void CompactionIterator::NextFromInput() {
             // is an unexpected Merge or Delete.  We will compact it out
             // either way. We will maintain counts of how many mismatches
             // happened
-            if (next_ikey.type != kTypeValue) {
+            if (next_ikey.type != kTypeValue &&
+                next_ikey.type != kTypeBlobIndex) {
               ++iter_stats_.num_single_del_mismatch;
             }
 
@@ -425,7 +445,7 @@ void CompactionIterator::NextFromInput() {
         // iteration. If the next key is corrupt, we return before the
         // comparison, so the value of has_current_user_key does not matter.
         has_current_user_key_ = false;
-        if (compaction_ != nullptr && ikey_.sequence <= earliest_snapshot_ &&
+        if (compaction_ != nullptr && IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
             compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                        &level_ptrs_)) {
           // Key doesn't exist outside of this range.
@@ -444,21 +464,38 @@ void CompactionIterator::NextFromInput() {
       if (valid_) {
         at_next_ = true;
       }
-    } else if (last_snapshot == current_user_key_snapshot_) {
+    } else if (last_snapshot == current_user_key_snapshot_ ||
+               (last_snapshot > 0 &&
+                last_snapshot < current_user_key_snapshot_)) {
       // If the earliest snapshot is which this key is visible in
       // is the same as the visibility of a previous instance of the
       // same key, then this kv is not visible in any snapshot.
       // Hidden by an newer entry for same user key
-      // TODO(noetzli): why not > ?
       //
       // Note: Dropping this key will not affect TransactionDB write-conflict
       // checking since there has already been a record returned for this key
       // in this snapshot.
       assert(last_sequence >= current_user_key_sequence_);
+
+      // Note2: if last_snapshot < current_user_key_snapshot, it can only
+      // mean last_snapshot is released between we process last value and
+      // this value, and findEarliestVisibleSnapshot returns the next snapshot
+      // as current_user_key_snapshot. In this case last value and current
+      // value are both in current_user_key_snapshot currently.
+      // Although last_snapshot is released we might still get a definitive
+      // response when key sequence number changes, e.g., when seq is determined
+      // too old and visible in all snapshots.
+      assert(last_snapshot == current_user_key_snapshot_ ||
+             (snapshot_checker_ != nullptr &&
+              snapshot_checker_->CheckInSnapshot(current_user_key_sequence_,
+                                                 last_snapshot) !=
+                  SnapshotCheckerResult::kNotInSnapshot));
+
       ++iter_stats_.num_record_drop_hidden;  // (A)
       input_->Next();
     } else if (compaction_ != nullptr && ikey_.type == kTypeDeletion &&
-               ikey_.sequence <= earliest_snapshot_ &&
+               IN_EARLIEST_SNAPSHOT(ikey_.sequence) &&
+               ikeyNotNeededForIncrementalSnapshot() &&
                compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
                                                           &level_ptrs_)) {
       // TODO(noetzli): This is the only place where we use compaction_
@@ -475,11 +512,38 @@ void CompactionIterator::NextFromInput() {
       //
       // Note:  Dropping this Delete will not affect TransactionDB
       // write-conflict checking since it is earlier than any snapshot.
+      //
+      // It seems that we can also drop deletion later than earliest snapshot
+      // given that:
+      // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+      // (2) No value exist earlier than the deletion.
       ++iter_stats_.num_record_drop_obsolete;
       if (!bottommost_level_) {
         ++iter_stats_.num_optimized_del_drop_obsolete;
       }
       input_->Next();
+    } else if ((ikey_.type == kTypeDeletion) && bottommost_level_ &&
+               ikeyNotNeededForIncrementalSnapshot()) {
+      // Handle the case where we have a delete key at the bottom most level
+      // We can skip outputting the key iff there are no subsequent puts for this
+      // key
+      ParsedInternalKey next_ikey;
+      input_->Next();
+      // Skip over all versions of this key that happen to occur in the same snapshot
+      // range as the delete
+      while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+             cmp_->Equal(ikey_.user_key, next_ikey.user_key) &&
+             (prev_snapshot == 0 ||
+              DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) {
+        input_->Next();
+      }
+      // If you find you still need to output a row with this key, we need to output the
+      // delete too
+      if (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) &&
+          cmp_->Equal(ikey_.user_key, next_ikey.user_key)) {
+        valid_ = true;
+        at_next_ = true;
+      }
     } else if (ikey_.type == kTypeMerge) {
       if (!merge_helper_->HasOperator()) {
         status_ = Status::InvalidArgument(
@@ -504,8 +568,8 @@ void CompactionIterator::NextFromInput() {
         //       These will be correctly set below.
         key_ = merge_out_iter_.key();
         value_ = merge_out_iter_.value();
-        bool valid_key __attribute__((__unused__)) =
-            ParseInternalKey(key_, &ikey_);
+        bool valid_key __attribute__((__unused__));
+        valid_key = ParseInternalKey(key_, &ikey_);
         // MergeUntil stops when it encounters a corrupt key and does not
         // include them in the result, so we expect the keys here to valid.
         assert(valid_key);
@@ -529,7 +593,7 @@ void CompactionIterator::NextFromInput() {
       // 1. new user key -OR-
       // 2. different snapshot stripe
       bool should_delete = range_del_agg_->ShouldDelete(
-          key_, RangeDelAggregator::RangePositioningMode::kForwardTraversal);
+          key_, RangeDelPositioningMode::kForwardTraversal);
       if (should_delete) {
         ++iter_stats_.num_record_drop_hidden;
         ++iter_stats_.num_record_drop_range_del;
@@ -555,13 +619,15 @@ void CompactionIterator::PrepareOutput() {
   // and the earliest snapshot is larger than this seqno
   // and the userkey differs from the last userkey in compaction
   // then we can squash the seqno to zero.
-
+  //
   // This is safe for TransactionDB write-conflict checking since transactions
   // only care about sequence number larger than any active snapshots.
+  //
+  // Can we do the same for levels above bottom level as long as
+  // KeyNotExistsBeyondOutputLevel() return true?
   if ((compaction_ != nullptr && !compaction_->allow_ingest_behind()) &&
-      bottommost_level_ && valid_ && ikey_.sequence <= earliest_snapshot_ &&
-      ikey_.type != kTypeMerge &&
-      !cmp_->Equal(compaction_->GetLargestUserKey(), ikey_.user_key)) {
+      ikeyNotNeededForIncrementalSnapshot() && bottommost_level_ && valid_ &&
+      IN_EARLIEST_SNAPSHOT(ikey_.sequence) && ikey_.type != kTypeMerge) {
     assert(ikey_.type != kTypeDeletion && ikey_.type != kTypeSingleDeletion);
     ikey_.sequence = 0;
     current_key_.UpdateInternalKey(0, ikey_.type);
@@ -571,18 +637,68 @@ void CompactionIterator::PrepareOutput() {
 inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
     SequenceNumber in, SequenceNumber* prev_snapshot) {
   assert(snapshots_->size());
-  SequenceNumber prev __attribute__((__unused__)) = kMaxSequenceNumber;
-  for (const auto cur : *snapshots_) {
-    assert(prev == kMaxSequenceNumber || prev <= cur);
-    if (cur >= in) {
-      *prev_snapshot = prev == kMaxSequenceNumber ? 0 : prev;
+  auto snapshots_iter = std::lower_bound(
+      snapshots_->begin(), snapshots_->end(), in);
+  if (snapshots_iter == snapshots_->begin()) {
+    *prev_snapshot = 0;
+  } else {
+    *prev_snapshot = *std::prev(snapshots_iter);
+    assert(*prev_snapshot < in);
+  }
+  if (snapshot_checker_ == nullptr) {
+    return snapshots_iter != snapshots_->end()
+      ? *snapshots_iter : kMaxSequenceNumber;
+  }
+  bool has_released_snapshot = !released_snapshots_.empty();
+  for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+    auto cur = *snapshots_iter;
+    assert(in <= cur);
+    // Skip if cur is in released_snapshots.
+    if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+      continue;
+    }
+    auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+    if (res == SnapshotCheckerResult::kInSnapshot) {
       return cur;
+    } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+      released_snapshots_.insert(cur);
     }
-    prev = cur;
-    assert(prev < kMaxSequenceNumber);
+    *prev_snapshot = cur;
   }
-  *prev_snapshot = prev;
   return kMaxSequenceNumber;
 }
 
+// used in 2 places - prevents deletion markers to be dropped if they may be
+// needed and disables seqnum zero-out in PrepareOutput for recent keys.
+inline bool CompactionIterator::ikeyNotNeededForIncrementalSnapshot() {
+  return (!compaction_->preserve_deletes()) ||
+         (ikey_.sequence < preserve_deletes_seqnum_);
+}
+
+bool CompactionIterator::IsInEarliestSnapshot(SequenceNumber sequence) {
+  assert(snapshot_checker_ != nullptr);
+  assert(earliest_snapshot_ == kMaxSequenceNumber ||
+         (earliest_snapshot_iter_ != snapshots_->end() &&
+          *earliest_snapshot_iter_ == earliest_snapshot_));
+  auto in_snapshot =
+      snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+  while (UNLIKELY(in_snapshot == SnapshotCheckerResult::kSnapshotReleased)) {
+    // Avoid the the current earliest_snapshot_ being return as
+    // earliest visible snapshot for the next value. So if a value's sequence
+    // is zero-ed out by PrepareOutput(), the next value will be compact out.
+    released_snapshots_.insert(earliest_snapshot_);
+    earliest_snapshot_iter_++;
+
+    if (earliest_snapshot_iter_ == snapshots_->end()) {
+      earliest_snapshot_ = kMaxSequenceNumber;
+    } else {
+      earliest_snapshot_ = *earliest_snapshot_iter_;
+    }
+    in_snapshot =
+        snapshot_checker_->CheckInSnapshot(sequence, earliest_snapshot_);
+  }
+  assert(in_snapshot != SnapshotCheckerResult::kSnapshotReleased);
+  return in_snapshot == SnapshotCheckerResult::kInSnapshot;
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/compaction_iterator.h b/thirdparty/rocksdb/db/compaction_iterator.h
index cad2386669..a9e7a26207 100644
--- a/thirdparty/rocksdb/db/compaction_iterator.h
+++ b/thirdparty/rocksdb/db/compaction_iterator.h
@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <deque>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "db/compaction.h"
@@ -14,13 +15,12 @@
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
 #include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
 #include "options/cf_options.h"
 #include "rocksdb/compaction_filter.h"
 
 namespace rocksdb {
 
-class CompactionEventListener;
-
 class CompactionIterator {
  public:
   // A wrapper around Compaction. Has a much smaller interface, only what
@@ -31,7 +31,7 @@ class CompactionIterator {
         : compaction_(compaction) {}
 
     virtual ~CompactionProxy() = default;
-    virtual int level(size_t compaction_input_level = 0) const {
+    virtual int level(size_t /*compaction_input_level*/ = 0) const {
       return compaction_->level();
     }
     virtual bool KeyNotExistsBeyondOutputLevel(
@@ -48,6 +48,9 @@ class CompactionIterator {
     virtual bool allow_ingest_behind() const {
       return compaction_->immutable_cf_options()->allow_ingest_behind;
     }
+    virtual bool preserve_deletes() const {
+      return compaction_->immutable_cf_options()->preserve_deletes;
+    }
 
    protected:
     CompactionProxy() = default;
@@ -59,25 +62,27 @@ class CompactionIterator {
   CompactionIterator(InternalIterator* input, const Comparator* cmp,
                      MergeHelper* merge_helper, SequenceNumber last_sequence,
                      std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot, Env* env,
-                     bool expect_valid_internal_key,
-                     RangeDelAggregator* range_del_agg,
+                     SequenceNumber earliest_write_conflict_snapshot,
+                     const SnapshotChecker* snapshot_checker, Env* env,
+                     bool report_detailed_time, bool expect_valid_internal_key,
+                     CompactionRangeDelAggregator* range_del_agg,
                      const Compaction* compaction = nullptr,
                      const CompactionFilter* compaction_filter = nullptr,
-                     CompactionEventListener* compaction_listener = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr);
+                     const std::atomic<bool>* shutting_down = nullptr,
+                     const SequenceNumber preserve_deletes_seqnum = 0);
 
   // Constructor with custom CompactionProxy, used for tests.
   CompactionIterator(InternalIterator* input, const Comparator* cmp,
                      MergeHelper* merge_helper, SequenceNumber last_sequence,
                      std::vector<SequenceNumber>* snapshots,
-                     SequenceNumber earliest_write_conflict_snapshot, Env* env,
-                     bool expect_valid_internal_key,
-                     RangeDelAggregator* range_del_agg,
+                     SequenceNumber earliest_write_conflict_snapshot,
+                     const SnapshotChecker* snapshot_checker, Env* env,
+                     bool report_detailed_time, bool expect_valid_internal_key,
+                     CompactionRangeDelAggregator* range_del_agg,
                      std::unique_ptr<CompactionProxy> compaction,
                      const CompactionFilter* compaction_filter = nullptr,
-                     CompactionEventListener* compaction_listener = nullptr,
-                     const std::atomic<bool>* shutting_down = nullptr);
+                     const std::atomic<bool>* shutting_down = nullptr,
+                     const SequenceNumber preserve_deletes_seqnum = 0);
 
   ~CompactionIterator();
 
@@ -111,6 +116,9 @@ class CompactionIterator {
   // compression.
   void PrepareOutput();
 
+  // Invoke compaction filter if needed.
+  void InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
   // Given a sequence number, return the sequence number of the
   // earliest snapshot that this sequence number is visible in.
   // The snapshots themselves are arranged in ascending order of
@@ -120,26 +128,45 @@ class CompactionIterator {
   inline SequenceNumber findEarliestVisibleSnapshot(
       SequenceNumber in, SequenceNumber* prev_snapshot);
 
+  // Checks whether the currently seen ikey_ is needed for
+  // incremental (differential) snapshot and hence can't be dropped
+  // or seqnum be zero-ed out even if all other conditions for it are met.
+  inline bool ikeyNotNeededForIncrementalSnapshot();
+
+  inline bool KeyCommitted(SequenceNumber sequence) {
+    return snapshot_checker_ == nullptr ||
+           snapshot_checker_->CheckInSnapshot(sequence, kMaxSequenceNumber) ==
+               SnapshotCheckerResult::kInSnapshot;
+  }
+
+  bool IsInEarliestSnapshot(SequenceNumber sequence);
+
   InternalIterator* input_;
   const Comparator* cmp_;
   MergeHelper* merge_helper_;
   const std::vector<SequenceNumber>* snapshots_;
+  // List of snapshots released during compaction.
+  // findEarliestVisibleSnapshot() find them out from return of
+  // snapshot_checker, and make sure they will not be returned as
+  // earliest visible snapshot of an older value.
+  // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+  std::unordered_set<SequenceNumber> released_snapshots_;
+  std::vector<SequenceNumber>::const_iterator earliest_snapshot_iter_;
   const SequenceNumber earliest_write_conflict_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
   Env* env_;
+  bool report_detailed_time_;
   bool expect_valid_internal_key_;
-  RangeDelAggregator* range_del_agg_;
+  CompactionRangeDelAggregator* range_del_agg_;
   std::unique_ptr<CompactionProxy> compaction_;
   const CompactionFilter* compaction_filter_;
-#ifndef ROCKSDB_LITE
-  CompactionEventListener* compaction_listener_;
-#endif  // ROCKSDB_LITE
   const std::atomic<bool>* shutting_down_;
+  const SequenceNumber preserve_deletes_seqnum_;
   bool bottommost_level_;
   bool valid_ = false;
   bool visible_at_tip_;
   SequenceNumber earliest_snapshot_;
   SequenceNumber latest_snapshot_;
-  bool ignore_snapshots_;
 
   // State
   //
@@ -189,6 +216,10 @@ class CompactionIterator {
   std::vector<size_t> level_ptrs_;
   CompactionIterationStats iter_stats_;
 
+  // Used to avoid purging uncommitted values. The application can specify
+  // uncommitted values by providing a SnapshotChecker object.
+  bool current_key_committed_;
+
   bool IsShuttingDown() {
     // This is a best-effort facility, so memory_order_relaxed is sufficient.
     return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
diff --git a/thirdparty/rocksdb/db/compaction_iterator_test.cc b/thirdparty/rocksdb/db/compaction_iterator_test.cc
index dfc4139363..c466f6c912 100644
--- a/thirdparty/rocksdb/db/compaction_iterator_test.cc
+++ b/thirdparty/rocksdb/db/compaction_iterator_test.cc
@@ -9,23 +9,25 @@
 #include <vector>
 
 #include "port/port.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "utilities/merge_operators.h"
 
 namespace rocksdb {
 
 // Expects no merging attempts.
 class NoMergingMergeOp : public MergeOperator {
  public:
-  bool FullMergeV2(const MergeOperationInput& merge_in,
-                   MergeOperationOutput* merge_out) const override {
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* /*merge_out*/) const override {
     ADD_FAILURE();
     return false;
   }
-  bool PartialMergeMulti(const Slice& key,
-                         const std::deque<Slice>& operand_list,
-                         std::string* new_value,
-                         Logger* logger) const override {
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& /*operand_list*/,
+                         std::string* /*new_value*/,
+                         Logger* /*logger*/) const override {
     ADD_FAILURE();
     return false;
   }
@@ -39,9 +41,9 @@ class NoMergingMergeOp : public MergeOperator {
 // Always returns Decition::kRemove.
 class StallingFilter : public CompactionFilter {
  public:
-  virtual Decision FilterV2(int level, const Slice& key, ValueType t,
-                            const Slice& existing_value, std::string* new_value,
-                            std::string* skip_until) const override {
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
     int k = std::atoi(key.ToString().c_str());
     last_seen.store(k);
     while (k >= stall_at.load()) {
@@ -72,6 +74,18 @@ class StallingFilter : public CompactionFilter {
   mutable std::atomic<int> last_seen{0};
 };
 
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    return Decision::kRemove;
+  }
+
+  const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
 class LoggingForwardVectorIterator : public InternalIterator {
  public:
   struct Action {
@@ -98,39 +112,39 @@ class LoggingForwardVectorIterator : public InternalIterator {
     assert(keys_.size() == values_.size());
   }
 
-  virtual bool Valid() const override { return current_ < keys_.size(); }
+  bool Valid() const override { return current_ < keys_.size(); }
 
-  virtual void SeekToFirst() override {
+  void SeekToFirst() override {
     log.emplace_back(Action::Type::SEEK_TO_FIRST);
     current_ = 0;
   }
-  virtual void SeekToLast() override { assert(false); }
+  void SeekToLast() override { assert(false); }
 
-  virtual void Seek(const Slice& target) override {
+  void Seek(const Slice& target) override {
     log.emplace_back(Action::Type::SEEK, target.ToString());
     current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) -
                keys_.begin();
   }
 
-  virtual void SeekForPrev(const Slice& target) override { assert(false); }
+  void SeekForPrev(const Slice& /*target*/) override { assert(false); }
 
-  virtual void Next() override {
+  void Next() override {
     assert(Valid());
     log.emplace_back(Action::Type::NEXT);
     current_++;
   }
-  virtual void Prev() override { assert(false); }
+  void Prev() override { assert(false); }
 
-  virtual Slice key() const override {
+  Slice key() const override {
     assert(Valid());
     return Slice(keys_[current_]);
   }
-  virtual Slice value() const override {
+  Slice value() const override {
     assert(Valid());
     return Slice(values_[current_]);
   }
 
-  virtual Status status() const override { return Status::OK(); }
+  Status status() const override { return Status::OK(); }
 
   std::vector<Action> log;
 
@@ -144,71 +158,156 @@ class FakeCompaction : public CompactionIterator::CompactionProxy {
  public:
   FakeCompaction() = default;
 
-  virtual int level(size_t compaction_input_level) const { return 0; }
-  virtual bool KeyNotExistsBeyondOutputLevel(
-      const Slice& user_key, std::vector<size_t>* level_ptrs) const {
-    return key_not_exists_beyond_output_level;
+  int level(size_t /*compaction_input_level*/) const override { return 0; }
+  bool KeyNotExistsBeyondOutputLevel(
+      const Slice& /*user_key*/,
+      std::vector<size_t>* /*level_ptrs*/) const override {
+    return is_bottommost_level || key_not_exists_beyond_output_level;
   }
-  virtual bool bottommost_level() const { return false; }
-  virtual int number_levels() const { return 1; }
-  virtual Slice GetLargestUserKey() const {
+  bool bottommost_level() const override { return is_bottommost_level; }
+  int number_levels() const override { return 1; }
+  Slice GetLargestUserKey() const override {
     return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
   }
-  virtual bool allow_ingest_behind() const { return false; }
+  bool allow_ingest_behind() const override { return false; }
+
+  bool preserve_deletes() const override { return false; }
 
   bool key_not_exists_beyond_output_level = false;
+
+  bool is_bottommost_level = false;
 };
 
-class CompactionIteratorTest : public testing::Test {
+// A simplifed snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit TestSnapshotChecker(
+      SequenceNumber last_committed_sequence,
+      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots = {})
+      : last_committed_sequence_(last_committed_sequence),
+        snapshots_(snapshots) {}
+
+  SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+    if (snapshot_seq == kMaxSequenceNumber) {
+      return seq <= last_committed_sequence_
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+    assert(snapshots_.count(snapshot_seq) > 0);
+    return seq <= snapshots_.at(snapshot_seq)
+               ? SnapshotCheckerResult::kInSnapshot
+               : SnapshotCheckerResult::kNotInSnapshot;
+  }
+
+ private:
+  SequenceNumber last_committed_sequence_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+//   bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
  public:
   CompactionIteratorTest()
       : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
 
-  void InitIterators(const std::vector<std::string>& ks,
-                     const std::vector<std::string>& vs,
-                     const std::vector<std::string>& range_del_ks,
-                     const std::vector<std::string>& range_del_vs,
-                     SequenceNumber last_sequence,
-                     MergeOperator* merge_op = nullptr,
-                     CompactionFilter* filter = nullptr) {
-    std::unique_ptr<InternalIterator> range_del_iter(
+  void InitIterators(
+      const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+      const std::vector<std::string>& range_del_ks,
+      const std::vector<std::string>& range_del_vs,
+      SequenceNumber last_sequence,
+      SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+      MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+    std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
         new test::VectorIterator(range_del_ks, range_del_vs));
-    range_del_agg_.reset(new RangeDelAggregator(icmp_, snapshots_));
-    ASSERT_OK(range_del_agg_->AddTombstones(std::move(range_del_iter)));
+    auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+        std::move(unfragmented_range_del_iter), icmp_);
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+                                             kMaxSequenceNumber));
+    range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+    range_del_agg_->AddTombstones(std::move(range_del_iter));
 
     std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
-    if (filter) {
+    if (filter || bottommost_level) {
       compaction_proxy_ = new FakeCompaction();
+      compaction_proxy_->is_bottommost_level = bottommost_level;
       compaction.reset(compaction_proxy_);
     }
+    bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+    if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+      snapshot_checker_.reset(
+          new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+    }
+    merge_helper_.reset(
+        new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+                        0 /*latest_snapshot*/, snapshot_checker_.get(),
+                        0 /*level*/, nullptr /*statistics*/, &shutting_down_));
 
-    merge_helper_.reset(new MergeHelper(Env::Default(), cmp_, merge_op, filter,
-                                        nullptr, false, 0, 0, nullptr,
-                                        &shutting_down_));
     iter_.reset(new LoggingForwardVectorIterator(ks, vs));
     iter_->SeekToFirst();
     c_iter_.reset(new CompactionIterator(
         iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
-        kMaxSequenceNumber, Env::Default(), false, range_del_agg_.get(),
-        std::move(compaction), filter, nullptr, &shutting_down_));
+        earliest_write_conflict_snapshot, snapshot_checker_.get(),
+        Env::Default(), false /* report_detailed_time */, false,
+        range_del_agg_.get(), std::move(compaction), filter, &shutting_down_));
+  }
+
+  void AddSnapshot(SequenceNumber snapshot,
+                   SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+    snapshots_.push_back(snapshot);
+    snapshot_map_[snapshot] = last_visible_seq;
   }
 
-  void AddSnapshot(SequenceNumber snapshot) { snapshots_.push_back(snapshot); }
+  virtual bool UseSnapshotChecker() const { return false; }
+
+  void RunTest(
+      const std::vector<std::string>& input_keys,
+      const std::vector<std::string>& input_values,
+      const std::vector<std::string>& expected_keys,
+      const std::vector<std::string>& expected_values,
+      SequenceNumber last_committed_seq = kMaxSequenceNumber,
+      MergeOperator* merge_operator = nullptr,
+      CompactionFilter* compaction_filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) {
+    InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+                  last_committed_seq, merge_operator, compaction_filter,
+                  bottommost_level, earliest_write_conflict_snapshot);
+    c_iter_->SeekToFirst();
+    for (size_t i = 0; i < expected_keys.size(); i++) {
+      std::string info = "i = " + ToString(i);
+      ASSERT_TRUE(c_iter_->Valid()) << info;
+      ASSERT_OK(c_iter_->status()) << info;
+      ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+      ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+      c_iter_->Next();
+    }
+    ASSERT_FALSE(c_iter_->Valid());
+  }
 
   const Comparator* cmp_;
   const InternalKeyComparator icmp_;
   std::vector<SequenceNumber> snapshots_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
   std::unique_ptr<MergeHelper> merge_helper_;
   std::unique_ptr<LoggingForwardVectorIterator> iter_;
   std::unique_ptr<CompactionIterator> c_iter_;
-  std::unique_ptr<RangeDelAggregator> range_del_agg_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
   std::atomic<bool> shutting_down_{false};
   FakeCompaction* compaction_proxy_;
 };
 
 // It is possible that the output of the compaction iterator is empty even if
 // the input is not.
-TEST_F(CompactionIteratorTest, EmptyResult) {
+TEST_P(CompactionIteratorTest, EmptyResult) {
   InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
                  test::KeyStr("a", 3, kTypeValue)},
                 {"", "val"}, {}, {}, 5);
@@ -218,7 +317,7 @@ TEST_F(CompactionIteratorTest, EmptyResult) {
 
 // If there is a corruption after a single deletion, the corrupted key should
 // be preserved.
-TEST_F(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
   InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
                  test::KeyStr("a", 3, kTypeValue, true),
                  test::KeyStr("b", 10, kTypeValue)},
@@ -237,7 +336,7 @@ TEST_F(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
   ASSERT_FALSE(c_iter_->Valid());
 }
 
-TEST_F(CompactionIteratorTest, SimpleRangeDeletion) {
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
   InitIterators({test::KeyStr("morning", 5, kTypeValue),
                  test::KeyStr("morning", 2, kTypeValue),
                  test::KeyStr("night", 3, kTypeValue)},
@@ -253,7 +352,7 @@ TEST_F(CompactionIteratorTest, SimpleRangeDeletion) {
   ASSERT_FALSE(c_iter_->Valid());
 }
 
-TEST_F(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
   AddSnapshot(10);
   std::vector<std::string> ks1;
   ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
@@ -274,12 +373,11 @@ TEST_F(CompactionIteratorTest, RangeDeletionWithSnapshots) {
   ASSERT_FALSE(c_iter_->Valid());
 }
 
-TEST_F(CompactionIteratorTest, CompactionFilterSkipUntil) {
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
   class Filter : public CompactionFilter {
-    virtual Decision FilterV2(int level, const Slice& key, ValueType t,
-                              const Slice& existing_value,
-                              std::string* new_value,
-                              std::string* skip_until) const override {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* skip_until) const override {
       std::string k = key.ToString();
       std::string v = existing_value.ToString();
       // See InitIterators() call below for the sequence of keys and their
@@ -349,7 +447,7 @@ TEST_F(CompactionIteratorTest, CompactionFilterSkipUntil) {
        test::KeyStr("j", 99, kTypeValue)},
       {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
        "fv25", "gv90", "hv91", "im95", "jv99"},
-      {}, {}, kMaxSequenceNumber, &merge_op, &filter);
+      {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
 
   // Compaction should output just "a", "e" and "h" keys.
   c_iter_->SeekToFirst();
@@ -384,13 +482,14 @@ TEST_F(CompactionIteratorTest, CompactionFilterSkipUntil) {
   ASSERT_EQ(expected_actions, iter_->log);
 }
 
-TEST_F(CompactionIteratorTest, ShuttingDownInFilter) {
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
   NoMergingMergeOp merge_op;
   StallingFilter filter;
   InitIterators(
       {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
        test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
-      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, &merge_op, &filter);
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
   // Don't leave tombstones (kTypeDeletion) for filtered keys.
   compaction_proxy_->key_not_exists_beyond_output_level = true;
 
@@ -421,13 +520,14 @@ TEST_F(CompactionIteratorTest, ShuttingDownInFilter) {
 
 // Same as ShuttingDownInFilter, but shutdown happens during filter call for
 // a merge operand, not for a value.
-TEST_F(CompactionIteratorTest, ShuttingDownInMerge) {
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
   NoMergingMergeOp merge_op;
   StallingFilter filter;
   InitIterators(
       {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
        test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
-      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, &merge_op, &filter);
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
   compaction_proxy_->key_not_exists_beyond_output_level = true;
 
   std::atomic<bool> seek_done{false};
@@ -455,12 +555,11 @@ TEST_F(CompactionIteratorTest, ShuttingDownInMerge) {
   EXPECT_EQ(2, filter.last_seen.load());
 }
 
-TEST_F(CompactionIteratorTest, SingleMergeOperand) {
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
   class Filter : public CompactionFilter {
-    virtual Decision FilterV2(int level, const Slice& key, ValueType t,
-                              const Slice& existing_value,
-                              std::string* new_value,
-                              std::string* skip_until) const override {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* /*skip_until*/) const override {
       std::string k = key.ToString();
       std::string v = existing_value.ToString();
 
@@ -511,7 +610,7 @@ TEST_F(CompactionIteratorTest, SingleMergeOperand) {
     bool PartialMergeMulti(const Slice& key,
                            const std::deque<Slice>& operand_list,
                            std::string* new_value,
-                           Logger* logger) const override {
+                           Logger* /*logger*/) const override {
       std::string string_key = key.ToString();
       EXPECT_TRUE(string_key == "a" || string_key == "b");
 
@@ -547,7 +646,7 @@ TEST_F(CompactionIteratorTest, SingleMergeOperand) {
        // c should invoke FullMerge due to kTypeValue at the beginning.
        test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
       {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
-      &merge_op, &filter);
+      kMaxSequenceNumber, &merge_op, &filter);
 
   c_iter_->SeekToFirst();
   ASSERT_TRUE(c_iter_->Valid());
@@ -560,6 +659,315 @@ TEST_F(CompactionIteratorTest, SingleMergeOperand) {
   ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
 }
 
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"}, kMaxSequenceNumber /*last_commited_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeDeletion),
+           test::KeyStr("b", 3, kTypeDeletion),
+           test::KeyStr("b", 1, kTypeValue)},
+          {"", "", ""},
+          {test::KeyStr("b", 3, kTypeDeletion),
+           test::KeyStr("b", 0, kTypeValue)},
+          {"", ""},
+          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion)},
+          {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+          kMaxSequenceNumber /*last_commited_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+    : public CompactionIteratorTest {
+ public:
+  bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Value) {
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Deletion) {
+  RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Merge) {
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_SingleDelete) {
+  RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_BlobIndex) {
+  RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v3", "v2", "v1"},
+          {test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex)},
+          {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+  AddSnapshot(2, 1);
+  AddSnapshot(4, 3);
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       DedupSameSnapshot_SingleDeletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeSingleDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v2", "v1"},
+          {test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_commited_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+       test::KeyStr("c", 3, kTypeDeletion)},
+      {"", "", ""},
+      {},
+      {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+  AddSnapshot(2,1);
+  RunTest(
+      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 1, kTypeValue),
+          test::KeyStr("b", 3, kTypeValue)},
+      {"", "", ""},
+      {test::KeyStr("a", 4, kTypeDeletion), test::KeyStr("a", 0, kTypeValue),
+            test::KeyStr("b", 3, kTypeValue)},
+      {"", "", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", "", ""},
+          {test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", ""}, kMaxSequenceNumber /*last_commited_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       SingleDeleteAcrossSnapshotBoundary) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+//   * Convert the latest velue to deletion, and/or
+//   * if latest value is a merge, apply filter to all suequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+      {"v2", "v1", "v3", "v4"},
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+      {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+      nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+      {"", "v1"},
+      {test::KeyStr("a", 2, kTypeDeletion),
+       test::KeyStr("a", 1, kTypeDeletion)},
+      {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+      compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       CompactionFilter_PartialMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+           test::KeyStr("a", 1, kTypeMerge)},
+          {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+          2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+       test::KeyStr("a", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+      {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+      compaction_filter.get());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/compaction_job.cc b/thirdparty/rocksdb/db/compaction_job.cc
index 1d023ca456..65e9719a39 100644
--- a/thirdparty/rocksdb/db/compaction_job.cc
+++ b/thirdparty/rocksdb/db/compaction_job.cc
@@ -25,8 +25,10 @@
 #include <vector>
 
 #include "db/builder.h"
+#include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
+#include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
@@ -34,11 +36,11 @@
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
-#include "port/likely.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@@ -63,6 +65,46 @@
 
 namespace rocksdb {
 
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+  switch (compaction_reason) {
+    case CompactionReason::kUnknown:
+      return "Unknown";
+    case CompactionReason::kLevelL0FilesNum:
+      return "LevelL0FilesNum";
+    case CompactionReason::kLevelMaxLevelSize:
+      return "LevelMaxLevelSize";
+    case CompactionReason::kUniversalSizeAmplification:
+      return "UniversalSizeAmplification";
+    case CompactionReason::kUniversalSizeRatio:
+      return "UniversalSizeRatio";
+    case CompactionReason::kUniversalSortedRunNum:
+      return "UniversalSortedRunNum";
+    case CompactionReason::kFIFOMaxSize:
+      return "FIFOMaxSize";
+    case CompactionReason::kFIFOReduceNumFiles:
+      return "FIFOReduceNumFiles";
+    case CompactionReason::kFIFOTtl:
+      return "FIFOTtl";
+    case CompactionReason::kManualCompaction:
+      return "ManualCompaction";
+    case CompactionReason::kFilesMarkedForCompaction:
+      return "FilesMarkedForCompaction";
+    case CompactionReason::kBottommostFiles:
+      return "BottommostFiles";
+    case CompactionReason::kTtl:
+      return "Ttl";
+    case CompactionReason::kFlush:
+      return "Flush";
+    case CompactionReason::kExternalSstIngestion:
+      return "ExternalSstIngestion";
+    case CompactionReason::kNumOfReasons:
+      // fall through
+    default:
+      assert(false);
+      return "Invalid";
+  }
+}
+
 // Maintains state for each sub-compaction
 struct CompactionJob::SubcompactionState {
   const Compaction* compaction;
@@ -115,7 +157,6 @@ struct CompactionJob::SubcompactionState {
   uint64_t overlapped_bytes = 0;
   // A flag determine whether the key has been seen in ShouldStopBefore()
   bool seen_key = false;
-  std::string compression_dict;
 
   SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
                      uint64_t size = 0)
@@ -131,8 +172,7 @@ struct CompactionJob::SubcompactionState {
         approx_size(size),
         grandparent_index(0),
         overlapped_bytes(0),
-        seen_key(false),
-        compression_dict() {
+        seen_key(false) {
     assert(compaction != nullptr);
   }
 
@@ -155,11 +195,10 @@ struct CompactionJob::SubcompactionState {
     grandparent_index = std::move(o.grandparent_index);
     overlapped_bytes = std::move(o.overlapped_bytes);
     seen_key = std::move(o.seen_key);
-    compression_dict = std::move(o.compression_dict);
     return *this;
   }
 
-  // Because member unique_ptrs do not have these.
+  // Because member std::unique_ptrs do not have these.
   SubcompactionState(const SubcompactionState&) = delete;
 
   SubcompactionState& operator=(const SubcompactionState&) = delete;
@@ -264,37 +303,46 @@ void CompactionJob::AggregateStatistics() {
 
 CompactionJob::CompactionJob(
     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
-    const EnvOptions& env_options, VersionSet* versions,
-    const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+    const EnvOptions env_options, VersionSet* versions,
+    const std::atomic<bool>* shutting_down,
+    const SequenceNumber preserve_deletes_seqnum, LogBuffer* log_buffer,
     Directory* db_directory, Directory* output_directory, Statistics* stats,
-    InstrumentedMutex* db_mutex, Status* db_bg_error,
+    InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
     std::vector<SequenceNumber> existing_snapshots,
     SequenceNumber earliest_write_conflict_snapshot,
-    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
-    bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
-    CompactionJobStats* compaction_job_stats)
+    const SnapshotChecker* snapshot_checker, std::shared_ptr<Cache> table_cache,
+    EventLogger* event_logger, bool paranoid_file_checks, bool measure_io_stats,
+    const std::string& dbname, CompactionJobStats* compaction_job_stats,
+    Env::Priority thread_pri)
     : job_id_(job_id),
       compact_(new CompactionState(compaction)),
       compaction_job_stats_(compaction_job_stats),
-      compaction_stats_(1),
+      compaction_stats_(compaction->compaction_reason(), 1),
       dbname_(dbname),
       db_options_(db_options),
       env_options_(env_options),
       env_(db_options.env),
+      env_optiosn_for_read_(
+          env_->OptimizeForCompactionTableRead(env_options, db_options_)),
       versions_(versions),
       shutting_down_(shutting_down),
+      preserve_deletes_seqnum_(preserve_deletes_seqnum),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
       output_directory_(output_directory),
       stats_(stats),
       db_mutex_(db_mutex),
-      db_bg_error_(db_bg_error),
+      db_error_handler_(db_error_handler),
       existing_snapshots_(std::move(existing_snapshots)),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
       table_cache_(std::move(table_cache)),
       event_logger_(event_logger),
+      bottommost_level_(false),
       paranoid_file_checks_(paranoid_file_checks),
-      measure_io_stats_(measure_io_stats) {
+      measure_io_stats_(measure_io_stats),
+      write_hint_(Env::WLTH_NOT_SET),
+      thread_pri_(thread_pri) {
   assert(log_buffer_ != nullptr);
   const auto* cfd = compact_->compaction->column_family_data();
   ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
@@ -308,15 +356,13 @@ CompactionJob::~CompactionJob() {
   ThreadStatusUtil::ResetThreadStatus();
 }
 
-void CompactionJob::ReportStartedCompaction(
-    Compaction* compaction) {
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
   const auto* cfd = compact_->compaction->column_family_data();
   ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
                                     db_options_.enable_thread_tracking);
 
-  ThreadStatusUtil::SetThreadOperationProperty(
-      ThreadStatus::COMPACTION_JOB_ID,
-      job_id_);
+  ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+                                               job_id_);
 
   ThreadStatusUtil::SetThreadOperationProperty(
       ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
@@ -346,8 +392,7 @@ void CompactionJob::ReportStartedCompaction(
 
   // Set the thread operation after operation properties
   // to ensure GetThreadList() can always show them all together.
-  ThreadStatusUtil::SetThreadOperation(
-      ThreadStatus::OP_COMPACTION);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
 
   if (compaction_job_stats_) {
     compaction_job_stats_->is_manual_compaction =
@@ -362,18 +407,19 @@ void CompactionJob::Prepare() {
   // Generate file_levels_ for compaction berfore making Iterator
   auto* c = compact_->compaction;
   assert(c->column_family_data() != nullptr);
-  assert(c->column_family_data()->current()->storage_info()
-      ->NumLevelFiles(compact_->compaction->level()) > 0);
+  assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
 
+  write_hint_ =
+      c->column_family_data()->CalculateSSTWriteHint(c->output_level());
   // Is this compaction producing files at the bottommost level?
   bottommost_level_ = c->bottommost_level();
 
   if (c->ShouldFormSubcompactions()) {
-    const uint64_t start_micros = env_->NowMicros();
-    GenSubcompactionBoundaries();
-    MeasureTime(stats_, SUBCOMPACTION_SETUP_TIME,
-                env_->NowMicros() - start_micros);
-
+    {
+      StopWatch sw(env_, stats_, SUBCOMPACTION_SETUP_TIME);
+      GenSubcompactionBoundaries();
+    }
     assert(sizes_.size() == boundaries_.size() + 1);
 
     for (size_t i = 0; i <= boundaries_.size(); i++) {
@@ -381,8 +427,8 @@ void CompactionJob::Prepare() {
       Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
       compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]);
     }
-    MeasureTime(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
-                compact_->sub_compact_states.size());
+    RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+                      compact_->sub_compact_states.size());
   } else {
     compact_->sub_compact_states.emplace_back(c, nullptr, nullptr);
   }
@@ -447,20 +493,27 @@ void CompactionJob::GenSubcompactionBoundaries() {
   }
 
   std::sort(bounds.begin(), bounds.end(),
-    [cfd_comparator] (const Slice& a, const Slice& b) -> bool {
-      return cfd_comparator->Compare(ExtractUserKey(a), ExtractUserKey(b)) < 0;
-    });
+            [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+              return cfd_comparator->Compare(ExtractUserKey(a),
+                                             ExtractUserKey(b)) < 0;
+            });
   // Remove duplicated entries from bounds
-  bounds.erase(std::unique(bounds.begin(), bounds.end(),
-    [cfd_comparator] (const Slice& a, const Slice& b) -> bool {
-      return cfd_comparator->Compare(ExtractUserKey(a), ExtractUserKey(b)) == 0;
-    }), bounds.end());
+  bounds.erase(
+      std::unique(bounds.begin(), bounds.end(),
+                  [cfd_comparator](const Slice& a, const Slice& b) -> bool {
+                    return cfd_comparator->Compare(ExtractUserKey(a),
+                                                   ExtractUserKey(b)) == 0;
+                  }),
+      bounds.end());
 
   // Combine consecutive pairs of boundaries into ranges with an approximate
   // size of data covered by keys in that range
   uint64_t sum = 0;
   std::vector<RangeWithSize> ranges;
-  auto* v = cfd->current();
+  // Get input version from CompactionState since it's already referenced
+  // earlier in SetInputVersioCompaction::SetInputVersion and will not change
+  // when db_mutex_ is released below
+  auto* v = compact_->compaction->input_version();
   for (auto it = bounds.begin();;) {
     const Slice a = *it;
     it++;
@@ -470,19 +523,28 @@ void CompactionJob::GenSubcompactionBoundaries() {
     }
 
     const Slice b = *it;
+
+    // ApproximateSize could potentially create table reader iterator to seek
+    // to the index block and may incur I/O cost in the process. Unlock db
+    // mutex to reduce contention
+    db_mutex_->Unlock();
     uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1);
+    db_mutex_->Lock();
     ranges.emplace_back(a, b, size);
     sum += size;
   }
 
   // Group the ranges into subcompactions
   const double min_file_fill_percent = 4.0 / 5;
-  uint64_t max_output_files = static_cast<uint64_t>(
-      std::ceil(sum / min_file_fill_percent /
-                c->mutable_cf_options()->MaxFileSizeForLevel(out_lvl)));
+  int base_level = v->storage_info()->base_level();
+  uint64_t max_output_files = static_cast<uint64_t>(std::ceil(
+      sum / min_file_fill_percent /
+      MaxFileSizeForLevel(*(c->mutable_cf_options()), out_lvl,
+          c->immutable_cf_options()->compaction_style, base_level,
+          c->immutable_cf_options()->level_compaction_dynamic_level_bytes)));
   uint64_t subcompactions =
       std::min({static_cast<uint64_t>(ranges.size()),
-                static_cast<uint64_t>(db_options_.max_subcompactions),
+                static_cast<uint64_t>(c->max_subcompactions()),
                 max_output_files});
 
   if (subcompactions > 1) {
@@ -539,12 +601,18 @@ Status CompactionJob::Run() {
     thread.join();
   }
 
-  if (output_directory_) {
-    output_directory_->Fsync();
+  compaction_stats_.micros = env_->NowMicros() - start_micros;
+  compaction_stats_.cpu_micros = 0;
+  for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) {
+    compaction_stats_.cpu_micros +=
+        compact_->sub_compact_states[i].compaction_job_stats.cpu_micros;
   }
 
-  compaction_stats_.micros = env_->NowMicros() - start_micros;
-  MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.cpu_micros);
+
+  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
 
   // Check if any thread encountered an error during execution
   Status status;
@@ -555,11 +623,79 @@ Status CompactionJob::Run() {
     }
   }
 
+  if (status.ok() && output_directory_) {
+    status = output_directory_->Fsync();
+  }
+
+  if (status.ok()) {
+    thread_pool.clear();
+    std::vector<const FileMetaData*> files_meta;
+    for (const auto& state : compact_->sub_compact_states) {
+      for (const auto& output : state.outputs) {
+        files_meta.emplace_back(&output.meta);
+      }
+    }
+    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+    auto prefix_extractor =
+        compact_->compaction->mutable_cf_options()->prefix_extractor.get();
+    std::atomic<size_t> next_file_meta_idx(0);
+    auto verify_table = [&](Status& output_status) {
+      while (true) {
+        size_t file_idx = next_file_meta_idx.fetch_add(1);
+        if (file_idx >= files_meta.size()) {
+          break;
+        }
+        // Verify that the table is usable
+        // We set for_compaction to false and don't OptimizeForCompactionTableRead
+        // here because this is a special case after we finish the table building
+        // No matter whether use_direct_io_for_flush_and_compaction is true,
+        // we will regard this verification as user reads since the goal is
+        // to cache it here for further user reads
+        InternalIterator* iter = cfd->table_cache()->NewIterator(
+            ReadOptions(), env_options_, cfd->internal_comparator(),
+            *files_meta[file_idx], nullptr /* range_del_agg */,
+            prefix_extractor, nullptr,
+            cfd->internal_stats()->GetFileReadHist(
+                compact_->compaction->output_level()),
+            false, nullptr /* arena */, false /* skip_filters */,
+            compact_->compaction->output_level());
+        auto s = iter->status();
+
+        if (s.ok() && paranoid_file_checks_) {
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
+          s = iter->status();
+        }
+
+        delete iter;
+
+        if (!s.ok()) {
+          output_status = s;
+          break;
+        }
+      }
+    };
+    for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+      thread_pool.emplace_back(verify_table,
+                               std::ref(compact_->sub_compact_states[i].status));
+    }
+    verify_table(compact_->sub_compact_states[0].status);
+    for (auto& thread : thread_pool) {
+      thread.join();
+    }
+    for (const auto& state : compact_->sub_compact_states) {
+      if (!state.status.ok()) {
+        status = state.status;
+        break;
+      }
+    }
+  }
+
   TablePropertiesCollection tp;
   for (const auto& state : compact_->sub_compact_states) {
     for (const auto& output : state.outputs) {
-      auto fn = TableFileName(db_options_.db_paths, output.meta.fd.GetNumber(),
-                              output.meta.fd.GetPathId());
+      auto fn =
+          TableFileName(state.compaction->immutable_cf_options()->cf_paths,
+                        output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
       tp[fn] = output.table_properties;
     }
   }
@@ -583,7 +719,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
   Status status = compact_->status;
   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
   cfd->internal_stats()->AddCompactionStats(
-      compact_->compaction->output_level(), compaction_stats_);
+      compact_->compaction->output_level(), thread_pri_, compaction_stats_);
 
   if (status.ok()) {
     status = InstallCompactionResults(mutable_cf_options);
@@ -617,7 +753,8 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
       "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
       "files in(%d, %d) out(%d) "
       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
-      "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
+      "write-amplify(%.1f) %s, records in: %" PRIu64
+      ", records dropped: %" PRIu64 " output_compression: %s\n",
       cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
       bytes_written_per_sec, compact_->compaction->output_level(),
       stats.num_input_files_in_non_output_levels,
@@ -626,20 +763,24 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
       stats.bytes_read_output_level / 1048576.0,
       stats.bytes_written / 1048576.0, read_write_amp, write_amp,
       status.ToString().c_str(), stats.num_input_records,
-      stats.num_dropped_records);
+      stats.num_dropped_records,
+      CompressionTypeToString(compact_->compaction->output_compression())
+          .c_str());
 
   UpdateCompactionJobStats(stats);
 
   auto stream = event_logger_->LogToBuffer(log_buffer_);
-  stream << "job" << job_id_
-         << "event" << "compaction_finished"
+  stream << "job" << job_id_ << "event"
+         << "compaction_finished"
          << "compaction_time_micros" << compaction_stats_.micros
+         << "compaction_time_cpu_micros" << compaction_stats_.cpu_micros
          << "output_level" << compact_->compaction->output_level()
          << "num_output_files" << compact_->NumOutputFiles()
-         << "total_output_size" << compact_->total_bytes
-         << "num_input_records" << compact_->num_input_records
-         << "num_output_records" << compact_->num_output_records
-         << "num_subcompactions" << compact_->sub_compact_states.size();
+         << "total_output_size" << compact_->total_bytes << "num_input_records"
+         << compact_->num_input_records << "num_output_records"
+         << compact_->num_output_records << "num_subcompactions"
+         << compact_->sub_compact_states.size() << "output_compression"
+         << CompressionTypeToString(compact_->compaction->output_compression());
 
   if (compaction_job_stats_ != nullptr) {
     stream << "num_single_delete_mismatches"
@@ -670,11 +811,35 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
 
 void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   assert(sub_compact != nullptr);
+
+  uint64_t prev_cpu_micros = env_->NowCPUNanos() / 1000;
+
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
-  std::unique_ptr<RangeDelAggregator> range_del_agg(
-      new RangeDelAggregator(cfd->internal_comparator(), existing_snapshots_));
+
+  // Create compaction filter and fail the compaction if
+  // IgnoreSnapshots() = false because it is not supported anymore
+  const CompactionFilter* compaction_filter =
+      cfd->ioptions()->compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (compaction_filter == nullptr) {
+    compaction_filter_from_factory =
+        sub_compact->compaction->CreateCompactionFilter();
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+  if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+    sub_compact->status = Status::NotSupported(
+        "CompactionFilter::IgnoreSnapshots() = false is not supported "
+        "anymore.");
+    return;
+  }
+
+  CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(),
+                                             existing_snapshots_);
+
+  // Although the v2 aggregator is what the level iterator(s) know about,
+  // the AddTombstones calls will be propagated down to the v1 aggregator.
   std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
-      sub_compact->compaction, range_del_agg.get()));
+      sub_compact->compaction, &range_del_agg, env_optiosn_for_read_));
 
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
@@ -686,54 +851,26 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   uint64_t prev_fsync_nanos = 0;
   uint64_t prev_range_sync_nanos = 0;
   uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
   if (measure_io_stats_) {
     prev_perf_level = GetPerfLevel();
-    SetPerfLevel(PerfLevel::kEnableTime);
+    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
     prev_write_nanos = IOSTATS(write_nanos);
     prev_fsync_nanos = IOSTATS(fsync_nanos);
     prev_range_sync_nanos = IOSTATS(range_sync_nanos);
     prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
   }
 
-  const MutableCFOptions* mutable_cf_options =
-      sub_compact->compaction->mutable_cf_options();
-
-  // To build compression dictionary, we sample the first output file, assuming
-  // it'll reach the maximum length, and then use the dictionary for compressing
-  // subsequent output files. The dictionary may be less than max_dict_bytes if
-  // the first output file's length is less than the maximum.
-  const int kSampleLenShift = 6;  // 2^6 = 64-byte samples
-  std::set<size_t> sample_begin_offsets;
-  if (bottommost_level_ &&
-      cfd->ioptions()->compression_opts.max_dict_bytes > 0) {
-    const size_t kMaxSamples =
-        cfd->ioptions()->compression_opts.max_dict_bytes >> kSampleLenShift;
-    const size_t kOutFileLen = mutable_cf_options->MaxFileSizeForLevel(
-        compact_->compaction->output_level());
-    if (kOutFileLen != port::kMaxSizet) {
-      const size_t kOutFileNumSamples = kOutFileLen >> kSampleLenShift;
-      Random64 generator{versions_->NewFileNumber()};
-      for (size_t i = 0; i < kMaxSamples; ++i) {
-        sample_begin_offsets.insert(generator.Uniform(kOutFileNumSamples)
-                                    << kSampleLenShift);
-      }
-    }
-  }
-
-  auto compaction_filter = cfd->ioptions()->compaction_filter;
-  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
-  if (compaction_filter == nullptr) {
-    compaction_filter_from_factory =
-        sub_compact->compaction->CreateCompactionFilter();
-    compaction_filter = compaction_filter_from_factory.get();
-  }
   MergeHelper merge(
       env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
       compaction_filter, db_options_.info_log.get(),
       false /* internal key corruption is expected */,
       existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
-      compact_->compaction->level(), db_options_.statistics.get(),
-      shutting_down_);
+      snapshot_checker_, compact_->compaction->level(),
+      db_options_.statistics.get(), shutting_down_);
 
   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
 
@@ -747,40 +884,23 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     input->SeekToFirst();
   }
 
-  // we allow only 1 compaction event listener. Used by blob storage
-  CompactionEventListener* comp_event_listener = nullptr;
-#ifndef ROCKSDB_LITE
-  for (auto& celitr : cfd->ioptions()->listeners) {
-    comp_event_listener = celitr->GetCompactionEventListener();
-    if (comp_event_listener != nullptr) {
-      break;
-    }
-  }
-#endif  // ROCKSDB_LITE
-
   Status status;
   sub_compact->c_iter.reset(new CompactionIterator(
       input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
-      &existing_snapshots_, earliest_write_conflict_snapshot_, env_, false,
-      range_del_agg.get(), sub_compact->compaction, compaction_filter,
-      comp_event_listener, shutting_down_));
+      &existing_snapshots_, earliest_write_conflict_snapshot_,
+      snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false,
+      &range_del_agg, sub_compact->compaction, compaction_filter,
+      shutting_down_, preserve_deletes_seqnum_));
   auto c_iter = sub_compact->c_iter.get();
   c_iter->SeekToFirst();
-  if (c_iter->Valid() &&
-      sub_compact->compaction->output_level() != 0) {
+  if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) {
     // ShouldStopBefore() maintains state based on keys processed so far. The
     // compaction loop always calls it on the "next" key, thus won't tell it the
     // first key. So we do that here.
-    sub_compact->ShouldStopBefore(
-      c_iter->key(), sub_compact->current_output_file_size);
+    sub_compact->ShouldStopBefore(c_iter->key(),
+                                  sub_compact->current_output_file_size);
   }
   const auto& c_iter_stats = c_iter->iter_stats();
-  auto sample_begin_offset_iter = sample_begin_offsets.cbegin();
-  // data_begin_offset and compression_dict are only valid while generating
-  // dictionary from the first output file.
-  size_t data_begin_offset = 0;
-  std::string compression_dict;
-  compression_dict.reserve(cfd->ioptions()->compression_opts.max_dict_bytes);
 
   while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
     // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
@@ -816,55 +936,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
         key, c_iter->ikey().sequence);
     sub_compact->num_output_records++;
 
-    if (sub_compact->outputs.size() == 1) {  // first output file
-      // Check if this key/value overlaps any sample intervals; if so, appends
-      // overlapping portions to the dictionary.
-      for (const auto& data_elmt : {key, value}) {
-        size_t data_end_offset = data_begin_offset + data_elmt.size();
-        while (sample_begin_offset_iter != sample_begin_offsets.cend() &&
-               *sample_begin_offset_iter < data_end_offset) {
-          size_t sample_end_offset =
-              *sample_begin_offset_iter + (1 << kSampleLenShift);
-          // Invariant: Because we advance sample iterator while processing the
-          // data_elmt containing the sample's last byte, the current sample
-          // cannot end before the current data_elmt.
-          assert(data_begin_offset < sample_end_offset);
-
-          size_t data_elmt_copy_offset, data_elmt_copy_len;
-          if (*sample_begin_offset_iter <= data_begin_offset) {
-            // The sample starts before data_elmt starts, so take bytes starting
-            // at the beginning of data_elmt.
-            data_elmt_copy_offset = 0;
-          } else {
-            // data_elmt starts before the sample starts, so take bytes starting
-            // at the below offset into data_elmt.
-            data_elmt_copy_offset =
-                *sample_begin_offset_iter - data_begin_offset;
-          }
-          if (sample_end_offset <= data_end_offset) {
-            // The sample ends before data_elmt ends, so take as many bytes as
-            // needed.
-            data_elmt_copy_len =
-                sample_end_offset - (data_begin_offset + data_elmt_copy_offset);
-          } else {
-            // data_elmt ends before the sample ends, so take all remaining
-            // bytes in data_elmt.
-            data_elmt_copy_len =
-                data_end_offset - (data_begin_offset + data_elmt_copy_offset);
-          }
-          compression_dict.append(&data_elmt.data()[data_elmt_copy_offset],
-                                  data_elmt_copy_len);
-          if (sample_end_offset > data_end_offset) {
-            // Didn't finish sample. Try to finish it with the next data_elmt.
-            break;
-          }
-          // Next sample may require bytes from same data_elmt.
-          sample_begin_offset_iter++;
-        }
-        data_begin_offset = data_end_offset;
-      }
-    }
-
     // Close output file if it is big enough. Two possibilities determine it's
     // time to close it: (1) the current key should be this file's last key, (2)
     // the next key should not be in this file.
@@ -886,8 +957,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
     c_iter->Next();
     if (!output_file_ended && c_iter->Valid() &&
         sub_compact->compaction->output_level() != 0 &&
-        sub_compact->ShouldStopBefore(
-          c_iter->key(), sub_compact->current_output_file_size) &&
+        sub_compact->ShouldStopBefore(c_iter->key(),
+                                      sub_compact->current_output_file_size) &&
         sub_compact->builder != nullptr) {
       // (2) this key belongs to the next file. For historical reasons, the
       // iterator status after advancing will be given to
@@ -901,16 +972,11 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
         next_key = &c_iter->key();
       }
       CompactionIterationStats range_del_out_stats;
-      status = FinishCompactionOutputFile(input_status, sub_compact,
-                                          range_del_agg.get(),
-                                          &range_del_out_stats, next_key);
+      status =
+          FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg,
+                                     &range_del_out_stats, next_key);
       RecordDroppedKeys(range_del_out_stats,
                         &sub_compact->compaction_job_stats);
-      if (sub_compact->outputs.size() == 1) {
-        // Use dictionary from first output file for compression of subsequent
-        // files.
-        sub_compact->compression_dict = std::move(compression_dict);
-      }
     }
   }
 
@@ -933,8 +999,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
   RecordCompactionIOStats();
 
-  if (status.ok() && (shutting_down_->load(std::memory_order_relaxed) ||
-                      cfd->IsDropped())) {
+  if (status.ok() &&
+      (shutting_down_->load(std::memory_order_relaxed) || cfd->IsDropped())) {
     status = Status::ShutdownInProgress(
         "Database shutdown or Column family drop during compaction");
   }
@@ -946,8 +1012,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   }
 
   if (status.ok() && sub_compact->builder == nullptr &&
-      sub_compact->outputs.size() == 0 &&
-      range_del_agg->ShouldAddTombstones(bottommost_level_)) {
+      sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) {
     // handle subcompaction containing only range deletions
     status = OpenCompactionOutputFile(sub_compact);
   }
@@ -956,14 +1021,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   // close the output file.
   if (sub_compact->builder != nullptr) {
     CompactionIterationStats range_del_out_stats;
-    Status s = FinishCompactionOutputFile(
-        status, sub_compact, range_del_agg.get(), &range_del_out_stats);
+    Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg,
+                                          &range_del_out_stats);
     if (status.ok()) {
       status = s;
     }
     RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
   }
 
+  sub_compact->compaction_job_stats.cpu_micros =
+      env_->NowCPUNanos() / 1000 - prev_cpu_micros;
+
   if (measure_io_stats_) {
     sub_compact->compaction_job_stats.file_write_nanos +=
         IOSTATS(write_nanos) - prev_write_nanos;
@@ -973,7 +1041,11 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
         IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
     sub_compact->compaction_job_stats.file_prepare_write_nanos +=
         IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
-    if (prev_perf_level != PerfLevel::kEnableTime) {
+    sub_compact->compaction_job_stats.cpu_micros -=
+        (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+         IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+        1000;
+    if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
       SetPerfLevel(prev_perf_level);
     }
   }
@@ -1022,7 +1094,7 @@ void CompactionJob::RecordDroppedKeys(
 
 Status CompactionJob::FinishCompactionOutputFile(
     const Status& input_status, SubcompactionState* sub_compact,
-    RangeDelAggregator* range_del_agg,
+    CompactionRangeDelAggregator* range_del_agg,
     CompactionIterationStats* range_del_out_stats,
     const Slice* next_table_min_key /* = nullptr */) {
   AutoThreadOperationStageUpdater stage_updater(
@@ -1035,48 +1107,185 @@ Status CompactionJob::FinishCompactionOutputFile(
   uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber();
   assert(output_number != 0);
 
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  const Comparator* ucmp = cfd->user_comparator();
+
   // Check for iterator errors
   Status s = input_status;
   auto meta = &sub_compact->current_output()->meta;
+  assert(meta != nullptr);
   if (s.ok()) {
     Slice lower_bound_guard, upper_bound_guard;
+    std::string smallest_user_key;
     const Slice *lower_bound, *upper_bound;
+    bool lower_bound_from_sub_compact = false;
     if (sub_compact->outputs.size() == 1) {
       // For the first output table, include range tombstones before the min key
       // but after the subcompaction boundary.
       lower_bound = sub_compact->start;
+      lower_bound_from_sub_compact = true;
     } else if (meta->smallest.size() > 0) {
       // For subsequent output tables, only include range tombstones from min
       // key onwards since the previous file was extended to contain range
       // tombstones falling before min key.
-      lower_bound_guard = meta->smallest.user_key();
+      smallest_user_key = meta->smallest.user_key().ToString(false /*hex*/);
+      lower_bound_guard = Slice(smallest_user_key);
       lower_bound = &lower_bound_guard;
     } else {
       lower_bound = nullptr;
     }
     if (next_table_min_key != nullptr) {
-      // This isn't the last file in the subcompaction, so extend until the next
-      // file starts.
+      // This may be the last file in the subcompaction in some cases, so we
+      // need to compare the end key of subcompaction with the next file start
+      // key. When the end key is chosen by the subcompaction, we know that
+      // it must be the biggest key in output file. Therefore, it is safe to
+      // use the smaller key as the upper bound of the output file, to ensure
+      // that there is no overlapping between different output files.
       upper_bound_guard = ExtractUserKey(*next_table_min_key);
-      upper_bound = &upper_bound_guard;
+      if (sub_compact->end != nullptr &&
+          ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) {
+        upper_bound = sub_compact->end;
+      } else {
+        upper_bound = &upper_bound_guard;
+      }
     } else {
       // This is the last file in the subcompaction, so extend until the
       // subcompaction ends.
       upper_bound = sub_compact->end;
     }
-    range_del_agg->AddToBuilder(sub_compact->builder.get(), lower_bound,
-                                upper_bound, meta, range_del_out_stats,
-                                bottommost_level_);
+    auto earliest_snapshot = kMaxSequenceNumber;
+    if (existing_snapshots_.size() > 0) {
+      earliest_snapshot = existing_snapshots_[0];
+    }
+    bool has_overlapping_endpoints;
+    if (upper_bound != nullptr && meta->largest.size() > 0) {
+      has_overlapping_endpoints =
+          ucmp->Compare(meta->largest.user_key(), *upper_bound) == 0;
+    } else {
+      has_overlapping_endpoints = false;
+    }
+
+    // The end key of the subcompaction must be bigger or equal to the upper
+    // bound. If the end of subcompaction is null or the upper bound is null,
+    // it means that this file is the last file in the compaction. So there
+    // will be no overlapping between this file and others.
+    assert(sub_compact->end == nullptr ||
+           upper_bound == nullptr ||
+           ucmp->Compare(*upper_bound , *sub_compact->end) <= 0);
+    auto it = range_del_agg->NewIterator(lower_bound, upper_bound,
+                                         has_overlapping_endpoints);
+    // Position the range tombstone output iterator. There may be tombstone
+    // fragments that are entirely out of range, so make sure that we do not
+    // include those.
+    if (lower_bound != nullptr) {
+      it->Seek(*lower_bound);
+    } else {
+      it->SeekToFirst();
+    }
+    for (; it->Valid(); it->Next()) {
+      auto tombstone = it->Tombstone();
+      if (upper_bound != nullptr) {
+        int cmp = ucmp->Compare(*upper_bound, tombstone.start_key_);
+        if ((has_overlapping_endpoints && cmp < 0) ||
+            (!has_overlapping_endpoints && cmp <= 0)) {
+          // Tombstones starting after upper_bound only need to be included in
+          // the next table. If the current SST ends before upper_bound, i.e.,
+          // `has_overlapping_endpoints == false`, we can also skip over range
+          // tombstones that start exactly at upper_bound. Such range tombstones
+          // will be included in the next file and are not relevant to the point
+          // keys or endpoints of the current file.
+          break;
+        }
+      }
+
+      if (bottommost_level_ && tombstone.seq_ <= earliest_snapshot) {
+        // TODO(andrewkr): tombstones that span multiple output files are
+        // counted for each compaction output file, so lots of double counting.
+        range_del_out_stats->num_range_del_drop_obsolete++;
+        range_del_out_stats->num_record_drop_obsolete++;
+        continue;
+      }
+
+      auto kv = tombstone.Serialize();
+      assert(lower_bound == nullptr ||
+             ucmp->Compare(*lower_bound, kv.second) < 0);
+      sub_compact->builder->Add(kv.first.Encode(), kv.second);
+      InternalKey smallest_candidate = std::move(kv.first);
+      if (lower_bound != nullptr &&
+          ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {
+        // Pretend the smallest key has the same user key as lower_bound
+        // (the max key in the previous table or subcompaction) in order for
+        // files to appear key-space partitioned.
+        //
+        // When lower_bound is chosen by a subcompaction, we know that
+        // subcompactions over smaller keys cannot contain any keys at
+        // lower_bound. We also know that smaller subcompactions exist, because
+        // otherwise the subcompaction woud be unbounded on the left. As a
+        // result, we know that no other files on the output level will contain
+        // actual keys at lower_bound (an output file may have a largest key of
+        // lower_bound@kMaxSequenceNumber, but this only indicates a large range
+        // tombstone was truncated). Therefore, it is safe to use the
+        // tombstone's sequence number, to ensure that keys at lower_bound at
+        // lower levels are covered by truncated tombstones.
+        //
+        // If lower_bound was chosen by the smallest data key in the file,
+        // choose lowest seqnum so this file's smallest internal key comes after
+        // the previous file's largest. The fake seqnum is OK because the read
+        // path's file-picking code only considers user key.
+        smallest_candidate = InternalKey(
+            *lower_bound, lower_bound_from_sub_compact ? tombstone.seq_ : 0,
+            kTypeRangeDeletion);
+      }
+      InternalKey largest_candidate = tombstone.SerializeEndKey();
+      if (upper_bound != nullptr &&
+          ucmp->Compare(*upper_bound, largest_candidate.user_key()) <= 0) {
+        // Pretend the largest key has the same user key as upper_bound (the
+        // min key in the following table or subcompaction) in order for files
+        // to appear key-space partitioned.
+        //
+        // Choose highest seqnum so this file's largest internal key comes
+        // before the next file's/subcompaction's smallest. The fake seqnum is
+        // OK because the read path's file-picking code only considers the user
+        // key portion.
+        //
+        // Note Seek() also creates InternalKey with (user_key,
+        // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+        // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+        // Seek() key in InternalKey's ordering. So Seek() will look in the
+        // next file for the user key.
+        largest_candidate =
+            InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+      }
+#ifndef NDEBUG
+      SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+      if (meta->smallest.size() > 0) {
+        smallest_ikey_seqnum = GetInternalKeySeqno(meta->smallest.Encode());
+      }
+#endif
+      meta->UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+                                     tombstone.seq_,
+                                     cfd->internal_comparator());
+
+      // The smallest key in a file is used for range tombstone truncation, so
+      // it cannot have a seqnum of 0 (unless the smallest data key in a file
+      // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+      // deleted keys at lower levels.
+      assert(smallest_ikey_seqnum == 0 ||
+             ExtractInternalKeyFooter(meta->smallest.Encode()) !=
+                 PackSequenceAndType(0, kTypeRangeDeletion));
+    }
+    meta->marked_for_compaction = sub_compact->builder->NeedCompact();
   }
   const uint64_t current_entries = sub_compact->builder->NumEntries();
-  meta->marked_for_compaction = sub_compact->builder->NeedCompact();
   if (s.ok()) {
     s = sub_compact->builder->Finish();
   } else {
     sub_compact->builder->Abandon();
   }
   const uint64_t current_bytes = sub_compact->builder->FileSize();
-  meta->fd.file_size = current_bytes;
+  if (s.ok()) {
+    meta->fd.file_size = current_bytes;
+  }
   sub_compact->current_output()->finished = true;
   sub_compact->total_bytes += current_bytes;
 
@@ -1090,92 +1299,67 @@ Status CompactionJob::FinishCompactionOutputFile(
   }
   sub_compact->outfile.reset();
 
-  if (s.ok() && current_entries == 0) {
+  TableProperties tp;
+  if (s.ok()) {
+    tp = sub_compact->builder->GetTableProperties();
+  }
+
+  if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
     // If there is nothing to output, no necessary to generate a sst file.
     // This happens when the output level is bottom level, at the same time
     // the sub_compact output nothing.
-    std::string fname = TableFileName(
-        db_options_.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId());
+    std::string fname =
+        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+                      meta->fd.GetNumber(), meta->fd.GetPathId());
     env_->DeleteFile(fname);
 
     // Also need to remove the file from outputs, or it will be added to the
     // VersionEdit.
     assert(!sub_compact->outputs.empty());
     sub_compact->outputs.pop_back();
-    sub_compact->builder.reset();
-    sub_compact->current_output_file_size = 0;
-    return s;
+    meta = nullptr;
   }
 
-  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
-  TableProperties tp;
-  if (s.ok() && current_entries > 0) {
-    // Verify that the table is usable
-    // We set for_compaction to false and don't OptimizeForCompactionTableRead
-    // here because this is a special case after we finish the table building
-    // No matter whether use_direct_io_for_flush_and_compaction is true,
-    // we will regrad this verification as user reads since the goal is
-    // to cache it here for further user reads
-    InternalIterator* iter = cfd->table_cache()->NewIterator(
-        ReadOptions(), env_options_, cfd->internal_comparator(), meta->fd,
-        nullptr /* range_del_agg */, nullptr,
-        cfd->internal_stats()->GetFileReadHist(
-            compact_->compaction->output_level()),
-        false);
-    s = iter->status();
-
-    if (s.ok() && paranoid_file_checks_) {
-      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
-      s = iter->status();
-    }
-
-    delete iter;
-
+  if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
     // Output to event logger and fire events.
-    if (s.ok()) {
-      tp = sub_compact->builder->GetTableProperties();
-      sub_compact->current_output()->table_properties =
-          std::make_shared<TableProperties>(tp);
-      ROCKS_LOG_INFO(db_options_.info_log,
-                     "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
-                     " keys, %" PRIu64 " bytes%s",
-                     cfd->GetName().c_str(), job_id_, output_number,
-                     current_entries, current_bytes,
-                     meta->marked_for_compaction ? " (need compaction)" : "");
-    }
+    sub_compact->current_output()->table_properties =
+        std::make_shared<TableProperties>(tp);
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+                   " keys, %" PRIu64 " bytes%s",
+                   cfd->GetName().c_str(), job_id_, output_number,
+                   current_entries, current_bytes,
+                   meta->marked_for_compaction ? " (need compaction)" : "");
+  }
+  std::string fname;
+  FileDescriptor output_fd;
+  if (meta != nullptr) {
+    fname =
+        TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+                      meta->fd.GetNumber(), meta->fd.GetPathId());
+    output_fd = meta->fd;
+  } else {
+    fname = "(nil)";
   }
-  std::string fname = TableFileName(db_options_.db_paths, meta->fd.GetNumber(),
-                                    meta->fd.GetPathId());
   EventHelpers::LogAndNotifyTableFileCreationFinished(
       event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
-      job_id_, meta->fd, tp, TableFileCreationReason::kCompaction, s);
+      job_id_, output_fd, tp, TableFileCreationReason::kCompaction, s);
 
 #ifndef ROCKSDB_LITE
   // Report new file to SstFileManagerImpl
   auto sfm =
       static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
-  if (sfm && meta->fd.GetPathId() == 0) {
-    auto fn = TableFileName(cfd->ioptions()->db_paths, meta->fd.GetNumber(),
-                            meta->fd.GetPathId());
-    sfm->OnAddFile(fn);
+  if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+    sfm->OnAddFile(fname);
     if (sfm->IsMaxAllowedSpaceReached()) {
       // TODO(ajkr): should we return OK() if max space was reached by the final
       // compaction output file (similarly to how flush works when full)?
-      s = Status::IOError("Max allowed space was reached");
+      s = Status::SpaceLimit("Max allowed space was reached");
       TEST_SYNC_POINT(
           "CompactionJob::FinishCompactionOutputFile:"
           "MaxAllowedSpaceReached");
       InstrumentedMutexLock l(db_mutex_);
-      if (db_bg_error_->ok()) {
-        Status new_bg_error = s;
-        // may temporarily unlock and lock the mutex.
-        EventHelpers::NotifyOnBackgroundError(
-            cfd->ioptions()->listeners, BackgroundErrorReason::kCompaction,
-            &new_bg_error, db_mutex_);
-        if (!new_bg_error.ok()) {
-          *db_bg_error_ = new_bg_error;
-        }
-      }
+      db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
     }
   }
 #endif
@@ -1211,7 +1395,7 @@ Status CompactionJob::InstallCompactionResults(
         compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
   }
 
-  // Add compaction outputs
+  // Add compaction inputs
   compaction->AddInputDeletions(compact_->compaction->edit());
 
   for (const auto& sub_compact : compact_->sub_compact_states) {
@@ -1241,8 +1425,9 @@ Status CompactionJob::OpenCompactionOutputFile(
   assert(sub_compact->builder == nullptr);
   // no need to lock because VersionSet::next_file_number_ is atomic
   uint64_t file_number = versions_->NewFileNumber();
-  std::string fname = TableFileName(db_options_.db_paths, file_number,
-                                    sub_compact->compaction->output_path_id());
+  std::string fname =
+      TableFileName(sub_compact->compaction->immutable_cf_options()->cf_paths,
+                    file_number, sub_compact->compaction->output_path_id());
   // Fire events.
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 #ifndef ROCKSDB_LITE
@@ -1251,12 +1436,13 @@ Status CompactionJob::OpenCompactionOutputFile(
       TableFileCreationReason::kCompaction);
 #endif  // !ROCKSDB_LITE
   // Make the output file
-  unique_ptr<WritableFile> writable_file;
-  EnvOptions opt_env_opts =
-      env_->OptimizeForCompactionTableWrite(env_options_, db_options_);
+  std::unique_ptr<WritableFile> writable_file;
+#ifndef NDEBUG
+  bool syncpoint_arg = env_options_.use_direct_writes;
   TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
-                           &opt_env_opts.use_direct_writes);
-  Status s = NewWritableFile(env_, fname, &writable_file, opt_env_opts);
+                           &syncpoint_arg);
+#endif
+  Status s = NewWritableFile(env_, fname, &writable_file, env_options_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(
         db_options_.info_log,
@@ -1279,10 +1465,14 @@ Status CompactionJob::OpenCompactionOutputFile(
 
   sub_compact->outputs.push_back(out);
   writable_file->SetIOPriority(Env::IO_LOW);
+  writable_file->SetWriteLifeTimeHint(write_hint_);
   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
       sub_compact->compaction->OutputFilePreallocationSize()));
-  sub_compact->outfile.reset(new WritableFileWriter(
-      std::move(writable_file), env_options_, db_options_.statistics.get()));
+  const auto& listeners =
+      sub_compact->compaction->immutable_cf_options()->listeners;
+  sub_compact->outfile.reset(
+      new WritableFileWriter(std::move(writable_file), fname, env_options_,
+                             env_, db_options_.statistics.get(), listeners));
 
   // If the Column family flag is to only optimize filters for hits,
   // we can skip creating filters if this is the bottommost_level where
@@ -1294,17 +1484,28 @@ Status CompactionJob::OpenCompactionOutputFile(
       sub_compact->compaction->MaxInputFileCreationTime();
   if (output_file_creation_time == 0) {
     int64_t _current_time = 0;
-    db_options_.env->GetCurrentTime(&_current_time);  // ignore error
+    auto status = db_options_.env->GetCurrentTime(&_current_time);
+    // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "Failed to get current time to populate creation_time property. "
+          "Status: %s",
+          status.ToString().c_str());
+    }
     output_file_creation_time = static_cast<uint64_t>(_current_time);
   }
 
   sub_compact->builder.reset(NewTableBuilder(
-      *cfd->ioptions(), cfd->internal_comparator(),
-      cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
-      sub_compact->outfile.get(), sub_compact->compaction->output_compression(),
-      cfd->ioptions()->compression_opts,
-      sub_compact->compaction->output_level(), &sub_compact->compression_dict,
-      skip_filters, output_file_creation_time));
+      *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+      cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+      cfd->GetID(), cfd->GetName(), sub_compact->outfile.get(),
+      sub_compact->compaction->output_compression(),
+      0 /*sample_for_compression */,
+      sub_compact->compaction->output_compression_opts(),
+      sub_compact->compaction->output_level(), skip_filters,
+      output_file_creation_time, 0 /* oldest_key_time */,
+      sub_compact->compaction->max_output_file_size()));
   LogFlush(db_options_.info_log);
   return s;
 }
@@ -1334,8 +1535,7 @@ void CompactionJob::CleanupCompaction() {
 
 #ifndef ROCKSDB_LITE
 namespace {
-void CopyPrefix(
-    const Slice& src, size_t prefix_length, std::string* dst) {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
   assert(prefix_length > 0);
   size_t length = src.size() > prefix_length ? prefix_length : src.size();
   dst->assign(src.data(), length);
@@ -1354,13 +1554,11 @@ void CompactionJob::UpdateCompactionStats() {
     if (compaction->level(input_level) != compaction->output_level()) {
       UpdateCompactionInputStatsHelper(
           &compaction_stats_.num_input_files_in_non_output_levels,
-          &compaction_stats_.bytes_read_non_output_levels,
-          input_level);
+          &compaction_stats_.bytes_read_non_output_levels, input_level);
     } else {
       UpdateCompactionInputStatsHelper(
           &compaction_stats_.num_input_files_in_output_level,
-          &compaction_stats_.bytes_read_output_level,
-          input_level);
+          &compaction_stats_.bytes_read_output_level, input_level);
     }
   }
 
@@ -1383,8 +1581,9 @@ void CompactionJob::UpdateCompactionStats() {
   }
 }
 
-void CompactionJob::UpdateCompactionInputStatsHelper(
-    int* num_files, uint64_t* bytes_read, int input_level) {
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+                                                     uint64_t* bytes_read,
+                                                     int input_level) {
   const Compaction* compaction = compact_->compaction;
   auto num_input_files = compaction->num_input_files(input_level);
   *num_files += static_cast<int>(num_input_files);
@@ -1405,10 +1604,8 @@ void CompactionJob::UpdateCompactionJobStats(
 
     // input information
     compaction_job_stats_->total_input_bytes =
-        stats.bytes_read_non_output_levels +
-        stats.bytes_read_output_level;
-    compaction_job_stats_->num_input_records =
-        compact_->num_input_records;
+        stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+    compaction_job_stats_->num_input_records = compact_->num_input_records;
     compaction_job_stats_->num_input_files =
         stats.num_input_files_in_non_output_levels +
         stats.num_input_files_in_output_level;
@@ -1417,21 +1614,20 @@ void CompactionJob::UpdateCompactionJobStats(
 
     // output information
     compaction_job_stats_->total_output_bytes = stats.bytes_written;
-    compaction_job_stats_->num_output_records =
-        compact_->num_output_records;
+    compaction_job_stats_->num_output_records = compact_->num_output_records;
     compaction_job_stats_->num_output_files = stats.num_output_files;
 
     if (compact_->NumOutputFiles() > 0U) {
-      CopyPrefix(
-          compact_->SmallestUserKey(),
-          CompactionJobStats::kMaxPrefixLength,
-          &compaction_job_stats_->smallest_output_key_prefix);
-      CopyPrefix(
-          compact_->LargestUserKey(),
-          CompactionJobStats::kMaxPrefixLength,
-          &compaction_job_stats_->largest_output_key_prefix);
+      CopyPrefix(compact_->SmallestUserKey(),
+                 CompactionJobStats::kMaxPrefixLength,
+                 &compaction_job_stats_->smallest_output_key_prefix);
+      CopyPrefix(compact_->LargestUserKey(),
+                 CompactionJobStats::kMaxPrefixLength,
+                 &compaction_job_stats_->largest_output_key_prefix);
     }
   }
+#else
+  (void)stats;
 #endif  // !ROCKSDB_LITE
 }
 
@@ -1454,7 +1650,9 @@ void CompactionJob::LogCompaction() {
     // build event logger report
     auto stream = event_logger_->Log();
     stream << "job" << job_id_ << "event"
-           << "compaction_started";
+           << "compaction_started"
+           << "compaction_reason"
+           << GetCompactionReasonString(compaction->compaction_reason());
     for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
       stream << ("files_L" + ToString(compaction->level(i)));
       stream.StartArray();
diff --git a/thirdparty/rocksdb/db/compaction_job.h b/thirdparty/rocksdb/db/compaction_job.h
index 6ca5d627a7..9767985f33 100644
--- a/thirdparty/rocksdb/db/compaction_job.h
+++ b/thirdparty/rocksdb/db/compaction_job.h
@@ -29,6 +29,7 @@
 #include "db/version_edit.h"
 #include "db/write_controller.h"
 #include "db/write_thread.h"
+#include "options/cf_options.h"
 #include "options/db_options.h"
 #include "port/port.h"
 #include "rocksdb/compaction_filter.h"
@@ -45,28 +46,33 @@
 
 namespace rocksdb {
 
+class Arena;
+class ErrorHandler;
 class MemTable;
+class SnapshotChecker;
 class TableCache;
 class Version;
 class VersionEdit;
 class VersionSet;
-class Arena;
 
 class CompactionJob {
  public:
   CompactionJob(int job_id, Compaction* compaction,
                 const ImmutableDBOptions& db_options,
-                const EnvOptions& env_options, VersionSet* versions,
-                const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
-                Directory* db_directory, Directory* output_directory,
-                Statistics* stats, InstrumentedMutex* db_mutex,
-                Status* db_bg_error,
+                const EnvOptions env_options, VersionSet* versions,
+                const std::atomic<bool>* shutting_down,
+                const SequenceNumber preserve_deletes_seqnum,
+                LogBuffer* log_buffer, Directory* db_directory,
+                Directory* output_directory, Statistics* stats,
+                InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
                 std::vector<SequenceNumber> existing_snapshots,
                 SequenceNumber earliest_write_conflict_snapshot,
+                const SnapshotChecker* snapshot_checker,
                 std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
                 bool paranoid_file_checks, bool measure_io_stats,
                 const std::string& dbname,
-                CompactionJobStats* compaction_job_stats);
+                CompactionJobStats* compaction_job_stats,
+                Env::Priority thread_pri);
 
   ~CompactionJob();
 
@@ -98,7 +104,7 @@ class CompactionJob {
 
   Status FinishCompactionOutputFile(
       const Status& input_status, SubcompactionState* sub_compact,
-      RangeDelAggregator* range_del_agg,
+      CompactionRangeDelAggregator* range_del_agg,
       CompactionIterationStats* range_del_out_stats,
       const Slice* next_table_min_key = nullptr);
   Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
@@ -127,17 +133,20 @@ class CompactionJob {
   // DBImpl state
   const std::string& dbname_;
   const ImmutableDBOptions& db_options_;
-  const EnvOptions& env_options_;
+  const EnvOptions env_options_;
 
   Env* env_;
+  // env_option optimized for compaction table reads
+  EnvOptions env_optiosn_for_read_;
   VersionSet* versions_;
   const std::atomic<bool>* shutting_down_;
+  const SequenceNumber preserve_deletes_seqnum_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
   Directory* output_directory_;
   Statistics* stats_;
   InstrumentedMutex* db_mutex_;
-  Status* db_bg_error_;
+  ErrorHandler* db_error_handler_;
   // If there were two snapshots with seq numbers s1 and
   // s2 and s1 < s2, and if we find two instances of a key k1 then lies
   // entirely within s1 and s2, then the earlier version of k1 can be safely
@@ -149,6 +158,8 @@ class CompactionJob {
   // should make sure not to remove evidence that a write occurred.
   SequenceNumber earliest_write_conflict_snapshot_;
 
+  const SnapshotChecker* const snapshot_checker_;
+
   std::shared_ptr<Cache> table_cache_;
 
   EventLogger* event_logger_;
@@ -160,6 +171,8 @@ class CompactionJob {
   std::vector<Slice> boundaries_;
   // Stores the approx size of keys covered in the range of each subcompaction
   std::vector<uint64_t> sizes_;
+  Env::WriteLifeTimeHint write_hint_;
+  Env::Priority thread_pri_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/compaction_job_stats_test.cc b/thirdparty/rocksdb/db/compaction_job_stats_test.cc
index 9a8372f578..48e883bc6c 100644
--- a/thirdparty/rocksdb/db/compaction_job_stats_test.cc
+++ b/thirdparty/rocksdb/db/compaction_job_stats_test.cc
@@ -98,7 +98,7 @@ class CompactionJobStatsTest : public testing::Test,
   CompactionJobStatsTest() : env_(Env::Default()) {
     env_->SetBackgroundThreads(1, Env::LOW);
     env_->SetBackgroundThreads(1, Env::HIGH);
-    dbname_ = test::TmpDir(env_) + "/compaction_job_stats_test";
+    dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
     alternative_wal_dir_ = dbname_ + "/wal";
     Options options;
     options.create_if_missing = true;
@@ -113,7 +113,7 @@ class CompactionJobStatsTest : public testing::Test,
     Reopen(options);
   }
 
-  ~CompactionJobStatsTest() {
+  ~CompactionJobStatsTest() override {
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
     rocksdb::SyncPoint::GetInstance()->LoadDependency({});
     rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -426,7 +426,7 @@ class CompactionJobStatsChecker : public EventListener {
   // Once a compaction completed, this function will verify the returned
   // CompactionJobInfo with the oldest CompactionJobInfo added earlier
   // in "expected_stats_" which has not yet being used for verification.
-  virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) {
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
     if (verify_next_comp_io_stats_) {
       ASSERT_GT(ci.stats.file_write_nanos, 0);
       ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
@@ -523,7 +523,7 @@ class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
  public:
   // Verifies whether two CompactionJobStats match.
   void Verify(const CompactionJobStats& current_stats,
-              const CompactionJobStats& stats) {
+              const CompactionJobStats& stats) override {
     ASSERT_EQ(
       current_stats.num_input_deletion_records,
       stats.num_input_deletion_records);
@@ -806,7 +806,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
     stats_checker->set_verify_next_comp_io_stats(true);
     std::atomic<bool> first_prepare_write(true);
     rocksdb::SyncPoint::GetInstance()->SetCallBack(
-        "WritableFileWriter::Append:BeforePrepareWrite", [&](void* arg) {
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
           if (first_prepare_write.load()) {
             options.env->SleepForMicroseconds(3);
             first_prepare_write.store(false);
@@ -815,7 +815,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
 
     std::atomic<bool> first_flush(true);
     rocksdb::SyncPoint::GetInstance()->SetCallBack(
-        "WritableFileWriter::Flush:BeforeAppend", [&](void* arg) {
+        "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
           if (first_flush.load()) {
             options.env->SleepForMicroseconds(3);
             first_flush.store(false);
@@ -824,7 +824,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
 
     std::atomic<bool> first_sync(true);
     rocksdb::SyncPoint::GetInstance()->SetCallBack(
-        "WritableFileWriter::SyncInternal:0", [&](void* arg) {
+        "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
           if (first_sync.load()) {
             options.env->SleepForMicroseconds(3);
             first_sync.store(false);
@@ -833,7 +833,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
 
     std::atomic<bool> first_range_sync(true);
     rocksdb::SyncPoint::GetInstance()->SetCallBack(
-        "WritableFileWriter::RangeSync:0", [&](void* arg) {
+        "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
           if (first_range_sync.load()) {
             options.env->SleepForMicroseconds(3);
             first_range_sync.store(false);
@@ -1034,7 +1034,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
   return 0;
 }
@@ -1043,5 +1043,5 @@ int main(int argc, char** argv) {
 
 #else
 
-int main(int argc, char** argv) { return 0; }
+int main(int /*argc*/, char** /*argv*/) { return 0; }
 #endif  // !defined(IOS_CROSS_COMPILE)
diff --git a/thirdparty/rocksdb/db/compaction_job_test.cc b/thirdparty/rocksdb/db/compaction_job_test.cc
index cace1814ad..f05a8ec2ff 100644
--- a/thirdparty/rocksdb/db/compaction_job_test.cc
+++ b/thirdparty/rocksdb/db/compaction_job_test.cc
@@ -12,6 +12,7 @@
 
 #include "db/column_family.h"
 #include "db/compaction_job.h"
+#include "db/error_handler.h"
 #include "db/version_set.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
@@ -67,7 +68,7 @@ class CompactionJobTest : public testing::Test {
  public:
   CompactionJobTest()
       : env_(Env::Default()),
-        dbname_(test::TmpDir() + "/compaction_job_test"),
+        dbname_(test::PerThreadDBPath("compaction_job_test")),
         db_options_(),
         mutable_cf_options_(cf_options_),
         table_cache_(NewLRUCache(50000, 16)),
@@ -76,7 +77,9 @@ class CompactionJobTest : public testing::Test {
                                  table_cache_.get(), &write_buffer_manager_,
                                  &write_controller_)),
         shutting_down_(false),
-        mock_table_factory_(new mock::MockTableFactory()) {
+        preserve_deletes_seqnum_(0),
+        mock_table_factory_(new mock::MockTableFactory()),
+        error_handler_(nullptr, db_options_, &mutex_) {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
@@ -142,7 +145,8 @@ class CompactionJobTest : public testing::Test {
   }
 
   void SetLastSequence(const SequenceNumber sequence_number) {
-    versions_->SetLastToBeWrittenSequence(sequence_number + 1);
+    versions_->SetLastAllocatedSequence(sequence_number + 1);
+    versions_->SetLastPublishedSequence(sequence_number + 1);
     versions_->SetLastSequence(sequence_number + 1);
   }
 
@@ -168,7 +172,7 @@ class CompactionJobTest : public testing::Test {
         // This is how the key will look like once it's written in bottommost
         // file
         InternalKey bottommost_internal_key(
-            key, (key == "9999") ? sequence_number : 0, kTypeValue);
+            key, 0, kTypeValue);
 
         if (corrupt_id(k)) {
           test::CorruptKeyType(&internal_key);
@@ -196,12 +200,12 @@ class CompactionJobTest : public testing::Test {
     new_db.SetLastSequence(0);
 
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     Status s = env_->NewWritableFile(
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
-    unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), env_options_));
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), manifest, env_options_));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
@@ -244,18 +248,22 @@ class CompactionJobTest : public testing::Test {
     Compaction compaction(cfd->current()->storage_info(), *cfd->ioptions(),
                           *cfd->GetLatestMutableCFOptions(),
                           compaction_input_files, 1, 1024 * 1024,
-                          10 * 1024 * 1024, 0, kNoCompression, {}, true);
+                          10 * 1024 * 1024, 0, kNoCompression,
+                          cfd->ioptions()->compression_opts, 0, {}, true);
     compaction.SetInputVersion(cfd->current());
 
     LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
     mutex_.Lock();
     EventLogger event_logger(db_options_.info_log.get());
+    // TODO(yiwu) add a mock snapshot checker and add test for it.
+    SnapshotChecker* snapshot_checker = nullptr;
     CompactionJob compaction_job(
         0, &compaction, db_options_, env_options_, versions_.get(),
-        &shutting_down_, &log_buffer, nullptr, nullptr, nullptr, &mutex_,
-        &bg_error_, snapshots, earliest_write_conflict_snapshot, table_cache_,
-        &event_logger, false, false, dbname_, &compaction_job_stats_);
-
+        &shutting_down_, preserve_deletes_seqnum_, &log_buffer, nullptr,
+        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+        earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+        &event_logger, false, false, dbname_, &compaction_job_stats_,
+        Env::Priority::USER);
     VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
 
     compaction_job.Prepare();
@@ -291,12 +299,13 @@ class CompactionJobTest : public testing::Test {
   std::unique_ptr<VersionSet> versions_;
   InstrumentedMutex mutex_;
   std::atomic<bool> shutting_down_;
+  SequenceNumber preserve_deletes_seqnum_;
   std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
   CompactionJobStats compaction_job_stats_;
   ColumnFamilyData* cfd_;
   std::unique_ptr<CompactionFilter> compaction_filter_;
   std::shared_ptr<MergeOperator> merge_op_;
-  Status bg_error_;
+  ErrorHandler error_handler_;
 };
 
 TEST_F(CompactionJobTest, Simple) {
@@ -371,7 +380,7 @@ TEST_F(CompactionJobTest, SimpleOverwrite) {
 
   auto expected_results =
       mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
-                          {KeyStr("b", 4U, kTypeValue), "val3"}});
+                          {KeyStr("b", 0U, kTypeValue), "val3"}});
 
   SetLastSequence(4U);
   auto files = cfd_->current()->storage_info()->LevelFiles(0);
@@ -424,7 +433,7 @@ TEST_F(CompactionJobTest, SimpleMerge) {
 
   auto expected_results =
       mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
-                          {KeyStr("b", 2U, kTypeValue), "1,2"}});
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
 
   SetLastSequence(5U);
   auto files = cfd_->current()->storage_info()->LevelFiles(0);
@@ -448,8 +457,7 @@ TEST_F(CompactionJobTest, NonAssocMerge) {
 
   auto expected_results =
       mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
-                          {KeyStr("b", 2U, kTypeMerge), "2"},
-                          {KeyStr("b", 1U, kTypeMerge), "1"}});
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
 
   SetLastSequence(5U);
   auto files = cfd_->current()->storage_info()->LevelFiles(0);
@@ -476,7 +484,7 @@ TEST_F(CompactionJobTest, MergeOperandFilter) {
 
   auto expected_results =
       mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
-                          {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}});
+                          {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
 
   SetLastSequence(5U);
   auto files = cfd_->current()->storage_info()->LevelFiles(0);
@@ -739,7 +747,7 @@ TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
   AddMockFile(file2);
 
   auto expected_results = mock::MakeMockFile({
-      {KeyStr("dummy", 5U, kTypeValue), "val2"},
+      {KeyStr("dummy", 0U, kTypeValue), "val2"},
   });
 
   SetLastSequence(22U);
@@ -923,7 +931,7 @@ TEST_F(CompactionJobTest, CorruptionAfterDeletion) {
       mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
                           {test::KeyStr("a", 0U, kTypeValue, true), "val"},
                           {test::KeyStr("b", 0U, kTypeValue, true), "val"},
-                          {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+                          {test::KeyStr("c", 0U, kTypeValue), "val2"}});
 
   SetLastSequence(6U);
   auto files = cfd_->current()->storage_info()->LevelFiles(0);
@@ -940,7 +948,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
   return 0;
diff --git a/thirdparty/rocksdb/db/compaction_picker.cc b/thirdparty/rocksdb/db/compaction_picker.cc
index 79af3ed9fe..6510d4bc0c 100644
--- a/thirdparty/rocksdb/db/compaction_picker.cc
+++ b/thirdparty/rocksdb/db/compaction_picker.cc
@@ -18,6 +18,7 @@
 #include <queue>
 #include <string>
 #include <utility>
+#include <vector>
 #include "db/column_family.h"
 #include "monitoring/statistics.h"
 #include "util/filename.h"
@@ -36,12 +37,13 @@ uint64_t TotalCompensatedFileSize(const std::vector<FileMetaData*>& files) {
   }
   return sum;
 }
+}  // anonymous namespace
 
 bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
                            size_t min_files_to_compact,
                            uint64_t max_compact_bytes_per_del_file,
                            CompactionInputFiles* comp_inputs) {
-  size_t compact_bytes = level_files[0]->fd.file_size;
+  size_t compact_bytes = static_cast<size_t>(level_files[0]->fd.file_size);
   size_t compact_bytes_per_del_file = port::kMaxSizet;
   // compaction range will be [0, span_len).
   size_t span_len;
@@ -49,7 +51,7 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
   // increasing.
   size_t new_compact_bytes_per_del_file = 0;
   for (span_len = 1; span_len < level_files.size(); ++span_len) {
-    compact_bytes += level_files[span_len]->fd.file_size;
+    compact_bytes += static_cast<size_t>(level_files[span_len]->fd.file_size);
     new_compact_bytes_per_del_file = compact_bytes / span_len;
     if (level_files[span_len]->being_compacted ||
         new_compact_bytes_per_del_file > compact_bytes_per_del_file) {
@@ -59,7 +61,7 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
   }
 
   if (span_len >= min_files_to_compact &&
-      new_compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+      compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
     assert(comp_inputs != nullptr);
     comp_inputs->level = 0;
     for (size_t i = 0; i < span_len; ++i) {
@@ -69,7 +71,6 @@ bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
   }
   return false;
 }
-}  // anonymous namespace
 
 // Determine compression type, based on user options, level of the output
 // file and whether compression is disabled.
@@ -89,7 +90,7 @@ CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
   // If bottommost_compression is set and we are compacting to the
   // bottommost level then we should use it.
   if (ioptions.bottommost_compression != kDisableCompressionOption &&
-      level > base_level && level >= (vstorage->num_non_empty_levels() - 1)) {
+      level >= (vstorage->num_non_empty_levels() - 1)) {
     return ioptions.bottommost_compression;
   }
   // If the user has specified a different compression level for each level,
@@ -110,6 +111,24 @@ CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
   }
 }
 
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+                                         const VersionStorageInfo* vstorage,
+                                         int level,
+                                         const bool enable_compression) {
+  if (!enable_compression) {
+    return ioptions.compression_opts;
+  }
+  // If bottommost_compression is set and we are compacting to the
+  // bottommost level then we should use the specified compression options
+  // for the bottmomost_compression.
+  if (ioptions.bottommost_compression != kDisableCompressionOption &&
+      level >= (vstorage->num_non_empty_levels() - 1) &&
+      ioptions.bottommost_compression_opts.enabled) {
+    return ioptions.bottommost_compression_opts;
+  }
+  return ioptions.compression_opts;
+}
+
 CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions,
                                    const InternalKeyComparator* icmp)
     : ioptions_(ioptions), icmp_(icmp) {}
@@ -199,9 +218,10 @@ void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
   assert(initialized);
 }
 
-bool CompactionPicker::ExpandInputsToCleanCut(const std::string& cf_name,
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
                                               VersionStorageInfo* vstorage,
-                                              CompactionInputFiles* inputs) {
+                                              CompactionInputFiles* inputs,
+                                              InternalKey** next_smallest) {
   // This isn't good compaction
   assert(!inputs->empty());
 
@@ -224,7 +244,8 @@ bool CompactionPicker::ExpandInputsToCleanCut(const std::string& cf_name,
     GetRange(*inputs, &smallest, &largest);
     inputs->clear();
     vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
-                                   hint_index, &hint_index);
+                                   hint_index, &hint_index, true,
+                                   next_smallest);
   } while (inputs->size() > old_size);
 
   // we started off with inputs non-empty and the previous loop only grew
@@ -292,25 +313,34 @@ Compaction* CompactionPicker::CompactFiles(
     VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
     uint32_t output_path_id) {
   assert(input_files.size());
-
-  // TODO(rven ): we might be able to run concurrent level 0 compaction
-  // if the key ranges of the two compactions do not overlap, but for now
-  // we do not allow it.
-  if ((input_files[0].level == 0) && !level0_compactions_in_progress_.empty()) {
-    return nullptr;
-  }
-  // This compaction output could overlap with a running compaction
-  if (FilesRangeOverlapWithCompaction(input_files, output_level)) {
-    return nullptr;
+  // This compaction output should not overlap with a running compaction as
+  // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+  // shouldn't have been released since.
+  assert(!FilesRangeOverlapWithCompaction(input_files, output_level));
+
+  CompressionType compression_type;
+  if (compact_options.compression == kDisableCompressionOption) {
+    int base_level;
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      base_level = vstorage->base_level();
+    } else {
+      base_level = 1;
+    }
+    compression_type =
+        GetCompressionType(ioptions_, vstorage, mutable_cf_options,
+                           output_level, base_level);
+  } else {
+    // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+    // without configurable `CompressionOptions`, which is inconsistent.
+    compression_type = compact_options.compression;
   }
-  auto c =
-      new Compaction(vstorage, ioptions_, mutable_cf_options, input_files,
-                     output_level, compact_options.output_file_size_limit,
-                     mutable_cf_options.max_compaction_bytes, output_path_id,
-                     compact_options.compression, /* grandparents */ {}, true);
-
-  // If it's level 0 compaction, make sure we don't execute any other level 0
-  // compactions in parallel
+  auto c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, input_files, output_level,
+      compact_options.output_file_size_limit,
+      mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+      GetCompressionOptions(ioptions_, vstorage, output_level),
+      compact_options.max_subcompactions,
+      /* grandparents */ {}, true);
   RegisterCompaction(c);
   return c;
 }
@@ -318,7 +348,7 @@ Compaction* CompactionPicker::CompactFiles(
 Status CompactionPicker::GetCompactionInputsFromFileNumbers(
     std::vector<CompactionInputFiles>* input_files,
     std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
-    const CompactionOptions& compact_options) const {
+    const CompactionOptions& /*compact_options*/) const {
   if (input_set->size() == 0U) {
     return Status::InvalidArgument(
         "Compaction must include at least one file.");
@@ -373,7 +403,7 @@ bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
   assert(level < NumberLevels());
 
   vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
-                                 *level_index, level_index);
+                                 level_index ? *level_index : 0, level_index);
   return AreFilesInCompaction(inputs);
 }
 
@@ -396,7 +426,10 @@ bool CompactionPicker::SetupOtherInputs(
   assert(output_level_inputs->empty());
   const int input_level = inputs->level;
   const int output_level = output_level_inputs->level;
-  assert(input_level != output_level);
+  if (input_level == output_level) {
+    // no possibility of conflict
+    return true;
+  }
 
   // For now, we only support merging two levels, start level and output level.
   // We need to assert other levels are empty.
@@ -481,7 +514,7 @@ bool CompactionPicker::SetupOtherInputs(
       ROCKS_LOG_INFO(ioptions_.info_log,
                      "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
                      "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
-                     "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 "bytes)\n",
+                     "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
                      cf_name.c_str(), input_level, inputs->size(),
                      output_level_inputs->size(), inputs_size,
                      output_level_inputs_size, expanded_inputs.size(),
@@ -510,7 +543,8 @@ void CompactionPicker::GetGrandparents(
 Compaction* CompactionPicker::CompactRange(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, int input_level, int output_level,
-    uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+    uint32_t output_path_id, uint32_t max_subcompactions,
+    const InternalKey* begin, const InternalKey* end,
     InternalKey** compaction_end, bool* manual_conflict) {
   // CompactionPickerFIFO has its own implementation of compact range
   assert(ioptions_.compaction_style != kCompactionStyleFIFO);
@@ -572,11 +606,14 @@ Compaction* CompactionPicker::CompactRange(
 
     Compaction* c = new Compaction(
         vstorage, ioptions_, mutable_cf_options, std::move(inputs),
-        output_level, mutable_cf_options.MaxFileSizeForLevel(output_level),
+        output_level,
+        MaxFileSizeForLevel(mutable_cf_options, output_level,
+                            ioptions_.compaction_style),
         /* max_compaction_bytes */ LLONG_MAX, output_path_id,
         GetCompressionType(ioptions_, vstorage, mutable_cf_options,
                            output_level, 1),
-        /* grandparents */ {}, /* is manual */ true);
+        GetCompressionOptions(ioptions_, vstorage, output_level),
+        max_subcompactions, /* grandparents */ {}, /* is manual */ true);
     RegisterCompaction(c);
     return c;
   }
@@ -615,16 +652,18 @@ Compaction* CompactionPicker::CompactRange(
       uint64_t s = inputs[i]->compensated_file_size;
       total += s;
       if (total >= limit) {
-        **compaction_end = inputs[i + 1]->smallest;
         covering_the_whole_range = false;
         inputs.files.resize(i + 1);
         break;
       }
     }
   }
-  assert(output_path_id < static_cast<uint32_t>(ioptions_.db_paths.size()));
+  assert(output_path_id < static_cast<uint32_t>(ioptions_.cf_paths.size()));
 
-  if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs) == false) {
+  InternalKey key_storage;
+  InternalKey* next_smallest = &key_storage;
+  if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+      false) {
     // manual compaction is now multi-threaded, so it can
     // happen that ExpandWhileOverlapping fails
     // we handle it higher in RunManualCompaction
@@ -632,8 +671,10 @@ Compaction* CompactionPicker::CompactRange(
     return nullptr;
   }
 
-  if (covering_the_whole_range) {
+  if (covering_the_whole_range || !next_smallest) {
     *compaction_end = nullptr;
+  } else {
+    **compaction_end = *next_smallest;
   }
 
   CompactionInputFiles output_level_inputs;
@@ -679,11 +720,16 @@ Compaction* CompactionPicker::CompactRange(
   GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
   Compaction* compaction = new Compaction(
       vstorage, ioptions_, mutable_cf_options, std::move(compaction_inputs),
-      output_level, mutable_cf_options.MaxFileSizeForLevel(output_level),
+      output_level,
+      MaxFileSizeForLevel(mutable_cf_options, output_level,
+                          ioptions_.compaction_style, vstorage->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
       mutable_cf_options.max_compaction_bytes, output_path_id,
       GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
                          vstorage->base_level()),
-      std::move(grandparents), /* is manual compaction */ true);
+      GetCompressionOptions(ioptions_, vstorage, output_level),
+      max_subcompactions, std::move(grandparents),
+      /* is manual compaction */ true);
 
   TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
   RegisterCompaction(compaction);
@@ -730,10 +776,6 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
   auto& levels = cf_meta.levels;
   auto comparator = icmp_->user_comparator();
 
-  // TODO(yhchiang): If there is any input files of L1 or up and there
-  // is at least one L0 files. All L0 files older than the L0 file needs
-  // to be included. Otherwise, it is a false conditoin
-
   // TODO(yhchiang): add is_adjustable to CompactionOptions
 
   // the smallest and largest key of the current compaction input
@@ -794,6 +836,8 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
         }
         last_included++;
       }
+    } else if (output_level > 0) {
+      last_included = static_cast<int>(current_files.size() - 1);
     }
 
     // include all files between the first and the last compaction input files.
@@ -853,6 +897,11 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
       }
     }
   }
+  if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+    return Status::Aborted(
+        "A running compaction is writing to the same output level in an "
+        "overlapping key range");
+  }
   return Status::OK();
 }
 
@@ -895,8 +944,8 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
   // any currently-existing files.
   for (auto file_num : *input_files) {
     bool found = false;
-    for (auto level_meta : cf_meta.levels) {
-      for (auto file_meta : level_meta.files) {
+    for (const auto& level_meta : cf_meta.levels) {
+      for (const auto& file_meta : level_meta.files) {
         if (file_num == TableFileNameToNumber(file_meta.name)) {
           if (file_meta.being_compacted) {
             return Status::Aborted("Specified compaction input file " +
@@ -947,8 +996,86 @@ void CompactionPicker::UnregisterCompaction(Compaction* c) {
   compactions_in_progress_.erase(c);
 }
 
+void CompactionPicker::PickFilesMarkedForCompaction(
+    const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+    int* output_level, CompactionInputFiles* start_level_inputs) {
+  if (vstorage->FilesMarkedForCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    *start_level = level_file.first;
+    *output_level =
+        (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+    if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+      return false;
+    }
+
+    start_level_inputs->files = {level_file.second};
+    start_level_inputs->level = *start_level;
+    return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+  };
+
+  // take a chance on a random file first
+  Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+  size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+      static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+
+  if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+    // found the compaction!
+    return;
+  }
+
+  for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+  start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+    VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+    int output_level, int* parent_index) {
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  assert(level0_compactions_in_progress()->empty());
+  InternalKey smallest, largest;
+  GetRange(*start_level_inputs, &smallest, &largest);
+  // Note that the next call will discard the file we placed in
+  // c->inputs_[0] earlier and replace it with an overlapping set
+  // which will include the picked file.
+  start_level_inputs->files.clear();
+  vstorage->GetOverlappingInputs(0, &smallest, &largest,
+                                 &(start_level_inputs->files));
+
+  // If we include more L0 files in the same compaction run it can
+  // cause the 'smallest' and 'largest' key to get extended to a
+  // larger range. So, re-invoke GetRange to get the new key range
+  GetRange(*start_level_inputs, &smallest, &largest);
+  if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+                          parent_index)) {
+    return false;
+  }
+  assert(!start_level_inputs->files.empty());
+
+  return true;
+}
+
 bool LevelCompactionPicker::NeedsCompaction(
     const VersionStorageInfo* vstorage) const {
+  if (!vstorage->ExpiredTtlFiles().empty()) {
+    return true;
+  }
+  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+    return true;
+  }
   if (!vstorage->FilesMarkedForCompaction().empty()) {
     return true;
   }
@@ -1012,8 +1139,7 @@ class LevelCompactionBuilder {
   // otherwise, returns false.
   bool PickIntraL0Compaction();
 
-  // If there is any file marked for compaction, put put it into inputs.
-  void PickFilesMarkedForCompaction();
+  void PickExpiredTtlFiles();
 
   const std::string& cf_name_;
   VersionStorageInfo* vstorage_;
@@ -1041,8 +1167,8 @@ class LevelCompactionBuilder {
   static const int kMinFilesForIntraL0Compaction = 4;
 };
 
-void LevelCompactionBuilder::PickFilesMarkedForCompaction() {
-  if (vstorage_->FilesMarkedForCompaction().empty()) {
+void LevelCompactionBuilder::PickExpiredTtlFiles() {
+  if (vstorage_->ExpiredTtlFiles().empty()) {
     return;
   }
 
@@ -1055,8 +1181,9 @@ void LevelCompactionBuilder::PickFilesMarkedForCompaction() {
     output_level_ =
         (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
 
-    if (start_level_ == 0 &&
-        !compaction_picker_->level0_compactions_in_progress()->empty()) {
+    if ((start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+        (start_level_ == 0 &&
+         !compaction_picker_->level0_compactions_in_progress()->empty())) {
       return false;
     }
 
@@ -1066,22 +1193,13 @@ void LevelCompactionBuilder::PickFilesMarkedForCompaction() {
                                                       &start_level_inputs_);
   };
 
-  // take a chance on a random file first
-  Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage_));
-  size_t random_file_index = static_cast<size_t>(rnd.Uniform(
-      static_cast<uint64_t>(vstorage_->FilesMarkedForCompaction().size())));
-
-  if (continuation(vstorage_->FilesMarkedForCompaction()[random_file_index])) {
-    // found the compaction!
-    return;
-  }
-
-  for (auto& level_file : vstorage_->FilesMarkedForCompaction()) {
+  for (auto& level_file : vstorage_->ExpiredTtlFiles()) {
     if (continuation(level_file)) {
       // found the compaction!
       return;
     }
   }
+
   start_level_inputs_.files.clear();
 }
 
@@ -1136,40 +1254,51 @@ void LevelCompactionBuilder::SetupInitialFiles() {
   // if we didn't find a compaction, check if there are any files marked for
   // compaction
   if (start_level_inputs_.empty()) {
-    is_manual_ = true;
     parent_index_ = base_index_ = -1;
-    PickFilesMarkedForCompaction();
+
+    // PickFilesMarkedForCompaction();
+    compaction_picker_->PickFilesMarkedForCompaction(
+        cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
     if (!start_level_inputs_.empty()) {
+      is_manual_ = true;
       compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+      return;
+    }
+
+    size_t i;
+    for (i = 0; i < vstorage_->BottommostFilesMarkedForCompaction().size();
+         ++i) {
+      auto& level_and_file = vstorage_->BottommostFilesMarkedForCompaction()[i];
+      assert(!level_and_file.second->being_compacted);
+      start_level_inputs_.level = output_level_ = start_level_ =
+          level_and_file.first;
+      start_level_inputs_.files = {level_and_file.second};
+      if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                     &start_level_inputs_)) {
+        break;
+      }
+    }
+    if (i == vstorage_->BottommostFilesMarkedForCompaction().size()) {
+      start_level_inputs_.clear();
+    } else {
+      assert(!start_level_inputs_.empty());
+      compaction_reason_ = CompactionReason::kBottommostFiles;
+      return;
+    }
+
+    assert(start_level_inputs_.empty());
+    PickExpiredTtlFiles();
+    if (!start_level_inputs_.empty()) {
+      compaction_reason_ = CompactionReason::kTtl;
     }
   }
 }
 
 bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
   if (start_level_ == 0 && output_level_ != 0) {
-    // Two level 0 compaction won't run at the same time, so don't need to worry
-    // about files on level 0 being compacted.
-    assert(compaction_picker_->level0_compactions_in_progress()->empty());
-    InternalKey smallest, largest;
-    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
-    // Note that the next call will discard the file we placed in
-    // c->inputs_[0] earlier and replace it with an overlapping set
-    // which will include the picked file.
-    start_level_inputs_.files.clear();
-    vstorage_->GetOverlappingInputs(0, &smallest, &largest,
-                                    &start_level_inputs_.files);
-
-    // If we include more L0 files in the same compaction run it can
-    // cause the 'smallest' and 'largest' key to get extended to a
-    // larger range. So, re-invoke GetRange to get the new key range
-    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
-    if (compaction_picker_->IsRangeInCompaction(
-            vstorage_, &smallest, &largest, output_level_, &parent_index_)) {
-      return false;
-    }
+    return compaction_picker_->GetOverlappingL0Files(
+        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
   }
-  assert(!start_level_inputs_.files.empty());
-
   return true;
 }
 
@@ -1242,13 +1371,17 @@ Compaction* LevelCompactionBuilder::PickCompaction() {
 Compaction* LevelCompactionBuilder::GetCompaction() {
   auto c = new Compaction(
       vstorage_, ioptions_, mutable_cf_options_, std::move(compaction_inputs_),
-      output_level_, mutable_cf_options_.MaxFileSizeForLevel(output_level_),
+      output_level_,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+                          ioptions_.compaction_style, vstorage_->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
       mutable_cf_options_.max_compaction_bytes,
       GetPathId(ioptions_, mutable_cf_options_, output_level_),
       GetCompressionType(ioptions_, vstorage_, mutable_cf_options_,
                          output_level_, vstorage_->base_level()),
-      std::move(grandparents_), is_manual_, start_level_score_,
-      false /* deletion_compaction */, compaction_reason_);
+      GetCompressionOptions(ioptions_, vstorage_, output_level_),
+      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+      start_level_score_, false /* deletion_compaction */, compaction_reason_);
 
   // If it's level 0 compaction, make sure we don't execute any other level 0
   // compactions in parallel
@@ -1271,32 +1404,47 @@ uint32_t LevelCompactionBuilder::GetPathId(
     const ImmutableCFOptions& ioptions,
     const MutableCFOptions& mutable_cf_options, int level) {
   uint32_t p = 0;
-  assert(!ioptions.db_paths.empty());
+  assert(!ioptions.cf_paths.empty());
 
   // size remaining in the most recent path
-  uint64_t current_path_size = ioptions.db_paths[0].target_size;
+  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
 
   uint64_t level_size;
   int cur_level = 0;
 
+  // max_bytes_for_level_base denotes L1 size.
+  // We estimate L0 size to be the same as L1.
   level_size = mutable_cf_options.max_bytes_for_level_base;
 
   // Last path is the fallback
-  while (p < ioptions.db_paths.size() - 1) {
+  while (p < ioptions.cf_paths.size() - 1) {
     if (level_size <= current_path_size) {
       if (cur_level == level) {
         // Does desired level fit in this path?
         return p;
       } else {
         current_path_size -= level_size;
-        level_size = static_cast<uint64_t>(
-            level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+        if (cur_level > 0) {
+          if (ioptions.level_compaction_dynamic_level_bytes) {
+            // Currently, level_compaction_dynamic_level_bytes is ignored when
+            // multiple db paths are specified. https://github.com/facebook/
+            // rocksdb/blob/master/db/column_family.cc.
+            // Still, adding this check to avoid accidentally using
+            // max_bytes_for_level_multiplier_additional
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+          } else {
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+          }
+        }
         cur_level++;
         continue;
       }
     }
     p++;
-    current_path_size = ioptions.db_paths[p].target_size;
+    current_path_size = ioptions.cf_paths[p].target_size;
   }
   return p;
 }
@@ -1400,195 +1548,4 @@ Compaction* LevelCompactionPicker::PickCompaction(
   return builder.PickCompaction();
 }
 
-#ifndef ROCKSDB_LITE
-bool FIFOCompactionPicker::NeedsCompaction(
-    const VersionStorageInfo* vstorage) const {
-  const int kLevel0 = 0;
-  return vstorage->CompactionScore(kLevel0) >= 1;
-}
-
-namespace {
-uint64_t GetTotalFilesSize(
-    const std::vector<FileMetaData*>& files) {
-  uint64_t total_size = 0;
-  for (const auto& f : files) {
-    total_size += f->fd.file_size;
-  }
-  return total_size;
-}
-}  // anonymous namespace
-
-Compaction* FIFOCompactionPicker::PickTTLCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  assert(ioptions_.compaction_options_fifo.ttl > 0);
-
-  const int kLevel0 = 0;
-  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
-  uint64_t total_size = GetTotalFilesSize(level_files);
-
-  int64_t _current_time;
-  auto status = ioptions_.env->GetCurrentTime(&_current_time);
-  if (!status.ok()) {
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: Couldn't get current time: %s. "
-                     "Not doing compactions based on TTL. ",
-                     cf_name.c_str(), status.ToString().c_str());
-    return nullptr;
-  }
-  const uint64_t current_time = static_cast<uint64_t>(_current_time);
-
-  std::vector<CompactionInputFiles> inputs;
-  inputs.emplace_back();
-  inputs[0].level = 0;
-
-  // avoid underflow
-  if (current_time > ioptions_.compaction_options_fifo.ttl) {
-    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
-      auto f = *ritr;
-      if (f->fd.table_reader != nullptr &&
-          f->fd.table_reader->GetTableProperties() != nullptr) {
-        auto creation_time =
-            f->fd.table_reader->GetTableProperties()->creation_time;
-        if (creation_time == 0 ||
-            creation_time >=
-                (current_time - ioptions_.compaction_options_fifo.ttl)) {
-          break;
-        }
-        total_size -= f->compensated_file_size;
-        inputs[0].files.push_back(f);
-      }
-    }
-  }
-
-  // Return a nullptr and proceed to size-based FIFO compaction if:
-  // 1. there are no files older than ttl OR
-  // 2. there are a few files older than ttl, but deleting them will not bring
-  //    the total size to be less than max_table_files_size threshold.
-  if (inputs[0].files.empty() ||
-      total_size > ioptions_.compaction_options_fifo.max_table_files_size) {
-    return nullptr;
-  }
-
-  for (const auto& f : inputs[0].files) {
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: picking file %" PRIu64
-                     " with creation time %" PRIu64 " for deletion",
-                     cf_name.c_str(), f->fd.GetNumber(),
-                     f->fd.table_reader->GetTableProperties()->creation_time);
-  }
-
-  Compaction* c = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
-      kNoCompression, {}, /* is manual */ false, vstorage->CompactionScore(0),
-      /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
-  return c;
-}
-
-Compaction* FIFOCompactionPicker::PickSizeCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  const int kLevel0 = 0;
-  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
-  uint64_t total_size = GetTotalFilesSize(level_files);
-
-  if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size ||
-      level_files.size() == 0) {
-    // total size not exceeded
-    if (ioptions_.compaction_options_fifo.allow_compaction &&
-        level_files.size() > 0) {
-      CompactionInputFiles comp_inputs;
-      if (FindIntraL0Compaction(
-              level_files,
-              mutable_cf_options
-                  .level0_file_num_compaction_trigger /* min_files_to_compact */,
-              mutable_cf_options.write_buffer_size, &comp_inputs)) {
-        Compaction* c = new Compaction(
-            vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
-            16 * 1024 * 1024 /* output file size limit */,
-            0 /* max compaction bytes, not applicable */,
-            0 /* output path ID */, mutable_cf_options.compression, {},
-            /* is manual */ false, vstorage->CompactionScore(0),
-            /* is deletion compaction */ false,
-            CompactionReason::kFIFOReduceNumFiles);
-        return c;
-      }
-    }
-
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
-                     ", max size %" PRIu64 "\n",
-                     cf_name.c_str(), total_size,
-                     ioptions_.compaction_options_fifo.max_table_files_size);
-    return nullptr;
-  }
-
-  if (!level0_compactions_in_progress_.empty()) {
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "[%s] FIFO compaction: Already executing compaction. No need "
-        "to run parallel compactions since compactions are very fast",
-        cf_name.c_str());
-    return nullptr;
-  }
-
-  std::vector<CompactionInputFiles> inputs;
-  inputs.emplace_back();
-  inputs[0].level = 0;
-
-  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
-    auto f = *ritr;
-    total_size -= f->compensated_file_size;
-    inputs[0].files.push_back(f);
-    char tmp_fsize[16];
-    AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: picking file %" PRIu64
-                     " with size %s for deletion",
-                     cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
-    if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) {
-      break;
-    }
-  }
-
-  Compaction* c = new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
-      kNoCompression, {}, /* is manual */ false, vstorage->CompactionScore(0),
-      /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
-  return c;
-}
-
-Compaction* FIFOCompactionPicker::PickCompaction(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
-  assert(vstorage->num_levels() == 1);
-
-  Compaction* c = nullptr;
-  if (ioptions_.compaction_options_fifo.ttl > 0) {
-    c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
-  }
-  if (c == nullptr) {
-    c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
-  }
-  RegisterCompaction(c);
-  return c;
-}
-
-Compaction* FIFOCompactionPicker::CompactRange(
-    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-    VersionStorageInfo* vstorage, int input_level, int output_level,
-    uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
-    InternalKey** compaction_end, bool* manual_conflict) {
-  assert(input_level == 0);
-  assert(output_level == 0);
-  *compaction_end = nullptr;
-  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
-  Compaction* c =
-      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
-  log_buffer.FlushBufferToLog();
-  return c;
-}
-
-#endif  // !ROCKSDB_LITE
-
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/compaction_picker.h b/thirdparty/rocksdb/db/compaction_picker.h
index f44139c2dd..c60d792852 100644
--- a/thirdparty/rocksdb/db/compaction_picker.h
+++ b/thirdparty/rocksdb/db/compaction_picker.h
@@ -58,7 +58,8 @@ class CompactionPicker {
   virtual Compaction* CompactRange(
       const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
       VersionStorageInfo* vstorage, int input_level, int output_level,
-      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
+      uint32_t output_path_id, uint32_t max_subcompactions,
+      const InternalKey* begin, const InternalKey* end,
       InternalKey** compaction_end, bool* manual_conflict);
 
   // The maximum allowed output level.  Default value is NumberLevels() - 1.
@@ -88,6 +89,10 @@ class CompactionPicker {
 
   // Takes a list of CompactionInputFiles and returns a (manual) Compaction
   // object.
+  //
+  // Caller must provide a set of input files that has been passed through
+  // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+  // between that call and this one.
   Compaction* CompactFiles(const CompactionOptions& compact_options,
                            const std::vector<CompactionInputFiles>& input_files,
                            int output_level, VersionStorageInfo* vstorage,
@@ -146,7 +151,8 @@ class CompactionPicker {
   // Will return false if it is impossible to apply this compaction.
   bool ExpandInputsToCleanCut(const std::string& cf_name,
                               VersionStorageInfo* vstorage,
-                              CompactionInputFiles* inputs);
+                              CompactionInputFiles* inputs,
+                              InternalKey** next_smallest = nullptr);
 
   // Returns true if any one of the parent files are being compacted
   bool IsRangeInCompaction(VersionStorageInfo* vstorage,
@@ -170,6 +176,15 @@ class CompactionPicker {
                        const CompactionInputFiles& output_level_inputs,
                        std::vector<FileMetaData*>* grandparents);
 
+  void PickFilesMarkedForCompaction(const std::string& cf_name,
+                                    VersionStorageInfo* vstorage,
+                                    int* start_level, int* output_level,
+                                    CompactionInputFiles* start_level_inputs);
+
+  bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+                             CompactionInputFiles* start_level_inputs,
+                             int output_level, int* parent_index);
+
   // Register this compaction in the set of running compactions
   void RegisterCompaction(Compaction* c);
 
@@ -220,41 +235,6 @@ class LevelCompactionPicker : public CompactionPicker {
 };
 
 #ifndef ROCKSDB_LITE
-class FIFOCompactionPicker : public CompactionPicker {
- public:
-  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icmp)
-      : CompactionPicker(ioptions, icmp) {}
-
-  virtual Compaction* PickCompaction(const std::string& cf_name,
-                                     const MutableCFOptions& mutable_cf_options,
-                                     VersionStorageInfo* version,
-                                     LogBuffer* log_buffer) override;
-
-  virtual Compaction* CompactRange(
-      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
-      VersionStorageInfo* vstorage, int input_level, int output_level,
-      uint32_t output_path_id, const InternalKey* begin, const InternalKey* end,
-      InternalKey** compaction_end, bool* manual_conflict) override;
-
-  // The maximum allowed output level.  Always returns 0.
-  virtual int MaxOutputLevel() const override { return 0; }
-
-  virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage) const override;
-
- private:
-  Compaction* PickTTLCompaction(const std::string& cf_name,
-                                const MutableCFOptions& mutable_cf_options,
-                                VersionStorageInfo* version,
-                                LogBuffer* log_buffer);
-
-  Compaction* PickSizeCompaction(const std::string& cf_name,
-                                 const MutableCFOptions& mutable_cf_options,
-                                 VersionStorageInfo* version,
-                                 LogBuffer* log_buffer);
-};
-
 class NullCompactionPicker : public CompactionPicker {
  public:
   NullCompactionPicker(const ImmutableCFOptions& ioptions,
@@ -263,36 +243,49 @@ class NullCompactionPicker : public CompactionPicker {
   virtual ~NullCompactionPicker() {}
 
   // Always return "nullptr"
-  Compaction* PickCompaction(const std::string& cf_name,
-                             const MutableCFOptions& mutable_cf_options,
-                             VersionStorageInfo* vstorage,
-                             LogBuffer* log_buffer) override {
+  Compaction* PickCompaction(const std::string& /*cf_name*/,
+                             const MutableCFOptions& /*mutable_cf_options*/,
+                             VersionStorageInfo* /*vstorage*/,
+                             LogBuffer* /*log_buffer*/) override {
     return nullptr;
   }
 
   // Always return "nullptr"
-  Compaction* CompactRange(const std::string& cf_name,
-                           const MutableCFOptions& mutable_cf_options,
-                           VersionStorageInfo* vstorage, int input_level,
-                           int output_level, uint32_t output_path_id,
-                           const InternalKey* begin, const InternalKey* end,
-                           InternalKey** compaction_end,
-                           bool* manual_conflict) override {
+  Compaction* CompactRange(const std::string& /*cf_name*/,
+                           const MutableCFOptions& /*mutable_cf_options*/,
+                           VersionStorageInfo* /*vstorage*/,
+                           int /*input_level*/, int /*output_level*/,
+                           uint32_t /*output_path_id*/,
+                           uint32_t /*max_subcompactions*/,
+                           const InternalKey* /*begin*/,
+                           const InternalKey* /*end*/,
+                           InternalKey** /*compaction_end*/,
+                           bool* /*manual_conflict*/) override {
     return nullptr;
   }
 
   // Always returns false.
   virtual bool NeedsCompaction(
-      const VersionStorageInfo* vstorage) const override {
+      const VersionStorageInfo* /*vstorage*/) const override {
     return false;
   }
 };
 #endif  // !ROCKSDB_LITE
 
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+                           size_t min_files_to_compact,
+                           uint64_t max_compact_bytes_per_del_file,
+                           CompactionInputFiles* comp_inputs);
+
 CompressionType GetCompressionType(const ImmutableCFOptions& ioptions,
                                    const VersionStorageInfo* vstorage,
                                    const MutableCFOptions& mutable_cf_options,
                                    int level, int base_level,
                                    const bool enable_compression = true);
 
+CompressionOptions GetCompressionOptions(const ImmutableCFOptions& ioptions,
+                                         const VersionStorageInfo* vstorage,
+                                         int level,
+                                         const bool enable_compression = true);
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/compaction_picker_fifo.cc b/thirdparty/rocksdb/db/compaction_picker_fifo.cc
new file mode 100644
index 0000000000..9229b2cfb1
--- /dev/null
+++ b/thirdparty/rocksdb/db/compaction_picker_fifo.cc
@@ -0,0 +1,234 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <string>
+#include <vector>
+#include "db/column_family.h"
+#include "util/log_buffer.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+  uint64_t total_size = 0;
+  for (const auto& f : files) {
+    total_size += f->fd.file_size;
+  }
+  return total_size;
+}
+}  // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  assert(mutable_cf_options.ttl > 0);
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  int64_t _current_time;
+  auto status = ioptions_.env->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on TTL. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.ttl) {
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      auto f = *ritr;
+      if (f->fd.table_reader != nullptr &&
+          f->fd.table_reader->GetTableProperties() != nullptr) {
+        auto creation_time =
+            f->fd.table_reader->GetTableProperties()->creation_time;
+        if (creation_time == 0 ||
+            creation_time >= (current_time - mutable_cf_options.ttl)) {
+          break;
+        }
+        total_size -= f->compensated_file_size;
+        inputs[0].files.push_back(f);
+      }
+    }
+  }
+
+  // Return a nullptr and proceed to size-based FIFO compaction if:
+  // 1. there are no files older than ttl OR
+  // 2. there are a few files older than ttl, but deleting them will not bring
+  //    the total size to be less than max_table_files_size threshold.
+  if (inputs[0].files.empty() ||
+      total_size >
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+    return nullptr;
+  }
+
+  for (const auto& f : inputs[0].files) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with creation time %" PRIu64 " for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(),
+                     f->fd.table_reader->GetTableProperties()->creation_time);
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, CompactionReason::kFIFOTtl);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  if (total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size ||
+      level_files.size() == 0) {
+    // total size not exceeded
+    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+        level_files.size() > 0) {
+      CompactionInputFiles comp_inputs;
+      // try to prevent same files from being compacted multiple times, which
+      // could produce large files that may never TTL-expire. Achieve this by
+      // disallowing compactions with files larger than memtable (inflate its
+      // size by 10% to account for uncompressed L0 files that may have size
+      // slightly greater than memtable size limit).
+      size_t max_compact_bytes_per_del_file =
+          static_cast<size_t>(MultiplyCheckOverflow(
+              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+              1.1));
+      if (FindIntraL0Compaction(
+              level_files,
+              mutable_cf_options
+                  .level0_file_num_compaction_trigger /* min_files_to_compact */
+              ,
+              max_compact_bytes_per_del_file, &comp_inputs)) {
+        Compaction* c = new Compaction(
+            vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
+            16 * 1024 * 1024 /* output file size limit */,
+            0 /* max compaction bytes, not applicable */,
+            0 /* output path ID */, mutable_cf_options.compression,
+            ioptions_.compression_opts, 0 /* max_subcompactions */, {},
+            /* is manual */ false, vstorage->CompactionScore(0),
+            /* is deletion compaction */ false,
+            CompactionReason::kFIFOReduceNumFiles);
+        return c;
+      }
+    }
+
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+        ", max size %" PRIu64 "\n",
+        cf_name.c_str(), total_size,
+        mutable_cf_options.compaction_options_fifo.max_table_files_size);
+    return nullptr;
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+    auto f = *ritr;
+    total_size -= f->compensated_file_size;
+    inputs[0].files.push_back(f);
+    char tmp_fsize[16];
+    AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with size %s for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+    if (total_size <=
+        mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+      break;
+    }
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
+      kNoCompression, ioptions_.compression_opts, /* max_subcompactions */ 0,
+      {}, /* is manual */ false, vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
+  assert(vstorage->num_levels() == 1);
+
+  Compaction* c = nullptr;
+  if (mutable_cf_options.ttl > 0) {
+    c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
+  }
+  RegisterCompaction(c);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, int input_level, int output_level,
+    uint32_t /*output_path_id*/, uint32_t /*max_subcompactions*/,
+    const InternalKey* /*begin*/, const InternalKey* /*end*/,
+    InternalKey** compaction_end, bool* /*manual_conflict*/) {
+#ifdef NDEBUG
+  (void)input_level;
+  (void)output_level;
+#endif
+  assert(input_level == 0);
+  assert(output_level == 0);
+  *compaction_end = nullptr;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
+  Compaction* c =
+      PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
+  log_buffer.FlushBufferToLog();
+  return c;
+}
+
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/compaction_picker_fifo.h b/thirdparty/rocksdb/db/compaction_picker_fifo.h
new file mode 100644
index 0000000000..015fd42ddb
--- /dev/null
+++ b/thirdparty/rocksdb/db/compaction_picker_fifo.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction_picker.h"
+
+namespace rocksdb {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+  FIFOCompactionPicker(const ImmutableCFOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+
+  virtual Compaction* PickCompaction(const std::string& cf_name,
+                                     const MutableCFOptions& mutable_cf_options,
+                                     VersionStorageInfo* version,
+                                     LogBuffer* log_buffer) override;
+
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, int input_level, int output_level,
+      uint32_t output_path_id, uint32_t max_subcompactions,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict) override;
+
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override { return 0; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+
+ private:
+  Compaction* PickTTLCompaction(const std::string& cf_name,
+                                const MutableCFOptions& mutable_cf_options,
+                                VersionStorageInfo* version,
+                                LogBuffer* log_buffer);
+
+  Compaction* PickSizeCompaction(const std::string& cf_name,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 VersionStorageInfo* version,
+                                 LogBuffer* log_buffer);
+};
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/compaction_picker_test.cc b/thirdparty/rocksdb/db/compaction_picker_test.cc
index bba2d073d8..31325c1289 100644
--- a/thirdparty/rocksdb/db/compaction_picker_test.cc
+++ b/thirdparty/rocksdb/db/compaction_picker_test.cc
@@ -4,10 +4,12 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/compaction_picker.h"
+
 #include <limits>
 #include <string>
 #include <utility>
 #include "db/compaction.h"
+#include "db/compaction_picker_fifo.h"
 #include "db/compaction_picker_universal.h"
 
 #include "util/logging.h"
@@ -20,7 +22,7 @@ namespace rocksdb {
 class CountingLogger : public Logger {
  public:
   using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override { log_count++; }
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
   size_t log_count;
 };
 
@@ -55,14 +57,16 @@ class CompactionPickerTest : public testing::Test {
         log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
         file_num_(1),
         vstorage_(nullptr) {
+    // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+    // tests to cover.
+    ioptions_.compaction_pri = kByCompensatedSize;
     fifo_options_.max_table_files_size = 1;
     mutable_cf_options_.RefreshDerivedOptions(ioptions_);
-    ioptions_.db_paths.emplace_back("dummy",
+    ioptions_.cf_paths.emplace_back("dummy",
                                     std::numeric_limits<uint64_t>::max());
   }
 
-  ~CompactionPickerTest() {
-  }
+  ~CompactionPickerTest() override {}
 
   void NewVersionStorage(int num_levels, CompactionStyle style) {
     DeleteVersionStorage();
@@ -81,16 +85,17 @@ class CompactionPickerTest : public testing::Test {
 
   void Add(int level, uint32_t file_number, const char* smallest,
            const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
-           SequenceNumber smallest_seq = 100,
-           SequenceNumber largest_seq = 100) {
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           size_t compensated_file_size = 0) {
     assert(level < vstorage_->num_levels());
     FileMetaData* f = new FileMetaData;
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
     f->largest = InternalKey(largest, largest_seq, kTypeValue);
-    f->smallest_seqno = smallest_seq;
-    f->largest_seqno = largest_seq;
-    f->compensated_file_size = file_size;
+    f->fd.smallest_seqno = smallest_seq;
+    f->fd.largest_seqno = largest_seq;
+    f->compensated_file_size =
+        (compensated_file_size != 0) ? compensated_file_size : file_size;
     f->refs = 0;
     vstorage_->AddFile(level, f);
     files_.emplace_back(f);
@@ -175,6 +180,8 @@ TEST_F(CompactionPickerTest, Level1Trigger) {
 }
 
 TEST_F(CompactionPickerTest, Level1Trigger2) {
+  mutable_cf_options_.target_file_size_base = 10000000000;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
   NewVersionStorage(6, kCompactionStyleLevel);
   Add(1, 66U, "150", "200", 1000000001U);
   Add(1, 88U, "201", "300", 1000000000U);
@@ -191,13 +198,14 @@ TEST_F(CompactionPickerTest, Level1Trigger2) {
   ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
   ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
   ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
 }
 
 TEST_F(CompactionPickerTest, LevelMaxScore) {
   NewVersionStorage(6, kCompactionStyleLevel);
   mutable_cf_options_.target_file_size_base = 10000000;
-  mutable_cf_options_.target_file_size_multiplier = 10;
   mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
   Add(0, 1U, "150", "200", 1000000U);
   // Level 1 score 1.2
   Add(1, 66U, "150", "200", 6000000U);
@@ -218,6 +226,9 @@ TEST_F(CompactionPickerTest, LevelMaxScore) {
   ASSERT_TRUE(compaction.get() != nullptr);
   ASSERT_EQ(1U, compaction->num_input_files(0));
   ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+                mutable_cf_options_.target_file_size_base / 10,
+            compaction->OutputFilePreallocationSize());
 }
 
 TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
@@ -383,10 +394,10 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
   NewVersionStorage(1, kCompactionStyleUniversal);
   UniversalCompactionPicker universal_compaction_picker(
       ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
   // must return false when there's no files.
   ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
             false);
-  UpdateVersionStorageInfo();
 
   // verify the trigger given different number of L0 files.
   for (int i = 1;
@@ -407,6 +418,7 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
   ioptions_.allow_ingest_behind = true;
   ioptions_.num_levels = 3;
   UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
   // must return false when there's no files.
   ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
             false);
@@ -437,9 +449,10 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
 TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
   const uint64_t kFileSize = 100000;
 
-  ioptions_.compaction_options_universal.allow_trivial_move = true;
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
   NewVersionStorage(1, kCompactionStyleUniversal);
   UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
   // must return false when there's no files.
   ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
             false);
@@ -468,7 +481,7 @@ TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
 TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
   const uint64_t kFileSize = 100000;
 
-  ioptions_.compaction_options_universal.allow_trivial_move = true;
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
   UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
 
   NewVersionStorage(3, kCompactionStyleUniversal);
@@ -496,7 +509,7 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
   const uint64_t kMaxSize = kFileSize * kFileCount / 2;
 
   fifo_options_.max_table_files_size = kMaxSize;
-  ioptions_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
   FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
   UpdateVersionStorageInfo();
   // must return false when there's no files.
@@ -521,9 +534,10 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
 TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
   NewVersionStorage(6, kCompactionStyleLevel);
   ioptions_.compaction_pri = kMinOverlappingRatio;
-  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.target_file_size_base = 100000000000;
   mutable_cf_options_.target_file_size_multiplier = 10;
   mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
 
   Add(2, 6U, "150", "179", 50000000U);
   Add(2, 7U, "180", "220", 50000000U);
@@ -543,6 +557,8 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
   ASSERT_EQ(1U, compaction->num_input_files(0));
   // Pick file 8 because it overlaps with 0 files on level 3.
   ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+  // Compaction input size * 1.1
+  ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
 }
 
 TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
@@ -602,6 +618,35 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
   ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
 }
 
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  // Overlaps with file 26, 27. And the file is compensated so will be
+  // picked up.
+  Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28
+
+  Add(3, 26U, "160", "165", 60000000U);
+  // Boosted file size in output level is not considered.
+  Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+  Add(3, 28U, "180", "400", 60000000U);
+  Add(3, 29U, "401", "500", 60000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
 // This test exhibits the bug where we don't properly reset parent_index in
 // PickCompaction()
 TEST_F(CompactionPickerTest, ParentIndexResetBug) {
diff --git a/thirdparty/rocksdb/db/compaction_picker_universal.cc b/thirdparty/rocksdb/db/compaction_picker_universal.cc
index 14533fbcdd..9291178585 100644
--- a/thirdparty/rocksdb/db/compaction_picker_universal.cc
+++ b/thirdparty/rocksdb/db/compaction_picker_universal.cc
@@ -35,7 +35,7 @@ namespace {
 // and the index of the file in that level
 
 struct InputFileInfo {
-  InputFileInfo() : f(nullptr) {}
+  InputFileInfo() : f(nullptr), level(0), index(0) {}
 
   FileMetaData* f;
   size_t level;
@@ -97,17 +97,17 @@ void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
                              SequenceNumber* largest_seqno) {
   bool is_first = true;
   for (FileMetaData* f : files) {
-    assert(f->smallest_seqno <= f->largest_seqno);
+    assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
     if (is_first) {
       is_first = false;
-      *smallest_seqno = f->smallest_seqno;
-      *largest_seqno = f->largest_seqno;
+      *smallest_seqno = f->fd.smallest_seqno;
+      *largest_seqno = f->fd.largest_seqno;
     } else {
-      if (f->smallest_seqno < *smallest_seqno) {
-        *smallest_seqno = f->smallest_seqno;
+      if (f->fd.smallest_seqno < *smallest_seqno) {
+        *smallest_seqno = f->fd.smallest_seqno;
       }
-      if (f->largest_seqno > *largest_seqno) {
-        *largest_seqno = f->largest_seqno;
+      if (f->fd.largest_seqno > *largest_seqno) {
+        *largest_seqno = f->fd.largest_seqno;
       }
     }
   }
@@ -162,7 +162,13 @@ bool UniversalCompactionPicker::IsInputFilesNonOverlapping(Compaction* c) {
 bool UniversalCompactionPicker::NeedsCompaction(
     const VersionStorageInfo* vstorage) const {
   const int kLevel0 = 0;
-  return vstorage->CompactionScore(kLevel0) >= 1;
+  if (vstorage->CompactionScore(kLevel0) >= 1) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  return false;
 }
 
 void UniversalCompactionPicker::SortedRun::Dump(char* out_buf,
@@ -204,7 +210,8 @@ void UniversalCompactionPicker::SortedRun::DumpSizeInfo(
 
 std::vector<UniversalCompactionPicker::SortedRun>
 UniversalCompactionPicker::CalculateSortedRuns(
-    const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions) {
+    const VersionStorageInfo& vstorage, const ImmutableCFOptions& /*ioptions*/,
+    const MutableCFOptions& mutable_cf_options) {
   std::vector<UniversalCompactionPicker::SortedRun> ret;
   for (FileMetaData* f : vstorage.LevelFiles(0)) {
     ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
@@ -218,7 +225,8 @@ UniversalCompactionPicker::CalculateSortedRuns(
     for (FileMetaData* f : vstorage.LevelFiles(level)) {
       total_compensated_size += f->compensated_file_size;
       total_size += f->fd.GetFileSize();
-      if (ioptions.compaction_options_universal.allow_trivial_move == true) {
+      if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
+          true) {
         if (f->being_compacted) {
           being_compacted = f->being_compacted;
         }
@@ -227,7 +235,8 @@ UniversalCompactionPicker::CalculateSortedRuns(
         // non-zero level, all the files should share the same being_compacted
         // value.
         // This assumption is only valid when
-        // ioptions.compaction_options_universal.allow_trivial_move is false
+        // mutable_cf_options.compaction_options_universal.allow_trivial_move is
+        // false
         assert(is_first || f->being_compacted == being_compacted);
       }
       if (is_first) {
@@ -245,18 +254,18 @@ UniversalCompactionPicker::CalculateSortedRuns(
 
 // Universal style of compaction. Pick files that are contiguous in
 // time-range to compact.
-//
 Compaction* UniversalCompactionPicker::PickCompaction(
     const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
     VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
   const int kLevel0 = 0;
   double score = vstorage->CompactionScore(kLevel0);
   std::vector<SortedRun> sorted_runs =
-      CalculateSortedRuns(*vstorage, ioptions_);
+      CalculateSortedRuns(*vstorage, ioptions_, mutable_cf_options);
 
   if (sorted_runs.size() == 0 ||
-      sorted_runs.size() <
-          (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger) {
+      (vstorage->FilesMarkedForCompaction().empty() &&
+       sorted_runs.size() < (unsigned int)mutable_cf_options
+                                .level0_file_num_compaction_trigger)) {
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: nothing to do\n",
                      cf_name.c_str());
     TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
@@ -270,64 +279,81 @@ Compaction* UniversalCompactionPicker::PickCompaction(
       cf_name.c_str(), sorted_runs.size(), vstorage->LevelSummary(&tmp));
 
   // Check for size amplification first.
-  Compaction* c;
-  if ((c = PickCompactionToReduceSizeAmp(cf_name, mutable_cf_options, vstorage,
-                                         score, sorted_runs, log_buffer)) !=
-      nullptr) {
-    ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: compacting for size amp\n",
-                     cf_name.c_str());
-  } else {
-    // Size amplification is within limits. Try reducing read
-    // amplification while maintaining file size ratios.
-    unsigned int ratio = ioptions_.compaction_options_universal.size_ratio;
-
-    if ((c = PickCompactionToReduceSortedRuns(
-             cf_name, mutable_cf_options, vstorage, score, ratio, UINT_MAX,
-             sorted_runs, log_buffer)) != nullptr) {
-      ROCKS_LOG_BUFFER(log_buffer,
-                       "[%s] Universal: compacting for size ratio\n",
+  Compaction* c = nullptr;
+  if (sorted_runs.size() >=
+      static_cast<size_t>(
+          mutable_cf_options.level0_file_num_compaction_trigger)) {
+    if ((c = PickCompactionToReduceSizeAmp(cf_name, mutable_cf_options,
+                                           vstorage, score, sorted_runs,
+                                           log_buffer)) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Universal: compacting for size amp\n",
                        cf_name.c_str());
     } else {
-      // Size amplification and file size ratios are within configured limits.
-      // If max read amplification is exceeding configured limits, then force
-      // compaction without looking at filesize ratios and try to reduce
-      // the number of files to fewer than level0_file_num_compaction_trigger.
-      // This is guaranteed by NeedsCompaction()
-      assert(sorted_runs.size() >=
-             static_cast<size_t>(
-                 mutable_cf_options.level0_file_num_compaction_trigger));
-      // Get the total number of sorted runs that are not being compacted
-      int num_sr_not_compacted = 0;
-      for (size_t i = 0; i < sorted_runs.size(); i++) {
-        if (sorted_runs[i].being_compacted == false) {
-          num_sr_not_compacted++;
+      // Size amplification is within limits. Try reducing read
+      // amplification while maintaining file size ratios.
+      unsigned int ratio =
+          mutable_cf_options.compaction_options_universal.size_ratio;
+
+      if ((c = PickCompactionToReduceSortedRuns(
+               cf_name, mutable_cf_options, vstorage, score, ratio, UINT_MAX,
+               sorted_runs, log_buffer)) != nullptr) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Universal: compacting for size ratio\n",
+                         cf_name.c_str());
+      } else {
+        // Size amplification and file size ratios are within configured limits.
+        // If max read amplification is exceeding configured limits, then force
+        // compaction without looking at filesize ratios and try to reduce
+        // the number of files to fewer than level0_file_num_compaction_trigger.
+        // This is guaranteed by NeedsCompaction()
+        assert(sorted_runs.size() >=
+               static_cast<size_t>(
+                   mutable_cf_options.level0_file_num_compaction_trigger));
+        // Get the total number of sorted runs that are not being compacted
+        int num_sr_not_compacted = 0;
+        for (size_t i = 0; i < sorted_runs.size(); i++) {
+          if (sorted_runs[i].being_compacted == false) {
+            num_sr_not_compacted++;
+          }
         }
-      }
 
-      // The number of sorted runs that are not being compacted is greater than
-      // the maximum allowed number of sorted runs
-      if (num_sr_not_compacted >
-          mutable_cf_options.level0_file_num_compaction_trigger) {
-        unsigned int num_files =
-            num_sr_not_compacted -
-            mutable_cf_options.level0_file_num_compaction_trigger + 1;
-        if ((c = PickCompactionToReduceSortedRuns(
-                 cf_name, mutable_cf_options, vstorage, score, UINT_MAX,
-                 num_files, sorted_runs, log_buffer)) != nullptr) {
-          ROCKS_LOG_BUFFER(log_buffer,
-                           "[%s] Universal: compacting for file num -- %u\n",
-                           cf_name.c_str(), num_files);
+        // The number of sorted runs that are not being compacted is greater
+        // than the maximum allowed number of sorted runs
+        if (num_sr_not_compacted >
+            mutable_cf_options.level0_file_num_compaction_trigger) {
+          unsigned int num_files =
+              num_sr_not_compacted -
+              mutable_cf_options.level0_file_num_compaction_trigger + 1;
+          if ((c = PickCompactionToReduceSortedRuns(
+                   cf_name, mutable_cf_options, vstorage, score, UINT_MAX,
+                   num_files, sorted_runs, log_buffer)) != nullptr) {
+            ROCKS_LOG_BUFFER(log_buffer,
+                             "[%s] Universal: compacting for file num -- %u\n",
+                             cf_name.c_str(), num_files);
+          }
         }
       }
     }
   }
+
+  if (c == nullptr) {
+    if ((c = PickDeleteTriggeredCompaction(cf_name, mutable_cf_options,
+                                           vstorage, score, sorted_runs,
+                                           log_buffer)) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] Universal: delete triggered compaction\n",
+                       cf_name.c_str());
+    }
+  }
+
   if (c == nullptr) {
     TEST_SYNC_POINT_CALLBACK("UniversalCompactionPicker::PickCompaction:Return",
                              nullptr);
     return nullptr;
   }
 
-  if (ioptions_.compaction_options_universal.allow_trivial_move == true) {
+  if (mutable_cf_options.compaction_options_universal.allow_trivial_move ==
+      true) {
     c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
   }
 
@@ -339,11 +365,11 @@ Compaction* UniversalCompactionPicker::PickCompaction(
   size_t level_index = 0U;
   if (c->start_level() == 0) {
     for (auto f : *c->inputs(0)) {
-      assert(f->smallest_seqno <= f->largest_seqno);
+      assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
       if (is_first) {
         is_first = false;
       }
-      prev_smallest_seqno = f->smallest_seqno;
+      prev_smallest_seqno = f->fd.smallest_seqno;
     }
     level_index = 1U;
   }
@@ -369,8 +395,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(
   }
 #endif
   // update statistics
-  MeasureTime(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
-              c->inputs(0)->size());
+  RecordInHistogram(ioptions_.statistics, NUM_FILES_IN_SINGLE_COMPACTION,
+                    c->inputs(0)->size());
 
   RegisterCompaction(c);
   vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
@@ -381,7 +407,8 @@ Compaction* UniversalCompactionPicker::PickCompaction(
 }
 
 uint32_t UniversalCompactionPicker::GetPathId(
-    const ImmutableCFOptions& ioptions, uint64_t file_size) {
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
   // Two conditions need to be satisfied:
   // (1) the target path needs to be able to hold the file's size
   // (2) Total size left in this and previous paths need to be not
@@ -398,12 +425,12 @@ uint32_t UniversalCompactionPicker::GetPathId(
   // that case. We need to improve it.
   uint64_t accumulated_size = 0;
   uint64_t future_size =
-      file_size * (100 - ioptions.compaction_options_universal.size_ratio) /
-      100;
+      file_size *
+      (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
   uint32_t p = 0;
-  assert(!ioptions.db_paths.empty());
-  for (; p < ioptions.db_paths.size() - 1; p++) {
-    uint64_t target_size = ioptions.db_paths[p].target_size;
+  assert(!ioptions.cf_paths.empty());
+  for (; p < ioptions.cf_paths.size() - 1; p++) {
+    uint64_t target_size = ioptions.cf_paths[p].target_size;
     if (target_size > file_size &&
         accumulated_size + (target_size - file_size) > future_size) {
       return p;
@@ -423,9 +450,9 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
     unsigned int max_number_of_files_to_compact,
     const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
   unsigned int min_merge_width =
-      ioptions_.compaction_options_universal.min_merge_width;
+      mutable_cf_options.compaction_options_universal.min_merge_width;
   unsigned int max_merge_width =
-      ioptions_.compaction_options_universal.max_merge_width;
+      mutable_cf_options.compaction_options_universal.max_merge_width;
 
   const SortedRun* sr = nullptr;
   bool done = false;
@@ -492,7 +519,7 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
       if (sz < static_cast<double>(succeeding_sr->size)) {
         break;
       }
-      if (ioptions_.compaction_options_universal.stop_style ==
+      if (mutable_cf_options.compaction_options_universal.stop_style ==
           kCompactionStopStyleSimilarSize) {
         // Similar-size stopping rule: also check the last picked file isn't
         // far larger than the next candidate file.
@@ -535,7 +562,7 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
   // size ratio of compression.
   bool enable_compression = true;
   int ratio_to_compress =
-      ioptions_.compaction_options_universal.compression_size_percent;
+      mutable_cf_options.compaction_options_universal.compression_size_percent;
   if (ratio_to_compress >= 0) {
     uint64_t total_size = 0;
     for (auto& sorted_run : sorted_runs) {
@@ -556,7 +583,8 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
   for (unsigned int i = 0; i < first_index_after; i++) {
     estimated_total_size += sorted_runs[i].size;
   }
-  uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
   int start_level = sorted_runs[start_index].level;
   int output_level;
   if (first_index_after == sorted_runs.size()) {
@@ -597,17 +625,21 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSortedRuns(
 
   CompactionReason compaction_reason;
   if (max_number_of_files_to_compact == UINT_MAX) {
-    compaction_reason = CompactionReason::kUniversalSortedRunNum;
-  } else {
     compaction_reason = CompactionReason::kUniversalSizeRatio;
+  } else {
+    compaction_reason = CompactionReason::kUniversalSortedRunNum;
   }
   return new Compaction(
       vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
-      mutable_cf_options.MaxFileSizeForLevel(output_level), LLONG_MAX, path_id,
+      MaxFileSizeForLevel(mutable_cf_options, output_level,
+                          kCompactionStyleUniversal),
+      LLONG_MAX, path_id,
       GetCompressionType(ioptions_, vstorage, mutable_cf_options, start_level,
                          1, enable_compression),
-      /* grandparents */ {}, /* is manual */ false, score,
-      false /* deletion_compaction */, compaction_reason);
+      GetCompressionOptions(ioptions_, vstorage, start_level,
+                            enable_compression),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      score, false /* deletion_compaction */, compaction_reason);
 }
 
 // Look at overall size amplification. If size amplification
@@ -621,14 +653,18 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
     VersionStorageInfo* vstorage, double score,
     const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer) {
   // percentage flexibility while reducing size amplification
-  uint64_t ratio =
-      ioptions_.compaction_options_universal.max_size_amplification_percent;
+  uint64_t ratio = mutable_cf_options.compaction_options_universal
+                       .max_size_amplification_percent;
 
   unsigned int candidate_count = 0;
   uint64_t candidate_size = 0;
   size_t start_index = 0;
   const SortedRun* sr = nullptr;
 
+  if (sorted_runs.back().being_compacted) {
+    return nullptr;
+  }
+
   // Skip files that are already being compacted
   for (size_t loop = 0; loop < sorted_runs.size() - 1; loop++) {
     sr = &sorted_runs[loop];
@@ -700,7 +736,8 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
   for (size_t loop = start_index; loop < sorted_runs.size(); loop++) {
     estimated_total_size += sorted_runs[loop].size;
   }
-  uint32_t path_id = GetPathId(ioptions_, estimated_total_size);
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
   int start_level = sorted_runs[start_index].level;
 
   std::vector<CompactionInputFiles> inputs(vstorage->num_levels());
@@ -734,15 +771,137 @@ Compaction* UniversalCompactionPicker::PickCompactionToReduceSizeAmp(
   }
 
   return new Compaction(
-      vstorage, ioptions_, mutable_cf_options, std::move(inputs),
-      output_level, mutable_cf_options.MaxFileSizeForLevel(output_level),
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options, output_level,
+                          kCompactionStyleUniversal),
       /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
-      GetCompressionType(ioptions_, vstorage, mutable_cf_options,
-                         output_level, 1),
-      /* grandparents */ {}, /* is manual */ false, score,
-      false /* deletion_compaction */,
+      GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
+                         1),
+      GetCompressionOptions(ioptions_, vstorage, output_level),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      score, false /* deletion_compaction */,
       CompactionReason::kUniversalSizeAmplification);
 }
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionPicker::PickDeleteTriggeredCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, double score,
+    const std::vector<SortedRun>& /*sorted_runs*/, LogBuffer* /*log_buffer*/) {
+  CompactionInputFiles start_level_inputs;
+  int output_level;
+  std::vector<CompactionInputFiles> inputs;
+
+  if (vstorage->num_levels() == 1) {
+    // This is single level universal. Since we're basically trying to reclaim
+    // space by processing files marked for compaction due to high tombstone
+    // density, let's do the same thing as compaction to reduce size amp which
+    // has the same goals.
+    bool compact = false;
+
+    start_level_inputs.level = 0;
+    start_level_inputs.files.clear();
+    output_level = 0;
+    for (FileMetaData* f : vstorage->LevelFiles(0)) {
+      if (f->marked_for_compaction) {
+        compact = true;
+      }
+      if (compact) {
+        start_level_inputs.files.push_back(f);
+      }
+    }
+    if (start_level_inputs.size() <= 1) {
+      // If only the last file in L0 is marked for compaction, ignore it
+      return nullptr;
+    }
+    inputs.push_back(start_level_inputs);
+  } else {
+    int start_level;
+
+    // For multi-level universal, the strategy is to make this look more like
+    // leveled. We pick one of the files marked for compaction and compact with
+    // overlapping files in the adjacent level.
+    PickFilesMarkedForCompaction(cf_name, vstorage, &start_level, &output_level,
+                                 &start_level_inputs);
+    if (start_level_inputs.empty()) {
+      return nullptr;
+    }
+
+    // Pick the first non-empty level after the start_level
+    for (output_level = start_level + 1; output_level < vstorage->num_levels();
+         output_level++) {
+      if (vstorage->NumLevelFiles(output_level) != 0) {
+        break;
+      }
+    }
+
+    // If all higher levels are empty, pick the highest level as output level
+    if (output_level == vstorage->num_levels()) {
+      if (start_level == 0) {
+        output_level = vstorage->num_levels() - 1;
+      } else {
+        // If start level is non-zero and all higher levels are empty, this
+        // compaction will translate into a trivial move. Since the idea is
+        // to reclaim space and trivial move doesn't help with that, we
+        // skip compaction in this case and return nullptr
+        return nullptr;
+      }
+    }
+    if (ioptions_.allow_ingest_behind &&
+        output_level == vstorage->num_levels() - 1) {
+      assert(output_level > 1);
+      output_level--;
+    }
+
+    if (output_level != 0) {
+      if (start_level == 0) {
+        if (!GetOverlappingL0Files(vstorage, &start_level_inputs, output_level,
+                                   nullptr)) {
+          return nullptr;
+        }
+      }
+
+      CompactionInputFiles output_level_inputs;
+      int parent_index = -1;
+
+      output_level_inputs.level = output_level;
+      if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage,
+                            &start_level_inputs, &output_level_inputs,
+                            &parent_index, -1)) {
+        return nullptr;
+      }
+      inputs.push_back(start_level_inputs);
+      if (!output_level_inputs.empty()) {
+        inputs.push_back(output_level_inputs);
+      }
+      if (FilesRangeOverlapWithCompaction(inputs, output_level)) {
+        return nullptr;
+      }
+    } else {
+      inputs.push_back(start_level_inputs);
+    }
+  }
+
+  uint64_t estimated_total_size = 0;
+  // Use size of the output level as estimated file size
+  for (FileMetaData* f : vstorage->LevelFiles(output_level)) {
+    estimated_total_size += f->fd.GetFileSize();
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options, estimated_total_size);
+  return new Compaction(
+      vstorage, ioptions_, mutable_cf_options, std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options, output_level,
+                          kCompactionStyleUniversal),
+      /* max_grandparent_overlap_bytes */ LLONG_MAX, path_id,
+      GetCompressionType(ioptions_, vstorage, mutable_cf_options, output_level,
+                         1),
+      GetCompressionOptions(ioptions_, vstorage, output_level),
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ true,
+      score, false /* deletion_compaction */,
+      CompactionReason::kFilesMarkedForCompaction);
+}
 }  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/compaction_picker_universal.h b/thirdparty/rocksdb/db/compaction_picker_universal.h
index 3f2bed3e62..375e5998e2 100644
--- a/thirdparty/rocksdb/db/compaction_picker_universal.h
+++ b/thirdparty/rocksdb/db/compaction_picker_universal.h
@@ -73,6 +73,11 @@ class UniversalCompactionPicker : public CompactionPicker {
       VersionStorageInfo* vstorage, double score,
       const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
 
+  Compaction* PickDeleteTriggeredCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      VersionStorageInfo* vstorage, double score,
+      const std::vector<SortedRun>& sorted_runs, LogBuffer* log_buffer);
+
   // Used in universal compaction when the enabled_trivial_move
   // option is set. Checks whether there are any overlapping files
   // in the input. Returns true if the input files are non
@@ -80,11 +85,13 @@ class UniversalCompactionPicker : public CompactionPicker {
   bool IsInputFilesNonOverlapping(Compaction* c);
 
   static std::vector<SortedRun> CalculateSortedRuns(
-      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions);
+      const VersionStorageInfo& vstorage, const ImmutableCFOptions& ioptions,
+      const MutableCFOptions& mutable_cf_options);
 
   // Pick a path ID to place a newly generated file, with its estimated file
   // size.
   static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
                             uint64_t file_size);
 };
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/comparator_db_test.cc b/thirdparty/rocksdb/db/comparator_db_test.cc
index 28a2a5658e..a7ff587949 100644
--- a/thirdparty/rocksdb/db/comparator_db_test.cc
+++ b/thirdparty/rocksdb/db/comparator_db_test.cc
@@ -2,6 +2,7 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
+#include <array>
 #include <map>
 #include <string>
 
@@ -26,24 +27,24 @@ class KVIter : public Iterator {
  public:
   explicit KVIter(const stl_wrappers::KVMap* map)
       : map_(map), iter_(map_->end()) {}
-  virtual bool Valid() const override { return iter_ != map_->end(); }
-  virtual void SeekToFirst() override { iter_ = map_->begin(); }
-  virtual void SeekToLast() override {
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
     if (map_->empty()) {
       iter_ = map_->end();
     } else {
       iter_ = map_->find(map_->rbegin()->first);
     }
   }
-  virtual void Seek(const Slice& k) override {
+  void Seek(const Slice& k) override {
     iter_ = map_->lower_bound(k.ToString());
   }
-  virtual void SeekForPrev(const Slice& k) override {
+  void SeekForPrev(const Slice& k) override {
     iter_ = map_->upper_bound(k.ToString());
     Prev();
   }
-  virtual void Next() override { ++iter_; }
-  virtual void Prev() override {
+  void Next() override { ++iter_; }
+  void Prev() override {
     if (iter_ == map_->begin()) {
       iter_ = map_->end();
       return;
@@ -51,9 +52,9 @@ class KVIter : public Iterator {
     --iter_;
   }
 
-  virtual Slice key() const override { return iter_->first; }
-  virtual Slice value() const override { return iter_->second; }
-  virtual Status status() const override { return Status::OK(); }
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
 
  private:
   const stl_wrappers::KVMap* const map_;
@@ -170,9 +171,9 @@ class DoubleComparator : public Comparator {
  public:
   DoubleComparator() {}
 
-  virtual const char* Name() const override { return "DoubleComparator"; }
+  const char* Name() const override { return "DoubleComparator"; }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
 #ifndef CYGWIN
     double da = std::stod(a.ToString());
     double db = std::stod(b.ToString());
@@ -188,19 +189,19 @@ class DoubleComparator : public Comparator {
       return -1;
     }
   }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {}
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
 
-  virtual void FindShortSuccessor(std::string* key) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 
 class HashComparator : public Comparator {
  public:
   HashComparator() {}
 
-  virtual const char* Name() const override { return "HashComparator"; }
+  const char* Name() const override { return "HashComparator"; }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     uint32_t ha = Hash(a.data(), a.size(), 66);
     uint32_t hb = Hash(b.data(), b.size(), 66);
     if (ha == hb) {
@@ -211,19 +212,19 @@ class HashComparator : public Comparator {
       return -1;
     }
   }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {}
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
 
-  virtual void FindShortSuccessor(std::string* key) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 
 class TwoStrComparator : public Comparator {
  public:
   TwoStrComparator() {}
 
-  virtual const char* Name() const override { return "TwoStrComparator"; }
+  const char* Name() const override { return "TwoStrComparator"; }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     assert(a.size() >= 2);
     assert(b.size() >= 2);
     size_t size_a1 = static_cast<size_t>(a[0]);
@@ -243,14 +244,16 @@ class TwoStrComparator : public Comparator {
     }
     return a2.compare(b2);
   }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {}
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
 
-  virtual void FindShortSuccessor(std::string* key) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 }  // namespace
 
-class ComparatorDBTest : public testing::Test {
+class ComparatorDBTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
  private:
   std::string dbname_;
   Env* env_;
@@ -261,11 +264,15 @@ class ComparatorDBTest : public testing::Test {
  public:
   ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
     comparator = BytewiseComparator();
-    dbname_ = test::TmpDir() + "/comparator_db_test";
+    dbname_ = test::PerThreadDBPath("comparator_db_test");
+    BlockBasedTableOptions toptions;
+    toptions.format_version = GetParam();
+    last_options_.table_factory.reset(
+        rocksdb::NewBlockBasedTableFactory(toptions));
     EXPECT_OK(DestroyDB(dbname_, last_options_));
   }
 
-  ~ComparatorDBTest() {
+  ~ComparatorDBTest() override {
     delete db_;
     EXPECT_OK(DestroyDB(dbname_, last_options_));
     comparator = BytewiseComparator();
@@ -273,8 +280,12 @@ class ComparatorDBTest : public testing::Test {
 
   DB* GetDB() { return db_; }
 
-  void SetOwnedComparator(const Comparator* cmp) {
-    comparator_guard.reset(cmp);
+  void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
+    if (owner) {
+      comparator_guard.reset(cmp);
+    } else {
+      comparator_guard.reset();
+    }
     comparator = cmp;
     last_options_.comparator = cmp;
   }
@@ -303,7 +314,12 @@ class ComparatorDBTest : public testing::Test {
   }
 };
 
-TEST_F(ComparatorDBTest, Bytewise) {
+INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
+                        testing::Values(test::kLatestFormatVersion));
+
+TEST_P(ComparatorDBTest, Bytewise) {
   for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
     DestroyAndReopen();
     Random rnd(rand_seed);
@@ -313,7 +329,7 @@ TEST_F(ComparatorDBTest, Bytewise) {
   }
 }
 
-TEST_F(ComparatorDBTest, SimpleSuffixReverseComparator) {
+TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) {
   SetOwnedComparator(new test::SimpleSuffixReverseComparator());
 
   for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
@@ -339,8 +355,8 @@ TEST_F(ComparatorDBTest, SimpleSuffixReverseComparator) {
   }
 }
 
-TEST_F(ComparatorDBTest, Uint64Comparator) {
-  SetOwnedComparator(test::Uint64Comparator());
+TEST_P(ComparatorDBTest, Uint64Comparator) {
+  SetOwnedComparator(test::Uint64Comparator(), false /* owner */);
 
   for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
     Options* opt = GetOptions();
@@ -363,7 +379,7 @@ TEST_F(ComparatorDBTest, Uint64Comparator) {
   }
 }
 
-TEST_F(ComparatorDBTest, DoubleComparator) {
+TEST_P(ComparatorDBTest, DoubleComparator) {
   SetOwnedComparator(new DoubleComparator());
 
   for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
@@ -388,7 +404,7 @@ TEST_F(ComparatorDBTest, DoubleComparator) {
   }
 }
 
-TEST_F(ComparatorDBTest, HashComparator) {
+TEST_P(ComparatorDBTest, HashComparator) {
   SetOwnedComparator(new HashComparator());
 
   for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
@@ -407,7 +423,7 @@ TEST_F(ComparatorDBTest, HashComparator) {
   }
 }
 
-TEST_F(ComparatorDBTest, TwoStrComparator) {
+TEST_P(ComparatorDBTest, TwoStrComparator) {
   SetOwnedComparator(new TwoStrComparator());
 
   for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
@@ -433,6 +449,209 @@ TEST_F(ComparatorDBTest, TwoStrComparator) {
   }
 }
 
+TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
+  {
+    // different length
+    Slice s("abcxy");
+    Slice t("abcxyz");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    Slice s("abcxyz");
+    Slice t("abcxy");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    // not last byte different
+    Slice s("abc1xyz");
+    Slice t("abc2xyz");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    // same string
+    Slice s("abcxyz");
+    Slice t("abcxyz");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    Slice s("abcxy");
+    Slice t("abcxz");
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    Slice s("abcxz");
+    Slice t("abcxy");
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xac";
+    const char t_array[] = "\x50\x8a\xad";
+    Slice s(s_array);
+    Slice t(t_array);
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff";
+    const char t_array[] = "\x50\x8b\x00";
+    Slice s(s_array, 3);
+    Slice t(t_array, 3);
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x00";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    ASSERT_TRUE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x01";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    ASSERT_FALSE(BytewiseComparator()->IsSameLengthImmediateSuccessor(s, t));
+  }
+}
+
+TEST_P(ComparatorDBTest, FindShortestSeparator) {
+  std::string s1 = "abc1xyz";
+  std::string s2 = "abc3xy";
+
+  BytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc2", s1);
+
+  s1 = "abc5xyztt";
+
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc5", s1);
+
+  s1 = "abc3";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  std::string old_s1 = s1 = "abc2xy";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_TRUE(old_s1 >= s1);
+  ASSERT_TRUE(s1 > s2);
+}
+
+TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) {
+  // Char list for boundary cases.
+  std::array<unsigned char, 6> char_list{{0, 1, 2, 253, 254, 255}};
+  Random rnd(301);
+
+  for (int attempts = 0; attempts < 1000; attempts++) {
+    uint32_t size1 = rnd.Skewed(4);
+    uint32_t size2;
+
+    if (rnd.OneIn(2)) {
+      // size2 to be random size
+      size2 = rnd.Skewed(4);
+    } else {
+      // size1 is within [-2, +2] of size1
+      int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+      int tmp_size2 = static_cast<int>(size1) + diff;
+      if (tmp_size2 < 0) {
+        tmp_size2 = 0;
+      }
+      size2 = static_cast<uint32_t>(tmp_size2);
+    }
+
+    std::string s1;
+    std::string s2;
+    for (uint32_t i = 0; i < size1; i++) {
+      if (rnd.OneIn(2)) {
+        // Use random byte
+        s1 += static_cast<char>(rnd.Uniform(256));
+      } else {
+        // Use one byte in char_list
+        char c = static_cast<char>(char_list[rnd.Uniform(sizeof(char_list))]);
+        s1 += c;
+      }
+    }
+
+    // First set s2 to be the same as s1, and then modify s2.
+    s2 = s1;
+    s2.resize(size2);
+    // We start from the back of the string
+    if (size2 > 0) {
+      uint32_t pos = size2 - 1;
+      do {
+        if (pos >= size1 || rnd.OneIn(4)) {
+          // For 1/4 chance, use random byte
+          s2[pos] = static_cast<char>(rnd.Uniform(256));
+        } else if (rnd.OneIn(4)) {
+          // In 1/4 chance, stop here.
+          break;
+        } else {
+          // Create a char within [-2, +2] of the matching char of s1.
+          int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+          // char may be signed or unsigned based on platform.
+          int s1_char = static_cast<int>(static_cast<unsigned char>(s1[pos]));
+          int s2_char = s1_char + diff;
+          if (s2_char < 0) {
+            s2_char = 0;
+          }
+          if (s2_char > 255) {
+            s2_char = 255;
+          }
+          s2[pos] = static_cast<char>(s2_char);
+        }
+      } while (pos-- != 0);
+    }
+
+    // Test separators
+    for (int rev = 0; rev < 2; rev++) {
+      if (rev == 1) {
+        // switch s1 and s2
+        std::string t = s1;
+        s1 = s2;
+        s2 = t;
+      }
+      std::string separator = s1;
+      BytewiseComparator()->FindShortestSeparator(&separator, s2);
+      std::string rev_separator = s1;
+      ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2);
+
+      if (s1 == s2) {
+        ASSERT_EQ(s1, separator);
+        ASSERT_EQ(s2, rev_separator);
+      } else if (s1 < s2) {
+        ASSERT_TRUE(s1 <= separator);
+        ASSERT_TRUE(s2 > separator);
+        ASSERT_LE(separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, rev_separator);
+      } else {
+        ASSERT_TRUE(s1 >= rev_separator);
+        ASSERT_TRUE(s2 < rev_separator);
+        ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, separator);
+      }
+    }
+
+    // Test successors
+    std::string succ = s1;
+    BytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ >= s1);
+
+    succ = s1;
+    ReverseBytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ <= s1);
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/convenience.cc b/thirdparty/rocksdb/db/convenience.cc
index 8ee31cacab..71c237f60c 100644
--- a/thirdparty/rocksdb/db/convenience.cc
+++ b/thirdparty/rocksdb/db/convenience.cc
@@ -19,15 +19,23 @@ void CancelAllBackgroundWork(DB* db, bool wait) {
 }
 
 Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
-                          const Slice* begin, const Slice* end) {
+                          const Slice* begin, const Slice* end,
+                          bool include_end) {
+  RangePtr range(begin, end);
+  return DeleteFilesInRanges(db, column_family, &range, 1, include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangePtr* ranges, size_t n,
+                           bool include_end) {
   return (static_cast_with_check<DBImpl, DB>(db->GetRootDB()))
-      ->DeleteFilesInRange(column_family, begin, end);
+      ->DeleteFilesInRanges(column_family, ranges, n, include_end);
 }
 
 Status VerifySstFileChecksum(const Options& options,
                              const EnvOptions& env_options,
                              const std::string& file_path) {
-  unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<RandomAccessFile> file;
   uint64_t file_size;
   InternalKeyComparator internal_comparator(options.comparator);
   ImmutableCFOptions ioptions(options);
@@ -38,12 +46,14 @@ Status VerifySstFileChecksum(const Options& options,
   } else {
     return s;
   }
-  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<TableReader> table_reader;
   std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(file), file_path));
+  const bool kImmortal = true;
   s = ioptions.table_factory->NewTableReader(
-      TableReaderOptions(ioptions, env_options, internal_comparator,
-                         false /* skip_filters */, -1 /* level */),
+      TableReaderOptions(ioptions, options.prefix_extractor.get(), env_options,
+                         internal_comparator, false /* skip_filters */,
+                         !kImmortal, -1 /* level */),
       std::move(file_reader), file_size, &table_reader,
       false /* prefetch_index_and_filter_in_cache */);
   if (!s.ok()) {
diff --git a/thirdparty/rocksdb/db/corruption_test.cc b/thirdparty/rocksdb/db/corruption_test.cc
index 56e157832c..1ccb1aa2b0 100644
--- a/thirdparty/rocksdb/db/corruption_test.cc
+++ b/thirdparty/rocksdb/db/corruption_test.cc
@@ -24,6 +24,8 @@
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
 #include "rocksdb/write_batch.h"
+#include "table/block_based_table_builder.h"
+#include "table/meta_blocks.h"
 #include "util/filename.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
@@ -37,7 +39,7 @@ class CorruptionTest : public testing::Test {
  public:
   test::ErrorEnv env_;
   std::string dbname_;
-  shared_ptr<Cache> tiny_cache_;
+  std::shared_ptr<Cache> tiny_cache_;
   Options options_;
   DB* db_;
 
@@ -48,7 +50,7 @@ class CorruptionTest : public testing::Test {
     tiny_cache_ = NewLRUCache(100, 4);
     options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
     options_.env = &env_;
-    dbname_ = test::TmpDir() + "/corruption_test";
+    dbname_ = test::PerThreadDBPath("corruption_test");
     DestroyDB(dbname_, options_);
 
     db_ = nullptr;
@@ -60,9 +62,9 @@ class CorruptionTest : public testing::Test {
     options_.create_if_missing = false;
   }
 
-  ~CorruptionTest() {
-     delete db_;
-     DestroyDB(dbname_, Options());
+  ~CorruptionTest() override {
+    delete db_;
+    DestroyDB(dbname_, Options());
   }
 
   void CloseDb() {
@@ -333,9 +335,9 @@ TEST_F(CorruptionTest, TableFileIndexData) {
   Corrupt(kTableFile, -2000, 500);
   Reopen();
   dbi = reinterpret_cast<DBImpl*>(db_);
-  // one full file should be readable, since only one was corrupted
+  // one full file may be readable, since only one was corrupted
   // the other file should be fully non-readable, since index was corrupted
-  Check(5000, 5000);
+  Check(0, 5000);
   ASSERT_NOK(dbi->VerifyChecksum());
 }
 
@@ -467,6 +469,39 @@ TEST_F(CorruptionTest, UnrelatedKeys) {
   ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
 }
 
+TEST_F(CorruptionTest, RangeDeletionCorrupted) {
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(static_cast<size_t>(1), metadata.size());
+  std::string filename = dbname_ + metadata[0].name;
+
+  std::unique_ptr<RandomAccessFile> file;
+  ASSERT_OK(options_.env->NewRandomAccessFile(filename, &file, EnvOptions()));
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(file), filename));
+
+  uint64_t file_size;
+  ASSERT_OK(options_.env->GetFileSize(filename, &file_size));
+
+  BlockHandle range_del_handle;
+  ASSERT_OK(FindMetaBlock(
+      file_reader.get(), file_size, kBlockBasedTableMagicNumber,
+      ImmutableCFOptions(options_), kRangeDelBlock, &range_del_handle));
+
+  ASSERT_OK(TryReopen());
+  CorruptFile(filename, static_cast<int>(range_del_handle.offset()), 1);
+  // The test case does not fail on TryReopen because failure to preload table
+  // handlers is not considered critical.
+  ASSERT_OK(TryReopen());
+  std::string val;
+  // However, it does fail on any read involving that file since that file
+  // cannot be opened with a corrupt range deletion meta-block.
+  ASSERT_TRUE(db_->Get(ReadOptions(), "a", &val).IsCorruption());
+}
+
 TEST_F(CorruptionTest, FileSystemStateCorrupted) {
   for (int iter = 0; iter < 2; ++iter) {
     Options options;
@@ -485,7 +520,7 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) {
     db_ = nullptr;
 
     if (iter == 0) {  // corrupt file size
-      unique_ptr<WritableFile> file;
+      std::unique_ptr<WritableFile> file;
       env_.NewWritableFile(filename, &file, EnvOptions());
       file->Append(Slice("corrupted sst"));
       file.reset();
@@ -510,7 +545,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/db/cuckoo_table_db_test.cc b/thirdparty/rocksdb/db/cuckoo_table_db_test.cc
index e7c2d279a4..2d4487ff45 100644
--- a/thirdparty/rocksdb/db/cuckoo_table_db_test.cc
+++ b/thirdparty/rocksdb/db/cuckoo_table_db_test.cc
@@ -25,13 +25,13 @@ class CuckooTableDBTest : public testing::Test {
 
  public:
   CuckooTableDBTest() : env_(Env::Default()) {
-    dbname_ = test::TmpDir() + "/cuckoo_table_db_test";
+    dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
     db_ = nullptr;
     Reopen();
   }
 
-  ~CuckooTableDBTest() {
+  ~CuckooTableDBTest() override {
     delete db_;
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
@@ -241,7 +241,7 @@ TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
 
   // Write 28 values, each 10016 B ~ 10KB
   for (int idx = 0; idx < 28; ++idx) {
-    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
   }
   dbfull()->TEST_WaitForFlushMemTable();
   ASSERT_EQ("1", FilesPerLevel());
@@ -250,7 +250,7 @@ TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
                               true /* disallow trivial move */);
   ASSERT_EQ("0,2", FilesPerLevel());
   for (int idx = 0; idx < 28; ++idx) {
-    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
   }
 }
 
@@ -271,14 +271,14 @@ TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
 
   // Generate one more file in level-0, and should trigger level-0 compaction
   for (int idx = 0; idx < 11; ++idx) {
-    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
   }
   dbfull()->TEST_WaitForFlushMemTable();
   dbfull()->TEST_CompactRange(0, nullptr, nullptr);
 
   ASSERT_EQ("0,1", FilesPerLevel());
   for (int idx = 0; idx < 11; ++idx) {
-    ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
   }
 }
 
@@ -333,7 +333,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/db/db_basic_test.cc b/thirdparty/rocksdb/db/db_basic_test.cc
index 654a457ef5..c93a5e4364 100644
--- a/thirdparty/rocksdb/db/db_basic_test.cc
+++ b/thirdparty/rocksdb/db/db_basic_test.cc
@@ -6,9 +6,11 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
+// #include <iostream>
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/perf_context.h"
+#include "util/fault_injection_test_env.h"
 #if !defined(ROCKSDB_LITE)
 #include "util/sync_point.h"
 #endif
@@ -41,7 +43,7 @@ TEST_F(DBBasicTest, ReadOnlyDB) {
   Close();
 
   auto options = CurrentOptions();
-  assert(options.env = env_);
+  assert(options.env == env_);
   ASSERT_OK(ReadOnlyReopen(options));
   ASSERT_EQ("v3", Get("foo"));
   ASSERT_EQ("v2", Get("bar"));
@@ -213,11 +215,11 @@ TEST_F(DBBasicTest, PutSingleDeleteGet) {
     ASSERT_EQ("v2", Get(1, "foo2"));
     ASSERT_OK(SingleDelete(1, "foo"));
     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
-    // Skip HashCuckooRep as it does not support single delete. FIFO and
-    // universal compaction do not apply to the test case. Skip MergePut
-    // because single delete does not get removed when it encounters a merge.
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
-                         kSkipUniversalCompaction | kSkipMergePut));
+    // Ski FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
 }
 
 TEST_F(DBBasicTest, EmptyFlush) {
@@ -235,11 +237,11 @@ TEST_F(DBBasicTest, EmptyFlush) {
     ASSERT_OK(Flush(1));
 
     ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
-    // Skip HashCuckooRep as it does not support single delete. FIFO and
-    // universal compaction do not apply to the test case. Skip MergePut
-    // because merges cannot be combined with single deletions.
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
-                         kSkipUniversalCompaction | kSkipMergePut));
+    // Skip FIFO and  universal compaction as they do not apply to the test
+    // case. Skip MergePut because merges cannot be combined with single
+    // deletions.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
 }
 
 TEST_F(DBBasicTest, GetFromVersions) {
@@ -263,11 +265,6 @@ TEST_F(DBBasicTest, GetSnapshot) {
       std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
       ASSERT_OK(Put(1, key, "v1"));
       const Snapshot* s1 = db_->GetSnapshot();
-      if (option_config_ == kHashCuckoo) {
-        // Unsupported case.
-        ASSERT_TRUE(s1 == nullptr);
-        break;
-      }
       ASSERT_OK(Put(1, key, "v2"));
       ASSERT_EQ("v2", Get(1, key));
       ASSERT_EQ("v1", Get(1, key, s1));
@@ -508,7 +505,7 @@ TEST_F(DBBasicTest, Snapshot) {
     ASSERT_EQ(0U, GetNumSnapshots());
     ASSERT_EQ("0v4", Get(0, "foo"));
     ASSERT_EQ("1v4", Get(1, "foo"));
-  } while (ChangeOptions(kSkipHashCuckoo));
+  } while (ChangeOptions());
 }
 
 #endif  // ROCKSDB_LITE
@@ -564,25 +561,24 @@ TEST_F(DBBasicTest, CompactBetweenSnapshots) {
                            nullptr);
     ASSERT_EQ("sixth", Get(1, "foo"));
     ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
-    // skip HashCuckooRep as it does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction));
+  } while (ChangeOptions(kSkipFIFOCompaction));
 }
 
 TEST_F(DBBasicTest, DBOpen_Options) {
   Options options = CurrentOptions();
-  std::string dbname = test::TmpDir(env_) + "/db_options_test";
-  ASSERT_OK(DestroyDB(dbname, options));
+  Close();
+  Destroy(options);
 
   // Does not exist, and create_if_missing == false: error
   DB* db = nullptr;
   options.create_if_missing = false;
-  Status s = DB::Open(options, dbname, &db);
+  Status s = DB::Open(options, dbname_, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
   ASSERT_TRUE(db == nullptr);
 
   // Does not exist, and create_if_missing == true: OK
   options.create_if_missing = true;
-  s = DB::Open(options, dbname, &db);
+  s = DB::Open(options, dbname_, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
@@ -592,14 +588,14 @@ TEST_F(DBBasicTest, DBOpen_Options) {
   // Does exist, and error_if_exists == true: error
   options.create_if_missing = false;
   options.error_if_exists = true;
-  s = DB::Open(options, dbname, &db);
+  s = DB::Open(options, dbname_, &db);
   ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
   ASSERT_TRUE(db == nullptr);
 
   // Does exist, and error_if_exists == false: OK
   options.create_if_missing = true;
   options.error_if_exists = false;
-  s = DB::Open(options, dbname, &db);
+  s = DB::Open(options, dbname_, &db);
   ASSERT_OK(s);
   ASSERT_TRUE(db != nullptr);
 
@@ -793,7 +789,7 @@ TEST_F(DBBasicTest, ChecksumTest) {
   BlockBasedTableOptions table_options;
   Options options = CurrentOptions();
   // change when new checksum type added
-  int max_checksum = static_cast<int>(kxxHash);
+  int max_checksum = static_cast<int>(kxxHash64);
   const int kNumPerFile = 2;
 
   // generate one table with each type of checksum
@@ -808,7 +804,7 @@ TEST_F(DBBasicTest, ChecksumTest) {
   }
 
   // verify data with each type of checksum
-  for (int i = 0; i <= kxxHash; ++i) {
+  for (int i = 0; i <= kxxHash64; ++i) {
     table_options.checksum = static_cast<ChecksumType>(i);
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     Reopen(options);
@@ -847,6 +843,291 @@ TEST_F(DBBasicTest, MmapAndBufferOptions) {
 }
 #endif
 
+class TestEnv : public EnvWrapper {
+  public:
+    explicit TestEnv() : EnvWrapper(Env::Default()),
+                close_count(0) { }
+
+    class TestLogger : public Logger {
+      public:
+        using Logger::Logv;
+        TestLogger(TestEnv *env_ptr) : Logger() { env = env_ptr; }
+        ~TestLogger() override {
+          if (!closed_) {
+            CloseHelper();
+          }
+        }
+        void Logv(const char* /*format*/, va_list /*ap*/) override{};
+
+       protected:
+        Status CloseImpl() override { return CloseHelper(); }
+
+       private:
+        Status CloseHelper() {
+          env->CloseCountInc();;
+          return Status::IOError();
+        }
+        TestEnv *env;
+    };
+
+    void CloseCountInc() { close_count++; }
+
+    int GetCloseCount() { return close_count; }
+
+    Status NewLogger(const std::string& /*fname*/,
+                     std::shared_ptr<Logger>* result) override {
+      result->reset(new TestLogger(this));
+      return Status::OK();
+    }
+
+   private:
+    int close_count;
+};
+
+TEST_F(DBBasicTest, DBClose) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("db_close_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  DB* db = nullptr;
+  TestEnv* env = new TestEnv();
+  options.create_if_missing = true;
+  options.env = env;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  ASSERT_EQ(s, Status::IOError());
+
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 1);
+
+  // Do not call DB::Close() and ensure our logger Close() still gets called
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+
+  // Provide our own logger and ensure DB::Close() does not close it
+  options.info_log.reset(new TestEnv::TestLogger(env));
+  options.create_if_missing = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(s, Status::OK());
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+  options.info_log.reset();
+  ASSERT_EQ(env->GetCloseCount(), 3);
+
+  delete options.env;
+}
+
+TEST_F(DBBasicTest, DBCloseFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.manual_wal_flush = true;
+  options.write_buffer_size=100;
+  options.env = fault_injection_env.get();
+
+  Reopen(options);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(Put("key3", "value3"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->Close();
+  fault_injection_env->SetFilesystemActive(true);
+  ASSERT_NE(s, Status::OK());
+
+  Destroy(options);
+}
+
+TEST_F(DBBasicTest, MultiGetMultiCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  rocksdb::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
+          // is forced to repeat the process
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                          "cf" + std::to_string(i) + "_val2"));
+          }
+        }
+        if (get_sv_count == 11) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  values = MultiGet(cfs, keys);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val2");
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+                    ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
+  }
+}
+
+TEST_F(DBBasicTest, MultiGetMultiCFMutex) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  int retries = 0;
+  bool last_try = false;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) {
+        last_try = true;
+        rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (last_try) {
+          return;
+        }
+        if (++get_sv_count == 2) {
+          ++retries;
+          get_sv_count = 0;
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(
+                i, "cf" + std::to_string(i) + "_key",
+                "cf" + std::to_string(i) + "_val" + std::to_string(retries)));
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  values = MultiGet(cfs, keys);
+  ASSERT_TRUE(last_try);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j],
+              "cf" + std::to_string(j) + "_val" + std::to_string(retries));
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+                    ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
+TEST_F(DBBasicTest, MultiGetMultiCFSnapshot) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  rocksdb::DBImpl* db = reinterpret_cast<DBImpl*>(db_);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                          "cf" + std::to_string(i) + "_val2"));
+          }
+        }
+        if (get_sv_count == 8) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_TRUE(
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) ||
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete));
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  const Snapshot* snapshot = db_->GetSnapshot();
+  values = MultiGet(cfs, keys, snapshot);
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
+                    reinterpret_cast<DBImpl*>(db_)->GetColumnFamilyHandle(i))
+                    ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_blob_index_test.cc b/thirdparty/rocksdb/db/db_blob_index_test.cc
index e71b511df5..005a23d63b 100644
--- a/thirdparty/rocksdb/db/db_blob_index_test.cc
+++ b/thirdparty/rocksdb/db/db_blob_index_test.cc
@@ -64,7 +64,8 @@ class DBBlobIndexTest : public DBTestBase {
     read_options.snapshot = snapshot;
     PinnableSlice value;
     auto s = dbfull()->GetImpl(read_options, cfh(), key, &value,
-                               nullptr /*value_found*/, is_blob_index);
+                               nullptr /*value_found*/, nullptr /*callback*/,
+                               is_blob_index);
     if (s.IsNotFound()) {
       return "NOT_FOUND";
     }
@@ -88,9 +89,9 @@ class DBBlobIndexTest : public DBTestBase {
   }
 
   ArenaWrappedDBIter* GetBlobIterator() {
-    return dbfull()->NewIteratorImpl(ReadOptions(), cfd(),
-                                     dbfull()->GetLatestSequenceNumber(),
-                                     true /*allow_blob*/);
+    return dbfull()->NewIteratorImpl(
+        ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+        nullptr /*read_callback*/, true /*allow_blob*/);
   }
 
   Options GetTestOptions() {
diff --git a/thirdparty/rocksdb/db/db_block_cache_test.cc b/thirdparty/rocksdb/db/db_block_cache_test.cc
index 169cadc85c..ad906dbcb5 100644
--- a/thirdparty/rocksdb/db/db_block_cache_test.cc
+++ b/thirdparty/rocksdb/db/db_block_cache_test.cc
@@ -47,7 +47,7 @@ class DBBlockCacheTest : public DBTestBase {
     return options;
   }
 
-  void InitTable(const Options& options) {
+  void InitTable(const Options& /*options*/) {
     std::string value(kValueSize, 'a');
     for (size_t i = 0; i < kNumBlocks; i++) {
       ASSERT_OK(Put(ToString(i), value.c_str()));
@@ -111,6 +111,31 @@ class DBBlockCacheTest : public DBTestBase {
   }
 };
 
+TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(0, 0, false);
+  table_options.block_cache = cache;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  ASSERT_EQ(0, cache->GetUsage());
+  iter = db_->NewIterator(read_options);
+  iter->Seek(ToString(0));
+  ASSERT_LT(0, cache->GetUsage());
+  delete iter;
+  iter = nullptr;
+  ASSERT_EQ(0, cache->GetUsage());
+}
+
 TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
   ReadOptions read_options;
   auto table_options = GetTableOptions();
@@ -148,7 +173,7 @@ TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
   delete iter;
   iter = nullptr;
 
-  // Release interators and access cache again.
+  // Release iterators and access cache again.
   for (size_t i = 0; i < kNumBlocks - 1; i++) {
     iterators[i].reset();
     CheckCacheCounters(options, 0, 0, 0, 0);
@@ -280,6 +305,41 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
             TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
 }
 
+// With fill_cache = false, fills up the cache, then iterates over the entire
+// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator`
+// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs
+TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(10, 0, true);
+  table_options.block_cache = cache;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(Put("key4", "val4"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key5", "val5"));
+  ASSERT_OK(Put("key6", "val6"));
+  ASSERT_OK(Flush());
+
+  Iterator* iter = nullptr;
+
+  iter = db_->NewIterator(read_options);
+  iter->Seek(ToString(0));
+  while (iter->Valid()) {
+    iter->Next();
+  }
+  delete iter;
+  iter = nullptr;
+}
+
 TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
@@ -289,7 +349,7 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
   // 200 bytes are enough to hold the first two blocks
   std::shared_ptr<Cache> cache = NewLRUCache(200, 0, false);
   table_options.block_cache = cache;
-  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   CreateAndReopenWithCF({"pikachu"}, options);
 
@@ -330,11 +390,14 @@ class MockCache : public LRUCache {
   static uint32_t high_pri_insert_count;
   static uint32_t low_pri_insert_count;
 
-  MockCache() : LRUCache(1 << 25, 0, false, 0.0) {}
+  MockCache()
+      : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
+                 false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/) {
+  }
 
-  virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Handle** handle, Priority priority) override {
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value), Handle** handle,
+                Priority priority) override {
     if (priority == Priority::LOW) {
       low_pri_insert_count++;
     } else {
@@ -568,6 +631,74 @@ TEST_F(DBBlockCacheTest, CompressedCache) {
   }
 }
 
+TEST_F(DBBlockCacheTest, CacheCompressionDict) {
+  const int kNumFiles = 4;
+  const int kNumEntriesPerFile = 128;
+  const int kNumBytesPerEntry = 1024;
+
+  // Try all the available libraries that support dictionary compression
+  std::vector<CompressionType> compression_types;
+#ifdef ZLIB
+  compression_types.push_back(kZlibCompression);
+#endif  // ZLIB
+#if LZ4_VERSION_NUMBER >= 10400
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+#if ZSTD_VERSION_NUMBER >= 500
+  compression_types.push_back(kZSTD);
+#endif  // ZSTD_VERSION_NUMBER >= 500
+  Random rnd(301);
+  for (auto compression_type : compression_types) {
+    Options options = CurrentOptions();
+    options.compression = compression_type;
+    options.compression_opts.max_dict_bytes = 4096;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.statistics = rocksdb::CreateDBStatistics();
+    options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache.reset(new MockCache());
+    options.table_factory.reset(new BlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    for (int i = 0; i < kNumFiles; ++i) {
+      ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+      for (int j = 0; j < kNumEntriesPerFile; ++j) {
+        std::string value = RandomString(&rnd, kNumBytesPerEntry);
+        ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+      }
+      ASSERT_OK(Flush());
+    }
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+    // Seek to a key in a file. It should cause the SST's dictionary meta-block
+    // to be read.
+    RecordCacheCounters(options);
+    ASSERT_EQ(0,
+              TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD));
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    ReadOptions read_options;
+    ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+    // Two blocks missed/added: dictionary and data block
+    // One block hit: index since it's prefetched
+    CheckCacheCounters(options, 2 /* expected_misses */, 1 /* expected_hits */,
+                       2 /* expected_inserts */, 0 /* expected_failures */);
+    ASSERT_EQ(1,
+              TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD));
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  }
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_bloom_filter_test.cc b/thirdparty/rocksdb/db/db_bloom_filter_test.cc
index e6248a0401..39dd20bb25 100644
--- a/thirdparty/rocksdb/db/db_bloom_filter_test.cc
+++ b/thirdparty/rocksdb/db/db_bloom_filter_test.cc
@@ -22,27 +22,51 @@ class DBBloomFilterTest : public DBTestBase {
 
 class DBBloomFilterTestWithParam
     : public DBTestBase,
-      public testing::WithParamInterface<std::tuple<bool, bool>> {
+      public testing::WithParamInterface<std::tuple<bool, bool, uint32_t>> {
   //                             public testing::WithParamInterface<bool> {
  protected:
   bool use_block_based_filter_;
   bool partition_filters_;
+  uint32_t format_version_;
 
  public:
   DBBloomFilterTestWithParam() : DBTestBase("/db_bloom_filter_tests") {}
 
-  ~DBBloomFilterTestWithParam() {}
+  ~DBBloomFilterTestWithParam() override {}
 
   void SetUp() override {
     use_block_based_filter_ = std::get<0>(GetParam());
     partition_filters_ = std::get<1>(GetParam());
+    format_version_ = std::get<2>(GetParam());
+  }
+};
+
+class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {};
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 5);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 5;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 5;
   }
 };
 
 // KeyMayExist can lead to a few false positives, but not false negatives.
 // To make test deterministic, use a much larger number of bits per key-20 than
 // bits in the key, so that false positives are eliminated
-TEST_P(DBBloomFilterTestWithParam, KeyMayExist) {
+TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
   do {
     ReadOptions ropts;
     std::string value;
@@ -117,11 +141,79 @@ TEST_P(DBBloomFilterTestWithParam, KeyMayExist) {
       ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
 }
 
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor =
+        std::make_shared<SliceTransformLimitedDomainGeneric>();
+    options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+    ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+    dbfull()->Flush(fo);
+
+    ASSERT_EQ("foo", Get("barbarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("foo2", Get("barbarbar2"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ro.total_order_seek = true;
+    ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    get_perf_context()->Reset();
+  }
+}
+
 TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
   for (bool partition_filters : {true, false}) {
     Options options = last_options_;
     options.prefix_extractor.reset(NewFixedPrefixTransform(8));
     options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
     BlockBasedTableOptions bbto;
     bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
     if (partition_filters) {
@@ -160,6 +252,10 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
     ro.total_order_seek = true;
     ASSERT_TRUE(db_->Get(ro, "foobarbar", &value).IsNotFound());
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    get_perf_context()->Reset();
   }
 }
 
@@ -168,6 +264,7 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     Options options = last_options_;
     options.prefix_extractor.reset(NewFixedPrefixTransform(3));
     options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
 
     BlockBasedTableOptions bbto;
     bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
@@ -315,6 +412,14 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
     ASSERT_EQ("bar", Get("barfoo"));
     ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    uint64_t bloom_filter_useful_all_levels = 0;
+    for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+      if (kv.second.bloom_filter_useful > 0) {
+        bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+      }
+    }
+    ASSERT_EQ(12, bloom_filter_useful_all_levels);
+    get_perf_context()->Reset();
   }
 }
 
@@ -334,6 +439,11 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
       table_options.index_type =
           BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
     }
+    table_options.format_version = format_version_;
+    if (format_version_ >= 4) {
+      // value delta encoding challenged more with index interval > 1
+      table_options.index_block_restart_interval = 8;
+    }
     table_options.metadata_block_size = 32;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
@@ -389,15 +499,34 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
   } while (ChangeCompactOptions());
 }
 
-INSTANTIATE_TEST_CASE_P(DBBloomFilterTestWithParam, DBBloomFilterTestWithParam,
-                        ::testing::Values(std::make_tuple(true, false),
-                                          std::make_tuple(false, true),
-                                          std::make_tuple(false, false)));
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestDefFormatVersion,
+    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
+                      std::make_tuple(false, true, test::kDefaultFormatVersion),
+                      std::make_tuple(false, false,
+                                      test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestWithParam,
+    ::testing::Values(std::make_tuple(true, false, test::kDefaultFormatVersion),
+                      std::make_tuple(false, true, test::kDefaultFormatVersion),
+                      std::make_tuple(false, false,
+                                      test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, DBBloomFilterTestWithParam,
+    ::testing::Values(std::make_tuple(true, false, test::kLatestFormatVersion),
+                      std::make_tuple(false, true, test::kLatestFormatVersion),
+                      std::make_tuple(false, false,
+                                      test::kLatestFormatVersion)));
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_F(DBBloomFilterTest, BloomFilterRate) {
   while (ChangeFilterOptions()) {
     Options options = CurrentOptions();
     options.statistics = rocksdb::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
     CreateAndReopenWithCF({"pikachu"}, options);
 
     const int maxKey = 10000;
@@ -419,6 +548,10 @@ TEST_F(DBBloomFilterTest, BloomFilterRate) {
       ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
     }
     ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+    ASSERT_GE(
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful,
+        maxKey * 0.98);
+    get_perf_context()->Reset();
   }
 }
 
@@ -509,7 +642,7 @@ class WrappedBloom : public FilterPolicy {
   explicit WrappedBloom(int bits_per_key)
       : filter_(NewBloomFilterPolicy(bits_per_key)), counter_(0) {}
 
-  ~WrappedBloom() { delete filter_; }
+  ~WrappedBloom() override { delete filter_; }
 
   const char* Name() const override { return "WrappedRocksDbFilterPolicy"; }
 
@@ -653,6 +786,56 @@ TEST_F(DBBloomFilterTest, PrefixExtractorBlockFilter) {
   delete iter;
 }
 
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kPrefixLen = 4;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  options.memtable_whole_key_filtering = false;
+  Reopen(options);
+  std::string key1("AAAABBBB");
+  std::string key2("AAAACCCC");  // not in DB
+  std::string key3("AAAADDDD");
+  std::string key4("AAAAEEEE");
+  std::string value1("Value1");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+  // same prefix, bloom filter false positive
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // enable whole key bloom filter
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // verify whole key filtering does not depend on prefix_extractor
+  options.prefix_extractor.reset();
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+}
+
 #ifndef ROCKSDB_LITE
 class BloomStatsTestWithParam
     : public DBBloomFilterTest,
@@ -690,7 +873,7 @@ class BloomStatsTestWithParam
     DestroyAndReopen(options_);
   }
 
-  ~BloomStatsTestWithParam() {
+  ~BloomStatsTestWithParam() override {
     get_perf_context()->Reset();
     Destroy(options_);
   }
@@ -764,7 +947,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
   ASSERT_OK(Put(key1, value1, WriteOptions()));
   ASSERT_OK(Put(key3, value3, WriteOptions()));
 
-  unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
 
   // check memtable bloom stats
   iter->Seek(key1);
@@ -940,6 +1123,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   options.optimize_filters_for_hits = true;
   options.statistics = rocksdb::CreateDBStatistics();
+  get_perf_context()->EnablePerLevelPerfContext();
   CreateAndReopenWithCF({"mypikachu"}, options);
 
   int numkeys = 200000;
@@ -986,6 +1170,14 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   // no bloom filter. Most keys be checked bloom filters twice.
   ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
   ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
+  uint64_t bloom_filter_useful_all_levels = 0;
+  for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+    if (kv.second.bloom_filter_useful > 0) {
+      bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+    }
+  }
+  ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2);
+  ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2);
 
   for (int i = 0; i < numkeys; i += 2) {
     ASSERT_EQ(Get(1, Key(i)), "val");
@@ -1057,10 +1249,10 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial_move++; });
+      [&](void* /*arg*/) { non_trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   CompactRangeOptions compact_options;
@@ -1095,6 +1287,414 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
   ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
   ASSERT_EQ(2 /* index and data block */,
             TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  get_perf_context()->Reset();
+}
+
+int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
+  int count = 0;
+  for (iter->Seek(key); iter->Valid() && iter->status() == Status::OK();
+       iter->Next()) {
+    count++;
+  }
+  return count;
+}
+
+// use iterate_upper_bound to hint compatiability of existing bloom filters.
+// The BF is considered compatible if 1) upper bound and seek key transform
+// into the same string, or 2) the transformed seek key is of the same length
+// as the upper bound and two keys are adjacent according to the comparator.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
+  int iteration = 0;
+  for (bool use_block_based_builder : {true, false}) {
+    Options options;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(10, use_block_based_builder));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("abcdxxx0", "val1"));
+    ASSERT_OK(Put("abcdxxx1", "val2"));
+    ASSERT_OK(Put("abcdxxx2", "val3"));
+    ASSERT_OK(Put("abcdxxx3", "val4"));
+    dbfull()->Flush(FlushOptions());
+    {
+      // prefix_extractor has not changed, BF will always be read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+    }
+    {
+      Slice upper_bound("abcdzzzz");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.FixedPrefix.5"));
+    {
+      // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
+      // should check bloom filter since upper bound meets requirement
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + iteration);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx01, abcey) is not valid bound since upper bound is too long for
+      // the BF in SST (capped:4)
+      Slice upper_bound("abcey");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
+      // should skip bloom filter since upper bound is too long
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + iteration);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx02, abcdy) is a valid bound since the prefix is the same
+      Slice upper_bound("abcdy");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx02"), 4);
+      // should check bloom filter since upper bound matches transformed seek
+      // key
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + iteration * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the
+      // same prefix, 2) the prefixes are not consecutive
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
+      // should skip bloom filter since mismatch is found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + iteration * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
+    {
+      // [abc, abd) is not a valid bound since the upper bound is too short
+      // for BF (capped:4)
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + iteration * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:4"}}));
+    {
+      // set back to capped:4 and verify BF is always read
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                3 + iteration * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+    iteration++;
+  }
+}
+
+// Create multiple SST files each with a different prefix_extractor config,
+// verify iterators can read all SST files using the latest config.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
+  int iteration = 0;
+  for (bool use_block_based_builder : {true, false}) {
+    Options options;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(10, use_block_based_builder));
+    table_options.cache_index_and_filter_blocks = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    Slice upper_bound("foz90000");
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+
+    // first SST with fixed:1 BF
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foq1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.CappedPrefix.3"));
+    read_options.iterate_upper_bound = &upper_bound;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              1 + iteration);
+    ASSERT_EQ(CountIter(iter, "gpk"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              1 + iteration);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    // second SST with capped:3 BF
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foq5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    dbfull()->Flush(FlushOptions());
+    {
+      // BF is cappped:3 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                2 + iteration * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // both counters are incremented because BF is "not changed" for 1 of the
+      // 2 SST files, so filter is checked once and found no match.
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                3 + iteration * 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.FixedPrefix.2"));
+    // third SST with fixed:2 BF
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foq8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    dbfull()->Flush(FlushOptions());
+    {
+      // BF is fixed:2 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
+      // the first and last BF are checked
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                4 + iteration * 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // only last BF is checked and not found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                5 + iteration * 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+    }
+
+    // iter_old can only see the first SST, so checked plus 1
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              6 + iteration * 3);
+    // iter was created after the first setoptions call so only full filter
+    // will check the filter
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+              6 + iteration * 4);
+
+    {
+      // keys in all three SSTs are visible to iterator
+      // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2)
+      // so +2 for checked counter
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                7 + iteration * 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                8 + iteration * 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.CappedPrefix.3"));
+    {
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 6);
+      // all three SST are checked because the current options has the same as
+      // the remaining SST (capped:3)
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                9 + iteration * 7);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED),
+                10 + iteration * 7);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
+    }
+    // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
+    iteration++;
+  }
+}
+
+// Create a new column family in a running DB, change prefix_extractor
+// dynamically, verify the iterator created on the new column family behaves
+// as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
+  int iteration = 0;
+  for (bool use_block_based_builder : {true, false}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(10, use_block_based_builder));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    // create a new CF and set prefix_extractor dynamically
+    options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+    CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
+    ASSERT_EQ(0,
+              strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
+                     "rocksdb.CappedPrefix.3"));
+    ASSERT_OK(Put(2, "foo3", "bar3"));
+    ASSERT_OK(Put(2, "foo4", "bar4"));
+    ASSERT_OK(Put(2, "foo5", "bar5"));
+    ASSERT_OK(Put(2, "foq6", "bar6"));
+    ASSERT_OK(Put(2, "fpq7", "bar7"));
+    dbfull()->Flush(FlushOptions());
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(
+        dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(0,
+              strcmp(dbfull()->GetOptions(handles_[2]).prefix_extractor->Name(),
+                     "rocksdb.FixedPrefix.2"));
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
+    dbfull()->DestroyColumnFamilyHandle(handles_[2]);
+    handles_[2] = nullptr;
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    dbfull()->DestroyColumnFamilyHandle(handles_[1]);
+    handles_[1] = nullptr;
+    iteration++;
+  }
+}
+
+// Verify it's possible to change prefix_extractor at runtime and iterators
+// behaves as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
+  int iteration = 0;
+  for (bool use_block_based_builder : {true, false}) {
+    Options options;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(10, use_block_based_builder));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foo1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foo5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foo8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    dbfull()->Flush(FlushOptions());
+
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "foo"), 12);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(0, strcmp(dbfull()->GetOptions().prefix_extractor->Name(),
+                        "rocksdb.CappedPrefix.3"));
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      // "fp*" should be skipped
+      ASSERT_EQ(CountIter(iter, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+
+    // iterator created before should not be affected and see all keys
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    ASSERT_EQ(CountIter(iter_old, "abc"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+    iteration++;
+  }
 }
 
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/db_compaction_filter_test.cc b/thirdparty/rocksdb/db/db_compaction_filter_test.cc
index 9f751f059f..37e80048e6 100644
--- a/thirdparty/rocksdb/db/db_compaction_filter_test.cc
+++ b/thirdparty/rocksdb/db/db_compaction_filter_test.cc
@@ -24,35 +24,72 @@ class DBTestCompactionFilter : public DBTestBase {
   DBTestCompactionFilter() : DBTestBase("/db_compaction_filter_test") {}
 };
 
+// Param variant of DBTestBase::ChangeCompactOptions
+class DBTestCompactionFilterWithCompactParam
+    : public DBTestCompactionFilter,
+      public ::testing::WithParamInterface<DBTestBase::OptionConfig> {
+ public:
+  DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() {
+    option_config_ = GetParam();
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    if (option_config_ == kDefault || option_config_ == kUniversalCompaction ||
+        option_config_ == kUniversalCompactionMultiLevel) {
+      options.create_if_missing = true;
+    }
+    if (option_config_ == kLevelSubcompactions ||
+        option_config_ == kUniversalSubcompactions) {
+      assert(options.max_subcompactions > 1);
+    }
+    TryReopen(options);
+  }
+};
+
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+    DBTestCompactionFilterWithCompactOption,
+    DBTestCompactionFilterWithCompactParam,
+    ::testing::Values(DBTestBase::OptionConfig::kDefault,
+                      DBTestBase::OptionConfig::kUniversalCompaction,
+                      DBTestBase::OptionConfig::kUniversalCompactionMultiLevel,
+                      DBTestBase::OptionConfig::kLevelSubcompactions,
+                      DBTestBase::OptionConfig::kUniversalSubcompactions));
+#else
+// Run fewer cases in valgrind
+INSTANTIATE_TEST_CASE_P(DBTestCompactionFilterWithCompactOption,
+                        DBTestCompactionFilterWithCompactParam,
+                        ::testing::Values(DBTestBase::OptionConfig::kDefault));
+#endif  // ROCKSDB_VALGRIND_RUN
+
 class KeepFilter : public CompactionFilter {
  public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     cfilter_count++;
     return false;
   }
 
-  virtual const char* Name() const override { return "KeepFilter"; }
+  const char* Name() const override { return "KeepFilter"; }
 };
 
 class DeleteFilter : public CompactionFilter {
  public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     cfilter_count++;
     return true;
   }
 
-  virtual const char* Name() const override { return "DeleteFilter"; }
+  const char* Name() const override { return "DeleteFilter"; }
 };
 
 class DeleteISFilter : public CompactionFilter {
  public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     cfilter_count++;
     int i = std::stoi(key.ToString());
     if (i > 5 && i <= 105) {
@@ -61,23 +98,23 @@ class DeleteISFilter : public CompactionFilter {
     return false;
   }
 
-  virtual bool IgnoreSnapshots() const override { return true; }
+  bool IgnoreSnapshots() const override { return true; }
 
-  virtual const char* Name() const override { return "DeleteFilter"; }
+  const char* Name() const override { return "DeleteFilter"; }
 };
 
 // Skip x if floor(x/10) is even, use range skips. Requires that keys are
 // zero-padded to length 10.
 class SkipEvenFilter : public CompactionFilter {
  public:
-  virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
-                            const Slice& existing_value, std::string* new_value,
-                            std::string* skip_until) const override {
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* skip_until) const override {
     cfilter_count++;
     int i = std::stoi(key.ToString());
     if (i / 10 % 2 == 0) {
       char key_str[100];
-      snprintf(key_str, sizeof(key), "%010d", i / 10 * 10 + 10);
+      snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10);
       *skip_until = key_str;
       ++cfilter_skips;
       return Decision::kRemoveAndSkipUntil;
@@ -85,22 +122,22 @@ class SkipEvenFilter : public CompactionFilter {
     return Decision::kKeep;
   }
 
-  virtual bool IgnoreSnapshots() const override { return true; }
+  bool IgnoreSnapshots() const override { return true; }
 
-  virtual const char* Name() const override { return "DeleteFilter"; }
+  const char* Name() const override { return "DeleteFilter"; }
 };
 
 class DelayFilter : public CompactionFilter {
  public:
   explicit DelayFilter(DBTestBase* d) : db_test(d) {}
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     db_test->env_->addon_time_.fetch_add(1000);
     return true;
   }
 
-  virtual const char* Name() const override { return "DelayFilter"; }
+  const char* Name() const override { return "DelayFilter"; }
 
  private:
   DBTestBase* db_test;
@@ -110,13 +147,13 @@ class ConditionalFilter : public CompactionFilter {
  public:
   explicit ConditionalFilter(const std::string* filtered_value)
       : filtered_value_(filtered_value) {}
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     return value.ToString() == *filtered_value_;
   }
 
-  virtual const char* Name() const override { return "ConditionalFilter"; }
+  const char* Name() const override { return "ConditionalFilter"; }
 
  private:
   const std::string* filtered_value_;
@@ -126,16 +163,15 @@ class ChangeFilter : public CompactionFilter {
  public:
   explicit ChangeFilter() {}
 
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* new_value, bool* value_changed) const override {
     assert(new_value != nullptr);
     *new_value = NEW_VALUE;
     *value_changed = true;
     return false;
   }
 
-  virtual const char* Name() const override { return "ChangeFilter"; }
+  const char* Name() const override { return "ChangeFilter"; }
 };
 
 class KeepFilterFactory : public CompactionFilterFactory {
@@ -146,7 +182,7 @@ class KeepFilterFactory : public CompactionFilterFactory {
         check_context_cf_id_(check_context_cf_id),
         compaction_filter_created_(false) {}
 
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     if (check_context_) {
       EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
@@ -161,7 +197,7 @@ class KeepFilterFactory : public CompactionFilterFactory {
 
   bool compaction_filter_created() const { return compaction_filter_created_; }
 
-  virtual const char* Name() const override { return "KeepFilterFactory"; }
+  const char* Name() const override { return "KeepFilterFactory"; }
   bool check_context_;
   bool check_context_cf_id_;
   std::atomic_bool expect_full_compaction_;
@@ -172,7 +208,7 @@ class KeepFilterFactory : public CompactionFilterFactory {
 
 class DeleteFilterFactory : public CompactionFilterFactory {
  public:
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     if (context.is_manual_compaction) {
       return std::unique_ptr<CompactionFilter>(new DeleteFilter());
@@ -181,13 +217,13 @@ class DeleteFilterFactory : public CompactionFilterFactory {
     }
   }
 
-  virtual const char* Name() const override { return "DeleteFilterFactory"; }
+  const char* Name() const override { return "DeleteFilterFactory"; }
 };
 
 // Delete Filter Factory which ignores snapshots
 class DeleteISFilterFactory : public CompactionFilterFactory {
  public:
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     if (context.is_manual_compaction) {
       return std::unique_ptr<CompactionFilter>(new DeleteISFilter());
@@ -196,12 +232,12 @@ class DeleteISFilterFactory : public CompactionFilterFactory {
     }
   }
 
-  virtual const char* Name() const override { return "DeleteFilterFactory"; }
+  const char* Name() const override { return "DeleteFilterFactory"; }
 };
 
 class SkipEvenFilterFactory : public CompactionFilterFactory {
  public:
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     if (context.is_manual_compaction) {
       return std::unique_ptr<CompactionFilter>(new SkipEvenFilter());
@@ -210,18 +246,18 @@ class SkipEvenFilterFactory : public CompactionFilterFactory {
     }
   }
 
-  virtual const char* Name() const override { return "SkipEvenFilterFactory"; }
+  const char* Name() const override { return "SkipEvenFilterFactory"; }
 };
 
 class DelayFilterFactory : public CompactionFilterFactory {
  public:
   explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
     return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
   }
 
-  virtual const char* Name() const override { return "DelayFilterFactory"; }
+  const char* Name() const override { return "DelayFilterFactory"; }
 
  private:
   DBTestBase* db_test;
@@ -232,15 +268,13 @@ class ConditionalFilterFactory : public CompactionFilterFactory {
   explicit ConditionalFilterFactory(const Slice& filtered_value)
       : filtered_value_(filtered_value.ToString()) {}
 
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
     return std::unique_ptr<CompactionFilter>(
         new ConditionalFilter(&filtered_value_));
   }
 
-  virtual const char* Name() const override {
-    return "ConditionalFilterFactory";
-  }
+  const char* Name() const override { return "ConditionalFilterFactory"; }
 
  private:
   std::string filtered_value_;
@@ -250,12 +284,12 @@ class ChangeFilterFactory : public CompactionFilterFactory {
  public:
   explicit ChangeFilterFactory() {}
 
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
     return std::unique_ptr<CompactionFilter>(new ChangeFilter());
   }
 
-  virtual const char* Name() const override { return "ChangeFilterFactory"; }
+  const char* Name() const override { return "ChangeFilterFactory"; }
 };
 
 #ifndef ROCKSDB_LITE
@@ -301,14 +335,14 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   Arena arena;
   {
     InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
-    ScopedArenaIterator iter(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[1]));
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
       ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
       total++;
       if (ikey.sequence != 0) {
@@ -318,7 +352,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
     }
   }
   ASSERT_EQ(total, 100000);
-  ASSERT_EQ(count, 1);
+  ASSERT_EQ(count, 0);
 
   // overwrite all the 100K keys once again.
   for (int i = 0; i < 100000; i++) {
@@ -391,9 +425,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) {
   count = 0;
   {
     InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
-    ScopedArenaIterator iter(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[1]));
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* upper_bound */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber, handles_[1]));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
@@ -440,65 +475,63 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
 }
 #endif  // ROCKSDB_LITE
 
-TEST_F(DBTestCompactionFilter, CompactionFilterWithValueChange) {
-  do {
-    Options options = CurrentOptions();
-    options.num_levels = 3;
-    options.compaction_filter_factory =
-      std::make_shared<ChangeFilterFactory>();
-    CreateAndReopenWithCF({"pikachu"}, options);
-
-    // Write 100K+1 keys, these are written to a few files
-    // in L0. We do this so that the current snapshot points
-    // to the 100001 key.The compaction filter is  not invoked
-    // on keys that are visible via a snapshot because we
-    // anyways cannot delete it.
-    const std::string value(10, 'x');
-    for (int i = 0; i < 100001; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      Put(1, key, value);
-    }
+TEST_P(DBTestCompactionFilterWithCompactParam,
+       CompactionFilterWithValueChange) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<ChangeFilterFactory>();
+  CreateAndReopenWithCF({"pikachu"}, options);
 
-    // push all files to  lower levels
-    ASSERT_OK(Flush(1));
-    if (option_config_ != kUniversalCompactionMultiLevel &&
-        option_config_ != kUniversalSubcompactions) {
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    } else {
-      dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                             nullptr);
-    }
+  // Write 100K+1 keys, these are written to a few files
+  // in L0. We do this so that the current snapshot points
+  // to the 100001 key.The compaction filter is  not invoked
+  // on keys that are visible via a snapshot because we
+  // anyways cannot delete it.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(1, key, value);
+  }
 
-    // re-write all data again
-    for (int i = 0; i < 100001; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      Put(1, key, value);
-    }
+  // push all files to  lower levels
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  } else {
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+  }
 
-    // push all files to  lower levels. This should
-    // invoke the compaction filter for all 100000 keys.
-    ASSERT_OK(Flush(1));
-    if (option_config_ != kUniversalCompactionMultiLevel &&
-        option_config_ != kUniversalSubcompactions) {
-      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
-      dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
-    } else {
-      dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
-                             nullptr);
-    }
+  // re-write all data again
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(1, key, value);
+  }
 
-    // verify that all keys now have the new value that
-    // was set by the compaction process.
-    for (int i = 0; i < 100001; i++) {
-      char key[100];
-      snprintf(key, sizeof(key), "B%010d", i);
-      std::string newvalue = Get(1, key);
-      ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
-    }
-  } while (ChangeCompactOptions());
+  // push all files to  lower levels. This should
+  // invoke the compaction filter for all 100000 keys.
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
+    dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]);
+  } else {
+    dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                           nullptr);
+  }
+
+  // verify that all keys now have the new value that
+  // was set by the compaction process.
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    std::string newvalue = Get(1, key);
+    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  }
 }
 
 TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
@@ -610,14 +643,14 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
     int total = 0;
     Arena arena;
     InternalKeyComparator icmp(options.comparator);
-    RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
-    ScopedArenaIterator iter(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg));
+    ReadRangeDelAggregator range_del_agg(&icmp,
+                                         kMaxSequenceNumber /* snapshots */);
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        &arena, &range_del_agg, kMaxSequenceNumber));
     iter->SeekToFirst();
     ASSERT_OK(iter->status());
     while (iter->Valid()) {
       ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      ikey.sequence = -1;
       ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
       total++;
       if (ikey.sequence != 0) {
@@ -626,7 +659,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
       iter->Next();
     }
     ASSERT_EQ(total, 700);
-    ASSERT_EQ(count, 1);
+    ASSERT_EQ(count, 0);
   }
 }
 #endif  // ROCKSDB_LITE
@@ -661,44 +694,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
 }
 
 #ifndef ROCKSDB_LITE
-// Compaction filters should only be applied to records that are newer than the
-// latest snapshot. This test inserts records and applies a delete filter.
-TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) {
-  Options options = CurrentOptions();
-  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>();
-  options.disable_auto_compactions = true;
-  options.create_if_missing = true;
-  DestroyAndReopen(options);
-
-  // Put some data.
-  const Snapshot* snapshot = nullptr;
-  for (int table = 0; table < 4; ++table) {
-    for (int i = 0; i < 10; ++i) {
-      Put(ToString(table * 100 + i), "val");
-    }
-    Flush();
-
-    if (table == 0) {
-      snapshot = db_->GetSnapshot();
-    }
-  }
-  assert(snapshot != nullptr);
-
-  cfilter_count = 0;
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  // The filter should delete 10 records.
-  ASSERT_EQ(30U, cfilter_count);
-
-  // Release the snapshot and compact again -> now all records should be
-  // removed.
-  db_->ReleaseSnapshot(snapshot);
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  ASSERT_EQ(0U, CountLiveFiles());
-}
-
-// Compaction filters should only be applied to records that are newer than the
-// latest snapshot. However, if the compaction filter asks to ignore snapshots
-// records newer than the snapshot will also be processed
+// Compaction filters aplies to all records, regardless snapshots.
 TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
   std::string five = ToString(5);
   Options options = CurrentOptions();
@@ -739,7 +735,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
       iter->Next();
     }
     ASSERT_EQ(count, 6);
-    read_options.snapshot = 0;
+    read_options.snapshot = nullptr;
     std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
     iter1->SeekToFirst();
     count = 0;
@@ -813,7 +809,7 @@ TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
   DestroyAndReopen(options);
 
   Put("0000000010", "v10");
-  Put("0000000020", "v20"); // skipped
+  Put("0000000020", "v20");  // skipped
   Put("0000000050", "v50");
   Flush();
 
@@ -836,6 +832,38 @@ TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
   EXPECT_EQ("v50", val);
 }
 
+class TestNotSupportedFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return true;
+  }
+
+  const char* Name() const override { return "NotSupported"; }
+  bool IgnoreSnapshots() const override { return false; }
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
+  Options options = CurrentOptions();
+  options.compaction_filter = new TestNotSupportedFilter();
+  DestroyAndReopen(options);
+
+  Put("a", "v10");
+  Put("z", "v20");
+  Flush();
+
+  Put("a", "v10");
+  Put("z", "v20");
+  Flush();
+
+  // Comapction should fail because IgnoreSnapshots() = false
+  EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsNotSupported());
+
+  delete options.compaction_filter;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_compaction_test.cc b/thirdparty/rocksdb/db/db_compaction_test.cc
index ca77d5b939..df51ef2ca2 100644
--- a/thirdparty/rocksdb/db/db_compaction_test.cc
+++ b/thirdparty/rocksdb/db/db_compaction_test.cc
@@ -8,11 +8,15 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/db_test_util.h"
-#include "port/stack_trace.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/concurrent_task_limiter.h"
 #include "rocksdb/experimental.h"
 #include "rocksdb/utilities/convenience.h"
+#include "util/concurrent_task_limiter_impl.h"
+#include "util/fault_injection_test_env.h"
 #include "util/sync_point.h"
+
 namespace rocksdb {
 
 // SYNC_POINT is not supported in released Windows mode.
@@ -51,9 +55,9 @@ namespace {
 class FlushedFileCollector : public EventListener {
  public:
   FlushedFileCollector() {}
-  ~FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
 
-  virtual void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
     std::lock_guard<std::mutex> lock(mutex_);
     flushed_files_.push_back(info.file_path);
   }
@@ -74,6 +78,62 @@ class FlushedFileCollector : public EventListener {
   std::mutex mutex_;
 };
 
+class CompactionStatsCollector : public EventListener {
+public:
+  CompactionStatsCollector()
+      : compaction_completed_(static_cast<int>(CompactionReason::kNumOfReasons)) {
+    for (auto& v : compaction_completed_) {
+      v.store(0);
+    }
+  }
+
+  ~CompactionStatsCollector() override {}
+
+  void OnCompactionCompleted(DB* /* db */,
+                             const CompactionJobInfo& info) override {
+    int k = static_cast<int>(info.compaction_reason);
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    assert(k >= 0 && k < num_of_reasons);
+    compaction_completed_[k]++;
+  }
+
+  void OnExternalFileIngested(
+      DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
+    compaction_completed_[k]++;
+  }
+
+  void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kFlush);
+    compaction_completed_[k]++;
+  }
+
+  int NumberOfCompactions(CompactionReason reason) const {
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    int k = static_cast<int>(reason);
+    assert(k >= 0 && k < num_of_reasons);
+    return compaction_completed_.at(k).load();
+  }
+
+private:
+  std::vector<std::atomic<int>> compaction_completed_;
+};
+
+class SstStatsCollector : public EventListener {
+ public:
+  SstStatsCollector() : num_ssts_creation_started_(0) {}
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /* info */) override {
+    ++num_ssts_creation_started_;
+  }
+
+  int num_ssts_creation_started() { return num_ssts_creation_started_; }
+
+ private:
+  std::atomic<int> num_ssts_creation_started_;
+};
+
 static const int kCDTValueSize = 1000;
 static const int kCDTKeysPerBuffer = 4;
 static const int kCDTNumLevels = 8;
@@ -154,6 +214,40 @@ void VerifyCompactionResult(
 #endif
 }
 
+/*
+ * Verifies compaction stats of cfd are valid.
+ *
+ * For each level of cfd, its compaction stats are valid if
+ * 1) sum(stat.counts) == stat.count, and
+ * 2) stat.counts[i] == collector.NumberOfCompactions(i)
+ */
+void VerifyCompactionStats(ColumnFamilyData& cfd,
+    const CompactionStatsCollector& collector) {
+#ifndef NDEBUG
+  InternalStats* internal_stats_ptr = cfd.internal_stats();
+  ASSERT_TRUE(internal_stats_ptr != nullptr);
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  const int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+  std::vector<int> counts(num_of_reasons, 0);
+  // Count the number of compactions caused by each CompactionReason across
+  // all levels.
+  for (const auto& stat : comp_stats) {
+    int sum = 0;
+    for (int i = 0; i < num_of_reasons; i++) {
+      counts[i] += stat.counts[i];
+      sum += stat.counts[i];
+    }
+    ASSERT_EQ(sum, stat.count);
+  }
+  // Verify InternalStats bookkeeping matches that of CompactionStatsCollector,
+  // assuming that all compactions complete.
+  for (int i = 0; i < num_of_reasons; i++) {
+    ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)), counts[i]);
+  }
+#endif /* NDEBUG */
+}
+
 const SstFileMetaData* PickFileRandomly(
     const ColumnFamilyMetaData& cf_meta,
     Random* rand,
@@ -175,6 +269,7 @@ const SstFileMetaData* PickFileRandomly(
 }
 }  // anonymous namespace
 
+#ifndef ROCKSDB_VALGRIND_RUN
 // All the TEST_P tests run once with sub_compactions disabled (i.e.
 // options.max_subcompactions = 1) and once with it enabled
 TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
@@ -217,6 +312,85 @@ TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
     ASSERT_GT(db_size[0] / 3, db_size[1]);
   }
 }
+#endif  // ROCKSDB_VALGRIND_RUN
+
+TEST_P(DBCompactionTestWithParam, CompactionsPreserveDeletes) {
+  //  For each options type we test following
+  //  - Enable preserve_deletes
+  //  - write bunch of keys and deletes
+  //  - Set start_seqnum to the beginning; compact; check that keys are present
+  //  - rewind start_seqnum way forward; compact; check that keys are gone
+
+  for (int tid = 0; tid < 3; ++tid) {
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+    options.preserve_deletes=true;
+    options.num_levels = 2;
+
+    if (tid == 1) {
+      options.skip_stats_update_on_db_open = true;
+    } else if (tid == 2) {
+      // third pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+    // highlight the default; all deletes should be preserved
+    SetPreserveDeletesSequenceNumber(0);
+
+    const int kTestSize = kCDTKeysPerBuffer;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(RandomString(&rnd, kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // to ensure we tackle all tombstones
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+    dbfull()->TEST_WaitForFlushMemTable();
+    dbfull()->CompactRange(cro, nullptr, nullptr);
+
+    // check that normal user iterator doesn't see anything
+    Iterator* db_iter = dbfull()->NewIterator(ReadOptions());
+    int i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      i++;
+    }
+    ASSERT_EQ(i, 0);
+    delete db_iter;
+
+    // check that iterator that sees internal keys sees tombstones
+    ReadOptions ro;
+    ro.iter_start_seqnum=1;
+    db_iter = dbfull()->NewIterator(ro);
+    i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      i++;
+    }
+    ASSERT_EQ(i, 4);
+    delete db_iter;
+
+    // now all deletes should be gone
+    SetPreserveDeletesSequenceNumber(100000000);
+    dbfull()->CompactRange(cro, nullptr, nullptr);
+
+    db_iter = dbfull()->NewIterator(ro);
+    i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      i++;
+    }
+    ASSERT_EQ(i, 0);
+    delete db_iter;
+  }
+}
 
 TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
   // This test verify UpdateAccumulatedStats is not on
@@ -239,6 +413,7 @@ TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
 
   // Reopen the DB with stats-update disabled
   options.skip_stats_update_on_db_open = true;
+  options.max_open_files = 20;
   env_->random_file_open_counter_.store(0);
   Reopen(options);
 
@@ -264,7 +439,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   Options options = CurrentOptions();
   options.env = env_;
   options.new_table_reader_for_compaction_inputs = true;
-  options.max_open_files = 100;
+  options.max_open_files = 20;
   options.level0_file_num_compaction_trigger = 3;
   DestroyAndReopen(options);
   Random rnd(301);
@@ -282,7 +457,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
       });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "TableCache::GetTableReader:0",
-      [&](void* arg) { num_new_table_reader++; });
+      [&](void* /*arg*/) { num_new_table_reader++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
@@ -293,15 +468,16 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
       Flush();
       dbfull()->TEST_WaitForCompact();
       // preloading iterator issues one table cache lookup and create
-      // a new table reader.
-      ASSERT_EQ(num_table_cache_lookup, 1);
+      // a new table reader, if not preloaded.
+      int old_num_table_cache_lookup = num_table_cache_lookup;
+      ASSERT_GE(num_table_cache_lookup, 1);
       ASSERT_EQ(num_new_table_reader, 1);
 
       num_table_cache_lookup = 0;
       num_new_table_reader = 0;
       ASSERT_EQ(Key(k), Get(Key(k)));
       // lookup iterator from table cache and no need to create a new one.
-      ASSERT_EQ(num_table_cache_lookup, 1);
+      ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
       ASSERT_EQ(num_new_table_reader, 0);
     }
   }
@@ -314,7 +490,10 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   // a new table reader. One file is created for flush and one for compaction.
   // Compaction inputs make no table cache look-up for data/range deletion
   // iterators
-  ASSERT_EQ(num_table_cache_lookup, 2);
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 2);
+  int old_num_table_cache_lookup2 = num_table_cache_lookup;
+
   // Create new iterator for:
   // (1) 1 for verifying flush results
   // (2) 3 for compaction input files
@@ -324,7 +503,7 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  ASSERT_EQ(num_table_cache_lookup, 1);
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
   ASSERT_EQ(num_new_table_reader, 0);
 
   num_table_cache_lookup = 0;
@@ -336,14 +515,16 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
   db_->CompactRange(cro, nullptr, nullptr);
   // Only verifying compaction outputs issues one table cache lookup
   // for both data block and range deletion block).
-  ASSERT_EQ(num_table_cache_lookup, 1);
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 1);
+  old_num_table_cache_lookup2 = num_table_cache_lookup;
   // One for compaction input, one for verifying compaction results.
   ASSERT_EQ(num_new_table_reader, 2);
 
   num_table_cache_lookup = 0;
   num_new_table_reader = 0;
   ASSERT_EQ(Key(1), Get(Key(1)));
-  ASSERT_EQ(num_table_cache_lookup, 1);
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 2);
   ASSERT_EQ(num_new_table_reader, 0);
 
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -838,7 +1019,7 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
   int32_t trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
@@ -895,10 +1076,10 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial_move++; });
+      [&](void* /*arg*/) { non_trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
@@ -994,10 +1175,10 @@ TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial_move++; });
+      [&](void* /*arg*/) { non_trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
@@ -1053,10 +1234,10 @@ TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial_move++; });
+      [&](void* /*arg*/) { non_trivial_move++; });
   bool first = true;
   // Purpose of dependencies:
   // 4 -> 1: ensure the order of two non-trivial compactions
@@ -1067,7 +1248,7 @@ TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
        {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
        {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (first) {
           first = false;
           TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
@@ -1198,17 +1379,17 @@ TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial_move++; });
+      [&](void* /*arg*/) { non_trivial_move++; });
   bool first = true;
   bool second = true;
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
       {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
        {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
         if (first) {
           TEST_SYNC_POINT("DBCompaction::PartialFill:4");
           first = false;
@@ -1421,7 +1602,8 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
   // Note that we don't delete level 0 files
   compact_options.change_level = true;
   compact_options.target_level = 1;
-  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  dbfull()->TEST_WaitForCompact();
 
   ASSERT_OK(
       DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
@@ -1439,15 +1621,185 @@ TEST_F(DBCompactionTest, DeleteFileRange) {
   ASSERT_GT(old_num_files, new_num_files);
 }
 
+TEST_F(DBCompactionTest, DeleteFilesInRanges) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.max_background_compactions = 3;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file [0 => 100), [100 => 200), ... [900, 1000)
+  for (auto i = 0; i < 10; i++) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      values[k] = RandomString(&rnd, value_size);
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("10", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,10", FilesPerLevel(0));
+
+  // file [0 => 100), [200 => 300), ... [800, 900)
+  for (auto i = 0; i < 10; i+=2) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5,0,10", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ("0,5,10", FilesPerLevel(0));
+
+  // Delete files in range [0, 299] (inclusive)
+  {
+    auto begin_str1 = Key(0), end_str1 = Key(100);
+    auto begin_str2 = Key(100), end_str2 = Key(200);
+    auto begin_str3 = Key(200), end_str3 = Key(299);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size()));
+    ASSERT_EQ("0,3,7", FilesPerLevel(0));
+
+    // Keys [0, 300) should not exist.
+    for (auto i = 0; i < 300; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete files in range [600, 999) (exclusive)
+  {
+    auto begin_str1 = Key(600), end_str1 = Key(800);
+    auto begin_str2 = Key(700), end_str2 = Key(900);
+    auto begin_str3 = Key(800), end_str3 = Key(999);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size(), false));
+    ASSERT_EQ("0,1,4", FilesPerLevel(0));
+
+    // Keys [600, 900) should not exist.
+    for (auto i = 600; i < 900; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 600; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+    for (auto i = 900; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete all files.
+  {
+    RangePtr range;
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+    ASSERT_EQ("", FilesPerLevel(0));
+
+    for (auto i = 0; i < 1000; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+  // regression test for #2833: groups of files whose user-keys overlap at the
+  // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+  // reappear, either because a new version of the key was removed, or a range
+  // deletion was partially dropped. It could also cause non-overlapping
+  // invariant to be violated if the files dropped by DeleteFilesInRange were
+  // a subset of files that a range deletion spans.
+  const int kNumL0Files = 2;
+  const int kValSize = 8 << 10;  // 8KB
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  // The snapshot prevents key 1 from having its old version dropped. The low
+  // `target_file_size_base` ensures two keys will be in each output file.
+  const Snapshot* snapshot = nullptr;
+  Random rnd(301);
+  // The value indicates which flush the key belonged to, which is enough
+  // for us to determine the keys' relative ages. After L0 flushes finish,
+  // files look like:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[0]
+  // File 1:               1 -> vals[1], 2 -> vals[1]
+  //
+  // Then L0->L1 compaction happens, which outputs keys as follows:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[1]
+  // File 1:               1 -> vals[0], 2 -> vals[1]
+  //
+  // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+  // would cause `1 -> vals[0]` (an older key) to reappear.
+  std::string vals[kNumL0Files];
+  for (int i = 0; i < kNumL0Files; ++i) {
+    vals[i] = RandomString(&rnd, kValSize);
+    Put(Key(i), vals[i]);
+    Put(Key(i + 1), vals[i]);
+    Flush();
+    if (i == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+  // "1 -> vals[0]" to reappear.
+  std::string begin_str = Key(0), end_str = Key(1);
+  Slice begin = begin_str, end = end_str;
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_EQ(vals[1], Get(Key(1)));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
   int32_t trivial_move = 0;
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial_move++; });
+      [&](void* /*arg*/) { non_trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
@@ -1735,6 +2087,125 @@ TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
   Destroy(options);
 }
 
+TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+    new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"},option_vector[1]);
+
+  // Configura CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"},option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_filesperlevel = [&](const std::string& expected) {
+    ASSERT_EQ(expected, FilesPerLevel(0));
+    ASSERT_EQ(expected, FilesPerLevel(1));
+    ASSERT_EQ(expected, FilesPerLevel(2));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // Check that default column family uses db_paths.
+  // And Column family "one" uses cf_paths.
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  generate_file();
+  check_sstfilecount(1, 3);
+
+  // (1, 4)
+  generate_file();
+  check_filesperlevel("1,4");
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 1)
+  generate_file();
+  check_filesperlevel("1,4,1");
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 2)
+  generate_file();
+  check_filesperlevel("1,4,2");
+  check_sstfilecount(2, 2);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  check_getvalues();
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
 TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
   Random rnd(301);
   int max_key_level_insert = 200;
@@ -2017,6 +2488,7 @@ TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
 
     // Compaction range overlaps files
     Compact(1, "p1", "p9", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ("0,1", FilesPerLevel(1));
     ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
     ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
@@ -2032,6 +2504,7 @@ TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
 
     // Compact just the new range
     Compact(1, "b", "f", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
     ASSERT_EQ("0,2", FilesPerLevel(1));
     ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
     ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
@@ -2048,6 +2521,7 @@ TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
     compact_options.target_path_id = 1;
     compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
     db_->CompactRange(compact_options, handles_[1], nullptr, nullptr);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
 
     ASSERT_EQ("0,1", FilesPerLevel(1));
     ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
@@ -2325,16 +2799,16 @@ TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "Compaction::InputCompressionMatchesOutput:Matches",
-      [&](void* arg) { matches++; });
+      [&](void* /*arg*/) { matches++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "Compaction::InputCompressionMatchesOutput:DidntMatch",
-      [&](void* arg) { didnt_match++; });
+      [&](void* /*arg*/) { didnt_match++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial++; });
+      [&](void* /*arg*/) { non_trivial++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Reopen(options);
@@ -2490,21 +2964,46 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
   dbfull()->TEST_WaitForCompact();
 }
 
+static std::string ShortKey(int i) {
+  assert(i < 10000);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%04d", i);
+  return std::string(buf);
+}
 
 TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   int32_t trivial_move = 0;
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial_move++; });
+      [&](void* /*arg*/) { non_trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
+  // The key size is guaranteed to be <= 8
+  class ShortKeyComparator : public Comparator {
+    int Compare(const rocksdb::Slice& a,
+                const rocksdb::Slice& b) const override {
+      assert(a.size() <= 8);
+      assert(b.size() <= 8);
+      return BytewiseComparator()->Compare(a, b);
+    }
+    const char* Name() const override { return "ShortKeyComparator"; }
+    void FindShortestSeparator(std::string* start,
+                               const rocksdb::Slice& limit) const override {
+      return BytewiseComparator()->FindShortestSeparator(start, limit);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      return BytewiseComparator()->FindShortSuccessor(key);
+    }
+  } short_key_cmp;
   Options options = CurrentOptions();
+  options.target_file_size_base = 100000000;
   options.write_buffer_size = 100000000;
   options.max_subcompactions = max_subcompactions_;
+  options.comparator = &short_key_cmp;
   DestroyAndReopen(options);
 
   int32_t value_size = 10 * 1024;  // 10 KB
@@ -2514,7 +3013,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   // File with keys [ 0 => 99 ]
   for (int i = 0; i < 100; i++) {
     values.push_back(RandomString(&rnd, value_size));
-    ASSERT_OK(Put(Key(i), values[i]));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
 
@@ -2531,7 +3030,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   // File with keys [ 100 => 199 ]
   for (int i = 100; i < 200; i++) {
     values.push_back(RandomString(&rnd, value_size));
-    ASSERT_OK(Put(Key(i), values[i]));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
 
@@ -2549,7 +3048,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   // File with keys [ 200 => 299 ]
   for (int i = 200; i < 300; i++) {
     values.push_back(RandomString(&rnd, value_size));
-    ASSERT_OK(Put(Key(i), values[i]));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
   }
   ASSERT_OK(Flush());
 
@@ -2567,7 +3066,7 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
   ASSERT_EQ(non_trivial_move, 0);
 
   for (int i = 0; i < 300; i++) {
-    ASSERT_EQ(Get(Key(i)), values[i]);
+    ASSERT_EQ(Get(ShortKey(i)), values[i]);
   }
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
@@ -2684,6 +3183,48 @@ TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
   ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
 }
 
+TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  for (bool use_universal_compaction : {false, true}) {
+    Options options = CurrentOptions();
+    if (use_universal_compaction) {
+      options.compaction_style = kCompactionStyleUniversal;
+    } else {
+      options.compaction_style = kCompactionStyleLevel;
+      options.level_compaction_dynamic_level_bytes = true;
+    }
+    options.num_levels = 4;
+    options.write_buffer_size = 100 << 10;     // 100KB
+    options.target_file_size_base = 32 << 10;  // 32KB
+    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+    // Trigger compaction if size amplification exceeds 110%
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    DestroyAndReopen(options);
+
+    int num_bottom_pri_compactions = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BGWorkBottomCompaction",
+        [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      ASSERT_EQ(NumSortedRuns(), num);
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx);
+    }
+    dbfull()->TEST_WaitForCompact();
+
+    ASSERT_EQ(1, num_bottom_pri_compactions);
+
+    // Verify that size amplification did occur
+    ASSERT_EQ(NumSortedRuns(), 1);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
 TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
   // Deletions can be dropped when compacted to non-last level if they fall
   // outside the lower-level files' key-ranges.
@@ -2724,52 +3265,874 @@ TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
             options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
 }
 
-INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
-                        ::testing::Values(std::make_tuple(1, true),
-                                          std::make_tuple(1, false),
-                                          std::make_tuple(4, true),
-                                          std::make_tuple(4, false)));
-
-TEST_P(DBCompactionDirectIOTest, DirectIO) {
+TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
+  // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
+  // CompactFiles() had a bug where it failed to pick a compaction when an L0
+  // compaction existed, but marked it as scheduled anyways. It'd never be
+  // unmarked as scheduled, so future compactions or DB close could hang.
+  const int kNumL0Files = 5;
   Options options = CurrentOptions();
-  Destroy(options);
-  options.create_if_missing = true;
-  options.disable_auto_compactions = true;
-  options.use_direct_io_for_flush_and_compaction = GetParam();
-  options.env = new MockEnv(Env::Default());
-  Reopen(options);
-  bool readahead = false;
-  SyncPoint::GetInstance()->SetCallBack(
-      "TableCache::NewIterator:for_compaction", [&](void* arg) {
-        bool* use_direct_reads = static_cast<bool*>(arg);
-        ASSERT_EQ(*use_direct_reads,
-                  options.use_direct_io_for_flush_and_compaction);
-      });
-  SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
-        bool* use_direct_writes = static_cast<bool*>(arg);
-        ASSERT_EQ(*use_direct_writes,
-                  options.use_direct_io_for_flush_and_compaction);
-      });
-  if (options.use_direct_io_for_flush_and_compaction) {
-    SyncPoint::GetInstance()->SetCallBack(
-        "SanitizeOptions:direct_io", [&](void* arg) {
-          readahead = true;
-        });
-  }
-  SyncPoint::GetInstance()->EnableProcessing();
-  CreateAndReopenWithCF({"pikachu"}, options);
-  MakeTables(3, "p", "q", 1);
-  ASSERT_EQ("1,1,1", FilesPerLevel(1));
-  Compact(1, "p1", "p9");
-  ASSERT_FALSE(readahead ^ options.use_direct_io_for_flush_and_compaction);
-  ASSERT_EQ("0,0,1", FilesPerLevel(1));
-  Destroy(options);
-  delete options.env;
-}
+  options.level0_file_num_compaction_trigger = kNumL0Files - 1;
+  options.max_background_compactions = 2;
+  DestroyAndReopen(options);
 
-INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
-                        testing::Bool());
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
+       {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  auto schedule_multi_compaction_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // Files 0-3 will be included in an L0->L1 compaction.
+  //
+  // File 4 will be included in a call to CompactFiles() while the first
+  // compaction is running.
+  for (int i = 0; i < kNumL0Files - 1; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Put(Key(i + 1), "val"));
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
+  // file 4 flushed after 0-3 picked
+  ASSERT_OK(Put(Key(kNumL0Files), "val"));
+  ASSERT_OK(Flush());
+
+  // previously DB close would hang forever as this situation caused scheduled
+  // compactions count to never decrement to zero.
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()
+                  ->CompactFiles(CompactionOptions(), input_filenames,
+                                 0 /* output_level */));
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
+  // Regression test for bug of not pulling in L0 files that overlap the user-
+  // specified input files in time- and key-ranges.
+  Put(Key(0), "old_val");
+  Flush();
+  Put(Key(0), "new_val");
+  Flush();
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_GE(cf_meta.levels.size(), 2);
+  ASSERT_EQ(2, cf_meta.levels[0].files.size());
+
+  // Compacting {new L0 file, L1 file} should pull in the old L0 file since it
+  // overlaps in key-range and time-range.
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+                                   1 /* output_level */));
+  ASSERT_EQ("new_val", Get(Key(0)));
+}
+
+TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
+  // bottom-level files may contain deletions due to snapshots protecting the
+  // deleted keys. Once the snapshot is released, we should see files with many
+  // such deletions undergo single-file compactions.
+  const int kNumKeysPerFile = 1024;
+  const int kNumLevelFiles = 4;
+  const int kValueSize = 128;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumLevelFiles;
+  // inflate it a bit to account for key/metadata overhead
+  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+  CreateAndReopenWithCF({"one"}, options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    if (i == kNumLevelFiles - 1) {
+      snapshot = db_->GetSnapshot();
+      // delete every other key after grabbing a snapshot, so these deletions
+      // and the keys they cover can't be dropped until after the snapshot is
+      // released.
+      for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+        ASSERT_OK(Delete(Key(j)));
+      }
+    }
+    Flush();
+    if (i < kNumLevelFiles - 1) {
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+  db_->GetLiveFilesMetaData(&pre_release_metadata);
+  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+  // files does not need to be preserved in case of a future snapshot.
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  // release snapshot and wait for compactions to finish. Single-file
+  // compactions should be triggered, which reduce the size of each bottom-level
+  // file without changing file count.
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kBottommostFiles);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->TEST_WaitForCompact();
+  db_->GetLiveFilesMetaData(&post_release_metadata);
+  ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+
+  for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+    const auto& pre_file = pre_release_metadata[i];
+    const auto& post_file = post_release_metadata[i];
+    ASSERT_EQ(1, pre_file.level);
+    ASSERT_EQ(1, post_file.level);
+    // each file is smaller than it was before as it was rewritten without
+    // deletion markers/deleted keys.
+    ASSERT_LT(post_file.size, pre_file.size);
+  }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 1024;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.max_open_files = -1;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  env_->addon_time_.fetch_add(36 * 60 * 60);  // 36 hours
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Just do a simple write + flush so that the Ttl expired files get
+  // compacted.
+  ASSERT_OK(Put("a", "1"));
+  Flush();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->TEST_WaitForCompact();
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Test dynamically changing ttl.
+
+  env_->addon_time_.store(0);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), RandomString(&rnd, kValueSize)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Move time forward by 12 hours, and make sure that compaction still doesn't
+  // trigger as ttl is set to 24 hours.
+  env_->addon_time_.fetch_add(12 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ("1,2,0,2", FilesPerLevel());
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dynamically change ttl to 10 hours.
+  // This should trigger a ttl compaction, as 12 hours have already passed.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
+  dbfull()->TEST_WaitForCompact();
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // L0 file count going too high.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as
+  //         compaction trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+    if (i == 0) {
+      options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+    } else {
+      options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the auto compaction doesn't finish until manual compaction has
+      // had a chance to be delayed.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "CompactionJob::Run():End"}});
+    } else {
+      // ensure the auto-compaction doesn't finish until manual compaction has
+      // continued without delay.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+    }
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(Key(k), RandomString(&rnd, 1024)));
+      }
+      Flush();
+    }
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      db_->CompactRange(cro, nullptr, nullptr);
+    });
+
+    manual_compaction_thread.join();
+    dbfull()->TEST_WaitForCompact();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // immutable memtable count going too high.
+  const int kNumImmMemTableLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as flush
+  //         trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    // the delay limit is one less than the stop limit. This test focuses on
+    // avoiding delay limit, but this option sets stop limit, so add one.
+    options.max_write_buffer_number = kNumImmMemTableLimit + 1;
+    if (i == 1) {
+      options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the flush doesn't finish until manual compaction has had a
+      // chance to be delayed.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "FlushJob::WriteLevel0Table"}});
+    } else {
+      // ensure the flush doesn't finish until manual compaction has continued
+      // without delay.
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone",
+            "FlushJob::WriteLevel0Table"}});
+    }
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
+      ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+      FlushOptions flush_opts;
+      flush_opts.wait = false;
+      flush_opts.allow_write_stall = true;
+      dbfull()->Flush(flush_opts);
+    }
+
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      db_->CompactRange(cro, nullptr, nullptr);
+    });
+
+    manual_compaction_thread.join();
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
+  // does not hang if CF is dropped or DB is closed
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
+  // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
+  //         simulate what happens during Close as we can't call Close (it
+  //         blocks on the auto-compaction, making a cycle).
+  for (int i = 0; i < 2; ++i) {
+    CreateAndReopenWithCF({"one"}, options);
+    // The calls to close CF/DB wait until the manual compaction stalls.
+    // The auto-compaction waits until the manual compaction finishes to ensure
+    // the signal comes from closing CF/DB, not from compaction making progress.
+    rocksdb::SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+          "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
+         {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
+          "CompactionJob::Run():End"}});
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(1, Key(k), RandomString(&rnd, 1024)));
+      }
+      Flush(1);
+    }
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                      .IsShutdownInProgress());
+    });
+
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
+    if (i == 0) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+    } else {
+      dbfull()->CancelAllBackgroundWork(false /* wait */);
+    }
+    manual_compaction_thread.join();
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
+    dbfull()->TEST_WaitForCompact();
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`,
+  // CompactRange skips its flush if the delay is long enough that the memtables
+  // existing at the beginning of the call have already been flushed.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  Reopen(options);
+
+  Random rnd(301);
+  // The manual flush includes the memtable that was active when CompactRange
+  // began. So it unblocks CompactRange and precludes its flush. Throughout the
+  // test, stall conditions are upheld via high L0 file count.
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+        "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
+       {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
+        "DBImpl::FlushMemTable:StallWaitDone"},
+       {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  //used for the delayable flushes
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    dbfull()->Flush(flush_opts);
+  }
+  auto manual_compaction_thread = port::Thread([this]() {
+    CompactRangeOptions cro;
+    cro.allow_write_stall = false;
+    db_->CompactRange(cro, nullptr, nullptr);
+  });
+
+  TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
+  Put(ToString(0), RandomString(&rnd, 1024));
+  dbfull()->Flush(flush_opts);
+  Put(ToString(0), RandomString(&rnd, 1024));
+  TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
+  manual_compaction_thread.join();
+
+  // If CompactRange's flush was skipped, the final Put above will still be
+  // in the active memtable.
+  std::string num_keys_in_memtable;
+  db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable, &num_keys_in_memtable);
+  ASSERT_EQ(ToString(1), num_keys_in_memtable);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
+  // Verify memtable only gets flushed if it contains data overlapping the range
+  // provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
+  const int kNumEndpointKeys = 5;
+  std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // One extra iteration for nullptr, which means left side of interval is
+  // unbounded.
+  for (int i = 0; i <= kNumEndpointKeys; ++i) {
+    Slice begin;
+    Slice* begin_ptr;
+    if (i == 0) {
+      begin_ptr = nullptr;
+    } else {
+      begin = keys[i - 1];
+      begin_ptr = &begin;
+    }
+    // Start at `i` so right endpoint comes after left endpoint. One extra
+    // iteration for nullptr, which means right side of interval is unbounded.
+    for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
+      Slice end;
+      Slice* end_ptr;
+      if (j == kNumEndpointKeys) {
+        end_ptr = nullptr;
+      } else {
+        end = keys[j];
+        end_ptr = &end;
+      }
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Put("d", "val"));
+      CompactRangeOptions compact_range_opts;
+      ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
+
+      uint64_t get_prop_tmp, num_memtable_entries = 0;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      if (begin_ptr == nullptr || end_ptr == nullptr ||
+          (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
+        // In this case `CompactRange`'s range overlapped in some way with the
+        // memtable's range, so flush should've happened. Then "b" and "d" won't
+        // be in the memtable.
+        ASSERT_EQ(0, num_memtable_entries);
+      } else {
+        ASSERT_EQ(2, num_memtable_entries);
+        // flush anyways to prepare for next iteration
+        db_->Flush(FlushOptions());
+      }
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionStatsTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  CompactionStatsCollector* collector = new CompactionStatsCollector();
+  options.listeners.emplace_back(collector);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      Put(std::to_string(j), std::string(1, 'A'));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  dbfull()->TEST_WaitForCompact();
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  VerifyCompactionStats(*cfd, *collector);
+}
+
+TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
+  // LSM setup:
+  // L1:      [ba bz]
+  // L2: [a b]       [c d]
+  // L3: [a b]       [c d]
+  //
+  // Thread 1:                        Thread 2:
+  // Begin compacting all L2->L3
+  //                                  Compact [ba bz] L1->L3
+  // End compacting all L2->L3
+  //
+  // The compaction operation in thread 2 should be disallowed because the range
+  // overlaps with the compaction in thread 1, which also covers that range in
+  // L3.
+  Options options = CurrentOptions();
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  for (int level = 3; level >= 2; --level) {
+    ASSERT_OK(Put("a", "val"));
+    ASSERT_OK(Put("b", "val"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("c", "val"));
+    ASSERT_OK(Put("d", "val"));
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  ASSERT_OK(Put("ba", "val"));
+  ASSERT_OK(Put("bz", "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
+      {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
+       "CompactFilesImpl:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto bg_thread = port::Thread([&]() {
+    // Thread 1
+    std::vector<std::string> filenames = collector->GetFlushedFiles();
+    filenames.pop_back();
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
+                                3 /* output_level */));
+  });
+
+  // Thread 2
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
+  std::string filename = collector->GetFlushedFiles().back();
+  ASSERT_FALSE(
+      db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
+          .ok());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
+
+  bg_thread.join();
+}
+
+TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
+  Options options = CurrentOptions();
+  SstStatsCollector* collector = new SstStatsCollector();
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  // Make sure the L0 files overlap to prevent trivial move.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(Put("b", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("a"));
+  ASSERT_OK(Delete("b"));
+  ASSERT_OK(Flush());
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Expect one file creation to start for each flush, and zero for compaction
+  // since no keys are written.
+  ASSERT_EQ(2, collector->num_ssts_creation_started());
+}
+
+TEST_F(DBCompactionTest, CompactionLimiter) {
+  const int kNumKeysPerFile = 10;
+  const int kMaxBackgroundThreads = 64;
+
+  struct CompactionLimiter {
+    std::string name;
+    int limit_tasks;
+    int max_tasks;
+    int tasks;
+    std::shared_ptr<ConcurrentTaskLimiter> limiter;
+  };
+
+  std::vector<CompactionLimiter> limiter_settings;
+  limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
+
+  for (auto& ls : limiter_settings) {
+    ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
+  }
+
+  std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
+    NewConcurrentTaskLimiter("unique_limiter", -1));
+
+  const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5",
+    "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
+  const int cf_count = sizeof cf_names / sizeof cf_names[0];
+
+  std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 * 1024;  // 110KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 64;
+  options.level0_stop_writes_trigger = 64;
+  options.max_background_jobs = kMaxBackgroundThreads; // Enough threads
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  options.max_write_buffer_number = 10; // Enough memtables
+  DestroyAndReopen(options);
+
+  std::vector<Options> option_vector;
+  option_vector.reserve(cf_count);
+
+  for (int cf = 0; cf < cf_count; cf++) {
+    ColumnFamilyOptions cf_opt(options);
+    if (cf == 0) {
+      // "Default" CF does't use compaction limiter
+      cf_opt.compaction_thread_limiter = nullptr;
+    } else if (cf == 1) {
+      // "1" CF uses bypass compaction limiter
+      unique_limiter->SetMaxOutstandingTask(-1);
+      cf_opt.compaction_thread_limiter = unique_limiter;
+    } else {
+      // Assign limiter by mod
+      auto& ls = limiter_settings[cf % 3];
+      cf_opt.compaction_thread_limiter = ls.limiter;
+      cf_to_limiter[cf_names[cf]] = &ls;
+    }
+    option_vector.emplace_back(DBOptions(options), cf_opt);
+  }
+
+  for (int cf = 1; cf < cf_count; cf++) {
+    CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
+  }
+
+  ReopenWithColumnFamilies(std::vector<std::string>(cf_names,
+                                                    cf_names + cf_count),
+                           option_vector);
+
+  port::Mutex mutex;
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+    "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
+      const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+      auto iter = cf_to_limiter.find(cf_name);
+      if (iter != cf_to_limiter.end()) {
+        MutexLock l(&mutex);
+        ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
+        iter->second->max_tasks = std::max(iter->second->max_tasks,
+              iter->second->limit_tasks);
+      }
+    });
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+    "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
+      const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+      auto iter = cf_to_limiter.find(cf_name);
+      if (iter != cf_to_limiter.end()) {
+        MutexLock l(&mutex);
+        ASSERT_GE(--iter->second->tasks, 0);
+      }
+    });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Block all compact threads in thread pool.
+  const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
+  const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
+  env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
+  env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
+
+  test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
+
+  // Block all compaction threads in thread pool.
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_compact_tasks[i], Env::LOW);
+    sleeping_compact_tasks[i].WaitUntilSleeping();
+  }
+
+  int keyIndex = 0;
+
+  for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
+    for (int cf = 0; cf < cf_count; cf++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(keyIndex++), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+    }
+
+    for (int cf = 0; cf < cf_count; cf++) {
+      dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+    }
+  }
+
+  // Enough L0 files to trigger compaction
+  for (int cf = 0; cf < cf_count; cf++) {
+    ASSERT_EQ(NumTableFilesAtLevel(0, cf),
+      options.level0_file_num_compaction_trigger);
+  }
+
+  // Create more files for one column family, which triggers speed up
+  // condition, all compactions will be scheduled.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(0, Key(i), ""));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(0, "", ""));
+    dbfull()->TEST_WaitForFlushMemTable(handles_[0]);
+    ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+              NumTableFilesAtLevel(0, 0));
+  }
+
+  // All CFs are pending compaction
+  ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
+
+  // Unblock all compaction threads
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    sleeping_compact_tasks[i].WakeUp();
+    sleeping_compact_tasks[i].WaitUntilDone();
+  }
+
+  for (int cf = 0; cf < cf_count; cf++) {
+    dbfull()->TEST_WaitForFlushMemTable(handles_[cf]);
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Max outstanding compact tasks reached limit
+  for (auto& ls : limiter_settings) {
+    ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
+    ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
+  }
+
+  // test manual compaction under a fully throttled limiter
+  int cf_test = 1;
+  unique_limiter->SetMaxOutstandingTask(0);
+
+  // flush one more file to cf 1
+  for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
+  }
+  // put extra key to trigger flush
+  ASSERT_OK(Put(cf_test, "", ""));
+
+  dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]);
+  ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
+
+  Compact(cf_test, Key(0), Key(keyIndex));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+                        ::testing::Values(std::make_tuple(1, true),
+                                          std::make_tuple(1, false),
+                                          std::make_tuple(4, true),
+                                          std::make_tuple(4, false)));
+
+TEST_P(DBCompactionDirectIOTest, DirectIO) {
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.use_direct_io_for_flush_and_compaction = GetParam();
+  options.env = new MockEnv(Env::Default());
+  Reopen(options);
+  bool readahead = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::NewIterator:for_compaction", [&](void* arg) {
+        bool* use_direct_reads = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_reads,
+                  options.use_direct_reads);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
+        bool* use_direct_writes = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_writes,
+                  options.use_direct_io_for_flush_and_compaction);
+      });
+  if (options.use_direct_io_for_flush_and_compaction) {
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions:direct_io", [&](void* /*arg*/) {
+          readahead = true;
+        });
+  }
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  Compact(1, "p1", "p9");
+  ASSERT_EQ(readahead, options.use_direct_reads);
+  ASSERT_EQ("0,0,1", FilesPerLevel(1));
+  Destroy(options);
+  delete options.env;
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
+                        testing::Bool());
 
 class CompactionPriTest : public DBTestBase,
                           public testing::WithParamInterface<uint32_t> {
@@ -2821,6 +4184,167 @@ INSTANTIATE_TEST_CASE_P(
                       CompactionPri::kOldestSmallestSeqFirst,
                       CompactionPri::kMinOverlappingRatio));
 
+class NoopMergeOperator : public MergeOperator {
+ public:
+  NoopMergeOperator() {}
+
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* merge_out) const override {
+    std::string val("bar");
+    merge_out->new_value = val;
+    return true;
+  }
+
+  const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 10240;
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      Merge("foo", RandomString(&rnd, 1024));
+    }
+    Flush();
+  }
+
+  MoveFilesToLevel(2);
+
+  std::string prop;
+  EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+  uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  dbfull()->CompactRange(cro, nullptr, nullptr);
+}
+
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+  // Regression test for bug where manual compaction hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  const int kNumL0Files = 4;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.env = mock_env.get();
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure files are overlapping in key-range to prevent trivial move.
+    Put("key1", RandomString(&rnd, 1024));
+    Put("key2", RandomString(&rnd, 1024));
+    Flush();
+  }
+  ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+  // Enter read-only mode by failing a write.
+  mock_env->SetFilesystemActive(false);
+  // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+  // early trying to flush memtable.
+  ASSERT_NOK(Put("key3", RandomString(&rnd, 1024)));
+
+  // In the bug scenario, the first manual compaction would fail and forget to
+  // unregister itself, causing the second one to hang forever due to conflict
+  // with a non-running compaction.
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  Slice begin_key("key1");
+  Slice end_key("key2");
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+  // Close before mock_env destruct.
+  Close();
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+  const int kNumKeysPerFile = 100;
+  // Generate SST files.
+  Options options = CurrentOptions();
+
+  // Generate an external SST file containing a single key, i.e. 99
+  std::string sst_files_dir = dbname_ + "/sst_files/";
+  test::DestroyDir(env_, sst_files_dir);
+  ASSERT_OK(env_->CreateDir(sst_files_dir));
+  SstFileWriter sst_writer(EnvOptions(), options);
+  const std::string sst_file_path = sst_files_dir + "test.sst";
+  ASSERT_OK(sst_writer.Open(sst_file_path));
+  ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+  ASSERT_OK(sst_writer.Finish());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+       "BackgroundCallCompaction:0"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.level0_file_num_compaction_trigger =
+      options.level0_stop_writes_trigger;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Generate level0_stop_writes_trigger L0 files to trigger write stop
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    for (int j = 0; j != kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 990)));
+    }
+    if (0 == i) {
+      // When we reach here, the memtables have kNumKeysPerFile keys. Note that
+      // flush is not yet triggered. We need to write an extra key so that the
+      // write path will call PreprocessWrite and flush the previous key-value
+      // pairs to e flushed. After that, there will be the newest key in the
+      // memtable, and a bunch of L0 files. Since there is already one key in
+      // the memtable, then for i = 1, 2, ..., we do not have to write this
+      // extra key to trigger flush.
+      ASSERT_OK(Put("", ""));
+    }
+    dbfull()->TEST_WaitForFlushMemTable();
+    ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i + 1);
+  }
+  // When we reach this point, there will be level0_stop_writes_trigger L0
+  // files and one extra key (99) in memory, which overlaps with the external
+  // SST file. Write stall triggers, and can be cleared only after compaction
+  // reduces the number of L0 files.
+
+  // Compaction will also be triggered since we have reached the threshold for
+  // auto compaction. Note that compaction may begin after the following file
+  // ingestion thread and waits for ingestion to finish.
+
+  // Thread to ingest file with overlapping key range with the current
+  // memtable. Consequently ingestion will trigger a flush. The flush MUST
+  // proceed without waiting for the write stall condition to clear, otherwise
+  // deadlock can happen.
+  port::Thread ingestion_thr([&]() {
+    IngestExternalFileOptions ifo;
+    Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+    ASSERT_OK(s);
+  });
+
+  // More write to trigger write stop
+  ingestion_thr.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Close();
+}
+
 #endif // !defined(ROCKSDB_LITE)
 }  // namespace rocksdb
 
@@ -2830,6 +4354,8 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 #else
+  (void) argc;
+  (void) argv;
   return 0;
 #endif
 }
diff --git a/thirdparty/rocksdb/db/db_dynamic_level_test.cc b/thirdparty/rocksdb/db/db_dynamic_level_test.cc
index f968e7fc05..8fac82851e 100644
--- a/thirdparty/rocksdb/db/db_dynamic_level_test.cc
+++ b/thirdparty/rocksdb/db/db_dynamic_level_test.cc
@@ -27,7 +27,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
     return;
   }
   // Use InMemoryEnv, or it would be too slow.
-  unique_ptr<Env> env(new MockEnv(env_));
+  std::unique_ptr<Env> env(new MockEnv(env_));
 
   const int kNKeys = 1000;
   int keys[kNKeys];
@@ -125,6 +125,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   int kMaxKey = 1000000;
 
   Options options = CurrentOptions();
+  options.compression = kNoCompression;
   options.create_if_missing = true;
   options.write_buffer_size = 20480;
   options.max_write_buffer_number = 2;
@@ -167,8 +168,8 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(4U, int_prop);
 
-  // Insert extra about 28K to L0. After they are compacted to L4, base level
-  // should be changed to L3.
+  // Insert extra about 28K to L0. After they are compacted to L4, the base
+  // level should be changed to L3.
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "true"},
   }));
@@ -189,13 +190,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
   ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
   ASSERT_EQ("0", str_prop);
 
-  // Trigger parallel compaction, and the first one would change the base
-  // level.
-  // Hold compaction jobs to make sure
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Start",
-      [&](void* arg) { env_->SleepForMicroseconds(100000); });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  // Write even more data while leaving the base level at L3.
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "true"},
   }));
@@ -208,18 +203,12 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
       {"disable_auto_compactions", "false"},
   }));
   Flush();
-  // Wait for 200 milliseconds before proceeding compactions to make sure two
-  // parallel ones are executed.
-  env_->SleepForMicroseconds(200000);
   dbfull()->TEST_WaitForCompact();
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(3U, int_prop);
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 
-  // Trigger a condition that the compaction changes base level and L0->Lbase
-  // happens at the same time.
-  // We try to make last levels' targets to be 40K, 160K, 640K, add triggers
-  // another compaction from 40K->160K.
+  // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base
+  // level to 2.
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "true"},
   }));
@@ -229,23 +218,31 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
                   RandomString(&rnd, 380)));
   }
+
+  // Make sure that the compaction starts before the last bit of data is
+  // flushed, so that the base level isn't raised to L1.
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
   ASSERT_OK(dbfull()->SetOptions({
       {"disable_auto_compactions", "false"},
   }));
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
   Flush();
   dbfull()->TEST_WaitForCompact();
   ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
   ASSERT_EQ(2U, int_prop);
-
-  // A manual compaction will trigger the base level to become L2
-  // Keep Writing data until base level changed 2->1. There will be L0->L2
-  // compaction going on at the same time.
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 
+  // Write more data until the base level changes to L1. There will be
+  // a manual compaction going on at the same time.
   rocksdb::SyncPoint::GetInstance()->LoadDependency({
-      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
-      {"DynamicLevelMaxBytesBase2:1", "CompactionJob::Run():End"},
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"},
+      {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"},
       {"DynamicLevelMaxBytesBase2:compact_range_finish",
        "FlushJob::WriteLevel0Table"},
   });
@@ -257,12 +254,12 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
     TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish");
   });
 
-  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
   for (int i = 0; i < 2; i++) {
     ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
                   RandomString(&rnd, 380)));
   }
-  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
 
   Flush();
 
@@ -378,7 +375,7 @@ TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
   int non_trivial = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial",
-      [&](void* arg) { non_trivial++; });
+      [&](void* /*arg*/) { non_trivial++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Random rnd(301);
@@ -501,6 +498,8 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 #else
+  (void) argc;
+  (void) argv;
   return 0;
 #endif
 }
diff --git a/thirdparty/rocksdb/db/db_encryption_test.cc b/thirdparty/rocksdb/db/db_encryption_test.cc
index 38eee56459..46ba411b6f 100644
--- a/thirdparty/rocksdb/db/db_encryption_test.cc
+++ b/thirdparty/rocksdb/db/db_encryption_test.cc
@@ -40,7 +40,7 @@ TEST_F(DBEncryptionTest, CheckEncrypted) {
       continue;
     }
     auto filePath = dbname_ + "/" + *it;
-    unique_ptr<SequentialFile> seqFile;
+    std::unique_ptr<SequentialFile> seqFile;
     auto envOptions = EnvOptions(CurrentOptions());
     status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
     ASSERT_OK(status);
diff --git a/thirdparty/rocksdb/db/db_filesnapshot.cc b/thirdparty/rocksdb/db/db_filesnapshot.cc
index e266bf1ae1..ace0befb6d 100644
--- a/thirdparty/rocksdb/db/db_filesnapshot.cc
+++ b/thirdparty/rocksdb/db/db_filesnapshot.cc
@@ -44,7 +44,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
   // Job id == 0 means that this is not our background process, but rather
   // user thread
   JobContext job_context(0);
-  bool should_purge_files = false;
+  bool file_deletion_enabled = false;
   {
     InstrumentedMutexLock l(&mutex_);
     if (force) {
@@ -54,18 +54,18 @@ Status DBImpl::EnableFileDeletions(bool force) {
       --disable_delete_obsolete_files_;
     }
     if (disable_delete_obsolete_files_ == 0)  {
-      ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
-      should_purge_files = true;
+      file_deletion_enabled = true;
       FindObsoleteFiles(&job_context, true);
-    } else {
-      ROCKS_LOG_WARN(
-          immutable_db_options_.info_log,
-          "File Deletions Enable, but not really enabled. Counter: %d",
-          disable_delete_obsolete_files_);
+      bg_cv_.SignalAll();
     }
   }
-  if (should_purge_files)  {
+  if (file_deletion_enabled) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
     PurgeObsoleteFiles(job_context);
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Enable, but not really enabled. Counter: %d",
+                   disable_delete_obsolete_files_);
   }
   job_context.Clean();
   LogFlush(immutable_db_options_.info_log);
@@ -73,7 +73,7 @@ Status DBImpl::EnableFileDeletions(bool force) {
 }
 
 int DBImpl::IsFileDeletionsEnabled() const {
-  return disable_delete_obsolete_files_;
+  return !disable_delete_obsolete_files_;
 }
 
 Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
@@ -86,19 +86,28 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
   if (flush_memtable) {
     // flush all dirty data to disk.
     Status status;
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (cfd->IsDropped()) {
-        continue;
-      }
-      cfd->Ref();
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
       mutex_.Unlock();
-      status = FlushMemTable(cfd, FlushOptions());
-      TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
-      TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+      status = AtomicFlushMemTables(cfds, FlushOptions(),
+                                    FlushReason::kGetLiveFiles);
       mutex_.Lock();
-      cfd->Unref();
-      if (!status.ok()) {
-        break;
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfd->Ref();
+        mutex_.Unlock();
+        status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+        TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+        TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+        mutex_.Lock();
+        cfd->Unref();
+        if (!status.ok()) {
+          break;
+        }
       }
     }
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
@@ -125,7 +134,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 
   // create names of the live files. The names are not absolute
   // paths, instead they are relative to dbname_;
-  for (auto live_file : live) {
+  for (const auto& live_file : live) {
     ret.push_back(MakeTableFileName("", live_file.GetNumber()));
   }
 
@@ -141,6 +150,18 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
 }
 
 Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  {
+    // If caller disabled deletions, this function should return files that are
+    // guaranteed not to be deleted until deletions are re-enabled. We need to
+    // wait for pending purges to finish since WalManager doesn't know which
+    // files are going to be purged. Additional purges won't be scheduled as
+    // long as deletions are disabled (so the below loop must terminate).
+    InstrumentedMutexLock l(&mutex_);
+    while (disable_delete_obsolete_files_ > 0 &&
+           pending_purge_obsolete_files_ > 0) {
+      bg_cv_.Wait();
+    }
+  }
   return wal_manager_.GetSortedWalFiles(files);
 }
 
diff --git a/thirdparty/rocksdb/db/db_flush_test.cc b/thirdparty/rocksdb/db/db_flush_test.cc
index 107e82467c..8a4d8fc63a 100644
--- a/thirdparty/rocksdb/db/db_flush_test.cc
+++ b/thirdparty/rocksdb/db/db_flush_test.cc
@@ -25,6 +25,12 @@ class DBFlushDirectIOTest : public DBFlushTest,
   DBFlushDirectIOTest() : DBFlushTest() {}
 };
 
+class DBAtomicFlushTest : public DBFlushTest,
+                          public ::testing::WithParamInterface<bool> {
+ public:
+  DBAtomicFlushTest() : DBFlushTest() {}
+};
+
 // We had issue when two background threads trying to flush at the same time,
 // only one of them get committed. The test verifies the issue is fixed.
 TEST_F(DBFlushTest, FlushWhileWritingManifest) {
@@ -35,11 +41,12 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) {
   Reopen(options);
   FlushOptions no_wait;
   no_wait.wait = false;
+  no_wait.allow_write_stall=true;
 
   SyncPoint::GetInstance()->LoadDependency(
       {{"VersionSet::LogAndApply:WriteManifest",
         "DBFlushTest::FlushWhileWritingManifest:1"},
-       {"MemTableList::InstallMemtableFlushResults:InProgress",
+       {"MemTableList::TryInstallMemtableFlushResults:InProgress",
         "VersionSet::LogAndApply:WriteManifestDone"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -55,6 +62,8 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) {
 #endif  // ROCKSDB_LITE
 }
 
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
 TEST_F(DBFlushTest, SyncFail) {
   std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
       new FaultInjectionTestEnv(env_));
@@ -63,32 +72,68 @@ TEST_F(DBFlushTest, SyncFail) {
   options.env = fault_injection_env.get();
 
   SyncPoint::GetInstance()->LoadDependency(
-      {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+      {{"DBFlushTest::SyncFail:GetVersionRefCount:1",
+        "DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"},
+       {"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
+        "DBFlushTest::SyncFail:GetVersionRefCount:2"},
+       {"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
        {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
-  Reopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
   Put("key", "value");
   auto* cfd =
       reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
           ->cfd();
-  int refs_before = cfd->current()->TEST_refs();
   FlushOptions flush_options;
   flush_options.wait = false;
   ASSERT_OK(dbfull()->Flush(flush_options));
+  // Flush installs a new super-version. Get the ref count after that.
+  auto current_before = cfd->current();
+  int refs_before = cfd->current()->TEST_refs();
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:GetVersionRefCount:2");
+  int refs_after_picking_memtables = cfd->current()->TEST_refs();
+  ASSERT_EQ(refs_before + 1, refs_after_picking_memtables);
   fault_injection_env->SetFilesystemActive(false);
   TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
   TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
   fault_injection_env->SetFilesystemActive(true);
+  // Now the background job will do the flush; wait for it.
   dbfull()->TEST_WaitForFlushMemTable();
 #ifndef ROCKSDB_LITE
   ASSERT_EQ("", FilesPerLevel());  // flush failed.
 #endif                             // ROCKSDB_LITE
-  // Flush job should release ref count to current version.
+  // Backgroun flush job should release ref count to current version.
+  ASSERT_EQ(current_before, cfd->current());
   ASSERT_EQ(refs_before, cfd->current()->TEST_refs());
   Destroy(options);
 }
 
+TEST_F(DBFlushTest, SyncSkip) {
+  Options options = CurrentOptions();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"},
+       {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  Put("key", "value");
+
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
+
+  // Now the background job will do the flush; wait for it.
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  Destroy(options);
+}
+
 TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
   // Verify setting an empty high-pri (flush) thread pool causes flushes to be
   // scheduled in the low-pri (compaction) thread pool.
@@ -101,7 +146,7 @@ TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
   std::thread::id tid;
   int num_flushes = 0, num_compactions = 0;
   SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BGWorkFlush", [&](void* arg) {
+      "DBImpl::BGWorkFlush", [&](void* /*arg*/) {
         if (tid == std::thread::id()) {
           tid = std::this_thread::get_id();
         } else {
@@ -110,7 +155,7 @@ TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
         ++num_flushes;
       });
   SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BGWorkCompaction", [&](void* arg) {
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) {
         ASSERT_EQ(tid, std::this_thread::get_id());
         ++num_compactions;
       });
@@ -126,6 +171,41 @@ TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
   ASSERT_EQ(1, num_compactions);
 }
 
+TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkFlush",
+        "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"},
+       {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2",
+        "FlushJob::WriteLevel0Table"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "value1"));
+
+  port::Thread t([&]() {
+    // The call wait for flush to finish, i.e. with flush_options.wait = true.
+    ASSERT_OK(Flush());
+  });
+
+  // Wait for flush start.
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1");
+  // Insert a second memtable before the manual flush finish.
+  // At the end of the manual flush job, it will check if further flush
+  // is needed, but it will not trigger flush of the second memtable because
+  // min_write_buffer_number_to_merge is not reached.
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2");
+
+  // Manual flush should return, without waiting for flush indefinitely.
+  t.join();
+}
+
 TEST_P(DBFlushDirectIOTest, DirectIO) {
   Options options;
   options.create_if_missing = true;
@@ -150,9 +230,269 @@ TEST_P(DBFlushDirectIOTest, DirectIO) {
   delete options.env;
 }
 
+TEST_F(DBFlushTest, FlushError) {
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_injection_env.get();
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->TEST_SwitchMemtable();
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+  ASSERT_NE(s, Status::OK());
+}
+
+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+  // Regression test for bug where manual flush hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_injection_env.get();
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  // Trigger a first flush but don't let it run
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "value1"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+
+  // Write a key to the second memtable so we have something to flush later
+  // after the DB is in read-only mode.
+  ASSERT_OK(Put("key2", "value2"));
+
+  // Let the first flush continue, hit an error, and put the DB in read-only
+  // mode.
+  fault_injection_env->SetFilesystemActive(false);
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  dbfull()->TEST_WaitForFlushMemTable();
+#ifndef ROCKSDB_LITE
+  uint64_t num_bg_errors;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBackgroundErrors,
+                                  &num_bg_errors));
+  ASSERT_GT(num_bg_errors, 0);
+#endif  // ROCKSDB_LITE
+
+  // In the bug scenario, triggering another flush would cause the second flush
+  // to hang forever. After the fix we expect it to return an error.
+  ASSERT_NOK(db_->Flush(FlushOptions()));
+
+  Close();
+}
+
+TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    cf_ids.emplace_back(static_cast<int>(i));
+  }
+  ASSERT_OK(Flush(cf_ids));
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  // 4KB so that we can easily trigger auto flush.
+  options.write_buffer_size = 4096;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:FlushFinish:0",
+        "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  // Keep writing to one of them column families to trigger auto flush.
+  for (int i = 0; i != 4000; ++i) {
+    ASSERT_OK(Put(static_cast<int>(num_cfs) - 1 /*cf*/,
+                  "key" + std::to_string(i), "value" + std::to_string(i),
+                  wopts));
+  }
+
+  TEST_SYNC_POINT(
+      "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
+  if (options.atomic_flush) {
+    for (size_t i = 0; i != num_cfs - 1; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    }
+  } else {
+    for (size_t i = 0; i != num_cfs - 1; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1",
+        "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"},
+       {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2",
+        "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
+  for (auto* cfh : handles_) {
+    dbfull()->TEST_WaitForFlushMemTable(cfh);
+  }
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+    cf_ids.push_back(cf_id);
+  }
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress());
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+       FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::BeforeDropCF"},
+       {"DBAtomicFlushTest::AfterDropCF",
+        "DBImpl::BackgroundCallFlush:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  port::Thread user_thread([&]() {
+    TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+  });
+  FlushOptions flush_opts;
+  flush_opts.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  user_thread.join();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+  Destroy(options);
+}
+
 INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
                         testing::Bool());
 
+INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_impl.cc b/thirdparty/rocksdb/db/db_impl.cc
index d1bfe41e8c..8180564c2a 100644
--- a/thirdparty/rocksdb/db/db_impl.cc
+++ b/thirdparty/rocksdb/db/db_impl.cc
@@ -11,14 +11,12 @@
 #ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
 #endif
-#include <inttypes.h>
 #include <stdint.h>
 #ifdef OS_SOLARIS
 #include <alloca.h>
 #endif
 
 #include <algorithm>
-#include <climits>
 #include <cstdio>
 #include <map>
 #include <set>
@@ -34,20 +32,21 @@
 #include "db/db_info_dumper.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
+#include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "db/external_sst_file_ingestion_job.h"
 #include "db/flush_job.h"
 #include "db/forward_iterator.h"
+#include "db/in_memory_stats_history.h"
 #include "db/job_context.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
 #include "db/malloc_stats.h"
-#include "db/managed_iterator.h"
 #include "db/memtable.h"
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
-#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/table_cache.h"
 #include "db/table_properties_collector.h"
 #include "db/transaction_log_impl.h"
@@ -63,7 +62,6 @@
 #include "options/cf_options.h"
 #include "options/options_helper.h"
 #include "options/options_parser.h"
-#include "port/likely.h"
 #include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/compaction_filter.h"
@@ -72,9 +70,9 @@
 #include "rocksdb/env.h"
 #include "rocksdb/merge_operator.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/stats_history.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "rocksdb/version.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/block.h"
 #include "table/block_based_table_factory.h"
@@ -101,7 +99,7 @@
 
 namespace rocksdb {
 const std::string kDefaultColumnFamilyName("default");
-void DumpRocksDBBuildVersion(Logger * log);
+void DumpRocksDBBuildVersion(Logger* log);
 
 CompressionType GetCompressionFlush(
     const ImmutableCFOptions& ioptions,
@@ -110,7 +108,8 @@ CompressionType GetCompressionFlush(
   // optimization is used for leveled compaction. Otherwise the CPU and
   // latency overhead is not offset by saving much space.
   if (ioptions.compaction_style == kCompactionStyleUniversal) {
-    if (ioptions.compaction_options_universal.compression_size_percent < 0) {
+    if (mutable_cf_options.compaction_options_universal
+            .compression_size_percent < 0) {
       return mutable_cf_options.compression;
     } else {
       return kNoCompression;
@@ -126,37 +125,44 @@ CompressionType GetCompressionFlush(
 namespace {
 void DumpSupportInfo(Logger* logger) {
   ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
-  ROCKS_LOG_HEADER(logger, "\tSnappy supported: %d", Snappy_Supported());
-  ROCKS_LOG_HEADER(logger, "\tZlib supported: %d", Zlib_Supported());
-  ROCKS_LOG_HEADER(logger, "\tBzip supported: %d", BZip2_Supported());
-  ROCKS_LOG_HEADER(logger, "\tLZ4 supported: %d", LZ4_Supported());
-  ROCKS_LOG_HEADER(logger, "\tZSTD supported: %d", ZSTD_Supported());
-  ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %d",
-                   crc32c::IsFastCrc32Supported());
+  for (auto& compression : OptionsHelper::compression_type_string_map) {
+    if (compression.second != kNoCompression &&
+        compression.second != kDisableCompressionOption) {
+      ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
+                       CompressionTypeSupported(compression.second));
+    }
+  }
+  ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
+                   crc32c::IsFastCrc32Supported().c_str());
 }
 
 int64_t kDefaultLowPriThrottledRate = 2 * 1024 * 1024;
-} // namespace
+}  // namespace
 
-DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+               const bool seq_per_batch, const bool batch_per_txn)
     : env_(options.env),
       dbname_(dbname),
+      own_info_log_(options.info_log == nullptr),
       initial_db_options_(SanitizeOptions(dbname, options)),
       immutable_db_options_(initial_db_options_),
       mutable_db_options_(initial_db_options_),
       stats_(immutable_db_options_.statistics.get()),
-      db_lock_(nullptr),
       mutex_(stats_, env_, DB_MUTEX_WAIT_MICROS,
              immutable_db_options_.use_adaptive_mutex),
+      default_cf_handle_(nullptr),
+      max_total_in_memory_state_(0),
+      env_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+      env_options_for_compaction_(env_->OptimizeForCompactionTableWrite(
+          env_options_, immutable_db_options_)),
+      db_lock_(nullptr),
       shutting_down_(false),
       bg_cv_(&mutex_),
       logfile_number_(0),
       log_dir_synced_(false),
       log_empty_(true),
-      default_cf_handle_(nullptr),
       log_sync_cv_(&mutex_),
       total_log_size_(0),
-      max_total_in_memory_state_(0),
       is_snapshot_supported_(true),
       write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
       write_thread_(immutable_db_options_),
@@ -177,23 +183,49 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
       num_running_flushes_(0),
       bg_purge_scheduled_(0),
       disable_delete_obsolete_files_(0),
+      pending_purge_obsolete_files_(0),
       delete_obsolete_files_last_run_(env_->NowMicros()),
       last_stats_dump_time_microsec_(0),
       next_job_id_(1),
       has_unpersisted_data_(false),
-      unable_to_flush_oldest_log_(false),
-      env_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+      unable_to_release_oldest_log_(false),
       num_running_ingest_file_(0),
 #ifndef ROCKSDB_LITE
-      wal_manager_(immutable_db_options_, env_options_),
+      wal_manager_(immutable_db_options_, env_options_, seq_per_batch),
 #endif  // ROCKSDB_LITE
       event_logger_(immutable_db_options_.info_log.get()),
       bg_work_paused_(0),
       bg_compaction_paused_(0),
       refitting_level_(false),
       opened_successfully_(false),
-      concurrent_prepare_(options.concurrent_prepare),
-      manual_wal_flush_(options.manual_wal_flush) {
+      two_write_queues_(options.two_write_queues),
+      manual_wal_flush_(options.manual_wal_flush),
+      seq_per_batch_(seq_per_batch),
+      batch_per_txn_(batch_per_txn),
+      // last_sequencee_ is always maintained by the main queue that also writes
+      // to the memtable. When two_write_queues_ is disabled last seq in
+      // memtable is the same as last seq published to the readers. When it is
+      // enabled but seq_per_batch_ is disabled, last seq in memtable still
+      // indicates last published seq since wal-only writes that go to the 2nd
+      // queue do not consume a sequence number. Otherwise writes performed by
+      // the 2nd queue could change what is visible to the readers. In this
+      // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
+      // separate variable to indicate the last published sequence.
+      last_seq_same_as_publish_seq_(
+          !(seq_per_batch && options.two_write_queues)),
+      // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+      // requires a custom gc for compaction, we use that to set use_custom_gc_
+      // as well.
+      use_custom_gc_(seq_per_batch),
+      shutdown_initiated_(false),
+      own_sfm_(options.sst_file_manager == nullptr),
+      preserve_deletes_(options.preserve_deletes),
+      closed_(false),
+      error_handler_(this, immutable_db_options_, &mutex_),
+      atomic_flush_install_cv_(&mutex_) {
+  // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
+  // WriteUnprepared, which should use seq_per_batch_.
+  assert(batch_per_txn_ || seq_per_batch_);
   env_->GetAbsolutePath(dbname, &db_absolute_path_);
 
   // Reserve ten files or so for other uses and give the rest to TableCache.
@@ -215,25 +247,175 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
   immutable_db_options_.Dump(immutable_db_options_.info_log.get());
   mutable_db_options_.Dump(immutable_db_options_.info_log.get());
   DumpSupportInfo(immutable_db_options_.info_log.get());
+
+  // always open the DB with 0 here, which means if preserve_deletes_==true
+  // we won't drop any deletion markers until SetPreserveDeletesSequenceNumber()
+  // is called by client and this seqnum is advanced.
+  preserve_deletes_seqnum_.store(0);
+}
+
+Status DBImpl::Resume() {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
+
+  InstrumentedMutexLock db_mutex(&mutex_);
+
+  if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
+    // Nothing to do
+    return Status::OK();
+  }
+
+  if (error_handler_.IsRecoveryInProgress()) {
+    // Don't allow a mix of manual and automatic recovery
+    return Status::Busy();
+  }
+
+  mutex_.Unlock();
+  Status s = error_handler_.RecoverFromBGError(true);
+  mutex_.Lock();
+  return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+//    order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+//    another error, which will be saved by error_handler_ and reported later
+//    as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+//    flush in the prior step might have been a no-op for some CFs, which
+//    means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl() {
+  mutex_.AssertHeld();
+  WaitForBackgroundWork();
+
+  Status bg_error = error_handler_.GetBGError();
+  Status s;
+  if (shutdown_initiated_) {
+    // Returning shutdown status to SFM during auto recovery will cause it
+    // to abort the recovery and allow the shutdown to progress
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok() && bg_error.severity() > Status::Severity::kHardError) {
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "DB resume requested but failed due to Fatal/Unrecoverable error");
+    s = bg_error;
+  }
+
+  // We cannot guarantee consistency of the WAL. So force flush Memtables of
+  // all the column families
+  if (s.ok()) {
+    FlushOptions flush_opts;
+    // We allow flush to stall write since we are trying to resume from error.
+    flush_opts.allow_write_stall = true;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, flush_opts, FlushReason::kErrorRecovery);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfd->Ref();
+        mutex_.Unlock();
+        s = FlushMemTable(cfd, flush_opts, FlushReason::kErrorRecovery);
+        mutex_.Lock();
+        cfd->Unref();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DB resume requested but failed due to Flush failure [%s]",
+                     s.ToString().c_str());
+    }
+  }
+
+  JobContext job_context(0);
+  FindObsoleteFiles(&job_context, true);
+  if (s.ok()) {
+    s = error_handler_.ClearBGError();
+  }
+  mutex_.Unlock();
+
+  job_context.manifest_file_number = 1;
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  }
+  mutex_.Lock();
+  // Check for shutdown again before scheduling further compactions,
+  // since we released and re-acquired the lock above
+  if (shutdown_initiated_) {
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok()) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      SchedulePendingCompaction(cfd);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  // Wake up any waiters - in this case, it could be the shutdown thread
+  bg_cv_.SignalAll();
+
+  // No need to check BGError again. If something happened, event listener would
+  // be notified and the operation causing it would have failed
+  return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_) {
+    bg_cv_.Wait();
+  }
 }
 
 // Will lock the mutex_,  will wait for completion if wait is true
 void DBImpl::CancelAllBackgroundWork(bool wait) {
-  InstrumentedMutexLock l(&mutex_);
-
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "Shutdown: canceling all background work");
 
+  if (thread_dump_stats_ != nullptr) {
+    thread_dump_stats_->cancel();
+    thread_dump_stats_.reset();
+  }
+  if (thread_persist_stats_ != nullptr) {
+    thread_persist_stats_->cancel();
+    thread_persist_stats_.reset();
+  }
+  InstrumentedMutexLock l(&mutex_);
   if (!shutting_down_.load(std::memory_order_acquire) &&
       has_unpersisted_data_.load(std::memory_order_relaxed) &&
       !mutable_db_options_.avoid_flush_during_shutdown) {
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
-      if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
-        cfd->Ref();
-        mutex_.Unlock();
-        FlushMemTable(cfd, FlushOptions());
-        mutex_.Lock();
-        cfd->Unref();
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : *versions_->GetColumnFamilySet()) {
+        if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+          cfd->Ref();
+          mutex_.Unlock();
+          FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+          mutex_.Lock();
+          cfd->Unref();
+        }
       }
     }
     versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
@@ -244,14 +426,20 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
   if (!wait) {
     return;
   }
-  // Wait for background work to finish
-  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
-         bg_flush_scheduled_) {
+  WaitForBackgroundWork();
+}
+
+Status DBImpl::CloseHelper() {
+  // Guarantee that there is no background error recovery in progress before
+  // continuing with the shutdown
+  mutex_.Lock();
+  shutdown_initiated_ = true;
+  error_handler_.CancelErrorRecovery();
+  while (error_handler_.IsRecoveryInProgress()) {
     bg_cv_.Wait();
   }
-}
+  mutex_.Unlock();
 
-DBImpl::~DBImpl() {
   // CancelAllBackgroundWork called with false means we just set the shutdown
   // marker. After this we do a variant of the waiting and unschedule work
   // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
@@ -260,6 +448,7 @@ DBImpl::~DBImpl() {
       env_->UnSchedule(this, Env::Priority::BOTTOM);
   int compactions_unscheduled = env_->UnSchedule(this, Env::Priority::LOW);
   int flushes_unscheduled = env_->UnSchedule(this, Env::Priority::HIGH);
+  Status ret;
   mutex_.Lock();
   bg_bottom_compaction_scheduled_ -= bottom_compactions_unscheduled;
   bg_compaction_scheduled_ -= compactions_unscheduled;
@@ -267,17 +456,24 @@ DBImpl::~DBImpl() {
 
   // Wait for background work to finish
   while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
-         bg_flush_scheduled_ || bg_purge_scheduled_) {
+         bg_flush_scheduled_ || bg_purge_scheduled_ ||
+         pending_purge_obsolete_files_ ||
+         error_handler_.IsRecoveryInProgress()) {
     TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
     bg_cv_.Wait();
   }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
+                           &files_grabbed_for_purge_);
   EraseThreadStatusDbInfo();
   flush_scheduler_.Clear();
 
   while (!flush_queue_.empty()) {
-    auto cfd = PopFirstFromFlushQueue();
-    if (cfd->Unref()) {
-      delete cfd;
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    for (const auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      if (cfd->Unref()) {
+        delete cfd;
+      }
     }
   }
   while (!compaction_queue_.empty()) {
@@ -321,7 +517,19 @@ DBImpl::~DBImpl() {
     delete l;
   }
   for (auto& log : logs_) {
-    log.ClearWriter();
+    uint64_t log_number = log.writer->get_log_number();
+    Status s = log.ClearWriter();
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          immutable_db_options_.info_log,
+          "Unable to Sync WAL file %s with error -- %s",
+          LogFileName(immutable_db_options_.wal_dir, log_number).c_str(),
+          s.ToString().c_str());
+      // Retain the first error
+      if (ret.ok()) {
+        ret = s;
+      }
+    }
   }
   logs_.clear();
 
@@ -354,6 +562,34 @@ DBImpl::~DBImpl() {
 
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
   LogFlush(immutable_db_options_.info_log);
+
+#ifndef ROCKSDB_LITE
+  // If the sst_file_manager was allocated by us during DB::Open(), ccall
+  // Close() on it before closing the info_log. Otherwise, background thread
+  // in SstFileManagerImpl might try to log something
+  if (immutable_db_options_.sst_file_manager && own_sfm_) {
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    sfm->Close();
+  }
+#endif  // ROCKSDB_LITE
+
+  if (immutable_db_options_.info_log && own_info_log_) {
+    Status s = immutable_db_options_.info_log->Close();
+    if (ret.ok()) {
+      ret = s;
+    }
+  }
+  return ret;
+}
+
+Status DBImpl::CloseImpl() { return CloseHelper(); }
+
+DBImpl::~DBImpl() {
+  if (!closed_) {
+    closed_ = true;
+    CloseHelper();
+  }
 }
 
 void DBImpl::MaybeIgnoreError(Status* s) const {
@@ -378,71 +614,180 @@ const Status DBImpl::CreateArchivalDirectory() {
 void DBImpl::PrintStatistics() {
   auto dbstats = immutable_db_options_.statistics.get();
   if (dbstats) {
-    ROCKS_LOG_WARN(immutable_db_options_.info_log, "STATISTICS:\n %s",
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
                    dbstats->ToString().c_str());
   }
 }
 
-void DBImpl::MaybeDumpStats() {
-  mutex_.Lock();
-  unsigned int stats_dump_period_sec =
-      mutable_db_options_.stats_dump_period_sec;
-  mutex_.Unlock();
-  if (stats_dump_period_sec == 0) return;
-
-  const uint64_t now_micros = env_->NowMicros();
+void DBImpl::StartTimedTasks() {
+  unsigned int stats_dump_period_sec = 0;
+  unsigned int stats_persist_period_sec = 0;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    stats_dump_period_sec = mutable_db_options_.stats_dump_period_sec;
+    if (stats_dump_period_sec > 0) {
+      if (!thread_dump_stats_) {
+        thread_dump_stats_.reset(new rocksdb::RepeatableThread(
+            [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+            stats_dump_period_sec * 1000000));
+      }
+    }
+    stats_persist_period_sec = mutable_db_options_.stats_persist_period_sec;
+    if (stats_persist_period_sec > 0) {
+      if (!thread_persist_stats_) {
+        thread_persist_stats_.reset(new rocksdb::RepeatableThread(
+            [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+            stats_persist_period_sec * 1000000));
+      }
+    }
+  }
+}
 
-  if (last_stats_dump_time_microsec_ + stats_dump_period_sec * 1000000 <=
-      now_micros) {
-    // Multiple threads could race in here simultaneously.
-    // However, the last one will update last_stats_dump_time_microsec_
-    // atomically. We could see more than one dump during one dump
-    // period in rare cases.
-    last_stats_dump_time_microsec_ = now_micros;
+// esitmate the total size of stats_history_
+size_t DBImpl::EstiamteStatsHistorySize() const {
+  size_t size_total =
+      sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
+  if (stats_history_.size() == 0) return size_total;
+  size_t size_per_slice =
+      sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
+  // non-empty map, stats_history_.begin() guaranteed to exist
+  std::map<std::string, uint64_t> sample_slice(stats_history_.begin()->second);
+  for (const auto& pairs : sample_slice) {
+    size_per_slice +=
+        pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
+  }
+  size_total = size_per_slice * stats_history_.size();
+  return size_total;
+}
 
+void DBImpl::PersistStats() {
+  TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
 #ifndef ROCKSDB_LITE
-    const DBPropertyInfo* cf_property_info =
-        GetPropertyInfo(DB::Properties::kCFStats);
-    assert(cf_property_info != nullptr);
-    const DBPropertyInfo* db_property_info =
-        GetPropertyInfo(DB::Properties::kDBStats);
-    assert(db_property_info != nullptr);
-
-    std::string stats;
-    {
-      InstrumentedMutexLock l(&mutex_);
-      default_cf_internal_stats_->GetStringProperty(
-          *db_property_info, DB::Properties::kDBStats, &stats);
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        if (cfd->initialized()) {
-          cfd->internal_stats()->GetStringProperty(
-              *cf_property_info, DB::Properties::kCFStatsNoFileHistogram,
-              &stats);
+  if (shutdown_initiated_) {
+    return;
+  }
+  uint64_t now_micros = env_->NowMicros();
+  Statistics* statistics = immutable_db_options_.statistics.get();
+  if (!statistics) {
+    return;
+  }
+  size_t stats_history_size_limit = 0;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
+  }
+
+  // TODO(Zhongyi): also persist immutable_db_options_.statistics
+  {
+    std::map<std::string, uint64_t> stats_map;
+    if (!statistics->getTickerMap(&stats_map)) {
+      return;
+    }
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    // calculate the delta from last time
+    if (stats_slice_initialized_) {
+      std::map<std::string, uint64_t> stats_delta;
+      for (const auto& stat : stats_map) {
+        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+          stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
         }
       }
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
-        if (cfd->initialized()) {
-          cfd->internal_stats()->GetStringProperty(
-              *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
-        }
+      stats_history_[now_micros] = stats_delta;
+    }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
+
+    // delete older stats snapshots to control memory consumption
+    bool purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit;
+    while (purge_needed && !stats_history_.empty()) {
+      stats_history_.erase(stats_history_.begin());
+      purge_needed = EstiamteStatsHistorySize() > stats_history_size_limit;
+    }
+  }
+  // TODO: persist stats to disk
+#endif  // !ROCKSDB_LITE
+}
+
+bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                             uint64_t* new_time,
+                             std::map<std::string, uint64_t>* stats_map) {
+  assert(new_time);
+  assert(stats_map);
+  if (!new_time || !stats_map) return false;
+  // lock when search for start_time
+  {
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    auto it = stats_history_.lower_bound(start_time);
+    if (it != stats_history_.end() && it->first < end_time) {
+      // make a copy for timestamp and stats_map
+      *new_time = it->first;
+      *stats_map = it->second;
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+Status DBImpl::GetStatsHistory(
+    uint64_t start_time, uint64_t end_time,
+    std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
+  if (!stats_iterator) {
+    return Status::InvalidArgument("stats_iterator not preallocated.");
+  }
+  stats_iterator->reset(
+      new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  return (*stats_iterator)->status();
+}
+
+void DBImpl::DumpStats() {
+  TEST_SYNC_POINT("DBImpl::DumpStats:1");
+#ifndef ROCKSDB_LITE
+  const DBPropertyInfo* cf_property_info =
+      GetPropertyInfo(DB::Properties::kCFStats);
+  assert(cf_property_info != nullptr);
+  const DBPropertyInfo* db_property_info =
+      GetPropertyInfo(DB::Properties::kDBStats);
+  assert(db_property_info != nullptr);
+
+  std::string stats;
+  if (shutdown_initiated_) {
+    return;
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    default_cf_internal_stats_->GetStringProperty(
+        *db_property_info, DB::Properties::kDBStats, &stats);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(
+            *cf_property_info, DB::Properties::kCFStatsNoFileHistogram, &stats);
       }
     }
-    ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                   "------- DUMPING STATS -------");
-    ROCKS_LOG_WARN(immutable_db_options_.info_log, "%s", stats.c_str());
-    if (immutable_db_options_.dump_malloc_stats) {
-      stats.clear();
-      DumpMallocStats(&stats);
-      if (!stats.empty()) {
-        ROCKS_LOG_WARN(immutable_db_options_.info_log,
-                       "------- Malloc STATS -------");
-        ROCKS_LOG_WARN(immutable_db_options_.info_log, "%s", stats.c_str());
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(
+            *cf_property_info, DB::Properties::kCFFileHistogram, &stats);
       }
     }
+  }
+  TEST_SYNC_POINT("DBImpl::DumpStats:2");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- DUMPING STATS -------");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+  if (immutable_db_options_.dump_malloc_stats) {
+    stats.clear();
+    DumpMallocStats(&stats);
+    if (!stats.empty()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "------- Malloc STATS -------");
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+    }
+  }
 #endif  // !ROCKSDB_LITE
 
-    PrintStatistics();
-  }
+  PrintStatistics();
 }
 
 void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
@@ -455,7 +800,16 @@ void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
   }
 }
 
-Directory* DBImpl::Directories::GetDataDir(size_t path_id) {
+Directory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+  assert(cfd);
+  Directory* ret_dir = cfd->GetDataDir(path_id);
+  if (ret_dir == nullptr) {
+    return directories_.GetDataDir(path_id);
+  }
+  return ret_dir;
+}
+
+Directory* DBImpl::Directories::GetDataDir(size_t path_id) const {
   assert(path_id < data_dirs_.size());
   Directory* ret_dir = data_dirs_[path_id].get();
   if (ret_dir == nullptr) {
@@ -465,9 +819,12 @@ Directory* DBImpl::Directories::GetDataDir(size_t path_id) {
   return ret_dir;
 }
 
-Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
+Status DBImpl::SetOptions(
+    ColumnFamilyHandle* column_family,
     const std::unordered_map<std::string, std::string>& options_map) {
 #ifdef ROCKSDB_LITE
+  (void)column_family;
+  (void)options_map;
   return Status::NotSupported("Not supported in ROCKSDB LITE");
 #else
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
@@ -481,7 +838,7 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
   MutableCFOptions new_options;
   Status s;
   Status persist_options_status;
-  WriteThread::Writer w;
+  SuperVersionContext sv_context(/* create_superversion */ true);
   {
     InstrumentedMutexLock l(&mutex_);
     s = cfd->SetOptions(options_map);
@@ -494,18 +851,18 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
       // Trigger possible flush/compactions. This has to be before we persist
       // options to file, otherwise there will be a deadlock with writer
       // thread.
-      auto* old_sv =
-          InstallSuperVersionAndScheduleWork(cfd, nullptr, new_options);
-      delete old_sv;
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
 
       persist_options_status = WriteOptionsFile(
           false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+      bg_cv_.SignalAll();
     }
   }
+  sv_context.Clean();
 
-  ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                 "SetOptions() on column family [%s], inputs:",
-                 cfd->GetName().c_str());
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
   for (const auto& o : options_map) {
     ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
                    o.second.c_str());
@@ -529,6 +886,7 @@ Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
 Status DBImpl::SetDBOptions(
     const std::unordered_map<std::string, std::string>& options_map) {
 #ifdef ROCKSDB_LITE
+  (void)options_map;
   return Status::NotSupported("Not supported in ROCKSDB LITE");
 #else
   if (options_map.empty()) {
@@ -540,7 +898,7 @@ Status DBImpl::SetDBOptions(
   MutableDBOptions new_options;
   Status s;
   Status persist_options_status;
-  WriteThread::Writer w;
+  bool wal_changed = false;
   WriteContext write_context;
   {
     InstrumentedMutexLock l(&mutex_);
@@ -553,17 +911,60 @@ Status DBImpl::SetDBOptions(
             new_options.max_background_compactions, Env::Priority::LOW);
         MaybeScheduleFlushOrCompaction();
       }
-
-      write_controller_.set_max_delayed_write_rate(new_options.delayed_write_rate);
+      if (new_options.stats_dump_period_sec !=
+          mutable_db_options_.stats_dump_period_sec) {
+        if (thread_dump_stats_) {
+          mutex_.Unlock();
+          thread_dump_stats_->cancel();
+          mutex_.Lock();
+        }
+        if (new_options.stats_dump_period_sec > 0) {
+          thread_dump_stats_.reset(new rocksdb::RepeatableThread(
+              [this]() { DBImpl::DumpStats(); }, "dump_st", env_,
+              new_options.stats_dump_period_sec * 1000000));
+        } else {
+          thread_dump_stats_.reset();
+        }
+      }
+      if (new_options.stats_persist_period_sec !=
+          mutable_db_options_.stats_persist_period_sec) {
+        if (thread_persist_stats_) {
+          mutex_.Unlock();
+          thread_persist_stats_->cancel();
+          mutex_.Lock();
+        }
+        if (new_options.stats_persist_period_sec > 0) {
+          thread_persist_stats_.reset(new rocksdb::RepeatableThread(
+              [this]() { DBImpl::PersistStats(); }, "pst_st", env_,
+              new_options.stats_persist_period_sec * 1000000));
+        } else {
+          thread_persist_stats_.reset();
+        }
+      }
+      write_controller_.set_max_delayed_write_rate(
+          new_options.delayed_write_rate);
       table_cache_.get()->SetCapacity(new_options.max_open_files == -1
                                           ? TableCache::kInfiniteCapacity
                                           : new_options.max_open_files - 10);
-
+      wal_changed = mutable_db_options_.wal_bytes_per_sync !=
+                    new_options.wal_bytes_per_sync;
+      if (new_options.bytes_per_sync == 0) {
+        new_options.bytes_per_sync = 1024 * 1024;
+      }
       mutable_db_options_ = new_options;
-
+      env_options_for_compaction_ = EnvOptions(
+          BuildDBOptions(immutable_db_options_, mutable_db_options_));
+      env_options_for_compaction_ = env_->OptimizeForCompactionTableWrite(
+          env_options_for_compaction_, immutable_db_options_);
+      versions_->ChangeEnvOptions(mutable_db_options_);
+      env_options_for_compaction_ = env_->OptimizeForCompactionTableRead(
+          env_options_for_compaction_, immutable_db_options_);
+      env_options_for_compaction_.compaction_readahead_size =
+          mutable_db_options_.compaction_readahead_size;
+      WriteThread::Writer w;
       write_thread_.EnterUnbatched(&w, &mutex_);
-      if (total_log_size_ > GetMaxTotalWalSize()) {
-        Status purge_wal_status = HandleWALFull(&write_context);
+      if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
+        Status purge_wal_status = SwitchWAL(&write_context);
         if (!purge_wal_status.ok()) {
           ROCKS_LOG_WARN(immutable_db_options_.info_log,
                          "Unable to purge WAL files in SetDBOptions() -- %s",
@@ -602,8 +1003,9 @@ Status DBImpl::SetDBOptions(
 }
 
 // return the same level if it cannot be moved
-int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
-    const MutableCFOptions& mutable_cf_options, int level) {
+int DBImpl::FindMinimumEmptyLevelFitting(
+    ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
+    int level) {
   mutex_.AssertHeld();
   const auto* vstorage = cfd->current()->storage_info();
   int minimum_level = level;
@@ -621,7 +1023,7 @@ int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
 }
 
 Status DBImpl::FlushWAL(bool sync) {
-  {
+  if (manual_wal_flush_) {
     // We need to lock log_write_mutex_ since logs_ might change concurrently
     InstrumentedMutexLock wl(&log_write_mutex_);
     log::Writer* cur_log_writer = logs_.back().writer;
@@ -629,12 +1031,20 @@ Status DBImpl::FlushWAL(bool sync) {
     if (!s.ok()) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
                       s.ToString().c_str());
+      // In case there is a fs error we should set it globally to prevent the
+      // future writes
+      WriteStatusCheck(s);
+      // whether sync or not, we should abort the rest of function upon error
+      return s;
     }
     if (!sync) {
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
       return s;
     }
   }
+  if (!sync) {
+    return Status::OK();
+  }
   // sync = true
   ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
   return SyncWAL();
@@ -702,12 +1112,29 @@ Status DBImpl::SyncWAL() {
   return status;
 }
 
-void DBImpl::MarkLogsSynced(
-    uint64_t up_to, bool synced_dir, const Status& status) {
+Status DBImpl::LockWAL() {
+  log_write_mutex_.Lock();
+  auto cur_log_writer = logs_.back().writer;
+  auto status = cur_log_writer->WriteBuffer();
+  if (!status.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+                    status.ToString().c_str());
+    // In case there is a fs error we should set it globally to prevent the
+    // future writes
+    WriteStatusCheck(status);
+  }
+  return status;
+}
+
+Status DBImpl::UnlockWAL() {
+  log_write_mutex_.Unlock();
+  return Status::OK();
+}
+
+void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
+                            const Status& status) {
   mutex_.AssertHeld();
-  if (synced_dir &&
-      logfile_number_ == up_to &&
-      status.ok()) {
+  if (synced_dir && logfile_number_ == up_to && status.ok()) {
     log_dir_synced_ = true;
   }
   for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
@@ -715,6 +1142,8 @@ void DBImpl::MarkLogsSynced(
     assert(log.getting_synced);
     if (status.ok() && logs_.size() > 1) {
       logs_to_free_.push_back(log.ReleaseWriter());
+      // To modify logs_ both mutex_ and log_write_mutex_ must be held
+      InstrumentedMutexLock l(&log_write_mutex_);
       it = logs_.erase(it);
     } else {
       log.getting_synced = false;
@@ -730,8 +1159,21 @@ SequenceNumber DBImpl::GetLatestSequenceNumber() const {
   return versions_->LastSequence();
 }
 
+void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
+  versions_->SetLastPublishedSequence(seq);
+}
+
+bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) {
+  if (seqnum > preserve_deletes_seqnum_.load()) {
+    preserve_deletes_seqnum_.store(seqnum);
+    return true;
+  } else {
+    return false;
+  }
+}
+
 InternalIterator* DBImpl::NewInternalIterator(
-    Arena* arena, RangeDelAggregator* range_del_agg,
+    Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
     ColumnFamilyHandle* column_family) {
   ColumnFamilyData* cfd;
   if (column_family == nullptr) {
@@ -745,8 +1187,8 @@ InternalIterator* DBImpl::NewInternalIterator(
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   mutex_.Unlock();
   ReadOptions roptions;
-  return NewInternalIterator(roptions, cfd, super_version, arena,
-                             range_del_agg);
+  return NewInternalIterator(roptions, cfd, super_version, arena, range_del_agg,
+                             sequence);
 }
 
 void DBImpl::SchedulePurge() {
@@ -768,16 +1210,14 @@ void DBImpl::BackgroundCallPurge() {
     if (!purge_queue_.empty()) {
       auto purge_file = purge_queue_.begin();
       auto fname = purge_file->fname;
+      auto dir_to_sync = purge_file->dir_to_sync;
       auto type = purge_file->type;
       auto number = purge_file->number;
-      auto path_id = purge_file->path_id;
       auto job_id = purge_file->job_id;
       purge_queue_.pop_front();
 
       mutex_.Unlock();
-      Status file_deletion_status;
-      DeleteObsoleteFileImpl(file_deletion_status, job_id, fname, type, number,
-                             path_id);
+      DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
       mutex_.Lock();
     } else {
       assert(!logs_to_free_queue_.empty());
@@ -813,7 +1253,7 @@ struct IterState {
   bool background_purge;
 };
 
-static void CleanupIteratorState(void* arg1, void* arg2) {
+static void CleanupIteratorState(void* arg1, void* /*arg2*/) {
   IterState* state = reinterpret_cast<IterState*>(arg1);
 
   if (state->super_version->Unref()) {
@@ -850,10 +1290,12 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
 }
 }  // namespace
 
-InternalIterator* DBImpl::NewInternalIterator(
-    const ReadOptions& read_options, ColumnFamilyData* cfd,
-    SuperVersion* super_version, Arena* arena,
-    RangeDelAggregator* range_del_agg) {
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+                                              ColumnFamilyData* cfd,
+                                              SuperVersion* super_version,
+                                              Arena* arena,
+                                              RangeDelAggregator* range_del_agg,
+                                              SequenceNumber sequence) {
   InternalIterator* internal_iter;
   assert(arena != nullptr);
   assert(range_del_agg != nullptr);
@@ -861,16 +1303,16 @@ InternalIterator* DBImpl::NewInternalIterator(
   MergeIteratorBuilder merge_iter_builder(
       &cfd->internal_comparator(), arena,
       !read_options.total_order_seek &&
-          cfd->ioptions()->prefix_extractor != nullptr);
+          super_version->mutable_cf_options.prefix_extractor != nullptr);
   // Collect iterator for mutable mem
   merge_iter_builder.AddIterator(
       super_version->mem->NewIterator(read_options, arena));
-  std::unique_ptr<InternalIterator> range_del_iter;
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
   Status s;
   if (!read_options.ignore_range_deletions) {
     range_del_iter.reset(
-        super_version->mem->NewRangeTombstoneIterator(read_options));
-    s = range_del_agg->AddTombstones(std::move(range_del_iter));
+        super_version->mem->NewRangeTombstoneIterator(read_options, sequence));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
   }
   // Collect all needed child iterators for immutable memtables
   if (s.ok()) {
@@ -880,6 +1322,7 @@ InternalIterator* DBImpl::NewInternalIterator(
                                                          range_del_agg);
     }
   }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
   if (s.ok()) {
     // Collect iterators for files in L0 - Ln
     if (read_options.read_tier != kMemtableTier) {
@@ -889,12 +1332,15 @@ InternalIterator* DBImpl::NewInternalIterator(
     internal_iter = merge_iter_builder.Finish();
     IterState* cleanup =
         new IterState(this, &mutex_, super_version,
-                      read_options.background_purge_on_iterator_cleanup);
+                      read_options.background_purge_on_iterator_cleanup ||
+                      immutable_db_options_.avoid_unnecessary_blocking_io);
     internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
 
     return internal_iter;
+  } else {
+    CleanupSuperVersion(super_version);
   }
-  return NewErrorInternalIterator(s);
+  return NewErrorInternalIterator<Slice>(s, arena);
 }
 
 ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
@@ -910,14 +1356,24 @@ Status DBImpl::Get(const ReadOptions& read_options,
 Status DBImpl::GetImpl(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        PinnableSlice* pinnable_val, bool* value_found,
-                       bool* is_blob_index) {
+                       ReadCallback* callback, bool* is_blob_index) {
   assert(pinnable_val != nullptr);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
   StopWatch sw(env_, stats_, DB_GET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
 
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+
   // Acquire SuperVersion
   SuperVersion* sv = GetAndRefSuperVersion(cfd);
 
@@ -926,8 +1382,19 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
 
   SequenceNumber snapshot;
   if (read_options.snapshot != nullptr) {
-    snapshot = reinterpret_cast<const SnapshotImpl*>(
-        read_options.snapshot)->number_;
+    // Note: In WritePrepared txns this is not necessary but not harmful
+    // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
+    // a snapshot is specified we should be fine with skipping seq numbers
+    // that are greater than that.
+    //
+    // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+    // may skip uncommitted data that should be visible to the transaction for
+    // reading own writes.
+    snapshot =
+        reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+    if (callback) {
+      snapshot = std::max(snapshot, callback->max_visible_seq());
+    }
   } else {
     // Since we get and reference the super version before getting
     // the snapshot number, without a mutex protection, it is possible
@@ -935,18 +1402,20 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     // data for this snapshot is available. But it will contain all
     // the data available in the super version we have, which is also
     // a valid snapshot to read from.
-    // We shouldn't get snapshot before finding and referencing the
-    // super versipon because a flush happening in between may compact
-    // away data for the snapshot, but the snapshot is earlier than the
-    // data overwriting it, so users may see wrong results.
-    snapshot = versions_->LastSequence();
+    // We shouldn't get snapshot before finding and referencing the super
+    // version because a flush happening in between may compact away data for
+    // the snapshot, but the snapshot is earlier than the data overwriting it,
+    // so users may see wrong results.
+    snapshot = last_seq_same_as_publish_seq_
+                   ? versions_->LastSequence()
+                   : versions_->LastPublishedSequence();
   }
   TEST_SYNC_POINT("DBImpl::GetImpl:3");
   TEST_SYNC_POINT("DBImpl::GetImpl:4");
 
   // Prepare to store a list of merge operations if merge occurs.
   MergeContext merge_context;
-  RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot);
+  SequenceNumber max_covering_tombstone_seq = 0;
 
   Status s;
   // First look in the memtable, then in the immutable memtable (if any).
@@ -960,26 +1429,29 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
   bool done = false;
   if (!skip_memtable) {
     if (sv->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                     &range_del_agg, read_options, is_blob_index)) {
+                     &max_covering_tombstone_seq, read_options, callback,
+                     is_blob_index)) {
       done = true;
       pinnable_val->PinSelf();
       RecordTick(stats_, MEMTABLE_HIT);
     } else if ((s.ok() || s.IsMergeInProgress()) &&
                sv->imm->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                            &range_del_agg, read_options, is_blob_index)) {
+                            &max_covering_tombstone_seq, read_options, callback,
+                            is_blob_index)) {
       done = true;
       pinnable_val->PinSelf();
       RecordTick(stats_, MEMTABLE_HIT);
     }
     if (!done && !s.ok() && !s.IsMergeInProgress()) {
+      ReturnAndCleanupSuperVersion(cfd, sv);
       return s;
     }
   }
   if (!done) {
     PERF_TIMER_GUARD(get_from_output_files_time);
     sv->current->Get(read_options, lkey, pinnable_val, &s, &merge_context,
-                     &range_del_agg, value_found, nullptr, nullptr,
-                     is_blob_index);
+                     &max_covering_tombstone_seq, value_found, nullptr, nullptr,
+                     callback, is_blob_index);
     RecordTick(stats_, MEMTABLE_MISS);
   }
 
@@ -989,10 +1461,13 @@ Status DBImpl::GetImpl(const ReadOptions& read_options,
     ReturnAndCleanupSuperVersion(cfd, sv);
 
     RecordTick(stats_, NUMBER_KEYS_READ);
-    size_t size = pinnable_val->size();
-    RecordTick(stats_, BYTES_READ, size);
-    MeasureTime(stats_, BYTES_PER_READ, size);
-    PERF_COUNTER_ADD(get_read_bytes, size);
+    size_t size = 0;
+    if (s.ok()) {
+      size = pinnable_val->size();
+      RecordTick(stats_, BYTES_READ, size);
+      PERF_COUNTER_ADD(get_read_bytes, size);
+    }
+    RecordInHistogram(stats_, BYTES_PER_READ, size);
   }
   return s;
 }
@@ -1001,7 +1476,7 @@ std::vector<Status> DBImpl::MultiGet(
     const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_family,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
-
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
   StopWatch sw(env_, stats_, DB_MULTIGET);
   PERF_TIMER_GUARD(get_snapshot_time);
 
@@ -1010,31 +1485,96 @@ std::vector<Status> DBImpl::MultiGet(
   struct MultiGetColumnFamilyData {
     ColumnFamilyData* cfd;
     SuperVersion* super_version;
+    MultiGetColumnFamilyData(ColumnFamilyData* cf, SuperVersion* sv)
+        : cfd(cf), super_version(sv) {}
   };
-  std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data;
-  // fill up and allocate outside of mutex
+  std::unordered_map<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
+      column_family.size());
   for (auto cf : column_family) {
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
     auto cfd = cfh->cfd();
     if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
-      auto mgcfd = new MultiGetColumnFamilyData();
-      mgcfd->cfd = cfd;
-      multiget_cf_data.insert({cfd->GetID(), mgcfd});
+      multiget_cf_data.emplace(cfd->GetID(),
+                               MultiGetColumnFamilyData(cfd, nullptr));
     }
   }
 
-  mutex_.Lock();
-  if (read_options.snapshot != nullptr) {
-    snapshot = reinterpret_cast<const SnapshotImpl*>(
-        read_options.snapshot)->number_;
-  } else {
-    snapshot = versions_->LastSequence();
-  }
-  for (auto mgd_iter : multiget_cf_data) {
-    mgd_iter.second->super_version =
-        mgd_iter.second->cfd->GetSuperVersion()->Ref();
+  bool last_try = false;
+  {
+    // If we end up with the same issue of memtable geting sealed during 2
+    // consecutive retries, it means the write rate is very high. In that case
+    // its probably ok to take the mutex on the 3rd try so we can succeed for
+    // sure
+    static const int num_retries = 3;
+    for (auto i = 0; i < num_retries; ++i) {
+      last_try = (i == num_retries - 1);
+      bool retry = false;
+
+      if (i > 0) {
+        for (auto mgd_iter = multiget_cf_data.begin();
+             mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
+          auto super_version = mgd_iter->second.super_version;
+          auto cfd = mgd_iter->second.cfd;
+          if (super_version != nullptr) {
+            ReturnAndCleanupSuperVersion(cfd, super_version);
+          }
+          mgd_iter->second.super_version = nullptr;
+        }
+      }
+
+      if (read_options.snapshot == nullptr) {
+        if (last_try) {
+          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+          // We're close to max number of retries. For the last retry,
+          // acquire the lock so we're sure to succeed
+          mutex_.Lock();
+        }
+        snapshot = last_seq_same_as_publish_seq_
+                       ? versions_->LastSequence()
+                       : versions_->LastPublishedSequence();
+      } else {
+        snapshot = reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                       ->number_;
+      }
+
+      for (auto mgd_iter = multiget_cf_data.begin();
+           mgd_iter != multiget_cf_data.end(); ++mgd_iter) {
+        if (!last_try) {
+          mgd_iter->second.super_version =
+              GetAndRefSuperVersion(mgd_iter->second.cfd);
+        } else {
+          mgd_iter->second.super_version =
+              mgd_iter->second.cfd->GetSuperVersion()->Ref();
+        }
+        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+        if (read_options.snapshot != nullptr || last_try) {
+          // If user passed a snapshot, then we don't care if a memtable is
+          // sealed or compaction happens because the snapshot would ensure
+          // that older key versions are kept around. If this is the last
+          // retry, then we have the lock so nothing bad can happen
+          continue;
+        }
+        // We could get the earliest sequence number for the whole list of
+        // memtables, which will include immutable memtables as well, but that
+        // might be tricky to maintain in case we decide, in future, to do
+        // memtable compaction.
+        if (!last_try) {
+          auto seq =
+              mgd_iter->second.super_version->mem->GetEarliestSequenceNumber();
+          if (seq > snapshot) {
+            retry = true;
+            break;
+          }
+        }
+      }
+      if (!retry) {
+        if (last_try) {
+          mutex_.Unlock();
+        }
+        break;
+      }
+    }
   }
-  mutex_.Unlock();
 
   // Contain a list of merge operations if merge occurs.
   MergeContext merge_context;
@@ -1052,6 +1592,7 @@ std::vector<Status> DBImpl::MultiGet(
   // First look in the memtable, then in the immutable memtable (if any).
   // s is both in/out. When in, s could either be OK or MergeInProgress.
   // merge_operands will contain the sequence of merges in the latter case.
+  size_t num_found = 0;
   for (size_t i = 0; i < num_keys; ++i) {
     merge_context.Clear();
     Status& s = stat_list[i];
@@ -1059,38 +1600,39 @@ std::vector<Status> DBImpl::MultiGet(
 
     LookupKey lkey(keys[i], snapshot);
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
-    RangeDelAggregator range_del_agg(cfh->cfd()->internal_comparator(),
-                                     snapshot);
+    SequenceNumber max_covering_tombstone_seq = 0;
     auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
     assert(mgd_iter != multiget_cf_data.end());
     auto mgd = mgd_iter->second;
-    auto super_version = mgd->super_version;
+    auto super_version = mgd.super_version;
     bool skip_memtable =
         (read_options.read_tier == kPersistedTier &&
          has_unpersisted_data_.load(std::memory_order_relaxed));
     bool done = false;
     if (!skip_memtable) {
       if (super_version->mem->Get(lkey, value, &s, &merge_context,
-                                  &range_del_agg, read_options)) {
+                                  &max_covering_tombstone_seq, read_options)) {
         done = true;
-        // TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
+        RecordTick(stats_, MEMTABLE_HIT);
       } else if (super_version->imm->Get(lkey, value, &s, &merge_context,
-                                         &range_del_agg, read_options)) {
+                                         &max_covering_tombstone_seq,
+                                         read_options)) {
         done = true;
-        // TODO(?): RecordTick(stats_, MEMTABLE_HIT)?
+        RecordTick(stats_, MEMTABLE_HIT);
       }
     }
     if (!done) {
       PinnableSlice pinnable_val;
       PERF_TIMER_GUARD(get_from_output_files_time);
       super_version->current->Get(read_options, lkey, &pinnable_val, &s,
-                                  &merge_context, &range_del_agg);
+                                  &merge_context, &max_covering_tombstone_seq);
       value->assign(pinnable_val.data(), pinnable_val.size());
-      // TODO(?): RecordTick(stats_, MEMTABLE_MISS)?
+      RecordTick(stats_, MEMTABLE_MISS);
     }
 
     if (s.ok()) {
       bytes_read += value->size();
+      num_found++;
     }
   }
 
@@ -1098,28 +1640,19 @@ std::vector<Status> DBImpl::MultiGet(
   PERF_TIMER_GUARD(get_post_process_time);
   autovector<SuperVersion*> superversions_to_delete;
 
-  // TODO(icanadi) do we need lock here or just around Cleanup()?
-  mutex_.Lock();
   for (auto mgd_iter : multiget_cf_data) {
     auto mgd = mgd_iter.second;
-    if (mgd->super_version->Unref()) {
-      mgd->super_version->Cleanup();
-      superversions_to_delete.push_back(mgd->super_version);
+    if (!last_try) {
+      ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
+    } else {
+      mgd.cfd->GetSuperVersion()->Unref();
     }
   }
-  mutex_.Unlock();
-
-  for (auto td : superversions_to_delete) {
-    delete td;
-  }
-  for (auto mgd : multiget_cf_data) {
-    delete mgd.second;
-  }
-
   RecordTick(stats_, NUMBER_MULTIGET_CALLS);
   RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
   RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
-  MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read);
+  RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
   PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
   PERF_TIMER_STOP(get_post_process_time);
 
@@ -1205,10 +1738,22 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
   if (s.ok() && immutable_db_options_.allow_concurrent_memtable_write) {
     s = CheckConcurrentWritesSupported(cf_options);
   }
+  if (s.ok()) {
+    s = CheckCFPathsSupported(initial_db_options_, cf_options);
+  }
+  if (s.ok()) {
+    for (auto& cf_path : cf_options.cf_paths) {
+      s = env_->CreateDirIfMissing(cf_path.path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
   if (!s.ok()) {
     return s;
   }
 
+  SuperVersionContext sv_context(/* create_superversion */ true);
   {
     InstrumentedMutexLock l(&mutex_);
 
@@ -1235,13 +1780,19 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
                                  &cf_options);
       write_thread_.ExitUnbatched(&w);
     }
+    if (s.ok()) {
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      s = cfd->AddDirectories();
+    }
     if (s.ok()) {
       single_column_family_mode_ = false;
       auto* cfd =
           versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
       assert(cfd != nullptr);
-      delete InstallSuperVersionAndScheduleWork(
-          cfd, nullptr, *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context,
+                                         *cfd->GetLatestMutableCFOptions());
 
       if (!cfd->mem()->IsSnapshotSupported()) {
         is_snapshot_supported_ = false;
@@ -1260,6 +1811,7 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
     }
   }  // InstrumentedMutexLock l(&mutex_)
 
+  sv_context.Clean();
   // this is outside the mutex
   if (s.ok()) {
     NewThreadStatusCfInfo(
@@ -1322,8 +1874,8 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
       // we drop column family from a single write thread
       WriteThread::Writer w;
       write_thread_.EnterUnbatched(&w, &mutex_);
-      s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
-                                 &edit, &mutex_);
+      s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+                                 &mutex_);
       write_thread_.ExitUnbatched(&w);
     }
     if (s.ok()) {
@@ -1344,6 +1896,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
       }
       is_snapshot_supported_ = new_is_snapshot_supported;
     }
+    bg_cv_.SignalAll();
   }
 
   if (s.ok()) {
@@ -1372,7 +1925,7 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
     *value_found = true;
   }
   ReadOptions roptions = read_options;
-  roptions.read_tier = kBlockCacheTier; // read from block cache only
+  roptions.read_tier = kBlockCacheTier;  // read from block cache only
   PinnableSlice pinnable_val;
   auto s = GetImpl(roptions, column_family, key, &pinnable_val, value_found);
   value->assign(pinnable_val.data(), pinnable_val.size());
@@ -1385,55 +1938,59 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options,
 
 Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
                               ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  Iterator* result = nullptr;
   if (read_options.read_tier == kPersistedTier) {
     return NewErrorIterator(Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators."));
   }
+  // if iterator wants internal keys, we can only proceed if
+  // we can guarantee the deletes haven't been processed yet
+  if (immutable_db_options_.preserve_deletes &&
+      read_options.iter_start_seqnum > 0 &&
+      read_options.iter_start_seqnum < preserve_deletes_seqnum_.load()) {
+    return NewErrorIterator(Status::InvalidArgument(
+        "Iterator requested internal keys which are too old and are not"
+        " guaranteed to be preserved, try larger iter_start_seqnum opt."));
+  }
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
-  if (read_options.managed) {
-#ifdef ROCKSDB_LITE
-    // not supported in lite version
-    return NewErrorIterator(Status::InvalidArgument(
-        "Managed Iterators not supported in RocksDBLite."));
-#else
-    if ((read_options.tailing) || (read_options.snapshot != nullptr) ||
-        (is_snapshot_supported_)) {
-      return new ManagedIterator(this, read_options, cfd);
-    }
-    // Managed iter not supported
-    return NewErrorIterator(Status::InvalidArgument(
-        "Managed Iterators not supported without snapshots."));
-#endif
-  } else if (read_options.tailing) {
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
     // not supported in lite version
-    return nullptr;
+    result = nullptr;
+
 #else
     SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
     auto iter = new ForwardIterator(this, read_options, cfd, sv);
-    return NewDBIterator(
-        env_, read_options, *cfd->ioptions(), cfd->user_comparator(), iter,
-        kMaxSequenceNumber,
-        sv->mutable_cf_options.max_sequential_skip_in_iterations);
+    result = NewDBIterator(
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        cfd->user_comparator(), iter, kMaxSequenceNumber,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+        this, cfd);
 #endif
   } else {
-    SequenceNumber latest_snapshot = versions_->LastSequence();
-    auto snapshot =
-        read_options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                  ->number_
-            : latest_snapshot;
-    return NewIteratorImpl(read_options, cfd, snapshot);
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
+    // WritePreparedTxnDB
+    auto snapshot = read_options.snapshot != nullptr
+                        ? read_options.snapshot->GetSequenceNumber()
+                        : versions_->LastSequence();
+    result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
   }
-  // To stop compiler from complaining
-  return nullptr;
+  return result;
 }
 
 ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
                                             ColumnFamilyData* cfd,
                                             SequenceNumber snapshot,
-                                            bool allow_blob) {
+                                            ReadCallback* read_callback,
+                                            bool allow_blob,
+                                            bool allow_refresh) {
   SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
 
   // Try to generate a DB iterator tree in continuous memory area to be
@@ -1479,14 +2036,14 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
   // likely that any iterator pointer is close to the iterator it points to so
   // that they are likely to be in the same cache line and/or page.
   ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, *cfd->ioptions(), snapshot,
+      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, snapshot,
       sv->mutable_cf_options.max_sequential_skip_in_iterations,
-      sv->version_number, ((read_options.snapshot != nullptr) ? nullptr : this),
-      cfd, allow_blob);
+      sv->version_number, read_callback, this, cfd, allow_blob,
+      ((read_options.snapshot != nullptr) ? false : allow_refresh));
 
   InternalIterator* internal_iter =
       NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                          db_iter->GetRangeDelAggregator());
+                          db_iter->GetRangeDelAggregator(), snapshot);
   db_iter->SetIterUnderDBIter(internal_iter);
 
   return db_iter;
@@ -1496,55 +2053,44 @@ Status DBImpl::NewIterators(
     const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_families,
     std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
   if (read_options.read_tier == kPersistedTier) {
     return Status::NotSupported(
         "ReadTier::kPersistedData is not yet supported in iterators.");
   }
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
   iterators->clear();
   iterators->reserve(column_families.size());
-  if (read_options.managed) {
-#ifdef ROCKSDB_LITE
-    return Status::InvalidArgument(
-        "Managed interator not supported in RocksDB lite");
-#else
-    if ((!read_options.tailing) && (read_options.snapshot == nullptr) &&
-        (!is_snapshot_supported_)) {
-      return Status::InvalidArgument(
-          "Managed interator not supported without snapshots");
-    }
-    for (auto cfh : column_families) {
-      auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
-      auto iter = new ManagedIterator(this, read_options, cfd);
-      iterators->push_back(iter);
-    }
-#endif
-  } else if (read_options.tailing) {
+  if (read_options.tailing) {
 #ifdef ROCKSDB_LITE
     return Status::InvalidArgument(
-        "Tailing interator not supported in RocksDB lite");
+        "Tailing iterator not supported in RocksDB lite");
 #else
     for (auto cfh : column_families) {
       auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
       SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
       auto iter = new ForwardIterator(this, read_options, cfd, sv);
       iterators->push_back(NewDBIterator(
-          env_, read_options, *cfd->ioptions(), cfd->user_comparator(), iter,
-          kMaxSequenceNumber,
-          sv->mutable_cf_options.max_sequential_skip_in_iterations));
+          env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+          cfd->user_comparator(), iter, kMaxSequenceNumber,
+          sv->mutable_cf_options.max_sequential_skip_in_iterations,
+          read_callback, this, cfd));
     }
 #endif
   } else {
-    SequenceNumber latest_snapshot = versions_->LastSequence();
-    auto snapshot =
-        read_options.snapshot != nullptr
-            ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                  ->number_
-            : latest_snapshot;
-
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
+    // WritePreparedTxnDB
+    auto snapshot = read_options.snapshot != nullptr
+                        ? read_options.snapshot->GetSequenceNumber()
+                        : versions_->LastSequence();
     for (size_t i = 0; i < column_families.size(); ++i) {
-      auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
-          column_families[i])->cfd();
-      iterators->push_back(NewIteratorImpl(read_options, cfd, snapshot));
+      auto* cfd =
+          reinterpret_cast<ColumnFamilyHandleImpl*>(column_families[i])->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, snapshot, read_callback));
     }
   }
 
@@ -1559,36 +2105,93 @@ const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
 }
 #endif  // ROCKSDB_LITE
 
-const Snapshot* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary) {
+SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
+                                      bool lock) {
   int64_t unix_time = 0;
   env_->GetCurrentTime(&unix_time);  // Ignore error
   SnapshotImpl* s = new SnapshotImpl;
 
-  InstrumentedMutexLock l(&mutex_);
+  if (lock) {
+    mutex_.Lock();
+  }
   // returns null if the underlying memtable does not support snapshot.
   if (!is_snapshot_supported_) {
+    if (lock) {
+      mutex_.Unlock();
+    }
     delete s;
     return nullptr;
   }
-  return snapshots_.New(s, versions_->LastSequence(), unix_time,
-                        is_write_conflict_boundary);
+  auto snapshot_seq = last_seq_same_as_publish_seq_
+                          ? versions_->LastSequence()
+                          : versions_->LastPublishedSequence();
+  SnapshotImpl* snapshot =
+      snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
+  if (lock) {
+    mutex_.Unlock();
+  }
+  return snapshot;
+}
+
+namespace {
+typedef autovector<ColumnFamilyData*, 2> CfdList;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+  for (const ColumnFamilyData* t : list) {
+    if (t == cfd) {
+      return true;
+    }
+  }
+  return false;
 }
+}  //  namespace
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
   const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
   {
     InstrumentedMutexLock l(&mutex_);
     snapshots_.Delete(casted_s);
+    uint64_t oldest_snapshot;
+    if (snapshots_.empty()) {
+      oldest_snapshot = last_seq_same_as_publish_seq_
+                            ? versions_->LastSequence()
+                            : versions_->LastPublishedSequence();
+    } else {
+      oldest_snapshot = snapshots_.oldest()->number_;
+    }
+    // Avoid to go through every column family by checking a global threshold
+    // first.
+    if (oldest_snapshot > bottommost_files_mark_threshold_) {
+      CfdList cf_scheduled;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+        if (!cfd->current()
+                 ->storage_info()
+                 ->BottommostFilesMarkedForCompaction()
+                 .empty()) {
+          SchedulePendingCompaction(cfd);
+          MaybeScheduleFlushOrCompaction();
+          cf_scheduled.push_back(cfd);
+        }
+      }
+
+      // Calculate a new threshold, skipping those CFs where compactions are
+      // scheduled. We do not do the same pass as the previous loop because
+      // mutex might be unlocked during the loop, making the result inaccurate.
+      SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (CfdListContains(cf_scheduled, cfd)) {
+          continue;
+        }
+        new_bottommost_files_mark_threshold = std::min(
+            new_bottommost_files_mark_threshold,
+            cfd->current()->storage_info()->bottommost_files_mark_threshold());
+      }
+      bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
+    }
   }
   delete casted_s;
 }
 
-bool DBImpl::HasActiveSnapshotInRange(SequenceNumber lower_bound,
-                                      SequenceNumber upper_bound) {
-  InstrumentedMutexLock l(&mutex_);
-  return snapshots_.HasSnapshotInRange(lower_bound, upper_bound);
-}
-
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
                                         TablePropertiesCollection* props) {
@@ -1635,13 +2238,9 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
 
 #endif  // ROCKSDB_LITE
 
-const std::string& DBImpl::GetName() const {
-  return dbname_;
-}
+const std::string& DBImpl::GetName() const { return dbname_; }
 
-Env* DBImpl::GetEnv() const {
-  return env_;
-}
+Env* DBImpl::GetEnv() const { return env_; }
 
 Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
   InstrumentedMutexLock l(&mutex_);
@@ -1674,6 +2273,13 @@ bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
     InstrumentedMutexLock l(&mutex_);
     return cfd->internal_stats()->GetStringProperty(*property_info, property,
                                                     value);
+  } else if (property_info->handle_string_dbimpl) {
+    std::string tmp_value;
+    bool ret_value = (this->*(property_info->handle_string_dbimpl))(&tmp_value);
+    if (ret_value) {
+      *value = tmp_value;
+    }
+    return ret_value;
   }
   // Shouldn't reach here since exactly one of handle_string and handle_int
   // should be non-nullptr.
@@ -1683,7 +2289,7 @@ bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
 
 bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
                             const Slice& property,
-                            std::map<std::string, double>* value) {
+                            std::map<std::string, std::string>* value) {
   const DBPropertyInfo* property_info = GetPropertyInfo(property);
   value->clear();
   auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
@@ -1740,6 +2346,16 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
   }
 }
 
+bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
+  assert(value != nullptr);
+  Statistics* statistics = immutable_db_options_.statistics.get();
+  if (!statistics) {
+    return false;
+  }
+  *value = statistics->ToString();
+  return true;
+}
+
 #ifndef ROCKSDB_LITE
 Status DBImpl::ResetStats() {
   InstrumentedMutexLock l(&mutex_);
@@ -1796,21 +2412,23 @@ SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
   return GetAndRefSuperVersion(cfd);
 }
 
+void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
+  // Release SuperVersion
+  if (sv->Unref()) {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      sv->Cleanup();
+    }
+    delete sv;
+    RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+  }
+  RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+}
+
 void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
                                           SuperVersion* sv) {
-  bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
-
-  if (unref_sv) {
-    // Release SuperVersion
-    if (sv->Unref()) {
-      {
-        InstrumentedMutexLock l(&mutex_);
-        sv->Cleanup();
-      }
-      delete sv;
-      RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
-    }
-    RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+  if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
+    CleanupSuperVersion(sv);
   }
 }
 
@@ -1839,17 +2457,18 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
 }
 
 // REQUIRED: mutex is NOT held.
-ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked(
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
     uint32_t column_family_id) {
-  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
-
   InstrumentedMutexLock l(&mutex_);
 
-  if (!cf_memtables->Seek(column_family_id)) {
+  auto* cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+  if (cfd == nullptr) {
     return nullptr;
   }
 
-  return cf_memtables->GetColumnFamilyHandle();
+  return std::unique_ptr<ColumnFamilyHandleImpl>(
+      new ColumnFamilyHandleImpl(cfd, this, &mutex_));
 }
 
 void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
@@ -1920,9 +2539,8 @@ void DBImpl::ReleaseFileNumberFromPendingOutputs(
 
 #ifndef ROCKSDB_LITE
 Status DBImpl::GetUpdatesSince(
-    SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
     const TransactionLogIterator::ReadOptions& read_options) {
-
   RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
   if (seq > versions_->LastSequence()) {
     return Status::NotFound("Requested sequence not yet written in the db");
@@ -1950,8 +2568,7 @@ Status DBImpl::DeleteFile(std::string name) {
                       name.c_str());
       return Status::NotSupported("Delete only supported for archived logs");
     }
-    status =
-        env_->DeleteFile(immutable_db_options_.wal_dir + "/" + name.c_str());
+    status = wal_manager_.DeleteFile(name, number);
     if (!status.ok()) {
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                       "DeleteFile %s failed -- %s.\n", name.c_str(),
@@ -2013,8 +2630,9 @@ Status DBImpl::DeleteFile(std::string name) {
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionAndScheduleWorkWrapper(
-          cfd, &job_context, *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
     }
     FindObsoleteFiles(&job_context, false);
   }  // lock released here
@@ -2029,56 +2647,62 @@ Status DBImpl::DeleteFile(std::string name) {
   return status;
 }
 
-Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
-                                  const Slice* begin, const Slice* end) {
+Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                                   const RangePtr* ranges, size_t n,
+                                   bool include_end) {
   Status status;
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   ColumnFamilyData* cfd = cfh->cfd();
   VersionEdit edit;
-  std::vector<FileMetaData*> deleted_files;
+  std::set<FileMetaData*> deleted_files;
   JobContext job_context(next_job_id_.fetch_add(1), true);
   {
     InstrumentedMutexLock l(&mutex_);
     Version* input_version = cfd->current();
 
     auto* vstorage = input_version->storage_info();
-    for (int i = 1; i < cfd->NumberLevels(); i++) {
-      if (vstorage->LevelFiles(i).empty() ||
-          !vstorage->OverlapInLevel(i, begin, end)) {
-        continue;
-      }
-      std::vector<FileMetaData*> level_files;
-      InternalKey begin_storage, end_storage, *begin_key, *end_key;
-      if (begin == nullptr) {
-        begin_key = nullptr;
-      } else {
-        begin_storage.SetMaxPossibleForUserKey(*begin);
-        begin_key = &begin_storage;
-      }
-      if (end == nullptr) {
-        end_key = nullptr;
-      } else {
-        end_storage.SetMinPossibleForUserKey(*end);
-        end_key = &end_storage;
-      }
+    for (size_t r = 0; r < n; r++) {
+      auto begin = ranges[r].start, end = ranges[r].limit;
+      for (int i = 1; i < cfd->NumberLevels(); i++) {
+        if (vstorage->LevelFiles(i).empty() ||
+            !vstorage->OverlapInLevel(i, begin, end)) {
+          continue;
+        }
+        std::vector<FileMetaData*> level_files;
+        InternalKey begin_storage, end_storage, *begin_key, *end_key;
+        if (begin == nullptr) {
+          begin_key = nullptr;
+        } else {
+          begin_storage.SetMinPossibleForUserKey(*begin);
+          begin_key = &begin_storage;
+        }
+        if (end == nullptr) {
+          end_key = nullptr;
+        } else {
+          end_storage.SetMaxPossibleForUserKey(*end);
+          end_key = &end_storage;
+        }
 
-      vstorage->GetOverlappingInputs(i, begin_key, end_key, &level_files, -1,
-                                     nullptr, false);
-      FileMetaData* level_file;
-      for (uint32_t j = 0; j < level_files.size(); j++) {
-        level_file = level_files[j];
-        if (((begin == nullptr) ||
-             (cfd->internal_comparator().user_comparator()->Compare(
-                  level_file->smallest.user_key(), *begin) >= 0)) &&
-            ((end == nullptr) ||
-             (cfd->internal_comparator().user_comparator()->Compare(
-                  level_file->largest.user_key(), *end) <= 0))) {
+        vstorage->GetCleanInputsWithinInterval(
+            i, begin_key, end_key, &level_files, -1 /* hint_index */,
+            nullptr /* file_index */);
+        FileMetaData* level_file;
+        for (uint32_t j = 0; j < level_files.size(); j++) {
+          level_file = level_files[j];
           if (level_file->being_compacted) {
             continue;
           }
+          if (deleted_files.find(level_file) != deleted_files.end()) {
+            continue;
+          }
+          if (!include_end && end != nullptr &&
+              cfd->user_comparator()->Compare(level_file->largest.user_key(),
+                                              *end) == 0) {
+            continue;
+          }
           edit.SetColumnFamily(cfd->GetID());
           edit.DeleteFile(i, level_file->fd.GetNumber());
-          deleted_files.push_back(level_file);
+          deleted_files.insert(level_file);
           level_file->being_compacted = true;
         }
       }
@@ -2091,8 +2715,9 @@ Status DBImpl::DeleteFilesInRange(ColumnFamilyHandle* column_family,
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionAndScheduleWorkWrapper(
-          cfd, &job_context, *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
     }
     for (auto* deleted_file : deleted_files) {
       deleted_file->being_compacted = false;
@@ -2116,9 +2741,8 @@ void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
   versions_->GetLiveFilesMetaData(metadata);
 }
 
-void DBImpl::GetColumnFamilyMetaData(
-    ColumnFamilyHandle* column_family,
-    ColumnFamilyMetaData* cf_meta) {
+void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                     ColumnFamilyMetaData* cf_meta) {
   assert(column_family);
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
   auto* sv = GetAndRefSuperVersion(cfd);
@@ -2164,15 +2788,16 @@ Status DBImpl::CheckConsistency() {
 Status DBImpl::GetDbIdentity(std::string& identity) const {
   std::string idfilename = IdentityFileName(dbname_);
   const EnvOptions soptions;
-  unique_ptr<SequentialFileReader> id_file_reader;
+  std::unique_ptr<SequentialFileReader> id_file_reader;
   Status s;
   {
-    unique_ptr<SequentialFile> idfile;
+    std::unique_ptr<SequentialFile> idfile;
     s = env_->NewSequentialFile(idfilename, &idfile, soptions);
     if (!s.ok()) {
       return s;
     }
-    id_file_reader.reset(new SequentialFileReader(std::move(idfile)));
+    id_file_reader.reset(
+        new SequentialFileReader(std::move(idfile), idfilename));
   }
 
   uint64_t file_size;
@@ -2180,7 +2805,8 @@ Status DBImpl::GetDbIdentity(std::string& identity) const {
   if (!s.ok()) {
     return s;
   }
-  char* buffer = reinterpret_cast<char*>(alloca(file_size));
+  char* buffer =
+      reinterpret_cast<char*>(alloca(static_cast<size_t>(file_size)));
   Slice id;
   s = id_file_reader->Read(static_cast<size_t>(file_size), &id, buffer);
   if (!s.ok()) {
@@ -2195,31 +2821,31 @@ Status DBImpl::GetDbIdentity(std::string& identity) const {
 }
 
 // Default implementation -- returns not supported status
-Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
-                              const std::string& column_family_name,
-                              ColumnFamilyHandle** handle) {
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+                              const std::string& /*column_family_name*/,
+                              ColumnFamilyHandle** /*handle*/) {
   return Status::NotSupported("");
 }
 
 Status DB::CreateColumnFamilies(
-    const ColumnFamilyOptions& cf_options,
-    const std::vector<std::string>& column_family_names,
-    std::vector<ColumnFamilyHandle*>* handles) {
+    const ColumnFamilyOptions& /*cf_options*/,
+    const std::vector<std::string>& /*column_family_names*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
   return Status::NotSupported("");
 }
 
 Status DB::CreateColumnFamilies(
-    const std::vector<ColumnFamilyDescriptor>& column_families,
-    std::vector<ColumnFamilyHandle*>* handles) {
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
   return Status::NotSupported("");
 }
 
-Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) {
+Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
   return Status::NotSupported("");
 }
 
 Status DB::DropColumnFamilies(
-    const std::vector<ColumnFamilyHandle*>& column_families) {
+    const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
   return Status::NotSupported("");
 }
 
@@ -2228,7 +2854,15 @@ Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
   return Status::OK();
 }
 
-DB::~DB() { }
+DB::~DB() {}
+
+Status DBImpl::Close() {
+  if (!closed_) {
+    closed_ = true;
+    return CloseImpl();
+  }
+  return Status::OK();
+}
 
 Status DB::ListColumnFamilies(const DBOptions& db_options,
                               const std::string& name,
@@ -2236,14 +2870,17 @@ Status DB::ListColumnFamilies(const DBOptions& db_options,
   return VersionSet::ListColumnFamilies(column_families, name, db_options.env);
 }
 
-Snapshot::~Snapshot() {
-}
+Snapshot::~Snapshot() {}
 
-Status DestroyDB(const std::string& dbname, const Options& options) {
-  const ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+Status DestroyDB(const std::string& dbname, const Options& options,
+                 const std::vector<ColumnFamilyDescriptor>& column_families) {
+  ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
   Env* env = soptions.env;
   std::vector<std::string> filenames;
 
+  // Reset the logger because it holds a handle to the
+  // log file and prevents cleanup and directory removal
+  soptions.info_log.reset();
   // Ignore error in case directory does not exist
   env->GetChildren(dbname, &filenames);
 
@@ -2254,15 +2891,15 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
     uint64_t number;
     FileType type;
     InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
-    for (size_t i = 0; i < filenames.size(); i++) {
-      if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) &&
+    for (const auto& fname : filenames) {
+      if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
           type != kDBLockFile) {  // Lock file will be deleted at end
         Status del;
-        std::string path_to_delete = dbname + "/" + filenames[i];
+        std::string path_to_delete = dbname + "/" + fname;
         if (type == kMetaDatabase) {
           del = DestroyDB(path_to_delete, options);
-        } else if (type == kTableFile) {
-          del = DeleteSSTFile(&soptions, path_to_delete, 0);
+        } else if (type == kTableFile || type == kLogFile) {
+          del = DeleteDBFile(&soptions, path_to_delete, dbname);
         } else {
           del = env->DeleteFile(path_to_delete);
         }
@@ -2272,59 +2909,84 @@ Status DestroyDB(const std::string& dbname, const Options& options) {
       }
     }
 
-    for (size_t path_id = 0; path_id < options.db_paths.size(); path_id++) {
-      const auto& db_path = options.db_paths[path_id];
-      env->GetChildren(db_path.path, &filenames);
-      for (size_t i = 0; i < filenames.size(); i++) {
-        if (ParseFileName(filenames[i], &number, &type) &&
-            type == kTableFile) {  // Lock file will be deleted at end
-          std::string table_path = db_path.path + "/" + filenames[i];
-          Status del = DeleteSSTFile(&soptions, table_path,
-                                     static_cast<uint32_t>(path_id));
-          if (result.ok() && !del.ok()) {
-            result = del;
+    std::vector<std::string> paths;
+
+    for (const auto& path : options.db_paths) {
+      paths.emplace_back(path.path);
+    }
+    for (const auto& cf : column_families) {
+      for (const auto& path : cf.options.cf_paths) {
+        paths.emplace_back(path.path);
+      }
+    }
+
+    // Remove duplicate paths.
+    // Note that we compare only the actual paths but not path ids.
+    // This reason is that same path can appear at different path_ids
+    // for different column families.
+    std::sort(paths.begin(), paths.end());
+    paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+    for (const auto& path : paths) {
+      if (env->GetChildren(path, &filenames).ok()) {
+        for (const auto& fname : filenames) {
+          if (ParseFileName(fname, &number, &type) &&
+              type == kTableFile) {  // Lock file will be deleted at end
+            std::string table_path = path + "/" + fname;
+            Status del = DeleteDBFile(&soptions, table_path, dbname);
+            if (result.ok() && !del.ok()) {
+              result = del;
+            }
           }
         }
+        env->DeleteDir(path);
       }
     }
 
     std::vector<std::string> walDirFiles;
     std::string archivedir = ArchivalDirectory(dbname);
+    bool wal_dir_exists = false;
     if (dbname != soptions.wal_dir) {
-      env->GetChildren(soptions.wal_dir, &walDirFiles);
+      wal_dir_exists = env->GetChildren(soptions.wal_dir, &walDirFiles).ok();
       archivedir = ArchivalDirectory(soptions.wal_dir);
     }
 
-    // Delete log files in the WAL dir
-    for (const auto& file : walDirFiles) {
-      if (ParseFileName(file, &number, &type) && type == kLogFile) {
-        Status del = env->DeleteFile(LogFileName(soptions.wal_dir, number));
-        if (result.ok() && !del.ok()) {
-          result = del;
+    // Archive dir may be inside wal dir or dbname and should be
+    // processed and removed before those otherwise we have issues
+    // removing them
+    std::vector<std::string> archiveFiles;
+    if (env->GetChildren(archivedir, &archiveFiles).ok()) {
+      // Delete archival files.
+      for (const auto& file : archiveFiles) {
+        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+          Status del =
+              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir);
+          if (result.ok() && !del.ok()) {
+            result = del;
+          }
         }
       }
+      env->DeleteDir(archivedir);
     }
 
-    std::vector<std::string> archiveFiles;
-    env->GetChildren(archivedir, &archiveFiles);
-    // Delete archival files.
-    for (size_t i = 0; i < archiveFiles.size(); ++i) {
-      if (ParseFileName(archiveFiles[i], &number, &type) &&
-          type == kLogFile) {
-        Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]);
-        if (result.ok() && !del.ok()) {
-          result = del;
+    // Delete log files in the WAL dir
+    if (wal_dir_exists) {
+      for (const auto& file : walDirFiles) {
+        if (ParseFileName(file, &number, &type) && type == kLogFile) {
+          Status del =
+              DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
+                           soptions.wal_dir);
+          if (result.ok() && !del.ok()) {
+            result = del;
+          }
         }
       }
+      env->DeleteDir(soptions.wal_dir);
     }
 
-    // ignore case where no archival directory is present
-    env->DeleteDir(archivedir);
-
     env->UnlockFile(lock);  // Ignore error since state is already gone
     env->DeleteFile(lockname);
     env->DeleteDir(dbname);  // Ignore error in case dir contains other files
-    env->DeleteDir(soptions.wal_dir);
   }
   return result;
 }
@@ -2386,6 +3048,9 @@ Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
                              s.ToString().c_str());
     }
   }
+#else
+  (void)need_mutex_lock;
+  (void)need_enter_write_thread;
 #endif  // !ROCKSDB_LITE
   return Status::OK();
 }
@@ -2445,31 +3110,36 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
 #ifndef ROCKSDB_LITE
   Status s;
 
-  versions_->options_file_number_ = versions_->NewFileNumber();
+  uint64_t options_file_number = versions_->NewFileNumber();
   std::string options_file_name =
-      OptionsFileName(GetName(), versions_->options_file_number_);
+      OptionsFileName(GetName(), options_file_number);
   // Retry if the file name happen to conflict with an existing one.
   s = GetEnv()->RenameFile(file_name, options_file_name);
+  if (s.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    versions_->options_file_number_ = options_file_number;
+  }
 
-  DeleteObsoleteOptionsFiles();
+  if (0 == disable_delete_obsolete_files_) {
+    DeleteObsoleteOptionsFiles();
+  }
   return s;
 #else
+  (void)file_name;
   return Status::OK();
 #endif  // !ROCKSDB_LITE
 }
 
 #ifdef ROCKSDB_USING_THREAD_STATUS
 
-void DBImpl::NewThreadStatusCfInfo(
-    ColumnFamilyData* cfd) const {
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
   if (immutable_db_options_.enable_thread_tracking) {
     ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
                                           cfd->ioptions()->env);
   }
 }
 
-void DBImpl::EraseThreadStatusCfInfo(
-    ColumnFamilyData* cfd) const {
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
   if (immutable_db_options_.enable_thread_tracking) {
     ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
   }
@@ -2482,21 +3152,16 @@ void DBImpl::EraseThreadStatusDbInfo() const {
 }
 
 #else
-void DBImpl::NewThreadStatusCfInfo(
-    ColumnFamilyData* cfd) const {
-}
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
 
-void DBImpl::EraseThreadStatusCfInfo(
-    ColumnFamilyData* cfd) const {
-}
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
 
-void DBImpl::EraseThreadStatusDbInfo() const {
-}
+void DBImpl::EraseThreadStatusDbInfo() const {}
 #endif  // ROCKSDB_USING_THREAD_STATUS
 
 //
 // A global method that can dump out the build version
-void DumpRocksDBBuildVersion(Logger * log) {
+void DumpRocksDBBuildVersion(Logger* log) {
 #if !defined(IOS_CROSS_COMPILE)
   // if we compile with Xcode, we don't run build_detect_version, so we don't
   // generate util/build_version.cc
@@ -2504,6 +3169,8 @@ void DumpRocksDBBuildVersion(Logger * log) {
                    ROCKSDB_MINOR, ROCKSDB_PATCH);
   ROCKS_LOG_HEADER(log, "Git sha %s", rocksdb_build_git_sha);
   ROCKS_LOG_HEADER(log, "Compile date %s", rocksdb_build_compile_date);
+#else
+  (void)log;  // ignore "-Wunused-parameter"
 #endif
 }
 
@@ -2530,8 +3197,7 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
                                        bool* is_blob_index) {
   Status s;
   MergeContext merge_context;
-  RangeDelAggregator range_del_agg(sv->mem->GetInternalKeyComparator(),
-                                   kMaxSequenceNumber);
+  SequenceNumber max_covering_tombstone_seq = 0;
 
   ReadOptions read_options;
   SequenceNumber current_seq = versions_->LastSequence();
@@ -2541,8 +3207,8 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   *found_record_for_key = false;
 
   // Check if there is a record for this key in the latest memtable
-  sv->mem->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
-               read_options, is_blob_index);
+  sv->mem->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+               seq, read_options, nullptr /*read_callback*/, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2560,8 +3226,8 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   }
 
   // Check if there is a record for this key in the immutable memtables
-  sv->imm->Get(lkey, nullptr, &s, &merge_context, &range_del_agg, seq,
-               read_options, is_blob_index);
+  sv->imm->Get(lkey, nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+               seq, read_options, nullptr /*read_callback*/, is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2579,8 +3245,9 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   }
 
   // Check if there is a record for this key in the immutable memtables
-  sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context, &range_del_agg,
-                          seq, read_options, is_blob_index);
+  sv->imm->GetFromHistory(lkey, nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, seq, read_options,
+                          is_blob_index);
 
   if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
     // unexpected error reading memtable.
@@ -2603,63 +3270,143 @@ Status DBImpl::GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
   if (!cache_only) {
     // Check tables
     sv->current->Get(read_options, lkey, nullptr, &s, &merge_context,
-                     &range_del_agg, nullptr /* value_found */,
-                     found_record_for_key, seq, is_blob_index);
+                     &max_covering_tombstone_seq, nullptr /* value_found */,
+                     found_record_for_key, seq, nullptr /*read_callback*/,
+                     is_blob_index);
 
     if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
       // unexpected error reading SST files
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
                       "Unexpected status returned from Version::Get: %s\n",
                       s.ToString().c_str());
-
-      return s;
     }
   }
 
-  return Status::OK();
+  return s;
 }
 
 Status DBImpl::IngestExternalFile(
     ColumnFamilyHandle* column_family,
     const std::vector<std::string>& external_files,
     const IngestExternalFileOptions& ingestion_options) {
-  Status status;
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  auto cfd = cfh->cfd();
+  IngestExternalFileArg arg;
+  arg.column_family = column_family;
+  arg.external_files = external_files;
+  arg.options = ingestion_options;
+  return IngestExternalFiles({arg});
+}
 
-  // Ingest should immediately fail if ingest_behind is requested,
-  // but the DB doesn't support it.
-  if (ingestion_options.ingest_behind) {
-    if (!immutable_db_options_.allow_ingest_behind) {
+Status DBImpl::IngestExternalFiles(
+    const std::vector<IngestExternalFileArg>& args) {
+  if (args.empty()) {
+    return Status::InvalidArgument("ingestion arg list is empty");
+  }
+  {
+    std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
+    for (const auto& arg : args) {
+      if (arg.column_family == nullptr) {
+        return Status::InvalidArgument("column family handle is null");
+      } else if (unique_cfhs.count(arg.column_family) > 0) {
+        return Status::InvalidArgument(
+            "ingestion args have duplicate column families");
+      }
+      unique_cfhs.insert(arg.column_family);
+    }
+  }
+  // Ingest multiple external SST files atomically.
+  size_t num_cfs = args.size();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    if (args[i].external_files.empty()) {
+      char err_msg[128] = {0};
+      snprintf(err_msg, 128, "external_files[%zu] is empty", i);
+      return Status::InvalidArgument(err_msg);
+    }
+  }
+  for (const auto& arg : args) {
+    const IngestExternalFileOptions& ingest_opts = arg.options;
+    if (ingest_opts.ingest_behind &&
+        !immutable_db_options_.allow_ingest_behind) {
       return Status::InvalidArgument(
-        "Can't ingest_behind file in DB with allow_ingest_behind=false");
+          "can't ingest_behind file in DB with allow_ingest_behind=false");
     }
   }
 
-  ExternalSstFileIngestionJob ingestion_job(env_, versions_.get(), cfd,
-                                            immutable_db_options_, env_options_,
-                                            &snapshots_, ingestion_options);
-
+  // TODO (yanqin) maybe handle the case in which column_families have
+  // duplicates
   std::list<uint64_t>::iterator pending_output_elem;
-  {
+  size_t total = 0;
+  for (const auto& arg : args) {
+    total += arg.external_files.size();
+  }
+  uint64_t next_file_number = 0;
+  Status status = ReserveFileNumbersBeforeIngestion(
+      static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
+      &pending_output_elem, &next_file_number);
+  if (!status.ok()) {
     InstrumentedMutexLock l(&mutex_);
-    if (!bg_error_.ok()) {
-      // Don't ingest files when there is a bg_error
-      return bg_error_;
-    }
-
-    // Make sure that bg cleanup wont delete the files that we are ingesting
-    pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
   }
 
-  status = ingestion_job.Prepare(external_files);
+  std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
+  for (const auto& arg : args) {
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
+    ingestion_jobs.emplace_back(env_, versions_.get(), cfd,
+                                immutable_db_options_, env_options_,
+                                &snapshots_, arg.options);
+  }
+  std::vector<std::pair<bool, Status>> exec_results;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    exec_results.emplace_back(false, Status::OK());
+  }
+  // TODO(yanqin) maybe make jobs run in parallel
+  for (size_t i = 1; i != num_cfs; ++i) {
+    uint64_t start_file_number =
+        next_file_number + args[i - 1].external_files.size();
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
+    exec_results[i].second = ingestion_jobs[i].Prepare(
+        args[i].external_files, start_file_number, super_version);
+    exec_results[i].first = true;
+    CleanupSuperVersion(super_version);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
+  {
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
+    exec_results[0].second = ingestion_jobs[0].Prepare(
+        args[0].external_files, next_file_number, super_version);
+    exec_results[0].first = true;
+    CleanupSuperVersion(super_version);
+  }
+  for (const auto& exec_result : exec_results) {
+    if (!exec_result.second.ok()) {
+      status = exec_result.second;
+      break;
+    }
+  }
   if (!status.ok()) {
+    for (size_t i = 0; i != num_cfs; ++i) {
+      if (exec_results[i].first) {
+        ingestion_jobs[i].Cleanup(status);
+      }
+    }
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
     return status;
   }
 
+  std::vector<SuperVersionContext> sv_ctxs;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs.emplace_back(true /* create_superversion */);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
   TEST_SYNC_POINT("DBImpl::AddFile:Start");
   {
-    // Lock db mutex
     InstrumentedMutexLock l(&mutex_);
     TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
 
@@ -2667,84 +3414,180 @@ Status DBImpl::IngestExternalFile(
     WriteThread::Writer w;
     write_thread_.EnterUnbatched(&w, &mutex_);
     WriteThread::Writer nonmem_w;
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
       nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
     }
 
-    num_running_ingest_file_++;
+    num_running_ingest_file_ += static_cast<int>(num_cfs);
+    TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
 
-    // We cannot ingest a file into a dropped CF
-    if (cfd->IsDropped()) {
-      status = Status::InvalidArgument(
-          "Cannot ingest an external file into a dropped CF");
+    bool at_least_one_cf_need_flush = false;
+    std::vector<bool> need_flush(num_cfs, false);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (cfd->IsDropped()) {
+        // TODO (yanqin) investigate whether we should abort ingestion or
+        // proceed with other non-dropped column families.
+        status = Status::InvalidArgument(
+            "cannot ingest an external file into a dropped CF");
+        break;
+      }
+      bool tmp = false;
+      status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
+      need_flush[i] = tmp;
+      at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
+      if (!status.ok()) {
+        break;
+      }
     }
+    TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
+                             &at_least_one_cf_need_flush);
 
-    // Figure out if we need to flush the memtable first
-    if (status.ok()) {
-      bool need_flush = false;
-      status = ingestion_job.NeedsFlush(&need_flush);
-      TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
-                               &need_flush);
-      if (status.ok() && need_flush) {
+    if (status.ok() && at_least_one_cf_need_flush) {
+      FlushOptions flush_opts;
+      flush_opts.allow_write_stall = true;
+      if (immutable_db_options_.atomic_flush) {
+        autovector<ColumnFamilyData*> cfds_to_flush;
+        SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
         mutex_.Unlock();
-        status = FlushMemTable(cfd, FlushOptions(), true /* writes_stopped */);
+        status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
+                                      FlushReason::kExternalFileIngestion,
+                                      true /* writes_stopped */);
         mutex_.Lock();
+      } else {
+        for (size_t i = 0; i != num_cfs; ++i) {
+          if (need_flush[i]) {
+            mutex_.Unlock();
+            auto* cfd =
+                static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
+                    ->cfd();
+            status = FlushMemTable(cfd, flush_opts,
+                                   FlushReason::kExternalFileIngestion,
+                                   true /* writes_stopped */);
+            mutex_.Lock();
+            if (!status.ok()) {
+              break;
+            }
+          }
+        }
       }
     }
-
-    // Run the ingestion job
+    // Run ingestion jobs.
     if (status.ok()) {
-      status = ingestion_job.Run();
+      for (size_t i = 0; i != num_cfs; ++i) {
+        status = ingestion_jobs[i].Run();
+        if (!status.ok()) {
+          break;
+        }
+      }
     }
-
-    // Install job edit [Mutex will be unlocked here]
-    auto mutable_cf_options = cfd->GetLatestMutableCFOptions();
     if (status.ok()) {
+      bool should_increment_last_seqno =
+          ingestion_jobs[0].ShouldIncrementLastSequence();
+#ifndef NDEBUG
+      for (size_t i = 1; i != num_cfs; ++i) {
+        assert(should_increment_last_seqno ==
+               ingestion_jobs[i].ShouldIncrementLastSequence());
+      }
+#endif
+      if (should_increment_last_seqno) {
+        const SequenceNumber last_seqno = versions_->LastSequence();
+        versions_->SetLastAllocatedSequence(last_seqno + 1);
+        versions_->SetLastPublishedSequence(last_seqno + 1);
+        versions_->SetLastSequence(last_seqno + 1);
+      }
+      autovector<ColumnFamilyData*> cfds_to_commit;
+      autovector<const MutableCFOptions*> mutable_cf_options_list;
+      autovector<autovector<VersionEdit*>> edit_lists;
+      uint32_t num_entries = 0;
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfds_to_commit.push_back(cfd);
+        mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+        autovector<VersionEdit*> edit_list;
+        edit_list.push_back(ingestion_jobs[i].edit());
+        edit_lists.push_back(edit_list);
+        ++num_entries;
+      }
+      // Mark the version edits as an atomic group if the number of version
+      // edits exceeds 1.
+      if (cfds_to_commit.size() > 1) {
+        for (auto& edits : edit_lists) {
+          assert(edits.size() == 1);
+          edits[0]->MarkAtomicGroup(--num_entries);
+        }
+        assert(0 == num_entries);
+      }
       status =
-          versions_->LogAndApply(cfd, *mutable_cf_options, ingestion_job.edit(),
-                                 &mutex_, directories_.GetDbDir());
+          versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
+                                 edit_lists, &mutex_, directories_.GetDbDir());
     }
+
     if (status.ok()) {
-      delete InstallSuperVersionAndScheduleWork(cfd, nullptr,
-                                                *mutable_cf_options);
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (!cfd->IsDropped()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+                                             *cfd->GetLatestMutableCFOptions());
+#ifndef NDEBUG
+          if (0 == i && num_cfs > 1) {
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
+          }
+#endif  // !NDEBUG
+        }
+      }
     }
 
     // Resume writes to the DB
-    if (concurrent_prepare_) {
+    if (two_write_queues_) {
       nonmem_write_thread_.ExitUnbatched(&nonmem_w);
     }
     write_thread_.ExitUnbatched(&w);
 
-    // Update stats
     if (status.ok()) {
-      ingestion_job.UpdateStats();
+      for (auto& job : ingestion_jobs) {
+        job.UpdateStats();
+      }
     }
-
     ReleaseFileNumberFromPendingOutputs(pending_output_elem);
-
-    num_running_ingest_file_--;
-    if (num_running_ingest_file_ == 0) {
+    num_running_ingest_file_ -= static_cast<int>(num_cfs);
+    if (0 == num_running_ingest_file_) {
       bg_cv_.SignalAll();
     }
-
     TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
   }
   // mutex_ is unlocked here
 
   // Cleanup
-  ingestion_job.Cleanup(status);
-
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs[i].Clean();
+    // This may rollback jobs that have completed successfully. This is
+    // intended for atomicity.
+    ingestion_jobs[i].Cleanup(status);
+  }
   if (status.ok()) {
-    NotifyOnExternalFileIngested(cfd, ingestion_job);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (!cfd->IsDropped()) {
+        NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
+      }
+    }
   }
-
   return status;
 }
 
 Status DBImpl::VerifyChecksum() {
   Status s;
-  Options options;
-  EnvOptions env_options;
   std::vector<ColumnFamilyData*> cfd_list;
   {
     InstrumentedMutexLock l(&mutex_);
@@ -2761,13 +3604,20 @@ Status DBImpl::VerifyChecksum() {
   }
   for (auto& sv : sv_list) {
     VersionStorageInfo* vstorage = sv->current->storage_info();
+    ColumnFamilyData* cfd = sv->current->cfd();
+    Options opts;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+                     cfd->GetLatestCFOptions());
+    }
     for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
       for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
            j++) {
         const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
-        std::string fname = TableFileName(immutable_db_options_.db_paths,
+        std::string fname = TableFileName(cfd->ioptions()->cf_paths,
                                           fd.GetNumber(), fd.GetPathId());
-        s = rocksdb::VerifySstFileChecksum(options, env_options, fname);
+        s = rocksdb::VerifySstFileChecksum(opts, env_options_, fname);
       }
     }
     if (!s.ok()) {
@@ -2783,7 +3633,7 @@ Status DBImpl::VerifyChecksum() {
       }
     }
     for (auto cfd : cfd_list) {
-        cfd->Unref();
+      cfd->Unref();
     }
   }
   return s;
@@ -2791,7 +3641,6 @@ Status DBImpl::VerifyChecksum() {
 
 void DBImpl::NotifyOnExternalFileIngested(
     ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
-#ifndef ROCKSDB_LITE
   if (immutable_db_options_.listeners.empty()) {
     return;
   }
@@ -2807,8 +3656,6 @@ void DBImpl::NotifyOnExternalFileIngested(
       listener->OnExternalFileIngested(this, info);
     }
   }
-
-#endif
 }
 
 void DBImpl::WaitForIngestFile() {
@@ -2818,6 +3665,77 @@ void DBImpl::WaitForIngestFile() {
   }
 }
 
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
+                          std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  tracer_.reset(new Tracer(env_, trace_options, std::move(trace_writer)));
+  return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  Status s;
+  if (tracer_ != nullptr) {
+    s = tracer_->Close();
+    tracer_.reset();
+  } else {
+    return Status::IOError("No trace file to close");
+  }
+  return s;
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeek(cf_id, key);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id,
+                                        const Slice& key) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeekForPrev(cf_id, key);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::ReserveFileNumbersBeforeIngestion(
+    ColumnFamilyData* cfd, uint64_t num,
+    std::list<uint64_t>::iterator* pending_output_elem,
+    uint64_t* next_file_number) {
+  Status s;
+  SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
+  assert(nullptr != pending_output_elem);
+  assert(nullptr != next_file_number);
+  InstrumentedMutexLock l(&mutex_);
+  if (error_handler_.IsDBStopped()) {
+    // Do not ingest files when there is a bg_error
+    return error_handler_.GetBGError();
+  }
+  *pending_output_elem = CaptureCurrentFileNumberInPendingOutputs();
+  *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
+  auto cf_options = cfd->GetLatestMutableCFOptions();
+  VersionEdit dummy_edit;
+  // If crash happen after a hard link established, Recover function may
+  // reuse the file number that has already assigned to the internal file,
+  // and this will overwrite the external file. To protect the external
+  // file, we have to make sure the file number will never being reused.
+  s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                             directories_.GetDbDir());
+  if (s.ok()) {
+    InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+  }
+  dummy_sv_ctx.Clean();
+  return s;
+}
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_impl.h b/thirdparty/rocksdb/db/db_impl.h
index f1730f9adb..e834e0fbec 100644
--- a/thirdparty/rocksdb/db/db_impl.h
+++ b/thirdparty/rocksdb/db/db_impl.h
@@ -14,7 +14,6 @@
 #include <limits>
 #include <list>
 #include <map>
-#include <queue>
 #include <set>
 #include <string>
 #include <utility>
@@ -23,11 +22,18 @@
 #include "db/column_family.h"
 #include "db/compaction_job.h"
 #include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
 #include "db/external_sst_file_ingestion_job.h"
 #include "db/flush_job.h"
 #include "db/flush_scheduler.h"
 #include "db/internal_stats.h"
 #include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
 #include "db/snapshot_impl.h"
 #include "db/version_edit.h"
 #include "db/wal_manager.h"
@@ -41,24 +47,29 @@
 #include "rocksdb/env.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
 #include "rocksdb/transaction_log.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
 #include "util/autovector.h"
 #include "util/event_logger.h"
 #include "util/hash.h"
+#include "util/repeatable_thread.h"
 #include "util/stop_watch.h"
 #include "util/thread_local.h"
+#include "util/trace_replay.h"
 
 namespace rocksdb {
 
+class Arena;
 class ArenaWrappedDBIter;
+class InMemoryStatsHistoryIterator;
 class MemTable;
 class TableCache;
+class TaskLimiterToken;
 class Version;
 class VersionEdit;
 class VersionSet;
-class Arena;
 class WriteCallback;
 struct JobContext;
 struct ExternalSstFileInfo;
@@ -66,9 +77,13 @@ struct MemTableInfo;
 
 class DBImpl : public DB {
  public:
-  DBImpl(const DBOptions& options, const std::string& dbname);
+  DBImpl(const DBOptions& options, const std::string& dbname,
+         const bool seq_per_batch = false, const bool batch_per_txn = true);
   virtual ~DBImpl();
 
+  using DB::Resume;
+  virtual Status Resume() override;
+
   // Implementations of the DB interface
   using DB::Put;
   virtual Status Put(const WriteOptions& options,
@@ -99,7 +114,8 @@ class DBImpl : public DB {
   // Note: 'value_found' from KeyMayExist propagates here
   Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
                  const Slice& key, PinnableSlice* value,
-                 bool* value_found = nullptr, bool* is_blob_index = nullptr);
+                 bool* value_found = nullptr, ReadCallback* callback = nullptr,
+                 bool* is_blob_index = nullptr);
 
   using DB::MultiGet;
   virtual std::vector<Status> MultiGet(
@@ -142,7 +158,9 @@ class DBImpl : public DB {
   ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
                                       ColumnFamilyData* cfd,
                                       SequenceNumber snapshot,
-                                      bool allow_blob = false);
+                                      ReadCallback* read_callback,
+                                      bool allow_blob = false,
+                                      bool allow_refresh = true);
 
   virtual const Snapshot* GetSnapshot() override;
   virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
@@ -150,9 +168,9 @@ class DBImpl : public DB {
   virtual bool GetProperty(ColumnFamilyHandle* column_family,
                            const Slice& property, std::string* value) override;
   using DB::GetMapProperty;
-  virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
-                              const Slice& property,
-                              std::map<std::string, double>* value) override;
+  virtual bool GetMapProperty(
+      ColumnFamilyHandle* column_family, const Slice& property,
+      std::map<std::string, std::string>* value) override;
   using DB::GetIntProperty;
   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
                               const Slice& property, uint64_t* value) override;
@@ -160,10 +178,9 @@ class DBImpl : public DB {
   virtual bool GetAggregatedIntProperty(const Slice& property,
                                         uint64_t* aggregated_value) override;
   using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags
-                                   = INCLUDE_FILES) override;
+  virtual void GetApproximateSizes(
+      ColumnFamilyHandle* column_family, const Range* range, int n,
+      uint64_t* sizes, uint8_t include_flags = INCLUDE_FILES) override;
   using DB::GetApproximateMemTableStats;
   virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
                                            const Range& range,
@@ -175,11 +192,13 @@ class DBImpl : public DB {
                               const Slice* begin, const Slice* end) override;
 
   using DB::CompactFiles;
-  virtual Status CompactFiles(const CompactionOptions& compact_options,
-                              ColumnFamilyHandle* column_family,
-                              const std::vector<std::string>& input_file_names,
-                              const int output_level,
-                              const int output_path_id = -1) override;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override;
 
   virtual Status PauseBackgroundWork() override;
   virtual Status ContinueBackgroundWork() override;
@@ -211,14 +230,32 @@ class DBImpl : public DB {
   using DB::Flush;
   virtual Status Flush(const FlushOptions& options,
                        ColumnFamilyHandle* column_family) override;
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
   virtual Status FlushWAL(bool sync) override;
+  bool TEST_WALBufferIsEmpty(bool lock = true);
   virtual Status SyncWAL() override;
+  virtual Status LockWAL() override;
+  virtual Status UnlockWAL() override;
 
   virtual SequenceNumber GetLatestSequenceNumber() const override;
+  virtual SequenceNumber GetLastPublishedSequence() const {
+    if (last_seq_same_as_publish_seq_) {
+      return versions_->LastSequence();
+    } else {
+      return versions_->LastPublishedSequence();
+    }
+  }
+  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+  // the second write queue otherwise.
+  virtual void SetLastPublishedSequence(SequenceNumber seq);
+  // Returns LastSequence in last_seq_same_as_publish_seq_
+  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+  // depends also on data written to the WAL but not to the memtable.
+  SequenceNumber TEST_GetLastVisibleSequence() const;
 
-  // Whether there is an active snapshot in range [lower_bound, upper_bound).
-  bool HasActiveSnapshotInRange(SequenceNumber lower_bound,
-                                SequenceNumber upper_bound);
+  virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) override;
 
 #ifndef ROCKSDB_LITE
   using DB::ResetStats;
@@ -233,12 +270,13 @@ class DBImpl : public DB {
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
 
   virtual Status GetUpdatesSince(
-      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
-      const TransactionLogIterator::ReadOptions&
-          read_options = TransactionLogIterator::ReadOptions()) override;
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) override;
   virtual Status DeleteFile(std::string name) override;
-  Status DeleteFilesInRange(ColumnFamilyHandle* column_family,
-                            const Slice* begin, const Slice* end);
+  Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                             const RangePtr* ranges, size_t n,
+                             bool include_end = true);
 
   virtual void GetLiveFilesMetaData(
       std::vector<LiveFileMetaData>* metadata) override;
@@ -247,9 +285,8 @@ class DBImpl : public DB {
   // Status::NotFound() will be returned if the current DB does not have
   // any column family match the specified name.
   // TODO(yhchiang): output parameter is placed in the end in this codebase.
-  virtual void GetColumnFamilyMetaData(
-      ColumnFamilyHandle* column_family,
-      ColumnFamilyMetaData* metadata) override;
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                       ColumnFamilyMetaData* metadata) override;
 
   Status SuggestCompactRange(ColumnFamilyHandle* column_family,
                              const Slice* begin, const Slice* end) override;
@@ -310,8 +347,21 @@ class DBImpl : public DB {
       const std::vector<std::string>& external_files,
       const IngestExternalFileOptions& ingestion_options) override;
 
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override;
+
   virtual Status VerifyChecksum() override;
 
+  using DB::StartTrace;
+  virtual Status StartTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndTrace;
+  virtual Status EndTrace() override;
+  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key);
+  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
 #endif  // ROCKSDB_LITE
 
   // Similar to GetSnapshot(), but also lets the db know that this snapshot
@@ -328,17 +378,21 @@ class DBImpl : public DB {
 
   Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
                              int output_level, uint32_t output_path_id,
-                             const Slice* begin, const Slice* end,
-                             bool exclusive,
+                             uint32_t max_subcompactions, const Slice* begin,
+                             const Slice* end, bool exclusive,
                              bool disallow_trivial_move = false);
 
   // Return an internal iterator over the current state of the database.
   // The keys of this iterator are internal keys (see format.h).
   // The returned iterator should be deleted when no longer needed.
   InternalIterator* NewInternalIterator(
-      Arena* arena, RangeDelAggregator* range_del_agg,
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence,
       ColumnFamilyHandle* column_family = nullptr);
 
+  LogsWithPrepTracker* logs_with_prep_tracker() {
+    return &logs_with_prep_tracker_;
+  }
+
 #ifndef NDEBUG
   // Extra methods (for testing) that are not in the public DB interface
   // Implemented in db_impl_debug.cc
@@ -348,11 +402,9 @@ class DBImpl : public DB {
                            ColumnFamilyHandle* column_family = nullptr,
                            bool disallow_trivial_move = false);
 
-  void TEST_HandleWALFull();
+  void TEST_SwitchWAL();
 
-  bool TEST_UnableToFlushOldestLog() {
-    return unable_to_flush_oldest_log_;
-  }
+  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
 
   bool TEST_IsLogGettingFlushed() {
     return alive_log_files_.begin()->getting_flushed;
@@ -361,23 +413,28 @@ class DBImpl : public DB {
   Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
 
   // Force current memtable contents to be flushed.
-  Status TEST_FlushMemTable(bool wait = true,
+  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
                             ColumnFamilyHandle* cfh = nullptr);
 
   // Wait for memtable compaction
   Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
 
   // Wait for any compaction
-  Status TEST_WaitForCompact();
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status TEST_WaitForCompact(bool waitUnscheduled = false);
 
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
-  int64_t TEST_MaxNextLevelOverlappingBytes(ColumnFamilyHandle* column_family =
-                                                nullptr);
+  int64_t TEST_MaxNextLevelOverlappingBytes(
+      ColumnFamilyHandle* column_family = nullptr);
 
   // Return the current manifest file no.
   uint64_t TEST_Current_Manifest_FileNo();
 
+  // Returns the number that'll be assigned to the next file that's created.
+  uint64_t TEST_Current_Next_FileNo();
+
   // get total level0 file size. Only for testing.
   uint64_t TEST_GetLevel0TotalSize();
 
@@ -419,9 +476,16 @@ class DBImpl : public DB {
 
   uint64_t TEST_FindMinLogContainingOutstandingPrep();
   uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+  size_t TEST_PreparedSectionCompletedSize();
+  size_t TEST_LogsWithPrepSize();
 
   int TEST_BGCompactionsAllowed() const;
   int TEST_BGFlushesAllowed() const;
+  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  void TEST_WaitForDumpStatsRun(std::function<void()> callback) const;
+  void TEST_WaitForPersistStatsRun(std::function<void()> callback) const;
+  bool TEST_IsPersistentStatsEnabled() const;
+  size_t TEST_EstiamteStatsHistorySize() const;
 
 #endif  // NDEBUG
 
@@ -443,6 +507,14 @@ class DBImpl : public DB {
 
   uint64_t MinLogNumberToKeep();
 
+  // Returns the lower bound file number for SSTs that won't be deleted, even if
+  // they're obsolete. This lower bound is used internally to prevent newly
+  // created flush/compaction output files from being deleted before they're
+  // installed. This technique avoids the need for tracking the exact numbers of
+  // files pending creation, although it prevents more files than necessary from
+  // being deleted.
+  uint64_t MinObsoleteSstNumberToKeep();
+
   // Returns the list of live files in 'live' and the list
   // of all files in the filesystem in 'candidate_files'.
   // If force == false and the last call was less than
@@ -452,10 +524,12 @@ class DBImpl : public DB {
                          bool no_full_scan = false);
 
   // Diffs the files listed in filenames and those that do not
-  // belong to live files are posibly removed. Also, removes all the
+  // belong to live files are possibly removed. Also, removes all the
   // files in sst_delete_files and log_delete_files.
   // It is not necessary to hold the mutex when invoking this method.
-  void PurgeObsoleteFiles(const JobContext& background_contet,
+  // If FindObsoleteFiles() was run, we need to also run
+  // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+  void PurgeObsoleteFiles(JobContext& background_contet,
                           bool schedule_only = false);
 
   void SchedulePurge();
@@ -481,6 +555,9 @@ class DBImpl : public DB {
   // mutex is held.
   SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
 
+  // Un-reference the super version and clean it up if it is the last reference.
+  void CleanupSuperVersion(SuperVersion* sv);
+
   // Un-reference the super version and return it to thread local cache if
   // needed. If it is the last reference of the super version. Clean it up
   // after un-referencing it.
@@ -497,7 +574,8 @@ class DBImpl : public DB {
   ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
 
   // Same as above, should called without mutex held and not on write thread.
-  ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id);
+  std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+      uint32_t column_family_id);
 
   // Returns the number of currently running flushes.
   // REQUIREMENT: mutex_ must be held when calling this function.
@@ -515,24 +593,58 @@ class DBImpl : public DB {
 
   const WriteController& write_controller() { return write_controller_; }
 
-  InternalIterator* NewInternalIterator(const ReadOptions&,
-                                        ColumnFamilyData* cfd,
-                                        SuperVersion* super_version,
-                                        Arena* arena,
-                                        RangeDelAggregator* range_del_agg);
+  InternalIterator* NewInternalIterator(
+      const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version,
+      Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence);
 
   // hollow transactions shell used for recovery.
   // these will then be passed to TransactionDB so that
   // locks can be reacquired before writing can resume.
   struct RecoveredTransaction {
-    uint64_t log_number_;
     std::string name_;
-    WriteBatch* batch_;
+    bool unprepared_;
+
+    struct BatchInfo {
+      uint64_t log_number_;
+      // TODO(lth): For unprepared, the memory usage here can be big for
+      // unprepared transactions. This is only useful for rollbacks, and we
+      // can in theory just keep keyset for that.
+      WriteBatch* batch_;
+      // Number of sub-batches. A new sub-batch is created if txn attempts to
+      // insert a duplicate key,seq to memtable. This is currently used in
+      // WritePreparedTxn/WriteUnpreparedTxn.
+      size_t batch_cnt_;
+    };
+
+    // This maps the seq of the first key in the batch to BatchInfo, which
+    // contains WriteBatch and other information relevant to the batch.
+    //
+    // For WriteUnprepared, batches_ can have size greater than 1, but for
+    // other write policies, it must be of size 1.
+    std::map<SequenceNumber, BatchInfo> batches_;
+
     explicit RecoveredTransaction(const uint64_t log, const std::string& name,
-                                  WriteBatch* batch)
-        : log_number_(log), name_(name), batch_(batch) {}
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared)
+        : name_(name), unprepared_(unprepared) {
+      batches_[seq] = {log, batch, batch_cnt};
+    }
 
-    ~RecoveredTransaction() { delete batch_; }
+    ~RecoveredTransaction() {
+      for (auto& it : batches_) {
+        delete it.second.batch_;
+      }
+    }
+
+    void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
+                  size_t batch_cnt, bool unprepared) {
+      assert(batches_.count(seq) == 0);
+      batches_[seq] = {log_number, batch, batch_cnt};
+      // Prior state must be unprepared, since the prepare batch must be the
+      // last batch.
+      assert(unprepared_);
+      unprepared_ = unprepared;
+    }
   };
 
   bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
@@ -552,9 +664,21 @@ class DBImpl : public DB {
   }
 
   void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
-                                  WriteBatch* batch) {
-    recovered_transactions_[name] = new RecoveredTransaction(log, name, batch);
-    MarkLogAsContainingPrepSection(log);
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared_batch) {
+    // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
+    // times for every unprepared batch encountered during recovery.
+    //
+    // If the transaction is prepared, then the last call to
+    // InsertRecoveredTransaction will have unprepared_batch = false.
+    auto rtxn = recovered_transactions_.find(name);
+    if (rtxn == recovered_transactions_.end()) {
+      recovered_transactions_[name] = new RecoveredTransaction(
+          log, name, batch, seq, batch_cnt, unprepared_batch);
+    } else {
+      rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
+    }
+    logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
   }
 
   void DeleteRecoveredTransaction(const std::string& name) {
@@ -562,7 +686,10 @@ class DBImpl : public DB {
     assert(it != recovered_transactions_.end());
     auto* trx = it->second;
     recovered_transactions_.erase(it);
-    MarkLogAsHavingPrepSectionFlushed(trx->log_number_);
+    for (const auto& info : trx->batches_) {
+      logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
+          info.second.log_number_);
+    }
     delete trx;
   }
 
@@ -574,25 +701,84 @@ class DBImpl : public DB {
     recovered_transactions_.clear();
   }
 
-  void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
-  void MarkLogAsContainingPrepSection(uint64_t log);
   void AddToLogsToFreeQueue(log::Writer* log_writer) {
     logs_to_free_queue_.push_back(log_writer);
   }
+
+  void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
+
+  // Fill JobContext with snapshot information needed by flush and compaction.
+  void GetSnapshotContext(JobContext* job_context,
+                          std::vector<SequenceNumber>* snapshot_seqs,
+                          SequenceNumber* earliest_write_conflict_snapshot,
+                          SnapshotChecker** snapshot_checker);
+
+  // Not thread-safe.
+  void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
+
   InstrumentedMutex* mutex() { return &mutex_; }
 
   Status NewDB();
 
+  // This is to be used only by internal rocksdb classes.
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                     const bool seq_per_batch, const bool batch_per_txn);
+
+  virtual Status Close() override;
+
+  static Status CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                      std::unique_ptr<Directory>* directory);
+
+  // Given a time window, return an iterator for accessing stats history
+  Status GetStatsHistory(
+      uint64_t start_time, uint64_t end_time,
+      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
+  // find stats map from stats_history_ with smallest timestamp in
+  // the range of [start_time, end_time)
+  bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                       uint64_t* new_time,
+                       std::map<std::string, uint64_t>* stats_map);
+
  protected:
   Env* const env_;
   const std::string dbname_;
-  unique_ptr<VersionSet> versions_;
+  std::unique_ptr<VersionSet> versions_;
+  // Flag to check whether we allocated and own the info log file
+  bool own_info_log_;
   const DBOptions initial_db_options_;
   const ImmutableDBOptions immutable_db_options_;
   MutableDBOptions mutable_db_options_;
   Statistics* stats_;
   std::unordered_map<std::string, RecoveredTransaction*>
       recovered_transactions_;
+  std::unique_ptr<Tracer> tracer_;
+  InstrumentedMutex trace_mutex_;
+
+  // State below is protected by mutex_
+  // With two_write_queues enabled, some of the variables that accessed during
+  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
+  // logs_, logfile_number_. Refer to the definition of each variable below for
+  // more description.
+  mutable InstrumentedMutex mutex_;
+
+  ColumnFamilyHandleImpl* default_cf_handle_;
+  InternalStats* default_cf_internal_stats_;
+
+  // only used for dynamically adjusting max_total_wal_size. it is a sum of
+  // [write_buffer_size * max_write_buffer_number] over all column families
+  uint64_t max_total_in_memory_state_;
+  // If true, we have only one (default) column family. We use this to optimize
+  // some code-paths
+  bool single_column_family_mode_;
+
+  // The options to access storage files
+  const EnvOptions env_options_;
+
+  // Additonal options for compaction and flush
+  EnvOptions env_options_for_compaction_;
 
   // Except in DB::Open(), WriteOptionsFile can only be called when:
   // Persist options to options file.
@@ -614,8 +800,12 @@ class DBImpl : public DB {
                               const MutableCFOptions& mutable_cf_options,
                               int job_id, TableProperties prop);
 
-  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
-                                   Compaction *c, const Status &st,
+  void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                               const Status& st,
+                               const CompactionJobStats& job_stats, int job_id);
+
+  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
+                                   const Status& st,
                                    const CompactionJobStats& job_stats,
                                    int job_id);
   void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
@@ -632,10 +822,27 @@ class DBImpl : public DB {
 
   void EraseThreadStatusDbInfo() const;
 
+  // If disable_memtable is set the application logic must guarantee that the
+  // batch will still be skipped from memtable during the recovery. An excption
+  // to this is seq_per_batch_ mode, in which since each batch already takes one
+  // seq, it is ok for the batch to write to memtable during recovery as long as
+  // it only takes one sequence number: i.e., no duplicate keys.
+  // In WriteCommitted it is guarnateed since disable_memtable is used for
+  // prepare batch which will be written to memtable later during the commit,
+  // and in WritePrepared it is guaranteed since it will be used only for WAL
+  // markers which will never be written to memtable. If the commit marker is
+  // accompanied with CommitTimeWriteBatch that is not written to memtable as
+  // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+  // policy.
+  // batch_cnt is expected to be non-zero in seq_per_batch mode and
+  // indicates the number of sub-patches. A sub-patch is a subset of the write
+  // batch that does not have duplicate keys.
   Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
                    WriteCallback* callback = nullptr,
                    uint64_t* log_used = nullptr, uint64_t log_ref = 0,
-                   bool disable_memtable = false, uint64_t* seq_used = nullptr);
+                   bool disable_memtable = false, uint64_t* seq_used = nullptr,
+                   size_t batch_cnt = 0,
+                   PreReleaseCallback* pre_release_callback = nullptr);
 
   Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
                             WriteCallback* callback = nullptr,
@@ -643,40 +850,69 @@ class DBImpl : public DB {
                             bool disable_memtable = false,
                             uint64_t* seq_used = nullptr);
 
+  // batch_cnt is expected to be non-zero in seq_per_batch mode and indicates
+  // the number of sub-patches. A sub-patch is a subset of the write batch that
+  // does not have duplicate keys.
   Status WriteImplWALOnly(const WriteOptions& options, WriteBatch* updates,
                           WriteCallback* callback = nullptr,
                           uint64_t* log_used = nullptr, uint64_t log_ref = 0,
-                          uint64_t* seq_used = nullptr);
+                          uint64_t* seq_used = nullptr, size_t batch_cnt = 0,
+                          PreReleaseCallback* pre_release_callback = nullptr);
 
-  uint64_t FindMinLogContainingOutstandingPrep();
-  uint64_t FindMinPrepLogReferencedByMemTable();
+  // write cached_recoverable_state_ to memtable if it is not empty
+  // The writer must be the leader in write_thread_ and holding mutex_
+  Status WriteRecoverableState();
+
+  // Actual implementation of Close()
+  Status CloseImpl();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  virtual Status Recover(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only = false, bool error_if_log_file_exist = false,
+      bool error_if_data_exists_in_logs = false);
 
  private:
   friend class DB;
+  friend class ErrorHandler;
   friend class InternalStats;
   friend class PessimisticTransaction;
+  friend class TransactionBaseImpl;
   friend class WriteCommittedTxn;
   friend class WritePreparedTxn;
+  friend class WritePreparedTxnDB;
+  friend class WriteBatchWithIndex;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+
 #ifndef ROCKSDB_LITE
   friend class ForwardIterator;
 #endif
   friend struct SuperVersion;
   friend class CompactedDBImpl;
+  friend class DBTest_ConcurrentFlushWAL_Test;
+  friend class DBTest_MixedSlowdownOptionsStop_Test;
+  friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
 #ifndef NDEBUG
   friend class DBTest2_ReadCallbackTest_Test;
+  friend class WriteCallbackTest_WriteWithCallbackTest_Test;
   friend class XFTransactionWriteHandler;
   friend class DBBlobIndexTest;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
 #endif
   struct CompactionState;
 
   struct WriteContext {
-    autovector<SuperVersion*> superversions_to_free_;
+    SuperVersionContext superversion_context;
     autovector<MemTable*> memtables_to_free_;
 
+    explicit WriteContext(bool create_superversion = false)
+        : superversion_context(create_superversion) {}
+
     ~WriteContext() {
-      for (auto& sv : superversions_to_free_) {
-        delete sv;
-      }
+      superversion_context.Clean();
       for (auto& m : memtables_to_free_) {
         delete m;
       }
@@ -686,12 +922,7 @@ class DBImpl : public DB {
   struct PrepickedCompaction;
   struct PurgeFileInfo;
 
-  // Recover the descriptor from persistent storage.  May do a significant
-  // amount of work to recover recently logged updates.  Any changes to
-  // be made to the descriptor are added to *edit.
-  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
-                 bool read_only = false, bool error_if_log_file_exist = false,
-                 bool error_if_data_exists_in_logs = false);
+  Status ResumeImpl();
 
   void MaybeIgnoreError(Status* s) const;
 
@@ -706,9 +937,9 @@ class DBImpl : public DB {
   // Delete any unneeded files and stale in-memory entries.
   void DeleteObsoleteFiles();
   // Delete obsolete files and log status and information of file deletion
-  void DeleteObsoleteFileImpl(Status file_deletion_status, int job_id,
-                              const std::string& fname, FileType type,
-                              uint64_t number, uint32_t path_id);
+  void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                              const std::string& path_to_sync, FileType type,
+                              uint64_t number);
 
   // Background process needs to call
   //     auto x = CaptureCurrentFileNumberInPendingOutputs()
@@ -732,11 +963,54 @@ class DBImpl : public DB {
   Status SyncClosedLogs(JobContext* job_context);
 
   // Flush the in-memory write buffer to storage.  Switches to a new
-  // log-file/memtable and writes a new descriptor iff successful.
-  Status FlushMemTableToOutputFile(ColumnFamilyData* cfd,
-                                   const MutableCFOptions& mutable_cf_options,
-                                   bool* madeProgress, JobContext* job_context,
-                                   LogBuffer* log_buffer);
+  // log-file/memtable and writes a new descriptor iff successful. Then
+  // installs a new super version for the column family.
+  Status FlushMemTableToOutputFile(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      bool* madeProgress, JobContext* job_context,
+      SuperVersionContext* superversion_context,
+      std::vector<SequenceNumber>& snapshot_seqs,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+      Env::Priority thread_pri);
+
+  // Argument required by background flush thread.
+  struct BGFlushArg {
+    BGFlushArg()
+        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+               SuperVersionContext* superversion_context)
+        : cfd_(cfd),
+          max_memtable_id_(max_memtable_id),
+          superversion_context_(superversion_context) {}
+
+    // Column family to flush.
+    ColumnFamilyData* cfd_;
+    // Maximum ID of memtable to flush. In this column family, memtables with
+    // IDs smaller than this value must be flushed before this flush completes.
+    uint64_t max_memtable_id_;
+    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+    // installs a new superversion for the column family. This operation
+    // requires a SuperVersionContext object (currently embedded in JobContext).
+    SuperVersionContext* superversion_context_;
+  };
+
+  // Argument passed to flush thread.
+  struct FlushThreadArg {
+    DBImpl* db_;
+
+    Env::Priority thread_pri_;
+  };
+
+  // Flush the memtables of (multiple) column families to multiple files on
+  // persistent storage.
+  Status FlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+  Status AtomicFlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
 
   // REQUIRES: log_numbers are sorted in ascending order
   Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
@@ -750,6 +1024,12 @@ class DBImpl : public DB {
   Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                      MemTable* mem, VersionEdit* edit);
 
+  // Restore alive_log_files_ and total_log_size_ after recovery.
+  // It needs to run only when there's no flush during recovery
+  // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+  // in case total_log_size > max_total_wal_size.
+  Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
   // num_bytes: for slowdown case, delay time is calculated based on
   //            `num_bytes` going through.
   Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
@@ -761,15 +1041,44 @@ class DBImpl : public DB {
 
   Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
 
+  void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
   // Force current memtable contents to be flushed.
   Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
-                       bool writes_stopped = false);
+                       FlushReason flush_reason, bool writes_stopped = false);
+
+  Status AtomicFlushMemTables(
+      const autovector<ColumnFamilyData*>& column_family_datas,
+      const FlushOptions& options, FlushReason flush_reason,
+      bool writes_stopped = false);
+
+  // Wait until flushing this column family won't stall writes
+  Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                           bool* flush_needed);
+
+  // Wait for memtable flushed.
+  // If flush_memtable_id is non-null, wait until the memtable with the ID
+  // gets flush. Otherwise, wait until the column family don't have any
+  // memtable pending flush.
+  // resuming_from_bg_err indicates whether the caller is attempting to resume
+  // from background error.
+  Status WaitForFlushMemTable(ColumnFamilyData* cfd,
+                              const uint64_t* flush_memtable_id = nullptr,
+                              bool resuming_from_bg_err = false) {
+    return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+                                 resuming_from_bg_err);
+  }
+  // Wait for memtables to be flushed for multiple column families.
+  Status WaitForFlushMemTables(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const uint64_t*>& flush_memtable_ids,
+      bool resuming_from_bg_err);
 
-  // Wait for memtable flushed
-  Status WaitForFlushMemTable(ColumnFamilyData* cfd);
+  // REQUIRES: mutex locked and in write thread.
+  void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
 
   // REQUIRES: mutex locked
-  Status HandleWALFull(WriteContext* write_context);
+  Status SwitchWAL(WriteContext* write_context);
 
   // REQUIRES: mutex locked
   Status HandleWriteBufferFull(WriteContext* write_context);
@@ -779,7 +1088,8 @@ class DBImpl : public DB {
                          WriteContext* write_context);
 
   WriteBatch* MergeBatch(const WriteThread::WriteGroup& write_group,
-                         WriteBatch* tmp_batch, size_t* write_with_wal);
+                         WriteBatch* tmp_batch, size_t* write_with_wal,
+                         WriteBatch** to_be_cached_state);
 
   Status WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
                     uint64_t* log_used, uint64_t* log_size);
@@ -791,10 +1101,10 @@ class DBImpl : public DB {
 
   Status ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
                               uint64_t* log_used, SequenceNumber* last_sequence,
-                              int total_count);
+                              size_t seq_inc);
 
   // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
-  void WriteCallbackStatusCheck(const Status& status);
+  void WriteStatusCheck(const Status& status);
 
   // Used by WriteImpl to update bg_error_ in case of memtable insert error.
   void MemTableInsertStatusCheck(const Status& memtable_insert_status);
@@ -804,8 +1114,10 @@ class DBImpl : public DB {
   Status CompactFilesImpl(const CompactionOptions& compact_options,
                           ColumnFamilyData* cfd, Version* version,
                           const std::vector<std::string>& input_file_names,
+                          std::vector<std::string>* const output_file_names,
                           const int output_level, int output_path_id,
-                          JobContext* job_context, LogBuffer* log_buffer);
+                          JobContext* job_context, LogBuffer* log_buffer,
+                          CompactionJobInfo* compaction_job_info);
 
   // Wait for current IngestExternalFile() calls to finish.
   // REQUIRES: mutex_ held
@@ -820,36 +1132,71 @@ class DBImpl : public DB {
   ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
 
   void MaybeScheduleFlushOrCompaction();
-  void SchedulePendingFlush(ColumnFamilyData* cfd);
+
+  // A flush request specifies the column families to flush as well as the
+  // largest memtable id to persist for each column family. Once all the
+  // memtables whose IDs are smaller than or equal to this per-column-family
+  // specified value, this flush request is considered to have completed its
+  // work of flushing this column family. After completing the work for all
+  // column families in this request, this flush is considered complete.
+  typedef std::vector<std::pair<ColumnFamilyData*, uint64_t>> FlushRequest;
+
+  void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                            FlushRequest* req);
+
+  void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
   void SchedulePendingCompaction(ColumnFamilyData* cfd);
-  void SchedulePendingPurge(std::string fname, FileType type, uint64_t number,
-                            uint32_t path_id, int job_id);
+  void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                            FileType type, uint64_t number, int job_id);
   static void BGWorkCompaction(void* arg);
   // Runs a pre-chosen universal compaction involving bottom level in a
   // separate, bottom-pri thread pool.
   static void BGWorkBottomCompaction(void* arg);
-  static void BGWorkFlush(void* db);
+  static void BGWorkFlush(void* arg);
   static void BGWorkPurge(void* arg);
-  static void UnscheduleCallback(void* arg);
+  static void UnscheduleCompactionCallback(void* arg);
+  static void UnscheduleFlushCallback(void* arg);
   void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
-                                Env::Priority bg_thread_pri);
-  void BackgroundCallFlush();
+                                Env::Priority thread_pri);
+  void BackgroundCallFlush(Env::Priority thread_pri);
   void BackgroundCallPurge();
   Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
                               LogBuffer* log_buffer,
-                              PrepickedCompaction* prepicked_compaction);
+                              PrepickedCompaction* prepicked_compaction,
+                              Env::Priority thread_pri);
   Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
-                         LogBuffer* log_buffer);
+                         LogBuffer* log_buffer, FlushReason* reason,
+                         Env::Priority thread_pri);
+
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               bool* sfm_bookkeeping, LogBuffer* log_buffer);
+
+  // Request compaction tasks token from compaction thread limiter.
+  // It always succeeds if force = true or limiter is disable.
+  bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                              std::unique_ptr<TaskLimiterToken>* token,
+                              LogBuffer* log_buffer);
+
+  // Schedule background tasks
+  void StartTimedTasks();
 
   void PrintStatistics();
 
+  size_t EstiamteStatsHistorySize() const;
+
+  // persist stats to column family "_persistent_stats"
+  void PersistStats();
+
   // dump rocksdb.stats to LOG
-  void MaybeDumpStats();
+  void DumpStats();
 
   // Return the minimum empty level that could hold the total data in the
   // input level. Return the input level, if such level could not be found.
   int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
-      const MutableCFOptions& mutable_cf_options, int level);
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level);
 
   // Move the files in the input level to the target level.
   // If target_level < 0, automatically calculate the minimum level that could
@@ -859,33 +1206,39 @@ class DBImpl : public DB {
   // helper functions for adding and removing from flush & compaction queues
   void AddToCompactionQueue(ColumnFamilyData* cfd);
   ColumnFamilyData* PopFirstFromCompactionQueue();
-  void AddToFlushQueue(ColumnFamilyData* cfd);
-  ColumnFamilyData* PopFirstFromFlushQueue();
+  FlushRequest PopFirstFromFlushQueue();
+
+  // Pick the first unthrottled compaction with task token from queue.
+  ColumnFamilyData* PickCompactionFromQueue(
+      std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
 
   // helper function to call after some of the logs_ were synced
   void MarkLogsSynced(uint64_t up_to, bool synced_dir, const Status& status);
 
-  const Snapshot* GetSnapshotImpl(bool is_write_conflict_boundary);
+  SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
+                                bool lock = true);
 
   uint64_t GetMaxTotalWalSize() const;
 
+  Directory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+
+  Status CloseHelper();
+
+  void WaitForBackgroundWork();
+
   // table_cache_ provides its own synchronization
   std::shared_ptr<Cache> table_cache_;
 
   // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
   FileLock* db_lock_;
 
+  // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
+  InstrumentedMutex stats_history_mutex_;
   // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
-  // logfile_number_. With concurrent_prepare it also protects alive_log_files_,
+  // logfile_number_. With two_write_queues it also protects alive_log_files_,
   // and log_empty_. Refer to the definition of each variable below for more
   // details.
   InstrumentedMutex log_write_mutex_;
-  // State below is protected by mutex_
-  // With concurrent_prepare enabled, some of the variables that accessed during
-  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
-  // logs_, logfile_number_. Refer to the definition of each variable below for
-  // more description.
-  mutable InstrumentedMutex mutex_;
 
   std::atomic<bool> shutting_down_;
   // This condition variable is signaled on these conditions:
@@ -897,6 +1250,10 @@ class DBImpl : public DB {
   // (i.e. whenever a flush is done, even if it didn't make any progress)
   // * whenever there is an error in background purge, flush or compaction
   // * whenever num_running_ingest_file_ goes to 0.
+  // * whenever pending_purge_obsolete_files_ goes to 0.
+  // * whenever disable_delete_obsolete_files_ goes to 0.
+  // * whenever SetOptions successfully updates options.
+  // * whenever a column family is dropped.
   InstrumentedCondVar bg_cv_;
   // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
   // must be under either mutex_ or log_write_mutex_. Since after ::Open,
@@ -904,21 +1261,19 @@ class DBImpl : public DB {
   // from the same write_thread_ without any locks.
   uint64_t logfile_number_;
   std::deque<uint64_t>
-      log_recycle_files;  // a list of log files that we can recycle
+      log_recycle_files_;  // a list of log files that we can recycle
   bool log_dir_synced_;
-  // Without concurrent_prepare, read and writes to log_empty_ are protected by
+  // Without two_write_queues, read and writes to log_empty_ are protected by
   // mutex_. Since it is currently updated/read only in write_thread_, it can be
   // accessed from the same write_thread_ without any locks. With
-  // concurrent_prepare writes, where it can be updated in different threads,
+  // two_write_queues writes, where it can be updated in different threads,
   // read and writes are protected by log_write_mutex_ instead. This is to avoid
   // expesnive mutex_ lock during WAL write, which update log_empty_.
   bool log_empty_;
-  ColumnFamilyHandleImpl* default_cf_handle_;
-  InternalStats* default_cf_internal_stats_;
-  unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+
+  std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
   struct LogFileNumberSize {
-    explicit LogFileNumberSize(uint64_t _number)
-        : number(_number) {}
+    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
     void AddSize(uint64_t new_size) { size += new_size; }
     uint64_t number;
     uint64_t size = 0;
@@ -934,22 +1289,24 @@ class DBImpl : public DB {
       writer = nullptr;
       return w;
     }
-    void ClearWriter() {
+    Status ClearWriter() {
+      Status s = writer->WriteBuffer();
       delete writer;
       writer = nullptr;
+      return s;
     }
 
     uint64_t number;
     // Visual Studio doesn't support deque's member to be noncopyable because
-    // of a unique_ptr as a member.
+    // of a std::unique_ptr as a member.
     log::Writer* writer;  // own
     // true for some prefix of logs_
     bool getting_synced = false;
   };
-  // Without concurrent_prepare, read and writes to alive_log_files_ are
+  // Without two_write_queues, read and writes to alive_log_files_ are
   // protected by mutex_. However since back() is never popped, and push_back()
   // is done only from write_thread_, the same thread can access the item
-  // reffered by back() without mutex_. With concurrent_prepare_, writes
+  // reffered by back() without mutex_. With two_write_queues_, writes
   // are protected by locking both mutex_ and log_write_mutex_, and reads must
   // be under either mutex_ or log_write_mutex_.
   std::deque<LogFileNumberSize> alive_log_files_;
@@ -969,19 +1326,29 @@ class DBImpl : public DB {
   std::deque<LogWriterNumber> logs_;
   // Signaled when getting_synced becomes false for some of the logs_.
   InstrumentedCondVar log_sync_cv_;
+  // This is the app-level state that is written to the WAL but will be used
+  // only during recovery. Using this feature enables not writing the state to
+  // memtable on normal writes and hence improving the throughput. Each new
+  // write of the state will replace the previous state entirely even if the
+  // keys in the two consecuitive states do not overlap.
+  // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+  // Otherwise only the heaad of write_thread_ can access it.
+  WriteBatch cached_recoverable_state_;
+  std::atomic<bool> cached_recoverable_state_empty_ = {true};
   std::atomic<uint64_t> total_log_size_;
-  // only used for dynamically adjusting max_total_wal_size. it is a sum of
-  // [write_buffer_size * max_write_buffer_number] over all column families
-  uint64_t max_total_in_memory_state_;
-  // If true, we have only one (default) column family. We use this to optimize
-  // some code-paths
-  bool single_column_family_mode_;
+
   // If this is non-empty, we need to delete these log files in background
   // threads. Protected by db mutex.
   autovector<log::Writer*> logs_to_free_;
 
   bool is_snapshot_supported_;
 
+  std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
+
+  std::map<std::string, uint64_t> stats_slice_;
+
+  bool stats_slice_initialized_ = false;
+
   // Class to maintain directories for all database paths other than main one.
   class Directories {
    public:
@@ -989,7 +1356,7 @@ class DBImpl : public DB {
                           const std::string& wal_dir,
                           const std::vector<DbPath>& data_paths);
 
-    Directory* GetDataDir(size_t path_id);
+    Directory* GetDataDir(size_t path_id) const;
 
     Directory* GetWalDir() {
       if (wal_dir_) {
@@ -1004,9 +1371,6 @@ class DBImpl : public DB {
     std::unique_ptr<Directory> db_dir_;
     std::vector<std::unique_ptr<Directory>> data_dirs_;
     std::unique_ptr<Directory> wal_dir_;
-
-    Status CreateAndNewDirectory(Env* env, const std::string& dirname,
-                                 std::unique_ptr<Directory>* directory) const;
   };
 
   Directories directories_;
@@ -1021,7 +1385,7 @@ class DBImpl : public DB {
 
   WriteController write_controller_;
 
-  unique_ptr<RateLimiter> low_pri_write_rate_limiter_;
+  std::unique_ptr<RateLimiter> low_pri_write_rate_limiter_;
 
   // Size of the last batch group. In slowdown mode, next write needs to
   // sleep if it uses up the quota.
@@ -1048,13 +1412,13 @@ class DBImpl : public DB {
   // purge_queue_
   struct PurgeFileInfo {
     std::string fname;
+    std::string dir_to_sync;
     FileType type;
     uint64_t number;
-    uint32_t path_id;
     int job_id;
-    PurgeFileInfo(std::string fn, FileType t, uint64_t num, uint32_t pid,
+    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
                   int jid)
-        : fname(fn), type(t), number(num), path_id(pid), job_id(jid) {}
+        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
   };
 
   // flush_queue_ and compaction_queue_ hold column families that we need to
@@ -1070,14 +1434,14 @@ class DBImpl : public DB {
   // compacted. Consumers of these queues are flush and compaction threads. When
   // column family is put on this queue, we increase unscheduled_flushes_ and
   // unscheduled_compactions_. When these variables are bigger than zero, that
-  // means we need to schedule background threads for compaction and thread.
+  // means we need to schedule background threads for flush and compaction.
   // Once the background threads are scheduled, we decrease unscheduled_flushes_
   // and unscheduled_compactions_. That way we keep track of number of
   // compaction and flush threads we need to schedule. This scheduling is done
   // in MaybeScheduleFlushOrCompaction()
   // invariant(column family present in flush_queue_ <==>
   // ColumnFamilyData::pending_flush_ == true)
-  std::deque<ColumnFamilyData*> flush_queue_;
+  std::deque<FlushRequest> flush_queue_;
   // invariant(column family present in compaction_queue_ <==>
   // ColumnFamilyData::pending_compaction_ == true)
   std::deque<ColumnFamilyData*> compaction_queue_;
@@ -1085,6 +1449,10 @@ class DBImpl : public DB {
   // A queue to store filenames of the files to be purged
   std::deque<PurgeFileInfo> purge_queue_;
 
+  // A vector to store the file numbers that have been assigned to certain
+  // JobContext. Current implementation tracks ssts only.
+  std::vector<uint64_t> files_grabbed_for_purge_;
+
   // A queue to store log writers to close
   std::deque<log::Writer*> logs_to_free_queue_;
   int unscheduled_flushes_;
@@ -1117,15 +1485,15 @@ class DBImpl : public DB {
     uint32_t output_path_id;
     Status status;
     bool done;
-    bool in_progress;             // compaction request being processed?
-    bool incomplete;              // only part of requested range compacted
-    bool exclusive;               // current behavior of only one manual
-    bool disallow_trivial_move;   // Force actual compaction to run
-    const InternalKey* begin;     // nullptr means beginning of key range
-    const InternalKey* end;       // nullptr means end of key range
-    InternalKey* manual_end;      // how far we are compacting
-    InternalKey tmp_storage;      // Used to keep track of compaction progress
-    InternalKey tmp_storage1;     // Used to keep track of compaction progress
+    bool in_progress;            // compaction request being processed?
+    bool incomplete;             // only part of requested range compacted
+    bool exclusive;              // current behavior of only one manual
+    bool disallow_trivial_move;  // Force actual compaction to run
+    const InternalKey* begin;    // nullptr means beginning of key range
+    const InternalKey* end;      // nullptr means end of key range
+    InternalKey* manual_end;     // how far we are compacting
+    InternalKey tmp_storage;     // Used to keep track of compaction progress
+    InternalKey tmp_storage1;    // Used to keep track of compaction progress
   };
   struct PrepickedCompaction {
     // background compaction takes ownership of `compaction`.
@@ -1133,6 +1501,8 @@ class DBImpl : public DB {
     // caller retains ownership of `manual_compaction_state` as it is reused
     // across background compactions.
     ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
+    // task limiter token is requested during compaction picking.
+    std::unique_ptr<TaskLimiterToken> task_token;
   };
   std::deque<ManualCompactionState*> manual_compaction_dequeue_;
 
@@ -1143,9 +1513,6 @@ class DBImpl : public DB {
     PrepickedCompaction* prepicked_compaction;
   };
 
-  // Have we encountered a background error in paranoid mode?
-  Status bg_error_;
-
   // shall we disable deletion of obsolete files
   // if 0 the deletion is enabled.
   // if non-zero, files will not be getting deleted
@@ -1154,6 +1521,10 @@ class DBImpl : public DB {
   // without any synchronization
   int disable_delete_obsolete_files_;
 
+  // Number of times FindObsoleteFiles has found deletable files and the
+  // corresponding call to PurgeObsoleteFiles has not yet finished.
+  int pending_purge_obsolete_files_;
+
   // last time when DeleteObsoleteFiles with full scan was executed. Originaly
   // initialized with startup time.
   uint64_t delete_obsolete_files_last_run_;
@@ -1171,12 +1542,12 @@ class DBImpl : public DB {
   std::atomic<bool> has_unpersisted_data_;
 
   // if an attempt was made to flush all column families that
-  // the oldest log depends on but uncommited data in the oldest
+  // the oldest log depends on but uncommitted data in the oldest
   // log prevents the log from being released.
   // We must attempt to free the dependent memtables again
   // at a later time after the transaction in the oldest
   // log is fully commited.
-  bool unable_to_flush_oldest_log_;
+  bool unable_to_release_oldest_log_;
 
   static const int KEEP_LOG_FILE_NUM = 1000;
   // MSVC version 1800 still does not have constexpr for ::max()
@@ -1184,9 +1555,6 @@ class DBImpl : public DB {
 
   std::string db_absolute_path_;
 
-  // The options to access storage files
-  const EnvOptions env_options_;
-
   // Number of running IngestExternalFile() calls.
   // REQUIRES: mutex held
   int num_running_ingest_file_;
@@ -1210,27 +1578,27 @@ class DBImpl : public DB {
   // Indicate DB was opened successfully
   bool opened_successfully_;
 
-  // minimum log number still containing prepared data.
-  // this is used by FindObsoleteFiles to determine which
-  // flushed logs we must keep around because they still
-  // contain prepared data which has not been flushed or rolled back
-  std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
-      min_log_with_prep_;
-
-  // to be used in conjunction with min_log_with_prep_.
-  // once a transaction with data in log L is committed or rolled back
-  // rather than removing the value from the heap we add that value
-  // to prepared_section_completed_ which maps LOG -> instance_count
-  // since a log could contain multiple prepared sections
-  //
-  // when trying to determine the minimum log still active we first
-  // consult min_log_with_prep_. while that root value maps to
-  // a value > 0 in prepared_section_completed_ we decrement the
-  // instance_count for that log and pop the root value in
-  // min_log_with_prep_. This will work the same as a min_heap
-  // where we are deleteing arbitrary elements and the up heaping.
-  std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
-  std::mutex prep_heap_mutex_;
+  // The min threshold to triggere bottommost compaction for removing
+  // garbages, among all column families.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  LogsWithPrepTracker logs_with_prep_tracker_;
+
+  // Callback for compaction to check if a key is visible to a snapshot.
+  // REQUIRES: mutex held
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
+
+  // Callback for when the cached_recoverable_state_ is written to memtable
+  // Only to be set during initialization
+  std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
+
+  // handle for scheduling stats dumping at fixed intervals
+  // REQUIRES: mutex locked
+  std::unique_ptr<rocksdb::RepeatableThread> thread_dump_stats_;
+
+  // handle for scheduling stats snapshoting at fixed intervals
+  // REQUIRES: mutex locked
+  std::unique_ptr<rocksdb::RepeatableThread> thread_persist_stats_;
 
   // No copying allowed
   DBImpl(const DBImpl&);
@@ -1238,24 +1606,20 @@ class DBImpl : public DB {
 
   // Background threads call this function, which is just a wrapper around
   // the InstallSuperVersion() function. Background threads carry
-  // job_context which can have new_superversion already
+  // sv_context which can have new_superversion already
   // allocated.
-  void InstallSuperVersionAndScheduleWorkWrapper(
-      ColumnFamilyData* cfd, JobContext* job_context,
-      const MutableCFOptions& mutable_cf_options);
-
   // All ColumnFamily state changes go through this function. Here we analyze
   // the new state and we schedule background work if we detect that the new
   // state needs flush or compaction.
-  SuperVersion* InstallSuperVersionAndScheduleWork(
-      ColumnFamilyData* cfd, SuperVersion* new_sv,
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
       const MutableCFOptions& mutable_cf_options);
 
 #ifndef ROCKSDB_LITE
   using DB::GetPropertiesOfAllTables;
-  virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
-                                          TablePropertiesCollection* props)
-      override;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override;
   virtual Status GetPropertiesOfTablesInRange(
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) override;
@@ -1265,6 +1629,7 @@ class DBImpl : public DB {
   bool GetIntPropertyInternal(ColumnFamilyData* cfd,
                               const DBPropertyInfo& property_info,
                               bool is_locked, uint64_t* value);
+  bool GetPropertyHandleOptionsStatistics(std::string* value);
 
   bool HasPendingManualCompaction();
   bool HasExclusiveManualCompaction();
@@ -1273,17 +1638,88 @@ class DBImpl : public DB {
   bool ShouldntRunManualCompaction(ManualCompactionState* m);
   bool HaveManualCompaction(ColumnFamilyData* cfd);
   bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+                              const Status& st,
+                              const CompactionJobStats& compaction_job_stats,
+                              const int job_id, const Version* current,
+                              CompactionJobInfo* compaction_job_info) const;
+  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+  // and return the current file_number in 'next_file_number'.
+  // Write a version edit to the MANIFEST.
+  Status ReserveFileNumbersBeforeIngestion(
+      ColumnFamilyData* cfd, uint64_t num,
+      std::list<uint64_t>::iterator* pending_output_elem,
+      uint64_t* next_file_number);
+#endif  //! ROCKSDB_LITE
+
+  bool ShouldPurge(uint64_t file_number) const;
+  void MarkAsGrabbedForPurge(uint64_t file_number);
 
   size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
 
-  // When set, we use a seprate queue for writes that dont write to memtable. In
-  // 2PC these are the writes at Prepare phase.
-  const bool concurrent_prepare_;
+  // When set, we use a separate queue for writes that dont write to memtable.
+  // In 2PC these are the writes at Prepare phase.
+  const bool two_write_queues_;
   const bool manual_wal_flush_;
+  // Increase the sequence number after writing each batch, whether memtable is
+  // disabled for that or not. Otherwise the sequence number is increased after
+  // writing each key into memtable. This implies that when disable_memtable is
+  // set, the seq is not increased at all.
+  //
+  // Default: false
+  const bool seq_per_batch_;
+  // This determines during recovery whether we expect one writebatch per
+  // recovered transaction, or potentially multiple writebatches per
+  // transaction. For WriteUnprepared, this is set to false, since multiple
+  // batches can exist per transaction.
+  //
+  // Default: true
+  const bool batch_per_txn_;
+  // LastSequence also indicates last published sequence visibile to the
+  // readers. Otherwise LastPublishedSequence should be used.
+  const bool last_seq_same_as_publish_seq_;
+  // It indicates that a customized gc algorithm must be used for
+  // flush/compaction and if it is not provided vis SnapshotChecker, we should
+  // disable gc to be safe.
+  const bool use_custom_gc_;
+  // Flag to indicate that the DB instance shutdown has been initiated. This
+  // different from shutting_down_ atomic in that it is set at the beginning
+  // of shutdown sequence, specifically in order to prevent any background
+  // error recovery from going on in parallel. The latter, shutting_down_,
+  // is set a little later during the shutdown after scheduling memtable
+  // flushes
+  std::atomic<bool> shutdown_initiated_;
+  // Flag to indicate whether sst_file_manager object was allocated in
+  // DB::Open() or passed to us
+  bool own_sfm_;
+
+  // Clients must periodically call SetPreserveDeletesSequenceNumber()
+  // to advance this seqnum. Default value is 0 which means ALL deletes are
+  // preserved. Note that this has no effect if DBOptions.preserve_deletes
+  // is set to false.
+  std::atomic<SequenceNumber> preserve_deletes_seqnum_;
+  const bool preserve_deletes_;
+
+  // Flag to check whether Close() has been called on this DB
+  bool closed_;
+
+  ErrorHandler error_handler_;
+
+  // Conditional variable to coordinate installation of atomic flush results.
+  // With atomic flush, each bg thread installs the result of flushing multiple
+  // column families, and different threads can flush different column
+  // families. It's difficult to rely on one thread to perform batch
+  // installation for all threads. This is different from the non-atomic flush
+  // case.
+  // atomic_flush_install_cv_ makes sure that threads install atomic flush
+  // results sequentially. Flush results of memtables with lower IDs get
+  // installed to MANIFEST first.
+  InstrumentedCondVar atomic_flush_install_cv_;
 };
 
-extern Options SanitizeOptions(const std::string& db,
-                               const Options& src);
+extern Options SanitizeOptions(const std::string& db, const Options& src);
 
 extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);
 
@@ -1291,6 +1727,25 @@ extern CompressionType GetCompressionFlush(
     const ImmutableCFOptions& ioptions,
     const MutableCFOptions& mutable_cf_options);
 
+// Return the earliest log file to keep after the memtable flush is
+// finalized.
+// `cfd_to_flush` is the column family whose memtable (specified in
+// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
+// file.
+// The function is only applicable to 2pc mode.
+extern uint64_t PrecomputeMinLogNumberToKeep(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    autovector<VersionEdit*> edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+
+// `cfd_to_flush` is the column family whose memtable will be flushed and thus
+// will not depend on any WAL file. nullptr means no memtable is being flushed.
+// The function is only applicable to 2pc mode.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+    const autovector<MemTable*>& memtables_to_flush);
+
 // Fix user-supplied options to be reasonable
 template <class T, class V>
 static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
diff --git a/thirdparty/rocksdb/db/db_impl_compaction_flush.cc b/thirdparty/rocksdb/db/db_impl_compaction_flush.cc
index 3e686fe703..f208b873dd 100644
--- a/thirdparty/rocksdb/db/db_impl_compaction_flush.cc
+++ b/thirdparty/rocksdb/db/db_impl_compaction_flush.cc
@@ -14,15 +14,73 @@
 #include <inttypes.h>
 
 #include "db/builder.h"
+#include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_updater.h"
 #include "monitoring/thread_status_util.h"
+#include "util/concurrent_task_limiter_impl.h"
 #include "util/sst_file_manager_impl.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
+
+bool DBImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
+  // Check if we have enough room to do the compaction
+  bool enough_room = true;
+#ifndef ROCKSDB_LITE
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm) {
+    // Pass the current bg_error_ to SFM so it can decide what checks to
+    // perform. If this DB instance hasn't seen any error yet, the SFM can be
+    // optimistic and not do disk space checks
+    enough_room =
+        sfm->EnoughRoomForCompaction(cfd, inputs, error_handler_.GetBGError());
+    if (enough_room) {
+      *sfm_reserved_compact_space = true;
+    }
+  }
+#else
+  (void)cfd;
+  (void)inputs;
+  (void)sfm_reserved_compact_space;
+#endif  // ROCKSDB_LITE
+  if (!enough_room) {
+    // Just in case tests want to change the value of enough_room
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room);
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Cancelled compaction because not enough room");
+    RecordTick(stats_, COMPACTION_CANCELLED, 1);
+  }
+  return enough_room;
+}
+
+bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                                    std::unique_ptr<TaskLimiterToken>* token,
+                                    LogBuffer* log_buffer) {
+  assert(*token == nullptr);
+  auto limiter = static_cast<ConcurrentTaskLimiterImpl*>(
+      cfd->ioptions()->compaction_thread_limiter.get());
+  if (limiter == nullptr) {
+    return true;
+  }
+  *token = limiter->GetToken(force);
+  if (*token != nullptr) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Thread limiter [%s] increase [%s] compaction task, "
+                     "force: %s, tasks after: %d",
+                     limiter->GetName().c_str(), cfd->GetName().c_str(),
+                     force ? "true" : "false", limiter->GetOutstandingTask());
+    return true;
+  }
+  return false;
+}
+
 Status DBImpl::SyncClosedLogs(JobContext* job_context) {
   TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
   mutex_.AssertHeld();
@@ -49,6 +107,9 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) {
                      "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
                      log->get_log_number());
       s = log->file()->Sync(immutable_db_options_.use_fsync);
+      if (!s.ok()) {
+        break;
+      }
     }
     if (s.ok()) {
       s = directories_.GetWalDir()->Fsync();
@@ -60,14 +121,7 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) {
     // "number < current_log_number".
     MarkLogsSynced(current_log_number - 1, true, s);
     if (!s.ok()) {
-      Status new_bg_error = s;
-      // may temporarily unlock and lock the mutex.
-      EventHelpers::NotifyOnBackgroundError(immutable_db_options_.listeners,
-                                            BackgroundErrorReason::kFlush,
-                                            &new_bg_error, &mutex_);
-      if (!new_bg_error.ok()) {
-        bg_error_ = new_bg_error;
-      }
+      error_handler_.SetBGError(s, BackgroundErrorReason::kFlush);
       TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
       return s;
     }
@@ -77,26 +131,31 @@ Status DBImpl::SyncClosedLogs(JobContext* job_context) {
 
 Status DBImpl::FlushMemTableToOutputFile(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    bool* made_progress, JobContext* job_context, LogBuffer* log_buffer) {
+    bool* made_progress, JobContext* job_context,
+    SuperVersionContext* superversion_context,
+    std::vector<SequenceNumber>& snapshot_seqs,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    Env::Priority thread_pri) {
   mutex_.AssertHeld();
   assert(cfd->imm()->NumNotFlushed() != 0);
   assert(cfd->imm()->IsFlushPending());
 
-  SequenceNumber earliest_write_conflict_snapshot;
-  std::vector<SequenceNumber> snapshot_seqs =
-      snapshots_.GetAll(&earliest_write_conflict_snapshot);
-
   FlushJob flush_job(
-      dbname_, cfd, immutable_db_options_, mutable_cf_options, env_options_,
-      versions_.get(), &mutex_, &shutting_down_, snapshot_seqs,
-      earliest_write_conflict_snapshot, job_context, log_buffer,
-      directories_.GetDbDir(), directories_.GetDataDir(0U),
+      dbname_, cfd, immutable_db_options_, mutable_cf_options,
+      nullptr /* memtable_id */, env_options_for_compaction_, versions_.get(),
+      &mutex_, &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+      snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+      GetDataDir(cfd, 0U),
       GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
-      &event_logger_, mutable_cf_options.report_bg_io_stats);
+      &event_logger_, mutable_cf_options.report_bg_io_stats,
+      true /* sync_output_directory */, true /* write_manifest */, thread_pri);
 
   FileMetaData file_meta;
 
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
   flush_job.PickMemTable();
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:AfterPickMemtables");
 
 #ifndef ROCKSDB_LITE
   // may temporarily unlock and lock the mutex.
@@ -106,7 +165,7 @@ Status DBImpl::FlushMemTableToOutputFile(
 
   Status s;
   if (logfile_number_ > 0 &&
-      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 0) {
+      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1) {
     // If there are more than one column families, we need to make sure that
     // all the log files except the most recent one are synced. Otherwise if
     // the host crashes after flushing and before WAL is persistent, the
@@ -114,6 +173,8 @@ Status DBImpl::FlushMemTableToOutputFile(
     // other column families are missing.
     // SyncClosedLogs() may unlock and re-lock the db_mutex.
     s = SyncClosedLogs(job_context);
+  } else {
+    TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
   }
 
   // Within flush_job.Run, rocksdb may call event listener to notify
@@ -123,16 +184,16 @@ Status DBImpl::FlushMemTableToOutputFile(
   // and EventListener callback will be called when the db_mutex
   // is unlocked by the current thread.
   if (s.ok()) {
-    s = flush_job.Run(&file_meta);
+    s = flush_job.Run(&logs_with_prep_tracker_, &file_meta);
   } else {
     flush_job.Cancel();
   }
 
   if (s.ok()) {
-    InstallSuperVersionAndScheduleWorkWrapper(cfd, job_context,
-                                              mutable_cf_options);
+    InstallSuperVersionAndScheduleWork(cfd, superversion_context,
+                                       mutable_cf_options);
     if (made_progress) {
-      *made_progress = 1;
+      *made_progress = true;
     }
     VersionStorageInfo::LevelSummaryStorage tmp;
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
@@ -140,18 +201,9 @@ Status DBImpl::FlushMemTableToOutputFile(
                      cfd->current()->storage_info()->LevelSummary(&tmp));
   }
 
-  if (!s.ok() && !s.IsShutdownInProgress() &&
-      immutable_db_options_.paranoid_checks && bg_error_.ok()) {
+  if (!s.ok() && !s.IsShutdownInProgress()) {
     Status new_bg_error = s;
-    // may temporarily unlock and lock the mutex.
-    EventHelpers::NotifyOnBackgroundError(immutable_db_options_.listeners,
-                                          BackgroundErrorReason::kFlush,
-                                          &new_bg_error, &mutex_);
-    if (!new_bg_error.ok()) {
-      // if a bad error happened (not ShutdownInProgress), paranoid_checks is
-      // true, and the error isn't handled by callback, mark DB read-only
-      bg_error_ = new_bg_error;
-    }
+    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
   }
   if (s.ok()) {
 #ifndef ROCKSDB_LITE
@@ -163,24 +215,324 @@ Status DBImpl::FlushMemTableToOutputFile(
     if (sfm) {
       // Notify sst_file_manager that a new file was added
       std::string file_path = MakeTableFileName(
-          immutable_db_options_.db_paths[0].path, file_meta.fd.GetNumber());
+          cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
       sfm->OnAddFile(file_path);
-      if (sfm->IsMaxAllowedSpaceReached() && bg_error_.ok()) {
-        Status new_bg_error = Status::IOError("Max allowed space was reached");
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        Status new_bg_error =
+            Status::SpaceLimit("Max allowed space was reached");
         TEST_SYNC_POINT_CALLBACK(
             "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
             &new_bg_error);
-        // may temporarily unlock and lock the mutex.
-        EventHelpers::NotifyOnBackgroundError(immutable_db_options_.listeners,
-                                              BackgroundErrorReason::kFlush,
-                                              &new_bg_error, &mutex_);
-        if (!new_bg_error.ok()) {
-          bg_error_ = new_bg_error;
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+  return s;
+}
+
+Status DBImpl::FlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  if (immutable_db_options_.atomic_flush) {
+    return AtomicFlushMemTablesToOutputFiles(
+        bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
+  }
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  Status status;
+  for (auto& arg : bg_flush_args) {
+    ColumnFamilyData* cfd = arg.cfd_;
+    MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+    SuperVersionContext* superversion_context = arg.superversion_context_;
+    Status s = FlushMemTableToOutputFile(
+        cfd, mutable_cf_options, made_progress, job_context,
+        superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, log_buffer, thread_pri);
+    if (!s.ok()) {
+      status = s;
+      if (!s.IsShutdownInProgress()) {
+        // At this point, DB is not shutting down, nor is cfd dropped.
+        // Something is wrong, thus we break out of the loop.
+        break;
+      }
+    }
+  }
+  return status;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+
+  autovector<ColumnFamilyData*> cfds;
+  for (const auto& arg : bg_flush_args) {
+    cfds.emplace_back(arg.cfd_);
+  }
+
+#ifndef NDEBUG
+  for (const auto cfd : cfds) {
+    assert(cfd->imm()->NumNotFlushed() != 0);
+    assert(cfd->imm()->IsFlushPending());
+  }
+#endif /* !NDEBUG */
+
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+
+  autovector<Directory*> distinct_output_dirs;
+  autovector<std::string> distinct_output_dir_paths;
+  std::vector<FlushJob> jobs;
+  std::vector<MutableCFOptions> all_mutable_cf_options;
+  int num_cfs = static_cast<int>(cfds.size());
+  all_mutable_cf_options.reserve(num_cfs);
+  for (int i = 0; i < num_cfs; ++i) {
+    auto cfd = cfds[i];
+    Directory* data_dir = GetDataDir(cfd, 0U);
+    const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
+
+    // Add to distinct output directories if eligible. Use linear search. Since
+    // the number of elements in the vector is not large, performance should be
+    // tolerable.
+    bool found = false;
+    for (const auto& path : distinct_output_dir_paths) {
+      if (path == curr_path) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      distinct_output_dir_paths.emplace_back(curr_path);
+      distinct_output_dirs.emplace_back(data_dir);
+    }
+
+    all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+    const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_);
+    jobs.emplace_back(
+        dbname_, cfd, immutable_db_options_, mutable_cf_options,
+        max_memtable_id, env_options_for_compaction_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+        data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+        stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+        false /* sync_output_directory */, false /* write_manifest */,
+        thread_pri);
+    jobs.back().PickMemTable();
+  }
+
+  std::vector<FileMetaData> file_meta(num_cfs);
+  Status s;
+  assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+  for (int i = 0; i != num_cfs; ++i) {
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+                       job_context->job_id, jobs[i].GetTableProperties());
+  }
+#endif /* !ROCKSDB_LITE */
+
+  if (logfile_number_ > 0) {
+    // TODO (yanqin) investigate whether we should sync the closed logs for
+    // single column family case.
+    s = SyncClosedLogs(job_context);
+  }
+
+  // exec_status stores the execution status of flush_jobs as
+  // <bool /* executed */, Status /* status code */>
+  autovector<std::pair<bool, Status>> exec_status;
+  for (int i = 0; i != num_cfs; ++i) {
+    // Initially all jobs are not executed, with status OK.
+    exec_status.emplace_back(false, Status::OK());
+  }
+
+  if (s.ok()) {
+    // TODO (yanqin): parallelize jobs with threads.
+    for (int i = 1; i != num_cfs; ++i) {
+      exec_status[i].second =
+          jobs[i].Run(&logs_with_prep_tracker_, &file_meta[i]);
+      exec_status[i].first = true;
+    }
+    if (num_cfs > 1) {
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+    }
+    exec_status[0].second =
+        jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]);
+    exec_status[0].first = true;
+
+    Status error_status;
+    for (const auto& e : exec_status) {
+      if (!e.second.ok()) {
+        s = e.second;
+        if (!e.second.IsShutdownInProgress()) {
+          // If a flush job did not return OK, and the CF is not dropped, and
+          // the DB is not shutting down, then we have to return this result to
+          // caller later.
+          error_status = e.second;
+        }
+      }
+    }
+
+    s = error_status.ok() ? s : error_status;
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    // Sync on all distinct output directories.
+    for (auto dir : distinct_output_dirs) {
+      if (dir != nullptr) {
+        s = dir->Fsync();
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    auto wait_to_install_func = [&]() {
+      bool ready = true;
+      for (size_t i = 0; i != cfds.size(); ++i) {
+        const auto& mems = jobs[i].GetMemTables();
+        if (cfds[i]->IsDropped()) {
+          // If the column family is dropped, then do not wait.
+          continue;
+        } else if (!mems.empty() &&
+                   cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+          // If a flush job needs to install the flush result for mems and
+          // mems[0] is not the earliest memtable, it means another thread must
+          // be installing flush results for the same column family, then the
+          // current thread needs to wait.
+          ready = false;
+          break;
+        } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+                                       bg_flush_args[i].max_memtable_id_) {
+          // If a flush job does not need to install flush results, then it has
+          // to wait until all memtables up to max_memtable_id_ (inclusive) are
+          // installed.
+          ready = false;
+          break;
+        }
+      }
+      return ready;
+    };
+
+    bool resuming_from_bg_err = error_handler_.IsDBStopped();
+    while ((!error_handler_.IsDBStopped() ||
+            error_handler_.GetRecoveryError().ok()) &&
+           !wait_to_install_func()) {
+      atomic_flush_install_cv_.Wait();
+    }
+
+    s = resuming_from_bg_err ? error_handler_.GetRecoveryError()
+                             : error_handler_.GetBGError();
+  }
+
+  if (s.ok()) {
+    autovector<ColumnFamilyData*> tmp_cfds;
+    autovector<const autovector<MemTable*>*> mems_list;
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    autovector<FileMetaData*> tmp_file_meta;
+    for (int i = 0; i != num_cfs; ++i) {
+      const auto& mems = jobs[i].GetMemTables();
+      if (!cfds[i]->IsDropped() && !mems.empty()) {
+        tmp_cfds.emplace_back(cfds[i]);
+        mems_list.emplace_back(&mems);
+        mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+        tmp_file_meta.emplace_back(&file_meta[i]);
+      }
+    }
+
+    s = InstallMemtableAtomicFlushResults(
+        nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+        versions_.get(), &mutex_, tmp_file_meta,
+        &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer);
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    assert(num_cfs ==
+           static_cast<int>(job_context->superversion_contexts.size()));
+    for (int i = 0; i != num_cfs; ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      InstallSuperVersionAndScheduleWork(cfds[i],
+                                         &job_context->superversion_contexts[i],
+                                         all_mutable_cf_options[i]);
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                       cfds[i]->GetName().c_str(),
+                       cfds[i]->current()->storage_info()->LevelSummary(&tmp));
+    }
+    if (made_progress) {
+      *made_progress = true;
+    }
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    for (int i = 0; i != num_cfs; ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      NotifyOnFlushCompleted(cfds[i], &file_meta[i], all_mutable_cf_options[i],
+                             job_context->job_id, jobs[i].GetTableProperties());
+      if (sfm) {
+        std::string file_path = MakeTableFileName(
+            cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+        sfm->OnAddFile(file_path);
+        if (sfm->IsMaxAllowedSpaceReached() &&
+            error_handler_.GetBGError().ok()) {
+          Status new_bg_error =
+              Status::SpaceLimit("Max allowed space was reached");
+          error_handler_.SetBGError(new_bg_error,
+                                    BackgroundErrorReason::kFlush);
         }
       }
     }
 #endif  // ROCKSDB_LITE
   }
+
+  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+  // it is not because of CF drop.
+  if (!s.ok() && !s.IsShutdownInProgress()) {
+    // Have to cancel the flush jobs that have NOT executed because we need to
+    // unref the versions.
+    for (int i = 0; i != num_cfs; ++i) {
+      if (!exec_status[i].first) {
+        jobs[i].Cancel();
+      }
+    }
+    for (int i = 0; i != num_cfs; ++i) {
+      if (exec_status[i].first && exec_status[i].second.ok()) {
+        auto& mems = jobs[i].GetMemTables();
+        cfds[i]->imm()->RollbackMemtableFlush(mems,
+                                              file_meta[i].fd.GetNumber());
+      }
+    }
+    Status new_bg_error = s;
+    error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+  }
+
   return s;
 }
 
@@ -205,18 +557,20 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
   mutex_.Unlock();
   {
     FlushJobInfo info;
+    info.cf_id = cfd->GetID();
     info.cf_name = cfd->GetName();
     // TODO(yhchiang): make db_paths dynamic in case flush does not
     //                 go to L0 in the future.
-    info.file_path = MakeTableFileName(immutable_db_options_.db_paths[0].path,
+    info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path,
                                        file_meta->fd.GetNumber());
     info.thread_id = env_->GetThreadID();
     info.job_id = job_id;
     info.triggered_writes_slowdown = triggered_writes_slowdown;
     info.triggered_writes_stop = triggered_writes_stop;
-    info.smallest_seqno = file_meta->smallest_seqno;
-    info.largest_seqno = file_meta->largest_seqno;
+    info.smallest_seqno = file_meta->fd.smallest_seqno;
+    info.largest_seqno = file_meta->fd.largest_seqno;
     info.table_properties = prop;
+    info.flush_reason = cfd->GetFlushReason();
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnFlushBegin(this, info);
     }
@@ -224,6 +578,12 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
   mutex_.Lock();
 // no need to signal bg_cv_ as it will be signaled at the end of the
 // flush process.
+#else
+  (void)cfd;
+  (void)file_meta;
+  (void)mutable_cf_options;
+  (void)job_id;
+  (void)prop;
 #endif  // ROCKSDB_LITE
 }
 
@@ -249,18 +609,20 @@ void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
   mutex_.Unlock();
   {
     FlushJobInfo info;
+    info.cf_id = cfd->GetID();
     info.cf_name = cfd->GetName();
     // TODO(yhchiang): make db_paths dynamic in case flush does not
     //                 go to L0 in the future.
-    info.file_path = MakeTableFileName(immutable_db_options_.db_paths[0].path,
+    info.file_path = MakeTableFileName(cfd->ioptions()->cf_paths[0].path,
                                        file_meta->fd.GetNumber());
     info.thread_id = env_->GetThreadID();
     info.job_id = job_id;
     info.triggered_writes_slowdown = triggered_writes_slowdown;
     info.triggered_writes_stop = triggered_writes_stop;
-    info.smallest_seqno = file_meta->smallest_seqno;
-    info.largest_seqno = file_meta->largest_seqno;
+    info.smallest_seqno = file_meta->fd.smallest_seqno;
+    info.largest_seqno = file_meta->fd.largest_seqno;
     info.table_properties = prop;
+    info.flush_reason = cfd->GetFlushReason();
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnFlushCompleted(this, info);
     }
@@ -268,24 +630,57 @@ void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
   mutex_.Lock();
   // no need to signal bg_cv_ as it will be signaled at the end of the
   // flush process.
+#else
+  (void)cfd;
+  (void)file_meta;
+  (void)mutable_cf_options;
+  (void)job_id;
+  (void)prop;
 #endif  // ROCKSDB_LITE
 }
 
 Status DBImpl::CompactRange(const CompactRangeOptions& options,
                             ColumnFamilyHandle* column_family,
                             const Slice* begin, const Slice* end) {
-  if (options.target_path_id >= immutable_db_options_.db_paths.size()) {
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+
+  if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
     return Status::InvalidArgument("Invalid target path ID");
   }
 
-  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  auto cfd = cfh->cfd();
   bool exclusive = options.exclusive_manual_compaction;
 
-  Status s = FlushMemTable(cfd, FlushOptions());
-  if (!s.ok()) {
-    LogFlush(immutable_db_options_.info_log);
-    return s;
+  bool flush_needed = true;
+  if (begin != nullptr && end != nullptr) {
+    // TODO(ajkr): We could also optimize away the flush in certain cases where
+    // one/both sides of the interval are unbounded. But it requires more
+    // changes to RangesOverlapWithMemtables.
+    Range range(*begin, *end);
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
+    cfd->RangesOverlapWithMemtables({range}, super_version, &flush_needed);
+    CleanupSuperVersion(super_version);
+  }
+
+  Status s;
+  if (flush_needed) {
+    FlushOptions fo;
+    fo.allow_write_stall = options.allow_write_stall;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      mutex_.Lock();
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+                               false /* writes_stopped */);
+    } else {
+      s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+                        false /* writes_stopped*/);
+    }
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
   }
 
   int max_level_with_files = 0;
@@ -311,7 +706,7 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
     }
     s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
                             final_output_level, options.target_path_id,
-                            begin, end, exclusive);
+                            options.max_subcompactions, begin, end, exclusive);
   } else {
     for (int level = 0; level <= max_level_with_files; level++) {
       int output_level;
@@ -345,7 +740,8 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
         }
       }
       s = RunManualCompaction(cfd, level, output_level, options.target_path_id,
-                              begin, end, exclusive);
+                              options.max_subcompactions, begin, end,
+                              exclusive);
       if (!s.ok()) {
         break;
       }
@@ -384,13 +780,21 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options,
   return s;
 }
 
-Status DBImpl::CompactFiles(
-    const CompactionOptions& compact_options,
-    ColumnFamilyHandle* column_family,
-    const std::vector<std::string>& input_file_names,
-    const int output_level, const int output_path_id) {
+Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
+                            ColumnFamilyHandle* column_family,
+                            const std::vector<std::string>& input_file_names,
+                            const int output_level, const int output_path_id,
+                            std::vector<std::string>* const output_file_names,
+                            CompactionJobInfo* compaction_job_info) {
 #ifdef ROCKSDB_LITE
-    // not supported in lite version
+  (void)compact_options;
+  (void)column_family;
+  (void)input_file_names;
+  (void)output_level;
+  (void)output_path_id;
+  (void)output_file_names;
+  (void)compaction_job_info;
+  // not supported in lite version
   return Status::NotSupported("Not supported in ROCKSDB LITE");
 #else
   if (column_family == nullptr) {
@@ -406,7 +810,7 @@ Status DBImpl::CompactFiles(
                        immutable_db_options_.info_log.get());
 
   // Perform CompactFiles
-  SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
+  TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
   {
     InstrumentedMutexLock l(&mutex_);
 
@@ -414,15 +818,16 @@ Status DBImpl::CompactFiles(
     // IngestExternalFile() calls to finish.
     WaitForIngestFile();
 
-    s = CompactFilesImpl(compact_options, cfd, sv->current,
-                         input_file_names, output_level,
-                         output_path_id, &job_context, &log_buffer);
-  }
-  if (sv->Unref()) {
-    mutex_.Lock();
-    sv->Cleanup();
-    mutex_.Unlock();
-    delete sv;
+    // We need to get current after `WaitForIngestFile`, because
+    // `IngestExternalFile` may add files that overlap with `input_file_names`
+    auto* current = cfd->current();
+    current->Ref();
+
+    s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
+                         output_file_names, output_level, output_path_id,
+                         &job_context, &log_buffer, compaction_job_info);
+
+    current->Unref();
   }
 
   // Find and delete obsolete files
@@ -436,7 +841,8 @@ Status DBImpl::CompactFiles(
   }  // release the mutex
 
   // delete unnecessary files if any, this is done outside the mutex
-  if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+  if (job_context.HaveSomethingToClean() ||
+      job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
     // Have to flush the info logs before bg_compaction_scheduled_--
     // because if bg_flush_scheduled_ becomes 0 and the lock is
     // released, the deconstructor of DB can kick in and destroy all the
@@ -458,8 +864,9 @@ Status DBImpl::CompactFiles(
 Status DBImpl::CompactFilesImpl(
     const CompactionOptions& compact_options, ColumnFamilyData* cfd,
     Version* version, const std::vector<std::string>& input_file_names,
-    const int output_level, int output_path_id, JobContext* job_context,
-    LogBuffer* log_buffer) {
+    std::vector<std::string>* const output_file_names, const int output_level,
+    int output_path_id, JobContext* job_context, LogBuffer* log_buffer,
+    CompactionJobInfo* compaction_job_info) {
   mutex_.AssertHeld();
 
   if (shutting_down_.load(std::memory_order_acquire)) {
@@ -467,7 +874,7 @@ Status DBImpl::CompactFilesImpl(
   }
 
   std::unordered_set<uint64_t> input_set;
-  for (auto file_name : input_file_names) {
+  for (const auto& file_name : input_file_names) {
     input_set.insert(TableFileNameToNumber(file_name));
   }
 
@@ -477,7 +884,7 @@ Status DBImpl::CompactFilesImpl(
   version->GetColumnFamilyMetaData(&cf_meta);
 
   if (output_path_id < 0) {
-    if (immutable_db_options_.db_paths.size() == 1U) {
+    if (cfd->ioptions()->cf_paths.size() == 1U) {
       output_path_id = 0;
     } else {
       return Status::NotSupported(
@@ -499,57 +906,61 @@ Status DBImpl::CompactFilesImpl(
     return s;
   }
 
-  for (auto inputs : input_files) {
+  for (const auto& inputs : input_files) {
     if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
       return Status::Aborted(
           "Some of the necessary compaction input "
           "files are already being compacted");
     }
   }
+  bool sfm_reserved_compact_space = false;
+  // First check if we have enough room to do the compaction
+  bool enough_room = EnoughRoomForCompaction(
+      cfd, input_files, &sfm_reserved_compact_space, log_buffer);
+
+  if (!enough_room) {
+    // m's vars will get set properly at the end of this function,
+    // as long as status == CompactionTooLarge
+    return Status::CompactionTooLarge();
+  }
 
   // At this point, CompactFiles will be run.
   bg_compaction_scheduled_++;
 
-  unique_ptr<Compaction> c;
+  std::unique_ptr<Compaction> c;
   assert(cfd->compaction_picker());
   c.reset(cfd->compaction_picker()->CompactFiles(
       compact_options, input_files, output_level, version->storage_info(),
       *cfd->GetLatestMutableCFOptions(), output_path_id));
-  if (!c) {
-    return Status::Aborted("Another Level 0 compaction is running");
-  }
+  // we already sanitized the set of input files and checked for conflicts
+  // without releasing the lock, so we're guaranteed a compaction can be formed.
+  assert(c != nullptr);
+
   c->SetInputVersion(version);
   // deletion compaction currently not allowed in CompactFiles.
   assert(!c->deletion_compaction());
 
+  std::vector<SequenceNumber> snapshot_seqs;
   SequenceNumber earliest_write_conflict_snapshot;
-  std::vector<SequenceNumber> snapshot_seqs =
-      snapshots_.GetAll(&earliest_write_conflict_snapshot);
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
 
   auto pending_outputs_inserted_elem =
       CaptureCurrentFileNumberInPendingOutputs();
 
   assert(is_snapshot_supported_ || snapshots_.empty());
+  CompactionJobStats compaction_job_stats;
   CompactionJob compaction_job(
-      job_context->job_id, c.get(), immutable_db_options_, env_options_,
-      versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(),
-      directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, &bg_error_,
-      snapshot_seqs, earliest_write_conflict_snapshot, table_cache_,
-      &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
+      job_context->job_id, c.get(), immutable_db_options_,
+      env_options_for_compaction_, versions_.get(), &shutting_down_,
+      preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+      GetDataDir(c->column_family_data(), c->output_path_id()), stats_, &mutex_,
+      &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+      snapshot_checker, table_cache_, &event_logger_,
+      c->mutable_cf_options()->paranoid_file_checks,
       c->mutable_cf_options()->report_bg_io_stats, dbname_,
-      nullptr);  // Here we pass a nullptr for CompactionJobStats because
-                 // CompactFiles does not trigger OnCompactionCompleted(),
-                 // which is the only place where CompactionJobStats is
-                 // returned.  The idea of not triggering OnCompationCompleted()
-                 // is that CompactFiles runs in the caller thread, so the user
-                 // should always know when it completes.  As a result, it makes
-                 // less sense to notify the users something they should already
-                 // know.
-                 //
-                 // In the future, if we would like to add CompactionJobStats
-                 // support for CompactFiles, we should have CompactFiles API
-                 // pass a pointer of CompactionJobStats as the out-value
-                 // instead of using EventListener.
+      &compaction_job_stats, Env::Priority::USER);
 
   // Creating a compaction influences the compaction score because the score
   // takes running compactions into account (by skipping files that are already
@@ -570,13 +981,27 @@ Status DBImpl::CompactFilesImpl(
 
   Status status = compaction_job.Install(*c->mutable_cf_options());
   if (status.ok()) {
-    InstallSuperVersionAndScheduleWorkWrapper(
-        c->column_family_data(), job_context, *c->mutable_cf_options());
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
   }
   c->ReleaseCompactionFiles(s);
+#ifndef ROCKSDB_LITE
+  // Need to make sure SstFileManager does its bookkeeping
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm && sfm_reserved_compact_space) {
+    sfm->OnCompactionCompletion(c.get());
+  }
+#endif  // ROCKSDB_LITE
 
   ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
+  if (compaction_job_info != nullptr) {
+    BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
+                           job_context->job_id, version, compaction_job_info);
+  }
+
   if (status.ok()) {
     // Done
   } else if (status.IsShutdownInProgress()) {
@@ -586,15 +1011,15 @@ Status DBImpl::CompactFilesImpl(
                    "[%s] [JOB %d] Compaction error: %s",
                    c->column_family_data()->GetName().c_str(),
                    job_context->job_id, status.ToString().c_str());
-    if (immutable_db_options_.paranoid_checks && bg_error_.ok()) {
-      Status new_bg_error = status;
-      // may temporarily unlock and lock the mutex.
-      EventHelpers::NotifyOnBackgroundError(immutable_db_options_.listeners,
-                                            BackgroundErrorReason::kCompaction,
-                                            &new_bg_error, &mutex_);
-      if (!new_bg_error.ok()) {
-        bg_error_ = new_bg_error;
-      }
+    error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+  }
+
+  if (output_file_names != nullptr) {
+    for (const auto newf : c->edit()->GetNewFiles()) {
+      (*output_file_names)
+          .push_back(TableFileName(c->immutable_cf_options()->cf_paths,
+                                   newf.second.fd.GetNumber(),
+                                   newf.second.fd.GetPathId()));
     }
   }
 
@@ -604,6 +1029,8 @@ Status DBImpl::CompactFilesImpl(
   if (bg_compaction_scheduled_ == 0) {
     bg_cv_.SignalAll();
   }
+  MaybeScheduleFlushOrCompaction();
+  TEST_SYNC_POINT("CompactFilesImpl:End");
 
   return status;
 }
@@ -637,21 +1064,23 @@ Status DBImpl::ContinueBackgroundWork() {
   return Status::OK();
 }
 
-void DBImpl::NotifyOnCompactionCompleted(
-    ColumnFamilyData* cfd, Compaction *c, const Status &st,
-    const CompactionJobStats& compaction_job_stats,
-    const int job_id) {
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                                     const Status& st,
+                                     const CompactionJobStats& job_stats,
+                                     int job_id) {
 #ifndef ROCKSDB_LITE
-  if (immutable_db_options_.listeners.size() == 0U) {
+  if (immutable_db_options_.listeners.empty()) {
     return;
   }
   mutex_.AssertHeld();
   if (shutting_down_.load(std::memory_order_acquire)) {
     return;
   }
+  Version* current = cfd->current();
+  current->Ref();
   // release lock while notifying events
   mutex_.Unlock();
-  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
   {
     CompactionJobInfo info;
     info.cf_name = cfd->GetName();
@@ -660,18 +1089,18 @@ void DBImpl::NotifyOnCompactionCompleted(
     info.job_id = job_id;
     info.base_input_level = c->start_level();
     info.output_level = c->output_level();
-    info.stats = compaction_job_stats;
+    info.stats = job_stats;
     info.table_properties = c->GetOutputTableProperties();
     info.compaction_reason = c->compaction_reason();
     info.compression = c->output_compression();
     for (size_t i = 0; i < c->num_input_levels(); ++i) {
       for (const auto fmd : *c->inputs(i)) {
-        auto fn = TableFileName(immutable_db_options_.db_paths,
+        auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
                                 fmd->fd.GetNumber(), fmd->fd.GetPathId());
         info.input_files.push_back(fn);
         if (info.table_properties.count(fn) == 0) {
           std::shared_ptr<const TableProperties> tp;
-          auto s = cfd->current()->GetTableProperties(&tp, fmd, &fn);
+          auto s = current->GetTableProperties(&tp, fmd, &fn);
           if (s.ok()) {
             info.table_properties[fn] = tp;
           }
@@ -679,17 +1108,59 @@ void DBImpl::NotifyOnCompactionCompleted(
       }
     }
     for (const auto newf : c->edit()->GetNewFiles()) {
-      info.output_files.push_back(TableFileName(immutable_db_options_.db_paths,
-                                                newf.second.fd.GetNumber(),
-                                                newf.second.fd.GetPathId()));
+      info.output_files.push_back(TableFileName(
+          c->immutable_cf_options()->cf_paths, newf.second.fd.GetNumber(),
+          newf.second.fd.GetPathId()));
     }
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionBegin(this, info);
+    }
+  }
+  mutex_.Lock();
+  current->Unref();
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnCompactionCompleted(
+    ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+  {
+    CompactionJobInfo info;
+    BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
+                           &info);
     for (auto listener : immutable_db_options_.listeners) {
       listener->OnCompactionCompleted(this, info);
     }
   }
   mutex_.Lock();
+  current->Unref();
   // no need to signal bg_cv_ as it will be signaled at the end of the
   // flush process.
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)compaction_job_stats;
+  (void)job_id;
 #endif  // ROCKSDB_LITE
 }
 
@@ -701,8 +1172,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
     return Status::InvalidArgument("Target level exceeds number of levels");
   }
 
-  std::unique_ptr<SuperVersion> superversion_to_free;
-  std::unique_ptr<SuperVersion> new_superversion(new SuperVersion());
+  SuperVersionContext sv_context(/* create_superversion */ true);
 
   Status status;
 
@@ -748,7 +1218,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
       edit.DeleteFile(level, f->fd.GetNumber());
       edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->smallest_seqno, f->largest_seqno,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
                    f->marked_for_compaction);
     }
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
@@ -757,8 +1227,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
 
     status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_,
                                     directories_.GetDbDir());
-    superversion_to_free.reset(InstallSuperVersionAndScheduleWork(
-        cfd, new_superversion.release(), mutable_cf_options));
+    InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
 
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
                     cfd->GetName().c_str(), status.ToString().data());
@@ -770,6 +1239,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
     }
   }
 
+  sv_context.Clean();
   refitting_level_ = false;
 
   return status;
@@ -780,25 +1250,83 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
   return cfh->cfd()->NumberLevels();
 }
 
-int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
   return 0;
 }
 
 int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   InstrumentedMutexLock l(&mutex_);
-  return cfh->cfd()->GetSuperVersion()->
-      mutable_cf_options.level0_stop_writes_trigger;
+  return cfh->cfd()
+      ->GetSuperVersion()
+      ->mutable_cf_options.level0_stop_writes_trigger;
 }
 
 Status DBImpl::Flush(const FlushOptions& flush_options,
                      ColumnFamilyHandle* column_family) {
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
-  return FlushMemTable(cfh->cfd(), flush_options);
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
+                 cfh->GetName().c_str());
+  Status s;
+  if (immutable_db_options_.atomic_flush) {
+    s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+                             FlushReason::kManualFlush);
+  } else {
+    s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] Manual flush finished, status: %s\n",
+                 cfh->GetName().c_str(), s.ToString().c_str());
+  return s;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     const std::vector<ColumnFamilyHandle*>& column_families) {
+  Status s;
+  if (!immutable_db_options_.atomic_flush) {
+    for (auto cfh : column_families) {
+      s = Flush(flush_options, cfh);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush start.\n"
+                   "=====Column families:=====");
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+    autovector<ColumnFamilyData*> cfds;
+    std::for_each(column_families.begin(), column_families.end(),
+                  [&cfds](ColumnFamilyHandle* elem) {
+                    auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+                    cfds.emplace_back(cfh->cfd());
+                  });
+    s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush finished, status: %s\n"
+                   "=====Column families:=====",
+                   s.ToString().c_str());
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+  }
+  return s;
 }
 
 Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
                                    int output_level, uint32_t output_path_id,
+                                   uint32_t max_subcompactions,
                                    const Slice* begin, const Slice* end,
                                    bool exclusive, bool disallow_trivial_move) {
   assert(input_level == ColumnFamilyData::kCompactAllLevels ||
@@ -826,7 +1354,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
       cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
     manual.begin = nullptr;
   } else {
-    begin_storage.SetMaxPossibleForUserKey(*begin);
+    begin_storage.SetMinPossibleForUserKey(*begin);
     manual.begin = &begin_storage;
   }
   if (end == nullptr ||
@@ -834,7 +1362,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
       cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
     manual.end = nullptr;
   } else {
-    end_storage.SetMinPossibleForUserKey(*end);
+    end_storage.SetMaxPossibleForUserKey(*end);
     manual.end = &end_storage;
   }
 
@@ -874,21 +1402,24 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "[%s] Manual compaction starting", cfd->GetName().c_str());
 
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
   // We don't check bg_error_ here, because if we get the error in compaction,
   // the compaction will set manual.status to bg_error_ and set manual.done to
   // true.
   while (!manual.done) {
     assert(HasPendingManualCompaction());
     manual_conflict = false;
-    Compaction* compaction;
+    Compaction* compaction = nullptr;
     if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
         scheduled ||
-        ((manual.manual_end = &manual.tmp_storage1) &&
+        (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
          ((compaction = manual.cfd->CompactRange(
                *manual.cfd->GetLatestMutableCFOptions(), manual.input_level,
-               manual.output_level, manual.output_path_id, manual.begin,
-               manual.end, &manual.manual_end, &manual_conflict)) == nullptr) &&
-         manual_conflict)) {
+               manual.output_level, manual.output_path_id, max_subcompactions,
+               manual.begin, manual.end, &manual.manual_end,
+               &manual_conflict)) == nullptr &&
+          manual_conflict))) {
       // exclusive manual compactions should not see a conflict during
       // CompactRange
       assert(!exclusive || !manual_conflict);
@@ -910,14 +1441,20 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
       ca->prepicked_compaction = new PrepickedCompaction;
       ca->prepicked_compaction->manual_compaction_state = &manual;
       ca->prepicked_compaction->compaction = compaction;
+      if (!RequestCompactionToken(
+              cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
+        // Don't throttle manual compaction, only count outstanding tasks.
+        assert(false);
+      }
       manual.incomplete = false;
       bg_compaction_scheduled_++;
       env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
-                     &DBImpl::UnscheduleCallback);
+                     &DBImpl::UnscheduleCompactionCallback);
       scheduled = true;
     }
   }
 
+  log_buffer.FlushBufferToLog();
   assert(!manual.in_progress);
   assert(HasPendingManualCompaction());
   RemoveManualCompaction(&manual);
@@ -925,64 +1462,290 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
   return manual.status;
 }
 
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                                  FlushRequest* req) {
+  assert(req != nullptr);
+  req->reserve(cfds.size());
+  for (const auto cfd : cfds) {
+    if (nullptr == cfd) {
+      // cfd may be null, see DBImpl::ScheduleFlushes
+      continue;
+    }
+    uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+    req->emplace_back(cfd, max_memtable_id);
+  }
+}
+
 Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
                              const FlushOptions& flush_options,
-                             bool writes_stopped) {
+                             FlushReason flush_reason, bool writes_stopped) {
   Status s;
+  uint64_t flush_memtable_id = 0;
+  if (!flush_options.allow_write_stall) {
+    bool flush_needed = true;
+    s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+    TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+    if (!s.ok() || !flush_needed) {
+      return s;
+    }
+  }
+  FlushRequest flush_req;
   {
     WriteContext context;
     InstrumentedMutexLock guard_lock(&mutex_);
 
-    if (cfd->imm()->NumNotFlushed() == 0 && cfd->mem()->IsEmpty()) {
-      // Nothing to flush
-      return Status::OK();
-    }
-
     WriteThread::Writer w;
     if (!writes_stopped) {
       write_thread_.EnterUnbatched(&w, &mutex_);
     }
 
-    // SwitchMemtable() will release and reacquire mutex
-    // during execution
-    s = SwitchMemtable(cfd, &context);
+    if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) {
+      s = SwitchMemtable(cfd, &context);
+    }
+
+    if (s.ok()) {
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        flush_memtable_id = cfd->imm()->GetLatestMemTableID();
+        flush_req.emplace_back(cfd, flush_memtable_id);
+      }
+    }
+
+    if (s.ok() && !flush_req.empty()) {
+      for (auto& elem : flush_req) {
+        ColumnFamilyData* loop_cfd = elem.first;
+        loop_cfd->imm()->FlushRequested();
+      }
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
 
     if (!writes_stopped) {
       write_thread_.ExitUnbatched(&w);
     }
+  }
 
-    cfd->imm()->FlushRequested();
+  if (s.ok() && flush_options.wait) {
+    autovector<ColumnFamilyData*> cfds;
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      cfds.push_back(iter.first);
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+                              (flush_reason == FlushReason::kErrorRecovery));
+  }
+  TEST_SYNC_POINT("FlushMemTableFinished");
+  return s;
+}
 
-    // schedule flush
-    SchedulePendingFlush(cfd);
-    MaybeScheduleFlushOrCompaction();
+// Flush all elments in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const FlushOptions& flush_options, FlushReason flush_reason,
+    bool writes_stopped) {
+  Status s;
+  if (!flush_options.allow_write_stall) {
+    int num_cfs_to_flush = 0;
+    for (auto cfd : column_family_datas) {
+      bool flush_needed = true;
+      s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+      if (!s.ok()) {
+        return s;
+      } else if (flush_needed) {
+        ++num_cfs_to_flush;
+      }
+    }
+    if (0 == num_cfs_to_flush) {
+      return s;
+    }
   }
+  FlushRequest flush_req;
+  autovector<ColumnFamilyData*> cfds;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    if (!writes_stopped) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+    }
+
+    for (auto cfd : column_family_datas) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        cfds.emplace_back(cfd);
+      }
+    }
+    for (auto cfd : cfds) {
+      if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) {
+        continue;
+      }
+      cfd->Ref();
+      s = SwitchMemtable(cfd, &context);
+      cfd->Unref();
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (s.ok()) {
+      AssignAtomicFlushSeq(cfds);
+      for (auto cfd : cfds) {
+        cfd->imm()->FlushRequested();
+      }
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (!writes_stopped) {
+      write_thread_.ExitUnbatched(&w);
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
 
   if (s.ok() && flush_options.wait) {
-    // Wait until the compaction completes
-    s = WaitForFlushMemTable(cfd);
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(cfds, flush_memtable_ids,
+                              (flush_reason == FlushReason::kErrorRecovery));
   }
   return s;
 }
 
-Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
-  Status s;
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Called should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                                 bool* flush_needed) {
+  {
+    *flush_needed = true;
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+    WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+    do {
+      if (write_stall_condition != WriteStallCondition::kNormal) {
+        // Same error handling as user writes: Don't wait if there's a
+        // background error, even if it's a soft error. We might wait here
+        // indefinitely as the pending flushes/compactions may never finish
+        // successfully, resulting in the stall condition lasting indefinitely
+        if (error_handler_.IsBGWorkStopped()) {
+          return error_handler_.GetBGError();
+        }
+
+        TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "[%s] WaitUntilFlushWouldNotStallWrites"
+                       " waiting on stall conditions to clear",
+                       cfd->GetName().c_str());
+        bg_cv_.Wait();
+      }
+      if (cfd->IsDropped() || shutting_down_.load(std::memory_order_acquire)) {
+        return Status::ShutdownInProgress();
+      }
+
+      uint64_t earliest_memtable_id =
+          std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+      if (earliest_memtable_id > orig_active_memtable_id) {
+        // We waited so long that the memtable we were originally waiting on was
+        // flushed.
+        *flush_needed = false;
+        return Status::OK();
+      }
+
+      const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+      const auto* vstorage = cfd->current()->storage_info();
+
+      // Skip stalling check if we're below auto-flush and auto-compaction
+      // triggers. If it stalled in these conditions, that'd mean the stall
+      // triggers are so low that stalling is needed for any background work. In
+      // that case we shouldn't wait since background work won't be scheduled.
+      if (cfd->imm()->NumNotFlushed() <
+              cfd->ioptions()->min_write_buffer_number_to_merge &&
+          vstorage->l0_delay_trigger_count() <
+              mutable_cf_options.level0_file_num_compaction_trigger) {
+        break;
+      }
+
+      // check whether one extra immutable memtable or an extra L0 file would
+      // cause write stalling mode to be entered. It could still enter stall
+      // mode due to pending compaction bytes, but that's less common
+      write_stall_condition =
+          ColumnFamilyData::GetWriteStallConditionAndCause(
+              cfd->imm()->NumNotFlushed() + 1,
+              vstorage->l0_delay_trigger_count() + 1,
+              vstorage->estimated_compaction_needed_bytes(), mutable_cf_options)
+              .first;
+    } while (write_stall_condition != WriteStallCondition::kNormal);
+  }
+  return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+//  1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+//     have to be flushed for THIS column family;
+//  2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+//     family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
+Status DBImpl::WaitForFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const uint64_t*>& flush_memtable_ids,
+    bool resuming_from_bg_err) {
+  int num = static_cast<int>(cfds.size());
   // Wait until the compaction completes
   InstrumentedMutexLock l(&mutex_);
-  while (cfd->imm()->NumNotFlushed() > 0 && bg_error_.ok()) {
+  // If the caller is trying to resume from bg error, then
+  // error_handler_.IsDBStopped() is true.
+  while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
     if (shutting_down_.load(std::memory_order_acquire)) {
       return Status::ShutdownInProgress();
     }
-    if (cfd->IsDropped()) {
-      // FlushJob cannot flush a dropped CF, if we did not break here
-      // we will loop forever since cfd->imm()->NumNotFlushed() will never
-      // drop to zero
+    // If an error has occurred during resumption, then no need to wait.
+    if (!error_handler_.GetRecoveryError().ok()) {
+      break;
+    }
+    // Number of column families that have been dropped.
+    int num_dropped = 0;
+    // Number of column families that have finished flush.
+    int num_finished = 0;
+    for (int i = 0; i < num; ++i) {
+      if (cfds[i]->IsDropped()) {
+        ++num_dropped;
+      } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+                 (flush_memtable_ids[i] != nullptr &&
+                  cfds[i]->imm()->GetEarliestMemTableID() >
+                      *flush_memtable_ids[i])) {
+        ++num_finished;
+      }
+    }
+    if (1 == num_dropped && 1 == num) {
       return Status::InvalidArgument("Cannot flush a dropped CF");
     }
+    // Column families involved in this flush request have either been dropped
+    // or finished flush. Then it's time to finish waiting.
+    if (num_dropped + num_finished == num) {
+      break;
+    }
     bg_cv_.Wait();
   }
-  if (!bg_error_.ok()) {
-    s = bg_error_;
+  Status s;
+  // If not resuming from bg error, and an error has caused the DB to stop,
+  // then report the bg error to caller.
+  if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
   }
   return s;
 }
@@ -1010,18 +1773,27 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
   if (bg_work_paused_ > 0) {
     // we paused the background work
     return;
+  } else if (error_handler_.IsBGWorkStopped() &&
+             !error_handler_.IsRecoveryInProgress()) {
+    // There has been a hard error and this call is not part of the recovery
+    // sequence. Bail out here so we don't get into an endless loop of
+    // scheduling BG work which will again call this function
+    return;
   } else if (shutting_down_.load(std::memory_order_acquire)) {
     // DB is being deleted; no more background compactions
     return;
   }
   auto bg_job_limits = GetBGJobLimits();
   bool is_flush_pool_empty =
-    env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
+      env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
   while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
          bg_flush_scheduled_ < bg_job_limits.max_flushes) {
-    unscheduled_flushes_--;
     bg_flush_scheduled_++;
-    env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this);
+    FlushThreadArg* fta = new FlushThreadArg;
+    fta->db_ = this;
+    fta->thread_pri_ = Env::Priority::HIGH;
+    env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this,
+                   &DBImpl::UnscheduleFlushCallback);
   }
 
   // special case -- if high-pri (flush) thread pool is empty, then schedule
@@ -1030,20 +1802,30 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     while (unscheduled_flushes_ > 0 &&
            bg_flush_scheduled_ + bg_compaction_scheduled_ <
                bg_job_limits.max_flushes) {
-      unscheduled_flushes_--;
       bg_flush_scheduled_++;
-      env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this);
+      FlushThreadArg* fta = new FlushThreadArg;
+      fta->db_ = this;
+      fta->thread_pri_ = Env::Priority::LOW;
+      env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this,
+                     &DBImpl::UnscheduleFlushCallback);
     }
   }
 
   if (bg_compaction_paused_ > 0) {
     // we paused the background compaction
     return;
+  } else if (error_handler_.IsBGWorkStopped()) {
+    // Compaction is not part of the recovery sequence from a hard error. We
+    // might get here because recovery might do a flush and install a new
+    // super version, which will try to schedule pending compactions. Bail
+    // out here and let the higher level recovery handle compactions
+    return;
   }
 
   if (HasExclusiveManualCompaction()) {
     // only manual compactions are allowed to run. don't schedule automatic
     // compactions
+    TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict");
     return;
   }
 
@@ -1055,7 +1837,7 @@ void DBImpl::MaybeScheduleFlushOrCompaction() {
     bg_compaction_scheduled_++;
     unscheduled_compactions_--;
     env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
-                   &DBImpl::UnscheduleCallback);
+                   &DBImpl::UnscheduleCompactionCallback);
   }
 }
 
@@ -1091,63 +1873,92 @@ DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes,
 }
 
 void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
-  assert(!cfd->pending_compaction());
+  assert(!cfd->queued_for_compaction());
   cfd->Ref();
   compaction_queue_.push_back(cfd);
-  cfd->set_pending_compaction(true);
+  cfd->set_queued_for_compaction(true);
 }
 
 ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
   assert(!compaction_queue_.empty());
   auto cfd = *compaction_queue_.begin();
   compaction_queue_.pop_front();
-  assert(cfd->pending_compaction());
-  cfd->set_pending_compaction(false);
+  assert(cfd->queued_for_compaction());
+  cfd->set_queued_for_compaction(false);
   return cfd;
 }
 
-void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) {
-  assert(!cfd->pending_flush());
-  cfd->Ref();
-  flush_queue_.push_back(cfd);
-  cfd->set_pending_flush(true);
-}
-
-ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() {
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
   assert(!flush_queue_.empty());
-  auto cfd = *flush_queue_.begin();
+  FlushRequest flush_req = flush_queue_.front();
+  assert(unscheduled_flushes_ >= static_cast<int>(flush_req.size()));
+  unscheduled_flushes_ -= static_cast<int>(flush_req.size());
   flush_queue_.pop_front();
-  assert(cfd->pending_flush());
-  cfd->set_pending_flush(false);
+  // TODO: need to unset flush reason?
+  return flush_req;
+}
+
+ColumnFamilyData* DBImpl::PickCompactionFromQueue(
+    std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer) {
+  assert(!compaction_queue_.empty());
+  assert(*token == nullptr);
+  autovector<ColumnFamilyData*> throttled_candidates;
+  ColumnFamilyData* cfd = nullptr;
+  while (!compaction_queue_.empty()) {
+    auto first_cfd = *compaction_queue_.begin();
+    compaction_queue_.pop_front();
+    assert(first_cfd->queued_for_compaction());
+    if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) {
+      throttled_candidates.push_back(first_cfd);
+      continue;
+    }
+    cfd = first_cfd;
+    cfd->set_queued_for_compaction(false);
+    break;
+  }
+  // Add throttled compaction candidates back to queue in the original order.
+  for (auto iter = throttled_candidates.rbegin();
+       iter != throttled_candidates.rend(); ++iter) {
+    compaction_queue_.push_front(*iter);
+  }
   return cfd;
 }
 
-void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) {
-  if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) {
-    AddToFlushQueue(cfd);
-    ++unscheduled_flushes_;
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
+                                  FlushReason flush_reason) {
+  if (flush_req.empty()) {
+    return;
+  }
+  for (auto& iter : flush_req) {
+    ColumnFamilyData* cfd = iter.first;
+    cfd->Ref();
+    cfd->SetFlushReason(flush_reason);
   }
+  unscheduled_flushes_ += static_cast<int>(flush_req.size());
+  flush_queue_.push_back(flush_req);
 }
 
 void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
-  if (!cfd->pending_compaction() && cfd->NeedsCompaction()) {
+  if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
     AddToCompactionQueue(cfd);
     ++unscheduled_compactions_;
   }
 }
 
-void DBImpl::SchedulePendingPurge(std::string fname, FileType type,
-                                  uint64_t number, uint32_t path_id,
-                                  int job_id) {
+void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                                  FileType type, uint64_t number, int job_id) {
   mutex_.AssertHeld();
-  PurgeFileInfo file_info(fname, type, number, path_id, job_id);
+  PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
   purge_queue_.push_back(std::move(file_info));
 }
 
-void DBImpl::BGWorkFlush(void* db) {
-  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+void DBImpl::BGWorkFlush(void* arg) {
+  FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+
+  IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush");
-  reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
+  reinterpret_cast<DBImpl*>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
   TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
 }
 
@@ -1182,7 +1993,7 @@ void DBImpl::BGWorkPurge(void* db) {
   TEST_SYNC_POINT("DBImpl::BGWorkPurge:end");
 }
 
-void DBImpl::UnscheduleCallback(void* arg) {
+void DBImpl::UnscheduleCompactionCallback(void* arg) {
   CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
   delete reinterpret_cast<CompactionArg*>(arg);
   if (ca.prepicked_compaction != nullptr) {
@@ -1191,62 +2002,93 @@ void DBImpl::UnscheduleCallback(void* arg) {
     }
     delete ca.prepicked_compaction;
   }
-  TEST_SYNC_POINT("DBImpl::UnscheduleCallback");
+  TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback");
+}
+
+void DBImpl::UnscheduleFlushCallback(void* arg) {
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+  TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
 }
 
 Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
-                               LogBuffer* log_buffer) {
+                               LogBuffer* log_buffer, FlushReason* reason,
+                               Env::Priority thread_pri) {
   mutex_.AssertHeld();
 
-  Status status = bg_error_;
-  if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
-    status = Status::ShutdownInProgress();
+  Status status;
+  *reason = FlushReason::kOthers;
+  // If BG work is stopped due to an error, but a recovery is in progress,
+  // that means this flush is part of the recovery. So allow it to go through
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    }
+  } else if (!error_handler_.IsRecoveryInProgress()) {
+    status = error_handler_.GetBGError();
   }
 
   if (!status.ok()) {
     return status;
   }
 
-  ColumnFamilyData* cfd = nullptr;
+  autovector<BGFlushArg> bg_flush_args;
+  std::vector<SuperVersionContext>& superversion_contexts =
+      job_context->superversion_contexts;
   while (!flush_queue_.empty()) {
     // This cfd is already referenced
-    auto first_cfd = PopFirstFromFlushQueue();
-
-    if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) {
-      // can't flush this CF, try next one
-      if (first_cfd->Unref()) {
-        delete first_cfd;
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    superversion_contexts.clear();
+    superversion_contexts.reserve(flush_req.size());
+
+    for (const auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+        // can't flush this CF, try next one
+        if (cfd->Unref()) {
+          delete cfd;
+        }
+        continue;
       }
-      continue;
+      superversion_contexts.emplace_back(SuperVersionContext(true));
+      bg_flush_args.emplace_back(cfd, iter.second,
+                                 &(superversion_contexts.back()));
+    }
+    if (!bg_flush_args.empty()) {
+      break;
     }
-
-    // found a flush!
-    cfd = first_cfd;
-    break;
   }
 
-  if (cfd != nullptr) {
-    const MutableCFOptions mutable_cf_options =
-        *cfd->GetLatestMutableCFOptions();
+  if (!bg_flush_args.empty()) {
     auto bg_job_limits = GetBGJobLimits();
-    ROCKS_LOG_BUFFER(
-        log_buffer,
-        "Calling FlushMemTableToOutputFile with column "
-        "family [%s], flush slots available %d, compaction slots available %d, "
-        "flush slots scheduled %d, compaction slots scheduled %d",
-        cfd->GetName().c_str(), bg_job_limits.max_flushes,
-        bg_job_limits.max_compactions, bg_flush_scheduled_,
-        bg_compaction_scheduled_);
-    status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
-                                       job_context, log_buffer);
-    if (cfd->Unref()) {
-      delete cfd;
+    for (const auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "Calling FlushMemTableToOutputFile with column "
+          "family [%s], flush slots available %d, compaction slots available "
+          "%d, "
+          "flush slots scheduled %d, compaction slots scheduled %d",
+          cfd->GetName().c_str(), bg_job_limits.max_flushes,
+          bg_job_limits.max_compactions, bg_flush_scheduled_,
+          bg_compaction_scheduled_);
+    }
+    status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+                                         job_context, log_buffer, thread_pri);
+    // All the CFDs in the FlushReq must have the same flush reason, so just
+    // grab the first one
+    *reason = bg_flush_args[0].cfd_->GetFlushReason();
+    for (auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      if (cfd->Unref()) {
+        delete cfd;
+        arg.cfd_ = nullptr;
+      }
     }
   }
   return status;
 }
 
-void DBImpl::BackgroundCallFlush() {
+void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
   bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
 
@@ -1261,15 +2103,18 @@ void DBImpl::BackgroundCallFlush() {
 
     auto pending_outputs_inserted_elem =
         CaptureCurrentFileNumberInPendingOutputs();
+    FlushReason reason;
 
-    Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer);
-    if (!s.ok() && !s.IsShutdownInProgress()) {
+    Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
+                               &reason, thread_pri);
+    if (!s.ok() && !s.IsShutdownInProgress() &&
+        reason != FlushReason::kErrorRecovery) {
       // Wait a little bit before retrying background flush in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed flushes for the duration of
       // the problem.
       uint64_t error_cnt =
-        default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
       bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
       mutex_.Unlock();
       ROCKS_LOG_ERROR(immutable_db_options_.info_log,
@@ -1282,14 +2127,17 @@ void DBImpl::BackgroundCallFlush() {
       mutex_.Lock();
     }
 
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
     ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
     // If flush failed, we want to delete all temporary files that we might have
     // created. Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
     // delete unnecessary files if any, this is done outside the mutex
-    if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
       mutex_.Unlock();
+      TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
       // Have to flush the info logs before bg_flush_scheduled_--
       // because if bg_flush_scheduled_ becomes 0 and the lock is
       // released, the deconstructor of DB can kick in and destroy all the
@@ -1302,12 +2150,14 @@ void DBImpl::BackgroundCallFlush() {
       job_context.Clean();
       mutex_.Lock();
     }
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
 
     assert(num_running_flushes_ > 0);
     num_running_flushes_--;
     bg_flush_scheduled_--;
     // See if there's more work to be done
     MaybeScheduleFlushOrCompaction();
+    atomic_flush_install_cv_.SignalAll();
     bg_cv_.SignalAll();
     // IMPORTANT: there should be no code after calling SignalAll. This call may
     // signal the DB destructor that it's OK to proceed with destruction. In
@@ -1321,7 +2171,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
   bool made_progress = false;
   JobContext job_context(next_job_id_.fetch_add(1), true);
   TEST_SYNC_POINT("BackgroundCallCompaction:0");
-  MaybeDumpStats();
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
   {
@@ -1340,9 +2189,14 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
             bg_bottom_compaction_scheduled_) ||
            (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
     Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
-                                    prepicked_compaction);
+                                    prepicked_compaction, bg_thread_pri);
     TEST_SYNC_POINT("BackgroundCallCompaction:1");
-    if (!s.ok() && !s.IsShutdownInProgress()) {
+    if (s.IsBusy()) {
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      env_->SleepForMicroseconds(10000);  // prevent hot loop
+      mutex_.Lock();
+    } else if (!s.ok() && !s.IsShutdownInProgress()) {
       // Wait a little bit before retrying background compaction in
       // case this is an environmental problem and we do not want to
       // chew up resources for failed compactions for the duration of
@@ -1367,9 +2221,11 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     // have created (they might not be all recorded in job_context in case of a
     // failure). Thus, we force full scan in FindObsoleteFiles()
     FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress());
+    TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
 
     // delete unnecessary files if any, this is done outside the mutex
-    if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
       mutex_.Unlock();
       // Have to flush the info logs before bg_compaction_scheduled_--
       // because if bg_flush_scheduled_ becomes 0 and the lock is
@@ -1379,6 +2235,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       log_buffer.FlushBufferToLog();
       if (job_context.HaveSomethingToDelete()) {
         PurgeObsoleteFiles(job_context);
+        TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
       }
       job_context.Clean();
       mutex_.Lock();
@@ -1400,7 +2257,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
     if (made_progress ||
         (bg_compaction_scheduled_ == 0 &&
          bg_bottom_compaction_scheduled_ == 0) ||
-        HasPendingManualCompaction()) {
+        HasPendingManualCompaction() || unscheduled_compactions_ == 0) {
       // signal if
       // * made_progress -- need to wakeup DelayWrite
       // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
@@ -1419,7 +2276,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
 Status DBImpl::BackgroundCompaction(bool* made_progress,
                                     JobContext* job_context,
                                     LogBuffer* log_buffer,
-                                    PrepickedCompaction* prepicked_compaction) {
+                                    PrepickedCompaction* prepicked_compaction,
+                                    Env::Priority thread_pri) {
   ManualCompactionState* manual_compaction =
       prepicked_compaction == nullptr
           ? nullptr
@@ -1429,7 +2287,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
 
   bool is_manual = (manual_compaction != nullptr);
-  unique_ptr<Compaction> c;
+  std::unique_ptr<Compaction> c;
   if (prepicked_compaction != nullptr &&
       prepicked_compaction->compaction != nullptr) {
     c.reset(prepicked_compaction->compaction);
@@ -1441,9 +2299,18 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       is_manual && manual_compaction->disallow_trivial_move;
 
   CompactionJobStats compaction_job_stats;
-  Status status = bg_error_;
-  if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
-    status = Status::ShutdownInProgress();
+  Status status;
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    }
+  } else {
+    status = error_handler_.GetBGError();
+    // If we get here, it means a hard error happened after this compaction
+    // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+    // a chance to execute. Since we didn't pop a cfd from the compaction
+    // queue, increment unscheduled_compactions_
+    unscheduled_compactions_++;
   }
 
   if (!status.ok()) {
@@ -1453,6 +2320,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       manual_compaction->in_progress = false;
       manual_compaction = nullptr;
     }
+    if (c) {
+      c->ReleaseCompactionFiles(status);
+      c.reset();
+    }
     return status;
   }
 
@@ -1461,8 +2332,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     manual_compaction->in_progress = true;
   }
 
+  std::unique_ptr<TaskLimiterToken> task_token;
+
   // InternalKey manual_end_storage;
   // InternalKey* manual_end = &manual_end_storage;
+  bool sfm_reserved_compact_space = false;
   if (is_manual) {
     ManualCompactionState* m = manual_compaction;
     assert(m->in_progress);
@@ -1476,19 +2350,32 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                        (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
                        (m->end ? m->end->DebugString().c_str() : "(end)"));
     } else {
-      ROCKS_LOG_BUFFER(
-          log_buffer,
-          "[%s] Manual compaction from level-%d to level-%d from %s .. "
-          "%s; will stop at %s\n",
-          m->cfd->GetName().c_str(), m->input_level, c->output_level(),
-          (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
-          (m->end ? m->end->DebugString().c_str() : "(end)"),
-          ((m->done || m->manual_end == nullptr)
-               ? "(end)"
-               : m->manual_end->DebugString().c_str()));
+      // First check if we have enough room to do the compaction
+      bool enough_room = EnoughRoomForCompaction(
+          m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+      if (!enough_room) {
+        // Then don't do the compaction
+        c->ReleaseCompactionFiles(status);
+        c.reset();
+        // m's vars will get set properly at the end of this function,
+        // as long as status == CompactionTooLarge
+        status = Status::CompactionTooLarge();
+      } else {
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Manual compaction from level-%d to level-%d from %s .. "
+            "%s; will stop at %s\n",
+            m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+            (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+            (m->end ? m->end->DebugString().c_str() : "(end)"),
+            ((m->done || m->manual_end == nullptr)
+                 ? "(end)"
+                 : m->manual_end->DebugString().c_str()));
+      }
     }
   } else if (!is_prepicked && !compaction_queue_.empty()) {
-    if (HaveManualCompaction(compaction_queue_.front())) {
+    if (HasExclusiveManualCompaction()) {
       // Can't compact right now, but try again later
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
 
@@ -1498,17 +2385,23 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       return Status::OK();
     }
 
-    // cfd is referenced here
-    auto cfd = PopFirstFromCompactionQueue();
+    auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
+    if (cfd == nullptr) {
+      // Can't find any executable task from the compaction queue.
+      // All tasks have been throttled by compaction thread limiter.
+      ++unscheduled_compactions_;
+      return Status::Busy();
+    }
+
     // We unreference here because the following code will take a Ref() on
     // this cfd if it is going to use it (Compaction class holds a
     // reference).
     // This will all happen under a mutex so we don't have to be afraid of
     // somebody else deleting it.
     if (cfd->Unref()) {
-      delete cfd;
       // This was the last reference of the column family, so no need to
       // compact.
+      delete cfd;
       return Status::OK();
     }
 
@@ -1525,27 +2418,48 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
       c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
       TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+
       if (c != nullptr) {
-        // update statistics
-        MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
-                    c->inputs(0)->size());
-        // There are three things that can change compaction score:
-        // 1) When flush or compaction finish. This case is covered by
-        // InstallSuperVersionAndScheduleWork
-        // 2) When MutableCFOptions changes. This case is also covered by
-        // InstallSuperVersionAndScheduleWork, because this is when the new
-        // options take effect.
-        // 3) When we Pick a new compaction, we "remove" those files being
-        // compacted from the calculation, which then influences compaction
-        // score. Here we check if we need the new compaction even without the
-        // files that are currently being compacted. If we need another
-        // compaction, we might be able to execute it in parallel, so we add it
-        // to the queue and schedule a new thread.
-        if (cfd->NeedsCompaction()) {
-          // Yes, we need more compactions!
+        bool enough_room = EnoughRoomForCompaction(
+            cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+        if (!enough_room) {
+          // Then don't do the compaction
+          c->ReleaseCompactionFiles(status);
+          c->column_family_data()
+              ->current()
+              ->storage_info()
+              ->ComputeCompactionScore(*(c->immutable_cf_options()),
+                                       *(c->mutable_cf_options()));
           AddToCompactionQueue(cfd);
           ++unscheduled_compactions_;
-          MaybeScheduleFlushOrCompaction();
+
+          c.reset();
+          // Don't need to sleep here, because BackgroundCallCompaction
+          // will sleep if !s.ok()
+          status = Status::CompactionTooLarge();
+        } else {
+          // update statistics
+          RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
+                            c->inputs(0)->size());
+          // There are three things that can change compaction score:
+          // 1) When flush or compaction finish. This case is covered by
+          // InstallSuperVersionAndScheduleWork
+          // 2) When MutableCFOptions changes. This case is also covered by
+          // InstallSuperVersionAndScheduleWork, because this is when the new
+          // options take effect.
+          // 3) When we Pick a new compaction, we "remove" those files being
+          // compacted from the calculation, which then influences compaction
+          // score. Here we check if we need the new compaction even without the
+          // files that are currently being compacted. If we need another
+          // compaction, we might be able to execute it in parallel, so we add
+          // it to the queue and schedule a new thread.
+          if (cfd->NeedsCompaction()) {
+            // Yes, we need more compactions!
+            AddToCompactionQueue(cfd);
+            ++unscheduled_compactions_;
+            MaybeScheduleFlushOrCompaction();
+          }
         }
       }
     }
@@ -1557,6 +2471,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   } else if (c->deletion_compaction()) {
     // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
     // file if there is alive snapshot pointing to it
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
     assert(c->num_input_files(1) == 0);
     assert(c->level() == 0);
     assert(c->column_family_data()->ioptions()->compaction_style ==
@@ -1564,20 +2480,28 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     compaction_job_stats.num_input_files = c->num_input_files(0);
 
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
     for (const auto& f : *c->inputs(0)) {
       c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
     }
     status = versions_->LogAndApply(c->column_family_data(),
                                     *c->mutable_cf_options(), c->edit(),
                                     &mutex_, directories_.GetDbDir());
-    InstallSuperVersionAndScheduleWorkWrapper(
-        c->column_family_data(), job_context, *c->mutable_cf_options());
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
                      c->column_family_data()->GetName().c_str(),
                      c->num_input_files(0));
     *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
   } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
     // Instrument for event update
     // TODO(yhchiang): add op details for showing trivial-move.
     ThreadStatusUtil::SetColumnFamily(
@@ -1587,6 +2511,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     compaction_job_stats.num_input_files = c->num_input_files(0);
 
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
     // Move files to next level
     int32_t moved_files = 0;
     int64_t moved_bytes = 0;
@@ -1599,14 +2526,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
         c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
         c->edit()->AddFile(c->output_level(), f->fd.GetNumber(),
                            f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest,
-                           f->largest, f->smallest_seqno, f->largest_seqno,
-                           f->marked_for_compaction);
-
-        ROCKS_LOG_BUFFER(log_buffer, "[%s] Moving #%" PRIu64
-                                     " to level-%d %" PRIu64 " bytes\n",
-                         c->column_family_data()->GetName().c_str(),
-                         f->fd.GetNumber(), c->output_level(),
-                         f->fd.GetFileSize());
+                           f->largest, f->fd.smallest_seqno,
+                           f->fd.largest_seqno, f->marked_for_compaction);
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+            c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+            c->output_level(), f->fd.GetFileSize());
         ++moved_files;
         moved_bytes += f->fd.GetFileSize();
       }
@@ -1616,8 +2543,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                                     *c->mutable_cf_options(), c->edit(),
                                     &mutex_, directories_.GetDbDir());
     // Use latest MutableCFOptions
-    InstallSuperVersionAndScheduleWorkWrapper(
-        c->column_family_data(), job_context, *c->mutable_cf_options());
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
 
     VersionStorageInfo::LevelSummaryStorage tmp;
     c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
@@ -1639,9 +2567,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     // Clear Instrument
     ThreadStatusUtil::ResetThreadStatus();
-  } else if (c->column_family_data()->ioptions()->compaction_style ==
-                 kCompactionStyleUniversal &&
-             !is_prepicked && c->output_level() > 0 &&
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  } else if (!is_prepicked && c->output_level() > 0 &&
              c->output_level() ==
                  c->column_family_data()
                      ->current()
@@ -1649,39 +2577,48 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                      ->MaxOutputLevel(
                          immutable_db_options_.allow_ingest_behind) &&
              env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
-    // Forward universal compactions involving last level to the bottom pool
-    // if it exists, such that long-running compactions can't block short-
-    // lived ones, like L0->L0s.
+    // Forward compactions involving last level to the bottom pool if it exists,
+    // such that compactions unlikely to contribute to write stalls can be
+    // delayed or deprioritized.
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
     CompactionArg* ca = new CompactionArg;
     ca->db = this;
     ca->prepicked_compaction = new PrepickedCompaction;
     ca->prepicked_compaction->compaction = c.release();
     ca->prepicked_compaction->manual_compaction_state = nullptr;
+    // Transfer requested token, so it doesn't need to do it again.
+    ca->prepicked_compaction->task_token = std::move(task_token);
     ++bg_bottom_compaction_scheduled_;
     env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
-                   this, &DBImpl::UnscheduleCallback);
+                   this, &DBImpl::UnscheduleCompactionCallback);
   } else {
-    int output_level  __attribute__((unused)) = c->output_level();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    int output_level __attribute__((__unused__));
+    output_level = c->output_level();
     TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
                              &output_level);
-
+    std::vector<SequenceNumber> snapshot_seqs;
     SequenceNumber earliest_write_conflict_snapshot;
-    std::vector<SequenceNumber> snapshot_seqs =
-        snapshots_.GetAll(&earliest_write_conflict_snapshot);
-
+    SnapshotChecker* snapshot_checker;
+    GetSnapshotContext(job_context, &snapshot_seqs,
+                       &earliest_write_conflict_snapshot, &snapshot_checker);
     assert(is_snapshot_supported_ || snapshots_.empty());
     CompactionJob compaction_job(
-        job_context->job_id, c.get(), immutable_db_options_, env_options_,
-        versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(),
-        directories_.GetDataDir(c->output_path_id()), stats_, &mutex_,
-        &bg_error_, snapshot_seqs, earliest_write_conflict_snapshot,
-        table_cache_, &event_logger_,
-        c->mutable_cf_options()->paranoid_file_checks,
+        job_context->job_id, c.get(), immutable_db_options_,
+        env_options_for_compaction_, versions_.get(), &shutting_down_,
+        preserve_deletes_seqnum_.load(), log_buffer, directories_.GetDbDir(),
+        GetDataDir(c->column_family_data(), c->output_path_id()), stats_,
+        &mutex_, &error_handler_, snapshot_seqs,
+        earliest_write_conflict_snapshot, snapshot_checker, table_cache_,
+        &event_logger_, c->mutable_cf_options()->paranoid_file_checks,
         c->mutable_cf_options()->report_bg_io_stats, dbname_,
-        &compaction_job_stats);
+        &compaction_job_stats, thread_pri);
     compaction_job.Prepare();
 
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
     mutex_.Unlock();
     compaction_job.Run();
     TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
@@ -1689,39 +2626,59 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
 
     status = compaction_job.Install(*c->mutable_cf_options());
     if (status.ok()) {
-      InstallSuperVersionAndScheduleWorkWrapper(
-          c->column_family_data(), job_context, *c->mutable_cf_options());
+      InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                         &job_context->superversion_contexts[0],
+                                         *c->mutable_cf_options());
     }
     *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
   }
   if (c != nullptr) {
     c->ReleaseCompactionFiles(status);
     *made_progress = true;
-    NotifyOnCompactionCompleted(
-        c->column_family_data(), c.get(), status,
-        compaction_job_stats, job_context->job_id);
+
+#ifndef ROCKSDB_LITE
+    // Need to make sure SstFileManager does its bookkeeping
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    if (sfm && sfm_reserved_compact_space) {
+      sfm->OnCompactionCompletion(c.get());
+    }
+#endif  // ROCKSDB_LITE
+
+    NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+                                compaction_job_stats, job_context->job_id);
   }
-  // this will unref its input_version and column_family_data
-  c.reset();
 
-  if (status.ok()) {
+  if (status.ok() || status.IsCompactionTooLarge()) {
     // Done
   } else if (status.IsShutdownInProgress()) {
     // Ignore compaction errors found during shutting down
   } else {
     ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
                    status.ToString().c_str());
-    if (immutable_db_options_.paranoid_checks && bg_error_.ok()) {
-      Status new_bg_error = status;
-      // may temporarily unlock and lock the mutex.
-      EventHelpers::NotifyOnBackgroundError(immutable_db_options_.listeners,
-                                            BackgroundErrorReason::kCompaction,
-                                            &new_bg_error, &mutex_);
-      if (!new_bg_error.ok()) {
-        bg_error_ = new_bg_error;
+    error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+      // Put this cfd back in the compaction queue so we can retry after some
+      // time
+      auto cfd = c->column_family_data();
+      assert(cfd != nullptr);
+      // Since this compaction failed, we need to recompute the score so it
+      // takes the original input files into account
+      c->column_family_data()
+          ->current()
+          ->storage_info()
+          ->ComputeCompactionScore(*(c->immutable_cf_options()),
+                                   *(c->mutable_cf_options()));
+      if (!cfd->queued_for_compaction()) {
+        AddToCompactionQueue(cfd);
+        ++unscheduled_compactions_;
       }
     }
   }
+  // this will unref its input_version and column_family_data
+  c.reset();
 
   if (is_manual) {
     ManualCompactionState* m = manual_compaction;
@@ -1757,7 +2714,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
       m->begin = &m->tmp_storage;
       m->incomplete = true;
     }
-    m->in_progress = false; // not being processed anymore
+    m->in_progress = false;  // not being processed anymore
   }
   TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish");
   return status;
@@ -1856,30 +2813,59 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
   return true;
 }
 
-// JobContext gets created and destructed outside of the lock --
-// we
-// use this convinently to:
+#ifndef ROCKSDB_LITE
+void DBImpl::BuildCompactionJobInfo(
+    const ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id,
+    const Version* current, CompactionJobInfo* compaction_job_info) const {
+  assert(compaction_job_info != nullptr);
+  compaction_job_info->cf_id = cfd->GetID();
+  compaction_job_info->cf_name = cfd->GetName();
+  compaction_job_info->status = st;
+  compaction_job_info->thread_id = env_->GetThreadID();
+  compaction_job_info->job_id = job_id;
+  compaction_job_info->base_input_level = c->start_level();
+  compaction_job_info->output_level = c->output_level();
+  compaction_job_info->stats = compaction_job_stats;
+  compaction_job_info->table_properties = c->GetOutputTableProperties();
+  compaction_job_info->compaction_reason = c->compaction_reason();
+  compaction_job_info->compression = c->output_compression();
+  for (size_t i = 0; i < c->num_input_levels(); ++i) {
+    for (const auto fmd : *c->inputs(i)) {
+      auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+                              fmd->fd.GetNumber(), fmd->fd.GetPathId());
+      compaction_job_info->input_files.push_back(fn);
+      if (compaction_job_info->table_properties.count(fn) == 0) {
+        std::shared_ptr<const TableProperties> tp;
+        auto s = current->GetTableProperties(&tp, fmd, &fn);
+        if (s.ok()) {
+          compaction_job_info->table_properties[fn] = tp;
+        }
+      }
+    }
+  }
+  for (const auto& newf : c->edit()->GetNewFiles()) {
+    compaction_job_info->output_files.push_back(
+        TableFileName(c->immutable_cf_options()->cf_paths,
+                      newf.second.fd.GetNumber(), newf.second.fd.GetPathId()));
+  }
+}
+#endif
+
+// SuperVersionContext gets created and destructed outside of the lock --
+// we use this conveniently to:
 // * malloc one SuperVersion() outside of the lock -- new_superversion
 // * delete SuperVersion()s outside of the lock -- superversions_to_free
 //
 // However, if InstallSuperVersionAndScheduleWork() gets called twice with the
-// same job_context, we can't reuse the SuperVersion() that got
+// same sv_context, we can't reuse the SuperVersion() that got
 // malloced because
 // first call already used it. In that rare case, we take a hit and create a
 // new SuperVersion() inside of the mutex. We do similar thing
 // for superversion_to_free
-void DBImpl::InstallSuperVersionAndScheduleWorkWrapper(
-    ColumnFamilyData* cfd, JobContext* job_context,
-    const MutableCFOptions& mutable_cf_options) {
-  mutex_.AssertHeld();
-  SuperVersion* old_superversion = InstallSuperVersionAndScheduleWork(
-      cfd, job_context->new_superversion, mutable_cf_options);
-  job_context->new_superversion = nullptr;
-  job_context->superversions_to_free.push_back(old_superversion);
-}
 
-SuperVersion* DBImpl::InstallSuperVersionAndScheduleWork(
-    ColumnFamilyData* cfd, SuperVersion* new_sv,
+void DBImpl::InstallSuperVersionAndScheduleWork(
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context,
     const MutableCFOptions& mutable_cf_options) {
   mutex_.AssertHeld();
 
@@ -1891,20 +2877,98 @@ SuperVersion* DBImpl::InstallSuperVersionAndScheduleWork(
                         old_sv->mutable_cf_options.max_write_buffer_number;
   }
 
-  auto* old = cfd->InstallSuperVersion(
-      new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
+  // this branch is unlikely to step in
+  if (UNLIKELY(sv_context->new_superversion == nullptr)) {
+    sv_context->NewSuperVersion();
+  }
+  cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
+
+  // There may be a small data race here. The snapshot tricking bottommost
+  // compaction may already be released here. But assuming there will always be
+  // newer snapshot created and released frequently, the compaction will be
+  // triggered soon anyway.
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+    bottommost_files_mark_threshold_ = std::min(
+        bottommost_files_mark_threshold_,
+        my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+  }
 
   // Whenever we install new SuperVersion, we might need to issue new flushes or
   // compactions.
-  SchedulePendingFlush(cfd);
   SchedulePendingCompaction(cfd);
   MaybeScheduleFlushOrCompaction();
 
   // Update max_total_in_memory_state_
-  max_total_in_memory_state_ =
-      max_total_in_memory_state_ - old_memtable_size +
-      mutable_cf_options.write_buffer_size *
-      mutable_cf_options.max_write_buffer_number;
-  return old;
+  max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
+                               mutable_cf_options.write_buffer_size *
+                                   mutable_cf_options.max_write_buffer_number;
+}
+
+// ShouldPurge is called by FindObsoleteFiles when doing a full scan,
+// and db mutex (mutex_) should already be held. This function performs a
+// linear scan of an vector (files_grabbed_for_purge_) in search of a
+// certain element. We expect FindObsoleteFiles with full scan to occur once
+// every 10 hours by default, and the size of the vector is small.
+// Therefore, the cost is affordable even if the mutex is held.
+// Actually, the current implementation of FindObsoleteFiles with
+// full_scan=true can issue I/O requests to obtain list of files in
+// directories, e.g. env_->getChildren while holding db mutex.
+// In the future, if we want to reduce the cost of search, we may try to keep
+// the vector sorted.
+bool DBImpl::ShouldPurge(uint64_t file_number) const {
+  for (auto fn : files_grabbed_for_purge_) {
+    if (file_number == fn) {
+      return false;
+    }
+  }
+  for (const auto& purge_file_info : purge_queue_) {
+    if (purge_file_info.number == file_number) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
+// (mutex_) should already be held.
+void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
+  files_grabbed_for_purge_.emplace_back(file_number);
+}
+
+void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
+  InstrumentedMutexLock l(&mutex_);
+  // snapshot_checker_ should only set once. If we need to set it multiple
+  // times, we need to make sure the old one is not deleted while it is still
+  // using by a compaction job.
+  assert(!snapshot_checker_);
+  snapshot_checker_.reset(snapshot_checker);
+}
+
+void DBImpl::GetSnapshotContext(
+    JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
+    SequenceNumber* earliest_write_conflict_snapshot,
+    SnapshotChecker** snapshot_checker_ptr) {
+  mutex_.AssertHeld();
+  assert(job_context != nullptr);
+  assert(snapshot_seqs != nullptr);
+  assert(earliest_write_conflict_snapshot != nullptr);
+  assert(snapshot_checker_ptr != nullptr);
+
+  *snapshot_checker_ptr = snapshot_checker_.get();
+  if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
+    *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+  }
+  if (*snapshot_checker_ptr != nullptr) {
+    // If snapshot_checker is used, that means the flush/compaction may
+    // contain values not visible to snapshot taken after
+    // flush/compaction job starts. Take a snapshot and it will appear
+    // in snapshot_seqs and force compaction iterator to consider such
+    // snapshots.
+    const Snapshot* job_snapshot =
+        GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
+    job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
+  }
+  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
 }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_impl_debug.cc b/thirdparty/rocksdb/db/db_impl_debug.cc
index a4b378020a..982227149d 100644
--- a/thirdparty/rocksdb/db/db_impl_debug.cc
+++ b/thirdparty/rocksdb/db/db_impl_debug.cc
@@ -10,6 +10,7 @@
 #ifndef NDEBUG
 
 #include "db/db_impl.h"
+#include "db/error_handler.h"
 #include "monitoring/thread_status_updater.h"
 
 namespace rocksdb {
@@ -19,10 +20,22 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() {
   return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
 }
 
-void DBImpl::TEST_HandleWALFull() {
+void DBImpl::TEST_SwitchWAL() {
   WriteContext write_context;
   InstrumentedMutexLock l(&mutex_);
-  HandleWALFull(&write_context);
+  SwitchWAL(&write_context);
+}
+
+bool DBImpl::TEST_WALBufferIsEmpty(bool lock) {
+  if (lock) {
+    log_write_mutex_.Lock();
+  }
+  log::Writer* cur_log_writer = logs_.back().writer;
+  auto res = cur_log_writer->TEST_BufferIsEmpty();
+  if (lock) {
+    log_write_mutex_.Unlock();
+  }
+  return res;
 }
 
 int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
@@ -60,6 +73,10 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
   return versions_->manifest_file_number();
 }
 
+uint64_t DBImpl::TEST_Current_Next_FileNo() {
+  return versions_->current_next_file_number();
+}
+
 Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
                                  const Slice* end,
                                  ColumnFamilyHandle* column_family,
@@ -76,7 +93,7 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
        cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
           ? level
           : level + 1;
-  return RunManualCompaction(cfd, level, output_level, 0, begin, end, true,
+  return RunManualCompaction(cfd, level, output_level, 0, 0, begin, end, true,
                              disallow_trivial_move);
 }
 
@@ -89,9 +106,11 @@ Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
   return SwitchMemtable(cfd, &write_context);
 }
 
-Status DBImpl::TEST_FlushMemTable(bool wait, ColumnFamilyHandle* cfh) {
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+                                  ColumnFamilyHandle* cfh) {
   FlushOptions fo;
   fo.wait = wait;
+  fo.allow_write_stall = allow_write_stall;
   ColumnFamilyData* cfd;
   if (cfh == nullptr) {
     cfd = default_cf_handle_->cfd();
@@ -99,7 +118,7 @@ Status DBImpl::TEST_FlushMemTable(bool wait, ColumnFamilyHandle* cfh) {
     auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh);
     cfd = cfhi->cfd();
   }
-  return FlushMemTable(cfd, fo);
+  return FlushMemTable(cfd, fo, FlushReason::kTest);
 }
 
 Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
@@ -110,10 +129,10 @@ Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
     cfd = cfh->cfd();
   }
-  return WaitForFlushMemTable(cfd);
+  return WaitForFlushMemTable(cfd, nullptr, false);
 }
 
-Status DBImpl::TEST_WaitForCompact() {
+Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
   // Wait until the compaction completes
 
   // TODO: a bug here. This function actually does not necessarily
@@ -122,20 +141,17 @@ Status DBImpl::TEST_WaitForCompact() {
 
   InstrumentedMutexLock l(&mutex_);
   while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
-          bg_flush_scheduled_) &&
-         bg_error_.ok()) {
+          bg_flush_scheduled_ ||
+          (wait_unscheduled && unscheduled_compactions_)) &&
+         (error_handler_.GetBGError() == Status::OK())) {
     bg_cv_.Wait();
   }
-  return bg_error_;
+  return error_handler_.GetBGError();
 }
 
-void DBImpl::TEST_LockMutex() {
-  mutex_.Lock();
-}
+void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
 
-void DBImpl::TEST_UnlockMutex() {
-  mutex_.Unlock();
-}
+void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
 
 void* DBImpl::TEST_BeginWrite() {
   auto w = new WriteThread::Writer();
@@ -179,11 +195,21 @@ Status DBImpl::TEST_GetAllImmutableCFOptions(
 }
 
 uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
-  return FindMinLogContainingOutstandingPrep();
+  return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+}
+
+size_t DBImpl::TEST_PreparedSectionCompletedSize() {
+  return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
+}
+
+size_t DBImpl::TEST_LogsWithPrepSize() {
+  return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
 }
 
 uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
-  return FindMinPrepLogReferencedByMemTable();
+  autovector<MemTable*> empty_list;
+  return FindMinPrepLogReferencedByMemTable(versions_.get(), nullptr,
+                                            empty_list);
 }
 
 Status DBImpl::TEST_GetLatestMutableCFOptions(
@@ -205,5 +231,38 @@ int DBImpl::TEST_BGFlushesAllowed() const {
   return GetBGJobLimits().max_flushes;
 }
 
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+  if (last_seq_same_as_publish_seq_) {
+    return versions_->LastSequence();
+  } else {
+    return versions_->LastAllocatedSequence();
+  }
+}
+
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+    uint64_t write_buffer_size) const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
+void DBImpl::TEST_WaitForDumpStatsRun(std::function<void()> callback) const {
+  if (thread_dump_stats_ != nullptr) {
+    thread_dump_stats_->TEST_WaitForRun(callback);
+  }
+}
+
+void DBImpl::TEST_WaitForPersistStatsRun(std::function<void()> callback) const {
+  if (thread_persist_stats_ != nullptr) {
+    thread_persist_stats_->TEST_WaitForRun(callback);
+  }
+}
+
+bool DBImpl::TEST_IsPersistentStatsEnabled() const {
+  return thread_persist_stats_ && thread_persist_stats_->IsRunning();
+}
+
+size_t DBImpl::TEST_EstiamteStatsHistorySize() const {
+  return EstiamteStatsHistorySize();
+}
 }  // namespace rocksdb
 #endif  // NDEBUG
diff --git a/thirdparty/rocksdb/db/db_impl_experimental.cc b/thirdparty/rocksdb/db/db_impl_experimental.cc
index 0d010758e6..47a880199e 100644
--- a/thirdparty/rocksdb/db/db_impl_experimental.cc
+++ b/thirdparty/rocksdb/db/db_impl_experimental.cc
@@ -30,10 +30,10 @@ Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
   auto cfd = cfh->cfd();
   InternalKey start_key, end_key;
   if (begin != nullptr) {
-    start_key.SetMaxPossibleForUserKey(*begin);
+    start_key.SetMinPossibleForUserKey(*begin);
   }
   if (end != nullptr) {
-    end_key.SetMinPossibleForUserKey(*end);
+    end_key.SetMaxPossibleForUserKey(*end);
   }
   {
     InstrumentedMutexLock l(&mutex_);
@@ -131,15 +131,16 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
       edit.DeleteFile(0, f->fd.GetNumber());
       edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
                    f->fd.GetFileSize(), f->smallest, f->largest,
-                   f->smallest_seqno, f->largest_seqno,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
                    f->marked_for_compaction);
     }
 
     status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                     &edit, &mutex_, directories_.GetDbDir());
     if (status.ok()) {
-      InstallSuperVersionAndScheduleWorkWrapper(
-          cfd, &job_context, *cfd->GetLatestMutableCFOptions());
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
     }
   }  // lock released here
   LogFlush(immutable_db_options_.info_log);
diff --git a/thirdparty/rocksdb/db/db_impl_files.cc b/thirdparty/rocksdb/db/db_impl_files.cc
index e44e423189..90cc6a14ba 100644
--- a/thirdparty/rocksdb/db/db_impl_files.cc
+++ b/thirdparty/rocksdb/db/db_impl_files.cc
@@ -12,120 +12,29 @@
 #define __STDC_FORMAT_MACROS
 #endif
 #include <inttypes.h>
+#include <set>
+#include <unordered_set>
 #include "db/event_helpers.h"
+#include "db/memtable_list.h"
 #include "util/file_util.h"
 #include "util/sst_file_manager_impl.h"
 
-
 namespace rocksdb {
-uint64_t DBImpl::FindMinPrepLogReferencedByMemTable() {
-  if (!allow_2pc()) {
-    return 0;
-  }
-
-  uint64_t min_log = 0;
-
-  // we must look through the memtables for two phase transactions
-  // that have been committed but not yet flushed
-  for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
-    if (loop_cfd->IsDropped()) {
-      continue;
-    }
-
-    auto log = loop_cfd->imm()->GetMinLogContainingPrepSection();
-
-    if (log > 0 && (min_log == 0 || log < min_log)) {
-      min_log = log;
-    }
-
-    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
-
-    if (log > 0 && (min_log == 0 || log < min_log)) {
-      min_log = log;
-    }
-  }
-
-  return min_log;
-}
-
-void DBImpl::MarkLogAsHavingPrepSectionFlushed(uint64_t log) {
-  assert(log != 0);
-  std::lock_guard<std::mutex> lock(prep_heap_mutex_);
-  auto it = prepared_section_completed_.find(log);
-  assert(it != prepared_section_completed_.end());
-  it->second += 1;
-}
-
-void DBImpl::MarkLogAsContainingPrepSection(uint64_t log) {
-  assert(log != 0);
-  std::lock_guard<std::mutex> lock(prep_heap_mutex_);
-  min_log_with_prep_.push(log);
-  auto it = prepared_section_completed_.find(log);
-  if (it == prepared_section_completed_.end()) {
-    prepared_section_completed_[log] = 0;
-  }
-}
-
-uint64_t DBImpl::FindMinLogContainingOutstandingPrep() {
-
-  if (!allow_2pc()) {
-    return 0;
-  }
-
-  std::lock_guard<std::mutex> lock(prep_heap_mutex_);
-  uint64_t min_log = 0;
-
-  // first we look in the prepared heap where we keep
-  // track of transactions that have been prepared (written to WAL)
-  // but not yet committed.
-  while (!min_log_with_prep_.empty()) {
-    min_log = min_log_with_prep_.top();
-
-    auto it = prepared_section_completed_.find(min_log);
-
-    // value was marked as 'deleted' from heap
-    if (it != prepared_section_completed_.end() && it->second > 0) {
-      it->second -= 1;
-      min_log_with_prep_.pop();
-
-      // back to squere one...
-      min_log = 0;
-      continue;
-    } else {
-      // found a valid value
-      break;
-    }
-  }
-
-  return min_log;
-}
 
 uint64_t DBImpl::MinLogNumberToKeep() {
-  uint64_t log_number = versions_->MinLogNumber();
-
   if (allow_2pc()) {
-    // if are 2pc we must consider logs containing prepared
-    // sections of outstanding transactions.
-    //
-    // We must check min logs with outstanding prep before we check
-    // logs referneces by memtables because a log referenced by the
-    // first data structure could transition to the second under us.
-    //
-    // TODO(horuff): iterating over all column families under db mutex.
-    // should find more optimial solution
-    auto min_log_in_prep_heap = FindMinLogContainingOutstandingPrep();
-
-    if (min_log_in_prep_heap != 0 && min_log_in_prep_heap < log_number) {
-      log_number = min_log_in_prep_heap;
-    }
-
-    auto min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable();
+    return versions_->min_log_number_to_keep_2pc();
+  } else {
+    return versions_->MinLogNumberWithUnflushedData();
+  }
+}
 
-    if (min_log_refed_by_mem != 0 && min_log_refed_by_mem < log_number) {
-      log_number = min_log_refed_by_mem;
-    }
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+  mutex_.AssertHeld();
+  if (!pending_outputs_.empty()) {
+    return *pending_outputs_.begin();
   }
-  return log_number;
+  return std::numeric_limits<uint64_t>::max();
 }
 
 // * Returns the list of live files in 'sst_live'
@@ -148,7 +57,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
 
   bool doing_the_full_scan = false;
 
-  // logic for figurint out if we're doing the full scan
+  // logic for figuring out if we're doing the full scan
   if (no_full_scan) {
     doing_the_full_scan = false;
   } else if (force ||
@@ -168,7 +77,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   // threads
   // Since job_context->min_pending_output is set, until file scan finishes,
   // mutex_ cannot be released. Otherwise, we might see no min_pending_output
-  // here but later find newer generated unfinalized files while scannint.
+  // here but later find newer generated unfinalized files while scanning.
   if (!pending_outputs_.empty()) {
     job_context->min_pending_output = *pending_outputs_.begin();
   } else {
@@ -182,27 +91,68 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                               &job_context->manifest_delete_files,
                               job_context->min_pending_output);
 
+  // Mark the elements in job_context->sst_delete_files as grabbedForPurge
+  // so that other threads calling FindObsoleteFiles with full_scan=true
+  // will not add these files to candidate list for purge.
+  for (const auto& sst_to_del : job_context->sst_delete_files) {
+    MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
+  }
+
   // store the current filenum, lognum, etc
   job_context->manifest_file_number = versions_->manifest_file_number();
   job_context->pending_manifest_file_number =
       versions_->pending_manifest_file_number();
   job_context->log_number = MinLogNumberToKeep();
-
   job_context->prev_log_number = versions_->prev_log_number();
 
   versions_->AddLiveFiles(&job_context->sst_live);
   if (doing_the_full_scan) {
+    InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+                                  dbname_);
+    std::set<std::string> paths;
     for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size();
          path_id++) {
+      paths.insert(immutable_db_options_.db_paths[path_id].path);
+    }
+
+    // Note that if cf_paths is not specified in the ColumnFamilyOptions
+    // of a particular column family, we use db_paths as the cf_paths
+    // setting. Hence, there can be multiple duplicates of files from db_paths
+    // in the following code. The duplicate are removed while identifying
+    // unique files in PurgeObsoleteFiles.
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size();
+           path_id++) {
+        auto& path = cfd->ioptions()->cf_paths[path_id].path;
+
+        if (paths.find(path) == paths.end()) {
+          paths.insert(path);
+        }
+      }
+    }
+
+    for (auto& path : paths) {
       // set of all files in the directory. We'll exclude files that are still
       // alive in the subsequent processings.
       std::vector<std::string> files;
-      env_->GetChildren(immutable_db_options_.db_paths[path_id].path,
-                        &files);  // Ignore errors
-      for (std::string file : files) {
+      env_->GetChildren(path, &files);  // Ignore errors
+      for (const std::string& file : files) {
+        uint64_t number;
+        FileType type;
+        // 1. If we cannot parse the file name, we skip;
+        // 2. If the file with file_number equals number has already been
+        // grabbed for purge by another compaction job, or it has already been
+        // schedule for purge, we also skip it if we
+        // are doing full scan in order to avoid double deletion of the same
+        // file under race conditions. See
+        // https://github.com/facebook/rocksdb/issues/3573
+        if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) ||
+            !ShouldPurge(number)) {
+          continue;
+        }
+
         // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
-        job_context->full_scan_candidate_files.emplace_back(
-            "/" + file, static_cast<uint32_t>(path_id));
+        job_context->full_scan_candidate_files.emplace_back("/" + file, path);
       }
     }
 
@@ -211,8 +161,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
       std::vector<std::string> log_files;
       env_->GetChildren(immutable_db_options_.wal_dir,
                         &log_files);  // Ignore errors
-      for (std::string log_file : log_files) {
-        job_context->full_scan_candidate_files.emplace_back(log_file, 0);
+      for (const std::string& log_file : log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.wal_dir);
       }
     }
     // Add info log files in db_log_dir
@@ -221,8 +172,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
       std::vector<std::string> info_log_files;
       // Ignore errors
       env_->GetChildren(immutable_db_options_.db_log_dir, &info_log_files);
-      for (std::string log_file : info_log_files) {
-        job_context->full_scan_candidate_files.emplace_back(log_file, 0);
+      for (std::string& log_file : info_log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.db_log_dir);
       }
     }
   }
@@ -236,11 +188,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     while (alive_log_files_.begin()->number < min_log_number) {
       auto& earliest = *alive_log_files_.begin();
       if (immutable_db_options_.recycle_log_file_num >
-          log_recycle_files.size()) {
+          log_recycle_files_.size()) {
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "adding log %" PRIu64 " to recycle list\n",
                        earliest.number);
-        log_recycle_files.push_back(earliest.number);
+        log_recycle_files_.push_back(earliest.number);
       } else {
         job_context->log_delete_files.push_back(earliest.number);
       }
@@ -250,11 +202,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
       }
       job_context->size_log_to_delete += earliest.size;
       total_log_size_ -= earliest.size;
-      if (concurrent_prepare_) {
+      if (two_write_queues_) {
         log_write_mutex_.Lock();
       }
       alive_log_files_.pop_front();
-      if (concurrent_prepare_) {
+      if (two_write_queues_) {
         log_write_mutex_.Unlock();
       }
       // Current log should always stay alive since it can't have
@@ -281,8 +233,11 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
   // We're just cleaning up for DB::Write().
   assert(job_context->logs_to_free.empty());
   job_context->logs_to_free = logs_to_free_;
-  job_context->log_recycle_files.assign(log_recycle_files.begin(),
-                                        log_recycle_files.end());
+  job_context->log_recycle_files.assign(log_recycle_files_.begin(),
+                                        log_recycle_files_.end());
+  if (job_context->HaveSomethingToDelete()) {
+    ++pending_purge_obsolete_files_;
+  }
   logs_to_free_.clear();
 }
 
@@ -294,21 +249,24 @@ bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
   } else if (first.file_name < second.file_name) {
     return false;
   } else {
-    return (first.path_id > second.path_id);
+    return (first.file_path > second.file_path);
   }
 }
 };  // namespace
 
 // Delete obsolete files and log status and information of file deletion
-void DBImpl::DeleteObsoleteFileImpl(Status file_deletion_status, int job_id,
-                                    const std::string& fname, FileType type,
-                                    uint64_t number, uint32_t path_id) {
-  if (type == kTableFile) {
+void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                                    const std::string& path_to_sync,
+                                    FileType type, uint64_t number) {
+  Status file_deletion_status;
+  if (type == kTableFile || type == kLogFile) {
     file_deletion_status =
-        DeleteSSTFile(&immutable_db_options_, fname, path_id);
+        DeleteDBFile(&immutable_db_options_, fname, path_to_sync);
   } else {
     file_deletion_status = env_->DeleteFile(fname);
   }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
+                           &file_deletion_status);
   if (file_deletion_status.ok()) {
     ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                     "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
@@ -335,19 +293,16 @@ void DBImpl::DeleteObsoleteFileImpl(Status file_deletion_status, int job_id,
 }
 
 // Diffs the files listed in filenames and those that do not
-// belong to live files are posibly removed. Also, removes all the
+// belong to live files are possibly removed. Also, removes all the
 // files in sst_delete_files and log_delete_files.
 // It is not necessary to hold the mutex when invoking this method.
-void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
+void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin");
   // we'd better have sth to delete
   assert(state.HaveSomethingToDelete());
 
-  // this checks if FindObsoleteFiles() was run before. If not, don't do
-  // PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also
-  // run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
-  if (state.manifest_file_number == 0) {
-    return;
-  }
+  // FindObsoleteFiles() should've populated this so nonzero
+  assert(state.manifest_file_number != 0);
 
   // Now, convert live list to an unordered map, WITHOUT mutex held;
   // set is slow.
@@ -364,23 +319,24 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
       state.log_delete_files.size() + state.manifest_delete_files.size());
   // We may ignore the dbname when generating the file names.
   const char* kDumbDbName = "";
-  for (auto file : state.sst_delete_files) {
+  for (auto& file : state.sst_delete_files) {
     candidate_files.emplace_back(
-        MakeTableFileName(kDumbDbName, file->fd.GetNumber()),
-        file->fd.GetPathId());
-    if (file->table_reader_handle) {
-      table_cache_->Release(file->table_reader_handle);
+        MakeTableFileName(kDumbDbName, file.metadata->fd.GetNumber()),
+        file.path);
+    if (file.metadata->table_reader_handle) {
+      table_cache_->Release(file.metadata->table_reader_handle);
     }
-    delete file;
+    file.DeleteMetadata();
   }
 
   for (auto file_num : state.log_delete_files) {
     if (file_num > 0) {
-      candidate_files.emplace_back(LogFileName(kDumbDbName, file_num), 0);
+      candidate_files.emplace_back(LogFileName(kDumbDbName, file_num),
+                                   immutable_db_options_.wal_dir);
     }
   }
   for (const auto& filename : state.manifest_delete_files) {
-    candidate_files.emplace_back(filename, 0);
+    candidate_files.emplace_back(filename, dbname_);
   }
 
   // dedup state.candidate_files so we don't try to delete the same
@@ -403,9 +359,32 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
   std::vector<std::string> old_info_log_files;
   InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
                                 dbname_);
+
+  // File numbers of most recent two OPTIONS file in candidate_files (found in
+  // previos FindObsoleteFiles(full_scan=true))
+  // At this point, there must not be any duplicate file numbers in
+  // candidate_files.
+  uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
+  uint64_t optsfile_num2 = std::numeric_limits<uint64_t>::min();
   for (const auto& candidate_file : candidate_files) {
-    std::string to_delete = candidate_file.file_name;
-    uint32_t path_id = candidate_file.path_id;
+    const std::string& fname = candidate_file.file_name;
+    uint64_t number;
+    FileType type;
+    if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) ||
+        type != kOptionsFile) {
+      continue;
+    }
+    if (number > optsfile_num1) {
+      optsfile_num2 = optsfile_num1;
+      optsfile_num1 = number;
+    } else if (number > optsfile_num2) {
+      optsfile_num2 = number;
+    }
+  }
+
+  std::unordered_set<uint64_t> files_to_del;
+  for (const auto& candidate_file : candidate_files) {
+    const std::string& to_delete = candidate_file.file_name;
     uint64_t number;
     FileType type;
     // Ignore file if we cannot recognize it.
@@ -431,6 +410,9 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
         // DontDeletePendingOutputs fail
         keep = (sst_live_map.find(number) != sst_live_map.end()) ||
                number >= state.min_pending_output;
+        if (!keep) {
+          files_to_del.insert(number);
+        }
         break;
       case kTempFile:
         // Any temp files that are currently being written to must
@@ -451,11 +433,19 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
           old_info_log_files.push_back(to_delete);
         }
         break;
+      case kOptionsFile:
+        keep = (number >= optsfile_num2);
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1",
+            reinterpret_cast<void*>(&number));
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2",
+            reinterpret_cast<void*>(&keep));
+        break;
       case kCurrentFile:
       case kDBLockFile:
       case kIdentityFile:
       case kMetaDatabase:
-      case kOptionsFile:
       case kBlobFile:
         keep = true;
         break;
@@ -466,13 +456,21 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
     }
 
     std::string fname;
+    std::string dir_to_sync;
     if (type == kTableFile) {
       // evict from cache
       TableCache::Evict(table_cache_.get(), number);
-      fname = TableFileName(immutable_db_options_.db_paths, number, path_id);
+      fname = MakeTableFileName(candidate_file.file_path, number);
+      dir_to_sync = candidate_file.file_path;
     } else {
-      fname = ((type == kLogFile) ? immutable_db_options_.wal_dir : dbname_) +
-              "/" + to_delete;
+      dir_to_sync =
+          (type == kLogFile) ? immutable_db_options_.wal_dir : dbname_;
+      fname = dir_to_sync +
+              ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+                       (!to_delete.empty() && to_delete.front() == '/')
+                   ? ""
+                   : "/") +
+              to_delete;
     }
 
 #ifndef ROCKSDB_LITE
@@ -486,13 +484,25 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
     Status file_deletion_status;
     if (schedule_only) {
       InstrumentedMutexLock guard_lock(&mutex_);
-      SchedulePendingPurge(fname, type, number, path_id, state.job_id);
+      SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
     } else {
-      DeleteObsoleteFileImpl(file_deletion_status, state.job_id, fname, type,
-                             number, path_id);
+      DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number);
     }
   }
 
+  {
+    // After purging obsolete files, remove them from files_grabbed_for_purge_.
+    // Use a temporary vector to perform bulk deletion via swap.
+    InstrumentedMutexLock guard_lock(&mutex_);
+    std::vector<uint64_t> tmp;
+    for (auto fn : files_grabbed_for_purge_) {
+      if (files_to_del.count(fn) == 0) {
+        tmp.emplace_back(fn);
+      }
+    }
+    files_grabbed_for_purge_.swap(tmp);
+  }
+
   // Delete old info log files.
   size_t old_info_log_file_count = old_info_log_files.size();
   if (old_info_log_file_count != 0 &&
@@ -531,6 +541,13 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state, bool schedule_only) {
   wal_manager_.PurgeObsoleteWALFiles();
 #endif  // ROCKSDB_LITE
   LogFlush(immutable_db_options_.info_log);
+  InstrumentedMutexLock l(&mutex_);
+  --pending_purge_obsolete_files_;
+  assert(pending_purge_obsolete_files_ >= 0);
+  if (pending_purge_obsolete_files_ == 0) {
+    bg_cv_.SignalAll();
+  }
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End");
 }
 
 void DBImpl::DeleteObsoleteFiles() {
@@ -545,4 +562,95 @@ void DBImpl::DeleteObsoleteFiles() {
   job_context.Clean();
   mutex_.Lock();
 }
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const ColumnFamilyData* cfd_to_flush,
+    const autovector<MemTable*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  // we must look through the memtables for two phase transactions
+  // that have been committed but not yet flushed
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped() || loop_cfd == cfd_to_flush) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        memtables_to_flush);
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    autovector<VersionEdit*> edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  // Calculate updated min_log_number_to_keep
+  // Since the function should only be called in 2pc mode, log number in
+  // the version edit should be sufficient.
+
+  // Precompute the min log number containing unflushed data for the column
+  // family being flushed (`cfd_to_flush`).
+  uint64_t cf_min_log_number_to_keep = 0;
+  for (auto& e : edit_list) {
+    if (e->has_log_number()) {
+      cf_min_log_number_to_keep =
+          std::max(cf_min_log_number_to_keep, e->log_number());
+    }
+  }
+  if (cf_min_log_number_to_keep == 0) {
+    // No version edit contains information on log number. The log number
+    // for this column family should stay the same as it is.
+    cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber();
+  }
+
+  // Get min log number containing unflushed data for other column families.
+  uint64_t min_log_number_to_keep =
+      vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush);
+  if (cf_min_log_number_to_keep != 0) {
+    min_log_number_to_keep =
+        std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
+  }
+
+  // if are 2pc we must consider logs containing prepared
+  // sections of outstanding transactions.
+  //
+  // We must check min logs with outstanding prep before we check
+  // logs references by memtables because a log referenced by the
+  // first data structure could transition to the second under us.
+  //
+  // TODO: iterating over all column families under db mutex.
+  // should find more optimal solution
+  auto min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem = FindMinPrepLogReferencedByMemTable(
+      vset, &cfd_to_flush, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+  return min_log_number_to_keep;
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_impl_open.cc b/thirdparty/rocksdb/db/db_impl_open.cc
index bc94b6095f..f5008857bc 100644
--- a/thirdparty/rocksdb/db/db_impl_open.cc
+++ b/thirdparty/rocksdb/db/db_impl_open.cc
@@ -14,6 +14,7 @@
 #include <inttypes.h>
 
 #include "db/builder.h"
+#include "db/error_handler.h"
 #include "options/options_helper.h"
 #include "rocksdb/wal_filter.h"
 #include "table/block_based_table_factory.h"
@@ -22,8 +23,7 @@
 #include "util/sync_point.h"
 
 namespace rocksdb {
-Options SanitizeOptions(const std::string& dbname,
-                        const Options& src) {
+Options SanitizeOptions(const std::string& dbname, const Options& src) {
   auto db_options = SanitizeOptions(dbname, DBOptions(src));
   ImmutableDBOptions immutable_db_options(db_options);
   auto cf_options =
@@ -41,6 +41,8 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
       max_max_open_files = 0x400000;
     }
     ClipToRange(&result.max_open_files, 20, max_max_open_files);
+    TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
+                             &result.max_open_files);
   }
 
   if (result.info_log == nullptr) {
@@ -55,10 +57,9 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     result.write_buffer_manager.reset(
         new WriteBufferManager(result.db_write_buffer_size));
   }
-  auto bg_job_limits = DBImpl::GetBGJobLimits(result.max_background_flushes,
-                                              result.max_background_compactions,
-                                              result.max_background_jobs,
-                                              true /* parallelize_compactions */);
+  auto bg_job_limits = DBImpl::GetBGJobLimits(
+      result.max_background_flushes, result.max_background_compactions,
+      result.max_background_jobs, true /* parallelize_compactions */);
   result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
                                            Env::Priority::LOW);
   result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
@@ -106,14 +107,12 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
   }
 
-  if (result.use_direct_io_for_flush_and_compaction &&
-      result.compaction_readahead_size == 0) {
+  if (result.use_direct_reads && result.compaction_readahead_size == 0) {
     TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
     result.compaction_readahead_size = 1024 * 1024 * 2;
   }
 
-  if (result.compaction_readahead_size > 0 ||
-      result.use_direct_io_for_flush_and_compaction) {
+  if (result.compaction_readahead_size > 0 || result.use_direct_reads) {
     result.new_table_reader_for_compaction_inputs = true;
   }
 
@@ -124,6 +123,24 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
     result.avoid_flush_during_recovery = false;
   }
 
+#ifndef ROCKSDB_LITE
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
+  for (size_t i = 0; i < result.db_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path);
+  }
+
+  // Create a default SstFileManager for purposes of tracking compaction size
+  // and facilitating recovery from out of space errors.
+  if (result.sst_file_manager.get() == nullptr) {
+    std::shared_ptr<SstFileManager> sst_file_manager(
+        NewSstFileManager(result.env, result.info_log));
+    result.sst_file_manager = sst_file_manager;
+  }
+#endif
   return result;
 }
 
@@ -152,28 +169,23 @@ static Status ValidateOptions(
     if (s.ok() && db_options.allow_concurrent_memtable_write) {
       s = CheckConcurrentWritesSupported(cfd.options);
     }
+    if (s.ok()) {
+      s = CheckCFPathsSupported(db_options, cfd.options);
+    }
     if (!s.ok()) {
       return s;
     }
-    if (db_options.db_paths.size() > 1) {
-      if ((cfd.options.compaction_style != kCompactionStyleUniversal) &&
-          (cfd.options.compaction_style != kCompactionStyleLevel)) {
-        return Status::NotSupported(
-            "More than one DB paths are only supported in "
-            "universal and level compaction styles. ");
-      }
-    }
-    if (cfd.options.compaction_options_fifo.ttl > 0) {
+
+    if (cfd.options.ttl > 0) {
       if (db_options.max_open_files != -1) {
         return Status::NotSupported(
-            "FIFO Compaction with TTL is only supported when files are always "
+            "TTL is only supported when files are always "
             "kept open (set max_open_files = -1). ");
       }
       if (cfd.options.table_factory->Name() !=
           BlockBasedTableFactory().Name()) {
         return Status::NotSupported(
-            "FIFO Compaction with TTL is only supported in "
-            "Block-Based Table format. ");
+            "TTL is only supported in Block-Based Table format. ");
       }
     }
   }
@@ -204,7 +216,7 @@ static Status ValidateOptions(
 
   return Status::OK();
 }
-} // namespace
+}  // namespace
 Status DBImpl::NewDB() {
   VersionEdit new_db;
   new_db.SetLogNumber(0);
@@ -216,7 +228,7 @@ Status DBImpl::NewDB() {
   ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
   const std::string manifest = DescriptorFileName(dbname_, 1);
   {
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_);
     s = NewWritableFile(env_, manifest, &file, env_options);
     if (!s.ok()) {
@@ -224,8 +236,9 @@ Status DBImpl::NewDB() {
     }
     file->SetPreallocationBlockSize(
         immutable_db_options_.manifest_preallocation_size);
-    unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), env_options));
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(file), manifest, env_options, env_, nullptr /* stats */,
+        immutable_db_options_.listeners));
     log::Writer log(std::move(file_writer), 0, false);
     std::string record;
     new_db.EncodeTo(&record);
@@ -243,9 +256,8 @@ Status DBImpl::NewDB() {
   return s;
 }
 
-Status DBImpl::Directories::CreateAndNewDirectory(
-    Env* env, const std::string& dirname,
-    std::unique_ptr<Directory>* directory) const {
+Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname,
+                                     std::unique_ptr<Directory>* directory) {
   // We call CreateDirIfMissing() as the directory may already exist (if we
   // are reopening a DB), when this happens we don't want creating the
   // directory to cause an error. However, we need to check if creating the
@@ -263,12 +275,12 @@ Status DBImpl::Directories::CreateAndNewDirectory(
 Status DBImpl::Directories::SetDirectories(
     Env* env, const std::string& dbname, const std::string& wal_dir,
     const std::vector<DbPath>& data_paths) {
-  Status s = CreateAndNewDirectory(env, dbname, &db_dir_);
+  Status s = DBImpl::CreateAndNewDirectory(env, dbname, &db_dir_);
   if (!s.ok()) {
     return s;
   }
   if (!wal_dir.empty() && dbname != wal_dir) {
-    s = CreateAndNewDirectory(env, wal_dir, &wal_dir_);
+    s = DBImpl::CreateAndNewDirectory(env, wal_dir, &wal_dir_);
     if (!s.ok()) {
       return s;
     }
@@ -281,7 +293,7 @@ Status DBImpl::Directories::SetDirectories(
       data_dirs_.emplace_back(nullptr);
     } else {
       std::unique_ptr<Directory> path_directory;
-      s = CreateAndNewDirectory(env, db_path, &path_directory);
+      s = DBImpl::CreateAndNewDirectory(env, db_path, &path_directory);
       if (!s.ok()) {
         return s;
       }
@@ -326,8 +338,8 @@ Status DBImpl::Recover(
       }
     } else if (s.ok()) {
       if (immutable_db_options_.error_if_exists) {
-        return Status::InvalidArgument(
-            dbname_, "exists (error_if_exists is true)");
+        return Status::InvalidArgument(dbname_,
+                                       "exists (error_if_exists is true)");
       }
     } else {
       // Unexpected error reading file
@@ -345,12 +357,53 @@ Status DBImpl::Recover(
       assert(s.IsIOError());
       return s;
     }
+    // Verify compatibility of env_options_ and filesystem
+    {
+      std::unique_ptr<RandomAccessFile> idfile;
+      EnvOptions customized_env(env_options_);
+      customized_env.use_direct_reads |=
+          immutable_db_options_.use_direct_io_for_flush_and_compaction;
+      s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
+                                    customized_env);
+      if (!s.ok()) {
+        std::string error_str = s.ToString();
+        // Check if unsupported Direct I/O is the root cause
+        customized_env.use_direct_reads = false;
+        s = env_->NewRandomAccessFile(IdentityFileName(dbname_), &idfile,
+                                      customized_env);
+        if (s.ok()) {
+          return Status::InvalidArgument(
+              "Direct I/O is not supported by the specified DB.");
+        } else {
+          return Status::InvalidArgument(
+              "Found options incompatible with filesystem", error_str.c_str());
+        }
+      }
+    }
   }
 
   Status s = versions_->Recover(column_families, read_only);
   if (immutable_db_options_.paranoid_checks && s.ok()) {
     s = CheckConsistency();
   }
+  if (s.ok() && !read_only) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      s = cfd->AddDirectories();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  // Initial max_total_in_memory_state_ before recovery logs. Log recovery
+  // may check this value to decide whether to flush.
+  max_total_in_memory_state_ = 0;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
+  }
+
   if (s.ok()) {
     SequenceNumber next_sequence(kMaxSequenceNumber);
     default_cf_handle_ = new ColumnFamilyHandleImpl(
@@ -368,7 +421,10 @@ Status DBImpl::Recover(
     // produced by an older version of rocksdb.
     std::vector<std::string> filenames;
     s = env_->GetChildren(immutable_db_options_.wal_dir, &filenames);
-    if (!s.ok()) {
+    if (s.IsNotFound()) {
+      return Status::InvalidArgument("wal_dir not found",
+                                     immutable_db_options_.wal_dir);
+    } else if (!s.ok()) {
       return s;
     }
 
@@ -423,12 +479,26 @@ Status DBImpl::Recover(
     }
   }
 
-  // Initial value
-  max_total_in_memory_state_ = 0;
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
-    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
-                                  mutable_cf_options->max_write_buffer_number;
+  if (read_only) {
+    // If we are opening as read-only, we need to update options_file_number_
+    // to reflect the most recent OPTIONS file. It does not matter for regular
+    // read-write db instance because options_file_number_ will later be
+    // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+    std::vector<std::string> file_names;
+    if (s.ok()) {
+      s = env_->GetChildren(GetName(), &file_names);
+    }
+    if (s.ok()) {
+      uint64_t number = 0;
+      uint64_t options_file_number = 0;
+      FileType type;
+      for (const auto& fname : file_names) {
+        if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+          options_file_number = std::max(number, options_file_number);
+        }
+      }
+      versions_->options_file_number_ = options_file_number;
+    }
   }
 
   return s;
@@ -442,7 +512,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     Logger* info_log;
     const char* fname;
     Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
-    virtual void Corruption(size_t bytes, const Status& s) override {
+    void Corruption(size_t bytes, const Status& s) override {
       ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
                      (this->status == nullptr ? "(ignoring error) " : ""),
                      fname, static_cast<int>(bytes), s.ToString().c_str());
@@ -479,10 +549,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     std::map<std::string, uint32_t> cf_name_id_map;
     std::map<uint32_t, uint64_t> cf_lognumber_map;
     for (auto cfd : *versions_->GetColumnFamilySet()) {
-      cf_name_id_map.insert(
-        std::make_pair(cfd->GetName(), cfd->GetID()));
+      cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
       cf_lognumber_map.insert(
-        std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+          std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
     }
 
     immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map,
@@ -493,17 +562,25 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   bool stop_replay_by_wal_filter = false;
   bool stop_replay_for_corruption = false;
   bool flushed = false;
+  uint64_t corrupted_log_number = kMaxSequenceNumber;
   for (auto log_number : log_numbers) {
+    if (log_number < versions_->min_log_number_to_keep_2pc()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Skipping log #%" PRIu64
+                     " since it is older than min log to keep #%" PRIu64,
+                     log_number, versions_->min_log_number_to_keep_2pc());
+      continue;
+    }
     // The previous incarnation may not have written any MANIFEST
     // records after allocating this log number.  So we manually
     // update the file number allocation counter in VersionSet.
-    versions_->MarkFileNumberUsedDuringRecovery(log_number);
+    versions_->MarkFileNumberUsed(log_number);
     // Open the log file
     std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
 
     ROCKS_LOG_INFO(immutable_db_options_.info_log,
                    "Recovering log #%" PRIu64 " mode %d", log_number,
-                   immutable_db_options_.wal_recovery_mode);
+                   static_cast<int>(immutable_db_options_.wal_recovery_mode));
     auto logFileDropped = [this, &fname]() {
       uint64_t bytes;
       if (env_->GetFileSize(fname, &bytes).ok()) {
@@ -517,9 +594,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       continue;
     }
 
-    unique_ptr<SequentialFileReader> file_reader;
+    std::unique_ptr<SequentialFileReader> file_reader;
     {
-      unique_ptr<SequentialFile> file;
+      std::unique_ptr<SequentialFile> file;
       status = env_->NewSequentialFile(fname, &file,
                                        env_->OptimizeForLogRead(env_options_));
       if (!status.ok()) {
@@ -532,7 +609,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
           continue;
         }
       }
-      file_reader.reset(new SequentialFileReader(std::move(file)));
+      file_reader.reset(new SequentialFileReader(std::move(file), fname));
     }
 
     // Create the log reader.
@@ -552,8 +629,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     // to be skipped instead of propagating bad information (like overly
     // large sequence numbers).
     log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
-                       &reporter, true /*checksum*/, 0 /*initial_offset*/,
-                       log_number);
+                       &reporter, true /*checksum*/, log_number);
 
     // Determine if we should tolerate incomplete records at the tail end of the
     // Read all the records and add to a memtable
@@ -647,7 +723,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
                 " mode %d log filter %s returned "
                 "more records (%d) than original (%d) which is not allowed. "
                 "Aborting recovery.",
-                log_number, immutable_db_options_.wal_recovery_mode,
+                log_number,
+                static_cast<int>(immutable_db_options_.wal_recovery_mode),
                 immutable_db_options_.wal_filter->Name(), new_count,
                 original_count);
             status = Status::NotSupported(
@@ -674,7 +751,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       status = WriteBatchInternal::InsertInto(
           &batch, column_family_memtables_.get(), &flush_scheduler_, true,
           log_number, this, false /* concurrent_memtable_writes */,
-          next_sequence, &has_valid_writes);
+          next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
       MaybeIgnoreError(&status);
       if (!status.ok()) {
         // We are treating this as a failure while reading since we read valid
@@ -711,6 +788,12 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     }
 
     if (!status.ok()) {
+      if (status.IsNotSupported()) {
+        // We should not treat NotSupported as corruption. It is rather a clear
+        // sign that we are processing a WAL that is produced by an incompatible
+        // version of the code.
+        return status;
+      }
       if (immutable_db_options_.wal_recovery_mode ==
           WALRecoveryMode::kSkipAnyCorruptedRecords) {
         // We should ignore all errors unconditionally
@@ -720,6 +803,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
         // We should ignore the error but not continue replaying
         status = Status::OK();
         stop_replay_for_corruption = true;
+        corrupted_log_number = log_number;
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "Point in time recovered to log #%" PRIu64
                        " seq #%" PRIu64,
@@ -737,10 +821,30 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     auto last_sequence = *next_sequence - 1;
     if ((*next_sequence != kMaxSequenceNumber) &&
         (versions_->LastSequence() <= last_sequence)) {
-      versions_->SetLastToBeWrittenSequence(last_sequence);
+      versions_->SetLastAllocatedSequence(last_sequence);
+      versions_->SetLastPublishedSequence(last_sequence);
       versions_->SetLastSequence(last_sequence);
     }
   }
+  // Compare the corrupted log number to all columnfamily's current log number.
+  // Abort Open() if any column family's log number is greater than
+  // the corrupted log number, which means CF contains data beyond the point of
+  // corruption. This could during PIT recovery when the WAL is corrupted and
+  // some (but not all) CFs are flushed
+  if (stop_replay_for_corruption == true &&
+      (immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kPointInTimeRecovery ||
+       immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->GetLogNumber() > corrupted_log_number) {
+        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                        "Column family inconsistency: SST file contains data"
+                        " beyond the point of corruption.");
+        return Status::Corruption("SST file is ahead of WALs");
+      }
+    }
+  }
 
   // True if there's any data in the WALs; if not, we can skip re-processing
   // them later
@@ -796,9 +900,9 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
       // not actually used. that is because VersionSet assumes
       // VersionSet::next_file_number_ always to be strictly greater than any
       // log number
-      versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1);
-      status = versions_->LogAndApply(
-          cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
+      versions_->MarkFileNumberUsed(max_log_number + 1);
+      status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                      edit, &mutex_);
       if (!status.ok()) {
         // Recovery failed
         break;
@@ -806,18 +910,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
     }
   }
 
-  if (data_seen && !flushed) {
-    // Mark these as alive so they'll be considered for deletion later by
-    // FindObsoleteFiles()
-    if (concurrent_prepare_) {
-      log_write_mutex_.Lock();
-    }
-    for (auto log_number : log_numbers) {
-      alive_log_files_.push_back(LogFileNumberSize(log_number));
-    }
-    if (concurrent_prepare_) {
-      log_write_mutex_.Unlock();
-    }
+  if (status.ok() && data_seen && !flushed) {
+    status = RestoreAliveLogFiles(log_numbers);
   }
 
   event_logger_.Log() << "job" << job_id << "event"
@@ -826,6 +920,60 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
   return status;
 }
 
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers) {
+  if (log_numbers.empty()) {
+    return Status::OK();
+  }
+  Status s;
+  mutex_.AssertHeld();
+  assert(immutable_db_options_.avoid_flush_during_recovery);
+  if (two_write_queues_) {
+    log_write_mutex_.Lock();
+  }
+  // Mark these as alive so they'll be considered for deletion later by
+  // FindObsoleteFiles()
+  total_log_size_ = 0;
+  log_empty_ = false;
+  for (auto log_number : log_numbers) {
+    LogFileNumberSize log(log_number);
+    std::string fname = LogFileName(immutable_db_options_.wal_dir, log_number);
+    // This gets the appear size of the logs, not including preallocated space.
+    s = env_->GetFileSize(fname, &log.size);
+    if (!s.ok()) {
+      break;
+    }
+    total_log_size_ += log.size;
+    alive_log_files_.push_back(log);
+    // We preallocate space for logs, but then after a crash and restart, those
+    // preallocated space are not needed anymore. It is likely only the last
+    // log has such preallocated space, so we only truncate for the last log.
+    if (log_number == log_numbers.back()) {
+      std::unique_ptr<WritableFile> last_log;
+      Status truncate_status = env_->ReopenWritableFile(
+          fname, &last_log,
+          env_->OptimizeForLogWrite(
+              env_options_,
+              BuildDBOptions(immutable_db_options_, mutable_db_options_)));
+      if (truncate_status.ok()) {
+        truncate_status = last_log->Truncate(log.size);
+      }
+      if (truncate_status.ok()) {
+        truncate_status = last_log->Close();
+      }
+      // Not a critical error if fail to truncate.
+      if (!truncate_status.ok()) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "Failed to truncate log #%" PRIu64 ": %s", log_number,
+                       truncate_status.ToString().c_str());
+      }
+    }
+  }
+  if (two_write_queues_) {
+    log_write_mutex_.Unlock();
+  }
+  return s;
+}
+
 Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
                                            MemTable* mem, VersionEdit* edit) {
   mutex_.AssertHeld();
@@ -857,26 +1005,35 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
     const uint64_t current_time = static_cast<uint64_t>(_current_time);
 
     {
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
       mutex_.Unlock();
 
       SequenceNumber earliest_write_conflict_snapshot;
       std::vector<SequenceNumber> snapshot_seqs =
           snapshots_.GetAll(&earliest_write_conflict_snapshot);
-
-      EnvOptions optimized_env_options =
-          env_->OptimizeForCompactionTableWrite(env_options_, immutable_db_options_);
+      auto snapshot_checker = snapshot_checker_.get();
+      if (use_custom_gc_ && snapshot_checker == nullptr) {
+        snapshot_checker = DisableGCSnapshotChecker::Instance();
+      }
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
       s = BuildTable(
           dbname_, env_, *cfd->ioptions(), mutable_cf_options,
-          optimized_env_options, cfd->table_cache(), iter.get(),
-          std::unique_ptr<InternalIterator>(mem->NewRangeTombstoneIterator(ro)),
-          &meta, cfd->internal_comparator(),
+          env_options_for_compaction_, cfd->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta, cfd->internal_comparator(),
           cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
-          snapshot_seqs, earliest_write_conflict_snapshot,
+          snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
           GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+          mutable_cf_options.sample_for_compression,
           cfd->ioptions()->compression_opts, paranoid_file_checks,
           cfd->internal_stats(), TableFileCreationReason::kRecovery,
           &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */,
-          -1 /* level */, current_time);
+          -1 /* level */, current_time, write_hint);
       LogFlush(immutable_db_options_.info_log);
       ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
                       "[%s] [WriteLevel0TableForRecovery]"
@@ -894,17 +1051,17 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
   if (s.ok() && meta.fd.GetFileSize() > 0) {
     edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
                   meta.fd.GetFileSize(), meta.smallest, meta.largest,
-                  meta.smallest_seqno, meta.largest_seqno,
+                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
                   meta.marked_for_compaction);
   }
 
-  InternalStats::CompactionStats stats(1);
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
   stats.micros = env_->NowMicros() - start_micros;
   stats.bytes_written = meta.fd.GetFileSize();
   stats.num_output_files = 1;
-  cfd->internal_stats()->AddCompactionStats(level, stats);
-  cfd->internal_stats()->AddCFStats(
-      InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
+  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+  cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
+                                    meta.fd.GetFileSize());
   RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
   return s;
 }
@@ -929,6 +1086,16 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
 Status DB::Open(const DBOptions& db_options, const std::string& dbname,
                 const std::vector<ColumnFamilyDescriptor>& column_families,
                 std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  const bool kSeqPerBatch = true;
+  const bool kBatchPerTxn = true;
+  return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+                      !kSeqPerBatch, kBatchPerTxn);
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+                    const std::vector<ColumnFamilyDescriptor>& column_families,
+                    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                    const bool seq_per_batch, const bool batch_per_txn) {
   Status s = SanitizeOptionsByTable(db_options, column_families);
   if (!s.ok()) {
     return s;
@@ -948,15 +1115,30 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
         std::max(max_write_buffer_size, cf.options.write_buffer_size);
   }
 
-  DBImpl* impl = new DBImpl(db_options, dbname);
+  DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
   s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.wal_dir);
   if (s.ok()) {
-    for (auto db_path : impl->immutable_db_options_.db_paths) {
-      s = impl->env_->CreateDirIfMissing(db_path.path);
+    std::vector<std::string> paths;
+    for (auto& db_path : impl->immutable_db_options_.db_paths) {
+      paths.emplace_back(db_path.path);
+    }
+    for (auto& cf : column_families) {
+      for (auto& cf_path : cf.options.cf_paths) {
+        paths.emplace_back(cf_path.path);
+      }
+    }
+    for (auto& path : paths) {
+      s = impl->env_->CreateDirIfMissing(path);
       if (!s.ok()) {
         break;
       }
     }
+
+    // For recovery from NoSpace() error, we can only handle
+    // the case where the database is stored in a single path
+    if (paths.size() <= 1) {
+      impl->error_handler_.EnableAutoRecovery();
+    }
   }
 
   if (!s.ok()) {
@@ -970,33 +1152,38 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
     return s;
   }
   impl->mutex_.Lock();
+  auto write_hint = impl->CalculateWALWriteHint();
   // Handles create_if_missing, error_if_exists
   s = impl->Recover(column_families);
   if (s.ok()) {
     uint64_t new_log_number = impl->versions_->NewFileNumber();
-    unique_ptr<WritableFile> lfile;
+    std::unique_ptr<WritableFile> lfile;
     EnvOptions soptions(db_options);
     EnvOptions opt_env_options =
         impl->immutable_db_options_.env->OptimizeForLogWrite(
             soptions, BuildDBOptions(impl->immutable_db_options_,
                                      impl->mutable_db_options_));
-    s = NewWritableFile(
-        impl->immutable_db_options_.env,
-        LogFileName(impl->immutable_db_options_.wal_dir, new_log_number),
-        &lfile, opt_env_options);
+    std::string log_fname =
+        LogFileName(impl->immutable_db_options_.wal_dir, new_log_number);
+    s = NewWritableFile(impl->immutable_db_options_.env, log_fname, &lfile,
+                        opt_env_options);
     if (s.ok()) {
+      lfile->SetWriteLifeTimeHint(write_hint);
       lfile->SetPreallocationBlockSize(
           impl->GetWalPreallocateBlockSize(max_write_buffer_size));
       {
         InstrumentedMutexLock wl(&impl->log_write_mutex_);
         impl->logfile_number_ = new_log_number;
-        unique_ptr<WritableFileWriter> file_writer(
-            new WritableFileWriter(std::move(lfile), opt_env_options));
+        const auto& listeners = impl->immutable_db_options_.listeners;
+        std::unique_ptr<WritableFileWriter> file_writer(
+            new WritableFileWriter(std::move(lfile), log_fname, opt_env_options,
+                                   impl->env_, nullptr /* stats */, listeners));
         impl->logs_.emplace_back(
             new_log_number,
             new log::Writer(
                 std::move(file_writer), new_log_number,
-                impl->immutable_db_options_.recycle_log_file_num > 0));
+                impl->immutable_db_options_.recycle_log_file_num > 0,
+                impl->immutable_db_options_.manual_wal_flush));
       }
 
       // set column family handles
@@ -1027,16 +1214,18 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
       }
     }
     if (s.ok()) {
+      SuperVersionContext sv_context(/* create_superversion */ true);
       for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-        delete impl->InstallSuperVersionAndScheduleWork(
-            cfd, nullptr, *cfd->GetLatestMutableCFOptions());
+        impl->InstallSuperVersionAndScheduleWork(
+            cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
       }
-      if (impl->concurrent_prepare_) {
+      sv_context.Clean();
+      if (impl->two_write_queues_) {
         impl->log_write_mutex_.Lock();
       }
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
-      if (impl->concurrent_prepare_) {
+      if (impl->two_write_queues_) {
         impl->log_write_mutex_.Unlock();
       }
       impl->DeleteObsoleteFiles();
@@ -1065,7 +1254,8 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
           !cfd->mem()->IsMergeOperatorSupported()) {
         s = Status::InvalidArgument(
             "The memtable of column family %s does not support merge operator "
-            "its options.merge_operator is non-null", cfd->GetName().c_str());
+            "its options.merge_operator is non-null",
+            cfd->GetName().c_str());
       }
       if (!s.ok()) {
         break;
@@ -1091,31 +1281,57 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
       impl->immutable_db_options_.sst_file_manager.get());
   if (s.ok() && sfm) {
     // Notify SstFileManager about all sst files that already exist in
-    // db_paths[0] when the DB is opened.
-    auto& db_path = impl->immutable_db_options_.db_paths[0];
-    std::vector<std::string> existing_files;
-    impl->immutable_db_options_.env->GetChildren(db_path.path, &existing_files);
-    for (auto& file_name : existing_files) {
-      uint64_t file_number;
-      FileType file_type;
-      std::string file_path = db_path.path + "/" + file_name;
-      if (ParseFileName(file_name, &file_number, &file_type) &&
-          file_type == kTableFile) {
-        sfm->OnAddFile(file_path);
+    // db_paths[0] and cf_paths[0] when the DB is opened.
+    std::vector<std::string> paths;
+    paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
+    for (auto& cf : column_families) {
+      if (!cf.options.cf_paths.empty()) {
+        paths.emplace_back(cf.options.cf_paths[0].path);
       }
     }
+    // Remove duplicate paths.
+    std::sort(paths.begin(), paths.end());
+    paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+    for (auto& path : paths) {
+      std::vector<std::string> existing_files;
+      impl->immutable_db_options_.env->GetChildren(path, &existing_files);
+      for (auto& file_name : existing_files) {
+        uint64_t file_number;
+        FileType file_type;
+        std::string file_path = path + "/" + file_name;
+        if (ParseFileName(file_name, &file_number, &file_type) &&
+            file_type == kTableFile) {
+          sfm->OnAddFile(file_path);
+        }
+      }
+    }
+
+    // Reserve some disk buffer space. This is a heuristic - when we run out
+    // of disk space, this ensures that there is atleast write_buffer_size
+    // amount of free space before we resume DB writes. In low disk space
+    // conditions, we want to avoid a lot of small L0 files due to frequent
+    // WAL write failures and resultant forced flushes
+    sfm->ReserveDiskBuffer(max_write_buffer_size,
+                           impl->immutable_db_options_.db_paths[0].path);
   }
 #endif  // !ROCKSDB_LITE
 
   if (s.ok()) {
-    ROCKS_LOG_INFO(impl->immutable_db_options_.info_log, "DB pointer %p", impl);
+    ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
+                     impl);
     LogFlush(impl->immutable_db_options_.info_log);
+    assert(impl->TEST_WALBufferIsEmpty());
+    // If the assert above fails then we need to FlushWAL before returning
+    // control back to the user.
     if (!persist_options_status.ok()) {
       s = Status::IOError(
           "DB::Open() failed --- Unable to persist Options file",
           persist_options_status.ToString());
     }
   }
+  if (s.ok()) {
+    impl->StartTimedTasks();
+  }
   if (!s.ok()) {
     for (auto* h : *handles) {
       delete h;
diff --git a/thirdparty/rocksdb/db/db_impl_readonly.cc b/thirdparty/rocksdb/db/db_impl_readonly.cc
index d69eecb988..5d7515c28e 100644
--- a/thirdparty/rocksdb/db/db_impl_readonly.cc
+++ b/thirdparty/rocksdb/db/db_impl_readonly.cc
@@ -9,7 +9,6 @@
 #include "db/db_impl.h"
 #include "db/db_iter.h"
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
 #include "monitoring/perf_context_imp.h"
 
 namespace rocksdb {
@@ -24,30 +23,45 @@ DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
   LogFlush(immutable_db_options_.info_log);
 }
 
-DBImplReadOnly::~DBImplReadOnly() {
-}
+DBImplReadOnly::~DBImplReadOnly() {}
 
 // Implementations of the DB interface
 Status DBImplReadOnly::Get(const ReadOptions& read_options,
                            ColumnFamilyHandle* column_family, const Slice& key,
                            PinnableSlice* pinnable_val) {
   assert(pinnable_val != nullptr);
+  // TODO: stopwatch DB_GET needed?, perf timer needed?
+  PERF_TIMER_GUARD(get_snapshot_time);
   Status s;
   SequenceNumber snapshot = versions_->LastSequence();
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
   auto cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
   SuperVersion* super_version = cfd->GetSuperVersion();
   MergeContext merge_context;
-  RangeDelAggregator range_del_agg(cfd->internal_comparator(), snapshot);
+  SequenceNumber max_covering_tombstone_seq = 0;
   LookupKey lkey(key, snapshot);
+  PERF_TIMER_STOP(get_snapshot_time);
   if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
-                              &range_del_agg, read_options)) {
+                              &max_covering_tombstone_seq, read_options)) {
     pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
   } else {
     PERF_TIMER_GUARD(get_from_output_files_time);
     super_version->current->Get(read_options, lkey, pinnable_val, &s,
-                                &merge_context, &range_del_agg);
+                                &merge_context, &max_covering_tombstone_seq);
+    RecordTick(stats_, MEMTABLE_MISS);
   }
+  RecordTick(stats_, NUMBER_KEYS_READ);
+  size_t size = pinnable_val->size();
+  RecordTick(stats_, BYTES_READ, size);
+  RecordInHistogram(stats_, BYTES_PER_READ, size);
+  PERF_COUNTER_ADD(get_read_bytes, size);
   return s;
 }
 
@@ -57,17 +71,20 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
   auto cfd = cfh->cfd();
   SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
   SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
   auto db_iter = NewArenaWrappedDbIterator(
-      env_, read_options, *cfd->ioptions(),
-      (read_options.snapshot != nullptr
-           ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                 ->number_
-           : latest_snapshot),
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      read_seq,
       super_version->mutable_cf_options.max_sequential_skip_in_iterations,
-      super_version->version_number);
+      super_version->version_number, read_callback);
   auto internal_iter =
       NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
-                          db_iter->GetRangeDelAggregator());
+                          db_iter->GetRangeDelAggregator(), read_seq);
   db_iter->SetIterUnderDBIter(internal_iter);
   return db_iter;
 }
@@ -76,27 +93,29 @@ Status DBImplReadOnly::NewIterators(
     const ReadOptions& read_options,
     const std::vector<ColumnFamilyHandle*>& column_families,
     std::vector<Iterator*>* iterators) {
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
   if (iterators == nullptr) {
     return Status::InvalidArgument("iterators not allowed to be nullptr");
   }
   iterators->clear();
   iterators->reserve(column_families.size());
   SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
 
   for (auto cfh : column_families) {
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
     auto* sv = cfd->GetSuperVersion()->Ref();
     auto* db_iter = NewArenaWrappedDbIterator(
-        env_, read_options, *cfd->ioptions(),
-        (read_options.snapshot != nullptr
-             ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
-                   ->number_
-             : latest_snapshot),
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, read_seq,
         sv->mutable_cf_options.max_sequential_skip_in_iterations,
-        sv->version_number);
+        sv->version_number, read_callback);
     auto* internal_iter =
         NewInternalIterator(read_options, cfd, sv, db_iter->GetArena(),
-                            db_iter->GetRangeDelAggregator());
+                            db_iter->GetRangeDelAggregator(), read_seq);
     db_iter->SetIterUnderDBIter(internal_iter);
     iterators->push_back(db_iter);
   }
@@ -105,7 +124,7 @@ Status DBImplReadOnly::NewIterators(
 }
 
 Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
-                           DB** dbptr, bool error_if_log_file_exist) {
+                           DB** dbptr, bool /*error_if_log_file_exist*/) {
   *dbptr = nullptr;
 
   // Try to first open DB as fully compacted DB
@@ -140,6 +159,7 @@ Status DB::OpenForReadOnly(
   *dbptr = nullptr;
   handles->clear();
 
+  SuperVersionContext sv_context(/* create_superversion */ true);
   DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
   impl->mutex_.Lock();
   Status s = impl->Recover(column_families, true /* read only */,
@@ -158,10 +178,12 @@ Status DB::OpenForReadOnly(
   }
   if (s.ok()) {
     for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
-      delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_);
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
     }
   }
   impl->mutex_.Unlock();
+  sv_context.Clean();
   if (s.ok()) {
     *dbptr = impl;
     for (auto* h : *handles) {
@@ -178,20 +200,21 @@ Status DB::OpenForReadOnly(
   return s;
 }
 
-#else  // !ROCKSDB_LITE
+#else   // !ROCKSDB_LITE
 
-Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
-                           DB** dbptr, bool error_if_log_file_exist) {
+Status DB::OpenForReadOnly(const Options& /*options*/,
+                           const std::string& /*dbname*/, DB** /*dbptr*/,
+                           bool /*error_if_log_file_exist*/) {
   return Status::NotSupported("Not supported in ROCKSDB_LITE.");
 }
 
 Status DB::OpenForReadOnly(
-    const DBOptions& db_options, const std::string& dbname,
-    const std::vector<ColumnFamilyDescriptor>& column_families,
-    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
-    bool error_if_log_file_exist) {
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
+    bool /*error_if_log_file_exist*/) {
   return Status::NotSupported("Not supported in ROCKSDB_LITE.");
 }
 #endif  // !ROCKSDB_LITE
 
-}   // namespace rocksdb
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_impl_readonly.h b/thirdparty/rocksdb/db/db_impl_readonly.h
index 9bdc95cc87..23816210dc 100644
--- a/thirdparty/rocksdb/db/db_impl_readonly.h
+++ b/thirdparty/rocksdb/db/db_impl_readonly.h
@@ -7,9 +7,9 @@
 
 #ifndef ROCKSDB_LITE
 
-#include "db/db_impl.h"
-#include <vector>
 #include <string>
+#include <vector>
+#include "db/db_impl.h"
 
 namespace rocksdb {
 
@@ -36,46 +36,49 @@ class DBImplReadOnly : public DBImpl {
       std::vector<Iterator*>* iterators) override;
 
   using DBImpl::Put;
-  virtual Status Put(const WriteOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value) override {
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::Merge;
-  virtual Status Merge(const WriteOptions& options,
-                       ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) override {
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::Delete;
-  virtual Status Delete(const WriteOptions& options,
-                        ColumnFamilyHandle* column_family,
-                        const Slice& key) override {
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::SingleDelete;
-  virtual Status SingleDelete(const WriteOptions& options,
-                              ColumnFamilyHandle* column_family,
-                              const Slice& key) override {
+  virtual Status SingleDelete(const WriteOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice& /*key*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
-  virtual Status Write(const WriteOptions& options,
-                       WriteBatch* updates) override {
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
   using DBImpl::CompactRange;
-  virtual Status CompactRange(const CompactRangeOptions& options,
-                              ColumnFamilyHandle* column_family,
-                              const Slice* begin, const Slice* end) override {
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
   using DBImpl::CompactFiles;
   virtual Status CompactFiles(
-      const CompactionOptions& compact_options,
-      ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& input_file_names,
-      const int output_level, const int output_path_id = -1) override {
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
@@ -83,18 +86,19 @@ class DBImplReadOnly : public DBImpl {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
-  virtual Status EnableFileDeletions(bool force) override {
+  virtual Status EnableFileDeletions(bool /*force*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
-  virtual Status GetLiveFiles(std::vector<std::string>&,
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
                               uint64_t* manifest_file_size,
-                              bool flush_memtable = true) override {
-    return Status::NotSupported("Not supported operation in read only mode.");
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
   }
 
   using DBImpl::Flush;
-  virtual Status Flush(const FlushOptions& options,
-                       ColumnFamilyHandle* column_family) override {
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
@@ -105,9 +109,9 @@ class DBImplReadOnly : public DBImpl {
 
   using DB::IngestExternalFile;
   virtual Status IngestExternalFile(
-      ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& external_files,
-      const IngestExternalFileOptions& ingestion_options) override {
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
     return Status::NotSupported("Not supported operation in read only mode.");
   }
 
@@ -118,6 +122,6 @@ class DBImplReadOnly : public DBImpl {
   DBImplReadOnly(const DBImplReadOnly&);
   void operator=(const DBImplReadOnly&);
 };
-}
+}  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/db_impl_secondary.cc b/thirdparty/rocksdb/db/db_impl_secondary.cc
new file mode 100644
index 0000000000..acc952524b
--- /dev/null
+++ b/thirdparty/rocksdb/db/db_impl_secondary.cc
@@ -0,0 +1,356 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl_secondary.h"
+#include "db/db_iter.h"
+#include "db/merge_context.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/auto_roll_logger.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+
+DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
+                                 const std::string& dbname)
+    : DBImpl(db_options, dbname) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Opening the db in secondary mode");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplSecondary::~DBImplSecondary() {}
+
+Status DBImplSecondary::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    bool /*readonly*/, bool /*error_if_log_file_exist*/,
+    bool /*error_if_data_exists_in_logs*/) {
+  mutex_.AssertHeld();
+
+  Status s;
+  s = static_cast<ReactiveVersionSet*>(versions_.get())
+          ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
+                    &manifest_reader_status_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (immutable_db_options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  // Initial max_total_in_memory_state_ before recovery logs.
+  max_total_in_memory_state_ = 0;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
+  }
+  if (s.ok()) {
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+    single_column_family_mode_ =
+        versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
+  }
+
+  // TODO: attempt to recover from WAL files.
+  return s;
+}
+
+// Implementation of the DB interface
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            PinnableSlice* value) {
+  return GetImpl(read_options, column_family, key, value);
+}
+
+Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, PinnableSlice* pinnable_val) {
+  assert(pinnable_val != nullptr);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, env_);
+  StopWatch sw(env_, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+  // Acquire SuperVersion
+  SuperVersion* super_version = GetAndRefSuperVersion(cfd);
+  SequenceNumber snapshot = versions_->LastSequence();
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  Status s;
+  LookupKey lkey(key, snapshot);
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  bool done = false;
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else if ((s.ok() || s.IsMergeInProgress()) &&
+             super_version->imm->Get(
+                 lkey, pinnable_val->GetSelf(), &s, &merge_context,
+                 &max_covering_tombstone_seq, read_options)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  }
+  if (!done && !s.ok() && !s.IsMergeInProgress()) {
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    return s;
+  }
+  if (!done) {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    super_version->current->Get(read_options, lkey, pinnable_val, &s,
+                                &merge_context, &max_covering_tombstone_seq);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    size_t size = pinnable_val->size();
+    RecordTick(stats_, BYTES_READ, size);
+    RecordTimeToHistogram(stats_, BYTES_PER_READ, size);
+    PERF_COUNTER_ADD(get_read_bytes, size);
+  }
+  return s;
+}
+
+Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
+                                       ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return NewErrorIterator(Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators."));
+  }
+  Iterator* result = nullptr;
+  auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
+  auto cfd = cfh->cfd();
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
+    return NewErrorIterator(Status::NotSupported(
+        "tailing iterator not supported in secondary mode"));
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return NewErrorIterator(
+        Status::NotSupported("snapshot not supported in secondary mode"));
+  } else {
+    auto snapshot = versions_->LastSequence();
+    result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+  }
+  return result;
+}
+
+ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
+    const ReadOptions& read_options, ColumnFamilyData* cfd,
+    SequenceNumber snapshot, ReadCallback* read_callback) {
+  assert(nullptr != cfd);
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(&mutex_);
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      snapshot,
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+      super_version->version_number, read_callback);
+  auto internal_iter =
+      NewInternalIterator(read_options, cfd, super_version, db_iter->GetArena(),
+                          db_iter->GetRangeDelAggregator(), snapshot);
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplSecondary::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators.");
+  }
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  if (read_options.tailing) {
+    return Status::NotSupported(
+        "tailing iterator not supported in secondary mode");
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return Status::NotSupported("snapshot not supported in secondary mode");
+  } else {
+    SequenceNumber read_seq = versions_->LastSequence();
+    for (auto cfh : column_families) {
+      ColumnFamilyData* cfd = static_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, read_seq, read_callback));
+    }
+  }
+  return Status::OK();
+}
+
+Status DBImplSecondary::TryCatchUpWithPrimary() {
+  assert(versions_.get() != nullptr);
+  assert(manifest_reader_.get() != nullptr);
+  Status s;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  InstrumentedMutexLock lock_guard(&mutex_);
+  s = static_cast<ReactiveVersionSet*>(versions_.get())
+          ->ReadAndApply(&mutex_, &manifest_reader_, &cfds_changed);
+  if (s.ok()) {
+    SuperVersionContext sv_context(true /* create_superversion */);
+    for (auto cfd : cfds_changed) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &mutex_);
+    }
+    sv_context.Clean();
+  }
+  return s;
+}
+
+Status DB::OpenAsSecondary(const Options& options, const std::string& dbname,
+                           const std::string& secondary_path, DB** dbptr) {
+  *dbptr = nullptr;
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path,
+                                 column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::string& secondary_path,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  *dbptr = nullptr;
+  if (db_options.max_open_files != -1) {
+    // TODO (yanqin) maybe support max_open_files != -1 by creating hard links
+    // on SST files so that db secondary can still have access to old SSTs
+    // while primary instance may delete original.
+    return Status::InvalidArgument("require max_open_files to be -1");
+  }
+
+  DBOptions tmp_opts(db_options);
+  if (nullptr == tmp_opts.info_log) {
+    Env* env = tmp_opts.env;
+    assert(env != nullptr);
+    std::string secondary_abs_path;
+    env->GetAbsolutePath(secondary_path, &secondary_abs_path);
+    std::string fname = InfoLogFileName(secondary_path, secondary_abs_path,
+                                        tmp_opts.db_log_dir);
+
+    env->CreateDirIfMissing(secondary_path);
+    if (tmp_opts.log_file_time_to_roll > 0 || tmp_opts.max_log_file_size > 0) {
+      AutoRollLogger* result = new AutoRollLogger(
+          env, secondary_path, tmp_opts.db_log_dir, tmp_opts.max_log_file_size,
+          tmp_opts.log_file_time_to_roll, tmp_opts.info_log_level);
+      Status s = result->GetStatus();
+      if (!s.ok()) {
+        delete result;
+      } else {
+        tmp_opts.info_log.reset(result);
+      }
+    }
+    if (nullptr == tmp_opts.info_log) {
+      env->RenameFile(
+          fname, OldInfoLogFileName(secondary_path, env->NowMicros(),
+                                    secondary_abs_path, tmp_opts.db_log_dir));
+      Status s = env->NewLogger(fname, &(tmp_opts.info_log));
+      if (tmp_opts.info_log != nullptr) {
+        tmp_opts.info_log->SetInfoLogLevel(tmp_opts.info_log_level);
+      }
+    }
+  }
+
+  assert(tmp_opts.info_log != nullptr);
+
+  handles->clear();
+  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname);
+  impl->versions_.reset(new ReactiveVersionSet(
+      dbname, &impl->immutable_db_options_, impl->env_options_,
+      impl->table_cache_.get(), impl->write_buffer_manager_,
+      &impl->write_controller_));
+  impl->column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+  impl->mutex_.Lock();
+  Status s = impl->Recover(column_families, true, false, false);
+  if (s.ok()) {
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (nullptr == cfd) {
+        s = Status::InvalidArgument("Column family not found: ", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  SuperVersionContext sv_context(true /* create_superversion */);
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  sv_context.Clean();
+  if (s.ok()) {
+    *dbptr = impl;
+    for (auto h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          reinterpret_cast<ColumnFamilyHandleImpl*>(h)->cfd());
+    }
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+#else   // !ROCKSDB_LITE
+
+Status DB::OpenAsSecondary(const Options& /*options*/,
+                           const std::string& /*name*/,
+                           const std::string& /*secondary_path*/,
+                           DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::string& /*secondary_path*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_impl_secondary.h b/thirdparty/rocksdb/db/db_impl_secondary.h
new file mode 100644
index 0000000000..1b6746f7e4
--- /dev/null
+++ b/thirdparty/rocksdb/db/db_impl_secondary.h
@@ -0,0 +1,151 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+#include "db/db_impl.h"
+
+namespace rocksdb {
+
+class DBImplSecondary : public DBImpl {
+ public:
+  DBImplSecondary(const DBOptions& options, const std::string& dbname);
+  ~DBImplSecondary() override;
+
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only, bool error_if_log_file_exist,
+                 bool error_if_data_exists_in_logs) override;
+
+  // Implementations of the DB interface
+  using DB::Get;
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value) override;
+
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, PinnableSlice* value);
+
+  using DBImpl::NewIterator;
+  Iterator* NewIterator(const ReadOptions&,
+                        ColumnFamilyHandle* column_family) override;
+
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback);
+
+  Status NewIterators(const ReadOptions& options,
+                      const std::vector<ColumnFamilyHandle*>& column_families,
+                      std::vector<Iterator*>* iterators) override;
+
+  using DBImpl::Put;
+  Status Put(const WriteOptions& /*options*/,
+             ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+             const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::Merge;
+  Status Merge(const WriteOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::Delete;
+  Status Delete(const WriteOptions& /*options*/,
+                ColumnFamilyHandle* /*column_family*/,
+                const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::SingleDelete;
+  Status SingleDelete(const WriteOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  Status Write(const WriteOptions& /*options*/,
+               WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::CompactRange;
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*begin*/, const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  Status GetLiveFiles(std::vector<std::string>&,
+                      uint64_t* /*manifest_file_size*/,
+                      bool /*flush_memtable*/ = true) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::Flush;
+  Status Flush(const FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::SyncWAL;
+  Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::IngestExternalFile;
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  // Try to catch up with the primary by reading as much as possible from the
+  // log files until there is nothing more to read or encounters an error. If
+  // the amount of information in the log files to process is huge, this
+  // method can take long time due to all the I/O and CPU costs.
+  Status TryCatchUpWithPrimary() override;
+
+ private:
+  friend class DB;
+
+  // No copying allowed
+  DBImplSecondary(const DBImplSecondary&);
+  void operator=(const DBImplSecondary&);
+
+  using DBImpl::Recover;
+
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
+  std::unique_ptr<Status> manifest_reader_status_;
+};
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/db_impl_write.cc b/thirdparty/rocksdb/db/db_impl_write.cc
index 8a11948f7e..21a9378d21 100644
--- a/thirdparty/rocksdb/db/db_impl_write.cc
+++ b/thirdparty/rocksdb/db/db_impl_write.cc
@@ -12,6 +12,7 @@
 #define __STDC_FORMAT_MACROS
 #endif
 #include <inttypes.h>
+#include "db/error_handler.h"
 #include "db/event_helpers.h"
 #include "monitoring/perf_context_imp.h"
 #include "options/options_helper.h"
@@ -45,6 +46,11 @@ Status DBImpl::SingleDelete(const WriteOptions& write_options,
   return DB::SingleDelete(write_options, column_family, key);
 }
 
+void DBImpl::SetRecoverableStatePreReleaseCallback(
+    PreReleaseCallback* callback) {
+  recoverable_state_pre_release_callback_.reset(callback);
+}
+
 Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
   return WriteImpl(write_options, my_batch, nullptr, nullptr);
 }
@@ -57,17 +63,40 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
 }
 #endif  // ROCKSDB_LITE
 
+// The main write queue. This is the only write queue that updates LastSequence.
+// When using one write queue, the same sequence also indicates the last
+// published sequence.
 Status DBImpl::WriteImpl(const WriteOptions& write_options,
                          WriteBatch* my_batch, WriteCallback* callback,
                          uint64_t* log_used, uint64_t log_ref,
-                         bool disable_memtable, uint64_t* seq_used) {
+                         bool disable_memtable, uint64_t* seq_used,
+                         size_t batch_cnt,
+                         PreReleaseCallback* pre_release_callback) {
+  assert(!seq_per_batch_ || batch_cnt != 0);
   if (my_batch == nullptr) {
     return Status::Corruption("Batch is nullptr!");
   }
-  if (concurrent_prepare_ && immutable_db_options_.enable_pipelined_write) {
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Write(my_batch);
+    }
+  }
+  if (write_options.sync && write_options.disableWAL) {
+    return Status::InvalidArgument("Sync writes has to enable WAL.");
+  }
+  if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
     return Status::NotSupported(
         "pipelined_writes is not compatible with concurrent prepares");
   }
+  if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
+    // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with seq_per_batch");
+  }
+  // Otherwise IsLatestPersistentState optimization does not make sense
+  assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
+         disable_memtable);
 
   Status status;
   if (write_options.low_pri) {
@@ -77,9 +106,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     }
   }
 
-  if (concurrent_prepare_ && disable_memtable) {
+  if (two_write_queues_ && disable_memtable) {
     return WriteImplWALOnly(write_options, my_batch, callback, log_used,
-                            log_ref, seq_used);
+                            log_ref, seq_used, batch_cnt, pre_release_callback);
   }
 
   if (immutable_db_options_.enable_pipelined_write) {
@@ -89,7 +118,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
-                        disable_memtable);
+                        disable_memtable, batch_cnt, pre_release_callback);
 
   if (!write_options.disableWAL) {
     RecordTick(stats_, WRITE_WITH_WAL);
@@ -100,19 +129,24 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   write_thread_.JoinBatchGroup(&w);
   if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
     // we are a non-leader in a parallel group
-    PERF_TIMER_GUARD(write_memtable_time);
 
     if (w.ShouldWriteToMemtable()) {
+      PERF_TIMER_STOP(write_pre_and_post_process_time);
+      PERF_TIMER_GUARD(write_memtable_time);
+
       ColumnFamilyMemTablesImpl column_family_memtables(
           versions_->GetColumnFamilySet());
       w.status = WriteBatchInternal::InsertInto(
           &w, w.sequence, &column_family_memtables, &flush_scheduler_,
           write_options.ignore_missing_column_families, 0 /*log_number*/, this,
-          true /*concurrent_memtable_writes*/);
+          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt);
+
+      PERF_TIMER_START(write_pre_and_post_process_time);
     }
 
     if (write_thread_.CompleteParallelMemTableWriter(&w)) {
       // we're responsible for exit batch group
+      // TODO(myabandeh): propagate status to write_group
       auto last_sequence = w.write_group->last_sequence;
       versions_->SetLastSequence(last_sequence);
       MemTableInsertStatusCheck(w.status);
@@ -144,19 +178,25 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   WriteThread::WriteGroup write_group;
   bool in_parallel_group = false;
   uint64_t last_sequence = kMaxSequenceNumber;
-  if (!concurrent_prepare_) {
+  if (!two_write_queues_) {
     last_sequence = versions_->LastSequence();
   }
 
   mutex_.Lock();
 
-  bool need_log_sync = !write_options.disableWAL && write_options.sync;
+  bool need_log_sync = write_options.sync;
   bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
-  if (!concurrent_prepare_ || !disable_memtable) {
+  if (!two_write_queues_ || !disable_memtable) {
     // With concurrent writes we do preprocess only in the write thread that
     // also does write to memtable to avoid sync issue on shared data structure
     // with the other thread
+
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
     status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+
+    PERF_TIMER_START(write_pre_and_post_process_time);
   }
   log::Writer* log_writer = logs_.back().writer;
 
@@ -167,6 +207,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   // and protects against concurrent loggers and concurrent writes
   // into memtables
 
+  TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
   last_batch_group_size_ =
       write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
 
@@ -184,10 +225,12 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     // more than once to a particular key.
     bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
                     write_group.size > 1;
-    int total_count = 0;
-    uint64_t total_byte_size = 0;
+    size_t total_count = 0;
+    size_t valid_batches = 0;
+    size_t total_byte_size = 0;
     for (auto* writer : write_group) {
       if (writer->CheckCallback(this)) {
+        valid_batches += writer->batch_cnt;
         if (writer->ShouldWriteToMemtable()) {
           total_count += WriteBatchInternal::Count(writer->batch);
           parallel = parallel && !writer->batch->HasMerge();
@@ -197,8 +240,15 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
             total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
       }
     }
-
-    const bool concurrent_update = concurrent_prepare_;
+    // Note about seq_per_batch_: either disableWAL is set for the entire write
+    // group or not. In either case we inc seq for each write batch with no
+    // failed callback. This means that there could be a batch with
+    // disalbe_memtable in between; although we do not write this batch to
+    // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
+    // the seq per valid written key to mem.
+    size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
+
+    const bool concurrent_update = two_write_queues_;
     // Update stats while we are an exclusive group leader, so we know
     // that nobody else can be writing to these particular stats.
     // We're optimistic, updating the stats before we successfully
@@ -218,7 +268,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
                         concurrent_update);
       RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
     }
-    MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size);
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
 
     if (write_options.disableWAL) {
       has_unpersisted_data_.store(true, std::memory_order_relaxed);
@@ -226,7 +276,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
 
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
-    if (!concurrent_prepare_) {
+    if (!two_write_queues_) {
       if (status.ok() && !write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
         status = WriteToWAL(write_group, log_writer, log_used, need_log_sync,
@@ -235,38 +285,60 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
     } else {
       if (status.ok() && !write_options.disableWAL) {
         PERF_TIMER_GUARD(write_wal_time);
-        // LastToBeWrittenSequence is increased inside WriteToWAL under
+        // LastAllocatedSequence is increased inside WriteToWAL under
         // wal_write_mutex_ to ensure ordered events in WAL
         status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
-                                      total_count);
+                                      seq_inc);
       } else {
         // Otherwise we inc seq number for memtable writes
-        last_sequence = versions_->FetchAddLastToBeWrittenSequence(total_count);
+        last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
       }
     }
     assert(last_sequence != kMaxSequenceNumber);
     const SequenceNumber current_sequence = last_sequence + 1;
-    last_sequence += total_count;
+    last_sequence += seq_inc;
+
+    // PreReleaseCallback is called after WAL write and before memtable write
+    if (status.ok()) {
+      SequenceNumber next_sequence = current_sequence;
+      // Note: the logic for advancing seq here must be consistent with the
+      // logic in WriteBatchInternal::InsertInto(write_group...) as well as
+      // with WriteBatchInternal::InsertInto(write_batch...) that is called on
+      // the merged batch during recovery from the WAL.
+      for (auto* writer : write_group) {
+        if (writer->CallbackFailed()) {
+          continue;
+        }
+        writer->sequence = next_sequence;
+        if (writer->pre_release_callback) {
+          Status ws = writer->pre_release_callback->Callback(
+              writer->sequence, disable_memtable, writer->log_used);
+          if (!ws.ok()) {
+            status = ws;
+            break;
+          }
+        }
+        if (seq_per_batch_) {
+          assert(writer->batch_cnt);
+          next_sequence += writer->batch_cnt;
+        } else if (writer->ShouldWriteToMemtable()) {
+          next_sequence += WriteBatchInternal::Count(writer->batch);
+        }
+      }
+    }
 
     if (status.ok()) {
       PERF_TIMER_GUARD(write_memtable_time);
 
       if (!parallel) {
+        // w.sequence will be set inside InsertInto
         w.status = WriteBatchInternal::InsertInto(
             write_group, current_sequence, column_family_memtables_.get(),
             &flush_scheduler_, write_options.ignore_missing_column_families,
-            0 /*recovery_log_number*/, this);
+            0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+            batch_per_txn_);
       } else {
-        SequenceNumber next_sequence = current_sequence;
-        for (auto* writer : write_group) {
-          if (writer->ShouldWriteToMemtable()) {
-            writer->sequence = next_sequence;
-            next_sequence += WriteBatchInternal::Count(writer->batch);
-          }
-        }
         write_group.last_sequence = last_sequence;
-        write_group.running.store(static_cast<uint32_t>(write_group.size),
-                                  std::memory_order_relaxed);
         write_thread_.LaunchParallelMemTableWriters(&write_group);
         in_parallel_group = true;
 
@@ -279,27 +351,28 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
           w.status = WriteBatchInternal::InsertInto(
               &w, w.sequence, &column_family_memtables, &flush_scheduler_,
               write_options.ignore_missing_column_families, 0 /*log_number*/,
-              this, true /*concurrent_memtable_writes*/);
-        }
-        if (seq_used != nullptr) {
-          *seq_used = w.sequence;
+              this, true /*concurrent_memtable_writes*/, seq_per_batch_,
+              w.batch_cnt, batch_per_txn_);
         }
       }
+      if (seq_used != nullptr) {
+        *seq_used = w.sequence;
+      }
     }
   }
   PERF_TIMER_START(write_pre_and_post_process_time);
 
   if (!w.CallbackFailed()) {
-    WriteCallbackStatusCheck(status);
+    WriteStatusCheck(status);
   }
 
   if (need_log_sync) {
     mutex_.Lock();
     MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
     mutex_.Unlock();
-    // Requesting sync with concurrent_prepare_ is expected to be very rare. We
-    // hance provide a simple implementation that is not necessarily efficient.
-    if (concurrent_prepare_) {
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
+    // hence provide a simple implementation that is not necessarily efficient.
+    if (two_write_queues_) {
       if (manual_wal_flush_) {
         status = FlushWAL(true);
       } else {
@@ -316,10 +389,12 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
   }
   if (should_exit_batch_group) {
     if (status.ok()) {
+      // Note: if we are to resume after non-OK statuses we need to revisit how
+      // we reacts to non-OK statuses here.
       versions_->SetLastSequence(last_sequence);
     }
     MemTableInsertStatusCheck(w.status);
-    write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
+    write_thread_.ExitAsBatchGroupLeader(write_group, status);
   }
 
   if (status.ok()) {
@@ -348,7 +423,10 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     mutex_.Lock();
     bool need_log_sync = !write_options.disableWAL && write_options.sync;
     bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
     w.status = PreprocessWrite(write_options, &need_log_sync, &write_context);
+    PERF_TIMER_START(write_pre_and_post_process_time);
     log::Writer* log_writer = logs_.back().writer;
     mutex_.Unlock();
 
@@ -385,10 +463,11 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
     stats->AddDBStats(InternalStats::BYTES_WRITTEN, total_byte_size);
     RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
 
     PERF_TIMER_STOP(write_pre_and_post_process_time);
 
-    if (w.ShouldWriteToWAL()) {
+    if (w.status.ok() && !write_options.disableWAL) {
       PERF_TIMER_GUARD(write_wal_time);
       stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
       RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
@@ -402,7 +481,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
     }
 
     if (!w.CallbackFailed()) {
-      WriteCallbackStatusCheck(w.status);
+      WriteStatusCheck(w.status);
     }
 
     if (need_log_sync) {
@@ -417,7 +496,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
   WriteThread::WriteGroup memtable_write_group;
   if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
     PERF_TIMER_GUARD(write_memtable_time);
-    assert(w.status.ok());
+    assert(w.ShouldWriteToMemtable());
     write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
     if (memtable_write_group.size > 1 &&
         immutable_db_options_.allow_concurrent_memtable_write) {
@@ -426,7 +505,8 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
       memtable_write_group.status = WriteBatchInternal::InsertInto(
           memtable_write_group, w.sequence, column_family_memtables_.get(),
           &flush_scheduler_, write_options.ignore_missing_column_families,
-          0 /*log_number*/, this);
+          0 /*log_number*/, this, false /*concurrent_memtable_writes*/,
+          seq_per_batch_, batch_per_txn_);
       versions_->SetLastSequence(memtable_write_group.last_sequence);
       write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
     }
@@ -454,17 +534,19 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
   return w.FinalStatus();
 }
 
+// The 2nd write queue. If enabled it will be used only for WAL-only writes.
+// This is the only queue that updates LastPublishedSequence which is only
+// applicable in a two-queue setting.
 Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
                                 WriteBatch* my_batch, WriteCallback* callback,
                                 uint64_t* log_used, uint64_t log_ref,
-                                uint64_t* seq_used) {
+                                uint64_t* seq_used, size_t batch_cnt,
+                                PreReleaseCallback* pre_release_callback) {
   Status status;
   PERF_TIMER_GUARD(write_pre_and_post_process_time);
   WriteThread::Writer w(write_options, my_batch, callback, log_ref,
-                        true /* disable_memtable */);
-  if (write_options.disableWAL) {
-    return status;
-  }
+                        true /* disable_memtable */, batch_cnt,
+                        pre_release_callback);
   RecordTick(stats_, WRITE_WITH_WAL);
   StopWatch write_sw(env_, immutable_db_options_.statistics.get(), DB_WRITE);
 
@@ -481,14 +563,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   }
   // else we are the leader of the write batch group
   assert(w.state == WriteThread::STATE_GROUP_LEADER);
-  WriteContext write_context;
   WriteThread::WriteGroup write_group;
   uint64_t last_sequence;
   nonmem_write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
   // Note: no need to update last_batch_group_size_ here since the batch writes
   // to WAL only
 
-  uint64_t total_byte_size = 0;
+  size_t total_byte_size = 0;
   for (auto* writer : write_group) {
     if (writer->CheckCallback(this)) {
       total_byte_size = WriteBatchInternal::AppendedByteSize(
@@ -513,24 +594,44 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
                       concurrent_update);
     RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
   }
-  MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size);
+  RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
 
   PERF_TIMER_STOP(write_pre_and_post_process_time);
 
   PERF_TIMER_GUARD(write_wal_time);
-  // LastToBeWrittenSequence is increased inside WriteToWAL under
+  // LastAllocatedSequence is increased inside WriteToWAL under
   // wal_write_mutex_ to ensure ordered events in WAL
-  status = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
-                                0 /*total_count*/);
+  size_t seq_inc = 0 /* total_count */;
+  if (seq_per_batch_) {
+    size_t total_batch_cnt = 0;
+    for (auto* writer : write_group) {
+      assert(writer->batch_cnt);
+      total_batch_cnt += writer->batch_cnt;
+    }
+    seq_inc = total_batch_cnt;
+  }
+  if (!write_options.disableWAL) {
+    status =
+        ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+  } else {
+    // Otherwise we inc seq number to do solely the seq allocation
+    last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+  }
   auto curr_seq = last_sequence + 1;
   for (auto* writer : write_group) {
-    if (writer->CheckCallback(this)) {
-      writer->sequence = curr_seq;
-      curr_seq += WriteBatchInternal::Count(writer->batch);
+    if (writer->CallbackFailed()) {
+      continue;
     }
+    writer->sequence = curr_seq;
+    if (seq_per_batch_) {
+      assert(writer->batch_cnt);
+      curr_seq += writer->batch_cnt;
+    }
+    // else seq advances only by memtable writes
   }
   if (status.ok() && write_options.sync) {
-    // Requesting sync with concurrent_prepare_ is expected to be very rare. We
+    assert(!write_options.disableWAL);
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
     // hance provide a simple implementation that is not necessarily efficient.
     if (manual_wal_flush_) {
       status = FlushWAL(true);
@@ -541,9 +642,23 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   PERF_TIMER_START(write_pre_and_post_process_time);
 
   if (!w.CallbackFailed()) {
-    WriteCallbackStatusCheck(status);
+    WriteStatusCheck(status);
   }
-  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, w.status);
+  if (status.ok()) {
+    for (auto* writer : write_group) {
+      if (!writer->CallbackFailed() && writer->pre_release_callback) {
+        assert(writer->sequence != kMaxSequenceNumber);
+        const bool DISABLE_MEMTABLE = true;
+        Status ws = writer->pre_release_callback->Callback(
+            writer->sequence, DISABLE_MEMTABLE, writer->log_used);
+        if (!ws.ok()) {
+          status = ws;
+          break;
+        }
+      }
+    }
+  }
+  nonmem_write_thread_.ExitAsBatchGroupLeader(write_group, status);
   if (status.ok()) {
     status = w.FinalStatus();
   }
@@ -553,22 +668,13 @@ Status DBImpl::WriteImplWALOnly(const WriteOptions& write_options,
   return status;
 }
 
-void DBImpl::WriteCallbackStatusCheck(const Status& status) {
+void DBImpl::WriteStatusCheck(const Status& status) {
   // Is setting bg_error_ enough here?  This will at least stop
   // compaction and fail any further writes.
   if (immutable_db_options_.paranoid_checks && !status.ok() &&
       !status.IsBusy() && !status.IsIncomplete()) {
     mutex_.Lock();
-    if (bg_error_.ok()) {
-      Status new_bg_error = status;
-      // may temporarily unlock and lock the mutex.
-      EventHelpers::NotifyOnBackgroundError(immutable_db_options_.listeners,
-                                            BackgroundErrorReason::kWriteCallback,
-                                            &new_bg_error, &mutex_);
-      if (!new_bg_error.ok()) {
-        bg_error_ = new_bg_error;  // stop compaction & fail any further writes
-      }
-    }
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
     mutex_.Unlock();
   }
 }
@@ -581,15 +687,8 @@ void DBImpl::MemTableInsertStatusCheck(const Status& status) {
   // ignore_missing_column_families.
   if (!status.ok()) {
     mutex_.Lock();
-    assert(bg_error_.ok());
-    Status new_bg_error = status;
-    // may temporarily unlock and lock the mutex.
-    EventHelpers::NotifyOnBackgroundError(immutable_db_options_.listeners,
-                                          BackgroundErrorReason::kMemTable,
-                                          &new_bg_error, &mutex_);
-    if (!new_bg_error.ok()) {
-      bg_error_ = new_bg_error;  // stop compaction & fail any further writes
-    }
+    assert(!error_handler_.IsBGWorkStopped());
+    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable);
     mutex_.Unlock();
   }
 }
@@ -601,11 +700,17 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
   assert(write_context != nullptr && need_log_sync != nullptr);
   Status status;
 
+  if (error_handler_.IsDBStopped()) {
+    status = error_handler_.GetBGError();
+  }
+
+  PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
   assert(!single_column_family_mode_ ||
          versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
   if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
                total_log_size_ > GetMaxTotalWalSize())) {
-    status = HandleWALFull(write_context);
+    status = SwitchWAL(write_context);
   }
 
   if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
@@ -617,22 +722,23 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
     status = HandleWriteBufferFull(write_context);
   }
 
-  if (UNLIKELY(status.ok() && !bg_error_.ok())) {
-    return bg_error_;
-  }
-
   if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
     status = ScheduleFlushes(write_context);
   }
 
+  PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
   if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
                                write_controller_.NeedsDelay()))) {
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
     PERF_TIMER_GUARD(write_delay_time);
     // We don't know size of curent batch so that we always use the size
     // for previous one. It might create a fairness issue that expiration
     // might happen for smaller writes but larger writes can go through.
     // Can optimize it if it is an issue.
     status = DelayWrite(last_batch_group_size_, write_options);
+    PERF_TIMER_START(write_pre_and_post_process_time);
   }
 
   if (status.ok() && *need_log_sync) {
@@ -663,18 +769,24 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
 }
 
 WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
-                               WriteBatch* tmp_batch, size_t* write_with_wal) {
+                               WriteBatch* tmp_batch, size_t* write_with_wal,
+                               WriteBatch** to_be_cached_state) {
   assert(write_with_wal != nullptr);
   assert(tmp_batch != nullptr);
+  assert(*to_be_cached_state == nullptr);
   WriteBatch* merged_batch = nullptr;
   *write_with_wal = 0;
   auto* leader = write_group.leader;
-  if (write_group.size == 1 && leader->ShouldWriteToWAL() &&
+  assert(!leader->disable_wal);  // Same holds for all in the batch group
+  if (write_group.size == 1 && !leader->CallbackFailed() &&
       leader->batch->GetWalTerminationPoint().is_cleared()) {
     // we simply write the first WriteBatch to WAL if the group only
     // contains one batch, that batch should be written to the WAL,
     // and the batch is not wanting to be truncated
     merged_batch = leader->batch;
+    if (WriteBatchInternal::IsLatestPersistentState(merged_batch)) {
+      *to_be_cached_state = merged_batch;
+    }
     *write_with_wal = 1;
   } else {
     // WAL needs all of the batches flattened into a single batch.
@@ -682,9 +794,13 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
     // interface
     merged_batch = tmp_batch;
     for (auto writer : write_group) {
-      if (writer->ShouldWriteToWAL()) {
+      if (!writer->CallbackFailed()) {
         WriteBatchInternal::Append(merged_batch, writer->batch,
                                    /*WAL_only*/ true);
+        if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
+          // We only need to cache the last of such write batch
+          *to_be_cached_state = writer->batch;
+        }
         (*write_with_wal)++;
       }
     }
@@ -692,7 +808,7 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
   return merged_batch;
 }
 
-// When concurrent_prepare_ is disabled, this function is called from the only
+// When two_write_queues_ is disabled, this function is called from the only
 // write thread. Otherwise this must be called holding log_write_mutex_.
 Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                           log::Writer* log_writer, uint64_t* log_used,
@@ -700,7 +816,21 @@ Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   assert(log_size != nullptr);
   Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
   *log_size = log_entry.size();
+  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
+  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+  // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
+  // from possible concurrent calls via the FlushWAL by the application.
+  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+  // Due to performance cocerns of missed branch prediction penalize the new
+  // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
+  // when we do not need any locking.
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Lock();
+  }
   Status status = log_writer->AddRecord(log_entry);
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Unlock();
+  }
   if (log_used != nullptr) {
     *log_used = logfile_number_;
   }
@@ -718,9 +848,12 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
                           SequenceNumber sequence) {
   Status status;
 
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
   size_t write_with_wal = 0;
-  WriteBatch* merged_batch =
-      MergeBatch(write_group, &tmp_batch_, &write_with_wal);
+  WriteBatch* to_be_cached_state = nullptr;
+  WriteBatch* merged_batch = MergeBatch(write_group, &tmp_batch_,
+                                        &write_with_wal, &to_be_cached_state);
   if (merged_batch == write_group.leader->batch) {
     write_group.leader->log_used = logfile_number_;
   } else if (write_with_wal > 1) {
@@ -733,6 +866,10 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
 
   uint64_t log_size;
   status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
 
   if (status.ok() && need_log_sync) {
     StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
@@ -777,13 +914,16 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
 Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
                                     uint64_t* log_used,
                                     SequenceNumber* last_sequence,
-                                    int total_count) {
+                                    size_t seq_inc) {
   Status status;
 
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
   WriteBatch tmp_batch;
   size_t write_with_wal = 0;
+  WriteBatch* to_be_cached_state = nullptr;
   WriteBatch* merged_batch =
-      MergeBatch(write_group, &tmp_batch, &write_with_wal);
+      MergeBatch(write_group, &tmp_batch, &write_with_wal, &to_be_cached_state);
 
   // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
   // pushed back concurrently
@@ -795,13 +935,17 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
       writer->log_used = logfile_number_;
     }
   }
-  *last_sequence = versions_->FetchAddLastToBeWrittenSequence(total_count);
+  *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
   auto sequence = *last_sequence + 1;
   WriteBatchInternal::SetSequence(merged_batch, sequence);
 
   log::Writer* log_writer = logs_.back().writer;
   uint64_t log_size;
   status = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
   log_write_mutex_.Unlock();
 
   if (status.ok()) {
@@ -816,7 +960,76 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
   return status;
 }
 
-Status DBImpl::HandleWALFull(WriteContext* write_context) {
+Status DBImpl::WriteRecoverableState() {
+  mutex_.AssertHeld();
+  if (!cached_recoverable_state_empty_) {
+    bool dont_care_bool;
+    SequenceNumber next_seq;
+    if (two_write_queues_) {
+      log_write_mutex_.Lock();
+    }
+    SequenceNumber seq;
+    if (two_write_queues_) {
+      seq = versions_->FetchAddLastAllocatedSequence(0);
+    } else {
+      seq = versions_->LastSequence();
+    }
+    WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
+    auto status = WriteBatchInternal::InsertInto(
+        &cached_recoverable_state_, column_family_memtables_.get(),
+        &flush_scheduler_, true, 0 /*recovery_log_number*/, this,
+        false /* concurrent_memtable_writes */, &next_seq, &dont_care_bool,
+        seq_per_batch_);
+    auto last_seq = next_seq - 1;
+    if (two_write_queues_) {
+      versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+      versions_->SetLastPublishedSequence(last_seq);
+    }
+    versions_->SetLastSequence(last_seq);
+    if (two_write_queues_) {
+      log_write_mutex_.Unlock();
+    }
+    if (status.ok() && recoverable_state_pre_release_callback_) {
+      const bool DISABLE_MEMTABLE = true;
+      for (uint64_t sub_batch_seq = seq + 1;
+           sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
+        uint64_t const no_log_num = 0;
+        status = recoverable_state_pre_release_callback_->Callback(
+            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num);
+      }
+    }
+    if (status.ok()) {
+      cached_recoverable_state_.Clear();
+      cached_recoverable_state_empty_ = true;
+    }
+    return status;
+  }
+  return Status::OK();
+}
+
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+    autovector<ColumnFamilyData*>* cfds) {
+  for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+        !cached_recoverable_state_empty_.load()) {
+      cfds->push_back(cfd);
+    }
+  }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+  assert(immutable_db_options_.atomic_flush);
+  auto seq = versions_->LastSequence();
+  for (auto cfd : cfds) {
+    cfd->imm()->AssignAtomicFlushSeq(seq);
+  }
+}
+
+Status DBImpl::SwitchWAL(WriteContext* write_context) {
   mutex_.AssertHeld();
   assert(write_context != nullptr);
   Status status;
@@ -826,52 +1039,78 @@ Status DBImpl::HandleWALFull(WriteContext* write_context) {
   }
 
   auto oldest_alive_log = alive_log_files_.begin()->number;
-  auto oldest_log_with_uncommited_prep = FindMinLogContainingOutstandingPrep();
-
-  if (allow_2pc() &&
-      oldest_log_with_uncommited_prep > 0 &&
-      oldest_log_with_uncommited_prep <= oldest_alive_log) {
-    if (unable_to_flush_oldest_log_) {
+  bool flush_wont_release_oldest_log = false;
+  if (allow_2pc()) {
+    auto oldest_log_with_uncommitted_prep =
+        logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+
+    assert(oldest_log_with_uncommitted_prep == 0 ||
+           oldest_log_with_uncommitted_prep >= oldest_alive_log);
+    if (oldest_log_with_uncommitted_prep > 0 &&
+        oldest_log_with_uncommitted_prep == oldest_alive_log) {
+      if (unable_to_release_oldest_log_) {
         // we already attempted to flush all column families dependent on
-        // the oldest alive log but the log still contained uncommited transactions.
-        // the oldest alive log STILL contains uncommited transaction so there
-        // is still nothing that we can do.
+        // the oldest alive log but the log still contained uncommitted
+        // transactions so there is still nothing that we can do.
         return status;
-    } else {
-      ROCKS_LOG_WARN(
-          immutable_db_options_.info_log,
-          "Unable to release oldest log due to uncommited transaction");
-      unable_to_flush_oldest_log_ = true;
+      } else {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Unable to release oldest log due to uncommitted transaction");
+        unable_to_release_oldest_log_ = true;
+        flush_wont_release_oldest_log = true;
+      }
     }
-  } else {
+  }
+  if (!flush_wont_release_oldest_log) {
     // we only mark this log as getting flushed if we have successfully
     // flushed all data in this log. If this log contains outstanding prepared
-    // transactions then we cannot flush this log until those transactions are commited.
-    unable_to_flush_oldest_log_ = false;
+    // transactions then we cannot flush this log until those transactions are
+    // commited.
+    unable_to_release_oldest_log_ = false;
     alive_log_files_.begin()->getting_flushed = true;
   }
 
-  ROCKS_LOG_INFO(immutable_db_options_.info_log,
-                 "Flushing all column families with data in WAL number %" PRIu64
-                 ". Total log size is %" PRIu64
-                 " while max_total_wal_size is %" PRIu64,
-                 oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "Flushing all column families with data in WAL number %" PRIu64
+      ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+      oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
   // no need to refcount because drop is happening in write thread, so can't
   // happen while we're in the write thread
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    if (cfd->IsDropped()) {
-      continue;
-    }
-    if (cfd->OldestLogToKeep() <= oldest_alive_log) {
-      status = SwitchMemtable(cfd, write_context);
-      if (!status.ok()) {
-        break;
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+        cfds.push_back(cfd);
       }
+    }
+  }
+  for (const auto cfd : cfds) {
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->Unref();
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (auto cfd : cfds) {
       cfd->imm()->FlushRequested();
-      SchedulePendingFlush(cfd);
     }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+    MaybeScheduleFlushOrCompaction();
   }
-  MaybeScheduleFlushOrCompaction();
   return status;
 }
 
@@ -888,35 +1127,59 @@ Status DBImpl::HandleWriteBufferFull(WriteContext* write_context) {
   ROCKS_LOG_INFO(
       immutable_db_options_.info_log,
       "Flushing column family with largest mem table size. Write buffer is "
-      "using %" PRIu64 " bytes out of a total of %" PRIu64 ".",
+      "using %" ROCKSDB_PRIszt " bytes out of a total of %" ROCKSDB_PRIszt ".",
       write_buffer_manager_->memory_usage(),
       write_buffer_manager_->buffer_size());
   // no need to refcount because drop is happening in write thread, so can't
   // happen while we're in the write thread
-  ColumnFamilyData* cfd_picked = nullptr;
-  SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    ColumnFamilyData* cfd_picked = nullptr;
+    SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
 
-  for (auto cfd : *versions_->GetColumnFamilySet()) {
-    if (cfd->IsDropped()) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (!cfd->mem()->IsEmpty()) {
+        // We only consider active mem table, hoping immutable memtable is
+        // already in the process of flushing.
+        uint64_t seq = cfd->mem()->GetCreationSeq();
+        if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+          cfd_picked = cfd;
+          seq_num_for_cf_picked = seq;
+        }
+      }
+    }
+    if (cfd_picked != nullptr) {
+      cfds.push_back(cfd_picked);
+    }
+  }
+
+  for (const auto cfd : cfds) {
+    if (cfd->mem()->IsEmpty()) {
       continue;
     }
-    if (!cfd->mem()->IsEmpty()) {
-      // We only consider active mem table, hoping immutable memtable is
-      // already in the process of flushing.
-      uint64_t seq = cfd->mem()->GetCreationSeq();
-      if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
-        cfd_picked = cfd;
-        seq_num_for_cf_picked = seq;
-      }
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->Unref();
+    if (!status.ok()) {
+      break;
     }
   }
-  if (cfd_picked != nullptr) {
-    status = SwitchMemtable(cfd_picked, write_context);
-    if (status.ok()) {
-      cfd_picked->imm()->FlushRequested();
-      SchedulePendingFlush(cfd_picked);
-      MaybeScheduleFlushOrCompaction();
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
     }
+    for (const auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    MaybeScheduleFlushOrCompaction();
   }
   return status;
 }
@@ -939,10 +1202,14 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
     uint64_t delay = write_controller_.GetDelay(env_, num_bytes);
     if (delay > 0) {
       if (write_options.no_slowdown) {
-        return Status::Incomplete();
+        return Status::Incomplete("Write stall");
       }
       TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
 
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
       mutex_.Unlock();
       // We will delay the write until we have slept for delay ms or
       // we don't need a delay anymore
@@ -959,15 +1226,25 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
         env_->SleepForMicroseconds(kDelayInterval);
       }
       mutex_.Lock();
+      write_thread_.EndWriteStall();
     }
 
-    while (bg_error_.ok() && write_controller_.IsStopped()) {
+    // Don't wait if there's a background error, even if its a soft error. We
+    // might wait here indefinitely as the background compaction may never
+    // finish successfully, resulting in the stall condition lasting
+    // indefinitely
+    while (error_handler_.GetBGError().ok() && write_controller_.IsStopped()) {
       if (write_options.no_slowdown) {
-        return Status::Incomplete();
+        return Status::Incomplete("Write stall");
       }
       delayed = true;
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
       TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
       bg_cv_.Wait();
+      write_thread_.EndWriteStall();
     }
   }
   assert(!delayed || !write_options.no_slowdown);
@@ -977,7 +1254,19 @@ Status DBImpl::DelayWrite(uint64_t num_bytes,
     RecordTick(stats_, STALL_MICROS, time_delayed);
   }
 
-  return bg_error_;
+  // If DB is not in read-only mode and write_controller is not stopping
+  // writes, we can ignore any background errors and allow the write to
+  // proceed
+  Status s;
+  if (write_controller_.IsStopped()) {
+    // If writes are still stopped, it means we bailed due to a background
+    // error
+    s = Status::Incomplete(error_handler_.GetBGError().ToString());
+  }
+  if (error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
 }
 
 Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
@@ -1001,6 +1290,7 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
       // is that in case the write is heavy, low pri writes may never have
       // a chance to run. Now we guarantee we are still slowly making
       // progress.
+      PERF_TIMER_GUARD(write_delay_time);
       write_controller_.low_pri_rate_limiter()->Request(
           my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
           RateLimiter::OpType::kWrite);
@@ -1010,21 +1300,46 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
 }
 
 Status DBImpl::ScheduleFlushes(WriteContext* context) {
-  ColumnFamilyData* cfd;
-  while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
-    auto status = SwitchMemtable(cfd, context);
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    for (auto cfd : cfds) {
+      cfd->Ref();
+    }
+    flush_scheduler_.Clear();
+  } else {
+    ColumnFamilyData* tmp_cfd;
+    while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+      cfds.push_back(tmp_cfd);
+    }
+  }
+  Status status;
+  for (auto& cfd : cfds) {
+    if (!cfd->mem()->IsEmpty()) {
+      status = SwitchMemtable(cfd, context);
+    }
     if (cfd->Unref()) {
       delete cfd;
+      cfd = nullptr;
     }
     if (!status.ok()) {
-      return status;
+      break;
     }
   }
-  return Status::OK();
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    FlushRequest flush_req;
+    GenerateFlushRequest(cfds, &flush_req);
+    SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
 }
 
 #ifndef ROCKSDB_LITE
-void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd,
+void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
                                     const MemTableInfo& mem_table_info) {
   if (immutable_db_options_.listeners.size() == 0U) {
     return;
@@ -1044,41 +1359,51 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* cfd,
 Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   mutex_.AssertHeld();
   WriteThread::Writer nonmem_w;
-  if (concurrent_prepare_) {
+  if (two_write_queues_) {
     // SwitchMemtable is a rare event. To simply the reasoning, we make sure
     // that there is no concurrent thread writing to WAL.
     nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
   }
 
-  unique_ptr<WritableFile> lfile;
+  std::unique_ptr<WritableFile> lfile;
   log::Writer* new_log = nullptr;
   MemTable* new_mem = nullptr;
 
+  // Recoverable state is persisted in WAL. After memtable switch, WAL might
+  // be deleted, so we write the state to memtable to be persisted as well.
+  Status s = WriteRecoverableState();
+  if (!s.ok()) {
+    return s;
+  }
+
   // In case of pipelined write is enabled, wait for all pending memtable
   // writers.
   if (immutable_db_options_.enable_pipelined_write) {
+    // Memtable writers may call DB::Get in case max_successive_merges > 0,
+    // which may lock mutex. Unlocking mutex here to avoid deadlock.
+    mutex_.Unlock();
     write_thread_.WaitForMemTableWriters();
+    mutex_.Lock();
   }
 
   // Attempt to switch to a new memtable and trigger flush of old.
   // Do this without holding the dbmutex lock.
   assert(versions_->prev_log_number() == 0);
-  if (concurrent_prepare_) {
+  if (two_write_queues_) {
     log_write_mutex_.Lock();
   }
   bool creating_new_log = !log_empty_;
-  if (concurrent_prepare_) {
+  if (two_write_queues_) {
     log_write_mutex_.Unlock();
   }
   uint64_t recycle_log_number = 0;
   if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
-      !log_recycle_files.empty()) {
-    recycle_log_number = log_recycle_files.front();
-    log_recycle_files.pop_front();
+      !log_recycle_files_.empty()) {
+    recycle_log_number = log_recycle_files_.front();
+    log_recycle_files_.pop_front();
   }
   uint64_t new_log_number =
       creating_new_log ? versions_->NewFileNumber() : logfile_number_;
-  SuperVersion* new_superversion = nullptr;
   const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
 
   // Set memtable_info for memtable sealed callback
@@ -1096,10 +1421,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   DBOptions db_options =
       BuildDBOptions(immutable_db_options_, mutable_db_options_);
   const auto preallocate_block_size =
-    GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+      GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+  auto write_hint = CalculateWALWriteHint();
   mutex_.Unlock();
-  Status s;
   {
+    std::string log_fname =
+        LogFileName(immutable_db_options_.wal_dir, new_log_number);
     if (creating_new_log) {
       EnvOptions opt_env_opt =
           env_->OptimizeForLogWrite(env_options_, db_options);
@@ -1107,14 +1434,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
         ROCKS_LOG_INFO(immutable_db_options_.info_log,
                        "reusing log %" PRIu64 " from recycle list\n",
                        recycle_log_number);
-        s = env_->ReuseWritableFile(
-            LogFileName(immutable_db_options_.wal_dir, new_log_number),
-            LogFileName(immutable_db_options_.wal_dir, recycle_log_number),
-            &lfile, opt_env_opt);
+        std::string old_log_fname =
+            LogFileName(immutable_db_options_.wal_dir, recycle_log_number);
+        s = env_->ReuseWritableFile(log_fname, old_log_fname, &lfile,
+                                    opt_env_opt);
       } else {
-        s = NewWritableFile(
-            env_, LogFileName(immutable_db_options_.wal_dir, new_log_number),
-            &lfile, opt_env_opt);
+        s = NewWritableFile(env_, log_fname, &lfile, opt_env_opt);
       }
       if (s.ok()) {
         // Our final size should be less than write_buffer_size
@@ -1123,8 +1448,10 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
         // use preallocate_block_size instead
         // of calling GetWalPreallocateBlockSize()
         lfile->SetPreallocationBlockSize(preallocate_block_size);
-        unique_ptr<WritableFileWriter> file_writer(
-            new WritableFileWriter(std::move(lfile), opt_env_opt));
+        lfile->SetWriteLifeTimeHint(write_hint);
+        std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(lfile), log_fname, opt_env_opt, env_, nullptr /* stats */,
+            immutable_db_options_.listeners));
         new_log = new log::Writer(
             std::move(file_writer), new_log_number,
             immutable_db_options_.recycle_log_file_num > 0, manual_wal_flush_);
@@ -1134,46 +1461,65 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
     if (s.ok()) {
       SequenceNumber seq = versions_->LastSequence();
       new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
-      new_superversion = new SuperVersion();
+      context->superversion_context.NewSuperVersion();
     }
-
-#ifndef ROCKSDB_LITE
-    // PLEASE NOTE: We assume that there are no failable operations
-    // after lock is acquired below since we are already notifying
-    // client about mem table becoming immutable.
-    NotifyOnMemTableSealed(cfd, memtable_info);
-#endif //ROCKSDB_LITE
   }
   ROCKS_LOG_INFO(immutable_db_options_.info_log,
                  "[%s] New memtable created with log file: #%" PRIu64
                  ". Immutable memtables: %d.\n",
                  cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
   mutex_.Lock();
-  if (!s.ok()) {
-    // how do we fail if we're not creating new log?
-    assert(creating_new_log);
-    assert(!new_mem);
-    assert(!new_log);
-    if (concurrent_prepare_) {
-      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
-    }
-    return s;
-  }
-  if (creating_new_log) {
+  if (s.ok() && creating_new_log) {
     log_write_mutex_.Lock();
-    logfile_number_ = new_log_number;
     assert(new_log != nullptr);
-    log_empty_ = true;
-    log_dir_synced_ = false;
     if (!logs_.empty()) {
       // Alway flush the buffer of the last log before switching to a new one
       log::Writer* cur_log_writer = logs_.back().writer;
-      cur_log_writer->WriteBuffer();
+      s = cur_log_writer->WriteBuffer();
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
+                       "  WAL file\n",
+                       cfd->GetName().c_str(), cur_log_writer->get_log_number(),
+                       new_log_number);
+      }
+    }
+    if (s.ok()) {
+      logfile_number_ = new_log_number;
+      log_empty_ = true;
+      log_dir_synced_ = false;
+      logs_.emplace_back(logfile_number_, new_log);
+      alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
     }
-    logs_.emplace_back(logfile_number_, new_log);
-    alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
     log_write_mutex_.Unlock();
   }
+
+  if (!s.ok()) {
+    // how do we fail if we're not creating new log?
+    assert(creating_new_log);
+    if (new_mem) {
+      delete new_mem;
+    }
+    if (new_log) {
+      delete new_log;
+    }
+    SuperVersion* new_superversion =
+        context->superversion_context.new_superversion.release();
+    if (new_superversion != nullptr) {
+      delete new_superversion;
+    }
+    // We may have lost data from the WritableFileBuffer in-memory buffer for
+    // the current log, so treat it as a fatal error and set bg_error
+    error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+    // Read back bg_error in order to get the right severity
+    s = error_handler_.GetBGError();
+
+    if (two_write_queues_) {
+      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+    }
+    return s;
+  }
+
   for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
     // all this is just optimization to delete logs that
     // are no longer needed -- if CF is empty, that means it
@@ -1192,9 +1538,16 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
   cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
   new_mem->Ref();
   cfd->SetMemtable(new_mem);
-  context->superversions_to_free_.push_back(InstallSuperVersionAndScheduleWork(
-      cfd, new_superversion, mutable_cf_options));
-  if (concurrent_prepare_) {
+  InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
+                                     mutable_cf_options);
+#ifndef ROCKSDB_LITE
+  mutex_.Unlock();
+  // Notify client that memtable is sealed, now that we have successfully
+  // installed a new memtable
+  NotifyOnMemTableSealed(cfd, memtable_info);
+  mutex_.Lock();
+#endif  // ROCKSDB_LITE
+  if (two_write_queues_) {
     nonmem_write_thread_.ExitUnbatched(&nonmem_w);
   }
   return s;
@@ -1202,11 +1555,13 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
 
 size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
   mutex_.AssertHeld();
-  size_t bsize = write_buffer_size / 10 + write_buffer_size;
+  size_t bsize =
+      static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
   // Some users might set very high write_buffer_size and rely on
   // max_total_wal_size or other parameters to control the WAL size.
   if (mutable_db_options_.max_total_wal_size > 0) {
-    bsize = std::min<size_t>(bsize, mutable_db_options_.max_total_wal_size);
+    bsize = std::min<size_t>(
+        bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
   }
   if (immutable_db_options_.db_write_buffer_size > 0) {
     bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
@@ -1228,7 +1583,10 @@ Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
   // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
   // and we allocate 11 extra bytes for key length, as well as value length.
   WriteBatch batch(key.size() + value.size() + 24);
-  batch.Put(column_family, key, value);
+  Status s = batch.Put(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
   return Write(opt, &batch);
 }
 
@@ -1257,7 +1615,10 @@ Status DB::DeleteRange(const WriteOptions& opt,
 Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
                  const Slice& key, const Slice& value) {
   WriteBatch batch;
-  batch.Merge(column_family, key, value);
+  Status s = batch.Merge(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
   return Write(opt, &batch);
 }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_info_dumper.cc b/thirdparty/rocksdb/db/db_info_dumper.cc
index 1668a1638f..31050d20a2 100644
--- a/thirdparty/rocksdb/db/db_info_dumper.cc
+++ b/thirdparty/rocksdb/db/db_info_dumper.cc
@@ -42,7 +42,7 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
           "Error when reading %s dir\n", dbname.c_str());
   }
   std::sort(files.begin(), files.end());
-  for (std::string file : files) {
+  for (const std::string& file : files) {
     if (!ParseFileName(file, &number, &type)) {
       continue;
     }
@@ -85,7 +85,7 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
         continue;
       }
       std::sort(files.begin(), files.end());
-      for (std::string file : files) {
+      for (const std::string& file : files) {
         if (ParseFileName(file, &number, &type)) {
           if (type == kTableFile && ++file_num < 10) {
             file_info.append(file).append(" ");
@@ -109,7 +109,7 @@ void DumpDBFileSummary(const ImmutableDBOptions& options,
       return;
     }
     wal_info.clear();
-    for (std::string file : files) {
+    for (const std::string& file : files) {
       if (ParseFileName(file, &number, &type)) {
         if (type == kLogFile) {
           env->GetFileSize(options.wal_dir + "/" + file, &file_size);
diff --git a/thirdparty/rocksdb/db/db_io_failure_test.cc b/thirdparty/rocksdb/db/db_io_failure_test.cc
index 9f4dcc5d05..ba8f197596 100644
--- a/thirdparty/rocksdb/db/db_io_failure_test.cc
+++ b/thirdparty/rocksdb/db/db_io_failure_test.cc
@@ -88,7 +88,6 @@ TEST_F(DBIOFailureTest, DropWritesFlush) {
     env_->drop_writes_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
-#endif  // ROCKSDB_LITE
 
 // Check that CompactRange() returns failure if there is not enough space left
 // on device
@@ -116,6 +115,7 @@ TEST_F(DBIOFailureTest, NoSpaceCompactRange) {
     env_->no_space_.store(false, std::memory_order_release);
   } while (ChangeCompactOptions());
 }
+#endif  // ROCKSDB_LITE
 
 TEST_F(DBIOFailureTest, NonWritableFileSystem) {
   do {
diff --git a/thirdparty/rocksdb/db/db_iter.cc b/thirdparty/rocksdb/db/db_iter.cc
index e4a6c92a7d..541a5fbed9 100644
--- a/thirdparty/rocksdb/db/db_iter.cc
+++ b/thirdparty/rocksdb/db/db_iter.cc
@@ -9,6 +9,7 @@
 
 #include "db/db_iter.h"
 #include <string>
+#include <iostream>
 #include <limits>
 
 #include "db/dbformat.h"
@@ -26,6 +27,8 @@
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/string_util.h"
+#include "util/trace_replay.h"
+#include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
 
@@ -47,12 +50,17 @@ static void DumpInternalIter(Iterator* iter) {
 // combines multiple entries for the same userkey found in the DB
 // representation into a single entry while accounting for sequence
 // numbers, deletion markers, overwrites, etc.
-class DBIter: public Iterator {
+class DBIter final: public Iterator {
  public:
   // The following is grossly complicated. TODO: clean it up
   // Which direction is the iterator currently moving?
-  // (1) When moving forward, the internal iterator is positioned at
-  //     the exact entry that yields this->key(), this->value()
+  // (1) When moving forward:
+  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
+  //        positioned at the exact entry that yields this->key(), this->value()
+  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
+  //        positioned immediately after the last entry that contributed to the
+  //        current this->value(). That entry may or may not have key equal to
+  //        this->key().
   // (2) When moving backwards, the internal iterator is positioned
   //     just before all entries whose user key == this->key().
   enum Direction {
@@ -75,6 +83,7 @@ class DBIter: public Iterator {
       prev_count_ = 0;
       prev_found_count_ = 0;
       bytes_read_ = 0;
+      skip_count_ = 0;
     }
 
     void BumpGlobalStatistics(Statistics* global_statistics) {
@@ -83,6 +92,7 @@ class DBIter: public Iterator {
       RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
       RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
       RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
       PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
       ResetCounters();
     }
@@ -97,32 +107,43 @@ class DBIter: public Iterator {
     uint64_t prev_found_count_;
     // Map to Tickers::ITER_BYTES_READ
     uint64_t bytes_read_;
+    // Map to Tickers::NUMBER_ITER_SKIP
+    uint64_t skip_count_;
   };
 
   DBIter(Env* _env, const ReadOptions& read_options,
-         const ImmutableCFOptions& cf_options, const Comparator* cmp,
+         const ImmutableCFOptions& cf_options,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
          InternalIterator* iter, SequenceNumber s, bool arena_mode,
-         uint64_t max_sequential_skip_in_iterations, bool allow_blob)
-      : arena_mode_(arena_mode),
-        env_(_env),
+         uint64_t max_sequential_skip_in_iterations,
+         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+         bool allow_blob)
+      : env_(_env),
         logger_(cf_options.info_log),
         user_comparator_(cmp),
         merge_operator_(cf_options.merge_operator),
         iter_(iter),
+        read_callback_(read_callback),
         sequence_(s),
+        statistics_(cf_options.statistics),
+        num_internal_keys_skipped_(0),
+        iterate_lower_bound_(read_options.iterate_lower_bound),
+        iterate_upper_bound_(read_options.iterate_upper_bound),
         direction_(kForward),
         valid_(false),
         current_entry_is_merged_(false),
-        statistics_(cf_options.statistics),
-        iterate_upper_bound_(read_options.iterate_upper_bound),
         prefix_same_as_start_(read_options.prefix_same_as_start),
         pin_thru_lifetime_(read_options.pin_data),
         total_order_seek_(read_options.total_order_seek),
-        range_del_agg_(cf_options.internal_comparator, s,
-                       true /* collapse_deletions */),
-        allow_blob_(allow_blob) {
-    RecordTick(statistics_, NO_ITERATORS);
-    prefix_extractor_ = cf_options.prefix_extractor;
+        allow_blob_(allow_blob),
+        is_blob_(false),
+        arena_mode_(arena_mode),
+        range_del_agg_(&cf_options.internal_comparator, s),
+        db_impl_(db_impl),
+        cfd_(cfd),
+        start_seqnum_(read_options.iter_start_seqnum) {
+    RecordTick(statistics_, NO_ITERATOR_CREATED);
+    prefix_extractor_ = mutable_cf_options.prefix_extractor.get();
     max_skip_ = max_sequential_skip_in_iterations;
     max_skippable_internal_keys_ = read_options.max_skippable_internal_keys;
     if (pin_thru_lifetime_) {
@@ -132,12 +153,13 @@ class DBIter: public Iterator {
       iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
     }
   }
-  virtual ~DBIter() {
+  ~DBIter() override {
     // Release pinned data if any
     if (pinned_iters_mgr_.PinningEnabled()) {
       pinned_iters_mgr_.ReleasePinnedData();
     }
-    RecordTick(statistics_, NO_ITERATORS, -1);
+    RecordTick(statistics_, NO_ITERATOR_DELETED);
+    ResetInternalKeysSkippedCounter();
     local_stats_.BumpGlobalStatistics(statistics_);
     if (!arena_mode_) {
       delete iter_;
@@ -150,16 +172,20 @@ class DBIter: public Iterator {
     iter_ = iter;
     iter_->SetPinnedItersMgr(&pinned_iters_mgr_);
   }
-  virtual RangeDelAggregator* GetRangeDelAggregator() {
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator() {
     return &range_del_agg_;
   }
 
-  virtual bool Valid() const override { return valid_; }
-  virtual Slice key() const override {
+  bool Valid() const override { return valid_; }
+  Slice key() const override {
     assert(valid_);
-    return saved_key_.GetUserKey();
+    if(start_seqnum_ > 0) {
+      return saved_key_.GetInternalKey();
+    } else {
+      return saved_key_.GetUserKey();
+    }
   }
-  virtual Slice value() const override {
+  Slice value() const override {
     assert(valid_);
     if (current_entry_is_merged_) {
       // If pinned_value_ is set then the result of merge operator is one of
@@ -171,10 +197,11 @@ class DBIter: public Iterator {
       return iter_->value();
     }
   }
-  virtual Status status() const override {
+  Status status() const override {
     if (status_.ok()) {
       return iter_->status();
     } else {
+      assert(!valid_);
       return status_;
     }
   }
@@ -183,8 +210,7 @@ class DBIter: public Iterator {
     return is_blob_;
   }
 
-  virtual Status GetProperty(std::string prop_name,
-                             std::string* prop) override {
+  Status GetProperty(std::string prop_name, std::string* prop) override {
     if (prop == nullptr) {
       return Status::InvalidArgument("prop is nullptr");
     }
@@ -198,34 +224,52 @@ class DBIter: public Iterator {
         *prop = "Iterator is not valid.";
       }
       return Status::OK();
+    } else if (prop_name == "rocksdb.iterator.internal-key") {
+      *prop = saved_key_.GetUserKey().ToString();
+      return Status::OK();
     }
-    return Status::InvalidArgument("Undentified property.");
+    return Status::InvalidArgument("Unidentified property.");
   }
 
-  virtual void Next() override;
-  virtual void Prev() override;
-  virtual void Seek(const Slice& target) override;
-  virtual void SeekForPrev(const Slice& target) override;
-  virtual void SeekToFirst() override;
-  virtual void SeekToLast() override;
+  void Next() override;
+  void Prev() override;
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
   Env* env() { return env_; }
-  void set_sequence(uint64_t s) { sequence_ = s; }
+  void set_sequence(uint64_t s) {
+    sequence_ = s;
+    if (read_callback_) {
+      read_callback_->Refresh(s);
+    }
+  }
   void set_valid(bool v) { valid_ = v; }
 
  private:
-  void ReverseToForward();
-  void ReverseToBackward();
-  void PrevInternal();
-  void FindParseableKey(ParsedInternalKey* ikey, Direction direction);
+  // For all methods in this block:
+  // PRE: iter_->Valid() && status_.ok()
+  // Return false if there was an error, and status() is non-ok, valid_ = false;
+  // in this case callers would usually stop what they were doing and return.
+  bool ReverseToForward();
+  bool ReverseToBackward();
   bool FindValueForCurrentKey();
   bool FindValueForCurrentKeyUsingSeek();
-  void FindPrevUserKey();
-  void FindNextUserKey();
-  inline void FindNextUserEntry(bool skipping, bool prefix_check);
-  void FindNextUserEntryInternal(bool skipping, bool prefix_check);
+  bool FindUserKeyBeforeSavedKey();
+  inline bool FindNextUserEntry(bool skipping, bool prefix_check);
+  bool FindNextUserEntryInternal(bool skipping, bool prefix_check);
   bool ParseKey(ParsedInternalKey* key);
-  void MergeValuesNewToOld();
+  bool MergeValuesNewToOld();
+
+  void PrevInternal();
   bool TooManyInternalKeysSkipped(bool increment = true);
+  inline bool IsVisible(SequenceNumber sequence);
+
+  // CanReseekToSkip() returns whether the iterator can use the optimization
+  // where it reseek by sequence number to get the next key when there are too
+  // many versions. This is disabled for write unprepared because seeking to
+  // sequence number does not guarantee that it is visible.
+  inline bool CanReseekToSkip();
 
   // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
   // is called
@@ -252,45 +296,64 @@ class DBIter: public Iterator {
   }
 
   inline void ResetInternalKeysSkippedCounter() {
+    local_stats_.skip_count_ += num_internal_keys_skipped_;
+    if (valid_) {
+      local_stats_.skip_count_--;
+    }
     num_internal_keys_skipped_ = 0;
   }
 
   const SliceTransform* prefix_extractor_;
-  bool arena_mode_;
   Env* const env_;
   Logger* logger_;
-  const Comparator* const user_comparator_;
+  UserComparatorWrapper user_comparator_;
   const MergeOperator* const merge_operator_;
   InternalIterator* iter_;
+  ReadCallback* read_callback_;
+  // Max visible sequence number. It is normally the snapshot seq unless we have
+  // uncommitted data in db as in WriteUnCommitted.
   SequenceNumber sequence_;
 
-  Status status_;
   IterKey saved_key_;
+  // Reusable internal key data structure. This is only used inside one function
+  // and should not be used across functions. Reusing this object can reduce
+  // overhead of calling construction of the function if creating it each time.
+  ParsedInternalKey ikey_;
   std::string saved_value_;
   Slice pinned_value_;
-  Direction direction_;
-  bool valid_;
-  bool current_entry_is_merged_;
   // for prefix seek mode to support prev()
   Statistics* statistics_;
   uint64_t max_skip_;
   uint64_t max_skippable_internal_keys_;
   uint64_t num_internal_keys_skipped_;
+  const Slice* iterate_lower_bound_;
   const Slice* iterate_upper_bound_;
+
   IterKey prefix_start_buf_;
+
+  Status status_;
   Slice prefix_start_key_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
   const bool prefix_same_as_start_;
   // Means that we will pin all data blocks we read as long the Iterator
   // is not deleted, will be true if ReadOptions::pin_data is true
   const bool pin_thru_lifetime_;
   const bool total_order_seek_;
+  bool allow_blob_;
+  bool is_blob_;
+  bool arena_mode_;
   // List of operands for merge operator.
   MergeContext merge_context_;
-  RangeDelAggregator range_del_agg_;
+  ReadRangeDelAggregator range_del_agg_;
   LocalStatistics local_stats_;
   PinnedIteratorsManager pinned_iters_mgr_;
-  bool allow_blob_;
-  bool is_blob_;
+  DBImpl* db_impl_;
+  ColumnFamilyData* cfd_;
+  // for diff snapshots we want the lower bound on the seqnum;
+  // if this value > 0 iterator will return internal keys
+  SequenceNumber start_seqnum_;
 
   // No copying allowed
   DBIter(const DBIter&);
@@ -300,6 +363,7 @@ class DBIter: public Iterator {
 inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
   if (!ParseInternalKey(iter_->key(), ikey)) {
     status_ = Status::Corruption("corrupted internal key in DBIter");
+    valid_ = false;
     ROCKS_LOG_ERROR(logger_, "corrupted internal key in DBIter: %s",
                     iter_->key().ToString(true).c_str());
     return false;
@@ -310,12 +374,17 @@ inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
 
 void DBIter::Next() {
   assert(valid_);
+  assert(status_.ok());
 
+  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, env_);
   // Release temporarily pinned blocks from last operation
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
+  bool ok = true;
   if (direction_ == kReverse) {
-    ReverseToForward();
+    if (!ReverseToForward()) {
+      ok = false;
+    }
   } else if (iter_->Valid() && !current_entry_is_merged_) {
     // If the current value is not a merge, the iter position is the
     // current key, which is already returned. We can safely issue a
@@ -329,13 +398,12 @@ void DBIter::Next() {
   if (statistics_ != nullptr) {
     local_stats_.next_count_++;
   }
-  // Now we point to the next internal position, for both of merge and
-  // not merge cases.
-  if (!iter_->Valid()) {
+  if (ok && iter_->Valid()) {
+    FindNextUserEntry(true /* skipping the current user key */,
+                      prefix_same_as_start_);
+  } else {
     valid_ = false;
-    return;
   }
-  FindNextUserEntry(true /* skipping the current user key */, prefix_same_as_start_);
   if (statistics_ != nullptr && valid_) {
     local_stats_.next_found_count_++;
     local_stats_.bytes_read_ += (key().size() + value().size());
@@ -356,15 +424,16 @@ void DBIter::Next() {
 // keys against the prefix of the seeked key. Set to false when
 // performing a seek without a key (e.g. SeekToFirst). Set to
 // prefix_same_as_start_ for other iterations.
-inline void DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
+inline bool DBIter::FindNextUserEntry(bool skipping, bool prefix_check) {
   PERF_TIMER_GUARD(find_next_user_entry_time);
-  FindNextUserEntryInternal(skipping, prefix_check);
+  return FindNextUserEntryInternal(skipping, prefix_check);
 }
 
 // Actual implementation of DBIter::FindNextUserEntry()
-void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
+bool DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
   // Loop until we hit an acceptable entry to yield
   assert(iter_->Valid());
+  assert(status_.ok());
   assert(direction_ == kForward);
   current_entry_is_merged_ = false;
 
@@ -383,85 +452,109 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
   is_blob_ = false;
 
   do {
-    ParsedInternalKey ikey;
-
-    if (!ParseKey(&ikey)) {
-      // Skip corrupted keys.
-      iter_->Next();
-      continue;
+    if (!ParseKey(&ikey_)) {
+      return false;
     }
 
     if (iterate_upper_bound_ != nullptr &&
-        user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) {
+        user_comparator_.Compare(ikey_.user_key, *iterate_upper_bound_) >= 0) {
       break;
     }
 
     if (prefix_extractor_ && prefix_check &&
-        prefix_extractor_->Transform(ikey.user_key)
-          .compare(prefix_start_key_) != 0) {
+        prefix_extractor_->Transform(ikey_.user_key)
+                .compare(prefix_start_key_) != 0) {
       break;
     }
 
     if (TooManyInternalKeysSkipped()) {
-      return;
+      return false;
     }
 
-    if (ikey.sequence <= sequence_) {
-      if (skipping &&
-          user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) <=
-              0) {
+    if (IsVisible(ikey_.sequence)) {
+      if (skipping && user_comparator_.Compare(ikey_.user_key,
+                                               saved_key_.GetUserKey()) <= 0) {
         num_skipped++;  // skip this entry
         PERF_COUNTER_ADD(internal_key_skipped_count, 1);
       } else {
         num_skipped = 0;
-        switch (ikey.type) {
+        switch (ikey_.type) {
           case kTypeDeletion:
           case kTypeSingleDeletion:
             // Arrange to skip all upcoming entries for this key since
             // they are hidden by this deletion.
-            saved_key_.SetUserKey(
-                ikey.user_key,
-                !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
-            skipping = true;
-            PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            // if iterartor specified start_seqnum we
+            // 1) return internal key, including the type
+            // 2) return ikey only if ikey.seqnum >= start_seqnum_
+            // note that if deletion seqnum is < start_seqnum_ we
+            // just skip it like in normal iterator.
+            if (start_seqnum_ > 0 && ikey_.sequence >= start_seqnum_)  {
+              saved_key_.SetInternalKey(ikey_);
+              valid_ = true;
+              return true;
+            } else {
+              saved_key_.SetUserKey(
+                ikey_.user_key,
+                !pin_thru_lifetime_ || !iter_->IsKeyPinned() /* copy */);
+              skipping = true;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            }
             break;
           case kTypeValue:
           case kTypeBlobIndex:
-            saved_key_.SetUserKey(
-                ikey.user_key,
-                !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
-            if (range_del_agg_.ShouldDelete(
-                    ikey, RangeDelAggregator::RangePositioningMode::
-                              kForwardTraversal)) {
-              // Arrange to skip all upcoming entries for this key since
-              // they are hidden by this deletion.
-              skipping = true;
-              num_skipped = 0;
-              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
-            } else if (ikey.type == kTypeBlobIndex) {
-              if (!allow_blob_) {
-                ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
-                status_ = Status::NotSupported(
-                    "Encounter unexpected blob index. Please open DB with "
-                    "rocksdb::blob_db::BlobDB instead.");
-                valid_ = false;
+            if (start_seqnum_ > 0) {
+              // we are taking incremental snapshot here
+              // incremental snapshots aren't supported on DB with range deletes
+              assert(!(
+                (ikey_.type == kTypeBlobIndex) && (start_seqnum_ > 0)
+              ));
+              if (ikey_.sequence >= start_seqnum_) {
+                saved_key_.SetInternalKey(ikey_);
+                valid_ = true;
+                return true;
               } else {
+                // this key and all previous versions shouldn't be included,
+                // skipping
+                saved_key_.SetUserKey(ikey_.user_key,
+                  !pin_thru_lifetime_ || !iter_->IsKeyPinned() /* copy */);
+                skipping = true;
+              }
+            } else {
+              saved_key_.SetUserKey(
+                  ikey_.user_key,
+                  !pin_thru_lifetime_ || !iter_->IsKeyPinned() /* copy */);
+              if (range_del_agg_.ShouldDelete(
+                      ikey_, RangeDelPositioningMode::kForwardTraversal)) {
+                // Arrange to skip all upcoming entries for this key since
+                // they are hidden by this deletion.
+                skipping = true;
+                num_skipped = 0;
+                PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+              } else if (ikey_.type == kTypeBlobIndex) {
+                if (!allow_blob_) {
+                  ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+                  status_ = Status::NotSupported(
+                      "Encounter unexpected blob index. Please open DB with "
+                      "rocksdb::blob_db::BlobDB instead.");
+                  valid_ = false;
+                  return false;
+                }
+
                 is_blob_ = true;
                 valid_ = true;
+                return true;
+              } else {
+                valid_ = true;
+                return true;
               }
-              return;
-            } else {
-              valid_ = true;
-              return;
             }
             break;
           case kTypeMerge:
             saved_key_.SetUserKey(
-                ikey.user_key,
-                !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+                ikey_.user_key,
+                !pin_thru_lifetime_ || !iter_->IsKeyPinned() /* copy */);
             if (range_del_agg_.ShouldDelete(
-                    ikey, RangeDelAggregator::RangePositioningMode::
-                              kForwardTraversal)) {
+                    ikey_, RangeDelPositioningMode::kForwardTraversal)) {
               // Arrange to skip all upcoming entries for this key since
               // they are hidden by this deletion.
               skipping = true;
@@ -472,8 +565,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
               // value
               current_entry_is_merged_ = true;
               valid_ = true;
-              MergeValuesNewToOld();  // Go to a different state machine
-              return;
+              return MergeValuesNewToOld();  // Go to a different state machine
             }
             break;
           default:
@@ -482,17 +574,18 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
         }
       }
     } else {
-      // This key was inserted after our snapshot was taken.
       PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
 
-      // Here saved_key_ may contain some old key, or the default empty key, or
-      // key assigned by some random other method. We don't care.
-      if (user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) <=
-          0) {
+      // This key was inserted after our snapshot was taken.
+      // If this happens too many times in a row for the same user key, we want
+      // to seek to the target sequence number.
+      int cmp =
+          user_comparator_.Compare(ikey_.user_key, saved_key_.GetUserKey());
+      if (cmp == 0 || (skipping && cmp <= 0)) {
         num_skipped++;
       } else {
         saved_key_.SetUserKey(
-            ikey.user_key,
+            ikey_.user_key,
             !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
         skipping = false;
         num_skipped = 0;
@@ -501,7 +594,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
 
     // If we have sequentially iterated via numerous equal keys, then it's
     // better to seek so that we can avoid too many key comparisons.
-    if (num_skipped > max_skip_) {
+    if (num_skipped > max_skip_ && CanReseekToSkip()) {
       num_skipped = 0;
       std::string last_key;
       if (skipping) {
@@ -528,7 +621,9 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
       iter_->Next();
     }
   } while (iter_->Valid());
+
   valid_ = false;
+  return iter_->status().ok();
 }
 
 // Merge values of the same user key starting from the current iter_ position
@@ -537,12 +632,12 @@ void DBIter::FindNextUserEntryInternal(bool skipping, bool prefix_check) {
 //      saved_key_ stores the user key
 // POST: saved_value_ has the merged value for the user key
 //       iter_ points to the next entry (or invalid)
-void DBIter::MergeValuesNewToOld() {
+bool DBIter::MergeValuesNewToOld() {
   if (!merge_operator_) {
     ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
     status_ = Status::InvalidArgument("merge_operator_ must be set.");
     valid_ = false;
-    return;
+    return false;
   }
 
   // Temporarily pin the blocks that hold merge operands
@@ -551,22 +646,22 @@ void DBIter::MergeValuesNewToOld() {
   // Start the merge process by pushing the first operand
   merge_context_.PushOperand(iter_->value(),
                              iter_->IsValuePinned() /* operand_pinned */);
+  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
 
   ParsedInternalKey ikey;
   Status s;
   for (iter_->Next(); iter_->Valid(); iter_->Next()) {
+    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
     if (!ParseKey(&ikey)) {
-      // skip corrupted key
-      continue;
+      return false;
     }
 
-    if (!user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
+    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
       // hit the next user key, stop right here
       break;
     } else if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
                range_del_agg_.ShouldDelete(
-                   ikey, RangeDelAggregator::RangePositioningMode::
-                             kForwardTraversal)) {
+                   ikey, RangeDelPositioningMode::kForwardTraversal)) {
       // hit a delete with the same user key, stop right here
       // iter_ is positioned after delete
       iter_->Next();
@@ -574,17 +669,22 @@ void DBIter::MergeValuesNewToOld() {
     } else if (kTypeValue == ikey.type) {
       // hit a put, merge the put value with operands and store the
       // final result in saved_value_. We are done!
-      // ignore corruption if there is any.
       const Slice val = iter_->value();
       s = MergeHelper::TimedFullMerge(
           merge_operator_, ikey.user_key, &val, merge_context_.GetOperands(),
           &saved_value_, logger_, statistics_, env_, &pinned_value_, true);
       if (!s.ok()) {
+        valid_ = false;
         status_ = s;
+        return false;
       }
       // iter_ is positioned after put
       iter_->Next();
-      return;
+      if (!iter_->status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
     } else if (kTypeMerge == ikey.type) {
       // hit a merge, add the value as an operand and run associative merge.
       // when complete, add result to operands and continue.
@@ -602,12 +702,17 @@ void DBIter::MergeValuesNewToOld() {
             Status::NotSupported("Blob DB does not support merge operator.");
       }
       valid_ = false;
-      return;
+      return false;
     } else {
       assert(false);
     }
   }
 
+  if (!iter_->status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
   // we either exhausted all internal keys under this user key, or hit
   // a deletion marker.
   // feed null as the existing value to the merge operator, such that
@@ -617,18 +722,31 @@ void DBIter::MergeValuesNewToOld() {
                                   &saved_value_, logger_, statistics_, env_,
                                   &pinned_value_, true);
   if (!s.ok()) {
+    valid_ = false;
     status_ = s;
+    return false;
   }
+
+  assert(status_.ok());
+  return true;
 }
 
 void DBIter::Prev() {
   assert(valid_);
+  assert(status_.ok());
+
+  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, env_);
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
+  bool ok = true;
   if (direction_ == kForward) {
-    ReverseToBackward();
+    if (!ReverseToBackward()) {
+      ok = false;
+    }
+  }
+  if (ok) {
+    PrevInternal();
   }
-  PrevInternal();
   if (statistics_ != nullptr) {
     local_stats_.prev_count_++;
     if (valid_) {
@@ -638,113 +756,131 @@ void DBIter::Prev() {
   }
 }
 
-void DBIter::ReverseToForward() {
-  if (prefix_extractor_ != nullptr && !total_order_seek_) {
+bool DBIter::ReverseToForward() {
+  assert(iter_->status().ok());
+
+  // When moving backwards, iter_ is positioned on _previous_ key, which may
+  // not exist or may have different prefix than the current key().
+  // If that's the case, seek iter_ to current key.
+  if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_->Valid()) {
     IterKey last_key;
     last_key.SetInternalKey(ParsedInternalKey(
         saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
     iter_->Seek(last_key.GetInternalKey());
   }
-  FindNextUserKey();
+
   direction_ = kForward;
-  if (!iter_->Valid()) {
-    iter_->SeekToFirst();
-    range_del_agg_.InvalidateTombstoneMapPositions();
+  // Skip keys less than the current key() (a.k.a. saved_key_).
+  while (iter_->Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
+      return true;
+    }
+    iter_->Next();
+  }
+
+  if (!iter_->status().ok()) {
+    valid_ = false;
+    return false;
   }
+
+  return true;
 }
 
-void DBIter::ReverseToBackward() {
-  if (prefix_extractor_ != nullptr && !total_order_seek_) {
+// Move iter_ to the key before saved_key_.
+bool DBIter::ReverseToBackward() {
+  assert(iter_->status().ok());
+
+  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
+  // key, which may not exist or may have prefix different from current.
+  // If that's the case, seek to saved_key_.
+  if (current_entry_is_merged_ &&
+      ((prefix_extractor_ != nullptr && !total_order_seek_) ||
+       !iter_->Valid())) {
     IterKey last_key;
-    last_key.SetInternalKey(ParsedInternalKey(saved_key_.GetUserKey(), 0,
-                                              kValueTypeForSeekForPrev));
-    iter_->SeekForPrev(last_key.GetInternalKey());
-  }
-  if (current_entry_is_merged_) {
-    // Not placed in the same key. Need to call Prev() until finding the
-    // previous key.
-    if (!iter_->Valid()) {
-      iter_->SeekToLast();
-      range_del_agg_.InvalidateTombstoneMapPositions();
-    }
-    ParsedInternalKey ikey;
-    FindParseableKey(&ikey, kReverse);
-    while (iter_->Valid() &&
-           user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) >
-               0) {
-      assert(ikey.sequence != kMaxSequenceNumber);
-      if (ikey.sequence > sequence_) {
-        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
-      } else {
-        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    // Using kMaxSequenceNumber and kValueTypeForSeek
+    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
+    // than saved_key_.
+    last_key.SetInternalKey(ParsedInternalKey(
+        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+    if (prefix_extractor_ != nullptr && !total_order_seek_) {
+      iter_->SeekForPrev(last_key.GetInternalKey());
+    } else {
+      // Some iterators may not support SeekForPrev(), so we avoid using it
+      // when prefix seek mode is disabled. This is somewhat expensive
+      // (an extra Prev(), as well as an extra change of direction of iter_),
+      // so we may need to reconsider it later.
+      iter_->Seek(last_key.GetInternalKey());
+      if (!iter_->Valid() && iter_->status().ok()) {
+        iter_->SeekToLast();
       }
-      iter_->Prev();
-      FindParseableKey(&ikey, kReverse);
     }
   }
-#ifndef NDEBUG
-  if (iter_->Valid()) {
-    ParsedInternalKey ikey;
-    assert(ParseKey(&ikey));
-    assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetUserKey()) <=
-           0);
-  }
-#endif
 
-  FindPrevUserKey();
   direction_ = kReverse;
+  return FindUserKeyBeforeSavedKey();
 }
 
 void DBIter::PrevInternal() {
-  if (!iter_->Valid()) {
-    valid_ = false;
-    return;
-  }
-
-  ParsedInternalKey ikey;
-
   while (iter_->Valid()) {
     saved_key_.SetUserKey(
         ExtractUserKey(iter_->key()),
         !iter_->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
 
-    if (FindValueForCurrentKey()) {
-      if (!iter_->Valid()) {
-        return;
-      }
-      FindParseableKey(&ikey, kReverse);
-      if (user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
-        FindPrevUserKey();
-      }
-      if (valid_ && prefix_extractor_ && prefix_same_as_start_ &&
-          prefix_extractor_->Transform(saved_key_.GetUserKey())
-                  .compare(prefix_start_key_) != 0) {
-        valid_ = false;
-      }
+    if (prefix_extractor_ && prefix_same_as_start_ &&
+        prefix_extractor_->Transform(saved_key_.GetUserKey())
+                .compare(prefix_start_key_) != 0) {
+      // Current key does not have the same prefix as start
+      valid_ = false;
       return;
     }
 
-    if (TooManyInternalKeysSkipped(false)) {
+    if (iterate_lower_bound_ != nullptr &&
+        user_comparator_.Compare(saved_key_.GetUserKey(),
+                                 *iterate_lower_bound_) < 0) {
+      // We've iterated earlier than the user-specified lower bound.
+      valid_ = false;
       return;
     }
 
-    if (!iter_->Valid()) {
-      break;
+    if (!FindValueForCurrentKey()) {  // assigns valid_
+      return;
     }
-    FindParseableKey(&ikey, kReverse);
-    if (user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
-      FindPrevUserKey();
+
+    // Whether or not we found a value for current key, we need iter_ to end up
+    // on a smaller key.
+    if (!FindUserKeyBeforeSavedKey()) {
+      return;
+    }
+
+    if (valid_) {
+      // Found the value.
+      return;
+    }
+
+    if (TooManyInternalKeysSkipped(false)) {
+      return;
     }
   }
+
   // We haven't found any key - iterator is not valid
-  // Or the prefix is different than start prefix
-  assert(!iter_->Valid());
   valid_ = false;
 }
 
-// This function checks, if the entry with biggest sequence_number <= sequence_
-// is non kTypeDeletion or kTypeSingleDeletion. If it's not, we save value in
-// saved_value_
+// Used for backwards iteration.
+// Looks at the entries with user key saved_key_ and finds the most up-to-date
+// value for it, or executes a merge, or determines that the value was deleted.
+// Sets valid_ to true if the value is found and is ready to be presented to
+// the user through value().
+// Sets valid_ to false if the value was deleted, and we should try another key.
+// Returns false if an error occurred, and !status().ok() and !valid_.
+//
+// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
+// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
+//       the entry just before them, or on the entry just after them.
 bool DBIter::FindValueForCurrentKey() {
   assert(iter_->Valid());
   merge_context_.Clear();
@@ -754,21 +890,28 @@ bool DBIter::FindValueForCurrentKey() {
   ValueType last_not_merge_type = kTypeDeletion;
   ValueType last_key_entry_type = kTypeDeletion;
 
-  ParsedInternalKey ikey;
-  FindParseableKey(&ikey, kReverse);
-
   // Temporarily pin blocks that hold (merge operands / the value)
   ReleaseTempPinnedData();
   TempPinData();
   size_t num_skipped = 0;
-  while (iter_->Valid() && ikey.sequence <= sequence_ &&
-         user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
+  while (iter_->Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (!IsVisible(ikey.sequence) ||
+        !user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+      break;
+    }
     if (TooManyInternalKeysSkipped()) {
       return false;
     }
 
-    // We iterate too much: let's use Seek() to avoid too much key comparisons
-    if (num_skipped >= max_skip_) {
+    // This user key has lots of entries.
+    // We're going from old to new, and it's taking too long. Let's do a Seek()
+    // and go from new to old. This helps when a key was overwritten many times.
+    if (num_skipped >= max_skip_ && CanReseekToSkip()) {
       return FindValueForCurrentKeyUsingSeek();
     }
 
@@ -777,8 +920,7 @@ bool DBIter::FindValueForCurrentKey() {
       case kTypeValue:
       case kTypeBlobIndex:
         if (range_del_agg_.ShouldDelete(
-                ikey,
-                RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
+                ikey, RangeDelPositioningMode::kBackwardTraversal)) {
           last_key_entry_type = kTypeRangeDeletion;
           PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
         } else {
@@ -796,8 +938,7 @@ bool DBIter::FindValueForCurrentKey() {
         break;
       case kTypeMerge:
         if (range_del_agg_.ShouldDelete(
-                ikey,
-                RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
+                ikey, RangeDelPositioningMode::kBackwardTraversal)) {
           merge_context_.Clear();
           last_key_entry_type = kTypeRangeDeletion;
           last_not_merge_type = last_key_entry_type;
@@ -814,10 +955,13 @@ bool DBIter::FindValueForCurrentKey() {
     }
 
     PERF_COUNTER_ADD(internal_key_skipped_count, 1);
-    assert(user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey()));
     iter_->Prev();
     ++num_skipped;
-    FindParseableKey(&ikey, kReverse);
+  }
+
+  if (!iter_->status().ok()) {
+    valid_ = false;
+    return false;
   }
 
   Status s;
@@ -827,7 +971,7 @@ bool DBIter::FindValueForCurrentKey() {
     case kTypeSingleDeletion:
     case kTypeRangeDeletion:
       valid_ = false;
-      return false;
+      return true;
     case kTypeMerge:
       current_entry_is_merged_ = true;
       if (last_not_merge_type == kTypeDeletion ||
@@ -848,7 +992,7 @@ bool DBIter::FindValueForCurrentKey() {
               Status::NotSupported("Blob DB does not support merge operator.");
         }
         valid_ = false;
-        return true;
+        return false;
       } else {
         assert(last_not_merge_type == kTypeValue);
         s = MergeHelper::TimedFullMerge(
@@ -858,7 +1002,7 @@ bool DBIter::FindValueForCurrentKey() {
       }
       break;
     case kTypeValue:
-      // do nothing - we've already has value in saved_value_
+      // do nothing - we've already has value in pinned_value_
       break;
     case kTypeBlobIndex:
       if (!allow_blob_) {
@@ -867,7 +1011,7 @@ bool DBIter::FindValueForCurrentKey() {
             "Encounter unexpected blob index. Please open DB with "
             "rocksdb::blob_db::BlobDB instead.");
         valid_ = false;
-        return true;
+        return false;
       }
       is_blob_ = true;
       break;
@@ -875,15 +1019,19 @@ bool DBIter::FindValueForCurrentKey() {
       assert(false);
       break;
   }
-  valid_ = true;
   if (!s.ok()) {
+    valid_ = false;
     status_ = s;
+    return false;
   }
+  valid_ = true;
   return true;
 }
 
 // This function is used in FindValueForCurrentKey.
 // We use Seek() function instead of Prev() to find necessary value
+// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
+//       Would be nice to reuse some code.
 bool DBIter::FindValueForCurrentKeyUsingSeek() {
   // FindValueForCurrentKey will enable pinning before calling
   // FindValueForCurrentKeyUsingSeek()
@@ -894,15 +1042,38 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
   iter_->Seek(last_key);
   RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
 
-  // assume there is at least one parseable key for this user key
+  // In case read_callback presents, the value we seek to may not be visible.
+  // Find the next value that's visible.
   ParsedInternalKey ikey;
-  FindParseableKey(&ikey, kForward);
+  while (true) {
+    if (!iter_->Valid()) {
+      valid_ = false;
+      return iter_->status().ok();
+    }
+
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+      // No visible values for this key, even though FindValueForCurrentKey()
+      // has seen some. This is possible if we're using a tailing iterator, and
+      // the entries were discarded in a compaction.
+      valid_ = false;
+      return true;
+    }
+
+    if (IsVisible(ikey.sequence)) {
+      break;
+    }
+
+    iter_->Next();
+  }
 
   if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
       range_del_agg_.ShouldDelete(
-          ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
+          ikey, RangeDelPositioningMode::kBackwardTraversal)) {
     valid_ = false;
-    return false;
+    return true;
   }
   if (ikey.type == kTypeBlobIndex && !allow_blob_) {
     ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
@@ -910,7 +1081,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
         "Encounter unexpected blob index. Please open DB with "
         "rocksdb::blob_db::BlobDB instead.");
     valid_ = false;
-    return true;
+    return false;
   }
   if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex) {
     assert(iter_->IsValuePinned());
@@ -921,111 +1092,146 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() {
 
   // kTypeMerge. We need to collect all kTypeMerge values and save them
   // in operands
+  assert(ikey.type == kTypeMerge);
   current_entry_is_merged_ = true;
   merge_context_.Clear();
-  while (
-      iter_->Valid() &&
-      user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey()) &&
-      ikey.type == kTypeMerge &&
-      !range_del_agg_.ShouldDelete(
-          ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
-    merge_context_.PushOperand(iter_->value(),
-                               iter_->IsValuePinned() /* operand_pinned */);
-    PERF_COUNTER_ADD(internal_merge_count, 1);
+  merge_context_.PushOperand(iter_->value(),
+                             iter_->IsValuePinned() /* operand_pinned */);
+  while (true) {
     iter_->Next();
-    FindParseableKey(&ikey, kForward);
-  }
 
-  Status s;
-  if (!iter_->Valid() ||
-      !user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey()) ||
-      ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
-      range_del_agg_.ShouldDelete(
-          ikey, RangeDelAggregator::RangePositioningMode::kBackwardTraversal)) {
-    s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
-                                    nullptr, merge_context_.GetOperands(),
-                                    &saved_value_, logger_, statistics_, env_,
-                                    &pinned_value_, true);
-    // Make iter_ valid and point to saved_key_
-    if (!iter_->Valid() ||
-        !user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
-      iter_->Seek(last_key);
-      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    if (!iter_->Valid()) {
+      if (!iter_->status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      break;
     }
-    valid_ = true;
-    if (!s.ok()) {
-      status_ = s;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (!user_comparator_.Equal(ikey.user_key, saved_key_.GetUserKey())) {
+      break;
+    }
+
+    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+        range_del_agg_.ShouldDelete(
+            ikey, RangeDelPositioningMode::kForwardTraversal)) {
+      break;
+    } else if (ikey.type == kTypeValue) {
+      const Slice val = iter_->value();
+      Status s = MergeHelper::TimedFullMerge(
+          merge_operator_, saved_key_.GetUserKey(), &val,
+          merge_context_.GetOperands(), &saved_value_, logger_, statistics_,
+          env_, &pinned_value_, true);
+      if (!s.ok()) {
+        valid_ = false;
+        status_ = s;
+        return false;
+      }
+      valid_ = true;
+      return true;
+    } else if (ikey.type == kTypeMerge) {
+      merge_context_.PushOperand(iter_->value(),
+                                 iter_->IsValuePinned() /* operand_pinned */);
+      PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (ikey.type == kTypeBlobIndex) {
+      if (!allow_blob_) {
+        ROCKS_LOG_ERROR(logger_, "Encounter unexpected blob index.");
+        status_ = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "rocksdb::blob_db::BlobDB instead.");
+      } else {
+        status_ =
+            Status::NotSupported("Blob DB does not support merge operator.");
+      }
+      valid_ = false;
+      return false;
+    } else {
+      assert(false);
     }
-    return true;
   }
 
-  const Slice& val = iter_->value();
-  s = MergeHelper::TimedFullMerge(merge_operator_, saved_key_.GetUserKey(),
-                                  &val, merge_context_.GetOperands(),
-                                  &saved_value_, logger_, statistics_, env_,
-                                  &pinned_value_, true);
-  valid_ = true;
+  Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, saved_key_.GetUserKey(), nullptr,
+      merge_context_.GetOperands(), &saved_value_, logger_, statistics_, env_,
+      &pinned_value_, true);
   if (!s.ok()) {
+    valid_ = false;
     status_ = s;
+    return false;
   }
-  return true;
-}
 
-// Used in Next to change directions
-// Go to next user key
-// Don't use Seek(),
-// because next user key will be very close
-void DBIter::FindNextUserKey() {
-  if (!iter_->Valid()) {
-    return;
-  }
-  ParsedInternalKey ikey;
-  FindParseableKey(&ikey, kForward);
-  while (iter_->Valid() &&
-         !user_comparator_->Equal(ikey.user_key, saved_key_.GetUserKey())) {
-    iter_->Next();
-    FindParseableKey(&ikey, kForward);
+  // Make sure we leave iter_ in a good state. If it's valid and we don't care
+  // about prefixes, that's already good enough. Otherwise it needs to be
+  // seeked to the current key.
+  if ((prefix_extractor_ != nullptr && !total_order_seek_) || !iter_->Valid()) {
+    if (prefix_extractor_ != nullptr && !total_order_seek_) {
+      iter_->SeekForPrev(last_key);
+    } else {
+      iter_->Seek(last_key);
+      if (!iter_->Valid() && iter_->status().ok()) {
+        iter_->SeekToLast();
+      }
+    }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
   }
+
+  valid_ = true;
+  return true;
 }
 
-// Go to previous user_key
-void DBIter::FindPrevUserKey() {
-  if (!iter_->Valid()) {
-    return;
-  }
+// Move backwards until the key smaller than saved_key_.
+// Changes valid_ only if return value is false.
+bool DBIter::FindUserKeyBeforeSavedKey() {
+  assert(status_.ok());
   size_t num_skipped = 0;
-  ParsedInternalKey ikey;
-  FindParseableKey(&ikey, kReverse);
-  int cmp;
-  while (iter_->Valid() &&
-         ((cmp = user_comparator_->Compare(ikey.user_key,
-                                           saved_key_.GetUserKey())) == 0 ||
-          (cmp > 0 && ikey.sequence > sequence_))) {
-    if (TooManyInternalKeysSkipped()) {
-      return;
+  while (iter_->Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
     }
 
-    if (cmp == 0) {
-      if (num_skipped >= max_skip_) {
-        num_skipped = 0;
-        IterKey last_key;
-        last_key.SetInternalKey(ParsedInternalKey(
-            saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
-        iter_->Seek(last_key.GetInternalKey());
-        RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
-      } else {
-        ++num_skipped;
-      }
+    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+      return true;
     }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
     assert(ikey.sequence != kMaxSequenceNumber);
-    if (ikey.sequence > sequence_) {
+    if (!IsVisible(ikey.sequence)) {
       PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
     } else {
       PERF_COUNTER_ADD(internal_key_skipped_count, 1);
     }
+
+    if (num_skipped >= max_skip_ && CanReseekToSkip()) {
+      num_skipped = 0;
+      IterKey last_key;
+      last_key.SetInternalKey(ParsedInternalKey(
+          saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+      // It would be more efficient to use SeekForPrev() here, but some
+      // iterators may not support it.
+      iter_->Seek(last_key.GetInternalKey());
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      if (!iter_->Valid()) {
+        break;
+      }
+    } else {
+      ++num_skipped;
+    }
+
     iter_->Prev();
-    FindParseableKey(&ikey, kReverse);
   }
+
+  if (!iter_->status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  return true;
 }
 
 bool DBIter::TooManyInternalKeysSkipped(bool increment) {
@@ -1040,28 +1246,46 @@ bool DBIter::TooManyInternalKeysSkipped(bool increment) {
   return false;
 }
 
-// Skip all unparseable keys
-void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
-  while (iter_->Valid() && !ParseKey(ikey)) {
-    if (direction == kReverse) {
-      iter_->Prev();
-    } else {
-      iter_->Next();
-    }
+bool DBIter::IsVisible(SequenceNumber sequence) {
+  if (read_callback_ == nullptr) {
+    return sequence <= sequence_;
+  } else {
+    return read_callback_->IsVisible(sequence);
   }
 }
 
+bool DBIter::CanReseekToSkip() {
+  return read_callback_ == nullptr || read_callback_->CanReseekToSkip();
+}
+
 void DBIter::Seek(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   StopWatch sw(env_, statistics_, DB_SEEK);
+  status_ = Status::OK();
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
+
+  SequenceNumber seq = sequence_;
   saved_key_.Clear();
-  saved_key_.SetInternalKey(target, sequence_);
+  saved_key_.SetInternalKey(target, seq);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
+  if (iterate_lower_bound_ != nullptr &&
+      user_comparator_.Compare(saved_key_.GetUserKey(), *iterate_lower_bound_) <
+          0) {
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_lower_bound_, seq);
+  }
 
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
     iter_->Seek(saved_key_.GetInternalKey());
-    range_del_agg_.InvalidateTombstoneMapPositions();
+    range_del_agg_.InvalidateRangeDelMapPositions();
   }
   RecordTick(statistics_, NUMBER_DB_SEEK);
   if (iter_->Valid()) {
@@ -1076,6 +1300,7 @@ void DBIter::Seek(const Slice& target) {
     }
     if (statistics_ != nullptr) {
       if (valid_) {
+        // Decrement since we don't want to count this key as skipped
         RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
         RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
         PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
@@ -1092,7 +1317,9 @@ void DBIter::Seek(const Slice& target) {
 }
 
 void DBIter::SeekForPrev(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   StopWatch sw(env_, statistics_, DB_SEEK);
+  status_ = Status::OK();
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
   saved_key_.Clear();
@@ -1100,12 +1327,25 @@ void DBIter::SeekForPrev(const Slice& target) {
   saved_key_.SetInternalKey(target, 0 /* sequence_number */,
                             kValueTypeForSeekForPrev);
 
+  if (iterate_upper_bound_ != nullptr &&
+      user_comparator_.Compare(saved_key_.GetUserKey(),
+                               *iterate_upper_bound_) >= 0) {
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber);
+  }
+
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
     iter_->SeekForPrev(saved_key_.GetInternalKey());
-    range_del_agg_.InvalidateTombstoneMapPositions();
+    range_del_agg_.InvalidateRangeDelMapPositions();
   }
 
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    db_impl_->TraceIteratorSeekForPrev(cfd_->GetID(), target);
+  }
+#endif  // ROCKSDB_LITE
+
   RecordTick(statistics_, NUMBER_DB_SEEK);
   if (iter_->Valid()) {
     if (prefix_extractor_ && prefix_same_as_start_) {
@@ -1134,11 +1374,17 @@ void DBIter::SeekForPrev(const Slice& target) {
 }
 
 void DBIter::SeekToFirst() {
+  if (iterate_lower_bound_ != nullptr) {
+    Seek(*iterate_lower_bound_);
+    return;
+  }
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek will be used.
-  if (prefix_extractor_ != nullptr) {
+  if (prefix_extractor_ != nullptr && !total_order_seek_) {
     max_skip_ = std::numeric_limits<uint64_t>::max();
   }
+  status_ = Status::OK();
   direction_ = kForward;
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
@@ -1147,7 +1393,7 @@ void DBIter::SeekToFirst() {
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
     iter_->SeekToFirst();
-    range_del_agg_.InvalidateTombstoneMapPositions();
+    range_del_agg_.InvalidateRangeDelMapPositions();
   }
 
   RecordTick(statistics_, NUMBER_DB_SEEK);
@@ -1174,11 +1420,23 @@ void DBIter::SeekToFirst() {
 }
 
 void DBIter::SeekToLast() {
+  if (iterate_upper_bound_ != nullptr) {
+    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
+    SeekForPrev(*iterate_upper_bound_);
+    if (Valid() && user_comparator_.Equal(*iterate_upper_bound_, key())) {
+      ReleaseTempPinnedData();
+      PrevInternal();
+    }
+    return;
+  }
+
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, env_);
   // Don't use iter_::Seek() if we set a prefix extractor
   // because prefix seek will be used.
-  if (prefix_extractor_ != nullptr) {
+  if (prefix_extractor_ != nullptr && !total_order_seek_) {
     max_skip_ = std::numeric_limits<uint64_t>::max();
   }
+  status_ = Status::OK();
   direction_ = kReverse;
   ReleaseTempPinnedData();
   ResetInternalKeysSkippedCounter();
@@ -1187,22 +1445,9 @@ void DBIter::SeekToLast() {
   {
     PERF_TIMER_GUARD(seek_internal_seek_time);
     iter_->SeekToLast();
-    range_del_agg_.InvalidateTombstoneMapPositions();
-  }
-  // When the iterate_upper_bound is set to a value,
-  // it will seek to the last key before the
-  // ReadOptions.iterate_upper_bound
-  if (iter_->Valid() && iterate_upper_bound_ != nullptr) {
-    SeekForPrev(*iterate_upper_bound_);
-    range_del_agg_.InvalidateTombstoneMapPositions();
-    if (!Valid()) {
-      return;
-    } else if (user_comparator_->Equal(*iterate_upper_bound_, key())) {
-      Prev();
-    }
-  } else {
-    PrevInternal();
+    range_del_agg_.InvalidateRangeDelMapPositions();
   }
+  PrevInternal();
   if (statistics_ != nullptr) {
     RecordTick(statistics_, NUMBER_DB_SEEK);
     if (valid_) {
@@ -1220,20 +1465,23 @@ void DBIter::SeekToLast() {
 
 Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
                         const ImmutableCFOptions& cf_options,
+                        const MutableCFOptions& mutable_cf_options,
                         const Comparator* user_key_comparator,
                         InternalIterator* internal_iter,
                         const SequenceNumber& sequence,
                         uint64_t max_sequential_skip_in_iterations,
-                        bool allow_blob) {
+                        ReadCallback* read_callback, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, bool allow_blob) {
   DBIter* db_iter = new DBIter(
-      env, read_options, cf_options, user_key_comparator, internal_iter,
-      sequence, false, max_sequential_skip_in_iterations, allow_blob);
+      env, read_options, cf_options, mutable_cf_options, user_key_comparator,
+      internal_iter, sequence, false, max_sequential_skip_in_iterations,
+      read_callback, db_impl, cfd, allow_blob);
   return db_iter;
 }
 
 ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
 
-RangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
+ReadRangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() {
   return db_iter_->GetRangeDelAggregator();
 }
 
@@ -1270,21 +1518,30 @@ inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
 
 void ArenaWrappedDBIter::Init(Env* env, const ReadOptions& read_options,
                               const ImmutableCFOptions& cf_options,
+                              const MutableCFOptions& mutable_cf_options,
                               const SequenceNumber& sequence,
                               uint64_t max_sequential_skip_in_iteration,
-                              uint64_t version_number, bool allow_blob) {
+                              uint64_t version_number,
+                              ReadCallback* read_callback, DBImpl* db_impl,
+                              ColumnFamilyData* cfd, bool allow_blob,
+                              bool allow_refresh) {
   auto mem = arena_.AllocateAligned(sizeof(DBIter));
-  db_iter_ = new (mem)
-      DBIter(env, read_options, cf_options, cf_options.user_comparator, nullptr,
-             sequence, true, max_sequential_skip_in_iteration, allow_blob);
+  db_iter_ = new (mem) DBIter(env, read_options, cf_options, mutable_cf_options,
+                              cf_options.user_comparator, nullptr, sequence,
+                              true, max_sequential_skip_in_iteration,
+                              read_callback, db_impl, cfd, allow_blob);
   sv_number_ = version_number;
+  allow_refresh_ = allow_refresh;
 }
 
 Status ArenaWrappedDBIter::Refresh() {
-  if (cfd_ == nullptr || db_impl_ == nullptr) {
+  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
     return Status::NotSupported("Creating renew iterator is not allowed.");
   }
   assert(db_iter_ != nullptr);
+  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+  // correct behavior. Will be corrected automatically when we take a snapshot
+  // here for the case of WritePreparedTxnDB.
   SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
   uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
   if (sv_number_ != cur_sv_number) {
@@ -1294,12 +1551,17 @@ Status ArenaWrappedDBIter::Refresh() {
     new (&arena_) Arena();
 
     SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_->mutex());
-    Init(env, read_options_, *(cfd_->ioptions()), latest_seq,
-         sv->mutable_cf_options.max_sequential_skip_in_iterations,
-         cur_sv_number, allow_blob_);
+    if (read_callback_) {
+      read_callback_->Refresh(latest_seq);
+    }
+    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+         latest_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, read_callback_, db_impl_, cfd_, allow_blob_,
+         allow_refresh_);
 
     InternalIterator* internal_iter = db_impl_->NewInternalIterator(
-        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator());
+        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+        latest_seq);
     SetIterUnderDBIter(internal_iter);
   } else {
     db_iter_->set_sequence(latest_seq);
@@ -1310,14 +1572,18 @@ Status ArenaWrappedDBIter::Refresh() {
 
 ArenaWrappedDBIter* NewArenaWrappedDbIterator(
     Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options, const SequenceNumber& sequence,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
     uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    DBImpl* db_impl, ColumnFamilyData* cfd, bool allow_blob) {
+    ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+    bool allow_blob, bool allow_refresh) {
   ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
-  iter->Init(env, read_options, cf_options, sequence,
-             max_sequential_skip_in_iterations, version_number, allow_blob);
-  if (db_impl != nullptr && cfd != nullptr) {
-    iter->StoreRefreshInfo(read_options, db_impl, cfd, allow_blob);
+  iter->Init(env, read_options, cf_options, mutable_cf_options, sequence,
+             max_sequential_skip_in_iterations, version_number, read_callback,
+             db_impl, cfd, allow_blob, allow_refresh);
+  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+    iter->StoreRefreshInfo(read_options, db_impl, cfd, read_callback,
+                           allow_blob);
   }
 
   return iter;
diff --git a/thirdparty/rocksdb/db/db_iter.h b/thirdparty/rocksdb/db/db_iter.h
index 26fcd44cbd..a640f0296e 100644
--- a/thirdparty/rocksdb/db/db_iter.h
+++ b/thirdparty/rocksdb/db/db_iter.h
@@ -23,18 +23,18 @@ namespace rocksdb {
 
 class Arena;
 class DBIter;
-class InternalIterator;
 
 // Return a new iterator that converts internal keys (yielded by
 // "*internal_iter") that were live at the specified "sequence" number
 // into appropriate user keys.
-extern Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
-                               const ImmutableCFOptions& cf_options,
-                               const Comparator* user_key_comparator,
-                               InternalIterator* internal_iter,
-                               const SequenceNumber& sequence,
-                               uint64_t max_sequential_skip_in_iterations,
-                               bool allow_blob = false);
+extern Iterator* NewDBIterator(
+    Env* env, const ReadOptions& read_options,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options,
+    const Comparator* user_key_comparator, InternalIterator* internal_iter,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+    ColumnFamilyData* cfd = nullptr, bool allow_blob = false);
 
 // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
 // iterator is supposed be allocated. This class is used as an entry point of
@@ -48,7 +48,7 @@ class ArenaWrappedDBIter : public Iterator {
   // Get the arena to be used to allocate memory for DBIter to be wrapped,
   // as well as child iterators in it.
   virtual Arena* GetArena() { return &arena_; }
-  virtual RangeDelAggregator* GetRangeDelAggregator();
+  virtual ReadRangeDelAggregator* GetRangeDelAggregator();
 
   // Set the internal iterator wrapped inside the DB Iterator. Usually it is
   // a merging iterator.
@@ -70,15 +70,19 @@ class ArenaWrappedDBIter : public Iterator {
 
   void Init(Env* env, const ReadOptions& read_options,
             const ImmutableCFOptions& cf_options,
+            const MutableCFOptions& mutable_cf_options,
             const SequenceNumber& sequence,
             uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-            bool allow_blob);
+            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+            bool allow_blob, bool allow_refresh);
 
   void StoreRefreshInfo(const ReadOptions& read_options, DBImpl* db_impl,
-                        ColumnFamilyData* cfd, bool allow_blob) {
+                        ColumnFamilyData* cfd, ReadCallback* read_callback,
+                        bool allow_blob) {
     read_options_ = read_options;
     db_impl_ = db_impl;
     cfd_ = cfd;
+    read_callback_ = read_callback;
     allow_blob_ = allow_blob;
   }
 
@@ -89,7 +93,9 @@ class ArenaWrappedDBIter : public Iterator {
   ColumnFamilyData* cfd_ = nullptr;
   DBImpl* db_impl_ = nullptr;
   ReadOptions read_options_;
+  ReadCallback* read_callback_;
   bool allow_blob_ = false;
+  bool allow_refresh_ = true;
 };
 
 // Generate the arena wrapped iterator class.
@@ -97,9 +103,10 @@ class ArenaWrappedDBIter : public Iterator {
 // be supported.
 extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
     Env* env, const ReadOptions& read_options,
-    const ImmutableCFOptions& cf_options, const SequenceNumber& sequence,
+    const ImmutableCFOptions& cf_options,
+    const MutableCFOptions& mutable_cf_options, const SequenceNumber& sequence,
     uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
-    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
-    bool allow_blob = false);
-
+    ReadCallback* read_callback, DBImpl* db_impl = nullptr,
+    ColumnFamilyData* cfd = nullptr, bool allow_blob = false,
+    bool allow_refresh = true);
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_iter_stress_test.cc b/thirdparty/rocksdb/db/db_iter_stress_test.cc
new file mode 100644
index 0000000000..a0f1dfeab4
--- /dev/null
+++ b/thirdparty/rocksdb/db/db_iter_stress_test.cc
@@ -0,0 +1,654 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/testharness.h"
+#include "utilities/merge_operators.h"
+
+#ifdef GFLAGS
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(verbose, false,
+            "Print huge, detailed trace. Intended for debugging failures.");
+
+#else
+
+void ParseCommandLineFlags(int*, char***, bool) {}
+bool FLAGS_verbose = false;
+
+#endif
+
+namespace rocksdb {
+
+class DBIteratorStressTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorStressTest() : env_(Env::Default()) {}
+};
+
+namespace {
+
+struct Entry {
+  std::string key;
+  ValueType type;  // kTypeValue, kTypeDeletion, kTypeMerge
+  uint64_t sequence;
+  std::string ikey;  // internal key, made from `key`, `sequence` and `type`
+  std::string value;
+  // If false, we'll pretend that this entry doesn't exist.
+  bool visible = true;
+
+  bool operator<(const Entry& e) const {
+    if (key != e.key) return key < e.key;
+    return std::tie(sequence, type) > std::tie(e.sequence, e.type);
+  }
+};
+
+struct Data {
+  std::vector<Entry> entries;
+
+  // Indices in `entries` with `visible` = false.
+  std::vector<size_t> hidden;
+  // Keys of entries whose `visible` changed since the last seek of iterators.
+  std::set<std::string> recently_touched_keys;
+};
+
+struct StressTestIterator : public InternalIterator {
+  Data* data;
+  Random64* rnd;
+  InternalKeyComparator cmp;
+
+  // Each operation will return error with this probability...
+  double error_probability = 0;
+  // ... and add/remove entries with this probability.
+  double mutation_probability = 0;
+  // The probability of adding vs removing entries will be chosen so that the
+  // amount of removed entries stays somewhat close to this number.
+  double target_hidden_fraction = 0;
+  // If true, print all mutations to stdout for debugging.
+  bool trace = false;
+
+  int iter = -1;
+  Status status_;
+
+  StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp)
+      : data(_data), rnd(_rnd), cmp(_cmp) {}
+
+  bool Valid() const override {
+    if (iter >= 0 && iter < (int)data->entries.size()) {
+      assert(status_.ok());
+      return true;
+    }
+    return false;
+  }
+
+  Status status() const override { return status_; }
+
+  bool MaybeFail() {
+    if (rnd->Next() >=
+        std::numeric_limits<uint64_t>::max() * error_probability) {
+      return false;
+    }
+    if (rnd->Next() % 2) {
+      status_ = Status::Incomplete("test");
+    } else {
+      status_ = Status::IOError("test");
+    }
+    if (trace) {
+      std::cout << "injecting " << status_.ToString() << std::endl;
+    }
+    iter = -1;
+    return true;
+  }
+
+  void MaybeMutate() {
+    if (rnd->Next() >=
+        std::numeric_limits<uint64_t>::max() * mutation_probability) {
+      return;
+    }
+    do {
+      // If too many entries are hidden, hide less, otherwise hide more.
+      double hide_probability =
+          data->hidden.size() > data->entries.size() * target_hidden_fraction
+              ? 1. / 3
+              : 2. / 3;
+      if (data->hidden.empty()) {
+        hide_probability = 1;
+      }
+      bool do_hide =
+          rnd->Next() < std::numeric_limits<uint64_t>::max() * hide_probability;
+      if (do_hide) {
+        // Hide a random entry.
+        size_t idx = rnd->Next() % data->entries.size();
+        Entry& e = data->entries[idx];
+        if (e.visible) {
+          if (trace) {
+            std::cout << "hiding idx " << idx << std::endl;
+          }
+          e.visible = false;
+          data->hidden.push_back(idx);
+          data->recently_touched_keys.insert(e.key);
+        } else {
+          // Already hidden. Let's go unhide something instead, just because
+          // it's easy and it doesn't really matter what we do.
+          do_hide = false;
+        }
+      }
+      if (!do_hide) {
+        // Unhide a random entry.
+        size_t hi = rnd->Next() % data->hidden.size();
+        size_t idx = data->hidden[hi];
+        if (trace) {
+          std::cout << "unhiding idx " << idx << std::endl;
+        }
+        Entry& e = data->entries[idx];
+        assert(!e.visible);
+        e.visible = true;
+        data->hidden[hi] = data->hidden.back();
+        data->hidden.pop_back();
+        data->recently_touched_keys.insert(e.key);
+      }
+    } while (rnd->Next() % 3 != 0);  // do 3 mutations on average
+  }
+
+  void SkipForward() {
+    while (iter < (int)data->entries.size() && !data->entries[iter].visible) {
+      ++iter;
+    }
+  }
+  void SkipBackward() {
+    while (iter >= 0 && !data->entries[iter].visible) {
+      --iter;
+    }
+  }
+
+  void SeekToFirst() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = 0;
+    SkipForward();
+  }
+  void SeekToLast() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = (int)data->entries.size() - 1;
+    SkipBackward();
+  }
+
+  void Seek(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; });
+    iter = (int)(it - data->entries.begin());
+    SkipForward();
+  }
+  void SeekForPrev(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; });
+    iter = (int)(it - data->entries.begin());
+    --iter;
+    SkipBackward();
+  }
+
+  void Next() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    ++iter;
+    SkipForward();
+  }
+  void Prev() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    --iter;
+    SkipBackward();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return data->entries[iter].ikey;
+  }
+  Slice value() const override {
+    assert(Valid());
+    return data->entries[iter].value;
+  }
+
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
+};
+
+// A small reimplementation of DBIter, supporting only some of the features,
+// and doing everything in O(log n).
+// Skips all keys that are in recently_touched_keys.
+struct ReferenceIterator {
+  Data* data;
+  uint64_t sequence;  // ignore entries with sequence number below this
+
+  bool valid = false;
+  std::string key;
+  std::string value;
+
+  ReferenceIterator(Data* _data, uint64_t _sequence)
+      : data(_data), sequence(_sequence) {}
+
+  bool Valid() const { return valid; }
+
+  // Finds the first entry with key
+  // greater/less/greater-or-equal/less-or-equal than `key`, depending on
+  // arguments: if `skip`, inequality is strict; if `forward`, it's
+  // greater/greater-or-equal, otherwise less/less-or-equal.
+  // Sets `key` to the result.
+  // If no such key exists, returns false. Doesn't check `visible`.
+  bool FindNextKey(bool skip, bool forward) {
+    valid = false;
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (forward != skip) {
+                                       return e.key < key;
+                                     } else {
+                                       return e.key <= key;
+                                     }
+                                   });
+    if (forward) {
+      if (it != data->entries.end()) {
+        key = it->key;
+        return true;
+      }
+    } else {
+      if (it != data->entries.begin()) {
+        --it;
+        key = it->key;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool FindValueForCurrentKey() {
+    if (data->recently_touched_keys.count(key)) {
+      return false;
+    }
+
+    // Find the first entry for the key. The caller promises that it exists.
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (e.key != key) {
+                                       return e.key < key;
+                                     }
+                                     return e.sequence > sequence;
+                                   });
+
+    // Find the first visible entry.
+    for (;; ++it) {
+      if (it == data->entries.end()) {
+        return false;
+      }
+      Entry& e = *it;
+      if (e.key != key) {
+        return false;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        return false;
+      }
+      if (e.type == kTypeValue) {
+        value = e.value;
+        valid = true;
+        return true;
+      }
+      assert(e.type == kTypeMerge);
+      break;
+    }
+
+    // Collect merge operands.
+    std::vector<Slice> operands;
+    for (; it != data->entries.end(); ++it) {
+      Entry& e = *it;
+      if (e.key != key) {
+        break;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        break;
+      }
+      operands.push_back(e.value);
+      if (e.type == kTypeValue) {
+        break;
+      }
+    }
+
+    // Do a merge.
+    value = operands.back().ToString();
+    for (int i = (int)operands.size() - 2; i >= 0; --i) {
+      value.append(",");
+      value.append(operands[i].data(), operands[i].size());
+    }
+
+    valid = true;
+    return true;
+  }
+
+  // Start at `key` and move until we encounter a valid value.
+  // `forward` defines the direction of movement.
+  // If `skip` is true, we're looking for key not equal to `key`.
+  void DoTheThing(bool skip, bool forward) {
+    while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) {
+      skip = true;
+    }
+  }
+
+  void Seek(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, true);
+  }
+  void SeekForPrev(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, false);
+  }
+  void SeekToFirst() { Seek(""); }
+  void SeekToLast() {
+    key = data->entries.back().key;
+    DoTheThing(false, false);
+  }
+  void Next() {
+    assert(Valid());
+    DoTheThing(true, true);
+  }
+  void Prev() {
+    assert(Valid());
+    DoTheThing(true, false);
+  }
+};
+
+}  // namespace
+
+// Use an internal iterator that sometimes returns errors and sometimes
+// adds/removes entries on the fly. Do random operations on a DBIter and
+// check results.
+// TODO: can be improved for more coverage:
+//   * Override IsKeyPinned() and IsValuePinned() to actually use
+//     PinnedIteratorManager and check that there's no use-after free.
+//   * Try different combinations of prefix_extractor, total_order_seek,
+//     prefix_same_as_start, iterate_lower_bound, iterate_upper_bound.
+TEST_F(DBIteratorStressTest, StressTest) {
+  // We use a deterministic RNG, and everything happens in a single thread.
+  Random64 rnd(826909345792864532ll);
+
+  auto gen_key = [&](int max_key) {
+    assert(max_key > 0);
+    int len = 0;
+    int a = max_key;
+    while (a) {
+      a /= 10;
+      ++len;
+    }
+    std::string s = ToString(rnd.Next() % static_cast<uint64_t>(max_key));
+    s.insert(0, len - (int)s.size(), '0');
+    return s;
+  };
+
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ReadOptions ropt;
+
+  size_t num_matching = 0;
+  size_t num_at_end = 0;
+  size_t num_not_ok = 0;
+  size_t num_recently_removed = 0;
+
+  // Number of iterations for each combination of parameters
+  // (there are ~250 of those).
+  // Tweak this to change the test run time.
+  // As of the time of writing, the test takes ~4 seconds for value of 5000.
+  const int num_iterations = 5000;
+  // Enable this to print all the operations for debugging.
+  bool trace = FLAGS_verbose;
+
+  for (int num_entries : {5, 10, 100}) {
+    for (double key_space : {0.1, 1.0, 3.0}) {
+      for (ValueType prevalent_entry_type :
+           {kTypeValue, kTypeDeletion, kTypeMerge}) {
+        for (double error_probability : {0.01, 0.1}) {
+          for (double mutation_probability : {0.01, 0.5}) {
+            for (double target_hidden_fraction : {0.1, 0.5}) {
+              std::string trace_str =
+                  "entries: " + ToString(num_entries) +
+                  ", key_space: " + ToString(key_space) +
+                  ", error_probability: " + ToString(error_probability) +
+                  ", mutation_probability: " + ToString(mutation_probability) +
+                  ", target_hidden_fraction: " +
+                  ToString(target_hidden_fraction);
+              SCOPED_TRACE(trace_str);
+              if (trace) {
+                std::cout << trace_str << std::endl;
+              }
+
+              // Generate data.
+              Data data;
+              int max_key = (int)(num_entries * key_space) + 1;
+              for (int i = 0; i < num_entries; ++i) {
+                Entry e;
+                e.key = gen_key(max_key);
+                if (rnd.Next() % 10 != 0) {
+                  e.type = prevalent_entry_type;
+                } else {
+                  const ValueType types[] = {kTypeValue, kTypeDeletion,
+                                             kTypeMerge};
+                  e.type =
+                      types[rnd.Next() % (sizeof(types) / sizeof(types[0]))];
+                }
+                e.sequence = i;
+                e.value = "v" + ToString(i);
+                ParsedInternalKey internal_key(e.key, e.sequence, e.type);
+                AppendInternalKey(&e.ikey, internal_key);
+
+                data.entries.push_back(e);
+              }
+              std::sort(data.entries.begin(), data.entries.end());
+              if (trace) {
+                std::cout << "entries:";
+                for (size_t i = 0; i < data.entries.size(); ++i) {
+                  Entry& e = data.entries[i];
+                  std::cout
+                      << "\n  idx " << i << ": \"" << e.key << "\": \""
+                      << e.value << "\" seq: " << e.sequence << " type: "
+                      << (e.type == kTypeValue
+                              ? "val"
+                              : e.type == kTypeDeletion ? "del" : "merge");
+                }
+                std::cout << std::endl;
+              }
+
+              std::unique_ptr<Iterator> db_iter;
+              std::unique_ptr<ReferenceIterator> ref_iter;
+              for (int iteration = 0; iteration < num_iterations; ++iteration) {
+                SCOPED_TRACE(iteration);
+                // Create a new iterator every ~30 operations.
+                if (db_iter == nullptr || rnd.Next() % 30 == 0) {
+                  uint64_t sequence = rnd.Next() % (data.entries.size() + 2);
+                  ref_iter.reset(new ReferenceIterator(&data, sequence));
+                  if (trace) {
+                    std::cout << "new iterator, seq: " << sequence << std::endl;
+                  }
+
+                  auto internal_iter =
+                      new StressTestIterator(&data, &rnd, BytewiseComparator());
+                  internal_iter->error_probability = error_probability;
+                  internal_iter->mutation_probability = mutation_probability;
+                  internal_iter->target_hidden_fraction =
+                      target_hidden_fraction;
+                  internal_iter->trace = trace;
+                  db_iter.reset(NewDBIterator(
+                      env_, ropt, ImmutableCFOptions(options),
+                      MutableCFOptions(options), BytewiseComparator(),
+                      internal_iter, sequence,
+                      options.max_sequential_skip_in_iterations,
+                      nullptr /*read_callback*/));
+                }
+
+                // Do a random operation. It's important to do it on ref_it
+                // later than on db_iter to make sure ref_it sees the correct
+                // recently_touched_keys.
+                std::string old_key;
+                bool forward = rnd.Next() % 2 > 0;
+                // Do Next()/Prev() ~90% of the time.
+                bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0;
+                if (trace) {
+                  std::cout << iteration << ": ";
+                }
+
+                if (!seek) {
+                  assert(db_iter->Valid());
+                  old_key = ref_iter->key;
+                  if (trace) {
+                    std::cout << (forward ? "Next" : "Prev") << std::endl;
+                  }
+
+                  if (forward) {
+                    db_iter->Next();
+                    ref_iter->Next();
+                  } else {
+                    db_iter->Prev();
+                    ref_iter->Prev();
+                  }
+                } else {
+                  data.recently_touched_keys.clear();
+                  // Do SeekToFirst less often than Seek.
+                  if (rnd.Next() % 4 == 0) {
+                    if (trace) {
+                      std::cout << (forward ? "SeekToFirst" : "SeekToLast")
+                                << std::endl;
+                    }
+
+                    if (forward) {
+                      old_key = "";
+                      db_iter->SeekToFirst();
+                      ref_iter->SeekToFirst();
+                    } else {
+                      old_key = data.entries.back().key;
+                      db_iter->SeekToLast();
+                      ref_iter->SeekToLast();
+                    }
+                  } else {
+                    old_key = gen_key(max_key);
+                    if (trace) {
+                      std::cout << (forward ? "Seek" : "SeekForPrev") << " \""
+                                << old_key << '"' << std::endl;
+                    }
+                    if (forward) {
+                      db_iter->Seek(old_key);
+                      ref_iter->Seek(old_key);
+                    } else {
+                      db_iter->SeekForPrev(old_key);
+                      ref_iter->SeekForPrev(old_key);
+                    }
+                  }
+                }
+
+                // Check the result.
+                if (db_iter->Valid()) {
+                  ASSERT_TRUE(db_iter->status().ok());
+                  if (data.recently_touched_keys.count(
+                          db_iter->key().ToString())) {
+                    // Ended on a key that may have been mutated during the
+                    // operation. Reference iterator skips such keys, so we
+                    // can't check the exact result.
+
+                    // Check that the key moved in the right direction.
+                    if (forward) {
+                      if (seek)
+                        ASSERT_GE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_GT(db_iter->key().ToString(), old_key);
+                    } else {
+                      if (seek)
+                        ASSERT_LE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_LT(db_iter->key().ToString(), old_key);
+                    }
+
+                    if (ref_iter->Valid()) {
+                      // Check that DBIter didn't miss any non-mutated key.
+                      if (forward) {
+                        ASSERT_LT(db_iter->key().ToString(), ref_iter->key);
+                      } else {
+                        ASSERT_GT(db_iter->key().ToString(), ref_iter->key);
+                      }
+                    }
+                    // Tell the next iteration of the loop to reseek the
+                    // iterators.
+                    ref_iter->valid = false;
+
+                    ++num_recently_removed;
+                  } else {
+                    ASSERT_TRUE(ref_iter->Valid());
+                    ASSERT_EQ(ref_iter->key, db_iter->key().ToString());
+                    ASSERT_EQ(ref_iter->value, db_iter->value());
+                    ++num_matching;
+                  }
+                } else if (db_iter->status().ok()) {
+                  ASSERT_FALSE(ref_iter->Valid());
+                  ++num_at_end;
+                } else {
+                  // Non-ok status. Nothing to check here.
+                  // Tell the next iteration of the loop to reseek the
+                  // iterators.
+                  ref_iter->valid = false;
+                  ++num_not_ok;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Check that all cases were hit many times.
+  EXPECT_GT(num_matching, 10000);
+  EXPECT_GT(num_at_end, 10000);
+  EXPECT_GT(num_not_ok, 10000);
+  EXPECT_GT(num_recently_removed, 10000);
+
+  std::cout << "stats:\n  exact matches: " << num_matching
+            << "\n  end reached: " << num_at_end
+            << "\n  non-ok status: " << num_not_ok
+            << "\n  mutated on the fly: " << num_recently_removed << std::endl;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
diff --git a/thirdparty/rocksdb/db/db_iter_test.cc b/thirdparty/rocksdb/db/db_iter_test.cc
index 6db3b4a9bb..29fbd32086 100644
--- a/thirdparty/rocksdb/db/db_iter_test.cc
+++ b/thirdparty/rocksdb/db/db_iter_test.cc
@@ -36,7 +36,9 @@ class TestIterator : public InternalIterator {
         valid_(false),
         sequence_number_(0),
         iter_(0),
-        cmp(comparator) {}
+        cmp(comparator) {
+    data_.reserve(16);
+  }
 
   void AddPut(std::string argkey, std::string argvalue) {
     Add(argkey, kTypeValue, argvalue);
@@ -84,26 +86,61 @@ class TestIterator : public InternalIterator {
     });
   }
 
-  virtual bool Valid() const override {
+  // Removes the key from the set of keys over which this iterator iterates.
+  // Not to be confused with AddDeletion().
+  // If the iterator is currently positioned on this key, the deletion will
+  // apply next time the iterator moves.
+  // Used for simulating ForwardIterator updating to a new version that doesn't
+  // have some of the keys (e.g. after compaction with a filter).
+  void Vanish(std::string _key) {
+    if (valid_ && data_[iter_].first == _key) {
+      delete_current_ = true;
+      return;
+    }
+    for (auto it = data_.begin(); it != data_.end(); ++it) {
+      ParsedInternalKey ikey;
+      bool ok __attribute__((__unused__)) = ParseInternalKey(it->first, &ikey);
+      assert(ok);
+      if (ikey.user_key != _key) {
+        continue;
+      }
+      if (valid_ && data_.begin() + iter_ > it) {
+        --iter_;
+      }
+      data_.erase(it);
+      return;
+    }
+    assert(false);
+  }
+
+  // Number of operations done on this iterator since construction.
+  size_t steps() const { return steps_; }
+
+  bool Valid() const override {
     assert(initialized_);
     return valid_;
   }
 
-  virtual void SeekToFirst() override {
+  void SeekToFirst() override {
     assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
     valid_ = (data_.size() > 0);
     iter_ = 0;
   }
 
-  virtual void SeekToLast() override {
+  void SeekToLast() override {
     assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
     valid_ = (data_.size() > 0);
     iter_ = data_.size() - 1;
   }
 
-  virtual void Seek(const Slice& target) override {
+  void Seek(const Slice& target) override {
     assert(initialized_);
     SeekToFirst();
+    ++steps_;
     if (!valid_) {
       return;
     }
@@ -117,22 +154,33 @@ class TestIterator : public InternalIterator {
     }
   }
 
-  virtual void SeekForPrev(const Slice& target) override {
+  void SeekForPrev(const Slice& target) override {
     assert(initialized_);
+    DeleteCurrentIfNeeded();
     SeekForPrevImpl(target, &cmp);
   }
 
-  virtual void Next() override {
+  void Next() override {
     assert(initialized_);
-    if (data_.empty() || (iter_ == data_.size() - 1)) {
-      valid_ = false;
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    if (delete_current_) {
+      DeleteCurrentIfNeeded();
     } else {
       ++iter_;
     }
+    valid_ = iter_ < data_.size();
   }
 
-  virtual void Prev() override {
+  void Prev() override {
     assert(initialized_);
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    DeleteCurrentIfNeeded();
     if (iter_ == 0) {
       valid_ = false;
     } else {
@@ -140,32 +188,42 @@ class TestIterator : public InternalIterator {
     }
   }
 
-  virtual Slice key() const override {
+  Slice key() const override {
     assert(initialized_);
     return data_[iter_].first;
   }
 
-  virtual Slice value() const override {
+  Slice value() const override {
     assert(initialized_);
     return data_[iter_].second;
   }
 
-  virtual Status status() const override {
+  Status status() const override {
     assert(initialized_);
     return Status::OK();
   }
 
-  virtual bool IsKeyPinned() const override { return true; }
-  virtual bool IsValuePinned() const override { return true; }
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
 
  private:
   bool initialized_;
   bool valid_;
   size_t sequence_number_;
   size_t iter_;
+  size_t steps_ = 0;
 
   InternalKeyComparator cmp;
   std::vector<std::pair<std::string, std::string>> data_;
+  bool delete_current_ = false;
+
+  void DeleteCurrentIfNeeded() {
+    if (!delete_current_) {
+      return;
+    }
+    data_.erase(data_.begin() + iter_);
+    delete_current_ = false;
+  }
 };
 
 class DBIteratorTest : public testing::Test {
@@ -178,7 +236,7 @@ class DBIteratorTest : public testing::Test {
 TEST_F(DBIteratorTest, DBIteratorPrevNext) {
   Options options;
   ImmutableCFOptions cf_options = ImmutableCFOptions(options);
-
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->AddDeletion("a");
@@ -191,9 +249,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -223,9 +282,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -249,9 +309,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -281,9 +342,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -316,9 +378,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -345,9 +408,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      7, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 7, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -356,7 +420,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     db_iter->SeekToLast();
 
     ASSERT_TRUE(db_iter->Valid());
-    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_key_skipped_count), 7);
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_key_skipped_count), 1);
     ASSERT_EQ(db_iter->key().ToString(), "b");
 
     SetPerfLevel(kDisable);
@@ -382,9 +446,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -407,9 +472,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
@@ -429,9 +495,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -464,9 +531,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     ReadOptions ro;
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      7, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 7, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     SetPerfLevel(kEnableCount);
     ASSERT_TRUE(GetPerfLevel() == kEnableCount);
@@ -475,7 +543,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     db_iter->SeekToLast();
 
     ASSERT_TRUE(db_iter->Valid());
-    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 1);
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
     ASSERT_EQ(db_iter->key().ToString(), "b");
 
     SetPerfLevel(kDisable);
@@ -493,9 +561,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -535,9 +604,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -566,9 +636,10 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
     internal_iter->Finish();
 
     ReadOptions ro;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -589,15 +660,17 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) {
 TEST_F(DBIteratorTest, DBIteratorEmpty) {
   Options options;
   ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   ReadOptions ro;
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -606,9 +679,10 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToFirst();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -629,8 +703,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 2, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 2,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -659,6 +734,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
   ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
     for (size_t i = 0; i < 200; ++i) {
@@ -672,8 +748,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 
       options.statistics = rocksdb::CreateDBStatistics();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -707,8 +784,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -735,8 +813,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, 202,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, 202, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -767,8 +846,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->AddPut("c", "200");
       internal_iter->Finish();
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, i,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
       db_iter->SeekToLast();
       ASSERT_TRUE(!db_iter->Valid());
 
@@ -782,9 +862,10 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
     }
     internal_iter->AddPut("c", "200");
     internal_iter->Finish();
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      200, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 200, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -817,8 +898,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -851,8 +933,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
       internal_iter->Finish();
 
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, i + 2,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, i + 2, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
       db_iter->SeekToLast();
       ASSERT_TRUE(db_iter->Valid());
 
@@ -882,6 +965,7 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) {
 TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
   Options options;
   ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
   ReadOptions ro;
 
   // Basic test case ... Make sure explicityly passing the default value works.
@@ -898,9 +982,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 0;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -944,9 +1029,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -988,9 +1074,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1026,9 +1113,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1061,9 +1149,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -1091,9 +1180,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1128,9 +1218,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
     internal_iter->Finish();
 
     ro.max_skippable_internal_keys = 2;
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
 
     db_iter->SeekToFirst();
     ASSERT_TRUE(db_iter->Valid());
@@ -1166,8 +1257,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
 
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, 2 * i + 1,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1219,8 +1311,9 @@ TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
       options.max_sequential_skip_in_iterations = 1000;
       ro.max_skippable_internal_keys = i;
       std::unique_ptr<Iterator> db_iter(NewDBIterator(
-          env_, ro, cf_options, BytewiseComparator(), internal_iter, 2 * i + 1,
-          options.max_sequential_skip_in_iterations));
+          env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+          internal_iter, 2 * i + 1, options.max_sequential_skip_in_iterations,
+          nullptr /*read_callback*/));
 
       db_iter->SeekToFirst();
       ASSERT_TRUE(db_iter->Valid());
@@ -1257,8 +1350,9 @@ TEST_F(DBIteratorTest, DBIterator1) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 1, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1284,8 +1378,9 @@ TEST_F(DBIteratorTest, DBIterator2) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 0, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 0,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1308,8 +1403,9 @@ TEST_F(DBIteratorTest, DBIterator3) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 2, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 2,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1332,8 +1428,9 @@ TEST_F(DBIteratorTest, DBIterator4) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 4, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 4,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1351,6 +1448,7 @@ TEST_F(DBIteratorTest, DBIterator5) {
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
   ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -1363,9 +1461,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1385,9 +1484,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      1, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 1, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1407,9 +1507,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1429,9 +1530,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      3, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 3, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1451,9 +1553,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1473,9 +1576,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      5, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 5, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1495,9 +1599,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      6, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 6, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1515,9 +1620,10 @@ TEST_F(DBIteratorTest, DBIterator5) {
     internal_iter->AddMerge("a", "merge_2");
     internal_iter->AddPut("b", "val_b");
     internal_iter->Finish();
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      10, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 10, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->Seek("b");
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -1532,6 +1638,7 @@ TEST_F(DBIteratorTest, DBIterator6) {
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
   ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -1544,9 +1651,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1566,9 +1674,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      1, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 1, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1588,9 +1697,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1610,9 +1720,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      3, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 3, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(!db_iter->Valid());
   }
@@ -1628,9 +1739,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1650,9 +1762,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      5, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 5, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1672,9 +1785,10 @@ TEST_F(DBIteratorTest, DBIterator6) {
     internal_iter->AddMerge("a", "merge_6");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      6, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 6, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1689,6 +1803,7 @@ TEST_F(DBIteratorTest, DBIterator7) {
   Options options;
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
   ImmutableCFOptions cf_options = ImmutableCFOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
 
   {
     TestIterator* internal_iter = new TestIterator(BytewiseComparator());
@@ -1713,9 +1828,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      0, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 0, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
     ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -1747,9 +1863,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      2, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 2, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1787,9 +1904,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      4, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 4, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1827,9 +1945,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      5, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 5, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1872,9 +1991,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      6, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 6, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1918,9 +2038,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      7, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 7, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -1958,9 +2079,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      9, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 9, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2004,9 +2126,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      13, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 13, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2051,9 +2174,10 @@ TEST_F(DBIteratorTest, DBIterator7) {
     internal_iter->AddDeletion("c");
     internal_iter->Finish();
 
-    std::unique_ptr<Iterator> db_iter(
-        NewDBIterator(env_, ro, cf_options, BytewiseComparator(), internal_iter,
-                      14, options.max_sequential_skip_in_iterations));
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, cf_options, mutable_cf_options, BytewiseComparator(),
+        internal_iter, 14, options.max_sequential_skip_in_iterations,
+        nullptr /*read_callback*/));
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
 
@@ -2082,8 +2206,9 @@ TEST_F(DBIteratorTest, DBIterator8) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 10, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2112,8 +2237,9 @@ TEST_F(DBIteratorTest, DBIterator9) {
     internal_iter->Finish();
 
     std::unique_ptr<Iterator> db_iter(NewDBIterator(
-        env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-        internal_iter, 10, options.max_sequential_skip_in_iterations));
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 10,
+        options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
 
     db_iter->SeekToLast();
     ASSERT_TRUE(db_iter->Valid());
@@ -2178,8 +2304,9 @@ TEST_F(DBIteratorTest, DBIterator10) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 10, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
 
   db_iter->Seek("c");
   ASSERT_TRUE(db_iter->Valid());
@@ -2216,9 +2343,10 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
   internal_iter->AddPut("b", "2");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(
-      NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 10, 0 /* force seek */));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10, 0 /* force seek */,
+      nullptr /*read_callback*/));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2245,8 +2373,9 @@ TEST_F(DBIteratorTest, DBIterator11) {
   internal_iter->Finish();
 
   std::unique_ptr<Iterator> db_iter(NewDBIterator(
-      env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-      internal_iter, 1, options.max_sequential_skip_in_iterations));
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 1,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
   db_iter->SeekToFirst();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "a");
@@ -2270,9 +2399,9 @@ TEST_F(DBIteratorTest, DBIterator12) {
   internal_iter->AddSingleDeletion("b");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(
-      NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 10, 0));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10, 0, nullptr /*read_callback*/));
   db_iter->SeekToLast();
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "c");
@@ -2307,9 +2436,9 @@ TEST_F(DBIteratorTest, DBIterator13) {
   internal_iter->AddPut(key, "8");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(
-      NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 2, 3));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 2, 3, nullptr /*read_callback*/));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), key);
@@ -2335,9 +2464,9 @@ TEST_F(DBIteratorTest, DBIterator14) {
   internal_iter->AddPut("c", "9");
   internal_iter->Finish();
 
-  std::unique_ptr<Iterator> db_iter(
-      NewDBIterator(env_, ro, ImmutableCFOptions(options), BytewiseComparator(),
-                    internal_iter, 4, 1));
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 4, 1, nullptr /*read_callback*/));
   db_iter->Seek("b");
   ASSERT_TRUE(db_iter->Valid());
   ASSERT_EQ(db_iter->key().ToString(), "b");
@@ -2347,6 +2476,80 @@ TEST_F(DBIteratorTest, DBIterator14) {
   ASSERT_EQ(db_iter->value().ToString(), "4");
 }
 
+TEST_F(DBIteratorTest, DBIteratorTestDifferentialSnapshots) {
+  { // test that KVs earlier that iter_start_seqnum are filtered out
+    ReadOptions ro;
+    ro.iter_start_seqnum=5;
+    Options options;
+    options.statistics = rocksdb::CreateDBStatistics();
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 10; ++i) {
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a");
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b");
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "c");
+    }
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 13,
+        options.max_sequential_skip_in_iterations, nullptr));
+    // Expecting InternalKeys in [5,8] range with correct type
+    int seqnums[4] = {5,8,11,13};
+    std::string user_keys[4] = {"1","2","3","4"};
+    std::string values[4] = {"1c", "2c", "3c", "4b"};
+    int i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      FullKey fkey;
+      ParseFullKey(db_iter->key(), &fkey);
+      ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
+      ASSERT_EQ(EntryType::kEntryPut, fkey.type);
+      ASSERT_EQ(seqnums[i], fkey.sequence);
+      ASSERT_EQ(values[i], db_iter->value().ToString());
+      i++;
+    }
+    ASSERT_EQ(i, 4);
+  }
+
+  { // Test that deletes are returned correctly as internal KVs
+    ReadOptions ro;
+    ro.iter_start_seqnum=5;
+    Options options;
+    options.statistics = rocksdb::CreateDBStatistics();
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 10; ++i) {
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "a");
+      internal_iter->AddPut(std::to_string(i), std::to_string(i) + "b");
+      internal_iter->AddDeletion(std::to_string(i));
+    }
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 13,
+        options.max_sequential_skip_in_iterations, nullptr));
+    // Expecting InternalKeys in [5,8] range with correct type
+    int seqnums[4] = {5,8,11,13};
+    EntryType key_types[4] = {EntryType::kEntryDelete,EntryType::kEntryDelete,
+      EntryType::kEntryDelete,EntryType::kEntryPut};
+    std::string user_keys[4] = {"1","2","3","4"};
+    std::string values[4] = {"", "", "", "4b"};
+    int i = 0;
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      FullKey fkey;
+      ParseFullKey(db_iter->key(), &fkey);
+      ASSERT_EQ(user_keys[i], fkey.user_key.ToString());
+      ASSERT_EQ(key_types[i], fkey.type);
+      ASSERT_EQ(seqnums[i], fkey.sequence);
+      ASSERT_EQ(values[i], db_iter->value().ToString());
+      i++;
+    }
+    ASSERT_EQ(i, 4);
+  }
+}
+
 class DBIterWithMergeIterTest : public testing::Test {
  public:
   DBIterWithMergeIterTest()
@@ -2373,10 +2576,11 @@ class DBIterWithMergeIterTest : public testing::Test {
     InternalIterator* merge_iter =
         NewMergingIterator(&icomp_, &child_iters[0], 2u);
 
-    db_iter_.reset(NewDBIterator(env_, ro_, ImmutableCFOptions(options_),
-                                 BytewiseComparator(), merge_iter,
-                                 8 /* read data earlier than seqId 8 */,
-                                 3 /* max iterators before reseek */));
+    db_iter_.reset(NewDBIterator(
+        env_, ro_, ImmutableCFOptions(options_), MutableCFOptions(options_),
+        BytewiseComparator(), merge_iter,
+        8 /* read data earlier than seqId 8 */,
+        3 /* max iterators before reseek */, nullptr /*read_callback*/));
   }
 
   Env* env_;
@@ -2458,8 +2662,8 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
   // MergeIterator::Prev() realized the mem table iterator is at its end
   // and before an SeekToLast() is called.
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "MergeIterator::Prev:BeforeSeekToLast",
-      [&](void* arg) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
+      "MergeIterator::Prev:BeforePrev",
+      [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   db_iter_->Prev();
@@ -2494,7 +2698,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) {
   // mem table after MergeIterator::Prev() realized the mem tableiterator is at
   // its end and before an SeekToLast() is called.
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "MergeIterator::Prev:BeforeSeekToLast", [&](void* arg) {
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
         internal_iter2_->Add("z", kTypeValue, "7", 12u);
         internal_iter2_->Add("z", kTypeValue, "7", 11u);
       });
@@ -2532,7 +2736,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) {
   // mem table after MergeIterator::Prev() realized the mem table iterator is at
   // its end and before an SeekToLast() is called.
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "MergeIterator::Prev:BeforeSeekToLast", [&](void* arg) {
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
         internal_iter2_->Add("z", kTypeValue, "7", 16u, true);
         internal_iter2_->Add("z", kTypeValue, "7", 15u, true);
         internal_iter2_->Add("z", kTypeValue, "7", 14u, true);
@@ -2796,6 +3000,173 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
+
+
+TEST_F(DBIteratorTest, SeekPrefixTombstones) {
+  ReadOptions ro;
+  Options options;
+  options.prefix_extractor.reset(NewNoopTransform());
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("b");
+  internal_iter->AddDeletion("c");
+  internal_iter->AddDeletion("d");
+  internal_iter->AddDeletion("e");
+  internal_iter->AddDeletion("f");
+  internal_iter->AddDeletion("g");
+  internal_iter->Finish();
+
+  ro.prefix_same_as_start = true;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+  int skipped_keys = 0;
+
+  get_perf_context()->Reset();
+  db_iter->SeekForPrev("z");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+
+  get_perf_context()->Reset();
+  db_iter->Seek("a");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+}
+
+TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
+  const int kNumKeys = 3;
+  for (int i = 0; i < kNumKeys + 2; ++i) {
+    // + 2 for two special cases: lower bound before and lower bound after the
+    // internal iterator's keys
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (int j = 1; j <= kNumKeys; ++j) {
+      internal_iter->AddPut(std::to_string(j), "val");
+    }
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    auto lower_bound_str = std::to_string(i);
+    Slice lower_bound(lower_bound_str);
+    ro.iterate_lower_bound = &lower_bound;
+    Options options;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    if (i == kNumKeys + 1) {
+      // lower bound was beyond the last key
+      ASSERT_FALSE(db_iter->Valid());
+    } else {
+      ASSERT_TRUE(db_iter->Valid());
+      int expected;
+      if (i == 0) {
+        // lower bound was before the first key
+        expected = 1;
+      } else {
+        // lower bound was at the ith key
+        expected = i;
+      }
+      ASSERT_EQ(std::to_string(expected), db_iter->key().ToString());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, PrevLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10 /* sequence */,
+      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+
+  db_iter->SeekToLast();
+  for (int i = kNumKeys; i >= kLowerBound; --i) {
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(std::to_string(i), db_iter->key().ToString());
+    db_iter->Prev();
+  }
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, SeekLessLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableCFOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, 10 /* sequence */,
+      options.max_sequential_skip_in_iterations, nullptr /* read_callback */));
+
+  auto before_lower_bound_str = std::to_string(kLowerBound - 1);
+  Slice before_lower_bound(lower_bound_str);
+
+  db_iter->Seek(before_lower_bound);
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(lower_bound_str, db_iter->key().ToString());
+}
+
+TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
+  Options options;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(0));
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "A");
+  internal_iter->AddPut("b", "B");
+  for (int i = 0; i < 100; ++i) {
+    internal_iter->AddPut("c" + ToString(i), "");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ReadOptions(), ImmutableCFOptions(options),
+      MutableCFOptions(options), BytewiseComparator(), internal_iter, 10,
+      options.max_sequential_skip_in_iterations, nullptr /*read_callback*/));
+
+  db_iter->SeekForPrev("a");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("a", db_iter->key().ToString());
+
+  internal_iter->Vanish("a");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("b", db_iter->key().ToString());
+
+  // A (sort of) bug used to cause DBIter to pointlessly drag the internal
+  // iterator all the way to the end. But this doesn't really matter at the time
+  // of writing because the only iterator that can see disappearing keys is
+  // ForwardIterator, which doesn't support SeekForPrev().
+  EXPECT_LT(internal_iter->steps(), 20);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_iterator_test.cc b/thirdparty/rocksdb/db/db_iterator_test.cc
index d3bd164a2c..6a5188c775 100644
--- a/thirdparty/rocksdb/db/db_iterator_test.cc
+++ b/thirdparty/rocksdb/db/db_iterator_test.cc
@@ -9,6 +9,7 @@
 
 #include <functional>
 
+#include "db/db_iter.h"
 #include "db/db_test_util.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
@@ -17,20 +18,57 @@
 
 namespace rocksdb {
 
-class DBIteratorTest : public DBTestBase {
+// A dumb ReadCallback which saying every key is committed.
+class DummyReadCallback : public ReadCallback {
+ public:
+  DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {}
+  bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; }
+  void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; }
+};
+
+// Test param:
+//   bool: whether to pass read_callback to NewIterator().
+class DBIteratorTest : public DBTestBase,
+                       public testing::WithParamInterface<bool> {
  public:
   DBIteratorTest() : DBTestBase("/db_iterator_test") {}
+
+  Iterator* NewIterator(const ReadOptions& read_options,
+                        ColumnFamilyHandle* column_family = nullptr) {
+    if (column_family == nullptr) {
+      column_family = db_->DefaultColumnFamily();
+    }
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    SequenceNumber seq = read_options.snapshot != nullptr
+                             ? read_options.snapshot->GetSequenceNumber()
+                             : db_->GetLatestSequenceNumber();
+    bool use_read_callback = GetParam();
+    DummyReadCallback* read_callback = nullptr;
+    if (use_read_callback) {
+      read_callback = new DummyReadCallback();
+      read_callback->SetSnapshot(seq);
+      InstrumentedMutexLock lock(&mutex_);
+      read_callbacks_.push_back(
+          std::unique_ptr<DummyReadCallback>(read_callback));
+    }
+    return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback);
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::vector<std::unique_ptr<DummyReadCallback>> read_callbacks_;
 };
 
 class FlushBlockEveryKeyPolicy : public FlushBlockPolicy {
  public:
-  virtual bool Update(const Slice& key, const Slice& value) override {
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
     if (!start_) {
       start_ = true;
       return false;
     }
     return true;
   }
+
  private:
   bool start_ = false;
 };
@@ -44,34 +82,41 @@ class FlushBlockEveryKeyPolicyFactory : public FlushBlockPolicyFactory {
   }
 
   FlushBlockPolicy* NewFlushBlockPolicy(
-    const BlockBasedTableOptions& table_options,
-    const BlockBuilder& data_block_builder) const override {
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
     return new FlushBlockEveryKeyPolicy;
   }
 };
 
-TEST_F(DBIteratorTest, IteratorProperty) {
+TEST_P(DBIteratorTest, IteratorProperty) {
   // The test needs to be changed if kPersistedTier is supported in iterator.
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
   Put(1, "1", "2");
+  Delete(1, "2");
   ReadOptions ropt;
   ropt.pin_data = false;
   {
-    unique_ptr<Iterator> iter(db_->NewIterator(ropt, handles_[1]));
+    std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
     iter->SeekToFirst();
     std::string prop_value;
     ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
     ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
     ASSERT_EQ("0", prop_value);
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("1", prop_value);
     iter->Next();
     ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
     ASSERT_EQ("Iterator is not valid.", prop_value);
+
+    // Get internal key at which the iteration stopped (tombstone in this case).
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("2", prop_value);
   }
   Close();
 }
 
-TEST_F(DBIteratorTest, PersistedTierOnIterator) {
+TEST_P(DBIteratorTest, PersistedTierOnIterator) {
   // The test needs to be changed if kPersistedTier is supported in iterator.
   Options options = CurrentOptions();
   CreateAndReopenWithCF({"pikachu"}, options);
@@ -87,7 +132,7 @@ TEST_F(DBIteratorTest, PersistedTierOnIterator) {
   Close();
 }
 
-TEST_F(DBIteratorTest, NonBlockingIteration) {
+TEST_P(DBIteratorTest, NonBlockingIteration) {
   do {
     ReadOptions non_blocking_opts, regular_opts;
     Options options = CurrentOptions();
@@ -99,7 +144,7 @@ TEST_F(DBIteratorTest, NonBlockingIteration) {
 
     // scan using non-blocking iterator. We should find it because
     // it is in memtable.
-    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    Iterator* iter = NewIterator(non_blocking_opts, handles_[1]);
     int count = 0;
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_OK(iter->status());
@@ -116,7 +161,7 @@ TEST_F(DBIteratorTest, NonBlockingIteration) {
     // kvs. Neither does it do any IOs to storage.
     uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
     count = 0;
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       count++;
@@ -133,7 +178,7 @@ TEST_F(DBIteratorTest, NonBlockingIteration) {
     // verify that we can find it via a non-blocking scan
     numopen = TestGetTickerCount(options, NO_FILE_OPENS);
     cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
     count = 0;
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       ASSERT_OK(iter->status());
@@ -146,79 +191,10 @@ TEST_F(DBIteratorTest, NonBlockingIteration) {
 
     // This test verifies block cache behaviors, which is not used by plain
     // table format.
-    // Exclude kHashCuckoo as it does not support iteration currently
-  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
-                         kSkipMmapReads));
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads));
 }
 
-#ifndef ROCKSDB_LITE
-TEST_F(DBIteratorTest, ManagedNonBlockingIteration) {
-  do {
-    ReadOptions non_blocking_opts, regular_opts;
-    Options options = CurrentOptions();
-    options.statistics = rocksdb::CreateDBStatistics();
-    non_blocking_opts.read_tier = kBlockCacheTier;
-    non_blocking_opts.managed = true;
-    CreateAndReopenWithCF({"pikachu"}, options);
-    // write one kv to the database.
-    ASSERT_OK(Put(1, "a", "b"));
-
-    // scan using non-blocking iterator. We should find it because
-    // it is in memtable.
-    Iterator* iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    int count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(iter->status());
-      count++;
-    }
-    ASSERT_EQ(count, 1);
-    delete iter;
-
-    // flush memtable to storage. Now, the key should not be in the
-    // memtable neither in the block cache.
-    ASSERT_OK(Flush(1));
-
-    // verify that a non-blocking iterator does not find any
-    // kvs. Neither does it do any IOs to storage.
-    int64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    int64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      count++;
-    }
-    ASSERT_EQ(count, 0);
-    ASSERT_TRUE(iter->status().IsIncomplete());
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-    delete iter;
-
-    // read in the specified block via a regular get
-    ASSERT_EQ(Get(1, "a"), "b");
-
-    // verify that we can find it via a non-blocking scan
-    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
-    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
-    iter = db_->NewIterator(non_blocking_opts, handles_[1]);
-    count = 0;
-    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-      ASSERT_OK(iter->status());
-      count++;
-    }
-    ASSERT_EQ(count, 1);
-    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
-    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
-    delete iter;
-
-    // This test verifies block cache behaviors, which is not used by plain
-    // table format.
-    // Exclude kHashCuckoo as it does not support iteration currently
-  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo |
-                         kSkipMmapReads));
-}
-#endif  // ROCKSDB_LITE
-
-TEST_F(DBIteratorTest, IterSeekBeforePrev) {
+TEST_P(DBIteratorTest, IterSeekBeforePrev) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   dbfull()->Flush(FlushOptions());
@@ -226,7 +202,7 @@ TEST_F(DBIteratorTest, IterSeekBeforePrev) {
   ASSERT_OK(Put("1", "h"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("2", "j"));
-  auto iter = db_->NewIterator(ReadOptions());
+  auto iter = NewIterator(ReadOptions());
   iter->Seek(Slice("c"));
   iter->Prev();
   iter->Seek(Slice("a"));
@@ -234,7 +210,7 @@ TEST_F(DBIteratorTest, IterSeekBeforePrev) {
   delete iter;
 }
 
-TEST_F(DBIteratorTest, IterSeekForPrevBeforeNext) {
+TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   dbfull()->Flush(FlushOptions());
@@ -242,7 +218,7 @@ TEST_F(DBIteratorTest, IterSeekForPrevBeforeNext) {
   ASSERT_OK(Put("1", "h"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("2", "j"));
-  auto iter = db_->NewIterator(ReadOptions());
+  auto iter = NewIterator(ReadOptions());
   iter->SeekForPrev(Slice("0"));
   iter->Next();
   iter->SeekForPrev(Slice("1"));
@@ -256,7 +232,7 @@ std::string MakeLongKey(size_t length, char c) {
 }
 }  // namespace
 
-TEST_F(DBIteratorTest, IterLongKeys) {
+TEST_P(DBIteratorTest, IterLongKeys) {
   ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
   ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
   ASSERT_OK(Put("a", "b"));
@@ -264,7 +240,7 @@ TEST_F(DBIteratorTest, IterLongKeys) {
   ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
   ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
   ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
-  auto iter = db_->NewIterator(ReadOptions());
+  auto iter = NewIterator(ReadOptions());
 
   // Create a key that needs to be skipped for Seq too new
   iter->Seek(MakeLongKey(20, 0));
@@ -286,7 +262,7 @@ TEST_F(DBIteratorTest, IterLongKeys) {
   ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
   delete iter;
 
-  iter = db_->NewIterator(ReadOptions());
+  iter = NewIterator(ReadOptions());
   iter->Seek(MakeLongKey(50, 1));
   ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
   iter->Next();
@@ -296,13 +272,13 @@ TEST_F(DBIteratorTest, IterLongKeys) {
   delete iter;
 }
 
-TEST_F(DBIteratorTest, IterNextWithNewerSeq) {
+TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("d", "e"));
-  auto iter = db_->NewIterator(ReadOptions());
+  auto iter = NewIterator(ReadOptions());
 
   // Create a key that needs to be skipped for Seq too new
   for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
@@ -322,13 +298,13 @@ TEST_F(DBIteratorTest, IterNextWithNewerSeq) {
   delete iter;
 }
 
-TEST_F(DBIteratorTest, IterPrevWithNewerSeq) {
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
   ASSERT_OK(Put("0", "0"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("d", "e"));
-  auto iter = db_->NewIterator(ReadOptions());
+  auto iter = NewIterator(ReadOptions());
 
   // Create a key that needs to be skipped for Seq too new
   for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
@@ -353,14 +329,14 @@ TEST_F(DBIteratorTest, IterPrevWithNewerSeq) {
   delete iter;
 }
 
-TEST_F(DBIteratorTest, IterPrevWithNewerSeq2) {
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
   ASSERT_OK(Put("0", "0"));
   dbfull()->Flush(FlushOptions());
   ASSERT_OK(Put("a", "b"));
   ASSERT_OK(Put("c", "d"));
   ASSERT_OK(Put("e", "f"));
-  auto iter = db_->NewIterator(ReadOptions());
-  auto iter2 = db_->NewIterator(ReadOptions());
+  auto iter = NewIterator(ReadOptions());
+  auto iter2 = NewIterator(ReadOptions());
   iter->Seek(Slice("c"));
   iter2->SeekForPrev(Slice("d"));
   ASSERT_EQ(IterStatus(iter), "c->d");
@@ -382,10 +358,10 @@ TEST_F(DBIteratorTest, IterPrevWithNewerSeq2) {
   delete iter2;
 }
 
-TEST_F(DBIteratorTest, IterEmpty) {
+TEST_P(DBIteratorTest, IterEmpty) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
 
     iter->SeekToFirst();
     ASSERT_EQ(IterStatus(iter), "(invalid)");
@@ -403,11 +379,11 @@ TEST_F(DBIteratorTest, IterEmpty) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBIteratorTest, IterSingle) {
+TEST_P(DBIteratorTest, IterSingle) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
 
     iter->SeekToFirst();
     ASSERT_EQ(IterStatus(iter), "a->va");
@@ -454,13 +430,13 @@ TEST_F(DBIteratorTest, IterSingle) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBIteratorTest, IterMulti) {
+TEST_P(DBIteratorTest, IterMulti) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
     ASSERT_OK(Put(1, "b", "vb"));
     ASSERT_OK(Put(1, "c", "vc"));
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
 
     iter->SeekToFirst();
     ASSERT_EQ(IterStatus(iter), "a->va");
@@ -553,7 +529,7 @@ TEST_F(DBIteratorTest, IterMulti) {
 
 // Check that we can skip over a run of user keys
 // by using reseek rather than sequential scan
-TEST_F(DBIteratorTest, IterReseek) {
+TEST_P(DBIteratorTest, IterReseek) {
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
   Options options = CurrentOptions(options_override);
@@ -570,7 +546,7 @@ TEST_F(DBIteratorTest, IterReseek) {
   ASSERT_OK(Put(1, "a", "one"));
   ASSERT_OK(Put(1, "a", "two"));
   ASSERT_OK(Put(1, "b", "bone"));
-  Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
   iter->SeekToFirst();
   ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
   ASSERT_EQ(IterStatus(iter), "a->two");
@@ -582,7 +558,7 @@ TEST_F(DBIteratorTest, IterReseek) {
   // insert a total of three keys with same userkey and verify
   // that reseek is still not invoked.
   ASSERT_OK(Put(1, "a", "three"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter = NewIterator(ReadOptions(), handles_[1]);
   iter->SeekToFirst();
   ASSERT_EQ(IterStatus(iter), "a->three");
   iter->Next();
@@ -593,7 +569,7 @@ TEST_F(DBIteratorTest, IterReseek) {
   // insert a total of four keys with same userkey and verify
   // that reseek is invoked.
   ASSERT_OK(Put(1, "a", "four"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter = NewIterator(ReadOptions(), handles_[1]);
   iter->SeekToFirst();
   ASSERT_EQ(IterStatus(iter), "a->four");
   ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
@@ -610,7 +586,7 @@ TEST_F(DBIteratorTest, IterReseek) {
 
   // Insert another version of b and assert that reseek is not invoked
   ASSERT_OK(Put(1, "b", "btwo"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter = NewIterator(ReadOptions(), handles_[1]);
   iter->SeekToLast();
   ASSERT_EQ(IterStatus(iter), "b->btwo");
   ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
@@ -625,7 +601,7 @@ TEST_F(DBIteratorTest, IterReseek) {
   // of b and 4 versions of a.
   ASSERT_OK(Put(1, "b", "bthree"));
   ASSERT_OK(Put(1, "b", "bfour"));
-  iter = db_->NewIterator(ReadOptions(), handles_[1]);
+  iter = NewIterator(ReadOptions(), handles_[1]);
   iter->SeekToLast();
   ASSERT_EQ(IterStatus(iter), "b->bfour");
   ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
@@ -639,7 +615,7 @@ TEST_F(DBIteratorTest, IterReseek) {
   delete iter;
 }
 
-TEST_F(DBIteratorTest, IterSmallAndLargeMix) {
+TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "a", "va"));
@@ -648,7 +624,7 @@ TEST_F(DBIteratorTest, IterSmallAndLargeMix) {
     ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
     ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
 
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
 
     iter->SeekToFirst();
     ASSERT_EQ(IterStatus(iter), "a->va");
@@ -680,7 +656,7 @@ TEST_F(DBIteratorTest, IterSmallAndLargeMix) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBIteratorTest, IterMultiWithDelete) {
+TEST_P(DBIteratorTest, IterMultiWithDelete) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     ASSERT_OK(Put(1, "ka", "va"));
@@ -689,7 +665,7 @@ TEST_F(DBIteratorTest, IterMultiWithDelete) {
     ASSERT_OK(Delete(1, "kb"));
     ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
 
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
     iter->Seek("kc");
     ASSERT_EQ(IterStatus(iter), "kc->vc");
     if (!CurrentOptions().merge_operator) {
@@ -706,7 +682,7 @@ TEST_F(DBIteratorTest, IterMultiWithDelete) {
   } while (ChangeOptions());
 }
 
-TEST_F(DBIteratorTest, IterPrevMaxSkip) {
+TEST_P(DBIteratorTest, IterPrevMaxSkip) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     for (int i = 0; i < 2; i++) {
@@ -736,7 +712,7 @@ TEST_F(DBIteratorTest, IterPrevMaxSkip) {
   } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
 }
 
-TEST_F(DBIteratorTest, IterWithSnapshot) {
+TEST_P(DBIteratorTest, IterWithSnapshot) {
   anon::OptionsOverride options_override;
   options_override.skip_policy = kSkipNoSnapshot;
   do {
@@ -750,7 +726,7 @@ TEST_F(DBIteratorTest, IterWithSnapshot) {
     const Snapshot* snapshot = db_->GetSnapshot();
     ReadOptions options;
     options.snapshot = snapshot;
-    Iterator* iter = db_->NewIterator(options, handles_[1]);
+    Iterator* iter = NewIterator(options, handles_[1]);
 
     ASSERT_OK(Put(1, "key0", "val0"));
     // Put more values after the snapshot
@@ -799,17 +775,16 @@ TEST_F(DBIteratorTest, IterWithSnapshot) {
     }
     db_->ReleaseSnapshot(snapshot);
     delete iter;
-    // skip as HashCuckooRep does not support snapshot
-  } while (ChangeOptions(kSkipHashCuckoo));
+  } while (ChangeOptions());
 }
 
-TEST_F(DBIteratorTest, IteratorPinsRef) {
+TEST_P(DBIteratorTest, IteratorPinsRef) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
     Put(1, "foo", "hello");
 
     // Get iterator that will yield the current contents of the DB.
-    Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]);
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
 
     // Write to force compactions
     Put(1, "foo", "newvalue1");
@@ -829,7 +804,9 @@ TEST_F(DBIteratorTest, IteratorPinsRef) {
   } while (ChangeCompactOptions());
 }
 
-TEST_F(DBIteratorTest, DBIteratorBoundTest) {
+// SetOptions not defined in ROCKSDB LITE
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, DBIteratorBoundTest) {
   Options options = CurrentOptions();
   options.env = env_;
   options.create_if_missing = true;
@@ -846,7 +823,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
     ReadOptions ro;
     ro.iterate_upper_bound = nullptr;
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
 
     iter->Seek("foo");
 
@@ -883,7 +860,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
     Slice prefix("foo2");
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
 
     iter->Seek("foo");
 
@@ -905,7 +882,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
     Slice prefix("foo");
     ro.iterate_upper_bound = &prefix;
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
 
     iter->SeekToLast();
     ASSERT_TRUE(iter->Valid());
@@ -913,9 +890,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
   }
 
   // prefix is the first letter of the key
-  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-
-  DestroyAndReopen(options);
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
   ASSERT_OK(Put("a", "0"));
   ASSERT_OK(Put("foo", "bar"));
   ASSERT_OK(Put("foo1", "bar1"));
@@ -929,7 +904,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
     Slice upper_bound("g");
     ro.iterate_upper_bound = &upper_bound;
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
 
     iter->Seek("foo");
 
@@ -962,7 +937,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
     ReadOptions ro;
     ro.iterate_upper_bound = nullptr;
 
-    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
 
     iter->Seek("b");
     ASSERT_TRUE(iter->Valid());
@@ -982,7 +957,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
     Slice prefix("c");
     ro.iterate_upper_bound = &prefix;
 
-    iter.reset(db_->NewIterator(ro));
+    iter.reset(NewIterator(ro));
 
     get_perf_context()->Reset();
 
@@ -1003,7 +978,62 @@ TEST_F(DBIteratorTest, DBIteratorBoundTest) {
   }
 }
 
-TEST_F(DBIteratorTest, DBIteratorBoundOptimizationTest) {
+TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("z", "0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  {
+    std::string up_str = "foo5";
+    Slice up(up_str);
+    ReadOptions ro;
+    ro.iterate_upper_bound = &up;
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    uint64_t prev_block_cache_hit =
+        TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    uint64_t prev_block_cache_miss =
+        TestGetTickerCount(options, BLOCK_CACHE_MISS);
+
+    ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0);
+
+    iter->Seek("foo4");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo4")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+
+    iter->Seek("foo2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo3")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+  }
+}
+#endif
+
+TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
   int upper_bound_hits = 0;
   Options options = CurrentOptions();
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
@@ -1031,7 +1061,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundOptimizationTest) {
   ReadOptions ro;
   ro.iterate_upper_bound = &ub;
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  std::unique_ptr<Iterator> iter(NewIterator(ro));
 
   iter->Seek("foo");
   ASSERT_TRUE(iter->Valid());
@@ -1049,7 +1079,7 @@ TEST_F(DBIteratorTest, DBIteratorBoundOptimizationTest) {
 }
 // TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
 //             return the biggest key which is smaller than the seek key.
-TEST_F(DBIteratorTest, PrevAfterAndNextAfterMerge) {
+TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
   Options options;
   options.create_if_missing = true;
   options.merge_operator = MergeOperators::CreatePutOperator();
@@ -1062,7 +1092,7 @@ TEST_F(DBIteratorTest, PrevAfterAndNextAfterMerge) {
   db_->Merge(wopts, "2", "data2");
   db_->Merge(wopts, "3", "data3");
 
-  std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
 
   it->Seek("2");
   ASSERT_TRUE(it->Valid());
@@ -1081,7 +1111,8 @@ TEST_F(DBIteratorTest, PrevAfterAndNextAfterMerge) {
   ASSERT_EQ("2", it->key().ToString());
 }
 
-TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) {
+class DBIteratorTestForPinnedData : public DBIteratorTest {
+ public:
   enum TestConfig {
     NORMAL,
     CLOSE_AND_OPEN,
@@ -1089,19 +1120,19 @@ TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) {
     FLUSH_EVERY_1000,
     MAX
   };
+  DBIteratorTestForPinnedData() : DBIteratorTest() {}
+  void PinnedDataIteratorRandomized(TestConfig run_config) {
+    // Generate Random data
+    Random rnd(301);
+
+    int puts = 100000;
+    int key_pool = static_cast<int>(puts * 0.7);
+    int key_size = 100;
+    int val_size = 1000;
+    int seeks_percentage = 20;   // 20% of keys will be used to test seek()
+    int delete_percentage = 20;  // 20% of keys will be deleted
+    int merge_percentage = 20;   // 20% of keys will be added using Merge()
 
-  // Generate Random data
-  Random rnd(301);
-
-  int puts = 100000;
-  int key_pool = static_cast<int>(puts * 0.7);
-  int key_size = 100;
-  int val_size = 1000;
-  int seeks_percentage = 20;   // 20% of keys will be used to test seek()
-  int delete_percentage = 20;  // 20% of keys will be deleted
-  int merge_percentage = 20;   // 20% of keys will be added using Merge()
-
-  for (int run_config = 0; run_config < TestConfig::MAX; run_config++) {
     Options options = CurrentOptions();
     BlockBasedTableOptions table_options;
     table_options.use_delta_encoding = false;
@@ -1157,7 +1188,7 @@ TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) {
 
     ReadOptions ro;
     ro.pin_data = true;
-    auto iter = db_->NewIterator(ro);
+    auto iter = NewIterator(ro);
 
     {
       // Test Seek to random keys
@@ -1246,11 +1277,28 @@ TEST_F(DBIteratorTest, PinnedDataIteratorRandomized) {
     }
 
     delete iter;
-  }
+}
+};
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
+  PinnedDataIteratorRandomized(TestConfig::NORMAL);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
+  PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
+}
+
+TEST_P(DBIteratorTestForPinnedData,
+       PinnedDataIteratorRandomizedCompactBeforeRead) {
+  PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
+  PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
 }
 
 #ifndef ROCKSDB_LITE
-TEST_F(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
+TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
   Options options = CurrentOptions();
   BlockBasedTableOptions table_options;
   table_options.use_delta_encoding = false;
@@ -1299,7 +1347,7 @@ TEST_F(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
 
   ReadOptions ro;
   ro.pin_data = true;
-  auto iter = db_->NewIterator(ro);
+  auto iter = NewIterator(ro);
 
   std::vector<std::pair<Slice, std::string>> results;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -1321,7 +1369,7 @@ TEST_F(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
 }
 #endif
 
-TEST_F(DBIteratorTest, PinnedDataIteratorMergeOperator) {
+TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) {
   Options options = CurrentOptions();
   BlockBasedTableOptions table_options;
   table_options.use_delta_encoding = false;
@@ -1354,7 +1402,7 @@ TEST_F(DBIteratorTest, PinnedDataIteratorMergeOperator) {
 
   ReadOptions ro;
   ro.pin_data = true;
-  auto iter = db_->NewIterator(ro);
+  auto iter = NewIterator(ro);
 
   std::vector<std::pair<Slice, std::string>> results;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@@ -1381,7 +1429,7 @@ TEST_F(DBIteratorTest, PinnedDataIteratorMergeOperator) {
   delete iter;
 }
 
-TEST_F(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
+TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
   Options options = CurrentOptions();
   BlockBasedTableOptions table_options;
   table_options.use_delta_encoding = false;
@@ -1401,7 +1449,7 @@ TEST_F(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
 
   ReadOptions ro;
   ro.pin_data = true;
-  auto iter = db_->NewIterator(ro);
+  auto iter = NewIterator(ro);
 
   // Delete 50% of the keys and update the other 50%
   for (auto& kv : true_data) {
@@ -1431,7 +1479,27 @@ TEST_F(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
   delete iter;
 }
 
-TEST_F(DBIteratorTest, IterSeekForPrevCrossingFiles) {
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 1);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 1;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 1;
+  }
+};
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
   Options options = CurrentOptions();
   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
   options.disable_auto_compactions = true;
@@ -1460,7 +1528,62 @@ TEST_F(DBIteratorTest, IterSeekForPrevCrossingFiles) {
   MoveFilesToLevel(1);
   {
     ReadOptions ro;
-    Iterator* iter = db_->NewIterator(ro);
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekForPrev("a4");
+    ASSERT_EQ(iter->key().ToString(), "a3");
+    ASSERT_EQ(iter->value().ToString(), "va3");
+
+    iter->SeekForPrev("c2");
+    ASSERT_EQ(iter->key().ToString(), "b3");
+    iter->SeekForPrev("d3");
+    ASSERT_EQ(iter->key().ToString(), "d2");
+    iter->SeekForPrev("b5");
+    ASSERT_EQ(iter->key().ToString(), "b4");
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.prefix_same_as_start = true;
+    Iterator* iter = NewIterator(ro);
+    iter->SeekForPrev("c2");
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
+  Options options = CurrentOptions();
+  options.prefix_extractor =
+      std::make_shared<SliceTransformLimitedDomainGeneric>();
+  options.disable_auto_compactions = true;
+  // Enable prefix bloom for SST files
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a1", "va1"));
+  ASSERT_OK(Put("a2", "va2"));
+  ASSERT_OK(Put("a3", "va3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b1", "vb1"));
+  ASSERT_OK(Put("b2", "vb2"));
+  ASSERT_OK(Put("b3", "vb3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b4", "vb4"));
+  ASSERT_OK(Put("d1", "vd1"));
+  ASSERT_OK(Put("d2", "vd2"));
+  ASSERT_OK(Put("d4", "vd4"));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(1);
+  {
+    ReadOptions ro;
+    Iterator* iter = NewIterator(ro);
 
     iter->SeekForPrev("a4");
     ASSERT_EQ(iter->key().ToString(), "a3");
@@ -1478,14 +1601,14 @@ TEST_F(DBIteratorTest, IterSeekForPrevCrossingFiles) {
   {
     ReadOptions ro;
     ro.prefix_same_as_start = true;
-    Iterator* iter = db_->NewIterator(ro);
+    Iterator* iter = NewIterator(ro);
     iter->SeekForPrev("c2");
     ASSERT_TRUE(!iter->Valid());
     delete iter;
   }
 }
 
-TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocks) {
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) {
   Options options = CurrentOptions();
   BlockBasedTableOptions table_options;
   table_options.block_size = 1;  // every block will contain one entry
@@ -1527,7 +1650,7 @@ TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocks) {
   {
     ReadOptions ro;
     ro.fill_cache = false;
-    Iterator* iter = db_->NewIterator(ro);
+    Iterator* iter = NewIterator(ro);
 
     iter->SeekToLast();
     ASSERT_EQ(iter->key().ToString(), "key5");
@@ -1553,7 +1676,7 @@ TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocks) {
   }
 }
 
-TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
   Options options = CurrentOptions();
   options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
   options.disable_auto_compactions = true;
@@ -1629,7 +1752,7 @@ TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
   {
     ReadOptions ro;
     ro.fill_cache = false;
-    Iterator* iter = db_->NewIterator(ro);
+    Iterator* iter = NewIterator(ro);
     auto data_iter = true_data.rbegin();
 
     for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
@@ -1645,7 +1768,7 @@ TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
   {
     ReadOptions ro;
     ro.fill_cache = false;
-    Iterator* iter = db_->NewIterator(ro);
+    Iterator* iter = NewIterator(ro);
     auto data_iter = true_data.rbegin();
 
     int entries_right = 0;
@@ -1700,7 +1823,7 @@ TEST_F(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
   }
 }
 
-TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
+TEST_P(DBIteratorTest, IteratorWithLocalStatistics) {
   Options options = CurrentOptions();
   options.statistics = rocksdb::CreateDBStatistics();
   DestroyAndReopen(options);
@@ -1721,7 +1844,7 @@ TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
   std::function<void()> reader_func_next = [&]() {
     SetPerfLevel(kEnableCount);
     get_perf_context()->Reset();
-    Iterator* iter = db_->NewIterator(ReadOptions());
+    Iterator* iter = NewIterator(ReadOptions());
 
     iter->SeekToFirst();
     // Seek will bump ITER_BYTES_READ
@@ -1748,7 +1871,7 @@ TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
 
   std::function<void()> reader_func_prev = [&]() {
     SetPerfLevel(kEnableCount);
-    Iterator* iter = db_->NewIterator(ReadOptions());
+    Iterator* iter = NewIterator(ReadOptions());
 
     iter->SeekToLast();
     // Seek will bump ITER_BYTES_READ
@@ -1794,7 +1917,7 @@ TEST_F(DBIteratorTest, IteratorWithLocalStatistics) {
 
 }
 
-TEST_F(DBIteratorTest, ReadAhead) {
+TEST_P(DBIteratorTest, ReadAhead) {
   Options options;
   env_->count_random_reads_ = true;
   options.env = env_;
@@ -1831,26 +1954,30 @@ TEST_F(DBIteratorTest, ReadAhead) {
   env_->random_read_bytes_counter_ = 0;
   options.statistics->setTickerCount(NO_FILE_OPENS, 0);
   ReadOptions read_options;
-  auto* iter = db_->NewIterator(read_options);
+  auto* iter = NewIterator(read_options);
   iter->SeekToFirst();
   int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS);
   size_t bytes_read = env_->random_read_bytes_counter_;
   delete iter;
 
+  int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
   env_->random_read_bytes_counter_ = 0;
   options.statistics->setTickerCount(NO_FILE_OPENS, 0);
   read_options.readahead_size = 1024 * 10;
-  iter = db_->NewIterator(read_options);
+  iter = NewIterator(read_options);
   iter->SeekToFirst();
   int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
   size_t bytes_read_readahead = env_->random_read_bytes_counter_;
   delete iter;
+  int64_t num_file_closes_readahead =
+      TestGetTickerCount(options, NO_FILE_CLOSES);
   ASSERT_EQ(num_file_opens + 3, num_file_opens_readahead);
+  ASSERT_EQ(num_file_closes + 3, num_file_closes_readahead);
   ASSERT_GT(bytes_read_readahead, bytes_read);
   ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
 
   // Verify correctness.
-  iter = db_->NewIterator(read_options);
+  iter = NewIterator(read_options);
   int count = 0;
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     ASSERT_EQ(value, iter->value());
@@ -1867,7 +1994,7 @@ TEST_F(DBIteratorTest, ReadAhead) {
 // Insert a key, create a snapshot iterator, overwrite key lots of times,
 // seek to a smaller key. Expect DBIter to fall back to a seek instead of
 // going through all the overwrites linearly.
-TEST_F(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
+TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
   Options options = CurrentOptions();
   options.env = env_;
   options.create_if_missing = true;
@@ -1882,7 +2009,7 @@ TEST_F(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
 
   // Create iterator.
   ReadOptions ro;
-  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  std::unique_ptr<Iterator> iter(NewIterator(ro));
 
   // Insert a lot.
   for (int i = 0; i < 100; ++i) {
@@ -1920,10 +2047,10 @@ TEST_F(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
                  NUMBER_OF_RESEEKS_IN_ITERATION));
 }
 
-TEST_F(DBIteratorTest, Refresh) {
+TEST_P(DBIteratorTest, Refresh) {
   ASSERT_OK(Put("x", "y"));
 
-  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
   iter->Seek(Slice("a"));
   ASSERT_TRUE(iter->Valid());
   ASSERT_EQ(iter->key().compare(Slice("x")), 0);
@@ -1979,6 +2106,516 @@ TEST_F(DBIteratorTest, Refresh) {
   iter.reset();
 }
 
+TEST_P(DBIteratorTest, RefreshWithSnapshot) {
+  ASSERT_OK(Put("x", "y"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ReadOptions options;
+  options.snapshot = snapshot;
+  Iterator* iter = NewIterator(options);
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  Status s;
+  s = iter->Refresh();
+  ASSERT_TRUE(s.IsNotSupported());
+  db_->ReleaseSnapshot(snapshot);
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, CreationFailure) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
+        *(reinterpret_cast<Status*>(arg)) = Status::Corruption("test status");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Iterator* iter = NewIterator(ReadOptions());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("y1", "1"));
+  ASSERT_OK(Put("y2", "1"));
+  ASSERT_OK(Put("y3", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.iterate_upper_bound = &ub_slice;
+  ro.max_skippable_internal_keys = 1000;
+
+  Iterator* iter = NewIterator(ro);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, TableFilter) {
+  ASSERT_OK(Put("a", "1"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Put("c", "3"));
+  dbfull()->Flush(FlushOptions());
+  ASSERT_OK(Put("d", "4"));
+  ASSERT_OK(Put("e", "5"));
+  ASSERT_OK(Put("f", "6"));
+  dbfull()->Flush(FlushOptions());
+
+  // Ensure the table_filter callback is called once for each table.
+  {
+    std::set<uint64_t> unseen{1, 2, 3};
+    ReadOptions opts;
+    opts.table_filter = [&](const TableProperties& props) {
+      auto it = unseen.find(props.num_entries);
+      if (it == unseen.end()) {
+        ADD_FAILURE() << "saw table properties with an unexpected "
+                      << props.num_entries << " entries";
+      } else {
+        unseen.erase(it);
+      }
+      return true;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->2");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->3");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(unseen.empty());
+    delete iter;
+  }
+
+  // Ensure returning false in the table_filter hides the keys from that table
+  // during iteration.
+  {
+    ReadOptions opts;
+    opts.table_filter = [](const TableProperties& props) {
+      return props.num_entries != 2;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+  ASSERT_OK(Put("foo", "2"));
+
+  ASSERT_OK(Put("foo", "3"));
+  ASSERT_OK(Put("foo", "4"));
+  ASSERT_OK(Put("foo", "5"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put("foo", "6"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.snapshot = snapshot;
+  ro.iterate_upper_bound = &ub_slice;
+
+  Iterator* iter = NewIterator(ro);
+  iter->SeekForPrev("goo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  iter->Prev();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBIteratorTest, SkipStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int skip_count = 0;
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("b", "1"));
+  ASSERT_OK(Put("c", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("d", "1"));
+  ASSERT_OK(Put("e", "1"));
+  ASSERT_OK(Put("f", "1"));
+  ASSERT_OK(Put("a", "2"));
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("d"));
+  ASSERT_OK(Delete("e"));
+  ASSERT_OK(Delete("f"));
+
+  Iterator* iter = NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8; // 3 deletes + 3 original keys + 2 lower in sequence
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ReadOptions());
+  count = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8; // Same as above, but in reverse order
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  ASSERT_OK(Put("aa", "1"));
+  ASSERT_OK(Put("ab", "1"));
+  ASSERT_OK(Put("ac", "1"));
+  ASSERT_OK(Put("ad", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("ab"));
+  ASSERT_OK(Delete("ac"));
+  ASSERT_OK(Delete("ad"));
+
+  ReadOptions ro;
+  Slice prefix("b");
+  ro.iterate_upper_bound = &prefix;
+
+  iter = NewIterator(ro);
+  count = 0;
+  for(iter->Seek("aa"); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 1);
+  delete iter;
+  skip_count += 6; // 3 deletes + 3 original keys
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ro);
+  count = 0;
+  for(iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  // 3 deletes + 3 original keys + lower sequence of "a"
+  skip_count += 7;
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+}
+
+TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ReadOptions ropts;
+  ropts.max_skippable_internal_keys = 2;
+
+  Put("1", "val_1");
+  // Add more tombstones than max_skippable_internal_keys so that Next() fails.
+  Delete("2");
+  Delete("3");
+  Delete("4");
+  Delete("5");
+  Put("6", "val_6");
+
+  std::unique_ptr<Iterator> iter(NewIterator(ropts));
+  iter->SeekToFirst();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "1");
+  ASSERT_EQ(iter->value().ToString(), "val_1");
+
+  // This should fail as incomplete due to too many non-visible internal keys on
+  // the way to the next valid user key.
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // Get the internal key at which Next() failed.
+  std::string prop_value;
+  ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+  ASSERT_EQ("4", prop_value);
+
+  // Create a new iterator to seek to the internal key.
+  std::unique_ptr<Iterator> iter2(NewIterator(ropts));
+  iter2->Seek(prop_value);
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_OK(iter2->status());
+
+  ASSERT_EQ(iter2->key().ToString(), "6");
+  ASSERT_EQ(iter2->value().ToString(), "val_6");
+}
+
+// Reproduces a former bug where iterator would skip some records when DBIter
+// re-seeks subiterator with Incomplete status.
+TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  // Make sure the sst file has more than one block.
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Two records in sst file, each in its own block.
+  Put("b", "");
+  Put("d", "");
+  Flush();
+
+  // Create a nonblocking iterator before writing to memtable.
+  ReadOptions ropt;
+  ropt.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+  // Overwrite a key in memtable many times to hit
+  // max_sequential_skip_in_iterations (which is 8 by default).
+  for (int i = 0; i < 20; ++i) {
+    Put("c", "");
+  }
+
+  // Load the second block in sst file into the block cache.
+  {
+    std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
+    iter2->Seek("d");
+  }
+
+  // Finally seek the nonblocking iterator.
+  iter->Seek("a");
+  // With the bug, the status used to be OK, and the iterator used to point to
+  // "d".
+  EXPECT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
+  Put("a", "");
+  Put("b", "");
+  Flush();
+
+  ReadOptions ropt;
+  Slice ub = "b";
+  ropt.iterate_upper_bound = &ub;
+
+  std::unique_ptr<Iterator> it(dbfull()->NewIterator(ropt));
+  it->SeekForPrev("a");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  ASSERT_EQ("a", it->key().ToString());
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  it->SeekForPrev("a");
+  ASSERT_OK(it->status());
+
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("a", it->key().ToString());
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how DBIter work with ReadCallback
+class DBIteratorWithReadCallbackTest : public DBIteratorTest {};
+
+TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber _max_visible_seq)
+        : ReadCallback(_max_visible_seq) {}
+
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= max_visible_seq_;
+    }
+  };
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("foo", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Put("z", "vz"));
+  SequenceNumber seq1 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback1(seq1);
+  ASSERT_OK(Put("foo", "v4"));
+  ASSERT_OK(Put("foo", "v5"));
+  ASSERT_OK(Put("bar", "v7"));
+
+  SequenceNumber seq2 = db_->GetLatestSequenceNumber();
+  auto* cfd =
+      reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
+          ->cfd();
+  // The iterator are suppose to see data before seq1.
+  Iterator* iter =
+      dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1);
+
+  // Seek
+  // The latest value of "foo" before seq1 is "v3"
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Next
+  // Seek to "a"
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("va", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Prev
+  // Seek to "z"
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // The previous key is "foo", which is visible to the iterator.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->Prev();  // skipping "bar"
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  // SeekForPrev
+  // The previous key is "foo", which is visible to the iterator.
+  iter->SeekForPrev("y");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->SeekForPrev("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  delete iter;
+
+  // Prev beyond max_sequential_skip_in_iterations
+  uint64_t num_versions =
+      CurrentOptions().max_sequential_skip_in_iterations + 10;
+  for (uint64_t i = 0; i < num_versions; i++) {
+    ASSERT_OK(Put("bar", ToString(i)));
+  }
+  SequenceNumber seq3 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback2(seq3);
+  ASSERT_OK(Put("bar", "v8"));
+  SequenceNumber seq4 = db_->GetLatestSequenceNumber();
+
+  // The iterator is suppose to see data before seq3.
+  iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2);
+  // Seek to "z", which is visible.
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // Previous key is "foo" and the last value "v5" is visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v5", iter->value());
+  // Since the number of values of "bar" is more than
+  // max_sequential_skip_in_iterations, Prev() will ultimately fallback to
+  // seek in forward direction. Here we test the fallback seek is correct.
+  // The last visible value should be (num_versions - 1), as "v8" is not
+  // visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key());
+  ASSERT_EQ(ToString(num_versions - 1), iter->value());
+
+  delete iter;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_log_iter_test.cc b/thirdparty/rocksdb/db/db_log_iter_test.cc
index e7f94c4c42..45642bc7ae 100644
--- a/thirdparty/rocksdb/db/db_log_iter_test.cc
+++ b/thirdparty/rocksdb/db/db_log_iter_test.cc
@@ -23,7 +23,7 @@ class DBTestXactLogIterator : public DBTestBase {
 
   std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
       const SequenceNumber seq) {
-    unique_ptr<TransactionLogIterator> iter;
+    std::unique_ptr<TransactionLogIterator> iter;
     Status status = dbfull()->GetUpdatesSince(seq, &iter);
     EXPECT_OK(status);
     EXPECT_TRUE(iter->Valid());
@@ -249,22 +249,20 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
   auto res = OpenTransactionLogIter(0)->GetBatch();
   struct Handler : public WriteBatch::Handler {
     std::string seen;
-    virtual Status PutCF(uint32_t cf, const Slice& key,
-                         const Slice& value) override {
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
       seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " +
               ToString(value.size()) + ")";
       return Status::OK();
     }
-    virtual Status MergeCF(uint32_t cf, const Slice& key,
-                           const Slice& value) override {
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
       seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " +
               ToString(value.size()) + ")";
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) override {
+    void LogData(const Slice& blob) override {
       seen += "LogData(" + blob.ToString() + ")";
     }
-    virtual Status DeleteCF(uint32_t cf, const Slice& key) override {
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
       seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")";
       return Status::OK();
     }
@@ -289,6 +287,8 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 #else
+  (void) argc;
+  (void) argv;
   return 0;
 #endif
 }
diff --git a/thirdparty/rocksdb/db/db_memtable_test.cc b/thirdparty/rocksdb/db/db_memtable_test.cc
index 63d274f6ab..294d0f581b 100644
--- a/thirdparty/rocksdb/db/db_memtable_test.cc
+++ b/thirdparty/rocksdb/db/db_memtable_test.cc
@@ -8,6 +8,7 @@
 
 #include "db/db_test_util.h"
 #include "db/memtable.h"
+#include "db/range_del_aggregator.h"
 #include "port/stack_trace.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice_transform.h"
@@ -24,37 +25,32 @@ class MockMemTableRep : public MemTableRep {
   explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep)
       : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
 
-  virtual KeyHandle Allocate(const size_t len, char** buf) override {
+  KeyHandle Allocate(const size_t len, char** buf) override {
     return rep_->Allocate(len, buf);
   }
 
-  virtual void Insert(KeyHandle handle) override {
-    return rep_->Insert(handle);
-  }
+  void Insert(KeyHandle handle) override { rep_->Insert(handle); }
 
-  virtual void InsertWithHint(KeyHandle handle, void** hint) override {
+  void InsertWithHint(KeyHandle handle, void** hint) override {
     num_insert_with_hint_++;
-    ASSERT_NE(nullptr, hint);
+    EXPECT_NE(nullptr, hint);
     last_hint_in_ = *hint;
     rep_->InsertWithHint(handle, hint);
     last_hint_out_ = *hint;
   }
 
-  virtual bool Contains(const char* key) const override {
-    return rep_->Contains(key);
-  }
+  bool Contains(const char* key) const override { return rep_->Contains(key); }
 
-  virtual void Get(const LookupKey& k, void* callback_args,
-                   bool (*callback_func)(void* arg,
-                                         const char* entry)) override {
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override {
     rep_->Get(k, callback_args, callback_func);
   }
 
-  virtual size_t ApproximateMemoryUsage() override {
+  size_t ApproximateMemoryUsage() override {
     return rep_->ApproximateMemoryUsage();
   }
 
-  virtual Iterator* GetIterator(Arena* arena) override {
+  Iterator* GetIterator(Arena* arena) override {
     return rep_->GetIterator(arena);
   }
 
@@ -71,10 +67,10 @@ class MockMemTableRep : public MemTableRep {
 
 class MockMemTableRepFactory : public MemTableRepFactory {
  public:
-  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
-                                         Allocator* allocator,
-                                         const SliceTransform* transform,
-                                         Logger* logger) override {
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger) override {
     SkipListFactory factory;
     MemTableRep* skiplist_rep =
         factory.CreateMemTableRep(cmp, allocator, transform, logger);
@@ -82,16 +78,16 @@ class MockMemTableRepFactory : public MemTableRepFactory {
     return mock_rep_;
   }
 
-  virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
-                                         Allocator* allocator,
-                                         const SliceTransform* transform,
-                                         Logger* logger,
-                                         uint32_t column_family_id) override {
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger,
+                                 uint32_t column_family_id) override {
     last_column_family_id_ = column_family_id;
     return CreateMemTableRep(cmp, allocator, transform, logger);
   }
 
-  virtual const char* Name() const override { return "MockMemTableRepFactory"; }
+  const char* Name() const override { return "MockMemTableRepFactory"; }
 
   MockMemTableRep* rep() { return mock_rep_; }
 
@@ -107,9 +103,9 @@ class MockMemTableRepFactory : public MemTableRepFactory {
 
 class TestPrefixExtractor : public SliceTransform {
  public:
-  virtual const char* Name() const override { return "TestPrefixExtractor"; }
+  const char* Name() const override { return "TestPrefixExtractor"; }
 
-  virtual Slice Transform(const Slice& key) const override {
+  Slice Transform(const Slice& key) const override {
     const char* p = separator(key);
     if (p == nullptr) {
       return Slice();
@@ -117,11 +113,11 @@ class TestPrefixExtractor : public SliceTransform {
     return Slice(key.data(), p - key.data() + 1);
   }
 
-  virtual bool InDomain(const Slice& key) const override {
+  bool InDomain(const Slice& key) const override {
     return separator(key) != nullptr;
   }
 
-  virtual bool InRange(const Slice& key) const override { return false; }
+  bool InRange(const Slice& /*key*/) const override { return false; }
 
  private:
   const char* separator(const Slice& key) const {
@@ -129,6 +125,85 @@ class TestPrefixExtractor : public SliceTransform {
   }
 };
 
+// Test that ::Add properly returns false when inserting duplicate keys
+TEST_F(DBMemTableTest, DuplicateSeq) {
+  SequenceNumber seq = 123;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  Options options;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Write some keys and make sure it returns false on duplicates
+  bool res;
+  res = mem->Add(seq, kTypeValue, "key", "value2");
+  ASSERT_TRUE(res);
+  res = mem->Add(seq, kTypeValue, "key", "value2");
+  ASSERT_FALSE(res);
+  // Changing the type should still cause the duplicatae key
+  res = mem->Add(seq, kTypeMerge, "key", "value2");
+  ASSERT_FALSE(res);
+  // Changing the seq number will make the key fresh
+  res = mem->Add(seq + 1, kTypeMerge, "key", "value2");
+  ASSERT_TRUE(res);
+  // Test with different types for duplicate keys
+  res = mem->Add(seq, kTypeDeletion, "key", "");
+  ASSERT_FALSE(res);
+  res = mem->Add(seq, kTypeSingleDeletion, "key", "");
+  ASSERT_FALSE(res);
+
+  // Test the duplicate keys under stress
+  for (int i = 0; i < 10000; i++) {
+    bool insert_dup = i % 10 == 1;
+    if (!insert_dup) {
+      seq++;
+    }
+    res = mem->Add(seq, kTypeValue, "foo", "value" + ToString(seq));
+    if (insert_dup) {
+      ASSERT_FALSE(res);
+    } else {
+      ASSERT_TRUE(res);
+    }
+  }
+  delete mem;
+
+  // Test with InsertWithHint
+  options.memtable_insert_with_hint_prefix_extractor.reset(
+      new TestPrefixExtractor());  // which uses _ to extract the prefix
+  ioptions = ImmutableCFOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  // Insert a duplicate key with _ in it
+  res = mem->Add(seq, kTypeValue, "key_1", "value");
+  ASSERT_TRUE(res);
+  res = mem->Add(seq, kTypeValue, "key_1", "value");
+  ASSERT_FALSE(res);
+  delete mem;
+
+  // Test when InsertConcurrently will be invoked
+  options.allow_concurrent_memtable_write = true;
+  ioptions = ImmutableCFOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  MemTablePostProcessInfo post_process_info;
+  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
+  ASSERT_TRUE(res);
+  res = mem->Add(seq, kTypeValue, "key", "value", true, &post_process_info);
+  ASSERT_FALSE(res);
+  delete mem;
+}
+
 TEST_F(DBMemTableTest, InsertWithHint) {
   Options options;
   options.allow_concurrent_memtable_write = false;
diff --git a/thirdparty/rocksdb/db/db_merge_operator_test.cc b/thirdparty/rocksdb/db/db_merge_operator_test.cc
index de28619106..2b5e4a445e 100644
--- a/thirdparty/rocksdb/db/db_merge_operator_test.cc
+++ b/thirdparty/rocksdb/db/db_merge_operator_test.cc
@@ -8,16 +8,128 @@
 #include "db/db_test_util.h"
 #include "db/forward_iterator.h"
 #include "port/stack_trace.h"
+#include "rocksdb/merge_operator.h"
 #include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
 
 namespace rocksdb {
 
+class TestReadCallback : public ReadCallback {
+ public:
+  TestReadCallback(SnapshotChecker* snapshot_checker,
+                   SequenceNumber snapshot_seq)
+      : ReadCallback(snapshot_seq),
+        snapshot_checker_(snapshot_checker),
+        snapshot_seq_(snapshot_seq) {}
+
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) ==
+           SnapshotCheckerResult::kInSnapshot;
+  }
+
+ private:
+  SnapshotChecker* snapshot_checker_;
+  SequenceNumber snapshot_seq_;
+};
+
 // Test merge operator functionality.
 class DBMergeOperatorTest : public DBTestBase {
  public:
   DBMergeOperatorTest() : DBTestBase("/db_merge_operator_test") {}
+
+  std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
+                                  const Slice& key,
+                                  const Snapshot* snapshot = nullptr) {
+    SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber()
+                                             : snapshot->GetSequenceNumber();
+    TestReadCallback read_callback(snapshot_checker, seq);
+    ReadOptions read_opt;
+    read_opt.snapshot = snapshot;
+    PinnableSlice value;
+    Status s =
+        dbfull()->GetImpl(read_opt, db_->DefaultColumnFamily(), key, &value,
+                          nullptr /*value_found*/, &read_callback);
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
 };
 
+TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
+  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+   public:
+    LimitedStringAppendMergeOp(int limit, char delim)
+        : StringAppendTESTOperator(delim), limit_(limit) {}
+
+    const char* Name() const override {
+      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+    }
+
+    bool ShouldMerge(const std::vector<Slice>& operands) const override {
+      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    size_t limit_ = 0;
+  };
+
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator =
+      std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  // All K1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).ok());
+  // Make sure that only the latest two merge operands are used. If this was
+  // not the case the value would be "a,b,c,d".
+  ASSERT_EQ(value, "c,d");
+
+  // All K2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "a"));
+  ASSERT_OK(Merge("k2", "b"));
+  ASSERT_OK(Merge("k2", "c"));
+  ASSERT_OK(Merge("k2", "d"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k2", &value).ok());
+  ASSERT_EQ(value, "c,d");
+
+  // All K3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k3", &value).ok());
+  ASSERT_EQ(value, "cd,de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ab"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "bc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "cd"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "de"));
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k4", &value).ok());
+  ASSERT_EQ(value, "cd,de");
+}
+
 TEST_F(DBMergeOperatorTest, MergeErrorOnRead) {
   Options options;
   options.create_if_missing = true;
@@ -57,16 +169,33 @@ TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) {
   ASSERT_OK(Merge("k1", "v1"));
   ASSERT_OK(Merge("k1", "corrupted"));
   ASSERT_OK(Put("k2", "v2"));
-  VerifyDBFromMap({{"k1", ""}, {"k2", "v2"}}, nullptr, false,
-                  {{"k1", Status::Corruption()}});
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k2");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
   VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}});
 
   DestroyAndReopen(options);
   ASSERT_OK(Merge("k1", "v1"));
   ASSERT_OK(Put("k2", "v2"));
   ASSERT_OK(Merge("k2", "corrupted"));
-  VerifyDBFromMap({{"k1", "v1"}, {"k2", ""}}, nullptr, false,
-                  {{"k2", Status::Corruption()}});
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
   VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}});
 }
 
@@ -158,7 +287,7 @@ TEST_P(MergeOperatorPinningTest, Randomized) {
     Random rnd(301);
     std::map<std::string, std::string> true_data;
 
-    const int kTotalMerges = 10000;
+    const int kTotalMerges = 5000;
     // Every key gets ~10 operands
     const int kKeyRange = kTotalMerges / 10;
     const int kOperandSize = 20;
@@ -205,8 +334,7 @@ TEST_P(MergeOperatorPinningTest, Randomized) {
 
     VerifyDBFromMap(true_data);
 
-    // Skip HashCuckoo since it does not support merge operators
-  } while (ChangeOptions(kSkipMergePut | kSkipHashCuckoo));
+  } while (ChangeOptions(kSkipMergePut));
 }
 
 class MergeOperatorHook : public MergeOperator {
@@ -214,15 +342,15 @@ class MergeOperatorHook : public MergeOperator {
   explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
       : merge_op_(_merge_op) {}
 
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
     before_merge_();
     bool res = merge_op_->FullMergeV2(merge_in, merge_out);
     after_merge_();
     return res;
   }
 
-  virtual const char* Name() const override { return merge_op_->Name(); }
+  const char* Name() const override { return merge_op_->Name(); }
 
   std::shared_ptr<MergeOperator> merge_op_;
   std::function<void()> before_merge_ = []() {};
@@ -356,8 +484,159 @@ TEST_P(MergeOperatorPinningTest, TailingIterator) {
   writer_thread.join();
   reader_thread.join();
 }
+
+TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  // Overview of the test:
+  //  * There are two merge operands for the same key: one in an sst file,
+  //    another in a memtable.
+  //  * Seek a tailing iterator to this key.
+  //  * As part of the seek, the iterator will:
+  //      (a) first visit the operand in the memtable and tell ForwardIterator
+  //          to pin this operand, then
+  //      (b) move on to the operand in the sst file, then pass both operands
+  //          to merge operator.
+  //  * The memtable may get flushed and unreferenced by another thread between
+  //    (a) and (b). The test simulates it by flushing the memtable inside a
+  //    SyncPoint callback located between (a) and (b).
+  //  * In this case it's ForwardIterator's responsibility to keep the memtable
+  //    pinned until (b) is complete. There used to be a bug causing
+  //    ForwardIterator to not pin it in some circumstances. This test
+  //    reproduces it.
+
+  db_->Merge(WriteOptions(), "key", "sst");
+  db_->Flush(FlushOptions()); // Switch to SuperVersion A
+  db_->Merge(WriteOptions(), "key", "memtable");
+
+  // Pin SuperVersion A
+  std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+
+  bool pushed_first_operand = false;
+  bool stepped_to_next_operand = false;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
+        EXPECT_FALSE(pushed_first_operand);
+        pushed_first_operand = true;
+        db_->Flush(FlushOptions()); // Switch to SuperVersion B
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
+        EXPECT_FALSE(stepped_to_next_operand);
+        stepped_to_next_operand = true;
+        someone_else.reset(); // Unpin SuperVersion A
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadOptions ro;
+  ro.tailing = true;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  iter->Seek("key");
+
+  ASSERT_TRUE(iter->status().ok());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
+  EXPECT_TRUE(pushed_first_operand);
+  EXPECT_TRUE(stepped_to_next_operand);
+}
 #endif  // ROCKSDB_LITE
 
+TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  class TestSnapshotChecker : public SnapshotChecker {
+   public:
+    SnapshotCheckerResult CheckInSnapshot(
+        SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+      return IsInSnapshot(seq, snapshot_seq)
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+
+    bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const {
+      switch (snapshot_seq) {
+        case 0:
+          return seq == 0;
+        case 1:
+          return seq <= 1;
+        case 2:
+          // seq = 2 not visible to snapshot with seq = 2
+          return seq <= 1;
+        case 3:
+          return seq <= 3;
+        case 4:
+          // seq = 4 not visible to snpahost with seq = 4
+          return seq <= 3;
+        default:
+          // seq >=4 is uncommitted
+          return seq <= 4;
+      };
+    }
+  };
+  TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker();
+  dbfull()->SetSnapshotChecker(snapshot_checker);
+
+  std::string value;
+  ASSERT_OK(Merge("foo", "v1"));
+  ASSERT_EQ(1, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v2"));
+  ASSERT_EQ(2, db_->GetLatestSequenceNumber());
+  // v2 is not visible to latest snapshot, which has seq = 2.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  // Take a snapshot with seq = 2.
+  const Snapshot* snapshot1 = db_->GetSnapshot();
+  ASSERT_EQ(2, snapshot1->GetSequenceNumber());
+  // v2 is not visible to snapshot1, which has seq = 2
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v3"));
+  ASSERT_EQ(3, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v4"));
+  ASSERT_EQ(4, db_->GetLatestSequenceNumber());
+  // v4 is not visible to latest snapshot, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  const Snapshot* snapshot2 = db_->GetSnapshot();
+  ASSERT_EQ(4, snapshot2->GetSequenceNumber());
+  // v4 is not visible to snapshot2, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v5"));
+  ASSERT_EQ(5, db_->GetLatestSequenceNumber());
+  // v5 is uncommitted
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  // full manual compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Verify compaction doesn't alter the result.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  db_->ReleaseSnapshot(snapshot1);
+  db_->ReleaseSnapshot(snapshot2);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_options_test.cc b/thirdparty/rocksdb/db/db_options_test.cc
index 243748f9fa..a7ecf12744 100644
--- a/thirdparty/rocksdb/db/db_options_test.cc
+++ b/thirdparty/rocksdb/db/db_options_test.cc
@@ -18,12 +18,15 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
 #include "util/random.h"
 #include "util/sync_point.h"
 #include "util/testutil.h"
 
 namespace rocksdb {
 
+const int kMicrosInSec = 1000000;
+
 class DBOptionsTest : public DBTestBase {
  public:
   DBOptionsTest() : DBTestBase("/db_options_test") {}
@@ -117,6 +120,150 @@ TEST_F(DBOptionsTest, GetLatestCFOptions) {
             GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
 }
 
+TEST_F(DBOptionsTest, SetBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024;  // 1MB
+  Options options;
+  options.create_if_missing = true;
+  options.bytes_per_sync = 1024 * 1024;
+  options.use_direct_reads = false;
+  options.write_buffer_size = 400 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  int counter = 0;
+  int low_bytes_per_sync = 0;
+  int i = 0;
+  const std::string kValue(kValueSize, 'v');
+  ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+        counter++;
+      });
+
+  WriteOptions write_opts;
+  // should sync approximately 40MB/1MB ~= 40 times.
+  for (i = 0; i < 40; i++) {
+    Put(Key(i), kValue, write_opts);
+  }
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  low_bytes_per_sync = counter;
+  ASSERT_GT(low_bytes_per_sync, 35);
+  ASSERT_LT(low_bytes_per_sync, 45);
+
+  counter = 0;
+  // 8388608 = 8 * 1024 * 1024
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}}));
+  ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync);
+  // should sync approximately 40MB*2/8MB ~= 10 times.
+  // data will be 40*2MB because of previous Puts too.
+  for (i = 0; i < 40; i++) {
+    Put(Key(i), kValue, write_opts);
+  }
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GT(counter, 5);
+  ASSERT_LT(counter, 15);
+
+  // Redundant assert. But leaving it here just to get the point across that
+  // low_bytes_per_sync > counter.
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, SetWalBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024 * 3;
+  Options options;
+  options.create_if_missing = true;
+  options.wal_bytes_per_sync = 512;
+  options.write_buffer_size = 100 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  int counter = 0;
+  int low_bytes_per_sync = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+        counter++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kValue(kValueSize, 'v');
+  int i = 0;
+  for (; i < 10; i++) {
+    Put(Key(i), kValue);
+  }
+  // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its
+  // empty and will not get the new wal_bytes_per_sync value.
+  low_bytes_per_sync = counter;
+  //5242880 = 1024 * 1024 * 5
+  ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}}));
+  ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  counter = 0;
+  i = 0;
+  for (; i < 10; i++) {
+    Put(Key(i), kValue);
+  }
+  ASSERT_GT(counter, 0);
+  ASSERT_GT(low_bytes_per_sync, 0);
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
+  Options options;
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_manifest_file_size = 1;
+  options.env = env_;
+  int buffer_size = 1024 * 1024;
+  Reopen(options);
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+
+  std::atomic<int> match_cnt(0);
+  std::atomic<int> unmatch_cnt(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
+        int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
+        if (value == buffer_size) {
+          match_cnt++;
+        } else {
+          unmatch_cnt++;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  int i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", ToString(i)));
+    ASSERT_OK(Put("bar", ToString(i)));
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
+  buffer_size = 512 * 1024;
+  match_cnt = 0;
+  unmatch_cnt = 0;  // SetDBOptions() will create a WriteableFileWriter
+
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+  i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", ToString(i)));
+    ASSERT_OK(Put("bar", ToString(i)));
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+}
+
 TEST_F(DBOptionsTest, SetOptionsAndReopen) {
   Random rnd(1044);
   auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);
@@ -364,10 +511,290 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
 
   for (int i = 0; i < 20; i++) {
     int num = rand() % 5000 + 1;
-    ASSERT_OK(dbfull()->SetDBOptions(
-        {{"stats_dump_period_sec", std::to_string(num)}}));
+    ASSERT_OK(
+        dbfull()->SetDBOptions({{"stats_dump_period_sec", ToString(num)}}));
     ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
   }
+  Close();
+}
+
+TEST_F(DBOptionsTest, RunStatsDumpPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = 5;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0); // in seconds
+  options.env = mock_env.get();
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DumpStats:1", [&](void* /*arg*/) {
+        counter++;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_dump_period_sec);
+  dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+
+  // Test cacel job through SetOptions
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "0"}}));
+  int old_val = counter;
+  for (int i = 6; i < 20; ++i) {
+    dbfull()->TEST_WaitForDumpStatsRun([&] { mock_env->set_current_time(i); });
+  }
+  ASSERT_EQ(counter, old_val);
+  Close();
+}
+
+// Test persistent stats background thread scheduling and cancelling
+TEST_F(DBOptionsTest, StatsPersistScheduling) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+
+  // Test cacel job through SetOptions
+  ASSERT_TRUE(dbfull()->TEST_IsPersistentStatsEnabled());
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  ASSERT_FALSE(dbfull()->TEST_IsPersistentStatsEnabled());
+  Close();
+}
+
+// Test enabling persistent stats for the first time
+TEST_F(DBOptionsTest, PersistentStatsFreshInstall) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 0;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+#endif  // OS_MACOSX && !NDEBUG
+  int counter = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PersistStats:Entry", [&](void* /*arg*/) { counter++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "5"}}));
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  ASSERT_GE(counter, 1);
+  Close();
+}
+
+TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(5, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
+  ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
+  ASSERT_EQ(12345, dbfull()->GetDBOptions().stats_persist_period_sec);
+}
+
+TEST_F(DBOptionsTest, GetStatsHistory) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.statistics = rocksdb::CreateDBStatistics();
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  int mock_time = 1;
+  // Wait for stats persist to finish
+  dbfull()->TEST_WaitForPersistStatsRun([&] { mock_env->set_current_time(5); });
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0, 6 * kMicrosInSec, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  // disabled stats snapshots
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "0"}}));
+  size_t stats_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count += stats_map.size();
+  }
+  ASSERT_GT(stats_count, 0);
+  // Wait a bit and verify no more stats are found
+  for (mock_time = 6; mock_time < 20; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_new = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    stats_count_new += stats_iter->GetStatsMap().size();
+  }
+  ASSERT_EQ(stats_count_new, stats_count);
+  Close();
+}
+
+TEST_F(DBOptionsTest, InMemoryStatsHistoryPurging) {
+  Options options;
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.stats_persist_period_sec = 1;
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env;
+  mock_env.reset(new rocksdb::MockTimeEnv(env_));
+  mock_env->set_current_time(0);  // in seconds
+  options.env = mock_env.get();
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  CreateColumnFamilies({"pikachu"}, options);
+  ASSERT_OK(Put("foo", "bar"));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // some random operation to populate statistics
+  ASSERT_OK(Delete("foo"));
+  ASSERT_OK(Put("sol", "sol"));
+  ASSERT_OK(Put("epic", "epic"));
+  ASSERT_OK(Put("ltd", "ltd"));
+  ASSERT_EQ("sol", Get("sol"));
+  ASSERT_EQ("epic", Get("epic"));
+  ASSERT_EQ("ltd", Get("ltd"));
+  Iterator* iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("sol"));
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  int mock_time = 1;
+  // Wait for stats persist to finish
+  for (; mock_time < 5; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+
+  // second round of ops
+  ASSERT_OK(Put("saigon", "saigon"));
+  ASSERT_OK(Put("noodle talk", "noodle talk"));
+  ASSERT_OK(Put("ping bistro", "ping bistro"));
+  iterator = db_->NewIterator(ReadOptions());
+  for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+    ASSERT_TRUE(iterator->key() == iterator->value());
+  }
+  delete iterator;
+  ASSERT_OK(Flush());
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  for (; mock_time < 10; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  std::unique_ptr<StatsHistoryIterator> stats_iter;
+  db_->GetStatsHistory(0, 10 * kMicrosInSec, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count = 0;
+  int slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count += stats_map.size();
+  }
+  size_t stats_history_size = dbfull()->TEST_EstiamteStatsHistorySize();
+  ASSERT_GE(slice_count, 9);
+  ASSERT_GE(stats_history_size, 12000);
+  // capping memory cost at 12000 bytes since one slice is around 10000~12000
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "12000"}}));
+  ASSERT_EQ(12000, dbfull()->GetDBOptions().stats_history_buffer_size);
+  // Wait for stats persist to finish
+  for (; mock_time < 20; ++mock_time) {
+    dbfull()->TEST_WaitForPersistStatsRun(
+        [&] { mock_env->set_current_time(mock_time); });
+  }
+  db_->GetStatsHistory(0, 20 * kMicrosInSec, &stats_iter);
+  ASSERT_TRUE(stats_iter != nullptr);
+  size_t stats_count_reopen = 0;
+  slice_count = 0;
+  for (; stats_iter->Valid(); stats_iter->Next()) {
+    slice_count++;
+    auto stats_map = stats_iter->GetStatsMap();
+    stats_count_reopen += stats_map.size();
+  }
+  size_t stats_history_size_reopen = dbfull()->TEST_EstiamteStatsHistorySize();
+  // only one slice can fit under the new stats_history_buffer_size
+  ASSERT_LT(slice_count, 2);
+  ASSERT_TRUE(stats_history_size_reopen < 12000 &&
+              stats_history_size_reopen > 0);
+  ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0);
+  Close();
 }
 
 static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
@@ -375,8 +802,13 @@ static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
   JobContext job_context(0);
   dbfull->FindObsoleteFiles(&job_context, false);
   ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty());
-  job_context.Clean();
   dbfull->TEST_UnlockMutex();
+  if (job_context.HaveSomethingToDelete()) {
+    // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles
+    // afterwards; otherwise the test may hang on shutdown
+    dbfull->PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
 }
 
 TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) {
@@ -441,6 +873,140 @@ TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
   ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
 }
 
+TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.compaction_options_fifo.allow_compaction = false;
+  env_->time_elapse_only_sleep_ = false;
+  options.env = env_;
+
+  // Test dynamically changing ttl.
+  env_->addon_time_.store(0);
+  options.ttl = 1 * 60 * 60;  // 1 hour
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Add 61 seconds to the time.
+  env_->addon_time_.fetch_add(61);
+
+  // No files should be compacted as ttl is set to 1 hour.
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 3600);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set ttl to 1 minute. So all files should get deleted.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Test dynamically changing compaction_options_fifo.max_table_files_size
+  env_->addon_time_.store(0);
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 00KB
+  options.ttl = 0;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB.
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            500 << 10);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set max_table_files_size to 12 KB. So only 1 file should remain now.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=12288;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            12 << 10);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // Test dynamically changing compaction_options_fifo.allow_compaction
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+  options.ttl = 0;
+  options.compaction_options_fifo.allow_compaction = false;
+  options.level0_file_num_compaction_trigger = 6;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(ToString(i * 20 + j), RandomString(&rnd, 980)));
+    }
+    Flush();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB and
+  // allow_compaction is false
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set allow_compaction to true. So number of files should be between 1 and 5.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GE(NumTableFilesAtLevel(0), 1);
+  ASSERT_LE(NumTableFilesAtLevel(0), 5);
+}
+
+TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
+  SpecialEnv env(env_);
+  Options options;
+  options.env = &env;
+
+  options.compaction_readahead_size = 0;
+  options.new_table_reader_for_compaction_inputs = true;
+  options.level0_file_num_compaction_trigger = 2;
+  const std::string kValue(1024, 'v');
+  Reopen(options);
+
+  ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size);
+  ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}}));
+  ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size);
+  for (int i = 0; i < 1024; i++) {
+    Put(Key(i), kValue);
+  }
+  Flush();
+  for (int i = 0; i < 1024 * 2; i++) {
+    Put(Key(i), kValue);
+  }
+  Flush();
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(256, env_->compaction_readahead_size_);
+  Close();
+}
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_properties_test.cc b/thirdparty/rocksdb/db/db_properties_test.cc
index 0da64b1365..1a988f5ea4 100644
--- a/thirdparty/rocksdb/db/db_properties_test.cc
+++ b/thirdparty/rocksdb/db/db_properties_test.cc
@@ -14,6 +14,7 @@
 
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/perf_context.h"
 #include "rocksdb/perf_level.h"
@@ -68,27 +69,27 @@ TEST_F(DBPropertiesTest, Empty) {
     ASSERT_OK(db_->DisableFileDeletions());
     ASSERT_TRUE(
         dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("1", num);
+    ASSERT_EQ("0", num);
 
     ASSERT_OK(db_->DisableFileDeletions());
     ASSERT_TRUE(
         dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("2", num);
+    ASSERT_EQ("0", num);
 
     ASSERT_OK(db_->DisableFileDeletions());
     ASSERT_TRUE(
         dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("3", num);
+    ASSERT_EQ("0", num);
 
     ASSERT_OK(db_->EnableFileDeletions(false));
     ASSERT_TRUE(
         dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("2", num);
+    ASSERT_EQ("0", num);
 
     ASSERT_OK(db_->EnableFileDeletions());
     ASSERT_TRUE(
         dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
-    ASSERT_EQ("0", num);
+    ASSERT_EQ("1", num);
   } while (ChangeOptions());
 }
 
@@ -169,6 +170,9 @@ void ResetTableProperties(TableProperties* tp) {
   tp->raw_value_size = 0;
   tp->num_data_blocks = 0;
   tp->num_entries = 0;
+  tp->num_deletions = 0;
+  tp->num_merge_operands = 0;
+  tp->num_range_deletions = 0;
 }
 
 void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
@@ -176,16 +180,19 @@ void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
   std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
   std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
   ResetTableProperties(tp);
-
   sscanf(tp_string.c_str(),
-         "# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64
+         "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+         " # merge operands %" SCNu64 " # range deletions %" SCNu64
+         " raw key size %" SCNu64
          " raw average key size %lf "
          " raw value size %" SCNu64
          " raw average value size %lf "
-         " data block size %" SCNu64 " index block size %" SCNu64
-         " filter block size %" SCNu64,
-         &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size,
+         " data block size %" SCNu64 " index block size (user-key? %" SCNu64
+         ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
+         &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+         &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
          &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+         &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
          &tp->index_size, &tp->filter_size);
 }
 
@@ -214,30 +221,50 @@ void VerifyTableProperties(const TableProperties& base_tp,
   VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
   VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
                 num_data_blocks_bias);
+
   ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
   ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
   ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+  ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions);
+  ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
+
+  // Merge operands may become Puts, so we only have an upper bound the exact
+  // number of merge operands.
+  ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands);
 }
 
-void GetExpectedTableProperties(TableProperties* expected_tp,
-                                const int kKeySize, const int kValueSize,
-                                const int kKeysPerTable, const int kTableCount,
-                                const int kBloomBitsPerKey,
-                                const size_t kBlockSize) {
-  const int kKeyCount = kTableCount * kKeysPerTable;
+void GetExpectedTableProperties(
+    TableProperties* expected_tp, const int kKeySize, const int kValueSize,
+    const int kPutsPerTable, const int kDeletionsPerTable,
+    const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable,
+    const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
+    const bool index_key_is_user_key, const bool value_delta_encoding) {
+  const int kKeysPerTable =
+      kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable;
+  const int kPutCount = kTableCount * kPutsPerTable;
+  const int kDeletionCount = kTableCount * kDeletionsPerTable;
+  const int kMergeCount = kTableCount * kMergeOperandsPerTable;
+  const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
+  const int kKeyCount = kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount;
   const int kAvgSuccessorSize = kKeySize / 5;
   const int kEncodingSavePerKey = kKeySize / 4;
   expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
-  expected_tp->raw_value_size = kKeyCount * kValueSize;
+  expected_tp->raw_value_size =
+      (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize;
   expected_tp->num_entries = kKeyCount;
+  expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount;
+  expected_tp->num_merge_operands = kMergeCount;
+  expected_tp->num_range_deletions = kRangeDeletionCount;
   expected_tp->num_data_blocks =
-      kTableCount *
-      (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+      kTableCount * (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
       kBlockSize;
   expected_tp->data_size =
       kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
   expected_tp->index_size =
-      expected_tp->num_data_blocks * (kAvgSuccessorSize + 8);
+      expected_tp->num_data_blocks *
+      (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
+       // discount 1 byte as value size is not encoded in value delta encoding
+       (value_delta_encoding ? 1 : 0));
   expected_tp->filter_size =
       kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8);
 }
@@ -250,8 +277,11 @@ TEST_F(DBPropertiesTest, ValidatePropertyInfo) {
     ASSERT_TRUE(ppt_name_and_info.first.empty() ||
                 !isdigit(ppt_name_and_info.first.back()));
 
-    ASSERT_TRUE((ppt_name_and_info.second.handle_string == nullptr) !=
-                (ppt_name_and_info.second.handle_int == nullptr));
+    int count = 0;
+    count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1;
+    ASSERT_TRUE(count == 1);
   }
 }
 
@@ -286,7 +316,10 @@ TEST_F(DBPropertiesTest, ValidateSampleNumber) {
 
 TEST_F(DBPropertiesTest, AggregatedTableProperties) {
   for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
-    const int kKeysPerTable = 100;
+    const int kDeletionsPerTable = 5;
+    const int kMergeOperandsPerTable = 15;
+    const int kRangeDeletionsPerTable = 5;
+    const int kPutsPerTable = 100;
     const int kKeySize = 80;
     const int kValueSize = 200;
     const int kBloomBitsPerKey = 20;
@@ -295,6 +328,8 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
     options.level0_file_num_compaction_trigger = 8;
     options.compression = kNoCompression;
     options.create_if_missing = true;
+    options.preserve_deletes = true;
+    options.merge_operator.reset(new TestPutOperator());
 
     BlockBasedTableOptions table_options;
     table_options.filter_policy.reset(
@@ -304,24 +339,44 @@ TEST_F(DBPropertiesTest, AggregatedTableProperties) {
 
     DestroyAndReopen(options);
 
+    // Hold open a snapshot to prevent range tombstones from being compacted
+    // away.
+    ManagedSnapshot snapshot(db_);
+
     Random rnd(5632);
     for (int table = 1; table <= kTableCount; ++table) {
-      for (int i = 0; i < kKeysPerTable; ++i) {
+      for (int i = 0; i < kPutsPerTable; ++i) {
         db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
                  RandomString(&rnd, kValueSize));
       }
+      for (int i = 0; i < kDeletionsPerTable; i++) {
+        db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+      }
+      for (int i = 0; i < kMergeOperandsPerTable; i++) {
+        db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+                   RandomString(&rnd, kValueSize));
+      }
+      for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+        std::string start = RandomString(&rnd, kKeySize);
+        std::string end = start;
+        end.resize(kValueSize);
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+      }
       db_->Flush(FlushOptions());
     }
     std::string property;
     db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
-
-    TableProperties expected_tp;
-    GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
-                               kKeysPerTable, kTableCount, kBloomBitsPerKey,
-                               table_options.block_size);
-
     TableProperties output_tp;
     ParseTablePropertiesString(property, &output_tp);
+    bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
+
+    TableProperties expected_tp;
+    GetExpectedTableProperties(
+        &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+        kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount,
+        kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+        value_is_delta_encoded);
 
     VerifyTableProperties(expected_tp, output_tp);
   }
@@ -337,7 +392,15 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
   options.target_file_size_base = 98 << 10;
   options.max_write_buffer_number = 2;
   options.statistics = rocksdb::CreateDBStatistics();
-  options.max_open_files = 100;
+  options.max_open_files = 11;  // Make sure no proloading of table readers
+
+  // RocksDB sanitize max open files to at least 20. Modify it back.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = static_cast<int*>(arg);
+        *max_open_files = 11;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   BlockBasedTableOptions table_options;
   table_options.no_block_cache = true;
@@ -370,6 +433,13 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
   for (int key = 0; key < key_index; key++) {
     Get(Key(key));
   }
+
+  // Test for getting immutable_db_options_.statistics
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.options-statistics", &prop));
+  ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss"));
+  ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros"));
+
   ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
                                     "rocksdb.cf-file-histogram", &prop));
   ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
@@ -378,12 +448,13 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
 
   // Reopen and issue iterating. See thee latency tracked
   ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
   ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
   ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
   ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
   {
-    unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
     for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
     }
   }
@@ -436,7 +507,10 @@ TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
 
 TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   const int kTableCount = 100;
-  const int kKeysPerTable = 10;
+  const int kDeletionsPerTable = 2;
+  const int kMergeOperandsPerTable = 2;
+  const int kRangeDeletionsPerTable = 2;
+  const int kPutsPerTable = 10;
   const int kKeySize = 50;
   const int kValueSize = 400;
   const int kMaxLevel = 7;
@@ -452,6 +526,8 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
   options.max_bytes_for_level_multiplier = 2;
   // This ensures there no compaction happening when we call GetProperty().
   options.disable_auto_compactions = true;
+  options.preserve_deletes = true;
+  options.merge_operator.reset(new TestPutOperator());
 
   BlockBasedTableOptions table_options;
   table_options.filter_policy.reset(
@@ -461,15 +537,31 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
 
   DestroyAndReopen(options);
 
+  // Hold open a snapshot to prevent range tombstones from being compacted away.
+  ManagedSnapshot snapshot(db_);
+
   std::string level_tp_strings[kMaxLevel];
   std::string tp_string;
   TableProperties level_tps[kMaxLevel];
   TableProperties tp, sum_tp, expected_tp;
   for (int table = 1; table <= kTableCount; ++table) {
-    for (int i = 0; i < kKeysPerTable; ++i) {
+    for (int i = 0; i < kPutsPerTable; ++i) {
       db_->Put(WriteOptions(), RandomString(&rnd, kKeySize),
                RandomString(&rnd, kValueSize));
     }
+    for (int i = 0; i < kDeletionsPerTable; i++) {
+      db_->Delete(WriteOptions(), RandomString(&rnd, kKeySize));
+    }
+    for (int i = 0; i < kMergeOperandsPerTable; i++) {
+      db_->Merge(WriteOptions(), RandomString(&rnd, kKeySize),
+                 RandomString(&rnd, kValueSize));
+    }
+    for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+      std::string start = RandomString(&rnd, kKeySize);
+      std::string end = start;
+      end.resize(kValueSize);
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end);
+    }
     db_->Flush(FlushOptions());
     db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
     ResetTableProperties(&sum_tp);
@@ -485,9 +577,14 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
       sum_tp.raw_value_size += level_tps[level].raw_value_size;
       sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
       sum_tp.num_entries += level_tps[level].num_entries;
+      sum_tp.num_deletions += level_tps[level].num_deletions;
+      sum_tp.num_merge_operands += level_tps[level].num_merge_operands;
+      sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
     }
     db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
     ParseTablePropertiesString(tp_string, &tp);
+    bool index_key_is_user_key = tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
     ASSERT_EQ(sum_tp.data_size, tp.data_size);
     ASSERT_EQ(sum_tp.index_size, tp.index_size);
     ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
@@ -495,13 +592,18 @@ TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
     ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
     ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
     ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+    ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions);
+    ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands);
+    ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
     if (table > 3) {
-      GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize,
-                                 kKeysPerTable, table, kBloomBitsPerKey,
-                                 table_options.block_size);
+      GetExpectedTableProperties(
+          &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+          kMergeOperandsPerTable, kRangeDeletionsPerTable, table,
+          kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+          value_is_delta_encoded);
       // Gives larger bias here as index block size, filter block size,
       // and data block size become much harder to estimate in this test.
-      VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25);
+      VerifyTableProperties(expected_tp, tp, 0.5, 0.4, 0.4, 0.25);
     }
   }
 }
@@ -985,13 +1087,14 @@ class CountingUserTblPropCollector : public TablePropertiesCollector {
     return Status::OK();
   }
 
-  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
-                    SequenceNumber seq, uint64_t file_size) override {
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
     ++count_;
     return Status::OK();
   }
 
-  virtual UserCollectedProperties GetReadableProperties() const override {
+  UserCollectedProperties GetReadableProperties() const override {
     return UserCollectedProperties{};
   }
 
@@ -1007,7 +1110,7 @@ class CountingUserTblPropCollectorFactory
       uint32_t expected_column_family_id)
       : expected_column_family_id_(expected_column_family_id),
         num_created_(0) {}
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+  TablePropertiesCollector* CreateTablePropertiesCollector(
       TablePropertiesCollectorFactory::Context context) override {
     EXPECT_EQ(expected_column_family_id_, context.column_family_id);
     num_created_++;
@@ -1027,8 +1130,9 @@ class CountingDeleteTabPropCollector : public TablePropertiesCollector {
  public:
   const char* Name() const override { return "CountingDeleteTabPropCollector"; }
 
-  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
-                    SequenceNumber seq, uint64_t file_size) override {
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
     if (type == kEntryDelete) {
       num_deletes_++;
     }
@@ -1054,8 +1158,8 @@ class CountingDeleteTabPropCollector : public TablePropertiesCollector {
 class CountingDeleteTabPropCollectorFactory
     : public TablePropertiesCollectorFactory {
  public:
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
-      TablePropertiesCollectorFactory::Context context) override {
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
     return new CountingDeleteTabPropCollector();
   }
   const char* Name() const override {
@@ -1328,7 +1432,7 @@ TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
   }
 
   options.compaction_style = kCompactionStyleFIFO;
-  options.compaction_options_fifo.ttl = 300;
+  options.ttl = 300;
   options.compaction_options_fifo.allow_compaction = false;
   DestroyAndReopen(options);
 
@@ -1383,6 +1487,215 @@ TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
   Close();
 }
 
+TEST_F(DBPropertiesTest, SstFilesSize) {
+  struct TestListener : public EventListener {
+    void OnCompactionCompleted(DB* db,
+                               const CompactionJobInfo& /*info*/) override {
+      assert(callback_triggered == false);
+      assert(size_before_compaction > 0);
+      callback_triggered = true;
+      uint64_t total_sst_size = 0;
+      uint64_t live_sst_size = 0;
+      bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+                                   &total_sst_size);
+      ASSERT_TRUE(ok);
+      // total_sst_size include files before and after compaction.
+      ASSERT_GT(total_sst_size, size_before_compaction);
+      ok =
+          db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+      ASSERT_TRUE(ok);
+      // live_sst_size only include files after compaction.
+      ASSERT_GT(live_sst_size, 0);
+      ASSERT_LT(live_sst_size, size_before_compaction);
+    }
+
+    uint64_t size_before_compaction = 0;
+    bool callback_triggered = false;
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options;
+  options.disable_auto_compactions = true;
+  options.listeners.push_back(listener);
+  Reopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + ToString(i), std::string(1000, 'v')));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Delete("key" + ToString(i)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t sst_size;
+  bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size);
+  ASSERT_TRUE(ok);
+  ASSERT_GT(sst_size, 0);
+  listener->size_before_compaction = sst_size;
+  // Compact to clean all keys and trigger listener.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_TRUE(listener->callback_triggered);
+}
+
+TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
+  class TestListener : public EventListener {
+   public:
+    void OnTableFileCreated(const TableFileCreationInfo& info) override {
+      if (info.reason == TableFileCreationReason::kCompaction) {
+        // Verify the property indicates that SSTs created by a running
+        // compaction cannot be deleted.
+        uint64_t created_file_num;
+        FileType created_file_type;
+        std::string filename =
+            info.file_path.substr(info.file_path.rfind('/') + 1);
+        ASSERT_TRUE(
+            ParseFileName(filename, &created_file_num, &created_file_type));
+        ASSERT_EQ(kTableFile, created_file_type);
+
+        uint64_t keep_sst_lower_bound;
+        ASSERT_TRUE(
+            db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep,
+                                &keep_sst_lower_bound));
+
+        ASSERT_LE(keep_sst_lower_bound, created_file_num);
+        validated_ = true;
+      }
+    }
+
+    void SetDB(DB* db) { db_ = db; }
+
+    int GetNumCompactions() { return num_compactions_; }
+
+    // True if we've verified the property for at least one output file
+    bool Validated() { return validated_; }
+
+   private:
+    int num_compactions_ = 0;
+    bool validated_ = false;
+    DB* db_ = nullptr;
+  };
+
+  const int kNumL0Files = 4;
+
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options = CurrentOptions();
+  options.listeners.push_back(listener);
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  DestroyAndReopen(options);
+  listener->SetDB(db_);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure they overlap in keyspace to prevent trivial move
+    Put("key1", "val");
+    Put("key2", "val");
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(listener->Validated());
+}
+
+TEST_F(DBPropertiesTest, BlockCacheProperties) {
+  Options options;
+  uint64_t value;
+
+  // Block cache properties are not available for tables other than
+  // block-based table.
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Block cache properties are not available if block cache is not used.
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Test with empty block cache.
+  constexpr size_t kCapacity = 100;
+  auto block_cache = NewLRUCache(kCapacity, 0 /*num_shard_bits*/);
+  table_options.block_cache = block_cache;
+  table_options.no_block_cache = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(0, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert unpinned item to the cache and check size.
+  constexpr size_t kSize1 = 50;
+  block_cache->Insert("item1", nullptr /*value*/, kSize1, nullptr /*deleter*/);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert pinned item to the cache and check size.
+  constexpr size_t kSize2 = 30;
+  Cache::Handle* item2 = nullptr;
+  block_cache->Insert("item2", nullptr /*value*/, kSize2, nullptr /*deleter*/,
+                      &item2);
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1 + kSize2, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2, value);
+
+  // Insert another pinned item to make the cache over-sized.
+  constexpr size_t kSize3 = 80;
+  Cache::Handle* item3 = nullptr;
+  block_cache->Insert("item3", nullptr /*value*/, kSize3, nullptr /*deleter*/,
+                      &item3);
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // Item 1 is evicted.
+  ASSERT_EQ(kSize2 + kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+
+  // Check size after release.
+  block_cache->Release(item2);
+  block_cache->Release(item3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // item2 will be evicted, while item3 remain in cache after release.
+  ASSERT_EQ(kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+}
+
 #endif  // ROCKSDB_LITE
 }  // namespace rocksdb
 
diff --git a/thirdparty/rocksdb/db/db_range_del_test.cc b/thirdparty/rocksdb/db/db_range_del_test.cc
index 982cbb85ab..ebe9366df5 100644
--- a/thirdparty/rocksdb/db/db_range_del_test.cc
+++ b/thirdparty/rocksdb/db/db_range_del_test.cc
@@ -27,47 +27,53 @@ class DBRangeDelTest : public DBTestBase {
 // ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
 TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
-  if (!IsMemoryMappedAccessSupported()) {
-    return;
+  // TODO: figure out why MmapReads trips the iterator pinning assertion in
+  // RangeDelAggregator. Ideally it would be supported; otherwise it should at
+  // least be explicitly unsupported.
+  for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
+    option_config_ = config;
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "dr1", "dr1")
+                    .IsNotSupported());
   }
-  Options opts = CurrentOptions();
-  opts.table_factory.reset(new PlainTableFactory());
-  opts.prefix_extractor.reset(NewNoopTransform());
-  opts.allow_mmap_reads = true;
-  opts.max_sequential_skip_in_iterations = 999999;
-  Reopen(opts);
-
-  ASSERT_TRUE(
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1", "dr1")
-          .IsNotSupported());
 }
 
 TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
-  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
-                             "dr2"));
-  ASSERT_OK(db_->Flush(FlushOptions()));
-  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "dr1", "dr2"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  } while (ChangeOptions(kRangeDelSkipConfigs));
 }
 
 TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
-  Options opts = CurrentOptions();
-  opts.disable_auto_compactions = true;
-  opts.statistics = CreateDBStatistics();
-  Reopen(opts);
+  do {
+    Options opts = CurrentOptions();
+    opts.disable_auto_compactions = true;
+    opts.statistics = CreateDBStatistics();
+    DestroyAndReopen(opts);
 
-  // snapshot protects range tombstone from dropping due to becoming obsolete.
-  const Snapshot* snapshot = db_->GetSnapshot();
-  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
-  db_->Flush(FlushOptions());
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z");
+    db_->Flush(FlushOptions());
 
-  ASSERT_EQ(1, NumTableFilesAtLevel(0));
-  ASSERT_EQ(0, NumTableFilesAtLevel(1));
-  dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
-                              true /* disallow_trivial_move */);
-  ASSERT_EQ(0, NumTableFilesAtLevel(0));
-  ASSERT_EQ(1, NumTableFilesAtLevel(1));
-  ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
-  db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                true /* disallow_trivial_move */);
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
+    // compactions as the above assertions about the number of files in a level
+    // do not hold true.
+  } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
+                         kSkipFIFOCompaction));
 }
 
 TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
@@ -185,7 +191,7 @@ TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
 
   std::vector<std::vector<FileMetaData>> files;
   dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
-  ASSERT_GT(files[0][0].smallest_seqno, 0);
+  ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
 
   db_->ReleaseSnapshot(snapshot);
 }
@@ -433,8 +439,8 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
       reinterpret_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())
           ->cfd(),
       1 /* input_level */, 2 /* output_level */, 0 /* output_path_id */,
-      nullptr /* begin */, nullptr /* end */, true /* exclusive */,
-      true /* disallow_trivial_move */));
+      0 /* max_subcompactions */, nullptr /* begin */, nullptr /* end */,
+      true /* exclusive */, true /* disallow_trivial_move */));
 }
 #endif  // ROCKSDB_LITE
 
@@ -484,6 +490,30 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
   ASSERT_EQ(expected, actual);
 }
 
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+  // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+  // Flush. The `CompactionIterator` previously had a bug where we forgot to
+  // check for covering range tombstones when processing the (1) Put, causing
+  // it to reappear after the flush.
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  std::string val;
+  PutFixed64(&val, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "key", "key_"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 1);
+  ASSERT_EQ(expected, actual);
+}
+
 // NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
 TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
@@ -496,12 +526,12 @@ TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
   Reopen(opts);
 
   db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
-                   "dr1");  // obsolete after compaction
+                   "dr10");  // obsolete after compaction
   db_->Put(WriteOptions(), "key", "val");
   db_->Flush(FlushOptions());
   const Snapshot* snapshot = db_->GetSnapshot();
   db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
-                   "dr2");  // protected by snapshot
+                   "dr20");  // protected by snapshot
   db_->Put(WriteOptions(), "key", "val");
   db_->Flush(FlushOptions());
 
@@ -590,48 +620,56 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
 }
 
 TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
-  db_->Put(WriteOptions(), "key", "val");
-  ASSERT_OK(
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  do {
+    DestroyAndReopen(CurrentOptions());
+    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
 
-  ReadOptions read_opts;
-  std::string value;
-  ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
 }
 
 TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
-  Options opts = CurrentOptions();
-  opts.max_write_buffer_number = 3;
-  opts.min_write_buffer_number_to_merge = 2;
-  // SpecialSkipListFactory lets us specify maximum number of elements the
-  // memtable can hold. It switches the active memtable to immutable (flush is
-  // prevented by the above options) upon inserting an element that would
-  // overflow the memtable.
-  opts.memtable_factory.reset(new SpecialSkipListFactory(1));
-  Reopen(opts);
-
-  db_->Put(WriteOptions(), "key", "val");
-  ASSERT_OK(
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-  db_->Put(WriteOptions(), "blah", "val");
+  do {
+    Options opts = CurrentOptions();
+    opts.max_write_buffer_number = 3;
+    opts.min_write_buffer_number_to_merge = 2;
+    // SpecialSkipListFactory lets us specify maximum number of elements the
+    // memtable can hold. It switches the active memtable to immutable (flush is
+    // prevented by the above options) upon inserting an element that would
+    // overflow the memtable.
+    opts.memtable_factory.reset(new SpecialSkipListFactory(1));
+    DestroyAndReopen(opts);
+
+    db_->Put(WriteOptions(), "key", "val");
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    db_->Put(WriteOptions(), "blah", "val");
 
-  ReadOptions read_opts;
-  std::string value;
-  ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
 }
 
 TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
-  db_->Put(WriteOptions(), "key", "val");
-  // snapshot prevents key from being deleted during flush
-  const Snapshot* snapshot = db_->GetSnapshot();
-  ASSERT_OK(
-      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
-  ASSERT_OK(db_->Flush(FlushOptions()));
+  do {
+    DestroyAndReopen(CurrentOptions());
+    db_->Put(WriteOptions(), "key", "val");
+    // snapshot prevents key from being deleted during flush
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
 
-  ReadOptions read_opts;
-  std::string value;
-  ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
-  db_->ReleaseSnapshot(snapshot);
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+    db_->ReleaseSnapshot(snapshot);
+  } while (ChangeOptions(kRangeDelSkipConfigs));
 }
 
 TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
@@ -895,11 +933,14 @@ TEST_F(DBRangeDelTest, MemtableBloomFilter) {
 }
 
 TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
-  // make sure compaction treats files containing a split range deletion in the
-  // input level as an atomic unit. I.e., compacting any input-level file(s)
-  // containing a portion of the range deletion causes all other input-level
-  // files containing portions of that same range deletion to be included in the
-  // compaction.
+  // This test originally verified that compaction treated files containing a
+  // split range deletion in the input level as an atomic unit. I.e.,
+  // compacting any input-level file(s) containing a portion of the range
+  // deletion causes all other input-level files containing portions of that
+  // same range deletion to be included in the compaction. Range deletion
+  // tombstones are now truncated to sstable boundaries which removed the need
+  // for that behavior (which could lead to excessively large
+  // compactions).
   const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
   Options options = CurrentOptions();
   options.compression = kNoCompression;
@@ -946,22 +987,116 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
     if (i == 0) {
       ASSERT_OK(db_->CompactFiles(
           CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(1));
     } else if (i == 1) {
       auto begin_str = Key(0), end_str = Key(1);
       Slice begin = begin_str, end = end_str;
       ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+      ASSERT_EQ(3, NumTableFilesAtLevel(1));
     } else if (i == 2) {
       ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
                                 {{"max_bytes_for_level_base", "10000"}}));
       dbfull()->TEST_WaitForCompact();
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
     }
-    ASSERT_EQ(0, NumTableFilesAtLevel(1));
     ASSERT_GT(NumTableFilesAtLevel(2), 0);
 
     db_->ReleaseSnapshot(snapshot);
   }
 }
 
+TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
+  // Test the handling of the range-tombstone end-key as the
+  // upper-bound for an sstable.
+
+  const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+
+  // Create an initial sstable at L2:
+  //   [key000000#1,1, key000000#1,1]
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // A snapshot protects the range tombstone from dropping due to
+  // becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                   Key(0), Key(2 * kNumFilesPerLevel));
+
+  // Create 2 additional sstables in L0. Note that the first sstable
+  // contains the range tombstone.
+  //   [key000000#3,1, key000004#72057594037927935,15]
+  //   [key000001#5,1, key000002#6,1]
+  Random rnd(301);
+  std::string value = RandomString(&rnd, kValueBytes);
+  for (int j = 0; j < kNumFilesPerLevel; ++j) {
+    // Give files overlapping key-ranges to prevent a trivial move when we
+    // compact from L0 to L1.
+    ASSERT_OK(Put(Key(j), value));
+    ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
+  }
+  // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
+  // are 2 sstables generated in L1 due to the target_file_size_base setting.
+  //   L1:
+  //     [key000000#3,1, key000002#72057594037927935,15]
+  //     [key000002#6,1, key000004#72057594037927935,15]
+  //   L2:
+  //     [key000000#1,1, key000000#1,1]
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  {
+    // Compact the second sstable in L1:
+    //   L1:
+    //     [key000000#3,1, key000002#72057594037927935,15]
+    //   L2:
+    //     [key000000#1,1, key000000#1,1]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    //
+    // At the same time, verify the compaction does not cause the key at the
+    // endpoint (key000002#6,1) to disappear.
+    ASSERT_EQ(value, Get(Key(2)));
+    auto begin_str = Key(3);
+    const rocksdb::Slice begin = begin_str;
+    dbfull()->TEST_CompactRange(1, &begin, nullptr);
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(2, NumTableFilesAtLevel(2));
+    ASSERT_EQ(value, Get(Key(2)));
+  }
+
+  {
+    // Compact the first sstable in L1. This should be copacetic, but
+    // was previously resulting in overlapping sstables in L2 due to
+    // mishandling of the range tombstone end-key when used as the
+    // largest key for an sstable. The resulting LSM structure should
+    // be:
+    //
+    //   L2:
+    //     [key000000#1,1, key000001#72057594037927935,15]
+    //     [key000001#5,1, key000002#72057594037927935,15]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    auto begin_str = Key(0);
+    const rocksdb::Slice begin = begin_str;
+    dbfull()->TEST_CompactRange(1, &begin, &begin);
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_EQ(3, NumTableFilesAtLevel(2));
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_F(DBRangeDelTest, UnorderedTombstones) {
   // Regression test for #2752. Range delete tombstones between
   // different snapshot stripes are not stored in order, so the first
@@ -996,6 +1131,395 @@ TEST_F(DBRangeDelTest, UnorderedTombstones) {
   ASSERT_TRUE(s.IsNotFound());
 }
 
+class MockMergeOperator : public MergeOperator {
+  // Mock non-associative operator. Non-associativity is expressed by lack of
+  // implementation for any `PartialMerge*` functions.
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    assert(merge_out != nullptr);
+    merge_out->new_value = merge_in.operand_list.back().ToString();
+    return true;
+  }
+
+  const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+  // This test uses a non-associative merge operator since that is a convenient
+  // way to get compaction to write out files with overlapping user-keys at the
+  // endpoints. Note, however, overlapping endpoints can also occur with other
+  // value types (Put, etc.), assuming the right snapshots are present.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // Push dummy data to L3 so that our actual test files on L0-L2
+  // will not be considered "bottommost" level, otherwise compaction
+  // may prevent us from creating overlapping user keys
+  // as on the bottommost layer MergeHelper
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(3);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+    }
+    if (i == kNumFiles - 1) {
+      // Take snapshot to prevent covered merge operands from being dropped by
+      // compaction.
+      snapshot = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Now we have multiple files at L1 all containing a single user key, thus
+  // guaranteeing overlap in the file endpoints.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Verify no merge operands reappeared after the compaction.
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  // Compact and verify again. It's worthwhile because now the files have
+  // tighter endpoints, so we can verify that doesn't mess anything up.
+  dbfull()->TEST_CompactRange(1 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_GT(NumTableFilesAtLevel(2), 1);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+  // Verify a key newer than a range tombstone cannot be deleted by being
+  // compacted to the bottom level (and thus having its seqnum zeroed) before
+  // the range tombstone. This used to happen when range tombstones were
+  // untruncated on reads such that they extended past their file boundaries.
+  //
+  // Test summary:
+  //
+  // - L1 is bottommost.
+  // - A couple snapshots are strategically taken to prevent seqnums from being
+  //   zeroed, range tombstone from being dropped, merge operands from being
+  //   dropped, and merge operands from being combined.
+  // - Left half of files in L1 all have same user key, ensuring their file
+  //   boundaries overlap. In the past this would cause range tombstones to be
+  //   untruncated.
+  // - Right half of L1 files all have different keys, ensuring no overlap.
+  // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+  // - Keys in the right side of the key-range are overwritten. These are
+  //   compacted down to L1 after releasing snapshots such that their seqnums
+  //   will be zeroed.
+  // - A full range scan is performed. If the tombstone in the left L1 files
+  //   were untruncated, it would now cover keys newer than it (but with zeroed
+  //   seqnums) in the right L1 files.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+  const int kMaxKey = kNumFiles* kFileBytes / kValueBytes;
+  const int kKeysOverwritten = 10;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  // - snapshots[0] prevents merge operands from being combined during
+  //   compaction.
+  // - snapshots[1] prevents merge operands from being dropped due to the
+  //   covering range tombstone.
+  const Snapshot* snapshots[] = {nullptr, nullptr};
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      std::string key;
+      if (i < kNumFiles / 2) {
+        key = Key(0);
+      } else {
+        key = Key(1 + i * kFileBytes / kValueBytes + j);
+      }
+      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+    }
+    if (i == 0) {
+      snapshots[0] = db_->GetSnapshot();
+    }
+    if (i == kNumFiles - 1) {
+      snapshots[1] = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key(0), Key(kMaxKey + 1)));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  auto get_key_count = [this]() -> int {
+    auto* iter = db_->NewIterator(ReadOptions());
+    iter->SeekToFirst();
+    int keys_found = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ++keys_found;
+    }
+    delete iter;
+    return keys_found;
+  };
+
+  // All keys should be covered
+  ASSERT_EQ(0, get_key_count());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Roughly the left half of L1 files should have overlapping boundary keys,
+  // while the right half should not.
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  // Now overwrite a few keys that are in L1 files that definitely don't have
+  // overlapping boundary keys.
+  for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+    auto value = RandomString(&rnd, kValueBytes);
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // The overwritten keys are in L0 now, so clearly aren't covered by the range
+  // tombstone in L1.
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+  // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+  db_->ReleaseSnapshot(snapshots[0]);
+  db_->ReleaseSnapshot(snapshots[1]);
+
+  auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+  auto end_key_storage = Key(kMaxKey);
+  Slice begin_key(begin_key_storage);
+  Slice end_key(end_key_storage);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+  // Exposes a bug where we were using
+  // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+  // in the forward direction. Confusingly, this case happened during
+  // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  // Need multiple keys so we can get results when calling `Prev()` after
+  // `SeekToLast()`.
+  const int kNumKeys = 3;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+      if (i == 0 && j == kNumKeys) {
+        // Take snapshot to prevent covered merge operands from being dropped or
+        // merged by compaction.
+        snapshot = db_->GetSnapshot();
+        // Do a DeleteRange near the beginning so only the oldest merge operand
+        // for each key is covered. This ensures the sequence of events:
+        //
+        // - `DBIter::Prev()` is called
+        // - After several same versions of the same user key are encountered,
+        //   it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+        // - Binary searches to the newest version of the key, which is in the
+        //   leftmost file containing the user key.
+        // - Scans forwards to collect all merge operands. Eventually reaches
+        //   the rightmost file containing the oldest merge operand, which
+        //   should be covered by the `DeleteRange`. If `RangeDelAggregator`
+        //   were not properly using `kForwardTraversal` here, that operand
+        //   would reappear.
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(0), Key(kNumKeys + 1)));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->SeekToLast();
+  int keys_found = 0;
+  for (; iter->Valid(); iter->Prev()) {
+    ++keys_found;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, keys_found);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  db_->Flush(FlushOptions());
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+  // Adapted from
+  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+  // Regression test for issue where range tombstone was written to more files
+  // than necessary when it began exactly at the begin key in the next
+  // compaction output file.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  // Have a bit of slack in the size limits but we enforce them more strictly
+  // when manually flushing/compacting.
+  options.max_compaction_bytes = 2 * kFileBytes;
+  options.target_file_size_base = 2 * kFileBytes;
+  options.write_buffer_size = 2 * kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  for (char first_char : {'a', 'b', 'c'}) {
+    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+      std::string key(1, first_char);
+      key.append(Key(i));
+      std::string value = RandomString(&rnd, kValueBytes);
+      ASSERT_OK(Put(key, value));
+    }
+    db_->Flush(FlushOptions());
+    MoveFilesToLevel(2);
+  }
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+  // Populate the memtable lightly while spanning the whole key-space. The
+  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+  // files to prevent a large L1->L2 compaction later.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "c" + Key(1), "d"));
+  // Our compaction output file cutting logic currently only considers point
+  // keys. So, in order for the range tombstone to have a chance at landing at
+  // the start of a new file, we need a point key at the range tombstone's
+  // start.
+  // TODO(ajkr): remove this `Put` after file cutting accounts for range
+  // tombstones (#3977).
+  ASSERT_OK(Put("c" + Key(1), "value"));
+  db_->Flush(FlushOptions());
+
+  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+  // and the range tombstone is only placed in the second SST.
+  std::string begin_key_storage("c" + Key(1));
+  Slice begin_key(begin_key_storage);
+  std::string end_key_storage("d");
+  Slice end_key(end_key_storage);
+  dbfull()->TEST_CompactRange(0 /* level */, &begin_key /* begin */,
+                              &end_key /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> all_metadata;
+  std::vector<LiveFileMetaData> l1_metadata;
+  db_->GetLiveFilesMetaData(&all_metadata);
+  for (const auto& metadata : all_metadata) {
+    if (metadata.level == 1) {
+      l1_metadata.push_back(metadata);
+    }
+  }
+  std::sort(l1_metadata.begin(), l1_metadata.end(),
+            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+                     0;
+            });
+  ASSERT_EQ("a", l1_metadata[0].smallestkey);
+  ASSERT_EQ("a", l1_metadata[0].largestkey);
+  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+  ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+  TablePropertiesCollection all_table_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+  int64_t num_range_deletions = 0;
+  for (const auto& name_and_table_props : all_table_props) {
+    const auto& name = name_and_table_props.first;
+    const auto& table_props = name_and_table_props.second;
+    // The range tombstone should only be output to the second L1 SST.
+    if (name.size() >= l1_metadata[1].name.size() &&
+        name.substr(name.size() - l1_metadata[1].name.size()).compare(l1_metadata[1].name) == 0) {
+      ASSERT_EQ(1, table_props->num_range_deletions);
+      ++num_range_deletions;
+    } else {
+      ASSERT_EQ(0, table_props->num_range_deletions);
+    }
+  }
+  ASSERT_EQ(1, num_range_deletions);
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/db_secondary_test.cc b/thirdparty/rocksdb/db/db_secondary_test.cc
new file mode 100644
index 0000000000..478a7cec97
--- /dev/null
+++ b/thirdparty/rocksdb/db/db_secondary_test.cc
@@ -0,0 +1,480 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "util/fault_injection_test_env.h"
+#include "util/sync_point.h"
+
+namespace rocksdb {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTest : public DBTestBase {
+ public:
+  DBSecondaryTest()
+      : DBTestBase("/db_secondary_test"),
+        secondary_path_(),
+        handles_secondary_(),
+        db_secondary_(nullptr) {
+    secondary_path_ =
+        test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+  }
+
+  ~DBSecondaryTest() override {
+    CloseSecondary();
+    if (getenv("KEEP_DB") != nullptr) {
+      fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(secondary_path_, options));
+    }
+  }
+
+ protected:
+  Status ReopenAsSecondary(const Options& options) {
+    return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+  }
+
+  void OpenSecondary(const Options& options);
+
+  void OpenSecondaryWithColumnFamilies(
+      const std::vector<std::string>& column_families, const Options& options);
+
+  void CloseSecondary() {
+    for (auto h : handles_secondary_) {
+      db_secondary_->DestroyColumnFamilyHandle(h);
+    }
+    handles_secondary_.clear();
+    delete db_secondary_;
+    db_secondary_ = nullptr;
+  }
+
+  DBImplSecondary* db_secondary_full() {
+    return static_cast<DBImplSecondary*>(db_secondary_);
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int expected_log,
+                           int expected_sst, int expected_manifest) const;
+
+  std::string secondary_path_;
+  std::vector<ColumnFamilyHandle*> handles_secondary_;
+  DB* db_secondary_;
+};
+
+void DBSecondaryTest::OpenSecondary(const Options& options) {
+  Status s =
+      DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+  ASSERT_OK(s);
+}
+
+void DBSecondaryTest::OpenSecondaryWithColumnFamilies(
+    const std::vector<std::string>& column_families, const Options& options) {
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  for (const auto& cf_name : column_families) {
+    cf_descs.emplace_back(cf_name, options);
+  }
+  Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_OK(s);
+}
+
+void DBSecondaryTest::CheckFileTypeCounts(const std::string& dir,
+                                          int expected_log, int expected_sst,
+                                          int expected_manifest) const {
+  std::vector<std::string> filenames;
+  env_->GetChildren(dir, &filenames);
+
+  int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+  for (auto file : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(file, &number, &type)) {
+      log_cnt += (type == kLogFile);
+      sst_cnt += (type == kTableFile);
+      manifest_cnt += (type == kDescriptorFile);
+    }
+  }
+  ASSERT_EQ(expected_log, log_cnt);
+  ASSERT_EQ(expected_sst, sst_cnt);
+  ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Put("bar", "bar_value"));
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  Close();
+
+  ASSERT_OK(ReopenAsSecondary(options));
+  ASSERT_EQ("foo_value", Get("foo"));
+  ASSERT_EQ("bar_value", Get("bar"));
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  auto db1 = static_cast<DBImplSecondary*>(db_);
+  ASSERT_NE(nullptr, db1);
+  Iterator* iter = db1->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    if (0 == count) {
+      ASSERT_EQ("bar", iter->key().ToString());
+      ASSERT_EQ("bar_value", iter->value().ToString());
+    } else if (1 == count) {
+      ASSERT_EQ("foo", iter->key().ToString());
+      ASSERT_EQ("foo_value", iter->value().ToString());
+    }
+    ++count;
+  }
+  delete iter;
+  ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+  cf_descs.emplace_back("pikachu", options1);
+  cf_descs.emplace_back("eevee", options1);
+  Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_EQ(0, handles_secondary_.size());
+  ASSERT_NE(nullptr, db_secondary_);
+
+  ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Flush(0 /*cf*/));
+  ASSERT_OK(Flush(1 /*cf*/));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+        "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+       {"VersionSet::ProcessManifestWrites:AfterNewManifest",
+        "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+        "1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make sure db calls RecoverLogFiles so as to trigger a manifest write,
+  // which causes the db to switch to a new MANIFEST upon start.
+  port::Thread ro_db_thread([&]() {
+    Options options1;
+    options1.env = env_;
+    options1.max_open_files = -1;
+    OpenSecondary(options1);
+    CloseSecondary();
+  });
+  Reopen(options);
+  ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+  int table_files_not_exist = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers",
+      [&](void* arg) {
+        Status s = *reinterpret_cast<Status*>(arg);
+        if (s.IsPathNotFound()) {
+          ++table_files_not_exist;
+        } else if (!s.ok()) {
+          assert(false);  // Should not reach here
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_NE(nullptr, db_secondary_full());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_EQ(options.level0_file_num_compaction_trigger, table_files_not_exist);
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+  Options options;
+  options.env = env_;
+  const std::string kCfName1 = "pikachu";
+  CreateAndReopenWithCF({kCfName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+  ASSERT_OK(Flush(1 /*cf*/));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  Close();
+  CheckFileTypeCounts(dbname_, 1, 0, 1);
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  value.clear();
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+  // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+  // ..., 9.
+  const int kNumKeys = 10;
+  // Create two sst
+  for (int i = 0; i != kNumFiles; ++i) {
+    for (int j = 0; j != kNumKeys; ++j) {
+      ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  const auto& range_scan_db = [&]() {
+    ReadOptions tmp_ropts;
+    tmp_ropts.total_order_seek = true;
+    tmp_ropts.verify_checksums = true;
+    std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+    int cnt = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+      ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+      ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+                iter->value().ToString());
+    }
+  };
+
+  range_scan_db();
+
+  // While secondary instance still keeps old MANIFEST open, we close primary,
+  // restart primary, performs full compaction, close again, restart again so
+  // that next time secondary tries to catch up with primary, the secondary
+  // will skip the MANIFEST in middle.
+  Reopen(options);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  range_scan_db();
+}
+#endif  //! ROCKSDB_LITE
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/thirdparty/rocksdb/db/db_sst_test.cc b/thirdparty/rocksdb/db/db_sst_test.cc
index e01754c44e..dcd5847eb2 100644
--- a/thirdparty/rocksdb/db/db_sst_test.cc
+++ b/thirdparty/rocksdb/db/db_sst_test.cc
@@ -20,6 +20,37 @@ class DBSSTTest : public DBTestBase {
   DBSSTTest() : DBTestBase("/db_sst_test") {}
 };
 
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+  void ClearFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+#endif  // ROCKSDB_LITE
+
 TEST_F(DBSSTTest, DontDeletePendingOutputs) {
   Options options;
   options.env = env_;
@@ -72,7 +103,7 @@ TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
   ASSERT_GT(num_files, 0);
 
   std::vector<std::string> filenames;
-  GetSstFiles(dbname_, &filenames);
+  GetSstFiles(env_, dbname_, &filenames);
   int num_ldb_files = 0;
   for (size_t i = 0; i < filenames.size(); ++i) {
     if (i & 1) {
@@ -231,11 +262,11 @@ TEST_F(DBSSTTest, DBWithSstFileManager) {
   int files_deleted = 0;
   int files_moved = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "SstFileManagerImpl::OnAddFile", [&](void* arg) { files_added++; });
+      "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { files_deleted++; });
+      "SstFileManagerImpl::OnDeleteFile", [&](void* /*arg*/) { files_deleted++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "SstFileManagerImpl::OnMoveFile", [&](void* arg) { files_moved++; });
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Options options = CurrentOptions();
@@ -325,26 +356,35 @@ TEST_F(DBSSTTest, RateLimitedDelete) {
   env_->time_elapse_only_sleep_ = true;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  // Need to disable stats dumping and persisting which also use
+  // RepeatableThread, one of whose member variables is of type
+  // InstrumentedCondVar. The callback for
+  // InstrumentedCondVar::TimedWaitInternal can be triggered by stats dumping
+  // and persisting threads and cause time_spent_deleting measurement to become
+  // incorrect.
+  options.stats_dump_period_sec = 0;
+  options.stats_persist_period_sec = 0;
   options.env = env_;
 
-  std::string trash_dir = test::TmpDir(env_) + "/trash";
   int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
   Status s;
   options.sst_file_manager.reset(
-      NewSstFileManager(env_, nullptr, trash_dir, 0, false, &s));
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
   ASSERT_OK(s);
   options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
-  sfm->delete_scheduler()->TEST_SetMaxTrashDBRatio(1.1);
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
 
+  WriteOptions wo;
+  wo.disableWAL = true;
   ASSERT_OK(TryReopen(options));
   // Create 4 files in L0
   for (char v = 'a'; v <= 'd'; v++) {
-    ASSERT_OK(Put("Key2", DummyString(1024, v)));
-    ASSERT_OK(Put("Key3", DummyString(1024, v)));
-    ASSERT_OK(Put("Key4", DummyString(1024, v)));
-    ASSERT_OK(Put("Key1", DummyString(1024, v)));
-    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key3", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key1", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
     ASSERT_OK(Flush());
   }
   // We created 4 sst files in L0
@@ -355,6 +395,7 @@ TEST_F(DBSSTTest, RateLimitedDelete) {
 
   // Compaction will move the 4 files in L0 to trash and create 1 L1 file
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
   ASSERT_EQ("0,1", FilesPerLevel(0));
 
   uint64_t delete_start_time = env_->NowMicros();
@@ -377,16 +418,93 @@ TEST_F(DBSSTTest, RateLimitedDelete) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBSSTTest, RateLimitedWALDelete) {
+  Destroy(last_options_);
+
+  std::vector<uint64_t> penalties;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  env_->no_slowdown_ = true;
+  env_->time_elapse_only_sleep_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(2.1);
+
+  ASSERT_OK(TryReopen(options));
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(penalties.size(), 8);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
+  Options options = CurrentOptions();
+
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  Destroy(last_options_);
+
+  // Add some trash files to the db directory so the DB can clean them up
+  env_->CreateDirIfMissing(dbname_);
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
+
+  // Reopen the DB and verify that it deletes existing trash files
+  ASSERT_OK(TryReopen(options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
+}
+
+
 // Create a DB with 2 db_paths, and generate multiple files in the 2
 // db_paths using CompactRangeOptions, make sure that files that were
 // deleted from first db_path were deleted using DeleteScheduler and
 // files in the second path were not.
 TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
-  int bg_delete_file = 0;
+  std::atomic<int> bg_delete_file(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+      [&](void* /*arg*/) { bg_delete_file++; });
+  // The deletion scheduler sometimes skips marking file as trash according to
+  // a heuristic. In that case the deletion will go through the below SyncPoint.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
 
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
@@ -394,20 +512,24 @@ TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
   options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100);
   options.env = env_;
 
-  std::string trash_dir = test::TmpDir(env_) + "/trash";
   int64_t rate_bytes_per_sec = 1024 * 1024;  // 1 Mb / Sec
   Status s;
-  options.sst_file_manager.reset(NewSstFileManager(
-      env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s));
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s,
+                        /* max_trash_db_ratio= */ 1.1));
+
   ASSERT_OK(s);
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
-  sfm->delete_scheduler()->TEST_SetMaxTrashDBRatio(1.1);
 
   DestroyAndReopen(options);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.disableWAL = true;
 
   // Create 4 files in L0
   for (int i = 0; i < 4; i++) {
-    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A')));
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'A'), wo));
     ASSERT_OK(Flush());
   }
   // We created 4 sst files in L0
@@ -423,7 +545,7 @@ TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
 
   // Create 4 files in L0
   for (int i = 4; i < 8; i++) {
-    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B')));
+    ASSERT_OK(Put("Key" + ToString(i), DummyString(1024, 'B'), wo));
     ASSERT_OK(Flush());
   }
   ASSERT_EQ("4,1", FilesPerLevel(0));
@@ -438,13 +560,15 @@ TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
   sfm->WaitForEmptyTrash();
   ASSERT_EQ(bg_delete_file, 8);
 
+  // Compaction will delete both files and regenerate a file in L1 in second
+  // db path. The deleted files should still be cleaned up via delete scheduler.
   compact_options.bottommost_level_compaction =
       BottommostLevelCompaction::kForce;
   ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
   ASSERT_EQ("0,1", FilesPerLevel(0));
 
   sfm->WaitForEmptyTrash();
-  ASSERT_EQ(bg_delete_file, 8);
+  ASSERT_EQ(bg_delete_file, 10);
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -453,16 +577,15 @@ TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
   int bg_delete_file = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
+      [&](void* /*arg*/) { bg_delete_file++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Status s;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
   options.env = env_;
-  std::string trash_dir = test::TmpDir(env_) + "/trash";
   options.sst_file_manager.reset(
-      NewSstFileManager(env_, nullptr, trash_dir, 0, false, &s));
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
   ASSERT_OK(s);
   DestroyAndReopen(options);
 
@@ -477,14 +600,27 @@ TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
   // Close DB and destroy it using DeleteScheduler
   Close();
 
+  int num_sst_files = 0;
+  int num_wal_files = 0;
+  std::vector<std::string> db_files;
+  env_->GetChildren(dbname_, &db_files);
+  for (std::string f : db_files) {
+    if (f.substr(f.find_last_of(".") + 1) == "sst") {
+      num_sst_files++;
+    } else if (f.substr(f.find_last_of(".") + 1) == "log") {
+      num_wal_files++;
+    }
+  }
+  ASSERT_GT(num_sst_files, 0);
+  ASSERT_GT(num_wal_files, 0);
+
   auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
 
   sfm->SetDeleteRateBytesPerSecond(1024 * 1024);
-  sfm->delete_scheduler()->TEST_SetMaxTrashDBRatio(1.1);
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
   ASSERT_OK(DestroyDB(dbname_, options));
   sfm->WaitForEmptyTrash();
-  // We have deleted the 4 sst files in the delete_scheduler
-  ASSERT_EQ(bg_delete_file, 4);
+  ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files);
 }
 
 TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
@@ -516,47 +652,163 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
   ASSERT_NOK(Flush());
 }
 
+TEST_F(DBSSTTest, CancellingCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.level0_file_num_compaction_trigger = 2;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int completed_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) {
+        sfm->SetMaxAllowedSpaceUsage(0);
+        ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun",
+      [&](void* /*arg*/) { completed_compactions++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  auto files_in_db = GetAllSSTFiles(&total_file_size);
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+  dbfull()->TEST_WaitForCompact(true);
+
+  // Because we set a callback in CancelledCompaction, we actually
+  // let the compaction run
+  ASSERT_GT(completed_compactions, 0);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(COMPACTION_CANCELLED), 0);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.statistics = CreateDBStatistics();
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  auto files_in_db = GetAllSSTFiles(&total_file_size);
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), RandomString(&rnd, 50)));
+  }
+  ASSERT_OK(Flush());
+
+  // OK, now trigger a manual compaction
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Wait for manual compaction to get scheduled and finish
+  dbfull()->TEST_WaitForCompact(true);
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            1);
+
+  // Now make sure CompactFiles also gets cancelled
+  auto l0_files = collector->GetFlushedFiles();
+  dbfull()->CompactFiles(rocksdb::CompactionOptions(), l0_files, 0);
+
+  // Wait for manual compaction to get scheduled and finish
+  dbfull()->TEST_WaitForCompact(true);
+
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            2);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+
+  // Now let the flush through and make sure GetCompactionsReservedSize
+  // returns to normal
+  sfm->SetMaxAllowedSpaceUsage(0);
+  int completed_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  dbfull()->CompactFiles(rocksdb::CompactionOptions(), l0_files, 0);
+  dbfull()->TEST_WaitForCompact(true);
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  ASSERT_GT(completed_compactions, 0);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
   // This test will set a maximum allowed space for the DB, then it will
   // keep filling the DB until the limit is reached and bg_error_ is set.
   // When bg_error_ is set we will verify that the DB size is greater
   // than the limit.
 
-  std::vector<int> max_space_limits_mbs = {1, 2, 4, 8, 10};
-  decltype(max_space_limits_mbs)::value_type limit_mb_cb;
-  bool bg_error_set = false;
-  uint64_t total_sst_files_size = 0;
+  std::vector<int> max_space_limits_mbs = {1, 10};
+  std::atomic<bool> bg_error_set(false);
 
-  std::atomic<int> estimate_multiplier(1);
-  int reached_max_space_on_flush = 0;
-  int reached_max_space_on_compaction = 0;
+  std::atomic<int> reached_max_space_on_flush(0);
+  std::atomic<int> reached_max_space_on_compaction(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
       [&](void* arg) {
         Status* bg_error = static_cast<Status*>(arg);
         bg_error_set = true;
-        GetAllSSTFiles(&total_sst_files_size);
         reached_max_space_on_flush++;
-        // low limit for size calculated using sst files
-        ASSERT_GE(total_sst_files_size, limit_mb_cb * 1024 * 1024);
         // clear error to ensure compaction callback is called
         *bg_error = Status::OK();
-        estimate_multiplier++;  // used in the main loop assert
+      });
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) {
+        bool* enough_room = static_cast<bool*>(arg);
+        *enough_room = true;
       });
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached",
-      [&](void* arg) {
+      [&](void* /*arg*/) {
         bg_error_set = true;
-        GetAllSSTFiles(&total_sst_files_size);
         reached_max_space_on_compaction++;
       });
 
   for (auto limit_mb : max_space_limits_mbs) {
     bg_error_set = false;
-    total_sst_files_size = 0;
-    estimate_multiplier = 1;
-    limit_mb_cb = limit_mb;
     rocksdb::SyncPoint::GetInstance()->ClearTrace();
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
     std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
@@ -570,21 +822,17 @@ TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
 
     sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024);
 
-    int keys_written = 0;
-    uint64_t estimated_db_size = 0;
+    // It is easy to detect if the test is stuck in a loop. No need for
+    // complex termination logic.
     while (true) {
       auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50));
       if (!s.ok()) {
         break;
       }
-      keys_written++;
-      // Check the estimated db size vs the db limit just to make sure we
-      // dont run into an infinite loop
-      estimated_db_size = keys_written * 60;  // ~60 bytes per key
-      ASSERT_LT(estimated_db_size,
-                estimate_multiplier * limit_mb * 1024 * 1024 * 2);
     }
     ASSERT_TRUE(bg_error_set);
+    uint64_t total_sst_files_size = 0;
+    GetAllSSTFiles(&total_sst_files_size);
     ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   }
diff --git a/thirdparty/rocksdb/db/db_statistics_test.cc b/thirdparty/rocksdb/db/db_statistics_test.cc
index 237a2c6814..31396a7bf4 100644
--- a/thirdparty/rocksdb/db/db_statistics_test.cc
+++ b/thirdparty/rocksdb/db/db_statistics_test.cc
@@ -46,7 +46,7 @@ TEST_F(DBStatisticsTest, CompressionStatsTest) {
   Options options = CurrentOptions();
   options.compression = type;
   options.statistics = rocksdb::CreateDBStatistics();
-  options.statistics->stats_level_ = StatsLevel::kExceptTimeForMutex;
+  options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
   DestroyAndReopen(options);
 
   int kNumKeysWritten = 100000;
@@ -105,7 +105,7 @@ TEST_F(DBStatisticsTest, MutexWaitStats) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
-  options.statistics->stats_level_ = StatsLevel::kAll;
+  options.statistics->set_stats_level(StatsLevel::kAll);
   CreateAndReopenWithCF({"pikachu"}, options);
   const uint64_t kMutexWaitDelay = 100;
   ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
diff --git a/thirdparty/rocksdb/db/db_table_properties_test.cc b/thirdparty/rocksdb/db/db_table_properties_test.cc
index 265e9cb2e1..5a54fd81c0 100644
--- a/thirdparty/rocksdb/db/db_table_properties_test.cc
+++ b/thirdparty/rocksdb/db/db_table_properties_test.cc
@@ -13,6 +13,7 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/db.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -250,6 +251,80 @@ TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
   }
 }
 
+TEST_F(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
+  int kNumKeys = 1000;
+  int kWindowSize = 100;
+  int kNumDelsTrigger = 90;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+    NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  Put(Key(0), "val");
+  Flush();
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+  // Change the window size and deletion trigger and ensure new values take
+  // effect
+  kWindowSize = 50;
+  kNumDelsTrigger = 40;
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+
+  // Change the window size to disable delete triggered compaction
+  kWindowSize = 0;
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>
+    (compact_on_del.get())->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+}
+
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/db_tailing_iter_test.cc b/thirdparty/rocksdb/db/db_tailing_iter_test.cc
index d217828db9..62e60758fd 100644
--- a/thirdparty/rocksdb/db/db_tailing_iter_test.cc
+++ b/thirdparty/rocksdb/db/db_tailing_iter_test.cc
@@ -157,10 +157,10 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
       });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "ForwardIterator::RenewIterators:Null",
-      [&](void* arg) { file_iters_renewed_null = true; });
+      [&](void* /*arg*/) { file_iters_renewed_null = true; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "ForwardIterator::RenewIterators:Copy",
-      [&](void* arg) { file_iters_renewed_copy = true; });
+      [&](void* /*arg*/) { file_iters_renewed_copy = true; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   const int num_records = 1000;
   for (int i = 1; i < num_records; ++i) {
@@ -214,9 +214,9 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
   }
   ASSERT_TRUE(file_iters_renewed_null);
   ASSERT_TRUE(file_iters_renewed_copy);
-  iter = 0;
-  itern = 0;
-  iterh = 0;
+  iter = nullptr;
+  itern = nullptr;
+  iterh = nullptr;
   BlockBasedTableOptions table_options;
   table_options.no_block_cache = true;
   table_options.block_cache_compressed = nullptr;
@@ -229,7 +229,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
   Slice target1(buf5, 20);
   iteri->Seek(target1);
   ASSERT_TRUE(iteri->status().IsIncomplete());
-  iteri = 0;
+  iteri = nullptr;
 
   read_options.read_tier = kReadAllTier;
   options.table_factory.reset(NewBlockBasedTableFactory());
@@ -415,7 +415,7 @@ TEST_F(DBTestTailingIterator, TailingIteratorUpperBound) {
   int immutable_seeks = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "ForwardIterator::SeekInternal:Immutable",
-      [&](void* arg) { ++immutable_seeks; });
+      [&](void* /*arg*/) { ++immutable_seeks; });
 
   // Seek to 13. This should not require any immutable seeks.
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
@@ -479,275 +479,6 @@ TEST_F(DBTestTailingIterator, TailingIteratorGap) {
   ASSERT_EQ("40", it->key().ToString());
 }
 
-TEST_F(DBTestTailingIterator, ManagedTailingIteratorSingle) {
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  iter->SeekToFirst();
-  ASSERT_TRUE(!iter->Valid());
-
-  // add a record and check that iter can see it
-  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
-  iter->SeekToFirst();
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "mirko");
-
-  iter->Next();
-  ASSERT_TRUE(!iter->Valid());
-}
-
-TEST_F(DBTestTailingIterator, ManagedTailingIteratorKeepAdding) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  std::string value(1024, 'a');
-
-  const int num_records = 10000;
-  for (int i = 0; i < num_records; ++i) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "%016d", i);
-
-    Slice key(buf, 16);
-    ASSERT_OK(Put(1, key, value));
-
-    iter->Seek(key);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
-  }
-}
-
-TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToNext) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  std::string value(1024, 'a');
-
-  const int num_records = 1000;
-  for (int i = 1; i < num_records; ++i) {
-    char buf1[32];
-    char buf2[32];
-    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
-
-    Slice key(buf1, 20);
-    ASSERT_OK(Put(1, key, value));
-
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush(1));
-    }
-
-    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
-    Slice target(buf2, 20);
-    iter->Seek(target);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
-  }
-  for (int i = 2 * num_records; i > 0; --i) {
-    char buf1[32];
-    char buf2[32];
-    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
-
-    Slice key(buf1, 20);
-    ASSERT_OK(Put(1, key, value));
-
-    if (i % 100 == 99) {
-      ASSERT_OK(Flush(1));
-    }
-
-    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
-    Slice target(buf2, 20);
-    iter->Seek(target);
-    ASSERT_TRUE(iter->Valid());
-    ASSERT_EQ(iter->key().compare(key), 0);
-  }
-}
-
-TEST_F(DBTestTailingIterator, ManagedTailingIteratorDeletes) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-
-  // write a single record, read it using the iterator, then delete it
-  ASSERT_OK(Put(1, "0test", "test"));
-  iter->SeekToFirst();
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "0test");
-  ASSERT_OK(Delete(1, "0test"));
-
-  // write many more records
-  const int num_records = 10000;
-  std::string value(1024, 'A');
-
-  for (int i = 0; i < num_records; ++i) {
-    char buf[32];
-    snprintf(buf, sizeof(buf), "1%015d", i);
-
-    Slice key(buf, 16);
-    ASSERT_OK(Put(1, key, value));
-  }
-
-  // force a flush to make sure that no records are read from memtable
-  ASSERT_OK(Flush(1));
-
-  // skip "0test"
-  iter->Next();
-
-  // make sure we can read all new records using the existing iterator
-  int count = 0;
-  for (; iter->Valid(); iter->Next(), ++count) {
-  }
-
-  ASSERT_EQ(count, num_records);
-}
-
-TEST_F(DBTestTailingIterator, ManagedTailingIteratorPrefixSeek) {
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  Options options = CurrentOptions();
-  options.create_if_missing = true;
-  options.disable_auto_compactions = true;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
-  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
-  options.allow_concurrent_memtable_write = false;
-  DestroyAndReopen(options);
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
-  ASSERT_OK(Put(1, "0101", "test"));
-
-  ASSERT_OK(Flush(1));
-
-  ASSERT_OK(Put(1, "0202", "test"));
-
-  // Seek(0102) shouldn't find any records since 0202 has a different prefix
-  iter->Seek("0102");
-  ASSERT_TRUE(!iter->Valid());
-
-  iter->Seek("0202");
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(iter->key().ToString(), "0202");
-
-  iter->Next();
-  ASSERT_TRUE(!iter->Valid());
-}
-
-TEST_F(DBTestTailingIterator, ManagedTailingIteratorIncomplete) {
-  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-  read_options.read_tier = kBlockCacheTier;
-
-  std::string key = "key";
-  std::string value = "value";
-
-  ASSERT_OK(db_->Put(WriteOptions(), key, value));
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  iter->SeekToFirst();
-  // we either see the entry or it's not in cache
-  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
-
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-  iter->SeekToFirst();
-  // should still be true after compaction
-  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
-}
-
-TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToSame) {
-  Options options = CurrentOptions();
-  options.compaction_style = kCompactionStyleUniversal;
-  options.write_buffer_size = 1000;
-  CreateAndReopenWithCF({"pikachu"}, options);
-
-  ReadOptions read_options;
-  read_options.tailing = true;
-  read_options.managed = true;
-
-  const int NROWS = 10000;
-  // Write rows with keys 00000, 00002, 00004 etc.
-  for (int i = 0; i < NROWS; ++i) {
-    char buf[100];
-    snprintf(buf, sizeof(buf), "%05d", 2 * i);
-    std::string key(buf);
-    std::string value("value");
-    ASSERT_OK(db_->Put(WriteOptions(), key, value));
-  }
-
-  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-  // Seek to 00001.  We expect to find 00002.
-  std::string start_key = "00001";
-  iter->Seek(start_key);
-  ASSERT_TRUE(iter->Valid());
-
-  std::string found = iter->key().ToString();
-  ASSERT_EQ("00002", found);
-
-  // Now seek to the same key.  The iterator should remain in the same
-  // position.
-  iter->Seek(found);
-  ASSERT_TRUE(iter->Valid());
-  ASSERT_EQ(found, iter->key().ToString());
-}
-
-TEST_F(DBTestTailingIterator, ForwardIteratorVersionProperty) {
-  Options options = CurrentOptions();
-  options.write_buffer_size = 1000;
-
-  ReadOptions read_options;
-  read_options.tailing = true;
-
-  Put("foo", "bar");
-
-  uint64_t v1, v2, v3, v4;
-  {
-    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-    iter->Seek("foo");
-    std::string prop_value;
-    ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number",
-                                &prop_value));
-    v1 = static_cast<uint64_t>(std::atoi(prop_value.c_str()));
-
-    Put("foo1", "bar1");
-    Flush();
-
-    ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number",
-                                &prop_value));
-    v2 = static_cast<uint64_t>(std::atoi(prop_value.c_str()));
-
-    iter->Seek("f");
-
-    ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number",
-                                &prop_value));
-    v3 = static_cast<uint64_t>(std::atoi(prop_value.c_str()));
-
-    ASSERT_EQ(v1, v2);
-    ASSERT_GT(v3, v2);
-  }
-
-  {
-    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
-    iter->Seek("foo");
-    std::string prop_value;
-    ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number",
-                                &prop_value));
-    v4 = static_cast<uint64_t>(std::atoi(prop_value.c_str()));
-  }
-  ASSERT_EQ(v3, v4);
-}
-
 TEST_F(DBTestTailingIterator, SeekWithUpperBoundBug) {
   ReadOptions read_options;
   read_options.tailing = true;
@@ -809,6 +540,8 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 #else
+  (void) argc;
+  (void) argv;
   return 0;
 #endif
 }
diff --git a/thirdparty/rocksdb/db/db_test.cc b/thirdparty/rocksdb/db/db_test.cc
index 193101d460..60e66c6c33 100644
--- a/thirdparty/rocksdb/db/db_test.cc
+++ b/thirdparty/rocksdb/db/db_test.cc
@@ -60,7 +60,6 @@
 #include "util/compression.h"
 #include "util/file_reader_writer.h"
 #include "util/filename.h"
-#include "util/hash.h"
 #include "util/mutexlock.h"
 #include "util/rate_limiter.h"
 #include "util/string_util.h"
@@ -94,7 +93,7 @@ class DBTestWithParam
 };
 
 TEST_F(DBTest, MockEnvTest) {
-  unique_ptr<MockEnv> env{new MockEnv(Env::Default())};
+  std::unique_ptr<MockEnv> env{new MockEnv(Env::Default())};
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
@@ -144,7 +143,7 @@ TEST_F(DBTest, MockEnvTest) {
 // defined.
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, MemEnvTest) {
-  unique_ptr<Env> env{NewMemEnv(Env::Default())};
+  std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
   Options options;
   options.create_if_missing = true;
   options.env = env.get();
@@ -223,6 +222,10 @@ TEST_F(DBTest, SkipDelay) {
 
   for (bool sync : {true, false}) {
     for (bool disableWAL : {true, false}) {
+      if (sync && disableWAL) {
+        // sync and disableWAL is incompatible.
+        continue;
+      }
       // Use a small number to ensure a large delay that is still effective
       // when we do Put
       // TODO(myabandeh): this is time dependent and could potentially make
@@ -231,11 +234,11 @@ TEST_F(DBTest, SkipDelay) {
       std::atomic<int> sleep_count(0);
       rocksdb::SyncPoint::GetInstance()->SetCallBack(
           "DBImpl::DelayWrite:Sleep",
-          [&](void* arg) { sleep_count.fetch_add(1); });
+          [&](void* /*arg*/) { sleep_count.fetch_add(1); });
       std::atomic<int> wait_count(0);
       rocksdb::SyncPoint::GetInstance()->SetCallBack(
           "DBImpl::DelayWrite:Wait",
-          [&](void* arg) { wait_count.fetch_add(1); });
+          [&](void* /*arg*/) { wait_count.fetch_add(1); });
       rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
       WriteOptions wo;
@@ -259,6 +262,196 @@ TEST_F(DBTest, SkipDelay) {
   }
 }
 
+TEST_F(DBTest, MixedSlowdownOptions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:BeginWriteStallDone",
+      [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(sleep_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Sleep",
+      [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+      });
+  std::atomic<int> wait_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { wait_count.fetch_add(1); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_EQ(sleep_count.load(), 1);
+  ASSERT_GE(wait_count.load(), 0);
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsStop) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> wakeup_writer = [&]() {
+    dbfull()->mutex_.Lock();
+    dbfull()->bg_cv_.SignalAll();
+    dbfull()->mutex_.Unlock();
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetStopToken();
+  std::atomic<int> wait_count(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) {
+        wait_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+        token.reset();
+        threads.emplace_back(wakeup_writer);
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  dbfull()->Put(wo, "foo", "bar");
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+          token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(wait_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
 #ifndef ROCKSDB_LITE
 
 TEST_F(DBTest, LevelLimitReopen) {
@@ -294,11 +487,11 @@ TEST_F(DBTest, PutSingleDeleteGet) {
     ASSERT_EQ("v2", Get(1, "foo2"));
     ASSERT_OK(SingleDelete(1, "foo"));
     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
-    // Skip HashCuckooRep as it does not support single delete. FIFO and
-    // universal compaction do not apply to the test case. Skip MergePut
-    // because single delete does not get removed when it encounters a merge.
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
-                         kSkipUniversalCompaction | kSkipMergePut));
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
 }
 
 TEST_F(DBTest, ReadFromPersistedTier) {
@@ -411,7 +604,7 @@ TEST_F(DBTest, ReadFromPersistedTier) {
         DestroyAndReopen(options);
       }
     }
-  } while (ChangeOptions(kSkipHashCuckoo));
+  } while (ChangeOptions());
 }
 
 TEST_F(DBTest, SingleDeleteFlush) {
@@ -447,11 +640,11 @@ TEST_F(DBTest, SingleDeleteFlush) {
 
     ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
-    // Skip HashCuckooRep as it does not support single delete. FIFO and
-    // universal compaction do not apply to the test case. Skip MergePut
-    // because merges cannot be combined with single deletions.
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
-                         kSkipUniversalCompaction | kSkipMergePut));
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
 }
 
 TEST_F(DBTest, SingleDeletePutFlush) {
@@ -470,11 +663,41 @@ TEST_F(DBTest, SingleDeletePutFlush) {
     ASSERT_OK(Flush(1));
 
     ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
-    // Skip HashCuckooRep as it does not support single delete. FIFO and
-    // universal compaction do not apply to the test case. Skip MergePut
-    // because merges cannot be combined with single deletions.
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
-                         kSkipUniversalCompaction | kSkipMergePut));
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) {
+  const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024};  // 4GB value
+  std::string raw(kValueSize, 'v');
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("boo", "v1"));
+  ASSERT_TRUE(Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument());
+
+  WriteBatch wb;
+  ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument());
+
+  Slice value_slice = raw;
+  Slice key_slice = "foo";
+  SliceParts sp_key(&key_slice, 1);
+  SliceParts sp_value(&value_slice, 1);
+
+  ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument());
 }
 
 // Disable because not all platform can run it.
@@ -500,7 +723,9 @@ TEST_F(DBTest, DISABLED_VeryLargeValue) {
   ASSERT_OK(Put(key2, raw));
   dbfull()->TEST_WaitForFlushMemTable();
 
+#ifndef ROCKSDB_LITE
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
+#endif  // !ROCKSDB_LITE
 
   std::string value;
   Status s = db_->Get(ReadOptions(), key1, &value);
@@ -715,13 +940,13 @@ TEST_F(DBTest, FlushSchedule) {
 namespace {
 class KeepFilter : public CompactionFilter {
  public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     return false;
   }
 
-  virtual const char* Name() const override { return "KeepFilter"; }
+  const char* Name() const override { return "KeepFilter"; }
 };
 
 class KeepFilterFactory : public CompactionFilterFactory {
@@ -729,7 +954,7 @@ class KeepFilterFactory : public CompactionFilterFactory {
   explicit KeepFilterFactory(bool check_context = false)
       : check_context_(check_context) {}
 
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     if (check_context_) {
       EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
@@ -738,7 +963,7 @@ class KeepFilterFactory : public CompactionFilterFactory {
     return std::unique_ptr<CompactionFilter>(new KeepFilter());
   }
 
-  virtual const char* Name() const override { return "KeepFilterFactory"; }
+  const char* Name() const override { return "KeepFilterFactory"; }
   bool check_context_;
   std::atomic_bool expect_full_compaction_;
   std::atomic_bool expect_manual_compaction_;
@@ -747,14 +972,14 @@ class KeepFilterFactory : public CompactionFilterFactory {
 class DelayFilter : public CompactionFilter {
  public:
   explicit DelayFilter(DBTestBase* d) : db_test(d) {}
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     db_test->env_->addon_time_.fetch_add(1000);
     return true;
   }
 
-  virtual const char* Name() const override { return "DelayFilter"; }
+  const char* Name() const override { return "DelayFilter"; }
 
  private:
   DBTestBase* db_test;
@@ -763,12 +988,12 @@ class DelayFilter : public CompactionFilter {
 class DelayFilterFactory : public CompactionFilterFactory {
  public:
   explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
     return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
   }
 
-  virtual const char* Name() const override { return "DelayFilterFactory"; }
+  const char* Name() const override { return "DelayFilterFactory"; }
 
  private:
   DBTestBase* db_test;
@@ -1344,7 +1569,7 @@ TEST_F(DBTest, Snapshot) {
     ASSERT_EQ(0U, GetNumSnapshots());
     ASSERT_EQ("0v4", Get(0, "foo"));
     ASSERT_EQ("1v4", Get(1, "foo"));
-  } while (ChangeOptions(kSkipHashCuckoo));
+  } while (ChangeOptions());
 }
 
 TEST_F(DBTest, HiddenValuesAreRemoved) {
@@ -1381,9 +1606,8 @@ TEST_F(DBTest, HiddenValuesAreRemoved) {
     ASSERT_TRUE(Between(Size("", "pastfoo", 1), 0, 1000));
     // ApproximateOffsetOf() is not yet implemented in plain table format,
     // which is used by Size().
-    // skip HashCuckooRep as it does not support snapshot
   } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
-                         kSkipPlainTable | kSkipHashCuckoo));
+                         kSkipPlainTable));
 }
 #endif  // ROCKSDB_LITE
 
@@ -1429,11 +1653,11 @@ TEST_F(DBTest, UnremovableSingleDelete) {
     ASSERT_EQ("first", Get(1, "foo", snapshot));
     ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
     db_->ReleaseSnapshot(snapshot);
-    // Skip HashCuckooRep as it does not support single delete.  FIFO and
-    // universal compaction do not apply to the test case.  Skip MergePut
-    // because single delete does not get removed when it encounters a merge.
-  } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction |
-                         kSkipUniversalCompaction | kSkipMergePut));
+    // Skip FIFO and universal compaction beccause they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
 }
 
 #ifndef ROCKSDB_LITE
@@ -1551,17 +1775,14 @@ TEST_F(DBTest, OverlapInLevel0) {
 TEST_F(DBTest, ComparatorCheck) {
   class NewComparator : public Comparator {
    public:
-    virtual const char* Name() const override {
-      return "rocksdb.NewComparator";
-    }
-    virtual int Compare(const Slice& a, const Slice& b) const override {
+    const char* Name() const override { return "rocksdb.NewComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
       return BytewiseComparator()->Compare(a, b);
     }
-    virtual void FindShortestSeparator(std::string* s,
-                                       const Slice& l) const override {
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
       BytewiseComparator()->FindShortestSeparator(s, l);
     }
-    virtual void FindShortSuccessor(std::string* key) const override {
+    void FindShortSuccessor(std::string* key) const override {
       BytewiseComparator()->FindShortSuccessor(key);
     }
   };
@@ -1584,18 +1805,15 @@ TEST_F(DBTest, ComparatorCheck) {
 TEST_F(DBTest, CustomComparator) {
   class NumberComparator : public Comparator {
    public:
-    virtual const char* Name() const override {
-      return "test.NumberComparator";
-    }
-    virtual int Compare(const Slice& a, const Slice& b) const override {
+    const char* Name() const override { return "test.NumberComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
       return ToNumber(a) - ToNumber(b);
     }
-    virtual void FindShortestSeparator(std::string* s,
-                                       const Slice& l) const override {
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
       ToNumber(*s);  // Check format
       ToNumber(l);   // Check format
     }
-    virtual void FindShortSuccessor(std::string* key) const override {
+    void FindShortSuccessor(std::string* key) const override {
       ToNumber(*key);  // Check format
     }
 
@@ -1647,7 +1865,7 @@ TEST_F(DBTest, CustomComparator) {
 
 TEST_F(DBTest, DBOpen_Options) {
   Options options = CurrentOptions();
-  std::string dbname = test::TmpDir(env_) + "/db_options_test";
+  std::string dbname = test::PerThreadDBPath("db_options_test");
   ASSERT_OK(DestroyDB(dbname, options));
 
   // Does not exist, and create_if_missing == false: error
@@ -1705,7 +1923,7 @@ TEST_F(DBTest, DBOpen_Change_NumLevels) {
 }
 
 TEST_F(DBTest, DestroyDBMetaDatabase) {
-  std::string dbname = test::TmpDir(env_) + "/db_meta";
+  std::string dbname = test::PerThreadDBPath("db_meta");
   ASSERT_OK(env_->CreateDirIfMissing(dbname));
   std::string metadbname = MetaDatabaseName(dbname, 0);
   ASSERT_OK(env_->CreateDirIfMissing(metadbname));
@@ -2029,15 +2247,12 @@ static void MTThreadBody(void* arg) {
 class MultiThreadedDBTest : public DBTest,
                             public ::testing::WithParamInterface<int> {
  public:
-  virtual void SetUp() override { option_config_ = GetParam(); }
+  void SetUp() override { option_config_ = GetParam(); }
 
   static std::vector<int> GenerateOptionConfigs() {
     std::vector<int> optionConfigs;
     for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
-      // skip as HashCuckooRep does not support snapshot
-      if (optionConfig != kHashCuckoo) {
-        optionConfigs.push_back(optionConfig);
-      }
+      optionConfigs.push_back(optionConfig);
     }
     return optionConfigs;
   }
@@ -2088,6 +2303,9 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // ROCKSDB_LITE
 
 // Group commit test:
+#if !defined(TRAVIS) && !defined(OS_WIN)
+// Disable this test temporarily on Travis and appveyor as it fails
+// intermittently. Github issue: #4151
 namespace {
 
 static const int kGCNumThreads = 4;
@@ -2118,10 +2336,16 @@ TEST_F(DBTest, GroupCommitTest) {
   do {
     Options options = CurrentOptions();
     options.env = env_;
-    env_->log_write_slowdown_.store(100);
     options.statistics = rocksdb::CreateDBStatistics();
     Reopen(options);
 
+    rocksdb::SyncPoint::GetInstance()->LoadDependency(
+        {{"WriteThread::JoinBatchGroup:BeganWaiting",
+          "DBImpl::WriteImpl:BeforeLeaderEnters"},
+          {"WriteThread::AwaitState:BlockingWaiting",
+          "WriteThread::EnterAsBatchGroupLeader:End"}});
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
     // Start threads
     GCThread thread[kGCNumThreads];
     for (int id = 0; id < kGCNumThreads; id++) {
@@ -2130,13 +2354,7 @@ TEST_F(DBTest, GroupCommitTest) {
       thread[id].done = false;
       env_->StartThread(GCThreadBody, &thread[id]);
     }
-
-    for (int id = 0; id < kGCNumThreads; id++) {
-      while (thread[id].done == false) {
-        env_->SleepForMicroseconds(100000);
-      }
-    }
-    env_->log_write_slowdown_.store(0);
+    env_->WaitForJoin();
 
     ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
 
@@ -2162,6 +2380,7 @@ TEST_F(DBTest, GroupCommitTest) {
     ASSERT_GT(hist_data.average, 0.0);
   } while (ChangeOptions(kSkipNoSeekToLast));
 }
+#endif  // TRAVIS
 
 namespace {
 typedef std::map<std::string, std::string> KVMap;
@@ -2173,7 +2392,7 @@ class ModelDB : public DB {
    public:
     KVMap map_;
 
-    virtual SequenceNumber GetSequenceNumber() const override {
+    SequenceNumber GetSequenceNumber() const override {
       // no need to call this
       assert(false);
       return 0;
@@ -2182,45 +2401,47 @@ class ModelDB : public DB {
 
   explicit ModelDB(const Options& options) : options_(options) {}
   using DB::Put;
-  virtual Status Put(const WriteOptions& o, ColumnFamilyHandle* cf,
-                     const Slice& k, const Slice& v) override {
+  Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+             const Slice& v) override {
     WriteBatch batch;
     batch.Put(cf, k, v);
     return Write(o, &batch);
   }
+  using DB::Close;
+  Status Close() override { return Status::OK(); }
   using DB::Delete;
-  virtual Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
-                        const Slice& key) override {
+  Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                const Slice& key) override {
     WriteBatch batch;
     batch.Delete(cf, key);
     return Write(o, &batch);
   }
   using DB::SingleDelete;
-  virtual Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
-                              const Slice& key) override {
+  Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                      const Slice& key) override {
     WriteBatch batch;
     batch.SingleDelete(cf, key);
     return Write(o, &batch);
   }
   using DB::Merge;
-  virtual Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf,
-                       const Slice& k, const Slice& v) override {
+  Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+               const Slice& v) override {
     WriteBatch batch;
     batch.Merge(cf, k, v);
     return Write(o, &batch);
   }
   using DB::Get;
-  virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* cf,
-                     const Slice& key, PinnableSlice* value) override {
+  Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/,
+             const Slice& key, PinnableSlice* /*value*/) override {
     return Status::NotSupported(key);
   }
 
   using DB::MultiGet;
-  virtual std::vector<Status> MultiGet(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
+  std::vector<Status> MultiGet(
+      const ReadOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
       const std::vector<Slice>& keys,
-      std::vector<std::string>* values) override {
+      std::vector<std::string>* /*values*/) override {
     std::vector<Status> s(keys.size(),
                           Status::NotSupported("Not implemented."));
     return s;
@@ -2228,44 +2449,50 @@ class ModelDB : public DB {
 
 #ifndef ROCKSDB_LITE
   using DB::IngestExternalFile;
-  virtual Status IngestExternalFile(
-      ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& external_files,
-      const IngestExternalFileOptions& options) override {
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*options*/) override {
     return Status::NotSupported("Not implemented.");
   }
 
-  virtual Status VerifyChecksum() override {
+  using DB::IngestExternalFiles;
+  Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& /*args*/) override {
+    return Status::NotSupported("Not implemented");
+  }
+
+  Status VerifyChecksum() override {
     return Status::NotSupported("Not implemented.");
   }
 
   using DB::GetPropertiesOfAllTables;
-  virtual Status GetPropertiesOfAllTables(
-      ColumnFamilyHandle* column_family,
-      TablePropertiesCollection* props) override {
+  Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* /*column_family*/,
+      TablePropertiesCollection* /*props*/) override {
     return Status();
   }
 
-  virtual Status GetPropertiesOfTablesInRange(
-      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
-      TablePropertiesCollection* props) override {
+  Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* /*column_family*/, const Range* /*range*/,
+      std::size_t /*n*/, TablePropertiesCollection* /*props*/) override {
     return Status();
   }
 #endif  // ROCKSDB_LITE
 
   using DB::KeyMayExist;
-  virtual bool KeyMayExist(const ReadOptions& options,
-                           ColumnFamilyHandle* column_family, const Slice& key,
-                           std::string* value,
-                           bool* value_found = nullptr) override {
+  bool KeyMayExist(const ReadOptions& /*options*/,
+                   ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+                   std::string* /*value*/,
+                   bool* value_found = nullptr) override {
     if (value_found != nullptr) {
       *value_found = false;
     }
     return true;  // Not Supported directly
   }
   using DB::NewIterator;
-  virtual Iterator* NewIterator(const ReadOptions& options,
-                                ColumnFamilyHandle* column_family) override {
+  Iterator* NewIterator(const ReadOptions& options,
+                        ColumnFamilyHandle* /*column_family*/) override {
     if (options.snapshot == nullptr) {
       KVMap* saved = new KVMap;
       *saved = map_;
@@ -2276,37 +2503,33 @@ class ModelDB : public DB {
       return new ModelIter(snapshot_state, false);
     }
   }
-  virtual Status NewIterators(
-      const ReadOptions& options,
-      const std::vector<ColumnFamilyHandle*>& column_family,
-      std::vector<Iterator*>* iterators) override {
+  Status NewIterators(const ReadOptions& /*options*/,
+                      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+                      std::vector<Iterator*>* /*iterators*/) override {
     return Status::NotSupported("Not supported yet");
   }
-  virtual const Snapshot* GetSnapshot() override {
+  const Snapshot* GetSnapshot() override {
     ModelSnapshot* snapshot = new ModelSnapshot;
     snapshot->map_ = map_;
     return snapshot;
   }
 
-  virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
+  void ReleaseSnapshot(const Snapshot* snapshot) override {
     delete reinterpret_cast<const ModelSnapshot*>(snapshot);
   }
 
-  virtual Status Write(const WriteOptions& options,
-                       WriteBatch* batch) override {
+  Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
     class Handler : public WriteBatch::Handler {
      public:
       KVMap* map_;
-      virtual void Put(const Slice& key, const Slice& value) override {
+      void Put(const Slice& key, const Slice& value) override {
         (*map_)[key.ToString()] = value.ToString();
       }
-      virtual void Merge(const Slice& key, const Slice& value) override {
+      void Merge(const Slice& /*key*/, const Slice& /*value*/) override {
         // ignore merge for now
         // (*map_)[key.ToString()] = value.ToString();
       }
-      virtual void Delete(const Slice& key) override {
-        map_->erase(key.ToString());
-      }
+      void Delete(const Slice& key) override { map_->erase(key.ToString()); }
     };
     Handler handler;
     handler.map_ = &map_;
@@ -2314,62 +2537,64 @@ class ModelDB : public DB {
   }
 
   using DB::GetProperty;
-  virtual bool GetProperty(ColumnFamilyHandle* column_family,
-                           const Slice& property, std::string* value) override {
+  bool GetProperty(ColumnFamilyHandle* /*column_family*/,
+                   const Slice& /*property*/, std::string* /*value*/) override {
     return false;
   }
   using DB::GetIntProperty;
-  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
-                              const Slice& property, uint64_t* value) override {
+  bool GetIntProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/, uint64_t* /*value*/) override {
     return false;
   }
   using DB::GetMapProperty;
-  virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
-                              const Slice& property,
-                              std::map<std::string, double>* value) override {
+  bool GetMapProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) override {
     return false;
   }
   using DB::GetAggregatedIntProperty;
-  virtual bool GetAggregatedIntProperty(const Slice& property,
-                                        uint64_t* value) override {
+  bool GetAggregatedIntProperty(const Slice& /*property*/,
+                                uint64_t* /*value*/) override {
     return false;
   }
   using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags
-                                   = INCLUDE_FILES) override {
+  void GetApproximateSizes(ColumnFamilyHandle* /*column_family*/,
+                           const Range* /*range*/, int n, uint64_t* sizes,
+                           uint8_t /*include_flags*/
+                           = INCLUDE_FILES) override {
     for (int i = 0; i < n; i++) {
       sizes[i] = 0;
     }
   }
   using DB::GetApproximateMemTableStats;
-  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
-                                           const Range& range,
-                                           uint64_t* const count,
-                                           uint64_t* const size) override {
+  void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
+                                   const Range& /*range*/,
+                                   uint64_t* const count,
+                                   uint64_t* const size) override {
     *count = 0;
     *size = 0;
   }
   using DB::CompactRange;
-  virtual Status CompactRange(const CompactRangeOptions& options,
-                              ColumnFamilyHandle* column_family,
-                              const Slice* start, const Slice* end) override {
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*start*/, const Slice* /*end*/) override {
     return Status::NotSupported("Not supported operation.");
   }
 
-  virtual Status SetDBOptions(
-      const std::unordered_map<std::string, std::string>& new_options)
+  Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& /*new_options*/)
       override {
     return Status::NotSupported("Not supported operation.");
   }
 
   using DB::CompactFiles;
-  virtual Status CompactFiles(const CompactionOptions& compact_options,
-                              ColumnFamilyHandle* column_family,
-                              const std::vector<std::string>& input_file_names,
-                              const int output_level,
-                              const int output_path_id = -1) override {
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
     return Status::NotSupported("Not supported operation.");
   }
 
@@ -2382,113 +2607,115 @@ class ModelDB : public DB {
   }
 
   Status EnableAutoCompaction(
-      const std::vector<ColumnFamilyHandle*>& column_family_handles) override {
+      const std::vector<ColumnFamilyHandle*>& /*column_family_handles*/)
+      override {
     return Status::NotSupported("Not supported operation.");
   }
 
   using DB::NumberLevels;
-  virtual int NumberLevels(ColumnFamilyHandle* column_family) override {
-    return 1;
-  }
+  int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
 
   using DB::MaxMemCompactionLevel;
-  virtual int MaxMemCompactionLevel(
-      ColumnFamilyHandle* column_family) override {
+  int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
     return 1;
   }
 
   using DB::Level0StopWriteTrigger;
-  virtual int Level0StopWriteTrigger(
-      ColumnFamilyHandle* column_family) override {
+  int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
     return -1;
   }
 
-  virtual const std::string& GetName() const override { return name_; }
+  const std::string& GetName() const override { return name_; }
 
-  virtual Env* GetEnv() const override { return nullptr; }
+  Env* GetEnv() const override { return nullptr; }
 
   using DB::GetOptions;
-  virtual Options GetOptions(ColumnFamilyHandle* column_family) const override {
+  Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
     return options_;
   }
 
   using DB::GetDBOptions;
-  virtual DBOptions GetDBOptions() const override { return options_; }
+  DBOptions GetDBOptions() const override { return options_; }
 
   using DB::Flush;
-  virtual Status Flush(const rocksdb::FlushOptions& options,
-                       ColumnFamilyHandle* column_family) override {
+  Status Flush(const rocksdb::FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
     Status ret;
     return ret;
   }
+  Status Flush(
+      const rocksdb::FlushOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
+    return Status::OK();
+  }
 
-  virtual Status SyncWAL() override { return Status::OK(); }
+  Status SyncWAL() override { return Status::OK(); }
 
 #ifndef ROCKSDB_LITE
-  virtual Status DisableFileDeletions() override { return Status::OK(); }
+  Status DisableFileDeletions() override { return Status::OK(); }
 
-  virtual Status EnableFileDeletions(bool force) override {
-    return Status::OK();
-  }
-  virtual Status GetLiveFiles(std::vector<std::string>&, uint64_t* size,
-                              bool flush_memtable = true) override {
+  Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
+  Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
+                      bool /*flush_memtable*/ = true) override {
     return Status::OK();
   }
 
-  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+  Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
     return Status::OK();
   }
 
-  virtual Status DeleteFile(std::string name) override { return Status::OK(); }
+  Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
 
-  virtual Status GetUpdatesSince(
-      rocksdb::SequenceNumber, unique_ptr<rocksdb::TransactionLogIterator>*,
-      const TransactionLogIterator::ReadOptions& read_options =
+  Status GetUpdatesSince(
+      rocksdb::SequenceNumber,
+      std::unique_ptr<rocksdb::TransactionLogIterator>*,
+      const TransactionLogIterator::ReadOptions& /*read_options*/ =
           TransactionLogIterator::ReadOptions()) override {
     return Status::NotSupported("Not supported in Model DB");
   }
 
-  virtual void GetColumnFamilyMetaData(
-      ColumnFamilyHandle* column_family,
-      ColumnFamilyMetaData* metadata) override {}
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+                               ColumnFamilyMetaData* /*metadata*/) override {}
 #endif  // ROCKSDB_LITE
 
-  virtual Status GetDbIdentity(std::string& identity) const override {
+  Status GetDbIdentity(std::string& /*identity*/) const override {
     return Status::OK();
   }
 
-  virtual SequenceNumber GetLatestSequenceNumber() const override { return 0; }
+  SequenceNumber GetLatestSequenceNumber() const override { return 0; }
 
-  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
-    return nullptr;
+  bool SetPreserveDeletesSequenceNumber(SequenceNumber /*seqnum*/) override {
+    return true;
   }
 
+  ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
  private:
   class ModelIter : public Iterator {
    public:
     ModelIter(const KVMap* map, bool owned)
         : map_(map), owned_(owned), iter_(map_->end()) {}
-    ~ModelIter() {
+    ~ModelIter() override {
       if (owned_) delete map_;
     }
-    virtual bool Valid() const override { return iter_ != map_->end(); }
-    virtual void SeekToFirst() override { iter_ = map_->begin(); }
-    virtual void SeekToLast() override {
+    bool Valid() const override { return iter_ != map_->end(); }
+    void SeekToFirst() override { iter_ = map_->begin(); }
+    void SeekToLast() override {
       if (map_->empty()) {
         iter_ = map_->end();
       } else {
         iter_ = map_->find(map_->rbegin()->first);
       }
     }
-    virtual void Seek(const Slice& k) override {
+    void Seek(const Slice& k) override {
       iter_ = map_->lower_bound(k.ToString());
     }
-    virtual void SeekForPrev(const Slice& k) override {
+    void SeekForPrev(const Slice& k) override {
       iter_ = map_->upper_bound(k.ToString());
       Prev();
     }
-    virtual void Next() override { ++iter_; }
-    virtual void Prev() override {
+    void Next() override { ++iter_; }
+    void Prev() override {
       if (iter_ == map_->begin()) {
         iter_ = map_->end();
         return;
@@ -2496,9 +2723,9 @@ class ModelDB : public DB {
       --iter_;
     }
 
-    virtual Slice key() const override { return iter_->first; }
-    virtual Slice value() const override { return iter_->second; }
-    virtual Status status() const override { return Status::OK(); }
+    Slice key() const override { return iter_->first; }
+    Slice value() const override { return iter_->second; }
+    Status status() const override { return Status::OK(); }
 
    private:
     const KVMap* const map_;
@@ -2510,6 +2737,7 @@ class ModelDB : public DB {
   std::string name_ = "";
 };
 
+#ifndef ROCKSDB_VALGRIND_RUN
 static std::string RandomKey(Random* rnd, int minimum = 0) {
   int len;
   do {
@@ -2565,15 +2793,14 @@ static bool CompareIterators(int step, DB* model, DB* db,
 class DBTestRandomized : public DBTest,
                          public ::testing::WithParamInterface<int> {
  public:
-  virtual void SetUp() override { option_config_ = GetParam(); }
+  void SetUp() override { option_config_ = GetParam(); }
 
   static std::vector<int> GenerateOptionConfigs() {
     std::vector<int> option_configs;
     // skip cuckoo hash as it does not support snapshot.
     for (int option_config = kDefault; option_config < kEnd; ++option_config) {
-      if (!ShouldSkipOptions(option_config, kSkipDeletesFilterFirst |
-                                                kSkipNoSeekToLast |
-                                                kSkipHashCuckoo)) {
+      if (!ShouldSkipOptions(option_config,
+                             kSkipDeletesFilterFirst | kSkipNoSeekToLast)) {
         option_configs.push_back(option_config);
       }
     }
@@ -2603,7 +2830,6 @@ TEST_P(DBTestRandomized, Randomized) {
     int p = rnd.Uniform(100);
     int minimum = 0;
     if (option_config_ == kHashSkipList || option_config_ == kHashLinkList ||
-        option_config_ == kHashCuckoo ||
         option_config_ == kPlainTableFirstBytePrefix ||
         option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
         option_config_ == kBlockBasedTableWithPrefixHashIndex) {
@@ -2666,6 +2892,7 @@ TEST_P(DBTestRandomized, Randomized) {
   if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
   if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
   // create a DB with block prefix index
@@ -2856,7 +3083,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
   options.create_if_missing = true;
-  options.compaction_options_fifo.ttl = 600;  // seconds
+  options.ttl = 600;  // seconds
 
   // Check that it is not supported with max_open_files != -1.
   options.max_open_files = 100;
@@ -2872,7 +3099,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
   Options options;
   options.compaction_style = kCompactionStyleFIFO;
   options.create_if_missing = true;
-  options.compaction_options_fifo.ttl = 600;  // seconds
+  options.ttl = 600;  // seconds
 
   options = CurrentOptions(options);
   options.table_factory.reset(NewBlockBasedTableFactory());
@@ -2882,10 +3109,6 @@ TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
   options.table_factory.reset(NewPlainTableFactory());
   ASSERT_TRUE(TryReopen(options).IsNotSupported());
 
-  Destroy(options);
-  options.table_factory.reset(NewCuckooTableFactory());
-  ASSERT_TRUE(TryReopen(options).IsNotSupported());
-
   Destroy(options);
   options.table_factory.reset(NewAdaptiveTableFactory());
   ASSERT_TRUE(TryReopen(options).IsNotSupported());
@@ -2907,7 +3130,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) {
     env_->addon_time_.store(0);
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = false;
-    options.compaction_options_fifo.ttl = 1 * 60 * 60 ;  // 1 hour
+    options.ttl = 1 * 60 * 60 ;  // 1 hour
     options = CurrentOptions(options);
     DestroyAndReopen(options);
 
@@ -2942,7 +3165,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) {
   {
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = false;
-    options.compaction_options_fifo.ttl = 1 * 60 * 60;  // 1 hour
+    options.ttl = 1 * 60 * 60;  // 1 hour
     options = CurrentOptions(options);
     DestroyAndReopen(options);
 
@@ -2984,7 +3207,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) {
     options.write_buffer_size = 10 << 10;                              // 10KB
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = false;
-    options.compaction_options_fifo.ttl =  1 * 60 * 60;  // 1 hour
+    options.ttl =  1 * 60 * 60;  // 1 hour
     options = CurrentOptions(options);
     DestroyAndReopen(options);
 
@@ -3021,7 +3244,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) {
   {
     options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
     options.compaction_options_fifo.allow_compaction = true;
-    options.compaction_options_fifo.ttl = 1 * 60 * 60;  // 1 hour
+    options.ttl = 1 * 60 * 60;  // 1 hour
     options.level0_file_num_compaction_trigger = 6;
     options = CurrentOptions(options);
     DestroyAndReopen(options);
@@ -3065,7 +3288,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) {
     options.write_buffer_size = 20 << 10;                               // 20K
     options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1.5MB
     options.compaction_options_fifo.allow_compaction = true;
-    options.compaction_options_fifo.ttl = 1 * 60 * 60;  // 1 hour
+    options.ttl = 1 * 60 * 60;  // 1 hour
     options.level0_file_num_compaction_trigger = 6;
     options = CurrentOptions(options);
     DestroyAndReopen(options);
@@ -3257,8 +3480,14 @@ TEST_F(DBTest, SanitizeNumThreads) {
                      (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
     }
 
-    // Wait 100 milliseconds for they are scheduled.
-    env_->SleepForMicroseconds(100000);
+    // Wait until 10s for they are scheduled.
+    for (int i = 0; i < 10000; i++) {
+      if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
+          options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
+        break;
+      }
+      env_->SleepForMicroseconds(1000);
+    }
 
     // pool size 3, total task 4. Queue size should be 1.
     ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
@@ -3295,6 +3524,56 @@ TEST_F(DBTest, WriteSingleThreadEntry) {
   }
 }
 
+TEST_F(DBTest, ConcurrentFlushWAL) {
+  const size_t cnt = 100;
+  Options options;
+  WriteOptions wopt;
+  ReadOptions ropt;
+  for (bool two_write_queues : {false, true}) {
+    for (bool manual_wal_flush : {false, true}) {
+      options.two_write_queues = two_write_queues;
+      options.manual_wal_flush = manual_wal_flush;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      std::vector<port::Thread> threads;
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt; i++) {
+          auto istr = ToString(i);
+          db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr, "b" + istr);
+        }
+      });
+      if (two_write_queues) {
+        threads.emplace_back([&] {
+          for (size_t i = cnt; i < 2 * cnt; i++) {
+            auto istr = ToString(i);
+            WriteBatch batch;
+            batch.Put("a" + istr, "b" + istr);
+            dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true);
+          }
+        });
+      }
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt * 100; i++) {  // FlushWAL is faster than Put
+          db_->FlushWAL(false);
+        }
+      });
+      for (auto& t : threads) {
+        t.join();
+      }
+      options.create_if_missing = false;
+      // Recover from the wal and make sure that it is not corrupted
+      Reopen(options);
+      for (size_t i = 0; i < cnt; i++) {
+        PinnableSlice pval;
+        auto istr = ToString(i);
+        ASSERT_OK(
+            db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
+        ASSERT_TRUE(pval == ("b" + istr));
+      }
+    }
+  }
+}
+
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest, DynamicMemtableOptions) {
   const uint64_t k64KB = 1 << 16;
@@ -3388,7 +3667,7 @@ TEST_F(DBTest, DynamicMemtableOptions) {
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::DelayWrite:Wait",
-      [&](void* arg) { sleeping_task_low.WakeUp(); });
+      [&](void* /*arg*/) { sleeping_task_low.WakeUp(); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   while (!sleeping_task_low.WokenUp() && count < 256) {
@@ -3481,12 +3760,16 @@ TEST_F(DBTest, GetThreadStatus) {
     const int kTestCount = 3;
     const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
     const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+    const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4};
     for (int test = 0; test < kTestCount; ++test) {
       // Change the number of threads in high / low priority pool.
       env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
       env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+      env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM);
       // Wait to ensure the all threads has been registered
       unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+      // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after
+      // all threads have been registered.
       // Try up to 60 seconds.
       for (int num_try = 0; num_try < 60000; num_try++) {
         env_->SleepForMicroseconds(1000);
@@ -3501,20 +3784,21 @@ TEST_F(DBTest, GetThreadStatus) {
         if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] ==
                 kHighPriCounts[test] &&
             thread_type_counts[ThreadStatus::LOW_PRIORITY] ==
-                kLowPriCounts[test]) {
+                kLowPriCounts[test] &&
+            thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] ==
+                kBottomPriCounts[test]) {
           break;
         }
       }
-      // Verify the total number of threades
-      ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY] +
-                    thread_type_counts[ThreadStatus::LOW_PRIORITY],
-                kHighPriCounts[test] + kLowPriCounts[test]);
       // Verify the number of high-priority threads
       ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY],
                 kHighPriCounts[test]);
       // Verify the number of low-priority threads
       ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY],
                 kLowPriCounts[test]);
+      // Verify the number of bottom-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY],
+                kBottomPriCounts[test]);
     }
     if (i == 0) {
       // repeat the test with multiple column families
@@ -4219,7 +4503,7 @@ TEST_F(DBTest, DynamicCompactionOptions) {
   // Clean up memtable and L0. Block compaction threads. If continue to write
   // and flush memtables. We should see put stop after 8 memtable flushes
   // since level0_stop_writes_trigger = 8
-  dbfull()->TEST_FlushMemTable(true);
+  dbfull()->TEST_FlushMemTable(true, true);
   dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   // Block compaction
   test::SleepingBackgroundTask sleeping_task_low;
@@ -4232,7 +4516,7 @@ TEST_F(DBTest, DynamicCompactionOptions) {
   WriteOptions wo;
   while (count < 64) {
     ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-    dbfull()->TEST_FlushMemTable(true);
+    dbfull()->TEST_FlushMemTable(true, true);
     count++;
     if (dbfull()->TEST_write_controler().IsStopped()) {
       sleeping_task_low.WakeUp();
@@ -4260,7 +4544,7 @@ TEST_F(DBTest, DynamicCompactionOptions) {
   count = 0;
   while (count < 64) {
     ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024), wo));
-    dbfull()->TEST_FlushMemTable(true);
+    dbfull()->TEST_FlushMemTable(true, true);
     count++;
     if (dbfull()->TEST_write_controler().IsStopped()) {
       sleeping_task_low.WakeUp();
@@ -4302,6 +4586,141 @@ TEST_F(DBTest, DynamicCompactionOptions) {
   dbfull()->TEST_WaitForCompact();
   ASSERT_LT(NumTableFilesAtLevel(0), 4);
 }
+
+// Test dynamic FIFO compaction options.
+// This test covers just option parsing and makes sure that the options are
+// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions
+// test which makes sure that the FIFO compaction funcionality is working
+// as expected on dynamically changing the options.
+// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
+TEST_F(DBTest, DynamicFIFOCompactionOptions) {
+  Options options;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024 * 1024 * 1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=23;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 97);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=31;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            31);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{max_table_files_size=51;allow_compaction=true;}"}}));
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            51);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 49);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+}
+
+TEST_F(DBTest, DynamicUniversalCompactionOptions) {
+  Options options;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{size_ratio=7;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{min_merge_width=11;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            11);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+}
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBTest, FileCreationRandomFailure) {
@@ -4365,6 +4784,7 @@ TEST_F(DBTest, FileCreationRandomFailure) {
 }
 
 #ifndef ROCKSDB_LITE
+
 TEST_F(DBTest, DynamicMiscOptions) {
   // Test max_sequential_skip_in_iterations
   Options options;
@@ -4515,7 +4935,7 @@ TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
       options.compression = comp;
       DestroyAndReopen(options);
 
-      int kNumKeysWritten = 100000;
+      int kNumKeysWritten = 1000;
 
       Random rnd(301);
       for (int i = 0; i < kNumKeysWritten; ++i) {
@@ -4596,14 +5016,14 @@ class DelayedMergeOperator : public MergeOperator {
  public:
   explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
 
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* merge_out) const override {
     db_test_->env_->addon_time_.fetch_add(1000);
     merge_out->new_value = "";
     return true;
   }
 
-  virtual const char* Name() const override { return "DelayedMergeOperator"; }
+  const char* Name() const override { return "DelayedMergeOperator"; }
 };
 
 TEST_F(DBTest, MergeTestTime) {
@@ -4682,6 +5102,7 @@ TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
   options.disable_auto_compactions = true;
   options.create_if_missing = true;
   options.statistics = rocksdb::CreateDBStatistics();
+  options.statistics->set_stats_level(kExceptTimeForMutex);
   options.max_subcompactions = max_subcompactions_;
   DestroyAndReopen(options);
 
@@ -4738,7 +5159,7 @@ TEST_F(DBTest, EmptyCompactedDB) {
 TEST_F(DBTest, SuggestCompactRangeTest) {
   class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
    public:
-    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
         const CompactionFilter::Context& context) override {
       saved_context = context;
       std::unique_ptr<CompactionFilter> empty_filter;
@@ -4908,56 +5329,148 @@ TEST_F(DBTest, PromoteL0Failure) {
   status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
   ASSERT_TRUE(status.IsInvalidArgument());
 }
-#endif  // ROCKSDB_LITE
 
 // Github issue #596
-TEST_F(DBTest, HugeNumberOfLevels) {
+TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) {
+  const int kNumLevels = 2;
+  const int kNumL0Files = 2;
   Options options = CurrentOptions();
-  options.write_buffer_size = 2 * 1024 * 1024;         // 2MB
-  options.max_bytes_for_level_base = 2 * 1024 * 1024;  // 2MB
-  options.num_levels = 12;
-  options.max_background_compactions = 10;
-  options.max_bytes_for_level_multiplier = 2;
-  options.level_compaction_dynamic_level_bytes = true;
+  options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
   DestroyAndReopen(options);
 
   Random rnd(301);
-  for (int i = 0; i < 300000; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), RandomString(&rnd, 1024)));
+    Flush();
   }
+  ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
 
   ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files);
 }
+#endif  // ROCKSDB_LITE
 
 TEST_F(DBTest, AutomaticConflictsWithManualCompaction) {
+  const int kNumL0Files = 50;
   Options options = CurrentOptions();
-  options.write_buffer_size = 2 * 1024 * 1024;         // 2MB
-  options.max_bytes_for_level_base = 2 * 1024 * 1024;  // 2MB
-  options.num_levels = 12;
+  options.level0_file_num_compaction_trigger = 4;
+  // never slowdown / stop
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
   options.max_background_compactions = 10;
-  options.max_bytes_for_level_multiplier = 2;
-  options.level_compaction_dynamic_level_bytes = true;
   DestroyAndReopen(options);
 
-  Random rnd(301);
-  for (int i = 0; i < 300000; ++i) {
-    ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024)));
-  }
-
+  // schedule automatic compactions after the manual one starts, but before it
+  // finishes to ensure conflict.
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:Start",
+        "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"},
+       {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
   std::atomic<int> callback_count(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction()::Conflict",
-      [&](void* arg) { callback_count.fetch_add(1); });
+      "DBImpl::MaybeScheduleFlushOrCompaction:Conflict",
+      [&](void* /*arg*/) { callback_count.fetch_add(1); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-  CompactRangeOptions croptions;
-  croptions.exclusive_manual_compaction = false;
-  ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  port::Thread manual_compaction_thread([this]() {
+    CompactRangeOptions croptions;
+    croptions.exclusive_manual_compaction = true;
+    ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts");
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts");
+
   ASSERT_GE(callback_count.load(), 1);
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  for (int i = 0; i < 300000; ++i) {
+  for (int i = 0; i < 2; ++i) {
     ASSERT_NE("NOT_FOUND", Get(Key(i)));
   }
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  manual_compaction_thread.join();
+  dbfull()->TEST_WaitForCompact();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
+  Options options = CurrentOptions();
+  options.max_background_compactions = 1;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 36;
+  options.level0_stop_writes_trigger = 36;
+  DestroyAndReopen(options);
+
+  // generate files for manual compaction
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  rocksdb::ColumnFamilyMetaData cf_meta_data;
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+
+  std::vector<std::string> input_files;
+  input_files.push_back(cf_meta_data.levels[0].files[0].name);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"},
+      {"DBTest::CompactFilesShouldTriggerAutoCompaction:End",
+       "CompactFilesImpl:1"},
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread manual_compaction_thread([&]() {
+      auto s = db_->CompactFiles(CompactionOptions(),
+          db_->DefaultColumnFamily(), input_files, 0);
+  });
+
+  TEST_SYNC_POINT(
+          "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
+  // generate enough files to trigger compaction
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), RandomString(&rnd, 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_GT(cf_meta_data.levels[0].files.size(),
+      options.level0_file_num_compaction_trigger);
+  TEST_SYNC_POINT(
+          "DBTest::CompactFilesShouldTriggerAutoCompaction:End");
+
+  manual_compaction_thread.join();
+  dbfull()->TEST_WaitForCompact();
+
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_LE(cf_meta_data.levels[0].files.size(),
+      options.level0_file_num_compaction_trigger);
 }
+#endif  // ROCKSDB_LITE
 
 // Github issue #595
 // Large write batch with column families
@@ -5146,7 +5659,7 @@ TEST_F(DBTest, HardLimit) {
 
   std::atomic<int> callback_count(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack("DBImpl::DelayWrite:Wait",
-                                                 [&](void* arg) {
+                                                 [&](void* /*arg*/) {
                                                    callback_count.fetch_add(1);
                                                    sleeping_task_low.WakeUp();
                                                  });
@@ -5171,7 +5684,23 @@ TEST_F(DBTest, HardLimit) {
   sleeping_task_low.WaitUntilDone();
 }
 
-#ifndef ROCKSDB_LITE
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+class WriteStallListener : public EventListener {
+ public:
+  WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
+  void OnStallConditionsChanged(const WriteStallInfo& info) override {
+    MutexLock l(&mutex_);
+    condition_ = info.condition.cur;
+  }
+  bool CheckCondition(WriteStallCondition expected) {
+    MutexLock l(&mutex_);
+    return expected == condition_;
+  }
+ private:
+  port::Mutex   mutex_;
+  WriteStallCondition condition_;
+};
+
 TEST_F(DBTest, SoftLimit) {
   Options options = CurrentOptions();
   options.env = env_;
@@ -5187,6 +5716,43 @@ TEST_F(DBTest, SoftLimit) {
   options.max_bytes_for_level_multiplier = 10;
   options.max_background_compactions = 1;
   options.compression = kNoCompression;
+  WriteStallListener* listener = new WriteStallListener();
+  options.listeners.emplace_back(listener);
+
+  // FlushMemtable with opt.wait=true does not wait for
+  // `OnStallConditionsChanged` being called. The event listener is triggered
+  // on `JobContext::Clean`, which happens after flush result is installed.
+  // We use sync point to create a custom WaitForFlush that waits for
+  // context cleanup.
+  port::Mutex flush_mutex;
+  port::CondVar flush_cv(&flush_mutex);
+  bool flush_finished = false;
+  auto InstallFlushCallback = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      flush_finished = false;
+    }
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
+          {
+            MutexLock l(&flush_mutex);
+            flush_finished = true;
+          }
+          flush_cv.SignalAll();
+        });
+  };
+  auto WaitForFlush = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      while (!flush_finished) {
+        flush_cv.Wait();
+      }
+    }
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp");
+  };
+
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   Reopen(options);
 
@@ -5194,7 +5760,7 @@ TEST_F(DBTest, SoftLimit) {
   for (int i = 0; i < 72; i++) {
     Put(Key(i), std::string(5000, 'x'));
     if (i % 10 == 0) {
-      Flush();
+      dbfull()->TEST_FlushMemTable(true, true);
     }
   }
   dbfull()->TEST_WaitForCompact();
@@ -5204,7 +5770,7 @@ TEST_F(DBTest, SoftLimit) {
   for (int i = 0; i < 72; i++) {
     Put(Key(i), std::string(5000, 'x'));
     if (i % 10 == 0) {
-      Flush();
+      dbfull()->TEST_FlushMemTable(true, true);
     }
   }
   dbfull()->TEST_WaitForCompact();
@@ -5223,9 +5789,12 @@ TEST_F(DBTest, SoftLimit) {
     Put(Key(i), std::string(5000, 'x'));
     Put(Key(100 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
-    Flush();
+    InstallFlushCallback();
+    dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
   }
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
 
   sleeping_task_low.WakeUp();
   sleeping_task_low.WaitUntilDone();
@@ -5236,18 +5805,17 @@ TEST_F(DBTest, SoftLimit) {
   // The L1 file size is around 30KB.
   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
 
   // Only allow one compactin going through.
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "BackgroundCallCompaction:0", [&](void* arg) {
+      "BackgroundCallCompaction:0", [&](void* /*arg*/) {
         // Schedule a sleeping task.
         sleeping_task_low.Reset();
         env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
                        &sleeping_task_low, Env::Priority::LOW);
       });
 
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-
   env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
                  Env::Priority::LOW);
   sleeping_task_low.WaitUntilSleeping();
@@ -5256,7 +5824,9 @@ TEST_F(DBTest, SoftLimit) {
     Put(Key(10 + i), std::string(5000, 'x'));
     Put(Key(90 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
-    Flush();
+    InstallFlushCallback();
+    dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
   }
 
   // Wake up sleep task to enable compaction to run and waits
@@ -5270,13 +5840,16 @@ TEST_F(DBTest, SoftLimit) {
   // doesn't trigger soft_pending_compaction_bytes_limit
   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
 
   // Create 3 L0 files, making score of L0 to be 3, higher than L0.
   for (int i = 0; i < 3; i++) {
     Put(Key(20 + i), std::string(5000, 'x'));
     Put(Key(80 - i), std::string(5000, 'x'));
     // Flush the file. File size is around 30KB.
-    Flush();
+    InstallFlushCallback();
+    dbfull()->TEST_FlushMemTable(true, true);
+    WaitForFlush();
   }
   // Wake up sleep task to enable compaction to run and waits
   // for it to go to sleep state again to make sure one compaction
@@ -5290,11 +5863,13 @@ TEST_F(DBTest, SoftLimit) {
   // triggerring soft_pending_compaction_bytes_limit
   ASSERT_EQ(NumTableFilesAtLevel(1), 1);
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
 
   sleeping_task_low.WakeUp();
   sleeping_task_low.WaitUntilSleeping();
 
   ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
 
   // shrink level base so L2 will hit soft limit easier.
   ASSERT_OK(dbfull()->SetOptions({
@@ -5304,6 +5879,7 @@ TEST_F(DBTest, SoftLimit) {
   Put("", "");
   Flush();
   ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
 
   sleeping_task_low.WaitUntilSleeping();
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
@@ -5345,7 +5921,7 @@ TEST_F(DBTest, LastWriteBufferDelay) {
   sleeping_task.WakeUp();
   sleeping_task.WaitUntilDone();
 }
-#endif  // ROCKSDB_LITE
+#endif  // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
 
 TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
   CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
@@ -5489,6 +6065,50 @@ TEST_F(DBTest, PauseBackgroundWorkTest) {
   // now it's done
   ASSERT_TRUE(done.load());
 }
+
+// Keep spawning short-living threads that create an iterator and quit.
+// Meanwhile in another thread keep flushing memtables.
+// This used to cause a deadlock.
+TEST_F(DBTest, ThreadLocalPtrDeadlock) {
+  std::atomic<int> flushes_done{0};
+  std::atomic<int> threads_destroyed{0};
+  auto done = [&] {
+    return flushes_done.load() > 10;
+  };
+
+  port::Thread flushing_thread([&] {
+    for (int i = 0; !done(); ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"),
+                         Slice(std::to_string(i).c_str())));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      int cnt = ++flushes_done;
+      fprintf(stderr, "Flushed %d times\n", cnt);
+    }
+  });
+
+  std::vector<port::Thread> thread_spawning_threads(10);
+  for (auto& t: thread_spawning_threads) {
+    t = port::Thread([&] {
+      while (!done()) {
+        {
+          port::Thread tmp_thread([&] {
+            auto it = db_->NewIterator(ReadOptions());
+            delete it;
+          });
+          tmp_thread.join();
+        }
+        ++threads_destroyed;
+      }
+    });
+  }
+
+  for (auto& t: thread_spawning_threads) {
+    t.join();
+  }
+  flushing_thread.join();
+  fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
+          flushes_done.load(), threads_destroyed.load());
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_test2.cc b/thirdparty/rocksdb/db/db_test2.cc
index 30afd5a690..6a00300eb5 100644
--- a/thirdparty/rocksdb/db/db_test2.cc
+++ b/thirdparty/rocksdb/db/db_test2.cc
@@ -11,6 +11,7 @@
 #include <functional>
 
 #include "db/db_test_util.h"
+#include "db/read_callback.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/persistent_cache.h"
@@ -29,7 +30,7 @@ class PrefixFullBloomWithReverseComparator
  public:
   PrefixFullBloomWithReverseComparator()
       : DBTestBase("/prefix_bloom_reverse") {}
-  virtual void SetUp() override { if_cache_filter_ = GetParam(); }
+  void SetUp() override { if_cache_filter_ = GetParam(); }
   bool if_cache_filter_;
 };
 
@@ -60,7 +61,7 @@ TEST_P(PrefixFullBloomWithReverseComparator,
     bbto.block_cache->EraseUnRefEntries();
   }
 
-  unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
   iter->Seek("bar345");
   ASSERT_OK(iter->status());
   ASSERT_TRUE(iter->Valid());
@@ -341,6 +342,7 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
     ASSERT_GE(cache->GetUsage(), 1024 * 1024);
     Close();
     options.write_buffer_manager.reset();
+    last_options_.write_buffer_manager.reset();
     ASSERT_LT(cache->GetUsage(), 1024 * 1024);
   }
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
@@ -353,7 +355,7 @@ INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
                                           std::make_tuple(false, true)));
 
 TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
-  std::string dbname2 = test::TmpDir(env_) + "/db_shared_wb_db2";
+  std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
   Options options = CurrentOptions();
   options.arena_block_size = 4096;
   // Avoid undeterministic value by malloc_usable_size();
@@ -453,6 +455,22 @@ TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  std::shared_ptr<Cache> cache =
+      NewLRUCache(LRUCacheOptions(10000000, 1, false, 0.0));
+  options.write_buffer_size = 50000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  // One dummy entry is 1MB.
+  ASSERT_GT(cache->GetUsage(), 500000);
+}
+
 namespace {
   void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
     const std::vector<Slice>& keys_must_not_exist) {
@@ -497,9 +515,9 @@ TEST_F(DBTest2, WalFilterTest) {
       apply_option_at_record_index_(apply_option_for_record_index),
       current_record_index_(0) {}
 
-    virtual WalProcessingOption LogRecord(const WriteBatch& batch,
-      WriteBatch* new_batch,
-      bool* batch_changed) const override {
+    WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+                                  WriteBatch* /*new_batch*/,
+                                  bool* /*batch_changed*/) const override {
       WalFilter::WalProcessingOption option_to_return;
 
       if (current_record_index_ == apply_option_at_record_index_) {
@@ -517,7 +535,7 @@ TEST_F(DBTest2, WalFilterTest) {
       return option_to_return;
     }
 
-    virtual const char* Name() const override { return "TestWalFilter"; }
+    const char* Name() const override { return "TestWalFilter"; }
   };
 
   // Create 3 batches with two keys each
@@ -669,7 +687,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
       : new_write_batch_(new_write_batch),
       num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
       num_keys_added_(0) {}
-    virtual void Put(const Slice& key, const Slice& value) override {
+    void Put(const Slice& key, const Slice& value) override {
       if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
         new_write_batch_->Put(key, value);
         ++num_keys_added_;
@@ -693,9 +711,9 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
       num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
       current_record_index_(0) {}
 
-    virtual WalProcessingOption LogRecord(const WriteBatch& batch,
-      WriteBatch* new_batch,
-      bool* batch_changed) const override {
+    WalProcessingOption LogRecord(const WriteBatch& batch,
+                                  WriteBatch* new_batch,
+                                  bool* batch_changed) const override {
       if (current_record_index_ >= change_records_from_index_) {
         ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
         batch.Iterate(&handler);
@@ -711,9 +729,7 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
       return WalProcessingOption::kContinueProcessing;
     }
 
-    virtual const char* Name() const override {
-      return "TestWalFilterWithChangeBatch";
-    }
+    const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
   };
 
   std::vector<std::vector<std::string>> batch_keys(3);
@@ -791,18 +807,17 @@ TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
 TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
   class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
   public:
-    virtual WalProcessingOption LogRecord(const WriteBatch& batch,
-      WriteBatch* new_batch,
-      bool* batch_changed) const override {
-      *new_batch = batch;
-      new_batch->Put("key_extra", "value_extra");
-      *batch_changed = true;
-      return WalProcessingOption::kContinueProcessing;
-    }
-
-    virtual const char* Name() const override {
-      return "WalFilterTestWithChangeBatchExtraKeys";
-    }
+   WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch,
+                                 bool* batch_changed) const override {
+     *new_batch = batch;
+     new_batch->Put("key_extra", "value_extra");
+     *batch_changed = true;
+     return WalProcessingOption::kContinueProcessing;
+   }
+
+   const char* Name() const override {
+     return "WalFilterTestWithChangeBatchExtraKeys";
+   }
   };
 
   std::vector<std::vector<std::string>> batch_keys(3);
@@ -866,19 +881,19 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
     // for verification against the keys we expect.
     std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
   public:
-    virtual void ColumnFamilyLogNumberMap(
-      const std::map<uint32_t, uint64_t>& cf_lognumber_map,
-      const std::map<std::string, uint32_t>& cf_name_id_map) override {
-      cf_log_number_map_ = cf_lognumber_map;
-      cf_name_id_map_ = cf_name_id_map;
-    }
-
-    virtual WalProcessingOption LogRecordFound(unsigned long long log_number,
-      const std::string& log_file_name,
-      const WriteBatch& batch,
-      WriteBatch* new_batch,
-      bool* batch_changed) override {
-      class LogRecordBatchHandler : public WriteBatch::Handler {
+   void ColumnFamilyLogNumberMap(
+       const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+       const std::map<std::string, uint32_t>& cf_name_id_map) override {
+     cf_log_number_map_ = cf_lognumber_map;
+     cf_name_id_map_ = cf_name_id_map;
+   }
+
+   WalProcessingOption LogRecordFound(unsigned long long log_number,
+                                      const std::string& /*log_file_name*/,
+                                      const WriteBatch& batch,
+                                      WriteBatch* /*new_batch*/,
+                                      bool* /*batch_changed*/) override {
+     class LogRecordBatchHandler : public WriteBatch::Handler {
       private:
         const std::map<uint32_t, uint64_t> & cf_log_number_map_;
         std::map<uint32_t, std::vector<std::string>> & cf_wal_keys_;
@@ -891,8 +906,8 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
           cf_wal_keys_(cf_wal_keys),
           log_number_(current_log_number){}
 
-        virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-          const Slice& /*value*/) override {
+        Status PutCF(uint32_t column_family_id, const Slice& key,
+                     const Slice& /*value*/) override {
           auto it = cf_log_number_map_.find(column_family_id);
           assert(it != cf_log_number_map_.end());
           unsigned long long log_number_for_cf = it->second;
@@ -910,11 +925,11 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
       batch.Iterate(&handler);
 
       return WalProcessingOption::kContinueProcessing;
-    }
+   }
 
-    virtual const char* Name() const override {
-      return "WalFilterTestWithColumnFamilies";
-    }
+   const char* Name() const override {
+     return "WalFilterTestWithColumnFamilies";
+   }
 
     const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
       return cf_wal_keys_;
@@ -1021,7 +1036,10 @@ TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
   ASSERT_TRUE(index == keys_cf.size());
 }
 
-TEST_F(DBTest2, PresetCompressionDict) {
+// Temporarily disable it because the test is flaky.
+TEST_F(DBTest2, DISABLED_PresetCompressionDict) {
+  // Verifies that compression ratio improves when dictionary is enabled, and
+  // improves even further when the dictionary is trained by ZSTD.
   const size_t kBlockSizeBytes = 4 << 10;
   const size_t kL0FileBytes = 128 << 10;
   const size_t kApproxPerBlockOverheadBytes = 50;
@@ -1031,7 +1049,6 @@ TEST_F(DBTest2, PresetCompressionDict) {
   options.env = CurrentOptions().env; // Make sure to use any custom env that the test is configured with.
   options.allow_concurrent_memtable_write = false;
   options.arena_block_size = kBlockSizeBytes;
-  options.compaction_style = kCompactionStyleUniversal;
   options.create_if_missing = true;
   options.disable_auto_compactions = true;
   options.level0_file_num_compaction_trigger = kNumL0Files;
@@ -1058,43 +1075,63 @@ TEST_F(DBTest2, PresetCompressionDict) {
   for (auto compression_type : compression_types) {
     options.compression = compression_type;
     size_t prev_out_bytes;
-    for (int i = 0; i < 2; ++i) {
+    for (int i = 0; i < 3; ++i) {
       // First iteration: compress without preset dictionary
       // Second iteration: compress with preset dictionary
-      // To make sure the compression dictionary was actually used, we verify
-      // the compressed size is smaller in the second iteration. Also in the
-      // second iteration, verify the data we get out is the same data we put
-      // in.
-      if (i) {
-        options.compression_opts.max_dict_bytes = kBlockSizeBytes;
-      } else {
-        options.compression_opts.max_dict_bytes = 0;
+      // Third iteration (zstd only): compress with zstd-trained dictionary
+      //
+      // To make sure the compression dictionary has the intended effect, we
+      // verify the compressed size is smaller in successive iterations. Also in
+      // the non-first iterations, verify the data we get out is the same data
+      // we put in.
+      switch (i) {
+        case 0:
+          options.compression_opts.max_dict_bytes = 0;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case 1:
+          options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case 2:
+          if (compression_type != kZSTD) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = 4 * kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          break;
+        default:
+          assert(false);
       }
 
       options.statistics = rocksdb::CreateDBStatistics();
       options.table_factory.reset(NewBlockBasedTableFactory(table_options));
       CreateAndReopenWithCF({"pikachu"}, options);
       Random rnd(301);
-      std::string seq_data =
-          RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+      std::string seq_datas[10];
+      for (int j = 0; j < 10; ++j) {
+        seq_datas[j] =
+            RandomString(&rnd, kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+      }
 
       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
       for (int j = 0; j < kNumL0Files; ++j) {
         for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
-          ASSERT_OK(Put(1, Key(static_cast<int>(
-                               j * (kL0FileBytes / kBlockSizeBytes) + k)),
-                        seq_data));
+          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+                        seq_datas[(key_num / 10) % 10]));
         }
         dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
         ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
       }
-      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+      dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                  true /* disallow_trivial_move */);
       ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
       ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
 
       size_t out_bytes = 0;
       std::vector<std::string> files;
-      GetSstFiles(dbname_, &files);
+      GetSstFiles(env_, dbname_, &files);
       for (const auto& file : files) {
         uint64_t curr_bytes;
         env_->GetFileSize(dbname_ + "/" + file, &curr_bytes);
@@ -1103,7 +1140,7 @@ TEST_F(DBTest2, PresetCompressionDict) {
 
       for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
            j++) {
-        ASSERT_EQ(seq_data, Get(1, Key(static_cast<int>(j))));
+        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
       }
       if (i) {
         ASSERT_GT(prev_out_bytes, out_bytes);
@@ -1114,6 +1151,70 @@ TEST_F(DBTest2, PresetCompressionDict) {
   }
 }
 
+TEST_F(DBTest2, PresetCompressionDictLocality) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  // Verifies that compression dictionary is generated from local data. The
+  // verification simply checks all output SSTs have different compression
+  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+  // the future.
+  const int kNumEntriesPerFile = 1 << 10;  // 1KB
+  const int kNumBytesPerEntry = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  options.statistics = rocksdb::CreateDBStatistics();
+  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+                    RandomString(&rnd, kNumBytesPerEntry)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+  }
+
+  // Store all the dictionaries generated during a full compaction.
+  std::vector<std::string> compression_dicts;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* arg) {
+        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  CompactRangeOptions compact_range_opts;
+  compact_range_opts.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+  // Dictionary compression should not be so good as to compress four totally
+  // random files into one. If it does then there's probably something wrong
+  // with the test.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Furthermore, there should be one compression dictionary generated per file.
+  // And they should all be different from each other.
+  ASSERT_EQ(NumTableFilesAtLevel(1),
+            static_cast<int>(compression_dicts.size()));
+  for (size_t i = 1; i < compression_dicts.size(); ++i) {
+    std::string& a = compression_dicts[i - 1];
+    std::string& b = compression_dicts[i];
+    size_t alen = a.size();
+    size_t blen = b.size();
+    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+  }
+}
+
 class CompactionCompressionListener : public EventListener {
  public:
   explicit CompactionCompressionListener(Options* db_options)
@@ -1133,7 +1234,7 @@ class CompactionCompressionListener : public EventListener {
     }
 
     if (db_options_->bottommost_compression != kDisableCompressionOption &&
-        ci.output_level == bottommost_level && ci.output_level >= 2) {
+        ci.output_level == bottommost_level) {
       ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
     } else if (db_options_->compression_per_level.size() != 0) {
       ASSERT_EQ(ci.compression,
@@ -1210,14 +1311,23 @@ TEST_F(DBTest2, CompressionOptions) {
 
 class CompactionStallTestListener : public EventListener {
  public:
-  CompactionStallTestListener() : compacted_files_cnt_(0) {}
+  CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
 
-  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacting_files_cnt_ += ci.input_files.size();
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
     ASSERT_EQ(ci.cf_name, "default");
     ASSERT_EQ(ci.base_input_level, 0);
     ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
     compacted_files_cnt_ += ci.input_files.size();
   }
+
+  std::atomic<size_t> compacting_files_cnt_;
   std::atomic<size_t> compacted_files_cnt_;
 };
 
@@ -1226,6 +1336,8 @@ TEST_F(DBTest2, CompactionStall) {
       {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
        {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
        {"DBTest2::CompactionStall:2",
+        "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
+       {"DBTest2::CompactionStall:3",
         "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -1267,14 +1379,18 @@ TEST_F(DBTest2, CompactionStall) {
   // Wait for another compaction to be triggered
   TEST_SYNC_POINT("DBTest2::CompactionStall:1");
 
-  // Hold NotifyOnCompactionCompleted in the unlock mutex section
+  // Hold NotifyOnCompactionBegin in the unlock mutex section
   TEST_SYNC_POINT("DBTest2::CompactionStall:2");
 
+  // Hold NotifyOnCompactionCompleted in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:3");
+
   dbfull()->TEST_WaitForCompact();
   ASSERT_LT(NumTableFilesAtLevel(0),
             options.level0_file_num_compaction_trigger);
   ASSERT_GT(listener->compacted_files_cnt_.load(),
             10 - options.level0_file_num_compaction_trigger);
+  ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load());
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -1296,11 +1412,49 @@ TEST_F(DBTest2, FirstSnapshotTest) {
   db_->ReleaseSnapshot(s1);
 }
 
-class PinL0IndexAndFilterBlocksTest : public DBTestBase,
-                                      public testing::WithParamInterface<bool> {
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, DuplicateSnapshot) {
+  Options options;
+  options = CurrentOptions(options);
+  std::vector<const Snapshot*> snapshots;
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  SequenceNumber oldest_ww_snap, first_ww_snap;
+
+  Put("k", "v");  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(db_->GetSnapshot());
+  Put("k", "v");  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  first_ww_snap = snapshots.back()->GetSequenceNumber();
+  Put("k", "v");  // inc seq
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  snapshots.push_back(db_->GetSnapshot());
+  Put("k", "v");  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+
+  {
+    InstrumentedMutexLock l(dbi->mutex());
+    auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
+    ASSERT_EQ(seqs.size(), 4);  // duplicates are not counted
+    ASSERT_EQ(oldest_ww_snap, first_ww_snap);
+  }
+
+  for (auto s : snapshots) {
+    db_->ReleaseSnapshot(s);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+class PinL0IndexAndFilterBlocksTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
   PinL0IndexAndFilterBlocksTest() : DBTestBase("/db_pin_l0_index_bloom_test") {}
-  virtual void SetUp() override { infinite_max_files_ = GetParam(); }
+  void SetUp() override {
+    infinite_max_files_ = std::get<0>(GetParam());
+    disallow_preload_ = std::get<1>(GetParam());
+  }
 
   void CreateTwoLevels(Options* options, bool close_afterwards) {
     if (infinite_max_files_) {
@@ -1337,6 +1491,7 @@ class PinL0IndexAndFilterBlocksTest : public DBTestBase,
   }
 
   bool infinite_max_files_;
+  bool disallow_preload_;
 };
 
 TEST_P(PinL0IndexAndFilterBlocksTest,
@@ -1427,12 +1582,27 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
   uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
   uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
 
+  if (disallow_preload_) {
+    // Now we have two files. We narrow the max open files to allow 3 entries
+    // so that preloading SST files won't happen.
+    options.max_open_files = 13;
+    // RocksDB sanitize max open files to at least 20. Modify it back.
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = static_cast<int*>(arg);
+          *max_open_files = 13;
+        });
+  }
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
   // Reopen database. If max_open_files is set as -1, table readers will be
   // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
   // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
   TryReopenWithColumnFamilies({"default", "pikachu"}, options);
 
-  if (infinite_max_files_) {
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (!disallow_preload_) {
     // After reopen, cache miss are increased by one because we read (and only
     // read) filter and index on L0
     ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
@@ -1460,7 +1630,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
 
   // this should be read from L1
   value = Get(1, "a");
-  if (infinite_max_files_) {
+  if (!disallow_preload_) {
     // In inifinite max files case, there's a cache miss in executing Get()
     // because index and filter are not prefetched before.
     ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
@@ -1478,10 +1648,45 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
     ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
     ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
   }
+
+  // Force a full compaction to one single file. There will be a block
+  // cache read for both of index and filter. If prefetch doesn't explicitly
+  // happen, it will happen when verifying the file.
+  Compact(1, "a", "zzzzz");
+  dbfull()->TEST_WaitForCompact();
+
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+
+  // Bloom and index hit will happen when a Get() happens.
+  value = Get(1, "a");
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
-                        PinL0IndexAndFilterBlocksTest, ::testing::Bool());
+                        PinL0IndexAndFilterBlocksTest,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(false, true)));
 
 #ifndef ROCKSDB_LITE
 TEST_F(DBTest2, MaxCompactionBytesTest) {
@@ -1549,7 +1754,7 @@ class MockPersistentCache : public PersistentCache {
         "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
   }
 
-  virtual ~MockPersistentCache() {}
+  ~MockPersistentCache() override {}
 
   PersistentCache::StatsType Stats() override {
     return PersistentCache::StatsType();
@@ -1597,6 +1802,127 @@ class MockPersistentCache : public PersistentCache {
   const size_t max_size_ = 10 * 1024;  // 10KiB
 };
 
+#ifdef OS_LINUX
+// Make sure that in CPU time perf context counters, Env::NowCPUNanos()
+// is used, rather than Env::CPUNanos();
+TEST_F(DBTest2, TestPerfContextGetCpuTime) {
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during Get().
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  env_->now_cpu_count_.store(0);
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+
+  uint64_t kDummyAddonTime = uint64_t{1000000000000};
+
+  // Add time to NowNanos() reading.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_GT(env_->now_cpu_count_.load(), 2);
+  ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonTime);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestPerfContextIterCpuTime) {
+  DestroyAndReopen(CurrentOptions());
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during iteration
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+
+  const size_t kNumEntries = 10;
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_OK(Put("k" + ToString(i), "v" + ToString(i)));
+  }
+  ASSERT_OK(Flush());
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_EQ("v" + ToString(i), Get("k" + ToString(i)));
+  }
+  std::string last_key = "k" + ToString(kNumEntries - 1);
+  std::string last_value = "v" + ToString(kNumEntries - 1);
+  env_->now_cpu_count_.store(0);
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+  delete iter;
+
+  uint64_t kDummyAddonTime = uint64_t{1000000000000};
+
+  // Add time to NowNanos() reading.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->addon_time_.fetch_add(kDummyAddonTime); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonTime);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonTime);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonTime);
+  ASSERT_GE(env_->now_cpu_count_.load(), 12);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonTime);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  delete iter;
+}
+#endif  // OS_LINUX
+
 #ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
 TEST_F(DBTest2, PersistentCache) {
   int num_iter = 80;
@@ -1673,7 +1999,7 @@ TEST_F(DBTest2, SyncPointMarker) {
   std::atomic<int> sync_point_called(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBTest2::MarkedPoint",
-      [&](void* arg) { sync_point_called.fetch_add(1); });
+      [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
 
   // The first dependency enforces Marker can be loaded before MarkedPoint.
   // The second checks that thread 1's MarkedPoint should be disabled here.
@@ -1801,11 +2127,29 @@ TEST_F(DBTest2, ReadAmpBitmap) {
 
 #ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented
 TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
-  if (dbname_.find("dev/shm") != std::string::npos) {
-    // /dev/shm dont support getting a unique file id, this mean that
-    // running this test on /dev/shm will fail because lru_cache will load
-    // the blocks again regardless of them being already in the cache
-    return;
+  {
+    const int kIdBufLen = 100;
+    char id_buf[kIdBufLen];
+#ifndef OS_WIN
+    // You can't open a directory on windows using random access file
+    std::unique_ptr<RandomAccessFile> file;
+    ASSERT_OK(env_->NewRandomAccessFile(dbname_, &file, EnvOptions()));
+    if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+      // fs holding db directory doesn't support getting a unique file id,
+      // this means that running this test will fail because lru_cache will load
+      // the blocks again regardless of them being already in the cache
+      return;
+    }
+#else
+    std::unique_ptr<Directory> dir;
+    ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+    if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+      // fs holding db directory doesn't support getting a unique file id,
+      // this means that running this test will fail because lru_cache will load
+      // the blocks again regardless of them being already in the cache
+      return;
+    }
+#endif
   }
   uint32_t bytes_per_bit[2] = {1, 16};
   for (size_t k = 0; k < 2; k++) {
@@ -1918,19 +2262,19 @@ TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
   ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
 
   auto get_stat = [](std::string level_str, LevelStatType type,
-                     std::map<std::string, double> props) {
+                     std::map<std::string, std::string> props) {
     auto prop_str =
-        level_str + "." +
+        "compaction." + level_str + "." +
         InternalStats::compaction_level_stats.at(type).property_name.c_str();
     auto prop_item = props.find(prop_str);
-    return prop_item == props.end() ? 0 : prop_item->second;
+    return prop_item == props.end() ? 0 : std::stod(prop_item->second);
   };
 
   // Trivial move 2 files to L2
   ASSERT_EQ("0,0,2", FilesPerLevel());
   // Also test that the stats GetMapProperty API reporting the same result
   {
-    std::map<std::string, double> prop;
+    std::map<std::string, std::string> prop;
     ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
     ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
     ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
@@ -1942,7 +2286,7 @@ TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
   // can fit in L2, these 2 files will be moved to L2 and overlap with
   // the running compaction and break the LSM consistency.
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Start", [&](void* arg) {
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
         ASSERT_OK(
             dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
                                   {"max_bytes_for_level_base", "1"}}));
@@ -1966,7 +2310,7 @@ TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
 
   // Test that the stats GetMapProperty API reporting 1 file in L2
   {
-    std::map<std::string, double> prop;
+    std::map<std::string, std::string> prop;
     ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
     ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
   }
@@ -2008,7 +2352,7 @@ TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
   // the running compaction and break the LSM consistency.
   std::atomic<bool> flag(false);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Start", [&](void* arg) {
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
         if (flag.exchange(true)) {
           // We want to make sure to call this callback only once
           return;
@@ -2260,7 +2604,8 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
                              kBytesPerKey) /* rate_bytes_per_sec */,
         10 * 1000 /* refill_period_us */, 10 /* fairness */,
         RateLimiter::Mode::kReadsOnly));
-    options.use_direct_io_for_flush_and_compaction = use_direct_io;
+    options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+        use_direct_io;
     BlockBasedTableOptions bbto;
     bbto.block_size = 16384;
     bbto.no_block_cache = true;
@@ -2282,11 +2627,11 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
     // chose 1MB as the upper bound on the total bytes read.
     size_t rate_limited_bytes =
         options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
-    // Include the explict prefetch of the footer in direct I/O case.
+    // Include the explicit prefetch of the footer in direct I/O case.
     size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
-    ASSERT_GE(rate_limited_bytes,
-              static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files +
-                                  direct_io_extra));
+    ASSERT_GE(
+        rate_limited_bytes,
+        static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
     ASSERT_LT(
         rate_limited_bytes,
         static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
@@ -2331,6 +2676,1033 @@ TEST_F(DBTest2, ReduceLevel) {
   ASSERT_EQ("0,1", FilesPerLevel());
 #endif  // !ROCKSDB_LITE
 }
+
+// Test that ReadCallback is actually used in both memtbale and sst tables
+TEST_F(DBTest2, ReadCallbackTest) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  Reopen(options);
+  std::vector<const Snapshot*> snapshots;
+  // Try to create a db with multiple layers and a memtable
+  const std::string key = "foo";
+  const std::string value = "bar";
+  // This test assumes that the seq start with 1 and increased by 1 after each
+  // write batch of size 1. If that behavior changes, the test needs to be
+  // updated as well.
+  // TODO(myabandeh): update this test to use the seq number that is returned by
+  // the DB instead of assuming what seq the DB used.
+  int i = 1;
+  for (; i < 10; i++) {
+    Put(key, value + std::to_string(i));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  Flush();
+  for (; i < 20; i++) {
+    Put(key, value + std::to_string(i));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  Flush();
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  for (; i < 30; i++) {
+    Put(key, value + std::to_string(i));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  Flush();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  // And also add some values to the memtable
+  for (; i < 40; i++) {
+    Put(key, value + std::to_string(i));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber snapshot)
+        : ReadCallback(snapshot), snapshot_(snapshot) {}
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= snapshot_;
+    }
+
+   private:
+    SequenceNumber snapshot_;
+  };
+
+  for (int seq = 1; seq < i; seq++) {
+    PinnableSlice pinnable_val;
+    ReadOptions roptions;
+    TestReadCallback callback(seq);
+    bool dont_care = true;
+    Status s = dbfull()->GetImpl(roptions, dbfull()->DefaultColumnFamily(), key,
+                                 &pinnable_val, &dont_care, &callback);
+    ASSERT_TRUE(s.ok());
+    // Assuming that after each Put the DB increased seq by one, the value and
+    // seq number must be equal since we also inc value by 1 after each Put.
+    ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
+  }
+
+  for (auto snapshot : snapshots) {
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
+  // Regression test for race condition where an obsolete file is returned to
+  // user as a "live file" but then deleted, all while file deletions are
+  // disabled.
+  //
+  // It happened like this:
+  //
+  // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
+  // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
+  //    latter returned "x.log"
+  // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
+  // 4. [user thread] Reading "x.log" failed
+  //
+  // Unfortunately the only regression test I can come up with involves sleep.
+  // We cannot set SyncPoints to repro since, once the fix is applied, the
+  // SyncPoints would cause a deadlock as the repro's sequence of events is now
+  // prohibited.
+  //
+  // Instead, if we sleep for a second between Find and Purge, and ensure the
+  // read attempt happens after purge, then the sequence of events will almost
+  // certainly happen on the old code.
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallFlush:FilesFound",
+       "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
+      {"DBImpl::PurgeObsoleteFiles:End",
+       "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
+  });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:Begin",
+      [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put("key", "val");
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  db_->Flush(flush_opts);
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
+
+  db_->DisableFileDeletions();
+  VectorLogPtr log_files;
+  db_->GetSortedWalFiles(log_files);
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
+  for (const auto& log_file : log_files) {
+    ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
+  }
+
+  db_->EnableFileDeletions();
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestNumPread) {
+  Options options = CurrentOptions();
+  // disable block cache
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  env_->count_random_reads_ = true;
+
+  env_->random_file_open_counter_.store(0);
+  ASSERT_OK(Put("bar", "foo"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  // After flush, we'll open the file and read footer, meta block,
+  // property block and index block.
+  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // All files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(Put("bar2", "foo2"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Flush());
+  // After flush, we'll open the file and read footer, meta block,
+  // property block and index block.
+  ASSERT_EQ(4, env_->random_read_counter_.Read());
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // Compaction needs two input blocks, which requires 2 preads, and
+  // generate a new SST file which needs 4 preads (footer, meta block,
+  // property block and index block). In total 6.
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(6, env_->random_read_counter_.Read());
+  // All compactin input files should have already been opened.
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("foo2", Get("bar2"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // SST files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+}
+
+TEST_F(DBTest2, TraceAndReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  Put("hello", "world");
+  Merge("foo", "bar");
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithLimit) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the max trace file size options
+  trace_opts.max_trace_file_size = 5;
+  std::string trace_filename = dbname_ + "/rocksdb.trace1";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "1"));
+  ASSERT_OK(Put(0, "c", "1"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay2";
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithSampling) {
+  Options options = CurrentOptions();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the trace file sampling options
+  trace_opts.sampling_frequency = 2;
+  std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "2"));
+  ASSERT_OK(Put(0, "c", "3"));
+  ASSERT_OK(Put(0, "d", "4"));
+  ASSERT_OK(Put(0, "e", "5"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay_sampling";
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithFilter) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  trace_opts.filter = TraceFilterType::kTraceFilterWrite;
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  Put("hello", "world");
+  Merge("foo", "bar");
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::TmpDir(env_) + "/db_replay";
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  ASSERT_OK(DB::Open(DBOptions(), dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  Replayer replayer(db2, handles_, std::move(trace_reader));
+  ASSERT_OK(replayer.Replay());
+
+  // All the key-values should not present since we filter out the WRITE ops.
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Set up a new db.
+  std::string dbname3 = test::TmpDir(env_) + "/db_not_trace_read";
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  DB* db3_init = nullptr;
+  options.create_if_missing = true;
+  ColumnFamilyHandle* cf3;
+  ASSERT_OK(DB::Open(options, dbname3, &db3_init));
+  ASSERT_OK(
+      db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
+  delete cf3;
+  delete db3_init;
+
+  column_families.clear();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  handles.clear();
+
+  DB* db3 =  nullptr;
+  ASSERT_OK(DB::Open(DBOptions(), dbname3, column_families, &handles, &db3));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  //The tracer will not record the READ ops.
+  trace_opts.filter = TraceFilterType::kTraceFilterGet;
+  std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
+  std::unique_ptr<TraceWriter> trace_writer3;
+  ASSERT_OK(
+    NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
+  ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
+
+  ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
+  ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
+  ASSERT_OK(db3->Delete(wo, handles[0], "c"));
+  ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
+
+  ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ(value, "1");
+  ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  ASSERT_OK(db3->EndTrace());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db3;
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  std::unique_ptr<TraceReader> trace_reader3;
+  ASSERT_OK(
+    NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
+
+  // Count the number of records in the trace file;
+  int count = 0;
+  std::string data;
+  Status s;
+  while (true) {
+    s = trace_reader3->Read(&data);
+    if (!s.ok()) {
+      break;
+    }
+    count += 1;
+  }
+  // We also need to count the header and footer
+  // 4 WRITE + HEADER + FOOTER = 6
+  ASSERT_EQ(count, 6);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, PinnableSliceAndMmapReads) {
+  Options options = CurrentOptions();
+  options.allow_mmap_reads = true;
+  options.max_open_files = 100;
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  PinnableSlice pinned_value;
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  // It is not safe to pin mmap files as they might disappear by compaction
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  dbfull()->TEST_CompactRange(0 /* level */, nullptr /* begin */,
+                              nullptr /* end */, nullptr /* column_family */,
+                              true /* disallow_trivial_move */);
+
+  // Ensure pinned_value doesn't rely on memory munmap'd by the above
+  // compaction. It crashes if it does.
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+#ifndef ROCKSDB_LITE
+  pinned_value.Reset();
+  // Unsafe to pin mmap files when they could be kicked out of table cache
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  pinned_value.Reset();
+  // In read-only mode with infinite capacity on table cache it should pin the
+  // value and avoid the memcpy
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_TRUE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+#endif
+}
+
+TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = rocksdb::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = false;
+  bbto.cache_index_and_filter_blocks = false;
+  bbto.block_cache = NewLRUCache(100000);
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(new BlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string v = RandomString(&rnd, 400);
+
+  // Since v is the size of a block, each key should take a block
+  // of 400+ bytes.
+  Put("1", v);
+  Put("3", v);
+  Put("5", v);
+  Put("7", v);
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Verify that iterators don't pin more than one data block in block cache
+  // at each time.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->SeekToFirst();
+
+    for (int i = 0; i < 4; i++) {
+      ASSERT_TRUE(iter->Valid());
+      // Block cache should contain exactly one block.
+      ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+      ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+
+    iter->Seek("4");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+  }
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Test compaction case
+  Put("2", v);
+  Put("5", v);
+  Put("6", v);
+  Put("8", v);
+  ASSERT_OK(Flush());
+
+  // Clear existing data in block cache
+  bbto.block_cache->SetCapacity(0);
+  bbto.block_cache->SetCapacity(100000);
+
+  // Verify compaction input iterators don't hold more than one data blocks at
+  // one time.
+  std::atomic<bool> finished(false);
+  std::atomic<int> block_newed(0);
+  std::atomic<int> block_destroyed(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Block::Block:0", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load());
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
+        block_newed.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "Block::~Block", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
+        block_destroyed.fetch_add(1);
+      });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify",
+      [&](void* /*arg*/) { finished = true; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Two input files. Each of them has 4 data blocks.
+  ASSERT_EQ(8, block_newed.load());
+  ASSERT_EQ(8, block_destroyed.load());
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestBBTTailPrefetch) {
+  std::atomic<bool> called(false);
+  size_t expected_lower_bound = 512 * 1024;
+  size_t expected_higher_bound = 512 * 1024;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        EXPECT_LE(expected_lower_bound, *prefetch_size);
+        EXPECT_GE(expected_higher_bound, *prefetch_size);
+        called = true;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  expected_lower_bound = 0;
+  expected_higher_bound = 8 * 1024;
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  // Full compaction to make sure there is no L0 file after the open.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::atomic<bool> first_call(true);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        if (first_call) {
+          EXPECT_EQ(4 * 1024, *prefetch_size);
+          first_call = false;
+        } else {
+          EXPECT_GE(4 * 1024, *prefetch_size);
+        }
+        called = true;
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.max_file_opening_threads = 1;  // one thread
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.max_open_files = -1;
+  Reopen(options);
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  Put("1", "1");
+  Put("9", "1");
+  Flush();
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  // Parallel loading SST files
+  options.max_file_opening_threads = 16;
+  Reopen(options);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      { {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
+         "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
+        {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
+         "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1", "test2"}, Options());
+  ASSERT_EQ(handles_.size(), 2);
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  port::Thread user_thread1([&]() {
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+  });
+
+  port::Thread user_thread2([&]() {
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+    TEST_SYNC_POINT("TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, TestCompactFiles) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"TestCompactFiles::IngestExternalFile1",
+       "TestCompactFiles::IngestExternalFile2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.num_levels = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  auto* handle = db_->DefaultColumnFamily();
+  ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+  rocksdb::SstFileWriter sst_file_writer{rocksdb::EnvOptions(), options};
+  std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
+  std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
+  std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
+
+  ASSERT_OK(sst_file_writer.Open(external_file1));
+  ASSERT_OK(sst_file_writer.Put("1", "1"));
+  ASSERT_OK(sst_file_writer.Put("2", "2"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file2));
+  ASSERT_OK(sst_file_writer.Put("3", "3"));
+  ASSERT_OK(sst_file_writer.Put("4", "4"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file3));
+  ASSERT_OK(sst_file_writer.Put("5", "5"));
+  ASSERT_OK(sst_file_writer.Put("6", "6"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
+                                    IngestExternalFileOptions()));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+  std::vector<std::string> files;
+  GetSstFiles(env_, dbname_, &files);
+  ASSERT_EQ(files.size(), 2);
+
+  port::Thread user_thread1(
+      [&]() { db_->CompactFiles(CompactionOptions(), handle, files, 1); });
+
+  port::Thread user_thread2([&]() {
+    ASSERT_OK(db_->IngestExternalFile(handle, {external_file2},
+                                      IngestExternalFileOptions()));
+    TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_LITE
+
+// TODO: figure out why this test fails in appveyor
+#ifndef OS_WIN
+TEST_F(DBTest2, MultiDBParallelOpenTest) {
+  const int kNumDbs = 2;
+  Options options = CurrentOptions();
+  std::vector<std::string> dbnames;
+  for (int i = 0; i < kNumDbs; ++i) {
+    dbnames.emplace_back(test::TmpDir(env_) + "/db" + ToString(i));
+    ASSERT_OK(DestroyDB(dbnames.back(), options));
+  }
+
+  // Verify empty DBs can be created in parallel
+  std::vector<std::thread> open_threads;
+  std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Now add some data and close, so next we can verify non-empty DBs can be
+  // recovered in parallel
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
+    delete dbs[i];
+  }
+
+  // Verify non-empty DBs can be recovered in parallel
+  dbs.clear();
+  open_threads.clear();
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Wait and cleanup
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    delete dbs[i];
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+  }
+}
+#endif  // OS_WIN
+
+namespace {
+class DummyOldStats : public Statistics {
+ public:
+  uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
+  void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
+    num_rt++;
+  }
+  void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
+  uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
+    return 0;
+  }
+  void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
+    num_mt++;
+  }
+  void histogramData(uint32_t /*histogram_type*/,
+                     rocksdb::HistogramData* const /*data*/) const override {}
+  std::string getHistogramString(uint32_t /*type*/) const override {
+    return "";
+  }
+  bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
+  std::string ToString() const override { return ""; }
+  int num_rt = 0;
+  int num_mt = 0;
+};
+}  // namespace
+
+TEST_F(DBTest2, OldStatsInterface) {
+  DummyOldStats* dos = new DummyOldStats();
+  std::shared_ptr<Statistics> stats(dos);
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = stats;
+  Reopen(options);
+
+  Put("foo", "bar");
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_GT(dos->num_rt, 0);
+  ASSERT_GT(dos->num_mt, 0);
+}
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/db_test_util.cc b/thirdparty/rocksdb/db/db_test_util.cc
index c4d465ba11..9ef82fd2e1 100644
--- a/thirdparty/rocksdb/db/db_test_util.cc
+++ b/thirdparty/rocksdb/db/db_test_util.cc
@@ -63,7 +63,7 @@ DBTestBase::DBTestBase(const std::string path)
       option_config_(kDefault) {
   env_->SetBackgroundThreads(1, Env::LOW);
   env_->SetBackgroundThreads(1, Env::HIGH);
-  dbname_ = test::TmpDir(env_) + path;
+  dbname_ = test::PerThreadDBPath(env_, path);
   alternative_wal_dir_ = dbname_ + "/wal";
   alternative_db_log_dir_ = dbname_ + "/db_log_dir";
   auto options = CurrentOptions();
@@ -101,24 +101,25 @@ DBTestBase::~DBTestBase() {
 bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
 #ifdef ROCKSDB_LITE
     // These options are not supported in ROCKSDB_LITE
-  if (option_config == kHashSkipList ||
-      option_config == kPlainTableFirstBytePrefix ||
-      option_config == kPlainTableCappedPrefix ||
-      option_config == kPlainTableCappedPrefixNonMmap ||
-      option_config == kPlainTableAllBytesPrefix ||
-      option_config == kVectorRep || option_config == kHashLinkList ||
-      option_config == kHashCuckoo || option_config == kUniversalCompaction ||
-      option_config == kUniversalCompactionMultiLevel ||
-      option_config == kUniversalSubcompactions ||
-      option_config == kFIFOCompaction ||
-      option_config == kConcurrentSkipList) {
-    return true;
+    if (option_config == kHashSkipList ||
+        option_config == kPlainTableFirstBytePrefix ||
+        option_config == kPlainTableCappedPrefix ||
+        option_config == kPlainTableCappedPrefixNonMmap ||
+        option_config == kPlainTableAllBytesPrefix ||
+        option_config == kVectorRep || option_config == kHashLinkList ||
+        option_config == kUniversalCompaction ||
+        option_config == kUniversalCompactionMultiLevel ||
+        option_config == kUniversalSubcompactions ||
+        option_config == kFIFOCompaction ||
+        option_config == kConcurrentSkipList) {
+      return true;
     }
 #endif
 
     if ((skip_mask & kSkipUniversalCompaction) &&
         (option_config == kUniversalCompaction ||
-         option_config == kUniversalCompactionMultiLevel)) {
+         option_config == kUniversalCompactionMultiLevel ||
+         option_config == kUniversalSubcompactions)) {
       return true;
     }
     if ((skip_mask & kSkipMergePut) && option_config == kMergePut) {
@@ -140,9 +141,6 @@ bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
          option_config == kBlockBasedTableWithWholeKeyHashIndex)) {
       return true;
     }
-    if ((skip_mask & kSkipHashCuckoo) && (option_config == kHashCuckoo)) {
-      return true;
-    }
     if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) {
       return true;
     }
@@ -258,6 +256,47 @@ bool DBTestBase::ChangeFilterOptions() {
   return true;
 }
 
+// Switch between different DB options for file ingestion tests.
+bool DBTestBase::ChangeOptionsForFileIngestionTest() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalSubcompactions) {
+    option_config_ = kDirectIO;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
 // Return the current option configuration.
 Options DBTestBase::CurrentOptions(
     const anon::OptionsOverride& options_override) const {
@@ -288,12 +327,11 @@ Options DBTestBase::GetOptions(
   Options options = default_options;
   BlockBasedTableOptions table_options;
   bool set_block_based_table_factory = true;
-#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) &&  \
-  !defined(OS_AIX)
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
   rocksdb::SyncPoint::GetInstance()->ClearCallBack(
       "NewRandomAccessFile:O_DIRECT");
-  rocksdb::SyncPoint::GetInstance()->ClearCallBack(
-      "NewWritableFile:O_DIRECT");
+  rocksdb::SyncPoint::GetInstance()->ClearCallBack("NewWritableFile:O_DIRECT");
 #endif
 
   bool can_allow_mmap = IsMemoryMappedAccessSupported();
@@ -342,11 +380,26 @@ Options DBTestBase::GetOptions(
           NewHashLinkListRepFactory(4, 0, 3, true, 4));
       options.allow_concurrent_memtable_write = false;
       break;
-    case kHashCuckoo:
-      options.memtable_factory.reset(
-          NewHashCuckooRepFactory(options.write_buffer_size));
-      options.allow_concurrent_memtable_write = false;
-      break;
+      case kDirectIO: {
+        options.use_direct_reads = true;
+        options.use_direct_io_for_flush_and_compaction = true;
+        options.compaction_readahead_size = 2 * 1024 * 1024;
+  #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+      !defined(OS_AIX) && !defined(OS_OPENBSD)
+        rocksdb::SyncPoint::GetInstance()->SetCallBack(
+            "NewWritableFile:O_DIRECT", [&](void* arg) {
+              int* val = static_cast<int*>(arg);
+              *val &= ~O_DIRECT;
+            });
+        rocksdb::SyncPoint::GetInstance()->SetCallBack(
+            "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
+              int* val = static_cast<int*>(arg);
+              *val &= ~O_DIRECT;
+            });
+        rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  #endif
+        break;
+      }
 #endif  // ROCKSDB_LITE
     case kMergePut:
       options.merge_operator = MergeOperators::CreatePutOperator();
@@ -410,6 +463,10 @@ Options DBTestBase::GetOptions(
       table_options.checksum = kxxHash;
       break;
     }
+    case kxxHash64Checksum: {
+      table_options.checksum = kxxHash64;
+      break;
+    }
     case kFIFOCompaction: {
       options.compaction_style = kCompactionStyleFIFO;
       break;
@@ -429,6 +486,18 @@ Options DBTestBase::GetOptions(
       options.prefix_extractor.reset(NewNoopTransform());
       break;
     }
+    case kBlockBasedTableWithPartitionedIndexFormat4: {
+      table_options.format_version = 4;
+      // Format 4 changes the binary index format. Since partitioned index is a
+      // super-set of simple indexes, we are also using kTwoLevelIndexSearch to
+      // test this format.
+      table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      // The top-level index in partition filters are also affected by format 4.
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      table_options.partition_filters = true;
+      table_options.index_block_restart_interval = 8;
+      break;
+    }
     case kBlockBasedTableWithIndexRestartInterval: {
       table_options.index_block_restart_interval = 8;
       break;
@@ -461,33 +530,13 @@ Options DBTestBase::GetOptions(
       options.enable_write_thread_adaptive_yield = true;
       break;
     }
-    case kDirectIO: {
-      options.use_direct_reads = true;
-      options.use_direct_io_for_flush_and_compaction = true;
-      options.compaction_readahead_size = 2 * 1024 * 1024;
-#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
-    !defined(OS_AIX)
-      rocksdb::SyncPoint::GetInstance()->SetCallBack(
-          "NewWritableFile:O_DIRECT", [&](void* arg) {
-            int* val = static_cast<int*>(arg);
-            *val &= ~O_DIRECT;
-          });
-      rocksdb::SyncPoint::GetInstance()->SetCallBack(
-          "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
-            int* val = static_cast<int*>(arg);
-            *val &= ~O_DIRECT;
-          });
-      rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-#endif
-      break;
-    }
     case kPipelinedWrite: {
       options.enable_pipelined_write = true;
       break;
     }
     case kConcurrentWALWrites: {
       // This options optimize 2PC commit path
-      options.concurrent_prepare = true;
+      options.two_write_queues = true;
       options.manual_wal_flush = true;
       break;
     }
@@ -547,6 +596,7 @@ Status DBTestBase::TryReopenWithColumnFamilies(
     column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
   }
   DBOptions db_opts = DBOptions(options[0]);
+  last_options_ = options[0];
   return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
 }
 
@@ -576,9 +626,17 @@ void DBTestBase::DestroyAndReopen(const Options& options) {
   ASSERT_OK(TryReopen(options));
 }
 
-void DBTestBase::Destroy(const Options& options) {
+void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  if (delete_cf_paths) {
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      ColumnFamilyDescriptor cfdescriptor;
+      handles_[i]->GetDescriptor(&cfdescriptor);
+      column_families.push_back(cfdescriptor);
+    }
+  }
   Close();
-  ASSERT_OK(DestroyDB(dbname_, options));
+  ASSERT_OK(DestroyDB(dbname_, options, column_families));
 }
 
 Status DBTestBase::ReadOnlyReopen(const Options& options) {
@@ -588,30 +646,19 @@ Status DBTestBase::ReadOnlyReopen(const Options& options) {
 Status DBTestBase::TryReopen(const Options& options) {
   Close();
   last_options_.table_factory.reset();
-  // Note: operator= is an unsafe approach here since it destructs shared_ptr in
-  // the same order of their creation, in contrast to destructors which
-  // destructs them in the opposite order of creation. One particular problme is
-  // that the cache destructor might invoke callback functions that use Option
-  // members such as statistics. To work around this problem, we manually call
-  // destructor of table_facotry which eventually clears the block cache.
+  // Note: operator= is an unsafe approach here since it destructs
+  // std::shared_ptr in the same order of their creation, in contrast to
+  // destructors which destructs them in the opposite order of creation. One
+  // particular problme is that the cache destructor might invoke callback
+  // functions that use Option members such as statistics. To work around this
+  // problem, we manually call destructor of table_facotry which eventually
+  // clears the block cache.
   last_options_ = options;
   return DB::Open(options, dbname_, &db_);
 }
 
 bool DBTestBase::IsDirectIOSupported() {
-  EnvOptions env_options;
-  env_options.use_mmap_writes = false;
-  env_options.use_direct_writes = true;
-  std::string tmp = TempFileName(dbname_, 999);
-  Status s;
-  {
-    unique_ptr<WritableFile> file;
-    s = env_->NewWritableFile(tmp, &file, env_options);
-  }
-  if (s.ok()) {
-    s = env_->DeleteFile(tmp);
-  }
-  return s.ok();
+  return test::IsDirectIOSupported(env_, dbname_);
 }
 
 bool DBTestBase::IsMemoryMappedAccessSupported() const {
@@ -626,6 +673,13 @@ Status DBTestBase::Flush(int cf) {
   }
 }
 
+Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
+  std::vector<ColumnFamilyHandle*> cfhs;
+  std::for_each(cf_ids.begin(), cf_ids.end(),
+                [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
+  return db_->Flush(FlushOptions(), cfhs);
+}
+
 Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
   if (kMergePut == option_config_) {
     return db_->Merge(wo, k, v);
@@ -668,6 +722,10 @@ Status DBTestBase::SingleDelete(int cf, const std::string& k) {
   return db_->SingleDelete(WriteOptions(), handles_[cf], k);
 }
 
+bool DBTestBase::SetPreserveDeletesSequenceNumber(SequenceNumber sn) {
+  return db_->SetPreserveDeletesSequenceNumber(sn);
+}
+
 std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
   ReadOptions options;
   options.verify_checksums = true;
@@ -697,6 +755,31 @@ std::string DBTestBase::Get(int cf, const std::string& k,
   return result;
 }
 
+std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
+                                              const std::vector<std::string>& k,
+                                              const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::vector<Slice> keys;
+  std::vector<std::string> result;
+
+  for (unsigned int i = 0; i < cfs.size(); ++i) {
+    handles.push_back(handles_[cfs[i]]);
+    keys.push_back(k[i]);
+  }
+  std::vector<Status> s = db_->MultiGet(options, handles, keys, &result);
+  for (unsigned int i = 0; i < s.size(); ++i) {
+    if (s[i].IsNotFound()) {
+      result[i] = "NOT_FOUND";
+    } else if (!s[i].ok()) {
+      result[i] = s[i].ToString();
+    }
+  }
+  return result;
+}
+
 Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
   ReadOptions options;
   options.verify_checksums = true;
@@ -749,13 +832,15 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
   Arena arena;
   auto options = CurrentOptions();
   InternalKeyComparator icmp(options.comparator);
-  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
   ScopedArenaIterator iter;
   if (cf == 0) {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber));
   } else {
-    iter.set(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[cf]));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber, handles_[cf]));
   }
   InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
   iter->Seek(target.Encode());
@@ -864,7 +949,6 @@ size_t DBTestBase::CountLiveFiles() {
   db_->GetLiveFilesMetaData(&metadata);
   return metadata.size();
 }
-#endif  // ROCKSDB_LITE
 
 int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
   std::string property;
@@ -925,6 +1009,7 @@ std::string DBTestBase::FilesPerLevel(int cf) {
   result.resize(last_non_zero_offset);
   return result;
 }
+#endif  // !ROCKSDB_LITE
 
 size_t DBTestBase::CountFiles() {
   std::vector<std::string> files;
@@ -994,6 +1079,7 @@ void DBTestBase::MoveFilesToLevel(int level, int cf) {
   }
 }
 
+#ifndef ROCKSDB_LITE
 void DBTestBase::DumpFileCounts(const char* label) {
   fprintf(stderr, "---\n%s:\n", label);
   fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
@@ -1005,6 +1091,7 @@ void DBTestBase::DumpFileCounts(const char* label) {
     }
   }
 }
+#endif  // !ROCKSDB_LITE
 
 std::string DBTestBase::DumpSSTableList() {
   std::string property;
@@ -1012,9 +1099,9 @@ std::string DBTestBase::DumpSSTableList() {
   return property;
 }
 
-void DBTestBase::GetSstFiles(std::string path,
+void DBTestBase::GetSstFiles(Env* env, std::string path,
                              std::vector<std::string>* files) {
-  env_->GetChildren(path, files);
+  env->GetChildren(path, files);
 
   files->erase(
       std::remove_if(files->begin(), files->end(), [](std::string name) {
@@ -1026,7 +1113,7 @@ void DBTestBase::GetSstFiles(std::string path,
 
 int DBTestBase::GetSstFileCount(std::string path) {
   std::vector<std::string> files;
-  GetSstFiles(path, &files);
+  DBTestBase::GetSstFiles(env_, path, &files);
   return static_cast<int>(files.size());
 }
 
@@ -1138,39 +1225,44 @@ UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue,
   }
 }
 
-UpdateStatus DBTestBase::updateInPlaceLargerSize(char* prevValue,
-                                                 uint32_t* prevSize,
+UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/,
+                                                 uint32_t* /*prevSize*/,
                                                  Slice delta,
                                                  std::string* newValue) {
   *newValue = std::string(delta.size(), 'c');
   return UpdateStatus::UPDATED;
 }
 
-UpdateStatus DBTestBase::updateInPlaceNoAction(char* prevValue,
-                                               uint32_t* prevSize, Slice delta,
-                                               std::string* newValue) {
+UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/,
+                                               uint32_t* /*prevSize*/,
+                                               Slice /*delta*/,
+                                               std::string* /*newValue*/) {
   return UpdateStatus::UPDATE_FAILED;
 }
 
 // Utility method to test InplaceUpdate
 void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
-  ScopedArenaIterator iter;
   Arena arena;
   auto options = CurrentOptions();
   InternalKeyComparator icmp(options.comparator);
-  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+  // This should be defined after range_del_agg so that it destructs the
+  // assigned iterator before it range_del_agg is already destructed.
+  ScopedArenaIterator iter;
   if (cf != 0) {
-    iter.set(
-        dbfull()->NewInternalIterator(&arena, &range_del_agg, handles_[cf]));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber, handles_[cf]));
   } else {
-    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg));
+    iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg,
+                                           kMaxSequenceNumber));
   }
   iter->SeekToFirst();
   ASSERT_EQ(iter->status().ok(), true);
   int seq = numValues;
   while (iter->Valid()) {
     ParsedInternalKey ikey;
-    ikey.sequence = -1;
+    ikey.clear();
     ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true);
 
     // checks sequence number for updates
@@ -1183,9 +1275,9 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
 void DBTestBase::CopyFile(const std::string& source,
                           const std::string& destination, uint64_t size) {
   const EnvOptions soptions;
-  unique_ptr<SequentialFile> srcfile;
+  std::unique_ptr<SequentialFile> srcfile;
   ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
-  unique_ptr<WritableFile> destfile;
+  std::unique_ptr<WritableFile> destfile;
   ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
 
   if (size == 0) {
@@ -1363,8 +1455,10 @@ void DBTestBase::VerifyDBInternal(
     std::vector<std::pair<std::string, std::string>> true_data) {
   Arena arena;
   InternalKeyComparator icmp(last_options_.comparator);
-  RangeDelAggregator range_del_agg(icmp, {});
-  auto iter = dbfull()->NewInternalIterator(&arena, &range_del_agg);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+  auto iter =
+      dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber);
   iter->SeekToFirst();
   for (auto p : true_data) {
     ASSERT_TRUE(iter->Valid());
diff --git a/thirdparty/rocksdb/db/db_test_util.h b/thirdparty/rocksdb/db/db_test_util.h
index f2caa46ca2..1ba1f0a964 100644
--- a/thirdparty/rocksdb/db/db_test_util.h
+++ b/thirdparty/rocksdb/db/db_test_util.h
@@ -46,6 +46,7 @@
 #include "table/scoped_arena_iterator.h"
 #include "util/compression.h"
 #include "util/filename.h"
+#include "util/mock_time_env.h"
 #include "util/mutexlock.h"
 
 #include "util/string_util.h"
@@ -109,8 +110,6 @@ struct OptionsOverride {
   // These will be used only if filter_policy is set
   bool partition_filters = false;
   uint64_t metadata_block_size = 1024;
-  BlockBasedTableOptions::IndexType index_type =
-      BlockBasedTableOptions::IndexType::kBinarySearch;
 
   // Used as a bit mask of individual enums in which to skip an XF test point
   int skip_policy = 0;
@@ -137,8 +136,8 @@ class SpecialMemTableRep : public MemTableRep {
   // Insert key into the list.
   // REQUIRES: nothing that compares equal to key is currently in the list.
   virtual void Insert(KeyHandle handle) override {
-    memtable_->Insert(handle);
     num_entries_++;
+    memtable_->Insert(handle);
   }
 
   // Returns true iff an entry that compares equal to key is in the list.
@@ -170,7 +169,7 @@ class SpecialMemTableRep : public MemTableRep {
   virtual ~SpecialMemTableRep() override {}
 
  private:
-  unique_ptr<MemTableRep> memtable_;
+  std::unique_ptr<MemTableRep> memtable_;
   int num_entries_flush_;
   int num_entries_;
 };
@@ -187,7 +186,7 @@ class SpecialSkipListFactory : public MemTableRepFactory {
   using MemTableRepFactory::CreateMemTableRep;
   virtual MemTableRep* CreateMemTableRep(
       const MemTableRep::KeyComparator& compare, Allocator* allocator,
-      const SliceTransform* transform, Logger* logger) override {
+      const SliceTransform* transform, Logger* /*logger*/) override {
     return new SpecialMemTableRep(
         allocator, factory_.CreateMemTableRep(compare, allocator, transform, 0),
         num_entries_flush_);
@@ -208,15 +207,15 @@ class SpecialEnv : public EnvWrapper {
  public:
   explicit SpecialEnv(Env* base);
 
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& soptions) override {
     class SSTableFile : public WritableFile {
      private:
       SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
+      std::unique_ptr<WritableFile> base_;
 
      public:
-      SSTableFile(SpecialEnv* env, unique_ptr<WritableFile>&& base)
+      SSTableFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& base)
           : env_(env), base_(std::move(base)) {}
       Status Append(const Slice& data) override {
         if (env_->table_write_callback_) {
@@ -296,7 +295,7 @@ class SpecialEnv : public EnvWrapper {
     };
     class ManifestFile : public WritableFile {
      public:
-      ManifestFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+      ManifestFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) {}
       Status Append(const Slice& data) override {
         if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
@@ -317,14 +316,17 @@ class SpecialEnv : public EnvWrapper {
         }
       }
       uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
 
      private:
       SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
+      std::unique_ptr<WritableFile> base_;
     };
     class WalFile : public WritableFile {
      public:
-      WalFile(SpecialEnv* env, unique_ptr<WritableFile>&& b)
+      WalFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
           : env_(env), base_(std::move(b)) {
         env_->num_open_wal_file_.fetch_add(1);
       }
@@ -370,10 +372,13 @@ class SpecialEnv : public EnvWrapper {
       bool IsSyncThreadSafe() const override {
         return env_->is_wal_sync_thread_safe_.load();
       }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
 
      private:
       SpecialEnv* env_;
-      unique_ptr<WritableFile> base_;
+      std::unique_ptr<WritableFile> base_;
     };
 
     if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
@@ -415,11 +420,11 @@ class SpecialEnv : public EnvWrapper {
   }
 
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& soptions) override {
     class CountingFile : public RandomAccessFile {
      public:
-      CountingFile(unique_ptr<RandomAccessFile>&& target,
+      CountingFile(std::unique_ptr<RandomAccessFile>&& target,
                    anon::AtomicCounter* counter,
                    std::atomic<size_t>* bytes_read)
           : target_(std::move(target)),
@@ -434,7 +439,7 @@ class SpecialEnv : public EnvWrapper {
       }
 
      private:
-      unique_ptr<RandomAccessFile> target_;
+      std::unique_ptr<RandomAccessFile> target_;
       anon::AtomicCounter* counter_;
       std::atomic<size_t>* bytes_read_;
     };
@@ -445,14 +450,18 @@ class SpecialEnv : public EnvWrapper {
       r->reset(new CountingFile(std::move(*r), &random_read_counter_,
                                 &random_read_bytes_counter_));
     }
+    if (s.ok() && soptions.compaction_readahead_size > 0) {
+      compaction_readahead_size_ = soptions.compaction_readahead_size;
+    }
     return s;
   }
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
-                           const EnvOptions& soptions) override {
+  virtual Status NewSequentialFile(const std::string& f,
+                                   std::unique_ptr<SequentialFile>* r,
+                                   const EnvOptions& soptions) override {
     class CountingFile : public SequentialFile {
      public:
-      CountingFile(unique_ptr<SequentialFile>&& target,
+      CountingFile(std::unique_ptr<SequentialFile>&& target,
                    anon::AtomicCounter* counter)
           : target_(std::move(target)), counter_(counter) {}
       virtual Status Read(size_t n, Slice* result, char* scratch) override {
@@ -462,7 +471,7 @@ class SpecialEnv : public EnvWrapper {
       virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
 
      private:
-      unique_ptr<SequentialFile> target_;
+      std::unique_ptr<SequentialFile> target_;
       anon::AtomicCounter* counter_;
     };
 
@@ -494,6 +503,11 @@ class SpecialEnv : public EnvWrapper {
     return s;
   }
 
+  virtual uint64_t NowCPUNanos() override {
+    now_cpu_count_.fetch_add(1);
+    return target()->NowCPUNanos();
+  }
+
   virtual uint64_t NowNanos() override {
     return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) +
            addon_time_.load() * 1000;
@@ -563,44 +577,17 @@ class SpecialEnv : public EnvWrapper {
 
   std::atomic<int64_t> addon_time_;
 
+  std::atomic<int> now_cpu_count_;
+
   std::atomic<int> delete_count_;
 
-  bool time_elapse_only_sleep_;
+  std::atomic<bool> time_elapse_only_sleep_;
 
   bool no_slowdown_;
 
   std::atomic<bool> is_wal_sync_thread_safe_{true};
-};
-
-class MockTimeEnv : public EnvWrapper {
- public:
-  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
-
-  virtual Status GetCurrentTime(int64_t* time) override {
-    assert(time != nullptr);
-    assert(current_time_ <=
-           static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
-    *time = static_cast<int64_t>(current_time_);
-    return Status::OK();
-  }
-
-  virtual uint64_t NowMicros() override {
-    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
-    return current_time_ * 1000000;
-  }
-
-  virtual uint64_t NowNanos() override {
-    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
-    return current_time_ * 1000000000;
-  }
 
-  void set_current_time(uint64_t time) {
-    assert(time >= current_time_);
-    current_time_ = time;
-  }
-
- private:
-  uint64_t current_time_ = 0;
+  std::atomic<size_t> compaction_readahead_size_{};
 };
 
 #ifndef ROCKSDB_LITE
@@ -665,36 +652,38 @@ class DBTestBase : public testing::Test {
     kPlainTableAllBytesPrefix = 6,
     kVectorRep = 7,
     kHashLinkList = 8,
-    kHashCuckoo = 9,
-    kMergePut = 10,
-    kFilter = 11,
-    kFullFilterWithNewTableReaderForCompactions = 12,
-    kUncompressed = 13,
-    kNumLevel_3 = 14,
-    kDBLogDir = 15,
-    kWalDirAndMmapReads = 16,
-    kManifestFileSize = 17,
-    kPerfOptions = 18,
-    kHashSkipList = 19,
-    kUniversalCompaction = 20,
-    kUniversalCompactionMultiLevel = 21,
-    kCompressedBlockCache = 22,
-    kInfiniteMaxOpenFiles = 23,
-    kxxHashChecksum = 24,
-    kFIFOCompaction = 25,
-    kOptimizeFiltersForHits = 26,
-    kRowCache = 27,
-    kRecycleLogFiles = 28,
-    kConcurrentSkipList = 29,
-    kPipelinedWrite = 30,
-    kConcurrentWALWrites = 31,
-    kEnd = 32,
-    kDirectIO = 33,
-    kLevelSubcompactions = 34,
-    kUniversalSubcompactions = 35,
-    kBlockBasedTableWithIndexRestartInterval = 36,
-    kBlockBasedTableWithPartitionedIndex = 37,
-    kPartitionedFilterWithNewTableReaderForCompactions = 38,
+    kMergePut = 9,
+    kFilter = 10,
+    kFullFilterWithNewTableReaderForCompactions = 11,
+    kUncompressed = 12,
+    kNumLevel_3 = 13,
+    kDBLogDir = 14,
+    kWalDirAndMmapReads = 15,
+    kManifestFileSize = 16,
+    kPerfOptions = 17,
+    kHashSkipList = 18,
+    kUniversalCompaction = 19,
+    kUniversalCompactionMultiLevel = 20,
+    kCompressedBlockCache = 21,
+    kInfiniteMaxOpenFiles = 22,
+    kxxHashChecksum = 23,
+    kFIFOCompaction = 24,
+    kOptimizeFiltersForHits = 25,
+    kRowCache = 26,
+    kRecycleLogFiles = 27,
+    kConcurrentSkipList = 28,
+    kPipelinedWrite = 29,
+    kConcurrentWALWrites = 30,
+    kDirectIO,
+    kLevelSubcompactions,
+    kBlockBasedTableWithIndexRestartInterval,
+    kBlockBasedTableWithPartitionedIndex,
+    kBlockBasedTableWithPartitionedIndexFormat4,
+    kPartitionedFilterWithNewTableReaderForCompactions,
+    kUniversalSubcompactions,
+    kxxHash64Checksum,
+    // This must be the last line
+    kEnd,
   };
 
  public:
@@ -720,11 +709,17 @@ class DBTestBase : public testing::Test {
     kSkipPlainTable = 8,
     kSkipHashIndex = 16,
     kSkipNoSeekToLast = 32,
-    kSkipHashCuckoo = 64,
     kSkipFIFOCompaction = 128,
     kSkipMmapReads = 256,
   };
 
+  const int kRangeDelSkipConfigs =
+      // Plain tables do not support range deletions.
+      kSkipPlainTable |
+      // MmapReads disables the iterator pinning that RangeDelAggregator
+      // requires.
+      kSkipMmapReads;
+
   explicit DBTestBase(const std::string path);
 
   ~DBTestBase();
@@ -757,6 +752,9 @@ class DBTestBase : public testing::Test {
   // Jump from kDefault to kFilter to kFullFilter
   bool ChangeFilterOptions();
 
+  // Switch between different DB options for file ingestion tests.
+  bool ChangeOptionsForFileIngestionTest();
+
   // Return the current option configuration.
   Options CurrentOptions(const anon::OptionsOverride& options_override =
                              anon::OptionsOverride()) const;
@@ -798,7 +796,7 @@ class DBTestBase : public testing::Test {
 
   void DestroyAndReopen(const Options& options);
 
-  void Destroy(const Options& options);
+  void Destroy(const Options& options, bool delete_cf_paths = false);
 
   Status ReadOnlyReopen(const Options& options);
 
@@ -810,6 +808,8 @@ class DBTestBase : public testing::Test {
 
   Status Flush(int cf = 0);
 
+  Status Flush(const std::vector<int>& cf_ids);
+
   Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
 
   Status Put(int cf, const Slice& k, const Slice& v,
@@ -829,6 +829,8 @@ class DBTestBase : public testing::Test {
 
   Status SingleDelete(int cf, const std::string& k);
 
+  bool SetPreserveDeletesSequenceNumber(SequenceNumber sn);
+
   std::string Get(const std::string& k, const Snapshot* snapshot = nullptr);
 
   std::string Get(int cf, const std::string& k,
@@ -836,6 +838,10 @@ class DBTestBase : public testing::Test {
 
   Status Get(const std::string& k, PinnableSlice* v);
 
+  std::vector<std::string> MultiGet(std::vector<int> cfs,
+                                    const std::vector<std::string>& k,
+                                    const Snapshot* snapshot = nullptr);
+
   uint64_t GetNumSnapshots();
 
   uint64_t GetTimeOldestSnapshots();
@@ -856,13 +862,13 @@ class DBTestBase : public testing::Test {
   size_t TotalLiveFiles(int cf = 0);
 
   size_t CountLiveFiles();
-#endif  // ROCKSDB_LITE
 
   int NumTableFilesAtLevel(int level, int cf = 0);
 
   double CompressionRatioAtLevel(int level, int cf = 0);
 
   int TotalTableFiles(int cf = 0, int levels = -1);
+#endif  // ROCKSDB_LITE
 
   // Return spread of files per level
   std::string FilesPerLevel(int cf = 0);
@@ -890,11 +896,14 @@ class DBTestBase : public testing::Test {
 
   void MoveFilesToLevel(int level, int cf = 0);
 
+#ifndef ROCKSDB_LITE
   void DumpFileCounts(const char* label);
+#endif  // ROCKSDB_LITE
 
   std::string DumpSSTableList();
 
-  void GetSstFiles(std::string path, std::vector<std::string>* files);
+  static void GetSstFiles(Env* env, std::string path,
+                          std::vector<std::string>* files);
 
   int GetSstFileCount(std::string path);
 
diff --git a/thirdparty/rocksdb/db/db_universal_compaction_test.cc b/thirdparty/rocksdb/db/db_universal_compaction_test.cc
index 58fda80d54..2bd8af684e 100644
--- a/thirdparty/rocksdb/db/db_universal_compaction_test.cc
+++ b/thirdparty/rocksdb/db/db_universal_compaction_test.cc
@@ -10,6 +10,7 @@
 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
 #if !defined(ROCKSDB_LITE)
+#include "rocksdb/utilities/table_properties_collectors.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
@@ -26,7 +27,7 @@ class DBTestUniversalCompactionBase
  public:
   explicit DBTestUniversalCompactionBase(
       const std::string& path) : DBTestBase(path) {}
-  virtual void SetUp() override {
+  void SetUp() override {
     num_levels_ = std::get<0>(GetParam());
     exclusive_manual_compaction_ = std::get<1>(GetParam());
   }
@@ -40,6 +41,12 @@ class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
       DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
 };
 
+class DBTestUniversalDeleteTrigCompaction : public DBTestBase {
+ public:
+  DBTestUniversalDeleteTrigCompaction()
+      : DBTestBase("/db_universal_compaction_test") {}
+};
+
 namespace {
 void VerifyCompactionResult(
     const ColumnFamilyMetaData& cf_meta,
@@ -56,13 +63,13 @@ void VerifyCompactionResult(
 
 class KeepFilter : public CompactionFilter {
  public:
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value, bool* value_changed) const
-      override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     return false;
   }
 
-  virtual const char* Name() const override { return "KeepFilter"; }
+  const char* Name() const override { return "KeepFilter"; }
 };
 
 class KeepFilterFactory : public CompactionFilterFactory {
@@ -70,7 +77,7 @@ class KeepFilterFactory : public CompactionFilterFactory {
   explicit KeepFilterFactory(bool check_context = false)
       : check_context_(check_context) {}
 
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) override {
     if (check_context_) {
       EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
@@ -79,7 +86,7 @@ class KeepFilterFactory : public CompactionFilterFactory {
     return std::unique_ptr<CompactionFilter>(new KeepFilter());
   }
 
-  virtual const char* Name() const override { return "KeepFilterFactory"; }
+  const char* Name() const override { return "KeepFilterFactory"; }
   bool check_context_;
   std::atomic_bool expect_full_compaction_;
   std::atomic_bool expect_manual_compaction_;
@@ -88,14 +95,14 @@ class KeepFilterFactory : public CompactionFilterFactory {
 class DelayFilter : public CompactionFilter {
  public:
   explicit DelayFilter(DBTestBase* d) : db_test(d) {}
-  virtual bool Filter(int level, const Slice& key, const Slice& value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     db_test->env_->addon_time_.fetch_add(1000);
     return true;
   }
 
-  virtual const char* Name() const override { return "DelayFilter"; }
+  const char* Name() const override { return "DelayFilter"; }
 
  private:
   DBTestBase* db_test;
@@ -104,12 +111,12 @@ class DelayFilter : public CompactionFilter {
 class DelayFilterFactory : public CompactionFilterFactory {
  public:
   explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
     return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
   }
 
-  virtual const char* Name() const override { return "DelayFilterFactory"; }
+  const char* Name() const override { return "DelayFilterFactory"; }
 
  private:
   DBTestBase* db_test;
@@ -374,6 +381,181 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
   ASSERT_EQ(NumSortedRuns(1), 1);
 }
 
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.size_ratio = 100;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_amp_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() ==
+              CompactionReason::kUniversalSizeAmplification) {
+            total_size_amp_compactions++;
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but could instead trigger size amplification if it's set
+  // to 110.
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Trigger compaction if size amplification exceeds 110% without reopening DB
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            200);
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {{"compaction_options_universal",
+                                   "{max_size_amplification_percent=110;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            110);
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(110, mutable_cf_options.compaction_options_universal
+                     .max_size_amplification_percent);
+
+  dbfull()->TEST_WaitForCompact();
+  // Verify that size amplification did happen
+  ASSERT_EQ(NumSortedRuns(1), 1);
+  ASSERT_EQ(total_picked_compactions, 1);
+  ASSERT_EQ(total_size_amp_compactions, 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.max_size_amplification_percent = 2000;
+  options.compaction_options_universal.size_ratio = 0;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_ratio_compactions = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) {
+            total_size_ratio_compactions++;
+          }
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Generate three files in Level 0. All files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), RandomString(&rnd, 10000)));
+      key_idx++;
+    }
+    dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger);
+
+  // Flush whatever is remaining in memtable. This is typically small, about
+  // 30KB.
+  ASSERT_OK(Flush(1));
+  dbfull()->TEST_WaitForCompact();
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1);
+  ASSERT_EQ(total_picked_compactions, 0);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1],
+      {{"compaction_options_universal",
+        "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.min_merge_width,
+            2);
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_merge_width,
+            2);
+  ASSERT_EQ(
+      dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
+      100);
+
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width, 2);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width, 2);
+
+  dbfull()->TEST_WaitForCompact();
+
+  // Files in L0 are approx: 0.3 (30KB), 1, 1, 1.
+  // On compaction: the files are below the size amp threshold, so we
+  // fallthrough to checking read amp conditions. The configured size ratio is
+  // not big enough to take 0.3 into consideration. So the next files 1 and 1
+  // are compacted together first as they satisfy size ratio condition and
+  // (min_merge_width, max_merge_width) condition, to give out a file size of 2.
+  // Next, the newly generated 2 and the last file 1 are compacted together. So
+  // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked
+  // ones are size ratio based compactions.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+  // If max_merge_width had not been changed dynamically above, and if it
+  // continued to be the default value of UINIT_MAX, total_picked_compactions
+  // would have been 1.
+  ASSERT_EQ(total_picked_compactions, 2);
+  ASSERT_EQ(total_size_ratio_compactions, 2);
+}
+
 TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) {
   const int kTestKeySize = 16;
   const int kTestValueSize = 984;
@@ -480,7 +662,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) {
   ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
 }
 
-
+#ifndef ROCKSDB_VALGRIND_RUN
 class DBTestUniversalCompactionMultiLevels
     : public DBTestUniversalCompactionBase {
  public:
@@ -516,13 +698,14 @@ TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
     ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
   }
 }
+
 // Tests universal compaction with trivial move enabled
 TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) {
   int32_t trivial_move = 0;
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
         non_trivial_move++;
@@ -594,7 +777,7 @@ TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
   std::atomic<int> num_compactions_running(0);
   std::atomic<bool> has_parallel(false);
   rocksdb::SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Start",
-                                                 [&](void* arg) {
+                                                 [&](void* /*arg*/) {
     if (num_compactions_running.fetch_add(1) > 0) {
       has_parallel.store(true);
       return;
@@ -609,7 +792,7 @@ TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
   });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "CompactionJob::Run():End",
-      [&](void* arg) { num_compactions_running.fetch_add(-1); });
+      [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   options = CurrentOptions(options);
@@ -758,6 +941,7 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalCompactionParallel,
                         DBTestUniversalCompactionParallel,
                         ::testing::Combine(::testing::Values(1, 10),
                                            ::testing::Values(false)));
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
   Options options = CurrentOptions();
@@ -970,16 +1154,17 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
     dbfull()->TEST_WaitForFlushMemTable();
     dbfull()->TEST_WaitForCompact();
   }
-  ASSERT_LT(TotalSize(), 120000U * 12 * 0.8 + 120000 * 2);
+  ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 // Test that checks trivial move in universal compaction
 TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
   int32_t trivial_move = 0;
   int32_t non_trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
         non_trivial_move++;
@@ -1025,7 +1210,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
   int32_t trivial_move = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:TrivialMove",
-      [&](void* arg) { trivial_move++; });
+      [&](void* /*arg*/) { trivial_move++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
         ASSERT_TRUE(arg != nullptr);
@@ -1065,6 +1250,7 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
   Options options = CurrentOptions();
@@ -1168,6 +1354,146 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
   Destroy(options);
 }
 
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      new SpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 10;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"},option_vector[1]);
+
+  // Configura CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"},option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  generate_file();
+  check_sstfilecount(2, 1);
+
+  // (1, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 1);
+
+  // (1,1,4) -> (2, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 2, 4) -> (3, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 4) -> (8)
+  generate_file();
+  check_sstfilecount(3, 1);
+
+  // (1, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(0, 1);
+
+  // (1, 1, 8) -> (2, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+
+  // (1, 2, 8) -> (3, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 8) -> (4, 8)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(3, 1);
+
+  // (1, 4, 8) -> (5, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 0);
+
+  check_getvalues();
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
 TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) {
   std::function<void(int)> verify_func = [&](int num_keys_in_db) {
     std::string keys_in_db;
@@ -1365,50 +1691,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
   Destroy(options);
 }
 
-TEST_P(DBTestUniversalCompaction, FullCompactionInBottomPriThreadPool) {
-  const int kNumFilesTrigger = 3;
-  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
-  for (bool allow_ingest_behind : {false, true}) {
-    Options options = CurrentOptions();
-    options.allow_ingest_behind = allow_ingest_behind;
-    options.compaction_style = kCompactionStyleUniversal;
-    options.num_levels = num_levels_;
-    options.write_buffer_size = 100 << 10;     // 100KB
-    options.target_file_size_base = 32 << 10;  // 32KB
-    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
-    // Trigger compaction if size amplification exceeds 110%
-    options.compaction_options_universal.max_size_amplification_percent = 110;
-    DestroyAndReopen(options);
-
-    int num_bottom_pri_compactions = 0;
-    SyncPoint::GetInstance()->SetCallBack(
-        "DBImpl::BGWorkBottomCompaction",
-        [&](void* arg) { ++num_bottom_pri_compactions; });
-    SyncPoint::GetInstance()->EnableProcessing();
-
-    Random rnd(301);
-    for (int num = 0; num < kNumFilesTrigger; num++) {
-      ASSERT_EQ(NumSortedRuns(), num);
-      int key_idx = 0;
-      GenerateNewFile(&rnd, &key_idx);
-    }
-    dbfull()->TEST_WaitForCompact();
-
-    if (allow_ingest_behind || num_levels_ > 1) {
-      // allow_ingest_behind increases number of levels while sanitizing.
-      ASSERT_EQ(1, num_bottom_pri_compactions);
-    } else {
-      // for single-level universal, everything's bottom level so nothing should
-      // be executed in bottom-pri thread pool.
-      ASSERT_EQ(0, num_bottom_pri_compactions);
-    }
-    // Verify that size amplification did occur
-    ASSERT_EQ(NumSortedRuns(), 1);
-    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-  }
-  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
-}
-
 TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
   if (num_levels_ == 1) {
     // for single-level universal, everything's bottom level so nothing should
@@ -1481,7 +1763,7 @@ TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
 
   std::atomic<int> num_compactions_attempted(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCompaction:Start", [&](void* arg) {
+      "DBImpl::BackgroundCompaction:Start", [&](void* /*arg*/) {
         ++num_compactions_attempted;
       });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
@@ -1499,6 +1781,63 @@ TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
   ASSERT_EQ(NumSortedRuns(), 5);
 }
 
+TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
+  // Regression test for conflict between:
+  // (1) Running CompactFiles including file in the final sorted run; and
+  // (2) Picking universal size-amp-triggered compaction, which always includes
+  //     the final sorted run.
+  if (exclusive_manual_compaction_) {
+    return;
+  }
+
+  Options opts = CurrentOptions();
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.compaction_options_universal.max_size_amplification_percent = 50;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compression = kNoCompression;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.max_background_compactions = 2;
+  opts.num_levels = num_levels_;
+  Reopen(opts);
+
+  // make sure compaction jobs can be parallelized
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  Put("key", "val");
+  Flush();
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
+  ColumnFamilyMetaData cf_meta;
+  ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily();
+  dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta);
+  ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size());
+  std::string first_sst_filename =
+      cf_meta.levels[num_levels_ - 1].files[0].name;
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactFilesImpl:0",
+        "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"},
+       {"DBImpl::BackgroundCompaction():AfterPickCompaction",
+        "CompactFilesImpl:1"}});
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_files_thread([&]() {
+    ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh,
+                                     {first_sst_filename}, num_levels_ - 1));
+  });
+
+  TEST_SYNC_POINT(
+      "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
+  for (int i = 0; i < 2; ++i) {
+    Put("key", "val");
+    Flush();
+  }
+  dbfull()->TEST_WaitForCompact();
+
+  compact_files_thread.join();
+}
+
 INSTANTIATE_TEST_CASE_P(UniversalCompactionNumLevels, DBTestUniversalCompaction,
                         ::testing::Combine(::testing::Values(1, 3, 5),
                                            ::testing::Bool()));
@@ -1574,6 +1913,241 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
                         ::testing::Combine(::testing::Values(1, 8),
                                            ::testing::Bool()));
 
+TEST_F(DBTestUniversalDeleteTrigCompaction, BasicL0toL1) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  //  MoveFilesToLevel(6);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalDeleteTrigCompaction, SingleLevel) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.num_levels = 1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(DBTestUniversalDeleteTrigCompaction, MultipleLevels) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 500; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 500; i < 1000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 1000; i < 1500; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 1500; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+  for (i = 1999; i < 2333; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2333; i < 2666; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2666; i < 2999; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+  for (i = 1900; i < 2100; ++i) {
+    Delete(Key(i));
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_EQ(0, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(3));
+  ASSERT_EQ(0, NumTableFilesAtLevel(4));
+  ASSERT_EQ(0, NumTableFilesAtLevel(5));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalDeleteTrigCompaction, OverlappingL0) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 5;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2000; i < 3000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 3500; i < 4000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  for (i = 2900; i < 3100; ++i) {
+    Delete(Key(i));
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalDeleteTrigCompaction, IngestBehind) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.allow_ingest_behind = true;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    Put(Key(i), "val");
+  }
+  Flush();
+  //  MoveFilesToLevel(6);
+  dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      Delete(Key(i));
+    } else {
+      Put(Key(i), "val");
+    }
+  }
+  Flush();
+
+  dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(6));
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+}
+
 }  // namespace rocksdb
 
 #endif  // !defined(ROCKSDB_LITE)
@@ -1584,6 +2158,8 @@ int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 #else
+  (void) argc;
+  (void) argv;
   return 0;
 #endif
 }
diff --git a/thirdparty/rocksdb/db/db_wal_test.cc b/thirdparty/rocksdb/db/db_wal_test.cc
index 461fe46739..78f72b4a0e 100644
--- a/thirdparty/rocksdb/db/db_wal_test.cc
+++ b/thirdparty/rocksdb/db/db_wal_test.cc
@@ -18,8 +18,119 @@ namespace rocksdb {
 class DBWALTest : public DBTestBase {
  public:
   DBWALTest() : DBTestBase("/db_wal_test") {}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+  uint64_t GetAllocatedFileSize(std::string file_name) {
+    struct stat sbuf;
+    int err = stat(file_name.c_str(), &sbuf);
+    assert(err == 0);
+    return sbuf.st_blocks * 512;
+  }
+#endif
+};
+
+// A SpecialEnv enriched to give more insight about deleted files
+class EnrichedSpecialEnv : public SpecialEnv {
+ public:
+  explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {}
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) override {
+    InstrumentedMutexLock l(&env_mutex_);
+    if (f == skipped_wal) {
+      deleted_wal_reopened = true;
+      if (IsWAL(f) && largetest_deleted_wal.size() != 0 &&
+          f.compare(largetest_deleted_wal) <= 0) {
+        gap_in_wals = true;
+      }
+    }
+    return SpecialEnv::NewSequentialFile(f, r, soptions);
+  }
+  Status DeleteFile(const std::string& fname) override {
+    if (IsWAL(fname)) {
+      deleted_wal_cnt++;
+      InstrumentedMutexLock l(&env_mutex_);
+      // If this is the first WAL, remember its name and skip deleting it. We
+      // remember its name partly because the application might attempt to
+      // delete the file again.
+      if (skipped_wal.size() != 0 && skipped_wal != fname) {
+        if (largetest_deleted_wal.size() == 0 ||
+            largetest_deleted_wal.compare(fname) < 0) {
+          largetest_deleted_wal = fname;
+        }
+      } else {
+        skipped_wal = fname;
+        return Status::OK();
+      }
+    }
+    return SpecialEnv::DeleteFile(fname);
+  }
+  bool IsWAL(const std::string& fname) {
+    // printf("iswal %s\n", fname.c_str());
+    return fname.compare(fname.size() - 3, 3, "log") == 0;
+  }
+
+  InstrumentedMutex env_mutex_;
+  // the wal whose actual delete was skipped by the env
+  std::string skipped_wal = "";
+  // the largest WAL that was requested to be deleted
+  std::string largetest_deleted_wal = "";
+  // number of WALs that were successfully deleted
+  std::atomic<size_t> deleted_wal_cnt = {0};
+  // the WAL whose delete from fs was skipped is reopened during recovery
+  std::atomic<bool> deleted_wal_reopened = {false};
+  // whether a gap in the WALs was detected during recovery
+  std::atomic<bool> gap_in_wals = {false};
+};
+
+class DBWALTestWithEnrichedEnv : public DBTestBase {
+ public:
+  DBWALTestWithEnrichedEnv() : DBTestBase("/db_wal_test") {
+    enriched_env_ = new EnrichedSpecialEnv(env_->target());
+    auto options = CurrentOptions();
+    options.env = enriched_env_;
+    options.allow_2pc = true;
+    Reopen(options);
+    delete env_;
+    // to be deleted by the parent class
+    env_ = enriched_env_;
+  }
+
+ protected:
+  EnrichedSpecialEnv* enriched_env_;
 };
 
+// Test that the recovery would successfully avoid the gaps between the logs.
+// One known scenario that could cause this is that the application issue the
+// WAL deletion out of order. For the sake of simplicity in the test, here we
+// create the gap by manipulating the env to skip deletion of the first WAL but
+// not the ones after it.
+TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) {
+  auto options = last_options_;
+  // To cause frequent WAL deletion
+  options.write_buffer_size = 128;
+  Reopen(options);
+
+  WriteOptions writeOpt = WriteOptions();
+  for (int i = 0; i < 128 * 5; i++) {
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+  }
+  FlushOptions fo;
+  fo.wait = true;
+  ASSERT_OK(db_->Flush(fo));
+
+  // some wals are deleted
+  ASSERT_NE(0, enriched_env_->deleted_wal_cnt);
+  // but not the first one
+  ASSERT_NE(0, enriched_env_->skipped_wal.size());
+
+  // Test that the WAL that was not deleted will be skipped during recovery
+  options = last_options_;
+  Reopen(options);
+  ASSERT_FALSE(enriched_env_->deleted_wal_reopened);
+  ASSERT_FALSE(enriched_env_->gap_in_wals);
+}
+
 TEST_F(DBWALTest, WAL) {
   do {
     CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
@@ -172,7 +283,31 @@ TEST_F(DBWALTest, RecoverWithTableHandle) {
     ASSERT_OK(Put(1, "bar", "v4"));
     ASSERT_OK(Flush(1));
     ASSERT_OK(Put(1, "big", std::string(100, 'a')));
-    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+
+    options = CurrentOptions();
+    const int kSmallMaxOpenFiles = 13;
+    if (option_config_ == kDBLogDir) {
+      // Use this option to check not preloading files
+      // Set the max open files to be small enough so no preload will
+      // happen.
+      options.max_open_files = kSmallMaxOpenFiles;
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      rocksdb::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = kSmallMaxOpenFiles;
+          });
+
+    } else if (option_config_ == kWalDirAndMmapReads) {
+      // Use this option to check always loading all files.
+      options.max_open_files = 100;
+    } else {
+      options.max_open_files = -1;
+    }
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 
     std::vector<std::vector<FileMetaData>> files;
     dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
@@ -183,10 +318,10 @@ TEST_F(DBWALTest, RecoverWithTableHandle) {
     ASSERT_EQ(total_files, 3);
     for (const auto& level : files) {
       for (const auto& file : level) {
-        if (kInfiniteMaxOpenFiles == option_config_) {
-          ASSERT_TRUE(file.table_reader_handle != nullptr);
-        } else {
+        if (options.max_open_files == kSmallMaxOpenFiles) {
           ASSERT_TRUE(file.table_reader_handle == nullptr);
+        } else {
+          ASSERT_TRUE(file.table_reader_handle != nullptr);
         }
       }
     }
@@ -693,12 +828,12 @@ class RecoveryTestHelper {
 
     *count = 0;
 
-    shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
+    std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
     EnvOptions env_options;
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
 
-    unique_ptr<VersionSet> versions;
-    unique_ptr<WalManager> wal_manager;
+    std::unique_ptr<VersionSet> versions;
+    std::unique_ptr<WalManager> wal_manager;
     WriteController write_controller;
 
     versions.reset(new VersionSet(test->dbname_, &db_options, env_options,
@@ -712,10 +847,10 @@ class RecoveryTestHelper {
     for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
       uint64_t current_log_number = j;
       std::string fname = LogFileName(test->dbname_, current_log_number);
-      unique_ptr<WritableFile> file;
+      std::unique_ptr<WritableFile> file;
       ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
-      unique_ptr<WritableFileWriter> file_writer(
-          new WritableFileWriter(std::move(file), env_options));
+      std::unique_ptr<WritableFileWriter> file_writer(
+          new WritableFileWriter(std::move(file), fname, env_options));
       current_log_writer.reset(
           new log::Writer(std::move(file_writer), current_log_number,
                           db_options.recycle_log_file_num > 0));
@@ -730,7 +865,8 @@ class RecoveryTestHelper {
         batch.Put(key, value);
         WriteBatchInternal::SetSequence(&batch, seq);
         current_log_writer->AddRecord(WriteBatchInternal::Contents(&batch));
-        versions->SetLastToBeWrittenSequence(seq);
+        versions->SetLastAllocatedSequence(seq);
+        versions->SetLastPublishedSequence(seq);
         versions->SetLastSequence(seq);
       }
     }
@@ -876,6 +1012,39 @@ TEST_F(DBWALTest, kAbsoluteConsistency) {
   }
 }
 
+// Test scope:
+// We don't expect the data store to be opened if there is any inconsistency
+// between WAL and SST files
+TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+
+  // Create DB with multiple column families.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(1, "key1", "val1"));
+  ASSERT_OK(Put(2, "key2", "val2"));
+
+  // Record the offset at this point
+  Env* env = options.env;
+  uint64_t wal_file_id = dbfull()->TEST_LogfileNumber();
+  std::string fname = LogFileName(dbname_, wal_file_id);
+  uint64_t offset_to_corrupt;
+  ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
+  ASSERT_GT(offset_to_corrupt, 0);
+
+  ASSERT_OK(Put(1, "key3", "val3"));
+  // Corrupt WAL at location of key3
+  RecoveryTestHelper::InduceCorruption(
+      fname, static_cast<size_t>(offset_to_corrupt), static_cast<size_t>(4));
+  ASSERT_OK(Put(2, "key4", "val4"));
+  ASSERT_OK(Put(1, "key5", "val5"));
+  Flush(2);
+
+  // PIT recovery & verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
+}
+
 // Test scope:
 // - We expect to open data store under all circumstances
 // - We expect only data upto the point where the first error was encountered
@@ -1007,7 +1176,7 @@ TEST_F(DBWALTest, AvoidFlushDuringRecovery) {
   Reopen(options);
   ASSERT_EQ("v11", Get("foo"));
   ASSERT_EQ("v12", Get("bar"));
-  ASSERT_EQ(2, TotalTableFiles());
+  ASSERT_EQ(3, TotalTableFiles());
 }
 
 TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) {
@@ -1195,6 +1364,99 @@ TEST_F(DBWALTest, RecoverFromCorruptedWALWithoutFlush) {
   }
 }
 
+// Tests that total log size is recovered if we set
+// avoid_flush_during_recovery=true.
+// Flush should trigger if max_total_wal_size is reached.
+TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
+  class TestFlushListener : public EventListener {
+   public:
+    std::atomic<int> count{0};
+
+    TestFlushListener() = default;
+
+    void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+      count++;
+      assert(FlushReason::kWriteBufferManager == flush_job_info.flush_reason);
+    }
+  };
+  std::shared_ptr<TestFlushListener> test_listener =
+      std::make_shared<TestFlushListener>();
+
+  constexpr size_t kKB = 1024;
+  constexpr size_t kMB = 1024 * 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.max_total_wal_size = 1 * kMB;
+  options.listeners.push_back(test_listener);
+  // Have to open DB in multi-CF mode to trigger flush when
+  // max_total_wal_size is reached.
+  CreateAndReopenWithCF({"one"}, options);
+  // Write some keys and we will end up with one log file which is slightly
+  // smaller than 1MB.
+  std::string value_100k(100 * kKB, 'v');
+  std::string value_300k(300 * kKB, 'v');
+  ASSERT_OK(Put(0, "foo", "v1"));
+  for (int i = 0; i < 9; i++) {
+    ASSERT_OK(Put(1, "key" + ToString(i), value_100k));
+  }
+  // Get log files before reopen.
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  uint64_t log_size_before = log_files_before[0]->SizeFileBytes();
+  ASSERT_GT(log_size_before, 900 * kKB);
+  ASSERT_LT(log_size_before, 1 * kMB);
+  ReopenWithColumnFamilies({"default", "one"}, options);
+  // Write one more value to make log larger than 1MB.
+  ASSERT_OK(Put(1, "bar", value_300k));
+  // Get log files again. A new log file will be opened.
+  VectorLogPtr log_files_after_reopen;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen));
+  ASSERT_EQ(2, log_files_after_reopen.size());
+  ASSERT_EQ(log_files_before[0]->LogNumber(),
+            log_files_after_reopen[0]->LogNumber());
+  ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() +
+                log_files_after_reopen[1]->SizeFileBytes(),
+            1 * kMB);
+  // Write one more key to trigger flush.
+  ASSERT_OK(Put(0, "foo", "v2"));
+  dbfull()->TEST_WaitForFlushMemTable();
+  // Flushed two column families.
+  ASSERT_EQ(2, test_listener->count.load());
+}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  Reopen(options);
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  // The preallocated space should be truncated.
+  ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+}
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+#endif  // ROCKSDB_PLATFORM_POSIX
+
 #endif  // ROCKSDB_LITE
 
 TEST_F(DBWALTest, WalTermTest) {
diff --git a/thirdparty/rocksdb/db/db_write_test.cc b/thirdparty/rocksdb/db/db_write_test.cc
index 726f444fa1..e6bab87511 100644
--- a/thirdparty/rocksdb/db/db_write_test.cc
+++ b/thirdparty/rocksdb/db/db_write_test.cc
@@ -3,12 +3,17 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <atomic>
 #include <memory>
 #include <thread>
 #include <vector>
 #include "db/db_test_util.h"
 #include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
 #include "port/stack_trace.h"
+#include "util/fault_injection_test_env.h"
+#include "util/string_util.h"
 #include "util/sync_point.h"
 
 namespace rocksdb {
@@ -18,53 +23,166 @@ class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
  public:
   DBWriteTest() : DBTestBase("/db_write_test") {}
 
-  void Open() { DBTestBase::Reopen(GetOptions(GetParam())); }
+  Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+  void Open() { DBTestBase::Reopen(GetOptions()); }
 };
 
-// Sequence number should be return through input write batch.
-TEST_P(DBWriteTest, ReturnSeuqneceNumber) {
-  Random rnd(4422);
-  Open();
-  for (int i = 0; i < 100; i++) {
-    WriteBatch batch;
-    batch.Put("key" + ToString(i), test::RandomHumanReadableString(&rnd, 10));
-    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
-    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(),
-              WriteBatchInternal::Sequence(&batch));
-  }
+// It is invalid to do sync write while disabling WAL.
+TEST_P(DBWriteTest, SyncAndDisableWAL) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = true;
+  ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument());
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
 }
 
-TEST_P(DBWriteTest, ReturnSeuqneceNumberMultiThreaded) {
-  constexpr size_t kThreads = 16;
-  constexpr size_t kNumKeys = 1000;
-  Open();
-  ASSERT_EQ(0, dbfull()->GetLatestSequenceNumber());
-  // Check each sequence is used once and only once.
-  std::vector<std::atomic_flag> flags(kNumKeys * kThreads + 1);
-  for (size_t i = 0; i < flags.size(); i++) {
-    flags[i].clear();
-  }
-  auto writer = [&](size_t id) {
-    Random rnd(4422 + static_cast<uint32_t>(id));
-    for (size_t k = 0; k < kNumKeys; k++) {
-      WriteBatch batch;
-      batch.Put("key" + ToString(id) + "-" + ToString(k),
-                test::RandomHumanReadableString(&rnd, 10));
-      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
-      SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
-      ASSERT_GT(sequence, 0);
-      ASSERT_LE(sequence, kNumKeys * kThreads);
-      // The sequence isn't consumed by someone else.
-      ASSERT_FALSE(flags[sequence].test_and_set());
-    }
-  };
+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+  constexpr int kNumThreads = 5;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  std::atomic<int> ready_count{0};
+  std::atomic<int> leader_count{0};
   std::vector<port::Thread> threads;
-  for (size_t i = 0; i < kThreads; i++) {
-    threads.emplace_back(writer, i);
+  mock_env->SetFilesystemActive(false);
+
+  // Wait until all threads linked to write threads, to make sure
+  // all threads join the same batch group.
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        ready_count++;
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          leader_count++;
+          while (ready_count < kNumThreads) {
+            // busy waiting
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.push_back(port::Thread(
+        [&](int index) {
+          // All threads should fail.
+          auto res = Put("key" + ToString(index), "value");
+          if (options.manual_wal_flush) {
+            ASSERT_TRUE(res.ok());
+            // we should see fs error when we do the flush
+
+            // TSAN reports a false alarm for lock-order-inversion but Open and
+            // FlushWAL are not run concurrently. Disabling this until TSAN is
+            // fixed.
+            // res = dbfull()->FlushWAL(false);
+            // ASSERT_FALSE(res.ok());
+          } else {
+            ASSERT_FALSE(res.ok());
+          }
+        },
+        i));
   }
-  for (size_t i = 0; i < kThreads; i++) {
+  for (int i = 0; i < kNumThreads; i++) {
     threads[i].join();
   }
+  ASSERT_EQ(1, leader_count);
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_P(DBWriteTest, ManualWalFlushInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
+  // try the 2nd wal created during SwitchWAL
+  dbfull()->TEST_SwitchWAL();
+  ASSERT_TRUE(Put("key" + ToString(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty());
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  for (int i = 0; i < 2; i++) {
+    // Forcibly fail WAL write for the first Put only. Subsequent Puts should
+    // fail due to read-only mode
+    mock_env->SetFilesystemActive(i != 0);
+    auto res = Put("key" + ToString(i), "value");
+    // TSAN reports a false alarm for lock-order-inversion but Open and
+    // FlushWAL are not run concurrently. Disabling this until TSAN is
+    // fixed.
+    /*
+    if (options.manual_wal_flush && i == 0) {
+      // even with manual_wal_flush the 2nd Put should return error because of
+      // the read-only mode
+      ASSERT_TRUE(res.ok());
+      // we should see fs error when we do the flush
+      res = dbfull()->FlushWAL(false);
+    }
+    */
+    if (!options.manual_wal_flush) {
+      ASSERT_FALSE(res.ok());
+    }
+  }
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
+  Random rnd(301);
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  options.writable_file_max_buffer_size = 4 * 1024 * 1024;
+  options.write_buffer_size = 3 * 512 * 1024;
+  options.wal_bytes_per_sync = 256 * 1024;
+  options.manual_wal_flush = true;
+  Reopen(options);
+  mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
+  Status s;
+  for (int i = 0; i < 4 * 512; ++i) {
+    s = Put(Key(i), RandomString(&rnd, 1024));
+    if (!s.ok()) {
+      break;
+    }
+  }
+  ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+
+  mock_env->SetFilesystemActive(true);
+  // Close before mock_env destruct.
+  Close();
+}
+
+// Test that db->LockWAL() flushes the WAL after locking.
+TEST_P(DBWriteTest, LockWalInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_OK(Put("key" + ToString(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
+  // try the 2nd wal created during SwitchWAL
+  dbfull()->TEST_SwitchWAL();
+  ASSERT_OK(Put("key" + ToString(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->TEST_WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->TEST_WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
 }
 
 INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
diff --git a/thirdparty/rocksdb/db/dbformat.cc b/thirdparty/rocksdb/db/dbformat.cc
index f287ae9f4e..cd2878198c 100644
--- a/thirdparty/rocksdb/db/dbformat.cc
+++ b/thirdparty/rocksdb/db/dbformat.cc
@@ -36,6 +36,36 @@ uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
   return (seq << 8) | t;
 }
 
+EntryType GetEntryType(ValueType value_type) {
+  switch (value_type) {
+    case kTypeValue:
+      return kEntryPut;
+    case kTypeDeletion:
+      return kEntryDelete;
+    case kTypeSingleDeletion:
+      return kEntrySingleDelete;
+    case kTypeMerge:
+      return kEntryMerge;
+    case kTypeRangeDeletion:
+      return kEntryRangeDeletion;
+    case kTypeBlobIndex:
+      return kEntryBlobIndex;
+    default:
+      return kEntryOther;
+  }
+}
+
+bool ParseFullKey(const Slice& internal_key, FullKey* fkey) {
+  ParsedInternalKey ikey;
+  if (!ParseInternalKey(internal_key, &ikey)) {
+    return false;
+  }
+  fkey->user_key = ikey.user_key;
+  fkey->sequence = ikey.sequence;
+  fkey->type = GetEntryType(ikey.type);
+  return true;
+}
+
 void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t) {
   *seq = packed >> 8;
   *t = static_cast<ValueType>(packed & 0xff);
@@ -76,28 +106,7 @@ std::string InternalKey::DebugString(bool hex) const {
   return result;
 }
 
-const char* InternalKeyComparator::Name() const {
-  return name_.c_str();
-}
-
-int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
-  // Order by:
-  //    increasing user key (according to user-supplied comparator)
-  //    decreasing sequence number
-  //    decreasing type (though sequence# should be enough to disambiguate)
-  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
-  PERF_COUNTER_ADD(user_key_comparison_count, 1);
-  if (r == 0) {
-    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
-    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
-    if (anum > bnum) {
-      r = -1;
-    } else if (anum < bnum) {
-      r = +1;
-    }
-  }
-  return r;
-}
+const char* InternalKeyComparator::Name() const { return name_.c_str(); }
 
 int InternalKeyComparator::Compare(const ParsedInternalKey& a,
                                    const ParsedInternalKey& b) const {
@@ -105,8 +114,7 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a,
   //    increasing user key (according to user-supplied comparator)
   //    decreasing sequence number
   //    decreasing type (though sequence# should be enough to disambiguate)
-  int r = user_comparator_->Compare(a.user_key, b.user_key);
-  PERF_COUNTER_ADD(user_key_comparison_count, 1);
+  int r = user_comparator_.Compare(a.user_key, b.user_key);
   if (r == 0) {
     if (a.sequence > b.sequence) {
       r = -1;
@@ -121,19 +129,19 @@ int InternalKeyComparator::Compare(const ParsedInternalKey& a,
   return r;
 }
 
-void InternalKeyComparator::FindShortestSeparator(
-      std::string* start,
-      const Slice& limit) const {
+void InternalKeyComparator::FindShortestSeparator(std::string* start,
+                                                  const Slice& limit) const {
   // Attempt to shorten the user portion of the key
   Slice user_start = ExtractUserKey(*start);
   Slice user_limit = ExtractUserKey(limit);
   std::string tmp(user_start.data(), user_start.size());
-  user_comparator_->FindShortestSeparator(&tmp, user_limit);
+  user_comparator_.FindShortestSeparator(&tmp, user_limit);
   if (tmp.size() <= user_start.size() &&
-      user_comparator_->Compare(user_start, tmp) < 0) {
+      user_comparator_.Compare(user_start, tmp) < 0) {
     // User key has become shorter physically, but larger logically.
     // Tack on the earliest possible number to the shortened user key.
-    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
     assert(this->Compare(*start, tmp) < 0);
     assert(this->Compare(tmp, limit) < 0);
     start->swap(tmp);
@@ -143,12 +151,13 @@ void InternalKeyComparator::FindShortestSeparator(
 void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
   Slice user_key = ExtractUserKey(*key);
   std::string tmp(user_key.data(), user_key.size());
-  user_comparator_->FindShortSuccessor(&tmp);
+  user_comparator_.FindShortSuccessor(&tmp);
   if (tmp.size() <= user_key.size() &&
-      user_comparator_->Compare(user_key, tmp) < 0) {
+      user_comparator_.Compare(user_key, tmp) < 0) {
     // User key has become shorter physically, but larger logically.
     // Tack on the earliest possible number to the shortened user key.
-    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    PutFixed64(&tmp,
+               PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
     assert(this->Compare(*key, tmp) < 0);
     key->swap(tmp);
   }
@@ -174,4 +183,13 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) {
   end_ = dst;
 }
 
+void IterKey::EnlargeBuffer(size_t key_size) {
+  // If size is smaller than buffer size, continue using current buffer,
+  // or the static allocated one, as default
+  assert(key_size > buf_size_);
+  // Need to enlarge the buffer.
+  ResetBuffer();
+  buf_ = new char[key_size];
+  buf_size_ = key_size;
+}
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/dbformat.h b/thirdparty/rocksdb/db/dbformat.h
index c58b8363ab..7a5ddc1ad0 100644
--- a/thirdparty/rocksdb/db/dbformat.h
+++ b/thirdparty/rocksdb/db/dbformat.h
@@ -11,6 +11,7 @@
 #include <stdio.h>
 #include <string>
 #include <utility>
+#include "monitoring/perf_context_imp.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
@@ -20,6 +21,7 @@
 #include "rocksdb/types.h"
 #include "util/coding.h"
 #include "util/logging.h"
+#include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
 
@@ -49,7 +51,16 @@ enum ValueType : unsigned char {
   kTypeRangeDeletion = 0xF,               // meta block
   kTypeColumnFamilyBlobIndex = 0x10,      // Blob DB only
   kTypeBlobIndex = 0x11,                  // Blob DB only
-  kMaxValue = 0x7F                        // Not used for storing records.
+  // When the prepared record is also persisted in db, we use a different
+  // record. This is to ensure that the WAL that is generated by a WritePolicy
+  // is not mistakenly read by another, which would result into data
+  // inconsistency.
+  kTypeBeginPersistedPrepareXID = 0x12,  // WAL only.
+  // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
+  // generated by WriteUnprepared write policy is not mistakenly read by
+  // another.
+  kTypeBeginUnprepareXID = 0x13,  // WAL only.
+  kMaxValue = 0x7F                // Not used for storing records.
 };
 
 // Defined in dbformat.cc
@@ -70,8 +81,7 @@ inline bool IsExtendedValueType(ValueType t) {
 
 // We leave eight bits empty at the bottom so a type and sequence#
 // can be packed together into 64-bits.
-static const SequenceNumber kMaxSequenceNumber =
-    ((0x1ull << 56) - 1);
+static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
 
 static const SequenceNumber kDisableGlobalSequenceNumber = port::kMaxUint64;
 
@@ -84,7 +94,7 @@ struct ParsedInternalKey {
       : sequence(kMaxSequenceNumber)  // Make code analyzer happy
   {}  // Intentionally left uninitialized (for speed)
   ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
-      : user_key(u), sequence(seq), type(t) { }
+      : user_key(u), sequence(seq), type(t) {}
   std::string DebugString(bool hex = false) const;
 
   void clear() {
@@ -106,6 +116,8 @@ extern uint64_t PackSequenceAndType(uint64_t seq, ValueType t);
 // and the ValueType in *t.
 extern void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, ValueType* t);
 
+EntryType GetEntryType(ValueType value_type);
+
 // Append the serialization of "key" to *result.
 extern void AppendInternalKey(std::string* result,
                               const ParsedInternalKey& key);
@@ -128,39 +140,52 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
   return Slice(internal_key.data(), internal_key.size() - 8);
 }
 
-inline ValueType ExtractValueType(const Slice& internal_key) {
+inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
   assert(internal_key.size() >= 8);
   const size_t n = internal_key.size();
-  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  return DecodeFixed64(internal_key.data() + n - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  uint64_t num = ExtractInternalKeyFooter(internal_key);
   unsigned char c = num & 0xff;
   return static_cast<ValueType>(c);
 }
 
 // A comparator for internal keys that uses a specified comparator for
 // the user key portion and breaks ties by decreasing sequence number.
-class InternalKeyComparator : public Comparator {
+class InternalKeyComparator
+#ifdef NDEBUG
+    final
+#endif
+    : public Comparator {
  private:
-  const Comparator* user_comparator_;
+  UserComparatorWrapper user_comparator_;
   std::string name_;
+
  public:
-  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c),
-    name_("rocksdb.InternalKeyComparator:" +
-          std::string(user_comparator_->Name())) {
-  }
+  explicit InternalKeyComparator(const Comparator* c)
+      : user_comparator_(c),
+        name_("rocksdb.InternalKeyComparator:" +
+              std::string(user_comparator_.Name())) {}
   virtual ~InternalKeyComparator() {}
 
   virtual const char* Name() const override;
   virtual int Compare(const Slice& a, const Slice& b) const override;
+  // Same as Compare except that it excludes the value type from comparison
+  virtual int CompareKeySeq(const Slice& a, const Slice& b) const;
   virtual void FindShortestSeparator(std::string* start,
                                      const Slice& limit) const override;
   virtual void FindShortSuccessor(std::string* key) const override;
 
-  const Comparator* user_comparator() const { return user_comparator_; }
+  const Comparator* user_comparator() const {
+    return user_comparator_.user_comparator();
+  }
 
   int Compare(const InternalKey& a, const InternalKey& b) const;
   int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
   virtual const Comparator* GetRootComparator() const override {
-    return user_comparator_->GetRootComparator();
+    return user_comparator_.GetRootComparator();
   }
 };
 
@@ -170,8 +195,9 @@ class InternalKeyComparator : public Comparator {
 class InternalKey {
  private:
   std::string rep_;
+
  public:
-  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
+  InternalKey() {}  // Leave rep_ as empty to indicate it is invalid
   InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
     AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
   }
@@ -179,15 +205,15 @@ class InternalKey {
   // sets the internal key to be bigger or equal to all internal keys with this
   // user key
   void SetMaxPossibleForUserKey(const Slice& _user_key) {
-    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
-                                               kValueTypeForSeek));
+    AppendInternalKey(
+        &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
   }
 
   // sets the internal key to be smaller or equal to all internal keys with this
   // user key
   void SetMinPossibleForUserKey(const Slice& _user_key) {
-    AppendInternalKey(
-        &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
+                                               kValueTypeForSeek));
   }
 
   bool Valid() const {
@@ -228,8 +254,8 @@ class InternalKey {
   std::string DebugString(bool hex = false) const;
 };
 
-inline int InternalKeyComparator::Compare(
-    const InternalKey& a, const InternalKey& b) const {
+inline int InternalKeyComparator::Compare(const InternalKey& a,
+                                          const InternalKey& b) const {
   return Compare(a.Encode(), b.Encode());
 }
 
@@ -266,7 +292,6 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
   return num >> 8;
 }
 
-
 // A helper class useful for DBImpl::Get()
 class LookupKey {
  public:
@@ -302,7 +327,7 @@ class LookupKey {
   const char* start_;
   const char* kstart_;
   const char* end_;
-  char space_[200];      // Avoid allocation for short keys
+  char space_[200];  // Avoid allocation for short keys
 
   // No copying allowed
   LookupKey(const LookupKey&);
@@ -317,13 +342,19 @@ class IterKey {
  public:
   IterKey()
       : buf_(space_),
-        buf_size_(sizeof(space_)),
         key_(buf_),
         key_size_(0),
+        buf_size_(sizeof(space_)),
         is_user_key_(true) {}
 
   ~IterKey() { ResetBuffer(); }
 
+  // The bool will be picked up by the next calls to SetKey
+  void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
+
+  // Returns the key in whichever format that was provided to KeyIter
+  Slice GetKey() const { return Slice(key_, key_size_); }
+
   Slice GetInternalKey() const {
     assert(!IsUserKey());
     return Slice(key_, key_size_);
@@ -373,6 +404,11 @@ class IterKey {
     key_size_ = total_size;
   }
 
+  Slice SetKey(const Slice& key, bool copy = true) {
+    // is_user_key_ expected to be set already via SetIsUserKey
+    return SetKeyImpl(key, copy);
+  }
+
   Slice SetUserKey(const Slice& key, bool copy = true) {
     is_user_key_ = true;
     return SetKeyImpl(key, copy);
@@ -463,9 +499,9 @@ class IterKey {
 
  private:
   char* buf_;
-  size_t buf_size_;
   const char* key_;
   size_t key_size_;
+  size_t buf_size_;
   char space_[32];  // Avoid allocation for short keys
   bool is_user_key_;
 
@@ -502,13 +538,12 @@ class IterKey {
     // If size is smaller than buffer size, continue using current buffer,
     // or the static allocated one, as default
     if (key_size > buf_size_) {
-      // Need to enlarge the buffer.
-      ResetBuffer();
-      buf_ = new char[key_size];
-      buf_size_ = key_size;
+      EnlargeBuffer(key_size);
     }
   }
 
+  void EnlargeBuffer(size_t key_size);
+
   // No copying allowed
   IterKey(const IterKey&) = delete;
   void operator=(const IterKey&) = delete;
@@ -589,10 +624,66 @@ struct RangeTombstone {
     return InternalKey(start_key_, seq_, kTypeRangeDeletion);
   }
 
+  // The tombstone end-key is exclusive, so we generate an internal-key here
+  // which has a similar property. Using kMaxSequenceNumber guarantees that
+  // the returned internal-key will compare less than any other internal-key
+  // with the same user-key. This in turn guarantees that the serialized
+  // end-key for a tombstone such as [a-b] will compare less than the key "b".
+  //
   // be careful to use SerializeEndKey(), allocates new memory
   InternalKey SerializeEndKey() const {
-    return InternalKey(end_key_, seq_, kTypeRangeDeletion);
+    return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
   }
 };
 
+inline int InternalKeyComparator::Compare(const Slice& akey,
+                                          const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
+                                                const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    // Shift the number to exclude the last byte which contains the value type
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8) >> 8;
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8) >> 8;
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+struct ParsedInternalKeyComparator {
+  explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
+      : cmp(c) {}
+
+  bool operator()(const ParsedInternalKey& a,
+                  const ParsedInternalKey& b) const {
+    return cmp->Compare(a, b) < 0;
+  }
+
+  const InternalKeyComparator* cmp;
+};
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/dbformat_test.cc b/thirdparty/rocksdb/db/dbformat_test.cc
index d96b5757af..0b16c13f57 100644
--- a/thirdparty/rocksdb/db/dbformat_test.cc
+++ b/thirdparty/rocksdb/db/dbformat_test.cc
@@ -192,6 +192,13 @@ TEST_F(FormatTest, UpdateInternalKey) {
   ASSERT_EQ(new_val_type, decoded.type);
 }
 
+TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
+  RangeTombstone t("a", "b", 2);
+  InternalKey k("b", 3, kTypeValue);
+  const InternalKeyComparator cmp(BytewiseComparator());
+  ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/deletefile_test.cc b/thirdparty/rocksdb/db/deletefile_test.cc
index 989c0c4118..3ae464c584 100644
--- a/thirdparty/rocksdb/db/deletefile_test.cc
+++ b/thirdparty/rocksdb/db/deletefile_test.cc
@@ -45,7 +45,7 @@ class DeleteFileTest : public testing::Test {
     options_.max_bytes_for_level_base = 1024*1024*1000;
     options_.WAL_ttl_seconds = 300; // Used to test log files
     options_.WAL_size_limit_MB = 1024; // Used to test log files
-    dbname_ = test::TmpDir() + "/deletefile_test";
+    dbname_ = test::PerThreadDBPath("deletefile_test");
     options_.wal_dir = dbname_ + "/wal_files";
 
     // clean up all the files that might have been there before
@@ -71,7 +71,9 @@ class DeleteFileTest : public testing::Test {
     }
     db_ = nullptr;
     options_.create_if_missing = create;
-    return DB::Open(options_, dbname_, &db_);
+    Status s = DB::Open(options_, dbname_, &db_);
+    assert(db_);
+    return s;
   }
 
   void CloseDB() {
@@ -159,7 +161,7 @@ class DeleteFileTest : public testing::Test {
   }
 
   // An empty job to guard all jobs are processed
-  static void GuardFinish(void* arg) {
+  static void GuardFinish(void* /*arg*/) {
     TEST_SYNC_POINT("DeleteFileTest::GuardFinish");
   }
 };
@@ -228,7 +230,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
 
   // this time, we keep an iterator alive
   ReopenDB(true);
-  Iterator *itr = 0;
+  Iterator *itr = nullptr;
   CreateTwoLevels();
   itr = db_->NewIterator(ReadOptions());
   db_->CompactRange(compact_options, &first_slice, &last_slice);
@@ -241,7 +243,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
   CloseDB();
 }
 
-TEST_F(DeleteFileTest, BackgroundPurgeTest) {
+TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
   std::string first("0"), last("999999");
   CompactRangeOptions compact_options;
   compact_options.change_level = true;
@@ -249,7 +251,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeTest) {
   Slice first_slice(first), last_slice(last);
 
   // We keep an iterator alive
-  Iterator* itr = 0;
+  Iterator* itr = nullptr;
   CreateTwoLevels();
   ReadOptions options;
   options.background_purge_on_iterator_cleanup = true;
@@ -279,6 +281,53 @@ TEST_F(DeleteFileTest, BackgroundPurgeTest) {
   CloseDB();
 }
 
+TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+  auto do_test = [&](bool bg_purge) {
+    ColumnFamilyOptions co;
+    WriteOptions wo;
+    FlushOptions fo;
+    ColumnFamilyHandle* cfh = nullptr;
+
+    ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh));
+
+    ASSERT_OK(db_->Put(wo, cfh, "pika", "chu"));
+    ASSERT_OK(db_->Flush(fo, cfh));
+    // Expect 1 sst file.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    ASSERT_OK(db_->DropColumnFamily(cfh));
+    // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    delete cfh;
+    test::SleepingBackgroundTask sleeping_task_after;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_task_after, Env::Priority::HIGH);
+    // If background purge is enabled, the file should still be there.
+    CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+
+    // Execute background purges.
+    sleeping_task_after.WakeUp();
+    sleeping_task_after.WaitUntilDone();
+    // The file should have been deleted.
+    CheckFileTypeCounts(dbname_, 0, 0, 1);
+  };
+
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = false");
+    do_test(false);
+  }
+
+  options_.avoid_unnecessary_blocking_io = true;
+  ASSERT_OK(ReopenDB(false));
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
+    do_test(true);
+  }
+
+  CloseDB();
+}
+
 // This test is to reproduce a bug that read invalid ReadOption in iterator
 // cleanup function
 TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
@@ -289,7 +338,7 @@ TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
   Slice first_slice(first), last_slice(last);
 
   // We keep an iterator alive
-  Iterator* itr = 0;
+  Iterator* itr = nullptr;
   CreateTwoLevels();
   ReadOptions* options = new ReadOptions();
   options->background_purge_on_iterator_cleanup = true;
@@ -500,7 +549,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
   return 0;
diff --git a/thirdparty/rocksdb/db/error_handler.cc b/thirdparty/rocksdb/db/error_handler.cc
new file mode 100644
index 0000000000..afec14edcb
--- /dev/null
+++ b/thirdparty/rocksdb/db/error_handler.cc
@@ -0,0 +1,345 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/error_handler.h"
+#include "db/db_impl.h"
+#include "db/event_helpers.h"
+#include "util/sst_file_manager_impl.h"
+
+namespace rocksdb {
+
+// Maps to help decide the severity of an error based on the
+// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
+// is set or not. There are 3 maps, going from most specific to least specific
+// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
+// paranoid_checks). The less specific map serves as a catch all in case we miss
+// a specific error code or subcode.
+std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
+         Status::Severity>
+    ErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kSoftError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+                         true),
+         Status::Severity::kHardError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kSpaceLimit, true),
+         Status::Severity::kHardError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>, Status::Severity>
+    DefaultErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
+    DefaultReasonMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction, false),
+          Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, false),
+          Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
+          Status::Severity::kFatalError},
+        // Errors during Memtable update
+        {std::make_tuple(BackgroundErrorReason::kMemTable, true),
+          Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kMemTable, false),
+          Status::Severity::kFatalError},
+};
+
+void ErrorHandler::CancelErrorRecovery() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // We'll release the lock before calling sfm, so make sure no new
+  // recovery gets scheduled at that point
+  auto_recovery_ = false;
+  SstFileManagerImpl* sfm = reinterpret_cast<SstFileManagerImpl*>(
+      db_options_.sst_file_manager.get());
+  if (sfm) {
+    // This may or may not cancel a pending recovery
+    db_mutex_->Unlock();
+    bool cancelled = sfm->CancelErrorRecovery(this);
+    db_mutex_->Lock();
+    if (cancelled) {
+      recovery_in_prog_ = false;
+    }
+  }
+#endif
+}
+
+// This is the main function for looking at an error during a background
+// operation and deciding the severity, and error recovery strategy. The high
+// level algorithm is as follows -
+// 1. Classify the severity of the error based on the ErrorSeverityMap,
+//    DefaultErrorSeverityMap and DefaultReasonMap defined earlier
+// 2. Call a Status code specific override function to adjust the severity
+//    if needed. The reason for this is our ability to recover may depend on
+//    the exact options enabled in DBOptions
+// 3. Determine if auto recovery is possible. A listener notification callback
+//    is called, which can disable the auto recovery even if we decide its
+//    feasible
+// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
+//    the actual recovery. If no sst file manager is specified in DBOptions,
+//    a default one is allocated during DB::Open(), so there will always be
+//    one.
+// This can also get called as part of a recovery operation. In that case, we
+// also track the error separately in recovery_error_ so we can tell in the
+// end whether recovery succeeded or not
+Status ErrorHandler::SetBGError(const Status& bg_err, BackgroundErrorReason reason) {
+  db_mutex_->AssertHeld();
+
+  if (bg_err.ok()) {
+    return Status::OK();
+  }
+
+  // Check if recovery is currently in progress. If it is, we will save this
+  // error so we can check it at the end to see if recovery succeeded or not
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = bg_err;
+  }
+
+  bool paranoid = db_options_.paranoid_checks;
+  Status::Severity sev = Status::Severity::kFatalError;
+  Status new_bg_err;
+  bool found = false;
+
+  {
+    auto entry = ErrorSeverityMap.find(std::make_tuple(reason, bg_err.code(),
+          bg_err.subcode(), paranoid));
+    if (entry != ErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultErrorSeverityMap.find(std::make_tuple(reason,
+          bg_err.code(), paranoid));
+    if (entry != DefaultErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
+    if (entry != DefaultReasonMap.end()) {
+      sev = entry->second;
+    }
+  }
+
+  new_bg_err = Status(bg_err, sev);
+
+  bool auto_recovery = auto_recovery_;
+  if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
+    auto_recovery = false;
+    ;
+  }
+
+  // Allow some error specific overrides
+  if (new_bg_err == Status::NoSpace()) {
+    new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
+  }
+
+  if (!new_bg_err.ok()) {
+    Status s = new_bg_err;
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
+                                          db_mutex_, &auto_recovery);
+    if (!s.ok() && (s.severity() > bg_error_.severity())) {
+      bg_error_ = s;
+    } else {
+      // This error is less severe than previously encountered error. Don't
+      // take any further action
+      return bg_error_;
+    }
+  }
+
+  if (auto_recovery) {
+    recovery_in_prog_ = true;
+
+    // Kick-off error specific recovery
+    if (bg_error_ == Status::NoSpace()) {
+      RecoverFromNoSpace();
+    }
+  }
+  return bg_error_;
+}
+
+Status ErrorHandler::OverrideNoSpaceError(Status bg_error,
+                                          bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (bg_error.severity() >= Status::Severity::kFatalError) {
+    return bg_error;
+  }
+
+  if (db_options_.sst_file_manager.get() == nullptr) {
+    // We rely on SFM to poll for enough disk space and recover
+    *auto_recovery = false;
+    return bg_error;
+  }
+
+  if (db_options_.allow_2pc &&
+      (bg_error.severity() <= Status::Severity::kSoftError)) {
+    // Don't know how to recover, as the contents of the current WAL file may
+    // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
+    // we can just flush the memtable and discard the log
+    *auto_recovery = false;
+    return Status(bg_error, Status::Severity::kFatalError);
+  }
+
+  {
+    uint64_t free_space;
+    if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
+                                      &free_space) == Status::NotSupported()) {
+      *auto_recovery = false;
+    }
+  }
+
+  return bg_error;
+#else
+  (void)auto_recovery;
+  return Status(bg_error, Status::Severity::kFatalError);
+#endif
+}
+
+void ErrorHandler::RecoverFromNoSpace() {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+
+  // Inform SFM of the error, so it can kick-off the recovery
+  if (sfm) {
+    sfm->StartErrorRecovery(this, bg_error_);
+  }
+#endif
+}
+
+Status ErrorHandler::ClearBGError() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // Signal that recovery succeeded
+  if (recovery_error_.ok()) {
+    Status old_bg_error = bg_error_;
+    bg_error_ = Status::OK();
+    recovery_in_prog_ = false;
+    EventHelpers::NotifyOnErrorRecoveryCompleted(db_options_.listeners,
+                                                 old_bg_error, db_mutex_);
+  }
+  return recovery_error_;
+#else
+  return bg_error_;
+#endif
+}
+
+Status ErrorHandler::RecoverFromBGError(bool is_manual) {
+#ifndef ROCKSDB_LITE
+  InstrumentedMutexLock l(db_mutex_);
+  if (is_manual) {
+    // If its a manual recovery and there's a background recovery in progress
+    // return busy status
+    if (recovery_in_prog_) {
+      return Status::Busy();
+    }
+    recovery_in_prog_ = true;
+  }
+
+  if (bg_error_.severity() == Status::Severity::kSoftError) {
+    // Simply clear the background error and return
+    recovery_error_ = Status::OK();
+    return ClearBGError();
+  }
+
+  // Reset recovery_error_. We will use this to record any errors that happen
+  // during the recovery process. While recovering, the only operations that
+  // can generate background errors should be the flush operations
+  recovery_error_ = Status::OK();
+  Status s = db_->ResumeImpl();
+  // For manual recover, shutdown, and fatal error  cases, set
+  // recovery_in_prog_ to false. For automatic background recovery, leave it
+  // as is regardless of success or failure as it will be retried
+  if (is_manual || s.IsShutdownInProgress() ||
+      bg_error_.severity() >= Status::Severity::kFatalError) {
+    recovery_in_prog_ = false;
+  }
+  return s;
+#else
+  (void)is_manual;
+  return bg_error_;
+#endif
+}
+}
diff --git a/thirdparty/rocksdb/db/error_handler.h b/thirdparty/rocksdb/db/error_handler.h
new file mode 100644
index 0000000000..c2af809fc6
--- /dev/null
+++ b/thirdparty/rocksdb/db/error_handler.h
@@ -0,0 +1,75 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DBImpl;
+
+class ErrorHandler {
+  public:
+   ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
+                InstrumentedMutex* db_mutex)
+       : db_(db),
+         db_options_(db_options),
+         bg_error_(Status::OK()),
+         recovery_error_(Status::OK()),
+         db_mutex_(db_mutex),
+         auto_recovery_(false),
+         recovery_in_prog_(false) {}
+   ~ErrorHandler() {}
+
+   void EnableAutoRecovery() { auto_recovery_ = true; }
+
+   Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
+                                     Status::Code code,
+                                     Status::SubCode subcode);
+
+   Status SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+   Status GetBGError() { return bg_error_; }
+
+   Status GetRecoveryError() { return recovery_error_; }
+
+   Status ClearBGError();
+
+   bool IsDBStopped() {
+     return !bg_error_.ok() &&
+            bg_error_.severity() >= Status::Severity::kHardError;
+    }
+
+    bool IsBGWorkStopped() {
+      return !bg_error_.ok() &&
+             (bg_error_.severity() >= Status::Severity::kHardError ||
+              !auto_recovery_);
+    }
+
+    bool IsRecoveryInProgress() { return recovery_in_prog_; }
+
+    Status RecoverFromBGError(bool is_manual = false);
+    void CancelErrorRecovery();
+
+   private:
+    DBImpl* db_;
+    const ImmutableDBOptions& db_options_;
+    Status bg_error_;
+    // A separate Status variable used to record any errors during the
+    // recovery process from hard errors
+    Status recovery_error_;
+    InstrumentedMutex* db_mutex_;
+    // A flag indicating whether automatic recovery from errors is enabled
+    bool auto_recovery_;
+    bool recovery_in_prog_;
+
+    Status OverrideNoSpaceError(Status bg_error, bool* auto_recovery);
+    void RecoverFromNoSpace();
+};
+
+}
diff --git a/thirdparty/rocksdb/db/error_handler_test.cc b/thirdparty/rocksdb/db/error_handler_test.cc
new file mode 100644
index 0000000000..d33e19df5d
--- /dev/null
+++ b/thirdparty/rocksdb/db/error_handler_test.cc
@@ -0,0 +1,691 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/sst_file_manager.h"
+#include "util/fault_injection_test_env.h"
+#if !defined(ROCKSDB_LITE)
+#include "util/sync_point.h"
+#endif
+
+namespace rocksdb {
+
+class DBErrorHandlingTest : public DBTestBase {
+ public:
+  DBErrorHandlingTest() : DBTestBase("/db_error_handling_test") {}
+};
+
+class DBErrorHandlingEnv : public EnvWrapper {
+  public:
+    DBErrorHandlingEnv() : EnvWrapper(Env::Default()),
+      trig_no_space(false), trig_io_error(false) {}
+
+    void SetTrigNoSpace() {trig_no_space = true;}
+    void SetTrigIoError() {trig_io_error = true;}
+  private:
+    bool trig_no_space;
+    bool trig_io_error;
+};
+
+class ErrorHandlerListener : public EventListener {
+ public:
+  ErrorHandlerListener()
+      : mutex_(),
+        cv_(&mutex_),
+        no_auto_recovery_(false),
+        recovery_complete_(false),
+        file_creation_started_(false),
+        override_bg_error_(false),
+        file_count_(0),
+        fault_env_(nullptr) {}
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*ti*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    file_creation_started_ = true;
+    if (file_count_ > 0) {
+      if (--file_count_ == 0) {
+        fault_env_->SetFilesystemActive(false, file_creation_error_);
+        file_creation_error_ = Status::OK();
+      }
+    }
+    cv_.SignalAll();
+  }
+
+  void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+                            Status /*bg_error*/,
+                            bool* auto_recovery) override {
+    if (*auto_recovery && no_auto_recovery_) {
+      *auto_recovery = false;
+    }
+  }
+
+  void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    recovery_complete_ = true;
+    cv_.SignalAll();
+  }
+
+  bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!recovery_complete_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    if (recovery_complete_) {
+      recovery_complete_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!file_creation_started_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    file_creation_started_ = false;
+  }
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (override_bg_error_) {
+      *bg_error = bg_error_;
+      override_bg_error_ = false;
+    }
+  }
+
+  void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+  void OverrideBGError(Status bg_err) {
+    bg_error_ = bg_err;
+    override_bg_error_ = true;
+  }
+
+  void InjectFileCreationError(FaultInjectionTestEnv* env, int file_count,
+                               Status s) {
+    fault_env_ = env;
+    file_count_ = file_count;
+    file_creation_error_ = s;
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cv_;
+  bool no_auto_recovery_;
+  bool recovery_complete_;
+  bool file_creation_started_;
+  bool override_bg_error_;
+  int file_count_;
+  Status file_creation_error_;
+  Status bg_error_;
+  FaultInjectionTestEnv* fault_env_;
+};
+
+TEST_F(DBErrorHandlingTest, FLushWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::Start", [&](void *) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, CompactionWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.env = fault_env.get();
+  Status s;
+  DestroyAndReopen(options);
+
+  Put(Key(0), "va;");
+  Put(Key(2), "va;");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  listener->OverrideBGError(
+      Status(Status::NoSpace(), Status::Severity::kHardError)
+      );
+  listener->EnableAutoRecovery(false);
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"FlushMemTableFinished", "BackgroundCallCompaction:0"}});
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void *) {
+      fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put(Key(1), "val");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, CorruptionError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.env = fault_env.get();
+  Status s;
+  DestroyAndReopen(options);
+
+  Put(Key(0), "va;");
+  Put(Key(2), "va;");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"FlushMemTableFinished", "BackgroundCallCompaction:0"}});
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void *) {
+      fault_env->SetFilesystemActive(false, Status::Corruption("Corruption"));
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Put(Key(1), "val");
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kUnrecoverableError);
+
+  fault_env->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_NE(s, Status::OK());
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, AutoRecoverFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  s = Put(Key(1), "val");
+  ASSERT_EQ(s, Status::OK());
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingTest, FailRecoverFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  Put(Key(0), "val");
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), rocksdb::Status::Severity::kHardError);
+  // We should be able to shutdown the database while auto recovery is going
+  // on in the background
+  Close();
+  DestroyDB(dbname_, options);
+}
+
+TEST_F(DBErrorHandlingTest, WALWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i<100; ++i) {
+      batch.Put(Key(i), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i<199; ++i) {
+      batch.Put(Key(i), RandomString(&rnd, 1024));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack("WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+      write_error++;
+      if (write_error > 2) {
+        fault_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+      }
+    });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  for (auto i=0; i<199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Reopen(options);
+  for (auto i=0; i<199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiCFWALWriteError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(Env::Default()));
+  std::shared_ptr<ErrorHandlerListener> listener(new ErrorHandlerListener());
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.env = fault_env.get();
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 1; i < 4; ++i) {
+      for (auto j = 0; j < 100; ++j) {
+        batch.Put(handles_[i], Key(j), RandomString(&rnd, 1024));
+      }
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(dbfull()->Write(wopts, &batch), Status::OK());
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    // Write to one CF
+    for (auto i = 100; i < 199; ++i) {
+      batch.Put(handles_[2], Key(i), RandomString(&rnd, 1024));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_env->SetFilesystemActive(false,
+                                           Status::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_env->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  for (auto i = 1; i < 4; ++i) {
+    // Every CF should have been flushed
+    ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+  }
+
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBCompactionError) {
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].env = fault_env[i].get();
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    // Setup for returning error for the 3rd SST, which would be level 1
+    listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+                                         Status::NoSpace("Out of space"));
+    snprintf(buf, sizeof(buf), "_%d", i);
+    DestroyDB(dbname_ + std::string(buf), options[i]);
+    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+              Status::OK());
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+    fault_env[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    delete db[i];
+    fault_env[i]->SetFilesystemActive(true);
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      Status s = DestroyDB(dbname_ + std::string(buf), options[i]);
+    }
+  }
+  options.clear();
+  sfm.reset();
+  delete def_env;
+}
+
+TEST_F(DBErrorHandlingTest, MultiDBVariousErrors) {
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(Env::Default());
+  std::vector<std::unique_ptr<FaultInjectionTestEnv>> fault_env;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_env.emplace_back(new FaultInjectionTestEnv(Env::Default()));
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].env = fault_env[i].get();
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    switch (i) {
+      case 0:
+        // Setup for returning error for the 3rd SST, which would be level 1
+        listener[i]->InjectFileCreationError(fault_env[i].get(), 3,
+                                             Status::NoSpace("Out of space"));
+        break;
+      case 1:
+        // Setup for returning error after the 1st SST, which would result
+        // in a hard error
+        listener[i]->InjectFileCreationError(fault_env[i].get(), 2,
+                                             Status::NoSpace("Out of space"));
+        break;
+      default:
+        break;
+    }
+    snprintf(buf, sizeof(buf), "_%d", i);
+    DestroyDB(dbname_ + std::string(buf), options[i]);
+    ASSERT_EQ(DB::Open(options[i], dbname_ + std::string(buf), &dbptr),
+              Status::OK());
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      batch.Put(Key(j), RandomString(&rnd, 1024));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_EQ(db[i]->Write(wopts, &batch), Status::OK());
+    if (i != 1) {
+      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::OK());
+    } else {
+      ASSERT_EQ(db[i]->Flush(FlushOptions()), Status::NoSpace());
+    }
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    switch (i) {
+      case 0:
+        ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+        break;
+      case 1:
+        ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+        break;
+      case 2:
+        ASSERT_EQ(s, Status::OK());
+        break;
+    }
+    fault_env[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    if (i < 2) {
+      ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    }
+    if (i == 1) {
+      ASSERT_EQ(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true),
+                Status::OK());
+    }
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + NumberToString(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    fault_env[i]->SetFilesystemActive(true);
+    delete db[i];
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      DestroyDB(dbname_ + std::string(buf), options[i]);
+    }
+  }
+  options.clear();
+  delete def_env;
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/event_helpers.cc b/thirdparty/rocksdb/db/event_helpers.cc
index 1b79acb0f2..c80c5aefb7 100644
--- a/thirdparty/rocksdb/db/event_helpers.cc
+++ b/thirdparty/rocksdb/db/event_helpers.cc
@@ -8,7 +8,7 @@
 namespace rocksdb {
 
 namespace {
-template<class T>
+template <class T>
 inline T SafeDivide(T a, T b) {
   return b == 0 ? 0 : a / b;
 }
@@ -17,7 +17,8 @@ inline T SafeDivide(T a, T b) {
 void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) {
   *jwriter << "time_micros"
            << std::chrono::duration_cast<std::chrono::microseconds>(
-                  std::chrono::system_clock::now().time_since_epoch()).count();
+                  std::chrono::system_clock::now().time_since_epoch())
+                  .count();
 }
 
 #ifndef ROCKSDB_LITE
@@ -39,8 +40,8 @@ void EventHelpers::NotifyTableFileCreationStarted(
 
 void EventHelpers::NotifyOnBackgroundError(
     const std::vector<std::shared_ptr<EventListener>>& listeners,
-    BackgroundErrorReason reason, Status* bg_error,
-    InstrumentedMutex* db_mutex) {
+    BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
+    bool* auto_recovery) {
 #ifndef ROCKSDB_LITE
   if (listeners.size() == 0U) {
     return;
@@ -50,8 +51,17 @@ void EventHelpers::NotifyOnBackgroundError(
   db_mutex->Unlock();
   for (auto& listener : listeners) {
     listener->OnBackgroundError(reason, bg_error);
+    if (*auto_recovery) {
+      listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
+    }
   }
   db_mutex->Lock();
+#else
+  (void)listeners;
+  (void)reason;
+  (void)bg_error;
+  (void)db_mutex;
+  (void)auto_recovery;
 #endif  // ROCKSDB_LITE
 }
 
@@ -117,20 +127,25 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
   for (auto& listener : listeners) {
     listener->OnTableFileCreated(info);
   }
+#else
+  (void)listeners;
+  (void)db_name;
+  (void)cf_name;
+  (void)file_path;
+  (void)reason;
 #endif  // !ROCKSDB_LITE
 }
 
 void EventHelpers::LogAndNotifyTableFileDeletion(
-    EventLogger* event_logger, int job_id,
-    uint64_t file_number, const std::string& file_path,
-    const Status& status, const std::string& dbname,
+    EventLogger* event_logger, int job_id, uint64_t file_number,
+    const std::string& file_path, const Status& status,
+    const std::string& dbname,
     const std::vector<std::shared_ptr<EventListener>>& listeners) {
-
   JSONWriter jwriter;
   AppendCurrentTime(&jwriter);
 
-  jwriter << "job" << job_id
-          << "event" << "table_file_deletion"
+  jwriter << "job" << job_id << "event"
+          << "table_file_deletion"
           << "file_number" << file_number;
   if (!status.ok()) {
     jwriter << "status" << status.ToString();
@@ -149,7 +164,32 @@ void EventHelpers::LogAndNotifyTableFileDeletion(
   for (auto& listener : listeners) {
     listener->OnTableFileDeleted(info);
   }
+#else
+  (void)file_path;
+  (void)dbname;
+  (void)listeners;
 #endif  // !ROCKSDB_LITE
 }
 
+void EventHelpers::NotifyOnErrorRecoveryCompleted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    Status old_bg_error, InstrumentedMutex* db_mutex) {
+#ifndef ROCKSDB_LITE
+  if (listeners.size() == 0U) {
+    return;
+  }
+  db_mutex->AssertHeld();
+  // release lock while notifying events
+  db_mutex->Unlock();
+  for (auto& listener : listeners) {
+    listener->OnErrorRecoveryCompleted(old_bg_error);
+  }
+  db_mutex->Lock();
+#else
+  (void)listeners;
+  (void)old_bg_error;
+  (void)db_mutex;
+#endif  // ROCKSDB_LITE
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/event_helpers.h b/thirdparty/rocksdb/db/event_helpers.h
index 674e6c5f6f..ea35b4b5b1 100644
--- a/thirdparty/rocksdb/db/event_helpers.h
+++ b/thirdparty/rocksdb/db/event_helpers.h
@@ -28,7 +28,7 @@ class EventHelpers {
   static void NotifyOnBackgroundError(
       const std::vector<std::shared_ptr<EventListener>>& listeners,
       BackgroundErrorReason reason, Status* bg_error,
-      InstrumentedMutex* db_mutex);
+      InstrumentedMutex* db_mutex, bool* auto_recovery);
   static void LogAndNotifyTableFileCreationFinished(
       EventLogger* event_logger,
       const std::vector<std::shared_ptr<EventListener>>& listeners,
@@ -41,6 +41,9 @@ class EventHelpers {
       uint64_t file_number, const std::string& file_path,
       const Status& status, const std::string& db_name,
       const std::vector<std::shared_ptr<EventListener>>& listeners);
+  static void NotifyOnErrorRecoveryCompleted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      Status bg_error, InstrumentedMutex* db_mutex);
 
  private:
   static void LogAndNotifyTableFileCreation(
diff --git a/thirdparty/rocksdb/db/experimental.cc b/thirdparty/rocksdb/db/experimental.cc
index effe9d7c35..d509a37bf2 100644
--- a/thirdparty/rocksdb/db/experimental.cc
+++ b/thirdparty/rocksdb/db/experimental.cc
@@ -30,12 +30,13 @@ Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
 
 #else  // ROCKSDB_LITE
 
-Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
-                           const Slice* begin, const Slice* end) {
+Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                           const Slice* /*begin*/, const Slice* /*end*/) {
   return Status::NotSupported("Not supported in RocksDB LITE");
 }
 
-Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                 int /*target_level*/) {
   return Status::NotSupported("Not supported in RocksDB LITE");
 }
 
diff --git a/thirdparty/rocksdb/db/external_sst_file_basic_test.cc b/thirdparty/rocksdb/db/external_sst_file_basic_test.cc
index 534e8a0bf7..256db0728b 100644
--- a/thirdparty/rocksdb/db/external_sst_file_basic_test.cc
+++ b/thirdparty/rocksdb/db/external_sst_file_basic_test.cc
@@ -14,9 +14,11 @@
 namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
-class ExternalSSTFileBasicTest : public DBTestBase {
+class ExternalSSTFileBasicTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
-  ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_test") {
+  ExternalSSTFileBasicTest() : DBTestBase("/external_sst_file_basic_test") {
     sst_files_dir_ = dbname_ + "/sst_files/";
     DestroyAndRecreateExternalSSTFilesDir();
   }
@@ -39,7 +41,9 @@ class ExternalSSTFileBasicTest : public DBTestBase {
 
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<int> keys,
-      const std::vector<ValueType>& value_types, int file_id,
+      const std::vector<ValueType>& value_types,
+      std::vector<std::pair<int, int>> range_deletions, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
       std::map<std::string, std::string>* true_data) {
     assert(value_types.size() == 1 || keys.size() == value_types.size());
     std::string file_path = sst_files_dir_ + ToString(file_id);
@@ -49,6 +53,29 @@ class ExternalSSTFileBasicTest : public DBTestBase {
     if (!s.ok()) {
       return s;
     }
+    for (size_t i = 0; i < range_deletions.size(); i++) {
+      // Account for the effect of range deletions on true_data before
+      // all point operators, even though sst_file_writer.DeleteRange
+      // must be called before other sst_file_writer methods. This is
+      // because point writes take precedence over range deletions
+      // in the same ingested sst.
+      std::string start_key = Key(range_deletions[i].first);
+      std::string end_key = Key(range_deletions[i].second);
+      s = sst_file_writer.DeleteRange(start_key, end_key);
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+      auto start_key_it = true_data->find(start_key);
+      if (start_key_it == true_data->end()) {
+        start_key_it = true_data->upper_bound(start_key);
+      }
+      auto end_key_it = true_data->find(end_key);
+      if (end_key_it == true_data->end()) {
+        end_key_it = true_data->upper_bound(end_key);
+      }
+      true_data->erase(start_key_it, end_key_it);
+    }
     for (size_t i = 0; i < keys.size(); i++) {
       std::string key = Key(keys[i]);
       std::string value = Key(keys[i]) + ToString(file_id);
@@ -81,20 +108,35 @@ class ExternalSSTFileBasicTest : public DBTestBase {
     if (s.ok()) {
       IngestExternalFileOptions ifo;
       ifo.allow_global_seqno = true;
+      ifo.write_global_seqno = write_global_seqno;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
       s = db_->IngestExternalFile({file_path}, ifo);
     }
     return s;
   }
 
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys,
+      const std::vector<ValueType>& value_types, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, value_types, {}, file_id, write_global_seqno,
+        verify_checksums_before_ingest, true_data);
+  }
+
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<int> keys, const ValueType value_type,
-      int file_id, std::map<std::string, std::string>* true_data) {
-    return GenerateAndAddExternalFile(options, keys,
-                                      std::vector<ValueType>(1, value_type),
-                                      file_id, true_data);
+      int file_id, bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, std::vector<ValueType>(1, value_type), file_id,
+        write_global_seqno, verify_checksums_before_ingest, true_data);
   }
 
-  ~ExternalSSTFileBasicTest() { test::DestroyDir(env_, sst_files_dir_); }
+  ~ExternalSSTFileBasicTest() override {
+    test::DestroyDir(env_, sst_files_dir_);
+  }
 
  protected:
   std::string sst_files_dir_;
@@ -126,9 +168,14 @@ TEST_F(ExternalSSTFileBasicTest, Basic) {
   ASSERT_EQ(file1_info.num_entries, 100);
   ASSERT_EQ(file1_info.smallest_key, Key(0));
   ASSERT_EQ(file1_info.largest_key, Key(99));
+  ASSERT_EQ(file1_info.num_range_del_entries, 0);
+  ASSERT_EQ(file1_info.smallest_range_del_key, "");
+  ASSERT_EQ(file1_info.largest_range_del_key, "");
   // sst_file_writer already finished, cannot add this value
   s = sst_file_writer.Put(Key(100), "bad_val");
   ASSERT_FALSE(s.ok()) << s.ToString();
+  s = sst_file_writer.DeleteRange(Key(100), Key(200));
+  ASSERT_FALSE(s.ok()) << s.ToString();
 
   DestroyAndReopen(options);
   // Add file using file path
@@ -189,6 +236,7 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   ASSERT_EQ(file3_info.num_entries, 15);
   ASSERT_EQ(file3_info.smallest_key, Key(110));
   ASSERT_EQ(file3_info.largest_key, Key(124));
+
   s = DeprecatedAddFile({file1}, true /* move file */);
   ASSERT_TRUE(s.ok()) << s.ToString();
   ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
@@ -197,8 +245,8 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   ASSERT_TRUE(s.ok()) << s.ToString();
   ASSERT_OK(env_->FileExists(file2));
 
-  // This file have overlapping values with the existing data
-  s = DeprecatedAddFile({file2}, true /* move file */);
+  // This file has overlapping values with the existing data
+  s = DeprecatedAddFile({file3}, true /* move file */);
   ASSERT_FALSE(s.ok()) << s.ToString();
   ASSERT_OK(env_->FileExists(file3));
 
@@ -207,7 +255,9 @@ TEST_F(ExternalSSTFileBasicTest, NoCopy) {
   }
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
   do {
     Options options = CurrentOptions();
     DestroyAndReopen(options);
@@ -215,37 +265,40 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
 
     int file_id = 1;
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2, 3, 4, 5, 6},
-                                         ValueType::kTypeValue, file_id++,
-                                         &true_data));
-    // File dont overwrite any keys, No seqno needed
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {10, 11, 12, 13},
-                                         ValueType::kTypeValue, file_id++,
-
-                                         &true_data));
-    // File dont overwrite any keys, No seqno needed
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 4, 6}, ValueType::kTypeValue, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {1, 4, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {11, 15, 19}, ValueType::kTypeValue, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {11, 15, 19}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {120, 130}, ValueType::kTypeValue, file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        options, {120, 130}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 130}, ValueType::kTypeValue, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
 
     // Write some keys through normal write path
@@ -256,18 +309,21 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
     SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {60, 61, 62}, ValueType::kTypeValue, file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {40, 41, 42}, ValueType::kTypeValue, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {40, 41, 42}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {20, 30, 40}, ValueType::kTypeValue, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {20, 30, 40}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
 
     const Snapshot* snapshot = db_->GetSnapshot();
@@ -275,34 +331,39 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
     // We will need a seqno for the file regardless if the file overwrite
     // keys in the DB or not because we have a snapshot
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1000, 1002}, ValueType::kTypeValue, file_id++, &true_data));
+        options, {1000, 1002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {2000, 3002}, ValueType::kTypeValue, file_id++, &true_data));
+        options, {2000, 3002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 20, 40, 100, 150},
-                                         ValueType::kTypeValue, file_id++,
-                                         &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     db_->ReleaseSnapshot(snapshot);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {5000, 5001}, ValueType::kTypeValue, file_id++, &true_data));
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // No snapshot anymore, no need to assign a seqno
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-  } while (ChangeCompactOptions());
+  } while (ChangeOptionsForFileIngestionTest());
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
   do {
     Options options = CurrentOptions();
     options.merge_operator.reset(new TestPutOperator());
@@ -311,40 +372,62 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
 
     int file_id = 1;
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2, 3, 4, 5, 6},
-                                         ValueType::kTypeValue, file_id++,
-                                         &true_data));
-    // File dont overwrite any keys, No seqno needed
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {10, 11, 12, 13},
-                                         ValueType::kTypeValue, file_id++,
-
-                                         &true_data));
-    // File dont overwrite any keys, No seqno needed
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 4, 6}, ValueType::kTypeMerge, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {1, 4, 6}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {11, 15, 19},
-                                         ValueType::kTypeDeletion, file_id++,
-                                         &true_data));
-    // File overwrite some keys, a seqno will be assigned
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {120, 130}, ValueType::kTypeMerge, file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        options, {120, 130}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 130}, ValueType::kTypeDeletion, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {1, 130}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
 
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{110, 120}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // The range deletion ends on a key, but it doesn't actually delete
+    // this key because the largest key in the range is exclusive. Still,
+    // it counts as an overlap so a new seqno will be assigned.
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{100, 109}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
     // Write some keys through normal write path
     for (int i = 0; i < 50; i++) {
       ASSERT_OK(Put(Key(i), "memtable"));
@@ -353,19 +436,21 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
     SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {60, 61, 62}, ValueType::kTypeValue, file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {40, 41, 42}, ValueType::kTypeMerge, file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {40, 41, 42}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {20, 30, 40},
-                                         ValueType::kTypeDeletion, file_id++,
-                                         &true_data));
-    // File overwrite some keys, a seqno will be assigned
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
 
     const Snapshot* snapshot = db_->GetSnapshot();
@@ -373,34 +458,39 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
     // We will need a seqno for the file regardless if the file overwrite
     // keys in the DB or not because we have a snapshot
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1000, 1002}, ValueType::kTypeMerge, file_id++, &true_data));
+        options, {1000, 1002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {2000, 3002}, ValueType::kTypeMerge, file_id++, &true_data));
+        options, {2000, 3002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
 
-    ASSERT_OK(GenerateAndAddExternalFile(options, {1, 20, 40, 100, 150},
-                                         ValueType::kTypeMerge, file_id++,
-                                         &true_data));
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     db_->ReleaseSnapshot(snapshot);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {5000, 5001}, ValueType::kTypeValue, file_id++, &true_data));
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
     // No snapshot anymore, no need to assign a seqno
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-  } while (ChangeCompactOptions());
+  } while (ChangeOptionsForFileIngestionTest());
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
   do {
     Options options = CurrentOptions();
     options.merge_operator.reset(new TestPutOperator());
@@ -413,44 +503,78 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {1, 2, 3, 4, 5, 6},
         {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
          ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {10, 11, 12, 13},
         {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
          ValueType::kTypeMerge},
-        file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {1, 4, 6}, {ValueType::kTypeDeletion, ValueType::kTypeValue,
-                             ValueType::kTypeMerge},
-        file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {1, 4, 6},
+        {ValueType::kTypeDeletion, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {11, 15, 19}, {ValueType::kTypeDeletion, ValueType::kTypeMerge,
-                                ValueType::kTypeValue},
-        file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {11, 15, 19},
+        {ValueType::kTypeDeletion, ValueType::kTypeMerge,
+         ValueType::kTypeValue},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion},
-        file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
 
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{150, 160}, {180, 190}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+        {{200, 250}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {300, 301, 302},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{1, 2}, {152, 154}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
     // Write some keys through normal write path
     for (int i = 0; i < 50; i++) {
       ASSERT_OK(Put(Key(i), "memtable"));
@@ -461,23 +585,27 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {60, 61, 62},
         {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
-        file_id++, &true_data));
-    // File dont overwrite any keys, No seqno needed
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
 
     ASSERT_OK(GenerateAndAddExternalFile(
-        options, {40, 41, 42}, {ValueType::kTypeValue, ValueType::kTypeDeletion,
-                                ValueType::kTypeDeletion},
-        file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        options, {40, 41, 42},
+        {ValueType::kTypeValue, ValueType::kTypeDeletion,
+         ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {20, 30, 40},
         {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
          ValueType::kTypeDeletion},
-        file_id++, &true_data));
-    // File overwrite some keys, a seqno will be assigned
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
 
     const Snapshot* snapshot = db_->GetSnapshot();
@@ -486,13 +614,15 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
     // keys in the DB or not because we have a snapshot
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
 
@@ -500,7 +630,8 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
         options, {1, 20, 40, 100, 150},
         {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
          ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
     // A global seqno will be assigned anyway because of the snapshot
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
@@ -508,13 +639,14 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
 
     ASSERT_OK(GenerateAndAddExternalFile(
         options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge},
-        file_id++, &true_data));
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
     // No snapshot anymore, no need to assign a seqno
     ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
 
     size_t kcnt = 0;
     VerifyDBFromMap(true_data, &kcnt, false);
-  } while (ChangeCompactOptions());
+  } while (ChangeOptionsForFileIngestionTest());
 }
 
 TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
@@ -557,9 +689,11 @@ TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
-TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+  int kNumLevels = 7;
   Options options = CurrentOptions();
   options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
   Reopen(options);
 
   std::map<std::string, std::string> true_data;
@@ -567,45 +701,232 @@ TEST_F(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
   // prevent range deletions from being dropped due to becoming obsolete.
   const Snapshot* snapshot = db_->GetSnapshot();
 
-  // range del [0, 50) in L0 file, [50, 100) in memtable
-  for (int i = 0; i < 2; i++) {
-    if (i == 1) {
+  // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
+  for (int i = 0; i < 3; i++) {
+    if (i != 0) {
       db_->Flush(FlushOptions());
+      if (i == 1) {
+        MoveFilesToLevel(kNumLevels - 1);
+      }
     }
     ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
                                Key(50 * i), Key(50 * (i + 1))));
   }
   ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
 
-  // overlaps with L0 file but not memtable, so flush is skipped
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // overlaps with L0 file but not memtable, so flush is skipped and file is
+  // ingested into L0
   SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{65, 70}, {70, 85}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L6 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L5
   ASSERT_OK(GenerateAndAddExternalFile(
       options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
-      file_id++, &true_data));
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L5 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L4
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {}, {}, {{5, 15}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
 
-  // overlaps with memtable, so flush is triggered (thus file count increases by
-  // two at this step).
+  // ingested file overlaps with memtable, so flush is triggered before the file
+  // is ingested such that the ingested data is considered newest. So L0 file
+  // count increases by two.
   ASSERT_OK(GenerateAndAddExternalFile(
-      options, {50, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
-      file_id++, &true_data));
+      options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
   ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
 
-  // snapshot unneeded now that both range deletions are persisted
+  // snapshot unneeded now that all range deletions are persisted
   db_->ReleaseSnapshot(snapshot);
 
   // overlaps with nothing, so places at bottom level and skips incrementing
   // seqnum.
   ASSERT_OK(GenerateAndAddExternalFile(
-      options, {101, 125}, {ValueType::kTypeValue, ValueType::kTypeValue},
-      file_id++, &true_data));
+      options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{160, 200}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
   ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
   ASSERT_EQ(4, NumTableFilesAtLevel(0));
-  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
+  bool change_checksum_called = false;
+  const auto& change_checksum = [&](void* arg) {
+    if (!change_checksum_called) {
+      char* buf = reinterpret_cast<char*>(arg);
+      assert(nullptr != buf);
+      buf[0] ^= 0x1;
+      change_checksum_called = true;
+    }
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
+      change_checksum);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+    Status s = GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data);
+    if (verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+    change_checksum_called = false;
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) {
+  SyncPoint::GetInstance()->DisableProcessing();
+  int file_id = 0;
+  EnvOptions env_options;
+  do {
+    Options options = CurrentOptions();
+    std::string file_path = sst_files_dir_ + ToString(file_id++);
+    SstFileWriter sst_file_writer(env_options, options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + ToString(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+    {
+      // Get file size
+      uint64_t file_size = 0;
+      ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+      ASSERT_GT(file_size, 8);
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      // We deterministically corrupt the first byte because we currently
+      // cannot choose a random offset. The reason for this limitation is that
+      // we do not checksum property block at present.
+      const uint64_t offset = 0;
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+    s = db_->IngestExternalFile({file_path}, ifo);
+    if (ifo.verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+  } while (ChangeOptionsForFileIngestionTest());
 }
 
+TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  if (!verify_checksums_before_ingest) {
+    return;
+  }
+  uint64_t props_block_offset = 0;
+  size_t props_block_size = 0;
+  const auto& get_props_block_offset = [&](void* arg) {
+    props_block_offset = *reinterpret_cast<uint64_t*>(arg);
+  };
+  const auto& get_props_block_size = [&](void* arg) {
+    props_block_size = *reinterpret_cast<uint64_t*>(arg);
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+      get_props_block_offset);
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+      get_props_block_size);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  Random64 rand(time(nullptr));
+  do {
+    std::string file_path = sst_files_dir_ + ToString(file_id++);
+    Options options = CurrentOptions();
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + ToString(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+
+    {
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      ASSERT_GT(props_block_size, 8);
+      uint64_t offset =
+          props_block_offset + rand.Next() % (props_block_size - 8);
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = true;
+    s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_NOK(s);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
+                        testing::Values(std::make_tuple(true, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(false, false)));
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/external_sst_file_ingestion_job.cc b/thirdparty/rocksdb/db/external_sst_file_ingestion_job.cc
index 58fa354463..28b481678a 100644
--- a/thirdparty/rocksdb/db/external_sst_file_ingestion_job.cc
+++ b/thirdparty/rocksdb/db/external_sst_file_ingestion_job.cc
@@ -29,13 +29,14 @@
 namespace rocksdb {
 
 Status ExternalSstFileIngestionJob::Prepare(
-    const std::vector<std::string>& external_files_paths) {
+    const std::vector<std::string>& external_files_paths,
+    uint64_t next_file_number, SuperVersion* sv) {
   Status status;
 
   // Read the information of files we are ingesting
   for (const std::string& file_path : external_files_paths) {
     IngestedFileInfo file_to_ingest;
-    status = GetIngestedFileInfo(file_path, &file_to_ingest);
+    status = GetIngestedFileInfo(file_path, &file_to_ingest, sv);
     if (!status.ok()) {
       return status;
     }
@@ -78,7 +79,7 @@ Status ExternalSstFileIngestionJob::Prepare(
   }
 
   for (IngestedFileInfo& f : files_to_ingest_) {
-    if (f.num_entries == 0) {
+    if (f.num_entries == 0 && f.num_range_deletions == 0) {
       return Status::InvalidArgument("File contain no entries");
     }
 
@@ -90,11 +91,12 @@ Status ExternalSstFileIngestionJob::Prepare(
 
   // Copy/Move external files into DB
   for (IngestedFileInfo& f : files_to_ingest_) {
-    f.fd = FileDescriptor(versions_->NewFileNumber(), 0, f.file_size);
+    f.fd = FileDescriptor(next_file_number++, 0, f.file_size);
 
     const std::string path_outside_db = f.external_file_path;
     const std::string path_inside_db =
-        TableFileName(db_options_.db_paths, f.fd.GetNumber(), f.fd.GetPathId());
+        TableFileName(cfd_->ioptions()->cf_paths, f.fd.GetNumber(),
+                      f.fd.GetPathId());
 
     if (ingestion_options_.move_files) {
       status = env_->LinkFile(path_outside_db, path_inside_db);
@@ -102,12 +104,16 @@ Status ExternalSstFileIngestionJob::Prepare(
         // Original file is on a different FS, use copy instead of hard linking
         status = CopyFile(env_, path_outside_db, path_inside_db, 0,
                           db_options_.use_fsync);
+        f.copy_file = true;
+      } else {
+        f.copy_file = false;
       }
     } else {
       status = CopyFile(env_, path_outside_db, path_inside_db, 0,
                         db_options_.use_fsync);
+      f.copy_file = true;
     }
-    TEST_SYNC_POINT("DBImpl::AddFile:FileCopied");
+    TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
     if (!status.ok()) {
       break;
     }
@@ -117,7 +123,7 @@ Status ExternalSstFileIngestionJob::Prepare(
   if (!status.ok()) {
     // We failed, remove all files that we copied into the db
     for (IngestedFileInfo& f : files_to_ingest_) {
-      if (f.internal_file_path == "") {
+      if (f.internal_file_path.empty()) {
         break;
       }
       Status s = env_->DeleteFile(f.internal_file_path);
@@ -132,11 +138,15 @@ Status ExternalSstFileIngestionJob::Prepare(
   return status;
 }
 
-Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed) {
-  SuperVersion* super_version = cfd_->GetSuperVersion();
+Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
+                                               SuperVersion* super_version) {
+  autovector<Range> ranges;
+  for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+    ranges.emplace_back(file_to_ingest.smallest_user_key,
+                        file_to_ingest.largest_user_key);
+  }
   Status status =
-      IngestedFilesOverlapWithMemtables(super_version, flush_needed);
-
+      cfd_->RangesOverlapWithMemtables(ranges, super_version, flush_needed);
   if (status.ok() && *flush_needed &&
       !ingestion_options_.allow_blocking_flush) {
     status = Status::InvalidArgument("External file requires flush");
@@ -148,15 +158,15 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed) {
 // nonmem_write_thread_
 Status ExternalSstFileIngestionJob::Run() {
   Status status;
+  SuperVersion* super_version = cfd_->GetSuperVersion();
 #ifndef NDEBUG
   // We should never run the job with a memtable that is overlapping
   // with the files we are ingesting
   bool need_flush = false;
-  status = NeedsFlush(&need_flush);
+  status = NeedsFlush(&need_flush, super_version);
   assert(status.ok() && need_flush == false);
 #endif
 
-  bool consumed_seqno = false;
   bool force_global_seqno = false;
 
   if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
@@ -164,10 +174,9 @@ Status ExternalSstFileIngestionJob::Run() {
     // if the dont overlap with any ranges since we have snapshots
     force_global_seqno = true;
   }
-  // It is safe to use this instead of LastToBeWrittenSequence since we are
+  // It is safe to use this instead of LastAllocatedSequence since we are
   // the only active writer, and hence they are equal
   const SequenceNumber last_seqno = versions_->LastSequence();
-  SuperVersion* super_version = cfd_->GetSuperVersion();
   edit_.SetColumnFamily(cfd_->GetID());
   // The levels that the files will be ingested into
 
@@ -187,7 +196,7 @@ Status ExternalSstFileIngestionJob::Run() {
     TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
                              &assigned_seqno);
     if (assigned_seqno == last_seqno + 1) {
-      consumed_seqno = true;
+      consumed_seqno_ = true;
     }
     if (!status.ok()) {
       return status;
@@ -197,12 +206,6 @@ Status ExternalSstFileIngestionJob::Run() {
                   f.largest_internal_key(), f.assigned_seqno, f.assigned_seqno,
                   false);
   }
-
-  if (consumed_seqno) {
-    versions_->SetLastToBeWrittenSequence(last_seqno + 1);
-    versions_->SetLastSequence(last_seqno + 1);
-  }
-
   return status;
 }
 
@@ -212,11 +215,20 @@ void ExternalSstFileIngestionJob::UpdateStats() {
   uint64_t total_l0_files = 0;
   uint64_t total_time = env_->NowMicros() - job_start_time_;
   for (IngestedFileInfo& f : files_to_ingest_) {
-    InternalStats::CompactionStats stats(1);
+    InternalStats::CompactionStats stats(CompactionReason::kExternalSstIngestion, 1);
     stats.micros = total_time;
-    stats.bytes_written = f.fd.GetFileSize();
+    // If actual copy occurred for this file, then we need to count the file
+    // size as the actual bytes written. If the file was linked, then we ignore
+    // the bytes written for file metadata.
+    // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
+    if (f.copy_file) {
+      stats.bytes_written = f.fd.GetFileSize();
+    } else {
+      stats.bytes_moved = f.fd.GetFileSize();
+    }
     stats.num_output_files = 1;
-    cfd_->internal_stats()->AddCompactionStats(f.picked_level, stats);
+    cfd_->internal_stats()->AddCompactionStats(f.picked_level,
+                                               Env::Priority::USER, stats);
     cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
                                        f.fd.GetFileSize());
     total_keys += f.num_entries;
@@ -250,6 +262,7 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
                        f.internal_file_path.c_str(), s.ToString().c_str());
       }
     }
+    consumed_seqno_ = false;
   } else if (status.ok() && ingestion_options_.move_files) {
     // The files were moved and added successfully, remove original file links
     for (IngestedFileInfo& f : files_to_ingest_) {
@@ -266,7 +279,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
 }
 
 Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
-    const std::string& external_file, IngestedFileInfo* file_to_ingest) {
+    const std::string& external_file, IngestedFileInfo* file_to_ingest,
+    SuperVersion* sv) {
   file_to_ingest->external_file_path = external_file;
 
   // Get external file size
@@ -288,13 +302,21 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
                                                    external_file));
 
   status = cfd_->ioptions()->table_factory->NewTableReader(
-      TableReaderOptions(*cfd_->ioptions(), env_options_,
-                         cfd_->internal_comparator()),
+      TableReaderOptions(*cfd_->ioptions(),
+                         sv->mutable_cf_options.prefix_extractor.get(),
+                         env_options_, cfd_->internal_comparator()),
       std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
   if (!status.ok()) {
     return status;
   }
 
+  if (ingestion_options_.verify_checksums_before_ingest) {
+    status = table_reader->VerifyChecksum();
+  }
+  if (!status.ok()) {
+    return status;
+  }
+
   // Get the external file properties
   auto props = table_reader->GetTableProperties();
   const auto& uprops = props->user_collected_properties;
@@ -316,12 +338,14 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
 
     // Set the global sequence number
     file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
-    file_to_ingest->global_seqno_offset = props->properties_offsets.at(
+    auto offsets_iter = props->properties_offsets.find(
         ExternalSstFilePropertyNames::kGlobalSeqno);
-
-    if (file_to_ingest->global_seqno_offset == 0) {
+    if (offsets_iter == props->properties_offsets.end() ||
+        offsets_iter->second == 0) {
+      file_to_ingest->global_seqno_offset = 0;
       return Status::Corruption("Was not able to find file global seqno field");
     }
+    file_to_ingest->global_seqno_offset = static_cast<size_t>(offsets_iter->second);
   } else if (file_to_ingest->version == 1) {
     // SST file V1 should not have global seqno field
     assert(seqno_iter == uprops.end());
@@ -336,6 +360,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   }
   // Get number of entries in table
   file_to_ingest->num_entries = props->num_entries;
+  file_to_ingest->num_range_deletions = props->num_range_deletions;
 
   ParsedInternalKey key;
   ReadOptions ro;
@@ -345,27 +370,57 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   // We need to disable fill_cache so that we read from the file without
   // updating the block cache.
   ro.fill_cache = false;
-  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(ro));
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get()));
+  std::unique_ptr<InternalIterator> range_del_iter(
+      table_reader->NewRangeTombstoneIterator(ro));
 
-  // Get first (smallest) key from file
+  // Get first (smallest) and last (largest) key from file.
+  bool bounds_set = false;
   iter->SeekToFirst();
-  if (!ParseInternalKey(iter->key(), &key)) {
-    return Status::Corruption("external file have corrupted keys");
-  }
-  if (key.sequence != 0) {
-    return Status::Corruption("external file have non zero sequence number");
-  }
-  file_to_ingest->smallest_user_key = key.user_key.ToString();
+  if (iter->Valid()) {
+    if (!ParseInternalKey(iter->key(), &key)) {
+      return Status::Corruption("external file have corrupted keys");
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("external file have non zero sequence number");
+    }
+    file_to_ingest->smallest_user_key = key.user_key.ToString();
+
+    iter->SeekToLast();
+    if (!ParseInternalKey(iter->key(), &key)) {
+      return Status::Corruption("external file have corrupted keys");
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("external file have non zero sequence number");
+    }
+    file_to_ingest->largest_user_key = key.user_key.ToString();
 
-  // Get last (largest) key from file
-  iter->SeekToLast();
-  if (!ParseInternalKey(iter->key(), &key)) {
-    return Status::Corruption("external file have corrupted keys");
+    bounds_set = true;
   }
-  if (key.sequence != 0) {
-    return Status::Corruption("external file have non zero sequence number");
+
+  // We may need to adjust these key bounds, depending on whether any range
+  // deletion tombstones extend past them.
+  const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+  if (range_del_iter != nullptr) {
+    for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+         range_del_iter->Next()) {
+      if (!ParseInternalKey(range_del_iter->key(), &key)) {
+        return Status::Corruption("external file have corrupted keys");
+      }
+      RangeTombstone tombstone(key, range_del_iter->value());
+
+      if (!bounds_set || ucmp->Compare(tombstone.start_key_,
+                                       file_to_ingest->smallest_user_key) < 0) {
+        file_to_ingest->smallest_user_key = tombstone.start_key_.ToString();
+      }
+      if (!bounds_set || ucmp->Compare(tombstone.end_key_,
+                                       file_to_ingest->largest_user_key) > 0) {
+        file_to_ingest->largest_user_key = tombstone.end_key_.ToString();
+      }
+      bounds_set = true;
+    }
   }
-  file_to_ingest->largest_user_key = key.user_key.ToString();
 
   file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
 
@@ -374,46 +429,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
   return status;
 }
 
-Status ExternalSstFileIngestionJob::IngestedFilesOverlapWithMemtables(
-    SuperVersion* sv, bool* overlap) {
-  // Create an InternalIterator over all memtables
-  Arena arena;
-  ReadOptions ro;
-  ro.total_order_seek = true;
-  MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(), &arena);
-  merge_iter_builder.AddIterator(sv->mem->NewIterator(ro, &arena));
-  sv->imm->AddIterators(ro, &merge_iter_builder);
-  ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
-
-  std::vector<InternalIterator*> memtable_range_del_iters;
-  auto* active_range_del_iter = sv->mem->NewRangeTombstoneIterator(ro);
-  if (active_range_del_iter != nullptr) {
-    memtable_range_del_iters.push_back(active_range_del_iter);
-  }
-  sv->imm->AddRangeTombstoneIterators(ro, &memtable_range_del_iters);
-  std::unique_ptr<InternalIterator> memtable_range_del_iter(NewMergingIterator(
-      &cfd_->internal_comparator(),
-      memtable_range_del_iters.empty() ? nullptr : &memtable_range_del_iters[0],
-      static_cast<int>(memtable_range_del_iters.size())));
-
-  Status status;
-  *overlap = false;
-  for (IngestedFileInfo& f : files_to_ingest_) {
-    status =
-        IngestedFileOverlapWithIteratorRange(&f, memtable_iter.get(), overlap);
-    if (!status.ok() || *overlap == true) {
-      break;
-    }
-    status = IngestedFileOverlapWithRangeDeletions(
-        &f, memtable_range_del_iter.get(), overlap);
-    if (!status.ok() || *overlap == true) {
-      break;
-    }
-  }
-
-  return status;
-}
-
 Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
     SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
     IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno) {
@@ -442,8 +457,9 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
 
     if (vstorage->NumLevelFiles(lvl) > 0) {
       bool overlap_with_level = false;
-      status = IngestedFileOverlapWithLevel(sv, file_to_ingest, lvl,
-        &overlap_with_level);
+      status = sv->current->OverlapWithLevelIterator(ro, env_options_,
+          file_to_ingest->smallest_user_key, file_to_ingest->largest_user_key,
+          lvl, &overlap_with_level);
       if (!status.ok()) {
         return status;
       }
@@ -461,10 +477,13 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
         const SequenceNumber level_largest_seqno =
             (*max_element(level_files.begin(), level_files.end(),
                           [](FileMetaData* f1, FileMetaData* f2) {
-                            return f1->largest_seqno < f2->largest_seqno;
+                            return f1->fd.largest_seqno < f2->fd.largest_seqno;
                           }))
-                ->largest_seqno;
-        if (level_largest_seqno != 0) {
+                ->fd.largest_seqno;
+        // should only assign seqno to current level's largest seqno when
+        // the file fits
+        if (level_largest_seqno != 0 &&
+            IngestedFileFitInLevel(file_to_ingest, lvl)) {
           *assigned_seqno = level_largest_seqno;
         } else {
           continue;
@@ -505,7 +524,7 @@ Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
   // at some upper level
   for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
     for (auto file : vstorage->LevelFiles(lvl)) {
-      if (file->smallest_seqno == 0) {
+      if (file->fd.smallest_seqno == 0) {
         return Status::InvalidArgument(
           "Can't ingest_behind file as despite allow_ingest_behind=true "
           "there are files with 0 seqno in database at upper levels!");
@@ -530,76 +549,26 @@ Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
         "field");
   }
 
-  std::unique_ptr<RandomRWFile> rwfile;
-  Status status = env_->NewRandomRWFile(file_to_ingest->internal_file_path,
-                                        &rwfile, env_options_);
-  if (!status.ok()) {
-    return status;
-  }
-
-  // Write the new seqno in the global sequence number field in the file
-  std::string seqno_val;
-  PutFixed64(&seqno_val, seqno);
-  status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val);
-  if (status.ok()) {
-    file_to_ingest->assigned_seqno = seqno;
-  }
-  return status;
-}
-
-Status ExternalSstFileIngestionJob::IngestedFileOverlapWithIteratorRange(
-    const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
-    bool* overlap) {
-  auto* vstorage = cfd_->current()->storage_info();
-  auto* ucmp = vstorage->InternalComparator()->user_comparator();
-  InternalKey range_start(file_to_ingest->smallest_user_key, kMaxSequenceNumber,
-                          kValueTypeForSeek);
-  iter->Seek(range_start.Encode());
-  if (!iter->status().ok()) {
-    return iter->status();
-  }
-
-  *overlap = false;
-  if (iter->Valid()) {
-    ParsedInternalKey seek_result;
-    if (!ParseInternalKey(iter->key(), &seek_result)) {
-      return Status::Corruption("DB have corrupted keys");
-    }
-
-    if (ucmp->Compare(seek_result.user_key, file_to_ingest->largest_user_key) <=
-        0) {
-      *overlap = true;
-    }
-  }
-
-  return iter->status();
-}
-
-Status ExternalSstFileIngestionJob::IngestedFileOverlapWithRangeDeletions(
-    const IngestedFileInfo* file_to_ingest, InternalIterator* range_del_iter,
-    bool* overlap) {
-  auto* vstorage = cfd_->current()->storage_info();
-  auto* ucmp = vstorage->InternalComparator()->user_comparator();
-
-  *overlap = false;
-  if (range_del_iter != nullptr) {
-    for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
-         range_del_iter->Next()) {
-      ParsedInternalKey parsed_key;
-      if (!ParseInternalKey(range_del_iter->key(), &parsed_key)) {
-        return Status::Corruption("corrupted range deletion key: " +
-                                  range_del_iter->key().ToString());
-      }
-      RangeTombstone range_del(parsed_key, range_del_iter->value());
-      if (ucmp->Compare(range_del.start_key_,
-                        file_to_ingest->largest_user_key) <= 0 &&
-          ucmp->Compare(file_to_ingest->smallest_user_key,
-                        range_del.end_key_) <= 0) {
-        *overlap = true;
-        break;
+  if (ingestion_options_.write_global_seqno) {
+    // Determine if we can write global_seqno to a given offset of file.
+    // If the file system does not support random write, then we should not.
+    // Otherwise we should.
+    std::unique_ptr<RandomRWFile> rwfile;
+    Status status = env_->NewRandomRWFile(file_to_ingest->internal_file_path,
+                                          &rwfile, env_options_);
+    if (status.ok()) {
+      std::string seqno_val;
+      PutFixed64(&seqno_val, seqno);
+      status = rwfile->Write(file_to_ingest->global_seqno_offset, seqno_val);
+      if (!status.ok()) {
+        return status;
       }
+    } else if (!status.IsNotSupported()) {
+      return status;
     }
   }
+
+  file_to_ingest->assigned_seqno = seqno;
   return Status::OK();
 }
 
@@ -631,35 +600,6 @@ bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
   return true;
 }
 
-Status ExternalSstFileIngestionJob::IngestedFileOverlapWithLevel(
-    SuperVersion* sv, IngestedFileInfo* file_to_ingest, int lvl,
-    bool* overlap_with_level) {
-  Arena arena;
-  ReadOptions ro;
-  ro.total_order_seek = true;
-  MergeIteratorBuilder merge_iter_builder(&cfd_->internal_comparator(),
-                                          &arena);
-  sv->current->AddIteratorsForLevel(ro, env_options_, &merge_iter_builder, lvl,
-                                    nullptr /* range_del_agg */);
-  ScopedArenaIterator level_iter(merge_iter_builder.Finish());
-
-  std::vector<InternalIterator*> level_range_del_iters;
-  sv->current->AddRangeDelIteratorsForLevel(ro, env_options_, lvl,
-                                            &level_range_del_iters);
-  std::unique_ptr<InternalIterator> level_range_del_iter(NewMergingIterator(
-      &cfd_->internal_comparator(),
-      level_range_del_iters.empty() ? nullptr : &level_range_del_iters[0],
-      static_cast<int>(level_range_del_iters.size())));
-
-  Status status = IngestedFileOverlapWithIteratorRange(
-      file_to_ingest, level_iter.get(), overlap_with_level);
-  if (status.ok() && *overlap_with_level == false) {
-    status = IngestedFileOverlapWithRangeDeletions(
-        file_to_ingest, level_range_del_iter.get(), overlap_with_level);
-  }
-  return status;
-}
-
 }  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/external_sst_file_ingestion_job.h b/thirdparty/rocksdb/db/external_sst_file_ingestion_job.h
index 2d0fadeed7..baa8e9f0f6 100644
--- a/thirdparty/rocksdb/db/external_sst_file_ingestion_job.h
+++ b/thirdparty/rocksdb/db/external_sst_file_ingestion_job.h
@@ -36,6 +36,8 @@ struct IngestedFileInfo {
   uint64_t file_size;
   // total number of keys in external file
   uint64_t num_entries;
+  // total number of range deletions in external file
+  uint64_t num_range_deletions;
   // Id of column family this file shoule be ingested into
   uint32_t cf_id;
   // TableProperties read from external file
@@ -46,11 +48,18 @@ struct IngestedFileInfo {
   // FileDescriptor for the file inside the DB
   FileDescriptor fd;
   // file path that we picked for file inside the DB
-  std::string internal_file_path = "";
+  std::string internal_file_path;
   // Global sequence number that we picked for the file inside the DB
   SequenceNumber assigned_seqno = 0;
   // Level inside the DB we picked for the external file.
   int picked_level = 0;
+  // Whether to copy or link the external sst file. copy_file will be set to
+  // false if ingestion_options.move_files is true and underlying FS
+  // supports link operation. Need to provide a default value to make the
+  // undefined-behavior sanity check of llvm happy. Since
+  // ingestion_options.move_files is false by default, thus copy_file is true
+  // by default.
+  bool copy_file = true;
 
   InternalKey smallest_internal_key() const {
     return InternalKey(smallest_user_key, assigned_seqno,
@@ -76,16 +85,22 @@ class ExternalSstFileIngestionJob {
         env_options_(env_options),
         db_snapshots_(db_snapshots),
         ingestion_options_(ingestion_options),
-        job_start_time_(env_->NowMicros()) {}
+        job_start_time_(env_->NowMicros()),
+        consumed_seqno_(false) {}
 
   // Prepare the job by copying external files into the DB.
-  Status Prepare(const std::vector<std::string>& external_files_paths);
+  Status Prepare(const std::vector<std::string>& external_files_paths,
+                 uint64_t next_file_number, SuperVersion* sv);
 
   // Check if we need to flush the memtable before running the ingestion job
   // This will be true if the files we are ingesting are overlapping with any
   // key range in the memtable.
-  // REQUIRES: Mutex held
-  Status NeedsFlush(bool* flush_needed);
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status NeedsFlush(bool* flush_needed, SuperVersion* super_version);
 
   // Will execute the ingestion job and prepare edit() to be applied.
   // REQUIRES: Mutex held
@@ -104,15 +119,15 @@ class ExternalSstFileIngestionJob {
     return files_to_ingest_;
   }
 
+  // Whether to increment VersionSet's seqno after this job runs
+  bool ShouldIncrementLastSequence() const { return consumed_seqno_; }
+
  private:
   // Open the external file and populate `file_to_ingest` with all the
   // external information we need to ingest this file.
   Status GetIngestedFileInfo(const std::string& external_file,
-                             IngestedFileInfo* file_to_ingest);
-
-  // Check if the files we are ingesting overlap with any memtable.
-  // REQUIRES: Mutex held
-  Status IngestedFilesOverlapWithMemtables(SuperVersion* sv, bool* overlap);
+                             IngestedFileInfo* file_to_ingest,
+                             SuperVersion* sv);
 
   // Assign `file_to_ingest` the appropriate sequence number and  the lowest
   // possible level that it can be ingested to according to compaction_style.
@@ -133,24 +148,6 @@ class ExternalSstFileIngestionJob {
   Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
                                           SequenceNumber seqno);
 
-  // Check if `file_to_ingest` key range overlap with the range `iter` represent
-  // REQUIRES: Mutex held
-  Status IngestedFileOverlapWithIteratorRange(
-      const IngestedFileInfo* file_to_ingest, InternalIterator* iter,
-      bool* overlap);
-
-  // Check if `file_to_ingest` key range overlaps with any range deletions
-  // specified by `iter`.
-  // REQUIRES: Mutex held
-  Status IngestedFileOverlapWithRangeDeletions(
-      const IngestedFileInfo* file_to_ingest, InternalIterator* range_del_iter,
-      bool* overlap);
-
-  // Check if `file_to_ingest` key range overlap with level
-  // REQUIRES: Mutex held
-  Status IngestedFileOverlapWithLevel(SuperVersion* sv,
-    IngestedFileInfo* file_to_ingest, int lvl, bool* overlap_with_level);
-
   // Check if `file_to_ingest` can fit in level `level`
   // REQUIRES: Mutex held
   bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
@@ -166,6 +163,7 @@ class ExternalSstFileIngestionJob {
   const IngestExternalFileOptions& ingestion_options_;
   VersionEdit edit_;
   uint64_t job_start_time_;
+  bool consumed_seqno_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/external_sst_file_test.cc b/thirdparty/rocksdb/db/external_sst_file_test.cc
index 4a4e82e792..cbbb2fa262 100644
--- a/thirdparty/rocksdb/db/external_sst_file_test.cc
+++ b/thirdparty/rocksdb/db/external_sst_file_test.cc
@@ -10,11 +10,15 @@
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/sst_file_writer.h"
+#include "util/fault_injection_test_env.h"
+#include "util/filename.h"
 #include "util/testutil.h"
 
 namespace rocksdb {
 
-class ExternalSSTFileTest : public DBTestBase {
+class ExternalSSTFileTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
  public:
   ExternalSSTFileTest() : DBTestBase("/external_sst_file_test") {
     sst_files_dir_ = dbname_ + "/sst_files/";
@@ -26,18 +30,15 @@ class ExternalSSTFileTest : public DBTestBase {
     env_->CreateDir(sst_files_dir_);
   }
 
-  Status GenerateAndAddExternalFile(
-      const Options options,
-      std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
-      bool allow_global_seqno = false, bool sort_data = false,
-      std::map<std::string, std::string>* true_data = nullptr,
-      ColumnFamilyHandle* cfh = nullptr) {
+  Status GenerateOneExternalFile(
+      const Options& options, ColumnFamilyHandle* cfh,
+      std::vector<std::pair<std::string, std::string>>& data, int file_id,
+      bool sort_data, std::string* external_file_path,
+      std::map<std::string, std::string>* true_data) {
     // Generate a file id if not provided
-    if (file_id == -1) {
-      file_id = last_file_id_ + 1;
-      last_file_id_++;
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
     }
-
     // Sort data if asked to do so
     if (sort_data) {
       std::sort(data.begin(), data.end(),
@@ -55,12 +56,11 @@ class ExternalSSTFileTest : public DBTestBase {
     }
     std::string file_path = sst_files_dir_ + ToString(file_id);
     SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
-
     Status s = sst_file_writer.Open(file_path);
     if (!s.ok()) {
       return s;
     }
-    for (auto& entry : data) {
+    for (const auto& entry : data) {
       s = sst_file_writer.Put(entry.first, entry.second);
       if (!s.ok()) {
         sst_file_writer.Finish();
@@ -68,29 +68,22 @@ class ExternalSSTFileTest : public DBTestBase {
       }
     }
     s = sst_file_writer.Finish();
-
-    if (s.ok()) {
-      IngestExternalFileOptions ifo;
-      ifo.allow_global_seqno = allow_global_seqno;
-      if (cfh) {
-        s = db_->IngestExternalFile(cfh, {file_path}, ifo);
-      } else {
-        s = db_->IngestExternalFile({file_path}, ifo);
-      }
+    if (s.ok() && external_file_path != nullptr) {
+      *external_file_path = file_path;
     }
-
-    if (s.ok() && true_data) {
-      for (auto& entry : data) {
-        (*true_data)[entry.first] = entry.second;
+    if (s.ok() && nullptr != true_data) {
+      for (const auto& entry : data) {
+        true_data->insert({entry.first, entry.second});
       }
     }
-
     return s;
   }
 
-  Status GenerateAndAddExternalFileIngestBehind(
-      const Options options, const IngestExternalFileOptions ifo,
+  Status GenerateAndAddExternalFile(
+      const Options options,
       std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
       bool sort_data = false,
       std::map<std::string, std::string>* true_data = nullptr,
       ColumnFamilyHandle* cfh = nullptr) {
@@ -132,6 +125,11 @@ class ExternalSSTFileTest : public DBTestBase {
     s = sst_file_writer.Finish();
 
     if (s.ok()) {
+      IngestExternalFileOptions ifo;
+      ifo.allow_global_seqno = allow_global_seqno;
+      ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+      ifo.ingest_behind = ingest_behind;
       if (cfh) {
         s = db_->IngestExternalFile(cfh, {file_path}, ifo);
       } else {
@@ -148,11 +146,47 @@ class ExternalSSTFileTest : public DBTestBase {
     return s;
   }
 
+  Status GenerateAndAddExternalFiles(
+      const Options& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::vector<IngestExternalFileOptions>& ifos,
+      std::vector<std::vector<std::pair<std::string, std::string>>>& data,
+      int file_id, bool sort_data,
+      std::vector<std::map<std::string, std::string>>& true_data) {
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
+    }
+    // Generate external SST files, one for each column family
+    size_t num_cfs = column_families.size();
+    assert(ifos.size() == num_cfs);
+    assert(data.size() == num_cfs);
+    Status s;
+    std::vector<IngestExternalFileArg> args(num_cfs);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      std::string external_file_path;
+      s = GenerateOneExternalFile(
+          options, column_families[i], data[i], file_id, sort_data,
+          &external_file_path,
+          true_data.size() == num_cfs ? &true_data[i] : nullptr);
+      if (!s.ok()) {
+        return s;
+      }
+      ++file_id;
 
+      args[i].column_family = column_families[i];
+      args[i].external_files.push_back(external_file_path);
+      args[i].options = ifos[i];
+    }
+    s = db_->IngestExternalFiles(args);
+    return s;
+  }
 
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<std::pair<int, std::string>> data,
-      int file_id = -1, bool allow_global_seqno = false, bool sort_data = false,
+      int file_id = -1, bool allow_global_seqno = false,
+      bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
       std::map<std::string, std::string>* true_data = nullptr,
       ColumnFamilyHandle* cfh = nullptr) {
     std::vector<std::pair<std::string, std::string>> file_data;
@@ -160,13 +194,16 @@ class ExternalSSTFileTest : public DBTestBase {
       file_data.emplace_back(Key(entry.first), entry.second);
     }
     return GenerateAndAddExternalFile(options, file_data, file_id,
-                                      allow_global_seqno, sort_data, true_data,
-                                      cfh);
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
   }
 
   Status GenerateAndAddExternalFile(
       const Options options, std::vector<int> keys, int file_id = -1,
-      bool allow_global_seqno = false, bool sort_data = false,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
       std::map<std::string, std::string>* true_data = nullptr,
       ColumnFamilyHandle* cfh = nullptr) {
     std::vector<std::pair<std::string, std::string>> file_data;
@@ -174,22 +211,25 @@ class ExternalSSTFileTest : public DBTestBase {
       file_data.emplace_back(Key(k), Key(k) + ToString(file_id));
     }
     return GenerateAndAddExternalFile(options, file_data, file_id,
-                                      allow_global_seqno, sort_data, true_data,
-                                      cfh);
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
   }
 
   Status DeprecatedAddFile(const std::vector<std::string>& files,
                            bool move_files = false,
-                           bool skip_snapshot_check = false) {
+                           bool skip_snapshot_check = false,
+                           bool skip_write_global_seqno = false) {
     IngestExternalFileOptions opts;
     opts.move_files = move_files;
     opts.snapshot_consistency = !skip_snapshot_check;
     opts.allow_global_seqno = false;
     opts.allow_blocking_flush = false;
+    opts.write_global_seqno = !skip_write_global_seqno;
     return db_->IngestExternalFile(files, opts);
   }
 
-  ~ExternalSSTFileTest() { test::DestroyDir(env_, sst_files_dir_); }
+  ~ExternalSSTFileTest() override { test::DestroyDir(env_, sst_files_dir_); }
 
  protected:
   int last_file_id_ = 0;
@@ -222,6 +262,9 @@ TEST_F(ExternalSSTFileTest, Basic) {
     ASSERT_EQ(file1_info.num_entries, 100);
     ASSERT_EQ(file1_info.smallest_key, Key(0));
     ASSERT_EQ(file1_info.largest_key, Key(99));
+    ASSERT_EQ(file1_info.num_range_del_entries, 0);
+    ASSERT_EQ(file1_info.smallest_range_del_key, "");
+    ASSERT_EQ(file1_info.largest_range_del_key, "");
     // sst_file_writer already finished, cannot add this value
     s = sst_file_writer.Put(Key(100), "bad_val");
     ASSERT_FALSE(s.ok()) << s.ToString();
@@ -290,6 +333,58 @@ TEST_F(ExternalSSTFileTest, Basic) {
     ASSERT_EQ(file5_info.smallest_key, Key(400));
     ASSERT_EQ(file5_info.largest_key, Key(499));
 
+    // file6.sst (delete 400 => 500)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    sst_file_writer.DeleteRange(Key(400), Key(500));
+    ExternalSstFileInfo file6_info;
+    s = sst_file_writer.Finish(&file6_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 1);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(400));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(500));
+
+    // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    sst_file_writer.DeleteRange(Key(500), Key(550));
+    for (int k = 520; k < 560; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    sst_file_writer.DeleteRange(Key(525), Key(575));
+    for (int k = 560; k < 600; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file7_info;
+    s = sst_file_writer.Finish(&file7_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 40);
+    ASSERT_EQ(file7_info.smallest_key, Key(520));
+    ASSERT_EQ(file7_info.largest_key, Key(598));
+    ASSERT_EQ(file7_info.num_range_del_entries, 2);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(500));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(575));
+
+    // file8.sst (delete 600 => 700)
+    std::string file8 = sst_files_dir_ + "file8.sst";
+    ASSERT_OK(sst_file_writer.Open(file8));
+    sst_file_writer.DeleteRange(Key(600), Key(700));
+    ExternalSstFileInfo file8_info;
+    s = sst_file_writer.Finish(&file8_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file8_info.file_path, file8);
+    ASSERT_EQ(file8_info.num_entries, 0);
+    ASSERT_EQ(file8_info.smallest_key, "");
+    ASSERT_EQ(file8_info.largest_key, "");
+    ASSERT_EQ(file8_info.num_range_del_entries, 1);
+    ASSERT_EQ(file8_info.smallest_range_del_key, Key(600));
+    ASSERT_EQ(file8_info.largest_range_del_key, Key(700));
+
     // Cannot create an empty sst file
     std::string file_empty = sst_files_dir_ + "file_empty.sst";
     ExternalSstFileInfo file_empty_info;
@@ -336,6 +431,16 @@ TEST_F(ExternalSSTFileTest, Basic) {
     // Key range of file5 (400 => 499) dont overlap with any keys in DB
     ASSERT_OK(DeprecatedAddFile({file5}));
 
+    // This file has overlapping values with the existing data
+    s = DeprecatedAddFile({file6});
+    ASSERT_FALSE(s.ok()) << s.ToString();
+
+    // Key range of file7 (500 => 598) dont overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file7}));
+
+    // Key range of file7 (600 => 700) dont overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file8}));
+
     // Make sure values are correct before and after flush/compaction
     for (int i = 0; i < 2; i++) {
       for (int k = 0; k < 200; k++) {
@@ -349,6 +454,13 @@ TEST_F(ExternalSSTFileTest, Basic) {
         std::string value = Key(k) + "_val";
         ASSERT_EQ(Get(Key(k)), value);
       }
+      for (int k = 500; k < 600; k++) {
+        std::string value = Key(k) + "_val";
+        if (k < 520 || k % 2 == 1) {
+          value = "NOT_FOUND";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
       ASSERT_OK(Flush());
       ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
     }
@@ -377,8 +489,10 @@ TEST_F(ExternalSSTFileTest, Basic) {
       ASSERT_EQ(Get(Key(k)), value);
     }
     DestroyAndRecreateExternalSSTFilesDir();
-  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
 }
+
 class SstFileWriterCollector : public TablePropertiesCollector {
  public:
   explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) {
@@ -388,20 +502,22 @@ class SstFileWriterCollector : public TablePropertiesCollector {
   const char* Name() const override { return name_.c_str(); }
 
   Status Finish(UserCollectedProperties* properties) override {
+    std::string count = std::to_string(count_);
     *properties = UserCollectedProperties{
         {prefix_ + "_SstFileWriterCollector", "YES"},
-        {prefix_ + "_Count", std::to_string(count_)},
+        {prefix_ + "_Count", count},
     };
     return Status::OK();
   }
 
-  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
-                    SequenceNumber seq, uint64_t file_size) override {
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
     ++count_;
     return Status::OK();
   }
 
-  virtual UserCollectedProperties GetReadableProperties() const override {
+  UserCollectedProperties GetReadableProperties() const override {
     return UserCollectedProperties{};
   }
 
@@ -415,8 +531,8 @@ class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory {
  public:
   explicit SstFileWriterCollectorFactory(std::string prefix)
       : prefix_(prefix), num_created_(0) {}
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
-      TablePropertiesCollectorFactory::Context context) override {
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
     num_created_++;
     return new SstFileWriterCollector(prefix_);
   }
@@ -516,17 +632,57 @@ TEST_F(ExternalSSTFileTest, AddList) {
     ASSERT_EQ(file5_info.smallest_key, Key(200));
     ASSERT_EQ(file5_info.largest_key, Key(299));
 
+    // file6.sst (delete 0 => 100)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
+    ExternalSstFileInfo file6_info;
+    s = sst_file_writer.Finish(&file6_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 2);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
+
+    // file7.sst (delete 100 => 200)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(100), Key(200)));
+    ExternalSstFileInfo file7_info;
+    s = sst_file_writer.Finish(&file7_info);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 0);
+    ASSERT_EQ(file7_info.smallest_key, "");
+    ASSERT_EQ(file7_info.largest_key, "");
+    ASSERT_EQ(file7_info.num_range_del_entries, 1);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(100));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(200));
+
     // list 1 has internal key range conflict
     std::vector<std::string> file_list0({file1, file2});
     std::vector<std::string> file_list1({file3, file2, file1});
     std::vector<std::string> file_list2({file5});
     std::vector<std::string> file_list3({file3, file4});
+    std::vector<std::string> file_list4({file5, file7});
+    std::vector<std::string> file_list5({file6, file7});
 
     DestroyAndReopen(options);
 
-    // This list of files have key ranges are overlapping with each other
+    // These lists of files have key ranges that overlap with each other
     s = DeprecatedAddFile(file_list1);
     ASSERT_FALSE(s.ok()) << s.ToString();
+    // Both of the following overlap on the end key of a range deletion
+    // tombstone. This is a limitation because these tombstones have exclusive
+    // end keys that should not count as overlapping with other keys.
+    s = DeprecatedAddFile(file_list4);
+    ASSERT_FALSE(s.ok()) << s.ToString();
+    s = DeprecatedAddFile(file_list5);
+    ASSERT_FALSE(s.ok()) << s.ToString();
 
     // Add files using file path list
     s = DeprecatedAddFile(file_list0);
@@ -617,7 +773,8 @@ TEST_F(ExternalSSTFileTest, AddList) {
       ASSERT_EQ(Get(Key(k)), value);
     }
     DestroyAndRecreateExternalSSTFilesDir();
-  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
 }
 
 TEST_F(ExternalSSTFileTest, AddListAtomicity) {
@@ -687,7 +844,7 @@ TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
   DestroyAndReopen(options);
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::AddFile:FileCopied", [&](void* arg) {
+      "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) {
         ASSERT_OK(Put("aaa", "bbb"));
         ASSERT_OK(Flush());
         ASSERT_OK(Put("aaa", "xxx"));
@@ -895,11 +1052,11 @@ TEST_F(ExternalSSTFileTest, MultiThreaded) {
 
 TEST_F(ExternalSSTFileTest, OverlappingRanges) {
   Random rnd(301);
-  int picked_level = 0;
+  SequenceNumber assigned_seqno = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-    "ExternalSstFileIngestionJob::Run", [&picked_level](void* arg) {
+    "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) {
       ASSERT_TRUE(arg != nullptr);
-      picked_level = *(static_cast<int*>(arg));
+      assigned_seqno = *(static_cast<SequenceNumber*>(arg));
     });
   bool need_flush = false;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
@@ -923,7 +1080,7 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) {
 
     printf("Option config = %d\n", option_config_);
     std::vector<std::pair<int, int>> key_ranges;
-    for (int i = 0; i < 500; i++) {
+    for (int i = 0; i < 100; i++) {
       int range_start = rnd.Uniform(20000);
       int keys_per_range = 10 + rnd.Uniform(41);
 
@@ -969,7 +1126,8 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) {
         s = DeprecatedAddFile({file_name});
         auto it = true_data.lower_bound(Key(range_start));
         if (option_config_ != kUniversalCompaction &&
-            option_config_ != kUniversalCompactionMultiLevel) {
+            option_config_ != kUniversalCompactionMultiLevel &&
+            option_config_ != kUniversalSubcompactions) {
           if (it != true_data.end() && it->first <= Key(range_end)) {
             // This range overlap with data already exist in DB
             ASSERT_NOK(s);
@@ -980,7 +1138,7 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) {
           }
         } else {
           if ((it != true_data.end() && it->first <= Key(range_end)) ||
-              need_flush || picked_level > 0 || overlap_with_db) {
+              need_flush || assigned_seqno > 0 || overlap_with_db) {
             // This range overlap with data already exist in DB
             ASSERT_NOK(s);
             failed_add_file++;
@@ -1023,7 +1181,7 @@ TEST_F(ExternalSSTFileTest, OverlappingRanges) {
   } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
 }
 
-TEST_F(ExternalSSTFileTest, PickedLevel) {
+TEST_P(ExternalSSTFileTest, PickedLevel) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = false;
   options.level0_file_num_compaction_trigger = 4;
@@ -1033,13 +1191,13 @@ TEST_F(ExternalSSTFileTest, PickedLevel) {
   std::map<std::string, std::string> true_data;
 
   // File 0 will go to last level (L3)
-  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true,
+                                       false, false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "0,0,0,1");
 
   // File 1 will go to level L2 (since it overlap with file 0 in L3)
-  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true,
+                                       false, false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "0,0,1,1");
 
   rocksdb::SyncPoint::GetInstance()->LoadDependency({
@@ -1068,13 +1226,13 @@ TEST_F(ExternalSSTFileTest, PickedLevel) {
 
   // This file overlaps with file 0 (L3), file 1 (L2) and the
   // output of compaction going to L1
-  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true,
+                                       false, false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "5,0,1,1");
 
   // This file does not overlap with any file or with the running compaction
   ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
-                                       &true_data));
+                                       false, false, false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "5,0,1,2");
 
   // Hold compaction from finishing
@@ -1126,7 +1284,7 @@ TEST_F(ExternalSSTFileTest, PickedLevelBug) {
   std::atomic<bool> bg_compact_started(false);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BackgroundCompaction:Start",
-      [&](void* arg) { bg_compact_started.store(true); });
+      [&](void* /*arg*/) { bg_compact_started.store(true); });
 
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -1175,6 +1333,40 @@ TEST_F(ExternalSSTFileTest, PickedLevelBug) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  Status s = db_->IngestExternalFile({"non_existing_file"},
+                                     IngestExternalFileOptions());
+  ASSERT_NOK(s);
+
+  // Verify file deletion is not impacted (verify a bug fix)
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // After full compaction, there should be only 1 file.
+  std::vector<std::string> files;
+  env_->GetChildren(dbname_, &files);
+  int num_sst_files = 0;
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kTableFile) {
+      num_sst_files++;
+    }
+  }
+  ASSERT_EQ(1, num_sst_files);
+}
+
 TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
   Options options = CurrentOptions();
   options.disable_auto_compactions = false;
@@ -1192,8 +1384,9 @@ TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
     ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id));
   };
 
+  const int num_of_ranges = 1000;
   std::vector<port::Thread> threads;
-  while (range_id < 5000) {
+  while (range_id < num_of_ranges) {
     int range_start = range_id * 10;
     int range_end = range_start + 10;
 
@@ -1218,7 +1411,7 @@ TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
     range_id++;
   }
 
-  for (int rid = 0; rid < 5000; rid++) {
+  for (int rid = 0; rid < num_of_ranges; rid++) {
     int range_start = rid * 10;
     int range_end = range_start + 10;
 
@@ -1270,12 +1463,12 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
   // This file overlaps with the output of the compaction (going to L3)
   // so the file will be added to L0 since L3 is the base level
   ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false,
-                                       false, &true_data));
+                                       false, true, false, false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "5");
 
   // This file does not overlap with the current running compactiong
   ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
-                                       &true_data));
+                                       true, false, false, &true_data));
   EXPECT_EQ(FilesPerLevel(), "5,0,0,1");
 
   // Hold compaction from finishing
@@ -1290,25 +1483,25 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
   Reopen(options);
 
   ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false,
-                                       &true_data));
+                                       true, false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "1,0,0,3");
 
   ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false,
-                                       false, &true_data));
+                                       false, true, false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "1,0,0,4");
 
   ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false,
-                                       false, &true_data));
+                                       false, true, false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "1,0,0,5");
 
   // File 5 overlaps with file 2 (L3 / base level)
-  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true,
+                                       false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "2,0,0,5");
 
   // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0)
-  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true,
+                                       false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "3,0,0,5");
 
   // Verify data in files
@@ -1327,7 +1520,7 @@ TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
 
   // File 7 overlaps with file 4 (L3)
   ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false,
-                                       false, &true_data));
+                                       false, true, false, false, &true_data));
   ASSERT_EQ(FilesPerLevel(), "5,0,0,5");
 
   VerifyDBFromMap(true_data, &kcnt, false);
@@ -1407,12 +1600,16 @@ TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) {
   ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6));  // L2
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "CompactionJob::Run():Start", [&](void* arg) {
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
         // fit in L3 but will overlap with compaction so will be added
         // to L2 but a compaction will trivially move it to L3
         // and break LSM consistency
-        ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}}));
-        ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7));
+        static std::atomic<bool> called = {false};
+        if (!called) {
+          called = true;
+          ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}}));
+          ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7));
+        }
       });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -1457,19 +1654,21 @@ TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) {
   ASSERT_OK(DeprecatedAddFile({file_path}));
 }
 
-TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
   Options options = CurrentOptions();
   options.IncreaseParallelism(20);
   options.level0_slowdown_writes_trigger = 256;
   options.level0_stop_writes_trigger = 256;
 
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
   for (int iter = 0; iter < 2; iter++) {
     bool write_to_memtable = (iter == 0);
     DestroyAndReopen(options);
 
     Random rnd(301);
     std::map<std::string, std::string> true_data;
-    for (int i = 0; i < 2000; i++) {
+    for (int i = 0; i < 500; i++) {
       std::vector<std::pair<std::string, std::string>> random_data;
       for (int j = 0; j < 100; j++) {
         std::string k;
@@ -1486,8 +1685,9 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
           true_data[entry.first] = entry.second;
         }
       } else {
-        ASSERT_OK(GenerateAndAddExternalFile(options, random_data, -1, true,
-                                             true, &true_data));
+        ASSERT_OK(GenerateAndAddExternalFile(
+            options, random_data, -1, true, write_global_seqno,
+            verify_checksums_before_ingest, false, true, &true_data));
       }
     }
     size_t kcnt = 0;
@@ -1497,7 +1697,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
   }
 }
 
-TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   Options options = CurrentOptions();
   options.num_levels = 5;
   options.disable_auto_compactions = true;
@@ -1516,8 +1716,11 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 0; i <= 20; i++) {
     file_data.emplace_back(Key(i), "L4");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
 
   // This file dont overlap with anything in the DB, will go to L4
   ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
@@ -1527,8 +1730,9 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 80; i <= 130; i++) {
     file_data.emplace_back(Key(i), "L0");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
 
   // This file overlap with the memtable, so it will flush it and add
   // it self to L0
@@ -1539,8 +1743,9 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 30; i <= 50; i++) {
     file_data.emplace_back(Key(i), "L4");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
 
   // This file dont overlap with anything in the DB and fit in L4 as well
   ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
@@ -1550,8 +1755,9 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   for (int i = 10; i <= 40; i++) {
     file_data.emplace_back(Key(i), "L3");
   }
-  ASSERT_OK(GenerateAndAddExternalFile(options, file_data, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
 
   // This file overlap with files in L4, we will ingest it in L3
   ASSERT_EQ("2,0,0,1,2", FilesPerLevel());
@@ -1560,7 +1766,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
   VerifyDBFromMap(true_data, &kcnt, false);
 }
 
-TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
   Options options = CurrentOptions();
   DestroyAndReopen(options);
   uint64_t entries_in_memtable;
@@ -1574,16 +1780,20 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
                       &entries_in_memtable);
   ASSERT_GE(entries_in_memtable, 1);
 
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
   // No need for flush
-  ASSERT_OK(GenerateAndAddExternalFile(options, {90, 100, 110}, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {90, 100, 110}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
-  ASSERT_OK(GenerateAndAddExternalFile(options, {19, 20, 21}, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {19, 20, 21}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_EQ(entries_in_memtable, 0);
@@ -1597,15 +1807,17 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
   ASSERT_GE(entries_in_memtable, 1);
 
   // No need for flush, this file keys fit between the memtable keys
-  ASSERT_OK(GenerateAndAddExternalFile(options, {202, 203, 204}, -1, true,
-                                       false, &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {202, 203, 204}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_GE(entries_in_memtable, 1);
 
   // This file will flush the memtable
-  ASSERT_OK(GenerateAndAddExternalFile(options, {206, 207}, -1, true, false,
-                                       &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {206, 207}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
   db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
                       &entries_in_memtable);
   ASSERT_EQ(entries_in_memtable, 0);
@@ -1614,7 +1826,7 @@ TEST_F(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
   VerifyDBFromMap(true_data, &kcnt, false);
 }
 
-TEST_F(ExternalSSTFileTest, L0SortingIssue) {
+TEST_P(ExternalSSTFileTest, L0SortingIssue) {
   Options options = CurrentOptions();
   options.num_levels = 2;
   DestroyAndReopen(options);
@@ -1623,10 +1835,16 @@ TEST_F(ExternalSSTFileTest, L0SortingIssue) {
   ASSERT_OK(Put(Key(1), "memtable"));
   ASSERT_OK(Put(Key(10), "memtable"));
 
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
   // No Flush needed, No global seqno needed, Ingest in L1
-  ASSERT_OK(GenerateAndAddExternalFile(options, {7, 8}, -1, true, false));
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
   // No Flush needed, but need a global seqno, Ingest in L0
-  ASSERT_OK(GenerateAndAddExternalFile(options, {7, 8}, -1, true, false));
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
   printf("%s\n", FilesPerLevel().c_str());
 
   // Overwrite what we added using external files
@@ -1795,9 +2013,59 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
   ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
 }
 
+/*
+ * Test and verify the functionality of ingestion_options.move_files.
+ */
+TEST_F(ExternalSSTFileTest, LinkExternalSst) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  const int kNumKeys = 10000;
+
+  std::string file_path = sst_files_dir_ + "file1.sst";
+  // Create SstFileWriter for default column family
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  ASSERT_OK(sst_file_writer.Open(file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+  uint64_t file_size = 0;
+  ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ASSERT_OK(db_->IngestExternalFile({file_path}, ifo));
+
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  const InternalStats* internal_stats_ptr = cfd->internal_stats();
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  uint64_t bytes_copied = 0;
+  uint64_t bytes_moved = 0;
+  for (const auto& stats : comp_stats) {
+    bytes_copied += stats.bytes_written;
+    bytes_moved += stats.bytes_moved;
+  }
+  // If bytes_moved > 0, it means external sst resides on the same FS
+  // supporting hard link operation. Therefore,
+  // 0 bytes should be copied, and the bytes_moved == file_size.
+  // Otherwise, FS does not support hard link, or external sst file resides on
+  // a different file system, then the bytes_copied should be equal to
+  // file_size.
+  if (bytes_moved > 0) {
+    ASSERT_EQ(0, bytes_copied);
+    ASSERT_EQ(file_size, bytes_moved);
+  } else {
+    ASSERT_EQ(file_size, bytes_copied);
+  }
+}
+
 class TestIngestExternalFileListener : public EventListener {
  public:
-  void OnExternalFileIngested(DB* db,
+  void OnExternalFileIngested(DB* /*db*/,
                               const ExternalFileIngestionInfo& info) override {
     ingested_files.push_back(info);
   }
@@ -1805,16 +2073,19 @@ class TestIngestExternalFileListener : public EventListener {
   std::vector<ExternalFileIngestionInfo> ingested_files;
 };
 
-TEST_F(ExternalSSTFileTest, IngestionListener) {
+TEST_P(ExternalSSTFileTest, IngestionListener) {
   Options options = CurrentOptions();
   TestIngestExternalFileListener* listener =
       new TestIngestExternalFileListener();
   options.listeners.emplace_back(listener);
   CreateAndReopenWithCF({"koko", "toto"}, options);
 
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
   // Ingest into default cf
-  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true, true, nullptr,
-                                       handles_[0]));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[0]));
   ASSERT_EQ(listener->ingested_files.size(), 1);
   ASSERT_EQ(listener->ingested_files.back().cf_name, "default");
   ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
@@ -1824,8 +2095,9 @@ TEST_F(ExternalSSTFileTest, IngestionListener) {
             "default");
 
   // Ingest into cf1
-  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true, true, nullptr,
-                                       handles_[1]));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[1]));
   ASSERT_EQ(listener->ingested_files.size(), 2);
   ASSERT_EQ(listener->ingested_files.back().cf_name, "koko");
   ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
@@ -1835,8 +2107,9 @@ TEST_F(ExternalSSTFileTest, IngestionListener) {
             "koko");
 
   // Ingest into cf2
-  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, true, true, nullptr,
-                                       handles_[2]));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[2]));
   ASSERT_EQ(listener->ingested_files.size(), 3);
   ASSERT_EQ(listener->ingested_files.back().cf_name, "toto");
   ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
@@ -1878,7 +2151,7 @@ TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
   db_->ReleaseSnapshot(snap);
 }
 
-TEST_F(ExternalSSTFileTest, IngestBehind) {
+TEST_P(ExternalSSTFileTest, IngestBehind) {
   Options options = CurrentOptions();
   options.compaction_style = kCompactionStyleUniversal;
   options.num_levels = 3;
@@ -1899,14 +2172,16 @@ TEST_F(ExternalSSTFileTest, IngestBehind) {
     file_data.emplace_back(Key(i), "ingest_behind");
   }
 
-  IngestExternalFileOptions ifo;
-  ifo.allow_global_seqno = true;
-  ifo.ingest_behind = true;
+  bool allow_global_seqno = true;
+  bool ingest_behind = true;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
 
   // Can't ingest behind since allow_ingest_behind isn't set to true
-  ASSERT_NOK(GenerateAndAddExternalFileIngestBehind(options, ifo,
-                                                   file_data, -1, false,
-                                                   &true_data));
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+      &true_data));
 
   options.allow_ingest_behind = true;
   // check that we still can open the DB, as num_levels should be
@@ -1924,14 +2199,16 @@ TEST_F(ExternalSSTFileTest, IngestBehind) {
   db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   // Universal picker should go at second from the bottom level
   ASSERT_EQ("0,1", FilesPerLevel());
-  ASSERT_OK(GenerateAndAddExternalFileIngestBehind(options, ifo,
-                                                   file_data, -1, false,
-                                                   &true_data));
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
   ASSERT_EQ("0,1,1", FilesPerLevel());
   // this time ingest should fail as the file doesn't fit to the bottom level
-  ASSERT_NOK(GenerateAndAddExternalFileIngestBehind(options, ifo,
-                                                   file_data, -1, false,
-                                                   &true_data));
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
   ASSERT_EQ("0,1,1", FilesPerLevel());
   db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
   // bottom level should be empty
@@ -1940,6 +2217,455 @@ TEST_F(ExternalSSTFileTest, IngestBehind) {
   size_t kcnt = 0;
   VerifyDBFromMap(true_data, &kcnt, false);
 }
+
+TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
+  Options options = CurrentOptions();
+
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+
+  // Create external SST file and include bloom filters
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_GE(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
+  }
+
+  // Create external SST file but skip bloom filters
+  options.statistics = rocksdb::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
+                                  Env::IOPriority::IO_TOTAL,
+                                  true /* skip_filters */);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_EQ(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0);
+  }
+}
+
+TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  const int kNumEntries = 1 << 10;
+  const int kNumBytesPerEntry = 1 << 10;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  DestroyAndReopen(options);
+
+  std::atomic<int> num_compression_dicts(0);
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* /* arg */) { ++num_compression_dicts; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::vector<std::pair<std::string, std::string>> random_data;
+  for (int i = 0; i < kNumEntries; i++) {
+    std::string val;
+    test::RandomString(&rnd, kNumBytesPerEntry, &val);
+    random_data.emplace_back(Key(i), std::move(val));
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
+  ASSERT_EQ(1, num_compression_dicts);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                         -1, true, true_data);
+  ASSERT_OK(s);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "BeforeRead"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "AfterRead",
+       "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const std::vector<std::map<std::string, std::string>> data_before_ingestion =
+      {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
+       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}}};
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    int cf = static_cast<int>(i);
+    const auto& orig_data = data_before_ingestion[i];
+    for (const auto& kv : orig_data) {
+      ASSERT_OK(Put(cf, kv.first, kv.second));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  // Take snapshot before ingestion starts
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  read_opts.snapshot = dbfull()->GetSnapshot();
+  std::vector<Iterator*> iters(handles_.size());
+
+  // Range scan checks first kv of each CF before ingestion starts.
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    iters[i] = dbfull()->NewIterator(read_opts, handles_[i]);
+    iters[i]->SeekToFirst();
+    ASSERT_TRUE(iters[i]->Valid());
+    const std::string& key = iters[i]->key().ToString();
+    const std::string& value = iters[i]->value().ToString();
+    const std::map<std::string, std::string>& orig_data =
+        data_before_ingestion[i];
+    std::map<std::string, std::string>::const_iterator it = orig_data.find(key);
+    ASSERT_NE(orig_data.end(), it);
+    ASSERT_EQ(it->second, value);
+    iters[i]->Next();
+  }
+  port::Thread ingest_thread([&]() {
+    ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                          -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "BeforeRead");
+  // Should see only data before ingestion
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    const auto& orig_data = data_before_ingestion[i];
+    for (; iters[i]->Valid(); iters[i]->Next()) {
+      const std::string& key = iters[i]->key().ToString();
+      const std::string& value = iters[i]->value().ToString();
+      std::map<std::string, std::string>::const_iterator it =
+          orig_data.find(key);
+      ASSERT_NE(orig_data.end(), it);
+      ASSERT_EQ(it->second, value);
+    }
+  }
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "AfterRead");
+  ingest_thread.join();
+  for (auto* iter : iters) {
+    delete iter;
+  }
+  iters.clear();
+  dbfull()->ReleaseSnapshot(read_opts.snapshot);
+
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  // Should see consistent state after ingestion for all column families even
+  // without snapshot.
+  ASSERT_EQ(2, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingest
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data);
+    ASSERT_NOK(s);
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeJobsRun:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data);
+    ASSERT_NOK(s);
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  SyncPoint::GetInstance()->ClearTrace();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:1",
+       "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    Status s = GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data);
+    ASSERT_NOK(s);
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:1");
+  ingest_thread.join();
+
+  fault_injection_env->DropUnsyncedFileData();
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
+                        testing::Values(std::make_tuple(false, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(true, true)));
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
@@ -1951,7 +2677,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as External SST File Writer and Ingestion are not supported "
           "in ROCKSDB_LITE\n");
diff --git a/thirdparty/rocksdb/db/fault_injection_test.cc b/thirdparty/rocksdb/db/fault_injection_test.cc
index adfcb4db5a..53de312c01 100644
--- a/thirdparty/rocksdb/db/fault_injection_test.cc
+++ b/thirdparty/rocksdb/db/fault_injection_test.cc
@@ -34,19 +34,22 @@ static const int kValueSize = 1000;
 static const int kMaxNumValues = 2000;
 static const size_t kNumIterations = 3;
 
-class FaultInjectionTest : public testing::Test,
-                           public testing::WithParamInterface<bool> {
+enum FaultInjectionOptionConfig {
+  kDefault,
+  kDifferentDataDir,
+  kWalDir,
+  kSyncWal,
+  kWalDirSyncWal,
+  kMultiLevels,
+  kEnd,
+};
+class FaultInjectionTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<
+          bool, FaultInjectionOptionConfig, FaultInjectionOptionConfig>> {
  protected:
-  enum OptionConfig {
-    kDefault,
-    kDifferentDataDir,
-    kWalDir,
-    kSyncWal,
-    kWalDirSyncWal,
-    kMultiLevels,
-    kEnd,
-  };
   int option_config_;
+  int non_inclusive_end_range_;  // kEnd or equivalent to that
   // When need to make sure data is persistent, sync WAL
   bool sync_use_wal_;
   // When need to make sure data is persistent, call DB::CompactRange()
@@ -67,27 +70,27 @@ class FaultInjectionTest : public testing::Test,
   std::unique_ptr<Env> base_env_;
   FaultInjectionTestEnv* env_;
   std::string dbname_;
-  shared_ptr<Cache> tiny_cache_;
+  std::shared_ptr<Cache> tiny_cache_;
   Options options_;
   DB* db_;
 
   FaultInjectionTest()
-      : option_config_(kDefault),
+      : option_config_(std::get<1>(GetParam())),
+        non_inclusive_end_range_(std::get<2>(GetParam())),
         sync_use_wal_(false),
         sync_use_compact_(true),
         base_env_(nullptr),
-        env_(NULL),
-        db_(NULL) {
-  }
+        env_(nullptr),
+        db_(nullptr) {}
 
-  ~FaultInjectionTest() {
+  ~FaultInjectionTest() override {
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
     rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
   }
 
   bool ChangeOptions() {
     option_config_++;
-    if (option_config_ >= kEnd) {
+    if (option_config_ >= non_inclusive_end_range_) {
       return false;
     } else {
       if (option_config_ == kMultiLevels) {
@@ -104,18 +107,18 @@ class FaultInjectionTest : public testing::Test,
     Options options;
     switch (option_config_) {
       case kWalDir:
-        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal");
         break;
       case kDifferentDataDir:
-        options.db_paths.emplace_back(test::TmpDir(env_) + "/fault_test_data",
-                                      1000000U);
+        options.db_paths.emplace_back(
+            test::PerThreadDBPath(env_, "fault_test_data"), 1000000U);
         break;
       case kSyncWal:
         sync_use_wal_ = true;
         sync_use_compact_ = false;
         break;
       case kWalDirSyncWal:
-        options.wal_dir = test::TmpDir(env_) + "/fault_test_wal";
+        options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal");
         sync_use_wal_ = true;
         sync_use_compact_ = false;
         break;
@@ -139,9 +142,9 @@ class FaultInjectionTest : public testing::Test,
   }
 
   Status NewDB() {
-    assert(db_ == NULL);
+    assert(db_ == nullptr);
     assert(tiny_cache_ == nullptr);
-    assert(env_ == NULL);
+    assert(env_ == nullptr);
 
     env_ =
         new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default());
@@ -155,7 +158,7 @@ class FaultInjectionTest : public testing::Test,
     table_options.block_cache = tiny_cache_;
     options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-    dbname_ = test::TmpDir() + "/fault_test";
+    dbname_ = test::PerThreadDBPath("fault_test");
 
     EXPECT_OK(DestroyDB(dbname_, options_));
 
@@ -166,7 +169,7 @@ class FaultInjectionTest : public testing::Test,
   }
 
   void SetUp() override {
-    sequential_order_ = GetParam();
+    sequential_order_ = std::get<0>(GetParam());
     ASSERT_OK(NewDB());
   }
 
@@ -176,7 +179,7 @@ class FaultInjectionTest : public testing::Test,
     Status s = DestroyDB(dbname_, options_);
 
     delete env_;
-    env_ = NULL;
+    env_ = nullptr;
 
     tiny_cache_.reset();
 
@@ -228,16 +231,9 @@ class FaultInjectionTest : public testing::Test,
     return Status::OK();
   }
 
-#ifdef ROCKSDB_UBSAN_RUN
-#if defined(__clang__)
-__attribute__((__no_sanitize__("shift"), no_sanitize("signed-integer-overflow")))
-#elif defined(__GNUC__)
-__attribute__((__no_sanitize_undefined__))
-#endif
-#endif
   // Return the ith key
   Slice Key(int i, std::string* storage) const {
-    int num = i;
+    unsigned long long num = i;
     if (!sequential_order_) {
       // random transfer
       const int m = 0x5bd1e995;
@@ -245,7 +241,7 @@ __attribute__((__no_sanitize_undefined__))
       num ^= num << 24;
     }
     char buf[100];
-    snprintf(buf, sizeof(buf), "%016d", num);
+    snprintf(buf, sizeof(buf), "%016d", static_cast<int>(num));
     storage->assign(buf, strlen(buf));
     return Slice(*storage);
   }
@@ -350,7 +346,9 @@ __attribute__((__no_sanitize_undefined__))
   }
 };
 
-TEST_P(FaultInjectionTest, FaultTest) {
+class FaultInjectionTestSplitted : public FaultInjectionTest {};
+
+TEST_P(FaultInjectionTestSplitted, FaultTest) {
   do {
     Random rnd(301);
 
@@ -463,10 +461,10 @@ TEST_P(FaultInjectionTest, UninstalledCompaction) {
 
   std::atomic<bool> opened(false);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::Open:Opened", [&](void* arg) { opened.store(true); });
+      "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DBImpl::BGWorkCompaction",
-      [&](void* arg) { ASSERT_TRUE(opened.load()); });
+      [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   ASSERT_OK(OpenDB());
   ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
@@ -537,7 +535,17 @@ TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
   ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
 }
 
-INSTANTIATE_TEST_CASE_P(FaultTest, FaultInjectionTest, ::testing::Bool());
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTest,
+    ::testing::Values(std::make_tuple(false, kDefault, kEnd),
+                      std::make_tuple(true, kDefault, kEnd)));
+
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTestSplitted,
+    ::testing::Values(std::make_tuple(false, kDefault, kSyncWal),
+                      std::make_tuple(true, kDefault, kSyncWal),
+                      std::make_tuple(false, kSyncWal, kEnd),
+                      std::make_tuple(true, kSyncWal, kEnd)));
 
 }  // namespace rocksdb
 
diff --git a/thirdparty/rocksdb/db/file_indexer_test.cc b/thirdparty/rocksdb/db/file_indexer_test.cc
index 5cd8c2d2cf..935a01ef8d 100644
--- a/thirdparty/rocksdb/db/file_indexer_test.cc
+++ b/thirdparty/rocksdb/db/file_indexer_test.cc
@@ -36,10 +36,10 @@ class IntComparator : public Comparator {
 
   const char* Name() const override { return "IntComparator"; }
 
-  void FindShortestSeparator(std::string* start,
-                             const Slice& limit) const override {}
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
 
-  void FindShortSuccessor(std::string* key) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 
 class FileIndexerTest : public testing::Test {
@@ -47,7 +47,7 @@ class FileIndexerTest : public testing::Test {
   FileIndexerTest()
       : kNumLevels(4), files(new std::vector<FileMetaData*>[kNumLevels]) {}
 
-  ~FileIndexerTest() {
+  ~FileIndexerTest() override {
     ClearFiles();
     delete[] files;
   }
diff --git a/thirdparty/rocksdb/db/flush_job.cc b/thirdparty/rocksdb/db/flush_job.cc
index 778c9eca12..f03188141a 100644
--- a/thirdparty/rocksdb/db/flush_job.cc
+++ b/thirdparty/rocksdb/db/flush_job.cc
@@ -24,15 +24,15 @@
 #include "db/event_helpers.h"
 #include "db/log_reader.h"
 #include "db/log_writer.h"
+#include "db/memtable.h"
 #include "db/memtable_list.h"
 #include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
 #include "monitoring/iostats_context_imp.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
-#include "port/likely.h"
 #include "port/port.h"
-#include "db/memtable.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/statistics.h"
@@ -55,28 +55,65 @@
 
 namespace rocksdb {
 
+const char* GetFlushReasonString (FlushReason flush_reason) {
+  switch (flush_reason) {
+    case FlushReason::kOthers:
+      return "Other Reasons";
+    case FlushReason::kGetLiveFiles:
+      return "Get Live Files";
+    case FlushReason::kShutDown:
+      return "Shut down";
+    case FlushReason::kExternalFileIngestion:
+      return "External File Ingestion";
+    case FlushReason::kManualCompaction:
+      return "Manual Compaction";
+    case FlushReason::kWriteBufferManager:
+      return "Write Buffer Manager";
+    case FlushReason::kWriteBufferFull:
+      return "Write Buffer Full";
+    case FlushReason::kTest:
+      return "Test";
+    case FlushReason::kDeleteFiles:
+      return "Delete Files";
+    case FlushReason::kAutoCompaction:
+      return "Auto Compaction";
+    case FlushReason::kManualFlush:
+      return "Manual Flush";
+    case FlushReason::kErrorRecovery:
+      return "Error Recovery";
+    default:
+      return "Invalid";
+  }
+}
+
 FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
                    const ImmutableDBOptions& db_options,
                    const MutableCFOptions& mutable_cf_options,
+                   const uint64_t* max_memtable_id,
                    const EnvOptions& env_options, VersionSet* versions,
                    InstrumentedMutex* db_mutex,
                    std::atomic<bool>* shutting_down,
                    std::vector<SequenceNumber> existing_snapshots,
                    SequenceNumber earliest_write_conflict_snapshot,
-                   JobContext* job_context, LogBuffer* log_buffer,
-                   Directory* db_directory, Directory* output_file_directory,
+                   SnapshotChecker* snapshot_checker, JobContext* job_context,
+                   LogBuffer* log_buffer, Directory* db_directory,
+                   Directory* output_file_directory,
                    CompressionType output_compression, Statistics* stats,
-                   EventLogger* event_logger, bool measure_io_stats)
+                   EventLogger* event_logger, bool measure_io_stats,
+                   const bool sync_output_directory, const bool write_manifest,
+                   Env::Priority thread_pri)
     : dbname_(dbname),
       cfd_(cfd),
       db_options_(db_options),
       mutable_cf_options_(mutable_cf_options),
+      max_memtable_id_(max_memtable_id),
       env_options_(env_options),
       versions_(versions),
       db_mutex_(db_mutex),
       shutting_down_(shutting_down),
       existing_snapshots_(std::move(existing_snapshots)),
       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
       job_context_(job_context),
       log_buffer_(log_buffer),
       db_directory_(db_directory),
@@ -85,7 +122,12 @@ FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
       stats_(stats),
       event_logger_(event_logger),
       measure_io_stats_(measure_io_stats),
-      pick_memtable_called(false) {
+      sync_output_directory_(sync_output_directory),
+      write_manifest_(write_manifest),
+      edit_(nullptr),
+      base_(nullptr),
+      pick_memtable_called(false),
+      thread_pri_(thread_pri) {
   // Update the thread status to indicate flush.
   ReportStartedFlush();
   TEST_SYNC_POINT("FlushJob::FlushJob()");
@@ -127,7 +169,7 @@ void FlushJob::PickMemTable() {
   assert(!pick_memtable_called);
   pick_memtable_called = true;
   // Save the contents of the earliest memtable as a new Table
-  cfd_->imm()->PickMemtablesToFlush(&mems_);
+  cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_);
   if (mems_.empty()) {
     return;
   }
@@ -152,7 +194,9 @@ void FlushJob::PickMemTable() {
   base_->Ref();  // it is likely that we do not need this reference
 }
 
-Status FlushJob::Run(FileMetaData* file_meta) {
+Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
+                     FileMetaData* file_meta) {
+  TEST_SYNC_POINT("FlushJob::Start");
   db_mutex_->AssertHeld();
   assert(pick_memtable_called);
   AutoThreadOperationStageUpdater stage_run(
@@ -169,6 +213,8 @@ Status FlushJob::Run(FileMetaData* file_meta) {
   uint64_t prev_fsync_nanos = 0;
   uint64_t prev_range_sync_nanos = 0;
   uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
   if (measure_io_stats_) {
     prev_perf_level = GetPerfLevel();
     SetPerfLevel(PerfLevel::kEnableTime);
@@ -176,6 +222,8 @@ Status FlushJob::Run(FileMetaData* file_meta) {
     prev_fsync_nanos = IOSTATS(fsync_nanos);
     prev_range_sync_nanos = IOSTATS(range_sync_nanos);
     prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
   }
 
   // This will release and re-acquire the mutex.
@@ -189,11 +237,11 @@ Status FlushJob::Run(FileMetaData* file_meta) {
 
   if (!s.ok()) {
     cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
-  } else {
+  } else if (write_manifest_) {
     TEST_SYNC_POINT("FlushJob::InstallResults");
     // Replace immutable memtable with the generated Table
-    s = cfd_->imm()->InstallMemtableFlushResults(
-        cfd_, mutable_cf_options_, mems_, versions_, db_mutex_,
+    s = cfd_->imm()->TryInstallMemtableFlushResults(
+        cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
         meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
         log_buffer_);
   }
@@ -206,6 +254,8 @@ Status FlushJob::Run(FileMetaData* file_meta) {
   auto stream = event_logger_->LogToBuffer(log_buffer_);
   stream << "job" << job_context_->job_id << "event"
          << "flush_finished";
+  stream << "output_compression"
+         << CompressionTypeToString(output_compression_);
   stream << "lsm_state";
   stream.StartArray();
   auto vstorage = cfd_->current()->storage_info();
@@ -225,6 +275,10 @@ Status FlushJob::Run(FileMetaData* file_meta) {
     stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
     stream << "file_prepare_write_nanos"
            << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
+    stream << "file_cpu_write_nanos"
+           << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
+    stream << "file_cpu_read_nanos"
+           << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
   }
 
   return s;
@@ -241,8 +295,10 @@ Status FlushJob::WriteLevel0Table() {
       ThreadStatus::STAGE_FLUSH_WRITE_L0);
   db_mutex_->AssertHeld();
   const uint64_t start_micros = db_options_.env->NowMicros();
+  const uint64_t start_cpu_micros = db_options_.env->NowCPUNanos() / 1000;
   Status s;
   {
+    auto write_hint = cfd_->CalculateSSTWriteHint(0);
     db_mutex_->Unlock();
     if (log_buffer_) {
       log_buffer_->FlushBufferToLog();
@@ -251,11 +307,13 @@ Status FlushJob::WriteLevel0Table() {
     // memtable and its associated range deletion memtable, respectively, at
     // corresponding indexes.
     std::vector<InternalIterator*> memtables;
-    std::vector<InternalIterator*> range_del_iters;
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters;
     ReadOptions ro;
     ro.total_order_seek = true;
     Arena arena;
     uint64_t total_num_entries = 0, total_num_deletes = 0;
+    uint64_t total_data_size = 0;
     size_t total_memory_usage = 0;
     for (MemTable* m : mems_) {
       ROCKS_LOG_INFO(
@@ -263,12 +321,14 @@ Status FlushJob::WriteLevel0Table() {
           "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
           cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
       memtables.push_back(m->NewIterator(ro, &arena));
-      auto* range_del_iter = m->NewRangeTombstoneIterator(ro);
+      auto* range_del_iter =
+          m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
       if (range_del_iter != nullptr) {
-        range_del_iters.push_back(range_del_iter);
+        range_del_iters.emplace_back(range_del_iter);
       }
       total_num_entries += m->num_entries();
       total_num_deletes += m->num_deletes();
+      total_data_size += m->get_data_size();
       total_memory_usage += m->ApproximateMemoryUsage();
     }
 
@@ -276,17 +336,15 @@ Status FlushJob::WriteLevel0Table() {
                          << "flush_started"
                          << "num_memtables" << mems_.size() << "num_entries"
                          << total_num_entries << "num_deletes"
-                         << total_num_deletes << "memory_usage"
-                         << total_memory_usage;
+                         << total_num_deletes << "total_data_size"
+                         << total_data_size << "memory_usage"
+                         << total_memory_usage << "flush_reason"
+                         << GetFlushReasonString(cfd_->GetFlushReason());
 
     {
       ScopedArenaIterator iter(
           NewMergingIterator(&cfd_->internal_comparator(), &memtables[0],
                              static_cast<int>(memtables.size()), &arena));
-      std::unique_ptr<InternalIterator> range_del_iter(NewMergingIterator(
-          &cfd_->internal_comparator(),
-          range_del_iters.empty() ? nullptr : &range_del_iters[0],
-          static_cast<int>(range_del_iters.size())));
       ROCKS_LOG_INFO(db_options_.info_log,
                      "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
                      cfd_->GetName().c_str(), job_context_->job_id,
@@ -294,27 +352,34 @@ Status FlushJob::WriteLevel0Table() {
 
       TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
                                &output_compression_);
-      EnvOptions optimized_env_options =
-          db_options_.env->OptimizeForCompactionTableWrite(env_options_, db_options_);
-
       int64_t _current_time = 0;
-      db_options_.env->GetCurrentTime(&_current_time);  // ignore error
+      auto status = db_options_.env->GetCurrentTime(&_current_time);
+      // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Failed to get current time to populate creation_time property. "
+            "Status: %s",
+            status.ToString().c_str());
+      }
       const uint64_t current_time = static_cast<uint64_t>(_current_time);
 
-      uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime();
+      uint64_t oldest_key_time =
+          mems_.front()->ApproximateOldestKeyTime();
 
       s = BuildTable(
           dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_,
-          optimized_env_options, cfd_->table_cache(), iter.get(),
-          std::move(range_del_iter), &meta_, cfd_->internal_comparator(),
+          env_options_, cfd_->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta_, cfd_->internal_comparator(),
           cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
           cfd_->GetName(), existing_snapshots_,
-          earliest_write_conflict_snapshot_, output_compression_,
+          earliest_write_conflict_snapshot_, snapshot_checker_,
+          output_compression_, mutable_cf_options_.sample_for_compression,
           cfd_->ioptions()->compression_opts,
           mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
           TableFileCreationReason::kFlush, event_logger_, job_context_->job_id,
           Env::IO_HIGH, &table_properties_, 0 /* level */, current_time,
-          oldest_key_time);
+          oldest_key_time, write_hint);
       LogFlush(db_options_.info_log);
     }
     ROCKS_LOG_INFO(db_options_.info_log,
@@ -326,8 +391,8 @@ Status FlushJob::WriteLevel0Table() {
                    s.ToString().c_str(),
                    meta_.marked_for_compaction ? " (needs compaction)" : "");
 
-    if (output_file_directory_ != nullptr) {
-      output_file_directory_->Fsync();
+    if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
+      s = output_file_directory_->Fsync();
     }
     TEST_SYNC_POINT("FlushJob::WriteLevel0Table");
     db_mutex_->Lock();
@@ -344,15 +409,17 @@ Status FlushJob::WriteLevel0Table() {
     // Add file to L0
     edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
                    meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
-                   meta_.smallest_seqno, meta_.largest_seqno,
+                   meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
                    meta_.marked_for_compaction);
   }
 
   // Note that here we treat flush as level 0 compaction in internal stats
-  InternalStats::CompactionStats stats(1);
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
   stats.micros = db_options_.env->NowMicros() - start_micros;
+  stats.cpu_micros = db_options_.env->NowCPUNanos() / 1000 - start_cpu_micros;
   stats.bytes_written = meta_.fd.GetFileSize();
-  cfd_->internal_stats()->AddCompactionStats(0 /* level */, stats);
+  RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
+  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
   cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED,
                                      meta_.fd.GetFileSize());
   RecordFlushIOStats();
diff --git a/thirdparty/rocksdb/db/flush_job.h b/thirdparty/rocksdb/db/flush_job.h
index 4698ae7b03..c408194562 100644
--- a/thirdparty/rocksdb/db/flush_job.h
+++ b/thirdparty/rocksdb/db/flush_job.h
@@ -22,6 +22,7 @@
 #include "db/internal_stats.h"
 #include "db/job_context.h"
 #include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
 #include "db/memtable_list.h"
 #include "db/snapshot_impl.h"
 #include "db/version_edit.h"
@@ -42,7 +43,9 @@
 
 namespace rocksdb {
 
+class DBImpl;
 class MemTable;
+class SnapshotChecker;
 class TableCache;
 class Version;
 class VersionEdit;
@@ -56,39 +59,52 @@ class FlushJob {
   FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
            const ImmutableDBOptions& db_options,
            const MutableCFOptions& mutable_cf_options,
-           const EnvOptions& env_options, VersionSet* versions,
-           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+           const uint64_t* max_memtable_id, const EnvOptions& env_options,
+           VersionSet* versions, InstrumentedMutex* db_mutex,
+           std::atomic<bool>* shutting_down,
            std::vector<SequenceNumber> existing_snapshots,
            SequenceNumber earliest_write_conflict_snapshot,
-           JobContext* job_context, LogBuffer* log_buffer,
-           Directory* db_directory, Directory* output_file_directory,
-           CompressionType output_compression, Statistics* stats,
-           EventLogger* event_logger, bool measure_io_stats);
+           SnapshotChecker* snapshot_checker, JobContext* job_context,
+           LogBuffer* log_buffer, Directory* db_directory,
+           Directory* output_file_directory, CompressionType output_compression,
+           Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+           const bool sync_output_directory, const bool write_manifest,
+           Env::Priority thread_pri);
 
   ~FlushJob();
 
   // Require db_mutex held.
   // Once PickMemTable() is called, either Run() or Cancel() has to be called.
   void PickMemTable();
-  Status Run(FileMetaData* file_meta = nullptr);
+  Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
+             FileMetaData* file_meta = nullptr);
   void Cancel();
   TableProperties GetTableProperties() const { return table_properties_; }
+  const autovector<MemTable*>& GetMemTables() const { return mems_; }
 
  private:
   void ReportStartedFlush();
   void ReportFlushInputSize(const autovector<MemTable*>& mems);
   void RecordFlushIOStats();
   Status WriteLevel0Table();
+
   const std::string& dbname_;
   ColumnFamilyData* cfd_;
   const ImmutableDBOptions& db_options_;
   const MutableCFOptions& mutable_cf_options_;
-  const EnvOptions& env_options_;
+  // Pointer to a variable storing the largest memtable id to flush in this
+  // flush job. RocksDB uses this variable to select the memtables to flush in
+  // this job. All memtables in this column family with an ID smaller than or
+  // equal to *max_memtable_id_ will be selected for flush. If null, then all
+  // memtables in the column family will be selected.
+  const uint64_t* max_memtable_id_;
+  const EnvOptions env_options_;
   VersionSet* versions_;
   InstrumentedMutex* db_mutex_;
   std::atomic<bool>* shutting_down_;
   std::vector<SequenceNumber> existing_snapshots_;
   SequenceNumber earliest_write_conflict_snapshot_;
+  SnapshotChecker* snapshot_checker_;
   JobContext* job_context_;
   LogBuffer* log_buffer_;
   Directory* db_directory_;
@@ -98,6 +114,23 @@ class FlushJob {
   EventLogger* event_logger_;
   TableProperties table_properties_;
   bool measure_io_stats_;
+  // True if this flush job should call fsync on the output directory. False
+  // otherwise.
+  // Usually sync_output_directory_ is true. A flush job needs to call sync on
+  // the output directory before committing to the MANIFEST.
+  // However, an individual flush job does not have to call sync on the output
+  // directory if it is part of an atomic flush. After all flush jobs in the
+  // atomic flush succeed, call sync once on each distinct output directory.
+  const bool sync_output_directory_;
+  // True if this flush job should write to MANIFEST after successfully
+  // flushing memtables. False otherwise.
+  // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
+  // flushing the memtables.
+  // However, an individual flush job cannot rashly write to the MANIFEST
+  // immediately after it finishes the flush if it is part of an atomic flush.
+  // In this case, only after all flush jobs succeed in flush can RocksDB
+  // commit to the MANIFEST.
+  const bool write_manifest_;
 
   // Variables below are set by PickMemTable():
   FileMetaData meta_;
@@ -105,6 +138,7 @@ class FlushJob {
   VersionEdit* edit_;
   Version* base_;
   bool pick_memtable_called;
+  Env::Priority thread_pri_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/flush_job_test.cc b/thirdparty/rocksdb/db/flush_job_test.cc
index 34a3c983c3..199ed29cac 100644
--- a/thirdparty/rocksdb/db/flush_job_test.cc
+++ b/thirdparty/rocksdb/db/flush_job_test.cc
@@ -27,9 +27,10 @@ class FlushJobTest : public testing::Test {
  public:
   FlushJobTest()
       : env_(Env::Default()),
-        dbname_(test::TmpDir() + "/flush_job_test"),
+        dbname_(test::PerThreadDBPath("flush_job_test")),
         options_(),
         db_options_(options_),
+        column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
         versions_(new VersionSet(dbname_, &db_options_, env_options_,
@@ -40,11 +41,14 @@ class FlushJobTest : public testing::Test {
     EXPECT_OK(env_->CreateDirIfMissing(dbname_));
     db_options_.db_paths.emplace_back(dbname_,
                                       std::numeric_limits<uint64_t>::max());
+    db_options_.statistics = rocksdb::CreateDBStatistics();
     // TODO(icanadi) Remove this once we mock out VersionSet
     NewDB();
     std::vector<ColumnFamilyDescriptor> column_families;
     cf_options_.table_factory = mock_table_factory_;
-    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+    for (const auto& cf_name : column_family_names_) {
+      column_families.emplace_back(cf_name, cf_options_);
+    }
 
     EXPECT_OK(versions_->Recover(column_families, false));
   }
@@ -55,18 +59,38 @@ class FlushJobTest : public testing::Test {
     new_db.SetNextFile(2);
     new_db.SetLastSequence(0);
 
+    autovector<VersionEdit> new_cfs;
+    SequenceNumber last_seq = 1;
+    uint32_t cf_id = 1;
+    for (size_t i = 1; i != column_family_names_.size(); ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(column_family_names_[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+
     const std::string manifest = DescriptorFileName(dbname_, 1);
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     Status s = env_->NewWritableFile(
         manifest, &file, env_->OptimizeForManifestWrite(env_options_));
     ASSERT_OK(s);
-    unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), EnvOptions()));
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), manifest, EnvOptions()));
     {
       log::Writer log(std::move(file_writer), 0, false);
       std::string record;
       new_db.EncodeTo(&record);
       s = log.AddRecord(record);
+
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = log.AddRecord(record);
+        ASSERT_OK(s);
+      }
     }
     ASSERT_OK(s);
     // Make "CURRENT" file that points to the new manifest file.
@@ -78,6 +102,7 @@ class FlushJobTest : public testing::Test {
   EnvOptions env_options_;
   Options options_;
   ImmutableDBOptions db_options_;
+  const std::vector<std::string> column_family_names_;
   std::shared_ptr<Cache> table_cache_;
   WriteController write_controller_;
   WriteBufferManager write_buffer_manager_;
@@ -92,11 +117,15 @@ TEST_F(FlushJobTest, Empty) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
   EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
-                     env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     {}, kMaxSequenceNumber, &job_context, nullptr, nullptr,
-                     nullptr, kNoCompression, nullptr, &event_logger, false);
+                     nullptr /* memtable_id */, env_options_, versions_.get(),
+                     &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, nullptr, &event_logger, false,
+                     true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
   {
     InstrumentedMutexLock l(&mutex_);
     flush_job.PickMemTable();
@@ -136,42 +165,227 @@ TEST_F(FlushJobTest, NonEmpty) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
   FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
                      db_options_, *cfd->GetLatestMutableCFOptions(),
-                     env_options_, versions_.get(), &mutex_, &shutting_down_,
-                     {}, kMaxSequenceNumber, &job_context, nullptr, nullptr,
-                     nullptr, kNoCompression, nullptr, &event_logger, true);
-  FileMetaData fd;
+                     nullptr /* memtable_id */, env_options_, versions_.get(),
+                     &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
+
+  HistogramData hist;
+  FileMetaData file_meta;
   mutex_.Lock();
   flush_job.PickMemTable();
-  ASSERT_OK(flush_job.Run(&fd));
+  ASSERT_OK(flush_job.Run(nullptr, &file_meta));
   mutex_.Unlock();
-  ASSERT_EQ(ToString(0), fd.smallest.user_key().ToString());
-  ASSERT_EQ("9999a",
-            fd.largest.user_key().ToString());  // range tombstone end key
-  ASSERT_EQ(1, fd.smallest_seqno);
-  ASSERT_EQ(10000, fd.largest_seqno);  // range tombstone seqnum 10000
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ(
+      "9999a",
+      file_meta.largest.user_key().ToString());  // range tombstone end key
+  ASSERT_EQ(1, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(10000, file_meta.fd.largest_seqno);  // range tombstone seqnum 10000
   mock_table_factory_->AssertSingleFile(inserted_keys);
   job_context.Clean();
 }
 
+TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(ToString(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue, key,
+               value);
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     &flush_memtable_id, env_options_, versions_.get(), &mutex_,
+                     &shutting_down_, {}, kMaxSequenceNumber, snapshot_checker,
+                     &job_context, nullptr, nullptr, nullptr, kNoCompression,
+                     db_options_.statistics.get(), &event_logger, true,
+                     true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("99", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(0, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
+            file_meta.fd.largest_seqno);
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
+  autovector<ColumnFamilyData*> all_cfds;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    all_cfds.push_back(cfd);
+  }
+  const std::vector<size_t> num_memtables = {2, 1, 3};
+  assert(num_memtables.size() == column_family_names_.size());
+  const size_t num_keys_per_memtable = 1000;
+  JobContext job_context(0);
+  std::vector<uint64_t> memtable_ids;
+  std::vector<SequenceNumber> smallest_seqs;
+  std::vector<SequenceNumber> largest_seqs;
+  autovector<MemTable*> to_delete;
+  SequenceNumber curr_seqno = 0;
+  size_t k = 0;
+  for (auto cfd : all_cfds) {
+    smallest_seqs.push_back(curr_seqno);
+    for (size_t i = 0; i != num_memtables[k]; ++i) {
+      MemTable* mem = cfd->ConstructNewMemtable(
+          *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+      mem->SetID(i);
+      mem->Ref();
+
+      for (size_t j = 0; j != num_keys_per_memtable; ++j) {
+        std::string key(ToString(j + i * num_keys_per_memtable));
+        std::string value("value" + key);
+        mem->Add(curr_seqno++, kTypeValue, key, value);
+      }
+
+      cfd->imm()->Add(mem, &to_delete);
+    }
+    largest_seqs.push_back(curr_seqno - 1);
+    memtable_ids.push_back(num_memtables[k++] - 1);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
+  std::vector<FlushJob> flush_jobs;
+  k = 0;
+  for (auto cfd : all_cfds) {
+    std::vector<SequenceNumber> snapshot_seqs;
+    flush_jobs.emplace_back(
+        dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+        &memtable_ids[k], env_options_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
+        &job_context, nullptr, nullptr, nullptr, kNoCompression,
+        db_options_.statistics.get(), &event_logger, true,
+        false /* sync_output_directory */, false /* write_manifest */,
+        Env::Priority::USER);
+    k++;
+  }
+  HistogramData hist;
+  std::vector<FileMetaData> file_metas;
+  // Call reserve to avoid auto-resizing
+  file_metas.reserve(flush_jobs.size());
+  mutex_.Lock();
+  for (auto& job : flush_jobs) {
+    job.PickMemTable();
+  }
+  for (auto& job : flush_jobs) {
+    FileMetaData meta;
+    // Run will release and re-acquire  mutex
+    ASSERT_OK(job.Run(nullptr /**/, &meta));
+    file_metas.emplace_back(meta);
+  }
+  autovector<FileMetaData*> file_meta_ptrs;
+  for (auto& meta : file_metas) {
+    file_meta_ptrs.push_back(&meta);
+  }
+  autovector<const autovector<MemTable*>*> mems_list;
+  for (size_t i = 0; i != all_cfds.size(); ++i) {
+    const auto& mems = flush_jobs[i].GetMemTables();
+    mems_list.push_back(&mems);
+  }
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  for (auto cfd : all_cfds) {
+    mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+  }
+
+  Status s = InstallMemtableAtomicFlushResults(
+      nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+      versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free,
+      nullptr /* db_directory */, nullptr /* log_buffer */);
+  ASSERT_OK(s);
+
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  k = 0;
+  for (const auto& file_meta : file_metas) {
+    ASSERT_EQ(ToString(0), file_meta.smallest.user_key().ToString());
+    ASSERT_EQ("999", file_meta.largest.user_key()
+                         .ToString());  // max key by bytewise comparator
+    ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno);
+    ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno);
+    // Verify that imm is empty
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
+              all_cfds[k]->imm()->GetEarliestMemTableID());
+    ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID());
+    ++k;
+  }
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
 TEST_F(FlushJobTest, Snapshots) {
   JobContext job_context(0);
   auto cfd = versions_->GetColumnFamilySet()->GetDefault();
   auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
                                            kMaxSequenceNumber);
 
-  std::vector<SequenceNumber> snapshots;
   std::set<SequenceNumber> snapshots_set;
   int keys = 10000;
   int max_inserts_per_keys = 8;
 
   Random rnd(301);
   for (int i = 0; i < keys / 2; ++i) {
-    snapshots.push_back(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
-    snapshots_set.insert(snapshots.back());
+    snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
   }
-  std::sort(snapshots.begin(), snapshots.end());
+  // set has already removed the duplicate snapshots
+  std::vector<SequenceNumber> snapshots(snapshots_set.begin(),
+                                        snapshots_set.end());
 
   new_mem->Ref();
   SequenceNumber current_seqno = 0;
@@ -202,16 +416,23 @@ TEST_F(FlushJobTest, Snapshots) {
   }
 
   EventLogger event_logger(db_options_.info_log.get());
-  FlushJob flush_job(
-      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
-      *cfd->GetLatestMutableCFOptions(), env_options_, versions_.get(), &mutex_,
-      &shutting_down_, snapshots, kMaxSequenceNumber, &job_context, nullptr,
-      nullptr, nullptr, kNoCompression, nullptr, &event_logger, true);
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     nullptr /* memtable_id */, env_options_, versions_.get(),
+                     &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+                     snapshot_checker, &job_context, nullptr, nullptr, nullptr,
+                     kNoCompression, db_options_.statistics.get(),
+                     &event_logger, true, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER);
   mutex_.Lock();
   flush_job.PickMemTable();
   ASSERT_OK(flush_job.Run());
   mutex_.Unlock();
   mock_table_factory_->AssertSingleFile(inserted_keys);
+  HistogramData hist;
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
   job_context.Clean();
 }
 
diff --git a/thirdparty/rocksdb/db/forward_iterator.cc b/thirdparty/rocksdb/db/forward_iterator.cc
index 65fff95956..d1c073468b 100644
--- a/thirdparty/rocksdb/db/forward_iterator.cc
+++ b/thirdparty/rocksdb/db/forward_iterator.cc
@@ -15,6 +15,8 @@
 #include "db/db_iter.h"
 #include "db/dbformat.h"
 #include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
@@ -25,24 +27,26 @@
 namespace rocksdb {
 
 // Usage:
-//     LevelIterator iter;
+//     ForwardLevelIterator iter;
 //     iter.SetFileIndex(file_index);
-//     iter.Seek(target);
+//     iter.Seek(target); // or iter.SeekToFirst();
 //     iter.Next()
-class LevelIterator : public InternalIterator {
+class ForwardLevelIterator : public InternalIterator {
  public:
-  LevelIterator(const ColumnFamilyData* const cfd,
-                const ReadOptions& read_options,
-                const std::vector<FileMetaData*>& files)
+  ForwardLevelIterator(const ColumnFamilyData* const cfd,
+                       const ReadOptions& read_options,
+                       const std::vector<FileMetaData*>& files,
+                       const SliceTransform* prefix_extractor)
       : cfd_(cfd),
         read_options_(read_options),
         files_(files),
         valid_(false),
         file_index_(std::numeric_limits<uint32_t>::max()),
         file_iter_(nullptr),
-        pinned_iters_mgr_(nullptr) {}
+        pinned_iters_mgr_(nullptr),
+        prefix_extractor_(prefix_extractor) {}
 
-  ~LevelIterator() {
+  ~ForwardLevelIterator() override {
     // Reset current pointer
     if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
       pinned_iters_mgr_->PinIterator(file_iter_);
@@ -53,11 +57,11 @@ class LevelIterator : public InternalIterator {
 
   void SetFileIndex(uint32_t file_index) {
     assert(file_index < files_.size());
+    status_ = Status::OK();
     if (file_index != file_index_) {
       file_index_ = file_index;
       Reset();
     }
-    valid_ = false;
   }
   void Reset() {
     assert(file_index_ < files_.size());
@@ -69,51 +73,70 @@ class LevelIterator : public InternalIterator {
       delete file_iter_;
     }
 
-    RangeDelAggregator range_del_agg(
-        cfd_->internal_comparator(), {} /* snapshots */);
+    ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                         kMaxSequenceNumber /* upper_bound */);
     file_iter_ = cfd_->table_cache()->NewIterator(
         read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
-        files_[file_index_]->fd,
+        *files_[file_index_],
         read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
-        nullptr /* table_reader_ptr */, nullptr, false);
+        prefix_extractor_, nullptr /* table_reader_ptr */, nullptr, false);
     file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+    valid_ = false;
     if (!range_del_agg.IsEmpty()) {
       status_ = Status::NotSupported(
           "Range tombstones unsupported with ForwardIterator");
-      valid_ = false;
     }
   }
   void SeekToLast() override {
-    status_ = Status::NotSupported("LevelIterator::SeekToLast()");
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()");
     valid_ = false;
   }
   void Prev() override {
-    status_ = Status::NotSupported("LevelIterator::Prev()");
+    status_ = Status::NotSupported("ForwardLevelIterator::Prev()");
     valid_ = false;
   }
   bool Valid() const override {
     return valid_;
   }
   void SeekToFirst() override {
-    SetFileIndex(0);
+    assert(file_iter_ != nullptr);
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
     file_iter_->SeekToFirst();
     valid_ = file_iter_->Valid();
   }
   void Seek(const Slice& internal_key) override {
     assert(file_iter_ != nullptr);
+
+    // This deviates from the usual convention for InternalIterator::Seek() in
+    // that it doesn't discard pre-existing error status. That's because this
+    // Seek() is only supposed to be called immediately after SetFileIndex()
+    // (which discards pre-existing error status), and SetFileIndex() may set
+    // an error status, which we shouldn't discard.
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
+
     file_iter_->Seek(internal_key);
     valid_ = file_iter_->Valid();
   }
-  void SeekForPrev(const Slice& internal_key) override {
-    status_ = Status::NotSupported("LevelIterator::SeekForPrev()");
+  void SeekForPrev(const Slice& /*internal_key*/) override {
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()");
     valid_ = false;
   }
   void Next() override {
     assert(valid_);
     file_iter_->Next();
     for (;;) {
-      if (file_iter_->status().IsIncomplete() || file_iter_->Valid()) {
-        valid_ = !file_iter_->status().IsIncomplete();
+      valid_ = file_iter_->Valid();
+      if (!file_iter_->status().ok()) {
+        assert(!valid_);
+        return;
+      }
+      if (valid_) {
         return;
       }
       if (file_index_ + 1 >= files_.size()) {
@@ -121,6 +144,10 @@ class LevelIterator : public InternalIterator {
         return;
       }
       SetFileIndex(file_index_ + 1);
+      if (!status_.ok()) {
+        assert(!valid_);
+        return;
+      }
       file_iter_->SeekToFirst();
     }
   }
@@ -135,7 +162,7 @@ class LevelIterator : public InternalIterator {
   Status status() const override {
     if (!status_.ok()) {
       return status_;
-    } else if (file_iter_ && !file_iter_->status().ok()) {
+    } else if (file_iter_) {
       return file_iter_->status();
     }
     return Status::OK();
@@ -165,6 +192,7 @@ class LevelIterator : public InternalIterator {
   Status status_;
   InternalIterator* file_iter_;
   PinnedIteratorsManager* pinned_iters_mgr_;
+  const SliceTransform* prefix_extractor_;
 };
 
 ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
@@ -173,7 +201,7 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
     : db_(db),
       read_options_(read_options),
       cfd_(cfd),
-      prefix_extractor_(cfd->ioptions()->prefix_extractor),
+      prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()),
       user_comparator_(cfd->user_comparator()),
       immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
       sv_(current_sv),
@@ -196,38 +224,62 @@ ForwardIterator::~ForwardIterator() {
   Cleanup(true);
 }
 
-namespace {
-// Used in PinnedIteratorsManager to release pinned SuperVersion
-static void ReleaseSuperVersionFunc(void* sv) {
-  delete reinterpret_cast<SuperVersion*>(sv);
-}
-}  // namespace
-
-void ForwardIterator::SVCleanup() {
-  if (sv_ != nullptr && sv_->Unref()) {
+void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv,
+                                bool background_purge_on_iterator_cleanup) {
+  if (sv->Unref()) {
     // Job id == 0 means that this is not our background process, but rather
     // user thread
     JobContext job_context(0);
-    db_->mutex_.Lock();
-    sv_->Cleanup();
-    db_->FindObsoleteFiles(&job_context, false, true);
-    if (read_options_.background_purge_on_iterator_cleanup) {
-      db_->ScheduleBgLogWriterClose(&job_context);
-    }
-    db_->mutex_.Unlock();
-    if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
-      pinned_iters_mgr_->PinPtr(sv_, &ReleaseSuperVersionFunc);
-    } else {
-      delete sv_;
-    }
+    db->mutex_.Lock();
+    sv->Cleanup();
+    db->FindObsoleteFiles(&job_context, false, true);
+    if (background_purge_on_iterator_cleanup) {
+      db->ScheduleBgLogWriterClose(&job_context);
+    }
+    db->mutex_.Unlock();
+    delete sv;
     if (job_context.HaveSomethingToDelete()) {
-      db_->PurgeObsoleteFiles(
-          job_context, read_options_.background_purge_on_iterator_cleanup);
+      db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup);
     }
     job_context.Clean();
   }
 }
 
+namespace {
+struct SVCleanupParams {
+  DBImpl* db;
+  SuperVersion* sv;
+  bool background_purge_on_iterator_cleanup;
+};
+}
+
+// Used in PinnedIteratorsManager to release pinned SuperVersion
+void ForwardIterator::DeferredSVCleanup(void* arg) {
+  auto d = reinterpret_cast<SVCleanupParams*>(arg);
+  ForwardIterator::SVCleanup(
+    d->db, d->sv, d->background_purge_on_iterator_cleanup);
+  delete d;
+}
+
+void ForwardIterator::SVCleanup() {
+  if (sv_ == nullptr) {
+    return;
+  }
+  bool background_purge =
+      read_options_.background_purge_on_iterator_cleanup ||
+      db_->immutable_db_options().avoid_unnecessary_blocking_io;
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    // pinned_iters_mgr_ tells us to make sure that all visited key-value slices
+    // are alive until pinned_iters_mgr_->ReleasePinnedData() is called.
+    // The slices may point into some memtables owned by sv_, so we need to keep
+    // sv_ referenced until pinned_iters_mgr_ unpins everything.
+    auto p = new SVCleanupParams{db_, sv_, background_purge};
+    pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup);
+  } else {
+    SVCleanup(db_, sv_, background_purge);
+  }
+}
+
 void ForwardIterator::Cleanup(bool release_sv) {
   if (mutable_iter_ != nullptr) {
     DeleteIterator(mutable_iter_, true /* is_arena */);
@@ -277,9 +329,6 @@ bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const {
 }
 
 void ForwardIterator::Seek(const Slice& internal_key) {
-  if (IsOverUpperBound(internal_key)) {
-    valid_ = false;
-  }
   if (sv_ == nullptr) {
     RebuildIterators(true);
   } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
@@ -361,14 +410,13 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
 
       if (!l0_iters_[i]->status().ok()) {
         immutable_status_ = l0_iters_[i]->status();
-      } else if (l0_iters_[i]->Valid()) {
-        if (!IsOverUpperBound(l0_iters_[i]->key())) {
-          immutable_min_heap_.push(l0_iters_[i]);
-        } else {
-          has_iter_trimmed_for_upper_bound_ = true;
-          DeleteIterator(l0_iters_[i]);
-          l0_iters_[i] = nullptr;
-        }
+      } else if (l0_iters_[i]->Valid() &&
+                 !IsOverUpperBound(l0_iters_[i]->key())) {
+        immutable_min_heap_.push(l0_iters_[i]);
+      } else {
+        has_iter_trimmed_for_upper_bound_ = true;
+        DeleteIterator(l0_iters_[i]);
+        l0_iters_[i] = nullptr;
       }
     }
 
@@ -395,15 +443,14 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
 
         if (!level_iters_[level - 1]->status().ok()) {
           immutable_status_ = level_iters_[level - 1]->status();
-        } else if (level_iters_[level - 1]->Valid()) {
-          if (!IsOverUpperBound(level_iters_[level - 1]->key())) {
-            immutable_min_heap_.push(level_iters_[level - 1]);
-          } else {
-            // Nothing in this level is interesting. Remove.
-            has_iter_trimmed_for_upper_bound_ = true;
-            DeleteIterator(level_iters_[level - 1]);
-            level_iters_[level - 1] = nullptr;
-          }
+        } else if (level_iters_[level - 1]->Valid() &&
+                   !IsOverUpperBound(level_iters_[level - 1]->key())) {
+          immutable_min_heap_.push(level_iters_[level - 1]);
+        } else {
+          // Nothing in this level is interesting. Remove.
+          has_iter_trimmed_for_upper_bound_ = true;
+          DeleteIterator(level_iters_[level - 1]);
+          level_iters_[level - 1] = nullptr;
         }
       }
     }
@@ -541,7 +588,7 @@ void ForwardIterator::UpdateChildrenPinnedItersMgr() {
   }
 
   // Set PinnedIteratorsManager for L1+ levels iterators.
-  for (LevelIterator* child_iter : level_iters_) {
+  for (ForwardLevelIterator* child_iter : level_iters_) {
     if (child_iter) {
       child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
     }
@@ -565,13 +612,14 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
     // New
     sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
   }
-  RangeDelAggregator range_del_agg(
-      InternalKeyComparator(cfd_->internal_comparator()), {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
   mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
   sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
   if (!read_options_.ignore_range_deletions) {
-    std::unique_ptr<InternalIterator> range_del_iter(
-        sv_->mem->NewRangeTombstoneIterator(read_options_));
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        sv_->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence()));
     range_del_agg.AddTombstones(std::move(range_del_iter));
     sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
                                          &range_del_agg);
@@ -585,13 +633,16 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) {
     if ((read_options_.iterate_upper_bound != nullptr) &&
         cfd_->internal_comparator().user_comparator()->Compare(
             l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) {
-      has_iter_trimmed_for_upper_bound_ = true;
+      // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator
+      // will never be interested in files with smallest key above
+      // iterate_upper_bound, since iterate_upper_bound can't be changed.
       l0_iters_.push_back(nullptr);
       continue;
     }
     l0_iters_.push_back(cfd_->table_cache()->NewIterator(
-        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd,
-        read_options_.ignore_range_deletions ? nullptr : &range_del_agg));
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        sv_->mutable_cf_options.prefix_extractor.get()));
   }
   BuildLevelIterators(vstorage);
   current_ = nullptr;
@@ -620,14 +671,15 @@ void ForwardIterator::RenewIterators() {
 
   mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
   svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
-  RangeDelAggregator range_del_agg(
-      InternalKeyComparator(cfd_->internal_comparator()), {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
   if (!read_options_.ignore_range_deletions) {
-    std::unique_ptr<InternalIterator> range_del_iter(
-        svnew->mem->NewRangeTombstoneIterator(read_options_));
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        svnew->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence()));
     range_del_agg.AddTombstones(std::move(range_del_iter));
-    sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
-                                         &range_del_agg);
+    svnew->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+                                           &range_del_agg);
   }
 
   const auto* vstorage = sv_->current->storage_info();
@@ -660,8 +712,9 @@ void ForwardIterator::RenewIterators() {
     }
     l0_iters_new.push_back(cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
-        l0_files_new[inew]->fd,
-        read_options_.ignore_range_deletions ? nullptr : &range_del_agg));
+        *l0_files_new[inew],
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        svnew->mutable_cf_options.prefix_extractor.get()));
   }
 
   for (auto* f : l0_iters_) {
@@ -702,8 +755,9 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage) {
         has_iter_trimmed_for_upper_bound_ = true;
       }
     } else {
-      level_iters_.push_back(
-          new LevelIterator(cfd_, read_options_, level_files));
+      level_iters_.push_back(new ForwardLevelIterator(
+          cfd_, read_options_, level_files,
+          sv_->mutable_cf_options.prefix_extractor.get()));
     }
   }
 }
@@ -718,7 +772,8 @@ void ForwardIterator::ResetIncompleteIterators() {
     DeleteIterator(l0_iters_[i]);
     l0_iters_[i] = cfd_->table_cache()->NewIterator(
         read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
-        l0_files[i]->fd, nullptr /* range_del_agg */);
+        *l0_files[i], nullptr /* range_del_agg */,
+        sv_->mutable_cf_options.prefix_extractor.get());
     l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
   }
 
@@ -753,7 +808,7 @@ void ForwardIterator::UpdateCurrent() {
       current_ = mutable_iter_;
     }
   }
-  valid_ = (current_ != nullptr);
+  valid_ = current_ != nullptr && immutable_status_.ok();
   if (!status_.ok()) {
     status_ = Status::OK();
   }
@@ -867,21 +922,13 @@ bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
 uint32_t ForwardIterator::FindFileInRange(
     const std::vector<FileMetaData*>& files, const Slice& internal_key,
     uint32_t left, uint32_t right) {
-  while (left < right) {
-    uint32_t mid = (left + right) / 2;
-    const FileMetaData* f = files[mid];
-    if (cfd_->internal_comparator().InternalKeyComparator::Compare(
-          f->largest.Encode(), internal_key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
-    }
-  }
-  return right;
+  auto cmp = [&](const FileMetaData* f, const Slice& key) -> bool {
+    return cfd_->internal_comparator().InternalKeyComparator::Compare(
+            f->largest.Encode(), key) < 0;
+  };
+  const auto &b = files.begin();
+  return static_cast<uint32_t>(std::lower_bound(b + left,
+                                 b + right, internal_key, cmp) - b);
 }
 
 void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) {
diff --git a/thirdparty/rocksdb/db/forward_iterator.h b/thirdparty/rocksdb/db/forward_iterator.h
index d4f32cba9f..146588d961 100644
--- a/thirdparty/rocksdb/db/forward_iterator.h
+++ b/thirdparty/rocksdb/db/forward_iterator.h
@@ -23,7 +23,7 @@ class DBImpl;
 class Env;
 struct SuperVersion;
 class ColumnFamilyData;
-class LevelIterator;
+class ForwardLevelIterator;
 class VersionStorageInfo;
 struct FileMetaData;
 
@@ -55,7 +55,7 @@ class ForwardIterator : public InternalIterator {
                   ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr);
   virtual ~ForwardIterator();
 
-  void SeekForPrev(const Slice& target) override {
+  void SeekForPrev(const Slice& /*target*/) override {
     status_ = Status::NotSupported("ForwardIterator::SeekForPrev()");
     valid_ = false;
   }
@@ -85,7 +85,14 @@ class ForwardIterator : public InternalIterator {
 
  private:
   void Cleanup(bool release_sv);
+  // Unreference and, if needed, clean up the current SuperVersion. This is
+  // either done immediately or deferred until this iterator is unpinned by
+  // PinnedIteratorsManager.
   void SVCleanup();
+  static void SVCleanup(
+    DBImpl* db, SuperVersion* sv, bool background_purge_on_iterator_cleanup);
+  static void DeferredSVCleanup(void* arg);
+
   void RebuildIterators(bool refresh_sv);
   void RenewIterators();
   void BuildLevelIterators(const VersionStorageInfo* vstorage);
@@ -119,7 +126,7 @@ class ForwardIterator : public InternalIterator {
   InternalIterator* mutable_iter_;
   std::vector<InternalIterator*> imm_iters_;
   std::vector<InternalIterator*> l0_iters_;
-  std::vector<LevelIterator*> level_iters_;
+  std::vector<ForwardLevelIterator*> level_iters_;
   InternalIterator* current_;
   bool valid_;
 
diff --git a/thirdparty/rocksdb/db/forward_iterator_bench.cc b/thirdparty/rocksdb/db/forward_iterator_bench.cc
index e9ae770cfa..113ded94b6 100644
--- a/thirdparty/rocksdb/db/forward_iterator_bench.cc
+++ b/thirdparty/rocksdb/db/forward_iterator_bench.cc
@@ -17,7 +17,6 @@ int main() {
 // Block forward_iterator_bench under MAC and Windows
 int main() { return 0; }
 #else
-#include <gflags/gflags.h>
 #include <semaphore.h>
 #include <atomic>
 #include <bitset>
@@ -30,11 +29,12 @@ int main() { return 0; }
 #include <random>
 #include <thread>
 
+#include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
-#include "port/port.h"
+#include "util/gflags_compat.h"
 #include "util/testharness.h"
 
 const int MAX_SHARDS = 100000;
@@ -319,11 +319,11 @@ struct StatsThread {
 };
 
 int main(int argc, char** argv) {
-  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
 
   std::mt19937 rng{std::random_device()()};
   rocksdb::Status status;
-  std::string path = rocksdb::test::TmpDir() + "/forward_iterator_test";
+  std::string path = rocksdb::test::PerThreadDBPath("forward_iterator_test");
   fprintf(stderr, "db path is %s\n", path.c_str());
   rocksdb::Options options;
   options.create_if_missing = true;
diff --git a/thirdparty/rocksdb/db/in_memory_stats_history.cc b/thirdparty/rocksdb/db/in_memory_stats_history.cc
new file mode 100644
index 0000000000..39355cfbe0
--- /dev/null
+++ b/thirdparty/rocksdb/db/in_memory_stats_history.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include "db/in_memory_stats_history.h"
+
+namespace rocksdb {
+
+InMemoryStatsHistoryIterator::~InMemoryStatsHistoryIterator() {}
+
+bool InMemoryStatsHistoryIterator::Valid() const { return valid_; }
+
+Status InMemoryStatsHistoryIterator::status() const { return status_; }
+
+void InMemoryStatsHistoryIterator::Next() {
+  // increment start_time by 1 to avoid infinite loop
+  AdvanceIteratorByTime(GetStatsTime() + 1, end_time_);
+}
+
+uint64_t InMemoryStatsHistoryIterator::GetStatsTime() const { return time_; }
+
+const std::map<std::string, uint64_t>&
+InMemoryStatsHistoryIterator::GetStatsMap() const {
+  return stats_map_;
+}
+
+// advance the iterator to the next time between [start_time, end_time)
+// if success, update time_ and stats_map_ with new_time and stats_map
+void InMemoryStatsHistoryIterator::AdvanceIteratorByTime(uint64_t start_time,
+                                                         uint64_t end_time) {
+  // try to find next entry in stats_history_ map
+  if (db_impl_ != nullptr) {
+    valid_ =
+        db_impl_->FindStatsByTime(start_time, end_time, &time_, &stats_map_);
+  } else {
+    valid_ = false;
+  }
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/in_memory_stats_history.h b/thirdparty/rocksdb/db/in_memory_stats_history.h
new file mode 100644
index 0000000000..4b52e23fff
--- /dev/null
+++ b/thirdparty/rocksdb/db/in_memory_stats_history.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/stats_history.h"
+
+namespace rocksdb {
+
+class InMemoryStatsHistoryIterator final : public StatsHistoryIterator {
+ public:
+  InMemoryStatsHistoryIterator(uint64_t start_time, uint64_t end_time,
+                               DBImpl* db_impl)
+      : start_time_(start_time),
+        end_time_(end_time),
+        valid_(true),
+        db_impl_(db_impl) {
+    AdvanceIteratorByTime(start_time_, end_time_);
+  }
+  ~InMemoryStatsHistoryIterator() override;
+  bool Valid() const override;
+  Status status() const override;
+
+  void Next() override;
+  uint64_t GetStatsTime() const override;
+
+  const std::map<std::string, uint64_t>& GetStatsMap() const override;
+
+ private:
+  // advance the iterator to the next stats history record with timestamp
+  // between [start_time, end_time)
+  void AdvanceIteratorByTime(uint64_t start_time, uint64_t end_time);
+
+  // No copying allowed
+  InMemoryStatsHistoryIterator(const InMemoryStatsHistoryIterator&) = delete;
+  void operator=(const InMemoryStatsHistoryIterator&) = delete;
+  InMemoryStatsHistoryIterator(InMemoryStatsHistoryIterator&&) = delete;
+  InMemoryStatsHistoryIterator& operator=(InMemoryStatsHistoryIterator&&) =
+      delete;
+
+  uint64_t time_;
+  uint64_t start_time_;
+  uint64_t end_time_;
+  std::map<std::string, uint64_t> stats_map_;
+  Status status_;
+  bool valid_;
+  DBImpl* db_impl_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/internal_stats.cc b/thirdparty/rocksdb/db/internal_stats.cc
index e98bd98cf7..51e55f5839 100644
--- a/thirdparty/rocksdb/db/internal_stats.cc
+++ b/thirdparty/rocksdb/db/internal_stats.cc
@@ -18,9 +18,10 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "db/column_family.h"
 
+#include "db/column_family.h"
 #include "db/db_impl.h"
+#include "table/block_based_table_factory.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -44,6 +45,8 @@ const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
         {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}},
         {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}},
         {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}},
+        {LevelStatType::COMP_CPU_SEC,
+         LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}},
         {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}},
         {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
         {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
@@ -55,7 +58,8 @@ const double kMB = 1048576.0;
 const double kGB = kMB * 1024;
 const double kMicrosInSec = 1000000.0;
 
-void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
+void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
+                           const std::string& group_by) {
   int written_size =
       snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
   auto hdr = [](LevelStatType t) {
@@ -63,15 +67,16 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) {
   };
   int line_size = snprintf(
       buf + written_size, len - written_size,
-      "Level    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
       // Note that we skip COMPACTED_FILES and merge it with Files column
-      hdr(LevelStatType::NUM_FILES), hdr(LevelStatType::SIZE_BYTES),
-      hdr(LevelStatType::SCORE), hdr(LevelStatType::READ_GB),
-      hdr(LevelStatType::RN_GB), hdr(LevelStatType::RNP1_GB),
-      hdr(LevelStatType::WRITE_GB), hdr(LevelStatType::W_NEW_GB),
-      hdr(LevelStatType::MOVED_GB), hdr(LevelStatType::WRITE_AMP),
-      hdr(LevelStatType::READ_MBPS), hdr(LevelStatType::WRITE_MBPS),
-      hdr(LevelStatType::COMP_SEC), hdr(LevelStatType::COMP_COUNT),
+      group_by.c_str(), hdr(LevelStatType::NUM_FILES),
+      hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
+      hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
+      hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
+      hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
+      hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
+      hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
+      hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
       hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
       hdr(LevelStatType::KEY_DROP));
 
@@ -86,8 +91,7 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
                        const InternalStats::CompactionStats& stats) {
   uint64_t bytes_read =
       stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
-  int64_t bytes_new =
-      stats.bytes_written - stats.bytes_read_output_level;
+  int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
   double elapsed = (stats.micros + 1) / kMicrosInSec;
 
   (*level_stats)[LevelStatType::NUM_FILES] = num_files;
@@ -106,6 +110,7 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
   (*level_stats)[LevelStatType::WRITE_MBPS] =
       stats.bytes_written / kMB / elapsed;
   (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
+  (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
   (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
   (*level_stats)[LevelStatType::AVG_SEC] =
       stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count;
@@ -117,50 +122,52 @@ void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
 
 void PrintLevelStats(char* buf, size_t len, const std::string& name,
                      const std::map<LevelStatType, double>& stat_value) {
-  snprintf(buf, len,
-           "%4s "      /*  Level */
-           "%6d/%-3d " /*  Files */
-           "%8s "      /*  Size */
-           "%5.1f "    /*  Score */
-           "%8.1f "    /*  Read(GB) */
-           "%7.1f "    /*  Rn(GB) */
-           "%8.1f "    /*  Rnp1(GB) */
-           "%9.1f "    /*  Write(GB) */
-           "%8.1f "    /*  Wnew(GB) */
-           "%9.1f "    /*  Moved(GB) */
-           "%5.1f "    /*  W-Amp */
-           "%8.1f "    /*  Rd(MB/s) */
-           "%8.1f "    /*  Wr(MB/s) */
-           "%9.0f "    /*  Comp(sec) */
-           "%9d "      /*  Comp(cnt) */
-           "%8.3f "    /*  Avg(sec) */
-           "%7s "      /*  KeyIn */
-           "%6s\n",    /*  KeyDrop */
-           name.c_str(),
-           static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
-           static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
-           BytesToHumanString(
-               static_cast<uint64_t>(stat_value.at(LevelStatType::SIZE_BYTES)))
-               .c_str(),
-           stat_value.at(LevelStatType::SCORE),
-           stat_value.at(LevelStatType::READ_GB),
-           stat_value.at(LevelStatType::RN_GB),
-           stat_value.at(LevelStatType::RNP1_GB),
-           stat_value.at(LevelStatType::WRITE_GB),
-           stat_value.at(LevelStatType::W_NEW_GB),
-           stat_value.at(LevelStatType::MOVED_GB),
-           stat_value.at(LevelStatType::WRITE_AMP),
-           stat_value.at(LevelStatType::READ_MBPS),
-           stat_value.at(LevelStatType::WRITE_MBPS),
-           stat_value.at(LevelStatType::COMP_SEC),
-           static_cast<int>(stat_value.at(LevelStatType::COMP_COUNT)),
-           stat_value.at(LevelStatType::AVG_SEC),
-           NumberToHumanString(
-               static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_IN)))
-               .c_str(),
-           NumberToHumanString(static_cast<std::int64_t>(
-                                   stat_value.at(LevelStatType::KEY_DROP)))
-               .c_str());
+  snprintf(
+      buf, len,
+      "%4s "      /*  Level */
+      "%6d/%-3d " /*  Files */
+      "%8s "      /*  Size */
+      "%5.1f "    /*  Score */
+      "%8.1f "    /*  Read(GB) */
+      "%7.1f "    /*  Rn(GB) */
+      "%8.1f "    /*  Rnp1(GB) */
+      "%9.1f "    /*  Write(GB) */
+      "%8.1f "    /*  Wnew(GB) */
+      "%9.1f "    /*  Moved(GB) */
+      "%5.1f "    /*  W-Amp */
+      "%8.1f "    /*  Rd(MB/s) */
+      "%8.1f "    /*  Wr(MB/s) */
+      "%9.2f "    /*  Comp(sec) */
+      "%17.2f "   /*  CompMergeCPU(sec) */
+      "%9d "      /*  Comp(cnt) */
+      "%8.3f "    /*  Avg(sec) */
+      "%7s "      /*  KeyIn */
+      "%6s\n",    /*  KeyDrop */
+      name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
+      static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
+      BytesToHumanString(
+          static_cast<uint64_t>(stat_value.at(LevelStatType::SIZE_BYTES)))
+          .c_str(),
+      stat_value.at(LevelStatType::SCORE),
+      stat_value.at(LevelStatType::READ_GB),
+      stat_value.at(LevelStatType::RN_GB),
+      stat_value.at(LevelStatType::RNP1_GB),
+      stat_value.at(LevelStatType::WRITE_GB),
+      stat_value.at(LevelStatType::W_NEW_GB),
+      stat_value.at(LevelStatType::MOVED_GB),
+      stat_value.at(LevelStatType::WRITE_AMP),
+      stat_value.at(LevelStatType::READ_MBPS),
+      stat_value.at(LevelStatType::WRITE_MBPS),
+      stat_value.at(LevelStatType::COMP_SEC),
+      stat_value.at(LevelStatType::COMP_CPU_SEC),
+      static_cast<int>(stat_value.at(LevelStatType::COMP_COUNT)),
+      stat_value.at(LevelStatType::AVG_SEC),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_IN)))
+          .c_str(),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
+          .c_str());
 }
 
 void PrintLevelStats(char* buf, size_t len, const std::string& name,
@@ -208,31 +215,34 @@ static const std::string mem_table_flush_pending = "mem-table-flush-pending";
 static const std::string compaction_pending = "compaction-pending";
 static const std::string background_errors = "background-errors";
 static const std::string cur_size_active_mem_table =
-                          "cur-size-active-mem-table";
+    "cur-size-active-mem-table";
 static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
 static const std::string size_all_mem_tables = "size-all-mem-tables";
 static const std::string num_entries_active_mem_table =
-                          "num-entries-active-mem-table";
+    "num-entries-active-mem-table";
 static const std::string num_entries_imm_mem_tables =
-                          "num-entries-imm-mem-tables";
+    "num-entries-imm-mem-tables";
 static const std::string num_deletes_active_mem_table =
-                          "num-deletes-active-mem-table";
+    "num-deletes-active-mem-table";
 static const std::string num_deletes_imm_mem_tables =
-                          "num-deletes-imm-mem-tables";
+    "num-deletes-imm-mem-tables";
 static const std::string estimate_num_keys = "estimate-num-keys";
 static const std::string estimate_table_readers_mem =
-                          "estimate-table-readers-mem";
+    "estimate-table-readers-mem";
 static const std::string is_file_deletions_enabled =
-                          "is-file-deletions-enabled";
+    "is-file-deletions-enabled";
 static const std::string num_snapshots = "num-snapshots";
 static const std::string oldest_snapshot_time = "oldest-snapshot-time";
 static const std::string num_live_versions = "num-live-versions";
 static const std::string current_version_number =
     "current-super-version-number";
 static const std::string estimate_live_data_size = "estimate-live-data-size";
-static const std::string min_log_number_to_keep = "min-log-number-to-keep";
-static const std::string base_level = "base-level";
+static const std::string min_log_number_to_keep_str = "min-log-number-to-keep";
+static const std::string min_obsolete_sst_number_to_keep_str =
+    "min-obsolete-sst-number-to-keep";
+static const std::string base_level_str = "base-level";
 static const std::string total_sst_files_size = "total-sst-files-size";
+static const std::string live_sst_files_size = "live-sst-files-size";
 static const std::string estimate_pending_comp_bytes =
     "estimate-pending-compaction-bytes";
 static const std::string aggregated_table_properties =
@@ -245,11 +255,15 @@ static const std::string actual_delayed_write_rate =
     "actual-delayed-write-rate";
 static const std::string is_write_stopped = "is-write-stopped";
 static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
+static const std::string block_cache_capacity = "block-cache-capacity";
+static const std::string block_cache_usage = "block-cache-usage";
+static const std::string block_cache_pinned_usage = "block-cache-pinned-usage";
+static const std::string options_statistics = "options-statistics";
 
 const std::string DB::Properties::kNumFilesAtLevelPrefix =
-                      rocksdb_prefix + num_files_at_level_prefix;
+    rocksdb_prefix + num_files_at_level_prefix;
 const std::string DB::Properties::kCompressionRatioAtLevelPrefix =
-                      rocksdb_prefix + compression_ratio_at_level_prefix;
+    rocksdb_prefix + compression_ratio_at_level_prefix;
 const std::string DB::Properties::kStats = rocksdb_prefix + allstats;
 const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables;
 const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats;
@@ -260,54 +274,58 @@ const std::string DB::Properties::kCFFileHistogram =
 const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
 const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
 const std::string DB::Properties::kNumImmutableMemTable =
-                      rocksdb_prefix + num_immutable_mem_table;
+    rocksdb_prefix + num_immutable_mem_table;
 const std::string DB::Properties::kNumImmutableMemTableFlushed =
     rocksdb_prefix + num_immutable_mem_table_flushed;
 const std::string DB::Properties::kMemTableFlushPending =
-                      rocksdb_prefix + mem_table_flush_pending;
+    rocksdb_prefix + mem_table_flush_pending;
 const std::string DB::Properties::kCompactionPending =
-                      rocksdb_prefix + compaction_pending;
+    rocksdb_prefix + compaction_pending;
 const std::string DB::Properties::kNumRunningCompactions =
     rocksdb_prefix + num_running_compactions;
 const std::string DB::Properties::kNumRunningFlushes =
     rocksdb_prefix + num_running_flushes;
 const std::string DB::Properties::kBackgroundErrors =
-                      rocksdb_prefix + background_errors;
+    rocksdb_prefix + background_errors;
 const std::string DB::Properties::kCurSizeActiveMemTable =
-                      rocksdb_prefix + cur_size_active_mem_table;
+    rocksdb_prefix + cur_size_active_mem_table;
 const std::string DB::Properties::kCurSizeAllMemTables =
     rocksdb_prefix + cur_size_all_mem_tables;
 const std::string DB::Properties::kSizeAllMemTables =
     rocksdb_prefix + size_all_mem_tables;
 const std::string DB::Properties::kNumEntriesActiveMemTable =
-                      rocksdb_prefix + num_entries_active_mem_table;
+    rocksdb_prefix + num_entries_active_mem_table;
 const std::string DB::Properties::kNumEntriesImmMemTables =
-                      rocksdb_prefix + num_entries_imm_mem_tables;
+    rocksdb_prefix + num_entries_imm_mem_tables;
 const std::string DB::Properties::kNumDeletesActiveMemTable =
-                      rocksdb_prefix + num_deletes_active_mem_table;
+    rocksdb_prefix + num_deletes_active_mem_table;
 const std::string DB::Properties::kNumDeletesImmMemTables =
-                      rocksdb_prefix + num_deletes_imm_mem_tables;
+    rocksdb_prefix + num_deletes_imm_mem_tables;
 const std::string DB::Properties::kEstimateNumKeys =
-                      rocksdb_prefix + estimate_num_keys;
+    rocksdb_prefix + estimate_num_keys;
 const std::string DB::Properties::kEstimateTableReadersMem =
-                      rocksdb_prefix + estimate_table_readers_mem;
+    rocksdb_prefix + estimate_table_readers_mem;
 const std::string DB::Properties::kIsFileDeletionsEnabled =
-                      rocksdb_prefix + is_file_deletions_enabled;
+    rocksdb_prefix + is_file_deletions_enabled;
 const std::string DB::Properties::kNumSnapshots =
-                      rocksdb_prefix + num_snapshots;
+    rocksdb_prefix + num_snapshots;
 const std::string DB::Properties::kOldestSnapshotTime =
-                      rocksdb_prefix + oldest_snapshot_time;
+    rocksdb_prefix + oldest_snapshot_time;
 const std::string DB::Properties::kNumLiveVersions =
-                      rocksdb_prefix + num_live_versions;
+    rocksdb_prefix + num_live_versions;
 const std::string DB::Properties::kCurrentSuperVersionNumber =
     rocksdb_prefix + current_version_number;
 const std::string DB::Properties::kEstimateLiveDataSize =
-                      rocksdb_prefix + estimate_live_data_size;
+    rocksdb_prefix + estimate_live_data_size;
 const std::string DB::Properties::kMinLogNumberToKeep =
-    rocksdb_prefix + min_log_number_to_keep;
+    rocksdb_prefix + min_log_number_to_keep_str;
+const std::string DB::Properties::kMinObsoleteSstNumberToKeep =
+    rocksdb_prefix + min_obsolete_sst_number_to_keep_str;
 const std::string DB::Properties::kTotalSstFilesSize =
-                      rocksdb_prefix + total_sst_files_size;
-const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level;
+    rocksdb_prefix + total_sst_files_size;
+const std::string DB::Properties::kLiveSstFilesSize =
+    rocksdb_prefix + live_sst_files_size;
+const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str;
 const std::string DB::Properties::kEstimatePendingCompactionBytes =
     rocksdb_prefix + estimate_pending_comp_bytes;
 const std::string DB::Properties::kAggregatedTableProperties =
@@ -320,107 +338,150 @@ const std::string DB::Properties::kIsWriteStopped =
     rocksdb_prefix + is_write_stopped;
 const std::string DB::Properties::kEstimateOldestKeyTime =
     rocksdb_prefix + estimate_oldest_key_time;
+const std::string DB::Properties::kBlockCacheCapacity =
+    rocksdb_prefix + block_cache_capacity;
+const std::string DB::Properties::kBlockCacheUsage =
+    rocksdb_prefix + block_cache_usage;
+const std::string DB::Properties::kBlockCachePinnedUsage =
+    rocksdb_prefix + block_cache_pinned_usage;
+const std::string DB::Properties::kOptionsStatistics =
+    rocksdb_prefix + options_statistics;
 
 const std::unordered_map<std::string, DBPropertyInfo>
     InternalStats::ppt_name_to_info = {
         {DB::Properties::kNumFilesAtLevelPrefix,
-         {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr}},
+         {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr,
+          nullptr}},
         {DB::Properties::kCompressionRatioAtLevelPrefix,
          {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr,
-          nullptr}},
+          nullptr, nullptr}},
         {DB::Properties::kLevelStats,
-         {false, &InternalStats::HandleLevelStats, nullptr, nullptr}},
+         {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}},
         {DB::Properties::kStats,
-         {false, &InternalStats::HandleStats, nullptr, nullptr}},
+         {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}},
         {DB::Properties::kCFStats,
          {false, &InternalStats::HandleCFStats, nullptr,
-          &InternalStats::HandleCFMapStats}},
+          &InternalStats::HandleCFMapStats, nullptr}},
         {DB::Properties::kCFStatsNoFileHistogram,
-         {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr,
+         {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr,
           nullptr}},
         {DB::Properties::kCFFileHistogram,
-         {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr}},
+         {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr,
+          nullptr}},
         {DB::Properties::kDBStats,
-         {false, &InternalStats::HandleDBStats, nullptr, nullptr}},
+         {false, &InternalStats::HandleDBStats, nullptr, nullptr, nullptr}},
         {DB::Properties::kSSTables,
-         {false, &InternalStats::HandleSsTables, nullptr, nullptr}},
+         {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
         {DB::Properties::kAggregatedTableProperties,
          {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
-          nullptr}},
+          nullptr, nullptr}},
         {DB::Properties::kAggregatedTablePropertiesAtLevel,
          {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
-          nullptr, nullptr}},
+          nullptr, nullptr, nullptr}},
         {DB::Properties::kNumImmutableMemTable,
-         {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr}},
+         {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
+          nullptr}},
         {DB::Properties::kNumImmutableMemTableFlushed,
          {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed,
-          nullptr}},
+          nullptr, nullptr}},
         {DB::Properties::kMemTableFlushPending,
-         {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr}},
+         {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr,
+          nullptr}},
         {DB::Properties::kCompactionPending,
-         {false, nullptr, &InternalStats::HandleCompactionPending, nullptr}},
+         {false, nullptr, &InternalStats::HandleCompactionPending, nullptr,
+          nullptr}},
         {DB::Properties::kBackgroundErrors,
-         {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr}},
+         {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr,
+          nullptr}},
         {DB::Properties::kCurSizeActiveMemTable,
-         {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable,
+         {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr,
           nullptr}},
         {DB::Properties::kCurSizeAllMemTables,
-         {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr}},
+         {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr,
+          nullptr}},
         {DB::Properties::kSizeAllMemTables,
-         {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr}},
+         {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr,
+          nullptr}},
         {DB::Properties::kNumEntriesActiveMemTable,
          {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable,
-          nullptr}},
+          nullptr, nullptr}},
         {DB::Properties::kNumEntriesImmMemTables,
-         {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr,
           nullptr}},
         {DB::Properties::kNumDeletesActiveMemTable,
          {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable,
-          nullptr}},
+          nullptr, nullptr}},
         {DB::Properties::kNumDeletesImmMemTables,
-         {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr,
           nullptr}},
         {DB::Properties::kEstimateNumKeys,
-         {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr}},
+         {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr,
+          nullptr}},
         {DB::Properties::kEstimateTableReadersMem,
-         {true, nullptr, &InternalStats::HandleEstimateTableReadersMem,
+         {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr,
           nullptr}},
         {DB::Properties::kIsFileDeletionsEnabled,
-         {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled,
+         {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr,
           nullptr}},
         {DB::Properties::kNumSnapshots,
-         {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr}},
+         {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr,
+          nullptr}},
         {DB::Properties::kOldestSnapshotTime,
-         {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr}},
+         {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr,
+          nullptr}},
         {DB::Properties::kNumLiveVersions,
-         {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr}},
+         {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr,
+          nullptr}},
         {DB::Properties::kCurrentSuperVersionNumber,
          {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber,
-          nullptr}},
+          nullptr, nullptr}},
         {DB::Properties::kEstimateLiveDataSize,
-         {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr}},
+         {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr,
+          nullptr}},
         {DB::Properties::kMinLogNumberToKeep,
-         {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr}},
+         {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr,
+          nullptr}},
+        {DB::Properties::kMinObsoleteSstNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep,
+          nullptr, nullptr}},
         {DB::Properties::kBaseLevel,
-         {false, nullptr, &InternalStats::HandleBaseLevel, nullptr}},
+         {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}},
         {DB::Properties::kTotalSstFilesSize,
-         {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr}},
+         {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveSstFilesSize,
+         {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr,
+          nullptr}},
         {DB::Properties::kEstimatePendingCompactionBytes,
          {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes,
-          nullptr}},
+          nullptr, nullptr}},
         {DB::Properties::kNumRunningFlushes,
-         {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr}},
+         {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr,
+          nullptr}},
         {DB::Properties::kNumRunningCompactions,
-         {false, nullptr, &InternalStats::HandleNumRunningCompactions,
+         {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr,
           nullptr}},
         {DB::Properties::kActualDelayedWriteRate,
-         {false, nullptr, &InternalStats::HandleActualDelayedWriteRate,
+         {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
           nullptr}},
         {DB::Properties::kIsWriteStopped,
-         {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr}},
+         {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr,
+          nullptr}},
         {DB::Properties::kEstimateOldestKeyTime,
-         {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime,
+         {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheCapacity,
+         {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheUsage,
+         {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr,
           nullptr}},
+        {DB::Properties::kBlockCachePinnedUsage,
+         {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kOptionsStatistics,
+         {false, nullptr, nullptr, nullptr,
+          &DBImpl::GetPropertyHandleOptionsStatistics}},
 };
 
 const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
@@ -442,8 +503,8 @@ bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info,
 }
 
 bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
-                                   const Slice& property,
-                                   std::map<std::string, double>* value) {
+                                   const Slice& /*property*/,
+                                   std::map<std::string, std::string>* value) {
   assert(value != nullptr);
   assert(property_info.handle_map != nullptr);
   return (this->*(property_info.handle_map))(value);
@@ -494,7 +555,7 @@ bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value,
   return true;
 }
 
-bool InternalStats::HandleLevelStats(std::string* value, Slice suffix) {
+bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) {
   char buf[1000];
   const auto* vstorage = cfd_->current()->storage_info();
   snprintf(buf, sizeof(buf),
@@ -521,40 +582,42 @@ bool InternalStats::HandleStats(std::string* value, Slice suffix) {
   return true;
 }
 
-bool InternalStats::HandleCFMapStats(std::map<std::string, double>* cf_stats) {
+bool InternalStats::HandleCFMapStats(
+    std::map<std::string, std::string>* cf_stats) {
   DumpCFMapStats(cf_stats);
   return true;
 }
 
-bool InternalStats::HandleCFStats(std::string* value, Slice suffix) {
+bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) {
   DumpCFStats(value);
   return true;
 }
 
 bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value,
-                                                 Slice suffix) {
+                                                 Slice /*suffix*/) {
   DumpCFStatsNoFileHistogram(value);
   return true;
 }
 
-bool InternalStats::HandleCFFileHistogram(std::string* value, Slice suffix) {
+bool InternalStats::HandleCFFileHistogram(std::string* value,
+                                          Slice /*suffix*/) {
   DumpCFFileHistogram(value);
   return true;
 }
 
-bool InternalStats::HandleDBStats(std::string* value, Slice suffix) {
+bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) {
   DumpDBStats(value);
   return true;
 }
 
-bool InternalStats::HandleSsTables(std::string* value, Slice suffix) {
+bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) {
   auto* current = cfd_->current();
   *value = current->DebugString(true, true);
   return true;
 }
 
 bool InternalStats::HandleAggregatedTableProperties(std::string* value,
-                                                    Slice suffix) {
+                                                    Slice /*suffix*/) {
   std::shared_ptr<const TableProperties> tp;
   auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
   if (!s.ok()) {
@@ -581,34 +644,34 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value,
   return true;
 }
 
-bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
-                                               Version* version) {
+bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
   *value = cfd_->imm()->NumNotFlushed();
   return true;
 }
 
 bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value,
-                                                      DBImpl* db,
-                                                      Version* version) {
+                                                      DBImpl* /*db*/,
+                                                      Version* /*version*/) {
   *value = cfd_->imm()->NumFlushed();
   return true;
 }
 
-bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* db,
-                                               Version* version) {
+bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
   // Return number of mem tables that are ready to flush (made immutable)
   *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
   return true;
 }
 
 bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db,
-                                            Version* version) {
+                                            Version* /*version*/) {
   *value = db->num_running_flushes();
   return true;
 }
 
-bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* db,
-                                            Version* version) {
+bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
   // 1 if the system already determines at least one compaction is needed.
   // 0 otherwise,
   const auto* vstorage = cfd_->current()->storage_info();
@@ -617,70 +680,74 @@ bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* db,
 }
 
 bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
-                                                Version* version) {
+                                                Version* /*version*/) {
   *value = db->num_running_compactions_;
   return true;
 }
 
-bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* db,
-                                           Version* version) {
+bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
   // Accumulated number of  errors in background flushes or compactions.
   *value = GetBackgroundErrorCount();
   return true;
 }
 
-bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
-                                                Version* version) {
+bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
   // Current size of the active memtable
   *value = cfd_->mem()->ApproximateMemoryUsage();
   return true;
 }
 
-bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db,
-                                              Version* version) {
+bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                              Version* /*version*/) {
   // Current size of the active memtable + immutable memtables
   *value = cfd_->mem()->ApproximateMemoryUsage() +
            cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
   return true;
 }
 
-bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* db,
-                                           Version* version) {
+bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
   *value = cfd_->mem()->ApproximateMemoryUsage() +
            cfd_->imm()->ApproximateMemoryUsage();
   return true;
 }
 
-bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db,
-                                                   Version* version) {
+bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
   // Current number of entires in the active memtable
   *value = cfd_->mem()->num_entries();
   return true;
 }
 
-bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db,
-                                                 Version* version) {
+bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
   // Current number of entries in the immutable memtables
   *value = cfd_->imm()->current()->GetTotalNumEntries();
   return true;
 }
 
-bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db,
-                                                   Version* version) {
+bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
   // Current number of entires in the active memtable
   *value = cfd_->mem()->num_deletes();
   return true;
 }
 
-bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db,
-                                                 Version* version) {
+bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
   // Current number of entries in the immutable memtables
   *value = cfd_->imm()->current()->GetTotalNumDeletes();
   return true;
 }
 
-bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* db,
-                                          Version* version) {
+bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
   // Estimate number of entries in the column family:
   // Use estimated entries in tables + total entries in memtables.
   const auto* vstorage = cfd_->current()->storage_info();
@@ -696,77 +763,92 @@ bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* db,
 }
 
 bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db,
-                                       Version* version) {
+                                       Version* /*version*/) {
   *value = db->snapshots().count();
   return true;
 }
 
 bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db,
-                                             Version* version) {
+                                             Version* /*version*/) {
   *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
   return true;
 }
 
-bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* db,
-                                          Version* version) {
+bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
   *value = cfd_->GetNumLiveVersions();
   return true;
 }
 
-bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db,
-                                                    Version* version) {
+bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value,
+                                                    DBImpl* /*db*/,
+                                                    Version* /*version*/) {
   *value = cfd_->GetSuperVersionNumber();
   return true;
 }
 
 bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
-                                                 Version* version) {
+                                                 Version* /*version*/) {
   *value = db->IsFileDeletionsEnabled();
   return true;
 }
 
-bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* db,
-                                    Version* version) {
+bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/,
+                                    Version* /*version*/) {
   const auto* vstorage = cfd_->current()->storage_info();
   *value = vstorage->base_level();
   return true;
 }
 
-bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* db,
-                                            Version* version) {
+bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
   *value = cfd_->GetTotalSstFilesSize();
   return true;
 }
 
+bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  *value = cfd_->GetLiveSstFilesSize();
+  return true;
+}
+
 bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value,
-                                                         DBImpl* db,
-                                                         Version* version) {
+                                                         DBImpl* /*db*/,
+                                                         Version* /*version*/) {
   const auto* vstorage = cfd_->current()->storage_info();
   *value = vstorage->estimated_compaction_needed_bytes();
   return true;
 }
 
-bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db,
+bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value,
+                                                  DBImpl* /*db*/,
                                                   Version* version) {
   *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders();
   return true;
 }
 
-bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
+bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/,
                                                Version* version) {
-  const auto* vstorage = cfd_->current()->storage_info();
+  const auto* vstorage = version->storage_info();
   *value = vstorage->EstimateLiveDataSize();
   return true;
 }
 
 bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
-                                             Version* version) {
+                                             Version* /*version*/) {
   *value = db->MinLogNumberToKeep();
   return true;
 }
 
+bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value,
+                                                     DBImpl* db,
+                                                     Version* /*version*/) {
+  *value = db->MinObsoleteSstNumberToKeep();
+  return true;
+}
+
 bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
-                                                 Version* version) {
+                                                 Version* /*version*/) {
   const WriteController& wc = db->write_controller();
   if (!wc.NeedsDelay()) {
     *value = 0;
@@ -777,7 +859,7 @@ bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
 }
 
 bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
-                                         Version* version) {
+                                         Version* /*version*/) {
   *value = db->write_controller().IsStopped() ? 1 : 0;
   return true;
 }
@@ -788,7 +870,8 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
   // with allow_compaction = false. This is because we don't propagate
   // oldest_key_time on compaction.
   if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
-      cfd_->ioptions()->compaction_options_fifo.allow_compaction) {
+      cfd_->GetCurrentMutableCFOptions()
+          ->compaction_options_fifo.allow_compaction) {
     return false;
   }
 
@@ -811,6 +894,58 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
   return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
 }
 
+bool InternalStats::HandleBlockCacheStat(Cache** block_cache) {
+  assert(block_cache != nullptr);
+  auto* table_factory = cfd_->ioptions()->table_factory;
+  assert(table_factory != nullptr);
+  if (BlockBasedTableFactory::kName != table_factory->Name()) {
+    return false;
+  }
+  auto* table_options =
+      reinterpret_cast<BlockBasedTableOptions*>(table_factory->GetOptions());
+  if (table_options == nullptr) {
+    return false;
+  }
+  *block_cache = table_options->block_cache.get();
+  if (table_options->no_block_cache || *block_cache == nullptr) {
+    return false;
+  }
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+                                             Version* /*version*/) {
+  Cache* block_cache;
+  bool ok = HandleBlockCacheStat(&block_cache);
+  if (!ok) {
+    return false;
+  }
+  *value = static_cast<uint64_t>(block_cache->GetCapacity());
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  Cache* block_cache;
+  bool ok = HandleBlockCacheStat(&block_cache);
+  if (!ok) {
+    return false;
+  }
+  *value = static_cast<uint64_t>(block_cache->GetUsage());
+  return true;
+}
+
+bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  Cache* block_cache;
+  bool ok = HandleBlockCacheStat(&block_cache);
+  if (!ok) {
+    return false;
+  }
+  *value = static_cast<uint64_t>(block_cache->GetPinnedUsage());
+  return true;
+}
+
 void InternalStats::DumpDBStats(std::string* value) {
   char buf[1000];
   // DB-level stats, only available from default column family
@@ -862,8 +997,7 @@ void InternalStats::DumpDBStats(std::string* value) {
   value->append(buf);
   // Stall
   AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
-  snprintf(buf, sizeof(buf),
-           "Cumulative stall: %s, %.1f percent\n",
+  snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n",
            human_micros,
            // 10000 = divide by 1M to get secs, then multiply by 100 for pct
            write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
@@ -874,43 +1008,40 @@ void InternalStats::DumpDBStats(std::string* value) {
   uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
   uint64_t interval_num_keys_written =
       num_keys_written - db_stats_snapshot_.num_keys_written;
-  snprintf(buf, sizeof(buf),
-           "Interval writes: %s writes, %s keys, %s commit groups, "
-           "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n",
-           NumberToHumanString(
-               interval_write_other + interval_write_self).c_str(),
-           NumberToHumanString(interval_num_keys_written).c_str(),
-           NumberToHumanString(interval_write_self).c_str(),
-           static_cast<double>(interval_write_other + interval_write_self) /
-               (interval_write_self + 1),
-           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
-           (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
-               std::max(interval_seconds_up, 0.001)),
-  value->append(buf);
+  snprintf(
+      buf, sizeof(buf),
+      "Interval writes: %s writes, %s keys, %s commit groups, "
+      "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n",
+      NumberToHumanString(interval_write_other + interval_write_self).c_str(),
+      NumberToHumanString(interval_num_keys_written).c_str(),
+      NumberToHumanString(interval_write_self).c_str(),
+      static_cast<double>(interval_write_other + interval_write_self) /
+          (interval_write_self + 1),
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+          std::max(interval_seconds_up, 0.001)),
+      value->append(buf);
 
   uint64_t interval_write_with_wal =
       write_with_wal - db_stats_snapshot_.write_with_wal;
   uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
   uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
 
-  snprintf(buf, sizeof(buf),
-           "Interval WAL: %s writes, %s syncs, "
-           "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
-           NumberToHumanString(interval_write_with_wal).c_str(),
-           NumberToHumanString(interval_wal_synced).c_str(),
-           interval_write_with_wal /
-              static_cast<double>(interval_wal_synced + 1),
-           interval_wal_bytes / kGB,
-           interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  snprintf(
+      buf, sizeof(buf),
+      "Interval WAL: %s writes, %s syncs, "
+      "%.2f writes per sync, written: %.2f MB, %.2f MB/s\n",
+      NumberToHumanString(interval_write_with_wal).c_str(),
+      NumberToHumanString(interval_wal_synced).c_str(),
+      interval_write_with_wal / static_cast<double>(interval_wal_synced + 1),
+      interval_wal_bytes / kGB,
+      interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
   value->append(buf);
 
   // Stall
-  AppendHumanMicros(
-      write_stall_micros - db_stats_snapshot_.write_stall_micros,
-      human_micros, kHumanMicrosLen, true);
-  snprintf(buf, sizeof(buf),
-           "Interval stall: %s, %.1f percent\n",
-           human_micros,
+  AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros,
+                    human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros,
            // 10000 = divide by 1M to get secs, then multiply by 100 for pct
            (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
                10000.0 / std::max(interval_seconds_up, 0.001));
@@ -928,13 +1059,16 @@ void InternalStats::DumpDBStats(std::string* value) {
 }
 
 /**
- * Dump Compaction Level stats to a map of stat name to value in double.
- * The level in stat name is represented with a prefix "Lx" where "x"
- * is the level number. A special level "Sum" represents the sum of a stat
- * for all levels.
+ * Dump Compaction Level stats to a map of stat name with "compaction." prefix
+ * to value in double as string. The level in stat name is represented with
+ * a prefix "Lx" where "x" is the level number. A special level "Sum"
+ * represents the sum of a stat for all levels.
+ * The result also contains IO stall counters which keys start with "io_stalls."
+ * and values represent uint64 encoded as strings.
  */
-void InternalStats::DumpCFMapStats(std::map<std::string, double>* cf_stats) {
-  CompactionStats compaction_stats_sum(0);
+void InternalStats::DumpCFMapStats(
+    std::map<std::string, std::string>* cf_stats) {
+  CompactionStats compaction_stats_sum;
   std::map<int, std::map<LevelStatType, double>> levels_stats;
   DumpCFMapStats(&levels_stats, &compaction_stats_sum);
   for (auto const& level_ent : levels_stats) {
@@ -943,11 +1077,13 @@ void InternalStats::DumpCFMapStats(std::map<std::string, double>* cf_stats) {
     for (auto const& stat_ent : level_ent.second) {
       auto stat_type = stat_ent.first;
       auto key_str =
-          level_str + "." +
+          "compaction." + level_str + "." +
           InternalStats::compaction_level_stats.at(stat_type).property_name;
-      (*cf_stats)[key_str] = stat_ent.second;
+      (*cf_stats)[key_str] = std::to_string(stat_ent.second);
     }
   }
+
+  DumpCFMapStatsIOStalls(cf_stats);
 }
 
 void InternalStats::DumpCFMapStats(
@@ -1018,6 +1154,52 @@ void InternalStats::DumpCFMapStats(
   (*levels_stats)[-1] = sum_stats;  //  -1 is for the Sum level
 }
 
+void InternalStats::DumpCFMapStatsByPriority(
+    std::map<int, std::map<LevelStatType, double>>* priorities_stats) {
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) {
+    if (comp_stats_by_pri_[priority].micros > 0) {
+      std::map<LevelStatType, double> priority_stats;
+      PrepareLevelStats(&priority_stats, 0 /* num_files */,
+                        0 /* being_compacted */, 0 /* total_file_size */,
+                        0 /* compaction_score */, 0 /* w_amp */,
+                        comp_stats_by_pri_[priority]);
+      (*priorities_stats)[static_cast<int>(priority)] = priority_stats;
+    }
+  }
+}
+
+void InternalStats::DumpCFMapStatsIOStalls(
+    std::map<std::string, std::string>* cf_stats) {
+  (*cf_stats)["io_stalls.level0_slowdown"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_numfiles"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.memtable_compaction"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.memtable_slowdown"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]);
+
+  uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+                        cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+                        cf_stats_count_[MEMTABLE_LIMIT_STOPS];
+
+  uint64_t total_slowdown =
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+
+  (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop);
+  (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown);
+}
+
 void InternalStats::DumpCFStats(std::string* value) {
   DumpCFStatsNoFileHistogram(value);
   DumpCFFileHistogram(value);
@@ -1026,12 +1208,12 @@ void InternalStats::DumpCFStats(std::string* value) {
 void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   char buf[2000];
   // Per-ColumnFamily stats
-  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName());
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level");
   value->append(buf);
 
   // Print stats for each level
   std::map<int, std::map<LevelStatType, double>> levels_stats;
-  CompactionStats compaction_stats_sum(0);
+  CompactionStats compaction_stats_sum;
   DumpCFMapStats(&levels_stats, &compaction_stats_sum);
   for (int l = 0; l < number_levels_; ++l) {
     if (levels_stats.find(l) != levels_stats.end()) {
@@ -1052,11 +1234,12 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL];
   // Cumulative summary
   uint64_t total_stall_count =
-      cf_stats_count_[LEVEL0_SLOWDOWN_TOTAL] +
-      cf_stats_count_[LEVEL0_NUM_FILES_TOTAL] +
-      cf_stats_count_[SOFT_PENDING_COMPACTION_BYTES_LIMIT] +
-      cf_stats_count_[HARD_PENDING_COMPACTION_BYTES_LIMIT] +
-      cf_stats_count_[MEMTABLE_COMPACTION] + cf_stats_count_[MEMTABLE_SLOWDOWN];
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
   // Interval summary
   uint64_t interval_flush_ingest =
       flush_ingest - cf_stats_snapshot_.ingest_bytes_flush;
@@ -1071,6 +1254,21 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
   value->append(buf);
 
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority");
+  value->append(buf);
+  std::map<int, std::map<LevelStatType, double>> priorities_stats;
+  DumpCFMapStatsByPriority(&priorities_stats);
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) {
+    if (priorities_stats.find(static_cast<int>(priority)) !=
+        priorities_stats.end()) {
+      PrintLevelStats(
+          buf, sizeof(buf),
+          Env::PriorityToString(static_cast<Env::Priority>(priority)),
+          priorities_stats[static_cast<int>(priority)]);
+      value->append(buf);
+    }
+  }
+
   double seconds_up = (env_->NowMicros() - started_at_ + 1) / kMicrosInSec;
   double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
   snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
@@ -1085,8 +1283,9 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
 
   uint64_t interval_ingest_files_addfile =
       ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile;
-  snprintf(buf, sizeof(buf), "AddFile(Total Files): cumulative %" PRIu64
-                             ", interval %" PRIu64 "\n",
+  snprintf(buf, sizeof(buf),
+           "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64
+           "\n",
            ingest_files_addfile, interval_ingest_files_addfile);
   value->append(buf);
 
@@ -1145,31 +1344,32 @@ void InternalStats::DumpCFStatsNoFileHistogram(std::string* value) {
   cf_stats_snapshot_.compact_bytes_read = compact_bytes_read;
   cf_stats_snapshot_.compact_micros = compact_micros;
 
-  snprintf(buf, sizeof(buf), "Stalls(count): %" PRIu64
-                             " level0_slowdown, "
-                             "%" PRIu64
-                             " level0_slowdown_with_compaction, "
-                             "%" PRIu64
-                             " level0_numfiles, "
-                             "%" PRIu64
-                             " level0_numfiles_with_compaction, "
-                             "%" PRIu64
-                             " stop for pending_compaction_bytes, "
-                             "%" PRIu64
-                             " slowdown for pending_compaction_bytes, "
-                             "%" PRIu64
-                             " memtable_compaction, "
-                             "%" PRIu64
-                             " memtable_slowdown, "
-                             "interval %" PRIu64 " total count\n",
-           cf_stats_count_[LEVEL0_SLOWDOWN_TOTAL],
-           cf_stats_count_[LEVEL0_SLOWDOWN_WITH_COMPACTION],
-           cf_stats_count_[LEVEL0_NUM_FILES_TOTAL],
-           cf_stats_count_[LEVEL0_NUM_FILES_WITH_COMPACTION],
-           cf_stats_count_[HARD_PENDING_COMPACTION_BYTES_LIMIT],
-           cf_stats_count_[SOFT_PENDING_COMPACTION_BYTES_LIMIT],
-           cf_stats_count_[MEMTABLE_COMPACTION],
-           cf_stats_count_[MEMTABLE_SLOWDOWN],
+  snprintf(buf, sizeof(buf),
+           "Stalls(count): %" PRIu64
+           " level0_slowdown, "
+           "%" PRIu64
+           " level0_slowdown_with_compaction, "
+           "%" PRIu64
+           " level0_numfiles, "
+           "%" PRIu64
+           " level0_numfiles_with_compaction, "
+           "%" PRIu64
+           " stop for pending_compaction_bytes, "
+           "%" PRIu64
+           " slowdown for pending_compaction_bytes, "
+           "%" PRIu64
+           " memtable_compaction, "
+           "%" PRIu64
+           " memtable_slowdown, "
+           "interval %" PRIu64 " total count\n",
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS],
+           cf_stats_count_[MEMTABLE_LIMIT_STOPS],
+           cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS],
            total_stall_count - cf_stats_snapshot_.stall_count);
   value->append(buf);
 
@@ -1203,7 +1403,9 @@ void InternalStats::DumpCFFileHistogram(std::string* value) {
 
 #else
 
-const DBPropertyInfo* GetPropertyInfo(const Slice& property) { return nullptr; }
+const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) {
+  return nullptr;
+}
 
 #endif  // !ROCKSDB_LITE
 
diff --git a/thirdparty/rocksdb/db/internal_stats.h b/thirdparty/rocksdb/db/internal_stats.h
index a0b8a90271..20fb07f485 100644
--- a/thirdparty/rocksdb/db/internal_stats.h
+++ b/thirdparty/rocksdb/db/internal_stats.h
@@ -19,8 +19,8 @@ class ColumnFamilyData;
 
 namespace rocksdb {
 
-class MemTableList;
 class DBImpl;
+class MemTableList;
 
 // Config for retrieving a property's value.
 struct DBPropertyInfo {
@@ -42,8 +42,13 @@ struct DBPropertyInfo {
   //      holding db mutex, which is only supported for int properties.
   bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db,
                                     Version* version);
-  bool (InternalStats::*handle_map)(
-      std::map<std::string, double>* compaction_stats);
+
+  // @param props Map of general properties to populate
+  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props);
+
+  // handle the string type properties rely on DBImpl methods
+  // @param value Value-result argument for storing the property's string value
+  bool (DBImpl::*handle_string_dbimpl)(std::string* value);
 };
 
 extern const DBPropertyInfo* GetPropertyInfo(const Slice& property);
@@ -66,6 +71,7 @@ enum class LevelStatType {
   READ_MBPS,
   WRITE_MBPS,
   COMP_SEC,
+  COMP_CPU_SEC,
   COMP_COUNT,
   AVG_SEC,
   KEY_IN,
@@ -85,14 +91,14 @@ class InternalStats {
   static const std::map<LevelStatType, LevelStat> compaction_level_stats;
 
   enum InternalCFStatsType {
-    LEVEL0_SLOWDOWN_TOTAL,
-    LEVEL0_SLOWDOWN_WITH_COMPACTION,
-    MEMTABLE_COMPACTION,
-    MEMTABLE_SLOWDOWN,
-    LEVEL0_NUM_FILES_TOTAL,
-    LEVEL0_NUM_FILES_WITH_COMPACTION,
-    SOFT_PENDING_COMPACTION_BYTES_LIMIT,
-    HARD_PENDING_COMPACTION_BYTES_LIMIT,
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
     WRITE_STALLS_ENUM_MAX,
     BYTES_FLUSHED,
     BYTES_INGESTED_ADD_FILE,
@@ -119,6 +125,7 @@ class InternalStats {
         cf_stats_value_{},
         cf_stats_count_{},
         comp_stats_(num_levels),
+        comp_stats_by_pri_(Env::Priority::TOTAL),
         file_read_latency_(num_levels),
         bg_error_count_(0),
         number_levels_(num_levels),
@@ -130,6 +137,7 @@ class InternalStats {
   // compactions that produced data for the specified "level".
   struct CompactionStats {
     uint64_t micros;
+    uint64_t cpu_micros;
 
     // The number of bytes read from all non-output levels
     uint64_t bytes_read_non_output_levels;
@@ -162,8 +170,31 @@ class InternalStats {
     // Number of compactions done
     int count;
 
-    explicit CompactionStats(int _count = 0)
+    // Number of compactions done per CompactionReason
+    int counts[static_cast<int>(CompactionReason::kNumOfReasons)];
+
+    explicit CompactionStats()
+        : micros(0),
+          cpu_micros(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
+          bytes_written(0),
+          bytes_moved(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
+          num_input_records(0),
+          num_dropped_records(0),
+          count(0) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+    }
+
+    explicit CompactionStats(CompactionReason reason, int c)
         : micros(0),
+          cpu_micros(0),
           bytes_read_non_output_levels(0),
           bytes_read_output_level(0),
           bytes_written(0),
@@ -173,25 +204,42 @@ class InternalStats {
           num_output_files(0),
           num_input_records(0),
           num_dropped_records(0),
-          count(_count) {}
+          count(c) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+      int r = static_cast<int>(reason);
+      if (r >= 0 && r < num_of_reasons) {
+        counts[r] = c;
+      } else {
+        count = 0;
+      }
+    }
 
     explicit CompactionStats(const CompactionStats& c)
         : micros(c.micros),
+          cpu_micros(c.cpu_micros),
           bytes_read_non_output_levels(c.bytes_read_non_output_levels),
           bytes_read_output_level(c.bytes_read_output_level),
           bytes_written(c.bytes_written),
           bytes_moved(c.bytes_moved),
           num_input_files_in_non_output_levels(
               c.num_input_files_in_non_output_levels),
-          num_input_files_in_output_level(
-              c.num_input_files_in_output_level),
+          num_input_files_in_output_level(c.num_input_files_in_output_level),
           num_output_files(c.num_output_files),
           num_input_records(c.num_input_records),
           num_dropped_records(c.num_dropped_records),
-          count(c.count) {}
+          count(c.count) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+    }
 
     void Clear() {
       this->micros = 0;
+      this->cpu_micros = 0;
       this->bytes_read_non_output_levels = 0;
       this->bytes_read_output_level = 0;
       this->bytes_written = 0;
@@ -202,10 +250,15 @@ class InternalStats {
       this->num_input_records = 0;
       this->num_dropped_records = 0;
       this->count = 0;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
     }
 
     void Add(const CompactionStats& c) {
       this->micros += c.micros;
+      this->cpu_micros += c.cpu_micros;
       this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
       this->bytes_read_output_level += c.bytes_read_output_level;
       this->bytes_written += c.bytes_written;
@@ -218,10 +271,15 @@ class InternalStats {
       this->num_input_records += c.num_input_records;
       this->num_dropped_records += c.num_dropped_records;
       this->count += c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i< num_of_reasons; i++) {
+        counts[i] += c.counts[i];
+      }
     }
 
     void Subtract(const CompactionStats& c) {
       this->micros -= c.micros;
+      this->cpu_micros -= c.cpu_micros;
       this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
       this->bytes_read_output_level -= c.bytes_read_output_level;
       this->bytes_written -= c.bytes_written;
@@ -234,6 +292,10 @@ class InternalStats {
       this->num_input_records -= c.num_input_records;
       this->num_dropped_records -= c.num_dropped_records;
       this->count -= c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] -= c.counts[i];
+      }
     }
   };
 
@@ -257,8 +319,10 @@ class InternalStats {
     started_at_ = env_->NowMicros();
   }
 
-  void AddCompactionStats(int level, const CompactionStats& stats) {
+  void AddCompactionStats(int level, Env::Priority thread_pri,
+                          const CompactionStats& stats) {
     comp_stats_[level].Add(stats);
+    comp_stats_by_pri_[thread_pri].Add(stats);
   }
 
   void IncBytesMoved(int level, uint64_t amount) {
@@ -298,7 +362,7 @@ class InternalStats {
 
   bool GetMapProperty(const DBPropertyInfo& property_info,
                       const Slice& property,
-                      std::map<std::string, double>* value);
+                      std::map<std::string, std::string>* value);
 
   bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value,
                       DBImpl* db);
@@ -306,20 +370,29 @@ class InternalStats {
   bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
                                 Version* version, uint64_t* value);
 
+  const std::vector<CompactionStats>& TEST_GetCompactionStats() const {
+    return comp_stats_;
+  }
+
   // Store a mapping from the user-facing DB::Properties string to our
   // DBPropertyInfo struct used internally for retrieving properties.
   static const std::unordered_map<std::string, DBPropertyInfo> ppt_name_to_info;
 
  private:
   void DumpDBStats(std::string* value);
-  void DumpCFMapStats(std::map<std::string, double>* cf_stats);
+  void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
   void DumpCFMapStats(
       std::map<int, std::map<LevelStatType, double>>* level_stats,
       CompactionStats* compaction_stats_sum);
+  void DumpCFMapStatsByPriority(
+      std::map<int, std::map<LevelStatType, double>>* priorities_stats);
+  void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
   void DumpCFStats(std::string* value);
   void DumpCFStatsNoFileHistogram(std::string* value);
   void DumpCFFileHistogram(std::string* value);
 
+  bool HandleBlockCacheStat(Cache** block_cache);
+
   // Per-DB stats
   std::atomic<uint64_t> db_stats_[INTERNAL_DB_STATS_ENUM_MAX];
   // Per-ColumnFamily stats
@@ -327,6 +400,7 @@ class InternalStats {
   uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
   // Per-ColumnFamily/level compaction stats
   std::vector<CompactionStats> comp_stats_;
+  std::vector<CompactionStats> comp_stats_by_pri_;
   std::vector<HistogramImpl> file_read_latency_;
 
   // Used to compute per-interval statistics
@@ -348,8 +422,7 @@ class InternalStats {
     uint64_t ingest_keys_addfile;      // Total number of keys ingested
 
     CFStatsSnapshot()
-        : comp_stats(0),
-          ingest_bytes_flush(0),
+        : ingest_bytes_flush(0),
           stall_count(0),
           compact_bytes_write(0),
           compact_bytes_read(0),
@@ -424,7 +497,7 @@ class InternalStats {
   bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
   bool HandleLevelStats(std::string* value, Slice suffix);
   bool HandleStats(std::string* value, Slice suffix);
-  bool HandleCFMapStats(std::map<std::string, double>* compaction_stats);
+  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats);
   bool HandleCFStats(std::string* value, Slice suffix);
   bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
   bool HandleCFFileHistogram(std::string* value, Slice suffix);
@@ -465,6 +538,7 @@ class InternalStats {
                                     Version* version);
   bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version);
   bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
   bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db,
                                             Version* version);
   bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db,
@@ -472,12 +546,17 @@ class InternalStats {
   bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
                                   Version* version);
   bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db,
+                                        Version* version);
   bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
                                     Version* version);
   bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
   bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
                                    Version* version);
-
+  bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
+                                   Version* version);
   // Total number of background errors encountered. Every time a flush task
   // or compaction task fails, this counter is incremented. The failure can
   // be caused by any possible reason, including file system errors, out of
@@ -496,14 +575,14 @@ class InternalStats {
 class InternalStats {
  public:
   enum InternalCFStatsType {
-    LEVEL0_SLOWDOWN_TOTAL,
-    LEVEL0_SLOWDOWN_WITH_COMPACTION,
-    MEMTABLE_COMPACTION,
-    MEMTABLE_SLOWDOWN,
-    LEVEL0_NUM_FILES_TOTAL,
-    LEVEL0_NUM_FILES_WITH_COMPACTION,
-    SOFT_PENDING_COMPACTION_BYTES_LIMIT,
-    HARD_PENDING_COMPACTION_BYTES_LIMIT,
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
     WRITE_STALLS_ENUM_MAX,
     BYTES_FLUSHED,
     BYTES_INGESTED_ADD_FILE,
@@ -525,10 +604,11 @@ class InternalStats {
     INTERNAL_DB_STATS_ENUM_MAX,
   };
 
-  InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) {}
+  InternalStats(int /*num_levels*/, Env* /*env*/, ColumnFamilyData* /*cfd*/) {}
 
   struct CompactionStats {
     uint64_t micros;
+    uint64_t cpu_micros;
     uint64_t bytes_read_non_output_levels;
     uint64_t bytes_read_output_level;
     uint64_t bytes_written;
@@ -540,48 +620,51 @@ class InternalStats {
     uint64_t num_dropped_records;
     int count;
 
-    explicit CompactionStats(int _count = 0) {}
+    explicit CompactionStats() {}
+
+    explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {}
 
-    explicit CompactionStats(const CompactionStats& c) {}
+    explicit CompactionStats(const CompactionStats& /*c*/) {}
 
-    void Add(const CompactionStats& c) {}
+    void Add(const CompactionStats& /*c*/) {}
 
-    void Subtract(const CompactionStats& c) {}
+    void Subtract(const CompactionStats& /*c*/) {}
   };
 
-  void AddCompactionStats(int level, const CompactionStats& stats) {}
+  void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+                          const CompactionStats& /*stats*/) {}
 
-  void IncBytesMoved(int level, uint64_t amount) {}
+  void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {}
 
-  void AddCFStats(InternalCFStatsType type, uint64_t value) {}
+  void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {}
 
-  void AddDBStats(InternalDBStatsType type, uint64_t value,
-                  bool concurrent = false) {}
+  void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/,
+                  bool /*concurrent */ = false) {}
 
-  HistogramImpl* GetFileReadHist(int level) { return nullptr; }
+  HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
 
   uint64_t GetBackgroundErrorCount() const { return 0; }
 
   uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
 
-  bool GetStringProperty(const DBPropertyInfo& property_info,
-                         const Slice& property, std::string* value) {
+  bool GetStringProperty(const DBPropertyInfo& /*property_info*/,
+                         const Slice& /*property*/, std::string* /*value*/) {
     return false;
   }
 
-  bool GetMapProperty(const DBPropertyInfo& property_info,
-                      const Slice& property,
-                      std::map<std::string, double>* value) {
+  bool GetMapProperty(const DBPropertyInfo& /*property_info*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) {
     return false;
   }
 
-  bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value,
-                      DBImpl* db) const {
+  bool GetIntProperty(const DBPropertyInfo& /*property_info*/, uint64_t* /*value*/,
+                      DBImpl* /*db*/) const {
     return false;
   }
 
-  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
-                                Version* version, uint64_t* value) const {
+  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/,
+                                Version* /*version*/, uint64_t* /*value*/) const {
     return false;
   }
 };
diff --git a/thirdparty/rocksdb/db/job_context.h b/thirdparty/rocksdb/db/job_context.h
index 950a3a667d..3978fad33c 100644
--- a/thirdparty/rocksdb/db/job_context.h
+++ b/thirdparty/rocksdb/db/job_context.h
@@ -13,27 +13,120 @@
 #include <vector>
 
 #include "db/log_writer.h"
+#include "db/column_family.h"
 
 namespace rocksdb {
 
 class MemTable;
+struct SuperVersion;
+
+struct SuperVersionContext {
+  struct WriteStallNotification {
+    WriteStallInfo write_stall_info;
+    const ImmutableCFOptions* immutable_cf_options;
+  };
+
+  autovector<SuperVersion*> superversions_to_free;
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+  autovector<WriteStallNotification> write_stall_notifications;
+#endif
+  std::unique_ptr<SuperVersion>
+      new_superversion;  // if nullptr no new superversion
+
+  explicit SuperVersionContext(bool create_superversion = false)
+    : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
+
+  explicit SuperVersionContext(SuperVersionContext&& other)
+      : superversions_to_free(std::move(other.superversions_to_free)),
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+        write_stall_notifications(std::move(other.write_stall_notifications)),
+#endif
+        new_superversion(std::move(other.new_superversion)) {
+  }
+
+  void NewSuperVersion() {
+    new_superversion = std::unique_ptr<SuperVersion>(new SuperVersion());
+  }
+
+  inline bool HaveSomethingToDelete() const {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    return !superversions_to_free.empty() ||
+           !write_stall_notifications.empty();
+#else
+    return !superversions_to_free.empty();
+#endif
+  }
+
+  void PushWriteStallNotification(
+      WriteStallCondition old_cond, WriteStallCondition new_cond,
+      const std::string& name, const ImmutableCFOptions* ioptions) {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    WriteStallNotification notif;
+    notif.write_stall_info.cf_name = name;
+    notif.write_stall_info.condition.prev = old_cond;
+    notif.write_stall_info.condition.cur = new_cond;
+    notif.immutable_cf_options = ioptions;
+    write_stall_notifications.push_back(notif);
+#else
+    (void)old_cond;
+    (void)new_cond;
+    (void)name;
+    (void)ioptions;
+#endif  // !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+  }
+
+  void Clean() {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    // notify listeners on changed write stall conditions
+    for (auto& notif : write_stall_notifications) {
+      for (auto& listener : notif.immutable_cf_options->listeners) {
+        listener->OnStallConditionsChanged(notif.write_stall_info);
+      }
+    }
+    write_stall_notifications.clear();
+#endif  // !ROCKSDB_LITE
+    // free superversions
+    for (auto s : superversions_to_free) {
+      delete s;
+    }
+    superversions_to_free.clear();
+  }
+
+  ~SuperVersionContext() {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    assert(write_stall_notifications.empty());
+#endif
+    assert(superversions_to_free.empty());
+  }
+};
 
 struct JobContext {
   inline bool HaveSomethingToDelete() const {
     return full_scan_candidate_files.size() || sst_delete_files.size() ||
-           log_delete_files.size() || manifest_delete_files.size() ||
-           new_superversion != nullptr || superversions_to_free.size() > 0 ||
-           memtables_to_free.size() > 0 || logs_to_free.size() > 0;
+           log_delete_files.size() || manifest_delete_files.size();
+  }
+
+  inline bool HaveSomethingToClean() const {
+    bool sv_have_sth = false;
+    for (const auto& sv_ctx : superversion_contexts) {
+      if (sv_ctx.HaveSomethingToDelete()) {
+        sv_have_sth = true;
+        break;
+      }
+    }
+    return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+           sv_have_sth;
   }
 
   // Structure to store information for candidate files to delete.
   struct CandidateFileInfo {
     std::string file_name;
-    uint32_t path_id;
-    CandidateFileInfo(std::string name, uint32_t path)
-        : file_name(std::move(name)), path_id(path) {}
+    std::string file_path;
+    CandidateFileInfo(std::string name, std::string path)
+        : file_name(std::move(name)), file_path(std::move(path)) {}
     bool operator==(const CandidateFileInfo& other) const {
-      return file_name == other.file_name && path_id == other.path_id;
+      return file_name == other.file_name &&
+             file_path == other.file_path;
     }
   };
 
@@ -50,7 +143,7 @@ struct JobContext {
   std::vector<FileDescriptor> sst_live;
 
   // a list of sst files that we need to delete
-  std::vector<FileMetaData*> sst_delete_files;
+  std::vector<ObsoleteFileInfo> sst_delete_files;
 
   // a list of log files that we need to delete
   std::vector<uint64_t> log_delete_files;
@@ -65,12 +158,11 @@ struct JobContext {
   // a list of memtables to be free
   autovector<MemTable*> memtables_to_free;
 
-  autovector<SuperVersion*> superversions_to_free;
+  // contexts for installing superversions for multiple column families
+  std::vector<SuperVersionContext> superversion_contexts;
 
   autovector<log::Writer*> logs_to_free;
 
-  SuperVersion* new_superversion;  // if nullptr no new superversion
-
   // the current manifest_file_number, log_number and prev_log_number
   // that corresponds to the set of files in 'live'.
   uint64_t manifest_file_number;
@@ -83,13 +175,17 @@ struct JobContext {
   size_t num_alive_log_files = 0;
   uint64_t size_log_to_delete = 0;
 
+  // Snapshot taken before flush/compaction job.
+  std::unique_ptr<ManagedSnapshot> job_snapshot;
+
   explicit JobContext(int _job_id, bool create_superversion = false) {
     job_id = _job_id;
     manifest_file_number = 0;
     pending_manifest_file_number = 0;
     log_number = 0;
     prev_log_number = 0;
-    new_superversion = create_superversion ? new SuperVersion() : nullptr;
+    superversion_contexts.emplace_back(
+        SuperVersionContext(create_superversion));
   }
 
   // For non-empty JobContext Clean() has to be called at least once before
@@ -97,31 +193,25 @@ struct JobContext {
   // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
   // doing potentially slow Clean() with locked DB mutex.
   void Clean() {
+    // free superversions
+    for (auto& sv_context : superversion_contexts) {
+      sv_context.Clean();
+    }
     // free pending memtables
     for (auto m : memtables_to_free) {
       delete m;
     }
-    // free superversions
-    for (auto s : superversions_to_free) {
-      delete s;
-    }
     for (auto l : logs_to_free) {
       delete l;
     }
-    // if new_superversion was not used, it will be non-nullptr and needs
-    // to be freed here
-    delete new_superversion;
 
     memtables_to_free.clear();
-    superversions_to_free.clear();
     logs_to_free.clear();
-    new_superversion = nullptr;
+    job_snapshot.reset();
   }
 
   ~JobContext() {
     assert(memtables_to_free.size() == 0);
-    assert(superversions_to_free.size() == 0);
-    assert(new_superversion == nullptr);
     assert(logs_to_free.size() == 0);
   }
 };
diff --git a/thirdparty/rocksdb/db/listener_test.cc b/thirdparty/rocksdb/db/listener_test.cc
index 5b5f2266b3..60d02ed0ae 100644
--- a/thirdparty/rocksdb/db/listener_test.cc
+++ b/thirdparty/rocksdb/db/listener_test.cc
@@ -46,22 +46,20 @@ class EventListenerTest : public DBTestBase {
 };
 
 struct TestPropertiesCollector : public rocksdb::TablePropertiesCollector {
-  virtual rocksdb::Status AddUserKey(const rocksdb::Slice& key,
-                                     const rocksdb::Slice& value,
-                                     rocksdb::EntryType type,
-                                     rocksdb::SequenceNumber seq,
-                                     uint64_t file_size) override {
+  rocksdb::Status AddUserKey(const rocksdb::Slice& /*key*/,
+                             const rocksdb::Slice& /*value*/,
+                             rocksdb::EntryType /*type*/,
+                             rocksdb::SequenceNumber /*seq*/,
+                             uint64_t /*file_size*/) override {
     return Status::OK();
   }
-  virtual rocksdb::Status Finish(
+  rocksdb::Status Finish(
       rocksdb::UserCollectedProperties* properties) override {
     properties->insert({"0", "1"});
     return Status::OK();
   }
 
-  virtual const char* Name() const override {
-    return "TestTablePropertiesCollector";
-  }
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
 
   rocksdb::UserCollectedProperties GetReadableProperties() const override {
     rocksdb::UserCollectedProperties ret;
@@ -72,8 +70,8 @@ struct TestPropertiesCollector : public rocksdb::TablePropertiesCollector {
 
 class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
  public:
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
-      TablePropertiesCollectorFactory::Context context) override {
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
     return new TestPropertiesCollector;
   }
   const char* Name() const override { return "TestTablePropertiesCollector"; }
@@ -260,7 +258,7 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
   }
 
-  // make sure call-back functions are called in the right order
+  // make sure callback functions are called in the right order
   for (size_t i = 0; i < cf_names.size(); ++i) {
     ASSERT_EQ(listener->flushed_dbs_[i], db_);
     ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
@@ -296,7 +294,7 @@ TEST_F(EventListenerTest, MultiCF) {
     ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
   }
 
-  // make sure call-back functions are called in the right order
+  // make sure callback functions are called in the right order
   for (size_t i = 0; i < cf_names.size(); i++) {
     ASSERT_EQ(listener->flushed_dbs_[i], db_);
     ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
@@ -417,7 +415,9 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
   for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
        ++i) {
     Put(1, ToString(i), std::string(10000, 'x'), WriteOptions());
-    db_->Flush(FlushOptions(), handles_[1]);
+    FlushOptions fo;
+    fo.allow_write_stall = true;
+    db_->Flush(fo, handles_[1]);
     db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
   }
   ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
@@ -425,7 +425,7 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
 
 class TestCompactionReasonListener : public EventListener {
  public:
-  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
     std::lock_guard<std::mutex> lock(mutex_);
     compaction_reasons_.push_back(ci.compaction_reason);
   }
@@ -528,7 +528,7 @@ TEST_F(EventListenerTest, CompactionReasonUniversal) {
 
   ASSERT_GT(listener->compaction_reasons_.size(), 0);
   for (auto compaction_reason : listener->compaction_reasons_) {
-    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSortedRunNum);
+    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio);
   }
 
   options.level0_file_num_compaction_trigger = 8;
@@ -601,7 +601,7 @@ class TableFileCreationListener : public EventListener {
 
     Status NewWritableFile(const std::string& fname,
                            std::unique_ptr<WritableFile>* result,
-                           const EnvOptions& options) {
+                           const EnvOptions& options) override {
       if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") {
         if (!status_.ok()) {
           return status_;
@@ -807,7 +807,8 @@ class BackgroundErrorListener : public EventListener {
  public:
   BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {}
 
-  void OnBackgroundError(BackgroundErrorReason reason, Status* bg_error) override {
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
     if (counter_ == 0) {
       // suppress the first error and disable write-dropping such that a retry
       // can succeed.
@@ -879,10 +880,75 @@ TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) {
   ASSERT_EQ(1, listener->counter());
 
   // trigger flush so compaction is triggered again; this time it succeeds
+  // The previous failed compaction may get retried automatically, so we may
+  // be left with 0 or 1 files in level 1, depending on when the retry gets
+  // scheduled
   ASSERT_OK(Put("key0", "val"));
   ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
   ASSERT_OK(dbfull()->TEST_WaitForCompact());
-  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_LE(1, NumTableFilesAtLevel(0));
+}
+
+class TestFileOperationListener : public EventListener {
+ public:
+  TestFileOperationListener() {
+    file_reads_.store(0);
+    file_reads_success_.store(0);
+    file_writes_.store(0);
+    file_writes_success_.store(0);
+  }
+
+  void OnFileReadFinish(const FileOperationInfo& info) override {
+    ++file_reads_;
+    if (info.status.ok()) {
+      ++file_reads_success_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileWriteFinish(const FileOperationInfo& info) override {
+    ++file_writes_;
+    if (info.status.ok()) {
+      ++file_writes_success_;
+    }
+    ReportDuration(info);
+  }
+
+  bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+  std::atomic<size_t> file_reads_;
+  std::atomic<size_t> file_reads_success_;
+  std::atomic<size_t> file_writes_;
+  std::atomic<size_t> file_writes_success_;
+
+ private:
+  void ReportDuration(const FileOperationInfo& info) const {
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
+        info.finish_timestamp - info.start_timestamp);
+    ASSERT_GT(duration.count(), 0);
+  }
+};
+
+TEST_F(EventListenerTest, OnFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  dbfull()->Flush(FlushOptions());
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_GE(listener->file_writes_.load(),
+            listener->file_writes_success_.load());
+  ASSERT_GT(listener->file_writes_.load(), 0);
+  Close();
+
+  Reopen(options);
+  ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
+  ASSERT_GT(listener->file_reads_.load(), 0);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/log_format.h b/thirdparty/rocksdb/db/log_format.h
index be22201af0..5aeb5aa5fd 100644
--- a/thirdparty/rocksdb/db/log_format.h
+++ b/thirdparty/rocksdb/db/log_format.h
@@ -37,9 +37,9 @@ static const unsigned int kBlockSize = 32768;
 // Header is checksum (4 bytes), length (2 bytes), type (1 byte)
 static const int kHeaderSize = 4 + 2 + 1;
 
-// Recyclable header is checksum (4 bytes), type (1 byte), log number
-// (4 bytes), length (2 bytes).
-static const int kRecyclableHeaderSize = 4 + 1 + 4 + 2;
+// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte),
+// log number (4 bytes).
+static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4;
 
 }  // namespace log
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/log_reader.cc b/thirdparty/rocksdb/db/log_reader.cc
index cae5d8ea08..e734e9d6c8 100644
--- a/thirdparty/rocksdb/db/log_reader.cc
+++ b/thirdparty/rocksdb/db/log_reader.cc
@@ -14,6 +14,7 @@
 #include "util/coding.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
+#include "util/util.h"
 
 namespace rocksdb {
 namespace log {
@@ -22,8 +23,8 @@ Reader::Reporter::~Reporter() {
 }
 
 Reader::Reader(std::shared_ptr<Logger> info_log,
-               unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
-               bool checksum, uint64_t initial_offset, uint64_t log_num)
+               std::unique_ptr<SequentialFileReader>&& _file,
+               Reporter* reporter, bool checksum, uint64_t log_num)
     : info_log_(info_log),
       file_(std::move(_file)),
       reporter_(reporter),
@@ -35,7 +36,6 @@ Reader::Reader(std::shared_ptr<Logger> info_log,
       eof_offset_(0),
       last_record_offset_(0),
       end_of_buffer_offset_(0),
-      initial_offset_(initial_offset),
       log_number_(log_num),
       recycled_(false) {}
 
@@ -43,29 +43,6 @@ Reader::~Reader() {
   delete[] backing_store_;
 }
 
-bool Reader::SkipToInitialBlock() {
-  size_t initial_offset_in_block = initial_offset_ % kBlockSize;
-  uint64_t block_start_location = initial_offset_ - initial_offset_in_block;
-
-  // Don't search a block if we'd be in the trailer
-  if (initial_offset_in_block > kBlockSize - 6) {
-    block_start_location += kBlockSize;
-  }
-
-  end_of_buffer_offset_ = block_start_location;
-
-  // Skip to start of first block that can contain the initial record
-  if (block_start_location > 0) {
-    Status skip_status = file_->Skip(block_start_location);
-    if (!skip_status.ok()) {
-      ReportDrop(static_cast<size_t>(block_start_location), skip_status);
-      return false;
-    }
-  }
-
-  return true;
-}
-
 // For kAbsoluteConsistency, on clean shutdown we don't expect any error
 // in the log files.  For other modes, we can ignore only incomplete records
 // in the last log file, which are presumably due to a write in progress
@@ -75,12 +52,6 @@ bool Reader::SkipToInitialBlock() {
 // restrict the inconsistency to only the last log
 bool Reader::ReadRecord(Slice* record, std::string* scratch,
                         WALRecoveryMode wal_recovery_mode) {
-  if (last_record_offset_ < initial_offset_) {
-    if (!SkipToInitialBlock()) {
-      return false;
-    }
-  }
-
   scratch->clear();
   record->clear();
   bool in_fragmented_record = false;
@@ -151,7 +122,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           // in clean shutdown we don't expect any error in the log files
           ReportCorruption(drop_size, "truncated header");
         }
-      // fall-thru
+        FALLTHROUGH_INTENDED;
 
       case kEof:
         if (in_fragmented_record) {
@@ -181,7 +152,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
           }
           return false;
         }
-      // fall-thru
+        FALLTHROUGH_INTENDED;
 
       case kBadRecord:
         if (in_fragmented_record) {
@@ -234,13 +205,14 @@ void Reader::UnmarkEOF() {
   if (read_error_) {
     return;
   }
-
   eof_ = false;
-
   if (eof_offset_ == 0) {
     return;
   }
+  UnmarkEOFInternal();
+}
 
+void Reader::UnmarkEOFInternal() {
   // If the EOF was in the middle of a block (a partial block was read) we have
   // to read the rest of the block as ReadPhysicalRecord can only read full
   // blocks and expects the file position indicator to be aligned to the start
@@ -298,8 +270,7 @@ void Reader::ReportCorruption(size_t bytes, const char* reason) {
 }
 
 void Reader::ReportDrop(size_t bytes, const Status& reason) {
-  if (reporter_ != nullptr &&
-      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+  if (reporter_ != nullptr) {
     reporter_->Corruption(bytes, reason);
   }
 }
@@ -316,7 +287,7 @@ bool Reader::ReadMore(size_t* drop_size, int *error) {
       read_error_ = true;
       *error = kEof;
       return false;
-    } else if (buffer_.size() < (size_t)kBlockSize) {
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
       eof_ = true;
       eof_offset_ = buffer_.size();
     }
@@ -341,8 +312,11 @@ bool Reader::ReadMore(size_t* drop_size, int *error) {
 unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
   while (true) {
     // We need at least the minimum header size
-    if (buffer_.size() < (size_t)kHeaderSize) {
-      int r;
+    if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+      // the default value of r is meaningless because ReadMore will overwrite
+      // it if it returns false; in case it returns true, the return value will
+      // not be used anyway
+      int r = kEof;
       if (!ReadMore(drop_size, &r)) {
         return r;
       }
@@ -362,8 +336,8 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
       }
       header_size = kRecyclableHeaderSize;
       // We need enough for the larger header
-      if (buffer_.size() < (size_t)kRecyclableHeaderSize) {
-        int r;
+      if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+        int r = kEof;
         if (!ReadMore(drop_size, &r)) {
           return r;
         }
@@ -380,9 +354,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
       if (!eof_) {
         return kBadRecordLen;
       }
-      // If the end of the file has been reached without reading |length| bytes
-      // of payload, assume the writer died in the middle of writing the record.
-      // Don't report a corruption unless requested.
+      // If the end of the file has been reached without reading |length|
+      // bytes of payload, assume the writer died in the middle of writing the
+      // record. Don't report a corruption unless requested.
       if (*drop_size) {
         return kBadHeader;
       }
@@ -416,17 +390,234 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
 
     buffer_.remove_prefix(header_size + length);
 
-    // Skip physical record that started before initial_offset_
-    if (end_of_buffer_offset_ - buffer_.size() - header_size - length <
-        initial_offset_) {
-      result->clear();
-      return kBadRecord;
-    }
-
     *result = Slice(header + header_size, length);
     return type;
   }
 }
 
+bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
+                                        WALRecoveryMode /*unused*/) {
+  assert(record != nullptr);
+  assert(scratch != nullptr);
+  record->clear();
+  scratch->clear();
+
+  uint64_t prospective_record_offset = 0;
+  uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+  size_t drop_size = 0;
+  unsigned int fragment_type_or_err = 0;  // Initialize to make compiler happy
+  Slice fragment;
+  while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) {
+    switch (fragment_type_or_err) {
+      case kFullType:
+      case kRecyclableFullType:
+        if (in_fragmented_record_ && !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(1)");
+        }
+        fragments_.clear();
+        *record = fragment;
+        prospective_record_offset = physical_record_offset;
+        last_record_offset_ = prospective_record_offset;
+        in_fragmented_record_ = false;
+        return true;
+
+      case kFirstType:
+      case kRecyclableFirstType:
+        if (in_fragmented_record_ || !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(2)");
+        }
+        prospective_record_offset = physical_record_offset;
+        fragments_.assign(fragment.data(), fragment.size());
+        in_fragmented_record_ = true;
+        break;
+
+      case kMiddleType:
+      case kRecyclableMiddleType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+      case kRecyclableLastType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+          scratch->assign(fragments_.data(), fragments_.size());
+          fragments_.clear();
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          in_fragmented_record_ = false;
+          return true;
+        }
+        break;
+
+      case kBadHeader:
+      case kBadRecord:
+      case kEof:
+      case kOldRecord:
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      case kBadRecordChecksum:
+        if (recycled_) {
+          fragments_.clear();
+          return false;
+        }
+        ReportCorruption(drop_size, "checksum mismatch");
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u",
+                 fragment_type_or_err);
+        ReportCorruption(
+            fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0),
+            buf);
+        in_fragmented_record_ = false;
+        fragments_.clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+void FragmentBufferedReader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+  eof_ = false;
+  UnmarkEOFInternal();
+}
+
+bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) {
+  if (!eof_ && !read_error_) {
+    // Last read was a full read, so this is a trailer to skip
+    buffer_.clear();
+    Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+    end_of_buffer_offset_ += buffer_.size();
+    if (!status.ok()) {
+      buffer_.clear();
+      ReportDrop(kBlockSize, status);
+      read_error_ = true;
+      *error = kEof;
+      return false;
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+      eof_ = true;
+      eof_offset_ = buffer_.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr);
+    }
+    return true;
+  } else if (!read_error_) {
+    UnmarkEOF();
+  }
+  if (!read_error_) {
+    return true;
+  }
+  *error = kEof;
+  *drop_size = buffer_.size();
+  if (buffer_.size() > 0) {
+    *error = kBadHeader;
+  }
+  buffer_.clear();
+  return false;
+}
+
+// return true if the caller should process the fragment_type_or_err.
+bool FragmentBufferedReader::TryReadFragment(
+    Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) {
+  assert(fragment != nullptr);
+  assert(drop_size != nullptr);
+  assert(fragment_type_or_err != nullptr);
+
+  while (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+  const char* header = buffer_.data();
+  const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+  const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+  const unsigned int type = header[6];
+  const uint32_t length = a | (b << 8);
+  int header_size = kHeaderSize;
+  if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+    if (end_of_buffer_offset_ - buffer_.size() == 0) {
+      recycled_ = true;
+    }
+    header_size = kRecyclableHeaderSize;
+    while (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+      size_t old_size = buffer_.size();
+      int error = kEof;
+      if (!TryReadMore(drop_size, &error)) {
+        *fragment_type_or_err = error;
+        return false;
+      } else if (old_size == buffer_.size()) {
+        return false;
+      }
+    }
+    const uint32_t log_num = DecodeFixed32(header + 7);
+    if (log_num != log_number_) {
+      *fragment_type_or_err = kOldRecord;
+      return true;
+    }
+  }
+
+  while (header_size + length > buffer_.size()) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+
+  if (type == kZeroType && length == 0) {
+    buffer_.clear();
+    *fragment_type_or_err = kBadRecord;
+    return true;
+  }
+
+  if (checksum_) {
+    uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+    uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+    if (actual_crc != expected_crc) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      *fragment_type_or_err = kBadRecordChecksum;
+      return true;
+    }
+  }
+
+  buffer_.remove_prefix(header_size + length);
+
+  *fragment = Slice(header + header_size, length);
+  *fragment_type_or_err = type;
+  return true;
+}
+
 }  // namespace log
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/log_reader.h b/thirdparty/rocksdb/db/log_reader.h
index c6a471cda4..058382b8a3 100644
--- a/thirdparty/rocksdb/db/log_reader.h
+++ b/thirdparty/rocksdb/db/log_reader.h
@@ -20,7 +20,6 @@ namespace rocksdb {
 
 class SequentialFileReader;
 class Logger;
-using std::unique_ptr;
 
 namespace log {
 
@@ -50,24 +49,21 @@ class Reader {
   // live while this Reader is in use.
   //
   // If "checksum" is true, verify checksums if available.
-  //
-  // The Reader will start reading at the first record located at physical
-  // position >= initial_offset within the file.
   Reader(std::shared_ptr<Logger> info_log,
-	 unique_ptr<SequentialFileReader>&& file,
-         Reporter* reporter, bool checksum, uint64_t initial_offset,
-         uint64_t log_num);
+         // @lint-ignore TXT2 T25377293 Grandfathered in
+         std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+         bool checksum, uint64_t log_num);
 
-  ~Reader();
+  virtual ~Reader();
 
   // Read the next record into *record.  Returns true if read
   // successfully, false if we hit end of the input.  May use
   // "*scratch" as temporary storage.  The contents filled in *record
   // will only be valid until the next mutating operation on this
   // reader or the next mutation to *scratch.
-  bool ReadRecord(Slice* record, std::string* scratch,
-                  WALRecoveryMode wal_recovery_mode =
-                      WALRecoveryMode::kTolerateCorruptedTailRecords);
+  virtual bool ReadRecord(Slice* record, std::string* scratch,
+                          WALRecoveryMode wal_recovery_mode =
+                              WALRecoveryMode::kTolerateCorruptedTailRecords);
 
   // Returns the physical offset of the last record returned by ReadRecord.
   //
@@ -79,21 +75,28 @@ class Reader {
     return eof_;
   }
 
+  // returns true if the reader has encountered read error.
+  bool hasReadError() const { return read_error_; }
+
   // when we know more data has been written to the file. we can use this
   // function to force the reader to look again in the file.
   // Also aligns the file position indicator to the start of the next block
   // by reading the rest of the data from the EOF position to the end of the
   // block that was partially read.
-  void UnmarkEOF();
+  virtual void UnmarkEOF();
 
   SequentialFileReader* file() { return file_.get(); }
 
- private:
+  Reporter* GetReporter() const { return reporter_; }
+
+ protected:
   std::shared_ptr<Logger> info_log_;
-  const unique_ptr<SequentialFileReader> file_;
+  const std::unique_ptr<SequentialFileReader> file_;
   Reporter* const reporter_;
   bool const checksum_;
   char* const backing_store_;
+
+  // Internal state variables used for reading records
   Slice buffer_;
   bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
   bool read_error_;   // Error occurred while reading from file
@@ -107,9 +110,6 @@ class Reader {
   // Offset of the first location past the end of buffer_.
   uint64_t end_of_buffer_offset_;
 
-  // Offset at which to start looking for the first record to return
-  uint64_t const initial_offset_;
-
   // which log number this is
   uint64_t const log_number_;
 
@@ -123,7 +123,6 @@ class Reader {
     // Currently there are three situations in which this happens:
     // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
     // * The record is a 0-length record (No drop is reported)
-    // * The record is below constructor's initial_offset (No drop is reported)
     kBadRecord = kMaxRecordType + 2,
     // Returned when we fail to read a valid header.
     kBadHeader = kMaxRecordType + 3,
@@ -135,26 +134,53 @@ class Reader {
     kBadRecordChecksum = kMaxRecordType + 6,
   };
 
-  // Skips all blocks that are completely before "initial_offset_".
-  //
-  // Returns true on success. Handles reporting.
-  bool SkipToInitialBlock();
-
   // Return type, or one of the preceding special values
   unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size);
 
   // Read some more
   bool ReadMore(size_t* drop_size, int *error);
 
+  void UnmarkEOFInternal();
+
   // Reports dropped bytes to the reporter.
   // buffer_ must be updated to remove the dropped bytes prior to invocation.
   void ReportCorruption(size_t bytes, const char* reason);
   void ReportDrop(size_t bytes, const Status& reason);
 
+ private:
   // No copying allowed
   Reader(const Reader&);
   void operator=(const Reader&);
 };
 
+class FragmentBufferedReader : public Reader {
+ public:
+  FragmentBufferedReader(std::shared_ptr<Logger> info_log,
+                         // @lint-ignore TXT2 T25377293 Grandfathered in
+                         std::unique_ptr<SequentialFileReader>&& _file,
+                         Reporter* reporter, bool checksum, uint64_t log_num)
+      : Reader(info_log, std::move(_file), reporter, checksum, log_num),
+        fragments_(),
+        in_fragmented_record_(false) {}
+  ~FragmentBufferedReader() override {}
+  bool ReadRecord(Slice* record, std::string* scratch,
+                  WALRecoveryMode wal_recovery_mode =
+                      WALRecoveryMode::kTolerateCorruptedTailRecords) override;
+  void UnmarkEOF() override;
+
+ private:
+  std::string fragments_;
+  bool in_fragmented_record_;
+
+  bool TryReadFragment(Slice* result, size_t* drop_size,
+                       unsigned int* fragment_type_or_err);
+
+  bool TryReadMore(size_t* drop_size, int* error);
+
+  // No copy allowed
+  FragmentBufferedReader(const FragmentBufferedReader&);
+  void operator=(const FragmentBufferedReader&);
+};
+
 }  // namespace log
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/log_test.cc b/thirdparty/rocksdb/db/log_test.cc
index 651a1d0eee..fd237b030e 100644
--- a/thirdparty/rocksdb/db/log_test.cc
+++ b/thirdparty/rocksdb/db/log_test.cc
@@ -43,7 +43,10 @@ static std::string RandomSkewedString(int i, Random* rnd) {
   return BigString(NumberString(i), rnd->Skewed(17));
 }
 
-class LogTest : public ::testing::TestWithParam<int> {
+// Param type is tuple<int, bool>
+// get<0>(tuple): non-zero if recycling log, zero if regular log
+// get<1>(tuple): true if allow retry after read EOF, false otherwise
+class LogTest : public ::testing::TestWithParam<std::tuple<int, bool>> {
  private:
   class StringSource : public SequentialFile {
    public:
@@ -53,16 +56,20 @@ class LogTest : public ::testing::TestWithParam<int> {
     bool force_eof_;
     size_t force_eof_position_;
     bool returned_partial_;
-    explicit StringSource(Slice& contents) :
-      contents_(contents),
-      force_error_(false),
-      force_error_position_(0),
-      force_eof_(false),
-      force_eof_position_(0),
-      returned_partial_(false) { }
-
-    virtual Status Read(size_t n, Slice* result, char* scratch) override {
-      EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+    bool fail_after_read_partial_;
+    explicit StringSource(Slice& contents, bool fail_after_read_partial)
+        : contents_(contents),
+          force_error_(false),
+          force_error_position_(0),
+          force_eof_(false),
+          force_eof_position_(0),
+          returned_partial_(false),
+          fail_after_read_partial_(fail_after_read_partial) {}
+
+    Status Read(size_t n, Slice* result, char* scratch) override {
+      if (fail_after_read_partial_) {
+        EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+      }
 
       if (force_error_) {
         if (force_error_position_ >= n) {
@@ -100,7 +107,7 @@ class LogTest : public ::testing::TestWithParam<int> {
       return Status::OK();
     }
 
-    virtual Status Skip(uint64_t n) override {
+    Status Skip(uint64_t n) override {
       if (n > contents_.size()) {
         contents_.clear();
         return Status::NotFound("in-memory file skipepd past end");
@@ -118,7 +125,7 @@ class LogTest : public ::testing::TestWithParam<int> {
     std::string message_;
 
     ReportCollector() : dropped_bytes_(0) { }
-    virtual void Corruption(size_t bytes, const Status& status) override {
+    void Corruption(size_t bytes, const Status& status) override {
       dropped_bytes_ += bytes;
       message_.append(status.ToString());
     }
@@ -139,39 +146,39 @@ class LogTest : public ::testing::TestWithParam<int> {
   }
 
   void reset_source_contents() {
-    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
+    auto src = dynamic_cast<StringSource*>(reader_->file()->file());
     assert(src);
     src->contents_ = dest_contents();
   }
 
   Slice reader_contents_;
-  unique_ptr<WritableFileWriter> dest_holder_;
-  unique_ptr<SequentialFileReader> source_holder_;
+  std::unique_ptr<WritableFileWriter> dest_holder_;
+  std::unique_ptr<SequentialFileReader> source_holder_;
   ReportCollector report_;
   Writer writer_;
-  Reader reader_;
+  std::unique_ptr<Reader> reader_;
 
-  // Record metadata for testing initial offset functionality
-  static size_t initial_offset_record_sizes_[];
-  uint64_t initial_offset_last_record_offsets_[4];
+ protected:
+  bool allow_retry_read_;
 
  public:
   LogTest()
       : reader_contents_(),
         dest_holder_(test::GetWritableFileWriter(
-            new test::StringSink(&reader_contents_))),
-        source_holder_(
-            test::GetSequentialFileReader(new StringSource(reader_contents_))),
-        writer_(std::move(dest_holder_), 123, GetParam()),
-        reader_(NULL, std::move(source_holder_), &report_, true /*checksum*/,
-                0 /*initial_offset*/, 123) {
-    int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-    initial_offset_last_record_offsets_[0] = 0;
-    initial_offset_last_record_offsets_[1] = header_size + 10000;
-    initial_offset_last_record_offsets_[2] = 2 * (header_size + 10000);
-    initial_offset_last_record_offsets_[3] = 2 * (header_size + 10000) +
-                                             (2 * log::kBlockSize - 1000) +
-                                             3 * header_size;
+            new test::StringSink(&reader_contents_), "" /* don't care */)),
+        source_holder_(test::GetSequentialFileReader(
+            new StringSource(reader_contents_, !std::get<1>(GetParam())),
+            "" /* file name */)),
+        writer_(std::move(dest_holder_), 123, std::get<0>(GetParam())),
+        allow_retry_read_(std::get<1>(GetParam())) {
+    if (allow_retry_read_) {
+      reader_.reset(new FragmentBufferedReader(
+          nullptr, std::move(source_holder_), &report_, true /* checksum */,
+          123 /* log_number */));
+    } else {
+      reader_.reset(new Reader(nullptr, std::move(source_holder_), &report_,
+                               true /* checksum */, 123 /* log_number */));
+    }
   }
 
   Slice* get_reader_contents() { return &reader_contents_; }
@@ -188,14 +195,16 @@ class LogTest : public ::testing::TestWithParam<int> {
                        WALRecoveryMode::kTolerateCorruptedTailRecords) {
     std::string scratch;
     Slice record;
-    if (reader_.ReadRecord(&record, &scratch, wal_recovery_mode)) {
+    bool ret = false;
+    ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode);
+    if (ret) {
       return record.ToString();
     } else {
       return "EOF";
     }
   }
 
-  void IncrementByte(int offset, int delta) {
+  void IncrementByte(int offset, char delta) {
     dest_contents()[offset] += delta;
   }
 
@@ -220,7 +229,7 @@ class LogTest : public ::testing::TestWithParam<int> {
   }
 
   void ForceError(size_t position = 0) {
-    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
+    auto src = dynamic_cast<StringSource*>(reader_->file()->file());
     src->force_error_ = true;
     src->force_error_position_ = position;
   }
@@ -234,20 +243,18 @@ class LogTest : public ::testing::TestWithParam<int> {
   }
 
   void ForceEOF(size_t position = 0) {
-    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
+    auto src = dynamic_cast<StringSource*>(reader_->file()->file());
     src->force_eof_ = true;
     src->force_eof_position_ = position;
   }
 
   void UnmarkEOF() {
-    auto src = dynamic_cast<StringSource*>(reader_.file()->file());
+    auto src = dynamic_cast<StringSource*>(reader_->file()->file());
     src->returned_partial_ = false;
-    reader_.UnmarkEOF();
+    reader_->UnmarkEOF();
   }
 
-  bool IsEOF() {
-    return reader_.IsEOF();
-  }
+  bool IsEOF() { return reader_->IsEOF(); }
 
   // Returns OK iff recorded error message contains "msg"
   std::string MatchError(const std::string& msg) const {
@@ -257,53 +264,8 @@ class LogTest : public ::testing::TestWithParam<int> {
       return "OK";
     }
   }
-
-  void WriteInitialOffsetLog() {
-    for (int i = 0; i < 4; i++) {
-      std::string record(initial_offset_record_sizes_[i],
-                         static_cast<char>('a' + i));
-      Write(record);
-    }
-  }
-
-  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
-    WriteInitialOffsetLog();
-    unique_ptr<SequentialFileReader> file_reader(
-        test::GetSequentialFileReader(new StringSource(reader_contents_)));
-    unique_ptr<Reader> offset_reader(
-        new Reader(NULL, std::move(file_reader), &report_,
-                   true /*checksum*/, WrittenBytes() + offset_past_end, 123));
-    Slice record;
-    std::string scratch;
-    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
-  }
-
-  void CheckInitialOffsetRecord(uint64_t initial_offset,
-                                int expected_record_offset) {
-    WriteInitialOffsetLog();
-    unique_ptr<SequentialFileReader> file_reader(
-        test::GetSequentialFileReader(new StringSource(reader_contents_)));
-    unique_ptr<Reader> offset_reader(
-        new Reader(NULL, std::move(file_reader), &report_,
-                   true /*checksum*/, initial_offset, 123));
-    Slice record;
-    std::string scratch;
-    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
-    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
-              record.size());
-    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
-              offset_reader->LastRecordOffset());
-    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
-  }
-
 };
 
-size_t LogTest::initial_offset_record_sizes_[] =
-    {10000,  // Two sizable records in first block
-     10000,
-     2 * log::kBlockSize - 1000,  // Span three blocks
-     1};
-
 TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
 
 TEST_P(LogTest, ReadWrite) {
@@ -341,7 +303,8 @@ TEST_P(LogTest, Fragmentation) {
 
 TEST_P(LogTest, MarginalTrailer) {
   // Make a trailer that is exactly the same length as an empty record.
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
   const int n = kBlockSize - 2 * header_size;
   Write(BigString("foo", n));
   ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
@@ -355,7 +318,8 @@ TEST_P(LogTest, MarginalTrailer) {
 
 TEST_P(LogTest, MarginalTrailer2) {
   // Make a trailer that is exactly the same length as an empty record.
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
   const int n = kBlockSize - 2 * header_size;
   Write(BigString("foo", n));
   ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
@@ -368,7 +332,8 @@ TEST_P(LogTest, MarginalTrailer2) {
 }
 
 TEST_P(LogTest, ShortTrailer) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
   const int n = kBlockSize - 2 * header_size + 4;
   Write(BigString("foo", n));
   ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
@@ -381,7 +346,8 @@ TEST_P(LogTest, ShortTrailer) {
 }
 
 TEST_P(LogTest, AlignedEof) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
   const int n = kBlockSize - 2 * header_size + 4;
   Write(BigString("foo", n));
   ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
@@ -432,6 +398,11 @@ TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
 }
 
 TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
   Write("foo");
   ShrinkSize(4);  // Drop all payload as well as a header byte
   ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
@@ -441,13 +412,20 @@ TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
 }
 
 TEST_P(LogTest, BadLength) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
   const int kPayloadSize = kBlockSize - header_size;
   Write(BigString("bar", kPayloadSize));
   Write("foo");
   // Least significant size byte is stored in header[4].
   IncrementByte(4, 1);
-  if (!GetParam()) {
+  if (!recyclable_log) {
     ASSERT_EQ("foo", Read());
     ASSERT_EQ(kBlockSize, DroppedBytes());
     ASSERT_EQ("OK", MatchError("bad record length"));
@@ -457,6 +435,12 @@ TEST_P(LogTest, BadLength) {
 }
 
 TEST_P(LogTest, BadLengthAtEndIsIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
   Write("foo");
   ShrinkSize(1);
   ASSERT_EQ("EOF", Read());
@@ -465,6 +449,12 @@ TEST_P(LogTest, BadLengthAtEndIsIgnored) {
 }
 
 TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
   Write("foo");
   ShrinkSize(1);
   ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
@@ -476,7 +466,8 @@ TEST_P(LogTest, ChecksumMismatch) {
   Write("foooooo");
   IncrementByte(0, 14);
   ASSERT_EQ("EOF", Read());
-  if (!GetParam()) {
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
     ASSERT_EQ(14U, DroppedBytes());
     ASSERT_EQ("OK", MatchError("checksum mismatch"));
   } else {
@@ -487,8 +478,10 @@ TEST_P(LogTest, ChecksumMismatch) {
 
 TEST_P(LogTest, UnexpectedMiddleType) {
   Write("foo");
-  SetByte(6, GetParam() ? kRecyclableMiddleType : kMiddleType);
-  FixChecksum(0, 3, !!GetParam());
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6, static_cast<char>(recyclable_log ? kRecyclableMiddleType
+                                              : kMiddleType));
+  FixChecksum(0, 3, !!recyclable_log);
   ASSERT_EQ("EOF", Read());
   ASSERT_EQ(3U, DroppedBytes());
   ASSERT_EQ("OK", MatchError("missing start"));
@@ -496,8 +489,10 @@ TEST_P(LogTest, UnexpectedMiddleType) {
 
 TEST_P(LogTest, UnexpectedLastType) {
   Write("foo");
-  SetByte(6, GetParam() ? kRecyclableLastType : kLastType);
-  FixChecksum(0, 3, !!GetParam());
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6,
+          static_cast<char>(recyclable_log ? kRecyclableLastType : kLastType));
+  FixChecksum(0, 3, !!recyclable_log);
   ASSERT_EQ("EOF", Read());
   ASSERT_EQ(3U, DroppedBytes());
   ASSERT_EQ("OK", MatchError("missing start"));
@@ -506,8 +501,10 @@ TEST_P(LogTest, UnexpectedLastType) {
 TEST_P(LogTest, UnexpectedFullType) {
   Write("foo");
   Write("bar");
-  SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType);
-  FixChecksum(0, 3, !!GetParam());
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
   ASSERT_EQ("bar", Read());
   ASSERT_EQ("EOF", Read());
   ASSERT_EQ(3U, DroppedBytes());
@@ -517,8 +514,10 @@ TEST_P(LogTest, UnexpectedFullType) {
 TEST_P(LogTest, UnexpectedFirstType) {
   Write("foo");
   Write(BigString("bar", 100000));
-  SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType);
-  FixChecksum(0, 3, !!GetParam());
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
   ASSERT_EQ(BigString("bar", 100000), Read());
   ASSERT_EQ("EOF", Read());
   ASSERT_EQ(3U, DroppedBytes());
@@ -535,6 +534,11 @@ TEST_P(LogTest, MissingLastIsIgnored) {
 }
 
 TEST_P(LogTest, MissingLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
   Write(BigString("bar", kBlockSize));
   // Remove the LAST block, including header.
   ShrinkSize(14);
@@ -553,6 +557,11 @@ TEST_P(LogTest, PartialLastIsIgnored) {
 }
 
 TEST_P(LogTest, PartialLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
   Write(BigString("bar", kBlockSize));
   // Cause a bad record length in the LAST block.
   ShrinkSize(1);
@@ -579,7 +588,8 @@ TEST_P(LogTest, ErrorJoinsRecords) {
     SetByte(offset, 'x');
   }
 
-  if (!GetParam()) {
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
     ASSERT_EQ("correct", Read());
     ASSERT_EQ("EOF", Read());
     size_t dropped = DroppedBytes();
@@ -590,59 +600,11 @@ TEST_P(LogTest, ErrorJoinsRecords) {
   }
 }
 
-TEST_P(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
-
-TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
-
-TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
-
-TEST_P(LogTest, ReadSecondStart) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(10000 + header_size, 1);
-}
-
-TEST_P(LogTest, ReadThirdOneOff) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(10000 + header_size + 1, 2);
-}
-
-TEST_P(LogTest, ReadThirdStart) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(20000 + 2 * header_size, 2);
-}
-
-TEST_P(LogTest, ReadFourthOneOff) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(20000 + 2 * header_size + 1, 3);
-}
-
-TEST_P(LogTest, ReadFourthFirstBlockTrailer) {
-  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
-}
-
-TEST_P(LogTest, ReadFourthMiddleBlock) {
-  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
-}
-
-TEST_P(LogTest, ReadFourthLastBlock) {
-  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
-}
-
-TEST_P(LogTest, ReadFourthStart) {
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
-  CheckInitialOffsetRecord(
-      2 * (header_size + 1000) + (2 * log::kBlockSize - 1000) + 3 * header_size,
-      3);
-}
-
-TEST_P(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
-
-TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
-
 TEST_P(LogTest, ClearEofSingleBlock) {
   Write("foo");
   Write("bar");
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
   ForceEOF(3 + header_size + 2);
   ASSERT_EQ("foo", Read());
   UnmarkEOF();
@@ -657,7 +619,8 @@ TEST_P(LogTest, ClearEofSingleBlock) {
 
 TEST_P(LogTest, ClearEofMultiBlock) {
   size_t num_full_blocks = 5;
-  int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
   size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
   Write(BigString("foo", n));
   Write(BigString("bar", n));
@@ -706,7 +669,8 @@ TEST_P(LogTest, ClearEofError2) {
 }
 
 TEST_P(LogTest, Recycle) {
-  if (!GetParam()) {
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
     return;  // test is only valid for recycled logs
   }
   Write("foo");
@@ -717,8 +681,9 @@ TEST_P(LogTest, Recycle) {
   while (get_reader_contents()->size() < log::kBlockSize * 2) {
     Write("xxxxxxxxxxxxxxxx");
   }
-  unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
-      new test::OverwritingStringSink(get_reader_contents())));
+  std::unique_ptr<WritableFileWriter> dest_holder(test::GetWritableFileWriter(
+      new test::OverwritingStringSink(get_reader_contents()),
+      "" /* don't care */));
   Writer recycle_writer(std::move(dest_holder), 123, true);
   recycle_writer.AddRecord(Slice("foooo"));
   recycle_writer.AddRecord(Slice("bar"));
@@ -728,7 +693,224 @@ TEST_P(LogTest, Recycle) {
   ASSERT_EQ("EOF", Read());
 }
 
-INSTANTIATE_TEST_CASE_P(bool, LogTest, ::testing::Values(0, 2));
+INSTANTIATE_TEST_CASE_P(bool, LogTest,
+                        ::testing::Values(std::make_tuple(0, false),
+                                          std::make_tuple(0, true),
+                                          std::make_tuple(1, false),
+                                          std::make_tuple(1, true)));
+
+class RetriableLogTest : public ::testing::TestWithParam<int> {
+ private:
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) {}
+    void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  Slice contents_;
+  std::unique_ptr<WritableFileWriter> dest_holder_;
+  std::unique_ptr<Writer> log_writer_;
+  Env* env_;
+  EnvOptions env_options_;
+  const std::string test_dir_;
+  const std::string log_file_;
+  std::unique_ptr<WritableFileWriter> writer_;
+  std::unique_ptr<SequentialFileReader> reader_;
+  ReportCollector report_;
+  std::unique_ptr<FragmentBufferedReader> log_reader_;
+
+ public:
+  RetriableLogTest()
+      : contents_(),
+        dest_holder_(nullptr),
+        log_writer_(nullptr),
+        env_(Env::Default()),
+        test_dir_(test::PerThreadDBPath("retriable_log_test")),
+        log_file_(test_dir_ + "/log"),
+        writer_(nullptr),
+        reader_(nullptr),
+        log_reader_(nullptr) {}
+
+  Status SetupTestEnv() {
+    dest_holder_.reset(test::GetWritableFileWriter(
+        new test::StringSink(&contents_), "" /* file name */));
+    assert(dest_holder_ != nullptr);
+    log_writer_.reset(new Writer(std::move(dest_holder_), 123, GetParam()));
+    assert(log_writer_ != nullptr);
+
+    Status s;
+    s = env_->CreateDirIfMissing(test_dir_);
+    std::unique_ptr<WritableFile> writable_file;
+    if (s.ok()) {
+      s = env_->NewWritableFile(log_file_, &writable_file, env_options_);
+    }
+    if (s.ok()) {
+      writer_.reset(new WritableFileWriter(std::move(writable_file), log_file_,
+                                           env_options_));
+      assert(writer_ != nullptr);
+    }
+    std::unique_ptr<SequentialFile> seq_file;
+    if (s.ok()) {
+      s = env_->NewSequentialFile(log_file_, &seq_file, env_options_);
+    }
+    if (s.ok()) {
+      reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_));
+      assert(reader_ != nullptr);
+      log_reader_.reset(new FragmentBufferedReader(
+          nullptr, std::move(reader_), &report_, true /* checksum */,
+          123 /* log_number */));
+      assert(log_reader_ != nullptr);
+    }
+    return s;
+  }
+
+  std::string contents() {
+    auto file =
+        dynamic_cast<test::StringSink*>(log_writer_->file()->writable_file());
+    assert(file != nullptr);
+    return file->contents_;
+  }
+
+  void Encode(const std::string& msg) { log_writer_->AddRecord(Slice(msg)); }
+
+  void Write(const Slice& data) {
+    writer_->Append(data);
+    writer_->Sync(true);
+  }
+
+  bool TryRead(std::string* result) {
+    assert(result != nullptr);
+    result->clear();
+    std::string scratch;
+    Slice record;
+    bool r = log_reader_->ReadRecord(&record, &scratch);
+    if (r) {
+      result->assign(record.data(), record.size());
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+TEST_P(RetriableLogTest, TailLog_PartialHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size - 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+  ASSERT_TRUE(eof);
+}
+
+TEST_P(RetriableLogTest, TailLog_FullHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size + 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+    ASSERT_TRUE(eof);
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+}
+
+TEST_P(RetriableLogTest, NonBlockingReadFullRecord) {
+  // Clear all sync point callbacks even if this test does not use sync point.
+  // It is necessary, otherwise the execute of this test may hit a sync point
+  // with which a callback is registered. The registered callback may access
+  // some dead variable, causing segfault.
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(SetupTestEnv());
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  size_t delta = header_size - 1;
+  size_t old_sz = contents().size();
+  Encode("foo-bar");
+  size_t new_sz = contents().size();
+  std::string part1 = contents().substr(old_sz, delta);
+  std::string part2 =
+      contents().substr(old_sz + delta, new_sz - old_sz - delta);
+  Write(Slice(part1));
+  std::string record;
+  ASSERT_FALSE(TryRead(&record));
+  ASSERT_TRUE(record.empty());
+  Write(Slice(part2));
+  ASSERT_TRUE(TryRead(&record));
+  ASSERT_EQ("foo-bar", record);
+}
+
+INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2));
 
 }  // namespace log
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/log_writer.cc b/thirdparty/rocksdb/db/log_writer.cc
index b02eec89dd..bc99931b9a 100644
--- a/thirdparty/rocksdb/db/log_writer.cc
+++ b/thirdparty/rocksdb/db/log_writer.cc
@@ -18,7 +18,7 @@
 namespace rocksdb {
 namespace log {
 
-Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
+Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
                bool recycle_log_files, bool manual_flush)
     : dest_(std::move(dest)),
       block_offset_(0),
@@ -57,8 +57,11 @@ Status Writer::AddRecord(const Slice& slice) {
         // Fill the trailer (literal below relies on kHeaderSize and
         // kRecyclableHeaderSize being <= 11)
         assert(header_size <= 11);
-        dest_->Append(
-            Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", leftover));
+        s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+                                static_cast<size_t>(leftover)));
+        if (!s.ok()) {
+          break;
+        }
       }
       block_offset_ = 0;
     }
@@ -89,6 +92,8 @@ Status Writer::AddRecord(const Slice& slice) {
   return s;
 }
 
+bool Writer::TEST_BufferIsEmpty() { return dest_->TEST_BufferIsEmpty(); }
+
 Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
   assert(n <= 0xffff);  // Must fit in two bytes
 
diff --git a/thirdparty/rocksdb/db/log_writer.h b/thirdparty/rocksdb/db/log_writer.h
index a3a879924e..3638beb7eb 100644
--- a/thirdparty/rocksdb/db/log_writer.h
+++ b/thirdparty/rocksdb/db/log_writer.h
@@ -20,8 +20,6 @@ namespace rocksdb {
 
 class WritableFileWriter;
 
-using std::unique_ptr;
-
 namespace log {
 
 /**
@@ -49,7 +47,7 @@ namespace log {
  * |CRC (4B) | Size (2B) | Type (1B) | Payload   |
  * +---------+-----------+-----------+--- ... ---+
  *
- * CRC = 32bit hash computed over the payload using CRC
+ * CRC = 32bit hash computed over the record type and payload using CRC
  * Size = Length of the payload data
  * Type = Type of record
  *        (kZeroType, kFullType, kFirstType, kLastType, kMiddleType )
@@ -72,8 +70,9 @@ class Writer {
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
   // "*dest" must remain live while this Writer is in use.
-  explicit Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
-                  bool recycle_log_files, bool manual_flush = false);
+  explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
+                  uint64_t log_number, bool recycle_log_files,
+                  bool manual_flush = false);
   ~Writer();
 
   Status AddRecord(const Slice& slice);
@@ -85,8 +84,10 @@ class Writer {
 
   Status WriteBuffer();
 
+  bool TEST_BufferIsEmpty();
+
  private:
-  unique_ptr<WritableFileWriter> dest_;
+  std::unique_ptr<WritableFileWriter> dest_;
   size_t block_offset_;       // Current offset in block
   uint64_t log_number_;
   bool recycle_log_files_;
diff --git a/thirdparty/rocksdb/db/logs_with_prep_tracker.cc b/thirdparty/rocksdb/db/logs_with_prep_tracker.cc
new file mode 100644
index 0000000000..1082dc102a
--- /dev/null
+++ b/thirdparty/rocksdb/db/logs_with_prep_tracker.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/logs_with_prep_tracker.h"
+
+#include "port/likely.h"
+
+namespace rocksdb {
+void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(prepared_section_completed_mutex_);
+  auto it = prepared_section_completed_.find(log);
+  if (UNLIKELY(it == prepared_section_completed_.end())) {
+    prepared_section_completed_[log] = 1;
+  } else {
+    it->second += 1;
+  }
+}
+
+void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+
+  auto rit = logs_with_prep_.rbegin();
+  bool updated = false;
+  // Most probably the last log is the one that is being marked for
+  // having a prepare section; so search from the end.
+  for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) {
+    if (rit->log == log) {
+      rit->cnt++;
+      updated = true;
+      break;
+    }
+  }
+  if (!updated) {
+    // We are either at the start, or at a position with rit->log < log
+    logs_with_prep_.insert(rit.base(), {log, 1});
+  }
+}
+
+uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() {
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+  auto it = logs_with_prep_.begin();
+  // start with the smallest log
+  for (; it != logs_with_prep_.end();) {
+    auto min_log = it->log;
+    {
+      std::lock_guard<std::mutex> lock2(prepared_section_completed_mutex_);
+      auto completed_it = prepared_section_completed_.find(min_log);
+      if (completed_it == prepared_section_completed_.end() ||
+          completed_it->second < it->cnt) {
+        return min_log;
+      }
+      assert(completed_it != prepared_section_completed_.end() &&
+             completed_it->second == it->cnt);
+      prepared_section_completed_.erase(completed_it);
+    }
+    // erase from beginning in vector is not efficient but this function is not
+    // on the fast path.
+    it = logs_with_prep_.erase(it);
+  }
+  // no such log found
+  return 0;
+}
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/logs_with_prep_tracker.h b/thirdparty/rocksdb/db/logs_with_prep_tracker.h
new file mode 100644
index 0000000000..639d8f8069
--- /dev/null
+++ b/thirdparty/rocksdb/db/logs_with_prep_tracker.h
@@ -0,0 +1,61 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <stdint.h>
+#include <cassert>
+#include <cstdlib>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace rocksdb {
+
+// This class is used to track the log files with outstanding prepare entries.
+class LogsWithPrepTracker {
+ public:
+  // Called when a transaction prepared in `log` has been committed or aborted.
+  void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
+  // Called when a transaction is prepared in `log`.
+  void MarkLogAsContainingPrepSection(uint64_t log);
+  // Return the earliest log file with outstanding prepare entries.
+  uint64_t FindMinLogContainingOutstandingPrep();
+  size_t TEST_PreparedSectionCompletedSize() {
+    return prepared_section_completed_.size();
+  }
+  size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); }
+
+ private:
+  // REQUIRES: logs_with_prep_mutex_ held
+  //
+  // sorted list of log numbers still containing prepared data.
+  // this is used by FindObsoleteFiles to determine which
+  // flushed logs we must keep around because they still
+  // contain prepared data which has not been committed or rolled back
+  struct LogCnt {
+    uint64_t log;  // the log number
+    uint64_t cnt;  // number of prepared sections in the log
+  };
+  std::vector<LogCnt> logs_with_prep_;
+  std::mutex logs_with_prep_mutex_;
+
+  // REQUIRES: prepared_section_completed_mutex_ held
+  //
+  // to be used in conjunction with logs_with_prep_.
+  // once a transaction with data in log L is committed or rolled back
+  // rather than updating logs_with_prep_ directly we keep track of that
+  // in prepared_section_completed_ which maps LOG -> instance_count. This helps
+  // avoiding contention between a commit thread and the prepare threads.
+  //
+  // when trying to determine the minimum log still active we first
+  // consult logs_with_prep_. while that root value maps to
+  // an equal value in prepared_section_completed_ we erase the log from
+  // both logs_with_prep_ and prepared_section_completed_.
+  std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
+  std::mutex prepared_section_completed_mutex_;
+
+};
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/malloc_stats.cc b/thirdparty/rocksdb/db/malloc_stats.cc
index 7acca65123..bcee5c3fbf 100644
--- a/thirdparty/rocksdb/db/malloc_stats.cc
+++ b/thirdparty/rocksdb/db/malloc_stats.cc
@@ -13,10 +13,16 @@
 #include <memory>
 #include <string.h>
 
+#include "port/jemalloc_helper.h"
+
+
 namespace rocksdb {
 
 #ifdef ROCKSDB_JEMALLOC
-#include "jemalloc/jemalloc.h"
+
+#ifdef JEMALLOC_NO_RENAME
+#define malloc_stats_print je_malloc_stats_print
+#endif
 
 typedef struct {
   char* cur;
@@ -34,19 +40,20 @@ static void GetJemallocStatus(void* mstat_arg, const char* status) {
   snprintf(mstat->cur, buf_size, "%s", status);
   mstat->cur += status_len;
 }
-#endif  // ROCKSDB_JEMALLOC
-
 void DumpMallocStats(std::string* stats) {
-#ifdef ROCKSDB_JEMALLOC
+  if (!HasJemalloc()) {
+    return;
+  }
   MallocStatus mstat;
   const unsigned int kMallocStatusLen = 1000000;
   std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
   mstat.cur = buf.get();
   mstat.end = buf.get() + kMallocStatusLen;
-  je_malloc_stats_print(GetJemallocStatus, &mstat, "");
+  malloc_stats_print(GetJemallocStatus, &mstat, "");
   stats->append(buf.get());
-#endif  // ROCKSDB_JEMALLOC
-}
-
 }
+#else
+void DumpMallocStats(std::string*) {}
+#endif  // ROCKSDB_JEMALLOC
+}  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/managed_iterator.cc b/thirdparty/rocksdb/db/managed_iterator.cc
deleted file mode 100644
index c393eb5a6f..0000000000
--- a/thirdparty/rocksdb/db/managed_iterator.cc
+++ /dev/null
@@ -1,262 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include "db/managed_iterator.h"
-
-#include <limits>
-#include <string>
-#include <utility>
-
-#include "db/column_family.h"
-#include "db/db_impl.h"
-#include "db/db_iter.h"
-#include "db/dbformat.h"
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/slice_transform.h"
-#include "table/merging_iterator.h"
-
-namespace rocksdb {
-
-namespace {
-// Helper class that locks a mutex on construction and unlocks the mutex when
-// the destructor of the MutexLock object is invoked.
-//
-// Typical usage:
-//
-//   void MyClass::MyMethod() {
-//     MILock l(&mu_);       // mu_ is an instance variable
-//     ... some complex code, possibly with multiple return paths ...
-//   }
-
-class MILock {
- public:
-  explicit MILock(std::mutex* mu, ManagedIterator* mi) : mu_(mu), mi_(mi) {
-    this->mu_->lock();
-  }
-  ~MILock() {
-    this->mu_->unlock();
-  }
-  ManagedIterator* GetManagedIterator() { return mi_; }
-
- private:
-  std::mutex* const mu_;
-  ManagedIterator* mi_;
-  // No copying allowed
-  MILock(const MILock&) = delete;
-  void operator=(const MILock&) = delete;
-};
-}  // anonymous namespace
-
-//
-// Synchronization between modifiers, releasers, creators
-// If iterator operation, wait till (!in_use), set in_use, do op, reset in_use
-//  if modifying mutable_iter, atomically exchange in_use:
-//  return if in_use set / otherwise set in use,
-//  atomically replace new iter with old , reset in use
-//  The releaser is the new operation and it holds a lock for a very short time
-//  The existing non-const iterator operations are supposed to be single
-//  threaded and hold the lock for the duration of the operation
-//  The existing const iterator operations use the cached key/values
-//  and don't do any locking.
-ManagedIterator::ManagedIterator(DBImpl* db, const ReadOptions& read_options,
-                                 ColumnFamilyData* cfd)
-    : db_(db),
-      read_options_(read_options),
-      cfd_(cfd),
-      svnum_(cfd->GetSuperVersionNumber()),
-      mutable_iter_(nullptr),
-      valid_(false),
-      snapshot_created_(false),
-      release_supported_(true) {
-  read_options_.managed = false;
-  if ((!read_options_.tailing) && (read_options_.snapshot == nullptr)) {
-    assert(nullptr != (read_options_.snapshot = db_->GetSnapshot()));
-    snapshot_created_ = true;
-  }
-  cfh_.SetCFD(cfd);
-  mutable_iter_ = unique_ptr<Iterator>(db->NewIterator(read_options_, &cfh_));
-}
-
-ManagedIterator::~ManagedIterator() {
-  Lock();
-  if (snapshot_created_) {
-    db_->ReleaseSnapshot(read_options_.snapshot);
-    snapshot_created_ = false;
-    read_options_.snapshot = nullptr;
-  }
-  UnLock();
-}
-
-bool ManagedIterator::Valid() const { return valid_; }
-
-void ManagedIterator::SeekToLast() {
-  MILock l(&in_use_, this);
-  if (NeedToRebuild()) {
-    RebuildIterator();
-  }
-  assert(mutable_iter_ != nullptr);
-  mutable_iter_->SeekToLast();
-  if (mutable_iter_->status().ok()) {
-    UpdateCurrent();
-  }
-}
-
-void ManagedIterator::SeekToFirst() {
-  MILock l(&in_use_, this);
-  SeekInternal(Slice(), true);
-}
-
-void ManagedIterator::Seek(const Slice& user_key) {
-  MILock l(&in_use_, this);
-  SeekInternal(user_key, false);
-}
-
-void ManagedIterator::SeekForPrev(const Slice& user_key) {
-  MILock l(&in_use_, this);
-  if (NeedToRebuild()) {
-    RebuildIterator();
-  }
-  assert(mutable_iter_ != nullptr);
-  mutable_iter_->SeekForPrev(user_key);
-  UpdateCurrent();
-}
-
-void ManagedIterator::SeekInternal(const Slice& user_key, bool seek_to_first) {
-  if (NeedToRebuild()) {
-    RebuildIterator();
-  }
-  assert(mutable_iter_ != nullptr);
-  if (seek_to_first) {
-    mutable_iter_->SeekToFirst();
-  } else {
-    mutable_iter_->Seek(user_key);
-  }
-  UpdateCurrent();
-}
-
-void ManagedIterator::Prev() {
-  if (!valid_) {
-    status_ = Status::InvalidArgument("Iterator value invalid");
-    return;
-  }
-  MILock l(&in_use_, this);
-  if (NeedToRebuild()) {
-    std::string current_key = key().ToString();
-    Slice old_key(current_key);
-    RebuildIterator();
-    SeekInternal(old_key, false);
-    UpdateCurrent();
-    if (!valid_) {
-      return;
-    }
-    if (key().compare(old_key) != 0) {
-      valid_ = false;
-      status_ = Status::Incomplete("Cannot do Prev now");
-      return;
-    }
-  }
-  mutable_iter_->Prev();
-  if (mutable_iter_->status().ok()) {
-    UpdateCurrent();
-    status_ = Status::OK();
-  } else {
-    status_ = mutable_iter_->status();
-  }
-}
-
-void ManagedIterator::Next() {
-  if (!valid_) {
-    status_ = Status::InvalidArgument("Iterator value invalid");
-    return;
-  }
-  MILock l(&in_use_, this);
-  if (NeedToRebuild()) {
-    std::string current_key = key().ToString();
-    Slice old_key(current_key.data(), cached_key_.Size());
-    RebuildIterator();
-    SeekInternal(old_key, false);
-    UpdateCurrent();
-    if (!valid_) {
-      return;
-    }
-    if (key().compare(old_key) != 0) {
-      valid_ = false;
-      status_ = Status::Incomplete("Cannot do Next now");
-      return;
-    }
-  }
-  mutable_iter_->Next();
-  UpdateCurrent();
-}
-
-Slice ManagedIterator::key() const {
-  assert(valid_);
-  return cached_key_.GetUserKey();
-}
-
-Slice ManagedIterator::value() const {
-  assert(valid_);
-  return cached_value_.GetUserKey();
-}
-
-Status ManagedIterator::status() const { return status_; }
-
-void ManagedIterator::RebuildIterator() {
-  svnum_ = cfd_->GetSuperVersionNumber();
-  mutable_iter_ = unique_ptr<Iterator>(db_->NewIterator(read_options_, &cfh_));
-}
-
-void ManagedIterator::UpdateCurrent() {
-  assert(mutable_iter_ != nullptr);
-
-  valid_ = mutable_iter_->Valid();
-  if (!valid_) {
-    status_ = mutable_iter_->status();
-    return;
-  }
-
-  status_ = Status::OK();
-  cached_key_.SetUserKey(mutable_iter_->key());
-  cached_value_.SetUserKey(mutable_iter_->value());
-}
-
-void ManagedIterator::ReleaseIter(bool only_old) {
-  if ((mutable_iter_ == nullptr) || (!release_supported_)) {
-    return;
-  }
-  if (svnum_ != cfd_->GetSuperVersionNumber() || !only_old) {
-    if (!TryLock()) {  // Don't release iter if in use
-      return;
-    }
-    mutable_iter_ = nullptr;  // in_use for a very short time
-    UnLock();
-  }
-}
-
-bool ManagedIterator::NeedToRebuild() {
-  if ((mutable_iter_ == nullptr) || (status_.IsIncomplete()) ||
-      (!only_drop_old_ && (svnum_ != cfd_->GetSuperVersionNumber()))) {
-    return true;
-  }
-  return false;
-}
-
-void ManagedIterator::Lock() {
-  in_use_.lock();
-  return;
-}
-
-bool ManagedIterator::TryLock() { return in_use_.try_lock(); }
-
-void ManagedIterator::UnLock() {
-  in_use_.unlock();
-}
-
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/managed_iterator.h b/thirdparty/rocksdb/db/managed_iterator.h
deleted file mode 100644
index 8e962f781a..0000000000
--- a/thirdparty/rocksdb/db/managed_iterator.h
+++ /dev/null
@@ -1,85 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-
-#ifndef ROCKSDB_LITE
-
-#include <mutex>
-#include <queue>
-#include <string>
-#include <vector>
-
-#include "db/column_family.h"
-#include "rocksdb/db.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/options.h"
-#include "util/arena.h"
-
-namespace rocksdb {
-
-class DBImpl;
-struct SuperVersion;
-class ColumnFamilyData;
-
-/**
- * ManagedIterator is a special type of iterator that supports freeing the
- * underlying iterator and still being able to access the current key/value
- * pair.  This is done by copying the key/value pair so that clients can
- * continue to access the data without getting a SIGSEGV.
- * The underlying iterator can be freed manually through the  call to
- * ReleaseIter or automatically (as needed on space pressure or age.)
- * The iterator is recreated using the saved original arguments.
- */
-class ManagedIterator : public Iterator {
- public:
-  ManagedIterator(DBImpl* db, const ReadOptions& read_options,
-                  ColumnFamilyData* cfd);
-  virtual ~ManagedIterator();
-
-  virtual void SeekToLast() override;
-  virtual void Prev() override;
-  virtual bool Valid() const override;
-  void SeekToFirst() override;
-  virtual void Seek(const Slice& target) override;
-  virtual void SeekForPrev(const Slice& target) override;
-  virtual void Next() override;
-  virtual Slice key() const override;
-  virtual Slice value() const override;
-  virtual Status status() const override;
-  void ReleaseIter(bool only_old);
-  void SetDropOld(bool only_old) {
-    only_drop_old_ = read_options_.tailing || only_old;
-  }
-
- private:
-  void RebuildIterator();
-  void UpdateCurrent();
-  void SeekInternal(const Slice& user_key, bool seek_to_first);
-  bool NeedToRebuild();
-  void Lock();
-  bool TryLock();
-  void UnLock();
-  DBImpl* const db_;
-  ReadOptions read_options_;
-  ColumnFamilyData* const cfd_;
-  ColumnFamilyHandleInternal cfh_;
-
-  uint64_t svnum_;
-  std::unique_ptr<Iterator> mutable_iter_;
-  // internal iterator status
-  Status status_;
-  bool valid_;
-
-  IterKey cached_key_;
-  IterKey cached_value_;
-
-  bool only_drop_old_ = true;
-  bool snapshot_created_;
-  bool release_supported_;
-  std::mutex in_use_;  // is managed iterator in use
-};
-
-}  // namespace rocksdb
-#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/manual_compaction_test.cc b/thirdparty/rocksdb/db/manual_compaction_test.cc
index 039b9080ed..02732a5558 100644
--- a/thirdparty/rocksdb/db/manual_compaction_test.cc
+++ b/thirdparty/rocksdb/db/manual_compaction_test.cc
@@ -19,7 +19,12 @@ using namespace rocksdb;
 
 namespace {
 
-const int kNumKeys = 1100000;
+// Reasoning: previously the number was 1100000. Since the keys are written to
+// the batch in one write each write will result into one SST file. each write
+// will result into one SST file. We reduced the write_buffer_size to 1K to
+// basically have the same effect with however less number of keys, which
+// results into less test runtime.
+const int kNumKeys = 1100;
 
 std::string Key1(int i) {
   char buf[100];
@@ -35,7 +40,7 @@ class ManualCompactionTest : public testing::Test {
  public:
   ManualCompactionTest() {
     // Get rid of any state from an old run.
-    dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
+    dbname_ = rocksdb::test::PerThreadDBPath("rocksdb_cbug_test");
     DestroyDB(dbname_, rocksdb::Options());
   }
 
@@ -46,15 +51,13 @@ class DestroyAllCompactionFilter : public CompactionFilter {
  public:
   DestroyAllCompactionFilter() {}
 
-  virtual bool Filter(int level, const Slice& key, const Slice& existing_value,
-                      std::string* new_value,
-                      bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     return existing_value.ToString() == "destroy";
   }
 
-  virtual const char* Name() const override {
-    return "DestroyAllCompactionFilter";
-  }
+  const char* Name() const override { return "DestroyAllCompactionFilter"; }
 };
 
 TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
@@ -99,6 +102,7 @@ TEST_F(ManualCompactionTest, Test) {
   // specific scenario.
   rocksdb::DB* db;
   rocksdb::Options db_options;
+  db_options.write_buffer_size = 1024;
   db_options.create_if_missing = true;
   db_options.compression = rocksdb::kNoCompression;
   ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
diff --git a/thirdparty/rocksdb/db/memtable.cc b/thirdparty/rocksdb/db/memtable.cc
index d51b261873..16b5c8ee0f 100644
--- a/thirdparty/rocksdb/db/memtable.cc
+++ b/thirdparty/rocksdb/db/memtable.cc
@@ -17,6 +17,8 @@
 #include "db/merge_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "port/port.h"
@@ -33,8 +35,8 @@
 #include "util/autovector.h"
 #include "util/coding.h"
 #include "util/memory_usage.h"
-#include "util/murmurhash.h"
 #include "util/mutexlock.h"
+#include "util/util.h"
 
 namespace rocksdb {
 
@@ -48,6 +50,8 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
               mutable_cf_options.memtable_prefix_bloom_size_ratio) *
           8u),
       memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
+      memtable_whole_key_filtering(
+          mutable_cf_options.memtable_whole_key_filtering),
       inplace_update_support(ioptions.inplace_update_support),
       inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
       inplace_callback(ioptions.inplace_callback),
@@ -66,15 +70,16 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       refs_(0),
       kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
       mem_tracker_(write_buffer_manager),
-      arena_(
-          moptions_.arena_block_size,
-          (write_buffer_manager != nullptr && write_buffer_manager->enabled())
-              ? &mem_tracker_
-              : nullptr,
-          mutable_cf_options.memtable_huge_page_size),
+      arena_(moptions_.arena_block_size,
+             (write_buffer_manager != nullptr &&
+              (write_buffer_manager->enabled() ||
+               write_buffer_manager->cost_to_cache()))
+                 ? &mem_tracker_
+                 : nullptr,
+             mutable_cf_options.memtable_huge_page_size),
       table_(ioptions.memtable_factory->CreateMemTableRep(
-          comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log,
-          column_family_id)),
+          comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
+          ioptions.info_log, column_family_id)),
       range_del_table_(SkipListFactory().CreateMemTableRep(
           comparator_, &arena_, nullptr /* transform */, ioptions.info_log,
           column_family_id)),
@@ -94,21 +99,24 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
       locks_(moptions_.inplace_update_support
                  ? moptions_.inplace_update_num_locks
                  : 0),
-      prefix_extractor_(ioptions.prefix_extractor),
+      prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
       flush_state_(FLUSH_NOT_REQUESTED),
       env_(ioptions.env),
       insert_with_hint_prefix_extractor_(
           ioptions.memtable_insert_with_hint_prefix_extractor),
-      oldest_key_time_(std::numeric_limits<uint64_t>::max()) {
+      oldest_key_time_(std::numeric_limits<uint64_t>::max()),
+      atomic_flush_seqno_(kMaxSequenceNumber) {
   UpdateFlushState();
   // something went wrong if we need to flush before inserting anything
   assert(!ShouldScheduleFlush());
 
-  if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) {
-    prefix_bloom_.reset(new DynamicBloom(
-        &arena_, moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality,
-        6 /* hard coded 6 probes */, nullptr, moptions_.memtable_huge_page_size,
-        ioptions.info_log));
+  // use bloom_filter_ for both whole key and prefix bloom filter
+  if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
+      moptions_.memtable_prefix_bloom_bits > 0) {
+    bloom_filter_.reset(
+        new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
+                         ioptions.bloom_locality, 6 /* hard coded 6 probes */,
+                         moptions_.memtable_huge_page_size, ioptions.info_log));
   }
 }
 
@@ -224,15 +232,23 @@ int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
   // Internal keys are encoded as length-prefixed strings.
   Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
   Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
-  return comparator.Compare(k1, k2);
+  return comparator.CompareKeySeq(k1, k2);
 }
 
 int MemTable::KeyComparator::operator()(const char* prefix_len_key,
-                                        const Slice& key)
+                                        const KeyComparator::DecodedType& key)
     const {
   // Internal keys are encoded as length-prefixed strings.
   Slice a = GetLengthPrefixedSlice(prefix_len_key);
-  return comparator.Compare(a, key);
+  return comparator.CompareKeySeq(a, key);
+}
+
+void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
+#ifndef ROCKSDB_LITE
+  throw std::runtime_error("concurrent insert not supported");
+#else
+  abort();
+#endif
 }
 
 Slice MemTableRep::UserKey(const char* key) const {
@@ -269,19 +285,18 @@ class MemTableIterator : public InternalIterator {
     if (use_range_del_table) {
       iter_ = mem.range_del_table_->GetIterator(arena);
     } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
-      bloom_ = mem.prefix_bloom_.get();
+      bloom_ = mem.bloom_filter_.get();
       iter_ = mem.table_->GetDynamicPrefixIterator(arena);
     } else {
       iter_ = mem.table_->GetIterator(arena);
     }
   }
 
-  ~MemTableIterator() {
+  ~MemTableIterator() override {
 #ifndef NDEBUG
     // Assert that the MemTableIterator is never deleted while
     // Pinning is Enabled.
-    assert(!pinned_iters_mgr_ ||
-           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
 #endif
     if (arena_mode_) {
       iter_->~Iterator();
@@ -291,18 +306,18 @@ class MemTableIterator : public InternalIterator {
   }
 
 #ifndef NDEBUG
-  virtual void SetPinnedItersMgr(
-      PinnedIteratorsManager* pinned_iters_mgr) override {
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
   }
   PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
 #endif
 
-  virtual bool Valid() const override { return valid_; }
-  virtual void Seek(const Slice& k) override {
+  bool Valid() const override { return valid_; }
+  void Seek(const Slice& k) override {
     PERF_TIMER_GUARD(seek_on_memtable_time);
     PERF_COUNTER_ADD(seek_on_memtable_count, 1);
-    if (bloom_ != nullptr) {
+    if (bloom_) {
+      // iterator should only use prefix bloom filter
       if (!bloom_->MayContain(
               prefix_extractor_->Transform(ExtractUserKey(k)))) {
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
@@ -315,10 +330,10 @@ class MemTableIterator : public InternalIterator {
     iter_->Seek(k, nullptr);
     valid_ = iter_->Valid();
   }
-  virtual void SeekForPrev(const Slice& k) override {
+  void SeekForPrev(const Slice& k) override {
     PERF_TIMER_GUARD(seek_on_memtable_time);
     PERF_COUNTER_ADD(seek_on_memtable_count, 1);
-    if (bloom_ != nullptr) {
+    if (bloom_) {
       if (!bloom_->MayContain(
               prefix_extractor_->Transform(ExtractUserKey(k)))) {
         PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
@@ -337,44 +352,44 @@ class MemTableIterator : public InternalIterator {
       Prev();
     }
   }
-  virtual void SeekToFirst() override {
+  void SeekToFirst() override {
     iter_->SeekToFirst();
     valid_ = iter_->Valid();
   }
-  virtual void SeekToLast() override {
+  void SeekToLast() override {
     iter_->SeekToLast();
     valid_ = iter_->Valid();
   }
-  virtual void Next() override {
+  void Next() override {
     PERF_COUNTER_ADD(next_on_memtable_count, 1);
     assert(Valid());
     iter_->Next();
     valid_ = iter_->Valid();
   }
-  virtual void Prev() override {
+  void Prev() override {
     PERF_COUNTER_ADD(prev_on_memtable_count, 1);
     assert(Valid());
     iter_->Prev();
     valid_ = iter_->Valid();
   }
-  virtual Slice key() const override {
+  Slice key() const override {
     assert(Valid());
     return GetLengthPrefixedSlice(iter_->key());
   }
-  virtual Slice value() const override {
+  Slice value() const override {
     assert(Valid());
     Slice key_slice = GetLengthPrefixedSlice(iter_->key());
     return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
   }
 
-  virtual Status status() const override { return Status::OK(); }
+  Status status() const override { return Status::OK(); }
 
-  virtual bool IsKeyPinned() const override {
+  bool IsKeyPinned() const override {
     // memtable data is always pinned
     return true;
   }
 
-  virtual bool IsValuePinned() const override {
+  bool IsValuePinned() const override {
     // memtable value is always pinned, except if we allow inplace update.
     return value_pinned_;
   }
@@ -400,18 +415,29 @@ InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
   return new (mem) MemTableIterator(*this, read_options, arena);
 }
 
-InternalIterator* MemTable::NewRangeTombstoneIterator(
-    const ReadOptions& read_options) {
-  if (read_options.ignore_range_deletions || is_range_del_table_empty_) {
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options, SequenceNumber read_seq) {
+  if (read_options.ignore_range_deletions ||
+      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+    return nullptr;
+  }
+  auto* unfragmented_iter = new MemTableIterator(
+      *this, read_options, nullptr /* arena */, true /* use_range_del_table */);
+  if (unfragmented_iter == nullptr) {
     return nullptr;
   }
-  return new MemTableIterator(*this, read_options, nullptr /* arena */,
-                              true /* use_range_del_table */);
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::unique_ptr<InternalIterator>(unfragmented_iter),
+          comparator_.comparator);
+
+  auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
+      fragmented_tombstone_list, comparator_.comparator, read_seq);
+  return fragmented_iter;
 }
 
 port::RWMutex* MemTable::GetLock(const Slice& key) {
-  static murmur_hash hash;
-  return &locks_[hash(key) % locks_.size()];
+  return &locks_[static_cast<size_t>(GetSliceNPHash64(key)) % locks_.size()];
 }
 
 MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
@@ -435,7 +461,7 @@ MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
   return {entry_count * (data_size / n), entry_count};
 }
 
-void MemTable::Add(SequenceNumber s, ValueType type,
+bool MemTable::Add(SequenceNumber s, ValueType type,
                    const Slice& key, /* user key */
                    const Slice& value, bool allow_concurrent,
                    MemTablePostProcessInfo* post_process_info) {
@@ -470,9 +496,15 @@ void MemTable::Add(SequenceNumber s, ValueType type,
     if (insert_with_hint_prefix_extractor_ != nullptr &&
         insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
       Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
-      table->InsertWithHint(handle, &insert_hints_[prefix]);
+      bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
+      if (UNLIKELY(!res)) {
+        return res;
+      }
     } else {
-      table->Insert(handle);
+      bool res = table->InsertKey(handle);
+      if (UNLIKELY(!res)) {
+        return res;
+      }
     }
 
     // this is a bit ugly, but is the way to avoid locked instructions
@@ -486,13 +518,15 @@ void MemTable::Add(SequenceNumber s, ValueType type,
                          std::memory_order_relaxed);
     }
 
-    if (prefix_bloom_) {
-      assert(prefix_extractor_);
-      prefix_bloom_->Add(prefix_extractor_->Transform(key));
+    if (bloom_filter_ && prefix_extractor_) {
+      bloom_filter_->Add(prefix_extractor_->Transform(key));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->Add(key);
     }
 
     // The first sequence number inserted into the memtable
-    assert(first_seqno_ == 0 || s > first_seqno_);
+    assert(first_seqno_ == 0 || s >= first_seqno_);
     if (first_seqno_ == 0) {
       first_seqno_.store(s, std::memory_order_relaxed);
 
@@ -505,7 +539,10 @@ void MemTable::Add(SequenceNumber s, ValueType type,
     assert(post_process_info == nullptr);
     UpdateFlushState();
   } else {
-    table->InsertConcurrently(handle);
+    bool res = table->InsertKeyConcurrently(handle);
+    if (UNLIKELY(!res)) {
+      return res;
+    }
 
     assert(post_process_info != nullptr);
     post_process_info->num_entries++;
@@ -514,9 +551,11 @@ void MemTable::Add(SequenceNumber s, ValueType type,
       post_process_info->num_deletes++;
     }
 
-    if (prefix_bloom_) {
-      assert(prefix_extractor_);
-      prefix_bloom_->AddConcurrently(prefix_extractor_->Transform(key));
+    if (bloom_filter_ && prefix_extractor_) {
+      bloom_filter_->AddConcurrently(prefix_extractor_->Transform(key));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->AddConcurrently(key);
     }
 
     // atomically update first_seqno_ and earliest_seqno_.
@@ -531,10 +570,11 @@ void MemTable::Add(SequenceNumber s, ValueType type,
         !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
     }
   }
-  if (is_range_del_table_empty_ && type == kTypeRangeDeletion) {
-    is_range_del_table_empty_ = false;
+  if (type == kTypeRangeDeletion) {
+    is_range_del_table_empty_.store(false, std::memory_order_relaxed);
   }
   UpdateOldestKeyTime();
+  return true;
 }
 
 // Callback from MemTable::Get()
@@ -550,23 +590,32 @@ struct Saver {
   const MergeOperator* merge_operator;
   // the merge operations encountered;
   MergeContext* merge_context;
-  RangeDelAggregator* range_del_agg;
+  SequenceNumber max_covering_tombstone_seq;
   MemTable* mem;
   Logger* logger;
   Statistics* statistics;
   bool inplace_update_support;
   Env* env_;
+  ReadCallback* callback_;
   bool* is_blob_index;
+
+  bool CheckCallback(SequenceNumber _seq) {
+    if (callback_) {
+      return callback_->IsVisible(_seq);
+    }
+    return true;
+  }
 };
 }  // namespace
 
 static bool SaveValue(void* arg, const char* entry) {
   Saver* s = reinterpret_cast<Saver*>(arg);
+  assert(s != nullptr);
   MergeContext* merge_context = s->merge_context;
-  RangeDelAggregator* range_del_agg = s->range_del_agg;
+  SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
   const MergeOperator* merge_operator = s->merge_operator;
 
-  assert(s != nullptr && merge_context != nullptr && range_del_agg != nullptr);
+  assert(merge_context != nullptr);
 
   // entry format is:
   //    klength  varint32
@@ -584,10 +633,17 @@ static bool SaveValue(void* arg, const char* entry) {
     // Correct user key
     const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
     ValueType type;
-    UnPackSequenceAndType(tag, &s->seq, &type);
+    SequenceNumber seq;
+    UnPackSequenceAndType(tag, &seq, &type);
+    // If the value is not in the snapshot, skip it
+    if (!s->CheckCallback(seq)) {
+      return true;  // to continue to the next seq
+    }
+
+    s->seq = seq;
 
     if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
-        range_del_agg->ShouldDelete(Slice(key_ptr, key_length))) {
+        max_covering_tombstone_seq > seq) {
       type = kTypeRangeDeletion;
     }
     switch (type) {
@@ -605,7 +661,7 @@ static bool SaveValue(void* arg, const char* entry) {
           *(s->found_final_value) = true;
           return false;
         }
-      // intentional fallthrough
+        FALLTHROUGH_INTENDED;
       case kTypeValue: {
         if (s->inplace_update_support) {
           s->mem->GetLock(s->key->user_key())->ReadLock();
@@ -613,10 +669,12 @@ static bool SaveValue(void* arg, const char* entry) {
         Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
         *(s->status) = Status::OK();
         if (*(s->merge_in_progress)) {
-          *(s->status) = MergeHelper::TimedFullMerge(
-              merge_operator, s->key->user_key(), &v,
-              merge_context->GetOperands(), s->value, s->logger, s->statistics,
-              s->env_, nullptr /* result_operand */, true);
+          if (s->value != nullptr) {
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), &v,
+                merge_context->GetOperands(), s->value, s->logger,
+                s->statistics, s->env_, nullptr /* result_operand */, true);
+          }
         } else if (s->value != nullptr) {
           s->value->assign(v.data(), v.size());
         }
@@ -633,10 +691,12 @@ static bool SaveValue(void* arg, const char* entry) {
       case kTypeSingleDeletion:
       case kTypeRangeDeletion: {
         if (*(s->merge_in_progress)) {
-          *(s->status) = MergeHelper::TimedFullMerge(
-              merge_operator, s->key->user_key(), nullptr,
-              merge_context->GetOperands(), s->value, s->logger, s->statistics,
-              s->env_, nullptr /* result_operand */, true);
+          if (s->value != nullptr) {
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), nullptr,
+                merge_context->GetOperands(), s->value, s->logger,
+                s->statistics, s->env_, nullptr /* result_operand */, true);
+          }
         } else {
           *(s->status) = Status::NotFound();
         }
@@ -658,6 +718,14 @@ static bool SaveValue(void* arg, const char* entry) {
         *(s->merge_in_progress) = true;
         merge_context->PushOperand(
             v, s->inplace_update_support == false /* operand_pinned */);
+        if (merge_operator->ShouldMerge(merge_context->GetOperandsDirectionBackward())) {
+          *(s->status) = MergeHelper::TimedFullMerge(
+              merge_operator, s->key->user_key(), nullptr,
+              merge_context->GetOperands(), s->value, s->logger, s->statistics,
+              s->env_, nullptr /* result_operand */, true);
+          *(s->found_final_value) = true;
+          return false;
+        }
         return true;
       }
       default:
@@ -672,8 +740,9 @@ static bool SaveValue(void* arg, const char* entry) {
 
 bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
                    MergeContext* merge_context,
-                   RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                   const ReadOptions& read_opts, bool* is_blob_index) {
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   ReadCallback* callback, bool* is_blob_index) {
   // The sequence number is updated synchronously in version_set.h
   if (IsEmpty()) {
     // Avoiding recording stats for speed.
@@ -681,27 +750,36 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
   }
   PERF_TIMER_GUARD(get_from_memtable_time);
 
-  std::unique_ptr<InternalIterator> range_del_iter(
-      NewRangeTombstoneIterator(read_opts));
-  Status status = range_del_agg->AddTombstones(std::move(range_del_iter));
-  if (!status.ok()) {
-    *s = status;
-    return false;
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      NewRangeTombstoneIterator(read_opts,
+                                GetInternalKeySeqno(key.internal_key())));
+  if (range_del_iter != nullptr) {
+    *max_covering_tombstone_seq =
+        std::max(*max_covering_tombstone_seq,
+                 range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key()));
   }
 
   Slice user_key = key.user_key();
   bool found_final_value = false;
   bool merge_in_progress = s->IsMergeInProgress();
-  bool const may_contain =
-      nullptr == prefix_bloom_
-          ? false
-          : prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key));
-  if (prefix_bloom_ && !may_contain) {
+  bool may_contain = true;
+  if (bloom_filter_) {
+    // when both memtable_whole_key_filtering and prefix_extractor_ are set,
+    // only do whole key filtering for Get() to save CPU
+    if (moptions_.memtable_whole_key_filtering) {
+      may_contain = bloom_filter_->MayContain(user_key);
+    } else {
+      assert(prefix_extractor_);
+      may_contain =
+          bloom_filter_->MayContain(prefix_extractor_->Transform(user_key));
+    }
+  }
+  if (bloom_filter_ && !may_contain) {
     // iter is null if prefix bloom says the key does not exist
     PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
     *seq = kMaxSequenceNumber;
   } else {
-    if (prefix_bloom_) {
+    if (bloom_filter_) {
       PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
     }
     Saver saver;
@@ -713,12 +791,13 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
     saver.seq = kMaxSequenceNumber;
     saver.mem = this;
     saver.merge_context = merge_context;
-    saver.range_del_agg = range_del_agg;
+    saver.max_covering_tombstone_seq = *max_covering_tombstone_seq;
     saver.merge_operator = moptions_.merge_operator;
     saver.logger = moptions_.info_log;
     saver.inplace_update_support = moptions_.inplace_update_support;
     saver.statistics = moptions_.statistics;
     saver.env_ = env_;
+    saver.callback_ = callback;
     saver.is_blob_index = is_blob_index;
     table_->Get(key, &saver, SaveValue);
 
@@ -761,8 +840,9 @@ void MemTable::Update(SequenceNumber seq,
       // Correct user key
       const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
       ValueType type;
-      SequenceNumber unused;
-      UnPackSequenceAndType(tag, &unused, &type);
+      SequenceNumber existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
+      assert(existing_seq != seq);
       if (type == kTypeValue) {
         Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
         uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
@@ -777,6 +857,7 @@ void MemTable::Update(SequenceNumber seq,
           assert((unsigned)((p + value.size()) - entry) ==
                  (unsigned)(VarintLength(key_length) + key_length +
                             VarintLength(value.size()) + value.size()));
+          RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
           return;
         }
       }
@@ -784,7 +865,10 @@ void MemTable::Update(SequenceNumber seq,
   }
 
   // key doesn't exist
-  Add(seq, kTypeValue, key, value);
+  bool add_res __attribute__((__unused__));
+  add_res = Add(seq, kTypeValue, key, value);
+  // We already checked unused != seq above. In that case, Add should not fail.
+  assert(add_res);
 }
 
 bool MemTable::UpdateCallback(SequenceNumber seq,
diff --git a/thirdparty/rocksdb/db/memtable.h b/thirdparty/rocksdb/db/memtable.h
index 4f63818eee..709e2061e5 100644
--- a/thirdparty/rocksdb/db/memtable.h
+++ b/thirdparty/rocksdb/db/memtable.h
@@ -16,7 +16,8 @@
 #include <unordered_map>
 #include <vector>
 #include "db/dbformat.h"
-#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
 #include "db/version_edit.h"
 #include "monitoring/instrumented_mutex.h"
 #include "options/cf_options.h"
@@ -33,7 +34,6 @@ namespace rocksdb {
 class Mutex;
 class MemTableIterator;
 class MergeContext;
-class InternalIterator;
 
 struct ImmutableMemTableOptions {
   explicit ImmutableMemTableOptions(const ImmutableCFOptions& ioptions,
@@ -41,6 +41,7 @@ struct ImmutableMemTableOptions {
   size_t arena_block_size;
   uint32_t memtable_prefix_bloom_bits;
   size_t memtable_huge_page_size;
+  bool memtable_whole_key_filtering;
   bool inplace_update_support;
   size_t inplace_update_num_locks;
   UpdateStatus (*inplace_callback)(char* existing_value,
@@ -63,7 +64,7 @@ struct MemTablePostProcessInfo {
 };
 
 // Note:  Many of the methods in this class have comments indicating that
-// external synchromization is required as these methods are not thread-safe.
+// external synchronization is required as these methods are not thread-safe.
 // It is up to higher layers of code to decide how to prevent concurrent
 // invokation of these methods.  This is usually done by acquiring either
 // the db mutex or the single writer thread.
@@ -83,7 +84,7 @@ class MemTable {
     virtual int operator()(const char* prefix_len_key1,
                            const char* prefix_len_key2) const override;
     virtual int operator()(const char* prefix_len_key,
-                           const Slice& key) const override;
+                           const DecodedType& key) const override;
   };
 
   // MemTables are reference counted.  The initial reference count
@@ -158,7 +159,8 @@ class MemTable {
   //        those allocated in arena.
   InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
 
-  InternalIterator* NewRangeTombstoneIterator(const ReadOptions& read_options);
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options, SequenceNumber read_seq);
 
   // Add an entry into memtable that maps key to value at the
   // specified sequence number and with the specified type.
@@ -166,7 +168,10 @@ class MemTable {
   //
   // REQUIRES: if allow_concurrent = false, external synchronization to prevent
   // simultaneous operations on the same MemTable.
-  void Add(SequenceNumber seq, ValueType type, const Slice& key,
+  //
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  bool Add(SequenceNumber seq, ValueType type, const Slice& key,
            const Slice& value, bool allow_concurrent = false,
            MemTablePostProcessInfo* post_process_info = nullptr);
 
@@ -184,16 +189,19 @@ class MemTable {
   // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
   // status returned indicates a corruption or other unexpected error.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           SequenceNumber* seq, const ReadOptions& read_opts,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           const ReadOptions& read_opts, bool* is_blob_index = nullptr) {
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
-               is_blob_index);
+    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+               read_opts, callback, is_blob_index);
   }
 
   // Attempts to update the new_value inplace, else does normal Add
@@ -258,12 +266,16 @@ class MemTable {
     return num_deletes_.load(std::memory_order_relaxed);
   }
 
+  uint64_t get_data_size() const {
+    return data_size_.load(std::memory_order_relaxed);
+  }
+
   // Dynamically change the memtable's capacity. If set below the current usage,
   // the next key added will trigger a flush. Can only increase size when
   // memtable prefix bloom is disabled, since we can't easily allocate more
   // space.
   void UpdateWriteBufferSize(size_t new_write_buffer_size) {
-    if (prefix_bloom_ == nullptr ||
+    if (bloom_filter_ == nullptr ||
         new_write_buffer_size < write_buffer_size_) {
       write_buffer_size_.store(new_write_buffer_size,
                                std::memory_order_relaxed);
@@ -332,6 +344,14 @@ class MemTable {
     mem_tracker_.DoneAllocating();
   }
 
+  // Notify the underlying storage that all data it contained has been
+  // persisted.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void MarkFlushed() {
+    table_->MarkFlushed();
+  }
+
   // return true if the current MemTableRep supports merge operator.
   bool IsMergeOperatorSupported() const {
     return table_->IsMergeOperatorSupported();
@@ -366,6 +386,21 @@ class MemTable {
     return oldest_key_time_.load(std::memory_order_relaxed);
   }
 
+  // REQUIRES: db_mutex held.
+  void SetID(uint64_t id) { id_ = id; }
+
+  uint64_t GetID() const { return id_; }
+
+  void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+  uint64_t GetFileNumber() const { return file_number_; }
+
+  void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+  void SetFlushInProgress(bool in_progress) {
+    flush_in_progress_ = in_progress;
+  }
+
  private:
   enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
 
@@ -379,9 +414,9 @@ class MemTable {
   const size_t kArenaBlockSize;
   AllocTracker mem_tracker_;
   ConcurrentArena arena_;
-  unique_ptr<MemTableRep> table_;
-  unique_ptr<MemTableRep> range_del_table_;
-  bool is_range_del_table_empty_;
+  std::unique_ptr<MemTableRep> table_;
+  std::unique_ptr<MemTableRep> range_del_table_;
+  std::atomic_bool is_range_del_table_empty_;
 
   // Total data size of all data inserted
   std::atomic<uint64_t> data_size_;
@@ -420,7 +455,7 @@ class MemTable {
   std::vector<port::RWMutex> locks_;
 
   const SliceTransform* const prefix_extractor_;
-  std::unique_ptr<DynamicBloom> prefix_bloom_;
+  std::unique_ptr<DynamicBloom> bloom_filter_;
 
   std::atomic<FlushStateEnum> flush_state_;
 
@@ -435,6 +470,15 @@ class MemTable {
   // Timestamp of oldest key
   std::atomic<uint64_t> oldest_key_time_;
 
+  // Memtable id to track flush.
+  uint64_t id_ = 0;
+
+  // Sequence number of the atomic flush that is responsible for this memtable.
+  // The sequence number of atomic flush is a seq, such that no writes with
+  // sequence numbers greater than or equal to seq are flushed, while all
+  // writes with sequence number smaller than seq are flushed.
+  SequenceNumber atomic_flush_seqno_;
+
   // Returns a heuristic flush decision
   bool ShouldFlushNow() const;
 
diff --git a/thirdparty/rocksdb/db/memtable_list.cc b/thirdparty/rocksdb/db/memtable_list.cc
index 5921a50b35..5abe59b363 100644
--- a/thirdparty/rocksdb/db/memtable_list.cc
+++ b/thirdparty/rocksdb/db/memtable_list.cc
@@ -11,8 +11,11 @@
 
 #include <inttypes.h>
 #include <limits>
+#include <queue>
 #include <string>
+#include "db/db_impl.h"
 #include "db/memtable.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_set.h"
 #include "monitoring/thread_status_util.h"
 #include "rocksdb/db.h"
@@ -40,7 +43,6 @@ void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
     to_delete->push_back(m);
     assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
     *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
-  } else {
   }
 }
 
@@ -103,42 +105,49 @@ int MemTableList::NumFlushed() const {
 // Operands stores the list of merge operations to apply, so far.
 bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
                               Status* s, MergeContext* merge_context,
-                              RangeDelAggregator* range_del_agg,
+                              SequenceNumber* max_covering_tombstone_seq,
                               SequenceNumber* seq, const ReadOptions& read_opts,
-                              bool* is_blob_index) {
-  return GetFromList(&memlist_, key, value, s, merge_context, range_del_agg,
-                     seq, read_opts, is_blob_index);
+                              ReadCallback* callback, bool* is_blob_index) {
+  return GetFromList(&memlist_, key, value, s, merge_context,
+                     max_covering_tombstone_seq, seq, read_opts, callback,
+                     is_blob_index);
 }
 
 bool MemTableListVersion::GetFromHistory(
     const LookupKey& key, std::string* value, Status* s,
-    MergeContext* merge_context, RangeDelAggregator* range_del_agg,
+    MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
     SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
   return GetFromList(&memlist_history_, key, value, s, merge_context,
-                     range_del_agg, seq, read_opts, is_blob_index);
+                     max_covering_tombstone_seq, seq, read_opts,
+                     nullptr /*read_callback*/, is_blob_index);
 }
 
 bool MemTableListVersion::GetFromList(
     std::list<MemTable*>* list, const LookupKey& key, std::string* value,
-    Status* s, MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-    SequenceNumber* seq, const ReadOptions& read_opts, bool* is_blob_index) {
+    Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+    const ReadOptions& read_opts, ReadCallback* callback, bool* is_blob_index) {
   *seq = kMaxSequenceNumber;
 
   for (auto& memtable : *list) {
     SequenceNumber current_seq = kMaxSequenceNumber;
 
-    bool done = memtable->Get(key, value, s, merge_context, range_del_agg,
-                              &current_seq, read_opts, is_blob_index);
+    bool done =
+        memtable->Get(key, value, s, merge_context, max_covering_tombstone_seq,
+                      &current_seq, read_opts, callback, is_blob_index);
     if (*seq == kMaxSequenceNumber) {
       // Store the most recent sequence number of any operation on this key.
       // Since we only care about the most recent change, we only need to
       // return the first operation found when searching memtables in
       // reverse-chronological order.
+      // current_seq would be equal to kMaxSequenceNumber if the value was to be
+      // skipped. This allows seq to be assigned again when the next value is
+      // read.
       *seq = current_seq;
     }
 
     if (done) {
-      assert(*seq != kMaxSequenceNumber);
+      assert(*seq != kMaxSequenceNumber || s->IsNotFound());
       return true;
     }
     if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
@@ -149,28 +158,15 @@ bool MemTableListVersion::GetFromList(
 }
 
 Status MemTableListVersion::AddRangeTombstoneIterators(
-    const ReadOptions& read_opts, Arena* arena,
+    const ReadOptions& read_opts, Arena* /*arena*/,
     RangeDelAggregator* range_del_agg) {
   assert(range_del_agg != nullptr);
   for (auto& m : memlist_) {
-    std::unique_ptr<InternalIterator> range_del_iter(
-        m->NewRangeTombstoneIterator(read_opts));
-    Status s = range_del_agg->AddTombstones(std::move(range_del_iter));
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return Status::OK();
-}
-
-Status MemTableListVersion::AddRangeTombstoneIterators(
-    const ReadOptions& read_opts,
-    std::vector<InternalIterator*>* range_del_iters) {
-  for (auto& m : memlist_) {
-    auto* range_del_iter = m->NewRangeTombstoneIterator(read_opts);
-    if (range_del_iter != nullptr) {
-      range_del_iters->push_back(range_del_iter);
-    }
+    // Using kMaxSequenceNumber is OK because these are immutable memtables.
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        m->NewRangeTombstoneIterator(read_opts,
+                                     kMaxSequenceNumber /* read_seq */));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
   }
   return Status::OK();
 }
@@ -243,6 +239,7 @@ void MemTableListVersion::Remove(MemTable* m,
   assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
   memlist_.remove(m);
 
+  m->MarkFlushed();
   if (max_write_buffer_number_to_maintain_ > 0) {
     memlist_history_.push_front(m);
     TrimHistory(to_delete);
@@ -266,7 +263,7 @@ void MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete) {
 // Returns true if there is at least one memtable on which flush has
 // not yet started.
 bool MemTableList::IsFlushPending() const {
-  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
+  if ((flush_requested_ && num_flush_not_started_ > 0) ||
       (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
     assert(imm_flush_needed.load(std::memory_order_relaxed));
     return true;
@@ -275,12 +272,16 @@ bool MemTableList::IsFlushPending() const {
 }
 
 // Returns the memtables that need to be flushed.
-void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
+void MemTableList::PickMemtablesToFlush(const uint64_t* max_memtable_id,
+                                        autovector<MemTable*>* ret) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
   const auto& memlist = current_->memlist_;
   for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
     MemTable* m = *it;
+    if (max_memtable_id != nullptr && m->GetID() > *max_memtable_id) {
+      break;
+    }
     if (!m->flush_in_progress_) {
       assert(!m->flush_completed_);
       num_flush_not_started_--;
@@ -295,7 +296,7 @@ void MemTableList::PickMemtablesToFlush(autovector<MemTable*>* ret) {
 }
 
 void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
-                                         uint64_t file_number) {
+                                         uint64_t /*file_number*/) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
   assert(!mems.empty());
@@ -314,17 +315,21 @@ void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
   imm_flush_needed.store(true, std::memory_order_release);
 }
 
-// Record a successful flush in the manifest file
-Status MemTableList::InstallMemtableFlushResults(
+// Try record a successful flush in the manifest file. It might just return
+// Status::OK letting a concurrent flush to do actual the recording..
+Status MemTableList::TryInstallMemtableFlushResults(
     ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-    const autovector<MemTable*>& mems, VersionSet* vset, InstrumentedMutex* mu,
-    uint64_t file_number, autovector<MemTable*>* to_delete,
-    Directory* db_directory, LogBuffer* log_buffer) {
+    const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
+    VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer) {
   AutoThreadOperationStageUpdater stage_updater(
       ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
   mu->AssertHeld();
 
-  // flush was successful
+  // Flush was successful
+  // Record the status on the memtable object. Either this call or a call by a
+  // concurrent flush thread will read the status and write it to manifest.
   for (size_t i = 0; i < mems.size(); ++i) {
     // All the edits are associated with the first memtable of this batch.
     assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
@@ -336,7 +341,7 @@ Status MemTableList::InstallMemtableFlushResults(
   // if some other thread is already committing, then return
   Status s;
   if (commit_in_progress_) {
-    TEST_SYNC_POINT("MemTableList::InstallMemtableFlushResults:InProgress");
+    TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress");
     return s;
   }
 
@@ -347,15 +352,21 @@ Status MemTableList::InstallMemtableFlushResults(
   // while the current thread is writing manifest where mutex is released.
   while (s.ok()) {
     auto& memlist = current_->memlist_;
+    // The back is the oldest; if flush_completed_ is not set to it, it means
+    // that we were assigned a more recent memtable. The memtables' flushes must
+    // be recorded in manifest in order. A concurrent flush thread, who is
+    // assigned to flush the oldest memtable, will later wake up and does all
+    // the pending writes to manifest, in order.
     if (memlist.empty() || !memlist.back()->flush_completed_) {
       break;
     }
     // scan all memtables from the earliest, and commit those
-    // (in that order) that have finished flushing. Memetables
+    // (in that order) that have finished flushing. Memtables
     // are always committed in the order that they were created.
     uint64_t batch_file_number = 0;
     size_t batch_count = 0;
     autovector<VersionEdit*> edit_list;
+    autovector<MemTable*> memtables_to_flush;
     // enumerate from the last (earliest) element to see how many batch finished
     for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
       MemTable* m = *it;
@@ -368,11 +379,21 @@ Status MemTableList::InstallMemtableFlushResults(
                          "[%s] Level-0 commit table #%" PRIu64 " started",
                          cfd->GetName().c_str(), m->file_number_);
         edit_list.push_back(&m->edit_);
+        memtables_to_flush.push_back(m);
       }
       batch_count++;
     }
 
+    // TODO(myabandeh): Not sure how batch_count could be 0 here.
     if (batch_count > 0) {
+      if (vset->db_options()->allow_2pc) {
+        assert(edit_list.size() > 0);
+        // We piggyback the information of  earliest log file to keep in the
+        // manifest entry for the last file flushed.
+        edit_list.back()->SetMinLogNumberToKeep(PrecomputeMinLogNumberToKeep(
+            vset, *cfd, edit_list, memtables_to_flush, prep_tracker));
+      }
+
       // this can release and reacquire the mutex.
       s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
                             db_directory);
@@ -384,7 +405,22 @@ Status MemTableList::InstallMemtableFlushResults(
       // All the later memtables that have the same filenum
       // are part of the same batch. They can be committed now.
       uint64_t mem_id = 1;  // how many memtables have been flushed.
-      if (s.ok()) {         // commit new state
+
+      // commit new state only if the column family is NOT dropped.
+      // The reason is as follows (refer to
+      // ColumnFamilyTest.FlushAndDropRaceCondition).
+      // If the column family is dropped, then according to LogAndApply, its
+      // corresponding flush operation is NOT written to the MANIFEST. This
+      // means the DB is not aware of the L0 files generated from the flush.
+      // By committing the new state, we remove the memtable from the memtable
+      // list. Creating an iterator on this column family will not be able to
+      // read full data since the memtable is removed, and the DB is not aware
+      // of the L0 files, causing MergingIterator unable to build child
+      // iterators. RocksDB contract requires that the iterator can be created
+      // on a dropped column family, and we must be able to
+      // read full data as long as column family handle is not deleted, even if
+      // the column family is dropped.
+      if (s.ok() && !cfd->IsDropped()) {  // commit new state
         while (batch_count-- > 0) {
           MemTable* m = current_->memlist_.back();
           ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64
@@ -463,13 +499,21 @@ void MemTableList::InstallNewVersion() {
   }
 }
 
-uint64_t MemTableList::GetMinLogContainingPrepSection() {
+uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
+    const autovector<MemTable*>& memtables_to_flush) {
   uint64_t min_log = 0;
 
   for (auto& m : current_->memlist_) {
-    // this mem has been flushed it no longer
-    // needs to hold on the its prep section
-    if (m->flush_completed_) {
+    // Assume the list is very short, we can live with O(m*n). We can optimize
+    // if the performance has some problem.
+    bool should_skip = false;
+    for (MemTable* m_to_flush : memtables_to_flush) {
+      if (m == m_to_flush) {
+        should_skip = true;
+        break;
+      }
+    }
+    if (should_skip) {
       continue;
     }
 
@@ -483,4 +527,109 @@ uint64_t MemTableList::GetMinLogContainingPrepSection() {
   return min_log;
 }
 
+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_metas,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  size_t num = mems_list.size();
+  assert(cfds.size() == num);
+  if (imm_lists != nullptr) {
+    assert(imm_lists->size() == num);
+  }
+  for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+    const auto* imm =
+        (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    if (!mems_list[k]->empty()) {
+      assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+    }
+#endif
+    assert(nullptr != file_metas[k]);
+    for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+      assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+      (*mems_list[k])[i]->SetFlushCompleted(true);
+      (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+    }
+  }
+
+  Status s;
+
+  autovector<autovector<VersionEdit*>> edit_lists;
+  uint32_t num_entries = 0;
+  for (const auto mems : mems_list) {
+    assert(mems != nullptr);
+    autovector<VersionEdit*> edits;
+    assert(!mems->empty());
+    edits.emplace_back((*mems)[0]->GetEdits());
+    ++num_entries;
+    edit_lists.emplace_back(edits);
+  }
+  // Mark the version edits as an atomic group if the number of version edits
+  // exceeds 1.
+  if (cfds.size() > 1) {
+    for (auto& edits : edit_lists) {
+      assert(edits.size() == 1);
+      edits[0]->MarkAtomicGroup(--num_entries);
+    }
+    assert(0 == num_entries);
+  }
+
+  // this can release and reacquire the mutex.
+  s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                        db_directory);
+
+  for (size_t k = 0; k != cfds.size(); ++k) {
+    auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    imm->InstallNewVersion();
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        assert(m->GetFileNumber() > 0);
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        imm->current_->Remove(m, to_delete);
+      }
+    }
+  } else {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        uint64_t mem_id = m->GetID();
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " failed",
+                         cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                         mem_id);
+        m->SetFlushCompleted(false);
+        m->SetFlushInProgress(false);
+        m->GetEdits()->Clear();
+        m->SetFileNumber(0);
+        imm->num_flush_not_started_++;
+      }
+      imm->imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  return s;
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/memtable_list.h b/thirdparty/rocksdb/db/memtable_list.h
index 69038af500..b56ad4932c 100644
--- a/thirdparty/rocksdb/db/memtable_list.h
+++ b/thirdparty/rocksdb/db/memtable_list.h
@@ -5,13 +5,15 @@
 //
 #pragma once
 
-#include <string>
+#include <deque>
+#include <limits>
 #include <list>
-#include <vector>
 #include <set>
-#include <deque>
+#include <string>
+#include <vector>
 
 #include "db/dbformat.h"
+#include "db/logs_with_prep_tracker.h"
 #include "db/memtable.h"
 #include "db/range_del_aggregator.h"
 #include "monitoring/instrumented_mutex.h"
@@ -29,6 +31,7 @@ class ColumnFamilyData;
 class InternalKeyComparator;
 class InstrumentedMutex;
 class MergeIteratorBuilder;
+class MemTableList;
 
 // keeps a list of immutable memtables in a vector. the list is immutable
 // if refcount is bigger than one. It is used as a state for Get() and
@@ -53,16 +56,19 @@ class MemTableListVersion {
   // will be stored in *seq on success (regardless of whether true/false is
   // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           SequenceNumber* seq, const ReadOptions& read_opts,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
            bool* is_blob_index = nullptr);
 
   bool Get(const LookupKey& key, std::string* value, Status* s,
-           MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-           const ReadOptions& read_opts, bool* is_blob_index = nullptr) {
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return Get(key, value, s, merge_context, range_del_agg, &seq, read_opts,
-               is_blob_index);
+    return Get(key, value, s, merge_context, max_covering_tombstone_seq, &seq,
+               read_opts, callback, is_blob_index);
   }
 
   // Similar to Get(), but searches the Memtable history of memtables that
@@ -71,24 +77,22 @@ class MemTableListVersion {
   // writes that are also present in the SST files.
   bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
                       MergeContext* merge_context,
-                      RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                      const ReadOptions& read_opts,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      SequenceNumber* seq, const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr);
   bool GetFromHistory(const LookupKey& key, std::string* value, Status* s,
                       MergeContext* merge_context,
-                      RangeDelAggregator* range_del_agg,
+                      SequenceNumber* max_covering_tombstone_seq,
                       const ReadOptions& read_opts,
                       bool* is_blob_index = nullptr) {
     SequenceNumber seq;
-    return GetFromHistory(key, value, s, merge_context, range_del_agg, &seq,
-                          read_opts, is_blob_index);
+    return GetFromHistory(key, value, s, merge_context,
+                          max_covering_tombstone_seq, &seq, read_opts,
+                          is_blob_index);
   }
 
   Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
                                     RangeDelAggregator* range_del_agg);
-  Status AddRangeTombstoneIterators(
-      const ReadOptions& read_opts,
-      std::vector<InternalIterator*>* range_del_iters);
 
   void AddIterators(const ReadOptions& options,
                     std::vector<InternalIterator*>* iterator_list,
@@ -111,6 +115,18 @@ class MemTableListVersion {
   SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
 
  private:
+  friend class MemTableList;
+
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
   // REQUIRE: m is an immutable memtable
   void Add(MemTable* m, autovector<MemTable*>* to_delete);
   // REQUIRE: m is an immutable memtable
@@ -120,15 +136,15 @@ class MemTableListVersion {
 
   bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
                    std::string* value, Status* s, MergeContext* merge_context,
-                   RangeDelAggregator* range_del_agg, SequenceNumber* seq,
-                   const ReadOptions& read_opts, bool* is_blob_index = nullptr);
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   ReadCallback* callback = nullptr,
+                   bool* is_blob_index = nullptr);
 
   void AddMemTable(MemTable* m);
 
   void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
 
-  friend class MemTableList;
-
   // Immutable MemTables that have not yet been flushed.
   std::list<MemTable*> memlist_;
 
@@ -196,19 +212,22 @@ class MemTableList {
 
   // Returns the earliest memtables that needs to be flushed. The returned
   // memtables are guaranteed to be in the ascending order of created time.
-  void PickMemtablesToFlush(autovector<MemTable*>* mems);
+  void PickMemtablesToFlush(const uint64_t* max_memtable_id,
+                            autovector<MemTable*>* mems);
 
   // Reset status of the given memtable list back to pending state so that
   // they can get picked up again on the next round of flush.
   void RollbackMemtableFlush(const autovector<MemTable*>& mems,
                              uint64_t file_number);
 
-  // Commit a successful flush in the manifest file
-  Status InstallMemtableFlushResults(
+  // Try commit a successful flush in the manifest file. It might just return
+  // Status::OK letting a concurrent flush to do the actual the recording.
+  Status TryInstallMemtableFlushResults(
       ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
-      const autovector<MemTable*>& m, VersionSet* vset, InstrumentedMutex* mu,
-      uint64_t file_number, autovector<MemTable*>* to_delete,
-      Directory* db_directory, LogBuffer* log_buffer);
+      const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
+      VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
 
   // New memtables are inserted at the front of the list.
   // Takes ownership of the referenced held on *m by the caller of Add().
@@ -239,9 +258,53 @@ class MemTableList {
 
   size_t* current_memory_usage() { return &current_memory_usage_; }
 
-  uint64_t GetMinLogContainingPrepSection();
+  // Returns the min log containing the prep section after memtables listsed in
+  // `memtables_to_flush` are flushed and their status is persisted in manifest.
+  uint64_t PrecomputeMinLogContainingPrepSection(
+      const autovector<MemTable*>& memtables_to_flush);
+
+  uint64_t GetEarliestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return std::numeric_limits<uint64_t>::max();
+    }
+    return memlist.back()->GetID();
+  }
+
+  uint64_t GetLatestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return 0;
+    }
+    return memlist.front()->GetID();
+  }
+
+  void AssignAtomicFlushSeq(const SequenceNumber& seq) {
+    const auto& memlist = current_->memlist_;
+    // Scan the memtable list from new to old
+    for (auto it = memlist.begin(); it != memlist.end(); ++it) {
+      MemTable* mem = *it;
+      if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
+        mem->atomic_flush_seqno_ = seq;
+      } else {
+        // Earlier memtables must have been assigned a atomic flush seq, no
+        // need to continue scan.
+        break;
+      }
+    }
+  }
 
  private:
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, InstrumentedMutex* mu,
+      const autovector<FileMetaData*>& file_meta,
+      autovector<MemTable*>* to_delete, Directory* db_directory,
+      LogBuffer* log_buffer);
+
   // DB mutex held
   void InstallNewVersion();
 
@@ -255,11 +318,26 @@ class MemTableList {
   // committing in progress
   bool commit_in_progress_;
 
-  // Requested a flush of all memtables to storage
+  // Requested a flush of memtables to storage. It's possible to request that
+  // a subset of memtables be flushed.
   bool flush_requested_;
 
   // The current memory usage.
   size_t current_memory_usage_;
 };
 
+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+    autovector<MemTable*>* to_delete, Directory* db_directory,
+    LogBuffer* log_buffer);
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/memtable_list_test.cc b/thirdparty/rocksdb/db/memtable_list_test.cc
index 30e5166637..a14c13b893 100644
--- a/thirdparty/rocksdb/db/memtable_list_test.cc
+++ b/thirdparty/rocksdb/db/memtable_list_test.cc
@@ -8,7 +8,6 @@
 #include <string>
 #include <vector>
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
 #include "db/version_set.h"
 #include "db/write_controller.h"
 #include "rocksdb/db.h"
@@ -25,9 +24,13 @@ class MemTableListTest : public testing::Test {
   std::string dbname;
   DB* db;
   Options options;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::atomic<uint64_t> file_number;
 
-  MemTableListTest() : db(nullptr) {
-    dbname = test::TmpDir() + "/memtable_list_test";
+  MemTableListTest() : db(nullptr), file_number(1) {
+    dbname = test::PerThreadDBPath("memtable_list_test");
+    options.create_if_missing = true;
+    DestroyDB(dbname, options);
   }
 
   // Create a test db if not yet created
@@ -35,19 +38,49 @@ class MemTableListTest : public testing::Test {
     if (db == nullptr) {
       options.create_if_missing = true;
       DestroyDB(dbname, options);
-      Status s = DB::Open(options, dbname, &db);
+      // Open DB only with default column family
+      ColumnFamilyOptions cf_options;
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
+      Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
       EXPECT_OK(s);
+
+      ColumnFamilyOptions cf_opt1, cf_opt2;
+      cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
+                                    std::numeric_limits<uint64_t>::max());
+      cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
+                                    std::numeric_limits<uint64_t>::max());
+      int sz = static_cast<int>(handles.size());
+      handles.resize(sz + 2);
+      s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
+      EXPECT_OK(s);
+      s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
+      EXPECT_OK(s);
+
+      cf_descs.emplace_back("one", cf_options);
+      cf_descs.emplace_back("two", cf_options);
     }
   }
 
-  ~MemTableListTest() {
+  ~MemTableListTest() override {
     if (db) {
+      std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+      for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
+        handles[i]->GetDescriptor(&cf_descs[i]);
+      }
+      for (auto h : handles) {
+        if (h) {
+          db->DestroyColumnFamilyHandle(h);
+        }
+      }
+      handles.clear();
       delete db;
-      DestroyDB(dbname, options);
+      db = nullptr;
+      DestroyDB(dbname, options, cf_descs);
     }
   }
 
-  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
   // structures needed to call this function.
   Status Mock_InstallMemtableFlushResults(
       MemTableList* list, const MutableCFOptions& mutable_cf_options,
@@ -56,36 +89,95 @@ class MemTableListTest : public testing::Test {
     test::NullLogger logger;
     LogBuffer log_buffer(DEBUG_LEVEL, &logger);
 
+    CreateDB();
     // Create a mock VersionSet
     DBOptions db_options;
     ImmutableDBOptions immutable_db_options(db_options);
     EnvOptions env_options;
-    shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
     WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
     WriteController write_controller(10000000u);
 
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller);
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+    auto column_family_set = versions.GetColumnFamilySet();
+    LogsWithPrepTracker dummy_prep_tracker;
+    auto cfd = column_family_set->GetDefault();
+    EXPECT_TRUE(nullptr != cfd);
+    uint64_t file_num = file_number.fetch_add(1);
+    // Create dummy mutex.
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    return list->TryInstallMemtableFlushResults(
+        cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+        file_num, to_delete, nullptr, &log_buffer);
+  }
+
+  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableAtomicFlushResults(
+      autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
     CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
     VersionSet versions(dbname, &immutable_db_options, env_options,
                         table_cache.get(), &write_buffer_manager,
                         &write_controller);
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+    EXPECT_OK(versions.Recover(cf_descs, false));
 
     // Create mock default ColumnFamilyData
-    ColumnFamilyOptions cf_options;
-    std::vector<ColumnFamilyDescriptor> column_families;
-    column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
-    EXPECT_OK(versions.Recover(column_families, false));
 
     auto column_family_set = versions.GetColumnFamilySet();
-    auto cfd = column_family_set->GetColumnFamily(0);
-    EXPECT_TRUE(cfd != nullptr);
 
-    // Create dummy mutex.
+    LogsWithPrepTracker dummy_prep_tracker;
+    autovector<ColumnFamilyData*> cfds;
+    for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
+      cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
+      EXPECT_NE(nullptr, cfds[i]);
+    }
+    std::vector<FileMetaData> file_metas;
+    file_metas.reserve(cf_ids.size());
+    for (size_t i = 0; i != cf_ids.size(); ++i) {
+      FileMetaData meta;
+      uint64_t file_num = file_number.fetch_add(1);
+      meta.fd = FileDescriptor(file_num, 0, 0);
+      file_metas.emplace_back(meta);
+    }
+    autovector<FileMetaData*> file_meta_ptrs;
+    for (auto& meta : file_metas) {
+      file_meta_ptrs.push_back(&meta);
+    }
     InstrumentedMutex mutex;
     InstrumentedMutexLock l(&mutex);
-
-    return list->InstallMemtableFlushResults(cfd, mutable_cf_options, m,
-                                             &versions, &mutex, 1, to_delete,
-                                             nullptr, &log_buffer);
+    return InstallMemtableAtomicFlushResults(
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex,
+        file_meta_ptrs, to_delete, nullptr, &log_buffer);
   }
 };
 
@@ -98,7 +190,7 @@ TEST_F(MemTableListTest, Empty) {
   ASSERT_FALSE(list.IsFlushPending());
 
   autovector<MemTable*> mems;
-  list.PickMemtablesToFlush(&mems);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &mems);
   ASSERT_EQ(0, mems.size());
 
   autovector<MemTable*> to_delete;
@@ -118,12 +210,12 @@ TEST_F(MemTableListTest, GetTest) {
   Status s;
   MergeContext merge_context;
   InternalKeyComparator ikey_cmp(options.comparator);
-  RangeDelAggregator range_del_agg(ikey_cmp, {} /* snapshots */);
+  SequenceNumber max_covering_tombstone_seq = 0;
   autovector<MemTable*> to_delete;
 
   LookupKey lkey("key1", seq);
   bool found = list.current()->Get(lkey, &value, &s, &merge_context,
-                                   &range_del_agg, ReadOptions());
+                                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Create a MemTable
@@ -146,19 +238,19 @@ TEST_F(MemTableListTest, GetTest) {
   // Fetch the newly written keys
   merge_context.Clear();
   found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value1");
 
   merge_context.Clear();
   found = mem->Get(LookupKey("key1", 2), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   // MemTable found out that this key is *not* found (at this sequence#)
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.2");
 
@@ -184,25 +276,28 @@ TEST_F(MemTableListTest, GetTest) {
 
   // Fetch keys via MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = list.current()->Get(LookupKey("key1", saved_seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+                              &merge_context, &max_covering_tombstone_seq,
+                              ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ("value1", value);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.3");
 
   merge_context.Clear();
   found = list.current()->Get(LookupKey("key2", 1), &value, &s, &merge_context,
-                              &range_del_agg, ReadOptions());
+                              &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   ASSERT_EQ(2, list.NumNotFlushed());
@@ -225,12 +320,12 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   Status s;
   MergeContext merge_context;
   InternalKeyComparator ikey_cmp(options.comparator);
-  RangeDelAggregator range_del_agg(ikey_cmp, {} /* snapshots */);
+  SequenceNumber max_covering_tombstone_seq = 0;
   autovector<MemTable*> to_delete;
 
   LookupKey lkey("key1", seq);
   bool found = list.current()->Get(lkey, &value, &s, &merge_context,
-                                   &range_del_agg, ReadOptions());
+                                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Create a MemTable
@@ -252,13 +347,13 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   // Fetch the newly written keys
   merge_context.Clear();
   found = mem->Get(LookupKey("key1", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   // MemTable found out that this key is *not* found (at this sequence#)
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
   found = mem->Get(LookupKey("key2", seq), &value, &s, &merge_context,
-                   &range_del_agg, ReadOptions());
+                   &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ(value, "value2.2");
 
@@ -268,24 +363,27 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 
   // Fetch keys via MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(s.ok() && found);
   ASSERT_EQ("value2.2", value);
 
   // Flush this memtable from the list.
   // (It will then be a part of the memtable history).
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
-  s = Mock_InstallMemtableFlushResults(&list, MutableCFOptions(options),
-                                       to_flush, &to_delete);
+  MutableCFOptions mutable_cf_options(options);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
   ASSERT_OK(s);
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_EQ(1, list.NumFlushed());
@@ -293,26 +391,28 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 
   // Verify keys are no longer in MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Verify keys are present in history
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key1", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key2", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key2", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found);
   ASSERT_EQ("value2.2", value);
 
@@ -330,12 +430,12 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
   ASSERT_EQ(0, to_delete.size());
 
   to_flush.clear();
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(1, to_flush.size());
 
   // Flush second memtable
-  s = Mock_InstallMemtableFlushResults(&list, MutableCFOptions(options),
-                                       to_flush, &to_delete);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
   ASSERT_OK(s);
   ASSERT_EQ(0, list.NumNotFlushed());
   ASSERT_EQ(2, list.NumFlushed());
@@ -353,38 +453,42 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 
   // Verify keys are no longer in MemTableList
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key1", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key3", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key3", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Verify that the second memtable's keys are in the history
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key1", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found && s.IsNotFound());
 
   merge_context.Clear();
-  found = list.current()->GetFromHistory(LookupKey("key3", seq), &value, &s,
-                                         &merge_context, &range_del_agg,
-                                         ReadOptions());
+  found = list.current()->GetFromHistory(
+      LookupKey("key3", seq), &value, &s, &merge_context,
+      &max_covering_tombstone_seq, ReadOptions());
   ASSERT_TRUE(found);
   ASSERT_EQ("value3", value);
 
   // Verify that key2 from the first memtable is no longer in the history
   merge_context.Clear();
-  found = list.current()->Get(LookupKey("key2", seq), &value, &s,
-                              &merge_context, &range_del_agg, ReadOptions());
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
   ASSERT_FALSE(found);
 
   // Cleanup
@@ -396,7 +500,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
 }
 
 TEST_F(MemTableListTest, FlushPendingTest) {
-  const int num_tables = 5;
+  const int num_tables = 6;
   SequenceNumber seq = 1;
   Status s;
 
@@ -414,11 +518,13 @@ TEST_F(MemTableListTest, FlushPendingTest) {
                     max_write_buffer_number_to_maintain);
 
   // Create some MemTables
+  uint64_t memtable_id = 0;
   std::vector<MemTable*> tables;
   MutableCFOptions mutable_cf_options(options);
   for (int i = 0; i < num_tables; i++) {
     MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
                                  kMaxSequenceNumber, 0 /* column_family_id */);
+    mem->SetID(memtable_id++);
     mem->Ref();
 
     std::string value;
@@ -437,7 +543,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
   autovector<MemTable*> to_flush;
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
 
   // Request a flush even though there is nothing to flush
@@ -446,7 +552,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Attempt to 'flush' to clear request for flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(0, to_flush.size());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
@@ -470,7 +576,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(2, to_flush.size());
   ASSERT_EQ(2, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -491,7 +597,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   ASSERT_EQ(3, to_flush.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -499,7 +605,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush2;
-  list.PickMemtablesToFlush(&to_flush2);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
   ASSERT_EQ(0, to_flush2.size());
   ASSERT_EQ(3, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -517,7 +623,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Pick tables to flush again
-  list.PickMemtablesToFlush(&to_flush2);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush2);
   ASSERT_EQ(1, to_flush2.size());
   ASSERT_EQ(4, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
@@ -538,7 +644,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(0, to_delete.size());
 
   // Pick tables to flush
-  list.PickMemtablesToFlush(&to_flush);
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush);
   // Should pick 4 of 5 since 1 table has been picked in to_flush2
   ASSERT_EQ(4, to_flush.size());
   ASSERT_EQ(5, list.NumNotFlushed());
@@ -547,20 +653,21 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Pick tables to flush again
   autovector<MemTable*> to_flush3;
+  list.PickMemtablesToFlush(nullptr /* memtable_id */, &to_flush3);
   ASSERT_EQ(0, to_flush3.size());  // nothing not in progress of being flushed
   ASSERT_EQ(5, list.NumNotFlushed());
   ASSERT_FALSE(list.IsFlushPending());
   ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
 
   // Flush the 4 memtables that were picked in to_flush
-  s = Mock_InstallMemtableFlushResults(&list, MutableCFOptions(options),
-                                       to_flush, &to_delete);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
   ASSERT_OK(s);
 
   // Note:  now to_flush contains tables[0,1,2,4].  to_flush2 contains
   // tables[3].
   // Current implementation will only commit memtables in the order they were
-  // created.  So InstallMemtableFlushResults will install the first 3 tables
+  // created. So TryInstallMemtableFlushResults will install the first 3 tables
   // in to_flush and stop when it encounters a table not yet flushed.
   ASSERT_EQ(2, list.NumNotFlushed());
   int num_in_history = std::min(3, max_write_buffer_number_to_maintain);
@@ -574,7 +681,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
 
   // Flush the 1 memtable that was picked in to_flush2
   s = MemTableListTest::Mock_InstallMemtableFlushResults(
-      &list, MutableCFOptions(options), to_flush2, &to_delete);
+      &list, mutable_cf_options, to_flush2, &to_delete);
   ASSERT_OK(s);
 
   // This will actually install 2 tables.  The 1 we told it to flush, and also
@@ -585,7 +692,7 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
 
   for (const auto& m : to_delete) {
-    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
     // Verify this, by Ref'ing then UnRef'ing:
     m->Ref();
     ASSERT_EQ(m, m->Unref());
@@ -593,12 +700,41 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   }
   to_delete.clear();
 
+  // Add another table
+  list.Add(tables[5], &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(5, list.GetLatestMemTableID());
+  memtable_id = 4;
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 4. Therefore, no table will be selected in this case.
+  autovector<MemTable*> to_flush4;
+  list.FlushRequested();
+  ASSERT_TRUE(list.HasFlushRequested());
+  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  ASSERT_TRUE(to_flush4.empty());
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.HasFlushRequested());
+
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 5. Therefore, only tables[5] will be selected.
+  memtable_id = 5;
+  list.FlushRequested();
+  list.PickMemtablesToFlush(&memtable_id, &to_flush4);
+  ASSERT_EQ(1, static_cast<int>(to_flush4.size()));
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  to_delete.clear();
+
   list.current()->Unref(&to_delete);
-  int to_delete_size = std::min(5, max_write_buffer_number_to_maintain);
+  int to_delete_size =
+      std::min(num_tables, max_write_buffer_number_to_maintain);
   ASSERT_EQ(to_delete_size, to_delete.size());
 
   for (const auto& m : to_delete) {
-    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
     // Verify this, by Ref'ing then UnRef'ing:
     m->Ref();
     ASSERT_EQ(m, m->Unref());
@@ -607,6 +743,157 @@ TEST_F(MemTableListTest, FlushPendingTest) {
   to_delete.clear();
 }
 
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+  autovector<MemTableList*> lists;
+  autovector<uint32_t> cf_ids;
+  autovector<const MutableCFOptions*> options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  autovector<MemTable*> to_delete;
+  Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+                                                    to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
+  const int num_cfs = 3;
+  const int num_tables_per_cf = 2;
+  SequenceNumber seq = 1;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableCFOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+
+  // Create MemTableLists
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  autovector<MemTableList*> lists;
+  for (int i = 0; i != num_cfs; ++i) {
+    lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
+                                        max_write_buffer_number_to_maintain));
+  }
+
+  autovector<uint32_t> cf_ids;
+  std::vector<std::vector<MemTable*>> tables(num_cfs);
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  uint32_t cf_id = 0;
+  for (auto& elem : tables) {
+    mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
+    uint64_t memtable_id = 0;
+    for (int i = 0; i != num_tables_per_cf; ++i) {
+      MemTable* mem =
+          new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
+                       kMaxSequenceNumber, cf_id);
+      mem->SetID(memtable_id++);
+      mem->Ref();
+
+      std::string value;
+
+      mem->Add(++seq, kTypeValue, "key1", ToString(i));
+      mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN");
+      mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value");
+      mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM");
+      mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), "");
+
+      elem.push_back(mem);
+    }
+    cf_ids.push_back(cf_id++);
+  }
+
+  std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
+
+  // Nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+    list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]);
+    ASSERT_EQ(0, flush_candidates[i].size());
+  }
+  // Request flush even though there is nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    list->FlushRequested();
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  autovector<MemTable*> to_delete;
+  // Add tables to the immutable memtalbe lists associated with column families
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      lists[i]->Add(tables[i][j], &to_delete);
+    }
+    ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
+    ASSERT_TRUE(lists[i]->IsFlushPending());
+    ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+  //          +----+
+  // list[0]: |0  1|
+  // list[1]: |0  1|
+  //          | +--+
+  // list[2]: |0| 1
+  //          +-+
+  // Pick memtables to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    flush_candidates[i].clear();
+    lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i],
+                                   &flush_candidates[i]);
+    ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+              static_cast<uint64_t>(flush_candidates[i].size()));
+  }
+  autovector<MemTableList*> tmp_lists;
+  autovector<uint32_t> tmp_cf_ids;
+  autovector<const MutableCFOptions*> tmp_options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  for (auto i = 0; i != num_cfs; ++i) {
+    if (!flush_candidates[i].empty()) {
+      to_flush.push_back(&flush_candidates[i]);
+      tmp_lists.push_back(lists[i]);
+      tmp_cf_ids.push_back(i);
+      tmp_options_list.push_back(mutable_cf_options_list[i]);
+    }
+  }
+  Status s = Mock_InstallMemtableAtomicFlushResults(
+      tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
+  ASSERT_OK(s);
+
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+        ASSERT_LT(0, tables[i][j]->GetFileNumber());
+      }
+    }
+    ASSERT_EQ(
+        static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+        lists[i]->NumNotFlushed());
+  }
+
+  to_delete.clear();
+  for (auto list : lists) {
+    list->current()->Unref(&to_delete);
+    delete list;
+  }
+  for (auto& mutable_cf_options : mutable_cf_options_list) {
+    if (mutable_cf_options != nullptr) {
+      delete mutable_cf_options;
+      mutable_cf_options = nullptr;
+    }
+  }
+  // All memtables in tables array must have been flushed, thus ready to be
+  // deleted.
+  ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this by Ref'ing and then Unref'ing.
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/merge_context.h b/thirdparty/rocksdb/db/merge_context.h
index 5e75e09973..fd06441f7c 100644
--- a/thirdparty/rocksdb/db/merge_context.h
+++ b/thirdparty/rocksdb/db/merge_context.h
@@ -74,8 +74,14 @@ class MergeContext {
     return (*operand_list_)[index];
   }
 
-  // Return all the operands.
+  // Same as GetOperandsDirectionForward
   const std::vector<Slice>& GetOperands() {
+    return GetOperandsDirectionForward();
+  }
+
+  // Return all the operands in the order as they were merged (passed to
+  // FullMerge or FullMergeV2)
+  const std::vector<Slice>& GetOperandsDirectionForward() {
     if (!operand_list_) {
       return empty_operand_list;
     }
@@ -84,6 +90,17 @@ class MergeContext {
     return *operand_list_;
   }
 
+  // Return all the operands in the reversed order relative to how they were
+  // merged (passed to FullMerge or FullMergeV2)
+  const std::vector<Slice>& GetOperandsDirectionBackward() {
+    if (!operand_list_) {
+      return empty_operand_list;
+    }
+
+    SetDirectionBackward();
+    return *operand_list_;
+  }
+
  private:
   void Initialize() {
     if (!operand_list_) {
diff --git a/thirdparty/rocksdb/db/merge_helper.cc b/thirdparty/rocksdb/db/merge_helper.cc
index 55f8254cf0..b5ae924ffc 100644
--- a/thirdparty/rocksdb/db/merge_helper.cc
+++ b/thirdparty/rocksdb/db/merge_helper.cc
@@ -5,15 +5,16 @@
 
 #include "db/merge_helper.h"
 
-#include <stdio.h>
 #include <string>
 
 #include "db/dbformat.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
+#include "port/likely.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/merge_operator.h"
+#include "table/format.h"
 #include "table/internal_iterator.h"
 
 namespace rocksdb {
@@ -22,7 +23,8 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
                          const MergeOperator* user_merge_operator,
                          const CompactionFilter* compaction_filter,
                          Logger* logger, bool assert_valid_internal_key,
-                         SequenceNumber latest_snapshot, int level,
+                         SequenceNumber latest_snapshot,
+                         const SnapshotChecker* snapshot_checker, int level,
                          Statistics* stats,
                          const std::atomic<bool>* shutting_down)
     : env_(env),
@@ -34,6 +36,7 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
       assert_valid_internal_key_(assert_valid_internal_key),
       allow_single_operand_(false),
       latest_snapshot_(latest_snapshot),
+      snapshot_checker_(snapshot_checker),
       level_(level),
       keys_(),
       filter_timer_(env_),
@@ -61,8 +64,8 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
   }
 
   if (update_num_ops_stats) {
-    MeasureTime(statistics, READ_NUM_MERGE_OPERANDS,
-                static_cast<uint64_t>(operands.size()));
+    RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS,
+                      static_cast<uint64_t>(operands.size()));
   }
 
   bool success;
@@ -107,8 +110,11 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
 //       keys_ stores the list of keys encountered while merging.
 //       operands_ stores the list of merge operands encountered while merging.
 //       keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
 Status MergeHelper::MergeUntil(InternalIterator* iter,
-                               RangeDelAggregator* range_del_agg,
+                               CompactionRangeDelAggregator* range_del_agg,
                                const SequenceNumber stop_before,
                                const bool at_bottom) {
   // Get a copy of the internal key, before it's invalidated by iter->Next()
@@ -132,7 +138,11 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
   // orig_ikey is backed by original_key if keys_.empty()
   // orig_ikey is backed by keys_.back() if !keys_.empty()
   ParsedInternalKey orig_ikey;
-  ParseInternalKey(original_key, &orig_ikey);
+  bool succ = ParseInternalKey(original_key, &orig_ikey);
+  assert(succ);
+  if (!succ) {
+    return Status::Corruption("Cannot parse key in MergeUntil");
+  }
 
   Status s;
   bool hit_the_next_user_key = false;
@@ -158,8 +168,13 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       // hit a different user key, stop right here
       hit_the_next_user_key = true;
       break;
-    } else if (stop_before && ikey.sequence <= stop_before) {
-      // hit an entry that's visible by the previous snapshot, can't touch that
+    } else if (stop_before > 0 && ikey.sequence <= stop_before &&
+               LIKELY(snapshot_checker_ == nullptr ||
+                      snapshot_checker_->CheckInSnapshot(ikey.sequence,
+                                                         stop_before) !=
+                          SnapshotCheckerResult::kNotInSnapshot)) {
+      // hit an entry that's possibly visible by the previous snapshot, can't
+      // touch that
       break;
     }
 
@@ -186,7 +201,15 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       // want. Also if we're in compaction and it's a put, it would be nice to
       // run compaction filter on it.
       const Slice val = iter->value();
-      const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr;
+      const Slice* val_ptr;
+      if (kTypeValue == ikey.type &&
+          (range_del_agg == nullptr ||
+           !range_del_agg->ShouldDelete(
+               ikey, RangeDelPositioningMode::kForwardTraversal))) {
+        val_ptr = &val;
+      } else {
+        val_ptr = nullptr;
+      }
       std::string merge_result;
       s = TimedFullMerge(user_merge_operator_, ikey.user_key, val_ptr,
                          merge_context_.GetOperands(), &merge_result, logger_,
@@ -231,8 +254,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
       if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
           range_del_agg != nullptr &&
           range_del_agg->ShouldDelete(
-              iter->key(),
-              RangeDelAggregator::RangePositioningMode::kForwardTraversal)) {
+              iter->key(), RangeDelPositioningMode::kForwardTraversal)) {
         filter = CompactionFilter::Decision::kRemove;
       }
       if (filter == CompactionFilter::Decision::kKeep ||
@@ -272,22 +294,24 @@ Status MergeHelper::MergeUntil(InternalIterator* iter,
     return Status::OK();
   }
 
-  // We are sure we have seen this key's entire history if we are at the
-  // last level and exhausted all internal keys of this user key.
-  // NOTE: !iter->Valid() does not necessarily mean we hit the
-  // beginning of a user key, as versions of a user key might be
-  // split into multiple files (even files on the same level)
-  // and some files might not be included in the compaction/merge.
+  // We are sure we have seen this key's entire history if:
+  // at_bottom == true (this does not necessarily mean it is the bottommost
+  // layer, but rather that we are confident the key does not appear on any of
+  // the lower layers, at_bottom == false doesn't mean it does appear, just
+  // that we can't be sure, see Compaction::IsBottommostLevel for details)
+  // AND
+  // we have either encountered another key or end of key history on this
+  // layer.
   //
-  // There are also cases where we have seen the root of history of this
-  // key without being sure of it. Then, we simply miss the opportunity
+  // When these conditions are true we are able to merge all the keys
+  // using full merge.
+  //
+  // For these cases we are not sure about, we simply miss the opportunity
   // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
   // sure that all merge-operands on the same level get compacted together,
   // this will simply lead to these merge operands moving to the next level.
-  //
-  // So, we only perform the following logic (to merge all operands together
-  // without a Put/Delete) if we are certain that we have seen the end of key.
-  bool surely_seen_the_beginning = hit_the_next_user_key && at_bottom;
+  bool surely_seen_the_beginning =
+      (hit_the_next_user_key || !iter->Valid()) && at_bottom;
   if (surely_seen_the_beginning) {
     // do a final merge with nullptr as the existing value and say
     // bye to the merge type (it's now converted to a Put)
@@ -367,7 +391,7 @@ CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
   if (compaction_filter_ == nullptr) {
     return CompactionFilter::Decision::kKeep;
   }
-  if (stats_ != nullptr) {
+  if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
     filter_timer_.Start();
   }
   compaction_filter_value_.clear();
diff --git a/thirdparty/rocksdb/db/merge_helper.h b/thirdparty/rocksdb/db/merge_helper.h
index b9ef12a4cf..670cba5983 100644
--- a/thirdparty/rocksdb/db/merge_helper.h
+++ b/thirdparty/rocksdb/db/merge_helper.h
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#ifndef MERGE_HELPER_H
-#define MERGE_HELPER_H
+#pragma once
 
 #include <deque>
 #include <string>
@@ -13,6 +12,7 @@
 #include "db/dbformat.h"
 #include "db/merge_context.h"
 #include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
 #include "rocksdb/slice.h"
@@ -25,7 +25,6 @@ class Iterator;
 class Logger;
 class MergeOperator;
 class Statistics;
-class InternalIterator;
 
 class MergeHelper {
  public:
@@ -33,7 +32,8 @@ class MergeHelper {
               const MergeOperator* user_merge_operator,
               const CompactionFilter* compaction_filter, Logger* logger,
               bool assert_valid_internal_key, SequenceNumber latest_snapshot,
-              int level = 0, Statistics* stats = nullptr,
+              const SnapshotChecker* snapshot_checker = nullptr, int level = 0,
+              Statistics* stats = nullptr,
               const std::atomic<bool>* shutting_down = nullptr);
 
   // Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
@@ -78,7 +78,7 @@ class MergeHelper {
   //
   // REQUIRED: The first key in the input is not corrupted.
   Status MergeUntil(InternalIterator* iter,
-                    RangeDelAggregator* range_del_agg = nullptr,
+                    CompactionRangeDelAggregator* range_del_agg = nullptr,
                     const SequenceNumber stop_before = 0,
                     const bool at_bottom = false);
 
@@ -145,6 +145,7 @@ class MergeHelper {
   bool assert_valid_internal_key_; // enforce no internal key corruption?
   bool allow_single_operand_;
   SequenceNumber latest_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
   int level_;
 
   // the scratch area that holds the result of MergeUntil
@@ -191,5 +192,3 @@ class MergeOutputIterator {
 };
 
 } // namespace rocksdb
-
-#endif
diff --git a/thirdparty/rocksdb/db/merge_helper_test.cc b/thirdparty/rocksdb/db/merge_helper_test.cc
index dc43db0d10..b61092ee57 100644
--- a/thirdparty/rocksdb/db/merge_helper_test.cc
+++ b/thirdparty/rocksdb/db/merge_helper_test.cc
@@ -20,7 +20,7 @@ class MergeHelperTest : public testing::Test {
  public:
   MergeHelperTest() { env_ = Env::Default(); }
 
-  ~MergeHelperTest() = default;
+  ~MergeHelperTest() override = default;
 
   Status Run(SequenceNumber stop_before, bool at_bottom,
              SequenceNumber latest_snapshot = 0) {
@@ -130,7 +130,7 @@ TEST_F(MergeHelperTest, SingleOperand) {
 
   AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
 
-  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_TRUE(Run(31, false).IsMergeInProgress());
   ASSERT_FALSE(iter_->Valid());
   ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
   ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
diff --git a/thirdparty/rocksdb/db/merge_test.cc b/thirdparty/rocksdb/db/merge_test.cc
index b6582b7a59..3bd4b9a600 100644
--- a/thirdparty/rocksdb/db/merge_test.cc
+++ b/thirdparty/rocksdb/db/merge_test.cc
@@ -20,15 +20,17 @@
 #include "utilities/merge_operators.h"
 #include "util/testharness.h"
 
-using namespace rocksdb;
+namespace rocksdb {
+
+bool use_compression;
+
+class MergeTest : public testing::Test {};
 
-namespace {
 size_t num_merge_operator_calls;
 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
 
 size_t num_partial_merge_calls;
 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
-}
 
 class CountMergeOperator : public AssociativeMergeOperator {
  public:
@@ -36,11 +38,8 @@ class CountMergeOperator : public AssociativeMergeOperator {
     mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
   }
 
-  virtual bool Merge(const Slice& key,
-                     const Slice* existing_value,
-                     const Slice& value,
-                     std::string* new_value,
-                     Logger* logger) const override {
+  bool Merge(const Slice& key, const Slice* existing_value, const Slice& value,
+             std::string* new_value, Logger* logger) const override {
     assert(new_value->empty());
     ++num_merge_operator_calls;
     if (existing_value == nullptr) {
@@ -56,25 +55,22 @@ class CountMergeOperator : public AssociativeMergeOperator {
         logger);
   }
 
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value,
-                                 Logger* logger) const override {
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override {
     assert(new_value->empty());
     ++num_partial_merge_calls;
     return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
                                              logger);
   }
 
-  virtual const char* Name() const override {
-    return "UInt64AddOperator";
-  }
+  const char* Name() const override { return "UInt64AddOperator"; }
 
  private:
   std::shared_ptr<MergeOperator> mergeOperator_;
 };
 
-namespace {
 std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
                            const size_t max_successive_merges = 0) {
   DB* db;
@@ -87,7 +83,6 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
 // DBWithTTL is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
   if (ttl) {
-    std::cout << "Opening database with TTL\n";
     DBWithTTL* db_with_ttl;
     s = DBWithTTL::Open(options, dbname, &db_with_ttl);
     db = db_with_ttl;
@@ -104,7 +99,6 @@ std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
   }
   return std::shared_ptr<DB>(db);
 }
-}  // namespace
 
 // Imagine we are maintaining a set of uint64 counters.
 // Each counter has a distinct name. And we would like
@@ -231,7 +225,7 @@ class MergeBasedCounters : public Counters {
   }
 
   // mapped to a rocksdb Merge operation
-  virtual bool add(const std::string& key, uint64_t value) override {
+  bool add(const std::string& key, uint64_t value) override {
     char encoded[sizeof(uint64_t)];
     EncodeFixed64(encoded, value);
     Slice slice(encoded, sizeof(uint64_t));
@@ -246,12 +240,11 @@ class MergeBasedCounters : public Counters {
   }
 };
 
-namespace {
 void dumpDb(DB* db) {
-  auto it = unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
   for (it->SeekToFirst(); it->Valid(); it->Next()) {
-    uint64_t value = DecodeFixed64(it->value().data());
-    std::cout << it->key().ToString() << ": " << value << std::endl;
+    //uint64_t value = DecodeFixed64(it->value().data());
+    //std::cout << it->key().ToString() << ": " << value << std::endl;
   }
   assert(it->status().ok());  // Check for any errors found during the scan
 }
@@ -281,8 +274,6 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
 
   dumpDb(db);
 
-  std::cout << "1\n";
-
   // 1+...+49 = ?
   uint64_t sum = 0;
   for (int i = 1; i < 50; i++) {
@@ -291,17 +282,12 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) {
   }
   assert(counters.assert_get("b") == sum);
 
-  std::cout << "2\n";
   dumpDb(db);
 
-  std::cout << "3\n";
-
   if (test_compaction) {
     db->Flush(o);
 
-    std::cout << "Compaction started ...\n";
     db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
-    std::cout << "Compaction ended\n";
 
     dumpDb(db);
 
@@ -411,44 +397,35 @@ void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
             static_cast<size_t>((num_merges % (max_num_merges + 1))));
 }
 
-void runTest(int argc, const std::string& dbname, const bool use_ttl = false) {
-  bool compact = false;
-  if (argc > 1) {
-    compact = true;
-    std::cout << "Turn on Compaction\n";
-  }
+void runTest(const std::string& dbname, const bool use_ttl = false) {
 
   {
     auto db = OpenDb(dbname, use_ttl);
 
     {
-      std::cout << "Test read-modify-write counters... \n";
       Counters counters(db, 0);
       testCounters(counters, db.get(), true);
     }
 
     {
-      std::cout << "Test merge-based counters... \n";
       MergeBasedCounters counters(db, 0);
-      testCounters(counters, db.get(), compact);
+      testCounters(counters, db.get(), use_compression);
     }
   }
 
   DestroyDB(dbname, Options());
 
   {
-    std::cout << "Test merge in memtable... \n";
     size_t max_merge = 5;
     auto db = OpenDb(dbname, use_ttl, max_merge);
     MergeBasedCounters counters(db, 0);
-    testCounters(counters, db.get(), compact);
+    testCounters(counters, db.get(), use_compression);
     testSuccessiveMerge(counters, max_merge, max_merge * 2);
     testSingleBatchSuccessiveMerge(db.get(), 5, 7);
     DestroyDB(dbname, Options());
   }
 
   {
-    std::cout << "Test Partial-Merge\n";
     size_t max_merge = 100;
     // Min merge is hard-coded to 2.
     uint32_t min_merge = 2;
@@ -468,7 +445,6 @@ void runTest(int argc, const std::string& dbname, const bool use_ttl = false) {
   }
 
   {
-    std::cout << "Test merge-operator not set after reopen\n";
     {
       auto db = OpenDb(dbname);
       MergeBasedCounters counters(db, 0);
@@ -502,16 +478,27 @@ void runTest(int argc, const std::string& dbname, const bool use_ttl = false) {
   }
   */
 }
-}  // namespace
 
-int main(int argc, char *argv[]) {
-  //TODO: Make this test like a general rocksdb unit-test
-  rocksdb::port::InstallStackTraceHandler();
-  runTest(argc, test::TmpDir() + "/merge_testdb");
-// DBWithTTL is not supported in ROCKSDB_LITE
+TEST_F(MergeTest, MergeDbTest) {
+  runTest(test::PerThreadDBPath("merge_testdb"));
+}
+
 #ifndef ROCKSDB_LITE
-  runTest(argc, test::TmpDir() + "/merge_testdbttl", true); // Run test on TTL database
+TEST_F(MergeTest, MergeDbTtlTest) {
+  runTest(test::PerThreadDBPath("merge_testdbttl"),
+          true);  // Run test on TTL database
+}
 #endif  // !ROCKSDB_LITE
-  printf("Passed all tests!\n");
-  return 0;
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  rocksdb::use_compression = false;
+  if (argc > 1) {
+    rocksdb::use_compression = true;
+  }
+
+  rocksdb::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
diff --git a/thirdparty/rocksdb/db/obsolete_files_test.cc b/thirdparty/rocksdb/db/obsolete_files_test.cc
new file mode 100644
index 0000000000..52175a07b7
--- /dev/null
+++ b/thirdparty/rocksdb/db/obsolete_files_test.cc
@@ -0,0 +1,268 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+#include <map>
+#include <string>
+#include <vector>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "util/filename.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::flush;
+
+namespace rocksdb {
+
+class ObsoleteFilesTest : public testing::Test {
+ public:
+  std::string dbname_;
+  Options options_;
+  DB* db_;
+  Env* env_;
+  int numlevels_;
+
+  ObsoleteFilesTest() {
+    db_ = nullptr;
+    env_ = Env::Default();
+    // Trigger compaction when the number of level 0 files reaches 2.
+    options_.level0_file_num_compaction_trigger = 2;
+    options_.disable_auto_compactions = false;
+    options_.delete_obsolete_files_period_micros = 0;  // always do full purge
+    options_.enable_thread_tracking = true;
+    options_.write_buffer_size = 1024*1024*1000;
+    options_.target_file_size_base = 1024*1024*1000;
+    options_.max_bytes_for_level_base = 1024*1024*1000;
+    options_.WAL_ttl_seconds = 300; // Used to test log files
+    options_.WAL_size_limit_MB = 1024; // Used to test log files
+    dbname_ = test::PerThreadDBPath("obsolete_files_test");
+    options_.wal_dir = dbname_ + "/wal_files";
+
+    // clean up all the files that might have been there before
+    std::vector<std::string> old_files;
+    env_->GetChildren(dbname_, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(dbname_ + "/" + file);
+    }
+    env_->GetChildren(options_.wal_dir, &old_files);
+    for (auto file : old_files) {
+      env_->DeleteFile(options_.wal_dir + "/" + file);
+    }
+
+    DestroyDB(dbname_, options_);
+    numlevels_ = 7;
+    EXPECT_OK(ReopenDB(true));
+  }
+
+  Status ReopenDB(bool create) {
+    delete db_;
+    if (create) {
+      DestroyDB(dbname_, options_);
+    }
+    db_ = nullptr;
+    options_.create_if_missing = create;
+    return DB::Open(options_, dbname_, &db_);
+  }
+
+  void CloseDB() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void AddKeys(int numkeys, int startkey) {
+    WriteOptions options;
+    options.sync = false;
+    for (int i = startkey; i < (numkeys + startkey) ; i++) {
+      std::string temp = ToString(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(
+    std::vector<LiveFileMetaData> &metadata,
+    std::vector<int> *keysperlevel = nullptr) {
+
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(),
+              metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void createLevel0Files(int numFiles, int numKeysPerFile) {
+    int startKey = 0;
+    DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+    for (int i = 0; i < numFiles; i++) {
+      AddKeys(numKeysPerFile, startKey);
+      startKey += numKeysPerFile;
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      ASSERT_OK(dbi->TEST_WaitForFlushMemTable());
+    }
+  }
+
+  void CheckFileTypeCounts(std::string& dir,
+                            int required_log,
+                            int required_sst,
+                            int required_manifest) {
+    std::vector<std::string> filenames;
+    env_->GetChildren(dir, &filenames);
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kLogFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+};
+
+TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
+      {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"},
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) {
+        Status* p_status = reinterpret_cast<Status*>(arg);
+        ASSERT_OK(*p_status);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
+        std::vector<uint64_t>* files_grabbed_for_purge_ptr =
+            reinterpret_cast<std::vector<uint64_t>*>(arg);
+        ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  port::Thread user_thread([&]() {
+    JobContext jobCxt(0);
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
+    dbi->TEST_LockMutex();
+    dbi->FindObsoleteFiles(&jobCxt,
+      true /* force=true */, false /* no_full_scan=false */);
+    dbi->TEST_UnlockMutex();
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
+    dbi->PurgeObsoleteFiles(jobCxt);
+    jobCxt.Clean();
+  });
+
+  user_thread.join();
+
+  CloseDB();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+  std::vector<uint64_t> optsfiles_nums;
+  std::vector<bool> optsfiles_keep;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:1", [&](void* arg) {
+        optsfiles_nums.push_back(*reinterpret_cast<uint64_t*>(arg));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:CheckOptionsFiles:2", [&](void* arg) {
+        optsfiles_keep.push_back(*reinterpret_cast<bool*>(arg));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(options_.wal_dir, 1, 0, 0);
+
+  DBImpl* dbi = static_cast<DBImpl*>(db_);
+  ASSERT_OK(dbi->DisableFileDeletions());
+  for (int i = 0; i != 4; ++i) {
+    if (i % 2) {
+      ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                                {{"paranoid_file_checks", "false"}}));
+    } else {
+      ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                                {{"paranoid_file_checks", "true"}}));
+    }
+  }
+  ASSERT_OK(dbi->EnableFileDeletions(true /* force */));
+  ASSERT_EQ(optsfiles_nums.size(), optsfiles_keep.size());
+
+  CloseDB();
+
+  std::vector<std::string> files;
+  int opts_file_count = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t file_num;
+    Slice dummy_info_log_name_prefix;
+    FileType type;
+    WalFileType log_type;
+    if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type,
+                      &log_type) &&
+        type == kOptionsFile) {
+      opts_file_count++;
+    }
+  }
+  ASSERT_EQ(2, opts_file_count);
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+} //namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/db/options_file_test.cc b/thirdparty/rocksdb/db/options_file_test.cc
index fc62840eb4..0a9a34ff0b 100644
--- a/thirdparty/rocksdb/db/options_file_test.cc
+++ b/thirdparty/rocksdb/db/options_file_test.cc
@@ -15,7 +15,7 @@
 namespace rocksdb {
 class OptionsFileTest : public testing::Test {
  public:
-  OptionsFileTest() : dbname_(test::TmpDir() + "/options_file_test") {}
+  OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {}
 
   std::string dbname_;
 };
@@ -112,7 +112,7 @@ int main(int argc, char** argv) {
 
 #include <cstdio>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   printf("Skipped as Options file is not supported in RocksDBLite.\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/db/perf_context_test.cc b/thirdparty/rocksdb/db/perf_context_test.cc
index d06843a830..b7efec182a 100644
--- a/thirdparty/rocksdb/db/perf_context_test.cc
+++ b/thirdparty/rocksdb/db/perf_context_test.cc
@@ -10,6 +10,7 @@
 
 #include "monitoring/histogram.h"
 #include "monitoring/instrumented_mutex.h"
+#include "monitoring/perf_context_imp.h"
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
 #include "rocksdb/db.h"
@@ -30,7 +31,7 @@ int FLAGS_min_write_buffer_number_to_merge = 7;
 bool FLAGS_verbose = false;
 
 // Path to the database on file system
-const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test";
+const std::string kDbName = rocksdb::test::PerThreadDBPath("perf_context_test");
 
 namespace rocksdb {
 
@@ -227,6 +228,9 @@ void ProfileQueries(bool enabled_time = false) {
   HistogramImpl hist_write_pre_post;
   HistogramImpl hist_write_wal_time;
   HistogramImpl hist_write_memtable_time;
+  HistogramImpl hist_write_delay_time;
+  HistogramImpl hist_write_thread_wait_nanos;
+  HistogramImpl hist_write_scheduling_time;
 
   uint64_t total_db_mutex_nanos = 0;
 
@@ -270,9 +274,15 @@ void ProfileQueries(bool enabled_time = false) {
       ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
 #endif
     }
-    hist_write_pre_post.Add(get_perf_context()->write_pre_and_post_process_time);
+    hist_write_pre_post.Add(
+        get_perf_context()->write_pre_and_post_process_time);
     hist_write_wal_time.Add(get_perf_context()->write_wal_time);
     hist_write_memtable_time.Add(get_perf_context()->write_memtable_time);
+    hist_write_delay_time.Add(get_perf_context()->write_delay_time);
+    hist_write_thread_wait_nanos.Add(
+        get_perf_context()->write_thread_wait_nanos);
+    hist_write_scheduling_time.Add(
+        get_perf_context()->write_scheduling_flushes_compactions_time);
     hist_put.Add(get_perf_context()->user_key_comparison_count);
     total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos;
   }
@@ -320,6 +330,11 @@ void ProfileQueries(bool enabled_time = false) {
               << hist_write_wal_time.ToString() << "\n"
               << " Writing Mem Table time: \n"
               << hist_write_memtable_time.ToString() << "\n"
+              << " Write Delay: \n" << hist_write_delay_time.ToString() << "\n"
+              << " Waiting for Batch time: \n"
+              << hist_write_thread_wait_nanos.ToString() << "\n"
+              << " Scheduling Flushes and Compactions Time: \n"
+              << hist_write_scheduling_time.ToString() << "\n"
               << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n";
 
     std::cout << "Get(): Time to get snapshot: \n"
@@ -359,6 +374,14 @@ void ProfileQueries(bool enabled_time = false) {
     ASSERT_GT(hist_mget_files.Average(), 0);
     ASSERT_GT(hist_mget_post_process.Average(), 0);
     ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+
+    EXPECT_GT(hist_write_pre_post.Average(), 0);
+    EXPECT_GT(hist_write_wal_time.Average(), 0);
+    EXPECT_GT(hist_write_memtable_time.Average(), 0);
+    EXPECT_EQ(hist_write_delay_time.Average(), 0);
+    EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0);
+    EXPECT_GT(hist_write_scheduling_time.Average(), 0);
+
 #ifndef NDEBUG
     ASSERT_GT(total_db_mutex_nanos, 2000U);
 #endif
@@ -447,7 +470,7 @@ void ProfileQueries(bool enabled_time = false) {
     ASSERT_GT(hist_num_memtable_checked.Average(), 0);
     // In read-only mode Get(), no super version operation is needed
     ASSERT_EQ(hist_get_post_process.Average(), 0);
-    ASSERT_EQ(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
 
     ASSERT_GT(hist_mget.Average(), 0);
     ASSERT_GT(hist_mget_snapshot.Average(), 0);
@@ -557,18 +580,18 @@ TEST_F(PerfContextTest, SeekKeyComparison) {
 
 TEST_F(PerfContextTest, DBMutexLockCounter) {
   int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
-  for (PerfLevel perf_level :
+  for (PerfLevel perf_level_test :
        {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
     for (int c = 0; c < 2; ++c) {
     InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]);
     mutex.Lock();
     rocksdb::port::Thread child_thread([&] {
-      SetPerfLevel(perf_level);
+      SetPerfLevel(perf_level_test);
       get_perf_context()->Reset();
       ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
       mutex.Lock();
       mutex.Unlock();
-      if (perf_level == PerfLevel::kEnableTimeExceptForMutex ||
+      if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
           stats_code[c] != DB_MUTEX_WAIT_MICROS) {
         ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
       } else {
@@ -664,7 +687,258 @@ TEST_F(PerfContextTest, MergeOperatorTime) {
 
   delete db;
 }
+
+TEST_F(PerfContextTest, CopyAndMove) {
+  // Assignment operator
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_assign;
+    perf_context_assign = *get_perf_context();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_assign.ClearPerLevelPerfContext();
+    perf_context_assign.Reset();
+  }
+  // Copy constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_copy(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_copy.ClearPerLevelPerfContext();
+    perf_context_copy.Reset();
+  }
+  // Move constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_move = std::move(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_move.ClearPerLevelPerfContext();
+    perf_context_move.Reset();
+  }
+}
+
+TEST_F(PerfContextTest, PerfContextDisableEnable) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PerfContext perf_context_copy(*get_perf_context());
+  ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  // this was set when per level perf context is disabled, should not be copied
+  ASSERT_NE(
+      1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count);
+  perf_context_copy.ClearPerLevelPerfContext();
+  perf_context_copy.Reset();
+  get_perf_context()->ClearPerLevelPerfContext();
+  get_perf_context()->Reset();
+}
+
+TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1);
+  ASSERT_EQ(
+      0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2]
+                   .bloom_filter_full_true_positive);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+                  .block_cache_hit_count);
+  ASSERT_EQ(5, (*(get_perf_context()->level_to_perf_context))[2]
+                  .block_cache_hit_count);
+  ASSERT_EQ(2, (*(get_perf_context()->level_to_perf_context))[3]
+                  .block_cache_miss_count);
+  ASSERT_EQ(4, (*(get_perf_context()->level_to_perf_context))[1]
+                  .block_cache_miss_count);
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_positive = 1@level0"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_true_positive = 1@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3"));
+}
+
+TEST_F(PerfContextTest, CPUTimer) {
+  DestroyDB(kDbName, Options());
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+
+  std::string max_str = "0";
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string i_str = ToString(i);
+    std::string key = "k" + i_str;
+    std::string value = "v" + i_str;
+    max_str = max_str > i_str ? max_str : i_str;
+
+    db->Put(write_options, key, value);
+  }
+  std::string last_key = "k" + max_str;
+  std::string last_value = "v" + max_str;
+
+  {
+    // Get
+    get_perf_context()->Reset();
+    std::string value;
+    ASSERT_OK(db->Get(read_options, "k0", &value));
+    ASSERT_EQ(value, "v0");
+
+    if (FLAGS_verbose) {
+      std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos
+                << "ns\n";
+    }
+
+    // Iter
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    // Seek
+    get_perf_context()->Reset();
+    iter->Seek(last_key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Seek CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekForPrev
+    get_perf_context()->Reset();
+    iter->SeekForPrev(last_key);
+    ASSERT_TRUE(iter->Valid());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekForPrev CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToLast
+    get_perf_context()->Reset();
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToLast CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToFirst
+    get_perf_context()->Reset();
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToFirst CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // Next
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v1", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Next CPU time nanos: "
+                << get_perf_context()->iter_next_cpu_nanos << "ns\n";
+    }
+
+    // Prev
+    get_perf_context()->Reset();
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Prev CPU time nanos: "
+                << get_perf_context()->iter_prev_cpu_nanos << "ns\n";
+    }
+
+    // monotonically increasing
+    get_perf_context()->Reset();
+    auto count = get_perf_context()->iter_seek_cpu_nanos;
+    for (int i = 0; i < FLAGS_total_keys; ++i) {
+      iter->Seek("k" + ToString(i));
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("v" + ToString(i), iter->value().ToString());
+      auto next_count = get_perf_context()->iter_seek_cpu_nanos;
+      ASSERT_GT(next_count, count);
+      count = next_count;
+    }
+
+    // iterator creation/destruction; multiple iterators
+    {
+      std::unique_ptr<Iterator> iter2(db->NewIterator(read_options));
+      ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+      iter2->Seek(last_key);
+      ASSERT_TRUE(iter2->Valid());
+      ASSERT_EQ(last_value, iter2->value().ToString());
+      ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count);
+      count = get_perf_context()->iter_seek_cpu_nanos;
+    }
+    ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+  }
 }
+}  // namespace rocksdb
 
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
diff --git a/thirdparty/rocksdb/db/plain_table_db_test.cc b/thirdparty/rocksdb/db/plain_table_db_test.cc
index 0b60332e53..2dd0cff0b4 100644
--- a/thirdparty/rocksdb/db/plain_table_db_test.cc
+++ b/thirdparty/rocksdb/db/plain_table_db_test.cc
@@ -50,10 +50,11 @@ TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
   test::StringSource* string_source =
       new test::StringSource(contents, 0, false);
 
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       test::GetRandomAccessFileReader(string_source));
-  unique_ptr<PlainTableReaderFileInfo> file_info(new PlainTableReaderFileInfo(
-      std::move(file_reader), EnvOptions(), kLength));
+  std::unique_ptr<PlainTableReaderFileInfo> file_info(
+      new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
+                                   kLength));
 
   {
     PlainTableFileReader reader(file_info.get());
@@ -108,14 +109,14 @@ class PlainTableDBTest : public testing::Test,
  public:
   PlainTableDBTest() : env_(Env::Default()) {}
 
-  ~PlainTableDBTest() {
+  ~PlainTableDBTest() override {
     delete db_;
     EXPECT_OK(DestroyDB(dbname_, Options()));
   }
 
   void SetUp() override {
     mmap_mode_ = GetParam();
-    dbname_ = test::TmpDir() + "/plain_table_db_test";
+    dbname_ = test::PerThreadDBPath("plain_table_db_test");
     EXPECT_OK(DestroyDB(dbname_, Options()));
     db_ = nullptr;
     Reopen();
@@ -157,6 +158,8 @@ class PlainTableDBTest : public testing::Test,
     db_ = nullptr;
   }
 
+  bool mmap_mode() const { return mmap_mode_; }
+
   void DestroyAndReopen(Options* options = nullptr) {
     //Destroy using last options
     Destroy(&last_options_);
@@ -173,6 +176,12 @@ class PlainTableDBTest : public testing::Test,
     return DB::Open(*options, dbname_, db);
   }
 
+  Status ReopenForReadOnly(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  }
+
   Status TryReopen(Options* options = nullptr) {
     delete db_;
     db_ = nullptr;
@@ -260,13 +269,15 @@ class TestPlainTableReader : public PlainTableReader {
                        int bloom_bits_per_key, double hash_table_ratio,
                        size_t index_sparseness,
                        const TableProperties* table_properties,
-                       unique_ptr<RandomAccessFileReader>&& file,
+                       std::unique_ptr<RandomAccessFileReader>&& file,
                        const ImmutableCFOptions& ioptions,
+                       const SliceTransform* prefix_extractor,
                        bool* expect_bloom_not_match, bool store_index_in_file,
                        uint32_t column_family_id,
                        const std::string& column_family_name)
       : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
-                         encoding_type, file_size, table_properties),
+                         encoding_type, file_size, table_properties,
+                         prefix_extractor),
         expect_bloom_not_match_(expect_bloom_not_match) {
     Status s = MmapDataIfNeeded();
     EXPECT_TRUE(s.ok());
@@ -292,10 +303,10 @@ class TestPlainTableReader : public PlainTableReader {
     }
   }
 
-  virtual ~TestPlainTableReader() {}
+  ~TestPlainTableReader() override {}
 
  private:
-  virtual bool MatchBloom(uint32_t hash) const override {
+  bool MatchBloom(uint32_t hash) const override {
     bool ret = PlainTableReader::MatchBloom(hash);
     if (*expect_bloom_not_match_) {
       EXPECT_TRUE(!ret);
@@ -325,27 +336,29 @@ class TestPlainTableFactory : public PlainTableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table,
-      bool prefetch_index_and_filter_in_cache) const override {
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool /*prefetch_index_and_filter_in_cache*/) const override {
     TableProperties* props = nullptr;
     auto s =
         ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                            table_reader_options.ioptions, &props);
+                            table_reader_options.ioptions, &props,
+                            true /* compression_type_missing */);
     EXPECT_TRUE(s.ok());
 
     if (store_index_in_file_) {
       BlockHandle bloom_block_handle;
       s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
                         table_reader_options.ioptions,
-                        BloomBlockBuilder::kBloomBlock, &bloom_block_handle);
+                        BloomBlockBuilder::kBloomBlock, &bloom_block_handle,
+                        /* compression_type_missing */ true);
       EXPECT_TRUE(s.ok());
 
       BlockHandle index_block_handle;
       s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
                         table_reader_options.ioptions,
                         PlainTableIndexBuilder::kPlainTableIndexBlock,
-                        &index_block_handle);
+                        &index_block_handle, /* compression_type_missing */ true);
       EXPECT_TRUE(s.ok());
     }
 
@@ -360,7 +373,8 @@ class TestPlainTableFactory : public PlainTableFactory {
         table_reader_options.env_options,
         table_reader_options.internal_comparator, encoding_type, file_size,
         bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
-        std::move(file), table_reader_options.ioptions, expect_bloom_not_match_,
+        std::move(file), table_reader_options.ioptions,
+        table_reader_options.prefix_extractor, expect_bloom_not_match_,
         store_index_in_file_, column_family_id_, column_family_name_));
 
     *table = std::move(new_reader);
@@ -541,6 +555,50 @@ TEST_P(PlainTableDBTest, Flush2) {
   }
 }
 
+TEST_P(PlainTableDBTest, Immortal) {
+  for (EncodingType encoding_type : {kPlain, kPrefix}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.max_open_files = -1;
+    // Set only one bucket to force bucket conflict.
+    // Test index interval for the same prefix to be 1, 2 and 4
+    PlainTableOptions plain_table_options;
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 10;
+    plain_table_options.encoding_type = encoding_type;
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("0000000000000bar", "b"));
+    ASSERT_OK(Put("1000000000000foo", "v1"));
+    dbfull()->TEST_FlushMemTable();
+
+    int copied = 0;
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ(2, copied);
+    copied = 0;
+
+    Close();
+    ASSERT_OK(ReopenForReadOnly(&options));
+
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+    if (mmap_mode()) {
+      ASSERT_EQ(0, copied);
+    } else {
+      ASSERT_EQ(2, copied);
+    }
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
 TEST_P(PlainTableDBTest, Iterator) {
   for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
        huge_page_tlb_size += 2 * 1024 * 1024) {
@@ -1170,7 +1228,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/db/pre_release_callback.h b/thirdparty/rocksdb/db/pre_release_callback.h
new file mode 100644
index 0000000000..f91ef1b27a
--- /dev/null
+++ b/thirdparty/rocksdb/db/pre_release_callback.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DB;
+
+class PreReleaseCallback {
+ public:
+  virtual ~PreReleaseCallback() {}
+
+  // Will be called while on the write thread after the write to the WAL and
+  // before the write to memtable. This is useful if any operation needs to be
+  // done before the write gets visible to the readers, or if we want to reduce
+  // the overhead of locking by updating something sequentially while we are on
+  // the write thread. If the callback fails, this function returns a non-OK
+  // status, the sequence number will not be released, and same status will be
+  // propagated to all the writers in the write group.
+  // seq is the sequence number that is used for this write and will be
+  // released.
+  // is_mem_disabled is currently used for debugging purposes to assert that
+  // the callback is done from the right write queue.
+  // If non-zero, log_number indicates the WAL log to which we wrote.
+  virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
+                          uint64_t log_number) = 0;
+};
+
+}  //  namespace rocksdb
diff --git a/thirdparty/rocksdb/db/prefix_test.cc b/thirdparty/rocksdb/db/prefix_test.cc
index a4ed201dad..ac854cb3db 100644
--- a/thirdparty/rocksdb/db/prefix_test.cc
+++ b/thirdparty/rocksdb/db/prefix_test.cc
@@ -17,7 +17,6 @@ int main() {
 #include <iostream>
 #include <vector>
 
-#include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "monitoring/histogram.h"
 #include "rocksdb/comparator.h"
@@ -27,14 +26,15 @@ int main() {
 #include "rocksdb/perf_context.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "utilities/merge_operators.h"
-#include "util/coding.h"
 
-using GFLAGS::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
 DEFINE_bool(trigger_deadlock, false,
             "issue delete in range scan to trigger PrefixHashMap deadlock");
@@ -53,7 +53,7 @@ DEFINE_int32(value_size, 40, "");
 DEFINE_bool(enable_print, false, "Print options generated to console.");
 
 // Path to the database on file system
-const std::string kDbName = rocksdb::test::TmpDir() + "/prefix_test";
+const std::string kDbName = rocksdb::test::PerThreadDBPath("prefix_test");
 
 namespace rocksdb {
 
@@ -83,7 +83,7 @@ class TestKeyComparator : public Comparator {
 
   // Compare needs to be aware of the possibility of a and/or b is
   // prefix only
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     const TestKey kkey_a = SliceToTestKey(a);
     const TestKey kkey_b = SliceToTestKey(b);
     const TestKey *key_a = &kkey_a;
@@ -122,14 +122,12 @@ class TestKeyComparator : public Comparator {
     return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0;
   }
 
-  virtual const char* Name() const override {
-    return "TestKeyComparator";
-  }
+  const char* Name() const override { return "TestKeyComparator"; }
 
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {}
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
 
-  virtual void FindShortSuccessor(std::string* key) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 
 namespace {
@@ -195,23 +193,23 @@ class SamePrefixTransform : public SliceTransform {
   explicit SamePrefixTransform(const Slice& prefix)
       : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {}
 
-  virtual const char* Name() const override { return name_.c_str(); }
+  const char* Name() const override { return name_.c_str(); }
 
-  virtual Slice Transform(const Slice& src) const override {
+  Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
     return prefix_;
   }
 
-  virtual bool InDomain(const Slice& src) const override {
+  bool InDomain(const Slice& src) const override {
     if (src.size() >= prefix_.size()) {
       return Slice(src.data(), prefix_.size()) == prefix_;
     }
     return false;
   }
 
-  virtual bool InRange(const Slice& dst) const override {
-    return dst == prefix_;
-  }
+  bool InRange(const Slice& dst) const override { return dst == prefix_; }
+
+  bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
 };
 
 }  // namespace
@@ -279,9 +277,8 @@ class PrefixTest : public testing::Test {
   PrefixTest() : option_config_(kBegin) {
     options.comparator = new TestKeyComparator();
   }
-  ~PrefixTest() {
-    delete options.comparator;
-  }
+  ~PrefixTest() override { delete options.comparator; }
+
  protected:
   enum OptionConfig {
     kBegin,
@@ -879,8 +876,6 @@ TEST_F(PrefixTest, PrefixSeekModePrev3) {
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   ParseCommandLineFlags(&argc, &argv, true);
-  std::cout << kDbName << "\n";
-
   return RUN_ALL_TESTS();
 }
 
@@ -889,7 +884,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as HashSkipList and HashLinkList are not supported in "
           "ROCKSDB_LITE\n");
diff --git a/thirdparty/rocksdb/db/range_del_aggregator.cc b/thirdparty/rocksdb/db/range_del_aggregator.cc
index c83f5a88cd..68216fc92f 100644
--- a/thirdparty/rocksdb/db/range_del_aggregator.cc
+++ b/thirdparty/rocksdb/db/range_del_aggregator.cc
@@ -1,520 +1,484 @@
-//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "db/range_del_aggregator.h"
 
-#include <algorithm>
+#include "db/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "include/rocksdb/comparator.h"
+#include "include/rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
 
 namespace rocksdb {
 
-RangeDelAggregator::RangeDelAggregator(
-    const InternalKeyComparator& icmp,
-    const std::vector<SequenceNumber>& snapshots,
-    bool collapse_deletions /* = true */)
-    : upper_bound_(kMaxSequenceNumber),
+TruncatedRangeDelIterator::TruncatedRangeDelIterator(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+    const InternalKeyComparator* icmp, const InternalKey* smallest,
+    const InternalKey* largest)
+    : iter_(std::move(iter)),
       icmp_(icmp),
-      collapse_deletions_(collapse_deletions) {
-  InitRep(snapshots);
+      smallest_ikey_(smallest),
+      largest_ikey_(largest) {
+  if (smallest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_smallest = pinned_bounds_.back();
+    if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) {
+      assert(false);
+    }
+    smallest_ = &parsed_smallest;
+  }
+  if (largest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_largest = pinned_bounds_.back();
+    if (!ParseInternalKey(largest->Encode(), &parsed_largest)) {
+      assert(false);
+    }
+    if (parsed_largest.type == kTypeRangeDeletion &&
+        parsed_largest.sequence == kMaxSequenceNumber) {
+      // The file boundary has been artificially extended by a range tombstone.
+      // We do not need to adjust largest to properly truncate range
+      // tombstones that extend past the boundary.
+    } else if (parsed_largest.sequence == 0) {
+      // The largest key in the sstable has a sequence number of 0. Since we
+      // guarantee that no internal keys with the same user key and sequence
+      // number can exist in a DB, we know that the largest key in this sstable
+      // cannot exist as the smallest key in the next sstable. This further
+      // implies that no range tombstone in this sstable covers largest;
+      // otherwise, the file boundary would have been artificially extended.
+      //
+      // Therefore, we will never truncate a range tombstone at largest, so we
+      // can leave it unchanged.
+    } else {
+      // The same user key may straddle two sstable boundaries. To ensure that
+      // the truncated end key can cover the largest key in this sstable, reduce
+      // its sequence number by 1.
+      parsed_largest.sequence -= 1;
+    }
+    largest_ = &parsed_largest;
+  }
 }
 
-RangeDelAggregator::RangeDelAggregator(const InternalKeyComparator& icmp,
-                                       SequenceNumber snapshot,
-                                       bool collapse_deletions /* = false */)
-    : upper_bound_(snapshot),
-      icmp_(icmp),
-      collapse_deletions_(collapse_deletions) {}
-
-void RangeDelAggregator::InitRep(const std::vector<SequenceNumber>& snapshots) {
-  assert(rep_ == nullptr);
-  rep_.reset(new Rep());
-  for (auto snapshot : snapshots) {
-    rep_->stripe_map_.emplace(
-        snapshot,
-        PositionalTombstoneMap(TombstoneMap(
-            stl_wrappers::LessOfComparator(icmp_.user_comparator()))));
-  }
-  // Data newer than any snapshot falls in this catch-all stripe
-  rep_->stripe_map_.emplace(
-      kMaxSequenceNumber,
-      PositionalTombstoneMap(TombstoneMap(
-          stl_wrappers::LessOfComparator(icmp_.user_comparator()))));
-  rep_->pinned_iters_mgr_.StartPinning();
+bool TruncatedRangeDelIterator::Valid() const {
+  return iter_->Valid() &&
+         (smallest_ == nullptr ||
+          icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
+         (largest_ == nullptr ||
+          icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
 }
 
-bool RangeDelAggregator::ShouldDelete(
-    const Slice& internal_key, RangeDelAggregator::RangePositioningMode mode) {
-  if (rep_ == nullptr) {
-    return false;
+void TruncatedRangeDelIterator::Next() { iter_->TopNext(); }
+
+void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); }
+
+void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); }
+
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::Seek(const Slice& target) {
+  if (largest_ != nullptr &&
+      icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
+                                                  kTypeRangeDeletion)) <= 0) {
+    iter_->Invalidate();
+    return;
   }
-  ParsedInternalKey parsed;
-  if (!ParseInternalKey(internal_key, &parsed)) {
-    assert(false);
+  if (smallest_ != nullptr &&
+      icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
+    iter_->Seek(smallest_->user_key);
+    return;
   }
-  return ShouldDelete(parsed, mode);
+  iter_->Seek(target);
 }
 
-bool RangeDelAggregator::ShouldDelete(
-    const ParsedInternalKey& parsed,
-    RangeDelAggregator::RangePositioningMode mode) {
-  assert(IsValueType(parsed.type));
-  if (rep_ == nullptr) {
-    return false;
-  }
-  auto& positional_tombstone_map = GetPositionalTombstoneMap(parsed.sequence);
-  const auto& tombstone_map = positional_tombstone_map.raw_map;
-  if (tombstone_map.empty()) {
-    return false;
+// NOTE: target is a user key
+void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
+  if (smallest_ != nullptr &&
+      icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
+                     *smallest_) < 0) {
+    iter_->Invalidate();
+    return;
   }
-  auto& tombstone_map_iter = positional_tombstone_map.iter;
-  if (tombstone_map_iter == tombstone_map.end() &&
-      (mode == kForwardTraversal || mode == kBackwardTraversal)) {
-    // invalid (e.g., if AddTombstones() changed the deletions), so need to
-    // reseek
-    mode = kBinarySearch;
+  if (largest_ != nullptr &&
+      icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
   }
-  switch (mode) {
-    case kFullScan:
-      assert(!collapse_deletions_);
-      // The maintained state (PositionalTombstoneMap::iter) isn't useful when
-      // we linear scan from the beginning each time, but we maintain it anyways
-      // for consistency.
-      tombstone_map_iter = tombstone_map.begin();
-      while (tombstone_map_iter != tombstone_map.end()) {
-        const auto& tombstone = tombstone_map_iter->second;
-        if (icmp_.user_comparator()->Compare(parsed.user_key,
-                                             tombstone.start_key_) < 0) {
-          break;
-        }
-        if (parsed.sequence < tombstone.seq_ &&
-            icmp_.user_comparator()->Compare(parsed.user_key,
-                                             tombstone.end_key_) < 0) {
-          return true;
-        }
-        ++tombstone_map_iter;
-      }
-      return false;
-    case kForwardTraversal:
-      assert(collapse_deletions_ && tombstone_map_iter != tombstone_map.end());
-      if (tombstone_map_iter == tombstone_map.begin() &&
-          icmp_.user_comparator()->Compare(parsed.user_key,
-                                           tombstone_map_iter->first) < 0) {
-        // before start of deletion intervals
-        return false;
-      }
-      while (std::next(tombstone_map_iter) != tombstone_map.end() &&
-             icmp_.user_comparator()->Compare(
-                 std::next(tombstone_map_iter)->first, parsed.user_key) <= 0) {
-        ++tombstone_map_iter;
-      }
-      break;
-    case kBackwardTraversal:
-      assert(collapse_deletions_ && tombstone_map_iter != tombstone_map.end());
-      while (tombstone_map_iter != tombstone_map.begin() &&
-             icmp_.user_comparator()->Compare(parsed.user_key,
-                                              tombstone_map_iter->first) < 0) {
-        --tombstone_map_iter;
-      }
-      if (tombstone_map_iter == tombstone_map.begin() &&
-          icmp_.user_comparator()->Compare(parsed.user_key,
-                                           tombstone_map_iter->first) < 0) {
-        // before start of deletion intervals
-        return false;
-      }
-      break;
-    case kBinarySearch:
-      assert(collapse_deletions_);
-      tombstone_map_iter =
-          tombstone_map.upper_bound(parsed.user_key);
-      if (tombstone_map_iter == tombstone_map.begin()) {
-        // before start of deletion intervals
-        return false;
-      }
-      --tombstone_map_iter;
-      break;
+  iter_->SeekForPrev(target);
+}
+
+void TruncatedRangeDelIterator::SeekToFirst() {
+  if (smallest_ != nullptr) {
+    iter_->Seek(smallest_->user_key);
+    return;
   }
-  assert(mode != kFullScan);
-  assert(tombstone_map_iter != tombstone_map.end() &&
-         icmp_.user_comparator()->Compare(tombstone_map_iter->first,
-                                          parsed.user_key) <= 0);
-  assert(std::next(tombstone_map_iter) == tombstone_map.end() ||
-         icmp_.user_comparator()->Compare(
-             parsed.user_key, std::next(tombstone_map_iter)->first) < 0);
-  return parsed.sequence < tombstone_map_iter->second.seq_;
+  iter_->SeekToTopFirst();
 }
 
-bool RangeDelAggregator::ShouldAddTombstones(
-    bool bottommost_level /* = false */) {
-  // TODO(andrewkr): can we just open a file and throw it away if it ends up
-  // empty after AddToBuilder()? This function doesn't take into subcompaction
-  // boundaries so isn't completely accurate.
-  if (rep_ == nullptr) {
-    return false;
+void TruncatedRangeDelIterator::SeekToLast() {
+  if (largest_ != nullptr) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
   }
-  auto stripe_map_iter = rep_->stripe_map_.begin();
-  assert(stripe_map_iter != rep_->stripe_map_.end());
-  if (bottommost_level) {
-    // For the bottommost level, keys covered by tombstones in the first
-    // (oldest) stripe have been compacted away, so the tombstones are obsolete.
-    ++stripe_map_iter;
+  iter_->SeekToTopLast();
+}
+
+std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+TruncatedRangeDelIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  using FragmentedIterPair =
+      std::pair<const SequenceNumber,
+                std::unique_ptr<FragmentedRangeTombstoneIterator>>;
+
+  auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+      split_truncated_iters;
+  std::for_each(
+      split_untruncated_iters.begin(), split_untruncated_iters.end(),
+      [&](FragmentedIterPair& iter_pair) {
+        std::unique_ptr<TruncatedRangeDelIterator> truncated_iter(
+            new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_,
+                                          smallest_ikey_, largest_ikey_));
+        split_truncated_iters.emplace(iter_pair.first,
+                                      std::move(truncated_iter));
+      });
+  return split_truncated_iters;
+}
+
+ForwardRangeDelIterator::ForwardRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(EndKeyMinComparator(icmp)),
+      inactive_iters_(StartKeyMinComparator(icmp)) {}
+
+bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that end before parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Next();
+    } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
   }
-  while (stripe_map_iter != rep_->stripe_map_.end()) {
-    if (!stripe_map_iter->second.raw_map.empty()) {
-      return true;
+
+  // Move inactive iterators that start before parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      iter->Next();
     }
-    ++stripe_map_iter;
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
   }
-  return false;
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
 }
 
-Status RangeDelAggregator::AddTombstones(
-    std::unique_ptr<InternalIterator> input) {
-  if (input == nullptr) {
-    return Status::OK();
+void ForwardRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
+}
+
+ReverseRangeDelIterator::ReverseRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(StartKeyMaxComparator(icmp)),
+      inactive_iters_(EndKeyMaxComparator(icmp)) {}
+
+bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that start after parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Prev();
+    } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
   }
-  input->SeekToFirst();
-  bool first_iter = true;
-  while (input->Valid()) {
-    if (first_iter) {
-      if (rep_ == nullptr) {
-        InitRep({upper_bound_});
-      } else {
-        InvalidateTombstoneMapPositions();
-      }
-      first_iter = false;
-    }
-    ParsedInternalKey parsed_key;
-    if (!ParseInternalKey(input->key(), &parsed_key)) {
-      return Status::Corruption("Unable to parse range tombstone InternalKey");
+
+  // Move inactive iterators that end after parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
+      iter->Prev();
     }
-    RangeTombstone tombstone(parsed_key, input->value());
-    AddTombstone(std::move(tombstone));
-    input->Next();
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
   }
-  if (!first_iter) {
-    rep_->pinned_iters_mgr_.PinIterator(input.release(), false /* arena */);
-  }
-  return Status::OK();
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
 }
 
-void RangeDelAggregator::InvalidateTombstoneMapPositions() {
-  if (rep_ == nullptr) {
-    return;
-  }
-  for (auto stripe_map_iter = rep_->stripe_map_.begin();
-       stripe_map_iter != rep_->stripe_map_.end(); ++stripe_map_iter) {
-    stripe_map_iter->second.iter = stripe_map_iter->second.raw_map.end();
-  }
+void ReverseRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
 }
 
-Status RangeDelAggregator::AddTombstone(RangeTombstone tombstone) {
-  auto& positional_tombstone_map = GetPositionalTombstoneMap(tombstone.seq_);
-  auto& tombstone_map = positional_tombstone_map.raw_map;
-  if (collapse_deletions_) {
-    // In collapsed mode, we only fill the seq_ field in the TombstoneMap's
-    // values. The end_key is unneeded because we assume the tombstone extends
-    // until the next tombstone starts. For gaps between real tombstones and
-    // for the last real tombstone, we denote end keys by inserting fake
-    // tombstones with sequence number zero.
-    std::vector<RangeTombstone> new_range_dels{
-        tombstone, RangeTombstone(tombstone.end_key_, Slice(), 0)};
-    auto new_range_dels_iter = new_range_dels.begin();
-    // Position at the first overlapping existing tombstone; if none exists,
-    // insert until we find an existing one overlapping a new point
-    const Slice* tombstone_map_begin = nullptr;
-    if (!tombstone_map.empty()) {
-      tombstone_map_begin = &tombstone_map.begin()->first;
-    }
-    auto last_range_dels_iter = new_range_dels_iter;
-    while (new_range_dels_iter != new_range_dels.end() &&
-           (tombstone_map_begin == nullptr ||
-            icmp_.user_comparator()->Compare(new_range_dels_iter->start_key_,
-                                             *tombstone_map_begin) < 0)) {
-      tombstone_map.emplace(
-          new_range_dels_iter->start_key_,
-          RangeTombstone(Slice(), Slice(), new_range_dels_iter->seq_));
-      last_range_dels_iter = new_range_dels_iter;
-      ++new_range_dels_iter;
-    }
-    if (new_range_dels_iter == new_range_dels.end()) {
-      return Status::OK();
-    }
-    // above loop advances one too far
-    new_range_dels_iter = last_range_dels_iter;
-    auto tombstone_map_iter =
-        tombstone_map.upper_bound(new_range_dels_iter->start_key_);
-    // if nothing overlapped we would've already inserted all the new points
-    // and returned early
-    assert(tombstone_map_iter != tombstone_map.begin());
-    tombstone_map_iter--;
-
-    // untermed_seq is non-kMaxSequenceNumber when we covered an existing point
-    // but haven't seen its corresponding endpoint. It's used for (1) deciding
-    // whether to forcibly insert the new interval's endpoint; and (2) possibly
-    // raising the seqnum for the to-be-inserted element (we insert the max
-    // seqnum between the next new interval and the unterminated interval).
-    SequenceNumber untermed_seq = kMaxSequenceNumber;
-    while (tombstone_map_iter != tombstone_map.end() &&
-           new_range_dels_iter != new_range_dels.end()) {
-      const Slice *tombstone_map_iter_end = nullptr,
-                  *new_range_dels_iter_end = nullptr;
-      if (tombstone_map_iter != tombstone_map.end()) {
-        auto next_tombstone_map_iter = std::next(tombstone_map_iter);
-        if (next_tombstone_map_iter != tombstone_map.end()) {
-          tombstone_map_iter_end = &next_tombstone_map_iter->first;
-        }
-      }
-      if (new_range_dels_iter != new_range_dels.end()) {
-        auto next_new_range_dels_iter = std::next(new_range_dels_iter);
-        if (next_new_range_dels_iter != new_range_dels.end()) {
-          new_range_dels_iter_end = &next_new_range_dels_iter->start_key_;
-        }
-      }
+bool RangeDelAggregator::StripeRep::ShouldDelete(
+    const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
+  if (!InStripe(parsed.sequence) || IsEmpty()) {
+    return false;
+  }
+  switch (mode) {
+    case RangeDelPositioningMode::kForwardTraversal:
+      InvalidateReverseIter();
 
-      // our positions in existing/new tombstone collections should always
-      // overlap. The non-overlapping cases are handled above and below this
-      // loop.
-      assert(new_range_dels_iter_end == nullptr ||
-             icmp_.user_comparator()->Compare(tombstone_map_iter->first,
-                                              *new_range_dels_iter_end) < 0);
-      assert(tombstone_map_iter_end == nullptr ||
-             icmp_.user_comparator()->Compare(new_range_dels_iter->start_key_,
-                                              *tombstone_map_iter_end) < 0);
-
-      int new_to_old_start_cmp = icmp_.user_comparator()->Compare(
-          new_range_dels_iter->start_key_, tombstone_map_iter->first);
-      // nullptr end means extends infinitely rightwards, set new_to_old_end_cmp
-      // accordingly so we can use common code paths later.
-      int new_to_old_end_cmp;
-      if (new_range_dels_iter_end == nullptr &&
-          tombstone_map_iter_end == nullptr) {
-        new_to_old_end_cmp = 0;
-      } else if (new_range_dels_iter_end == nullptr) {
-        new_to_old_end_cmp = 1;
-      } else if (tombstone_map_iter_end == nullptr) {
-        new_to_old_end_cmp = -1;
-      } else {
-        new_to_old_end_cmp = icmp_.user_comparator()->Compare(
-            *new_range_dels_iter_end, *tombstone_map_iter_end);
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
+           it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        forward_iter_.AddNewIter(iter.get(), parsed);
       }
 
-      if (new_to_old_start_cmp < 0) {
-        // the existing one's left endpoint comes after, so raise/delete it if
-        // it's covered.
-        if (tombstone_map_iter->second.seq_ < new_range_dels_iter->seq_) {
-          untermed_seq = tombstone_map_iter->second.seq_;
-          if (tombstone_map_iter != tombstone_map.begin() &&
-              std::prev(tombstone_map_iter)->second.seq_ ==
-                  new_range_dels_iter->seq_) {
-            tombstone_map_iter = tombstone_map.erase(tombstone_map_iter);
-            --tombstone_map_iter;
-          } else {
-            tombstone_map_iter->second.seq_ = new_range_dels_iter->seq_;
-          }
-        }
-      } else if (new_to_old_start_cmp > 0) {
-        if (untermed_seq != kMaxSequenceNumber ||
-            tombstone_map_iter->second.seq_ < new_range_dels_iter->seq_) {
-          auto seq = tombstone_map_iter->second.seq_;
-          // need to adjust this element if not intended to span beyond the new
-          // element (i.e., was_tombstone_map_iter_raised == true), or if it
-          // can be raised
-          tombstone_map_iter = tombstone_map.emplace(
-              new_range_dels_iter->start_key_,
-              RangeTombstone(
-                  Slice(), Slice(),
-                  std::max(
-                      untermed_seq == kMaxSequenceNumber ? 0 : untermed_seq,
-                      new_range_dels_iter->seq_)));
-          untermed_seq = seq;
-        }
-      } else {
-        // their left endpoints coincide, so raise the existing one if needed
-        if (tombstone_map_iter->second.seq_ < new_range_dels_iter->seq_) {
-          untermed_seq = tombstone_map_iter->second.seq_;
-          tombstone_map_iter->second.seq_ = new_range_dels_iter->seq_;
-        }
+      return forward_iter_.ShouldDelete(parsed);
+    case RangeDelPositioningMode::kBackwardTraversal:
+      InvalidateForwardIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
+           it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        reverse_iter_.AddNewIter(iter.get(), parsed);
       }
 
-      // advance whichever one ends earlier, or both if their right endpoints
-      // coincide
-      if (new_to_old_end_cmp < 0) {
-        ++new_range_dels_iter;
-      } else if (new_to_old_end_cmp > 0) {
-        ++tombstone_map_iter;
-        untermed_seq = kMaxSequenceNumber;
-      } else {
-        ++new_range_dels_iter;
-        ++tombstone_map_iter;
-        untermed_seq = kMaxSequenceNumber;
+      return reverse_iter_.ShouldDelete(parsed);
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
+                                                      const Slice& end) {
+  Invalidate();
+
+  // Set the internal start/end keys so that:
+  // - if start_ikey has the same user key and sequence number as the
+  // current end key, start_ikey will be considered greater; and
+  // - if end_ikey has the same user key and sequence number as the current
+  // start key, end_ikey will be considered greater.
+  ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
+                               static_cast<ValueType>(0));
+  ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
+  for (auto& iter : iters_) {
+    bool checked_candidate_tombstones = false;
+    for (iter->SeekForPrev(start);
+         iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
+         iter->Next()) {
+      checked_candidate_tombstones = true;
+      if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
       }
     }
-    while (new_range_dels_iter != new_range_dels.end()) {
-      tombstone_map.emplace(
-          new_range_dels_iter->start_key_,
-          RangeTombstone(Slice(), Slice(), new_range_dels_iter->seq_));
-      ++new_range_dels_iter;
+
+    if (!checked_candidate_tombstones) {
+      // Do an additional check for when the end of the range is the begin
+      // key of a tombstone, which we missed earlier since SeekForPrev'ing
+      // to the start was invalid.
+      iter->SeekForPrev(end);
+      if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
+      }
     }
-  } else {
-    auto start_key = tombstone.start_key_;
-    tombstone_map.emplace(start_key, std::move(tombstone));
   }
-  return Status::OK();
+  return false;
 }
 
-RangeDelAggregator::PositionalTombstoneMap&
-RangeDelAggregator::GetPositionalTombstoneMap(SequenceNumber seq) {
-  assert(rep_ != nullptr);
-  // The stripe includes seqnum for the snapshot above and excludes seqnum for
-  // the snapshot below.
-  StripeMap::iterator iter;
-  if (seq > 0) {
-    // upper_bound() checks strict inequality so need to subtract one
-    iter = rep_->stripe_map_.upper_bound(seq - 1);
-  } else {
-    iter = rep_->stripe_map_.begin();
+void ReadRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
   }
-  // catch-all stripe justifies this assertion in either of above cases
-  assert(iter != rep_->stripe_map_.end());
-  return iter->second;
+  rep_.AddTombstones(
+      std::unique_ptr<TruncatedRangeDelIterator>(new TruncatedRangeDelIterator(
+          std::move(input_iter), icmp_, smallest, largest)));
+}
+
+bool ReadRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+                                          RangeDelPositioningMode mode) {
+  return rep_.ShouldDelete(parsed, mode);
 }
 
-// TODO(andrewkr): We should implement an iterator over range tombstones in our
-// map. It'd enable compaction to open tables on-demand, i.e., only once range
-// tombstones are known to be available, without the code duplication we have
-// in ShouldAddTombstones(). It'll also allow us to move the table-modifying
-// code into more coherent places: CompactionJob and BuildTable().
-void RangeDelAggregator::AddToBuilder(
-    TableBuilder* builder, const Slice* lower_bound, const Slice* upper_bound,
-    FileMetaData* meta,
-    CompactionIterationStats* range_del_out_stats /* = nullptr */,
-    bool bottommost_level /* = false */) {
-  if (rep_ == nullptr) {
+bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
+                                               const Slice& end) {
+  InvalidateRangeDelMapPositions();
+  return rep_.IsRangeOverlapped(start, end);
+}
+
+void CompactionRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
     return;
   }
-  auto stripe_map_iter = rep_->stripe_map_.begin();
-  assert(stripe_map_iter != rep_->stripe_map_.end());
-  if (bottommost_level) {
-    // TODO(andrewkr): these are counted for each compaction output file, so
-    // lots of double-counting.
-    if (!stripe_map_iter->second.raw_map.empty()) {
-      range_del_out_stats->num_range_del_drop_obsolete +=
-          static_cast<int64_t>(stripe_map_iter->second.raw_map.size()) -
-          (collapse_deletions_ ? 1 : 0);
-      range_del_out_stats->num_record_drop_obsolete +=
-          static_cast<int64_t>(stripe_map_iter->second.raw_map.size()) -
-          (collapse_deletions_ ? 1 : 0);
+  assert(input_iter->lower_bound() == 0);
+  assert(input_iter->upper_bound() == kMaxSequenceNumber);
+  parent_iters_.emplace_back(new TruncatedRangeDelIterator(
+      std::move(input_iter), icmp_, smallest, largest));
+
+  auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
+  for (auto& split_iter : split_iters) {
+    auto it = reps_.find(split_iter.first);
+    if (it == reps_.end()) {
+      bool inserted;
+      SequenceNumber upper_bound = split_iter.second->upper_bound();
+      SequenceNumber lower_bound = split_iter.second->lower_bound();
+      std::tie(it, inserted) = reps_.emplace(
+          split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
+      assert(inserted);
     }
-    // For the bottommost level, keys covered by tombstones in the first
-    // (oldest) stripe have been compacted away, so the tombstones are obsolete.
-    ++stripe_map_iter;
+    assert(it != reps_.end());
+    it->second.AddTombstones(std::move(split_iter.second));
   }
+}
 
-  // Note the order in which tombstones are stored is insignificant since we
-  // insert them into a std::map on the read path.
-  while (stripe_map_iter != rep_->stripe_map_.end()) {
-    bool first_added = false;
-    for (auto tombstone_map_iter = stripe_map_iter->second.raw_map.begin();
-         tombstone_map_iter != stripe_map_iter->second.raw_map.end();
-         ++tombstone_map_iter) {
-      RangeTombstone tombstone;
-      if (collapse_deletions_) {
-        auto next_tombstone_map_iter = std::next(tombstone_map_iter);
-        if (next_tombstone_map_iter == stripe_map_iter->second.raw_map.end() ||
-            tombstone_map_iter->second.seq_ == 0) {
-          // it's a sentinel tombstone
-          continue;
-        }
-        tombstone.start_key_ = tombstone_map_iter->first;
-        tombstone.end_key_ = next_tombstone_map_iter->first;
-        tombstone.seq_ = tombstone_map_iter->second.seq_;
-      } else {
-        tombstone = tombstone_map_iter->second;
-      }
-      if (upper_bound != nullptr &&
-          icmp_.user_comparator()->Compare(*upper_bound,
-                                           tombstone.start_key_) <= 0) {
-        // Tombstones starting at upper_bound or later only need to be included
-        // in the next table. Break because subsequent tombstones will start
-        // even later.
-        break;
-      }
-      if (lower_bound != nullptr &&
-          icmp_.user_comparator()->Compare(tombstone.end_key_,
-                                           *lower_bound) <= 0) {
-        // Tombstones ending before or at lower_bound only need to be included
-        // in the prev table. Continue because subsequent tombstones may still
-        // overlap [lower_bound, upper_bound).
-        continue;
-      }
+bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+                                                RangeDelPositioningMode mode) {
+  auto it = reps_.lower_bound(parsed.sequence);
+  if (it == reps_.end()) {
+    return false;
+  }
+  return it->second.ShouldDelete(parsed, mode);
+}
 
-      auto ikey_and_end_key = tombstone.Serialize();
-      builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
-      if (!first_added) {
-        first_added = true;
-        InternalKey smallest_candidate = std::move(ikey_and_end_key.first);
-        if (lower_bound != nullptr &&
-            icmp_.user_comparator()->Compare(smallest_candidate.user_key(),
-                                             *lower_bound) <= 0) {
-          // Pretend the smallest key has the same user key as lower_bound
-          // (the max key in the previous table or subcompaction) in order for
-          // files to appear key-space partitioned.
-          //
-          // Choose lowest seqnum so this file's smallest internal key comes
-          // after the previous file's/subcompaction's largest. The fake seqnum
-          // is OK because the read path's file-picking code only considers user
-          // key.
-          smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
-        }
-        if (meta->smallest.size() == 0 ||
-            icmp_.Compare(smallest_candidate, meta->smallest) < 0) {
-          meta->smallest = std::move(smallest_candidate);
-        }
+namespace {
+
+class TruncatedRangeDelMergingIter : public InternalIterator {
+ public:
+  TruncatedRangeDelMergingIter(
+      const InternalKeyComparator* icmp, const Slice* lower_bound,
+      const Slice* upper_bound, bool upper_bound_inclusive,
+      const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
+      : icmp_(icmp),
+        lower_bound_(lower_bound),
+        upper_bound_(upper_bound),
+        upper_bound_inclusive_(upper_bound_inclusive),
+        heap_(StartKeyMinComparator(icmp)) {
+    for (auto& child : children) {
+      if (child != nullptr) {
+        assert(child->lower_bound() == 0);
+        assert(child->upper_bound() == kMaxSequenceNumber);
+        children_.push_back(child.get());
       }
-      InternalKey largest_candidate = tombstone.SerializeEndKey();
-      if (upper_bound != nullptr &&
-          icmp_.user_comparator()->Compare(*upper_bound,
-                                           largest_candidate.user_key()) <= 0) {
-        // Pretend the largest key has the same user key as upper_bound (the
-        // min key in the following table or subcompaction) in order for files
-        // to appear key-space partitioned.
-        //
-        // Choose highest seqnum so this file's largest internal key comes
-        // before the next file's/subcompaction's smallest. The fake seqnum is
-        // OK because the read path's file-picking code only considers the user
-        // key portion.
-        //
-        // Note Seek() also creates InternalKey with (user_key,
-        // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
-        // kTypeRangeDeletion (0xF), so the range tombstone comes before the
-        // Seek() key in InternalKey's ordering. So Seek() will look in the
-        // next file for the user key.
-        largest_candidate = InternalKey(*upper_bound, kMaxSequenceNumber,
-                                        kTypeRangeDeletion);
+    }
+  }
+
+  bool Valid() const override {
+    return !heap_.empty() && BeforeEndKey(heap_.top());
+  }
+  Status status() const override { return Status::OK(); }
+
+  void SeekToFirst() override {
+    heap_.clear();
+    for (auto& child : children_) {
+      if (lower_bound_ != nullptr) {
+        child->Seek(*lower_bound_);
+      } else {
+        child->SeekToFirst();
       }
-      if (meta->largest.size() == 0 ||
-          icmp_.Compare(meta->largest, largest_candidate) < 0) {
-        meta->largest = std::move(largest_candidate);
+      if (child->Valid()) {
+        heap_.push(child);
       }
-      meta->smallest_seqno = std::min(meta->smallest_seqno, tombstone.seq_);
-      meta->largest_seqno = std::max(meta->largest_seqno, tombstone.seq_);
     }
-    ++stripe_map_iter;
   }
-}
 
-bool RangeDelAggregator::IsEmpty() {
-  if (rep_ == nullptr) {
-    return true;
+  void Next() override {
+    auto* top = heap_.top();
+    top->InternalNext();
+    if (top->Valid()) {
+      heap_.replace_top(top);
+    } else {
+      heap_.pop();
+    }
   }
-  for (auto stripe_map_iter = rep_->stripe_map_.begin();
-       stripe_map_iter != rep_->stripe_map_.end(); ++stripe_map_iter) {
-    if (!stripe_map_iter->second.raw_map.empty()) {
-      return false;
+
+  Slice key() const override {
+    auto* top = heap_.top();
+    cur_start_key_.Set(top->start_key().user_key, top->seq(),
+                       kTypeRangeDeletion);
+    return cur_start_key_.Encode();
+  }
+
+  Slice value() const override {
+    auto* top = heap_.top();
+    assert(top->end_key().sequence == kMaxSequenceNumber);
+    return top->end_key().user_key;
+  }
+
+  // Unused InternalIterator methods
+  void Prev() override { assert(false); }
+  void Seek(const Slice& /* target */) override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+ private:
+  bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
+    if (upper_bound_ == nullptr) {
+      return true;
     }
+    int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key,
+                                                *upper_bound_);
+    return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
   }
-  return true;
+
+  const InternalKeyComparator* icmp_;
+  const Slice* lower_bound_;
+  const Slice* upper_bound_;
+  bool upper_bound_inclusive_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
+  std::vector<TruncatedRangeDelIterator*> children_;
+
+  mutable InternalKey cur_start_key_;
+};
+
+}  // namespace
+
+std::unique_ptr<FragmentedRangeTombstoneIterator>
+CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
+                                          const Slice* upper_bound,
+                                          bool upper_bound_inclusive) {
+  InvalidateRangeDelMapPositions();
+  std::unique_ptr<TruncatedRangeDelMergingIter> merging_iter(
+      new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound,
+                                       upper_bound_inclusive, parent_iters_));
+
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::move(merging_iter), *icmp_, true /* for_compaction */,
+          *snapshots_);
+
+  return std::unique_ptr<FragmentedRangeTombstoneIterator>(
+      new FragmentedRangeTombstoneIterator(
+          fragmented_tombstone_list, *icmp_,
+          kMaxSequenceNumber /* upper_bound */));
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/range_del_aggregator.h b/thirdparty/rocksdb/db/range_del_aggregator.h
index 9d4b8ca168..712ae45839 100644
--- a/thirdparty/rocksdb/db/range_del_aggregator.h
+++ b/thirdparty/rocksdb/db/range_del_aggregator.h
@@ -1,161 +1,431 @@
-//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
 
+#include <algorithm>
+#include <iterator>
+#include <list>
 #include <map>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "db/compaction_iteration_stats.h"
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_edit.h"
 #include "include/rocksdb/comparator.h"
 #include "include/rocksdb/types.h"
 #include "table/internal_iterator.h"
 #include "table/scoped_arena_iterator.h"
 #include "table/table_builder.h"
+#include "util/heap.h"
 #include "util/kv_map.h"
 
 namespace rocksdb {
 
-// A RangeDelAggregator aggregates range deletion tombstones as they are
-// encountered in memtables/SST files. It provides methods that check whether a
-// key is covered by range tombstones or write the relevant tombstones to a new
-// SST file.
-class RangeDelAggregator {
+class TruncatedRangeDelIterator {
+ public:
+  TruncatedRangeDelIterator(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+      const InternalKeyComparator* icmp, const InternalKey* smallest,
+      const InternalKey* largest);
+
+  bool Valid() const;
+
+  void Next();
+  void Prev();
+
+  void InternalNext();
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the earliest tombstone that ends after target.
+  void Seek(const Slice& target);
+
+  // Seeks to the tombstone with the highest viisble sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the latest tombstone that starts before target.
+  void SeekForPrev(const Slice& target);
+
+  void SeekToFirst();
+  void SeekToLast();
+
+  ParsedInternalKey start_key() const {
+    return (smallest_ == nullptr ||
+            icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+               ? iter_->parsed_start_key()
+               : *smallest_;
+  }
+
+  ParsedInternalKey end_key() const {
+    return (largest_ == nullptr ||
+            icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+               ? iter_->parsed_end_key()
+               : *largest_;
+  }
+
+  SequenceNumber seq() const { return iter_->seq(); }
+
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return iter_->upper_bound(); }
+
+  SequenceNumber lower_bound() const { return iter_->lower_bound(); }
+
+ private:
+  std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+  const InternalKeyComparator* icmp_;
+  const ParsedInternalKey* smallest_ = nullptr;
+  const ParsedInternalKey* largest_ = nullptr;
+  std::list<ParsedInternalKey> pinned_bounds_;
+
+  const InternalKey* smallest_ikey_;
+  const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return a->seq() > b->seq();
+  }
+};
+
+struct StartKeyMinComparator {
+  explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return icmp->Compare(a->start_key(), b->start_key()) > 0;
+  }
+
+  const InternalKeyComparator* icmp;
+};
+
+class ForwardRangeDelIterator {
  public:
-  // @param snapshots These are used to organize the tombstones into snapshot
-  //    stripes, which is the seqnum range between consecutive snapshots,
-  //    including the higher snapshot and excluding the lower one. Currently,
-  //    this is used by ShouldDelete() to prevent deletion of keys that are
-  //    covered by range tombstones in other snapshot stripes. This constructor
-  //    is used for writes (flush/compaction). All DB snapshots are provided
-  //    such that no keys are removed that are uncovered according to any DB
-  //    snapshot.
-  // Note this overload does not lazily initialize Rep.
-  RangeDelAggregator(const InternalKeyComparator& icmp,
-                     const std::vector<SequenceNumber>& snapshots,
-                     bool collapse_deletions = true);
-
-  // @param upper_bound Similar to snapshots above, except with a single
-  //    snapshot, which allows us to store the snapshot on the stack and defer
-  //    initialization of heap-allocating members (in Rep) until the first range
-  //    deletion is encountered. This constructor is used in case of reads (get/
-  //    iterator), for which only the user snapshot (upper_bound) is provided
-  //    such that the seqnum space is divided into two stripes. Only the older
-  //    stripe will be used by ShouldDelete().
-  RangeDelAggregator(const InternalKeyComparator& icmp,
-                     SequenceNumber upper_bound,
-                     bool collapse_deletions = false);
-
-  // We maintain position in the tombstone map across calls to ShouldDelete. The
-  // caller may wish to specify a mode to optimize positioning the iterator
-  // during the next call to ShouldDelete. The non-kFullScan modes are only
-  // available when deletion collapsing is enabled.
-  //
-  // For example, if we invoke Next() on an iterator, kForwardTraversal should
-  // be specified to advance one-by-one through deletions until one is found
-  // with its interval containing the key. This will typically be faster than
-  // doing a full binary search (kBinarySearch).
-  enum RangePositioningMode {
-    kFullScan,  // used iff collapse_deletions_ == false
-    kForwardTraversal,
-    kBackwardTraversal,
-    kBinarySearch,
+  explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->Seek(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMinComparator {
+    explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+    }
+
+    const InternalKeyComparator* icmp;
   };
 
-  // Returns whether the key should be deleted, which is the case when it is
-  // covered by a range tombstone residing in the same snapshot stripe.
-  // @param mode If collapse_deletions_ is true, this dictates how we will find
-  //             the deletion whose interval contains this key. Otherwise, its
-  //             value must be kFullScan indicating linear scan from beginning..
-  bool ShouldDelete(const ParsedInternalKey& parsed,
-                    RangePositioningMode mode = kFullScan);
-  bool ShouldDelete(const Slice& internal_key,
-                    RangePositioningMode mode = kFullScan);
-  bool ShouldAddTombstones(bool bottommost_level = false);
-
-  // Adds tombstones to the tombstone aggregation structure maintained by this
-  // object.
-  // @return non-OK status if any of the tombstone keys are corrupted.
-  Status AddTombstones(std::unique_ptr<InternalIterator> input);
-
-  // Resets iterators maintained across calls to ShouldDelete(). This may be
-  // called when the tombstones change, or the owner may call explicitly, e.g.,
-  // if it's an iterator that just seeked to an arbitrary position. The effect
-  // of invalidation is that the following call to ShouldDelete() will binary
-  // search for its tombstone.
-  void InvalidateTombstoneMapPositions();
-
-  // Writes tombstones covering a range to a table builder.
-  // @param extend_before_min_key If true, the range of tombstones to be added
-  //    to the TableBuilder starts from the beginning of the key-range;
-  //    otherwise, it starts from meta->smallest.
-  // @param lower_bound/upper_bound Any range deletion with [start_key, end_key)
-  //    that overlaps the target range [*lower_bound, *upper_bound) is added to
-  //    the builder. If lower_bound is nullptr, the target range extends
-  //    infinitely to the left. If upper_bound is nullptr, the target range
-  //    extends infinitely to the right. If both are nullptr, the target range
-  //    extends infinitely in both directions, i.e., all range deletions are
-  //    added to the builder.
-  // @param meta The file's metadata. We modify the begin and end keys according
-  //    to the range tombstones added to this file such that the read path does
-  //    not miss range tombstones that cover gaps before/after/between files in
-  //    a level. lower_bound/upper_bound above constrain how far file boundaries
-  //    can be extended.
-  // @param bottommost_level If true, we will filter out any tombstones
-  //    belonging to the oldest snapshot stripe, because all keys potentially
-  //    covered by this tombstone are guaranteed to have been deleted by
-  //    compaction.
-  void AddToBuilder(TableBuilder* builder, const Slice* lower_bound,
-                    const Slice* upper_bound, FileMetaData* meta,
-                    CompactionIterationStats* range_del_out_stats = nullptr,
-                    bool bottommost_level = false);
-  bool IsEmpty();
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+      return;
+    }
+    int cmp = icmp_->Compare(parsed, iter->start_key());
+    if (cmp < 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
+};
+
+class ReverseRangeDelIterator {
+ public:
+  explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->SeekForPrev(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
 
  private:
-  // Maps tombstone user start key -> tombstone object
-  typedef std::multimap<Slice, RangeTombstone, stl_wrappers::LessOfComparator>
-      TombstoneMap;
-  // Also maintains position in TombstoneMap last seen by ShouldDelete(). The
-  // end iterator indicates invalidation (e.g., if AddTombstones() changes the
-  // underlying map). End iterator cannot be invalidated.
-  struct PositionalTombstoneMap {
-    explicit PositionalTombstoneMap(TombstoneMap _raw_map)
-        : raw_map(std::move(_raw_map)), iter(raw_map.end()) {}
-    PositionalTombstoneMap(const PositionalTombstoneMap&) = delete;
-    PositionalTombstoneMap(PositionalTombstoneMap&& other)
-        : raw_map(std::move(other.raw_map)), iter(raw_map.end()) {}
-
-    TombstoneMap raw_map;
-    TombstoneMap::const_iterator iter;
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMaxComparator {
+    explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const TruncatedRangeDelIterator* a,
+                    const TruncatedRangeDelIterator* b) const {
+      return icmp->Compare(a->end_key(), b->end_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
   };
+  struct StartKeyMaxComparator {
+    explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
 
-  // Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e.,
-  // their seqnums are greater than the next smaller snapshot's seqnum.
-  typedef std::map<SequenceNumber, PositionalTombstoneMap> StripeMap;
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+    }
 
-  struct Rep {
-    StripeMap stripe_map_;
-    PinnedIteratorsManager pinned_iters_mgr_;
+    const InternalKeyComparator* icmp;
   };
-  // Initializes rep_ lazily. This aggregator object is constructed for every
-  // read, so expensive members should only be created when necessary, i.e.,
-  // once the first range deletion is encountered.
-  void InitRep(const std::vector<SequenceNumber>& snapshots);
-
-  PositionalTombstoneMap& GetPositionalTombstoneMap(SequenceNumber seq);
-  Status AddTombstone(RangeTombstone tombstone);
-
-  SequenceNumber upper_bound_;
-  std::unique_ptr<Rep> rep_;
-  const InternalKeyComparator& icmp_;
-  // collapse range deletions so they're binary searchable
-  const bool collapse_deletions_;
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+    } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
+};
+
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
+class RangeDelAggregator {
+ public:
+  explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+      : icmp_(icmp) {}
+  virtual ~RangeDelAggregator() {}
+
+  virtual void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) = 0;
+
+  bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) {
+    ParsedInternalKey parsed;
+    if (!ParseInternalKey(key, &parsed)) {
+      return false;
+    }
+    return ShouldDelete(parsed, mode);
+  }
+  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+                            RangeDelPositioningMode mode) = 0;
+
+  virtual void InvalidateRangeDelMapPositions() = 0;
+
+  virtual bool IsEmpty() const = 0;
+
+  bool AddFile(uint64_t file_number) {
+    return files_seen_.insert(file_number).second;
+  }
+
+ protected:
+  class StripeRep {
+   public:
+    StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+              SequenceNumber lower_bound)
+        : icmp_(icmp),
+          forward_iter_(icmp),
+          reverse_iter_(icmp),
+          upper_bound_(upper_bound),
+          lower_bound_(lower_bound) {}
+
+    void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+      iters_.push_back(std::move(input_iter));
+    }
+
+    bool IsEmpty() const { return iters_.empty(); }
+
+    bool ShouldDelete(const ParsedInternalKey& parsed,
+                      RangeDelPositioningMode mode);
+
+    void Invalidate() {
+      InvalidateForwardIter();
+      InvalidateReverseIter();
+    }
+
+    bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+   private:
+    bool InStripe(SequenceNumber seq) const {
+      return lower_bound_ <= seq && seq <= upper_bound_;
+    }
+
+    void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+    void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+    const InternalKeyComparator* icmp_;
+    std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+    ForwardRangeDelIterator forward_iter_;
+    ReverseRangeDelIterator reverse_iter_;
+    SequenceNumber upper_bound_;
+    SequenceNumber lower_bound_;
+  };
+
+  const InternalKeyComparator* icmp_;
+
+ private:
+  std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator : public RangeDelAggregator {
+ public:
+  ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+                         SequenceNumber upper_bound)
+      : RangeDelAggregator(icmp),
+        rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+  ~ReadRangeDelAggregator() override {}
+
+  using RangeDelAggregator::ShouldDelete;
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+  bool IsEmpty() const override { return rep_.IsEmpty(); }
+
+ private:
+  StripeRep rep_;
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+  CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+                               const std::vector<SequenceNumber>& snapshots)
+      : RangeDelAggregator(icmp), snapshots_(&snapshots) {}
+  ~CompactionRangeDelAggregator() override {}
+
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  using RangeDelAggregator::ShouldDelete;
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override {
+    for (auto& rep : reps_) {
+      rep.second.Invalidate();
+    }
+  }
+
+  bool IsEmpty() const override {
+    for (const auto& rep : reps_) {
+      if (!rep.second.IsEmpty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Creates an iterator over all the range tombstones in the aggregator, for
+  // use in compaction. Nullptr arguments indicate that the iterator range is
+  // unbounded.
+  // NOTE: the boundaries are used for optimization purposes to reduce the
+  // number of tombstones that are passed to the fragmenter; they do not
+  // guarantee that the resulting iterator only contains range tombstones that
+  // cover keys in the provided range. If required, these bounds must be
+  // enforced during iteration.
+  std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+      const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+      bool upper_bound_inclusive = false);
+
+ private:
+  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+  std::map<SequenceNumber, StripeRep> reps_;
+
+  const std::vector<SequenceNumber>* snapshots_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/range_del_aggregator_bench.cc b/thirdparty/rocksdb/db/range_del_aggregator_bench.cc
new file mode 100644
index 0000000000..34b2f7e5db
--- /dev/null
+++ b/thirdparty/rocksdb/db/range_del_aggregator_bench.cc
@@ -0,0 +1,256 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "util/coding.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/testutil.h"
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
+
+DEFINE_int32(num_runs, 1000, "number of test runs");
+
+DEFINE_int32(tombstone_start_upper_bound, 1000,
+             "exclusive upper bound on range tombstone start keys");
+
+DEFINE_int32(should_delete_upper_bound, 1000,
+             "exclusive upper bound on keys passed to ShouldDelete");
+
+DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
+
+DEFINE_double(tombstone_width_stddev, 0.0,
+              "standard deviation of range tombstone width");
+
+DEFINE_int32(seed, 0, "random number generator seed");
+
+DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
+
+DEFINE_int32(add_tombstones_per_run, 1,
+             "number of AddTombstones calls per run");
+
+namespace {
+
+struct Stats {
+  uint64_t time_add_tombstones = 0;
+  uint64_t time_first_should_delete = 0;
+  uint64_t time_rest_should_delete = 0;
+};
+
+std::ostream& operator<<(std::ostream& os, const Stats& s) {
+  std::ios fmt_holder(nullptr);
+  fmt_holder.copyfmt(os);
+
+  os << std::left;
+  os << std::setw(25) << "AddTombstones: "
+     << s.time_add_tombstones /
+            (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+     << " us\n";
+  os << std::setw(25) << "ShouldDelete (first): "
+     << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n";
+  if (FLAGS_should_deletes_per_run > 1) {
+    os << std::setw(25) << "ShouldDelete (rest): "
+       << s.time_rest_should_delete /
+              ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3)
+       << " us\n";
+  }
+
+  os.copyfmt(fmt_holder);
+  return os;
+}
+
+auto icmp = rocksdb::InternalKeyComparator(rocksdb::BytewiseComparator());
+
+}  // anonymous namespace
+
+namespace rocksdb {
+
+namespace {
+
+// A wrapper around RangeTombstones and the underlying data of its start and end
+// keys.
+struct PersistentRangeTombstone {
+  std::string start_key;
+  std::string end_key;
+  RangeTombstone tombstone;
+
+  PersistentRangeTombstone(std::string start, std::string end,
+                           SequenceNumber seq)
+      : start_key(std::move(start)), end_key(std::move(end)) {
+    tombstone = RangeTombstone(start_key, end_key, seq);
+  }
+
+  PersistentRangeTombstone() = default;
+
+  PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; }
+
+  PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) {
+    start_key = t.start_key;
+    end_key = t.end_key;
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+
+  PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; }
+
+  PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) {
+    start_key = std::move(t.start_key);
+    end_key = std::move(t.end_key);
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+};
+
+struct TombstoneStartKeyComparator {
+  explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
+
+  bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
+    return cmp->Compare(a.start_key_, b.start_key_) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+std::unique_ptr<InternalIterator> MakeRangeDelIterator(
+    const std::vector<PersistentRangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.tombstone.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (size_t i = 0; i < sizeof(val); ++i) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+}  // anonymous namespace
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  Stats stats;
+  rocksdb::Random64 rnd(FLAGS_seed);
+  std::default_random_engine random_gen(FLAGS_seed);
+  std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
+                                               FLAGS_tombstone_width_stddev);
+  std::vector<std::vector<rocksdb::PersistentRangeTombstone> >
+      all_persistent_range_tombstones(FLAGS_add_tombstones_per_run);
+  for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) {
+    all_persistent_range_tombstones[i] =
+        std::vector<rocksdb::PersistentRangeTombstone>(
+            FLAGS_num_range_tombstones);
+  }
+  auto mode = rocksdb::RangeDelPositioningMode::kForwardTraversal;
+
+  for (int i = 0; i < FLAGS_num_runs; i++) {
+    rocksdb::ReadRangeDelAggregator range_del_agg(
+        &icmp, rocksdb::kMaxSequenceNumber /* upper_bound */);
+
+    std::vector<std::unique_ptr<rocksdb::FragmentedRangeTombstoneList> >
+        fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run);
+
+    for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
+      // TODO(abhimadan): consider whether creating the range tombstones right
+      // before AddTombstones is artificially warming the cache compared to
+      // real workloads.
+      for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
+        uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
+        uint64_t end = static_cast<uint64_t>(
+            std::round(start + std::max(1.0, normal_dist(random_gen))));
+        persistent_range_tombstones[j] = rocksdb::PersistentRangeTombstone(
+            rocksdb::Key(start), rocksdb::Key(end), j);
+      }
+
+      auto range_del_iter =
+          rocksdb::MakeRangeDelIterator(persistent_range_tombstones);
+      fragmented_range_tombstone_lists.emplace_back(
+          new rocksdb::FragmentedRangeTombstoneList(
+              rocksdb::MakeRangeDelIterator(persistent_range_tombstones),
+              icmp));
+      std::unique_ptr<rocksdb::FragmentedRangeTombstoneIterator>
+          fragmented_range_del_iter(
+              new rocksdb::FragmentedRangeTombstoneIterator(
+                  fragmented_range_tombstone_lists.back().get(), icmp,
+                  rocksdb::kMaxSequenceNumber));
+
+      rocksdb::StopWatchNano stop_watch_add_tombstones(rocksdb::Env::Default(),
+                                                       true /* auto_start */);
+      range_del_agg.AddTombstones(std::move(fragmented_range_del_iter));
+      stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
+    }
+
+    rocksdb::ParsedInternalKey parsed_key;
+    parsed_key.sequence = FLAGS_num_range_tombstones / 2;
+    parsed_key.type = rocksdb::kTypeValue;
+
+    uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound -
+                                     FLAGS_should_deletes_per_run + 1);
+
+    for (int j = 0; j < FLAGS_should_deletes_per_run; j++) {
+      std::string key_string = rocksdb::Key(first_key + j);
+      parsed_key.user_key = key_string;
+
+      rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(),
+                                                      true /* auto_start */);
+      range_del_agg.ShouldDelete(parsed_key, mode);
+      uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
+
+      if (j == 0) {
+        stats.time_first_should_delete += call_time;
+      } else {
+        stats.time_rest_should_delete += call_time;
+      }
+    }
+  }
+
+  std::cout << "=========================\n"
+            << "Results:\n"
+            << "=========================\n"
+            << stats;
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/thirdparty/rocksdb/db/range_del_aggregator_test.cc b/thirdparty/rocksdb/db/range_del_aggregator_test.cc
index 39029bd2a2..28c8129ecb 100644
--- a/thirdparty/rocksdb/db/range_del_aggregator_test.cc
+++ b/thirdparty/rocksdb/db/range_del_aggregator_test.cc
@@ -1,13 +1,17 @@
-//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include <algorithm>
+#include "db/range_del_aggregator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
 
 #include "db/db_test_util.h"
-#include "db/range_del_aggregator.h"
-#include "rocksdb/comparator.h"
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "util/testutil.h"
 
 namespace rocksdb {
@@ -16,137 +20,685 @@ class RangeDelAggregatorTest : public testing::Test {};
 
 namespace {
 
-struct ExpectedPoint {
-  Slice begin;
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
+MakeFragmentedTombstoneLists(
+    const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
+  for (const auto& range_dels : range_dels_list) {
+    auto range_del_iter = MakeRangeDelIter(range_dels);
+    fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
+        std::move(range_del_iter), bytewise_icmp));
+  }
+  return fragment_lists;
+}
+
+struct TruncatedIterScanTestCase {
+  ParsedInternalKey start;
+  ParsedInternalKey end;
   SequenceNumber seq;
 };
 
-enum Direction {
-  kForward,
-  kReverse,
+struct TruncatedIterSeekTestCase {
+  Slice target;
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+  bool invalid;
 };
 
-void VerifyRangeDels(const std::vector<RangeTombstone>& range_dels,
-                     const std::vector<ExpectedPoint>& expected_points) {
-  // Test same result regardless of which order the range deletions are added.
-  for (Direction dir : {kForward, kReverse}) {
-    auto icmp = InternalKeyComparator(BytewiseComparator());
-    RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, true);
-    std::vector<std::string> keys, values;
-    for (const auto& range_del : range_dels) {
-      auto key_and_value = range_del.Serialize();
-      keys.push_back(key_and_value.first.Encode().ToString());
-      values.push_back(key_and_value.second.ToString());
-    }
-    if (dir == kReverse) {
-      std::reverse(keys.begin(), keys.end());
-      std::reverse(values.begin(), values.end());
+struct ShouldDeleteTestCase {
+  ParsedInternalKey lookup_key;
+  bool result;
+};
+
+struct IsRangeOverlappedTestCase {
+  Slice start;
+  Slice end;
+  bool result;
+};
+
+ParsedInternalKey UncutEndpoint(const Slice& s) {
+  return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
+}
+
+ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) {
+  return ParsedInternalKey(key, seq, kTypeValue);
+}
+
+void VerifyIterator(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
+  // Test forward iteration.
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
+    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+    EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+
+  // Test reverse iteration.
+  iter->SeekToLast();
+  std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
+      expected_range_dels.rbegin(), expected_range_dels.rend());
+  for (size_t i = 0; i < reverse_expected_range_dels.size();
+       i++, iter->Prev()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(),
+                              reverse_expected_range_dels[i].start));
+    EXPECT_EQ(
+        0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
+    EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifySeek(TruncatedRangeDelIterator* iter,
+                const InternalKeyComparator& icmp,
+                const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->Seek(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
     }
-    std::unique_ptr<test::VectorIterator> range_del_iter(
-        new test::VectorIterator(keys, values));
-    range_del_agg.AddTombstones(std::move(range_del_iter));
-
-    for (const auto expected_point : expected_points) {
-      ParsedInternalKey parsed_key;
-      parsed_key.user_key = expected_point.begin;
-      parsed_key.sequence = expected_point.seq;
-      parsed_key.type = kTypeValue;
-      ASSERT_FALSE(range_del_agg.ShouldDelete(
-          parsed_key,
-          RangeDelAggregator::RangePositioningMode::kForwardTraversal));
-      if (parsed_key.sequence > 0) {
-        --parsed_key.sequence;
-        ASSERT_TRUE(range_del_agg.ShouldDelete(
-            parsed_key,
-            RangeDelAggregator::RangePositioningMode::kForwardTraversal));
-      }
+  }
+}
+
+void VerifySeekForPrev(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->SeekForPrev(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
     }
   }
 }
 
-}  // anonymous namespace
+void VerifyShouldDelete(RangeDelAggregator* range_del_agg,
+                        const std::vector<ShouldDeleteTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
+  }
+  for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
+    const auto& test_case = *it;
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
+  }
+}
+
+void VerifyIsRangeOverlapped(
+    ReadRangeDelAggregator* range_del_agg,
+    const std::vector<IsRangeOverlappedTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.result,
+              range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
+  }
+}
 
-TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}); }
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
 
-TEST_F(RangeDelAggregatorTest, SameStartAndEnd) {
-  VerifyRangeDels({{"a", "a", 5}}, {{" ", 0}, {"a", 0}, {"b", 0}});
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
 }
 
-TEST_F(RangeDelAggregatorTest, Single) {
-  VerifyRangeDels({{"a", "b", 10}}, {{" ", 0}, {"a", 10}, {"b", 0}});
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) {
-  VerifyRangeDels({{"a", "c", 10}, {"b", "d", 5}},
-                  {{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}});
+}  // namespace
+
+TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) {
+  auto range_del_iter = MakeRangeDelIter({});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  iter.SeekToFirst();
+  ASSERT_FALSE(iter.Valid());
+
+  iter.SeekToLast();
+  ASSERT_FALSE(iter.Valid());
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapAboveRight) {
-  VerifyRangeDels({{"a", "c", 5}, {"b", "d", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}});
+TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) {
-  VerifyRangeDels({{"a", "d", 5}, {"b", "c", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}});
+TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           9 /* snapshot */));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapFully) {
-  VerifyRangeDels({{"a", "d", 10}, {"b", "c", 5}},
-                  {{" ", 0}, {"a", 10}, {"d", 0}});
+TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("d", 7, kTypeValue);
+  InternalKey largest("m", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("d", 7), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), InternalValue("m", 8), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), InternalValue("m", 8), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
 }
 
-TEST_F(RangeDelAggregatorTest, OverlapPoint) {
-  VerifyRangeDels({{"a", "b", 5}, {"b", "c", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}});
+TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("f", 7, kTypeValue);
+  InternalKey largest("i", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("f", 7), UncutEndpoint("g"), 8}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
 }
 
-TEST_F(RangeDelAggregatorTest, SameStartKey) {
-  VerifyRangeDels({{"a", "c", 5}, {"a", "b", 10}},
-                  {{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}});
+TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  range_del_agg.AddTombstones(std::move(input_iter));
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, SameEndKey) {
-  VerifyRangeDels({{"a", "d", 5}, {"b", "d", 10}},
-                  {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}});
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) {
-  VerifyRangeDels(
-      {{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}},
-      {{" ", 0}, {"a", 5}, {"b", 0}, {"c", 10}, {"d", 0}, {"e", 15}, {"f", 0}});
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("a", 9), true},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), false},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
 }
 
-// Note the Cover* tests also test cases where tombstones are inserted under a
-// larger one when VerifyRangeDels() runs them in reverse
-TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) {
-  VerifyRangeDels(
-      {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}},
-      {{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}});
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (size_t i = 0; i < fragment_lists.size(); i++) {
+    const auto& fragment_list = fragment_lists[i];
+    const auto& bounds = iter_bounds[i];
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
+                                &bounds.second);
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true},
+                                      {InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true},
+                                      {InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) {
-  VerifyRangeDels(
-      {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}},
-      {{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}});
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+
+  auto add_iter_to_agg = [&](size_t i) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
+                                             bytewise_icmp, 19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
+                                &iter_bounds[i].second);
+  };
+
+  add_iter_to_agg(0);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true}});
+
+  add_iter_to_agg(1);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true}});
+
+  add_iter_to_agg(2);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
 }
 
-TEST_F(RangeDelAggregatorTest, CoverMultipleFully) {
-  VerifyRangeDels(
-      {{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}},
-      {{" ", 0}, {"a", 20}, {"h", 0}});
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots;
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
 }
 
-TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) {
-  VerifyRangeDels(
-      {{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}},
-      {{" ", 0},
-       {"a", 5},
-       {"b", 15},
-       {"d", 10},
-       {"e", 20},
-       {"g", 5},
-       {"h", 0}});
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(
+      &range_del_agg,
+      {
+          {InternalValue("a", 19), false},  // [10, 19]
+          {InternalValue("a", 9), false},   // [0, 9]
+          {InternalValue("b", 9), false},   // [0, 9]
+          {InternalValue("d", 9), false},   // [0, 9]
+          {InternalValue("d", 7), true},    // [0, 9]
+          {InternalValue("e", 7), true},    // [0, 9]
+          {InternalValue("g", 7), false},   // [0, 9]
+          {InternalValue("h", 24), true},   // [20, kMaxSequenceNumber]
+          {InternalValue("i", 24), false},  // [20, kMaxSequenceNumber]
+          {InternalValue("ii", 14), true},  // [10, 19]
+          {InternalValue("j", 14), false}   // [10, 19]
+      });
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"a", "b", 10},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"c", "e", 8},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("_");
+  Slice end("__");
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("p");
+  Slice end("q");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
+                            {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(
+      range_del_compaction_iter2.get(),
+      {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
+}
+
+TEST_F(RangeDelAggregatorTest,
+       CompactionAggregatorBoundedIteratorExtraFragments) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "d", 10}, {"c", "g", 8}},
+       {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/range_tombstone_fragmenter.cc b/thirdparty/rocksdb/db/range_tombstone_fragmenter.cc
new file mode 100644
index 0000000000..f9d9f2feb4
--- /dev/null
+++ b/thirdparty/rocksdb/db/range_tombstone_fragmenter.cc
@@ -0,0 +1,438 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include <algorithm>
+#include <functional>
+#include <set>
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "util/autovector.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace rocksdb {
+
+FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  if (unfragmented_tombstones == nullptr) {
+    return;
+  }
+  bool is_sorted = true;
+  int num_tombstones = 0;
+  InternalKey pinned_last_start_key;
+  Slice last_start_key;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next(), num_tombstones++) {
+    if (num_tombstones > 0 &&
+        icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
+      is_sorted = false;
+      break;
+    }
+    if (unfragmented_tombstones->IsKeyPinned()) {
+      last_start_key = unfragmented_tombstones->key();
+    } else {
+      pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
+      last_start_key = pinned_last_start_key.Encode();
+    }
+  }
+  if (is_sorted) {
+    FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+                       snapshots);
+    return;
+  }
+
+  // Sort the tombstones before fragmenting them.
+  std::vector<std::string> keys, values;
+  keys.reserve(num_tombstones);
+  values.reserve(num_tombstones);
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    keys.emplace_back(unfragmented_tombstones->key().data(),
+                      unfragmented_tombstones->key().size());
+    values.emplace_back(unfragmented_tombstones->value().data(),
+                        unfragmented_tombstones->value().size());
+  }
+  // VectorIterator implicitly sorts by key during construction.
+  auto iter = std::unique_ptr<VectorIterator>(
+      new VectorIterator(std::move(keys), std::move(values), &icmp));
+  FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
+}
+
+void FragmentedRangeTombstoneList::FragmentTombstones(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  Slice cur_start_key(nullptr, 0);
+  auto cmp = ParsedInternalKeyComparator(&icmp);
+
+  // Stores the end keys and sequence numbers of range tombstones with a start
+  // key less than or equal to cur_start_key. Provides an ordering by end key
+  // for use in flush_current_tombstones.
+  std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
+
+  // Given the next start key in unfragmented_tombstones,
+  // flush_current_tombstones writes every tombstone fragment that starts
+  // and ends with a key before next_start_key, and starts with a key greater
+  // than or equal to cur_start_key.
+  auto flush_current_tombstones = [&](const Slice& next_start_key) {
+    auto it = cur_end_keys.begin();
+    bool reached_next_start_key = false;
+    for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
+      Slice cur_end_key = it->user_key;
+      if (icmp.user_comparator()->Compare(cur_start_key, cur_end_key) == 0) {
+        // Empty tombstone.
+        continue;
+      }
+      if (icmp.user_comparator()->Compare(next_start_key, cur_end_key) <= 0) {
+        // All of the end keys in [it, cur_end_keys.end()) are after
+        // next_start_key, so the tombstones they represent can be used in
+        // fragments that start with keys greater than or equal to
+        // next_start_key. However, the end keys we already passed will not be
+        // used in any more tombstone fragments.
+        //
+        // Remove the fully fragmented tombstones and stop iteration after a
+        // final round of flushing to preserve the tombstones we can create more
+        // fragments from.
+        reached_next_start_key = true;
+        cur_end_keys.erase(cur_end_keys.begin(), it);
+        cur_end_key = next_start_key;
+      }
+
+      // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
+      // should not overlap with the last-flushed tombstone fragment.
+      assert(tombstones_.empty() ||
+             icmp.user_comparator()->Compare(tombstones_.back().end_key,
+                                             cur_start_key) <= 0);
+
+      // Sort the sequence numbers of the tombstones being fragmented in
+      // descending order, and then flush them in that order.
+      autovector<SequenceNumber> seqnums_to_flush;
+      for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
+        seqnums_to_flush.push_back(flush_it->sequence);
+      }
+      std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
+                std::greater<SequenceNumber>());
+
+      size_t start_idx = tombstone_seqs_.size();
+      size_t end_idx = start_idx + seqnums_to_flush.size();
+
+      if (for_compaction) {
+        // Drop all tombstone seqnums that are not preserved by a snapshot.
+        SequenceNumber next_snapshot = kMaxSequenceNumber;
+        for (auto seq : seqnums_to_flush) {
+          if (seq <= next_snapshot) {
+            // This seqnum is visible by a lower snapshot.
+            tombstone_seqs_.push_back(seq);
+            seq_set_.insert(seq);
+            auto upper_bound_it =
+                std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+            if (upper_bound_it == snapshots.begin()) {
+              // This seqnum is the topmost one visible by the earliest
+              // snapshot. None of the seqnums below it will be visible, so we
+              // can skip them.
+              break;
+            }
+            next_snapshot = *std::prev(upper_bound_it);
+          }
+        }
+        end_idx = tombstone_seqs_.size();
+      } else {
+        // The fragmentation is being done for reads, so preserve all seqnums.
+        tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+                               seqnums_to_flush.end());
+        seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end());
+      }
+
+      assert(start_idx < end_idx);
+      tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx);
+
+      cur_start_key = cur_end_key;
+    }
+    if (!reached_next_start_key) {
+      // There is a gap between the last flushed tombstone fragment and
+      // the next tombstone's start key. Remove all the end keys in
+      // the working set, since we have fully fragmented their corresponding
+      // tombstones.
+      cur_end_keys.clear();
+    }
+    cur_start_key = next_start_key;
+  };
+
+  pinned_iters_mgr_.StartPinning();
+
+  bool no_tombstones = true;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    const Slice& ikey = unfragmented_tombstones->key();
+    Slice tombstone_start_key = ExtractUserKey(ikey);
+    SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+    if (!unfragmented_tombstones->IsKeyPinned()) {
+      pinned_slices_.emplace_back(tombstone_start_key.data(),
+                                  tombstone_start_key.size());
+      tombstone_start_key = pinned_slices_.back();
+    }
+    no_tombstones = false;
+
+    Slice tombstone_end_key = unfragmented_tombstones->value();
+    if (!unfragmented_tombstones->IsValuePinned()) {
+      pinned_slices_.emplace_back(tombstone_end_key.data(),
+                                  tombstone_end_key.size());
+      tombstone_end_key = pinned_slices_.back();
+    }
+    if (!cur_end_keys.empty() && icmp.user_comparator()->Compare(
+                                     cur_start_key, tombstone_start_key) != 0) {
+      // The start key has changed. Flush all tombstones that start before
+      // this new start key.
+      flush_current_tombstones(tombstone_start_key);
+    }
+    cur_start_key = tombstone_start_key;
+
+    cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
+  }
+  if (!cur_end_keys.empty()) {
+    ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
+    flush_current_tombstones(last_end_key.user_key);
+  }
+
+  if (!no_tombstones) {
+    pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
+                                  false /* arena */);
+  }
+}
+
+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+                                                 SequenceNumber upper) const {
+  auto seq_it = seq_set_.lower_bound(lower);
+  return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const FragmentedRangeTombstoneList* tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_(tombstones),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_ref_(tombstones),
+      tombstones_(tombstones_ref_.get()),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToFirst() {
+  pos_ = tombstones_->begin();
+  seq_pos_ = tombstones_->seq_begin();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = tombstones_->begin();
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToLast() {
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::prev(tombstones_->seq_end());
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopLast() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekToCoveringTombstone(target);
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekForPrevToCoveringTombstone(target);
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
+    const Slice& target) {
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_end_cmp_);
+  if (pos_ == tombstones_->end()) {
+    // All tombstones end before target.
+    seq_pos_ = tombstones_->seq_end();
+    return;
+  }
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
+    const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_start_cmp_);
+  if (pos_ == tombstones_->begin()) {
+    // All tombstones start after target.
+    Invalidate();
+    return;
+  }
+  --pos_;
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+}
+
+void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    ++pos_;
+    if (pos_ == tombstones_->end()) {
+      Invalidate();
+      return;
+    }
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+  }
+}
+
+void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    if (pos_ == tombstones_->begin()) {
+      Invalidate();
+      return;
+    }
+    --pos_;
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+  }
+}
+
+void FragmentedRangeTombstoneIterator::Next() {
+  ++seq_pos_;
+  if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+    ++pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopNext() {
+  ++pos_;
+  if (pos_ == tombstones_->end()) {
+    return;
+  }
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Prev() {
+  if (seq_pos_ == tombstones_->seq_begin()) {
+    Invalidate();
+    return;
+  }
+  --seq_pos_;
+  if (pos_ == tombstones_->end() ||
+      seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
+    --pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopPrev() {
+  if (pos_ == tombstones_->begin()) {
+    Invalidate();
+    return;
+  }
+  --pos_;
+  seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                              tombstones_->seq_iter(pos_->seq_end_idx),
+                              upper_bound_, std::greater<SequenceNumber>());
+  ScanBackwardToVisibleTombstone();
+}
+
+bool FragmentedRangeTombstoneIterator::Valid() const {
+  return tombstones_ != nullptr && pos_ != tombstones_->end();
+}
+
+SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
+    const Slice& user_key) {
+  SeekToCoveringTombstone(user_key);
+  return ValidPos() && ucmp_->Compare(start_key(), user_key) <= 0 ? seq() : 0;
+}
+
+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      splits;
+  SequenceNumber lower = 0;
+  SequenceNumber upper;
+  for (size_t i = 0; i <= snapshots.size(); i++) {
+    if (i >= snapshots.size()) {
+      upper = kMaxSequenceNumber;
+    } else {
+      upper = snapshots[i];
+    }
+    if (tombstones_->ContainsRange(lower, upper)) {
+      splits.emplace(upper, std::unique_ptr<FragmentedRangeTombstoneIterator>(
+                                new FragmentedRangeTombstoneIterator(
+                                    tombstones_, *icmp_, upper, lower)));
+    }
+    lower = upper + 1;
+  }
+  return splits;
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/range_tombstone_fragmenter.h b/thirdparty/rocksdb/db/range_tombstone_fragmenter.h
new file mode 100644
index 0000000000..a0b77b6777
--- /dev/null
+++ b/thirdparty/rocksdb/db/range_tombstone_fragmenter.h
@@ -0,0 +1,254 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+
+namespace rocksdb {
+
+struct FragmentedRangeTombstoneList {
+ public:
+  // A compact representation of a "stack" of range tombstone fragments, which
+  // start and end at the same user keys but have different sequence numbers.
+  // The members seq_start_idx and seq_end_idx are intended to be parameters to
+  // seq_iter().
+  struct RangeTombstoneStack {
+    RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
+                        size_t end_idx)
+        : start_key(start),
+          end_key(end),
+          seq_start_idx(start_idx),
+          seq_end_idx(end_idx) {}
+
+    Slice start_key;
+    Slice end_key;
+    size_t seq_start_idx;
+    size_t seq_end_idx;
+  };
+  FragmentedRangeTombstoneList(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction = false,
+      const std::vector<SequenceNumber>& snapshots = {});
+
+  std::vector<RangeTombstoneStack>::const_iterator begin() const {
+    return tombstones_.begin();
+  }
+
+  std::vector<RangeTombstoneStack>::const_iterator end() const {
+    return tombstones_.end();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
+    return std::next(tombstone_seqs_.begin(), idx);
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_begin() const {
+    return tombstone_seqs_.begin();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_end() const {
+    return tombstone_seqs_.end();
+  }
+
+  bool empty() const { return tombstones_.empty(); }
+
+  // Returns true if the stored tombstones contain with one with a sequence
+  // number in [lower, upper].
+  bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const;
+
+ private:
+  // Given an ordered range tombstone iterator unfragmented_tombstones,
+  // "fragment" the tombstones into non-overlapping pieces, and store them in
+  // tombstones_ and tombstone_seqs_.
+  void FragmentTombstones(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction,
+      const std::vector<SequenceNumber>& snapshots);
+
+  std::vector<RangeTombstoneStack> tombstones_;
+  std::vector<SequenceNumber> tombstone_seqs_;
+  std::set<SequenceNumber> seq_set_;
+  std::list<std::string> pinned_slices_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+};
+
+// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
+// meta block into an iterator over non-overlapping tombstone fragments. The
+// tombstone fragmentation process should be more efficient than the range
+// tombstone collapsing algorithm in RangeDelAggregator because this leverages
+// the internal key ordering already provided by the input iterator, if
+// applicable (when the iterator is unsorted, a new sorted iterator is created
+// before proceeding). If there are few overlaps, creating a
+// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
+// tombstone collapsing is always O(n log n).
+class FragmentedRangeTombstoneIterator : public InternalIterator {
+ public:
+  FragmentedRangeTombstoneIterator(
+      const FragmentedRangeTombstoneList* tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);
+  FragmentedRangeTombstoneIterator(
+      const std::shared_ptr<const FragmentedRangeTombstoneList>& tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      SequenceNumber lower_bound = 0);
+
+  void SeekToFirst() override;
+  void SeekToLast() override;
+
+  void SeekToTopFirst();
+  void SeekToTopLast();
+
+  // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
+  // seeking should behave. This is OK because they are not currently used, but
+  // eventually FragmentedRangeTombstoneIterator should no longer implement
+  // InternalIterator.
+  //
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the earliest tombstone in
+  // the snapshot that ends after target.
+  void Seek(const Slice& target) override;
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the latest tombstone in the
+  // snapshot that starts before target.
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+  void Prev() override;
+
+  void TopNext();
+  void TopPrev();
+
+  bool Valid() const override;
+  Slice key() const override {
+    MaybePinKey();
+    return current_start_key_.Encode();
+  }
+  Slice value() const override { return pos_->end_key; }
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return true; }
+  Status status() const override { return Status::OK(); }
+
+  bool empty() const { return tombstones_->empty(); }
+  void Invalidate() {
+    pos_ = tombstones_->end();
+    seq_pos_ = tombstones_->seq_end();
+  }
+
+  RangeTombstone Tombstone() const {
+    return RangeTombstone(start_key(), end_key(), seq());
+  }
+  Slice start_key() const { return pos_->start_key; }
+  Slice end_key() const { return pos_->end_key; }
+  SequenceNumber seq() const { return *seq_pos_; }
+  ParsedInternalKey parsed_start_key() const {
+    return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+  ParsedInternalKey parsed_end_key() const {
+    return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+
+  SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
+
+  // Splits the iterator into n+1 iterators (where n is the number of
+  // snapshots), each providing a view over a "stripe" of sequence numbers. The
+  // iterators are keyed by the upper bound of their ranges (the provided
+  // snapshots + kMaxSequenceNumber).
+  //
+  // NOTE: the iterators in the returned map are no longer valid if their
+  // parent iterator is deleted, since they do not modify the refcount of the
+  // underlying tombstone list. Therefore, this map should be deleted before
+  // the parent iterator.
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return upper_bound_; }
+  SequenceNumber lower_bound() const { return lower_bound_; }
+
+ private:
+  using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
+
+  struct RangeTombstoneStackStartComparator {
+    explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->Compare(a.start_key, b.start_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->Compare(a.start_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->Compare(a, b.start_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  struct RangeTombstoneStackEndComparator {
+    explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->Compare(a.end_key, b.end_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->Compare(a.end_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->Compare(a, b.end_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  void MaybePinKey() const {
+    if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
+        (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
+      current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
+      pinned_pos_ = pos_;
+      pinned_seq_pos_ = seq_pos_;
+    }
+  }
+
+  void SeekToCoveringTombstone(const Slice& key);
+  void SeekForPrevToCoveringTombstone(const Slice& key);
+  void ScanForwardToVisibleTombstone();
+  void ScanBackwardToVisibleTombstone();
+  bool ValidPos() const {
+    return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
+  }
+
+  const RangeTombstoneStackStartComparator tombstone_start_cmp_;
+  const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+  const InternalKeyComparator* icmp_;
+  const Comparator* ucmp_;
+  std::shared_ptr<const FragmentedRangeTombstoneList> tombstones_ref_;
+  const FragmentedRangeTombstoneList* tombstones_;
+  SequenceNumber upper_bound_;
+  SequenceNumber lower_bound_;
+  std::vector<RangeTombstoneStack>::const_iterator pos_;
+  std::vector<SequenceNumber>::const_iterator seq_pos_;
+  mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
+  mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
+  mutable InternalKey current_start_key_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/range_tombstone_fragmenter_test.cc b/thirdparty/rocksdb/db/range_tombstone_fragmenter_test.cc
new file mode 100644
index 0000000000..ddd3f77417
--- /dev/null
+++ b/thirdparty/rocksdb/db/range_tombstone_fragmenter_test.cc
@@ -0,0 +1,552 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include "db/db_test_util.h"
+#include "rocksdb/comparator.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+class RangeTombstoneFragmenterTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<test::VectorIterator>(
+      new test::VectorIterator(keys, values));
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifyVisibleTombstones(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToTopFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+struct SeekTestCase {
+  Slice seek_target;
+  RangeTombstone expected_position;
+  bool out_of_range;
+};
+
+void VerifySeek(FragmentedRangeTombstoneIterator* iter,
+                const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->Seek(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
+                       const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->SeekForPrev(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+struct MaxCoveringTombstoneSeqnumTestCase {
+  Slice user_key;
+  SequenceNumber result;
+};
+
+void VerifyMaxCoveringTombstoneSeqnum(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<MaxCoveringTombstoneSeqnumTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    EXPECT_EQ(testcase.result,
+              iter->MaxCoveringTombstoneSeqnum(testcase.user_key));
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
+  auto range_del_iter = MakeRangeDelIter(
+      {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter,
+                            {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "c", 30},
+                                          {"a", "g", 20},
+                                          {"a", "e", 10},
+                                          {"a", "g", 7},
+                                          {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
+                                    {"a", "c", 20},
+                                    {"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 20},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 20},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         9 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+                                         7 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+                                         5 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
+    VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
+                                     {"c", "e", 10},
+                                     {"c", "e", 8},
+                                     {"c", "e", 6},
+                                     {"e", "g", 8},
+                                     {"e", "g", 6},
+                                     {"g", "i", 6},
+                                     {"j", "l", 4},
+                                     {"j", "l", 2},
+                                     {"l", "n", 4}});
+  }
+
+  ASSERT_EQ(0, iter1.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
+  VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
+                                   {"c", "e", 10},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter2.lower_bound());
+  ASSERT_EQ(9, iter2.upper_bound());
+  VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter3.lower_bound());
+  ASSERT_EQ(7, iter3.upper_bound());
+  VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
+                                   {"e", "g", 6},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter4.lower_bound());
+  ASSERT_EQ(5, iter4.upper_bound());
+  VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter5.lower_bound());
+  ASSERT_EQ(3, iter5.upper_bound());
+  VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        9 /* upper_bound */);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(9, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"c", "e", 6},
+                                    {"e", "g", 8},
+                                    {"e", "g", 6},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"j", "l", 2},
+                                    {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {} /* snapshots */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+       OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {20, 9} /* upper_bounds */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+  ASSERT_EQ(1, split_iters.size());
+
+  auto* split_iter = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(0, split_iter->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+  VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+                                       {"c", "e", 10},
+                                       {"e", "g", 8},
+                                       {"g", "i", 6},
+                                       {"j", "l", 4},
+                                       {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+  ASSERT_EQ(5, split_iters.size());
+
+  auto* split_iter1 = split_iters[3].get();
+  ASSERT_EQ(0, split_iter1->lower_bound());
+  ASSERT_EQ(3, split_iter1->upper_bound());
+  VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+  auto* split_iter2 = split_iters[5].get();
+  ASSERT_EQ(4, split_iter2->lower_bound());
+  ASSERT_EQ(5, split_iter2->upper_bound());
+  VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+  auto* split_iter3 = split_iters[7].get();
+  ASSERT_EQ(6, split_iter3->lower_bound());
+  ASSERT_EQ(7, split_iter3->upper_bound());
+  VerifyVisibleTombstones(split_iter3,
+                          {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+  auto* split_iter4 = split_iters[9].get();
+  ASSERT_EQ(8, split_iter4->lower_bound());
+  ASSERT_EQ(9, split_iter4->upper_bound());
+  VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+  auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(10, split_iter5->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+  VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"a", {"j", "l", 2}},
+                      {"e", {"j", "l", 2}},
+                      {"l", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */},
+                             {"e", {}, true /* out of range */},
+                             {"l", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"b", {"j", "l", 2}},
+                      {"f", {"j", "l", 2}},
+                      {"m", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */},
+                             {"f", {}, true /* out of range */},
+                             {"m", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(&iter1, {{"c", {"c", "e", 10}},
+                      {"g", {"g", "i", 6}},
+                      {"i", {"j", "l", 4}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}},
+                             {"g", {"g", "i", 6}},
+                             {"i", {"g", "i", 6}},
+                             {"n", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"c", {"j", "l", 2}},
+                      {"g", {"j", "l", 2}},
+                      {"i", {"j", "l", 2}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */},
+                             {"g", {}, true /* out of range */},
+                             {"i", {}, true /* out of range */},
+                             {"n", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter,
+                    {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/thirdparty/rocksdb/db/read_callback.h b/thirdparty/rocksdb/db/read_callback.h
new file mode 100644
index 0000000000..52573be19d
--- /dev/null
+++ b/thirdparty/rocksdb/db/read_callback.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/types.h"
+
+namespace rocksdb {
+
+class ReadCallback {
+ public:
+  ReadCallback(SequenceNumber last_visible_seq)
+      : max_visible_seq_(last_visible_seq) {}
+  ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
+      : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
+
+  virtual ~ReadCallback() {}
+
+  // Will be called to see if the seq number visible; if not it moves on to
+  // the next seq number.
+  virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0;
+
+  inline bool IsVisible(SequenceNumber seq) {
+    assert(min_uncommitted_ > 0);
+    assert(min_uncommitted_ >= kMinUnCommittedSeq);
+    if (seq < min_uncommitted_) {  // handles seq == 0 as well
+      assert(seq <= max_visible_seq_);
+      return true;
+    } else if (max_visible_seq_ < seq) {
+      assert(seq != 0);
+      return false;
+    } else {
+      assert(seq != 0);  // already handled in the first if-then clause
+      return IsVisibleFullCheck(seq);
+    }
+  }
+
+  inline SequenceNumber max_visible_seq() { return max_visible_seq_; }
+
+  virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
+
+  // Refer to DBIter::CanReseekToSkip
+  virtual bool CanReseekToSkip() { return true; }
+
+ protected:
+  // The max visible seq, it is usually the snapshot but could be larger if
+  // transaction has its own writes written to db.
+  SequenceNumber max_visible_seq_ = kMaxSequenceNumber;
+  // Any seq less than min_uncommitted_ is committed.
+  const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+};
+
+}  //  namespace rocksdb
diff --git a/thirdparty/rocksdb/db/repair.cc b/thirdparty/rocksdb/db/repair.cc
index 9ed326032c..7b9409a229 100644
--- a/thirdparty/rocksdb/db/repair.cc
+++ b/thirdparty/rocksdb/db/repair.cc
@@ -99,12 +99,14 @@ class Repairer {
         env_(db_options.env),
         env_options_(),
         db_options_(SanitizeOptions(dbname_, db_options)),
-        immutable_db_options_(db_options_),
+        immutable_db_options_(ImmutableDBOptions(db_options_)),
         icmp_(default_cf_opts.comparator),
-        default_cf_opts_(default_cf_opts),
+        default_cf_opts_(
+            SanitizeOptions(immutable_db_options_, default_cf_opts)),
         default_cf_iopts_(
-            ImmutableCFOptions(immutable_db_options_, default_cf_opts)),
-        unknown_cf_opts_(unknown_cf_opts),
+            ImmutableCFOptions(immutable_db_options_, default_cf_opts_)),
+        unknown_cf_opts_(
+            SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
         create_unknown_cfs_(create_unknown_cfs),
         raw_table_cache_(
             // TableCache can be small since we expect each table to be opened
@@ -116,7 +118,8 @@ class Repairer {
         wc_(db_options_.delayed_write_rate),
         vset_(dbname_, &immutable_db_options_, env_options_,
               raw_table_cache_.get(), &wb_, &wc_),
-        next_file_number_(1) {
+        next_file_number_(1),
+        db_lock_(nullptr) {
     for (const auto& cfd : column_families) {
       cf_name_to_opts_[cfd.name] = cfd.options;
     }
@@ -161,11 +164,18 @@ class Repairer {
   }
 
   ~Repairer() {
+    if (db_lock_ != nullptr) {
+      env_->UnlockFile(db_lock_);
+    }
     delete table_cache_;
   }
 
   Status Run() {
-    Status status = FindFiles();
+    Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!status.ok()) {
+      return status;
+    }
+    status = FindFiles();
     if (status.ok()) {
       // Discard older manifests and start a fresh one
       for (size_t i = 0; i < manifests_.size(); i++) {
@@ -203,7 +213,7 @@ class Repairer {
       ROCKS_LOG_WARN(db_options_.info_log,
                      "**** Repaired rocksdb %s; "
                      "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
-                     "bytes. "
+                     " bytes. "
                      "Some data may have been lost. "
                      "****",
                      dbname_.c_str(), tables_.size(), bytes);
@@ -243,6 +253,9 @@ class Repairer {
   std::vector<uint64_t> logs_;
   std::vector<TableInfo> tables_;
   uint64_t next_file_number_;
+  // Lock over the persistent DB state. Non-nullptr iff successfully
+  // acquired.
+  FileLock* db_lock_;
 
   Status FindFiles() {
     std::vector<std::string> filenames;
@@ -254,14 +267,21 @@ class Repairer {
     }
 
     // search wal_dir if user uses a customize wal_dir
-    if (!db_options_.wal_dir.empty() && 
-        db_options_.wal_dir != dbname_) {
-        to_search_paths.push_back(db_options_.wal_dir);
+    bool same = false;
+    Status status = env_->AreFilesSame(db_options_.wal_dir, dbname_, &same);
+    if (status.IsNotSupported()) {
+      same = db_options_.wal_dir == dbname_;
+      status = Status::OK();
+    } else if (!status.ok()) {
+      return status;
+    }
+
+    if (!same) {
+      to_search_paths.push_back(db_options_.wal_dir);
     }
 
     for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
-      Status status =
-          env_->GetChildren(to_search_paths[path_id], &filenames);
+      status = env_->GetChildren(to_search_paths[path_id], &filenames);
       if (!status.ok()) {
         return status;
       }
@@ -316,7 +336,7 @@ class Repairer {
       Env* env;
       std::shared_ptr<Logger> info_log;
       uint64_t lognum;
-      virtual void Corruption(size_t bytes, const Status& s) override {
+      void Corruption(size_t bytes, const Status& s) override {
         // We print error messages for corruption, but continue repairing.
         ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
                         lognum, static_cast<int>(bytes), s.ToString().c_str());
@@ -325,14 +345,14 @@ class Repairer {
 
     // Open the log file
     std::string logname = LogFileName(db_options_.wal_dir, log);
-    unique_ptr<SequentialFile> lfile;
+    std::unique_ptr<SequentialFile> lfile;
     Status status = env_->NewSequentialFile(
         logname, &lfile, env_->OptimizeForLogRead(env_options_));
     if (!status.ok()) {
       return status;
     }
-    unique_ptr<SequentialFileReader> lfile_reader(
-        new SequentialFileReader(std::move(lfile)));
+    std::unique_ptr<SequentialFileReader> lfile_reader(
+        new SequentialFileReader(std::move(lfile), logname));
 
     // Create the log reader.
     LogReporter reporter;
@@ -344,7 +364,7 @@ class Repairer {
     // propagating bad information (like overly large sequence
     // numbers).
     log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
-                       true /*enable checksum*/, 0 /*initial_offset*/, log);
+                       true /*enable checksum*/, log);
 
     // Initialize per-column family memtables
     for (auto* cfd : *vset_.GetColumnFamilySet()) {
@@ -390,23 +410,30 @@ class Repairer {
       ro.total_order_seek = true;
       Arena arena;
       ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
-      EnvOptions optimized_env_options =
-          env_->OptimizeForCompactionTableWrite(env_options_, immutable_db_options_);
-
       int64_t _current_time = 0;
       status = env_->GetCurrentTime(&_current_time);  // ignore error
       const uint64_t current_time = static_cast<uint64_t>(_current_time);
-
+      SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
+
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
       status = BuildTable(
           dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
-          optimized_env_options, table_cache_, iter.get(),
-          std::unique_ptr<InternalIterator>(mem->NewRangeTombstoneIterator(ro)),
+          env_options_, table_cache_, iter.get(), std::move(range_del_iters),
           &meta, cfd->internal_comparator(),
           cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
-          {}, kMaxSequenceNumber, kNoCompression, CompressionOptions(), false,
+          {}, kMaxSequenceNumber, snapshot_checker, kNoCompression,
+          0 /* sample_for_compression */, CompressionOptions(), false,
           nullptr /* internal_stats */, TableFileCreationReason::kRecovery,
           nullptr /* event_logger */, 0 /* job_id */, Env::IO_HIGH,
-          nullptr /* table_properties */, -1 /* level */, current_time);
+          nullptr /* table_properties */, -1 /* level */, current_time,
+          write_hint);
       ROCKS_LOG_INFO(db_options_.info_log,
                      "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
                      log, counter, meta.fd.GetNumber(),
@@ -491,8 +518,9 @@ class Repairer {
     }
     if (status.ok()) {
       InternalIterator* iter = table_cache_->NewIterator(
-          ReadOptions(), env_options_, cfd->internal_comparator(), t->meta.fd,
-          nullptr /* range_del_agg */);
+          ReadOptions(), env_options_, cfd->internal_comparator(), t->meta,
+          nullptr /* range_del_agg */,
+          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
       bool empty = true;
       ParsedInternalKey parsed;
       t->min_sequence = 0;
@@ -541,7 +569,8 @@ class Repairer {
         max_sequence = tables_[i].max_sequence;
       }
     }
-    vset_.SetLastToBeWrittenSequence(max_sequence);
+    vset_.SetLastAllocatedSequence(max_sequence);
+    vset_.SetLastPublishedSequence(max_sequence);
     vset_.SetLastSequence(max_sequence);
 
     for (const auto& cf_id_and_tables : cf_id_to_tables) {
@@ -560,6 +589,8 @@ class Repairer {
                      table->meta.largest, table->min_sequence,
                      table->max_sequence, table->meta.marked_for_compaction);
       }
+      assert(next_file_number_ > 0);
+      vset_.MarkFileNumberUsed(next_file_number_ - 1);
       mutex_.Lock();
       Status status = vset_.LogAndApply(
           cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
@@ -611,11 +642,13 @@ Status GetDefaultCFOptions(
 }  // anonymous namespace
 
 Status RepairDB(const std::string& dbname, const DBOptions& db_options,
-                const std::vector<ColumnFamilyDescriptor>& column_families) {
+                const std::vector<ColumnFamilyDescriptor>& column_families
+                ) {
   ColumnFamilyOptions default_cf_opts;
   Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
   if (status.ok()) {
-    Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+    Repairer repairer(dbname, db_options, column_families,
+                      default_cf_opts,
                       ColumnFamilyOptions() /* unknown_cf_opts */,
                       false /* create_unknown_cfs */);
     status = repairer.Run();
@@ -629,7 +662,8 @@ Status RepairDB(const std::string& dbname, const DBOptions& db_options,
   ColumnFamilyOptions default_cf_opts;
   Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
   if (status.ok()) {
-    Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+    Repairer repairer(dbname, db_options,
+                      column_families, default_cf_opts,
                       unknown_cf_opts, true /* create_unknown_cfs */);
     status = repairer.Run();
   }
@@ -639,7 +673,8 @@ Status RepairDB(const std::string& dbname, const DBOptions& db_options,
 Status RepairDB(const std::string& dbname, const Options& options) {
   DBOptions db_options(options);
   ColumnFamilyOptions cf_options(options);
-  Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */,
+  Repairer repairer(dbname, db_options,
+                    {}, cf_options /* default_cf_opts */,
                     cf_options /* unknown_cf_opts */,
                     true /* create_unknown_cfs */);
   return repairer.Run();
diff --git a/thirdparty/rocksdb/db/repair_test.cc b/thirdparty/rocksdb/db/repair_test.cc
index b267c6d168..3422532da4 100644
--- a/thirdparty/rocksdb/db/repair_test.cc
+++ b/thirdparty/rocksdb/db/repair_test.cc
@@ -74,7 +74,7 @@ TEST_F(RepairTest, CorruptManifest) {
 
   Close();
   ASSERT_OK(env_->FileExists(manifest_path));
-  CreateFile(env_, manifest_path, "blah");
+  CreateFile(env_, manifest_path, "blah", false /* use_fsync */);
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
   Reopen(CurrentOptions());
 
@@ -108,6 +108,23 @@ TEST_F(RepairTest, IncompleteManifest) {
   ASSERT_EQ(Get("key2"), "val2");
 }
 
+TEST_F(RepairTest, PostRepairSstFileNumbering) {
+  // Verify after a DB is repaired, new files will be assigned higher numbers
+  // than old files.
+  Put("key", "val");
+  Flush();
+  Put("key2", "val2");
+  Flush();
+  uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  Reopen(CurrentOptions());
+  uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  ASSERT_GE(post_repair_file_num, pre_repair_file_num);
+}
+
 TEST_F(RepairTest, LostSst) {
   // Delete one of the SST files but preserve the manifest that refers to it,
   // then verify the DB is still usable for the intact SST.
@@ -136,7 +153,7 @@ TEST_F(RepairTest, CorruptSst) {
   Flush();
   auto sst_path = GetFirstSstPath();
   ASSERT_FALSE(sst_path.empty());
-  CreateFile(env_, sst_path, "blah");
+  CreateFile(env_, sst_path, "blah", false /* use_fsync */);
 
   Close();
   ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
@@ -296,6 +313,7 @@ TEST_F(RepairTest, RepairColumnFamilyOptions) {
     ASSERT_EQ(comparator_name,
               fname_and_props.second->comparator_name);
   }
+  Close();
 
   // Also check comparator when it's provided via "unknown" CF options
   ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}},
@@ -309,6 +327,25 @@ TEST_F(RepairTest, RepairColumnFamilyOptions) {
   }
 }
 
+TEST_F(RepairTest, DbNameContainsTrailingSlash) {
+  {
+    bool tmp;
+    if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+      fprintf(stderr,
+              "skipping RepairTest.DbNameContainsTrailingSlash due to "
+              "unsupported Env::AreFilesSame\n");
+      return;
+    }
+  }
+
+  Put("key", "val");
+  Flush();
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
+  Reopen(CurrentOptions());
+  ASSERT_EQ(Get("key"), "val");
+}
 #endif  // ROCKSDB_LITE
 }  // namespace rocksdb
 
@@ -320,7 +357,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/db/snapshot_checker.h b/thirdparty/rocksdb/db/snapshot_checker.h
new file mode 100644
index 0000000000..4d29b83c4f
--- /dev/null
+++ b/thirdparty/rocksdb/db/snapshot_checker.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/types.h"
+
+namespace rocksdb {
+
+enum class SnapshotCheckerResult : int {
+  kInSnapshot = 0,
+  kNotInSnapshot = 1,
+  // In case snapshot is released and the checker has no clue whether
+  // the given sequence is visible to the snapshot.
+  kSnapshotReleased = 2,
+};
+
+// Callback class that control GC of duplicate keys in flush/compaction.
+class SnapshotChecker {
+ public:
+  virtual ~SnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0;
+};
+
+class DisableGCSnapshotChecker : public SnapshotChecker {
+ public:
+  virtual ~DisableGCSnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber /*sequence*/,
+      SequenceNumber /*snapshot_sequence*/) const override {
+    // By returning kNotInSnapshot, we prevent all the values from being GCed
+    return SnapshotCheckerResult::kNotInSnapshot;
+  }
+  static DisableGCSnapshotChecker* Instance() { return &instance_; }
+
+ protected:
+  static DisableGCSnapshotChecker instance_;
+  explicit DisableGCSnapshotChecker() {}
+};
+
+class WritePreparedTxnDB;
+
+// Callback class created by WritePreparedTxnDB to check if a key
+// is visible by a snapshot.
+class WritePreparedSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db);
+  virtual ~WritePreparedSnapshotChecker() {}
+
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const override;
+
+ private:
+#ifndef ROCKSDB_LITE
+  const WritePreparedTxnDB* const txn_db_;
+#endif  // !ROCKSDB_LITE
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/snapshot_impl.h b/thirdparty/rocksdb/db/snapshot_impl.h
index 7dc405931c..f2610fd18b 100644
--- a/thirdparty/rocksdb/db/snapshot_impl.h
+++ b/thirdparty/rocksdb/db/snapshot_impl.h
@@ -21,6 +21,10 @@ class SnapshotList;
 class SnapshotImpl : public Snapshot {
  public:
   SequenceNumber number_;  // const after creation
+  // It indicates the smallest uncommitted data at the time the snapshot was
+  // taken. This is currently used by WritePrepared transactions to limit the
+  // scope of queries to IsInSnpashot.
+  SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
 
   virtual SequenceNumber GetSequenceNumber() const override { return number_; }
 
@@ -45,15 +49,22 @@ class SnapshotList {
     list_.prev_ = &list_;
     list_.next_ = &list_;
     list_.number_ = 0xFFFFFFFFL;      // placeholder marker, for debugging
+    // Set all the variables to make UBSAN happy.
+    list_.list_ = nullptr;
+    list_.unix_time_ = 0;
+    list_.is_write_conflict_boundary_ = false;
     count_ = 0;
   }
 
+  // No copy-construct.
+  SnapshotList(const SnapshotList&) = delete;
+
   bool empty() const { return list_.next_ == &list_; }
   SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
   SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
 
-  const SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq,
-                          uint64_t unix_time, bool is_write_conflict_boundary) {
+  SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time,
+                    bool is_write_conflict_boundary) {
     s->number_ = seq;
     s->unix_time_ = unix_time;
     s->is_write_conflict_boundary_ = is_write_conflict_boundary;
@@ -75,7 +86,7 @@ class SnapshotList {
   }
 
   // retrieve all snapshot numbers up until max_seq. They are sorted in
-  // ascending order.
+  // ascending order (with no duplicates).
   std::vector<SequenceNumber> GetAll(
       SequenceNumber* oldest_write_conflict_snapshot = nullptr,
       const SequenceNumber& max_seq = kMaxSequenceNumber) const {
@@ -93,7 +104,10 @@ class SnapshotList {
       if (s->next_->number_ > max_seq) {
         break;
       }
-      ret.push_back(s->next_->number_);
+      // Avoid duplicates
+      if (ret.empty() || ret.back() != s->next_->number_) {
+        ret.push_back(s->next_->number_);
+      }
 
       if (oldest_write_conflict_snapshot != nullptr &&
           *oldest_write_conflict_snapshot == kMaxSequenceNumber &&
@@ -108,22 +122,6 @@ class SnapshotList {
     return ret;
   }
 
-  // Whether there is an active snapshot in range [lower_bound, upper_bound).
-  bool HasSnapshotInRange(SequenceNumber lower_bound,
-                          SequenceNumber upper_bound) {
-    if (empty()) {
-      return false;
-    }
-    const SnapshotImpl* s = &list_;
-    while (s->next_ != &list_) {
-      if (s->next_->number_ >= lower_bound) {
-        return s->next_->number_ < upper_bound;
-      }
-      s = s->next_;
-    }
-    return false;
-  }
-
   // get the sequence number of the most recent snapshot
   SequenceNumber GetNewest() {
     if (empty()) {
diff --git a/thirdparty/rocksdb/db/table_cache.cc b/thirdparty/rocksdb/db/table_cache.cc
index b4d5cc1bb7..764c05bfa4 100644
--- a/thirdparty/rocksdb/db/table_cache.cc
+++ b/thirdparty/rocksdb/db/table_cache.cc
@@ -10,6 +10,7 @@
 #include "db/table_cache.h"
 
 #include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
 #include "db/version_edit.h"
 #include "util/filename.h"
 
@@ -30,7 +31,7 @@ namespace rocksdb {
 namespace {
 
 template <class T>
-static void DeleteEntry(const Slice& key, void* value) {
+static void DeleteEntry(const Slice& /*key*/, void* value) {
   T* typed_value = reinterpret_cast<T*>(value);
   delete typed_value;
 }
@@ -43,6 +44,8 @@ static void UnrefEntry(void* arg1, void* arg2) {
 
 static void DeleteTableReader(void* arg1, void* arg2) {
   TableReader* table_reader = reinterpret_cast<TableReader*>(arg1);
+  Statistics* stats = reinterpret_cast<Statistics*>(arg2);
+  RecordTick(stats, NO_FILE_CLOSES);
   delete table_reader;
 }
 
@@ -65,7 +68,10 @@ void AppendVarint64(IterKey* key, uint64_t v) {
 
 TableCache::TableCache(const ImmutableCFOptions& ioptions,
                        const EnvOptions& env_options, Cache* const cache)
-    : ioptions_(ioptions), env_options_(env_options), cache_(cache) {
+    : ioptions_(ioptions),
+      env_options_(env_options),
+      cache_(cache),
+      immortal_tables_(false) {
   if (ioptions_.row_cache) {
     // If the same cache is shared by multiple instances, we need to
     // disambiguate its entries.
@@ -88,17 +94,21 @@ Status TableCache::GetTableReader(
     const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
     bool sequential_mode, size_t readahead, bool record_read_stats,
-    HistogramImpl* file_read_hist, unique_ptr<TableReader>* table_reader,
-    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
-    bool for_compaction) {
+    HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
+    const SliceTransform* prefix_extractor, bool skip_filters, int level,
+    bool prefetch_index_and_filter_in_cache, bool for_compaction) {
   std::string fname =
-      TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
-  unique_ptr<RandomAccessFile> file;
+      TableFileName(ioptions_.cf_paths, fd.GetNumber(), fd.GetPathId());
+  std::unique_ptr<RandomAccessFile> file;
   Status s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
 
   RecordTick(ioptions_.statistics, NO_FILE_OPENS);
   if (s.ok()) {
-    if (readahead > 0) {
+    if (readahead > 0 && !env_options.use_mmap_reads) {
+      // Not compatible with mmap files since ReadaheadRandomAccessFile requires
+      // its wrapped file's Read() to copy data into the provided scratch
+      // buffer, which mmap files don't use.
+      // TODO(ajkr): try madvise for mmap files in place of buffered readahead.
       file = NewReadaheadRandomAccessFile(std::move(file), readahead);
     }
     if (!sequential_mode && ioptions_.advise_random_on_open) {
@@ -109,10 +119,12 @@ Status TableCache::GetTableReader(
         new RandomAccessFileReader(
             std::move(file), fname, ioptions_.env,
             record_read_stats ? ioptions_.statistics : nullptr, SST_READ_MICROS,
-            file_read_hist, ioptions_.rate_limiter, for_compaction));
+            file_read_hist, ioptions_.rate_limiter, for_compaction,
+            ioptions_.listeners));
     s = ioptions_.table_factory->NewTableReader(
-        TableReaderOptions(ioptions_, env_options, internal_comparator,
-                           skip_filters, level),
+        TableReaderOptions(ioptions_, prefix_extractor, env_options,
+                           internal_comparator, skip_filters, immortal_tables_,
+                           level, fd.largest_seqno),
         std::move(file_reader), fd.GetFileSize(), table_reader,
         prefetch_index_and_filter_in_cache);
     TEST_SYNC_POINT("TableCache::GetTableReader:0");
@@ -130,11 +142,12 @@ void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) {
 Status TableCache::FindTable(const EnvOptions& env_options,
                              const InternalKeyComparator& internal_comparator,
                              const FileDescriptor& fd, Cache::Handle** handle,
+                             const SliceTransform* prefix_extractor,
                              const bool no_io, bool record_read_stats,
                              HistogramImpl* file_read_hist, bool skip_filters,
                              int level,
                              bool prefetch_index_and_filter_in_cache) {
-  PERF_TIMER_GUARD(find_table_nanos);
+  PERF_TIMER_GUARD_WITH_ENV(find_table_nanos, ioptions_.env);
   Status s;
   uint64_t number = fd.GetNumber();
   Slice key = GetSliceForFileNumber(&number);
@@ -146,11 +159,12 @@ Status TableCache::FindTable(const EnvOptions& env_options,
     if (no_io) {  // Don't do IO and return a not-found status
       return Status::Incomplete("Table not found in table_cache, no_io is set");
     }
-    unique_ptr<TableReader> table_reader;
+    std::unique_ptr<TableReader> table_reader;
     s = GetTableReader(env_options, internal_comparator, fd,
                        false /* sequential mode */, 0 /* readahead */,
                        record_read_stats, file_read_hist, &table_reader,
-                       skip_filters, level, prefetch_index_and_filter_in_cache);
+                       prefix_extractor, skip_filters, level,
+                       prefetch_index_and_filter_in_cache);
     if (!s.ok()) {
       assert(table_reader == nullptr);
       RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
@@ -170,65 +184,75 @@ Status TableCache::FindTable(const EnvOptions& env_options,
 
 InternalIterator* TableCache::NewIterator(
     const ReadOptions& options, const EnvOptions& env_options,
-    const InternalKeyComparator& icomparator, const FileDescriptor& fd,
-    RangeDelAggregator* range_del_agg, TableReader** table_reader_ptr,
-    HistogramImpl* file_read_hist, bool for_compaction, Arena* arena,
-    bool skip_filters, int level) {
+    const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
+    RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor,
+    TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+    bool for_compaction, Arena* arena, bool skip_filters, int level,
+    const InternalKey* smallest_compaction_key,
+    const InternalKey* largest_compaction_key) {
   PERF_TIMER_GUARD(new_table_iterator_nanos);
 
   Status s;
   bool create_new_table_reader = false;
   TableReader* table_reader = nullptr;
   Cache::Handle* handle = nullptr;
-  if (s.ok()) {
-    if (table_reader_ptr != nullptr) {
-      *table_reader_ptr = nullptr;
-    }
-    size_t readahead = 0;
-    if (for_compaction) {
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+  size_t readahead = 0;
+  if (for_compaction) {
 #ifndef NDEBUG
-      bool use_direct_reads_for_compaction = env_options.use_direct_reads;
-      TEST_SYNC_POINT_CALLBACK("TableCache::NewIterator:for_compaction",
-                               &use_direct_reads_for_compaction);
+    bool use_direct_reads_for_compaction = env_options.use_direct_reads;
+    TEST_SYNC_POINT_CALLBACK("TableCache::NewIterator:for_compaction",
+                             &use_direct_reads_for_compaction);
 #endif  // !NDEBUG
-      if (ioptions_.new_table_reader_for_compaction_inputs) {
-        readahead = ioptions_.compaction_readahead_size;
-        create_new_table_reader = true;
-      }
-    } else {
-      readahead = options.readahead_size;
-      create_new_table_reader = readahead > 0;
+    if (ioptions_.new_table_reader_for_compaction_inputs) {
+      // get compaction_readahead_size from env_options allows us to set the
+      // value dynamically
+      readahead = env_options.compaction_readahead_size;
+      create_new_table_reader = true;
     }
+  } else {
+    readahead = options.readahead_size;
+    create_new_table_reader = readahead > 0;
+  }
 
-    if (create_new_table_reader) {
-      unique_ptr<TableReader> table_reader_unique_ptr;
-      s = GetTableReader(
-          env_options, icomparator, fd, true /* sequential_mode */, readahead,
-          !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr,
-          false /* skip_filters */, level,
-          true /* prefetch_index_and_filter_in_cache */, for_compaction);
+  auto& fd = file_meta.fd;
+  if (create_new_table_reader) {
+    std::unique_ptr<TableReader> table_reader_unique_ptr;
+    s = GetTableReader(
+        env_options, icomparator, fd, true /* sequential_mode */, readahead,
+        !for_compaction /* record stats */, nullptr, &table_reader_unique_ptr,
+        prefix_extractor, false /* skip_filters */, level,
+        true /* prefetch_index_and_filter_in_cache */, for_compaction);
+    if (s.ok()) {
+      table_reader = table_reader_unique_ptr.release();
+    }
+  } else {
+    table_reader = fd.table_reader;
+    if (table_reader == nullptr) {
+      s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    !for_compaction /* record read_stats */, file_read_hist,
+                    skip_filters, level);
       if (s.ok()) {
-        table_reader = table_reader_unique_ptr.release();
-      }
-    } else {
-      table_reader = fd.table_reader;
-      if (table_reader == nullptr) {
-        s = FindTable(env_options, icomparator, fd, &handle,
-                      options.read_tier == kBlockCacheTier /* no_io */,
-                      !for_compaction /* record read_stats */, file_read_hist,
-                      skip_filters, level);
-        if (s.ok()) {
-          table_reader = GetTableReaderFromHandle(handle);
-        }
+        table_reader = GetTableReaderFromHandle(handle);
       }
     }
   }
   InternalIterator* result = nullptr;
   if (s.ok()) {
-    result = table_reader->NewIterator(options, arena, skip_filters);
+    if (options.table_filter &&
+        !options.table_filter(*table_reader->GetTableProperties())) {
+      result = NewEmptyInternalIterator<Slice>(arena);
+    } else {
+      result = table_reader->NewIterator(options, prefix_extractor, arena,
+                                         skip_filters, for_compaction);
+    }
     if (create_new_table_reader) {
       assert(handle == nullptr);
-      result->RegisterCleanup(&DeleteTableReader, table_reader, nullptr);
+      result->RegisterCleanup(&DeleteTableReader, table_reader,
+                              ioptions_.statistics);
     } else if (handle != nullptr) {
       result->RegisterCleanup(&UnrefEntry, cache_, handle);
       handle = nullptr;  // prevent from releasing below
@@ -242,13 +266,25 @@ InternalIterator* TableCache::NewIterator(
     }
   }
   if (s.ok() && range_del_agg != nullptr && !options.ignore_range_deletions) {
-    std::unique_ptr<InternalIterator> range_del_iter(
-        table_reader->NewRangeTombstoneIterator(options));
-    if (range_del_iter != nullptr) {
-      s = range_del_iter->status();
-    }
-    if (s.ok()) {
-      s = range_del_agg->AddTombstones(std::move(range_del_iter));
+    if (range_del_agg->AddFile(fd.GetNumber())) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          static_cast<FragmentedRangeTombstoneIterator*>(
+              table_reader->NewRangeTombstoneIterator(options)));
+      if (range_del_iter != nullptr) {
+        s = range_del_iter->status();
+      }
+      if (s.ok()) {
+        const InternalKey* smallest = &file_meta.smallest;
+        const InternalKey* largest = &file_meta.largest;
+        if (smallest_compaction_key != nullptr) {
+          smallest = smallest_compaction_key;
+        }
+        if (largest_compaction_key != nullptr) {
+          largest = largest_compaction_key;
+        }
+        range_del_agg->AddTombstones(std::move(range_del_iter), smallest,
+                                     largest);
+      }
     }
   }
 
@@ -257,54 +293,19 @@ InternalIterator* TableCache::NewIterator(
   }
   if (!s.ok()) {
     assert(result == nullptr);
-    result = NewErrorInternalIterator(s, arena);
-  }
-  return result;
-}
-
-InternalIterator* TableCache::NewRangeTombstoneIterator(
-    const ReadOptions& options, const EnvOptions& env_options,
-    const InternalKeyComparator& icomparator, const FileDescriptor& fd,
-    HistogramImpl* file_read_hist, bool skip_filters, int level) {
-  Status s;
-  TableReader* table_reader = nullptr;
-  Cache::Handle* handle = nullptr;
-  table_reader = fd.table_reader;
-  if (table_reader == nullptr) {
-    s = FindTable(env_options, icomparator, fd, &handle,
-                  options.read_tier == kBlockCacheTier /* no_io */,
-                  true /* record read_stats */, file_read_hist, skip_filters,
-                  level);
-    if (s.ok()) {
-      table_reader = GetTableReaderFromHandle(handle);
-    }
-  }
-  InternalIterator* result = nullptr;
-  if (s.ok()) {
-    result = table_reader->NewRangeTombstoneIterator(options);
-    if (result != nullptr) {
-      if (handle != nullptr) {
-        result->RegisterCleanup(&UnrefEntry, cache_, handle);
-      }
-    }
-  }
-  if (result == nullptr && handle != nullptr) {
-    // the range deletion block didn't exist, or there was a failure between
-    // getting handle and getting iterator.
-    ReleaseHandle(handle);
-  }
-  if (!s.ok()) {
-    assert(result == nullptr);
-    result = NewErrorInternalIterator(s);
+    result = NewErrorInternalIterator<Slice>(s, arena);
   }
   return result;
 }
 
 Status TableCache::Get(const ReadOptions& options,
                        const InternalKeyComparator& internal_comparator,
-                       const FileDescriptor& fd, const Slice& k,
-                       GetContext* get_context, HistogramImpl* file_read_hist,
-                       bool skip_filters, int level) {
+                       const FileMetaData& file_meta, const Slice& k,
+                       GetContext* get_context,
+                       const SliceTransform* prefix_extractor,
+                       HistogramImpl* file_read_hist, bool skip_filters,
+                       int level) {
+  auto& fd = file_meta.fd;
   std::string* row_cache_entry = nullptr;
   bool done = false;
 #ifndef ROCKSDB_LITE
@@ -368,29 +369,29 @@ Status TableCache::Get(const ReadOptions& options,
   Cache::Handle* handle = nullptr;
   if (!done && s.ok()) {
     if (t == nullptr) {
-      s = FindTable(env_options_, internal_comparator, fd, &handle,
-                    options.read_tier == kBlockCacheTier /* no_io */,
-                    true /* record_read_stats */, file_read_hist, skip_filters,
-                    level);
+      s = FindTable(
+          env_options_, internal_comparator, fd, &handle, prefix_extractor,
+          options.read_tier == kBlockCacheTier /* no_io */,
+          true /* record_read_stats */, file_read_hist, skip_filters, level);
       if (s.ok()) {
         t = GetTableReaderFromHandle(handle);
       }
     }
-    if (s.ok() && get_context->range_del_agg() != nullptr &&
+    SequenceNumber* max_covering_tombstone_seq =
+        get_context->max_covering_tombstone_seq();
+    if (s.ok() && max_covering_tombstone_seq != nullptr &&
         !options.ignore_range_deletions) {
-      std::unique_ptr<InternalIterator> range_del_iter(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
           t->NewRangeTombstoneIterator(options));
       if (range_del_iter != nullptr) {
-        s = range_del_iter->status();
-      }
-      if (s.ok()) {
-        s = get_context->range_del_agg()->AddTombstones(
-            std::move(range_del_iter));
+        *max_covering_tombstone_seq = std::max(
+            *max_covering_tombstone_seq,
+            range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k)));
       }
     }
     if (s.ok()) {
       get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
-      s = t->Get(options, k, get_context, skip_filters);
+      s = t->Get(options, k, get_context, prefix_extractor, skip_filters);
       get_context->SetReplayLog(nullptr);
     } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
       // Couldn't find Table in cache but treat as kFound if no_io set
@@ -420,7 +421,8 @@ Status TableCache::Get(const ReadOptions& options,
 Status TableCache::GetTableProperties(
     const EnvOptions& env_options,
     const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
-    std::shared_ptr<const TableProperties>* properties, bool no_io) {
+    std::shared_ptr<const TableProperties>* properties,
+    const SliceTransform* prefix_extractor, bool no_io) {
   Status s;
   auto table_reader = fd.table_reader;
   // table already been pre-loaded?
@@ -431,7 +433,8 @@ Status TableCache::GetTableProperties(
   }
 
   Cache::Handle* table_handle = nullptr;
-  s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle,
+                prefix_extractor, no_io);
   if (!s.ok()) {
     return s;
   }
@@ -444,8 +447,8 @@ Status TableCache::GetTableProperties(
 
 size_t TableCache::GetMemoryUsageByTableReader(
     const EnvOptions& env_options,
-    const InternalKeyComparator& internal_comparator,
-    const FileDescriptor& fd) {
+    const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
+    const SliceTransform* prefix_extractor) {
   Status s;
   auto table_reader = fd.table_reader;
   // table already been pre-loaded?
@@ -454,7 +457,8 @@ size_t TableCache::GetMemoryUsageByTableReader(
   }
 
   Cache::Handle* table_handle = nullptr;
-  s = FindTable(env_options, internal_comparator, fd, &table_handle, true);
+  s = FindTable(env_options, internal_comparator, fd, &table_handle,
+                prefix_extractor, true);
   if (!s.ok()) {
     return 0;
   }
diff --git a/thirdparty/rocksdb/db/table_cache.h b/thirdparty/rocksdb/db/table_cache.h
index 8b65bafa3e..180ebc6bde 100644
--- a/thirdparty/rocksdb/db/table_cache.h
+++ b/thirdparty/rocksdb/db/table_cache.h
@@ -31,7 +31,6 @@ class Arena;
 struct FileDescriptor;
 class GetContext;
 class HistogramImpl;
-class InternalIterator;
 
 class TableCache {
  public:
@@ -53,16 +52,13 @@ class TableCache {
   InternalIterator* NewIterator(
       const ReadOptions& options, const EnvOptions& toptions,
       const InternalKeyComparator& internal_comparator,
-      const FileDescriptor& file_fd, RangeDelAggregator* range_del_agg,
+      const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
+      const SliceTransform* prefix_extractor = nullptr,
       TableReader** table_reader_ptr = nullptr,
       HistogramImpl* file_read_hist = nullptr, bool for_compaction = false,
-      Arena* arena = nullptr, bool skip_filters = false, int level = -1);
-
-  InternalIterator* NewRangeTombstoneIterator(
-      const ReadOptions& options, const EnvOptions& toptions,
-      const InternalKeyComparator& internal_comparator,
-      const FileDescriptor& file_fd, HistogramImpl* file_read_hist,
-      bool skip_filters, int level);
+      Arena* arena = nullptr, bool skip_filters = false, int level = -1,
+      const InternalKey* smallest_compaction_key = nullptr,
+      const InternalKey* largest_compaction_key = nullptr);
 
   // If a seek to internal key "k" in specified file finds an entry,
   // call (*handle_result)(arg, found_key, found_value) repeatedly until
@@ -74,9 +70,11 @@ class TableCache {
   // @param level The level this table is at, -1 for "not set / don't know"
   Status Get(const ReadOptions& options,
              const InternalKeyComparator& internal_comparator,
-             const FileDescriptor& file_fd, const Slice& k,
-             GetContext* get_context, HistogramImpl* file_read_hist = nullptr,
-             bool skip_filters = false, int level = -1);
+             const FileMetaData& file_meta, const Slice& k,
+             GetContext* get_context,
+             const SliceTransform* prefix_extractor = nullptr,
+             HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+             int level = -1);
 
   // Evict any entry for the specified file number
   static void Evict(Cache* cache, uint64_t file_number);
@@ -91,6 +89,7 @@ class TableCache {
   Status FindTable(const EnvOptions& toptions,
                    const InternalKeyComparator& internal_comparator,
                    const FileDescriptor& file_fd, Cache::Handle**,
+                   const SliceTransform* prefix_extractor = nullptr,
                    const bool no_io = false, bool record_read_stats = true,
                    HistogramImpl* file_read_hist = nullptr,
                    bool skip_filters = false, int level = -1,
@@ -109,6 +108,7 @@ class TableCache {
                             const InternalKeyComparator& internal_comparator,
                             const FileDescriptor& file_meta,
                             std::shared_ptr<const TableProperties>* properties,
+                            const SliceTransform* prefix_extractor = nullptr,
                             bool no_io = false);
 
   // Return total memory usage of the table reader of the file.
@@ -116,15 +116,26 @@ class TableCache {
   size_t GetMemoryUsageByTableReader(
       const EnvOptions& toptions,
       const InternalKeyComparator& internal_comparator,
-      const FileDescriptor& fd);
+      const FileDescriptor& fd,
+      const SliceTransform* prefix_extractor = nullptr);
 
   // Release the handle from a cache
   void ReleaseHandle(Cache::Handle* handle);
 
+  Cache* get_cache() const { return cache_; }
+
   // Capacity of the backing Cache that indicates inifinite TableCache capacity.
   // For example when max_open_files is -1 we set the backing Cache to this.
   static const int kInfiniteCapacity = 0x400000;
 
+  // The tables opened with this TableCache will be immortal, i.e., their
+  // lifetime is as long as that of the DB.
+  void SetTablesAreImmortal() {
+    if (cache_->GetCapacity() >= kInfiniteCapacity) {
+      immortal_tables_ = true;
+    }
+  }
+
  private:
   // Build a table reader
   Status GetTableReader(const EnvOptions& env_options,
@@ -132,7 +143,8 @@ class TableCache {
                         const FileDescriptor& fd, bool sequential_mode,
                         size_t readahead, bool record_read_stats,
                         HistogramImpl* file_read_hist,
-                        unique_ptr<TableReader>* table_reader,
+                        std::unique_ptr<TableReader>* table_reader,
+                        const SliceTransform* prefix_extractor = nullptr,
                         bool skip_filters = false, int level = -1,
                         bool prefetch_index_and_filter_in_cache = true,
                         bool for_compaction = false);
@@ -141,6 +153,7 @@ class TableCache {
   const EnvOptions& env_options_;
   Cache* const cache_;
   std::string row_cache_id_;
+  bool immortal_tables_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/table_properties_collector.cc b/thirdparty/rocksdb/db/table_properties_collector.cc
index a1f4dba97b..4dbcd4cc41 100644
--- a/thirdparty/rocksdb/db/table_properties_collector.cc
+++ b/thirdparty/rocksdb/db/table_properties_collector.cc
@@ -11,71 +11,10 @@
 
 namespace rocksdb {
 
-Status InternalKeyPropertiesCollector::InternalAdd(const Slice& key,
-                                                   const Slice& value,
-                                                   uint64_t file_size) {
-  ParsedInternalKey ikey;
-  if (!ParseInternalKey(key, &ikey)) {
-    return Status::InvalidArgument("Invalid internal key");
-  }
-
-  // Note: We count both, deletions and single deletions here.
-  if (ikey.type == ValueType::kTypeDeletion ||
-      ikey.type == ValueType::kTypeSingleDeletion) {
-    ++deleted_keys_;
-  } else if (ikey.type == ValueType::kTypeMerge) {
-    ++merge_operands_;
-  }
-
-  return Status::OK();
-}
-
-Status InternalKeyPropertiesCollector::Finish(
-    UserCollectedProperties* properties) {
-  assert(properties);
-  assert(properties->find(
-        InternalKeyTablePropertiesNames::kDeletedKeys) == properties->end());
-  assert(properties->find(InternalKeyTablePropertiesNames::kMergeOperands) ==
-         properties->end());
-
-  std::string val_deleted_keys;
-  PutVarint64(&val_deleted_keys, deleted_keys_);
-  properties->insert(
-      {InternalKeyTablePropertiesNames::kDeletedKeys, val_deleted_keys});
-
-  std::string val_merge_operands;
-  PutVarint64(&val_merge_operands, merge_operands_);
-  properties->insert(
-      {InternalKeyTablePropertiesNames::kMergeOperands, val_merge_operands});
-
-  return Status::OK();
-}
-
-UserCollectedProperties
-InternalKeyPropertiesCollector::GetReadableProperties() const {
-  return {{"kDeletedKeys", ToString(deleted_keys_)},
-          {"kMergeOperands", ToString(merge_operands_)}};
-}
-
 namespace {
 
-EntryType GetEntryType(ValueType value_type) {
-  switch (value_type) {
-    case kTypeValue:
-      return kEntryPut;
-    case kTypeDeletion:
-      return kEntryDelete;
-    case kTypeSingleDeletion:
-      return kEntrySingleDelete;
-    case kTypeMerge:
-      return kEntryMerge;
-    default:
-      return kEntryOther;
-  }
-}
-
 uint64_t GetUint64Property(const UserCollectedProperties& props,
-                           const std::string property_name,
+                           const std::string& property_name,
                            bool* property_present) {
   auto pos = props.find(property_name);
   if (pos == props.end()) {
@@ -102,6 +41,13 @@ Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
                                 ikey.sequence, file_size);
 }
 
+void UserKeyTablePropertiesCollector::BlockAdd(
+    uint64_t bLockRawBytes, uint64_t blockCompressedBytesFast,
+    uint64_t blockCompressedBytesSlow) {
+  return collector_->BlockAdd(bLockRawBytes, blockCompressedBytesFast,
+                              blockCompressedBytesSlow);
+}
+
 Status UserKeyTablePropertiesCollector::Finish(
     UserCollectedProperties* properties) {
   return collector_->Finish(properties);
@@ -112,23 +58,17 @@ UserKeyTablePropertiesCollector::GetReadableProperties() const {
   return collector_->GetReadableProperties();
 }
 
-
-const std::string InternalKeyTablePropertiesNames::kDeletedKeys
-  = "rocksdb.deleted.keys";
-const std::string InternalKeyTablePropertiesNames::kMergeOperands =
-    "rocksdb.merge.operands";
-
 uint64_t GetDeletedKeys(
     const UserCollectedProperties& props) {
   bool property_present_ignored;
-  return GetUint64Property(props, InternalKeyTablePropertiesNames::kDeletedKeys,
+  return GetUint64Property(props, TablePropertiesNames::kDeletedKeys,
                            &property_present_ignored);
 }
 
 uint64_t GetMergeOperands(const UserCollectedProperties& props,
                           bool* property_present) {
   return GetUint64Property(
-      props, InternalKeyTablePropertiesNames::kMergeOperands, property_present);
+      props, TablePropertiesNames::kMergeOperands, property_present);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/table_properties_collector.h b/thirdparty/rocksdb/db/table_properties_collector.h
index d8cd75689d..e4d6217157 100644
--- a/thirdparty/rocksdb/db/table_properties_collector.h
+++ b/thirdparty/rocksdb/db/table_properties_collector.h
@@ -14,11 +14,6 @@
 
 namespace rocksdb {
 
-struct InternalKeyTablePropertiesNames {
-  static const std::string kDeletedKeys;
-  static const std::string kMergeOperands;
-};
-
 // Base class for internal table properties collector.
 class IntTblPropCollector {
  public:
@@ -32,6 +27,10 @@ class IntTblPropCollector {
   virtual Status InternalAdd(const Slice& key, const Slice& value,
                              uint64_t file_size) = 0;
 
+  virtual void BlockAdd(uint64_t blockRawBytes,
+                        uint64_t blockCompressedBytesFast,
+                        uint64_t blockCompressedBytesSlow) = 0;
+
   virtual UserCollectedProperties GetReadableProperties() const = 0;
 
   virtual bool NeedCompact() const { return false; }
@@ -49,39 +48,6 @@ class IntTblPropCollectorFactory {
   virtual const char* Name() const = 0;
 };
 
-// Collecting the statistics for internal keys. Visible only by internal
-// rocksdb modules.
-class InternalKeyPropertiesCollector : public IntTblPropCollector {
- public:
-  virtual Status InternalAdd(const Slice& key, const Slice& value,
-                             uint64_t file_size) override;
-
-  virtual Status Finish(UserCollectedProperties* properties) override;
-
-  virtual const char* Name() const override {
-    return "InternalKeyPropertiesCollector";
-  }
-
-  UserCollectedProperties GetReadableProperties() const override;
-
- private:
-  uint64_t deleted_keys_ = 0;
-  uint64_t merge_operands_ = 0;
-};
-
-class InternalKeyPropertiesCollectorFactory
-    : public IntTblPropCollectorFactory {
- public:
-  virtual IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t column_family_id) override {
-    return new InternalKeyPropertiesCollector();
-  }
-
-  virtual const char* Name() const override {
-    return "InternalKeyPropertiesCollectorFactory";
-  }
-};
-
 // When rocksdb creates a new table, it will encode all "user keys" into
 // "internal keys", which contains meta information of a given entry.
 //
@@ -98,6 +64,10 @@ class UserKeyTablePropertiesCollector : public IntTblPropCollector {
   virtual Status InternalAdd(const Slice& key, const Slice& value,
                              uint64_t file_size) override;
 
+  virtual void BlockAdd(uint64_t blockRawBytes,
+                        uint64_t blockCompressedBytesFast,
+                        uint64_t blockCompressedBytesSlow) override;
+
   virtual Status Finish(UserCollectedProperties* properties) override;
 
   virtual const char* Name() const override { return collector_->Name(); }
diff --git a/thirdparty/rocksdb/db/table_properties_collector_test.cc b/thirdparty/rocksdb/db/table_properties_collector_test.cc
index 66c66c0253..ea561e982f 100644
--- a/thirdparty/rocksdb/db/table_properties_collector_test.cc
+++ b/thirdparty/rocksdb/db/table_properties_collector_test.cc
@@ -28,7 +28,7 @@ namespace rocksdb {
 class TablePropertiesTest : public testing::Test,
                             public testing::WithParamInterface<bool> {
  public:
-  virtual void SetUp() override { backward_mode_ = GetParam(); }
+  void SetUp() override { backward_mode_ = GetParam(); }
 
   bool backward_mode_;
 };
@@ -39,19 +39,21 @@ static const uint32_t kTestColumnFamilyId = 66;
 static const std::string kTestColumnFamilyName = "test_column_fam";
 
 void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
+                 const MutableCFOptions& moptions,
                  const InternalKeyComparator& internal_comparator,
                  const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
                      int_tbl_prop_collector_factories,
                  std::unique_ptr<WritableFileWriter>* writable,
                  std::unique_ptr<TableBuilder>* builder) {
-  unique_ptr<WritableFile> wf(new test::StringSink);
-  writable->reset(new WritableFileWriter(std::move(wf), EnvOptions()));
+  std::unique_ptr<WritableFile> wf(new test::StringSink);
+  writable->reset(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
   int unknown_level = -1;
   builder->reset(NewTableBuilder(
-      ioptions, internal_comparator, int_tbl_prop_collector_factories,
-      kTestColumnFamilyId, kTestColumnFamilyName,
-      writable->get(), options.compression, options.compression_opts,
-      unknown_level));
+      ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
+      kTestColumnFamilyId, kTestColumnFamilyName, writable->get(),
+      options.compression, options.sample_for_compression,
+      options.compression_opts, unknown_level));
 }
 }  // namespace
 
@@ -82,8 +84,9 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
      return Status::OK();
   }
 
-  Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type,
-                    SequenceNumber seq, uint64_t file_size) override {
+  Status AddUserKey(const Slice& user_key, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t file_size) override {
     // simply asssume all user keys are not empty.
     if (user_key.data()[0] == 'A') {
       ++count_;
@@ -104,7 +107,7 @@ class RegularKeysStartWithA: public TablePropertiesCollector {
     return Status::OK();
   }
 
-  virtual UserCollectedProperties GetReadableProperties() const override {
+  UserCollectedProperties GetReadableProperties() const override {
     return UserCollectedProperties{};
   }
 
@@ -133,7 +136,7 @@ class RegularKeysStartWithABackwardCompatible
     return Status::OK();
   }
 
-  Status Add(const Slice& user_key, const Slice& value) override {
+  Status Add(const Slice& user_key, const Slice& /*value*/) override {
     // simply asssume all user keys are not empty.
     if (user_key.data()[0] == 'A') {
       ++count_;
@@ -141,7 +144,7 @@ class RegularKeysStartWithABackwardCompatible
     return Status::OK();
   }
 
-  virtual UserCollectedProperties GetReadableProperties() const override {
+  UserCollectedProperties GetReadableProperties() const override {
     return UserCollectedProperties{};
   }
 
@@ -161,8 +164,8 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector {
     return Status::OK();
   }
 
-  Status InternalAdd(const Slice& user_key, const Slice& value,
-                     uint64_t file_size) override {
+  Status InternalAdd(const Slice& user_key, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
     // simply asssume all user keys are not empty.
     if (user_key.data()[0] == 'A') {
       ++count_;
@@ -170,7 +173,14 @@ class RegularKeysStartWithAInternal : public IntTblPropCollector {
     return Status::OK();
   }
 
-  virtual UserCollectedProperties GetReadableProperties() const override {
+  void BlockAdd(uint64_t /* blockRawBytes */,
+                uint64_t /* blockCompressedBytesFast */,
+                uint64_t /* blockCompressedBytesSlow */) override {
+    // Nothing to do.
+    return;
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
     return UserCollectedProperties{};
   }
 
@@ -183,7 +193,7 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
  public:
   explicit RegularKeysStartWithAFactory(bool backward_mode)
       : backward_mode_(backward_mode) {}
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+  TablePropertiesCollector* CreateTablePropertiesCollector(
       TablePropertiesCollectorFactory::Context context) override {
     EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
     if (!backward_mode_) {
@@ -192,8 +202,8 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
       return new RegularKeysStartWithABackwardCompatible();
     }
   }
-  virtual IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t column_family_id) override {
+  IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t /*column_family_id*/) override {
     return new RegularKeysStartWithAInternal();
   }
   const char* Name() const override { return "RegularKeysStartWithA"; }
@@ -203,7 +213,7 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
 
 class FlushBlockEveryThreePolicy : public FlushBlockPolicy {
  public:
-  virtual bool Update(const Slice& key, const Slice& value) override {
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
     return (++count_ % 3U == 0);
   }
 
@@ -220,8 +230,8 @@ class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
   }
 
   FlushBlockPolicy* NewFlushBlockPolicy(
-      const BlockBasedTableOptions& table_options,
-      const BlockBuilder& data_block_builder) const override {
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
     return new FlushBlockEveryThreePolicy;
   }
 };
@@ -250,6 +260,7 @@ void TestCustomizedTablePropertiesCollector(
   std::unique_ptr<TableBuilder> builder;
   std::unique_ptr<WritableFileWriter> writer;
   const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
       int_tbl_prop_collector_factories;
   if (test_int_tbl_prop_collector) {
@@ -258,7 +269,7 @@ void TestCustomizedTablePropertiesCollector(
   } else {
     GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
   }
-  MakeBuilder(options, ioptions, internal_comparator,
+  MakeBuilder(options, ioptions, moptions, internal_comparator,
               &int_tbl_prop_collector_factories, &writer, &builder);
 
   SequenceNumber seqNum = 0U;
@@ -277,7 +288,8 @@ void TestCustomizedTablePropertiesCollector(
           new test::StringSource(fwf->contents())));
   TableProperties* props;
   Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
-                                 magic_number, ioptions, &props);
+                                 magic_number, ioptions, &props,
+                                 true /* compression_type_missing */);
   std::unique_ptr<TableProperties> props_guard(props);
   ASSERT_OK(s);
 
@@ -395,15 +407,13 @@ void TestInternalKeyPropertiesCollector(
     ImmutableCFOptions ioptions(options);
     GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
     options.comparator = comparator;
-  } else {
-    int_tbl_prop_collector_factories.emplace_back(
-        new InternalKeyPropertiesCollectorFactory);
   }
   const ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
 
   for (int iter = 0; iter < 2; ++iter) {
-    MakeBuilder(options, ioptions, pikc, &int_tbl_prop_collector_factories,
-                &writable, &builder);
+    MakeBuilder(options, ioptions, moptions, pikc,
+                &int_tbl_prop_collector_factories, &writable, &builder);
     for (const auto& k : keys) {
       builder->Add(k.Encode(), "val");
     }
@@ -413,12 +423,13 @@ void TestInternalKeyPropertiesCollector(
 
     test::StringSink* fwf =
         static_cast<test::StringSink*>(writable->writable_file());
-    unique_ptr<RandomAccessFileReader> reader(test::GetRandomAccessFileReader(
-        new test::StringSource(fwf->contents())));
+    std::unique_ptr<RandomAccessFileReader> reader(
+        test::GetRandomAccessFileReader(
+            new test::StringSource(fwf->contents())));
     TableProperties* props;
     Status s =
         ReadTableProperties(reader.get(), fwf->contents().size(), magic_number,
-                            ioptions, &props);
+                            ioptions, &props, true /* compression_type_missing */);
     ASSERT_OK(s);
 
     std::unique_ptr<TableProperties> props_guard(props);
diff --git a/thirdparty/rocksdb/db/transaction_log_impl.cc b/thirdparty/rocksdb/db/transaction_log_impl.cc
index e22c0c4af0..f92d563eb8 100644
--- a/thirdparty/rocksdb/db/transaction_log_impl.cc
+++ b/thirdparty/rocksdb/db/transaction_log_impl.cc
@@ -19,19 +19,21 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
     const std::string& dir, const ImmutableDBOptions* options,
     const TransactionLogIterator::ReadOptions& read_options,
     const EnvOptions& soptions, const SequenceNumber seq,
-    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions)
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+    const bool seq_per_batch)
     : dir_(dir),
       options_(options),
       read_options_(read_options),
       soptions_(soptions),
-      startingSequenceNumber_(seq),
+      starting_sequence_number_(seq),
       files_(std::move(files)),
       started_(false),
-      isValid_(false),
-      currentFileIndex_(0),
-      currentBatchSeq_(0),
-      currentLastSeq_(0),
-      versions_(versions) {
+      is_valid_(false),
+      current_file_index_(0),
+      current_batch_seq_(0),
+      current_last_seq_(0),
+      versions_(versions),
+      seq_per_batch_(seq_per_batch) {
   assert(files_ != nullptr);
   assert(versions_ != nullptr);
 
@@ -41,70 +43,68 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl(
 }
 
 Status TransactionLogIteratorImpl::OpenLogFile(
-    const LogFile* logFile, unique_ptr<SequentialFileReader>* file_reader) {
+    const LogFile* log_file,
+    std::unique_ptr<SequentialFileReader>* file_reader) {
   Env* env = options_->env;
-  unique_ptr<SequentialFile> file;
+  std::unique_ptr<SequentialFile> file;
+  std::string fname;
   Status s;
   EnvOptions optimized_env_options = env->OptimizeForLogRead(soptions_);
-  if (logFile->Type() == kArchivedLogFile) {
-    std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+  if (log_file->Type() == kArchivedLogFile) {
+    fname = ArchivedLogFileName(dir_, log_file->LogNumber());
     s = env->NewSequentialFile(fname, &file, optimized_env_options);
   } else {
-    std::string fname = LogFileName(dir_, logFile->LogNumber());
+    fname = LogFileName(dir_, log_file->LogNumber());
     s = env->NewSequentialFile(fname, &file, optimized_env_options);
     if (!s.ok()) {
       //  If cannot open file in DB directory.
       //  Try the archive dir, as it could have moved in the meanwhile.
-      fname = ArchivedLogFileName(dir_, logFile->LogNumber());
+      fname = ArchivedLogFileName(dir_, log_file->LogNumber());
       s = env->NewSequentialFile(fname, &file, optimized_env_options);
     }
   }
   if (s.ok()) {
-    file_reader->reset(new SequentialFileReader(std::move(file)));
+    file_reader->reset(new SequentialFileReader(std::move(file), fname));
   }
   return s;
 }
 
 BatchResult TransactionLogIteratorImpl::GetBatch()  {
-  assert(isValid_);  //  cannot call in a non valid state.
+  assert(is_valid_);  //  cannot call in a non valid state.
   BatchResult result;
-  result.sequence = currentBatchSeq_;
-  result.writeBatchPtr = std::move(currentBatch_);
+  result.sequence = current_batch_seq_;
+  result.writeBatchPtr = std::move(current_batch_);
   return result;
 }
 
-Status TransactionLogIteratorImpl::status() {
-  return currentStatus_;
-}
+Status TransactionLogIteratorImpl::status() { return current_status_; }
 
-bool TransactionLogIteratorImpl::Valid() {
-  return started_ && isValid_;
-}
+bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
 
 bool TransactionLogIteratorImpl::RestrictedRead(
     Slice* record,
     std::string* scratch) {
   // Don't read if no more complete entries to read from logs
-  if (currentLastSeq_ >= versions_->LastSequence()) {
+  if (current_last_seq_ >= versions_->LastSequence()) {
     return false;
   }
-  return currentLogReader_->ReadRecord(record, scratch);
+  return current_log_reader_->ReadRecord(record, scratch);
 }
 
-void TransactionLogIteratorImpl::SeekToStartSequence(
-    uint64_t startFileIndex,
-    bool strict) {
+void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
+                                                     bool strict) {
   std::string scratch;
   Slice record;
   started_ = false;
-  isValid_ = false;
-  if (files_->size() <= startFileIndex) {
+  is_valid_ = false;
+  if (files_->size() <= start_file_index) {
     return;
   }
-  Status s = OpenLogReader(files_->at(startFileIndex).get());
+  Status s =
+      OpenLogReader(files_->at(static_cast<size_t>(start_file_index)).get());
   if (!s.ok()) {
-    currentStatus_ = s;
-    reporter_.Info(currentStatus_.ToString().c_str());
+    current_status_ = s;
+    reporter_.Info(current_status_.ToString().c_str());
     return;
   }
   while (RestrictedRead(&record, &scratch)) {
@@ -114,21 +114,22 @@ void TransactionLogIteratorImpl::SeekToStartSequence(
       continue;
     }
     UpdateCurrentWriteBatch(record);
-    if (currentLastSeq_ >= startingSequenceNumber_) {
-      if (strict && currentBatchSeq_ != startingSequenceNumber_) {
-        currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
-                                            "seek to required sequence number");
-        reporter_.Info(currentStatus_.ToString().c_str());
+    if (current_last_seq_ >= starting_sequence_number_) {
+      if (strict && current_batch_seq_ != starting_sequence_number_) {
+        current_status_ = Status::Corruption(
+            "Gap in sequence number. Could not "
+            "seek to required sequence number");
+        reporter_.Info(current_status_.ToString().c_str());
         return;
       } else if (strict) {
         reporter_.Info("Could seek required sequence number. Iterator will "
                        "continue.");
       }
-      isValid_ = true;
+      is_valid_ = true;
       started_ = true; // set started_ as we could seek till starting sequence
       return;
     } else {
-      isValid_ = false;
+      is_valid_ = false;
     }
   }
 
@@ -137,13 +138,15 @@ void TransactionLogIteratorImpl::SeekToStartSequence(
   // If strict is set, we want to seek exactly till the start sequence and it
   // should have been present in the file we scanned above
   if (strict) {
-    currentStatus_ = Status::Corruption("Gap in sequence number. Could not "
-                                        "seek to required sequence number");
-    reporter_.Info(currentStatus_.ToString().c_str());
+    current_status_ = Status::Corruption(
+        "Gap in sequence number. Could not "
+        "seek to required sequence number");
+    reporter_.Info(current_status_.ToString().c_str());
   } else if (files_->size() != 1) {
-    currentStatus_ = Status::Corruption("Start sequence was not found, "
-                                        "skipping to the next available");
-    reporter_.Info(currentStatus_.ToString().c_str());
+    current_status_ = Status::Corruption(
+        "Start sequence was not found, "
+        "skipping to the next available");
+    reporter_.Info(current_status_.ToString().c_str());
     // Let NextImpl find the next available entry. started_ remains false
     // because we don't want to check for gaps while moving to start sequence
     NextImpl(true);
@@ -157,15 +160,15 @@ void TransactionLogIteratorImpl::Next() {
 void TransactionLogIteratorImpl::NextImpl(bool internal) {
   std::string scratch;
   Slice record;
-  isValid_ = false;
+  is_valid_ = false;
   if (!internal && !started_) {
     // Runs every time until we can seek to the start sequence
     return SeekToStartSequence();
   }
   while(true) {
-    assert(currentLogReader_);
-    if (currentLogReader_->IsEOF()) {
-      currentLogReader_->UnmarkEOF();
+    assert(current_log_reader_);
+    if (current_log_reader_->IsEOF()) {
+      current_log_reader_->UnmarkEOF();
     }
     while (RestrictedRead(&record, &scratch)) {
       if (record.size() < WriteBatchInternal::kHeader) {
@@ -186,20 +189,20 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
     }
 
     // Open the next file
-    if (currentFileIndex_ < files_->size() - 1) {
-      ++currentFileIndex_;
-      Status s = OpenLogReader(files_->at(currentFileIndex_).get());
+    if (current_file_index_ < files_->size() - 1) {
+      ++current_file_index_;
+      Status s = OpenLogReader(files_->at(current_file_index_).get());
       if (!s.ok()) {
-        isValid_ = false;
-        currentStatus_ = s;
+        is_valid_ = false;
+        current_status_ = s;
         return;
       }
     } else {
-      isValid_ = false;
-      if (currentLastSeq_ == versions_->LastSequence()) {
-        currentStatus_ = Status::OK();
+      is_valid_ = false;
+      if (current_last_seq_ == versions_->LastSequence()) {
+        current_status_ = Status::OK();
       } else {
-        currentStatus_ = Status::Corruption("NO MORE DATA LEFT");
+        current_status_ = Status::Corruption("NO MORE DATA LEFT");
       }
       return;
     }
@@ -207,17 +210,16 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) {
 }
 
 bool TransactionLogIteratorImpl::IsBatchExpected(
-    const WriteBatch* batch,
-    const SequenceNumber expectedSeq) {
+    const WriteBatch* batch, const SequenceNumber expected_seq) {
   assert(batch);
   SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
-  if (batchSeq != expectedSeq) {
+  if (batchSeq != expected_seq) {
     char buf[200];
     snprintf(buf, sizeof(buf),
              "Discontinuity in log records. Got seq=%" PRIu64
              ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
              ".Log iterator will reseek the correct batch.",
-             batchSeq, expectedSeq, versions_->LastSequence());
+             batchSeq, expected_seq, versions_->LastSequence());
     reporter_.Info(buf);
     return false;
   }
@@ -228,44 +230,90 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
   std::unique_ptr<WriteBatch> batch(new WriteBatch());
   WriteBatchInternal::SetContents(batch.get(), record);
 
-  SequenceNumber expectedSeq = currentLastSeq_ + 1;
+  SequenceNumber expected_seq = current_last_seq_ + 1;
   // If the iterator has started, then confirm that we get continuous batches
-  if (started_ && !IsBatchExpected(batch.get(), expectedSeq)) {
+  if (started_ && !IsBatchExpected(batch.get(), expected_seq)) {
     // Seek to the batch having expected sequence number
-    if (expectedSeq < files_->at(currentFileIndex_)->StartSequence()) {
+    if (expected_seq < files_->at(current_file_index_)->StartSequence()) {
       // Expected batch must lie in the previous log file
       // Avoid underflow.
-      if (currentFileIndex_ != 0) {
-        currentFileIndex_--;
+      if (current_file_index_ != 0) {
+        current_file_index_--;
       }
     }
-    startingSequenceNumber_ = expectedSeq;
+    starting_sequence_number_ = expected_seq;
     // currentStatus_ will be set to Ok if reseek succeeds
-    currentStatus_ = Status::NotFound("Gap in sequence numbers");
-    return SeekToStartSequence(currentFileIndex_, true);
+    // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
+    // that allows gaps in the WAL since it will still skip over the gap.
+    current_status_ = Status::NotFound("Gap in sequence numbers");
+    // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
+    // should be disabled
+    return SeekToStartSequence(current_file_index_, !seq_per_batch_);
   }
 
-  currentBatchSeq_ = WriteBatchInternal::Sequence(batch.get());
-  currentLastSeq_ = currentBatchSeq_ +
-                    WriteBatchInternal::Count(batch.get()) - 1;
+  struct BatchCounter : public WriteBatch::Handler {
+    SequenceNumber sequence_;
+    BatchCounter(SequenceNumber sequence) : sequence_(sequence) {}
+    Status MarkNoop(bool empty_batch) override {
+      if (!empty_batch) {
+        sequence_++;
+      }
+      return Status::OK();
+    }
+    Status MarkEndPrepare(const Slice&) override {
+      sequence_++;
+      return Status::OK();
+    }
+    Status MarkCommit(const Slice&) override {
+      sequence_++;
+      return Status::OK();
+    }
+
+    Status PutCF(uint32_t /*cf*/, const Slice& /*key*/,
+                 const Slice& /*val*/) override {
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override {
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*cf*/, const Slice& /*key*/) override {
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*cf*/, const Slice& /*key*/,
+                   const Slice& /*val*/) override {
+      return Status::OK();
+    }
+    Status MarkBeginPrepare(bool) override { return Status::OK(); }
+    Status MarkRollback(const Slice&) override { return Status::OK(); }
+  };
+
+  current_batch_seq_ = WriteBatchInternal::Sequence(batch.get());
+  if (seq_per_batch_) {
+    BatchCounter counter(current_batch_seq_);
+    batch->Iterate(&counter);
+    current_last_seq_ = counter.sequence_;
+  } else {
+    current_last_seq_ =
+        current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1;
+  }
   // currentBatchSeq_ can only change here
-  assert(currentLastSeq_ <= versions_->LastSequence());
+  assert(current_last_seq_ <= versions_->LastSequence());
 
-  currentBatch_ = std::move(batch);
-  isValid_ = true;
-  currentStatus_ = Status::OK();
+  current_batch_ = std::move(batch);
+  is_valid_ = true;
+  current_status_ = Status::OK();
 }
 
-Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
-  unique_ptr<SequentialFileReader> file;
-  Status s = OpenLogFile(logFile, &file);
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) {
+  std::unique_ptr<SequentialFileReader> file;
+  Status s = OpenLogFile(log_file, &file);
   if (!s.ok()) {
     return s;
   }
   assert(file);
-  currentLogReader_.reset(new log::Reader(
-      options_->info_log, std::move(file), &reporter_,
-      read_options_.verify_checksums_, 0, logFile->LogNumber()));
+  current_log_reader_.reset(
+      new log::Reader(options_->info_log, std::move(file), &reporter_,
+                      read_options_.verify_checksums_, log_file->LogNumber()));
   return Status::OK();
 }
 }  //  namespace rocksdb
diff --git a/thirdparty/rocksdb/db/transaction_log_impl.h b/thirdparty/rocksdb/db/transaction_log_impl.h
index 769d8339bd..6382b61a5b 100644
--- a/thirdparty/rocksdb/db/transaction_log_impl.h
+++ b/thirdparty/rocksdb/db/transaction_log_impl.h
@@ -62,7 +62,8 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
       const std::string& dir, const ImmutableDBOptions* options,
       const TransactionLogIterator::ReadOptions& read_options,
       const EnvOptions& soptions, const SequenceNumber seqNum,
-      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions);
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+      const bool seq_per_batch);
 
   virtual bool Valid() override;
 
@@ -77,16 +78,16 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
   const ImmutableDBOptions* options_;
   const TransactionLogIterator::ReadOptions read_options_;
   const EnvOptions& soptions_;
-  SequenceNumber startingSequenceNumber_;
+  SequenceNumber starting_sequence_number_;
   std::unique_ptr<VectorLogPtr> files_;
   bool started_;
-  bool isValid_;  // not valid when it starts of.
-  Status currentStatus_;
-  size_t currentFileIndex_;
-  std::unique_ptr<WriteBatch> currentBatch_;
-  unique_ptr<log::Reader> currentLogReader_;
-  Status OpenLogFile(const LogFile* logFile,
-                     unique_ptr<SequentialFileReader>* file);
+  bool is_valid_;  // not valid when it starts of.
+  Status current_status_;
+  size_t current_file_index_;
+  std::unique_ptr<WriteBatch> current_batch_;
+  std::unique_ptr<log::Reader> current_log_reader_;
+  Status OpenLogFile(const LogFile* log_file,
+                     std::unique_ptr<SequentialFileReader>* file);
 
   struct LogReporter : public log::Reader::Reporter {
     Env* env;
@@ -98,24 +99,25 @@ class TransactionLogIteratorImpl : public TransactionLogIterator {
     virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); }
   } reporter_;
 
-  SequenceNumber currentBatchSeq_; // sequence number at start of current batch
-  SequenceNumber currentLastSeq_; // last sequence in the current batch
+  SequenceNumber
+      current_batch_seq_;  // sequence number at start of current batch
+  SequenceNumber current_last_seq_;  // last sequence in the current batch
   // Used only to get latest seq. num
   // TODO(icanadi) can this be just a callback?
   VersionSet const* const versions_;
-
+  const bool seq_per_batch_;
   // Reads from transaction log only if the writebatch record has been written
   bool RestrictedRead(Slice* record, std::string* scratch);
   // Seeks to startingSequenceNumber reading from startFileIndex in files_.
   // If strict is set,then must get a batch starting with startingSequenceNumber
-  void SeekToStartSequence(uint64_t startFileIndex = 0, bool strict = false);
+  void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);
   // Implementation of Next. SeekToStartSequence calls it internally with
   // internal=true to let it find next entry even if it has to jump gaps because
   // the iterator may start off from the first available entry but promises to
   // be continuous after that
   void NextImpl(bool internal = false);
   // Check if batch is expected, else return false
-  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expectedSeq);
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq);
   // Update current batch if a continuous batch is found, else return false
   void UpdateCurrentWriteBatch(const Slice& record);
   Status OpenLogReader(const LogFile* file);
diff --git a/thirdparty/rocksdb/db/version_builder.cc b/thirdparty/rocksdb/db/version_builder.cc
index e8db67527e..84e4dc6579 100644
--- a/thirdparty/rocksdb/db/version_builder.cc
+++ b/thirdparty/rocksdb/db/version_builder.cc
@@ -35,11 +35,11 @@
 namespace rocksdb {
 
 bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) {
-  if (a->largest_seqno != b->largest_seqno) {
-    return a->largest_seqno > b->largest_seqno;
+  if (a->fd.largest_seqno != b->fd.largest_seqno) {
+    return a->fd.largest_seqno > b->fd.largest_seqno;
   }
-  if (a->smallest_seqno != b->smallest_seqno) {
-    return a->smallest_seqno > b->smallest_seqno;
+  if (a->fd.smallest_seqno != b->fd.smallest_seqno) {
+    return a->fd.smallest_seqno > b->fd.smallest_seqno;
   }
   // Break ties by file number
   return a->fd.GetNumber() > b->fd.GetNumber();
@@ -66,6 +66,8 @@ class VersionBuilder::Rep {
     enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method;
     const InternalKeyComparator* internal_comparator;
 
+    FileComparator() : internal_comparator(nullptr) {}
+
     bool operator()(FileMetaData* f1, FileMetaData* f2) const {
       switch (sort_method) {
         case kLevel0:
@@ -160,22 +162,24 @@ class VersionBuilder::Rep {
             abort();
           }
 
-          if (f2->smallest_seqno == f2->largest_seqno) {
+          if (f2->fd.smallest_seqno == f2->fd.largest_seqno) {
             // This is an external file that we ingested
-            SequenceNumber external_file_seqno = f2->smallest_seqno;
-            if (!(external_file_seqno < f1->largest_seqno ||
+            SequenceNumber external_file_seqno = f2->fd.smallest_seqno;
+            if (!(external_file_seqno < f1->fd.largest_seqno ||
                   external_file_seqno == 0)) {
-              fprintf(stderr, "L0 file with seqno %" PRIu64 " %" PRIu64
-                              " vs. file with global_seqno %" PRIu64 "\n",
-                      f1->smallest_seqno, f1->largest_seqno,
+              fprintf(stderr,
+                      "L0 file with seqno %" PRIu64 " %" PRIu64
+                      " vs. file with global_seqno %" PRIu64 "\n",
+                      f1->fd.smallest_seqno, f1->fd.largest_seqno,
                       external_file_seqno);
               abort();
             }
-          } else if (f1->smallest_seqno <= f2->smallest_seqno) {
-            fprintf(stderr, "L0 files seqno %" PRIu64 " %" PRIu64
-                            " vs. %" PRIu64 " %" PRIu64 "\n",
-                    f1->smallest_seqno, f1->largest_seqno, f2->smallest_seqno,
-                    f2->largest_seqno);
+          } else if (f1->fd.smallest_seqno <= f2->fd.smallest_seqno) {
+            fprintf(stderr,
+                    "L0 files seqno %" PRIu64 " %" PRIu64 " vs. %" PRIu64
+                    " %" PRIu64 "\n",
+                    f1->fd.smallest_seqno, f1->fd.largest_seqno,
+                    f2->fd.smallest_seqno, f2->fd.largest_seqno);
             abort();
           }
         } else {
@@ -197,7 +201,7 @@ class VersionBuilder::Rep {
     }
   }
 
-  void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,
+  void CheckConsistencyForDeletes(VersionEdit* /*edit*/, uint64_t number,
                                   int level) {
 #ifdef NDEBUG
     if (!base_vstorage_->force_consistency_checks()) {
@@ -274,11 +278,12 @@ class VersionBuilder::Rep {
         auto exising = levels_[level].added_files.find(number);
         if (exising != levels_[level].added_files.end()) {
           UnrefFile(exising->second);
-          levels_[level].added_files.erase(number);
+          levels_[level].added_files.erase(exising);
         }
       } else {
-        if (invalid_levels_[level].count(number) > 0) {
-          invalid_levels_[level].erase(number);
+        auto exising = invalid_levels_[level].find(number);
+        if (exising != invalid_levels_[level].end()) {
+          invalid_levels_[level].erase(exising);
         } else {
           // Deleting an non-existing file on invalid level.
           has_invalid_levels_ = true;
@@ -319,8 +324,6 @@ class VersionBuilder::Rep {
       // Merge the set of added files with the set of pre-existing files.
       // Drop any deleted files.  Store the result in *v.
       const auto& base_files = base_vstorage_->LevelFiles(level);
-      auto base_iter = base_files.begin();
-      auto base_end = base_files.end();
       const auto& unordered_added_files = levels_[level].added_files;
       vstorage->Reserve(level,
                         base_files.size() + unordered_added_files.size());
@@ -334,51 +337,92 @@ class VersionBuilder::Rep {
       std::sort(added_files.begin(), added_files.end(), cmp);
 
 #ifndef NDEBUG
-      FileMetaData* prev_file = nullptr;
-#endif
-
+      FileMetaData* prev_added_file = nullptr;
       for (const auto& added : added_files) {
-#ifndef NDEBUG
-        if (level > 0 && prev_file != nullptr) {
+        if (level > 0 && prev_added_file != nullptr) {
           assert(base_vstorage_->InternalComparator()->Compare(
-                     prev_file->smallest, added->smallest) <= 0);
+                     prev_added_file->smallest, added->smallest) <= 0);
         }
-        prev_file = added;
+        prev_added_file = added;
+      }
 #endif
 
-        // Add all smaller files listed in base_
-        for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp);
-             base_iter != bpos; ++base_iter) {
-          MaybeAddFile(vstorage, level, *base_iter);
+      auto base_iter = base_files.begin();
+      auto base_end = base_files.end();
+      auto added_iter = added_files.begin();
+      auto added_end = added_files.end();
+      while (added_iter != added_end || base_iter != base_end) {
+        if (base_iter == base_end ||
+                (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+          MaybeAddFile(vstorage, level, *added_iter++);
+        } else {
+          MaybeAddFile(vstorage, level, *base_iter++);
         }
-
-        MaybeAddFile(vstorage, level, added);
-      }
-
-      // Add remaining base files
-      for (; base_iter != base_end; ++base_iter) {
-        MaybeAddFile(vstorage, level, *base_iter);
       }
     }
 
     CheckConsistency(vstorage);
   }
 
-  void LoadTableHandlers(InternalStats* internal_stats, int max_threads,
-                         bool prefetch_index_and_filter_in_cache) {
+  Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
+                           bool prefetch_index_and_filter_in_cache,
+                           bool is_initial_load,
+                           const SliceTransform* prefix_extractor) {
     assert(table_cache_ != nullptr);
+
+    size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
+    bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity);
+    size_t max_load = port::kMaxSizet;
+
+    if (!always_load) {
+      // If it is initial loading and not set to always laoding all the
+      // files, we only load up to kInitialLoadLimit files, to limit the
+      // time reopening the DB.
+      const size_t kInitialLoadLimit = 16;
+      size_t load_limit;
+      // If the table cache is not 1/4 full, we pin the table handle to
+      // file metadata to avoid the cache read costs when reading the file.
+      // The downside of pinning those files is that LRU won't be followed
+      // for those files. This doesn't matter much because if number of files
+      // of the DB excceeds table cache capacity, eventually no table reader
+      // will be pinned and LRU will be followed.
+      if (is_initial_load) {
+        load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4);
+      } else {
+        load_limit = table_cache_capacity / 4;
+      }
+
+      size_t table_cache_usage = table_cache_->get_cache()->GetUsage();
+      if (table_cache_usage >= load_limit) {
+        // TODO (yanqin) find a suitable status code.
+        return Status::OK();
+      } else {
+        max_load = load_limit - table_cache_usage;
+      }
+    }
+
     // <file metadata, level>
     std::vector<std::pair<FileMetaData*, int>> files_meta;
+    std::vector<Status> statuses;
     for (int level = 0; level < num_levels_; level++) {
       for (auto& file_meta_pair : levels_[level].added_files) {
         auto* file_meta = file_meta_pair.second;
-        assert(!file_meta->table_reader_handle);
-        files_meta.emplace_back(file_meta, level);
+        // If the file has been opened before, just skip it.
+        if (!file_meta->table_reader_handle) {
+          files_meta.emplace_back(file_meta, level);
+          statuses.emplace_back(Status::OK());
+        }
+        if (files_meta.size() >= max_load) {
+          break;
+        }
+      }
+      if (files_meta.size() >= max_load) {
+        break;
       }
     }
 
     std::atomic<size_t> next_file_meta_idx(0);
-    std::function<void()> load_handlers_func = [&]() {
+    std::function<void()> load_handlers_func([&]() {
       while (true) {
         size_t file_idx = next_file_meta_idx.fetch_add(1);
         if (file_idx >= files_meta.size()) {
@@ -387,37 +431,39 @@ class VersionBuilder::Rep {
 
         auto* file_meta = files_meta[file_idx].first;
         int level = files_meta[file_idx].second;
-        table_cache_->FindTable(env_options_,
-                                *(base_vstorage_->InternalComparator()),
-                                file_meta->fd, &file_meta->table_reader_handle,
-                                false /*no_io */, true /* record_read_stats */,
-                                internal_stats->GetFileReadHist(level), false,
-                                level, prefetch_index_and_filter_in_cache);
+        statuses[file_idx] = table_cache_->FindTable(
+            env_options_, *(base_vstorage_->InternalComparator()),
+            file_meta->fd, &file_meta->table_reader_handle, prefix_extractor,
+            false /*no_io */, true /* record_read_stats */,
+            internal_stats->GetFileReadHist(level), false, level,
+            prefetch_index_and_filter_in_cache);
         if (file_meta->table_reader_handle != nullptr) {
           // Load table_reader
           file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
               file_meta->table_reader_handle);
         }
       }
-    };
+    });
 
-    if (max_threads <= 1) {
-      load_handlers_func();
-    } else {
-      std::vector<port::Thread> threads;
-      for (int i = 0; i < max_threads; i++) {
-        threads.emplace_back(load_handlers_func);
-      }
-
-      for (auto& t : threads) {
-        t.join();
+    std::vector<port::Thread> threads;
+    for (int i = 1; i < max_threads; i++) {
+      threads.emplace_back(load_handlers_func);
+    }
+    load_handlers_func();
+    for (auto& t : threads) {
+      t.join();
+    }
+    for (const auto& s : statuses) {
+      if (!s.ok()) {
+        return s;
       }
     }
+    return Status::OK();
   }
 
   void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) {
     if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
-      // f is to-be-delected table file
+      // f is to-be-deleted table file
       vstorage->RemoveCurrentStats(f);
     } else {
       vstorage->AddFile(level, f, info_log_);
@@ -452,11 +498,13 @@ void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) {
   rep_->SaveTo(vstorage);
 }
 
-void VersionBuilder::LoadTableHandlers(
+Status VersionBuilder::LoadTableHandlers(
     InternalStats* internal_stats, int max_threads,
-    bool prefetch_index_and_filter_in_cache) {
-  rep_->LoadTableHandlers(internal_stats, max_threads,
-                          prefetch_index_and_filter_in_cache);
+    bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+    const SliceTransform* prefix_extractor) {
+  return rep_->LoadTableHandlers(internal_stats, max_threads,
+                                 prefetch_index_and_filter_in_cache,
+                                 is_initial_load, prefix_extractor);
 }
 
 void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level,
diff --git a/thirdparty/rocksdb/db/version_builder.h b/thirdparty/rocksdb/db/version_builder.h
index 440d4eaf6b..168301fdd6 100644
--- a/thirdparty/rocksdb/db/version_builder.h
+++ b/thirdparty/rocksdb/db/version_builder.h
@@ -9,6 +9,7 @@
 //
 #pragma once
 #include "rocksdb/env.h"
+#include "rocksdb/slice_transform.h"
 
 namespace rocksdb {
 
@@ -32,8 +33,10 @@ class VersionBuilder {
   bool CheckConsistencyForNumLevels();
   void Apply(VersionEdit* edit);
   void SaveTo(VersionStorageInfo* vstorage);
-  void LoadTableHandlers(InternalStats* internal_stats, int max_threads,
-                         bool prefetch_index_and_filter_in_cache);
+  Status LoadTableHandlers(InternalStats* internal_stats, int max_threads,
+                           bool prefetch_index_and_filter_in_cache,
+                           bool is_initial_load,
+                           const SliceTransform* prefix_extractor);
   void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f);
 
  private:
diff --git a/thirdparty/rocksdb/db/version_builder_test.cc b/thirdparty/rocksdb/db/version_builder_test.cc
index 304df2a045..514952bb5b 100644
--- a/thirdparty/rocksdb/db/version_builder_test.cc
+++ b/thirdparty/rocksdb/db/version_builder_test.cc
@@ -37,7 +37,7 @@ class VersionBuilderTest : public testing::Test {
     size_being_compacted_.resize(options_.num_levels);
   }
 
-  ~VersionBuilderTest() {
+  ~VersionBuilderTest() override {
     for (int i = 0; i < vstorage_.num_levels(); i++) {
       for (auto* f : vstorage_.LevelFiles(i)) {
         if (--f->refs == 0) {
@@ -63,8 +63,8 @@ class VersionBuilderTest : public testing::Test {
     f->fd = FileDescriptor(file_number, path_id, file_size);
     f->smallest = GetInternalKey(smallest, smallest_seq);
     f->largest = GetInternalKey(largest, largest_seq);
-    f->smallest_seqno = smallest_seqno;
-    f->largest_seqno = largest_seqno;
+    f->fd.smallest_seqno = smallest_seqno;
+    f->fd.largest_seqno = largest_seqno;
     f->compensated_file_size = file_size;
     f->refs = 0;
     f->num_entries = num_entries;
diff --git a/thirdparty/rocksdb/db/version_edit.cc b/thirdparty/rocksdb/db/version_edit.cc
index b01f7bbdf7..01ec44515a 100644
--- a/thirdparty/rocksdb/db/version_edit.cc
+++ b/thirdparty/rocksdb/db/version_edit.cc
@@ -20,7 +20,7 @@ namespace rocksdb {
 
 // Tag numbers for serialized VersionEdit.  These numbers are written to
 // disk and should not be changed.
-enum Tag {
+enum Tag : uint32_t {
   kComparator = 1,
   kLogNumber = 2,
   kNextFileNumber = 3,
@@ -30,6 +30,7 @@ enum Tag {
   kNewFile = 7,
   // 8 was used for large value refs
   kPrevLogNumber = 9,
+  kMinLogNumberToKeep = 10,
 
   // these are new formats divergent from open source leveldb
   kNewFile2 = 100,
@@ -39,11 +40,21 @@ enum Tag {
   kColumnFamilyAdd = 201,
   kColumnFamilyDrop = 202,
   kMaxColumnFamily = 203,
+
+  kInAtomicGroup = 300,
 };
 
-enum CustomTag {
+// Mask for an identified tag from the future which can be safely ignored.
+uint32_t kTagSafeIgnoreMask = 1 << 13;
+
+enum CustomTag : uint32_t {
   kTerminate = 1,  // The end of customized fields
   kNeedCompaction = 2,
+  // Since Manifest is not entirely currently forward-compatible, and the only
+  // forward-compatible part is the CutsomtTag of kNewFile, we currently encode
+  // kMinLogNumberToKeep as part of a CustomTag as a hack. This should be
+  // removed when manifest becomes forward-comptabile.
+  kMinLogNumberToKeepHack = 3,
   kPathId = 65,
 };
 // If this bit for the custom tag is set, opening DB should fail if
@@ -63,18 +74,22 @@ void VersionEdit::Clear() {
   last_sequence_ = 0;
   next_file_number_ = 0;
   max_column_family_ = 0;
+  min_log_number_to_keep_ = 0;
   has_comparator_ = false;
   has_log_number_ = false;
   has_prev_log_number_ = false;
   has_next_file_number_ = false;
   has_last_sequence_ = false;
   has_max_column_family_ = false;
+  has_min_log_number_to_keep_ = false;
   deleted_files_.clear();
   new_files_.clear();
   column_family_ = 0;
   is_column_family_add_ = 0;
   is_column_family_drop_ = 0;
   column_family_name_.clear();
+  is_in_atomic_group_ = false;
+  remaining_entries_ = 0;
 }
 
 bool VersionEdit::EncodeTo(std::string* dst) const {
@@ -97,19 +112,19 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
   if (has_max_column_family_) {
     PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
   }
-
   for (const auto& deleted : deleted_files_) {
     PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
                                 deleted.second /* file number */);
   }
 
+  bool min_log_num_written = false;
   for (size_t i = 0; i < new_files_.size(); i++) {
     const FileMetaData& f = new_files_[i].second;
     if (!f.smallest.Valid() || !f.largest.Valid()) {
       return false;
     }
     bool has_customized_fields = false;
-    if (f.marked_for_compaction) {
+    if (f.marked_for_compaction || has_min_log_number_to_keep_) {
       PutVarint32(dst, kNewFile4);
       has_customized_fields = true;
     } else if (f.fd.GetPathId() == 0) {
@@ -127,7 +142,7 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
     PutVarint64(dst, f.fd.GetFileSize());
     PutLengthPrefixedSlice(dst, f.smallest.Encode());
     PutLengthPrefixedSlice(dst, f.largest.Encode());
-    PutVarint64Varint64(dst, f.smallest_seqno, f.largest_seqno);
+    PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
     if (has_customized_fields) {
       // Customized fields' format:
       // +-----------------------------+
@@ -165,6 +180,13 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
         char p = static_cast<char>(1);
         PutLengthPrefixedSlice(dst, Slice(&p, 1));
       }
+      if (has_min_log_number_to_keep_ && !min_log_num_written) {
+        PutVarint32(dst, CustomTag::kMinLogNumberToKeepHack);
+        std::string varint_log_number;
+        PutFixed64(&varint_log_number, min_log_number_to_keep_);
+        PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+        min_log_num_written = true;
+      }
       TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
                                dst);
 
@@ -185,6 +207,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
   if (is_column_family_drop_) {
     PutVarint32(dst, kColumnFamilyDrop);
   }
+
+  if (is_in_atomic_group_) {
+    PutVarint32(dst, kInAtomicGroup);
+    PutVarint32(dst, remaining_entries_);
+  }
   return true;
 }
 
@@ -198,7 +225,7 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
   }
 }
 
-bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
   uint32_t v;
   if (GetVarint32(input, &v)) {
     *level = v;
@@ -218,11 +245,16 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
   uint64_t number;
   uint32_t path_id = 0;
   uint64_t file_size;
+  SequenceNumber smallest_seqno;
+  SequenceNumber largest_seqno;
+  // Since this is the only forward-compatible part of the code, we hack new
+  // extension into this record. When we do, we set this boolean to distinguish
+  // the record from the normal NewFile records.
   if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
       GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
       GetInternalKey(input, &f.largest) &&
-      GetVarint64(input, &f.smallest_seqno) &&
-      GetVarint64(input, &f.largest_seqno)) {
+      GetVarint64(input, &smallest_seqno) &&
+      GetVarint64(input, &largest_seqno)) {
     // See comments in VersionEdit::EncodeTo() for format of customized fields
     while (true) {
       uint32_t custom_tag;
@@ -234,7 +266,7 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
         break;
       }
       if (!GetLengthPrefixedSlice(input, &field)) {
-        return "new-file4 custom field lenth prefixed slice error";
+        return "new-file4 custom field length prefixed slice error";
       }
       switch (custom_tag) {
         case kPathId:
@@ -252,6 +284,14 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
           }
           f.marked_for_compaction = (field[0] == 1);
           break;
+        case kMinLogNumberToKeepHack:
+          // This is a hack to encode kMinLogNumberToKeep in a
+          // forward-compatible fashion.
+          if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+            return "deleted log number malformatted";
+          }
+          has_min_log_number_to_keep_ = true;
+          break;
         default:
           if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
             // Should not proceed if cannot understand it
@@ -263,7 +303,8 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) {
   } else {
     return "new-file4 entry";
   }
-  f.fd = FileDescriptor(number, path_id, file_size);
+  f.fd =
+      FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
   new_files_.push_back(std::make_pair(level, f));
   return nullptr;
 }
@@ -331,6 +372,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         }
         break;
 
+      case kMinLogNumberToKeep:
+        if (GetVarint64(&input, &min_log_number_to_keep_)) {
+          has_min_log_number_to_keep_ = true;
+        } else {
+          msg = "min log number to kee";
+        }
+        break;
+
       case kCompactPointer:
         if (GetLevel(&input, &level, &msg) &&
             GetInternalKey(&input, &key)) {
@@ -375,13 +424,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
       case kNewFile2: {
         uint64_t number;
         uint64_t file_size;
+        SequenceNumber smallest_seqno;
+        SequenceNumber largest_seqno;
         if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
             GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
-            GetVarint64(&input, &f.smallest_seqno) &&
-            GetVarint64(&input, &f.largest_seqno)) {
-          f.fd = FileDescriptor(number, 0, file_size);
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
+                                largest_seqno);
           new_files_.push_back(std::make_pair(level, f));
         } else {
           if (!msg) {
@@ -395,13 +447,16 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         uint64_t number;
         uint32_t path_id;
         uint64_t file_size;
+        SequenceNumber smallest_seqno;
+        SequenceNumber largest_seqno;
         if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
             GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
             GetInternalKey(&input, &f.smallest) &&
             GetInternalKey(&input, &f.largest) &&
-            GetVarint64(&input, &f.smallest_seqno) &&
-            GetVarint64(&input, &f.largest_seqno)) {
-          f.fd = FileDescriptor(number, path_id, file_size);
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
+                                largest_seqno);
           new_files_.push_back(std::make_pair(level, f));
         } else {
           if (!msg) {
@@ -439,8 +494,31 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
         is_column_family_drop_ = true;
         break;
 
+      case kInAtomicGroup:
+        is_in_atomic_group_ = true;
+        if (!GetVarint32(&input, &remaining_entries_)) {
+          if (!msg) {
+            msg = "remaining entries";
+          }
+        }
+        break;
+
       default:
-        msg = "unknown tag";
+        if (tag & kTagSafeIgnoreMask) {
+          // Tag from future which can be safely ignored.
+          // The next field must be the length of the entry.
+          uint32_t field_len;
+          if (!GetVarint32(&input, &field_len) ||
+              static_cast<size_t>(field_len) > input.size()) {
+            if (!msg) {
+              msg = "safely ignoreable tag length error";
+            }
+          } else {
+            input.remove_prefix(static_cast<size_t>(field_len));
+          }
+        } else {
+          msg = "unknown tag";
+        }
         break;
     }
   }
@@ -475,6 +553,10 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append("\n  NextFileNumber: ");
     AppendNumberTo(&r, next_file_number_);
   }
+  if (has_min_log_number_to_keep_) {
+    r.append("\n  MinLogNumberToKeep: ");
+    AppendNumberTo(&r, min_log_number_to_keep_);
+  }
   if (has_last_sequence_) {
     r.append("\n  LastSeq: ");
     AppendNumberTo(&r, last_sequence_);
@@ -513,6 +595,11 @@ std::string VersionEdit::DebugString(bool hex_key) const {
     r.append("\n  MaxColumnFamily: ");
     AppendNumberTo(&r, max_column_family_);
   }
+  if (is_in_atomic_group_) {
+    r.append("\n  AtomicGroup: ");
+    AppendNumberTo(&r, remaining_entries_);
+    r.append(" entries remains");
+  }
   r.append("\n}\n");
   return r;
 }
@@ -582,6 +669,12 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
   if (has_max_column_family_) {
     jw << "MaxColumnFamily" << max_column_family_;
   }
+  if (has_min_log_number_to_keep_) {
+    jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
+  }
+  if (is_in_atomic_group_) {
+    jw << "AtomicGroup" << remaining_entries_;
+  }
 
   jw.EndObject();
 
diff --git a/thirdparty/rocksdb/db/version_edit.h b/thirdparty/rocksdb/db/version_edit.h
index 47ebf5b1c7..ee6499cdc3 100644
--- a/thirdparty/rocksdb/db/version_edit.h
+++ b/thirdparty/rocksdb/db/version_edit.h
@@ -27,7 +27,7 @@ const uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
 extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
 
 // A copyable structure contains information needed to read data from an SST
-// file. It can contains a pointer to a table reader opened for the file, or
+// file. It can contain a pointer to a table reader opened for the file, or
 // file number and size, which can be used to create a new table reader for it.
 // The behavior is undefined when a copied of the structure is used when the
 // file is not in any live version any more.
@@ -36,18 +36,28 @@ struct FileDescriptor {
   TableReader* table_reader;
   uint64_t packed_number_and_path_id;
   uint64_t file_size;  // File size in bytes
+  SequenceNumber smallest_seqno;  // The smallest seqno in this file
+  SequenceNumber largest_seqno;   // The largest seqno in this file
 
   FileDescriptor() : FileDescriptor(0, 0, 0) {}
 
   FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+      : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
+                 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
       : table_reader(nullptr),
         packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
-        file_size(_file_size) {}
+        file_size(_file_size),
+        smallest_seqno(_smallest_seqno),
+        largest_seqno(_largest_seqno) {}
 
   FileDescriptor& operator=(const FileDescriptor& fd) {
     table_reader = fd.table_reader;
     packed_number_and_path_id = fd.packed_number_and_path_id;
     file_size = fd.file_size;
+    smallest_seqno = fd.smallest_seqno;
+    largest_seqno = fd.largest_seqno;
     return *this;
   }
 
@@ -77,8 +87,6 @@ struct FileMetaData {
   FileDescriptor fd;
   InternalKey smallest;            // Smallest internal key served by table
   InternalKey largest;             // Largest internal key served by table
-  SequenceNumber smallest_seqno;   // The smallest seqno in this file
-  SequenceNumber largest_seqno;    // The largest seqno in this file
 
   // Needs to be disposed when refs becomes 0.
   Cache::Handle* table_reader_handle;
@@ -108,9 +116,7 @@ struct FileMetaData {
                                // file.
 
   FileMetaData()
-      : smallest_seqno(kMaxSequenceNumber),
-        largest_seqno(0),
-        table_reader_handle(nullptr),
+      : table_reader_handle(nullptr),
         compensated_file_size(0),
         num_entries(0),
         num_deletions(0),
@@ -128,8 +134,23 @@ struct FileMetaData {
       smallest.DecodeFrom(key);
     }
     largest.DecodeFrom(key);
-    smallest_seqno = std::min(smallest_seqno, seqno);
-    largest_seqno = std::max(largest_seqno, seqno);
+    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+  }
+
+  // Unlike UpdateBoundaries, ranges do not need to be presented in any
+  // particular order.
+  void UpdateBoundariesForRange(const InternalKey& start,
+                                const InternalKey& end, SequenceNumber seqno,
+                                const InternalKeyComparator& icmp) {
+    if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
+      smallest = start;
+    }
+    if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
+      largest = end;
+    }
+    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
   }
 };
 
@@ -144,6 +165,7 @@ struct FdWithKeyRange {
 
   FdWithKeyRange()
       : fd(),
+        file_metadata(nullptr),
         smallest_key(),
         largest_key() {
   }
@@ -198,6 +220,18 @@ class VersionEdit {
     has_max_column_family_ = true;
     max_column_family_ = max_column_family;
   }
+  void SetMinLogNumberToKeep(uint64_t num) {
+    has_min_log_number_to_keep_ = true;
+    min_log_number_to_keep_ = num;
+  }
+
+  bool has_log_number() { return has_log_number_; }
+
+  uint64_t log_number() { return log_number_; }
+
+  bool has_next_file_number() const { return has_next_file_number_; }
+
+  uint64_t next_file_number() const { return next_file_number_; }
 
   // Add the specified file at the specified number.
   // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
@@ -209,17 +243,18 @@ class VersionEdit {
                bool marked_for_compaction) {
     assert(smallest_seqno <= largest_seqno);
     FileMetaData f;
-    f.fd = FileDescriptor(file, file_path_id, file_size);
+    f.fd = FileDescriptor(file, file_path_id, file_size, smallest_seqno,
+                          largest_seqno);
     f.smallest = smallest;
     f.largest = largest;
-    f.smallest_seqno = smallest_seqno;
-    f.largest_seqno = largest_seqno;
+    f.fd.smallest_seqno = smallest_seqno;
+    f.fd.largest_seqno = largest_seqno;
     f.marked_for_compaction = marked_for_compaction;
     new_files_.emplace_back(level, std::move(f));
   }
 
   void AddFile(int level, const FileMetaData& f) {
-    assert(f.smallest_seqno <= f.largest_seqno);
+    assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
     new_files_.emplace_back(level, f);
   }
 
@@ -269,10 +304,16 @@ class VersionEdit {
     return new_files_;
   }
 
+  void MarkAtomicGroup(uint32_t remaining_entries) {
+    is_in_atomic_group_ = true;
+    remaining_entries_ = remaining_entries;
+  }
+
   std::string DebugString(bool hex_key = false) const;
   std::string DebugJSON(int edit_num, bool hex_key = false) const;
 
  private:
+  friend class ReactiveVersionSet;
   friend class VersionSet;
   friend class Version;
 
@@ -284,6 +325,8 @@ class VersionEdit {
   uint64_t prev_log_number_;
   uint64_t next_file_number_;
   uint32_t max_column_family_;
+  // The most recent WAL log number that is deleted
+  uint64_t min_log_number_to_keep_;
   SequenceNumber last_sequence_;
   bool has_comparator_;
   bool has_log_number_;
@@ -291,11 +334,12 @@ class VersionEdit {
   bool has_next_file_number_;
   bool has_last_sequence_;
   bool has_max_column_family_;
+  bool has_min_log_number_to_keep_;
 
   DeletedFileSet deleted_files_;
   std::vector<std::pair<int, FileMetaData>> new_files_;
 
-  // Each version edit record should have column_family_id set
+  // Each version edit record should have column_family_ set
   // If it's not set, it is default (0)
   uint32_t column_family_;
   // a version edit can be either column_family add or
@@ -304,6 +348,9 @@ class VersionEdit {
   bool is_column_family_drop_;
   bool is_column_family_add_;
   std::string column_family_name_;
+
+  bool is_in_atomic_group_;
+  uint32_t remaining_entries_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/version_edit_test.cc b/thirdparty/rocksdb/db/version_edit_test.cc
index 338bb36f60..64d1fd77bc 100644
--- a/thirdparty/rocksdb/db/version_edit_test.cc
+++ b/thirdparty/rocksdb/db/version_edit_test.cc
@@ -8,6 +8,7 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_edit.h"
+#include "util/coding.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
 
@@ -181,6 +182,63 @@ TEST_F(VersionEditTest, ColumnFamilyTest) {
   TestEncodeDecode(edit);
 }
 
+TEST_F(VersionEditTest, MinLogNumberToKeep) {
+  VersionEdit edit;
+  edit.SetMinLogNumberToKeep(13);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetMinLogNumberToKeep(23);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AtomicGroupTest) {
+  VersionEdit edit;
+  edit.MarkAtomicGroup(1);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, IgnorableField) {
+  VersionEdit ve;
+  std::string encoded;
+
+  // Size of ignorable field is too large
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      5 /* fieldlength 5 */);
+  encoded += "abc";  // Only fills 3 bytes,
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  encoded.clear();
+  // Error when seeing unidentified tag that is not ignorable
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* next file number */, 88);
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  // Safely ignore an identified but safely ignorable entry
+  encoded.clear();
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88);
+
+  ASSERT_OK(ve.DecodeFrom(encoded));
+
+  ASSERT_TRUE(ve.has_log_number());
+  ASSERT_TRUE(ve.has_next_file_number());
+  ASSERT_EQ(66, ve.log_number());
+  ASSERT_EQ(88, ve.next_file_number());
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/version_set.cc b/thirdparty/rocksdb/db/version_set.cc
index 782ebc263e..6c7b77a900 100644
--- a/thirdparty/rocksdb/db/version_set.cc
+++ b/thirdparty/rocksdb/db/version_set.cc
@@ -16,7 +16,7 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <algorithm>
-#include <climits>
+#include <list>
 #include <map>
 #include <set>
 #include <string>
@@ -51,6 +51,7 @@
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
+#include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
 
@@ -63,20 +64,39 @@ int FindFileInRange(const InternalKeyComparator& icmp,
     const Slice& key,
     uint32_t left,
     uint32_t right) {
-  while (left < right) {
-    uint32_t mid = (left + right) / 2;
-    const FdWithKeyRange& f = file_level.files[mid];
-    if (icmp.InternalKeyComparator::Compare(f.largest_key, key) < 0) {
-      // Key at "mid.largest" is < "target".  Therefore all
-      // files at or before "mid" are uninteresting.
-      left = mid + 1;
-    } else {
-      // Key at "mid.largest" is >= "target".  Therefore all files
-      // after "mid" are uninteresting.
-      right = mid;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
+  };
+  const auto &b = file_level.files;
+  return static_cast<int>(std::lower_bound(b + left,
+                                           b + right, key, cmp) - b);
+}
+
+Status OverlapWithIterator(const Comparator* ucmp,
+    const Slice& smallest_user_key,
+    const Slice& largest_user_key,
+    InternalIterator* iter,
+    bool* overlap) {
+  InternalKey range_start(smallest_user_key, kMaxSequenceNumber,
+                          kValueTypeForSeek);
+  iter->Seek(range_start.Encode());
+  if (!iter->status().ok()) {
+    return iter->status();
+  }
+
+  *overlap = false;
+  if (iter->Valid()) {
+    ParsedInternalKey seek_result;
+    if (!ParseInternalKey(iter->key(), &seek_result)) {
+      return Status::Corruption("DB have corrupted keys");
+    }
+
+    if (ucmp->Compare(seek_result.user_key, largest_user_key) <= 0) {
+      *overlap = true;
     }
   }
-  return right;
+
+  return iter->status();
 }
 
 // Class to help choose the next file to search for the particular key.
@@ -103,11 +123,15 @@ class FilePicker {
 #endif
         level_files_brief_(file_levels),
         is_hit_file_last_in_level_(false),
+        curr_file_level_(nullptr),
         user_key_(user_key),
         ikey_(ikey),
         file_indexer_(file_indexer),
         user_comparator_(user_comparator),
         internal_comparator_(internal_comparator) {
+#ifdef NDEBUG
+    (void)files;
+#endif
     // Setup member variables to search first level.
     search_ended_ = !PrepareNextLevel();
     if (!search_ended_) {
@@ -278,17 +302,28 @@ class FilePicker {
         // On Level-n (n>=1), files are sorted. Binary search to find the
         // earliest file whose largest key >= ikey. Search left bound and
         // right bound are used to narrow the range.
-        if (search_left_bound_ == search_right_bound_) {
-          start_index = search_left_bound_;
-        } else if (search_left_bound_ < search_right_bound_) {
+        if (search_left_bound_ <= search_right_bound_) {
           if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
             search_right_bound_ =
                 static_cast<int32_t>(curr_file_level_->num_files) - 1;
           }
+          // `search_right_bound_` is an inclusive upper-bound, but since it was
+          // determined based on user key, it is still possible the lookup key
+          // falls to the right of `search_right_bound_`'s corresponding file.
+          // So, pass a limit one higher, which allows us to detect this case.
           start_index =
               FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
                               static_cast<uint32_t>(search_left_bound_),
-                              static_cast<uint32_t>(search_right_bound_));
+                              static_cast<uint32_t>(search_right_bound_) + 1);
+          if (start_index == search_right_bound_ + 1) {
+            // `ikey_` comes after `search_right_bound_`. The lookup key does
+            // not exist on this level, so let's skip this level and do a full
+            // binary search on the next level.
+            search_left_bound_ = 0;
+            search_right_bound_ = FileIndexer::kLevelMaxIndex;
+            curr_level_++;
+            continue;
+          }
         } else {
           // search_left_bound > search_right_bound, key does not exist in
           // this level. Since no comparison is done in this level, it will
@@ -328,7 +363,11 @@ Version::~Version() {
       assert(f->refs > 0);
       f->refs--;
       if (f->refs <= 0) {
-        vset_->obsolete_files_.push_back(f);
+        assert(cfd_ != nullptr);
+        uint32_t path_id = f->fd.GetPathId();
+        assert(path_id < cfd_->ioptions()->cf_paths.size());
+        vset_->obsolete_files_.push_back(
+            ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path));
       }
     }
   }
@@ -409,9 +448,9 @@ bool SomeFileOverlapsRange(
   // Binary search over file list
   uint32_t index = 0;
   if (smallest_user_key != nullptr) {
-    // Find the earliest possible internal key for smallest_user_key
+    // Find the leftmost possible internal key for smallest_user_key
     InternalKey small;
-    small.SetMaxPossibleForUserKey(*smallest_user_key);
+    small.SetMinPossibleForUserKey(*smallest_user_key);
     index = FindFile(icmp, file_level, small.Encode());
   }
 
@@ -425,128 +464,258 @@ bool SomeFileOverlapsRange(
 
 namespace {
 
-// An internal iterator.  For a given version/level pair, yields
-// information about the files in the level.  For a given entry, key()
-// is the largest key that occurs in the file, and value() is an
-// 16-byte value containing the file number and file size, both
-// encoded using EncodeFixed64.
-class LevelFileNumIterator : public InternalIterator {
+class LevelIterator final : public InternalIterator {
  public:
-  LevelFileNumIterator(const InternalKeyComparator& icmp,
-                       const LevelFilesBrief* flevel, bool should_sample)
-      : icmp_(icmp),
+  LevelIterator(
+      TableCache* table_cache, const ReadOptions& read_options,
+      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
+      const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor,
+      bool should_sample, HistogramImpl* file_read_hist, bool for_compaction,
+      bool skip_filters, int level, RangeDelAggregator* range_del_agg,
+      const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
+          nullptr)
+      : table_cache_(table_cache),
+        read_options_(read_options),
+        env_options_(env_options),
+        icomparator_(icomparator),
+        user_comparator_(icomparator.user_comparator()),
         flevel_(flevel),
-        index_(static_cast<uint32_t>(flevel->num_files)),
-        current_value_(0, 0, 0),  // Marks as invalid
-        should_sample_(should_sample) {}
-  virtual bool Valid() const override { return index_ < flevel_->num_files; }
-  virtual void Seek(const Slice& target) override {
-    index_ = FindFile(icmp_, *flevel_, target);
-  }
-  virtual void SeekForPrev(const Slice& target) override {
-    SeekForPrevImpl(target, &icmp_);
+        prefix_extractor_(prefix_extractor),
+        file_read_hist_(file_read_hist),
+        should_sample_(should_sample),
+        for_compaction_(for_compaction),
+        skip_filters_(skip_filters),
+        file_index_(flevel_->num_files),
+        level_(level),
+        range_del_agg_(range_del_agg),
+        pinned_iters_mgr_(nullptr),
+        compaction_boundaries_(compaction_boundaries) {
+    // Empty level is not supported.
+    assert(flevel_ != nullptr && flevel_->num_files > 0);
   }
 
-  virtual void SeekToFirst() override { index_ = 0; }
-  virtual void SeekToLast() override {
-    index_ = (flevel_->num_files == 0)
-                 ? 0
-                 : static_cast<uint32_t>(flevel_->num_files) - 1;
-  }
-  virtual void Next() override {
-    assert(Valid());
-    index_++;
-  }
-  virtual void Prev() override {
-    assert(Valid());
-    if (index_ == 0) {
-      index_ = static_cast<uint32_t>(flevel_->num_files);  // Marks as invalid
-    } else {
-      index_--;
-    }
-  }
+  ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() override;
+  void Prev() override;
+
+  bool Valid() const override { return file_iter_.Valid(); }
   Slice key() const override {
     assert(Valid());
-    return flevel_->files[index_].largest_key;
+    return file_iter_.key();
   }
   Slice value() const override {
     assert(Valid());
-
-    auto file_meta = flevel_->files[index_];
-    if (should_sample_) {
-      sample_file_read_inc(file_meta.file_metadata);
+    return file_iter_.value();
+  }
+  Status status() const override {
+    return file_iter_.iter() ? file_iter_.status() : Status::OK();
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    if (file_iter_.iter()) {
+      file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
     }
-    current_value_ = file_meta.fd;
-    return Slice(reinterpret_cast<const char*>(&current_value_),
-                 sizeof(FileDescriptor));
   }
-  virtual Status status() const override { return Status::OK(); }
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsKeyPinned();
+  }
+  bool IsValuePinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsValuePinned();
+  }
 
  private:
-  const InternalKeyComparator icmp_;
-  const LevelFilesBrief* flevel_;
-  uint32_t index_;
-  mutable FileDescriptor current_value_;
-  bool should_sample_;
-};
-
-class LevelFileIteratorState : public TwoLevelIteratorState {
- public:
-  // @param skip_filters Disables loading/accessing the filter block
-  LevelFileIteratorState(TableCache* table_cache,
-                         const ReadOptions& read_options,
-                         const EnvOptions& env_options,
-                         const InternalKeyComparator& icomparator,
-                         HistogramImpl* file_read_hist, bool for_compaction,
-                         bool prefix_enabled, bool skip_filters, int level,
-                         RangeDelAggregator* range_del_agg)
-      : TwoLevelIteratorState(prefix_enabled),
-        table_cache_(table_cache),
-        read_options_(read_options),
-        env_options_(env_options),
-        icomparator_(icomparator),
-        file_read_hist_(file_read_hist),
-        for_compaction_(for_compaction),
-        skip_filters_(skip_filters),
-        level_(level),
-        range_del_agg_(range_del_agg) {}
+  void SkipEmptyFileForward();
+  void SkipEmptyFileBackward();
+  void SetFileIterator(InternalIterator* iter);
+  void InitFileIterator(size_t new_file_index);
 
-  InternalIterator* NewSecondaryIterator(const Slice& meta_handle) override {
-    if (meta_handle.size() != sizeof(FileDescriptor)) {
-      return NewErrorInternalIterator(
-          Status::Corruption("FileReader invoked with unexpected value"));
-    }
-    const FileDescriptor* fd =
-        reinterpret_cast<const FileDescriptor*>(meta_handle.data());
-    return table_cache_->NewIterator(
-        read_options_, env_options_, icomparator_, *fd, range_del_agg_,
-        nullptr /* don't need reference to table */, file_read_hist_,
-        for_compaction_, nullptr /* arena */, skip_filters_, level_);
+  const Slice& file_smallest_key(size_t file_index) {
+    assert(file_index < flevel_->num_files);
+    return flevel_->files[file_index].smallest_key;
   }
 
-  bool PrefixMayMatch(const Slice& internal_key) override {
-    return true;
+  bool KeyReachedUpperBound(const Slice& internal_key) {
+    return read_options_.iterate_upper_bound != nullptr &&
+           user_comparator_.Compare(ExtractUserKey(internal_key),
+                                    *read_options_.iterate_upper_bound) >= 0;
   }
 
-  bool KeyReachedUpperBound(const Slice& internal_key) override {
-    return read_options_.iterate_upper_bound != nullptr &&
-           icomparator_.user_comparator()->Compare(
-               ExtractUserKey(internal_key),
-               *read_options_.iterate_upper_bound) >= 0;
+  InternalIterator* NewFileIterator() {
+    assert(file_index_ < flevel_->num_files);
+    auto file_meta = flevel_->files[file_index_];
+    if (should_sample_) {
+      sample_file_read_inc(file_meta.file_metadata);
+    }
+
+    const InternalKey* smallest_compaction_key = nullptr;
+    const InternalKey* largest_compaction_key = nullptr;
+    if (compaction_boundaries_ != nullptr) {
+      smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
+      largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
+    }
+    return table_cache_->NewIterator(
+        read_options_, env_options_, icomparator_, *file_meta.file_metadata,
+        range_del_agg_, prefix_extractor_,
+        nullptr /* don't need reference to table */,
+        file_read_hist_, for_compaction_, nullptr /* arena */, skip_filters_,
+        level_, smallest_compaction_key, largest_compaction_key);
   }
 
- private:
   TableCache* table_cache_;
   const ReadOptions read_options_;
   const EnvOptions& env_options_;
   const InternalKeyComparator& icomparator_;
+  const UserComparatorWrapper user_comparator_;
+  const LevelFilesBrief* flevel_;
+  mutable FileDescriptor current_value_;
+  const SliceTransform* prefix_extractor_;
+
   HistogramImpl* file_read_hist_;
+  bool should_sample_;
   bool for_compaction_;
   bool skip_filters_;
+  size_t file_index_;
   int level_;
   RangeDelAggregator* range_del_agg_;
+  IteratorWrapper file_iter_;  // May be nullptr
+  PinnedIteratorsManager* pinned_iters_mgr_;
+
+  // To be propagated to RangeDelAggregator in order to safely truncate range
+  // tombstones.
+  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
 };
 
+void LevelIterator::Seek(const Slice& target) {
+  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+
+  InitFileIterator(new_file_index);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.Seek(target);
+  }
+  SkipEmptyFileForward();
+}
+
+void LevelIterator::SeekForPrev(const Slice& target) {
+  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+  if (new_file_index >= flevel_->num_files) {
+    new_file_index = flevel_->num_files - 1;
+  }
+
+  InitFileIterator(new_file_index);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekForPrev(target);
+    SkipEmptyFileBackward();
+  }
+}
+
+void LevelIterator::SeekToFirst() {
+  InitFileIterator(0);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToFirst();
+  }
+  SkipEmptyFileForward();
+}
+
+void LevelIterator::SeekToLast() {
+  InitFileIterator(flevel_->num_files - 1);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToLast();
+  }
+  SkipEmptyFileBackward();
+}
+
+void LevelIterator::Next() {
+  assert(Valid());
+  file_iter_.Next();
+  SkipEmptyFileForward();
+}
+
+void LevelIterator::Prev() {
+  assert(Valid());
+  file_iter_.Prev();
+  SkipEmptyFileBackward();
+}
+
+void LevelIterator::SkipEmptyFileForward() {
+  while (file_iter_.iter() == nullptr ||
+         (!file_iter_.Valid() && file_iter_.status().ok() &&
+          !file_iter_.iter()->IsOutOfBound())) {
+    // Move to next file
+    if (file_index_ >= flevel_->num_files - 1) {
+      // Already at the last file
+      SetFileIterator(nullptr);
+      return;
+    }
+    if (KeyReachedUpperBound(file_smallest_key(file_index_ + 1))) {
+      SetFileIterator(nullptr);
+      return;
+    }
+    InitFileIterator(file_index_ + 1);
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToFirst();
+    }
+  }
+}
+
+void LevelIterator::SkipEmptyFileBackward() {
+  while (file_iter_.iter() == nullptr ||
+         (!file_iter_.Valid() && file_iter_.status().ok())) {
+    // Move to previous file
+    if (file_index_ == 0) {
+      // Already the first file
+      SetFileIterator(nullptr);
+      return;
+    }
+    InitFileIterator(file_index_ - 1);
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToLast();
+    }
+  }
+}
+
+void LevelIterator::SetFileIterator(InternalIterator* iter) {
+  if (pinned_iters_mgr_ && iter) {
+    iter->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  InternalIterator* old_iter = file_iter_.Set(iter);
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    pinned_iters_mgr_->PinIterator(old_iter);
+  } else {
+    delete old_iter;
+  }
+}
+
+void LevelIterator::InitFileIterator(size_t new_file_index) {
+  if (new_file_index >= flevel_->num_files) {
+    file_index_ = new_file_index;
+    SetFileIterator(nullptr);
+    return;
+  } else {
+    // If the file iterator shows incomplete, we try it again if users seek
+    // to the same file, as this time we may go to a different data block
+    // which is cached in block cache.
+    //
+    if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() &&
+        new_file_index == file_index_) {
+      // file_iter_ is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      file_index_ = new_file_index;
+      InternalIterator* iter = NewFileIterator();
+      SetFileIterator(iter);
+    }
+  }
+}
+}  // anonymous namespace
+
 // A wrapper of version builder which references the current version in
 // constructor and unref it in the destructor.
 // Both of the constructor and destructor need to be called inside DB Mutex.
@@ -560,16 +729,14 @@ class BaseReferencedVersionBuilder {
     version_->Ref();
   }
   ~BaseReferencedVersionBuilder() {
-    delete version_builder_;
     version_->Unref();
   }
-  VersionBuilder* version_builder() { return version_builder_; }
+  VersionBuilder* version_builder() { return version_builder_.get(); }
 
  private:
-  VersionBuilder* version_builder_;
+  std::unique_ptr<VersionBuilder> version_builder_;
   Version* version_;
 };
-}  // anonymous namespace
 
 Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
                                    const FileMetaData* file_meta,
@@ -577,8 +744,8 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   auto table_cache = cfd_->table_cache();
   auto ioptions = cfd_->ioptions();
   Status s = table_cache->GetTableProperties(
-      vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
-      tp, true /* no io */);
+      env_options_, cfd_->internal_comparator(), file_meta->fd, tp,
+      mutable_cf_options_.prefix_extractor.get(), true /* no io */);
   if (s.ok()) {
     return s;
   }
@@ -597,10 +764,10 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
     file_name = *fname;
   } else {
     file_name =
-      TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
+      TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(),
                     file_meta->fd.GetPathId());
   }
-  s = ioptions->env->NewRandomAccessFile(file_name, &file, vset_->env_options_);
+  s = ioptions->env->NewRandomAccessFile(file_name, &file, env_options_);
   if (!s.ok()) {
     return s;
   }
@@ -609,10 +776,15 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
   // By setting the magic number to kInvalidTableMagicNumber, we can by
   // pass the magic number check in the footer.
   std::unique_ptr<RandomAccessFileReader> file_reader(
-      new RandomAccessFileReader(std::move(file), file_name));
+      new RandomAccessFileReader(
+          std::move(file), file_name, nullptr /* env */, nullptr /* stats */,
+          0 /* hist_type */, nullptr /* file_read_hist */,
+          nullptr /* rate_limiter */, false /* for_compaction*/,
+          ioptions->listeners));
   s = ReadTableProperties(
       file_reader.get(), file_meta->fd.GetFileSize(),
-      Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions, &raw_table_properties);
+      Footer::kInvalidTableMagicNumber /* table's magic number */, *ioptions,
+      &raw_table_properties, false /* compression_type_missing */);
   if (!s.ok()) {
     return s;
   }
@@ -638,7 +810,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
                                          int level) {
   for (const auto& file_meta : storage_info_.files_[level]) {
     auto fname =
-        TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
+        TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
                       file_meta->fd.GetPathId());
     // 1. If the table is already present in table cache, load table
     // properties from there.
@@ -666,7 +838,7 @@ Status Version::GetPropertiesOfTablesInRange(
                                          false);
       for (const auto& file_meta : files) {
         auto fname =
-            TableFileName(vset_->db_options_->db_paths,
+            TableFileName(cfd_->ioptions()->cf_paths,
                           file_meta->fd.GetNumber(), file_meta->fd.GetPathId());
         if (props->count(fname) == 0) {
           // 1. If the table is already present in table cache, load table
@@ -712,8 +884,8 @@ size_t Version::GetMemoryUsageByTableReaders() {
   for (auto& file_level : storage_info_.level_files_brief_) {
     for (size_t i = 0; i < file_level.num_files; i++) {
       total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
-          vset_->env_options_, cfd_->internal_comparator(),
-          file_level.files[i].fd);
+          env_options_, cfd_->internal_comparator(), file_level.files[i].fd,
+          mutable_cf_options_.prefix_extractor.get());
     }
   }
   return total_usage;
@@ -738,19 +910,24 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
     for (const auto& file : vstorage->LevelFiles(level)) {
       uint32_t path_id = file->fd.GetPathId();
       std::string file_path;
-      if (path_id < ioptions->db_paths.size()) {
-        file_path = ioptions->db_paths[path_id].path;
+      if (path_id < ioptions->cf_paths.size()) {
+        file_path = ioptions->cf_paths[path_id].path;
       } else {
-        assert(!ioptions->db_paths.empty());
-        file_path = ioptions->db_paths.back().path;
+        assert(!ioptions->cf_paths.empty());
+        file_path = ioptions->cf_paths.back().path;
       }
-      files.emplace_back(
-          MakeTableFileName("", file->fd.GetNumber()), file_path,
-          file->fd.GetFileSize(), file->smallest_seqno, file->largest_seqno,
+      files.emplace_back(SstFileMetaData{
+          MakeTableFileName("", file->fd.GetNumber()),
+          file_path,
+          static_cast<size_t>(file->fd.GetFileSize()),
+          file->fd.smallest_seqno,
+          file->fd.largest_seqno,
           file->smallest.user_key().ToString(),
           file->largest.user_key().ToString(),
           file->stats.num_reads_sampled.load(std::memory_order_relaxed),
-          file->being_compacted);
+          file->being_compacted});
+      files.back().num_entries = file->num_entries;
+      files.back().num_deletions = file->num_deletions;
       level_size += file->fd.GetFileSize();
     }
     cf_meta->levels.emplace_back(
@@ -759,6 +936,15 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
   }
 }
 
+uint64_t Version::GetSstFilesSize() {
+  uint64_t sst_files_size = 0;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.LevelFiles(level)) {
+      sst_files_size += file_meta->fd.GetFileSize();
+    }
+  }
+  return sst_files_size;
+}
 
 uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
   // Estimation will be inaccurate when:
@@ -841,9 +1027,10 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
     for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
       const auto& file = storage_info_.LevelFilesBrief(0).files[i];
       merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator(
-          read_options, soptions, cfd_->internal_comparator(), file.fd,
-          range_del_agg, nullptr, cfd_->internal_stats()->GetFileReadHist(0),
-          false, arena, false /* skip_filters */, 0 /* level */));
+          read_options, soptions, cfd_->internal_comparator(), *file.file_metadata,
+          range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0), false, arena,
+          false /* skip_filters */, 0 /* level */));
     }
     if (should_sample) {
       // Count ones for every L0 files. This is done per iterator creation
@@ -854,41 +1041,74 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options,
         sample_file_read_inc(meta);
       }
     }
-  } else {
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
     // For levels > 0, we can use a concatenating iterator that sequentially
     // walks through the non-overlapping files in the level, opening them
     // lazily.
-    auto* mem = arena->AllocateAligned(sizeof(LevelFileIteratorState));
-    auto* state = new (mem)
-        LevelFileIteratorState(cfd_->table_cache(), read_options, soptions,
-                               cfd_->internal_comparator(),
-                               cfd_->internal_stats()->GetFileReadHist(level),
-                               false /* for_compaction */,
-                               cfd_->ioptions()->prefix_extractor != nullptr,
-                               IsFilterSkipped(level), level, range_del_agg);
-    mem = arena->AllocateAligned(sizeof(LevelFileNumIterator));
-    auto* first_level_iter = new (mem) LevelFileNumIterator(
+    auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+    merge_iter_builder->AddIterator(new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, soptions,
         cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
-        should_sample_file_read());
-    merge_iter_builder->AddIterator(
-        NewTwoLevelIterator(state, first_level_iter, arena, false));
+        mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        false /* for_compaction */, IsFilterSkipped(level), level,
+        range_del_agg));
   }
 }
 
-void Version::AddRangeDelIteratorsForLevel(
-    const ReadOptions& read_options, const EnvOptions& soptions, int level,
-    std::vector<InternalIterator*>* range_del_iters) {
-  range_del_iters->clear();
-  for (size_t i = 0; i < storage_info_.LevelFilesBrief(level).num_files; i++) {
-    const auto& file = storage_info_.LevelFilesBrief(level).files[i];
-    auto* range_del_iter = cfd_->table_cache()->NewRangeTombstoneIterator(
-        read_options, soptions, cfd_->internal_comparator(), file.fd,
-        cfd_->internal_stats()->GetFileReadHist(level),
-        false /* skip_filters */, level);
-    if (range_del_iter != nullptr) {
-      range_del_iters->push_back(range_del_iter);
+Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
+                                         const EnvOptions& env_options,
+                                         const Slice& smallest_user_key,
+                                         const Slice& largest_user_key,
+                                         int level, bool* overlap) {
+  assert(storage_info_.finalized_);
+
+  auto icmp = cfd_->internal_comparator();
+  auto ucmp = icmp.user_comparator();
+
+  Arena arena;
+  Status status;
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  *overlap = false;
+
+  if (level == 0) {
+    for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+      const auto file = &storage_info_.LevelFilesBrief(0).files[i];
+      if (AfterFile(ucmp, &smallest_user_key, file) ||
+          BeforeFile(ucmp, &largest_user_key, file)) {
+        continue;
+      }
+      ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
+          read_options, env_options, cfd_->internal_comparator(), *file->file_metadata,
+          &range_del_agg, mutable_cf_options_.prefix_extractor.get(), nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0), false, &arena,
+          false /* skip_filters */, 0 /* level */));
+      status = OverlapWithIterator(
+          ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
+      if (!status.ok() || *overlap) {
+        break;
+      }
     }
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+    auto mem = arena.AllocateAligned(sizeof(LevelIterator));
+    ScopedArenaIterator iter(new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, env_options,
+        cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+        mutable_cf_options_.prefix_extractor.get(), should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        false /* for_compaction */, IsFilterSkipped(level), level,
+        &range_del_agg));
+    status = OverlapWithIterator(
+        ucmp, smallest_user_key, largest_user_key, iter.get(), overlap);
+  }
+
+  if (status.ok() && *overlap == false &&
+      range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) {
+    *overlap = true;
   }
+  return status;
 }
 
 VersionStorageInfo::VersionStorageInfo(
@@ -905,6 +1125,7 @@ VersionStorageInfo::VersionStorageInfo(
       compaction_style_(compaction_style),
       files_(new std::vector<FileMetaData*>[num_levels_]),
       base_level_(num_levels_ == 1 ? -1 : 1),
+      level_multiplier_(0.0),
       files_by_compaction_pri_(num_levels_),
       level0_non_overlapping_(false),
       next_file_to_compact_by_size_(num_levels_),
@@ -932,10 +1153,13 @@ VersionStorageInfo::VersionStorageInfo(
     current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_;
     current_num_deletions_ = ref_vstorage->current_num_deletions_;
     current_num_samples_ = ref_vstorage->current_num_samples_;
+    oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_;
   }
 }
 
 Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
+                 const EnvOptions& env_opt,
+                 const MutableCFOptions mutable_cf_options,
                  uint64_t version_number)
     : env_(vset->env_),
       cfd_(column_family_data),
@@ -959,13 +1183,16 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
       next_(this),
       prev_(this),
       refs_(0),
+      env_options_(env_opt),
+      mutable_cf_options_(mutable_cf_options),
       version_number_(version_number) {}
 
 void Version::Get(const ReadOptions& read_options, const LookupKey& k,
                   PinnableSlice* value, Status* status,
                   MergeContext* merge_context,
-                  RangeDelAggregator* range_del_agg, bool* value_found,
-                  bool* key_exists, SequenceNumber* seq, bool* is_blob) {
+                  SequenceNumber* max_covering_tombstone_seq, bool* value_found,
+                  bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
+                  bool* is_blob) {
   Slice ikey = k.internal_key();
   Slice user_key = k.user_key();
 
@@ -980,8 +1207,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
   GetContext get_context(
       user_comparator(), merge_operator_, info_log_, db_statistics_,
       status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
-      value, value_found, merge_context, range_del_agg, this->env_, seq,
-      merge_operator_ ? &pinned_iters_mgr : nullptr, is_blob);
+      value, value_found, merge_context, max_covering_tombstone_seq, this->env_,
+      seq, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, is_blob);
 
   // Pin blocks that we read to hold merge operands
   if (merge_operator_) {
@@ -993,25 +1220,50 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
       storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_,
       user_comparator(), internal_comparator());
   FdWithKeyRange* f = fp.GetNextFile();
+
   while (f != nullptr) {
+    if (*max_covering_tombstone_seq > 0) {
+      // The remaining files we look at will only contain covered keys, so we
+      // stop here.
+      break;
+    }
     if (get_context.sample()) {
       sample_file_read_inc(f->file_metadata);
     }
+
+    bool timer_enabled =
+        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+        get_perf_context()->per_level_perf_context_enabled;
+    StopWatchNano timer(env_, timer_enabled /* auto_start */);
     *status = table_cache_->Get(
-        read_options, *internal_comparator(), f->fd, ikey, &get_context,
+        read_options, *internal_comparator(), *f->file_metadata, ikey,
+        &get_context, mutable_cf_options_.prefix_extractor.get(),
         cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
         IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
                         fp.IsHitFileLastInLevel()),
         fp.GetCurrentLevel());
     // TODO: examine the behavior for corrupted key
+    if (timer_enabled) {
+      PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                                fp.GetCurrentLevel());
+    }
     if (!status->ok()) {
       return;
     }
 
+    // report the counters before returning
+    if (get_context.State() != GetContext::kNotFound &&
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    }
     switch (get_context.State()) {
       case GetContext::kNotFound:
         // Keep searching in other files
         break;
+      case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
+        break;
       case GetContext::kFound:
         if (fp.GetHitFileLevel() == 0) {
           RecordTick(db_statistics_, GET_HIT_L0);
@@ -1020,6 +1272,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
         } else if (fp.GetHitFileLevel() >= 2) {
           RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
         }
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel());
         return;
       case GetContext::kDeleted:
         // Use empty error message for speed
@@ -1028,8 +1281,6 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
       case GetContext::kCorrupt:
         *status = Status::Corruption("corrupted key for ", user_key);
         return;
-      case GetContext::kMerge:
-        break;
       case GetContext::kBlobIndex:
         ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
         *status = Status::NotSupported(
@@ -1040,6 +1291,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k,
     f = fp.GetNextFile();
   }
 
+  if (db_statistics_ != nullptr) {
+    get_context.ReportCounters();
+  }
   if (GetContext::kMerge == get_context.State()) {
     if (!merge_operator_) {
       *status =  Status::InvalidArgument(
@@ -1090,6 +1344,7 @@ void Version::PrepareApply(
   storage_info_.GenerateFileIndexer();
   storage_info_.GenerateLevelFilesBrief();
   storage_info_.GenerateLevel0NonOverlapping();
+  storage_info_.GenerateBottommostFiles();
 }
 
 bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
@@ -1109,7 +1364,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
   }
   if (tp.get() == nullptr) return false;
   file_meta->num_entries = tp->num_entries;
-  file_meta->num_deletions = GetDeletedKeys(tp->user_collected_properties);
+  file_meta->num_deletions = tp->num_deletions;
   file_meta->raw_value_size = tp->raw_value_size;
   file_meta->raw_key_size = tp->raw_key_size;
 
@@ -1328,6 +1583,7 @@ void VersionStorageInfo::EstimateCompactionBytesNeeded(
 
 namespace {
 uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
+                                 const MutableCFOptions& mutable_cf_options,
                                  const std::vector<FileMetaData*>& files) {
   uint32_t ttl_expired_files_count = 0;
 
@@ -1341,8 +1597,7 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableCFOptions& ioptions,
         auto creation_time =
             f->fd.table_reader->GetTableProperties()->creation_time;
         if (creation_time > 0 &&
-            creation_time <
-                (current_time - ioptions.compaction_options_fifo.ttl)) {
+            creation_time < (current_time - mutable_cf_options.ttl)) {
           ttl_expired_files_count++;
         }
       }
@@ -1389,19 +1644,19 @@ void VersionStorageInfo::ComputeCompactionScore(
       }
 
       if (compaction_style_ == kCompactionStyleFIFO) {
-        score =
-            static_cast<double>(total_size) /
-            immutable_cf_options.compaction_options_fifo.max_table_files_size;
-        if (immutable_cf_options.compaction_options_fifo.allow_compaction) {
+        score = static_cast<double>(total_size) /
+                mutable_cf_options.compaction_options_fifo.max_table_files_size;
+        if (mutable_cf_options.compaction_options_fifo.allow_compaction) {
           score = std::max(
               static_cast<double>(num_sorted_runs) /
                   mutable_cf_options.level0_file_num_compaction_trigger,
               score);
         }
-        if (immutable_cf_options.compaction_options_fifo.ttl > 0) {
-          score = std::max(static_cast<double>(GetExpiredTtlFilesCount(
-                               immutable_cf_options, files_[level])),
-                           score);
+        if (mutable_cf_options.ttl > 0) {
+          score = std::max(
+              static_cast<double>(GetExpiredTtlFilesCount(
+                  immutable_cf_options, mutable_cf_options, files_[level])),
+              score);
         }
 
       } else {
@@ -1446,6 +1701,10 @@ void VersionStorageInfo::ComputeCompactionScore(
     }
   }
   ComputeFilesMarkedForCompaction();
+  ComputeBottommostFilesMarkedForCompaction();
+  if (mutable_cf_options.ttl > 0) {
+    ComputeExpiredTtlFiles(immutable_cf_options, mutable_cf_options.ttl);
+  }
   EstimateCompactionBytesNeeded(mutable_cf_options);
 }
 
@@ -1472,6 +1731,33 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
   }
 }
 
+void VersionStorageInfo::ComputeExpiredTtlFiles(
+    const ImmutableCFOptions& ioptions, const uint64_t ttl) {
+  assert(ttl > 0);
+
+  expired_ttl_files_.clear();
+
+  int64_t _current_time;
+  auto status = ioptions.env->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    return;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  for (int level = 0; level < num_levels() - 1; level++) {
+    for (auto f : files_[level]) {
+      if (!f->being_compacted && f->fd.table_reader != nullptr &&
+          f->fd.table_reader->GetTableProperties() != nullptr) {
+        auto creation_time =
+            f->fd.table_reader->GetTableProperties()->creation_time;
+        if (creation_time > 0 && creation_time < (current_time - ttl)) {
+          expired_ttl_files_.emplace_back(level, f);
+        }
+      }
+    }
+  }
+}
+
 namespace {
 
 // used to sort files by size
@@ -1508,6 +1794,8 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) {
     }
     assert(false);
   }
+#else
+  (void)info_log;
 #endif
   f->refs++;
   level_files->push_back(f);
@@ -1521,6 +1809,7 @@ void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) {
 // 4. GenerateFileIndexer();
 // 5. GenerateLevelFilesBrief();
 // 6. GenerateLevel0NonOverlapping();
+// 7. GenerateBottommostFiles();
 void VersionStorageInfo::SetFinalized() {
   finalized_ = true;
 #ifndef NDEBUG
@@ -1597,9 +1886,9 @@ void SortFileByOverlappingRatio(
       next_level_it++;
     }
 
-    assert(file->fd.file_size != 0);
+    assert(file->compensated_file_size != 0);
     file_to_order[file->fd.GetNumber()] =
-        overlapping_bytes * 1024u / file->fd.file_size;
+        overlapping_bytes * 1024u / file->compensated_file_size;
   }
 
   std::sort(temp->begin(), temp->end(),
@@ -1612,7 +1901,8 @@ void SortFileByOverlappingRatio(
 
 void VersionStorageInfo::UpdateFilesByCompactionPri(
     CompactionPri compaction_pri) {
-  if (compaction_style_ == kCompactionStyleFIFO ||
+  if (compaction_style_ == kCompactionStyleNone ||
+      compaction_style_ == kCompactionStyleFIFO ||
       compaction_style_ == kCompactionStyleUniversal) {
     // don't need this
     return;
@@ -1643,13 +1933,15 @@ void VersionStorageInfo::UpdateFilesByCompactionPri(
       case kOldestLargestSeqFirst:
         std::sort(temp.begin(), temp.end(),
                   [](const Fsize& f1, const Fsize& f2) -> bool {
-                    return f1.file->largest_seqno < f2.file->largest_seqno;
+                    return f1.file->fd.largest_seqno <
+                           f2.file->fd.largest_seqno;
                   });
         break;
       case kOldestSmallestSeqFirst:
         std::sort(temp.begin(), temp.end(),
                   [](const Fsize& f1, const Fsize& f2) -> bool {
-                    return f1.file->smallest_seqno < f2.file->smallest_seqno;
+                    return f1.file->fd.smallest_seqno <
+                           f2.file->fd.smallest_seqno;
                   });
         break;
       case kMinOverlappingRatio:
@@ -1697,6 +1989,60 @@ void VersionStorageInfo::GenerateLevel0NonOverlapping() {
   }
 }
 
+void VersionStorageInfo::GenerateBottommostFiles() {
+  assert(!finalized_);
+  assert(bottommost_files_.empty());
+  for (size_t level = 0; level < level_files_brief_.size(); ++level) {
+    for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files;
+         ++file_idx) {
+      const FdWithKeyRange& f = level_files_brief_[level].files[file_idx];
+      int l0_file_idx;
+      if (level == 0) {
+        l0_file_idx = static_cast<int>(file_idx);
+      } else {
+        l0_file_idx = -1;
+      }
+      Slice smallest_user_key = ExtractUserKey(f.smallest_key);
+      Slice largest_user_key = ExtractUserKey(f.largest_key);
+      if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
+                                         static_cast<int>(level),
+                                         l0_file_idx)) {
+        bottommost_files_.emplace_back(static_cast<int>(level),
+                                       f.file_metadata);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
+  assert(seqnum >= oldest_snapshot_seqnum_);
+  oldest_snapshot_seqnum_ = seqnum;
+  if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
+    ComputeBottommostFilesMarkedForCompaction();
+  }
+}
+
+void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
+  bottommost_files_marked_for_compaction_.clear();
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto& level_and_file : bottommost_files_) {
+    if (!level_and_file.second->being_compacted &&
+        level_and_file.second->fd.largest_seqno != 0 &&
+        level_and_file.second->num_deletions > 1) {
+      // largest_seqno might be nonzero due to containing the final key in an
+      // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
+      // ensures the file really contains deleted or overwritten keys.
+      if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+        bottommost_files_marked_for_compaction_.push_back(level_and_file);
+      } else {
+        bottommost_files_mark_threshold_ =
+            std::min(bottommost_files_mark_threshold_,
+                     level_and_file.second->fd.largest_seqno);
+      }
+    }
+  }
+}
+
 void Version::Ref() {
   ++refs_;
 }
@@ -1730,13 +2076,29 @@ bool VersionStorageInfo::OverlapInLevel(int level,
 void VersionStorageInfo::GetOverlappingInputs(
     int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
-    bool expand_range) const {
+    bool expand_range, InternalKey** next_smallest) const {
   if (level >= num_non_empty_levels_) {
     // this level is empty, no overlapping inputs
     return;
   }
 
   inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  const Comparator* user_cmp = user_comparator_;
+  if (level > 0) {
+    GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+                                          file_index, false, next_smallest);
+    return;
+  }
+
+  if (next_smallest) {
+    // next_smallest key only makes sense for non-level 0, where files are
+    // non-overlapping
+    *next_smallest = nullptr;
+  }
+
   Slice user_begin, user_end;
   if (begin != nullptr) {
     user_begin = begin->user_key();
@@ -1744,43 +2106,52 @@ void VersionStorageInfo::GetOverlappingInputs(
   if (end != nullptr) {
     user_end = end->user_key();
   }
-  if (file_index) {
-    *file_index = -1;
-  }
-  const Comparator* user_cmp = user_comparator_;
-  if (begin != nullptr && end != nullptr && level > 0) {
-    GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
-                                          hint_index, file_index);
-    return;
-  }
 
-  for (size_t i = 0; i < level_files_brief_[level].num_files; ) {
-    FdWithKeyRange* f = &(level_files_brief_[level].files[i++]);
-    const Slice file_start = ExtractUserKey(f->smallest_key);
-    const Slice file_limit = ExtractUserKey(f->largest_key);
-    if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
-      // "f" is completely before specified range; skip it
-    } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
-      // "f" is completely after specified range; skip it
-    } else {
-      inputs->push_back(files_[level][i-1]);
-      if (level == 0 && expand_range) {
-        // Level-0 files may overlap each other.  So check if the newly
-        // added file has expanded the range.  If so, restart search.
-        if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
-          user_begin = file_start;
-          inputs->clear();
-          i = 0;
-        } else if (end != nullptr
-            && user_cmp->Compare(file_limit, user_end) > 0) {
-          user_end = file_limit;
-          inputs->clear();
-          i = 0;
+  // index stores the file index need to check.
+  std::list<size_t> index;
+  for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+    index.emplace_back(i);
+  }
+
+  while (!index.empty()) {
+    bool found_overlapping_file = false;
+    auto iter = index.begin();
+    while (iter != index.end()) {
+      FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
+      const Slice file_start = ExtractUserKey(f->smallest_key);
+      const Slice file_limit = ExtractUserKey(f->largest_key);
+      if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
+        // "f" is completely before specified range; skip it
+        iter++;
+      } else if (end != nullptr &&
+                 user_cmp->Compare(file_start, user_end) > 0) {
+        // "f" is completely after specified range; skip it
+        iter++;
+      } else {
+        // if overlap
+        inputs->emplace_back(files_[level][*iter]);
+        found_overlapping_file = true;
+        // record the first file index.
+        if (file_index && *file_index == -1) {
+          *file_index = static_cast<int>(*iter);
+        }
+        // the related file is overlap, erase to avoid checking again.
+        iter = index.erase(iter);
+        if (expand_range) {
+          if (begin != nullptr &&
+              user_cmp->Compare(file_start, user_begin) < 0) {
+            user_begin = file_start;
+          }
+          if (end != nullptr && user_cmp->Compare(file_limit, user_end) > 0) {
+            user_end = file_limit;
+          }
         }
-      } else if (file_index) {
-        *file_index = static_cast<int>(i) - 1;
       }
     }
+    // if all the files left are not overlap, break
+    if (!found_overlapping_file) {
+      break;
+    }
   }
 }
 
@@ -1793,27 +2164,28 @@ void VersionStorageInfo::GetOverlappingInputs(
 void VersionStorageInfo::GetCleanInputsWithinInterval(
     int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
-  if (level >= num_non_empty_levels_) {
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  if (level >= num_non_empty_levels_ || level == 0 ||
+      level_files_brief_[level].num_files == 0) {
     // this level is empty, no inputs within range
+    // also don't support clean input interval within L0
     return;
   }
 
-  inputs->clear();
-  Slice user_begin, user_end;
-  if (begin != nullptr) {
-    user_begin = begin->user_key();
-  }
-  if (end != nullptr) {
-    user_end = end->user_key();
+  const auto& level_files = level_files_brief_[level];
+  if (begin == nullptr) {
+    begin = &level_files.files[0].file_metadata->smallest;
   }
-  if (file_index) {
-    *file_index = -1;
-  }
-  if (begin != nullptr && end != nullptr && level > 0) {
-    GetOverlappingInputsRangeBinarySearch(level, user_begin, user_end, inputs,
-                                          hint_index, file_index,
-                                          true /* within_interval */);
+  if (end == nullptr) {
+    end = &level_files.files[level_files.num_files - 1].file_metadata->largest;
   }
+
+  GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs,
+                                        hint_index, file_index,
+                                        true /* within_interval */);
 }
 
 // Store in "*inputs" all files in "level" that overlap [begin,end]
@@ -1824,15 +2196,15 @@ void VersionStorageInfo::GetCleanInputsWithinInterval(
 // within range [begin, end]. "clean" means there is a boudnary
 // between the files in "*inputs" and the surrounding files
 void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
-    int level, const Slice& user_begin, const Slice& user_end,
+    int level, const InternalKey* begin, const InternalKey* end,
     std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
-    bool within_interval) const {
+    bool within_interval, InternalKey** next_smallest) const {
   assert(level > 0);
   int min = 0;
   int mid = 0;
   int max = static_cast<int>(files_[level].size()) - 1;
   bool foundOverlap = false;
-  const Comparator* user_cmp = user_comparator_;
+  auto user_cmp = user_comparator_;
 
   // if the caller already knows the index of a file that has overlap,
   // then we can skip the binary search.
@@ -1844,15 +2216,15 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
   while (!foundOverlap && min <= max) {
     mid = (min + max)/2;
     FdWithKeyRange* f = &(level_files_brief_[level].files[mid]);
-    const Slice file_start = ExtractUserKey(f->smallest_key);
-    const Slice file_limit = ExtractUserKey(f->largest_key);
-    if ((!within_interval && user_cmp->Compare(file_limit, user_begin) < 0) ||
-        (within_interval && user_cmp->Compare(file_start, user_begin) < 0)) {
+    auto& smallest = f->file_metadata->smallest;
+    auto& largest = f->file_metadata->largest;
+    if ((!within_interval && sstableKeyCompare(user_cmp, begin, largest) > 0) ||
+        (within_interval && sstableKeyCompare(user_cmp, begin, smallest) > 0)) {
       min = mid + 1;
     } else if ((!within_interval &&
-                user_cmp->Compare(user_end, file_start) < 0) ||
+                sstableKeyCompare(user_cmp, smallest, end) > 0) ||
                (within_interval &&
-                user_cmp->Compare(user_end, file_limit) < 0)) {
+                sstableKeyCompare(user_cmp, largest, end) > 0)) {
       max = mid - 1;
     } else {
       foundOverlap = true;
@@ -1862,6 +2234,9 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
 
   // If there were no overlapping files, return immediately.
   if (!foundOverlap) {
+    if (next_smallest) {
+      *next_smallest = nullptr;
+    }
     return;
   }
   // returns the index where an overlap is found
@@ -1871,17 +2246,26 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
 
   int start_index, end_index;
   if (within_interval) {
-    ExtendFileRangeWithinInterval(level, user_begin, user_end, mid, &start_index,
-                                  &end_index);
+    ExtendFileRangeWithinInterval(level, begin, end, mid,
+                                  &start_index, &end_index);
   } else {
-    ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid,
+    ExtendFileRangeOverlappingInterval(level, begin, end, mid,
                                        &start_index, &end_index);
+    assert(end_index >= start_index);
   }
-  assert(end_index >= start_index);
   // insert overlapping files into vector
   for (int i = start_index; i <= end_index; i++) {
     inputs->push_back(files_[level][i]);
   }
+
+  if (next_smallest != nullptr) {
+    // Provide the next key outside the range covered by inputs
+    if (++end_index < static_cast<int>(files_[level].size())) {
+      **next_smallest = files_[level][end_index]->smallest;
+    } else {
+      *next_smallest = nullptr;
+    }
+  }
 }
 
 // Store in *start_index and *end_index the range of all files in
@@ -1891,33 +2275,41 @@ void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
 // and forward to find all overlapping files.
 // Use FileLevel in searching, make it faster
 void VersionStorageInfo::ExtendFileRangeOverlappingInterval(
-    int level, const Slice& user_begin, const Slice& user_end,
+    int level, const InternalKey* begin, const InternalKey* end,
     unsigned int mid_index, int* start_index, int* end_index) const {
-  const Comparator* user_cmp = user_comparator_;
+  auto user_cmp = user_comparator_;
   const FdWithKeyRange* files = level_files_brief_[level].files;
 #ifndef NDEBUG
   {
     // assert that the file at mid_index overlaps with the range
     assert(mid_index < level_files_brief_[level].num_files);
     const FdWithKeyRange* f = &files[mid_index];
-    const Slice fstart = ExtractUserKey(f->smallest_key);
-    const Slice flimit = ExtractUserKey(f->largest_key);
-    if (user_cmp->Compare(fstart, user_begin) >= 0) {
-      assert(user_cmp->Compare(fstart, user_end) <= 0);
+    auto& smallest = f->file_metadata->smallest;
+    auto& largest = f->file_metadata->largest;
+    if (sstableKeyCompare(user_cmp, begin, smallest) <= 0) {
+      assert(sstableKeyCompare(user_cmp, smallest, end) <= 0);
     } else {
-      assert(user_cmp->Compare(flimit, user_begin) >= 0);
+      // fprintf(stderr, "ExtendFileRangeOverlappingInterval\n%s - %s\n%s - %s\n%d %d\n",
+      //         begin ? begin->DebugString().c_str() : "(null)",
+      //         end ? end->DebugString().c_str() : "(null)",
+      //         smallest->DebugString().c_str(),
+      //         largest->DebugString().c_str(),
+      //         sstableKeyCompare(user_cmp, smallest, begin),
+      //         sstableKeyCompare(user_cmp, largest, begin));
+      assert(sstableKeyCompare(user_cmp, begin, largest) <= 0);
     }
   }
 #endif
   *start_index = mid_index + 1;
   *end_index = mid_index;
-  int count __attribute__((unused)) = 0;
+  int count __attribute__((__unused__));
+  count = 0;
 
   // check backwards from 'mid' to lower indices
   for (int i = mid_index; i >= 0 ; i--) {
     const FdWithKeyRange* f = &files[i];
-    const Slice file_limit = ExtractUserKey(f->largest_key);
-    if (user_cmp->Compare(file_limit, user_begin) >= 0) {
+    auto& largest = f->file_metadata->largest;
+    if (sstableKeyCompare(user_cmp, begin, largest) <= 0) {
       *start_index = i;
       assert((count++, true));
     } else {
@@ -1928,8 +2320,8 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval(
   for (unsigned int i = mid_index+1;
        i < level_files_brief_[level].num_files; i++) {
     const FdWithKeyRange* f = &files[i];
-    const Slice file_start = ExtractUserKey(f->smallest_key);
-    if (user_cmp->Compare(file_start, user_end) <= 0) {
+    auto& smallest = f->file_metadata->smallest;
+    if (sstableKeyCompare(user_cmp, smallest, end) <= 0) {
       assert((count++, true));
       *end_index = i;
     } else {
@@ -1947,39 +2339,36 @@ void VersionStorageInfo::ExtendFileRangeOverlappingInterval(
 // the clean range required.
 // Use FileLevel in searching, make it faster
 void VersionStorageInfo::ExtendFileRangeWithinInterval(
-    int level, const Slice& user_begin, const Slice& user_end,
+    int level, const InternalKey* begin, const InternalKey* end,
     unsigned int mid_index, int* start_index, int* end_index) const {
   assert(level != 0);
-  const Comparator* user_cmp = user_comparator_;
+  auto* user_cmp = user_comparator_;
   const FdWithKeyRange* files = level_files_brief_[level].files;
 #ifndef NDEBUG
   {
     // assert that the file at mid_index is within the range
     assert(mid_index < level_files_brief_[level].num_files);
     const FdWithKeyRange* f = &files[mid_index];
-    const Slice fstart = ExtractUserKey(f->smallest_key);
-    const Slice flimit = ExtractUserKey(f->largest_key);
-    assert(user_cmp->Compare(fstart, user_begin) >= 0 &&
-           user_cmp->Compare(flimit, user_end) <= 0);
+    auto& smallest = f->file_metadata->smallest;
+    auto& largest = f->file_metadata->largest;
+    assert(sstableKeyCompare(user_cmp, begin, smallest) <= 0 &&
+           sstableKeyCompare(user_cmp, largest, end) <= 0);
   }
 #endif
-  ExtendFileRangeOverlappingInterval(level, user_begin, user_end, mid_index,
+  ExtendFileRangeOverlappingInterval(level, begin, end, mid_index,
                                      start_index, end_index);
   int left = *start_index;
   int right = *end_index;
   // shrink from left to right
   while (left <= right) {
-    const Slice& first_key_in_range = ExtractUserKey(files[left].smallest_key);
-    if (user_cmp->Compare(first_key_in_range, user_begin) < 0) {
+    auto& smallest = files[left].file_metadata->smallest;
+    if (sstableKeyCompare(user_cmp, begin, smallest) > 0) {
       left++;
       continue;
     }
     if (left > 0) {  // If not first file
-      const Slice& last_key_before =
-          ExtractUserKey(files[left - 1].largest_key);
-      if (user_cmp->Equal(first_key_in_range, last_key_before)) {
-        // The first user key in range overlaps with the previous file's last
-        // key
+      auto& largest = files[left - 1].file_metadata->largest;
+      if (sstableKeyCompare(user_cmp, smallest, largest) == 0) {
         left++;
         continue;
       }
@@ -1988,16 +2377,15 @@ void VersionStorageInfo::ExtendFileRangeWithinInterval(
   }
   // shrink from right to left
   while (left <= right) {
-    const Slice last_key_in_range = ExtractUserKey(files[right].largest_key);
-    if (user_cmp->Compare(last_key_in_range, user_end) > 0) {
+    auto& largest = files[right].file_metadata->largest;
+    if (sstableKeyCompare(user_cmp, largest, end) > 0) {
       right--;
       continue;
     }
     if (right < static_cast<int>(level_files_brief_[level].num_files) -
                     1) {  // If not the last file
-      const Slice first_key_after =
-          ExtractUserKey(files[right + 1].smallest_key);
-      if (user_cmp->Equal(last_key_in_range, first_key_after)) {
+      auto& smallest = files[right + 1].file_metadata->smallest;
+      if (sstableKeyCompare(user_cmp, smallest, largest) == 0) {
         // The last user key in range overlaps with the next file's first key
         right--;
         continue;
@@ -2021,9 +2409,12 @@ const char* VersionStorageInfo::LevelSummary(
   int len = 0;
   if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
     assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
-    len = snprintf(scratch->buffer, sizeof(scratch->buffer),
-                   "base level %d max bytes base %" PRIu64 " ", base_level_,
-                   level_max_bytes_[base_level_]);
+    if (level_multiplier_ != 0.0) {
+      len = snprintf(
+          scratch->buffer, sizeof(scratch->buffer),
+          "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
+          base_level_, level_multiplier_, level_max_bytes_[base_level_]);
+    }
   }
   len +=
       snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
@@ -2058,7 +2449,7 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
     AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
     int ret = snprintf(scratch->buffer + len, sz,
                        "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
-                       f->fd.GetNumber(), f->smallest_seqno, sztxt,
+                       f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
                        static_cast<int>(f->being_compacted));
     if (ret < 0 || ret >= sz)
       break;
@@ -2159,7 +2550,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
       // No compaction from L1+ needs to be scheduled.
       base_level_ = num_levels_ - 1;
     } else {
-      uint64_t base_bytes_max = options.max_bytes_for_level_base;
+      uint64_t l0_size = 0;
+      for (const auto& f : files_[0]) {
+        l0_size += f->fd.GetFileSize();
+      }
+
+      uint64_t base_bytes_max =
+          std::max(options.max_bytes_for_level_base, l0_size);
       uint64_t base_bytes_min = static_cast<uint64_t>(
           base_bytes_max / options.max_bytes_for_level_multiplier);
 
@@ -2199,11 +2596,33 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
         }
       }
 
+      level_multiplier_ = options.max_bytes_for_level_multiplier;
+      assert(base_level_size > 0);
+      if (l0_size > base_level_size &&
+          (l0_size > options.max_bytes_for_level_base ||
+           static_cast<int>(files_[0].size() / 2) >=
+               options.level0_file_num_compaction_trigger)) {
+        // We adjust the base level according to actual L0 size, and adjust
+        // the level multiplier accordingly, when:
+        //   1. the L0 size is larger than level size base, or
+        //   2. number of L0 files reaches twice the L0->L1 compaction trigger
+        // We don't do this otherwise to keep the LSM-tree structure stable
+        // unless the L0 compation is backlogged.
+        base_level_size = l0_size;
+        if (base_level_ == num_levels_ - 1) {
+          level_multiplier_ = 1.0;
+        } else {
+          level_multiplier_ = std::pow(
+              static_cast<double>(max_level_size) /
+                  static_cast<double>(base_level_size),
+              1.0 / static_cast<double>(num_levels_ - base_level_ - 1));
+        }
+      }
+
       uint64_t level_size = base_level_size;
       for (int i = base_level_; i < num_levels_; i++) {
         if (i > base_level_) {
-          level_size = MultiplyCheckOverflow(
-              level_size, options.max_bytes_for_level_multiplier);
+          level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
         }
         // Don't set any level below base_bytes_max. Otherwise, the LSM can
         // assume an hourglass shape where L1+ sizes are smaller than L0. This
@@ -2249,6 +2668,36 @@ uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
   return size;
 }
 
+bool VersionStorageInfo::RangeMightExistAfterSortedRun(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int last_level, int last_l0_idx) {
+  assert((last_l0_idx != -1) == (last_level == 0));
+  // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
+  // bottommost only if it's the oldest L0 file and there are no files on older
+  // levels. It'd be better to consider it bottommost if there's no overlap in
+  // older levels/files.
+  if (last_level == 0 &&
+      last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) {
+    return true;
+  }
+
+  // Checks whether there are files living beyond the `last_level`. If lower
+  // levels have files, it checks for overlap between [`smallest_key`,
+  // `largest_key`] and those files. Bottomlevel optimizations can be made if
+  // there are no files in lower levels or if there is no overlap with the files
+  // in the lower levels.
+  for (int level = last_level + 1; level < num_levels(); level++) {
+    // The range is not in the bottommost level if there are files in lower
+    // levels when the `last_level` is 0 or if there are files in lower levels
+    // which overlap with [`smallest_key`, `largest_key`].
+    if (files_[level].size() > 0 &&
+        (last_level == 0 ||
+         OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
+      return true;
+    }
+  }
+  return false;
+}
 
 void Version::AddLiveFiles(std::vector<FileDescriptor>* live) {
   for (int level = 0; level < storage_info_.num_levels(); level++) {
@@ -2303,35 +2752,41 @@ struct VersionSet::ManifestWriter {
   bool done;
   InstrumentedCondVar cv;
   ColumnFamilyData* cfd;
+  const MutableCFOptions mutable_cf_options;
   const autovector<VersionEdit*>& edit_list;
 
   explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+                          const MutableCFOptions& cf_options,
                           const autovector<VersionEdit*>& e)
-      : done(false), cv(mu), cfd(_cfd), edit_list(e) {}
+      : done(false),
+        cv(mu),
+        cfd(_cfd),
+        mutable_cf_options(cf_options),
+        edit_list(e) {}
 };
 
 VersionSet::VersionSet(const std::string& dbname,
-                       const ImmutableDBOptions* db_options,
+                       const ImmutableDBOptions* _db_options,
                        const EnvOptions& storage_options, Cache* table_cache,
                        WriteBufferManager* write_buffer_manager,
                        WriteController* write_controller)
     : column_family_set_(
-          new ColumnFamilySet(dbname, db_options, storage_options, table_cache,
+          new ColumnFamilySet(dbname, _db_options, storage_options, table_cache,
                               write_buffer_manager, write_controller)),
-      env_(db_options->env),
+      env_(_db_options->env),
       dbname_(dbname),
-      db_options_(db_options),
+      db_options_(_db_options),
       next_file_number_(2),
       manifest_file_number_(0),  // Filled by Recover()
+      options_file_number_(0),
       pending_manifest_file_number_(0),
       last_sequence_(0),
-      last_to_be_written_sequence_(0),
+      last_allocated_sequence_(0),
+      last_published_sequence_(0),
       prev_log_number_(0),
       current_version_number_(0),
       manifest_file_size_(0),
-      env_options_(storage_options),
-      env_options_compactions_(
-          env_->OptimizeForCompactionTableRead(env_options_, *db_options_)) {}
+      env_options_(storage_options) {}
 
 void CloseTables(void* ptr, size_t) {
   TableReader* table_reader = reinterpret_cast<TableReader*>(ptr);
@@ -2344,12 +2799,12 @@ VersionSet::~VersionSet() {
   Cache* table_cache = column_family_set_->get_table_cache();
   table_cache->ApplyToAllCacheEntries(&CloseTables, false /* thread_safe */);
   column_family_set_.reset();
-  for (auto file : obsolete_files_) {
-    if (file->table_reader_handle) {
-      table_cache->Release(file->table_reader_handle);
-      TableCache::Evict(table_cache, file->fd.GetNumber());
+  for (auto& file : obsolete_files_) {
+    if (file.metadata->table_reader_handle) {
+      table_cache->Release(file.metadata->table_reader_handle);
+      TableCache::Evict(table_cache, file.metadata->fd.GetNumber());
     }
-    delete file;
+    file.DeleteMetadata();
   }
   obsolete_files_.clear();
 }
@@ -2382,95 +2837,161 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
   v->next_->prev_ = v;
 }
 
-Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
-                               const MutableCFOptions& mutable_cf_options,
-                               const autovector<VersionEdit*>& edit_list,
-                               InstrumentedMutex* mu, Directory* db_directory,
-                               bool new_descriptor_log,
-                               const ColumnFamilyOptions* new_cf_options) {
-  mu->AssertHeld();
-  // num of edits
-  auto num_edits = edit_list.size();
-  if (num_edits == 0) {
-    return Status::OK();
-  } else if (num_edits > 1) {
-#ifndef NDEBUG
-    // no group commits for column family add or drop
-    for (auto& edit : edit_list) {
-      assert(!edit->IsColumnFamilyManipulation());
-    }
-#endif
-  }
+Status VersionSet::ProcessManifestWrites(
+    std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
+    Directory* db_directory, bool new_descriptor_log,
+    const ColumnFamilyOptions* new_cf_options) {
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  ManifestWriter* last_writer = &first_writer;
 
-  // column_family_data can be nullptr only if this is column_family_add.
-  // in that case, we also need to specify ColumnFamilyOptions
-  if (column_family_data == nullptr) {
-    assert(num_edits == 1);
-    assert(edit_list[0]->is_column_family_add_);
-    assert(new_cf_options != nullptr);
-  }
-
-  // queue our request
-  ManifestWriter w(mu, column_family_data, edit_list);
-  manifest_writers_.push_back(&w);
-  while (!w.done && &w != manifest_writers_.front()) {
-    w.cv.Wait();
-  }
-  if (w.done) {
-    return w.status;
-  }
-  if (column_family_data != nullptr && column_family_data->IsDropped()) {
-    // if column family is dropped by the time we get here, no need to write
-    // anything to the manifest
-    manifest_writers_.pop_front();
-    // Notify new head of write queue
-    if (!manifest_writers_.empty()) {
-      manifest_writers_.front()->cv.Signal();
-    }
-    // we steal this code to also inform about cf-drop
-    return Status::ShutdownInProgress();
-  }
+  assert(!manifest_writers_.empty());
+  assert(manifest_writers_.front() == &first_writer);
 
   autovector<VersionEdit*> batch_edits;
-  Version* v = nullptr;
-  std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(nullptr);
-
-  // process all requests in the queue
-  ManifestWriter* last_writer = &w;
-  assert(!manifest_writers_.empty());
-  assert(manifest_writers_.front() == &w);
-  if (w.edit_list.front()->IsColumnFamilyManipulation()) {
-    // no group commits for column family add or drop
-    LogAndApplyCFHelper(w.edit_list.front());
-    batch_edits.push_back(w.edit_list.front());
+  autovector<Version*> versions;
+  autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
+  std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
+
+  if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+    // No group commits for column family add or drop
+    LogAndApplyCFHelper(first_writer.edit_list.front());
+    batch_edits.push_back(first_writer.edit_list.front());
   } else {
-    v = new Version(column_family_data, this, current_version_number_++);
-    builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data));
-    auto* builder = builder_guard->version_builder();
-    for (const auto& writer : manifest_writers_) {
-      if (writer->edit_list.front()->IsColumnFamilyManipulation() ||
-          writer->cfd->GetID() != column_family_data->GetID()) {
+    auto it = manifest_writers_.cbegin();
+    size_t group_start = std::numeric_limits<size_t>::max();
+    while (it != manifest_writers_.cend()) {
+      if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
         // no group commits for column family add or drop
-        // also, group commits across column families are not supported
         break;
       }
-      last_writer = writer;
-      for (const auto& edit : writer->edit_list) {
-        LogAndApplyHelper(column_family_data, builder, v, edit, mu);
-        batch_edits.push_back(edit);
+      last_writer = *(it++);
+      assert(last_writer != nullptr);
+      assert(last_writer->cfd != nullptr);
+      if (last_writer->cfd->IsDropped()) {
+        // If we detect a dropped CF at this point, and the corresponding
+        // version edits belong to an atomic group, then we need to find out
+        // the preceding version edits in the same atomic group, and update
+        // their `remaining_entries_` member variable because we are NOT going
+        // to write the version edits' of dropped CF to the MANIFEST. If we
+        // don't update, then Recover can report corrupted atomic group because
+        // the `remaining_entries_` do not match.
+        if (!batch_edits.empty()) {
+          if (batch_edits.back()->is_in_atomic_group_ &&
+              batch_edits.back()->remaining_entries_ > 0) {
+            assert(group_start < batch_edits.size());
+            const auto& edit_list = last_writer->edit_list;
+            size_t k = 0;
+            while (k < edit_list.size()) {
+              if (!edit_list[k]->is_in_atomic_group_) {
+                break;
+              } else if (edit_list[k]->remaining_entries_ == 0) {
+                ++k;
+                break;
+              }
+              ++k;
+            }
+            for (auto i = group_start; i < batch_edits.size(); ++i) {
+              assert(static_cast<uint32_t>(k) <=
+                     batch_edits.back()->remaining_entries_);
+              batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+            }
+          }
+        }
+        continue;
+      }
+      // We do a linear search on versions because versions is small.
+      // TODO(yanqin) maybe consider unordered_map
+      Version* version = nullptr;
+      VersionBuilder* builder = nullptr;
+      for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
+        uint32_t cf_id = last_writer->cfd->GetID();
+        if (versions[i]->cfd()->GetID() == cf_id) {
+          version = versions[i];
+          assert(!builder_guards.empty() &&
+                 builder_guards.size() == versions.size());
+          builder = builder_guards[i]->version_builder();
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
+          break;
+        }
+      }
+      if (version == nullptr) {
+        version = new Version(last_writer->cfd, this, env_options_,
+                              last_writer->mutable_cf_options,
+                              current_version_number_++);
+        versions.push_back(version);
+        mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+        builder_guards.emplace_back(
+            new BaseReferencedVersionBuilder(last_writer->cfd));
+        builder = builder_guards.back()->version_builder();
+      }
+      assert(builder != nullptr);  // make checker happy
+      for (const auto& e : last_writer->edit_list) {
+        if (e->is_in_atomic_group_) {
+          if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+              (batch_edits.back()->is_in_atomic_group_ &&
+               batch_edits.back()->remaining_entries_ == 0)) {
+            group_start = batch_edits.size();
+          }
+        } else if (group_start != std::numeric_limits<size_t>::max()) {
+          group_start = std::numeric_limits<size_t>::max();
+        }
+        LogAndApplyHelper(last_writer->cfd, builder, e, mu);
+        batch_edits.push_back(e);
+      }
+    }
+    for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+      assert(!builder_guards.empty() &&
+             builder_guards.size() == versions.size());
+      auto* builder = builder_guards[i]->version_builder();
+      builder->SaveTo(versions[i]->storage_info());
+    }
+  }
+
+#ifndef NDEBUG
+  // Verify that version edits of atomic groups have correct
+  // remaining_entries_.
+  size_t k = 0;
+  while (k < batch_edits.size()) {
+    while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+      ++k;
+    }
+    if (k == batch_edits.size()) {
+      break;
+    }
+    size_t i = k;
+    while (i < batch_edits.size()) {
+      if (!batch_edits[i]->is_in_atomic_group_) {
+        break;
+      }
+      assert(i - k + batch_edits[i]->remaining_entries_ ==
+             batch_edits[k]->remaining_entries_);
+      if (batch_edits[i]->remaining_entries_ == 0) {
+        ++i;
+        break;
       }
+      ++i;
     }
-    builder->SaveTo(v->storage_info());
+    assert(batch_edits[i - 1]->is_in_atomic_group_);
+    assert(0 == batch_edits[i - 1]->remaining_entries_);
+    std::vector<VersionEdit*> tmp;
+    for (size_t j = k; j != i; ++j) {
+      tmp.emplace_back(batch_edits[j]);
+    }
+    TEST_SYNC_POINT_CALLBACK(
+        "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+    k = i;
   }
+#endif  // NDEBUG
 
-  // Initialize new descriptor log file if necessary by creating
-  // a temporary file that contains a snapshot of the current version.
   uint64_t new_manifest_file_size = 0;
   Status s;
 
   assert(pending_manifest_file_number_ == 0);
   if (!descriptor_log_ ||
       manifest_file_size_ > db_options_->max_manifest_file_size) {
+    TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
     pending_manifest_file_number_ = NewFileNumber();
     batch_edits.back()->SetNextFile(next_file_number_.load());
     new_descriptor_log = true;
@@ -2479,70 +3000,87 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
   }
 
   if (new_descriptor_log) {
-    // if we're writing out new snapshot make sure to persist max column family
+    // if we are writing out new snapshot make sure to persist max column
+    // family.
     if (column_family_set_->GetMaxColumnFamily() > 0) {
-      w.edit_list.front()->SetMaxColumnFamily(
+      first_writer.edit_list.front()->SetMaxColumnFamily(
           column_family_set_->GetMaxColumnFamily());
     }
   }
 
-  // Unlock during expensive operations. New writes cannot get here
-  // because &w is ensuring that all new writes get queued.
   {
-
+    EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
     mu->Unlock();
 
     TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
-    if (!w.edit_list.front()->IsColumnFamilyManipulation() &&
-        this->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
-            TableCache::kInfiniteCapacity) {
-      // unlimited table cache. Pre-load table handle now.
-      // Need to do it out of the mutex.
-      builder_guard->version_builder()->LoadTableHandlers(
-          column_family_data->internal_stats(),
-          column_family_data->ioptions()->optimize_filters_for_hits,
-          true /* prefetch_index_and_filter_in_cache */);
+    if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        assert(!builder_guards.empty() &&
+               builder_guards.size() == versions.size());
+        assert(!mutable_cf_options_ptrs.empty() &&
+               builder_guards.size() == versions.size());
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        builder_guards[i]->version_builder()->LoadTableHandlers(
+            cfd->internal_stats(), cfd->ioptions()->optimize_filters_for_hits,
+            true /* prefetch_index_and_filter_in_cache */,
+            false /* is_initial_load */,
+            mutable_cf_options_ptrs[i]->prefix_extractor.get());
+      }
     }
 
     // This is fine because everything inside of this block is serialized --
     // only one thread can be here at the same time
     if (new_descriptor_log) {
-      // create manifest file
+      // create new manifest file
       ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
                      pending_manifest_file_number_);
-      unique_ptr<WritableFile> descriptor_file;
-      EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
-      s = NewWritableFile(
-          env_, DescriptorFileName(dbname_, pending_manifest_file_number_),
-          &descriptor_file, opt_env_opts);
+      std::string descriptor_fname =
+          DescriptorFileName(dbname_, pending_manifest_file_number_);
+      std::unique_ptr<WritableFile> descriptor_file;
+      s = NewWritableFile(env_, descriptor_fname, &descriptor_file,
+                          opt_env_opts);
       if (s.ok()) {
         descriptor_file->SetPreallocationBlockSize(
             db_options_->manifest_preallocation_size);
 
-        unique_ptr<WritableFileWriter> file_writer(
-            new WritableFileWriter(std::move(descriptor_file), opt_env_opts));
+        std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(descriptor_file), descriptor_fname, opt_env_opts, env_,
+            nullptr, db_options_->listeners));
         descriptor_log_.reset(
             new log::Writer(std::move(file_writer), 0, false));
         s = WriteSnapshot(descriptor_log_.get());
       }
     }
 
-    if (!w.edit_list.front()->IsColumnFamilyManipulation()) {
-      // This is cpu-heavy operations, which should be called outside mutex.
-      v->PrepareApply(mutable_cf_options, true);
+    if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        versions[i]->PrepareApply(*mutable_cf_options_ptrs[i], true);
+      }
     }
 
-    // Write new record to MANIFEST log
+    // Write new records to MANIFEST log
     if (s.ok()) {
+#ifndef NDEBUG
+      size_t idx = 0;
+#endif
       for (auto& e : batch_edits) {
         std::string record;
         if (!e->EncodeTo(&record)) {
-          s = Status::Corruption(
-              "Unable to Encode VersionEdit:" + e->DebugString(true));
+          s = Status::Corruption("Unable to encode VersionEdit:" +
+                                 e->DebugString(true));
           break;
         }
         TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord",
                          rocksdb_kill_odds * REDUCE_ODDS2);
+#ifndef NDEBUG
+        if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
+          TEST_SYNC_POINT(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0");
+          TEST_SYNC_POINT(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+        }
+        ++idx;
+#endif /* !NDEBUG */
         s = descriptor_log_->AddRecord(record);
         if (!s.ok()) {
           break;
@@ -2552,7 +3090,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
         s = SyncManifest(env_, db_options_, descriptor_log_->file());
       }
       if (!s.ok()) {
-        ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write: %s\n",
+        ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
                         s.ToString().c_str());
       }
     }
@@ -2562,6 +3100,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     if (s.ok() && new_descriptor_log) {
       s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
                          db_directory);
+      TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:AfterNewManifest");
     }
 
     if (s.ok()) {
@@ -2569,7 +3108,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
       new_manifest_file_size = descriptor_log_->file()->GetFileSize();
     }
 
-    if (w.edit_list.front()->is_column_family_drop_) {
+    if (first_writer.edit_list.front()->is_column_family_drop_) {
       TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
       TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
       TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
@@ -2580,88 +3119,207 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
     mu->Lock();
   }
 
-  // Append the old mainfest file to the obsolete_manifests_ list to be deleted
+  // Append the old manifest file to the obsolete_manifest_ list to be deleted
   // by PurgeObsoleteFiles later.
   if (s.ok() && new_descriptor_log) {
     obsolete_manifests_.emplace_back(
         DescriptorFileName("", manifest_file_number_));
   }
 
-  // Install the new version
+  // Install the new versions
   if (s.ok()) {
-    if (w.edit_list.front()->is_column_family_add_) {
-      // no group commit on column family add
+    if (first_writer.edit_list.front()->is_column_family_add_) {
       assert(batch_edits.size() == 1);
       assert(new_cf_options != nullptr);
-      CreateColumnFamily(*new_cf_options, w.edit_list.front());
-    } else if (w.edit_list.front()->is_column_family_drop_) {
+      CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
+    } else if (first_writer.edit_list.front()->is_column_family_drop_) {
       assert(batch_edits.size() == 1);
-      column_family_data->SetDropped();
-      if (column_family_data->Unref()) {
-        delete column_family_data;
+      first_writer.cfd->SetDropped();
+      if (first_writer.cfd->Unref()) {
+        delete first_writer.cfd;
       }
     } else {
-      uint64_t max_log_number_in_batch  = 0;
+      // Each version in versions corresponds to a column family.
+      // For each column family, update its log number indicating that logs
+      // with number smaller than this should be ignored.
+      for (const auto version : versions) {
+        uint64_t max_log_number_in_batch = 0;
+        uint32_t cf_id = version->cfd_->GetID();
+        for (const auto& e : batch_edits) {
+          if (e->has_log_number_ && e->column_family_ == cf_id) {
+            max_log_number_in_batch =
+                std::max(max_log_number_in_batch, e->log_number_);
+          }
+        }
+        if (max_log_number_in_batch != 0) {
+          assert(version->cfd_->GetLogNumber() <= max_log_number_in_batch);
+          version->cfd_->SetLogNumber(max_log_number_in_batch);
+        }
+      }
+
+      uint64_t last_min_log_number_to_keep = 0;
       for (auto& e : batch_edits) {
-        if (e->has_log_number_) {
-          max_log_number_in_batch =
-              std::max(max_log_number_in_batch, e->log_number_);
+        if (e->has_min_log_number_to_keep_) {
+          last_min_log_number_to_keep =
+              std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
         }
       }
-      if (max_log_number_in_batch != 0) {
-        assert(column_family_data->GetLogNumber() <= max_log_number_in_batch);
-        column_family_data->SetLogNumber(max_log_number_in_batch);
+
+      if (last_min_log_number_to_keep != 0) {
+        // Should only be set in 2PC mode.
+        MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
       }
-      AppendVersion(column_family_data, v);
-    }
 
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        AppendVersion(cfd, versions[i]);
+      }
+    }
     manifest_file_number_ = pending_manifest_file_number_;
     manifest_file_size_ = new_manifest_file_size;
-    prev_log_number_ = w.edit_list.front()->prev_log_number_;
+    prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
   } else {
     std::string version_edits;
     for (auto& e : batch_edits) {
-      version_edits = version_edits + "\n" + e->DebugString(true);
+      version_edits += ("\n" + e->DebugString(true));
+    }
+    ROCKS_LOG_ERROR(db_options_->info_log,
+                    "Error in committing version edit to MANIFEST: %s",
+                    version_edits.c_str());
+    for (auto v : versions) {
+      delete v;
     }
-    ROCKS_LOG_ERROR(
-        db_options_->info_log,
-        "[%s] Error in committing version edit to MANIFEST: %s",
-        column_family_data ? column_family_data->GetName().c_str() : "<null>",
-        version_edits.c_str());
-    delete v;
     if (new_descriptor_log) {
-      ROCKS_LOG_INFO(db_options_->info_log, "Deleting manifest %" PRIu64
-                                            " current manifest %" PRIu64 "\n",
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Deleting manifest %" PRIu64 " current manifest %" PRIu64
+                     "\n",
                      manifest_file_number_, pending_manifest_file_number_);
       descriptor_log_.reset();
       env_->DeleteFile(
           DescriptorFileName(dbname_, pending_manifest_file_number_));
     }
   }
+
   pending_manifest_file_number_ = 0;
 
   // wake up all the waiting writers
   while (true) {
     ManifestWriter* ready = manifest_writers_.front();
     manifest_writers_.pop_front();
-    if (ready != &w) {
-      ready->status = s;
-      ready->done = true;
+    bool need_signal = true;
+    for (const auto& w : writers) {
+      if (&w == ready) {
+        need_signal = false;
+        break;
+      }
+    }
+    ready->status = s;
+    ready->done = true;
+    if (need_signal) {
       ready->cv.Signal();
     }
-    if (ready == last_writer) break;
+    if (ready == last_writer) {
+      break;
+    }
   }
-  // Notify new head of write queue
   if (!manifest_writers_.empty()) {
     manifest_writers_.front()->cv.Signal();
   }
   return s;
 }
 
+// 'datas' is gramatically incorrect. We still use this notation to indicate
+// that this variable represents a collection of column_family_data.
+Status VersionSet::LogAndApply(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log,
+    const ColumnFamilyOptions* new_cf_options) {
+  mu->AssertHeld();
+  int num_edits = 0;
+  for (const auto& elist : edit_lists) {
+    num_edits += static_cast<int>(elist.size());
+  }
+  if (num_edits == 0) {
+    return Status::OK();
+  } else if (num_edits > 1) {
+#ifndef NDEBUG
+    for (const auto& edit_list : edit_lists) {
+      for (const auto& edit : edit_list) {
+        assert(!edit->IsColumnFamilyManipulation());
+      }
+    }
+#endif /* ! NDEBUG */
+  }
+
+  int num_cfds = static_cast<int>(column_family_datas.size());
+  if (num_cfds == 1 && column_family_datas[0] == nullptr) {
+    assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
+    assert(edit_lists[0][0]->is_column_family_add_);
+    assert(new_cf_options != nullptr);
+  }
+  std::deque<ManifestWriter> writers;
+  if (num_cfds > 0) {
+    assert(static_cast<size_t>(num_cfds) == mutable_cf_options_list.size());
+    assert(static_cast<size_t>(num_cfds) == edit_lists.size());
+  }
+  for (int i = 0; i < num_cfds; ++i) {
+    writers.emplace_back(mu, column_family_datas[i],
+                         *mutable_cf_options_list[i], edit_lists[i]);
+    manifest_writers_.push_back(&writers[i]);
+  }
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  while (!first_writer.done && &first_writer != manifest_writers_.front()) {
+    first_writer.cv.Wait();
+  }
+  if (first_writer.done) {
+    // All non-CF-manipulation operations can be grouped together and committed
+    // to MANIFEST. They should all have finished. The status code is stored in
+    // the first manifest writer.
+#ifndef NDEBUG
+    for (const auto& writer : writers) {
+      assert(writer.done);
+    }
+#endif /* !NDEBUG */
+    return first_writer.status;
+  }
+
+  int num_undropped_cfds = 0;
+  for (auto cfd : column_family_datas) {
+    // if cfd == nullptr, it is a column family add.
+    if (cfd == nullptr || !cfd->IsDropped()) {
+      ++num_undropped_cfds;
+    }
+  }
+  if (0 == num_undropped_cfds) {
+    // TODO (yanqin) maybe use a different status code to denote column family
+    // drop other than OK and ShutdownInProgress
+    for (int i = 0; i != num_cfds; ++i) {
+      manifest_writers_.pop_front();
+    }
+    // Notify new head of manifest write queue.
+    if (!manifest_writers_.empty()) {
+      manifest_writers_.front()->cv.Signal();
+    }
+    return Status::ShutdownInProgress();
+  }
+
+  return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
+                               new_cf_options);
+}
+
 void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
   assert(edit->IsColumnFamilyManipulation());
   edit->SetNextFile(next_file_number_.load());
-  edit->SetLastSequence(last_sequence_);
+  // The log might have data that is not visible to memtbale and hence have not
+  // updated the last_sequence_ yet. It is also possible that the log has is
+  // expecting some new data that is not written yet. Since LastSequence is an
+  // upper bound on the sequence, it is ok to record
+  // last_allocated_sequence_ as the last sequence.
+  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
+                                                      : last_sequence_);
   if (edit->is_column_family_drop_) {
     // if we drop column family, we have to make sure to save max column family,
     // so that we don't reuse existing ID
@@ -2670,8 +3328,11 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
 }
 
 void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
-                                   VersionBuilder* builder, Version* v,
-                                   VersionEdit* edit, InstrumentedMutex* mu) {
+                                   VersionBuilder* builder, VersionEdit* edit,
+                                   InstrumentedMutex* mu) {
+#ifdef NDEBUG
+  (void)cfd;
+#endif
   mu->AssertHeld();
   assert(!edit->IsColumnFamilyManipulation());
 
@@ -2684,11 +3345,181 @@ void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
     edit->SetPrevLogNumber(prev_log_number_);
   }
   edit->SetNextFile(next_file_number_.load());
-  edit->SetLastSequence(last_sequence_);
+  // The log might have data that is not visible to memtbale and hence have not
+  // updated the last_sequence_ yet. It is also possible that the log has is
+  // expecting some new data that is not written yet. Since LastSequence is an
+  // upper bound on the sequence, it is ok to record
+  // last_allocated_sequence_ as the last sequence.
+  edit->SetLastSequence(db_options_->two_write_queues ? last_allocated_sequence_
+                                                      : last_sequence_);
 
   builder->Apply(edit);
 }
 
+Status VersionSet::ApplyOneVersionEditToBuilder(
+    VersionEdit& edit,
+    const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_options,
+    std::unordered_map<int, std::string>& column_families_not_found,
+    std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>&
+        builders,
+    bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
+    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+    bool* have_last_sequence, SequenceNumber* last_sequence,
+    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+  // Not found means that user didn't supply that column
+  // family option AND we encountered column family add
+  // record. Once we encounter column family drop record,
+  // we will delete the column family from
+  // column_families_not_found.
+  bool cf_in_not_found = (column_families_not_found.find(edit.column_family_) !=
+                          column_families_not_found.end());
+  // in builders means that user supplied that column family
+  // option AND that we encountered column family add record
+  bool cf_in_builders = builders.find(edit.column_family_) != builders.end();
+
+  // they can't both be true
+  assert(!(cf_in_not_found && cf_in_builders));
+
+  ColumnFamilyData* cfd = nullptr;
+
+  if (edit.is_column_family_add_) {
+    if (cf_in_builders || cf_in_not_found) {
+      return Status::Corruption(
+          "Manifest adding the same column family twice: " +
+          edit.column_family_name_);
+    }
+    auto cf_options = name_to_options.find(edit.column_family_name_);
+    if (cf_options == name_to_options.end()) {
+      column_families_not_found.insert(
+          {edit.column_family_, edit.column_family_name_});
+    } else {
+      cfd = CreateColumnFamily(cf_options->second, &edit);
+      cfd->set_initialized();
+      builders.insert(std::make_pair(
+          edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
+                                   new BaseReferencedVersionBuilder(cfd))));
+    }
+  } else if (edit.is_column_family_drop_) {
+    if (cf_in_builders) {
+      auto builder = builders.find(edit.column_family_);
+      assert(builder != builders.end());
+      builders.erase(builder);
+      cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+      assert(cfd != nullptr);
+      if (cfd->Unref()) {
+        delete cfd;
+        cfd = nullptr;
+      } else {
+        // who else can have reference to cfd!?
+        assert(false);
+      }
+    } else if (cf_in_not_found) {
+      column_families_not_found.erase(edit.column_family_);
+    } else {
+      return Status::Corruption(
+          "Manifest - dropping non-existing column family");
+    }
+  } else if (!cf_in_not_found) {
+    if (!cf_in_builders) {
+      return Status::Corruption(
+          "Manifest record referencing unknown column family");
+    }
+
+    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+    // this should never happen since cf_in_builders is true
+    assert(cfd != nullptr);
+
+    // if it is not column family add or column family drop,
+    // then it's a file add/delete, which should be forwarded
+    // to builder
+    auto builder = builders.find(edit.column_family_);
+    assert(builder != builders.end());
+    builder->second->version_builder()->Apply(&edit);
+  }
+  return ExtractInfoFromVersionEdit(
+      cfd, edit, have_log_number, log_number, have_prev_log_number,
+      previous_log_number, have_next_file, next_file, have_last_sequence,
+      last_sequence, min_log_number_to_keep, max_column_family);
+}
+
+Status VersionSet::ExtractInfoFromVersionEdit(
+    ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number,
+    uint64_t* log_number, bool* have_prev_log_number,
+    uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+    bool* have_last_sequence, SequenceNumber* last_sequence,
+    uint64_t* min_log_number_to_keep, uint32_t* max_column_family) {
+  if (cfd != nullptr) {
+    if (edit.has_log_number_) {
+      if (cfd->GetLogNumber() > edit.log_number_) {
+        ROCKS_LOG_WARN(
+            db_options_->info_log,
+            "MANIFEST corruption detected, but ignored - Log numbers in "
+            "records NOT monotonically increasing");
+      } else {
+        cfd->SetLogNumber(edit.log_number_);
+        *have_log_number = true;
+        *log_number = edit.log_number_;
+      }
+    }
+    if (edit.has_comparator_ &&
+        edit.comparator_ != cfd->user_comparator()->Name()) {
+      return Status::InvalidArgument(
+          cfd->user_comparator()->Name(),
+          "does not match existing comparator " + edit.comparator_);
+    }
+  }
+
+  if (edit.has_prev_log_number_) {
+    *previous_log_number = edit.prev_log_number_;
+    *have_prev_log_number = true;
+  }
+
+  if (edit.has_next_file_number_) {
+    *next_file = edit.next_file_number_;
+    *have_next_file = true;
+  }
+
+  if (edit.has_max_column_family_) {
+    *max_column_family = edit.max_column_family_;
+  }
+
+  if (edit.has_min_log_number_to_keep_) {
+    *min_log_number_to_keep =
+        std::max(*min_log_number_to_keep, edit.min_log_number_to_keep_);
+  }
+
+  if (edit.has_last_sequence_) {
+    *last_sequence = edit.last_sequence_;
+    *have_last_sequence = true;
+  }
+  return Status::OK();
+}
+
+Status VersionSet::GetCurrentManifestPath(std::string* manifest_path) {
+  assert(manifest_path != nullptr);
+  std::string fname;
+  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &fname);
+  if (!s.ok()) {
+    return s;
+  }
+  if (fname.empty() || fname.back() != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  // remove the trailing '\n'
+  fname.resize(fname.size() - 1);
+  FileType type;
+  bool parse_ok = ParseFileName(fname, &manifest_file_number_, &type);
+  if (!parse_ok || type != kDescriptorFile) {
+    return Status::Corruption("CURRENT file corrupted");
+  }
+  *manifest_path = dbname_;
+  if (dbname_.back() != '/') {
+    manifest_path->push_back('/');
+  }
+  *manifest_path += fname;
+  return Status::OK();
+}
+
 Status VersionSet::Recover(
     const std::vector<ColumnFamilyDescriptor>& column_families,
     bool read_only) {
@@ -2702,43 +3533,28 @@ Status VersionSet::Recover(
   std::unordered_map<int, std::string> column_families_not_found;
 
   // Read "CURRENT" file, which contains a pointer to the current manifest file
-  std::string manifest_filename;
-  Status s = ReadFileToString(
-      env_, CurrentFileName(dbname_), &manifest_filename
-  );
+  std::string manifest_path;
+  Status s = GetCurrentManifestPath(&manifest_path);
   if (!s.ok()) {
     return s;
   }
-  if (manifest_filename.empty() ||
-      manifest_filename.back() != '\n') {
-    return Status::Corruption("CURRENT file does not end with newline");
-  }
-  // remove the trailing '\n'
-  manifest_filename.resize(manifest_filename.size() - 1);
-  FileType type;
-  bool parse_ok =
-      ParseFileName(manifest_filename, &manifest_file_number_, &type);
-  if (!parse_ok || type != kDescriptorFile) {
-    return Status::Corruption("CURRENT file corrupted");
-  }
 
   ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n",
-                 manifest_filename.c_str());
+                 manifest_path.c_str());
 
-  manifest_filename = dbname_ + "/" + manifest_filename;
-  unique_ptr<SequentialFileReader> manifest_file_reader;
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
   {
-    unique_ptr<SequentialFile> manifest_file;
-    s = env_->NewSequentialFile(manifest_filename, &manifest_file,
+    std::unique_ptr<SequentialFile> manifest_file;
+    s = env_->NewSequentialFile(manifest_path, &manifest_file,
                                 env_->OptimizeForManifestRead(env_options_));
     if (!s.ok()) {
       return s;
     }
     manifest_file_reader.reset(
-        new SequentialFileReader(std::move(manifest_file)));
+        new SequentialFileReader(std::move(manifest_file), manifest_path));
   }
   uint64_t current_manifest_file_size;
-  s = env_->GetFileSize(manifest_filename, &current_manifest_file_size);
+  s = env_->GetFileSize(manifest_path, &current_manifest_file_size);
   if (!s.ok()) {
     return s;
   }
@@ -2752,7 +3568,9 @@ Status VersionSet::Recover(
   uint64_t log_number = 0;
   uint64_t previous_log_number = 0;
   uint32_t max_column_family = 0;
-  std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
+  uint64_t min_log_number_to_keep = 0;
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      builders;
 
   // add default column family
   auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
@@ -2767,15 +3585,19 @@ Status VersionSet::Recover(
   // In recovery, nobody else can access it, so it's fine to set it to be
   // initialized earlier.
   default_cfd->set_initialized();
-  builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
+  builders.insert(
+      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+                            new BaseReferencedVersionBuilder(default_cfd))));
 
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
-    log::Reader reader(NULL, std::move(manifest_file_reader), &reporter,
-                       true /*checksum*/, 0 /*initial_offset*/, 0);
+    log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
     Slice record;
     std::string scratch;
+    std::vector<VersionEdit> replay_buffer;
+    size_t num_entries_decoded = 0;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
       VersionEdit edit;
       s = edit.DecodeFrom(record);
@@ -2783,118 +3605,55 @@ Status VersionSet::Recover(
         break;
       }
 
-      // Not found means that user didn't supply that column
-      // family option AND we encountered column family add
-      // record. Once we encounter column family drop record,
-      // we will delete the column family from
-      // column_families_not_found.
-      bool cf_in_not_found =
-          column_families_not_found.find(edit.column_family_) !=
-          column_families_not_found.end();
-      // in builders means that user supplied that column family
-      // option AND that we encountered column family add record
-      bool cf_in_builders =
-          builders.find(edit.column_family_) != builders.end();
-
-      // they can't both be true
-      assert(!(cf_in_not_found && cf_in_builders));
-
-      ColumnFamilyData* cfd = nullptr;
-
-      if (edit.is_column_family_add_) {
-        if (cf_in_builders || cf_in_not_found) {
-          s = Status::Corruption(
-              "Manifest adding the same column family twice");
-          break;
-        }
-        auto cf_options = cf_name_to_options.find(edit.column_family_name_);
-        if (cf_options == cf_name_to_options.end()) {
-          column_families_not_found.insert(
-              {edit.column_family_, edit.column_family_name_});
-        } else {
-          cfd = CreateColumnFamily(cf_options->second, &edit);
-          cfd->set_initialized();
-          builders.insert(
-              {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
-        }
-      } else if (edit.is_column_family_drop_) {
-        if (cf_in_builders) {
-          auto builder = builders.find(edit.column_family_);
-          assert(builder != builders.end());
-          delete builder->second;
-          builders.erase(builder);
-          cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-          if (cfd->Unref()) {
-            delete cfd;
-            cfd = nullptr;
-          } else {
-            // who else can have reference to cfd!?
-            assert(false);
-          }
-        } else if (cf_in_not_found) {
-          column_families_not_found.erase(edit.column_family_);
-        } else {
-          s = Status::Corruption(
-              "Manifest - dropping non-existing column family");
-          break;
+      if (edit.is_in_atomic_group_) {
+        if (replay_buffer.empty()) {
+          replay_buffer.resize(edit.remaining_entries_ + 1);
+          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:FirstInAtomicGroup",
+                                   &edit);
         }
-      } else if (!cf_in_not_found) {
-        if (!cf_in_builders) {
-          s = Status::Corruption(
-              "Manifest record referencing unknown column family");
+        ++num_entries_decoded;
+        if (num_entries_decoded + edit.remaining_entries_ !=
+            static_cast<uint32_t>(replay_buffer.size())) {
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::Recover:IncorrectAtomicGroupSize", &edit);
+          s = Status::Corruption("corrupted atomic group");
           break;
         }
-
-        cfd = column_family_set_->GetColumnFamily(edit.column_family_);
-        // this should never happen since cf_in_builders is true
-        assert(cfd != nullptr);
-
-        // if it is not column family add or column family drop,
-        // then it's a file add/delete, which should be forwarded
-        // to builder
-        auto builder = builders.find(edit.column_family_);
-        assert(builder != builders.end());
-        builder->second->version_builder()->Apply(&edit);
-      }
-
-      if (cfd != nullptr) {
-        if (edit.has_log_number_) {
-          if (cfd->GetLogNumber() > edit.log_number_) {
-            ROCKS_LOG_WARN(
-                db_options_->info_log,
-                "MANIFEST corruption detected, but ignored - Log numbers in "
-                "records NOT monotonically increasing");
-          } else {
-            cfd->SetLogNumber(edit.log_number_);
-            have_log_number = true;
+        replay_buffer[num_entries_decoded - 1] = std::move(edit);
+        if (num_entries_decoded == replay_buffer.size()) {
+          TEST_SYNC_POINT_CALLBACK("VersionSet::Recover:LastInAtomicGroup",
+                                   &edit);
+          for (auto& e : replay_buffer) {
+            s = ApplyOneVersionEditToBuilder(
+                e, cf_name_to_options, column_families_not_found, builders,
+                &have_log_number, &log_number, &have_prev_log_number,
+                &previous_log_number, &have_next_file, &next_file,
+                &have_last_sequence, &last_sequence, &min_log_number_to_keep,
+                &max_column_family);
+            if (!s.ok()) {
+              break;
+            }
           }
+          replay_buffer.clear();
+          num_entries_decoded = 0;
         }
-        if (edit.has_comparator_ &&
-            edit.comparator_ != cfd->user_comparator()->Name()) {
-          s = Status::InvalidArgument(
-              cfd->user_comparator()->Name(),
-              "does not match existing comparator " + edit.comparator_);
+        TEST_SYNC_POINT("VersionSet::Recover:AtomicGroup");
+      } else {
+        if (!replay_buffer.empty()) {
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", &edit);
+          s = Status::Corruption("corrupted atomic group");
           break;
         }
+        s = ApplyOneVersionEditToBuilder(
+            edit, cf_name_to_options, column_families_not_found, builders,
+            &have_log_number, &log_number, &have_prev_log_number,
+            &previous_log_number, &have_next_file, &next_file,
+            &have_last_sequence, &last_sequence, &min_log_number_to_keep,
+            &max_column_family);
       }
-
-      if (edit.has_prev_log_number_) {
-        previous_log_number = edit.prev_log_number_;
-        have_prev_log_number = true;
-      }
-
-      if (edit.has_next_file_number_) {
-        next_file = edit.next_file_number_;
-        have_next_file = true;
-      }
-
-      if (edit.has_max_column_family_) {
-        max_column_family = edit.max_column_family_;
-      }
-
-      if (edit.has_last_sequence_) {
-        last_sequence = edit.last_sequence_;
-        have_last_sequence = true;
+      if (!s.ok()) {
+        break;
       }
     }
   }
@@ -2914,8 +3673,11 @@ Status VersionSet::Recover(
 
     column_family_set_->UpdateMaxColumnFamily(max_column_family);
 
-    MarkFileNumberUsedDuringRecovery(previous_log_number);
-    MarkFileNumberUsedDuringRecovery(log_number);
+    // When reading DB generated using old release, min_log_number_to_keep=0.
+    // All log files will be scanned for potential prepare entries.
+    MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
+    MarkFileNumberUsed(previous_log_number);
+    MarkFileNumberUsed(log_number);
   }
 
   // there were some column families in the MANIFEST that weren't specified
@@ -2948,21 +3710,25 @@ Status VersionSet::Recover(
       if (cfd->IsDropped()) {
         continue;
       }
+      if (read_only) {
+        cfd->table_cache()->SetTablesAreImmortal();
+      }
       assert(cfd->initialized());
       auto builders_iter = builders.find(cfd->GetID());
       assert(builders_iter != builders.end());
-      auto* builder = builders_iter->second->version_builder();
-
-      if (GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
-          TableCache::kInfiniteCapacity) {
-        // unlimited table cache. Pre-load table handle now.
-        // Need to do it out of the mutex.
-        builder->LoadTableHandlers(
-            cfd->internal_stats(), db_options_->max_file_opening_threads,
-            false /* prefetch_index_and_filter_in_cache */);
-      }
+      auto builder = builders_iter->second->version_builder();
 
-      Version* v = new Version(cfd, this, current_version_number_++);
+      // unlimited table cache. Pre-load table handle now.
+      // Need to do it out of the mutex.
+      builder->LoadTableHandlers(
+          cfd->internal_stats(), db_options_->max_file_opening_threads,
+          false /* prefetch_index_and_filter_in_cache */,
+          true /* is_initial_load */,
+          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+
+      Version* v = new Version(cfd, this, env_options_,
+                               *cfd->GetLatestMutableCFOptions(),
+                               current_version_number_++);
       builder->SaveTo(v->storage_info());
 
       // Install recovered version
@@ -2973,36 +3739,34 @@ Status VersionSet::Recover(
 
     manifest_file_size_ = current_manifest_file_size;
     next_file_number_.store(next_file + 1);
-    last_to_be_written_sequence_ = last_sequence;
+    last_allocated_sequence_ = last_sequence;
+    last_published_sequence_ = last_sequence;
     last_sequence_ = last_sequence;
     prev_log_number_ = previous_log_number;
 
     ROCKS_LOG_INFO(
         db_options_->info_log,
         "Recovered from manifest file:%s succeeded,"
-        "manifest_file_number is %lu, next_file_number is %lu, "
-        "last_sequence is %lu, log_number is %lu,"
-        "prev_log_number is %lu,"
-        "max_column_family is %u\n",
-        manifest_filename.c_str(), (unsigned long)manifest_file_number_,
-        (unsigned long)next_file_number_.load(), (unsigned long)last_sequence_,
-        (unsigned long)log_number, (unsigned long)prev_log_number_,
-        column_family_set_->GetMaxColumnFamily());
+        "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64
+        ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
+        ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
+        ",min_log_number_to_keep is %" PRIu64 "\n",
+        manifest_path.c_str(), manifest_file_number_,
+        next_file_number_.load(), last_sequence_.load(), log_number,
+        prev_log_number_, column_family_set_->GetMaxColumnFamily(),
+        min_log_number_to_keep_2pc());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
         continue;
       }
       ROCKS_LOG_INFO(db_options_->info_log,
-                     "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
+                     "Column family [%s] (ID %" PRIu32
+                     "), log number is %" PRIu64 "\n",
                      cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
     }
   }
 
-  for (auto& builder : builders) {
-    delete builder.second;
-  }
-
   return s;
 }
 
@@ -3024,14 +3788,14 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
 
   std::string dscname = dbname + "/" + current;
 
-  unique_ptr<SequentialFileReader> file_reader;
+  std::unique_ptr<SequentialFileReader> file_reader;
   {
-  unique_ptr<SequentialFile> file;
-  s = env->NewSequentialFile(dscname, &file, soptions);
-  if (!s.ok()) {
-    return s;
+    std::unique_ptr<SequentialFile> file;
+    s = env->NewSequentialFile(dscname, &file, soptions);
+    if (!s.ok()) {
+      return s;
   }
-  file_reader.reset(new SequentialFileReader(std::move(file)));
+  file_reader.reset(new SequentialFileReader(std::move(file), dscname));
   }
 
   std::map<uint32_t, std::string> column_family_names;
@@ -3039,8 +3803,8 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
   column_family_names.insert({0, kDefaultColumnFamilyName});
   VersionSet::LogReporter reporter;
   reporter.status = &s;
-  log::Reader reader(NULL, std::move(file_reader), &reporter, true /*checksum*/,
-                     0 /*initial_offset*/, 0);
+  log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                     true /* checksum */, 0 /* log_number */);
   Slice record;
   std::string scratch;
   while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -3166,16 +3930,16 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
 Status VersionSet::DumpManifest(Options& options, std::string& dscname,
                                 bool verbose, bool hex, bool json) {
   // Open the specified manifest file.
-  unique_ptr<SequentialFileReader> file_reader;
+  std::unique_ptr<SequentialFileReader> file_reader;
   Status s;
   {
-    unique_ptr<SequentialFile> file;
+    std::unique_ptr<SequentialFile> file;
     s = options.env->NewSequentialFile(
         dscname, &file, env_->OptimizeForManifestRead(env_options_));
     if (!s.ok()) {
       return s;
     }
-    file_reader.reset(new SequentialFileReader(std::move(file)));
+    file_reader.reset(new SequentialFileReader(std::move(file), dscname));
   }
 
   bool have_prev_log_number = false;
@@ -3186,7 +3950,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   uint64_t previous_log_number = 0;
   int count = 0;
   std::unordered_map<uint32_t, std::string> comparators;
-  std::unordered_map<uint32_t, BaseReferencedVersionBuilder*> builders;
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      builders;
 
   // add default column family
   VersionEdit default_cf_edit;
@@ -3194,13 +3959,15 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
   default_cf_edit.SetColumnFamily(0);
   ColumnFamilyData* default_cfd =
       CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit);
-  builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)});
+  builders.insert(
+      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+                            new BaseReferencedVersionBuilder(default_cfd))));
 
   {
     VersionSet::LogReporter reporter;
     reporter.status = &s;
-    log::Reader reader(NULL, std::move(file_reader), &reporter,
-                       true /*checksum*/, 0 /*initial_offset*/, 0);
+    log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
     Slice record;
     std::string scratch;
     while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@@ -3235,8 +4002,9 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         }
         cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit);
         cfd->set_initialized();
-        builders.insert(
-            {edit.column_family_, new BaseReferencedVersionBuilder(cfd)});
+        builders.insert(std::make_pair(
+            edit.column_family_, std::unique_ptr<BaseReferencedVersionBuilder>(
+                                     new BaseReferencedVersionBuilder(cfd))));
       } else if (edit.is_column_family_drop_) {
         if (!cf_in_builders) {
           s = Status::Corruption(
@@ -3244,7 +4012,6 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
           break;
         }
         auto builder_iter = builders.find(edit.column_family_);
-        delete builder_iter->second;
         builders.erase(builder_iter);
         comparators.erase(edit.column_family_);
         cfd = column_family_set_->GetColumnFamily(edit.column_family_);
@@ -3275,6 +4042,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
         cfd->SetLogNumber(edit.log_number_);
       }
 
+
       if (edit.has_prev_log_number_) {
         previous_log_number = edit.prev_log_number_;
         have_prev_log_number = true;
@@ -3293,6 +4061,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       if (edit.has_max_column_family_) {
         column_family_set_->UpdateMaxColumnFamily(edit.max_column_family_);
       }
+
+      if (edit.has_min_log_number_to_keep_) {
+        MarkMinLogNumberToKeep2PC(edit.min_log_number_to_keep_);
+      }
     }
   }
   file_reader.reset();
@@ -3320,13 +4092,16 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       assert(builders_iter != builders.end());
       auto builder = builders_iter->second->version_builder();
 
-      Version* v = new Version(cfd, this, current_version_number_++);
+      Version* v = new Version(cfd, this, env_options_,
+                               *cfd->GetLatestMutableCFOptions(),
+                               current_version_number_++);
       builder->SaveTo(v->storage_info());
       v->PrepareApply(*cfd->GetLatestMutableCFOptions(), false);
 
-      printf("--------------- Column family \"%s\"  (ID %u) --------------\n",
-             cfd->GetName().c_str(), (unsigned int)cfd->GetID());
-      printf("log number: %lu\n", (unsigned long)cfd->GetLogNumber());
+      printf("--------------- Column family \"%s\"  (ID %" PRIu32
+             ") --------------\n",
+             cfd->GetName().c_str(), cfd->GetID());
+      printf("log number: %" PRIu64 "\n", cfd->GetLogNumber());
       auto comparator = comparators.find(cfd->GetID());
       if (comparator != comparators.end()) {
         printf("comparator: %s\n", comparator->second.c_str());
@@ -3337,36 +4112,41 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
       delete v;
     }
 
-    // Free builders
-    for (auto& builder : builders) {
-      delete builder.second;
-    }
-
     next_file_number_.store(next_file + 1);
-    last_to_be_written_sequence_ = last_sequence;
+    last_allocated_sequence_ = last_sequence;
+    last_published_sequence_ = last_sequence;
     last_sequence_ = last_sequence;
     prev_log_number_ = previous_log_number;
 
-    printf(
-        "next_file_number %lu last_sequence "
-        "%lu  prev_log_number %lu max_column_family %u\n",
-        (unsigned long)next_file_number_.load(), (unsigned long)last_sequence,
-        (unsigned long)previous_log_number,
-        column_family_set_->GetMaxColumnFamily());
+    printf("next_file_number %" PRIu64 " last_sequence %" PRIu64
+           "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
+           " min_log_number_to_keep "
+           "%" PRIu64 "\n",
+           next_file_number_.load(), last_sequence, previous_log_number,
+           column_family_set_->GetMaxColumnFamily(),
+           min_log_number_to_keep_2pc());
   }
 
   return s;
 }
 #endif  // ROCKSDB_LITE
 
-void VersionSet::MarkFileNumberUsedDuringRecovery(uint64_t number) {
-  // only called during recovery which is single threaded, so this works because
-  // there can't be concurrent calls
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  // only called during recovery and repair which are single threaded, so this
+  // works because there can't be concurrent calls
   if (next_file_number_.load(std::memory_order_relaxed) <= number) {
     next_file_number_.store(number + 1, std::memory_order_relaxed);
   }
 }
 
+// Called only either from ::LogAndApply which is protected by mutex or during
+// recovery which is single-threaded.
+void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
+  if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
+    min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
+  }
+}
+
 Status VersionSet::WriteSnapshot(log::Writer* log) {
   // TODO: Break up into multiple records to reduce memory usage on recovery?
 
@@ -3412,7 +4192,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
              cfd->current()->storage_info()->LevelFiles(level)) {
           edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
                        f->fd.GetFileSize(), f->smallest, f->largest,
-                       f->smallest_seqno, f->largest_seqno,
+                       f->fd.smallest_seqno, f->fd.largest_seqno,
                        f->marked_for_compaction);
         }
       }
@@ -3532,8 +4312,9 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
     // approximate offset of "key" within the table.
     TableReader* table_reader_ptr;
     InternalIterator* iter = v->cfd_->table_cache()->NewIterator(
-        ReadOptions(), env_options_, v->cfd_->internal_comparator(), f.fd,
-        nullptr /* range_del_agg */, &table_reader_ptr);
+        ReadOptions(), v->env_options_, v->cfd_->internal_comparator(),
+        *f.file_metadata, nullptr /* range_del_agg */,
+        v->GetMutableCFOptions().prefix_extractor.get(), &table_reader_ptr);
     if (table_reader_ptr != nullptr) {
       result = table_reader_ptr->ApproximateOffsetOf(key);
     }
@@ -3585,7 +4366,8 @@ void VersionSet::AddLiveFiles(std::vector<FileDescriptor>* live_list) {
 }
 
 InternalIterator* VersionSet::MakeInputIterator(
-    const Compaction* c, RangeDelAggregator* range_del_agg) {
+    const Compaction* c, RangeDelAggregator* range_del_agg,
+    const EnvOptions& env_options_compactions) {
   auto cfd = c->column_family_data();
   ReadOptions read_options;
   read_options.verify_checksums = true;
@@ -3610,26 +4392,25 @@ InternalIterator* VersionSet::MakeInputIterator(
         const LevelFilesBrief* flevel = c->input_levels(which);
         for (size_t i = 0; i < flevel->num_files; i++) {
           list[num++] = cfd->table_cache()->NewIterator(
-              read_options, env_options_compactions_,
-              cfd->internal_comparator(), flevel->files[i].fd, range_del_agg,
+              read_options, env_options_compactions, cfd->internal_comparator(),
+              *flevel->files[i].file_metadata, range_del_agg,
+              c->mutable_cf_options()->prefix_extractor.get(),
               nullptr /* table_reader_ptr */,
               nullptr /* no per level latency histogram */,
               true /* for_compaction */, nullptr /* arena */,
-              false /* skip_filters */, (int)which /* level */);
+              false /* skip_filters */, static_cast<int>(which) /* level */);
         }
       } else {
         // Create concatenating iterator for the files from this level
-        list[num++] = NewTwoLevelIterator(
-            new LevelFileIteratorState(
-                cfd->table_cache(), read_options, env_options_compactions_,
-                cfd->internal_comparator(),
-                nullptr /* no per level latency histogram */,
-                true /* for_compaction */, false /* prefix enabled */,
-                false /* skip_filters */, (int)which /* level */,
-                range_del_agg),
-            new LevelFileNumIterator(cfd->internal_comparator(),
-                                     c->input_levels(which),
-                                     false /* don't sample compaction */));
+        list[num++] = new LevelIterator(
+            cfd->table_cache(), read_options, env_options_compactions,
+            cfd->internal_comparator(), c->input_levels(which),
+            c->mutable_cf_options()->prefix_extractor.get(),
+            false /* should_sample */,
+            nullptr /* no per level latency histogram */,
+            true /* for_compaction */, false /* skip_filters */,
+            static_cast<int>(which) /* level */, range_del_agg,
+            c->boundaries(which));
       }
     }
   }
@@ -3686,6 +4467,8 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
       }
     }
   }
+#else
+  (void)c;
 #endif
   return true;     // everything good
 }
@@ -3724,36 +4507,41 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
         LiveFileMetaData filemetadata;
         filemetadata.column_family_name = cfd->GetName();
         uint32_t path_id = file->fd.GetPathId();
-        if (path_id < db_options_->db_paths.size()) {
-          filemetadata.db_path = db_options_->db_paths[path_id].path;
+        if (path_id < cfd->ioptions()->cf_paths.size()) {
+          filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path;
         } else {
-          assert(!db_options_->db_paths.empty());
-          filemetadata.db_path = db_options_->db_paths.back().path;
+          assert(!cfd->ioptions()->cf_paths.empty());
+          filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
         }
         filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
         filemetadata.level = level;
-        filemetadata.size = file->fd.GetFileSize();
+        filemetadata.size = static_cast<size_t>(file->fd.GetFileSize());
         filemetadata.smallestkey = file->smallest.user_key().ToString();
         filemetadata.largestkey = file->largest.user_key().ToString();
-        filemetadata.smallest_seqno = file->smallest_seqno;
-        filemetadata.largest_seqno = file->largest_seqno;
+        filemetadata.smallest_seqno = file->fd.smallest_seqno;
+        filemetadata.largest_seqno = file->fd.largest_seqno;
+        filemetadata.num_reads_sampled = file->stats.num_reads_sampled.load(
+            std::memory_order_relaxed);
+        filemetadata.being_compacted = file->being_compacted;
+        filemetadata.num_entries = file->num_entries;
+        filemetadata.num_deletions = file->num_deletions;
         metadata->push_back(filemetadata);
       }
     }
   }
 }
 
-void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files,
+void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
                                   std::vector<std::string>* manifest_filenames,
                                   uint64_t min_pending_output) {
   assert(manifest_filenames->empty());
   obsolete_manifests_.swap(*manifest_filenames);
-  std::vector<FileMetaData*> pending_files;
-  for (auto f : obsolete_files_) {
-    if (f->fd.GetNumber() < min_pending_output) {
-      files->push_back(f);
+  std::vector<ObsoleteFileInfo> pending_files;
+  for (auto& f : obsolete_files_) {
+    if (f.metadata->fd.GetNumber() < min_pending_output) {
+      files->push_back(std::move(f));
     } else {
-      pending_files.push_back(f);
+      pending_files.push_back(std::move(f));
     }
   }
   obsolete_files_.swap(pending_files);
@@ -3763,7 +4551,9 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
     const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
   assert(edit->is_column_family_add_);
 
-  Version* dummy_versions = new Version(nullptr, this);
+  MutableCFOptions dummy_cf_options;
+  Version* dummy_versions =
+      new Version(nullptr, this, env_options_, dummy_cf_options);
   // Ref() dummy version once so that later we can call Unref() to delete it
   // by avoiding calling "delete" explicitly (~Version is private)
   dummy_versions->Ref();
@@ -3771,7 +4561,9 @@ ColumnFamilyData* VersionSet::CreateColumnFamily(
       edit->column_family_name_, edit->column_family_, dummy_versions,
       cf_options);
 
-  Version* v = new Version(new_cfd, this, current_version_number_++);
+  Version* v = new Version(new_cfd, this, env_options_,
+                           *new_cfd->GetLatestMutableCFOptions(),
+                           current_version_number_++);
 
   // Fill level target base information.
   v->storage_info()->CalculateBaseBytes(*new_cfd->ioptions(),
@@ -3811,4 +4603,405 @@ uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
   return total_files_size;
 }
 
+ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname,
+                                       const ImmutableDBOptions* _db_options,
+                                       const EnvOptions& _env_options,
+                                       Cache* table_cache,
+                                       WriteBufferManager* write_buffer_manager,
+                                       WriteController* write_controller)
+    : VersionSet(dbname, _db_options, _env_options, table_cache,
+                 write_buffer_manager, write_controller) {}
+
+ReactiveVersionSet::~ReactiveVersionSet() {}
+
+Status ReactiveVersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+    std::unique_ptr<Status>* manifest_reader_status) {
+  assert(manifest_reader != nullptr);
+  assert(manifest_reporter != nullptr);
+  assert(manifest_reader_status != nullptr);
+
+  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_options;
+  for (const auto& cf : column_families) {
+    cf_name_to_options.insert({cf.name, cf.options});
+  }
+
+  // add default column family
+  auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName);
+  if (default_cf_iter == cf_name_to_options.end()) {
+    return Status::InvalidArgument("Default column family not specified");
+  }
+  VersionEdit default_cf_edit;
+  default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+  default_cf_edit.SetColumnFamily(0);
+  ColumnFamilyData* default_cfd =
+      CreateColumnFamily(default_cf_iter->second, &default_cf_edit);
+  // In recovery, nobody else can access it, so it's fine to set it to be
+  // initialized earlier.
+  default_cfd->set_initialized();
+
+  bool have_log_number = false;
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t log_number = 0;
+  uint64_t previous_log_number = 0;
+  uint32_t max_column_family = 0;
+  uint64_t min_log_number_to_keep = 0;
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      builders;
+  std::unordered_map<int, std::string> column_families_not_found;
+  builders.insert(
+      std::make_pair(0, std::unique_ptr<BaseReferencedVersionBuilder>(
+                            new BaseReferencedVersionBuilder(default_cfd))));
+
+  manifest_reader_status->reset(new Status());
+  manifest_reporter->reset(new LogReporter());
+  static_cast<LogReporter*>(manifest_reporter->get())->status =
+      manifest_reader_status->get();
+  Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
+  log::Reader* reader = manifest_reader->get();
+
+  int retry = 0;
+  while (s.ok() && retry < 1) {
+    assert(reader != nullptr);
+    Slice record;
+    std::string scratch;
+    while (s.ok() && reader->ReadRecord(&record, &scratch)) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (!s.ok()) {
+        break;
+      }
+      s = ApplyOneVersionEditToBuilder(
+          edit, cf_name_to_options, column_families_not_found, builders,
+          &have_log_number, &log_number, &have_prev_log_number,
+          &previous_log_number, &have_next_file, &next_file,
+          &have_last_sequence, &last_sequence, &min_log_number_to_keep,
+          &max_column_family);
+    }
+    if (s.ok()) {
+      bool enough = have_next_file && have_log_number && have_last_sequence;
+      if (enough) {
+        for (const auto& cf : column_families) {
+          auto cfd = column_family_set_->GetColumnFamily(cf.name);
+          if (cfd == nullptr) {
+            enough = false;
+            break;
+          }
+        }
+      }
+      if (enough) {
+        for (const auto& cf : column_families) {
+          auto cfd = column_family_set_->GetColumnFamily(cf.name);
+          assert(cfd != nullptr);
+          if (!cfd->IsDropped()) {
+            auto builder_iter = builders.find(cfd->GetID());
+            assert(builder_iter != builders.end());
+            auto builder = builder_iter->second->version_builder();
+            assert(builder != nullptr);
+            s = builder->LoadTableHandlers(
+                cfd->internal_stats(), db_options_->max_file_opening_threads,
+                false /* prefetch_index_and_filter_in_cache */,
+                true /* is_initial_load */,
+                cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+            if (!s.ok()) {
+              enough = false;
+              if (s.IsPathNotFound()) {
+                s = Status::OK();
+              }
+              break;
+            }
+          }
+        }
+      }
+      if (enough) {
+        break;
+      }
+    }
+    ++retry;
+  }
+
+  if (s.ok()) {
+    if (!have_prev_log_number) {
+      previous_log_number = 0;
+    }
+    column_family_set_->UpdateMaxColumnFamily(max_column_family);
+
+    MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
+    MarkFileNumberUsed(previous_log_number);
+    MarkFileNumberUsed(log_number);
+
+    for (auto cfd : *column_family_set_) {
+      assert(builders.count(cfd->GetID()) > 0);
+      auto builder = builders[cfd->GetID()]->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      auto builders_iter = builders.find(cfd->GetID());
+      assert(builders_iter != builders.end());
+      auto* builder = builders_iter->second->version_builder();
+
+      Version* v = new Version(cfd, this, env_options_,
+                               *cfd->GetLatestMutableCFOptions(),
+                               current_version_number_++);
+      builder->SaveTo(v->storage_info());
+
+      // Install recovered version
+      v->PrepareApply(*cfd->GetLatestMutableCFOptions(),
+                      !(db_options_->skip_stats_update_on_db_open));
+      AppendVersion(cfd, v);
+    }
+    next_file_number_.store(next_file + 1);
+    last_allocated_sequence_ = last_sequence;
+    last_published_sequence_ = last_sequence;
+    last_sequence_ = last_sequence;
+    prev_log_number_ = previous_log_number;
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
+                     cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+    }
+  }
+  return s;
+}
+
+Status ReactiveVersionSet::ReadAndApply(
+    InstrumentedMutex* mu,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    std::unordered_set<ColumnFamilyData*>* cfds_changed) {
+  assert(manifest_reader != nullptr);
+  assert(cfds_changed != nullptr);
+  mu->AssertHeld();
+
+  Status s;
+  bool have_log_number = false;
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t log_number = 0;
+  uint64_t previous_log_number = 0;
+  uint32_t max_column_family = 0;
+  uint64_t min_log_number_to_keep = 0;
+
+  while (s.ok()) {
+    Slice record;
+    std::string scratch;
+    log::Reader* reader = manifest_reader->get();
+    std::string old_manifest_path = reader->file()->file_name();
+    while (reader->ReadRecord(&record, &scratch)) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (!s.ok()) {
+        break;
+      }
+      ColumnFamilyData* cfd =
+          column_family_set_->GetColumnFamily(edit.column_family_);
+      // If we cannot find this column family in our column family set, then it
+      // may be a new column family created by the primary after the secondary
+      // starts. Ignore it for now.
+      if (nullptr == cfd) {
+        continue;
+      }
+      if (active_version_builders_.find(edit.column_family_) ==
+          active_version_builders_.end()) {
+        std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(
+            new BaseReferencedVersionBuilder(cfd));
+        active_version_builders_.insert(
+            std::make_pair(edit.column_family_, std::move(builder_guard)));
+      }
+      s = ApplyOneVersionEditToBuilder(
+          edit, &have_log_number, &log_number, &have_prev_log_number,
+          &previous_log_number, &have_next_file, &next_file,
+          &have_last_sequence, &last_sequence, &min_log_number_to_keep,
+          &max_column_family);
+      if (!s.ok()) {
+        break;
+      }
+      auto builder_iter = active_version_builders_.find(edit.column_family_);
+      assert(builder_iter != active_version_builders_.end());
+      auto builder = builder_iter->second->version_builder();
+      assert(builder != nullptr);
+      s = builder->LoadTableHandlers(
+          cfd->internal_stats(), db_options_->max_file_opening_threads,
+          false /* prefetch_index_and_filter_in_cache */,
+          false /* is_initial_load */,
+          cfd->GetLatestMutableCFOptions()->prefix_extractor.get());
+      TEST_SYNC_POINT_CALLBACK(
+          "ReactiveVersionSet::ReadAndApply:AfterLoadTableHandlers", &s);
+      if (!s.ok() && !s.IsPathNotFound()) {
+        break;
+      } else if (s.IsPathNotFound()) {
+        s = Status::OK();
+      } else {  // s.ok() == true
+        auto version = new Version(cfd, this, env_options_,
+                                   *cfd->GetLatestMutableCFOptions(),
+                                   current_version_number_++);
+        builder->SaveTo(version->storage_info());
+        version->PrepareApply(*cfd->GetLatestMutableCFOptions(), true);
+        AppendVersion(cfd, version);
+        active_version_builders_.erase(builder_iter);
+        if (cfds_changed->count(cfd) == 0) {
+          cfds_changed->insert(cfd);
+        }
+      }
+      if (have_next_file) {
+        next_file_number_.store(next_file + 1);
+      }
+      if (have_last_sequence) {
+        last_allocated_sequence_ = last_sequence;
+        last_published_sequence_ = last_sequence;
+        last_sequence_ = last_sequence;
+      }
+      if (have_prev_log_number) {
+        prev_log_number_ = previous_log_number;
+        MarkFileNumberUsed(previous_log_number);
+      }
+      if (have_log_number) {
+        MarkFileNumberUsed(log_number);
+      }
+      column_family_set_->UpdateMaxColumnFamily(max_column_family);
+      MarkMinLogNumberToKeep2PC(min_log_number_to_keep);
+    }
+    // It's possible that:
+    // 1) s.IsCorruption(), indicating the current MANIFEST is corrupted.
+    // 2) we have finished reading the current MANIFEST.
+    // 3) we have encountered an IOError reading the current MANIFEST.
+    // We need to look for the next MANIFEST and start from there. If we cannot
+    // find the next MANIFEST, we should exit the loop.
+    s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
+    reader = manifest_reader->get();
+    if (s.ok() && reader->file()->file_name() == old_manifest_path) {
+      break;
+    }
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *column_family_set_) {
+      auto builder_iter = active_version_builders_.find(cfd->GetID());
+      if (builder_iter == active_version_builders_.end()) {
+        continue;
+      }
+      auto builder = builder_iter->second->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+  return s;
+}
+
+Status ReactiveVersionSet::ApplyOneVersionEditToBuilder(
+    VersionEdit& edit, bool* have_log_number, uint64_t* log_number,
+    bool* have_prev_log_number, uint64_t* previous_log_number,
+    bool* have_next_file, uint64_t* next_file, bool* have_last_sequence,
+    SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep,
+    uint32_t* max_column_family) {
+  ColumnFamilyData* cfd = nullptr;
+  Status status;
+  if (edit.is_column_family_add_) {
+    // TODO (yanqin) for now the secondary ignores column families created
+    // after Open. This also simplifies handling of switching to a new MANIFEST
+    // and processing the snapshot of the system at the beginning of the
+    // MANIFEST.
+    return Status::OK();
+  } else if (edit.is_column_family_drop_) {
+    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+    // Drop a CF created by primary after secondary starts? Then ignore
+    if (cfd == nullptr) {
+      return Status::OK();
+    }
+    // Drop the column family by setting it to be 'dropped' without destroying
+    // the column family handle.
+    cfd->SetDropped();
+    if (cfd->Unref()) {
+      delete cfd;
+      cfd = nullptr;
+    }
+  } else {
+    cfd = column_family_set_->GetColumnFamily(edit.column_family_);
+    // Operation on a CF created after Open? Then ignore
+    if (cfd == nullptr) {
+      return Status::OK();
+    }
+    auto builder_iter = active_version_builders_.find(edit.column_family_);
+    assert(builder_iter != active_version_builders_.end());
+    auto builder = builder_iter->second->version_builder();
+    assert(builder != nullptr);
+    builder->Apply(&edit);
+  }
+  return ExtractInfoFromVersionEdit(
+      cfd, edit, have_log_number, log_number, have_prev_log_number,
+      previous_log_number, have_next_file, next_file, have_last_sequence,
+      last_sequence, min_log_number_to_keep, max_column_family);
+}
+
+Status ReactiveVersionSet::MaybeSwitchManifest(
+    log::Reader::Reporter* reporter,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
+  assert(manifest_reader != nullptr);
+  Status s;
+  do {
+    std::string manifest_path;
+    s = GetCurrentManifestPath(&manifest_path);
+    std::unique_ptr<SequentialFile> manifest_file;
+    if (s.ok()) {
+      if (nullptr == manifest_reader->get() ||
+          manifest_reader->get()->file()->file_name() != manifest_path) {
+        TEST_SYNC_POINT(
+            "ReactiveVersionSet::MaybeSwitchManifest:"
+            "AfterGetCurrentManifestPath:0");
+        TEST_SYNC_POINT(
+            "ReactiveVersionSet::MaybeSwitchManifest:"
+            "AfterGetCurrentManifestPath:1");
+        s = env_->NewSequentialFile(
+            manifest_path, &manifest_file,
+            env_->OptimizeForManifestRead(env_options_));
+      } else {
+        // No need to switch manifest.
+        break;
+      }
+    }
+    std::unique_ptr<SequentialFileReader> manifest_file_reader;
+    if (s.ok()) {
+      manifest_file_reader.reset(
+          new SequentialFileReader(std::move(manifest_file), manifest_path));
+      manifest_reader->reset(new log::FragmentBufferedReader(
+          nullptr, std::move(manifest_file_reader), reporter,
+          true /* checksum */, 0 /* log_number */));
+      ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
+                     manifest_path.c_str());
+      // TODO (yanqin) every time we switch to a new MANIFEST, we clear the
+      // active_version_builders_ map because we choose to construct the
+      // versions from scratch, thanks to the first part of each MANIFEST
+      // written by VersionSet::WriteSnapshot. This is not necessary, but we
+      // choose this at present for the sake of simplicity.
+      active_version_builders_.clear();
+    }
+  } while (s.IsPathNotFound());
+  return s;
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/version_set.h b/thirdparty/rocksdb/db/version_set.h
index 5862dea335..8b50dca76e 100644
--- a/thirdparty/rocksdb/db/version_set.h
+++ b/thirdparty/rocksdb/db/version_set.h
@@ -35,6 +35,7 @@
 #include "db/file_indexer.h"
 #include "db/log_reader.h"
 #include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
 #include "db/table_cache.h"
 #include "db/version_builder.h"
 #include "db/version_edit.h"
@@ -51,7 +52,6 @@ class Writer;
 }
 
 class Compaction;
-class InternalIterator;
 class LogBuffer;
 class LookupKey;
 class MemTable;
@@ -114,7 +114,7 @@ class VersionStorageInfo {
   // Update the accumulated stats from a file-meta.
   void UpdateAccumulatedStats(FileMetaData* file_meta);
 
-  // Decrease the current stat form a to-be-delected file-meta
+  // Decrease the current stat from a to-be-deleted file-meta
   void RemoveCurrentStats(FileMetaData* file_meta);
 
   void ComputeCompensatedSizes();
@@ -134,6 +134,23 @@ class VersionStorageInfo {
   // ComputeCompactionScore()
   void ComputeFilesMarkedForCompaction();
 
+  // This computes ttl_expired_files_ and is called by
+  // ComputeCompactionScore()
+  void ComputeExpiredTtlFiles(const ImmutableCFOptions& ioptions,
+                              const uint64_t ttl);
+
+  // This computes bottommost_files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore() or UpdateOldestSnapshot().
+  //
+  // Among bottommost files (assumes they've already been computed), marks the
+  // ones that have keys that would be eliminated if recompacted, according to
+  // the seqnum of the oldest existing snapshot. Must be called every time
+  // oldest snapshot changes as that is when bottom-level files can become
+  // eligible for compaction.
+  //
+  // REQUIRES: DB mutex held
+  void ComputeBottommostFilesMarkedForCompaction();
+
   // Generate level_files_brief_ from files_
   void GenerateLevelFilesBrief();
   // Sort all files for this version based on their file size and
@@ -146,6 +163,16 @@ class VersionStorageInfo {
     return level0_non_overlapping_;
   }
 
+  // Check whether each file in this version is bottommost (i.e., nothing in its
+  // key-range could possibly exist in an older file/level).
+  // REQUIRES: This version has not been saved
+  void GenerateBottommostFiles();
+
+  // Updates the oldest snapshot and related internal state, like the bottommost
+  // files marked for compaction.
+  // REQUIRES: DB mutex held
+  void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
+
   int MaxInputLevel() const;
   int MaxOutputLevel(bool allow_ingest_behind) const;
 
@@ -161,9 +188,11 @@ class VersionStorageInfo {
       std::vector<FileMetaData*>* inputs,
       int hint_index = -1,        // index of overlap file
       int* file_index = nullptr,  // return index of overlap file
-      bool expand_range = true)   // if set, returns files which overlap the
-      const;                      // range and overlap each other. If false,
+      bool expand_range = true,   // if set, returns files which overlap the
+                                  // range and overlap each other. If false,
                                   // then just files intersecting the range
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
   void GetCleanInputsWithinInterval(
       int level, const InternalKey* begin,  // nullptr means before all keys
       const InternalKey* end,               // nullptr means after all keys
@@ -173,31 +202,32 @@ class VersionStorageInfo {
       const;
 
   void GetOverlappingInputsRangeBinarySearch(
-      int level,           // level > 0
-      const Slice& begin,  // nullptr means before all keys
-      const Slice& end,    // nullptr means after all keys
+      int level,                 // level > 0
+      const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,    // nullptr means after all keys
       std::vector<FileMetaData*>* inputs,
       int hint_index,                // index of overlap file
       int* file_index,               // return index of overlap file
-      bool within_interval = false)  // if set, force the inputs within interval
-      const;
+      bool within_interval = false,  // if set, force the inputs within interval
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
 
   void ExtendFileRangeOverlappingInterval(
       int level,
-      const Slice& begin,  // nullptr means before all keys
-      const Slice& end,    // nullptr means after all keys
-      unsigned int index,  // start extending from this index
-      int* startIndex,     // return the startIndex of input range
-      int* endIndex)       // return the endIndex of input range
+      const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,    // nullptr means after all keys
+      unsigned int index,        // start extending from this index
+      int* startIndex,           // return the startIndex of input range
+      int* endIndex)             // return the endIndex of input range
       const;
 
   void ExtendFileRangeWithinInterval(
       int level,
-      const Slice& begin,  // nullptr means before all keys
-      const Slice& end,    // nullptr means after all keys
-      unsigned int index,  // start extending from this index
-      int* startIndex,     // return the startIndex of input range
-      int* endIndex)       // return the endIndex of input range
+      const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,    // nullptr means after all keys
+      unsigned int index,        // start extending from this index
+      int* startIndex,           // return the startIndex of input range
+      int* endIndex)             // return the endIndex of input range
       const;
 
   // Returns true iff some file in the specified level overlaps
@@ -263,7 +293,23 @@ class VersionStorageInfo {
     return files_marked_for_compaction_;
   }
 
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
+    assert(finalized_);
+    return expired_ttl_files_;
+  }
+
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>&
+  BottommostFilesMarkedForCompaction() const {
+    assert(finalized_);
+    return bottommost_files_marked_for_compaction_;
+  }
+
   int base_level() const { return base_level_; }
+  double level_multiplier() const { return level_multiplier_; }
 
   // REQUIRES: lock is held
   // Set the index that is used to offset into files_by_compaction_pri_ to find
@@ -356,6 +402,20 @@ class VersionStorageInfo {
 
   bool force_consistency_checks() const { return force_consistency_checks_; }
 
+  SequenceNumber bottommost_files_mark_threshold() const {
+    return bottommost_files_mark_threshold_;
+  }
+
+  // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
+  // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
+  //
+  // @param last_level Level after which we check for overlap
+  // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
+  //    check for overlap; otherwise, must be -1
+  bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
+                                     const Slice& largest_user_key,
+                                     int last_level, int last_l0_idx);
+
  private:
   const InternalKeyComparator* internal_comparator_;
   const Comparator* user_comparator_;
@@ -380,6 +440,8 @@ class VersionStorageInfo {
   // be empty. -1 if it is not level-compaction so it's not applicable.
   int base_level_;
 
+  double level_multiplier_;
+
   // A list for the same set of files that are stored in files_,
   // but files in each level are now sorted based on file
   // size. The file with the largest size is at the front.
@@ -405,6 +467,30 @@ class VersionStorageInfo {
   // ComputeCompactionScore()
   autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
 
+  autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
+
+  // These files are considered bottommost because none of their keys can exist
+  // at lower levels. They are not necessarily all in the same level. The marked
+  // ones are eligible for compaction because they contain duplicate key
+  // versions that are no longer protected by snapshot. These variables are
+  // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
+  // `ComputeBottommostFilesMarkedForCompaction()`.
+  autovector<std::pair<int, FileMetaData*>> bottommost_files_;
+  autovector<std::pair<int, FileMetaData*>>
+      bottommost_files_marked_for_compaction_;
+
+  // Threshold for needing to mark another bottommost file. Maintain it so we
+  // can quickly check when releasing a snapshot whether more bottommost files
+  // became eligible for compaction. It's defined as the min of the max nonzero
+  // seqnums of unmarked bottommost files.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  // Monotonically increases as we release old snapshots. Zero indicates no
+  // snapshots have been released yet. When no snapshots remain we set it to the
+  // current seqnum, which needs to be protected as a snapshot can still be
+  // created that references it.
+  SequenceNumber oldest_snapshot_seqnum_ = 0;
+
   // Level that should be compacted next and its compaction score.
   // Score < 1 means compaction is not strictly needed.  These fields
   // are initialized by Finalize().
@@ -428,7 +514,7 @@ class VersionStorageInfo {
   uint64_t accumulated_num_deletions_;
   // current number of non_deletion entries
   uint64_t current_num_non_deletions_;
-  // current number of delection entries
+  // current number of deletion entries
   uint64_t current_num_deletions_;
   // current number of file samples
   uint64_t current_num_samples_;
@@ -462,9 +548,10 @@ class Version {
                             MergeIteratorBuilder* merger_iter_builder,
                             int level, RangeDelAggregator* range_del_agg);
 
-  void AddRangeDelIteratorsForLevel(
-      const ReadOptions& read_options, const EnvOptions& soptions, int level,
-      std::vector<InternalIterator*>* range_del_iters);
+  Status OverlapWithLevelIterator(const ReadOptions&, const EnvOptions&,
+                                  const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level, bool* overlap);
 
   // Lookup the value for key.  If found, store it in *val and
   // return OK.  Else return a non-OK status.
@@ -484,8 +571,9 @@ class Version {
   // REQUIRES: lock is not held
   void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
            Status* status, MergeContext* merge_context,
-           RangeDelAggregator* range_del_agg, bool* value_found = nullptr,
-           bool* key_exists = nullptr, SequenceNumber* seq = nullptr,
+           SequenceNumber* max_covering_tombstone_seq,
+           bool* value_found = nullptr, bool* key_exists = nullptr,
+           SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
            bool* is_blob = nullptr);
 
   // Loads some stats information from files. Call without mutex held. It needs
@@ -506,13 +594,13 @@ class Version {
   // Return a human readable string that describes this version's contents.
   std::string DebugString(bool hex = false, bool print_stats = false) const;
 
-  // Returns the version nuber of this version
+  // Returns the version number of this version
   uint64_t GetVersionNumber() const { return version_number_; }
 
   // REQUIRES: lock is held
   // On success, "tp" will contains the table properties of the file
   // specified in "file_meta".  If the file name of "file_meta" is
-  // known ahread, passing it by a non-null "fname" can save a
+  // known ahead, passing it by a non-null "fname" can save a
   // file-name conversion.
   Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
                             const FileMetaData* file_meta,
@@ -521,14 +609,14 @@ class Version {
   // REQUIRES: lock is held
   // On success, *props will be populated with all SSTables' table properties.
   // The keys of `props` are the sst file name, the values of `props` are the
-  // tables' propertis, represented as shared_ptr.
+  // tables' properties, represented as std::shared_ptr.
   Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
   Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
   Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
                                       TablePropertiesCollection* props) const;
 
   // REQUIRES: lock is held
-  // On success, "tp" will contains the aggregated table property amoug
+  // On success, "tp" will contains the aggregated table property among
   // the table properties of all sst files in this version.
   Status GetAggregatedTableProperties(
       std::shared_ptr<const TableProperties>* tp, int level = -1);
@@ -554,8 +642,13 @@ class Version {
 
   void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
 
+  uint64_t GetSstFilesSize();
+
+  MutableCFOptions GetMutableCFOptions() { return mutable_cf_options_; }
+
  private:
   Env* env_;
+  friend class ReactiveVersionSet;
   friend class VersionSet;
 
   const InternalKeyComparator* internal_comparator() const {
@@ -576,7 +669,7 @@ class Version {
   bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
 
   // The helper function of UpdateAccumulatedStats, which may fill the missing
-  // fields of file_mata from its associated TableProperties.
+  // fields of file_meta from its associated TableProperties.
   // Returns true if it does initialize FileMetaData.
   bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
 
@@ -600,12 +693,15 @@ class Version {
   Version* next_;               // Next version in linked list
   Version* prev_;               // Previous version in linked list
   int refs_;                    // Number of live refs to this version
+  const EnvOptions env_options_;
+  const MutableCFOptions mutable_cf_options_;
 
   // A version number that uniquely represents this version. This is
   // used for debugging and logging purposes only.
   uint64_t version_number_;
 
-  Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0);
+  Version(ColumnFamilyData* cfd, VersionSet* vset, const EnvOptions& env_opt,
+          MutableCFOptions mutable_cf_options, uint64_t version_number = 0);
 
   ~Version();
 
@@ -614,13 +710,45 @@ class Version {
   void operator=(const Version&);
 };
 
+struct ObsoleteFileInfo {
+  FileMetaData* metadata;
+  std::string   path;
+
+  ObsoleteFileInfo() noexcept : metadata(nullptr) {}
+  ObsoleteFileInfo(FileMetaData* f, const std::string& file_path)
+      : metadata(f), path(file_path) {}
+
+  ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
+  ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
+
+  ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept :
+    ObsoleteFileInfo() {
+      *this = std::move(rhs);
+  }
+
+  ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
+    path = std::move(rhs.path);
+    metadata = rhs.metadata;
+    rhs.metadata = nullptr;
+
+    return *this;
+  }
+
+  void DeleteMetadata() {
+    delete metadata;
+    metadata = nullptr;
+  }
+};
+
+class BaseReferencedVersionBuilder;
+
 class VersionSet {
  public:
   VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
              const EnvOptions& env_options, Cache* table_cache,
              WriteBufferManager* write_buffer_manager,
              WriteController* write_controller);
-  ~VersionSet();
+  virtual ~VersionSet();
 
   // Apply *edit to the current version to form a new descriptor that
   // is both saved to persistent state and installed as the new
@@ -634,9 +762,15 @@ class VersionSet {
       InstrumentedMutex* mu, Directory* db_directory = nullptr,
       bool new_descriptor_log = false,
       const ColumnFamilyOptions* column_family_options = nullptr) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
     autovector<VersionEdit*> edit_list;
-    edit_list.push_back(edit);
-    return LogAndApply(column_family_data, mutable_cf_options, edit_list, mu,
+    edit_list.emplace_back(edit);
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
                        db_directory, new_descriptor_log, column_family_options);
   }
   // The batch version. If edit_list.size() > 1, caller must ensure that
@@ -646,7 +780,29 @@ class VersionSet {
       const MutableCFOptions& mutable_cf_options,
       const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
       Directory* db_directory = nullptr, bool new_descriptor_log = false,
-      const ColumnFamilyOptions* column_family_options = nullptr);
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                       db_directory, new_descriptor_log, column_family_options);
+  }
+
+  // The across-multi-cf batch version. If edit_lists contain more than
+  // 1 version edits, caller must ensure that no edit in the []list is column
+  // family manipulation.
+  virtual Status LogAndApply(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<autovector<VersionEdit*>>& edit_lists,
+      InstrumentedMutex* mu, Directory* db_directory = nullptr,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* new_cf_options = nullptr);
+
+  Status GetCurrentManifestPath(std::string* manifest_filename);
 
   // Recover the last saved descriptor from persistent storage.
   // If read_only == true, Recover() will not complain if some column families
@@ -691,52 +847,87 @@ class VersionSet {
 
   uint64_t current_next_file_number() const { return next_file_number_.load(); }
 
+  uint64_t min_log_number_to_keep_2pc() const {
+    return min_log_number_to_keep_2pc_.load();
+  }
+
   // Allocate and return a new file number
   uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
 
+  // Fetch And Add n new file number
+  uint64_t FetchAddFileNumber(uint64_t n) {
+    return next_file_number_.fetch_add(n);
+  }
+
   // Return the last sequence number.
   uint64_t LastSequence() const {
     return last_sequence_.load(std::memory_order_acquire);
   }
 
   // Note: memory_order_acquire must be sufficient.
-  uint64_t LastToBeWrittenSequence() const {
-    return last_to_be_written_sequence_.load(std::memory_order_seq_cst);
+  uint64_t LastAllocatedSequence() const {
+    return last_allocated_sequence_.load(std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_acquire must be sufficient.
+  uint64_t LastPublishedSequence() const {
+    return last_published_sequence_.load(std::memory_order_seq_cst);
   }
 
   // Set the last sequence number to s.
   void SetLastSequence(uint64_t s) {
     assert(s >= last_sequence_);
-    // Last visible seqeunce must always be less than last written seq
-    assert(!db_options_->concurrent_prepare ||
-           s <= last_to_be_written_sequence_);
+    // Last visible sequence must always be less than last written seq
+    assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
     last_sequence_.store(s, std::memory_order_release);
   }
 
   // Note: memory_order_release must be sufficient
-  void SetLastToBeWrittenSequence(uint64_t s) {
-    assert(s >= last_to_be_written_sequence_);
-    last_to_be_written_sequence_.store(s, std::memory_order_seq_cst);
+  void SetLastPublishedSequence(uint64_t s) {
+    assert(s >= last_published_sequence_);
+    last_published_sequence_.store(s, std::memory_order_seq_cst);
   }
 
   // Note: memory_order_release must be sufficient
-  uint64_t FetchAddLastToBeWrittenSequence(uint64_t s) {
-    return last_to_be_written_sequence_.fetch_add(s, std::memory_order_seq_cst);
+  void SetLastAllocatedSequence(uint64_t s) {
+    assert(s >= last_allocated_sequence_);
+    last_allocated_sequence_.store(s, std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_release must be sufficient
+  uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
+    return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
   }
 
   // Mark the specified file number as used.
-  // REQUIRED: this is only called during single-threaded recovery
-  void MarkFileNumberUsedDuringRecovery(uint64_t number);
+  // REQUIRED: this is only called during single-threaded recovery or repair.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Mark the specified log number as deleted
+  // REQUIRED: this is only called during single-threaded recovery or repair, or
+  // from ::LogAndApply where the global mutex is held.
+  void MarkMinLogNumberToKeep2PC(uint64_t number);
 
   // Return the log file number for the log file that is currently
   // being compacted, or zero if there is no such log file.
   uint64_t prev_log_number() const { return prev_log_number_; }
 
-  // Returns the minimum log number such that all
-  // log numbers less than or equal to it can be deleted
-  uint64_t MinLogNumber() const {
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // In non-2PC mode, all the log numbers smaller than this number can be safely
+  // deleted.
+  uint64_t MinLogNumberWithUnflushedData() const {
+    return PreComputeMinLogNumberWithUnflushedData(nullptr);
+  }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfd_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const ColumnFamilyData* cfd_to_skip) const {
     uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
     for (auto cfd : *column_family_set_) {
+      if (cfd == cfd_to_skip) {
+        continue;
+      }
       // It's safe to ignore dropped column families here:
       // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
       if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
@@ -748,8 +939,9 @@ class VersionSet {
 
   // Create an iterator that reads over the compaction inputs for "*c".
   // The caller should delete the iterator when no longer needed.
-  InternalIterator* MakeInputIterator(const Compaction* c,
-                                      RangeDelAggregator* range_del_agg);
+  InternalIterator* MakeInputIterator(
+      const Compaction* c, RangeDelAggregator* range_del_agg,
+      const EnvOptions& env_options_compactions);
 
   // Add all files listed in any live version to *live.
   void AddLiveFiles(std::vector<FileDescriptor>* live_list);
@@ -775,26 +967,33 @@ class VersionSet {
   // This function doesn't support leveldb SST filenames
   void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
 
-  void GetObsoleteFiles(std::vector<FileMetaData*>* files,
+  void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
                         std::vector<std::string>* manifest_filenames,
                         uint64_t min_pending_output);
 
   ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
   const EnvOptions& env_options() { return env_options_; }
+  void ChangeEnvOptions(const MutableDBOptions& new_options) {
+    env_options_.writable_file_max_buffer_size =
+        new_options.writable_file_max_buffer_size;
+  }
+
+  const ImmutableDBOptions* db_options() const { return db_options_; }
 
   static uint64_t GetNumLiveVersions(Version* dummy_versions);
 
   static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
 
- private:
+ protected:
   struct ManifestWriter;
 
   friend class Version;
   friend class DBImpl;
+  friend class DBImplReadOnly;
 
   struct LogReporter : public log::Reader::Reporter {
     Status* status;
-    virtual void Corruption(size_t bytes, const Status& s) override {
+    virtual void Corruption(size_t /*bytes*/, const Status& s) override {
       if (this->status->ok()) *this->status = s;
     }
   };
@@ -814,23 +1013,57 @@ class VersionSet {
   ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
                                        VersionEdit* edit);
 
+  // REQUIRES db mutex
+  Status ApplyOneVersionEditToBuilder(
+      VersionEdit& edit,
+      const std::unordered_map<std::string, ColumnFamilyOptions>& name_to_opts,
+      std::unordered_map<int, std::string>& column_families_not_found,
+      std::unordered_map<
+          uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>& builders,
+      bool* have_log_number, uint64_t* log_number, bool* have_prev_log_number,
+      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+      bool* have_last_sequence, SequenceNumber* last_sequence,
+      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+
+  Status ExtractInfoFromVersionEdit(
+      ColumnFamilyData* cfd, const VersionEdit& edit, bool* have_log_number,
+      uint64_t* log_number, bool* have_prev_log_number,
+      uint64_t* previous_log_number, bool* have_next_file, uint64_t* next_file,
+      bool* have_last_sequence, SequenceNumber* last_sequence,
+      uint64_t* min_log_number_to_keep, uint32_t* max_column_family);
+
   std::unique_ptr<ColumnFamilySet> column_family_set_;
 
   Env* const env_;
   const std::string dbname_;
   const ImmutableDBOptions* const db_options_;
   std::atomic<uint64_t> next_file_number_;
+  // Any log number equal or lower than this should be ignored during recovery,
+  // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
+  // number is ignored.
+  std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
   uint64_t manifest_file_number_;
   uint64_t options_file_number_;
   uint64_t pending_manifest_file_number_;
-  // The last seq visible to reads
+  // The last seq visible to reads. It normally indicates the last sequence in
+  // the memtable but when using two write queues it could also indicate the
+  // last sequence in the WAL visible to reads.
   std::atomic<uint64_t> last_sequence_;
-  // The last seq with which a writer has written/will write.
-  std::atomic<uint64_t> last_to_be_written_sequence_;
+  // The last seq that is already allocated. It is applicable only when we have
+  // two write queues. In that case seq might or might not have appreated in
+  // memtable but it is expected to appear in the WAL.
+  // We have last_sequence <= last_allocated_sequence_
+  std::atomic<uint64_t> last_allocated_sequence_;
+  // The last allocated sequence that is also published to the readers. This is
+  // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
+  // last_sequence_ also indicates the last published seq.
+  // We have last_sequence <= last_published_sequence_ <=
+  // last_allocated_sequence_
+  std::atomic<uint64_t> last_published_sequence_;
   uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
 
   // Opened lazily
-  unique_ptr<log::Writer> descriptor_log_;
+  std::unique_ptr<log::Writer> descriptor_log_;
 
   // generates a increasing version number for every new version
   uint64_t current_version_number_;
@@ -841,23 +1074,83 @@ class VersionSet {
   // Current size of manifest file
   uint64_t manifest_file_size_;
 
-  std::vector<FileMetaData*> obsolete_files_;
+  std::vector<ObsoleteFileInfo> obsolete_files_;
   std::vector<std::string> obsolete_manifests_;
 
   // env options for all reads and writes except compactions
-  const EnvOptions& env_options_;
-
-  // env options used for compactions. This is a copy of
-  // env_options_ but with readaheads set to readahead_compactions_.
-  const EnvOptions env_options_compactions_;
+  EnvOptions env_options_;
 
+ private:
   // No copying allowed
   VersionSet(const VersionSet&);
   void operator=(const VersionSet&);
 
+  // REQUIRES db mutex at beginning. may release and re-acquire db mutex
+  Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
+                               InstrumentedMutex* mu, Directory* db_directory,
+                               bool new_descriptor_log,
+                               const ColumnFamilyOptions* new_cf_options);
+
   void LogAndApplyCFHelper(VersionEdit* edit);
-  void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v,
+  void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
                          VersionEdit* edit, InstrumentedMutex* mu);
 };
 
+class ReactiveVersionSet : public VersionSet {
+ public:
+  ReactiveVersionSet(const std::string& dbname,
+                     const ImmutableDBOptions* _db_options,
+                     const EnvOptions& _env_options, Cache* table_cache,
+                     WriteBufferManager* write_buffer_manager,
+                     WriteController* write_controller);
+
+  ~ReactiveVersionSet() override;
+
+  Status ReadAndApply(
+      InstrumentedMutex* mu,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+      std::unordered_set<ColumnFamilyData*>* cfds_changed);
+
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+                 std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+                 std::unique_ptr<Status>* manifest_reader_status);
+
+ protected:
+  using VersionSet::ApplyOneVersionEditToBuilder;
+
+  // REQUIRES db mutex
+  Status ApplyOneVersionEditToBuilder(
+      VersionEdit& edit, bool* have_log_number, uint64_t* log_number,
+      bool* have_prev_log_number, uint64_t* previous_log_number,
+      bool* have_next_file, uint64_t* next_file, bool* have_last_sequence,
+      SequenceNumber* last_sequence, uint64_t* min_log_number_to_keep,
+      uint32_t* max_column_family);
+
+  Status MaybeSwitchManifest(
+      log::Reader::Reporter* reporter,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
+
+ private:
+  std::unordered_map<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>
+      active_version_builders_;
+
+  using VersionSet::LogAndApply;
+  using VersionSet::Recover;
+
+  Status LogAndApply(
+      const autovector<ColumnFamilyData*>& /*cfds*/,
+      const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
+      const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
+      InstrumentedMutex* /*mu*/, Directory* /*db_directory*/,
+      bool /*new_descriptor_log*/,
+      const ColumnFamilyOptions* /*new_cf_option*/) override {
+    return Status::NotSupported("not supported in reactive mode");
+  }
+
+  // No copy allowed
+  ReactiveVersionSet(const ReactiveVersionSet&);
+  ReactiveVersionSet& operator=(const ReactiveVersionSet&);
+};
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/db/version_set_test.cc b/thirdparty/rocksdb/db/version_set_test.cc
index 625d459226..43924a3add 100644
--- a/thirdparty/rocksdb/db/version_set_test.cc
+++ b/thirdparty/rocksdb/db/version_set_test.cc
@@ -8,7 +8,10 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #include "db/version_set.h"
+#include "db/log_writer.h"
+#include "table/mock_table.h"
 #include "util/logging.h"
+#include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
@@ -22,7 +25,7 @@ class GenerateLevelFilesBriefTest : public testing::Test {
 
   GenerateLevelFilesBriefTest() { }
 
-  ~GenerateLevelFilesBriefTest() {
+  ~GenerateLevelFilesBriefTest() override {
     for (size_t i = 0; i < files_.size(); i++) {
       delete files_[i];
     }
@@ -76,7 +79,7 @@ class CountingLogger : public Logger {
  public:
   CountingLogger() : log_count(0) {}
   using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override { log_count++; }
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
   int log_count;
 };
 
@@ -112,7 +115,7 @@ class VersionStorageInfoTest : public testing::Test {
         mutable_cf_options_(options_),
         vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, nullptr, false) {}
 
-  ~VersionStorageInfoTest() {
+  ~VersionStorageInfoTest() override {
     for (int i = 0; i < vstorage_.num_levels(); i++) {
       for (auto* f : vstorage_.LevelFiles(i)) {
         if (--f->refs == 0) {
@@ -135,6 +138,35 @@ class VersionStorageInfoTest : public testing::Test {
     f->num_deletions = 0;
     vstorage_.AddFile(level, f);
   }
+
+  void Add(int level, uint32_t file_number, const InternalKey& smallest,
+           const InternalKey& largest, uint64_t file_size = 0) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData;
+    f->fd = FileDescriptor(file_number, 0, file_size);
+    f->smallest = smallest;
+    f->largest = largest;
+    f->compensated_file_size = file_size;
+    f->refs = 0;
+    f->num_entries = 0;
+    f->num_deletions = 0;
+    vstorage_.AddFile(level, f);
+  }
+
+  std::string GetOverlappingFiles(int level, const InternalKey& begin,
+                                  const InternalKey& end) {
+    std::vector<FileMetaData*> inputs;
+    vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
+
+    std::string result;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (i > 0) {
+        result += ",";
+      }
+      AppendNumberTo(&result, inputs[i]->fd.GetNumber());
+    }
+    return result;
+  }
 };
 
 TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
@@ -234,6 +266,93 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
   ASSERT_EQ(0, logger_->log_count);
 }
 
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 40000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 1U, "1", "2", 10000U);
+  Add(0, 2U, "1", "2", 10000U);
+  Add(0, 3U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 11U, "1", "2", 10000U);
+  Add(0, 12U, "1", "2", 10000U);
+  Add(0, 13U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+  ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+  ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 11U, "1", "2", 5000U);
+  Add(0, 12U, "1", "2", 5000U);
+  Add(0, 13U, "1", "2", 5000U);
+  Add(0, 14U, "1", "2", 5000U);
+  Add(0, 15U, "1", "2", 5000U);
+  Add(0, 16U, "1", "2", 5000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  vstorage_.CalculateBaseBytes(ioptions_, mutable_cf_options_);
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_LT(vstorage_.level_multiplier(), 3.6);
+  ASSERT_GT(vstorage_.level_multiplier(), 3.4);
+  // Level size should be around 30,000, 105,000, 367,500
+  ASSERT_EQ(30000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_LT(vstorage_.MaxBytesForLevel(3), 110000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 100000U);
+  ASSERT_LT(vstorage_.MaxBytesForLevel(4), 370000U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 360000U);
+}
+
 TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
   // Test whether the overlaps are detected as expected
   Add(1, 1U, "4", "7", 1U);  // Perfect overlap with last level
@@ -257,6 +376,40 @@ TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
   ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
 }
 
+TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
+  // Two files that overlap at the range deletion tombstone sentinel.
+  Add(1, 1U, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
+  Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
+  // Two files that overlap at the same user key.
+  Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
+  Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
+  // Two files that do not overlap.
+  Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
+  Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
+  vstorage_.UpdateNumNonEmptyLevels();
+  vstorage_.GenerateLevelFilesBrief();
+
+  ASSERT_EQ("1,2", GetOverlappingFiles(
+      1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
+  ASSERT_EQ("1", GetOverlappingFiles(
+      1, {"a", 0, kTypeValue}, {"b", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("2", GetOverlappingFiles(
+      1, {"b", kMaxSequenceNumber, kTypeValue}, {"c", 0, kTypeValue}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(
+      1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
+  ASSERT_EQ("3", GetOverlappingFiles(
+      1, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(
+      1, {"e", kMaxSequenceNumber, kTypeValue}, {"f", 0, kTypeValue}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(
+      1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
+  ASSERT_EQ("5", GetOverlappingFiles(
+      1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
+  ASSERT_EQ("6", GetOverlappingFiles(
+      1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
+}
+
+
 class FindLevelFileTest : public testing::Test {
  public:
   LevelFilesBrief file_level_;
@@ -265,8 +418,7 @@ class FindLevelFileTest : public testing::Test {
 
   FindLevelFileTest() : disjoint_sorted_files_(true) { }
 
-  ~FindLevelFileTest() {
-  }
+  ~FindLevelFileTest() override {}
 
   void LevelFileInit(size_t num = 0) {
     char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
@@ -450,6 +602,493 @@ TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
   ASSERT_TRUE(Overlaps("600", "700"));
 }
 
+class VersionSetTestBase {
+ public:
+  const static std::string kColumnFamilyName1;
+  const static std::string kColumnFamilyName2;
+  const static std::string kColumnFamilyName3;
+
+  VersionSetTestBase()
+      : env_(Env::Default()),
+        dbname_(test::PerThreadDBPath("version_set_test")),
+        db_options_(),
+        mutable_cf_options_(cf_options_),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(dbname_, &db_options_, env_options_,
+                                 table_cache_.get(), &write_buffer_manager_,
+                                 &write_controller_)),
+        shutting_down_(false),
+        mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+  }
+
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+                       SequenceNumber* last_seqno,
+                       std::unique_ptr<log::Writer>* log_writer) {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
+    const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
+    autovector<VersionEdit> new_cfs;
+    uint64_t last_seq = 1;
+    uint32_t cf_id = 1;
+    for (int i = 1; i != kInitialNumOfCfs; ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(cf_names[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+    *last_seqno = last_seq;
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    std::unique_ptr<WritableFile> file;
+    Status s = env_->NewWritableFile(
+        manifest, &file, env_->OptimizeForManifestWrite(env_options_));
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), manifest, env_options_));
+    {
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = (*log_writer)->AddRecord(record);
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = (*log_writer)->AddRecord(record);
+        ASSERT_OK(s);
+      }
+    }
+    ASSERT_OK(s);
+
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : cf_names) {
+      column_families->emplace_back(cf_name, cf_options_);
+    }
+  }
+
+  // Create DB with 3 column families.
+  void NewDB() {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    SequenceNumber last_seqno;
+    std::unique_ptr<log::Writer> log_writer;
+
+    PrepareManifest(&column_families, &last_seqno, &log_writer);
+    log_writer.reset();
+    // Make "CURRENT" file point to the new manifest file.
+    Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+    ASSERT_OK(s);
+
+    EXPECT_OK(versions_->Recover(column_families, false));
+    EXPECT_EQ(column_families.size(),
+              versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  }
+
+  Env* env_;
+  const std::string dbname_;
+  EnvOptions env_options_;
+  ImmutableDBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  std::shared_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+};
+
+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+  VersionSetTest() : VersionSetTestBase() {}
+};
+
+TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
+  NewDB();
+  const int kGroupSize = 5;
+  autovector<VersionEdit> edits;
+  for (int i = 0; i != kGroupSize; ++i) {
+    edits.emplace_back(VersionEdit());
+  }
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> all_mutable_cf_options;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (int i = 0; i != kGroupSize; ++i) {
+    cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
+    all_mutable_cf_options.emplace_back(&mutable_cf_options_);
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(&edits[i]);
+    edit_lists.emplace_back(edit_list);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
+        uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
+        EXPECT_EQ(0, *cf_id);
+        ++count;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  Status s =
+      versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, &mutex_);
+  mutex_.Unlock();
+  EXPECT_OK(s);
+  EXPECT_EQ(kGroupSize - 1, count);
+}
+
+TEST_F(VersionSetTest, HandleValidAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 3;
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool first_in_atomic_group = false;
+  bool last_in_atomic_group = false;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits.front().DebugString(),
+                  e->DebugString());  // compare based on value
+        first_in_atomic_group = true;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:LastInAtomicGroup", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits.back().DebugString(),
+                  e->DebugString());  // compare based on value
+        EXPECT_TRUE(first_in_atomic_group);
+        last_in_atomic_group = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  EXPECT_OK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group);
+  EXPECT_TRUE(last_in_atomic_group);
+}
+
+TEST_F(VersionSetTest, HandleIncompleteTrailingAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  std::vector<VersionEdit> edits(kNumberOfPersistedVersionEdits);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool first_in_atomic_group = false;
+  bool last_in_atomic_group = false;
+  size_t num = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:FirstInAtomicGroup", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits.front().DebugString(),
+                  e->DebugString());  // compare based on value
+        first_in_atomic_group = true;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:LastInAtomicGroup",
+      [&](void* /* arg */) { last_in_atomic_group = true; });
+  SyncPoint::GetInstance()->SetCallBack("VersionSet::Recover:AtomicGroup",
+                                        [&](void* /* arg */) { ++num; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  EXPECT_OK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group);
+  EXPECT_FALSE(last_in_atomic_group);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num);
+}
+
+TEST_F(VersionSetTest, HandleCorruptedAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 4;
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    if (i != (kAtomicGroupSize / 2)) {
+      edits[i].MarkAtomicGroup(--remaining);
+    }
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool mixed = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:AtomicGroupMixedWithNormalEdits", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits[kAtomicGroupSize / 2].DebugString(), e->DebugString());
+        mixed = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  EXPECT_NOK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(mixed);
+}
+
+TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+
+  // Append multiple version edits that form an atomic group
+  const int kAtomicGroupSize = 4;
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  int remaining = kAtomicGroupSize;
+  for (size_t i = 0; i != edits.size(); ++i) {
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    if (i != 1) {
+      edits[i].MarkAtomicGroup(--remaining);
+    } else {
+      edits[i].MarkAtomicGroup(remaining--);
+    }
+    edits[i].SetLastSequence(last_seqno++);
+  }
+  Status s;
+  for (const auto& edit : edits) {
+    std::string record;
+    edit.EncodeTo(&record);
+    s = log_writer->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer.reset();
+
+  s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  bool incorrect_group_size = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::Recover:IncorrectAtomicGroupSize", [&](void* arg) {
+        VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+        EXPECT_EQ(edits[1].DebugString(), e->DebugString());
+        incorrect_group_size = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  EXPECT_NOK(versions_->Recover(column_families, false));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(incorrect_group_size);
+}
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+                                public testing::TestWithParam<std::string> {
+ public:
+  VersionSetTestDropOneCF() : VersionSetTestBase() {}
+};
+
+// This test simulates the following execution sequence
+// Time  thread1                  bg_flush_thr
+//  |                             Prepare version edits (e1,e2,e3) for atomic
+//  |                             flush cf1, cf2, cf3
+//  |    Enqueue e to drop cfi
+//  |    to manifest_writers_
+//  |                             Enqueue (e1,e2,e3) to manifest_writers_
+//  |
+//  |    Apply e,
+//  |    cfi.IsDropped() is true
+//  |                             Apply (e1,e2,e3),
+//  |                             since cfi.IsDropped() == true, we need to
+//  |                             drop ei and write the rest to MANIFEST.
+//  V
+//
+//  Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+//  last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  Status s = SetCurrentFile(env_, dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+  const int kAtomicGroupSize = 3;
+  const std::vector<std::string> non_default_cf_names = {
+      kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+  // Drop one column family
+  VersionEdit drop_cf_edit;
+  drop_cf_edit.DropColumnFamily();
+  const std::string cf_to_drop_name(GetParam());
+  auto cfd_to_drop =
+      versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+  ASSERT_NE(nullptr, cfd_to_drop);
+  // Increase its refcount because cfd_to_drop is used later, and we need to
+  // prevent it from being deleted.
+  cfd_to_drop->Ref();
+  drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfd_to_drop,
+                             *cfd_to_drop->GetLatestMutableCFOptions(),
+                             &drop_cf_edit, &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  uint32_t remaining = kAtomicGroupSize;
+  size_t i = 0;
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (const auto& cf_name : non_default_cf_names) {
+    auto cfd = (cf_name != cf_to_drop_name)
+                   ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+                   : cfd_to_drop;
+    ASSERT_NE(nullptr, cfd);
+    cfds.push_back(cfd);
+    mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+    edits[i].SetColumnFamily(cfd->GetID());
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+    autovector<VersionEdit*> tmp_edits;
+    tmp_edits.push_back(&edits[i]);
+    edit_lists.emplace_back(tmp_edits);
+    ++i;
+  }
+  int called = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+        std::vector<VersionEdit*>* tmp_edits =
+            reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+        EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+        for (const auto e : *tmp_edits) {
+          bool found = false;
+          for (const auto& e2 : edits) {
+            if (&e2 == e) {
+              found = true;
+              break;
+            }
+          }
+          ASSERT_TRUE(found);
+        }
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists,
+                             &mutex_);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, called);
+  if (cfd_to_drop->Unref()) {
+    delete cfd_to_drop;
+    cfd_to_drop = nullptr;
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AtomicGroup, VersionSetTestDropOneCF,
+    testing::Values(VersionSetTestBase::kColumnFamilyName1,
+                    VersionSetTestBase::kColumnFamilyName2,
+                    VersionSetTestBase::kColumnFamilyName3));
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/db/wal_manager.cc b/thirdparty/rocksdb/db/wal_manager.cc
index 4a9ecbfdd8..62511819e4 100644
--- a/thirdparty/rocksdb/db/wal_manager.cc
+++ b/thirdparty/rocksdb/db/wal_manager.cc
@@ -29,6 +29,7 @@
 #include "util/cast_util.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
+#include "util/file_util.h"
 #include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
@@ -39,6 +40,15 @@ namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
 
+Status WalManager::DeleteFile(const std::string& fname, uint64_t number) {
+  auto s = env_->DeleteFile(db_options_.wal_dir + "/" + fname);
+  if (s.ok()) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.erase(number);
+  }
+  return s;
+}
+
 Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
   // First get sorted files in db dir, then get sorted files from archived
   // dir, to avoid a race condition where a log file is moved to archived
@@ -115,7 +125,7 @@ Status WalManager::GetUpdatesSince(
   }
   iter->reset(new TransactionLogIteratorImpl(
       db_options_.wal_dir, &db_options_, read_options, env_options_, seq,
-      std::move(wal_files), version_set));
+      std::move(wal_files), version_set, seq_per_batch_));
   return (*iter)->status();
 }
 
@@ -181,7 +191,7 @@ void WalManager::PurgeObsoleteWALFiles() {
           continue;
         }
         if (now_seconds - file_m_time > db_options_.wal_ttl_seconds) {
-          s = env_->DeleteFile(file_path);
+          s = DeleteDBFile(&db_options_, file_path, archival_dir, false);
           if (!s.ok()) {
             ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
                            file_path.c_str(), s.ToString().c_str());
@@ -207,7 +217,7 @@ void WalManager::PurgeObsoleteWALFiles() {
             log_file_size = std::max(log_file_size, file_size);
             ++log_files_num;
           } else {
-            s = env_->DeleteFile(file_path);
+            s = DeleteDBFile(&db_options_, file_path, archival_dir, false);
             if (!s.ok()) {
               ROCKS_LOG_WARN(db_options_.info_log,
                              "Unable to delete file: %s: %s", file_path.c_str(),
@@ -228,7 +238,7 @@ void WalManager::PurgeObsoleteWALFiles() {
   }
 
   size_t const files_keep_num =
-      db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size;
+      static_cast<size_t>(db_options_.wal_size_limit_mb * 1024 * 1024 / log_file_size);
   if (log_files_num <= files_keep_num) {
     return;
   }
@@ -246,7 +256,8 @@ void WalManager::PurgeObsoleteWALFiles() {
 
   for (size_t i = 0; i < files_del_num; ++i) {
     std::string const file_path = archived_logs[i]->PathName();
-    s = env_->DeleteFile(db_options_.wal_dir + "/" + file_path);
+    s = DeleteDBFile(&db_options_, db_options_.wal_dir + "/" + file_path,
+                     db_options_.wal_dir, false);
     if (!s.ok()) {
       ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
                      file_path.c_str(), s.ToString().c_str());
@@ -343,7 +354,7 @@ Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
   // Binary Search. avoid opening all files.
   while (end >= start) {
     int64_t mid = start + (end - start) / 2;  // Avoid overflow.
-    SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence();
+    SequenceNumber current_seq_num = all_logs.at(static_cast<size_t>(mid))->StartSequence();
     if (current_seq_num == target) {
       end = mid;
       break;
@@ -354,7 +365,7 @@ Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
     }
   }
   // end could be -ve.
-  size_t start_index = std::max(static_cast<int64_t>(0), end);
+  size_t start_index = static_cast<size_t>(std::max(static_cast<int64_t>(0), end));
   // The last wal file is always included
   all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
   return Status::OK();
@@ -420,7 +431,7 @@ Status WalManager::ReadFirstLine(const std::string& fname,
 
     Status* status;
     bool ignore_error;  // true if db_options_.paranoid_checks==false
-    virtual void Corruption(size_t bytes, const Status& s) override {
+    void Corruption(size_t bytes, const Status& s) override {
       ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s",
                      (this->ignore_error ? "(ignoring error) " : ""), fname,
                      static_cast<int>(bytes), s.ToString().c_str());
@@ -434,8 +445,8 @@ Status WalManager::ReadFirstLine(const std::string& fname,
   std::unique_ptr<SequentialFile> file;
   Status status = env_->NewSequentialFile(
       fname, &file, env_->OptimizeForLogRead(env_options_));
-  unique_ptr<SequentialFileReader> file_reader(
-      new SequentialFileReader(std::move(file)));
+  std::unique_ptr<SequentialFileReader> file_reader(
+      new SequentialFileReader(std::move(file), fname));
 
   if (!status.ok()) {
     return status;
@@ -448,7 +459,7 @@ Status WalManager::ReadFirstLine(const std::string& fname,
   reporter.status = &status;
   reporter.ignore_error = !db_options_.paranoid_checks;
   log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
-                     true /*checksum*/, 0 /*initial_offset*/, number);
+                     true /*checksum*/, number);
   std::string scratch;
   Slice record;
 
diff --git a/thirdparty/rocksdb/db/wal_manager.h b/thirdparty/rocksdb/db/wal_manager.h
index aa62d793bc..6caf1640c0 100644
--- a/thirdparty/rocksdb/db/wal_manager.h
+++ b/thirdparty/rocksdb/db/wal_manager.h
@@ -31,11 +31,12 @@ namespace rocksdb {
 class WalManager {
  public:
   WalManager(const ImmutableDBOptions& db_options,
-             const EnvOptions& env_options)
+             const EnvOptions& env_options, const bool seq_per_batch = false)
       : db_options_(db_options),
         env_options_(env_options),
         env_(db_options.env),
-        purge_wal_files_last_run_(0) {}
+        purge_wal_files_last_run_(0),
+        seq_per_batch_(seq_per_batch) {}
 
   Status GetSortedWalFiles(VectorLogPtr& files);
 
@@ -48,6 +49,8 @@ class WalManager {
 
   void ArchiveWALFile(const std::string& fname, uint64_t number);
 
+  Status DeleteFile(const std::string& fname, uint64_t number);
+
   Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
                               SequenceNumber* sequence) {
     return ReadFirstRecord(type, number, sequence);
@@ -86,6 +89,8 @@ class WalManager {
   // last time when PurgeObsoleteWALFiles ran.
   uint64_t purge_wal_files_last_run_;
 
+  bool seq_per_batch_;
+
   // obsolete files will be deleted every this seconds if ttl deletion is
   // enabled and archive size_limit is disabled.
   static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
diff --git a/thirdparty/rocksdb/db/wal_manager_test.cc b/thirdparty/rocksdb/db/wal_manager_test.cc
index 9f5cf273d2..379f12f52a 100644
--- a/thirdparty/rocksdb/db/wal_manager_test.cc
+++ b/thirdparty/rocksdb/db/wal_manager_test.cc
@@ -32,7 +32,7 @@ class WalManagerTest : public testing::Test {
  public:
   WalManagerTest()
       : env_(new MockEnv(Env::Default())),
-        dbname_(test::TmpDir() + "/wal_manager_test"),
+        dbname_(test::PerThreadDBPath("wal_manager_test")),
         db_options_(),
         table_cache_(NewLRUCache(50000, 16)),
         write_buffer_manager_(db_options_.db_write_buffer_size),
@@ -67,18 +67,19 @@ class WalManagerTest : public testing::Test {
     batch.Put(key, value);
     WriteBatchInternal::SetSequence(&batch, seq);
     current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch));
-    versions_->SetLastToBeWrittenSequence(seq);
+    versions_->SetLastAllocatedSequence(seq);
+    versions_->SetLastPublishedSequence(seq);
     versions_->SetLastSequence(seq);
   }
 
   // NOT thread safe
-  void RollTheLog(bool archived) {
+  void RollTheLog(bool /*archived*/) {
     current_log_number_++;
     std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
-    unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(file), env_options_));
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), fname, env_options_));
     current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
   }
 
@@ -93,7 +94,7 @@ class WalManagerTest : public testing::Test {
 
   std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
       const SequenceNumber seq) {
-    unique_ptr<TransactionLogIterator> iter;
+    std::unique_ptr<TransactionLogIterator> iter;
     Status status = wal_manager_->GetUpdatesSince(
         seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
     EXPECT_OK(status);
@@ -117,7 +118,7 @@ class WalManagerTest : public testing::Test {
 TEST_F(WalManagerTest, ReadFirstRecordCache) {
   Init();
   std::string path = dbname_ + "/000001.log";
-  unique_ptr<WritableFile> file;
+  std::unique_ptr<WritableFile> file;
   ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions()));
 
   SequenceNumber s;
@@ -128,8 +129,8 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) {
       wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
   ASSERT_EQ(s, 0U);
 
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), path, EnvOptions()));
   log::Writer writer(std::move(file_writer), 1,
                      db_options_.recycle_log_file_num > 0);
   WriteBatch batch;
@@ -302,7 +303,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/db/write_batch.cc b/thirdparty/rocksdb/db/write_batch.cc
index 76fc94844a..30480f64e1 100644
--- a/thirdparty/rocksdb/db/write_batch.cc
+++ b/thirdparty/rocksdb/db/write_batch.cc
@@ -15,15 +15,19 @@
 //    kTypeValue varstring varstring
 //    kTypeDeletion varstring
 //    kTypeSingleDeletion varstring
+//    kTypeRangeDeletion varstring varstring
 //    kTypeMerge varstring varstring
 //    kTypeColumnFamilyValue varint32 varstring varstring
-//    kTypeColumnFamilyDeletion varint32 varstring varstring
-//    kTypeColumnFamilySingleDeletion varint32 varstring varstring
+//    kTypeColumnFamilyDeletion varint32 varstring
+//    kTypeColumnFamilySingleDeletion varint32 varstring
+//    kTypeColumnFamilyRangeDeletion varint32 varstring varstring
 //    kTypeColumnFamilyMerge varint32 varstring varstring
 //    kTypeBeginPrepareXID varstring
 //    kTypeEndPrepareXID
 //    kTypeCommitXID varstring
 //    kTypeRollbackXID varstring
+//    kTypeBeginPersistedPrepareXID varstring
+//    kTypeBeginUnprepareXID varstring
 //    kTypeNoop
 // varstring :=
 //    len: varint32
@@ -49,7 +53,9 @@
 #include "monitoring/statistics.h"
 #include "rocksdb/merge_operator.h"
 #include "util/coding.h"
+#include "util/duplicate_detector.h"
 #include "util/string_util.h"
+#include "util/util.h"
 
 namespace rocksdb {
 
@@ -68,6 +74,7 @@ enum ContentFlags : uint32_t {
   HAS_ROLLBACK = 1 << 8,
   HAS_DELETE_RANGE = 1 << 9,
   HAS_BLOB_INDEX = 1 << 10,
+  HAS_BEGIN_UNPREPARE = 1 << 11,
 };
 
 struct BatchContentClassifier : public WriteBatch::Handler {
@@ -103,8 +110,11 @@ struct BatchContentClassifier : public WriteBatch::Handler {
     return Status::OK();
   }
 
-  Status MarkBeginPrepare() override {
+  Status MarkBeginPrepare(bool unprepare) override {
     content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
+    if (unprepare) {
+      content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
+    }
     return Status::OK();
   }
 
@@ -143,6 +153,12 @@ WriteBatch::WriteBatch(const std::string& rep)
       max_bytes_(0),
       rep_(rep) {}
 
+WriteBatch::WriteBatch(std::string&& rep)
+    : save_points_(nullptr),
+      content_flags_(ContentFlags::DEFERRED),
+      max_bytes_(0),
+      rep_(std::move(rep)) {}
+
 WriteBatch::WriteBatch(const WriteBatch& src)
     : save_points_(src.save_points_),
       wal_term_point_(src.wal_term_point_),
@@ -150,7 +166,7 @@ WriteBatch::WriteBatch(const WriteBatch& src)
       max_bytes_(src.max_bytes_),
       rep_(src.rep_) {}
 
-WriteBatch::WriteBatch(WriteBatch&& src)
+WriteBatch::WriteBatch(WriteBatch&& src) noexcept
     : save_points_(std::move(src.save_points_)),
       wal_term_point_(std::move(src.wal_term_point_)),
       content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
@@ -177,7 +193,7 @@ WriteBatch::~WriteBatch() { delete save_points_; }
 
 WriteBatch::Handler::~Handler() { }
 
-void WriteBatch::Handler::LogData(const Slice& blob) {
+void WriteBatch::Handler::LogData(const Slice& /*blob*/) {
   // If the user has not specified something to do with blobs, then we ignore
   // them.
 }
@@ -292,7 +308,7 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
       if (!GetVarint32(input, column_family)) {
         return Status::Corruption("bad WriteBatch Put");
       }
-    // intentional fallthrough
+      FALLTHROUGH_INTENDED;
     case kTypeValue:
       if (!GetLengthPrefixedSlice(input, key) ||
           !GetLengthPrefixedSlice(input, value)) {
@@ -304,7 +320,7 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
       if (!GetVarint32(input, column_family)) {
         return Status::Corruption("bad WriteBatch Delete");
       }
-    // intentional fallthrough
+      FALLTHROUGH_INTENDED;
     case kTypeDeletion:
     case kTypeSingleDeletion:
       if (!GetLengthPrefixedSlice(input, key)) {
@@ -315,7 +331,7 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
       if (!GetVarint32(input, column_family)) {
         return Status::Corruption("bad WriteBatch DeleteRange");
       }
-    // intentional fallthrough
+      FALLTHROUGH_INTENDED;
     case kTypeRangeDeletion:
       // for range delete, "key" is begin_key, "value" is end_key
       if (!GetLengthPrefixedSlice(input, key) ||
@@ -327,7 +343,7 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
       if (!GetVarint32(input, column_family)) {
         return Status::Corruption("bad WriteBatch Merge");
       }
-    // intentional fallthrough
+      FALLTHROUGH_INTENDED;
     case kTypeMerge:
       if (!GetLengthPrefixedSlice(input, key) ||
           !GetLengthPrefixedSlice(input, value)) {
@@ -338,7 +354,7 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
       if (!GetVarint32(input, column_family)) {
         return Status::Corruption("bad WriteBatch BlobIndex");
       }
-    // intentional fallthrough
+      FALLTHROUGH_INTENDED;
     case kTypeBlobIndex:
       if (!GetLengthPrefixedSlice(input, key) ||
           !GetLengthPrefixedSlice(input, value)) {
@@ -353,6 +369,11 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag,
       break;
     case kTypeNoop:
     case kTypeBeginPrepareXID:
+      // This indicates that the prepared batch is also persisted in the db.
+      // This is used in WritePreparedTxn
+    case kTypeBeginPersistedPrepareXID:
+      // This is used in WriteUnpreparedTxn
+    case kTypeBeginUnprepareXID:
       break;
     case kTypeEndPrepareXID:
       if (!GetLengthPrefixedSlice(input, xid)) {
@@ -383,16 +404,43 @@ Status WriteBatch::Iterate(Handler* handler) const {
 
   input.remove_prefix(WriteBatchInternal::kHeader);
   Slice key, value, blob, xid;
+  // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
+  // the batch boundary symbols otherwise we would mis-count the number of
+  // batches. We do that by checking whether the accumulated batch is empty
+  // before seeing the next Noop.
+  bool empty_batch = true;
   int found = 0;
   Status s;
-  while (s.ok() && !input.empty() && handler->Continue()) {
-    char tag = 0;
-    uint32_t column_family = 0;  // default
-
-    s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
-                                 &blob, &xid);
-    if (!s.ok()) {
-      return s;
+  char tag = 0;
+  uint32_t column_family = 0;  // default
+  bool last_was_try_again = false;
+  bool handler_continue = true;
+  while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
+    handler_continue = handler->Continue();
+    if (!handler_continue) {
+      break;
+    }
+
+    if (LIKELY(!s.IsTryAgain())) {
+      last_was_try_again = false;
+      tag = 0;
+      column_family = 0;  // default
+
+      s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                   &blob, &xid);
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      assert(s.IsTryAgain());
+      assert(!last_was_try_again); // to detect infinite loop bugs
+      if (UNLIKELY(last_was_try_again)) {
+        return Status::Corruption(
+            "two consecutive TryAgain in WriteBatch handler; this is either a "
+            "software bug or data corruption.");
+      }
+      last_was_try_again = true;
+      s = Status::OK();
     }
 
     switch (tag) {
@@ -401,67 +449,137 @@ Status WriteBatch::Iterate(Handler* handler) const {
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
         s = handler->PutCF(column_family, key, value);
-        found++;
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
         break;
       case kTypeColumnFamilyDeletion:
       case kTypeDeletion:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
         s = handler->DeleteCF(column_family, key);
-        found++;
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
         break;
       case kTypeColumnFamilySingleDeletion:
       case kTypeSingleDeletion:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
         s = handler->SingleDeleteCF(column_family, key);
-        found++;
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
         break;
       case kTypeColumnFamilyRangeDeletion:
       case kTypeRangeDeletion:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
         s = handler->DeleteRangeCF(column_family, key, value);
-        found++;
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
         break;
       case kTypeColumnFamilyMerge:
       case kTypeMerge:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
         s = handler->MergeCF(column_family, key, value);
-        found++;
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
         break;
       case kTypeColumnFamilyBlobIndex:
       case kTypeBlobIndex:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
         s = handler->PutBlobIndexCF(column_family, key, value);
-        found++;
+        if (LIKELY(s.ok())) {
+          found++;
+        }
         break;
       case kTypeLogData:
         handler->LogData(blob);
+        // A batch might have nothing but LogData. It is still a batch.
+        empty_batch = false;
         break;
       case kTypeBeginPrepareXID:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
         handler->MarkBeginPrepare();
+        empty_batch = false;
+        if (!handler->WriteAfterCommit()) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_after_commit_ is disabled (in "
+              "WritePrepared/WriteUnprepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        if (handler->WriteBeforePrepare()) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_before_prepare_ is enabled "
+              "(in WriteUnprepared mode). If it is not due to corruption, the "
+              "WAL must be emptied before changing the WritePolicy.");
+        }
+        break;
+      case kTypeBeginPersistedPrepareXID:
+        assert(content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+        handler->MarkBeginPrepare();
+        empty_batch = false;
+        if (handler->WriteAfterCommit()) {
+          s = Status::NotSupported(
+              "WritePrepared/WriteUnprepared txn tag when write_after_commit_ "
+              "is enabled (in default WriteCommitted mode). If it is not due "
+              "to corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        break;
+      case kTypeBeginUnprepareXID:
+        assert(content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
+        handler->MarkBeginPrepare(true /* unprepared */);
+        empty_batch = false;
+        if (handler->WriteAfterCommit()) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_after_commit_ is enabled (in "
+              "default WriteCommitted mode). If it is not due to corruption, "
+              "the WAL must be emptied before changing the WritePolicy.");
+        }
+        if (!handler->WriteBeforePrepare()) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_before_prepare_ is disabled "
+              "(in WriteCommitted/WritePrepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
         break;
       case kTypeEndPrepareXID:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
         handler->MarkEndPrepare(xid);
+        empty_batch = true;
         break;
       case kTypeCommitXID:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
         handler->MarkCommit(xid);
+        empty_batch = true;
         break;
       case kTypeRollbackXID:
         assert(content_flags_.load(std::memory_order_relaxed) &
                (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
         handler->MarkRollback(xid);
+        empty_batch = true;
         break;
       case kTypeNoop:
+        handler->MarkNoop(empty_batch);
+        empty_batch = true;
         break;
       default:
         return Status::Corruption("unknown WriteBatch tag");
@@ -470,13 +588,21 @@ Status WriteBatch::Iterate(Handler* handler) const {
   if (!s.ok()) {
     return s;
   }
-  if (found != WriteBatchInternal::Count(this)) {
+  if (handler_continue && found != WriteBatchInternal::Count(this)) {
     return Status::Corruption("WriteBatch has wrong count");
   } else {
     return Status::OK();
   }
 }
 
+bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) {
+  return b->is_latest_persistent_state_;
+}
+
+void WriteBatchInternal::SetAsLastestPersistentState(WriteBatch* b) {
+  b->is_latest_persistent_state_ = true;
+}
+
 int WriteBatchInternal::Count(const WriteBatch* b) {
   return DecodeFixed32(b->rep_.data() + 8);
 }
@@ -493,12 +619,19 @@ void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
   EncodeFixed64(&b->rep_[0], seq);
 }
 
-size_t WriteBatchInternal::GetFirstOffset(WriteBatch* b) {
+size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) {
   return WriteBatchInternal::kHeader;
 }
 
 Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
                                const Slice& key, const Slice& value) {
+  if (key.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
   LocalSavePoint save(b);
   WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
   if (column_family_id == 0) {
@@ -521,8 +654,33 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
                                  value);
 }
 
+Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
+                                                 const SliceParts& value) {
+  size_t total_key_bytes = 0;
+  for (int i = 0; i < key.num_parts; ++i) {
+    total_key_bytes += key.parts[i].size();
+  }
+  if (total_key_bytes >= size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("key is too large");
+  }
+
+  size_t total_value_bytes = 0;
+  for (int i = 0; i < value.num_parts; ++i) {
+    total_value_bytes += value.parts[i].size();
+  }
+  if (total_value_bytes >= size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("value is too large");
+  }
+  return Status::OK();
+}
+
 Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
                                const SliceParts& key, const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
   LocalSavePoint save(b);
   WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
   if (column_family_id == 0) {
@@ -550,7 +708,9 @@ Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
   return Status::OK();
 }
 
-Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) {
+Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
+                                          bool write_after_commit,
+                                          bool unprepared_batch) {
   // a manually constructed batch can only contain one prepare section
   assert(b->rep_[12] == static_cast<char>(kTypeNoop));
 
@@ -562,13 +722,21 @@ Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid) {
   }
 
   // rewrite noop as begin marker
-  b->rep_[12] = static_cast<char>(kTypeBeginPrepareXID);
+  b->rep_[12] = static_cast<char>(
+      write_after_commit ? kTypeBeginPrepareXID
+                         : (unprepared_batch ? kTypeBeginUnprepareXID
+                                             : kTypeBeginPersistedPrepareXID));
   b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
   PutLengthPrefixedSlice(&b->rep_, xid);
   b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
                               ContentFlags::HAS_END_PREPARE |
                               ContentFlags::HAS_BEGIN_PREPARE,
                           std::memory_order_relaxed);
+  if (unprepared_batch) {
+    b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                                ContentFlags::HAS_BEGIN_UNPREPARE,
+                            std::memory_order_relaxed);
+  }
   return Status::OK();
 }
 
@@ -736,6 +904,13 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
 
 Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
                                  const Slice& key, const Slice& value) {
+  if (key.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{port::kMaxUint32}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
   LocalSavePoint save(b);
   WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
   if (column_family_id == 0) {
@@ -761,6 +936,11 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
 Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
                                  const SliceParts& key,
                                  const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
   LocalSavePoint save(b);
   WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
   if (column_family_id == 0) {
@@ -873,14 +1053,25 @@ class MemTableInserter : public WriteBatch::Handler {
   // a map is too expensive in the Write() path as they
   // cause memory allocations though unused.
   // Make creation optional but do not incur
-  // unique_ptr additional allocation
-  using 
-  MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
-  using
-  PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
+  // std::unique_ptr additional allocation
+  using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
+  using PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
   PostMapType mem_post_info_map_;
   // current recovered transaction we are rebuilding (recovery)
   WriteBatch* rebuilding_trx_;
+  SequenceNumber rebuilding_trx_seq_;
+  // Increase seq number once per each write batch. Otherwise increase it once
+  // per key.
+  bool seq_per_batch_;
+  // Whether the memtable write will be done only after the commit
+  bool write_after_commit_;
+  // Whether memtable write can be done before prepare
+  bool write_before_prepare_;
+  // Whether this batch was unprepared or not
+  bool unprepared_batch_;
+  using DupDetector = std::aligned_storage<sizeof(DuplicateDetector)>::type;
+  DupDetector       duplicate_detector_;
+  bool              dup_dectector_on_;
 
   MemPostInfoMap& GetPostMap() {
     assert(concurrent_memtable_writes_);
@@ -891,38 +1082,87 @@ class MemTableInserter : public WriteBatch::Handler {
     return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
   }
 
-public:
+  bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) {
+    assert(!write_after_commit_);
+    assert(rebuilding_trx_ != nullptr);
+    if (!dup_dectector_on_) {
+      new (&duplicate_detector_) DuplicateDetector(db_);
+      dup_dectector_on_ = true;
+    }
+    return reinterpret_cast<DuplicateDetector*>
+      (&duplicate_detector_)->IsDuplicateKeySeq(column_family_id, key, sequence_);
+  }
+
+ protected:
+  bool WriteBeforePrepare() const override { return write_before_prepare_; }
+  bool WriteAfterCommit() const override { return write_after_commit_; }
+
+ public:
   // cf_mems should not be shared with concurrent inserters
- MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
-                  FlushScheduler* flush_scheduler,
-                  bool ignore_missing_column_families,
-                  uint64_t recovering_log_number, DB* db,
-                  bool concurrent_memtable_writes,
-                  bool* has_valid_writes = nullptr)
-     : sequence_(_sequence),
-       cf_mems_(cf_mems),
-       flush_scheduler_(flush_scheduler),
-       ignore_missing_column_families_(ignore_missing_column_families),
-       recovering_log_number_(recovering_log_number),
-       log_number_ref_(0),
-       db_(reinterpret_cast<DBImpl*>(db)),
-       concurrent_memtable_writes_(concurrent_memtable_writes),
-       post_info_created_(false),
-       has_valid_writes_(has_valid_writes),
-       rebuilding_trx_(nullptr) {
-   assert(cf_mems_);
-  }
-
-  ~MemTableInserter() {
+  MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
+                   FlushScheduler* flush_scheduler,
+                   bool ignore_missing_column_families,
+                   uint64_t recovering_log_number, DB* db,
+                   bool concurrent_memtable_writes,
+                   bool* has_valid_writes = nullptr, bool seq_per_batch = false,
+                   bool batch_per_txn = true)
+      : sequence_(_sequence),
+        cf_mems_(cf_mems),
+        flush_scheduler_(flush_scheduler),
+        ignore_missing_column_families_(ignore_missing_column_families),
+        recovering_log_number_(recovering_log_number),
+        log_number_ref_(0),
+        db_(reinterpret_cast<DBImpl*>(db)),
+        concurrent_memtable_writes_(concurrent_memtable_writes),
+        post_info_created_(false),
+        has_valid_writes_(has_valid_writes),
+        rebuilding_trx_(nullptr),
+        rebuilding_trx_seq_(0),
+        seq_per_batch_(seq_per_batch),
+        // Write after commit currently uses one seq per key (instead of per
+        // batch). So seq_per_batch being false indicates write_after_commit
+        // approach.
+        write_after_commit_(!seq_per_batch),
+        // WriteUnprepared can write WriteBatches per transaction, so
+        // batch_per_txn being false indicates write_before_prepare.
+        write_before_prepare_(!batch_per_txn),
+        unprepared_batch_(false),
+        duplicate_detector_(),
+        dup_dectector_on_(false) {
+    assert(cf_mems_);
+  }
+
+  ~MemTableInserter() override {
+    if (dup_dectector_on_) {
+      reinterpret_cast<DuplicateDetector*>
+        (&duplicate_detector_)->~DuplicateDetector();
+    }
     if (post_info_created_) {
       reinterpret_cast<MemPostInfoMap*>
         (&mem_post_info_map_)->~MemPostInfoMap();
     }
+    delete rebuilding_trx_;
   }
 
   MemTableInserter(const MemTableInserter&) = delete;
   MemTableInserter& operator=(const MemTableInserter&) = delete;
 
+  // The batch seq is regularly restarted; In normal mode it is set when
+  // MemTableInserter is constructed in the write thread and in recovery mode it
+  // is set when a batch, which is tagged with seq, is read from the WAL.
+  // Within a sequenced batch, which could be a merge of multiple batches, we
+  // have two policies to advance the seq: i) seq_per_key (default) and ii)
+  // seq_per_batch. To implement the latter we need to mark the boundary between
+  // the individual batches. The approach is this: 1) Use the terminating
+  // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID,
+  // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a
+  // natural boundary marker.
+  void MaybeAdvanceSeq(bool batch_boundry = false) {
+    if (batch_boundry == seq_per_batch_) {
+      sequence_++;
+    }
+  }
+
   void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
 
   SequenceNumber sequence() const { return sequence_; }
@@ -980,26 +1220,46 @@ class MemTableInserter : public WriteBatch::Handler {
 
   Status PutCFImpl(uint32_t column_family_id, const Slice& key,
                    const Slice& value, ValueType value_type) {
-    if (rebuilding_trx_ != nullptr) {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
       WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
       return Status::OK();
+      // else insert the values to the memtable right away
     }
 
     Status seek_status;
-    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
-      ++sequence_;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
       return seek_status;
     }
+    Status ret_status;
 
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
+    // inplace_update_support is inconsistent with snapshots, and therefore with
+    // any kind of transactions including the ones that use seq_per_batch
+    assert(!seq_per_batch_ || !moptions->inplace_update_support);
     if (!moptions->inplace_update_support) {
-      mem->Add(sequence_, value_type, key, value, concurrent_memtable_writes_,
-               get_post_process_info(mem));
+      bool mem_res =
+          mem->Add(sequence_, value_type, key, value,
+                   concurrent_memtable_writes_, get_post_process_info(mem));
+      if (UNLIKELY(!mem_res)) {
+        assert(seq_per_batch_);
+        ret_status = Status::TryAgain("key+seq exists");
+        const bool BATCH_BOUNDRY = true;
+        MaybeAdvanceSeq(BATCH_BOUNDRY);
+      }
     } else if (moptions->inplace_callback == nullptr) {
       assert(!concurrent_memtable_writes_);
       mem->Update(sequence_, key, value);
-      RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
     } else {
       assert(!concurrent_memtable_writes_);
       if (mem->UpdateCallback(sequence_, key, value)) {
@@ -1008,6 +1268,9 @@ class MemTableInserter : public WriteBatch::Handler {
         SnapshotImpl read_from_snapshot;
         read_from_snapshot.number_ = sequence_;
         ReadOptions ropts;
+        // it's going to be overwritten for sure, so no point caching data block
+        // containing the old version
+        ropts.fill_cache = false;
         ropts.snapshot = &read_from_snapshot;
 
         std::string prev_value;
@@ -1029,82 +1292,151 @@ class MemTableInserter : public WriteBatch::Handler {
                                                  value, &merged_value);
         if (status == UpdateStatus::UPDATED_INPLACE) {
           // prev_value is updated in-place with final value.
-          mem->Add(sequence_, value_type, key, Slice(prev_buffer, prev_size));
+          bool mem_res __attribute__((__unused__));
+          mem_res = mem->Add(
+              sequence_, value_type, key, Slice(prev_buffer, prev_size));
+          assert(mem_res);
           RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         } else if (status == UpdateStatus::UPDATED) {
           // merged_value contains the final value.
-          mem->Add(sequence_, value_type, key, Slice(merged_value));
+          bool mem_res __attribute__((__unused__));
+          mem_res =
+              mem->Add(sequence_, value_type, key, Slice(merged_value));
+          assert(mem_res);
           RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
         }
       }
     }
-    // Since all Puts are logged in trasaction logs (if enabled), always bump
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key, value);
+    }
+    // Since all Puts are logged in transaction logs (if enabled), always bump
     // sequence number. Even if the update eventually fails and does not result
     // in memtable add/update.
-    sequence_++;
+    MaybeAdvanceSeq();
     CheckMemtableFull();
-    return Status::OK();
+    return ret_status;
   }
 
-  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) override {
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
     return PutCFImpl(column_family_id, key, value, kTypeValue);
   }
 
-  Status DeleteImpl(uint32_t column_family_id, const Slice& key,
+  Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
                     const Slice& value, ValueType delete_type) {
+    Status ret_status;
     MemTable* mem = cf_mems_->GetMemTable();
-    mem->Add(sequence_, delete_type, key, value, concurrent_memtable_writes_,
-             get_post_process_info(mem));
-    sequence_++;
+    bool mem_res =
+        mem->Add(sequence_, delete_type, key, value,
+                 concurrent_memtable_writes_, get_post_process_info(mem));
+    if (UNLIKELY(!mem_res)) {
+      assert(seq_per_batch_);
+      ret_status = Status::TryAgain("key+seq exists");
+      const bool BATCH_BOUNDRY = true;
+      MaybeAdvanceSeq(BATCH_BOUNDRY);
+    }
+    MaybeAdvanceSeq();
     CheckMemtableFull();
-    return Status::OK();
+    return ret_status;
   }
 
-  virtual Status DeleteCF(uint32_t column_family_id,
-                          const Slice& key) override {
-    if (rebuilding_trx_ != nullptr) {
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
       WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
       return Status::OK();
+      // else insert the values to the memtable right away
     }
 
     Status seek_status;
-    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
-      ++sequence_;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
       return seek_status;
     }
 
-    return DeleteImpl(column_family_id, key, Slice(), kTypeDeletion);
+    auto ret_status = DeleteImpl(column_family_id, key, Slice(), kTypeDeletion);
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+    }
+    return ret_status;
   }
 
-  virtual Status SingleDeleteCF(uint32_t column_family_id,
-                                const Slice& key) override {
-    if (rebuilding_trx_ != nullptr) {
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
       WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
       return Status::OK();
+      // else insert the values to the memtable right away
     }
 
     Status seek_status;
-    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
-      ++sequence_;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+                                         key);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
       return seek_status;
     }
 
-    return DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
+    auto ret_status =
+        DeleteImpl(column_family_id, key, Slice(), kTypeSingleDeletion);
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id, key);
+    }
+    return ret_status;
   }
 
-  virtual Status DeleteRangeCF(uint32_t column_family_id,
-                               const Slice& begin_key,
-                               const Slice& end_key) override {
-    if (rebuilding_trx_ != nullptr) {
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
       WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
                                       begin_key, end_key);
       return Status::OK();
+      // else insert the values to the memtable right away
     }
 
     Status seek_status;
-    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
-      ++sequence_;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                        begin_key, end_key);
+        // TODO(myabandeh): when transactional DeleteRange support is added,
+        // check if end_key must also be added.
+        batch_boundry = IsDuplicateKeySeq(column_family_id, begin_key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
       return seek_status;
     }
     if (db_ != nullptr) {
@@ -1121,23 +1453,45 @@ class MemTableInserter : public WriteBatch::Handler {
       }
     }
 
-    return DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
+    auto ret_status =
+        DeleteImpl(column_family_id, begin_key, end_key, kTypeRangeDeletion);
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                      begin_key, end_key);
+    }
+    return ret_status;
   }
 
-  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
     assert(!concurrent_memtable_writes_);
-    if (rebuilding_trx_ != nullptr) {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
       WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
       return Status::OK();
+      // else insert the values to the memtable right away
     }
 
     Status seek_status;
-    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
-      ++sequence_;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &seek_status))) {
+      bool batch_boundry = false;
+      if (rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+                                  value);
+        batch_boundry = IsDuplicateKeySeq(column_family_id, key);
+      }
+      MaybeAdvanceSeq(batch_boundry);
       return seek_status;
     }
 
+    Status ret_status;
     MemTable* mem = cf_mems_->GetMemTable();
     auto* moptions = mem->GetImmutableMemTableOptions();
     bool perform_merge = false;
@@ -1193,22 +1547,41 @@ class MemTableInserter : public WriteBatch::Handler {
         perform_merge = false;
       } else {
         // 3) Add value to memtable
-        mem->Add(sequence_, kTypeValue, key, new_value);
+        bool mem_res = mem->Add(sequence_, kTypeValue, key, new_value);
+        if (UNLIKELY(!mem_res)) {
+          assert(seq_per_batch_);
+          ret_status = Status::TryAgain("key+seq exists");
+          const bool BATCH_BOUNDRY = true;
+          MaybeAdvanceSeq(BATCH_BOUNDRY);
+        }
       }
     }
 
     if (!perform_merge) {
       // Add merge operator to memtable
-      mem->Add(sequence_, kTypeMerge, key, value);
+      bool mem_res = mem->Add(sequence_, kTypeMerge, key, value);
+      if (UNLIKELY(!mem_res)) {
+        assert(seq_per_batch_);
+        ret_status = Status::TryAgain("key+seq exists");
+        const bool BATCH_BOUNDRY = true;
+        MaybeAdvanceSeq(BATCH_BOUNDRY);
+      }
     }
 
-    sequence_++;
+    // optimize for non-recovery mode
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // If the ret_status is TryAgain then let the next try to add the ky to
+      // the rebuilding transaction object.
+      WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key, value);
+    }
+    MaybeAdvanceSeq();
     CheckMemtableFull();
-    return Status::OK();
+    return ret_status;
   }
 
-  virtual Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
-                                const Slice& value) override {
+  Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+                        const Slice& value) override {
     // Same as PutCF except for value type.
     return PutCFImpl(column_family_id, key, value, kTypeBlobIndex);
   }
@@ -1226,7 +1599,9 @@ class MemTableInserter : public WriteBatch::Handler {
     }
   }
 
-  Status MarkBeginPrepare() override {
+  // The write batch handler calls MarkBeginPrepare with unprepare set to true
+  // if it encounters the kTypeBeginUnprepareXID marker.
+  Status MarkBeginPrepare(bool unprepare) override {
     assert(rebuilding_trx_ == nullptr);
     assert(db_);
 
@@ -1241,14 +1616,15 @@ class MemTableInserter : public WriteBatch::Handler {
 
       // we are now iterating through a prepared section
       rebuilding_trx_ = new WriteBatch();
+      rebuilding_trx_seq_ = sequence_;
+      // We only call MarkBeginPrepare once per batch, and unprepared_batch_
+      // is initialized to false by default.
+      assert(!unprepared_batch_);
+      unprepared_batch_ = unprepare;
+
       if (has_valid_writes_ != nullptr) {
         *has_valid_writes_ = true;
       }
-    } else {
-      // in non-recovery we ignore prepare markers
-      // and insert the values directly. making sure we have a
-      // log for each insertion to reference.
-      assert(log_number_ref_ > 0);
     }
 
     return Status::OK();
@@ -1260,14 +1636,33 @@ class MemTableInserter : public WriteBatch::Handler {
 
     if (recovering_log_number_ != 0) {
       assert(db_->allow_2pc());
+      size_t batch_cnt =
+          write_after_commit_
+              ? 0  // 0 will disable further checks
+              : static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
       db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
-                                      rebuilding_trx_);
+                                      rebuilding_trx_, rebuilding_trx_seq_,
+                                      batch_cnt, unprepared_batch_);
       rebuilding_trx_ = nullptr;
     } else {
       assert(rebuilding_trx_ == nullptr);
-      assert(log_number_ref_ > 0);
     }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return Status::OK();
+  }
 
+  Status MarkNoop(bool empty_batch) override {
+    // A hack in pessimistic transaction could result into a noop at the start
+    // of the write batch, that should be ignored.
+    if (!empty_batch) {
+      // In the absence of Prepare markers, a kTypeNoop tag indicates the end of
+      // a batch. This happens when write batch commits skipping the prepare
+      // phase.
+      const bool batch_boundry = true;
+      MaybeAdvanceSeq(batch_boundry);
+    }
     return Status::OK();
   }
 
@@ -1282,17 +1677,23 @@ class MemTableInserter : public WriteBatch::Handler {
       // and commit.
       auto trx = db_->GetRecoveredTransaction(name.ToString());
 
-      // the log contaiting the prepared section may have
+      // the log containing the prepared section may have
       // been released in the last incarnation because the
       // data was flushed to L0
       if (trx != nullptr) {
         // at this point individual CF lognumbers will prevent
         // duplicate re-insertion of values.
         assert(log_number_ref_ == 0);
-        // all insertes must reference this trx log number
-        log_number_ref_ = trx->log_number_;
-        s = trx->batch_->Iterate(this);
-        log_number_ref_ = 0;
+        if (write_after_commit_) {
+          // write_after_commit_ can only have one batch in trx.
+          assert(trx->batches_.size() == 1);
+          const auto& batch_info = trx->batches_.begin()->second;
+          // all inserts must reference this trx log number
+          log_number_ref_ = batch_info.log_number_;
+          s = batch_info.batch_->Iterate(this);
+          log_number_ref_ = 0;
+        }
+        // else the values are already inserted before the commit
 
         if (s.ok()) {
           db_->DeleteRecoveredTransaction(name.ToString());
@@ -1302,8 +1703,13 @@ class MemTableInserter : public WriteBatch::Handler {
         }
       }
     } else {
-      // in non recovery we simply ignore this tag
+      // When writes are not delayed until commit, there is no disconnect
+      // between a memtable write and the WAL that supports it. So the commit
+      // need not reference any log as the only log to which it depends.
+      assert(!write_after_commit_ || log_number_ref_ > 0);
     }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
 
     return s;
   }
@@ -1324,6 +1730,9 @@ class MemTableInserter : public WriteBatch::Handler {
       // in non recovery we simply ignore this tag
     }
 
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
     return Status::OK();
   }
 
@@ -1342,45 +1751,56 @@ class MemTableInserter : public WriteBatch::Handler {
 // 2) During Write(), in a single-threaded write thread
 // 3) During Write(), in a concurrent context where memtables has been cloned
 // The reason is that it calls memtables->Seek(), which has a stateful cache
-Status WriteBatchInternal::InsertInto(WriteThread::WriteGroup& write_group,
-                                      SequenceNumber sequence,
-                                      ColumnFamilyMemTables* memtables,
-                                      FlushScheduler* flush_scheduler,
-                                      bool ignore_missing_column_families,
-                                      uint64_t recovery_log_number, DB* db,
-                                      bool concurrent_memtable_writes) {
-  MemTableInserter inserter(sequence, memtables, flush_scheduler,
-                            ignore_missing_column_families, recovery_log_number,
-                            db, concurrent_memtable_writes);
+Status WriteBatchInternal::InsertInto(
+    WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+  MemTableInserter inserter(
+      sequence, memtables, flush_scheduler, ignore_missing_column_families,
+      recovery_log_number, db, concurrent_memtable_writes,
+      nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
   for (auto w : write_group) {
+    if (w->CallbackFailed()) {
+      continue;
+    }
+    w->sequence = inserter.sequence();
     if (!w->ShouldWriteToMemtable()) {
+      // In seq_per_batch_ mode this advances the seq by one.
+      inserter.MaybeAdvanceSeq(true);
       continue;
     }
     SetSequence(w->batch, inserter.sequence());
-    w->sequence = inserter.sequence();
     inserter.set_log_number_ref(w->log_ref);
     w->status = w->batch->Iterate(&inserter);
     if (!w->status.ok()) {
       return w->status;
     }
+    assert(!seq_per_batch || w->batch_cnt != 0);
+    assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
   }
   return Status::OK();
 }
 
-Status WriteBatchInternal::InsertInto(WriteThread::Writer* writer,
-                                      SequenceNumber sequence,
-                                      ColumnFamilyMemTables* memtables,
-                                      FlushScheduler* flush_scheduler,
-                                      bool ignore_missing_column_families,
-                                      uint64_t log_number, DB* db,
-                                      bool concurrent_memtable_writes) {
+Status WriteBatchInternal::InsertInto(
+    WriteThread::Writer* writer, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
+    bool batch_per_txn) {
+#ifdef NDEBUG
+  (void)batch_cnt;
+#endif
   assert(writer->ShouldWriteToMemtable());
-  MemTableInserter inserter(sequence, memtables, flush_scheduler,
-                            ignore_missing_column_families, log_number, db,
-                            concurrent_memtable_writes);
+  MemTableInserter inserter(
+      sequence, memtables, flush_scheduler, ignore_missing_column_families,
+      log_number, db, concurrent_memtable_writes, nullptr /*has_valid_writes*/,
+      seq_per_batch, batch_per_txn);
   SetSequence(writer->batch, sequence);
   inserter.set_log_number_ref(writer->log_ref);
   Status s = writer->batch->Iterate(&inserter);
+  assert(!seq_per_batch || batch_cnt != 0);
+  assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
   if (concurrent_memtable_writes) {
     inserter.PostProcess();
   }
@@ -1391,13 +1811,15 @@ Status WriteBatchInternal::InsertInto(
     const WriteBatch* batch, ColumnFamilyMemTables* memtables,
     FlushScheduler* flush_scheduler, bool ignore_missing_column_families,
     uint64_t log_number, DB* db, bool concurrent_memtable_writes,
-    SequenceNumber* last_seq_used, bool* has_valid_writes) {
+    SequenceNumber* next_seq, bool* has_valid_writes, bool seq_per_batch,
+    bool batch_per_txn) {
   MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
                             ignore_missing_column_families, log_number, db,
-                            concurrent_memtable_writes, has_valid_writes);
+                            concurrent_memtable_writes, has_valid_writes,
+                            seq_per_batch, batch_per_txn);
   Status s = batch->Iterate(&inserter);
-  if (last_seq_used != nullptr) {
-    *last_seq_used = inserter.sequence();
+  if (next_seq != nullptr) {
+    *next_seq = inserter.sequence();
   }
   if (concurrent_memtable_writes) {
     inserter.PostProcess();
diff --git a/thirdparty/rocksdb/db/write_batch_internal.h b/thirdparty/rocksdb/db/write_batch_internal.h
index 2408686f12..bae62bf031 100644
--- a/thirdparty/rocksdb/db/write_batch_internal.h
+++ b/thirdparty/rocksdb/db/write_batch_internal.h
@@ -102,7 +102,9 @@ class WriteBatchInternal {
   static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
                              const Slice& key, const Slice& value);
 
-  static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid);
+  static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
+                               const bool write_after_commit = true,
+                               const bool unprepared_batch = false);
 
   static Status MarkRollback(WriteBatch* batch, const Slice& xid);
 
@@ -116,10 +118,10 @@ class WriteBatchInternal {
   // Set the count for the number of entries in the batch.
   static void SetCount(WriteBatch* batch, int n);
 
-  // Return the seqeunce number for the start of this batch.
+  // Return the sequence number for the start of this batch.
   static SequenceNumber Sequence(const WriteBatch* batch);
 
-  // Store the specified number as the seqeunce number for the start of
+  // Store the specified number as the sequence number for the start of
   // this batch.
   static void SetSequence(WriteBatch* batch, SequenceNumber seq);
 
@@ -137,6 +139,9 @@ class WriteBatchInternal {
 
   static Status SetContents(WriteBatch* batch, const Slice& contents);
 
+  static Status CheckSlicePartsLength(const SliceParts& key,
+                                      const SliceParts& value);
+
   // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
   //
   // If ignore_missing_column_families == true. WriteBatch
@@ -154,31 +159,31 @@ class WriteBatchInternal {
   //
   // Under concurrent use, the caller is responsible for making sure that
   // the memtables object itself is thread-local.
-  static Status InsertInto(WriteThread::WriteGroup& write_group,
-                           SequenceNumber sequence,
-                           ColumnFamilyMemTables* memtables,
-                           FlushScheduler* flush_scheduler,
-                           bool ignore_missing_column_families = false,
-                           uint64_t log_number = 0, DB* db = nullptr,
-                           bool concurrent_memtable_writes = false);
+  static Status InsertInto(
+      WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+      ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      bool seq_per_batch = false, bool batch_per_txn = true);
 
   // Convenience form of InsertInto when you have only one batch
-  // last_seq_used returns the last sequnce number used in a MemTable insert
-  static Status InsertInto(const WriteBatch* batch,
-                           ColumnFamilyMemTables* memtables,
-                           FlushScheduler* flush_scheduler,
-                           bool ignore_missing_column_families = false,
-                           uint64_t log_number = 0, DB* db = nullptr,
-                           bool concurrent_memtable_writes = false,
-                           SequenceNumber* last_seq_used = nullptr,
-                           bool* has_valid_writes = nullptr);
+  // next_seq returns the seq after last sequence number used in MemTable insert
+  static Status InsertInto(
+      const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+      FlushScheduler* flush_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
+      bool seq_per_batch = false, bool batch_per_txn = true);
 
   static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
                            ColumnFamilyMemTables* memtables,
                            FlushScheduler* flush_scheduler,
                            bool ignore_missing_column_families = false,
                            uint64_t log_number = 0, DB* db = nullptr,
-                           bool concurrent_memtable_writes = false);
+                           bool concurrent_memtable_writes = false,
+                           bool seq_per_batch = false, size_t batch_cnt = 0,
+                           bool batch_per_txn = true);
 
   static Status Append(WriteBatch* dst, const WriteBatch* src,
                        const bool WAL_only = false);
@@ -186,6 +191,11 @@ class WriteBatchInternal {
   // Returns the byte size of appending a WriteBatch with ByteSize
   // leftByteSize and a WriteBatch with ByteSize rightByteSize
   static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
+
+  // This write batch includes the latest state that should be persisted. Such
+  // state meant to be used only during recovery.
+  static void SetAsLastestPersistentState(WriteBatch* b);
+  static bool IsLatestPersistentState(const WriteBatch* b);
 };
 
 // LocalSavePoint is similar to a scope guard
diff --git a/thirdparty/rocksdb/db/write_batch_test.cc b/thirdparty/rocksdb/db/write_batch_test.cc
index 4584793abe..322bd8945b 100644
--- a/thirdparty/rocksdb/db/write_batch_test.cc
+++ b/thirdparty/rocksdb/db/write_batch_test.cc
@@ -18,7 +18,6 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "table/scoped_arena_iterator.h"
-#include "util/logging.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 
@@ -52,7 +51,8 @@ static std::string PrintContents(WriteBatch* b) {
       iter = mem->NewIterator(ReadOptions(), &arena);
       arena_iter_guard.set(iter);
     } else {
-      iter = mem->NewRangeTombstoneIterator(ReadOptions());
+      iter = mem->NewRangeTombstoneIterator(ReadOptions(),
+                                            kMaxSequenceNumber /* read_seq */);
       iter_guard.reset(iter);
     }
     if (iter == nullptr) {
@@ -236,8 +236,8 @@ TEST_F(WriteBatchTest, SingleDeletion) {
 namespace {
   struct TestHandler : public WriteBatch::Handler {
     std::string seen;
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
       if (column_family_id == 0) {
         seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
       } else {
@@ -246,8 +246,7 @@ namespace {
       }
       return Status::OK();
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
       if (column_family_id == 0) {
         seen += "Delete(" + key.ToString() + ")";
       } else {
@@ -256,8 +255,8 @@ namespace {
       }
       return Status::OK();
     }
-    virtual Status SingleDeleteCF(uint32_t column_family_id,
-                                  const Slice& key) override {
+    Status SingleDeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
       if (column_family_id == 0) {
         seen += "SingleDelete(" + key.ToString() + ")";
       } else {
@@ -266,9 +265,8 @@ namespace {
       }
       return Status::OK();
     }
-    virtual Status DeleteRangeCF(uint32_t column_family_id,
-                                 const Slice& begin_key,
-                                 const Slice& end_key) override {
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                         const Slice& end_key) override {
       if (column_family_id == 0) {
         seen += "DeleteRange(" + begin_key.ToString() + ", " +
                 end_key.ToString() + ")";
@@ -278,8 +276,8 @@ namespace {
       }
       return Status::OK();
     }
-    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) override {
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
       if (column_family_id == 0) {
         seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
       } else {
@@ -288,22 +286,27 @@ namespace {
       }
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) override {
+    void LogData(const Slice& blob) override {
       seen += "LogData(" + blob.ToString() + ")";
     }
-    virtual Status MarkBeginPrepare() override {
-      seen += "MarkBeginPrepare()";
+    Status MarkBeginPrepare(bool unprepare) override {
+      seen +=
+          "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
       return Status::OK();
     }
-    virtual Status MarkEndPrepare(const Slice& xid) override {
+    Status MarkEndPrepare(const Slice& xid) override {
       seen += "MarkEndPrepare(" + xid.ToString() + ")";
       return Status::OK();
     }
-    virtual Status MarkCommit(const Slice& xid) override {
+    Status MarkNoop(bool empty_batch) override {
+      seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")";
+      return Status::OK();
+    }
+    Status MarkCommit(const Slice& xid) override {
       seen += "MarkCommit(" + xid.ToString() + ")";
       return Status::OK();
     }
-    virtual Status MarkRollback(const Slice& xid) override {
+    Status MarkRollback(const Slice& xid) override {
       seen += "MarkRollback(" + xid.ToString() + ")";
       return Status::OK();
     }
@@ -400,7 +403,7 @@ TEST_F(WriteBatchTest, PrepareCommit) {
   TestHandler handler;
   batch.Iterate(&handler);
   ASSERT_EQ(
-      "MarkBeginPrepare()"
+      "MarkBeginPrepare(false)"
       "Put(k1, v1)"
       "Put(k2, v2)"
       "MarkEndPrepare(xid1)"
@@ -415,7 +418,7 @@ TEST_F(WriteBatchTest, PrepareCommit) {
 TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
   // Insert key and value of 3GB and push total batch size to 12GB.
   static const size_t kKeyValueSize = 4u;
-  static const uint32_t kNumUpdates = 3 << 30;
+  static const uint32_t kNumUpdates = uint32_t(3 << 30);
   std::string raw(kKeyValueSize, 'A');
   WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
   char c = 'A';
@@ -434,8 +437,8 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
   struct NoopHandler : public WriteBatch::Handler {
     uint32_t num_seen = 0;
     char expected_char = 'A';
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
       EXPECT_EQ(kKeyValueSize, key.size());
       EXPECT_EQ(kKeyValueSize, value.size());
       EXPECT_EQ(expected_char, key[0]);
@@ -449,23 +452,23 @@ TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
       ++num_seen;
       return Status::OK();
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
       ADD_FAILURE();
       return Status::OK();
     }
-    virtual Status SingleDeleteCF(uint32_t column_family_id,
-                                  const Slice& key) override {
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
       ADD_FAILURE();
       return Status::OK();
     }
-    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) override {
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
       ADD_FAILURE();
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) override { ADD_FAILURE(); }
-    virtual bool Continue() override { return num_seen < kNumUpdates; }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < kNumUpdates; }
   } handler;
 
   batch.Iterate(&handler);
@@ -489,8 +492,8 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
 
   struct NoopHandler : public WriteBatch::Handler {
     int num_seen = 0;
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
       EXPECT_EQ(kKeyValueSize, key.size());
       EXPECT_EQ(kKeyValueSize, value.size());
       EXPECT_EQ('A' + num_seen, key[0]);
@@ -500,23 +503,23 @@ TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
       ++num_seen;
       return Status::OK();
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
       ADD_FAILURE();
       return Status::OK();
     }
-    virtual Status SingleDeleteCF(uint32_t column_family_id,
-                                  const Slice& key) override {
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
       ADD_FAILURE();
       return Status::OK();
     }
-    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) override {
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
       ADD_FAILURE();
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) override { ADD_FAILURE(); }
-    virtual bool Continue() override { return num_seen < 2; }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < 2; }
   } handler;
 
   batch.Iterate(&handler);
@@ -528,31 +531,30 @@ TEST_F(WriteBatchTest, Continue) {
 
   struct Handler : public TestHandler {
     int num_seen = 0;
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
       ++num_seen;
       return TestHandler::PutCF(column_family_id, key, value);
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
       ++num_seen;
       return TestHandler::DeleteCF(column_family_id, key);
     }
-    virtual Status SingleDeleteCF(uint32_t column_family_id,
-                                  const Slice& key) override {
+    Status SingleDeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
       ++num_seen;
       return TestHandler::SingleDeleteCF(column_family_id, key);
     }
-    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) override {
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
       ++num_seen;
       return TestHandler::MergeCF(column_family_id, key, value);
     }
-    virtual void LogData(const Slice& blob) override {
+    void LogData(const Slice& blob) override {
       ++num_seen;
       TestHandler::LogData(blob);
     }
-    virtual bool Continue() override { return num_seen < 5; }
+    bool Continue() override { return num_seen < 5; }
   } handler;
 
   batch.Put(Slice("k1"), Slice("v1"));
diff --git a/thirdparty/rocksdb/db/write_callback_test.cc b/thirdparty/rocksdb/db/write_callback_test.cc
index d2bf30a093..cb880560ef 100644
--- a/thirdparty/rocksdb/db/write_callback_test.cc
+++ b/thirdparty/rocksdb/db/write_callback_test.cc
@@ -29,7 +29,7 @@ class WriteCallbackTest : public testing::Test {
   string dbname;
 
   WriteCallbackTest() {
-    dbname = test::TmpDir() + "/write_callback_testdb";
+    dbname = test::PerThreadDBPath("write_callback_testdb");
   }
 };
 
@@ -54,9 +54,7 @@ class WriteCallbackTestWriteCallback1 : public WriteCallback {
 
 class WriteCallbackTestWriteCallback2 : public WriteCallback {
  public:
-  Status Callback(DB *db) override {
-    return Status::Busy();
-  }
+  Status Callback(DB* /*db*/) override { return Status::Busy(); }
   bool AllowWriteBatching() override { return true; }
 };
 
@@ -74,7 +72,7 @@ class MockWriteCallback : public WriteCallback {
     was_called_.store(other.was_called_.load());
   }
 
-  Status Callback(DB* db) override {
+  Status Callback(DB* /*db*/) override {
     was_called_.store(true);
     if (should_fail_) {
       return Status::Busy();
@@ -126,6 +124,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
       {false, false, true, false, true},
   };
 
+  for (auto& seq_per_batch : {true, false}) {
   for (auto& two_queues : {true, false}) {
     for (auto& allow_parallel : {true, false}) {
       for (auto& allow_batching : {true, false}) {
@@ -136,14 +135,34 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
               options.create_if_missing = true;
               options.allow_concurrent_memtable_write = allow_parallel;
               options.enable_pipelined_write = enable_pipelined_write;
-              options.concurrent_prepare = two_queues;
+              options.two_write_queues = two_queues;
+              if (options.enable_pipelined_write && seq_per_batch) {
+                // This combination is not supported
+                continue;
+              }
+              if (options.enable_pipelined_write && options.two_write_queues) {
+                // This combination is not supported
+                continue;
+              }
 
               ReadOptions read_options;
               DB* db;
               DBImpl* db_impl;
 
               DestroyDB(dbname, options);
-              ASSERT_OK(DB::Open(options, dbname, &db));
+
+              DBOptions db_options(options);
+              ColumnFamilyOptions cf_options(options);
+              std::vector<ColumnFamilyDescriptor> column_families;
+              column_families.push_back(
+                  ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+              std::vector<ColumnFamilyHandle*> handles;
+              auto open_s =
+                  DBImpl::Open(db_options, dbname, column_families, &handles,
+                               &db, seq_per_batch, true /* batch_per_txn */);
+              ASSERT_OK(open_s);
+              assert(handles.size() == 1);
+              delete handles[0];
 
               db_impl = dynamic_cast<DBImpl*>(db);
               ASSERT_TRUE(db_impl);
@@ -259,16 +278,41 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
                   string sval(10, my_key);
                   write_op.Put(skey, sval);
 
-                  if (!write_op.callback_.should_fail_) {
+                  if (!write_op.callback_.should_fail_ && !seq_per_batch) {
                     seq.fetch_add(1);
                   }
                 }
+                if (!write_op.callback_.should_fail_ && seq_per_batch) {
+                  seq.fetch_add(1);
+                }
 
                 WriteOptions woptions;
                 woptions.disableWAL = !enable_WAL;
                 woptions.sync = enable_WAL;
-                Status s = db_impl->WriteWithCallback(
-                    woptions, &write_op.write_batch_, &write_op.callback_);
+                Status s;
+                if (seq_per_batch) {
+                  class PublishSeqCallback : public PreReleaseCallback {
+                   public:
+                    PublishSeqCallback(DBImpl* db_impl_in)
+                        : db_impl_(db_impl_in) {}
+                    Status Callback(SequenceNumber last_seq, bool /*not used*/,
+                                    uint64_t) override {
+                      db_impl_->SetLastPublishedSequence(last_seq);
+                      return Status::OK();
+                    }
+                    DBImpl* db_impl_;
+                  } publish_seq_callback(db_impl);
+                  // seq_per_batch requires a natural batch separator or Noop
+                  WriteBatchInternal::InsertNoop(&write_op.write_batch_);
+                  const size_t ONE_BATCH = 1;
+                  s = db_impl->WriteImpl(
+                      woptions, &write_op.write_batch_, &write_op.callback_,
+                      nullptr, 0, false, nullptr, ONE_BATCH,
+                      two_queues ? &publish_seq_callback : nullptr);
+                } else {
+                  s = db_impl->WriteWithCallback(
+                      woptions, &write_op.write_batch_, &write_op.callback_);
+                }
 
                 if (write_op.callback_.should_fail_) {
                   ASSERT_TRUE(s.IsBusy());
@@ -305,7 +349,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
                 }
               }
 
-              ASSERT_EQ(seq.load(), db_impl->GetLatestSequenceNumber());
+              ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
 
               delete db;
               DestroyDB(dbname, options);
@@ -316,6 +360,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) {
     }
 }
 }
+}
 
 TEST_F(WriteCallbackTest, WriteCallBackTest) {
   Options options;
@@ -388,7 +433,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n");
   return 0;
diff --git a/thirdparty/rocksdb/db/write_controller_test.cc b/thirdparty/rocksdb/db/write_controller_test.cc
index a1fe3fa27e..55feb00a33 100644
--- a/thirdparty/rocksdb/db/write_controller_test.cc
+++ b/thirdparty/rocksdb/db/write_controller_test.cc
@@ -18,7 +18,7 @@ class TimeSetEnv : public EnvWrapper {
  public:
   explicit TimeSetEnv() : EnvWrapper(nullptr) {}
   uint64_t now_micros_ = 6666;
-  virtual uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
+  uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
 };
 
 TEST_F(WriteControllerTest, ChangeDelayRateTest) {
diff --git a/thirdparty/rocksdb/db/write_thread.cc b/thirdparty/rocksdb/db/write_thread.cc
index 2d3b34602c..835992c8fc 100644
--- a/thirdparty/rocksdb/db/write_thread.cc
+++ b/thirdparty/rocksdb/db/write_thread.cc
@@ -7,6 +7,7 @@
 #include <chrono>
 #include <thread>
 #include "db/column_family.h"
+#include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "util/random.h"
 #include "util/sync_point.h"
@@ -23,7 +24,10 @@ WriteThread::WriteThread(const ImmutableDBOptions& db_options)
       enable_pipelined_write_(db_options.enable_pipelined_write),
       newest_writer_(nullptr),
       newest_memtable_writer_(nullptr),
-      last_sequence_(0) {}
+      last_sequence_(0),
+      write_stall_dummy_(),
+      stall_mu_(),
+      stall_cv_(&stall_mu_) {}
 
 uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
   // We're going to block.  Lazily create the mutex.  We guarantee
@@ -73,6 +77,10 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
     port::AsmVolatilePause();
   }
 
+  // This is below the fast path, so that the stat is zero when all writes are
+  // from the same thread.
+  PERF_TIMER_GUARD(write_thread_wait_nanos);
+
   // If we're only going to end up waiting a short period of time,
   // it can be a lot more efficient to call std::this_thread::yield()
   // in a loop than to block in StateMutex().  For reference, on my 4.0
@@ -173,6 +181,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
   }
 
   if ((state & goal_mask) == 0) {
+    TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w);
     state = BlockingAwaitState(w, goal_mask);
   }
 
@@ -214,6 +223,28 @@ bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
   assert(w->state == STATE_INIT);
   Writer* writers = newest_writer->load(std::memory_order_relaxed);
   while (true) {
+    // If write stall in effect, and w->no_slowdown is not true,
+    // block here until stall is cleared. If its true, then return
+    // immediately
+    if (writers == &write_stall_dummy_) {
+      if (w->no_slowdown) {
+        w->status = Status::Incomplete("Write stall");
+        SetState(w, STATE_COMPLETED);
+        return false;
+      }
+      // Since no_slowdown is false, wait here to be notified of the write
+      // stall clearing
+      {
+        MutexLock lock(&stall_mu_);
+        writers = newest_writer->load(std::memory_order_relaxed);
+        if (writers == &write_stall_dummy_) {
+          stall_cv_.Wait();
+          // Load newest_writers_ again since it may have changed
+          writers = newest_writer->load(std::memory_order_relaxed);
+          continue;
+        }
+      }
+    }
     w->link_older = writers;
     if (newest_writer->compare_exchange_weak(writers, w)) {
       return (writers == nullptr);
@@ -258,6 +289,17 @@ void WriteThread::CreateMissingNewerLinks(Writer* head) {
   }
 }
 
+WriteThread::Writer* WriteThread::FindNextLeader(Writer* from,
+                                                 Writer* boundary) {
+  assert(from != nullptr && from != boundary);
+  Writer* current = from;
+  while (current->link_older != boundary) {
+    current = current->link_older;
+    assert(current != nullptr);
+  }
+  return current;
+}
+
 void WriteThread::CompleteLeader(WriteGroup& write_group) {
   assert(write_group.size > 0);
   Writer* leader = write_group.leader;
@@ -287,12 +329,44 @@ void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
   SetState(w, STATE_COMPLETED);
 }
 
+void WriteThread::BeginWriteStall() {
+  LinkOne(&write_stall_dummy_, &newest_writer_);
+
+  // Walk writer list until w->write_group != nullptr. The current write group
+  // will not have a mix of slowdown/no_slowdown, so its ok to stop at that
+  // point
+  Writer* w = write_stall_dummy_.link_older;
+  Writer* prev = &write_stall_dummy_;
+  while (w != nullptr && w->write_group == nullptr) {
+    if (w->no_slowdown) {
+      prev->link_older = w->link_older;
+      w->status = Status::Incomplete("Write stall");
+      SetState(w, STATE_COMPLETED);
+      w = prev->link_older;
+    } else {
+      prev = w;
+      w = w->link_older;
+    }
+  }
+}
+
+void WriteThread::EndWriteStall() {
+  MutexLock lock(&stall_mu_);
+
+  assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_);
+  newest_writer_.exchange(write_stall_dummy_.link_older);
+
+  // Wake up writers
+  stall_cv_.SignalAll();
+}
+
 static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
 void WriteThread::JoinBatchGroup(Writer* w) {
   TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
   assert(w->batch != nullptr);
 
   bool linked_as_leader = LinkOne(w, &newest_writer_);
+
   if (linked_as_leader) {
     SetState(w, STATE_GROUP_LEADER);
   }
@@ -313,6 +387,7 @@ void WriteThread::JoinBatchGroup(Writer* w) {
      * 3.2) an existing memtable writer group leader tell us to finish memtable
      *      writes in parallel.
      */
+    TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w);
     AwaitState(w, STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
                       STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
                &jbg_ctx);
@@ -394,6 +469,7 @@ size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
     write_group->last_writer = w;
     write_group->size++;
   }
+  TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w);
   return size;
 }
 
@@ -455,7 +531,8 @@ void WriteThread::EnterAsMemTableWriter(Writer* leader,
       last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1;
 }
 
-void WriteThread::ExitAsMemTableWriter(Writer* self, WriteGroup& write_group) {
+void WriteThread::ExitAsMemTableWriter(Writer* /*self*/,
+                                       WriteGroup& write_group) {
   Writer* leader = write_group.leader;
   Writer* last_writer = write_group.last_writer;
 
@@ -532,6 +609,11 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
   Writer* last_writer = write_group.last_writer;
   assert(leader->link_older == nullptr);
 
+  // Propagate memtable write error to the whole group.
+  if (status.ok() && !write_group.status.ok()) {
+    status = write_group.status;
+  }
+
   if (enable_pipelined_write_) {
     // Notify writers don't write to memtable to exit.
     for (Writer* w = last_writer; w != leader;) {
@@ -545,21 +627,49 @@ void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
     if (!leader->ShouldWriteToMemtable()) {
       CompleteLeader(write_group);
     }
+
+    Writer* next_leader = nullptr;
+
+    // Look for next leader before we call LinkGroup. If there isn't
+    // pending writers, place a dummy writer at the tail of the queue
+    // so we know the boundary of the current write group.
+    Writer dummy;
+    Writer* expected = last_writer;
+    bool has_dummy = newest_writer_.compare_exchange_strong(expected, &dummy);
+    if (!has_dummy) {
+      // We find at least one pending writer when we insert dummy. We search
+      // for next leader from there.
+      next_leader = FindNextLeader(expected, last_writer);
+      assert(next_leader != nullptr && next_leader != last_writer);
+    }
+
     // Link the ramaining of the group to memtable writer list.
+    //
+    // We have to link our group to memtable writer queue before wake up the
+    // next leader or set newest_writer_ to null, otherwise the next leader
+    // can run ahead of us and link to memtable writer queue before we do.
     if (write_group.size > 0) {
       if (LinkGroup(write_group, &newest_memtable_writer_)) {
         // The leader can now be different from current writer.
         SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER);
       }
     }
-    // Reset newest_writer_ and wake up the next leader.
-    Writer* newest_writer = last_writer;
-    if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
-      Writer* next_leader = newest_writer;
-      while (next_leader->link_older != last_writer) {
-        next_leader = next_leader->link_older;
-        assert(next_leader != nullptr);
+
+    // If we have inserted dummy in the queue, remove it now and check if there
+    // are pending writer join the queue since we insert the dummy. If so,
+    // look for next leader again.
+    if (has_dummy) {
+      assert(next_leader == nullptr);
+      expected = &dummy;
+      bool has_pending_writer =
+          !newest_writer_.compare_exchange_strong(expected, nullptr);
+      if (has_pending_writer) {
+        next_leader = FindNextLeader(expected, &dummy);
+        assert(next_leader != nullptr && next_leader != &dummy);
       }
+    }
+
+    if (next_leader != nullptr) {
       next_leader->link_older = nullptr;
       SetState(next_leader, STATE_GROUP_LEADER);
     }
diff --git a/thirdparty/rocksdb/db/write_thread.h b/thirdparty/rocksdb/db/write_thread.h
index 57ce71e08f..dc9c22ff87 100644
--- a/thirdparty/rocksdb/db/write_thread.h
+++ b/thirdparty/rocksdb/db/write_thread.h
@@ -14,6 +14,8 @@
 #include <type_traits>
 #include <vector>
 
+#include "db/dbformat.h"
+#include "db/pre_release_callback.h"
 #include "db/write_callback.h"
 #include "monitoring/instrumented_mutex.h"
 #include "rocksdb/options.h"
@@ -116,6 +118,8 @@ class WriteThread {
     bool no_slowdown;
     bool disable_wal;
     bool disable_memtable;
+    size_t batch_cnt;  // if non-zero, number of sub-batches in the write batch
+    PreReleaseCallback* pre_release_callback;
     uint64_t log_used;  // log number that this batch was inserted into
     uint64_t log_ref;   // log number that memtable insert should reference
     WriteCallback* callback;
@@ -123,8 +127,9 @@ class WriteThread {
     std::atomic<uint8_t> state;  // write under StateMutex() or pre-link
     WriteGroup* write_group;
     SequenceNumber sequence;  // the sequence number to use for the first key
-    Status status;            // status of memtable inserter
+    Status status;
     Status callback_status;   // status returned by callback->Callback()
+
     std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
     std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
     Writer* link_older;  // read/write only before linking, or as leader
@@ -136,28 +141,36 @@ class WriteThread {
           no_slowdown(false),
           disable_wal(false),
           disable_memtable(false),
+          batch_cnt(0),
+          pre_release_callback(nullptr),
           log_used(0),
           log_ref(0),
           callback(nullptr),
           made_waitable(false),
           state(STATE_INIT),
           write_group(nullptr),
+          sequence(kMaxSequenceNumber),
           link_older(nullptr),
           link_newer(nullptr) {}
 
     Writer(const WriteOptions& write_options, WriteBatch* _batch,
-           WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable)
+           WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
+           size_t _batch_cnt = 0,
+           PreReleaseCallback* _pre_release_callback = nullptr)
         : batch(_batch),
           sync(write_options.sync),
           no_slowdown(write_options.no_slowdown),
           disable_wal(write_options.disableWAL),
           disable_memtable(_disable_memtable),
+          batch_cnt(_batch_cnt),
+          pre_release_callback(_pre_release_callback),
           log_used(0),
           log_ref(_log_ref),
           callback(_callback),
           made_waitable(false),
           state(STATE_INIT),
           write_group(nullptr),
+          sequence(kMaxSequenceNumber),
           link_older(nullptr),
           link_newer(nullptr) {}
 
@@ -329,6 +342,13 @@ class WriteThread {
     return last_sequence_;
   }
 
+  // Insert a dummy writer at the tail of the write queue to indicate a write
+  // stall, and fail any writers in the queue with no_slowdown set to true
+  void BeginWriteStall();
+
+  // Remove the dummy writer and wake up waiting writers
+  void EndWriteStall();
+
  private:
   // See AwaitState.
   const uint64_t max_yield_usec_;
@@ -352,6 +372,17 @@ class WriteThread {
   // is not necessary visible to reads because the writer can be ongoing.
   SequenceNumber last_sequence_;
 
+  // A dummy writer to indicate a write stall condition. This will be inserted
+  // at the tail of the writer queue by the leader, so newer writers can just
+  // check for this and bail
+  Writer write_stall_dummy_;
+
+  // Mutex and condvar for writers to block on a write stall. During a write
+  // stall, writers with no_slowdown set to false will wait on this rather
+  // on the writer queue
+  port::Mutex stall_mu_;
+  port::CondVar stall_cv_;
+
   // Waits for w->state & goal_mask using w->StateMutex().  Returns
   // the state that satisfies goal_mask.
   uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
@@ -379,6 +410,10 @@ class WriteThread {
   // concurrently with itself.
   void CreateMissingNewerLinks(Writer* head);
 
+  // Starting from a pending writer, follow link_older to search for next
+  // leader, until we hit boundary.
+  Writer* FindNextLeader(Writer* pending_writer, Writer* boundary);
+
   // Set the leader in write_group to completed state and remove it from the
   // write group.
   void CompleteLeader(WriteGroup& write_group);
diff --git a/thirdparty/rocksdb/defs.bzl b/thirdparty/rocksdb/defs.bzl
new file mode 100644
index 0000000000..4468cebdd2
--- /dev/null
+++ b/thirdparty/rocksdb/defs.bzl
@@ -0,0 +1,31 @@
+load("@fbcode_macros//build_defs:cpp_binary.bzl", "cpp_binary")
+load("@fbcode_macros//build_defs:custom_unittest.bzl", "custom_unittest")
+
+def test_binary(
+        test_name,
+        test_cc,
+        parallelism,
+        rocksdb_arch_preprocessor_flags,
+        rocksdb_compiler_flags,
+        rocksdb_preprocessor_flags,
+        rocksdb_external_deps):
+    TEST_RUNNER = native.package_name() + "/buckifier/rocks_test_runner.sh"
+
+    ttype = "gtest" if parallelism == "parallel" else "simple"
+    test_bin = test_name + "_bin"
+
+    cpp_binary(
+        name = test_bin,
+        srcs = [test_cc],
+        arch_preprocessor_flags = rocksdb_arch_preprocessor_flags,
+        compiler_flags = rocksdb_compiler_flags,
+        preprocessor_flags = rocksdb_preprocessor_flags,
+        deps = [":rocksdb_test_lib"],
+        external_deps = rocksdb_external_deps,
+    )
+
+    custom_unittest(
+        name = test_name,
+        command = [TEST_RUNNER, "$(location :{})".format(test_bin)],
+        type = ttype,
+    )
diff --git a/thirdparty/rocksdb/docs/.gitignore b/thirdparty/rocksdb/docs/.gitignore
new file mode 100644
index 0000000000..e48dc98be8
--- /dev/null
+++ b/thirdparty/rocksdb/docs/.gitignore
@@ -0,0 +1,9 @@
+.DS_STORE
+_site/
+*.swo
+*.swp
+_site
+.sass-cache
+*.psd
+*~
+
diff --git a/thirdparty/rocksdb/docs/CNAME b/thirdparty/rocksdb/docs/CNAME
new file mode 100644
index 0000000000..827d1c0ed1
--- /dev/null
+++ b/thirdparty/rocksdb/docs/CNAME
@@ -0,0 +1 @@
+rocksdb.org
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/CONTRIBUTING.md b/thirdparty/rocksdb/docs/CONTRIBUTING.md
new file mode 100644
index 0000000000..2c5842fb45
--- /dev/null
+++ b/thirdparty/rocksdb/docs/CONTRIBUTING.md
@@ -0,0 +1,115 @@
+This provides guidance on how to contribute various content to `rocksdb.org`.
+
+## Getting started
+
+You should only have to do these one time.
+
+- Rename this file to `CONTRIBUTING.md`.
+- Rename `EXAMPLE-README-FOR-RUNNING-DOCS.md` to `README.md` (replacing the existing `README.md` that came with the template).
+- Rename `EXAMPLE-LICENSE` to `LICENSE`.
+- Review the [template information](./TEMPLATE-INFORMATION.md).
+- Review `./_config.yml`.
+- Make sure you update `title`, `description`, `tagline` and `gacode` (Google Analytics) in `./_config.yml`.
+
+## Basic Structure
+
+Most content is written in markdown. You name the file `something.md`, then have a header that looks like this:
+
+```
+---
+docid: getting-started
+title: Getting started with ProjectName
+layout: docs
+permalink: /docs/getting-started.html
+---
+```
+
+Customize these values for each document, blog post, etc.
+
+> The filename of the `.md` file doesn't actually matter; what is important is the `docid` being unique and the `permalink` correct and unique too).
+
+## Landing page
+
+Modify `index.md` with your new or updated content.
+
+If you want a `GridBlock` as part of your content, you can do so directly with HTML:
+
+```
+<div class="gridBlock">
+  <div class="blockElement twoByGridBlock alignLeft">
+    <div class="blockContent">
+      <h3>Your Features</h3>
+      <ul>
+        <li>The <a href="http://example.org/">Example</a></li>
+        <li><a href="http://example.com">Another Example</a></li>
+      </ul>
+    </div>
+  </div>
+
+  <div class="blockElement twoByGridBlock alignLeft">
+    <div class="blockContent">
+      <h3>More information</h3>
+      <p>
+         Stuff here
+      </p>
+    </div>
+  </div>
+</div>
+```
+
+or with a combination of changing `./_data/features.yml` and adding some Liquid to `index.md`, such as:
+
+```
+{% include content/gridblocks.html data_source=site.data.features imagealign="bottom"%}
+```
+
+## Blog
+
+To modify a blog post, edit the appopriate markdown file in `./_posts/`.
+
+Adding a new blog post is a four-step process.
+
+> Some posts have a `permalink` and `comments` in the blog post YAML header. You will not need these for new blog posts. These are an artifact of migrating the blog from Wordpress to gh-pages.
+
+1. Create your blog post in `./_posts/` in markdown (file extension `.md` or `.markdown`). See current posts in that folder or `./doc-type-examples/2016-04-07-blog-post-example.md` for an example of the YAML format. **If the `./_posts` directory does not exist, create it**.
+  - You can add a `<!--truncate-->` tag in the middle of your post such that you show only the excerpt above that tag in the main `/blog` index on your page.
+1. If you have not authored a blog post before, modify the `./_data/authors.yml` file with the `author` id you used in your blog post, along with your full name and Facebook ID to get your profile picture.
+1. [Run the site locally](./README.md) to test your changes. It will be at `http://127.0.0.1/blog/your-new-blog-post-title.html`
+1. Push your changes to GitHub.
+
+## Docs
+
+To modify docs, edit the appropriate markdown file in `./_docs/`.
+
+To add docs to the site....
+
+1. Add your markdown file to the `./_docs/` folder. See `./doc-type-examples/docs-hello-world.md` for an example of the YAML header format. **If the `./_docs/` directory does not exist, create it**.
+  - You can use folders in the `./_docs/` directory to organize your content if you want.
+1. Update `_data/nav_docs.yml` to add your new document to the navigation bar. Use the `docid` you put in your doc markdown in as the `id` in the `_data/nav_docs.yml` file.
+1. [Run the site locally](./README.md) to test your changes. It will be at `http://127.0.0.1/docs/your-new-doc-permalink.html`
+1. Push your changes to GitHub.
+
+## Header Bar
+
+To modify the header bar, change `./_data/nav.yml`.
+
+## Top Level Page
+
+To modify a top-level page, edit the appropriate markdown file in `./top-level/`
+
+If you want a top-level page (e.g., http://your-site.com/top-level.html) -- not in `/blog/` or `/docs/`....
+
+1. Create a markdown file in the root `./top-level/`. See `./doc-type-examples/top-level-example.md` for more information.
+1. If you want a visible link to that file, update `_data/nav.yml` to add a link to your new top-level document in the header bar.
+
+   > This is not necessary if you just want to have a page that is linked to from another page, but not exposed as direct link to the user.
+
+1. [Run the site locally](./README.md) to test your changes. It will be at `http://127.0.0.1/your-top-level-page-permalink.html`
+1. Push your changes to GitHub.
+
+## Other Changes
+
+- CSS: `./css/main.css` or `./_sass/*.scss`.
+- Images: `./static/images/[docs | posts]/....`
+- Main Blog post HTML: `./_includes/post.html`
+- Main Docs HTML: `./_includes/doc.html`
diff --git a/thirdparty/rocksdb/docs/Gemfile b/thirdparty/rocksdb/docs/Gemfile
new file mode 100644
index 0000000000..93dc8b0d7f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/Gemfile
@@ -0,0 +1,2 @@
+source 'https://rubygems.org'
+gem 'github-pages', '~> 104'
diff --git a/thirdparty/rocksdb/docs/Gemfile.lock b/thirdparty/rocksdb/docs/Gemfile.lock
new file mode 100644
index 0000000000..78dc919a98
--- /dev/null
+++ b/thirdparty/rocksdb/docs/Gemfile.lock
@@ -0,0 +1,146 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (4.2.7)
+      i18n (~> 0.7)
+      json (~> 1.7, >= 1.7.7)
+      minitest (~> 5.1)
+      thread_safe (~> 0.3, >= 0.3.4)
+      tzinfo (~> 1.1)
+    addressable (2.4.0)
+    coffee-script (2.4.1)
+      coffee-script-source
+      execjs
+    coffee-script-source (1.12.2)
+    colorator (1.1.0)
+    concurrent-ruby (1.0.5)
+    ethon (0.11.0)
+      ffi (>= 1.3.0)
+    execjs (2.7.0)
+    faraday (0.15.2)
+      multipart-post (>= 1.2, < 3)
+    ffi (1.9.25)
+    forwardable-extended (2.6.0)
+    gemoji (2.1.0)
+    github-pages (104)
+      activesupport (= 4.2.7)
+      github-pages-health-check (= 1.2.0)
+      jekyll (>= 3.8.4)
+      jekyll-avatar (= 0.4.2)
+      jekyll-coffeescript (= 1.0.1)
+      jekyll-feed (= 0.8.0)
+      jekyll-gist (= 1.4.0)
+      jekyll-github-metadata (= 2.2.0)
+      jekyll-mentions (= 1.2.0)
+      jekyll-paginate (= 1.1.0)
+      jekyll-redirect-from (= 0.11.0)
+      jekyll-sass-converter (= 1.3.0)
+      jekyll-seo-tag (= 2.1.0)
+      jekyll-sitemap (= 0.12.0)
+      jekyll-swiss (= 0.4.0)
+      jemoji (= 0.7.0)
+      kramdown (= 1.11.1)
+      liquid (= 3.0.6)
+      listen (= 3.0.6)
+      mercenary (~> 0.3)
+      minima (= 2.0.0)
+      rouge (= 1.11.1)
+      terminal-table (~> 1.4)
+    github-pages-health-check (1.2.0)
+      addressable (~> 2.3)
+      net-dns (~> 0.8)
+      octokit (~> 4.0)
+      public_suffix (~> 1.4)
+      typhoeus (~> 0.7)
+    html-pipeline (2.4.2)
+      activesupport (>= 2)
+      nokogiri (~> 1.8.2)
+    i18n (0.7.0)
+    jekyll (3.8.4)
+      addressable (~> 2.4)
+      colorator (~> 1.0)
+      jekyll-sass-converter (~> 1.0)
+      jekyll-watch (~> 1.1)
+      kramdown (~> 1.3)
+      liquid (~> 3.0)
+      mercenary (~> 0.3.3)
+      pathutil (~> 0.9)
+      rouge (~> 1.7)
+      safe_yaml (~> 1.0)
+    jekyll-avatar (0.4.2)
+      jekyll (~> 3.0)
+    jekyll-coffeescript (1.0.1)
+      coffee-script (~> 2.2)
+    jekyll-feed (0.8.0)
+      jekyll (~> 3.3)
+    jekyll-gist (1.4.0)
+      octokit (~> 4.2)
+    jekyll-github-metadata (2.2.0)
+      jekyll (~> 3.1)
+      octokit (~> 4.0, != 4.4.0)
+    jekyll-mentions (1.2.0)
+      activesupport (~> 4.0)
+      html-pipeline (~> 2.3)
+      jekyll (~> 3.0)
+    jekyll-paginate (1.1.0)
+    jekyll-redirect-from (0.11.0)
+      jekyll (>= 2.0)
+    jekyll-sass-converter (1.3.0)
+      sass (~> 3.2)
+    jekyll-seo-tag (2.1.0)
+      jekyll (~> 3.3)
+    jekyll-sitemap (0.12.0)
+      jekyll (~> 3.3)
+    jekyll-swiss (0.4.0)
+    jekyll-watch (1.5.0)
+      listen (~> 3.0, < 3.1)
+    jemoji (0.7.0)
+      activesupport (~> 4.0)
+      gemoji (~> 2.0)
+      html-pipeline (~> 2.2)
+      jekyll (>= 3.0)
+    json (1.8.3)
+    kramdown (1.11.1)
+    liquid (3.0.6)
+    listen (3.0.6)
+      rb-fsevent (>= 0.9.3)
+      rb-inotify (>= 0.9.7)
+    mercenary (0.3.6)
+    mini_portile2 (2.3.0)
+    minima (2.0.0)
+    minitest (5.9.1)
+    multipart-post (2.0.0)
+    net-dns (0.8.0)
+    nokogiri (~> 1.8.2)
+      mini_portile2 (~> 2.3.0)
+    octokit (4.4.1)
+      sawyer (~> 0.7.0, >= 0.5.3)
+    pathutil (0.14.0)
+      forwardable-extended (~> 2.6)
+    public_suffix (1.5.3)
+    rb-fsevent (0.9.8)
+    rb-inotify (0.9.7)
+      ffi (>= 0.5.0)
+    rouge (1.11.1)
+    safe_yaml (1.0.4)
+    sass (3.4.22)
+    sawyer (0.7.0)
+      addressable (>= 2.3.5, < 2.5)
+      faraday (~> 0.8, < 0.10)
+    terminal-table (1.7.3)
+      unicode-display_width (~> 1.1.1)
+    thread_safe (0.3.5)
+    typhoeus (0.8.0)
+      ethon (>= 0.8.0)
+    tzinfo (1.2.2)
+      thread_safe (~> 0.1)
+    unicode-display_width (1.1.1)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  github-pages (~> 104)
+
+BUNDLED WITH
+   1.13.1
diff --git a/thirdparty/rocksdb/docs/LICENSE-DOCUMENTATION b/thirdparty/rocksdb/docs/LICENSE-DOCUMENTATION
new file mode 100644
index 0000000000..1f255c9f37
--- /dev/null
+++ b/thirdparty/rocksdb/docs/LICENSE-DOCUMENTATION
@@ -0,0 +1,385 @@
+Attribution 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+  wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+  wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution 4.0 International Public License ("Public License"). To the
+extent this Public License may be interpreted as a contract, You are
+granted the Licensed Rights in consideration of Your acceptance of
+these terms and conditions, and the Licensor grants You such rights in
+consideration of benefits the Licensor receives from making the
+Licensed Material available under these terms and conditions.
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+b. Adapter's License means the license You apply to Your Copyright
+   and Similar Rights in Your contributions to Adapted Material in
+   accordance with the terms and conditions of this Public License.
+
+c. Copyright and Similar Rights means copyright and/or similar rights
+   closely related to copyright including, without limitation,
+   performance, broadcast, sound recording, and Sui Generis Database
+   Rights, without regard to how the rights are labeled or
+   categorized. For purposes of this Public License, the rights
+   specified in Section 2(b)(1)-(2) are not Copyright and Similar
+   Rights.
+
+d. Effective Technological Measures means those measures that, in the
+   absence of proper authority, may not be circumvented under laws
+   fulfilling obligations under Article 11 of the WIPO Copyright
+   Treaty adopted on December 20, 1996, and/or similar international
+   agreements.
+
+e. Exceptions and Limitations means fair use, fair dealing, and/or
+   any other exception or limitation to Copyright and Similar Rights
+   that applies to Your use of the Licensed Material.
+
+f. Licensed Material means the artistic or literary work, database,
+   or other material to which the Licensor applied this Public
+   License.
+
+g. Licensed Rights means the rights granted to You subject to the
+   terms and conditions of this Public License, which are limited to
+   all Copyright and Similar Rights that apply to Your use of the
+   Licensed Material and that the Licensor has authority to license.
+
+h. Licensor means the individual(s) or entity(ies) granting rights
+   under this Public License.
+
+i. Share means to provide material to the public by any means or
+   process that requires permission under the Licensed Rights, such
+   as reproduction, public display, public performance, distribution,
+   dissemination, communication, or importation, and to make material
+   available to the public including in ways that members of the
+   public may access the material from a place and at a time
+   individually chosen by them.
+
+j. Sui Generis Database Rights means rights other than copyright
+   resulting from Directive 96/9/EC of the European Parliament and of
+   the Council of 11 March 1996 on the legal protection of databases,
+   as amended and/or succeeded, as well as other essentially
+   equivalent rights anywhere in the world.
+
+k. You means the individual or entity exercising the Licensed Rights
+   under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+a. License grant.
+
+     1. Subject to the terms and conditions of this Public License,
+        the Licensor hereby grants You a worldwide, royalty-free,
+        non-sublicensable, non-exclusive, irrevocable license to
+        exercise the Licensed Rights in the Licensed Material to:
+
+          a. reproduce and Share the Licensed Material, in whole or
+             in part; and
+
+          b. produce, reproduce, and Share Adapted Material.
+
+     2. Exceptions and Limitations. For the avoidance of doubt, where
+        Exceptions and Limitations apply to Your use, this Public
+        License does not apply, and You do not need to comply with
+        its terms and conditions.
+
+     3. Term. The term of this Public License is specified in Section
+        6(a).
+
+     4. Media and formats; technical modifications allowed. The
+        Licensor authorizes You to exercise the Licensed Rights in
+        all media and formats whether now known or hereafter created,
+        and to make technical modifications necessary to do so. The
+        Licensor waives and/or agrees not to assert any right or
+        authority to forbid You from making technical modifications
+        necessary to exercise the Licensed Rights, including
+        technical modifications necessary to circumvent Effective
+        Technological Measures. For purposes of this Public License,
+        simply making modifications authorized by this Section 2(a)
+        (4) never produces Adapted Material.
+
+     5. Downstream recipients.
+
+          a. Offer from the Licensor -- Licensed Material. Every
+             recipient of the Licensed Material automatically
+             receives an offer from the Licensor to exercise the
+             Licensed Rights under the terms and conditions of this
+             Public License.
+
+          b. No downstream restrictions. You may not offer or impose
+             any additional or different terms or conditions on, or
+             apply any Effective Technological Measures to, the
+             Licensed Material if doing so restricts exercise of the
+             Licensed Rights by any recipient of the Licensed
+             Material.
+
+     6. No endorsement. Nothing in this Public License constitutes or
+        may be construed as permission to assert or imply that You
+        are, or that Your use of the Licensed Material is, connected
+        with, or sponsored, endorsed, or granted official status by,
+        the Licensor or others designated to receive attribution as
+        provided in Section 3(a)(1)(A)(i).
+
+b. Other rights.
+
+     1. Moral rights, such as the right of integrity, are not
+        licensed under this Public License, nor are publicity,
+        privacy, and/or other similar personality rights; however, to
+        the extent possible, the Licensor waives and/or agrees not to
+        assert any such rights held by the Licensor to the limited
+        extent necessary to allow You to exercise the Licensed
+        Rights, but not otherwise.
+
+     2. Patent and trademark rights are not licensed under this
+        Public License.
+
+     3. To the extent possible, the Licensor waives any right to
+        collect royalties from You for the exercise of the Licensed
+        Rights, whether directly or through a collecting society
+        under any voluntary or waivable statutory or compulsory
+        licensing scheme. In all other cases the Licensor expressly
+        reserves any right to collect such royalties.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+a. Attribution.
+
+     1. If You Share the Licensed Material (including in modified
+        form), You must:
+
+          a. retain the following if it is supplied by the Licensor
+             with the Licensed Material:
+
+               i. identification of the creator(s) of the Licensed
+                  Material and any others designated to receive
+                  attribution, in any reasonable manner requested by
+                  the Licensor (including by pseudonym if
+                  designated);
+
+              ii. a copyright notice;
+
+             iii. a notice that refers to this Public License;
+
+              iv. a notice that refers to the disclaimer of
+                  warranties;
+
+               v. a URI or hyperlink to the Licensed Material to the
+                  extent reasonably practicable;
+
+          b. indicate if You modified the Licensed Material and
+             retain an indication of any previous modifications; and
+
+          c. indicate the Licensed Material is licensed under this
+             Public License, and include the text of, or the URI or
+             hyperlink to, this Public License.
+
+     2. You may satisfy the conditions in Section 3(a)(1) in any
+        reasonable manner based on the medium, means, and context in
+        which You Share the Licensed Material. For example, it may be
+        reasonable to satisfy the conditions by providing a URI or
+        hyperlink to a resource that includes the required
+        information.
+
+     3. If requested by the Licensor, You must remove any of the
+        information required by Section 3(a)(1)(A) to the extent
+        reasonably practicable.
+
+     4. If You Share Adapted Material You produce, the Adapter's
+        License You apply must not prevent recipients of the Adapted
+        Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+   to extract, reuse, reproduce, and Share all or a substantial
+   portion of the contents of the database;
+
+b. if You include all or a substantial portion of the database
+   contents in a database in which You have Sui Generis Database
+   Rights, then the database in which You have Sui Generis Database
+   Rights (but not its individual contents) is Adapted Material; and
+
+c. You must comply with the conditions in Section 3(a) if You Share
+   all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+   EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+   AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+   ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+   IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+   WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+   PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+   ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+   KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+   ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+   TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+   NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+   INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+   COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+   USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+   ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+   DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+   IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+c. The disclaimer of warranties and limitation of liability provided
+   above shall be interpreted in a manner that, to the extent
+   possible, most closely approximates an absolute disclaimer and
+   waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+a. This Public License applies for the term of the Copyright and
+   Similar Rights licensed here. However, if You fail to comply with
+   this Public License, then Your rights under this Public License
+   terminate automatically.
+
+b. Where Your right to use the Licensed Material has terminated under
+   Section 6(a), it reinstates:
+
+     1. automatically as of the date the violation is cured, provided
+        it is cured within 30 days of Your discovery of the
+        violation; or
+
+     2. upon express reinstatement by the Licensor.
+
+   For the avoidance of doubt, this Section 6(b) does not affect any
+   right the Licensor may have to seek remedies for Your violations
+   of this Public License.
+
+c. For the avoidance of doubt, the Licensor may also offer the
+   Licensed Material under separate terms or conditions or stop
+   distributing the Licensed Material at any time; however, doing so
+   will not terminate this Public License.
+
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+   License.
+
+Section 7 -- Other Terms and Conditions.
+
+a. The Licensor shall not be bound by any additional or different
+   terms or conditions communicated by You unless expressly agreed.
+
+b. Any arrangements, understandings, or agreements regarding the
+   Licensed Material not stated herein are separate from and
+   independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+a. For the avoidance of doubt, this Public License does not, and
+   shall not be interpreted to, reduce, limit, restrict, or impose
+   conditions on any use of the Licensed Material that could lawfully
+   be made without permission under this Public License.
+
+b. To the extent possible, if any provision of this Public License is
+   deemed unenforceable, it shall be automatically reformed to the
+   minimum extent necessary to make it enforceable. If the provision
+   cannot be reformed, it shall be severed from this Public License
+   without affecting the enforceability of the remaining terms and
+   conditions.
+
+c. No term or condition of this Public License will be waived and no
+   failure to comply consented to unless expressly agreed to by the
+   Licensor.
+
+d. Nothing in this Public License constitutes or may be interpreted
+   as a limitation upon, or waiver of, any privileges and immunities
+   that apply to the Licensor or You, including from the legal
+   processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public licenses.
+Notwithstanding, Creative Commons may elect to apply one of its public
+licenses to material it publishes and in those instances will be
+considered the "Licensor." Except for the limited purpose of indicating
+that material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the public
+licenses.
+
+Creative Commons may be contacted at creativecommons.org.
+
diff --git a/thirdparty/rocksdb/docs/README.md b/thirdparty/rocksdb/docs/README.md
new file mode 100644
index 0000000000..0ae8978bcb
--- /dev/null
+++ b/thirdparty/rocksdb/docs/README.md
@@ -0,0 +1,80 @@
+## User Documentation for rocksdb.org
+
+This directory will contain the user and feature documentation for RocksDB. The documentation will be hosted on GitHub pages.
+
+### Contributing
+
+See [CONTRIBUTING.md](./CONTRIBUTING.md) for details on how to add or modify content.
+
+### Run the Site Locally
+
+The requirements for running a GitHub pages site locally is described in [GitHub help](https://help.github.com/articles/setting-up-your-github-pages-site-locally-with-jekyll/#requirements). The steps below summarize these steps.
+
+> If you have run the site before, you can start with step 1 and then move on to step 5.
+
+1. Ensure that you are in the `/docs` directory in your local RocksDB clone (i.e., the same directory where this `README.md` exists). The below RubyGems commands, etc. must be run from there.
+
+1. Make sure you have Ruby and [RubyGems](https://rubygems.org/) installed.
+
+   > Ruby >= 2.2 is required for the gems. On the latest versions of Mac OS X, Ruby 2.0 is the
+   > default. Use `brew install ruby` (or your preferred upgrade mechanism) to install a newer
+   > version of Ruby for your Mac OS X system.
+
+1. Make sure you have [Bundler](http://bundler.io/) installed.
+
+    ```
+    # may require sudo
+    gem install bundler
+    ```
+1. Install the project's dependencies
+
+    ```
+    # run this in the 'docs' directory
+    bundle install
+    ```
+
+    > If you get an error when installing `nokogiri`, you may be running into the problem described
+    > in [this nokogiri issue](https://github.com/sparklemotion/nokogiri/issues/1483). You can
+    > either `brew uninstall xz` (and then `brew install xz` after the bundle is installed) or
+    > `xcode-select --install` (although this may not work if you have already installed command
+    > line tools).
+
+1. Run Jekyll's server.
+
+    - On first runs or for structural changes to the documentation (e.g., new sidebar menu item), do a full build.
+
+    ```
+    bundle exec jekyll serve
+    ```
+
+    - For content changes only, you can use `--incremental` for faster builds.
+
+    ```
+    bundle exec jekyll serve --incremental
+    ```
+
+    > We use `bundle exec` instead of running straight `jekyll` because `bundle exec` will always use the version of Jekyll from our `Gemfile`. Just running `jekyll` will use the system version and may not necessarily be compatible.
+
+    - To run using an actual IP address, you can use `--host=0.0.0.0`
+
+    ```
+    bundle exec jekyll serve --host=0.0.0.0
+    ```
+
+    This will allow you to use the IP address associated with your machine in the URL. That way you could share it with other people.
+
+    e.g., on a Mac, you can your IP address with something like `ifconfig | grep "inet " | grep -v 127.0.0.1`.    
+
+1. Either of commands in the previous step will serve up the site on your local device at http://127.0.0.1:4000/ or http://localhost:4000.
+
+### Updating the Bundle
+
+The site depends on Github Pages and the installed bundle is based on the `github-pages` gem.
+Occasionally that gem might get updated with new or changed functionality. If that is the case,
+you can run:
+
+```
+bundle update
+```
+
+to get the latest packages for the installation.
diff --git a/thirdparty/rocksdb/docs/TEMPLATE-INFORMATION.md b/thirdparty/rocksdb/docs/TEMPLATE-INFORMATION.md
new file mode 100644
index 0000000000..9175bc0c29
--- /dev/null
+++ b/thirdparty/rocksdb/docs/TEMPLATE-INFORMATION.md
@@ -0,0 +1,17 @@
+## Template Details
+
+First, go through `_config.yml` and adjust the available settings to your project's standard. When you make changes here, you'll have to kill the `jekyll serve` instance and restart it to see those changes, but that's only the case with the config file.
+
+Next, update some image assets - you'll want to update `favicon.png`, `logo.svg`, and `og_image.png` (used for Like button stories and Shares on Facbeook) in the `static` folder with your own logos.
+
+Next, if you're going to have docs on your site, keep the `_docs` and `docs` folders, if not, you can safely remove them (or you can safely leave them and not include them in your navigation - Jekyll renders all of this before a client views the site anyway, so there's no performance hit from just leaving it there for a future expansion).
+
+Same thing with a blog section, either keep or delete the `_posts` and `blog` folders.
+
+You can customize your homepage in three parts - the first in the homepage header, which is mostly automatically derived from the elements you insert into your config file. However, you can also specify a series of 'promotional' elements in `_data/promo.yml`. You can read that file for more information.
+
+The second place for your homepage is in `index.md` which contains the bulk of the main content below the header. This is all markdown if you want, but you can use HTML and Jekyll's template tags (called Liquid) in there too. Checkout this folder's index.md for an example of one common template tag that we use on our sites called gridblocks.
+
+The third and last place is in the `_data/powered_by.yml` and `_data/powered_by_highlight.yml` files. Both these files combine to create a section on the homepage that is intended to show a list of companies or apps that are using your project. The `powered_by_highlight` file is a list of curated companies/apps that you want to show as a highlight at the top of this section, including their logos in whatever format you want. The `powered_by` file is a more open list that is just text links to the companies/apps and can be updated via Pull Request by the community. If you don't want these sections on your homepage, just empty out both files and leave them blank.
+
+The last thing you'll want to do is setup your top level navigation bar. You can do this by editing `nav.yml` and keeping the existing title/href/category structure used there. Although the nav is responsive and fairly flexible design-wise, no more than 5 or 6 nav items is recommended.
diff --git a/thirdparty/rocksdb/docs/_config.yml b/thirdparty/rocksdb/docs/_config.yml
new file mode 100644
index 0000000000..2e5cee097f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_config.yml
@@ -0,0 +1,85 @@
+# Site settings
+permalink: /blog/:year/:month/:day/:title.html
+title: RocksDB
+tagline: A persistent key-value store for fast storage environments
+description: >
+  RocksDB is an embeddable persistent key-value store for fast storage.
+fbappid: "1615782811974223"
+gacode: "UA-49459723-1"
+# baseurl determines the subpath of your site. For example if you're using an
+# organisation.github.io/reponame/ basic site URL, then baseurl would be set
+# as "/reponame" but leave blank if you have a top-level domain URL as it is
+# now set to "" by default as discussed in:
+# http://jekyllrb.com/news/2016/10/06/jekyll-3-3-is-here/
+baseurl: ""
+
+# the base hostname & protocol for your site
+# If baseurl is set, then the absolute url for your site would be url/baseurl
+# This was also be set to the right thing automatically for local development
+# https://github.com/blog/2277-what-s-new-in-github-pages-with-jekyll-3-3
+# http://jekyllrb.com/news/2016/10/06/jekyll-3-3-is-here/
+url: "http://rocksdb.org"
+
+# Note: There are new filters in Jekyll 3.3 to help with absolute and relative urls
+# absolute_url
+# relative_url
+# So you will see these used throughout the Jekyll code in this template.
+# no more need for | prepend: site.url | prepend: site.baseurl
+# http://jekyllrb.com/news/2016/10/06/jekyll-3-3-is-here/
+#https://github.com/blog/2277-what-s-new-in-github-pages-with-jekyll-3-3
+
+# The GitHub repo for your project
+ghrepo: "facebook/rocksdb"
+
+# Use these color settings to determine your colour scheme for the site.
+color:
+  # primary should be a vivid color that reflects the project's brand
+  primary: "#2a2a2a"
+  # secondary should be a subtle light or dark color used on page backgrounds
+  secondary: "#f9f9f9"
+  # Use the following to specify whether the previous two colours are 'light'
+  # or 'dark' and therefore what colors can be overlaid on them
+  primary-overlay: "dark"
+  secondary-overlay: "light"
+
+#Uncomment this if you want to enable Algolia doc search with your own values
+#searchconfig:
+#  apikey: ""
+#  indexname: ""
+
+# Blog posts are builtin to Jekyll by default, with the `_posts` directory.
+# Here you can specify other types of documentation. The names here are `docs`
+# and `top-level`. This means their content will be in `_docs` and `_top-level`.
+# The permalink format is also given.
+# http://ben.balter.com/2015/02/20/jekyll-collections/
+collections:
+  docs:
+    output: true
+    permalink: /docs/:name/
+  top-level:
+    output: true
+    permalink: :name.html
+
+# DO NOT ADJUST BELOW THIS LINE UNLESS YOU KNOW WHAT YOU ARE CHANGING
+
+markdown: kramdown
+kramdown:
+  input: GFM
+  syntax_highlighter: rouge
+
+  syntax_highlighter_opts:
+    css_class: 'rougeHighlight'
+    span:
+      line_numbers: false
+    block:
+      line_numbers: true
+      start_line: 1
+
+sass:
+  style: :compressed
+
+redcarpet:
+  extensions: [with_toc_data]
+
+gems:
+  - jekyll-redirect-from
diff --git a/thirdparty/rocksdb/docs/_data/authors.yml b/thirdparty/rocksdb/docs/_data/authors.yml
new file mode 100644
index 0000000000..13225be9df
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_data/authors.yml
@@ -0,0 +1,70 @@
+icanadi:
+  full_name: Igor Canadi
+  fbid: 706165749
+
+xjin:
+  full_name: Xing Jin
+  fbid: 100000739847320
+
+leijin:
+  full_name: Lei Jin
+  fbid: 634570164
+
+yhciang:
+  full_name: Yueh-Hsuan Chiang
+  fbid: 1619020986
+
+radheshyam:
+  full_name: Radheshyam Balasundaram
+  fbid: 800837305
+
+zagfox:
+  full_name: Feng Zhu
+  fbid: 100006493823622
+
+lgalanis:
+  full_name: Leonidas Galanis
+  fbid: 8649950
+
+sdong:
+  full_name: Siying Dong
+  fbid: 9805119
+
+dmitrism:
+  full_name: Dmitri Smirnov
+
+rven2:
+  full_name: Venkatesh Radhakrishnan
+  fbid: 100008352697325
+
+yiwu:
+  full_name: Yi Wu
+  fbid: 100000476362039
+
+maysamyabandeh:
+  full_name: Maysam Yabandeh
+  fbid: 100003482360101
+
+IslamAbdelRahman:
+  full_name: Islam AbdelRahman
+  fbid: 642759407
+
+ajkr:
+  full_name: Andrew Kryczka
+  fbid: 568694102
+
+abhimadan:
+  full_name: Abhishek Madan
+  fbid: 1850247869
+
+sagar0:
+  full_name: Sagar Vemuri
+  fbid: 2419111
+
+lightmark:
+  full_name: Aaron Gao
+  fbid: 1351549072
+
+fgwu:
+  full_name: Fenggang Wu
+  fbid: 100002297362180
diff --git a/thirdparty/rocksdb/docs/_data/features.yml b/thirdparty/rocksdb/docs/_data/features.yml
new file mode 100644
index 0000000000..d692c1849d
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_data/features.yml
@@ -0,0 +1,19 @@
+- title: High Performance
+  text: |
+        RocksDB uses a log structured database engine, written entirely in C++, for maximum performance. Keys and values are just arbitrarily-sized byte streams.
+  image: images/promo-performance.svg
+
+- title: Optimized for Fast Storage
+  text: |
+        RocksDB is optimized for fast, low latency storage such as flash drives and high-speed disk drives. RocksDB exploits the full potential of high read/write rates offered by flash or RAM.
+  image: images/promo-flash.svg
+
+- title: Adaptable
+  text: |
+        RocksDB is adaptable to different workloads. From database storage engines such as [MyRocks](https://github.com/facebook/mysql-5.6) to [application data caching](http://techblog.netflix.com/2016/05/application-data-caching-using-ssds.html) to embedded workloads, RocksDB can be used for a variety of data needs.
+  image: images/promo-adapt.svg
+
+- title: Basic and Advanced Database Operations
+  text: |
+        RocksDB provides basic operations such as opening and closing a database, reading and writing to more advanced operations such as merging and compaction filters.
+  image: images/promo-operations.svg
diff --git a/thirdparty/rocksdb/docs/_data/nav.yml b/thirdparty/rocksdb/docs/_data/nav.yml
new file mode 100644
index 0000000000..108de02545
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_data/nav.yml
@@ -0,0 +1,30 @@
+- title: Docs
+  href: /docs/
+  category: docs
+
+- title: GitHub
+  href: https://github.com/facebook/rocksdb/
+  category: external
+
+- title: API (C++)
+  href: https://github.com/facebook/rocksdb/tree/master/include/rocksdb
+  category: external
+
+- title: API (Java)
+  href: https://github.com/facebook/rocksdb/tree/master/java/src/main/java/org/rocksdb
+  category: external
+
+- title: Support
+  href: /support.html
+  category: support
+
+- title: Blog
+  href: /blog/
+  category: blog
+
+- title: Facebook
+  href: https://www.facebook.com/groups/rocksdb.dev/
+  category: external
+
+# Use external for external links not associated with the paths of the current site.
+# If a category is external, site urls, for example, are not prepended to the href, etc..
diff --git a/thirdparty/rocksdb/docs/_data/nav_docs.yml b/thirdparty/rocksdb/docs/_data/nav_docs.yml
new file mode 100644
index 0000000000..8cdfd2d04d
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_data/nav_docs.yml
@@ -0,0 +1,3 @@
+- title: Quick Start
+  items:
+  - id: getting-started
diff --git a/thirdparty/rocksdb/docs/_data/powered_by.yml b/thirdparty/rocksdb/docs/_data/powered_by.yml
new file mode 100644
index 0000000000..a780cfe401
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_data/powered_by.yml
@@ -0,0 +1 @@
+# Fill in later if desired
diff --git a/thirdparty/rocksdb/docs/_data/powered_by_highlight.yml b/thirdparty/rocksdb/docs/_data/powered_by_highlight.yml
new file mode 100644
index 0000000000..a780cfe401
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_data/powered_by_highlight.yml
@@ -0,0 +1 @@
+# Fill in later if desired
diff --git a/thirdparty/rocksdb/docs/_data/promo.yml b/thirdparty/rocksdb/docs/_data/promo.yml
new file mode 100644
index 0000000000..9a72aa844c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_data/promo.yml
@@ -0,0 +1,6 @@
+# This file determines the list of promotional elements added to the header of \
+# your site's homepage. Full list of plugins are shown
+
+- type: button
+  href: docs/getting-started.html
+  text: Get Started
diff --git a/thirdparty/rocksdb/docs/_docs/faq.md b/thirdparty/rocksdb/docs/_docs/faq.md
new file mode 100644
index 0000000000..0887a0987f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_docs/faq.md
@@ -0,0 +1,48 @@
+---
+docid: support-faq
+title: FAQ
+layout: docs
+permalink: /docs/support/faq.html
+---
+
+Here is an ever-growing list of frequently asked questions around RocksDB
+
+## What is RocksDB?
+
+RocksDB is an embeddable persistent key-value store for fast storage. RocksDB can also be the foundation for a client-server database but our current focus is on embedded workloads.
+
+RocksDB builds on [LevelDB](https://code.google.com/p/leveldb/) to be scalable to run on servers with many CPU cores, to efficiently use fast storage, to support IO-bound, in-memory and write-once workloads, and to be flexible to allow for innovation.
+
+For the latest details, watch [Mark Callaghan’s and Igor Canadi’s talk at CMU on 10/2015](https://scs.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=f4e0eb37-ae18-468f-9248-cb73edad3e56). [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages-old/intro.pdf?raw=true) from the Data @ Scale 2013 conference provides some perspective about how RocksDB has evolved.
+
+## How does performance compare?
+
+We benchmarked LevelDB and found that it was unsuitable for our server workloads. The [benchmark results](http://leveldb.googlecode.com/svn/trunk/doc/benchmark.html) look awesome at first sight, but we quickly realized that those results were for a database whose size was smaller than the size of RAM on the test machine – where the entire database could fit in the OS page cache. When we performed the same benchmarks on a database that was at least 5 times larger than main memory, the performance results were dismal.
+
+By contrast, we’ve published the [RocksDB benchmark results](https://github.com/facebook/rocksdb/wiki/Performance-Benchmarks) for server side workloads on Flash. We also measured the performance of LevelDB on these server-workload benchmarks and found that RocksDB solidly outperforms LevelDB for these IO bound workloads. We found that LevelDB’s single-threaded compaction process was insufficient to drive server workloads. We saw frequent write-stalls with LevelDB that caused 99-percentile latency to be tremendously large. We found that mmap-ing a file into the OS cache introduced performance bottlenecks for reads. We could not make LevelDB consume all the IOs offered by the underlying Flash storage.
+
+## What is RocksDB suitable for?
+
+RocksDB can be used by applications that need low latency database accesses. Possibilities include:
+
+* A user-facing application that stores the viewing history and state of users of a website.
+* A spam detection application that needs fast access to big data sets.
+* A graph-search query that needs to scan a data set in realtime.
+* A cache data from Hadoop, thereby allowing applications to query Hadoop data in realtime.
+* A message-queue that supports a high number of inserts and deletes.
+
+## How big is RocksDB adoption?
+
+RocksDB is an embedded storage engine that is used in a number of backend systems at Facebook. In the Facebook newsfeed’s backend, it replaced another internal storage engine called Centrifuge and is one of the many components used. ZippyDB, a distributed key value store service used by Facebook products relies RocksDB. Details on ZippyDB are in [Muthu Annamalai’s talk at Data@Scale in Seattle](https://youtu.be/DfiN7pG0D0k). Dragon, a distributed graph query engine part of the social graph infrastructure, is using RocksDB to store data. Parse has been running [MongoDB on RocksDB in production](http://blog.parse.com/announcements/mongodb-rocksdb-parse/) since early 2015.
+
+RocksDB is proving to be a useful component for a lot of other groups in the industry. For a list of projects currently using RocksDB, take a look at our USERS.md list on github.
+
+## How good is RocksDB as a database storage engine?
+
+Our engineering team at Facebook firmly believes that RocksDB has great potential as storage engine for databases. It has been proven in production with MongoDB: [MongoRocks](https://github.com/mongodb-partners/mongo-rocks) is the RocksDB based storage engine for MongoDB.
+
+[MyRocks](https://code.facebook.com/posts/190251048047090/myrocks-a-space-and-write-optimized-mysql-database/) is the RocksDB based storage engine for MySQL. Using RocksDB we have managed to achieve 2x better compression and 10x less write amplification for our benchmarks compared to our existing MySQL setup. Given our current results, work is currently underway to develop MyRocks into a production ready solution for web-scale MySQL workloads. Follow along on [GitHub](https://github.com/facebook/mysql-5.6)!
+
+## Why is RocksDB open sourced?
+
+We are open sourcing this project on [GitHub](http://github.com/facebook/rocksdb) because we think it will be useful beyond Facebook. We are hoping that software programmers and database developers will use, enhance, and customize RocksDB for their use-cases. We would also like to engage with the academic community on topics related to efficiency for modern database algorithms.
diff --git a/thirdparty/rocksdb/docs/_docs/getting-started.md b/thirdparty/rocksdb/docs/_docs/getting-started.md
new file mode 100644
index 0000000000..8b01dfefd4
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_docs/getting-started.md
@@ -0,0 +1,78 @@
+---
+docid: getting-started
+title: Getting started
+layout: docs
+permalink: /docs/getting-started.html
+---
+
+## Overview
+
+The RocksDB library provides a persistent key value store. Keys and values are arbitrary byte arrays. The keys are ordered within the key value store according to a user-specified comparator function.
+
+The library is maintained by the Facebook Database Engineering Team, and is based on [LevelDB](https://github.com/google/leveldb), by Sanjay Ghemawat and Jeff Dean at Google.
+
+This overview gives some simple examples of how RocksDB is used. For the story of why RocksDB was created in the first place, see [Dhruba Borthakur’s introductory talk](https://github.com/facebook/rocksdb/blob/gh-pages-old/intro.pdf?raw=true) from the Data @ Scale 2013 conference.
+
+## Opening A Database
+
+A rocksdb database has a name which corresponds to a file system directory. All of the contents of database are stored in this directory. The following example shows how to open a database, creating it if necessary:
+
+```c++
+#include <assert>
+#include "rocksdb/db.h"
+
+rocksdb::DB* db;
+rocksdb::Options options;
+options.create_if_missing = true;
+rocksdb::Status status =
+  rocksdb::DB::Open(options, "/tmp/testdb", &db);
+assert(status.ok());
+...
+```
+
+If you want to raise an error if the database already exists, add the following line before the rocksdb::DB::Open call:
+
+```c++
+options.error_if_exists = true;
+```
+
+## Status
+
+You may have noticed the `rocksdb::Status` type above. Values of this type are returned by most functions in RocksDB that may encounter
+an error. You can check if such a result is ok, and also print an associated error message:
+
+```c++
+rocksdb::Status s = ...;
+if (!s.ok()) cerr << s.ToString() << endl;
+```
+
+## Closing A Database
+
+When you are done with a database, just delete the database object. For example:
+
+```c++
+/* open the db as described above */
+/* do something with db */
+delete db;
+```
+
+## Reads And Writes
+
+The database provides Put, Delete, and Get methods to modify/query the database. For example, the following code moves the value stored under `key1` to `key2`.
+
+```c++
+std::string value;
+rocksdb::Status s = db->Get(rocksdb::ReadOptions(), key1, &value);
+if (s.ok()) s = db->Put(rocksdb::WriteOptions(), key2, value);
+if (s.ok()) s = db->Delete(rocksdb::WriteOptions(), key1);
+```
+
+## Further documentation
+
+These are just simple examples of how RocksDB is used. The full documentation is currently on the [GitHub wiki](https://github.com/facebook/rocksdb/wiki).
+
+Here are some specific details about the RocksDB implementation:
+
+- [Architecture Guide](https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide)
+- [Format of an immutable Table file](https://github.com/facebook/rocksdb/wiki/Rocksdb-Table-Format)
+- [Format of a log file](https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format)
diff --git a/thirdparty/rocksdb/docs/_includes/blog_pagination.html b/thirdparty/rocksdb/docs/_includes/blog_pagination.html
new file mode 100644
index 0000000000..6a1f33436e
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/blog_pagination.html
@@ -0,0 +1,28 @@
+<!-- Pagination links - copied from http://jekyllrb.com/docs/pagination/ -->
+{% if paginator.total_pages > 1 %}
+<br />
+<div class="pagination">
+  {% if paginator.previous_page %}
+    <a href="{{ paginator.previous_page_path | replace: '//', '/' }}">&laquo; Prev</a>
+  {% else %}
+    <span>&laquo; Prev</span>
+  {% endif %}
+
+  {% for page in (1..paginator.total_pages) %}
+    {% if page == paginator.page %}
+      <em>{{ page }}</em>
+    {% elsif page == 1 %}
+      <a href="{{ '/blog' }}">{{ page }}</a>
+    {% else %}
+      <a href="{{ site.paginate_path | replace: '//', '/' | replace: ':num', page }}">{{ page }}</a>
+    {% endif %}
+  {% endfor %}
+
+  {% if paginator.next_page %}
+    <a href="{{ paginator.next_page_path | replace: '//', '/' }}">Next &raquo;</a>
+  {% else %}
+    <span>Next &raquo;</span>
+  {% endif %}
+</div>
+<br />
+{% endif %}
diff --git a/thirdparty/rocksdb/docs/_includes/content/gridblocks.html b/thirdparty/rocksdb/docs/_includes/content/gridblocks.html
new file mode 100644
index 0000000000..49c5e5917d
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/content/gridblocks.html
@@ -0,0 +1,5 @@
+<div class="gridBlock">
+{% for item in {{include.data_source}} %}
+  {% include content/items/gridblock.html item=item layout=include.layout imagealign=include.imagealign align=include.align %}
+{% endfor %}
+</div>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/content/items/gridblock.html b/thirdparty/rocksdb/docs/_includes/content/items/gridblock.html
new file mode 100644
index 0000000000..58c9e7fdaf
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/content/items/gridblock.html
@@ -0,0 +1,37 @@
+{% if include.layout == "fourColumn" %}
+  {% assign layout = "fourByGridBlock" %}
+{% else %}
+  {% assign layout = "twoByGridBlock" %}
+{% endif %}
+
+{% if include.imagealign == "side" %}
+  {% assign imagealign = "imageAlignSide" %}
+{% else %}
+  {% if item.image %}
+    {% assign imagealign = "imageAlignTop" %}
+  {% else %}
+    {% assign imagealign = "" %}
+  {% endif %}
+{% endif %}
+
+{% if include.align == "right" %}
+  {% assign align = "alignRight" %}
+{% elsif include.align == "center" %}
+  {% assign align = "alignCenter" %}
+{% else %}
+  {% assign align = "alignLeft" %}
+{% endif %}
+
+<div class="blockElement {{ layout }} {{ imagealign }} {{ align }}">
+  {% if item.image %}
+  <div class="blockImage">
+    <img src="/static/{{ item.image }}" alt="{{ item.title }}" title="{{ item.title }}" />
+  </div>
+  {% endif %}
+  <div class="blockContent">
+    <h3>{{ item.title }}</h3>
+    {% if item.text %}
+    {{ item.text | markdownify }}
+    {% endif %}
+  </div>
+</div>
diff --git a/thirdparty/rocksdb/docs/_includes/doc.html b/thirdparty/rocksdb/docs/_includes/doc.html
new file mode 100644
index 0000000000..a7950004ec
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/doc.html
@@ -0,0 +1,25 @@
+<div class="post">
+  <header class="post-header">
+    <h1 class="post-title">{% if include.truncate %}<a href="{{ page.url | absolute_url }}">{{ page.title }}</a>{% else %}{{ page.title }}{% endif %}</h1>
+  </header>
+
+  <article class="post-content">
+   {% if include.truncate %}
+      {% if page.content contains '<!--truncate-->' %}
+        {{ page.content | split:'<!--truncate-->' | first }}
+        <div class="read-more">
+          <a href="{{ page.url | absolute_url }}" >
+            ...Read More
+          </a>
+        </div>
+      {% else %}
+        {{ page.content }}
+      {% endif %}
+    {% else %}
+      {{ content }}
+
+      <p><a class="edit-page-link" href="https://github.com/{{ site.ghrepo }}/blob/master/docs/{{ page.path }}" target="_blank">Edit on GitHub</a></p>
+    {% endif %}
+  </article>
+  {% include doc_paging.html %}
+</div>
diff --git a/thirdparty/rocksdb/docs/_includes/doc_paging.html b/thirdparty/rocksdb/docs/_includes/doc_paging.html
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/thirdparty/rocksdb/docs/_includes/footer.html b/thirdparty/rocksdb/docs/_includes/footer.html
new file mode 100644
index 0000000000..dd9494aeb5
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/footer.html
@@ -0,0 +1,33 @@
+<div class="footerContainer">
+  <div id="footer_wrap" class="wrapper footerWrapper">
+    <div class="footerBlocks">
+      <div id="fb_oss" class="footerSection fbOpenSourceFooter">
+          <svg class="facebookOSSLogoSvg" viewBox="0 0 1133.9 1133.9" x="0px" y="0px">
+            <g>
+              <path class="logoRing outerRing" d="M 498.3 3.7 c 153.6 88.9 307.3 177.7 461.1 266.2 c 7.6 4.4 10.3 9.1 10.3 17.8 c -0.3 179.1 -0.2 358.3 0 537.4 c 0 8.1 -2.4 12.8 -9.7 17.1 c -154.5 88.9 -308.8 178.1 -462.9 267.5 c -9 5.2 -15.5 5.3 -24.6 0.1 c -153.9 -89.2 -307.9 -178 -462.1 -266.8 C 3 838.8 0 833.9 0 825.1 c 0.3 -179.1 0.2 -358.3 0 -537.4 c 0 -8.6 2.6 -13.6 10.2 -18 C 164.4 180.9 318.4 92 472.4 3 C 477 -1.5 494.3 -0.7 498.3 3.7 Z M 48.8 555.3 c 0 79.9 0.2 159.9 -0.2 239.8 c -0.1 10 3 15.6 11.7 20.6 c 137.2 78.8 274.2 157.8 411 237.3 c 9.9 5.7 17 5.7 26.8 0.1 c 137.5 -79.8 275.2 -159.2 412.9 -238.5 c 7.4 -4.3 10.5 -8.9 10.5 -17.8 c -0.3 -160.2 -0.3 -320.5 0 -480.7 c 0 -8.8 -2.8 -13.6 -10.3 -18 C 772.1 218 633.1 137.8 494.2 57.4 c -6.5 -3.8 -11.5 -4.5 -18.5 -0.5 C 336.8 137.4 197.9 217.7 58.8 297.7 c -7.7 4.4 -10.2 9.2 -10.2 17.9 C 48.9 395.5 48.8 475.4 48.8 555.3 Z" />
+              <path class="logoRing middleRing" d="M 184.4 555.9 c 0 -33.3 -1 -66.7 0.3 -100 c 1.9 -48 24.1 -86 64.7 -110.9 c 54.8 -33.6 110.7 -65.5 167 -96.6 c 45.7 -25.2 92.9 -24.7 138.6 1 c 54.4 30.6 108.7 61.5 162.2 93.7 c 44 26.5 67.3 66.8 68 118.4 c 0.9 63.2 0.9 126.5 0 189.7 c -0.7 50.6 -23.4 90.7 -66.6 116.9 c -55 33.4 -110.8 65.4 -167.1 96.5 c -43.4 24 -89 24.2 -132.3 0.5 c -57.5 -31.3 -114.2 -64 -170 -98.3 c -41 -25.1 -62.9 -63.7 -64.5 -112.2 C 183.5 621.9 184.3 588.9 184.4 555.9 Z M 232.9 556.3 c 0 29.5 0.5 59.1 -0.1 88.6 c -0.8 39.2 16.9 67.1 50.2 86.2 c 51.2 29.4 102.2 59.2 153.4 88.4 c 31.4 17.9 63.6 18.3 95 0.6 c 53.7 -30.3 107.1 -61.2 160.3 -92.5 c 29.7 -17.5 45 -44.5 45.3 -78.8 c 0.6 -61.7 0.5 -123.5 0 -185.2 c -0.3 -34.4 -15.3 -61.5 -44.9 -79 C 637.7 352.6 583 320.8 527.9 290 c -27.5 -15.4 -57.2 -16.1 -84.7 -0.7 c -56.9 31.6 -113.4 64 -169.1 97.6 c -26.4 15.9 -40.7 41.3 -41.1 72.9 C 232.6 491.9 232.9 524.1 232.9 556.3 Z" />
+              <path class="logoRing innerRing" d="M 484.9 424.4 c 69.8 -2.8 133.2 57.8 132.6 132 C 617 630 558.5 688.7 484.9 689.1 c -75.1 0.4 -132.6 -63.6 -132.7 -132.7 C 352.1 485 413.4 421.5 484.9 424.4 Z M 401.3 556.7 c -3.4 37.2 30.5 83.6 83 84.1 c 46.6 0.4 84.8 -37.6 84.9 -84 c 0.1 -46.6 -37.2 -84.4 -84.2 -84.6 C 432.2 472.1 397.9 518.3 401.3 556.7 Z" />
+            </g>
+          </svg>
+        <h2>Facebook Open Source</h2>
+      </div>
+      <div class="footerSection">
+        <a class="footerLink" href="https://code.facebook.com/projects/" target="_blank">Open Source Projects</a>
+        <a class="footerLink" href="https://github.com/facebook/" target="_blank">GitHub</a>
+        <a class="footerLink" href="https://twitter.com/fbOpenSource" target="_blank">Twitter</a>
+      </div>
+      <div class="footerSection rightAlign">
+        <a class="footerLink" href="https://github.com/{{ site.ghrepo }}" target="_blank">Contribute to this project on GitHub</a>
+      </div>
+    </div>
+  </div>
+</div>
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+  ga('create', '{{ site.gacode }}', 'auto');
+  ga('send', 'pageview');
+</script>
diff --git a/thirdparty/rocksdb/docs/_includes/head.html b/thirdparty/rocksdb/docs/_includes/head.html
new file mode 100644
index 0000000000..10845ec1d5
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/head.html
@@ -0,0 +1,23 @@
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  <meta property="og:url" content="{{ page.url | replace:'index.html','' | absolute_url }}" />
+  <meta property="og:site_name" content="{{ site.title }}"/>
+  <meta property="og:title" content="{% if page.title %}{{ page.title }}{% else %}{{ site.title }}{% endif %}" />
+  <meta property="og:image" content="{{ '/static/og_image.png' | absolute_url }}" />
+  <meta property="og:description" content="{% if page.excerpt %}{{ page.excerpt | strip_html | strip_newlines | truncate: 160 }}{% else %}{{ site.description }}{% endif %}" />
+
+  <link rel="stylesheet" href="{{ '/css/main.css' }}" media="screen">
+  <link rel="icon" href="{{ '/static/favicon.png' }}" type="image/x-icon">
+  {% if site.searchconfig %}
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/docsearch.js/1/docsearch.min.css" />
+  {% endif %}
+
+  <title>{% if page.title %}{{ page.title }} | {{ site.title }}{% else %}{{ site.title }}{% endif %}</title>
+  <meta name="description" content="{% if page.excerpt %}{{ page.excerpt | strip_html | strip_newlines | truncate: 160 }}{% else %}{{ site.description }}{% endif %}">
+
+  <link rel="canonical" href="{{ page.url | replace:'index.html','' | absolute_url }}">
+  <link rel="alternate" type="application/rss+xml" title="{{ site.title }}" href="{{ '/feed.xml' |  absolute_url }}" />
+</head>
diff --git a/thirdparty/rocksdb/docs/_includes/header.html b/thirdparty/rocksdb/docs/_includes/header.html
new file mode 100644
index 0000000000..8108d222be
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/header.html
@@ -0,0 +1,19 @@
+<div class="headerContainer">
+  <div id="header_wrap" class="wrapper headerWrapper">
+    <div class="inner">
+      <img class="projectLogo" height="200px" src="{{ '/static/logo.svg' }}" alt="{{ site.title }}" title="{{ site.title }}" />
+      <h1 id="project_title">{{ site.title }}</h1>
+      <h2 id="project_tagline" class="fbossFontLight">{{ site.tagline }}</h2>
+
+      <section id="intro">
+        <p>{% if page.excerpt %}{{ page.excerpt | strip_html }}{% else %}{{ site.description }}{% endif %}</p>
+      </section>
+      <div id="promo" class="section promoSection">
+        {% for promo in site.data.promo %}
+          {% include plugins/{{promo.type}}.html button_href=promo.href button_text=promo.text %}
+          <div class="gridClear"></div>
+        {% endfor %}
+      </div>
+    </div>
+  </div>
+</div>
diff --git a/thirdparty/rocksdb/docs/_includes/hero.html b/thirdparty/rocksdb/docs/_includes/hero.html
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/thirdparty/rocksdb/docs/_includes/home_header.html b/thirdparty/rocksdb/docs/_includes/home_header.html
new file mode 100644
index 0000000000..90880d17cf
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/home_header.html
@@ -0,0 +1,22 @@
+<div class="homeContainer">
+  <div class="homeSplashFade">
+    <div id="home_wrap" class="wrapper homeWrapper">
+      <div id="inner">
+        <h2 id="project_tagline">{{ site.tagline }}</h2>
+        <section id="intro">
+          <p>{% if page.excerpt %}{{ page.excerpt | strip_html }}{% else %}{{ site.description }}{% endif %}</p>
+        </section>
+        <div id="promo" class="section promoSection">
+          {% for promo in site.data.promo %}
+            <div class="promoRow">
+            {% include plugins/{{promo.type}}.html href=promo.href text=promo.text children=promo.children %}
+            </div>
+          {% endfor %}
+        </div>
+      </div>
+      <div class="projectLogo">
+        <img src="{{ '/static/logo.svg' }}" alt="{{ site.title }}">
+      </div>
+    </div>
+  </div>
+</div>
diff --git a/thirdparty/rocksdb/docs/_includes/katex_import.html b/thirdparty/rocksdb/docs/_includes/katex_import.html
new file mode 100644
index 0000000000..6d6b7cf44a
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/katex_import.html
@@ -0,0 +1,3 @@
+<script src="//code.jquery.com/jquery-1.11.1.min.js"></script>
+<link rel="stylesheet" href="//cdnjs.cloudflare.com/ajax/libs/KaTeX/0.2.0/katex.min.css">
+<script src="//cdnjs.cloudflare.com/ajax/libs/KaTeX/0.2.0/katex.min.js"></script>
diff --git a/thirdparty/rocksdb/docs/_includes/katex_render.html b/thirdparty/rocksdb/docs/_includes/katex_render.html
new file mode 100644
index 0000000000..56e2e89743
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/katex_render.html
@@ -0,0 +1,210 @@
+<script type="text/javascript">
+/* global katex */
+
+var findEndOfMath = function(delimiter, text, startIndex) {
+    // Adapted from
+    // https://github.com/Khan/perseus/blob/master/src/perseus-markdown.jsx
+    var index = startIndex;
+    var braceLevel = 0;
+
+    var delimLength = delimiter.length;
+
+    while (index < text.length) {
+        var character = text[index];
+
+        if (braceLevel <= 0 &&
+            text.slice(index, index + delimLength) === delimiter) {
+            return index;
+        } else if (character === "\\") {
+            index++;
+        } else if (character === "{") {
+            braceLevel++;
+        } else if (character === "}") {
+            braceLevel--;
+        }
+
+        index++;
+    }
+
+    return -1;
+};
+
+var splitAtDelimiters = function(startData, leftDelim, rightDelim, display) {
+    var finalData = [];
+
+    for (var i = 0; i < startData.length; i++) {
+        if (startData[i].type === "text") {
+            var text = startData[i].data;
+
+            var lookingForLeft = true;
+            var currIndex = 0;
+            var nextIndex;
+
+            nextIndex = text.indexOf(leftDelim);
+            if (nextIndex !== -1) {
+                currIndex = nextIndex;
+                finalData.push({
+                    type: "text",
+                    data: text.slice(0, currIndex)
+                });
+                lookingForLeft = false;
+            }
+
+            while (true) {
+                if (lookingForLeft) {
+                    nextIndex = text.indexOf(leftDelim, currIndex);
+                    if (nextIndex === -1) {
+                        break;
+                    }
+
+                    finalData.push({
+                        type: "text",
+                        data: text.slice(currIndex, nextIndex)
+                    });
+
+                    currIndex = nextIndex;
+                } else {
+                    nextIndex = findEndOfMath(
+                        rightDelim,
+                        text,
+                        currIndex + leftDelim.length);
+                    if (nextIndex === -1) {
+                        break;
+                    }
+
+                    finalData.push({
+                        type: "math",
+                        data: text.slice(
+                            currIndex + leftDelim.length,
+                            nextIndex),
+                        rawData: text.slice(
+                            currIndex,
+                            nextIndex + rightDelim.length),
+                        display: display
+                    });
+
+                    currIndex = nextIndex + rightDelim.length;
+                }
+
+                lookingForLeft = !lookingForLeft;
+            }
+
+            finalData.push({
+                type: "text",
+                data: text.slice(currIndex)
+            });
+        } else {
+            finalData.push(startData[i]);
+        }
+    }
+
+    return finalData;
+};
+
+var splitWithDelimiters = function(text, delimiters) {
+    var data = [{type: "text", data: text}];
+    for (var i = 0; i < delimiters.length; i++) {
+        var delimiter = delimiters[i];
+        data = splitAtDelimiters(
+            data, delimiter.left, delimiter.right,
+            delimiter.display || false);
+    }
+    return data;
+};
+
+var renderMathInText = function(text, delimiters) {
+    var data = splitWithDelimiters(text, delimiters);
+
+    var fragment = document.createDocumentFragment();
+
+    for (var i = 0; i < data.length; i++) {
+        if (data[i].type === "text") {
+            fragment.appendChild(document.createTextNode(data[i].data));
+        } else {
+            var span = document.createElement("span");
+            var math = data[i].data;
+            try {
+                katex.render(math, span, {
+                    displayMode: data[i].display
+                });
+            } catch (e) {
+                if (!(e instanceof katex.ParseError)) {
+                    throw e;
+                }
+                console.error(
+                    "KaTeX auto-render: Failed to parse `" + data[i].data +
+                    "` with ",
+                    e
+                );
+                fragment.appendChild(document.createTextNode(data[i].rawData));
+                continue;
+            }
+            fragment.appendChild(span);
+        }
+    }
+
+    return fragment;
+};
+
+var renderElem = function(elem, delimiters, ignoredTags) {
+    for (var i = 0; i < elem.childNodes.length; i++) {
+        var childNode = elem.childNodes[i];
+        if (childNode.nodeType === 3) {
+            // Text node
+            var frag = renderMathInText(childNode.textContent, delimiters);
+            i += frag.childNodes.length - 1;
+            elem.replaceChild(frag, childNode);
+        } else if (childNode.nodeType === 1) {
+            // Element node
+            var shouldRender = ignoredTags.indexOf(
+                childNode.nodeName.toLowerCase()) === -1;
+
+            if (shouldRender) {
+                renderElem(childNode, delimiters, ignoredTags);
+            }
+        }
+        // Otherwise, it's something else, and ignore it.
+    }
+};
+
+var defaultOptions = {
+    delimiters: [
+        {left: "$$", right: "$$", display: true},
+        {left: "\\[", right: "\\]", display: true},
+        {left: "\\(", right: "\\)", display: false}
+        // LaTeX uses this, but it ruins the display of normal `$` in text:
+        // {left: "$", right: "$", display: false}
+    ],
+
+    ignoredTags: [
+        "script", "noscript", "style", "textarea", "pre", "code"
+    ]
+};
+
+var extend = function(obj) {
+    // Adapted from underscore.js' `_.extend`. See LICENSE.txt for license.
+    var source, prop;
+    for (var i = 1, length = arguments.length; i < length; i++) {
+        source = arguments[i];
+        for (prop in source) {
+            if (Object.prototype.hasOwnProperty.call(source, prop)) {
+                obj[prop] = source[prop];
+            }
+        }
+    }
+    return obj;
+};
+
+var renderMathInElement = function(elem, options) {
+    if (!elem) {
+        throw new Error("No element provided to render");
+    }
+
+    options = extend({}, defaultOptions, options);
+
+    renderElem(elem, options.delimiters, options.ignoredTags);
+};
+
+renderMathInElement(document.body);
+
+</script>
diff --git a/thirdparty/rocksdb/docs/_includes/nav.html b/thirdparty/rocksdb/docs/_includes/nav.html
new file mode 100644
index 0000000000..9c6fed06b1
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/nav.html
@@ -0,0 +1,37 @@
+<div id="fixed_header" class="fixedHeaderContainer{% if include.alwayson %} visible{% endif %}">
+  <div class="headerWrapper wrapper">
+    <header>
+      <a href="{{ '/' | absolute_url }}">
+        <img src="{{ '/static/logo.svg' }}">
+        <h2>{{ site.title }}</h2>
+      </a>
+
+      <div class="navigationWrapper navigationFull" id="flat_nav">
+        <nav class="navigation">
+          <ul>
+            {% for item in site.data.nav %}
+            <li class="navItem{% if page.collection == item.category or page.category == item.category %} navItemActive{% endif %}">
+              {% if item.category == "external" %}
+                <a href="{{ item.href }}">{{ item.title }}</a>
+              {% else %}
+                {% comment %}
+                I removed `relative_url` from here for now until the problem we are having with
+                GitHub pages is resolved. Yes, I know this is exactly the same as the if above.
+                See: https://github.com/facebook/rocksdb/commit/800e51553ee029f29581f7f338cbc988c7f6da62
+                {% endcomment %}
+                <a href="{{ item.href }}">{{ item.title }}</a>
+              {% endif %}
+            </li>
+            {% endfor %}
+            {% if site.searchconfig %}
+            {% include nav_search.html inputselector="search_input" %}
+            {% endif %}
+          </ul>
+        </nav>
+      </div>
+      <div class="navigationWrapper navigationSlider" id="navigation_wrap">
+        {% include nav/header_nav.html %}
+      </div>
+    </header>
+  </div>
+</div>
diff --git a/thirdparty/rocksdb/docs/_includes/nav/collection_nav.html b/thirdparty/rocksdb/docs/_includes/nav/collection_nav.html
new file mode 100644
index 0000000000..a3c7a2dd35
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/nav/collection_nav.html
@@ -0,0 +1,64 @@
+<div class="docsNavContainer">
+  <nav class="toc" id="doc_nav">
+    <div class="toggleNav" id="collection_nav">
+      <section class="navWrapper wrapper">
+        <div class="navBreadcrumb wrapper">
+          <div class="navToggle" id="collection_nav_toggler">
+            <i></i>
+          </div>
+          <h2>
+            <a href="{{ include.sectionpath }}">{{ include.sectiontitle }}</a>
+            {% if include.currentgroup %}
+            <i>›</i>
+            <span>{{ include.currentgroup }}</span>
+            {% endif %}
+          </h2>
+        </div>
+        <div class="navGroups">
+          {% if include.type == "blog" %}
+            {% assign grouptitle = "All Posts" %}
+            {% assign groupitems = include.navdata %}
+            {% include nav/collection_nav_group.html %}
+          {% else %}
+          {% for group in include.navdata %}
+            {% assign grouptitle = group.title %}
+            {% for item in group.items %}
+              {% if item.id == page.docid %}
+              {% assign currentgroup = group %}
+              {% endif %}
+            {% endfor %}
+            {% include nav/collection_nav_group.html %}
+          {% endfor %}
+          {% endif %}
+        </div>
+      </section>
+    </div>
+  </nav>
+</div>
+<script>
+  var docsevent = document.createEvent('Event');
+  docsevent.initEvent('docs_slide', true, true);
+  document.addEventListener('docs_slide', function (e) {
+    document.body.classList.toggle('docsSliderActive');
+  }, false);
+
+  var collectionNav = document.getElementById('collection_nav');
+  var collectionNavToggler =
+    document.getElementById('collection_nav_toggler');
+  collectionNavToggler.addEventListener('click', function(e) {
+    collectionNav.classList.toggle('toggleNavActive');
+    document.dispatchEvent(docsevent);
+  });
+
+  var groups = document.getElementsByClassName('navGroup');
+  for(var i = 0; i < groups.length; i++) {
+    var thisGroup = groups[i];
+    thisGroup.onclick = function() {
+      for(var j = 0; j < groups.length; j++) {
+        var group = groups[j];
+        group.classList.remove('navGroupActive');
+      }
+      this.classList.add('navGroupActive');
+    }
+  }
+</script>
diff --git a/thirdparty/rocksdb/docs/_includes/nav/collection_nav_group.html b/thirdparty/rocksdb/docs/_includes/nav/collection_nav_group.html
new file mode 100644
index 0000000000..b236ac5e3f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/nav/collection_nav_group.html
@@ -0,0 +1,19 @@
+<div class="navGroup{% if currentgroup == group %} navGroupActive navGroupCurrent{% endif %}">
+  <h3><i>+</i><span>{{ grouptitle }}</span></h3>
+  <ul>
+    {% if include.data_collection %}
+      {% for item in group.items %}
+        {% for collectionitem in include.data_collection %}
+        {% if collectionitem.docid == item.id %}
+          {% assign groupitem = collectionitem %}
+          {% include nav/collection_nav_group_item.html %}
+        {% endif %}
+        {% endfor %}
+      {% endfor %}
+    {% else %}
+      {% for groupitem in groupitems %}
+      {% include nav/collection_nav_group_item.html %}
+      {% endfor %}
+    {% endif %}
+  </ul>
+</div>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/nav/collection_nav_group_item.html b/thirdparty/rocksdb/docs/_includes/nav/collection_nav_group_item.html
new file mode 100644
index 0000000000..fbb063deb7
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/nav/collection_nav_group_item.html
@@ -0,0 +1 @@
+<li class="navListItem"><a class="navItem" href="{{ groupitem.url | absolute_url }}">{{ groupitem.title }}</a></li>
diff --git a/thirdparty/rocksdb/docs/_includes/nav/header_nav.html b/thirdparty/rocksdb/docs/_includes/nav/header_nav.html
new file mode 100644
index 0000000000..0fe945cdcd
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/nav/header_nav.html
@@ -0,0 +1,30 @@
+<div id="header_nav">
+  <div class="navSlideout">
+    <i class="menuExpand" id="header_nav_expander"><span></span><span></span><span></span></i>
+  </div>
+  <nav class="slidingNav">
+    <ul>
+      {% for item in site.data.nav %}
+      <li class="navItem">
+        <a href="{{ item.href }}"{% if item.category == "external" %} target="_blank"{% endif %}>{{ item.title }}</a>
+      </li>
+      {% endfor %}
+      {% if site.searchconfig %}
+      {% include nav_search.html inputselector="search_input_react" %}
+      {% endif %}
+    </ul>
+  </nav>
+</div>
+<script>
+  var event = document.createEvent('Event');
+  event.initEvent('slide', true, true);
+  document.addEventListener('slide', function (e) {
+    document.body.classList.toggle('sliderActive');
+  }, false);
+  var headerNav = document.getElementById('header_nav');
+  var headerNavExpander = document.getElementById('header_nav_expander');
+  headerNavExpander.addEventListener('click', function(e) {
+    headerNav.classList.toggle('navSlideoutActive');
+    document.dispatchEvent(event);
+  }, false);
+</script>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/nav_search.html b/thirdparty/rocksdb/docs/_includes/nav_search.html
new file mode 100644
index 0000000000..84956b9f78
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/nav_search.html
@@ -0,0 +1,15 @@
+<li class="navSearchWrapper">
+  <input id="{{ include.inputselector }}" type="search" />
+</li>
+<script type="text/javascript" src="https://cdn.jsdelivr.net/docsearch.js/1/docsearch.min.js"></script>
+<script>
+// For Algolia search
+(function() {
+  // Algolia
+  docsearch({
+    apiKey: '{{ site.searchconfig.apikey }}',
+    indexName: '{{ site.searchconfig.indexname }}',
+    inputSelector: '#{{ include.inputselector }}',
+  });
+}());
+</script>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/all_share.html b/thirdparty/rocksdb/docs/_includes/plugins/all_share.html
new file mode 100644
index 0000000000..59b00d615f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/all_share.html
@@ -0,0 +1,3 @@
+<div class="pluginBlock allShareBlock">
+  {% include plugins/like_button.html %}{% include plugins/twitter_share.html %}{% include plugins/google_share.html %}
+</div>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/ascii_cinema.html b/thirdparty/rocksdb/docs/_includes/plugins/ascii_cinema.html
new file mode 100644
index 0000000000..7d3f971480
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/ascii_cinema.html
@@ -0,0 +1,2 @@
+<div class="ascii-cinema pluginBlock"></div>
+<script type="text/javascript" src="https://asciinema.org/a/{{ include.href }}.js" id="asciicast-{{ include.href }}" async data-autoplay="true" data-loop="true" data-speed="2" data-t="23"></script>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/button.html b/thirdparty/rocksdb/docs/_includes/plugins/button.html
new file mode 100644
index 0000000000..9e499fe3f3
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/button.html
@@ -0,0 +1,6 @@
+<div class="pluginWrapper buttonWrapper">
+  <a
+    class="button"
+    href="{{ include.href }}"
+  >{{ include.text }}</a>
+</div>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/github_star.html b/thirdparty/rocksdb/docs/_includes/plugins/github_star.html
new file mode 100644
index 0000000000..6aea70fc73
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/github_star.html
@@ -0,0 +1,4 @@
+<div class="pluginWrapper ghStarWrapper">
+  <a aria-label="Star {{ site.ghrepo }} on GitHub" data-count-aria-label="# stargazers on GitHub" data-count-api="/repos/{{ site.ghrepo }}#stargazers_count" data-count-href="/{{ site.ghrepo }}/stargazers" data-style="mega" data-icon="octicon-star" href="https://github.com/{{ site.ghrepo }}" class="github-button">Star</a>
+</div>
+<script async defer id="github-bjs" src="https://buttons.github.io/buttons.js"></script>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/github_watch.html b/thirdparty/rocksdb/docs/_includes/plugins/github_watch.html
new file mode 100644
index 0000000000..64233b57b6
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/github_watch.html
@@ -0,0 +1,4 @@
+<div class="pluginWrapper ghWatchWrapper">
+  <a aria-label="Watch {{ site.ghrepo }} on GitHub" data-count-aria-label="# watchers on GitHub" data-count-api="/repos/{{ site.ghrepo }}#subscribers_count" data-count-href="/{{ site.ghrepo }}/watchers" data-style="mega" data-icon="octicon-eye" href="https://github.com/{{ site.ghrepo }}" class="github-button">Watch</a>
+</div>
+<script async defer id="github-bjs" src="https://buttons.github.io/buttons.js"></script>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/google_share.html b/thirdparty/rocksdb/docs/_includes/plugins/google_share.html
new file mode 100644
index 0000000000..1b557db86c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/google_share.html
@@ -0,0 +1,5 @@
+<div class="pluginBlock">
+  <div class="g-plusone" data-size="medium"></div>
+</div>
+
+<script src="https://apis.google.com/js/platform.js" async defer></script>
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/iframe.html b/thirdparty/rocksdb/docs/_includes/plugins/iframe.html
new file mode 100644
index 0000000000..525b59f227
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/iframe.html
@@ -0,0 +1,6 @@
+<div class="iframeContent">
+  <iframe class="pluginIframe" src="{{ include.href }}" seamless></iframe>
+</div>
+<div class="iframePreview">
+  {% include plugins/button.html href=include.href text=include.text %}
+</div>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/like_button.html b/thirdparty/rocksdb/docs/_includes/plugins/like_button.html
new file mode 100644
index 0000000000..bcb8a7beef
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/like_button.html
@@ -0,0 +1,18 @@
+<div class="fb-like pluginWrapper likeButtonWrapper" data-layout="button_count" data-action="like" data-show-faces="true" data-share="true"></div>
+<script>
+  window.fbAsyncInit = function() {
+  FB.init({
+    appId      : '{{ site.fbappid }}',
+    xfbml      : true,
+    version    : 'v2.3'
+  });
+  };
+
+  (function(d, s, id){
+   var js, fjs = d.getElementsByTagName(s)[0];
+   if (d.getElementById(id)) {return;}
+   js = d.createElement(s); js.id = id;
+   js.src = "//connect.facebook.net/en_US/sdk.js";
+   fjs.parentNode.insertBefore(js, fjs);
+   }(document, 'script', 'facebook-jssdk'));
+</script>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/plugin_row.html b/thirdparty/rocksdb/docs/_includes/plugins/plugin_row.html
new file mode 100644
index 0000000000..800f50b821
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/plugin_row.html
@@ -0,0 +1,5 @@
+<div class="pluginRowBlock">
+{% for child in include.children %}
+  {% include plugins/{{child.type}}.html href=child.href text=child.text %}
+{% endfor %}
+</div>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/post_social_plugins.html b/thirdparty/rocksdb/docs/_includes/plugins/post_social_plugins.html
new file mode 100644
index 0000000000..a2ecb90eeb
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/post_social_plugins.html
@@ -0,0 +1,41 @@
+<div class="postSocialPlugins">
+  <a
+    href="https://twitter.com/share"
+    class="twitter-share-button"
+    data-url="{{ page.url | replace:'index.html','' | absolute_url }}"
+    data-text="{% if page.title %}{{ page.title }}{% else %}{{ site.title }}{% endif %}"
+    data-hashtags="flowtype">Tweet</a>
+  <div
+    class="fb-like"
+    data-href="{{ page.url | replace:'index.html','' | absolute_url }}"
+    data-layout="button_count"
+    data-action="like"
+    data-show-faces="false"
+    data-share="true"></div>
+</div>
+<script>
+  window.fbAsyncInit = function() {
+  FB.init({
+    appId      : '{{ site.fbappid }}',
+    xfbml      : true,
+    version    : 'v2.2'
+  });
+  };
+
+  (function(d, s, id){
+   var js, fjs = d.getElementsByTagName(s)[0];
+   if (d.getElementById(id)) {return;}
+   js = d.createElement(s); js.id = id;
+   js.src = "//connect.facebook.net/en_US/sdk.js";
+   fjs.parentNode.insertBefore(js, fjs);
+   }(document, 'script', 'facebook-jssdk'));
+</script>
+
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){
+  js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+  }
+  }(document, 'script', 'twitter-wjs');
+</script>
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/slideshow.html b/thirdparty/rocksdb/docs/_includes/plugins/slideshow.html
new file mode 100644
index 0000000000..69fa2b300e
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/slideshow.html
@@ -0,0 +1,88 @@
+<div class="slideshowBlock pluginWrapper" id="slideshow"></div>
+<script>
+  var slideshowData = [
+    {% for image in site.data.slideshow %}
+    {
+      id         : "{{ image.id }}",
+      imagesrc   : "{{ image.src }}",
+      tooltip    : "{{ image.tooltip }}",
+      href       : "{{ image.link }}",
+    },
+    {% endfor %}
+  ];
+</script>
+<script src="http://fb.me/react-with-addons-0.13.1.min.js"></script>
+<script type="text/javascript">
+  var Slideshow = React.createClass({displayName: "Slideshow",
+    getInitialState: function() {
+      return {
+        currentSlide: 0,
+      };
+    },
+    getDefaultProps: function() {
+      return {
+        data: slideshowData,
+      };
+    },
+    handleSelect: function(id) {
+      var index = this.props.data.map(function (el, elIndex) {
+        return (
+          elIndex
+        );
+      });
+      var currentIndex = index.indexOf(id);
+      this.setState({
+        currentSlide: currentIndex,
+      });
+    },
+    render: function() {
+      return (
+        React.createElement("div", {className: "slideshow"},
+          React.createElement("div", {className: "slides"},
+            this.props.data.map(this.renderSlide)
+          ),
+          React.createElement("div", {className: "pagination"},
+            this.props.data.map(this.renderPager)
+          )
+        )
+      );
+    },
+    renderSlide: function(child, index) {
+      var classes = React.addons.classSet({
+        'slide': true,
+        'slideActive': this.state.currentSlide === index,
+      });
+      if (child.href) {
+        return (
+          React.createElement("div", {key: index, className: classes},
+            React.createElement("a", {href: child.href, alt: child.tooltip, title: child.tooltip},
+              React.createElement("img", {src: child.imagesrc, alt: child.tooltip, title: child.tooltip})
+            )
+          )
+        );
+      }
+      return (
+        React.createElement("div", {key: index, className: classes},
+          React.createElement("img", {src: child.imagesrc, alt: child.tooltip})
+        )
+      );
+    },
+    renderPager: function(child, index) {
+      var classes = React.addons.classSet({
+        'pager': true,
+        'pagerActive': this.state.currentSlide === index,
+      });
+      return (
+        React.createElement("span", {key: index, className: classes, onClick: this.handleSelect.bind(this, index)})
+      );
+    },
+  });
+
+  function render(slideshowData) {
+    React.render(
+      React.createElement(Slideshow, {data: slideshowData}),
+      document.getElementById('slideshow')
+    );
+  }
+  render(slideshowData);
+</script>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/twitter_follow.html b/thirdparty/rocksdb/docs/_includes/plugins/twitter_follow.html
new file mode 100644
index 0000000000..b0f25dc605
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/twitter_follow.html
@@ -0,0 +1,12 @@
+<div class="pluginBlock">
+  <a href="https://twitter.com/{{ include.href }}" class="twitter-follow-button pluginBlock" data-show-count="false">Follow @{{ include.href }}</a>
+</div>
+
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){
+  js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+  }
+  }(document, 'script', 'twitter-wjs');
+</script>
diff --git a/thirdparty/rocksdb/docs/_includes/plugins/twitter_share.html b/thirdparty/rocksdb/docs/_includes/plugins/twitter_share.html
new file mode 100644
index 0000000000..a60f2a8dff
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/plugins/twitter_share.html
@@ -0,0 +1,11 @@
+<div class="pluginWrapper twitterSharePlugin">
+  <a href="https://twitter.com/share" class="twitter-share-button" data-hashtags="{{ site.title| replace: ' ', '' }}">Tweet</a>
+</div>
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){js=d.createElement(s);
+  js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+}
+}(document, 'script', 'twitter-wjs');
+</script>
diff --git a/thirdparty/rocksdb/docs/_includes/post.html b/thirdparty/rocksdb/docs/_includes/post.html
new file mode 100644
index 0000000000..3ae0a2a808
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/post.html
@@ -0,0 +1,40 @@
+<div class="post">
+  <header class="post-header">
+    <div style="display: flex; align-content: center; align-items: center; justify-content: center">
+    {% for author_idx in page.author %}
+      <div style="padding: 16px; display: inline-block; text-align: center">
+        {% assign author = site.data.authors[author_idx] %}
+        {% if author.fbid %}
+        <div class="authorPhoto">
+          <img src="http://graph.facebook.com/{{ author.fbid }}/picture/" alt="{{ author.fullname }}" title="{{ author.fullname }}" />
+        </div>
+        {% endif %}
+        {% if author.full_name %}
+        <p class="post-authorName">{{ author.full_name }}</p>
+        {% endif %}
+      </div>
+    {% endfor %}
+    </div>
+    <h1 class="post-title">{% if include.truncate %}<a href="{{ page.url | absolute_url }}">{{ page.title }}</a>{% else %}{{ page.title }}{% endif %}</h1>
+    <p class="post-meta">Posted {{ page.date | date: '%B %d, %Y' }}{% if page.meta %} • {{ page.meta }}{% endif %}</p>
+  </header>
+  <article class="post-content">
+  {% if include.truncate %}
+    {% if page.content contains '<!--truncate-->' %}
+      {{ page.content | split:'<!--truncate-->' | first | markdownify }}
+      <div class="read-more">
+        <a href="{{ page.url | absolute_url }}" >
+          Read More
+        </a>
+      </div>
+    {% else %}
+      {{ page.content | markdownify }}
+    {% endif %}
+  {% else %}
+    {{ content }}
+  {% endif %}
+  {% unless include.truncate %}
+    {% include plugins/like_button.html %}
+  {% endunless %}
+  </article>
+</div>
diff --git a/thirdparty/rocksdb/docs/_includes/powered_by.html b/thirdparty/rocksdb/docs/_includes/powered_by.html
new file mode 100644
index 0000000000..c629429cd0
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/powered_by.html
@@ -0,0 +1,28 @@
+{% if site.data.powered_by.first.items or site.data.powered_by_highlight.first.items %}
+<div class="poweredByContainer">
+  <div class="wrapper mainWrapper poweredByWrapper">
+    {% if site.data.powered_by_highlight.first.title %}
+    <h2>{{ site.data.powered_by_highlight.first.title }}</h2>
+    {% else %}
+    <h2>{{ site.data.powered_by.first.title }}</h2>
+    {% endif %}
+    {% if site.data.powered_by_highlight.first.items %}
+    <div class="poweredByItems">
+      {% for item in site.data.powered_by_highlight.first.items %}
+      <div class="poweredByItem itemLarge">
+        <a href="{{ item.url }}" target="_blank"><img src="{{ item.img }}" alt="{{ item.name }}" /></a>
+      </div>
+      {% endfor %}
+    </div>
+    {% endif %}
+    <div class="poweredByItems">
+      {% for item in site.data.powered_by.first.items %}
+      <div class="poweredByItem itemSmall">
+        <a href="{{ item.url }}" target="_blank">{{ item.name }}</a>
+      </div>
+      {% endfor %}
+    </div>
+    <div class="poweredByMessage">Does your app use {{ site.title }}? Add it to this list with <a href="https://github.com/{{ site.ghrepo }}/edit/gh-pages/_data/powered_by.yml" target="_blank">a pull request!</a></div>
+  </div>
+</div>
+{% endif %}
diff --git a/thirdparty/rocksdb/docs/_includes/social_plugins.html b/thirdparty/rocksdb/docs/_includes/social_plugins.html
new file mode 100644
index 0000000000..9b36580dc0
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/social_plugins.html
@@ -0,0 +1,31 @@
+<a
+  href="https://twitter.com/share"
+  class="twitter-share-button"
+  data-url="http://facebook.github.io/fresco{{ page.url }}"
+  data-text="Fresco | {{ page.title }}"
+  data-hashtags="fresco">Tweet</a>
+<div
+  class="fb-like"
+  data-href="http://facebook.github.io/fresco{{ page.url }}"
+  data-layout="standard"
+  data-action="like"
+  data-show-faces="true"
+  data-share="true"></div>
+
+<div id="fb-root"></div>
+<script>(function(d, s, id) {
+  var js, fjs = d.getElementsByTagName(s)[0];
+  if (d.getElementById(id)) return;
+  js = d.createElement(s); js.id = id;
+  js.src = "//connect.facebook.net/en_US/sdk.js#xfbml=1&version=v2.0";
+  fjs.parentNode.insertBefore(js, fjs);
+}(document, 'script', 'facebook-jssdk'));</script>
+
+<script>!function(d,s,id){
+  var js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';
+  if(!d.getElementById(id)){
+  js=d.createElement(s);js.id=id;js.src=p+'://platform.twitter.com/widgets.js';
+  fjs.parentNode.insertBefore(js,fjs);
+  }
+  }(document, 'script', 'twitter-wjs');
+</script>
diff --git a/thirdparty/rocksdb/docs/_includes/ui/button.html b/thirdparty/rocksdb/docs/_includes/ui/button.html
new file mode 100644
index 0000000000..729ccc33b9
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_includes/ui/button.html
@@ -0,0 +1 @@
+<span class="buttonWrap {{ include.align }}"><a class="button blockButton fbossFontLight pluginBlock margin{{ include.margin }}" target="{{ include.button_target }}" href="{{ include.button_href }}">{{ include.button_text }}</a></span>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_layouts/basic.html b/thirdparty/rocksdb/docs/_layouts/basic.html
new file mode 100644
index 0000000000..65bd21060c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/basic.html
@@ -0,0 +1,12 @@
+---
+layout: doc_default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    <div class="post basicPost">
+      {{ content }}
+    </div>
+  </div>
+</div>
+
diff --git a/thirdparty/rocksdb/docs/_layouts/blog.html b/thirdparty/rocksdb/docs/_layouts/blog.html
new file mode 100644
index 0000000000..1b0da41359
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/blog.html
@@ -0,0 +1,11 @@
+---
+category: blog
+layout: blog_default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/thirdparty/rocksdb/docs/_layouts/blog_default.html b/thirdparty/rocksdb/docs/_layouts/blog_default.html
new file mode 100644
index 0000000000..a29d58d3dd
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/blog_default.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <body class="docsNavVisible">
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      <div class="docMainWrapper wrapper">
+      {% include nav/collection_nav.html navdata=site.posts type="blog" sectionpath="/blog/" sectiontitle="Blog" %}
+      {{ content }}
+      </div>
+      {% include footer.html %}
+    </div>
+  </body>
+</html>
diff --git a/thirdparty/rocksdb/docs/_layouts/default.html b/thirdparty/rocksdb/docs/_layouts/default.html
new file mode 100644
index 0000000000..0167d9fd91
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/default.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <body>
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      {{ content }}
+      {% include footer.html %}
+    </div>
+  </body>
+
+</html>
diff --git a/thirdparty/rocksdb/docs/_layouts/doc_default.html b/thirdparty/rocksdb/docs/_layouts/doc_default.html
new file mode 100644
index 0000000000..4a4139247c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/doc_default.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <body class="docsNavVisible">
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      <div class="docMainWrapper wrapper">
+      {% include nav/collection_nav.html navdata=site.data.nav_docs type="docs" sectionpath="/docs/" sectiontitle="Docs" data_collection=site.docs %}
+      {{ content }}
+      </div>
+      {% include footer.html %}
+    </div>
+  </body>
+</html>
diff --git a/thirdparty/rocksdb/docs/_layouts/doc_page.html b/thirdparty/rocksdb/docs/_layouts/doc_page.html
new file mode 100644
index 0000000000..dba761e7d7
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/doc_page.html
@@ -0,0 +1,10 @@
+---
+layout: doc_default
+---
+
+<div class="mainContainer documentContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/thirdparty/rocksdb/docs/_layouts/docs.html b/thirdparty/rocksdb/docs/_layouts/docs.html
new file mode 100644
index 0000000000..749dafabbe
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/docs.html
@@ -0,0 +1,5 @@
+---
+layout: doc_page
+---
+
+{% include doc.html %}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_layouts/home.html b/thirdparty/rocksdb/docs/_layouts/home.html
new file mode 100644
index 0000000000..e3c320f55c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/home.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+  {% include head.html %}
+  <body>
+    {% include nav.html alwayson=true %}
+    <div class="navPusher">
+      {% include home_header.html %}
+      <div class="mainContainer">
+        <div id="main_wrap" class="wrapper mainWrapper">
+          {{ content }}
+        </div>
+        {% include powered_by.html %}
+      </div>
+      {% include footer.html %}
+    </div>
+  </body>
+</html>
diff --git a/thirdparty/rocksdb/docs/_layouts/page.html b/thirdparty/rocksdb/docs/_layouts/page.html
new file mode 100644
index 0000000000..bec36805b2
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/page.html
@@ -0,0 +1,3 @@
+---
+layout: blog
+---
diff --git a/thirdparty/rocksdb/docs/_layouts/plain.html b/thirdparty/rocksdb/docs/_layouts/plain.html
new file mode 100644
index 0000000000..fccc02ce17
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/plain.html
@@ -0,0 +1,10 @@
+---
+layout: default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/thirdparty/rocksdb/docs/_layouts/post.html b/thirdparty/rocksdb/docs/_layouts/post.html
new file mode 100644
index 0000000000..4c92cf214c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/post.html
@@ -0,0 +1,8 @@
+---
+collection: blog
+layout: blog
+---
+
+<div class="lonePost">
+{% include post.html %}
+</div>
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_layouts/redirect.html b/thirdparty/rocksdb/docs/_layouts/redirect.html
new file mode 100644
index 0000000000..c24f817484
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/redirect.html
@@ -0,0 +1,6 @@
+<html>
+<head>
+  <meta http-equiv="refresh" content="0; {{ page.destination }}">
+</head>
+<body></body>
+</html>
diff --git a/thirdparty/rocksdb/docs/_layouts/top-level.html b/thirdparty/rocksdb/docs/_layouts/top-level.html
new file mode 100644
index 0000000000..fccc02ce17
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_layouts/top-level.html
@@ -0,0 +1,10 @@
+---
+layout: default
+---
+
+<div class="mainContainer blogContainer postContainer">
+  <div id="main_wrap" class="wrapper mainWrapper">
+    {{ content }}
+  </div>
+</div>
+
diff --git a/thirdparty/rocksdb/docs/_posts/2014-03-27-how-to-backup-rocksdb.markdown b/thirdparty/rocksdb/docs/_posts/2014-03-27-how-to-backup-rocksdb.markdown
new file mode 100644
index 0000000000..f9e4a54447
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-03-27-how-to-backup-rocksdb.markdown
@@ -0,0 +1,135 @@
+---
+title: How to backup RocksDB?
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/191/how-to-backup-rocksdb/
+---
+
+In RocksDB, we have implemented an easy way to backup your DB. Here is a simple example:
+
+
+
+    #include "rocksdb/db.h"
+    #include "utilities/backupable_db.h"
+    using namespace rocksdb;
+
+    DB* db;
+    DB::Open(Options(), "/tmp/rocksdb", &db);
+    BackupableDB* backupable_db = new BackupableDB(db, BackupableDBOptions("/tmp/rocksdb_backup"));
+    backupable_db->Put(...); // do your thing
+    backupable_db->CreateNewBackup();
+    delete backupable_db; // no need to also delete db
+
+<!--truncate-->
+
+
+This simple example will create a backup of your DB in "/tmp/rocksdb_backup". Creating new BackupableDB consumes DB* and you should be calling all the DB methods on object `backupable_db` going forward.
+
+Restoring is also easy:
+
+
+
+    RestoreBackupableDB* restore = new RestoreBackupableDB(Env::Default(), BackupableDBOptions("/tmp/rocksdb_backup"));
+    restore->RestoreDBFromLatestBackup("/tmp/rocksdb", "/tmp/rocksdb");
+    delete restore;
+
+
+
+
+This code will restore the backup back to "/tmp/rocksdb". The second parameter is the location of log files (In some DBs they are different from DB directory, but usually they are the same. See Options::wal_dir for more info).
+
+An alternative API for backups is to use BackupEngine directly:
+
+
+
+    #include "rocksdb/db.h"
+    #include "utilities/backupable_db.h"
+    using namespace rocksdb;
+
+    DB* db;
+    DB::Open(Options(), "/tmp/rocksdb", &db);
+    db->Put(...); // do your thing
+    BackupEngine* backup_engine = BackupEngine::NewBackupEngine(Env::Default(), BackupableDBOptions("/tmp/rocksdb_backup"));
+    backup_engine->CreateNewBackup(db);
+    delete db;
+    delete backup_engine;
+
+
+
+
+Restoring with BackupEngine is similar to RestoreBackupableDB:
+
+
+
+    BackupEngine* backup_engine = BackupEngine::NewBackupEngine(Env::Default(), BackupableDBOptions("/tmp/rocksdb_backup"));
+    backup_engine->RestoreDBFromLatestBackup("/tmp/rocksdb", "/tmp/rocksdb");
+    delete backup_engine;
+
+
+
+
+Backups are incremental. You can create a new backup with `CreateNewBackup()` and only the new data will be copied to backup directory (for more details on what gets copied, see "Under the hood"). Checksum is always calculated for any backuped file (including sst, log, and etc). It is used to make sure files are kept sound in the file system. Checksum is also verified for files from the previous backups even though they do not need to be copied. A checksum mismatch aborts the current backup (see "Under the hood" for more details). Once you have more backups saved, you can issue `GetBackupInfo()` call to get a list of all backups together with information on timestamp of the backup and the size (please note that sum of all backups' sizes is bigger than the actual size of the backup directory because some data is shared by multiple backups). Backups are identified by their always-increasing IDs. `GetBackupInfo()` is available both in `BackupableDB` and `RestoreBackupableDB`.
+
+You probably want to keep around only small number of backups. To delete old backups, just call `PurgeOldBackups(N)`, where N is how many backups you'd like to keep. All backups except the N newest ones will be deleted. You can also choose to delete arbitrary backup with call `DeleteBackup(id)`.
+
+`RestoreDBFromLatestBackup()` will restore the DB from the latest consistent backup. An alternative is `RestoreDBFromBackup()` which takes a backup ID and restores that particular backup. Checksum is calculated for any restored file and compared against the one stored during the backup time. If a checksum mismatch is detected, the restore process is aborted and `Status::Corruption` is returned. Very important thing to note here: Let's say you have backups 1, 2, 3, 4. If you restore from backup 2 and start writing more data to your database, newly created backup will delete old backups 3 and 4 and create new backup 3 on top of 2.
+
+
+
+## Advanced usage
+
+
+Let's say you want to backup your DB to HDFS. There is an option in `BackupableDBOptions` to set `backup_env`, which will be used for all file I/O related to backup dir (writes when backuping, reads when restoring). If you set it to HDFS Env, all the backups will be stored in HDFS.
+
+`BackupableDBOptions::info_log` is a Logger object that is used to print out LOG messages if not-nullptr.
+
+If `BackupableDBOptions::sync` is true, we will sync data to disk after every file write, guaranteeing that backups will be consistent after a reboot or if machine crashes. Setting it to false will speed things up a bit, but some (newer) backups might be inconsistent. In most cases, everything should be fine, though.
+
+If you set `BackupableDBOptions::destroy_old_data` to true, creating new `BackupableDB` will delete all the old backups in the backup directory.
+
+`BackupableDB::CreateNewBackup()` method takes a parameter `flush_before_backup`, which is false by default. When `flush_before_backup` is true, `BackupableDB` will first issue a memtable flush and only then copy the DB files to the backup directory. Doing so will prevent log files from being copied to the backup directory (since flush will delete them). If `flush_before_backup` is false, backup will not issue flush before starting the backup. In that case, the backup will also include log files corresponding to live memtables. Backup will be consistent with current state of the database regardless of `flush_before_backup` parameter.
+
+
+
+## Under the hood
+
+
+`BackupableDB` implements `DB` interface and adds four methods to it: `CreateNewBackup()`, `GetBackupInfo()`, `PurgeOldBackups()`, `DeleteBackup()`. Any `DB` interface calls will get forwarded to underlying `DB` object.
+
+When you call `BackupableDB::CreateNewBackup()`, it does the following:
+
+
+
+
+
+  1. Disable file deletions
+
+
+
+  2. Get live files (this includes table files, current and manifest file).
+
+
+
+  3. Copy live files to the backup directory. Since table files are immutable and filenames unique, we don't copy a table file that is already present in the backup directory. For example, if there is a file `00050.sst` already backed up and `GetLiveFiles()` returns `00050.sst`, we will not copy that file to the backup directory. However, checksum is calculated for all files regardless if a file needs to be copied or not. If a file is already present, the calculated checksum is compared against previously calculated checksum to make sure nothing crazy happened between backups. If a mismatch is detected, backup is aborted and the system is restored back to the state before `BackupableDB::CreateNewBackup()` is called. One thing to note is that a backup abortion could mean a corruption from a file in backup directory or the corresponding live file in current DB. Both manifest and current files are copied, since they are not immutable.
+
+
+
+  4. If `flush_before_backup` was set to false, we also need to copy log files to the backup directory. We call `GetSortedWalFiles()` and copy all live files to the backup directory.
+
+
+
+  5. Enable file deletions
+
+
+
+
+Backup IDs are always increasing and we have a file `LATEST_BACKUP` that contains the ID of the latest backup. If we crash in middle of backing up, on a restart we will detect that there are newer backup files than `LATEST_BACKUP` claims there are. In that case, we will delete any backup newer than `LATEST_BACKUP` and clean up all the files since some of the table files might be corrupted. Having corrupted table files in the backup directory is dangerous because of our deduplication strategy.
+
+
+
+## Further reading
+
+
+For the API details, see `include/utilities/backupable_db.h`. For the implementation, see `utilities/backupable/backupable_db.cc`.
diff --git a/thirdparty/rocksdb/docs/_posts/2014-03-27-how-to-persist-in-memory-rocksdb-database.markdown b/thirdparty/rocksdb/docs/_posts/2014-03-27-how-to-persist-in-memory-rocksdb-database.markdown
new file mode 100644
index 0000000000..89ffb2d97e
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-03-27-how-to-persist-in-memory-rocksdb-database.markdown
@@ -0,0 +1,54 @@
+---
+title: How to persist in-memory RocksDB database?
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/245/how-to-persist-in-memory-rocksdb-database/
+---
+
+In recent months, we have focused on optimizing RocksDB for in-memory workloads. With growing RAM sizes and strict low-latency requirements, lots of applications decide to keep their entire data in memory. Running in-memory database with RocksDB is easy -- just mount your RocksDB directory on tmpfs or ramfs [1]. Even if the process crashes, RocksDB can recover all of your data from in-memory filesystem. However, what happens if the machine reboots?
+
+<!--truncate-->
+
+In this article we will explain how you can recover your in-memory RocksDB database even after a machine reboot.
+
+Every update to RocksDB is written to two places - one is an in-memory data structure called memtable and second is write-ahead log. Write-ahead log can be used to completely recover the data in memtable. By default, when we flush the memtable to table file, we also delete the current log, since we don't need it anymore for recovery (the data from the log is "persisted" in the table file -- we say that the log file is obsolete). However, if your table file is stored in in-memory file system, you may need the obsolete write-ahead log to recover the data after the machine reboots. Here's how you can do that.
+
+Options::wal_dir is the directory where RocksDB stores write-ahead log files. If you configure this directory to be on flash or disk, you will not lose current log file on machine reboot.
+Options::WAL_ttl_seconds is the timeout when we delete the archived log files. If the timeout is non-zero, obsolete log files will be moved to `archive/` directory under Options::wal_dir. Those archived log files will only be deleted after the specified timeout.
+
+Let's assume Options::wal_dir is a directory on persistent storage and Options::WAL_ttl_seconds is set to one day. To fully recover the DB, we also need to backup the current snapshot of the database (containing table and metadata files) with a frequency of less than one day. RocksDB provides an utility that enables you to easily backup the snapshot of your database. You can learn more about it here: [How to backup RocksDB?](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
+
+You should configure the backup process to avoid backing up log files, since they are already stored in persistent storage. To do that, set BackupableDBOptions::backup_log_files to false.
+
+Restore process by default cleans up entire DB and WAL directory. Since we didn't include log files in the backup, we need to make sure that restoring the database doesn't delete log files in WAL directory. When restoring, configure RestoreOptions::keep_log_file to true. That option will also move any archived log files back to WAL directory, enabling RocksDB to replay all archived log files and rebuild the in-memory database state.
+
+To reiterate, here's what you have to do:
+
+
+
+
+  * Set DB directory to tmpfs or ramfs mounted drive
+
+
+
+  * Set Options::wal_log to a directory on persistent storage
+
+
+
+  * Set Options::WAL_ttl_seconds to T seconds
+
+
+
+  * Backup RocksDB every T/2 seconds, with BackupableDBOptions::backup_log_files = false
+
+
+
+  * When you lose data, restore from backup with RestoreOptions::keep_log_file = true
+
+
+
+
+
+[1] You might also want to consider using [PlainTable format](https://github.com/facebook/rocksdb/wiki/PlainTable-Format) for table files
diff --git a/thirdparty/rocksdb/docs/_posts/2014-04-02-the-1st-rocksdb-local-meetup-held-on-march-27-2014.markdown b/thirdparty/rocksdb/docs/_posts/2014-04-02-the-1st-rocksdb-local-meetup-held-on-march-27-2014.markdown
new file mode 100644
index 0000000000..7ccbdbaada
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-04-02-the-1st-rocksdb-local-meetup-held-on-march-27-2014.markdown
@@ -0,0 +1,53 @@
+---
+title: The 1st RocksDB Local Meetup Held on March 27, 2014
+layout: post
+author: xjin
+category: blog
+redirect_from:
+  - /blog/323/the-1st-rocksdb-local-meetup-held-on-march-27-2014/
+---
+
+On Mar 27, 2014, RocksDB team @ Facebook held the 1st RocksDB local meetup in FB HQ (Menlo Park, California). We invited around 80 guests from 20+ local companies, including LinkedIn, Twitter, Dropbox, Square, Pinterest, MapR, Microsoft and IBM. Finally around 50 guests showed up, totaling around 60% show-up rate.
+
+<!--truncate-->
+
+[![Resize of 20140327_200754](/static/images/Resize-of-20140327_200754-300x225.jpg)](/static/images/Resize-of-20140327_200754-300x225.jpg)
+
+RocksDB team @ Facebook gave four talks about the latest progress and experience on RocksDB:
+
+
+
+
+  * [Supporting a 1PB In-Memory Workload](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Haobo-RocksDB-In-Memory.pdf)
+
+
+
+
+  * [Column Families in RocksDB](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Igor-Column-Families.pdf)
+
+
+
+
+  * ["Lockless" Get() in RocksDB?](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf)
+
+
+
+
+  * [Prefix Hashing in RocksDB](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Siying-Prefix-Hash.pdf)
+
+
+A very interesting question asked by a massive number of guests is: does RocksDB plan to provide replication functionality? Obviously, many applications need a resilient and distributed storage solution, not just single-node storage. We are considering how to approach this issue.
+
+When will be the next meetup? We haven't decided yet. We will see whether the community is interested in it and how it can help RocksDB grow.
+
+If you have any questions or feedback for the meetup or RocksDB, please let us know in [our Facebook group](https://www.facebook.com/groups/rocksdb.dev/).
+
+### Comments
+
+**[Rajiv](geetasen@gmail.com)**
+
+Have any of these talks been recorded and if so will they be published?
+
+**[Igor Canadi](icanadi@fb.com)**
+
+Yes, I think we plan to publish them soon.
diff --git a/thirdparty/rocksdb/docs/_posts/2014-04-07-rocksdb-2-8-release.markdown b/thirdparty/rocksdb/docs/_posts/2014-04-07-rocksdb-2-8-release.markdown
new file mode 100644
index 0000000000..7be7842a5f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-04-07-rocksdb-2-8-release.markdown
@@ -0,0 +1,40 @@
+---
+title: RocksDB 2.8 release
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/371/rocksdb-2-8-release/
+---
+
+Check out the new RocksDB 2.8 release on [Github](https://github.com/facebook/rocksdb/releases/tag/2.8.fb).
+
+RocksDB 2.8. is mostly focused on improving performance for in-memory workloads. We are seeing read QPS as high as 5M (we will write a separate blog post on this).
+
+<!--truncate-->
+
+Here is the summary of new features:
+
+  * Added a new table format called PlainTable, which is optimized for RAM storage (ramfs or tmpfs). You can read more details about it on [our wiki](https://github.com/facebook/rocksdb/wiki/PlainTable-Format).
+
+
+  * New prefixed memtable format HashLinkedList, which is optimized for cases where there are only a few keys for each prefix.
+
+
+  * Merge operator supports a new function PartialMergeMulti() that allows users to do partial merges against multiple operands. This function enables big speedups for workloads that use merge operators.
+
+
+  * Added a V2 compaction filter interface. It buffers the kv-pairs sharing the same key prefix, process them in batches, and return the batched results back to DB.
+
+
+  * Geo-spatial support for locations and radial-search.
+
+
+  * Improved read performance using thread local cache for frequently accessed data.
+
+
+  * Stability improvements -- we're now ignoring partially written tailing record to MANIFEST or WAL files.
+
+
+
+We have also introduced small incompatible API changes (mostly for advanced users). You can see full release notes in our [HISTORY.my](https://github.com/facebook/rocksdb/blob/2.8.fb/HISTORY.md) file.
diff --git a/thirdparty/rocksdb/docs/_posts/2014-04-21-indexing-sst-files-for-better-lookup-performance.markdown b/thirdparty/rocksdb/docs/_posts/2014-04-21-indexing-sst-files-for-better-lookup-performance.markdown
new file mode 100644
index 0000000000..368055d2c2
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-04-21-indexing-sst-files-for-better-lookup-performance.markdown
@@ -0,0 +1,28 @@
+---
+title: Indexing SST Files for Better Lookup Performance
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/431/indexing-sst-files-for-better-lookup-performance/
+---
+
+For a `Get()` request, RocksDB goes through mutable memtable, list of immutable memtables, and SST files to look up the target key. SST files are organized in levels.
+
+On level 0, files are sorted based on the time they are flushed. Their key range (as defined by FileMetaData.smallest and FileMetaData.largest) are mostly overlapped with each other. So it needs to look up every L0 file.
+
+<!--truncate-->
+
+Compaction is scheduled periodically to pick up files from an upper level and merges them with files from lower level. As a result, key/values are moved from L0 down the LSM tree gradually. Compaction sorts key/values and split them into files. From level 1 and below, SST files are sorted based on key. Their key range are mutually exclusive. Instead of scanning through each SST file and checking if a key falls into its range, RocksDB performs a binary search based on FileMetaData.largest to locate a candidate file that can potentially contain the target key. This reduces complexity from O(N) to O(log(N)). However, log(N) can still be large for bottom levels. For a fan-out ratio of 10, level 3 can have 1000 files. That requires 10 comparisons to locate a candidate file. This is a significant cost for an in-memory database when you can do [several million gets per second](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks).
+
+One observation to this problem is that: after the LSM tree is built, an SST file's position in its level is fixed. Furthermore, its order relative to files from the next level is also fixed. Based on this idea, we can perform [fractional cascading](http://en.wikipedia.org/wiki/Fractional_cascading) kind of optimization to narrow down the binary search range. Here is an example:
+
+[![tree_example](/static/images/tree_example1.png)](/static/images/tree_example1.png)
+
+Level 1 has 2 files and level 2 has 8 files. Now, we want to look up key 80. A binary search based FileMetaData.largest tells you file 1 is the candidate. Then key 80 is compared with its FileMetaData.smallest and FileMetaData.largest to decide if it falls into the range. The comparison shows 80 is less than FileMetaData.smallest (100), so file 1 does not possibly contain key 80. We to proceed to check level 2. Usually, we need to do binary search among all 8 files on level 2. But since we already know target key 80 is less than 100 and only file 1 to file 3 can contain key less than 100, we can safely exclude other files from the search. As a result we cut down the search space from 8 files to 3 files.
+
+Let's look at another example. We want to get key 230. A binary search on level 1 locates to file 2 (this also implies key 230 is larger than file 1's FileMetaData.largest 200). A comparison with file 2's range shows the target key is smaller than file 2's FileMetaData.smallest 300. Even though, we couldn't find key on level 1, we have derived hints that target key is in range between 200 and 300. Any files on level 2 that cannot overlap with [200, 300] can be safely excluded. As a result, we only need to look at file 5 and file 6 on level 2.
+
+Inspired by this concept, we pre-build pointers at compaction time on level 1 files that point to a range of files on level 2. For example, file 1 on level 1 points to file 3 (on level 2) on the left and file 4 on the right. File 2 will point to level 2 files 6 and 7. At query time, these pointers are used to determine the actual binary search range based on comparison result.
+
+Our benchmark shows that this optimization improves lookup QPS by ~5% for similar setup mentioned [here](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks).
diff --git a/thirdparty/rocksdb/docs/_posts/2014-05-14-lock.markdown b/thirdparty/rocksdb/docs/_posts/2014-05-14-lock.markdown
new file mode 100644
index 0000000000..12009cc88c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-05-14-lock.markdown
@@ -0,0 +1,88 @@
+---
+title: Reducing Lock Contention in RocksDB
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/521/lock/
+---
+
+In this post, we briefly introduce the recent improvements we did to RocksDB to improve the issue of lock contention costs.
+
+RocksDB has a simple thread synchronization mechanism (See [RocksDB Architecture Guide](https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide)  to understand terms used below, like SST tables or mem tables). SST tables are immutable after being written and mem tables are lock-free data structures supporting single writer and multiple readers. There is only one single major lock, the DB mutex (DBImpl.mutex_) protecting all the meta operations, including:
+
+<!--truncate-->
+
+  * Increase or decrease reference counters of mem tables and SST tables
+
+
+  * Change and check meta data structures, before and after finishing compactions, flushes and new mem table creations
+
+
+  * Coordinating writers
+
+
+This DB mutex used to be scalability bottleneck preventing us from scaling to more than 16 threads. To address the issue, we improved RocksDB in several ways.
+
+1. Consolidate reference counters and introduce "super version". For every read operation, mutex was acquired, and reference counters for each mem table and each SST table were increased. One such operation is not expensive but if you are building a high throughput server with lots of reads, the lock contention will become the bottleneck. This is especially true if you store all your data in RAM.
+
+To solve this problem, we created a meta-meta data structure called “[super version](https://reviews.facebook.net/rROCKSDB1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c)”, which holds reference counters to all those mem table and SST tables, so that readers only need to increase the reference counters for this single data structure. In RocksDB, list of live mem tables and SST tables only changes infrequently, which would happen when new mem tables are created or flush/compaction happens. Now, at those times, a new super version is created with their reference counters increased. A super version lists live mem tables and SST tables so a reader only needs acquire the lock in order to find the latest super version and increase its reference counter. From the super version, the reader can find all the mem and SST tables which are safety accessible as long as the reader holds the reference count for the super version.
+
+2. We replace some reference counters to stc::atomic objects, so that decreasing reference count of an object usually doesn’t need to be inside the mutex any more.
+
+3. Make fetching super version and reference counting lock-free in read queries. After consolidating reference counting to one single super version and removing the locking for decreasing reference counts, in read case, we only acquire mutex for one thing: fetch the latest super version and increase the reference count for that (dereference the counter is done in an atomic decrease). We designed and implemented a (mostly) lock-free approach to do it. See [details](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf). We will write a separate blog post for that.
+
+4. Avoid disk I/O inside the mutex. As we know, each disk I/O to hard drives takes several milliseconds. It can be even longer if file system journal is involved or I/Os are queued. Even occasional disk I/O within mutex can cause huge performance outliers.
+We identified in two situations, we might do disk I/O inside mutex and we removed them:
+(1) Opening and closing transactional log files. We moved those operations out of the mutex.
+(2) Information logging. In multiple places we write to logs within mutex. There is a chance that file write will wait for disk I/O to finish before finishing, even if fsync() is not issued, especially in EXT systems. We occasionally see 100+ milliseconds write() latency on EXT. Instead of removing those logging, we came up with a solution of delay logging. When inside mutex, instead of directly writing to the log file, we write to a log buffer, with the timing information. As soon as mutex is released, we flush the log buffer to log files.
+
+5. Reduce object creation inside the mutex.
+Object creation can be slow because it involves malloc (in our case). Malloc sometimes is slow because it needs to lock some shared data structures. Allocating can also be slow because we sometimes do expensive operations in some of our classes' constructors. For these reasons, we try to reduce object creations inside the mutex. Here are two examples:
+
+(1) std::vector uses malloc inside. We introduced “[autovector](https://reviews.facebook.net/rROCKSDBc01676e46d3be08c3c140361ef1f5884f47d3b3c)” data structure, in which memory for first a few elements are pre-allocated as members of the autovector class. When an autovector is used as a stack variable, no malloc will be needed unless the pre-allocated buffer is used up. This autovector is quite useful for manipulating those meta data structures. Those meta operations are often locked inside DB mutex.
+
+(2) When building an iterator, we used to creating iterator of every live men table and SST table within the mutex and a merging iterator on top of them. Besides malloc, some of those iterators can be quite expensive to create, like sorting. Now, instead of doing that, we simply increase the reference counters of them, and release the mutex before creating any iterator.
+
+6. Deal with mutexes in LRU caches.
+When I said there was only one single major lock, I was lying. In RocksDB, all LRU caches had exclusive mutexes within to protect writes to the LRU lists, which are done in both of read and write operations. LRU caches are used in block cache and table cache. Both of them are accessed more frequently than DB data structures. Lock contention of these two locks are as intense as the DB mutex. Even if LRU cache is sharded into ShardedLRUCache, we can still see lock contentions, especially table caches. We further address this issue in two way:
+(1) Bypassing table caches. A table cache maintains list of SST table’s read handlers. Those handlers contain SST files’ descriptors, table metadata, and possibly data indexes, as well as bloom filters. When the table handler needs to be evicted based on LRU, those information is cleared. When the SST table needs to be read and its table handler is not in LRU cache, the table is opened and those metadata is loaded. In some cases, users want to tune the system in a way that table handler evictions should never happen. It is common for high-throughput, low-latency servers. We introduce a mode where table cache is bypassed in read queries. In this mode, all table handlers are cached and accessed directly, so there is no need to query and adjust table caches for reading the database. It is the users’ responsibility to reserve enough resource for it. This mode can be turned on by setting options.max_open_files=-1.
+
+(2) [New PlainTable format](//github.com/facebook/rocksdb/wiki/PlainTable-Format) (optimized for SST in ramfs/tmpfs) does not organize data by blocks. Data are located by memory addresses so no block cache is needed.
+
+With all of those improvements, lock contention is not a bottleneck anymore, which is shown in our [memory-only benchmark](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks) . Furthermore, lock contentions are not causing some huge (50 milliseconds+) latency outliers they used to cause.
+
+### Comments
+
+**[Lee Hounshell](lee@apsalar.com)**
+
+Please post an example of reading the same rocksdb concurrently.
+
+We are using the latest 3.0 rocksdb; however, when two separate processes
+try and open the same rocksdb for reading, only one of the open requests
+succeed. The other open always fails with “db/LOCK: Resource temporarily unavailable” So far we have not found an option that allows sharing the rocksdb for reads. An example would be most appreciated.
+
+**[Siying Dong](siying.d@fb.com)**
+
+Sorry for the delay. We don’t have feature support for this scenario yet. Here is an example you can work around this problem. You can build a snapshot of the DB by doing this:
+
+1. create a separate directory on the same host for a snapshot of the DB.
+1. call `DB::DisableFileDeletions()`
+1. call `DB::GetLiveFiles()` to get a full list of the files.
+1. for all the files except manifest, add a hardlink file in your new directory pointing to the original file
+1. copy the manifest file and truncate the size (you can read the comments of `DB::GetLiveFiles()` for more information)
+1. call `DB::EnableFileDeletions()`
+1. now you can open the snapshot directory in another process to access those files. Please remember to delete the directory after reading the data to allow those files to be recycled.
+
+By the way, the best way to ask those questions is in our [facebook group](https://www.facebook.com/groups/rocksdb.dev/). Let us know if you need any further help.
+
+**[Darshan](darshan.ghumare@gmail.com)**
+
+Will this consistency problem of RocksDB all occurs in case of single put/write?
+What all ACID properties is supported by RocksDB, only durability irrespective of single or batch write?
+
+**[Siying Dong](siying.d@fb.com)**
+
+We recently [introduced optimistic transaction](https://reviews.facebook.net/D33435) which can help you ensure all of ACID.
+
+This blog post is mainly about optimizations in implementation. The RocksDB consistency semantic is not changed.
diff --git a/thirdparty/rocksdb/docs/_posts/2014-05-19-rocksdb-3-0-release.markdown b/thirdparty/rocksdb/docs/_posts/2014-05-19-rocksdb-3-0-release.markdown
new file mode 100644
index 0000000000..61c90dc936
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-05-19-rocksdb-3-0-release.markdown
@@ -0,0 +1,24 @@
+---
+title: RocksDB 3.0 release
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/557/rocksdb-3-0-release/
+---
+
+Check out new RocksDB release on [Github](https://github.com/facebook/rocksdb/releases/tag/3.0.fb)!
+
+New features in RocksDB 3.0:
+
+  * [Column Family support](https://github.com/facebook/rocksdb/wiki/Column-Families)
+
+
+  * [Ability to chose different checksum function](https://github.com/facebook/rocksdb/commit/0afc8bc29a5800e3212388c327c750d32e31f3d6)
+
+
+  * Deprecated ReadOptions::prefix_seek and ReadOptions::prefix
+
+<!--truncate-->
+
+Check out the full [change log](https://github.com/facebook/rocksdb/blob/3.0.fb/HISTORY.md).
diff --git a/thirdparty/rocksdb/docs/_posts/2014-05-22-rocksdb-3-1-release.markdown b/thirdparty/rocksdb/docs/_posts/2014-05-22-rocksdb-3-1-release.markdown
new file mode 100644
index 0000000000..30156742b2
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-05-22-rocksdb-3-1-release.markdown
@@ -0,0 +1,20 @@
+---
+title: RocksDB 3.1 release
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/575/rocksdb-3-1-release/
+---
+
+Check out the new release on [Github](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.1)!
+
+New features in RocksDB 3.1:
+
+  * [Materialized hash index](https://github.com/facebook/rocksdb/commit/0b3d03d026a7248e438341264b4c6df339edc1d7)
+
+
+  * [FIFO compaction style](https://github.com/facebook/rocksdb/wiki/FIFO-compaction-style)
+
+
+We released 3.1 so fast after 3.0 because one of our internal customers needed materialized hash index.
diff --git a/thirdparty/rocksdb/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown b/thirdparty/rocksdb/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
new file mode 100644
index 0000000000..6a641f2335
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-06-23-plaintable-a-new-file-format.markdown
@@ -0,0 +1,47 @@
+---
+title: PlainTable — A New File Format
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/599/plaintable-a-new-file-format/
+---
+
+In this post, we are introducing "PlainTable" -- a file format we designed for RocksDB, initially to satisfy a production use case at Facebook.
+
+Design goals:
+
+1. All data stored in memory, in files stored in tmpfs/ramfs. Support DBs larger than 100GB (may be sharded across multiple RocksDB instance).
+1. Optimize for [prefix hashing](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Siying-Prefix-Hash.pdf)
+1. Less than or around 1 micro-second average latency for single Get() or Seek().
+1. Minimize memory consumption.
+1. Queries efficiently return empty results
+
+<!--truncate-->
+
+Notice that our priority was not to maximize query performance, but to strike a balance between query performance and memory consumption. PlainTable query performance is not as good as you would see with a nicely-designed hash table, but they are of the same order of magnitude, while keeping memory overhead to a minimum.
+
+Since we are targeting micro-second latency, it is on the level of the number of CPU cache misses (if they cannot be parallellized, which are usually the case for index look-ups). On our target hardware with Intel CPUs of multiple sockets with NUMA, we can only allow 4-5 CPU cache misses (including costs of data TLB).
+
+To meet our requirements, given that only hash prefix iterating is needed, we made two decisions:
+
+1. to use a hash index, which is
+1. directly addressed to rows, with no block structure.
+
+Having addressed our latency goal, the next task was to design a very compact hash index to minimize memory consumption. Some tricks we used to meet this goal:
+
+1. We only use 32-bit integers for data and index offsets.The first bit serves as a flag, so we can avoid using 8-byte pointers.
+1. We never copy keys or parts of keys to index search structures. We store only offsets from which keys can be retrieved, to make comparisons with search keys.
+1. Since our file is immutable, we can accurately estimate the number of hash buckets needed.
+
+To make sure the format works efficiently with empty queries, we added a bloom filter check before the query. This adds only one cache miss for non-empty cases [1], but avoids multiple cache misses for most empty results queries. This is a good trade-off for use cases with a large percentage of empty results.
+
+These are the design goals and basic ideas of PlainTable file format. For detailed information, see [this wiki page](https://github.com/facebook/rocksdb/wiki/PlainTable-Format).
+
+[1] Bloom filter checks typically require multiple memory access. However, because they are independent, they usually do not make the CPU pipeline stale. In any case, we improved the bloom filter to improve data locality - we may cover this further in a future blog post.
+
+### Comments
+
+**[Siying Dong](siying.d@fb.com)**
+
+Does [http://rocksdb.org/feed/](http://rocksdb.org/feed/) work?
diff --git a/thirdparty/rocksdb/docs/_posts/2014-06-27-avoid-expensive-locks-in-get.markdown b/thirdparty/rocksdb/docs/_posts/2014-06-27-avoid-expensive-locks-in-get.markdown
new file mode 100644
index 0000000000..4411c7ae31
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-06-27-avoid-expensive-locks-in-get.markdown
@@ -0,0 +1,89 @@
+---
+title: Avoid Expensive Locks in Get()
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/677/avoid-expensive-locks-in-get/
+---
+
+As promised in the previous [blog post](blog/2014/05/14/lock.html)!
+
+RocksDB employs a multiversion concurrency control strategy. Before reading data, it needs to grab the current version, which is encapsulated in a data structure called [SuperVersion](https://reviews.facebook.net/rROCKSDB1fdb3f7dc60e96394e3e5b69a46ede5d67fb976c).
+
+<!--truncate-->
+
+At the beginning of `GetImpl()`, it used to do this:
+
+
+    <span class="zw-portion">mutex_.Lock();
+    </span>auto* s = super_version_->Ref();
+    mutex_.Unlock();
+
+
+The lock is necessary because pointer super_version_ may be updated, the corresponding SuperVersion may be deleted while Ref() is in progress.
+
+
+`Ref()` simply increases the reference counter and returns “this” pointer. However, this simple operation posed big challenges for in-memory workload and stopped RocksDB from scaling read throughput beyond 8 cores. Running 32 read threads on a 32-core CPU leads to [70% system CPU usage](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf). This is outrageous!
+
+
+
+
+Luckily, we found a way to circumvent this problem by using [thread local storage](http://en.wikipedia.org/wiki/Thread-local_storage). Version change is a rare event comparable to millions of read requests. On the very first Get() request, each thread pays the mutex cost to acquire a reference to the new super version. Instead of releasing the reference after use, the reference is cached in thread’s local storage. An atomic variable is used to track global super version number. Subsequent reads simply compare the local super version number against the global super version number. If they are the same, the cached super version reference may be used directly, at no cost. If a version change is detected, mutex must be acquired to update the reference. The cost of mutex lock is amortized among millions of reads and becomes negligible.
+
+
+
+
+The code looks something like this:
+
+
+
+
+
+    SuperVersion* s = thread_local_->Get();
+    if (s->version_number != super_version_number_.load()) {
+      // slow path, cleanup of current super version is omitted
+      mutex_.Lock();
+      s = super_version_->Ref();
+      mutex_.Unlock();
+    }
+
+
+
+
+The result is quite amazing. RocksDB can nicely [scale to 32 cores](https://github.com/facebook/rocksdb/raw/gh-pages/talks/2014-03-27-RocksDB-Meetup-Lei-Lockless-Get.pdf) and most CPU time is spent in user land.
+
+
+
+
+Daryl Grove gives a pretty good [comparison between mutex and atomic](https://blogs.oracle.com/d/entry/the_cost_of_mutexes). However, the real cost difference lies beyond what is shown in the assembly code. Mutex can keep threads spinning on CPU or even trigger thread context switches in which all readers compete to access the critical area. Our approach prevents mutual competition by directing threads to check against a global version which does not change at high frequency, and is therefore much more cache-friendly.
+
+
+
+
+The new approach entails one issue: a thread can visit GetImpl() once but can never come back again. SuperVersion is referenced and cached in its thread local storage. All resources (e.g., memtables, files) which belong to that version are frozen. A “supervisor” is required to visit each thread’s local storage and free its resources without incurring a lock. We designed a lockless sweep using CAS (compare and switch instruction). Here is how it works:
+
+
+
+
+(1) A reader thread uses CAS to acquire SuperVersion from its local storage and to put in a special flag (SuperVersion::kSVInUse).
+
+
+
+
+(2) Upon completion of GetImpl(), the reader thread tries to return SuperVersion to local storage by CAS, expecting the special flag (SuperVersion::kSVInUse) in its local storage. If it does not see SuperVersion::kSVInUse, that means a “sweep” was done and the reader thread is responsible for cleanup (this is expensive, but does not happen often on the hot path).
+
+
+
+
+(3) After any flush/compaction, the background thread performs a sweep (CAS) across all threads’ local storage and frees encountered SuperVersion. A reader thread must re-acquire a new SuperVersion reference on its next visit.
+
+### Comments
+
+**[David Barbour](dmbarbour@gmail.com)**
+
+Please post an example of reading the same rocksdb concurrently.
+
+We are using the latest 3.0 rocksdb; however, when two separate processes
+try and open the same rocksdb for reading, only one of the open requests
+succeed. The other open always fails with “db/LOCK: Resource temporarily unavailable” So far we have not found an option that allows sharing the rocksdb for reads. An example would be most appreciated.
diff --git a/thirdparty/rocksdb/docs/_posts/2014-06-27-rocksdb-3-2-release.markdown b/thirdparty/rocksdb/docs/_posts/2014-06-27-rocksdb-3-2-release.markdown
new file mode 100644
index 0000000000..e4eba6af4b
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-06-27-rocksdb-3-2-release.markdown
@@ -0,0 +1,30 @@
+---
+title: RocksDB 3.2 release
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/647/rocksdb-3-2-release/
+---
+
+Check out new RocksDB release on [GitHub](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.2)!
+
+New Features in RocksDB 3.2:
+
+  * PlainTable now supports a new key encoding: for keys of the same prefix, the prefix is only written once. It can be enabled through encoding_type paramter of NewPlainTableFactory()
+
+
+  * Add AdaptiveTableFactory, which is used to convert from a DB of PlainTable to BlockBasedTabe, or vise versa. It can be created using NewAdaptiveTableFactory()
+
+<!--truncate-->
+
+Public API changes:
+
+
+  * We removed seek compaction as a concept from RocksDB
+
+
+  * Add two paramters to NewHashLinkListRepFactory() for logging on too many entries in a hash bucket when flushing
+
+
+  * Added new option BlockBasedTableOptions::hash_index_allow_collision. When enabled, prefix hash index for block-based table will not store prefix and allow hash collision, reducing memory consumption
diff --git a/thirdparty/rocksdb/docs/_posts/2014-07-29-rocksdb-3-3-release.markdown b/thirdparty/rocksdb/docs/_posts/2014-07-29-rocksdb-3-3-release.markdown
new file mode 100644
index 0000000000..d858e4fafe
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-07-29-rocksdb-3-3-release.markdown
@@ -0,0 +1,34 @@
+---
+title: RocksDB 3.3 Release
+layout: post
+author: yhciang
+category: blog
+redirect_from:
+  - /blog/1301/rocksdb-3-3-release/
+---
+
+Check out new RocksDB release on [GitHub](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.3)!
+
+New Features in RocksDB 3.3:
+
+  * **JSON API prototype**.
+
+
+  * **Performance improvement on HashLinkList**:  We addressed performance outlier of HashLinkList caused by skewed bucket by switching data in the bucket from linked list to skip list. Add parameter threshold_use_skiplist in NewHashLinkListRepFactory().
+
+<!--truncate-->
+
+  * **More effective on storage space reclaim**:  RocksDB is now able to reclaim storage space more effectively during the compaction process.  This is done by compensating the size of each deletion entry by the 2X average value size, which makes compaction to be triggerred by deletion entries more easily.
+
+
+  * **TimeOut API to write**:  Now WriteOptions have a variable called timeout_hint_us.  With timeout_hint_us set to non-zero, any write associated with this timeout_hint_us may be aborted when it runs longer than the specified timeout_hint_us, and it is guaranteed that any write completes earlier than the specified time-out will not be aborted due to the time-out condition.
+
+
+  * **rate_limiter option**: We added an option that controls total throughput of flush and compaction. The throughput is specified in bytes/sec. Flush always has precedence over compaction when available bandwidth is constrained.
+
+
+
+Public API changes:
+
+
+  * Removed NewTotalOrderPlainTableFactory because it is not used and implemented semantically incorrect.
diff --git a/thirdparty/rocksdb/docs/_posts/2014-09-12-cuckoo.markdown b/thirdparty/rocksdb/docs/_posts/2014-09-12-cuckoo.markdown
new file mode 100644
index 0000000000..22178f7cac
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-09-12-cuckoo.markdown
@@ -0,0 +1,74 @@
+---
+title: Cuckoo Hashing Table Format
+layout: post
+author: radheshyam
+category: blog
+redirect_from:
+  - /blog/1427/new-bloom-filter-format/
+---
+
+## Introduction
+
+We recently introduced a new [Cuckoo Hashing](http://en.wikipedia.org/wiki/Cuckoo_hashing) based SST file format which is optimized for fast point lookups. The new format was built for applications which require very high point lookup rates (~4Mqps) in read only mode but do not use operations like range scan, merge operator, etc. But, the existing RocksDB file formats were built to support range scan and other operations and the current best point lookup in RocksDB is 1.2 Mqps given by [PlainTable](https://github.com/facebook/rocksdb/wiki/PlainTable-Format)[ format](https://github.com/facebook/rocksdb/wiki/PlainTable-Format). This prompted a hashing based file format, which we present here. The new table format uses a cache friendly version of Cuckoo Hashing algorithm with only 1 or 2 memory accesses per lookup.
+
+<!--truncate-->
+
+Goals:
+
+  * Reduce memory accesses per lookup to 1 or 2
+
+
+  * Get an end to end point lookup rate of at least 4 Mqps
+
+
+  * Minimize database size
+
+
+Assumptions:
+
+  * Key length and value length are fixed
+
+
+  * The database is operated in read only mode
+
+
+Non-goals:
+
+
+  * While optimizing the performance of Get() operation was our primary goal, compaction and build times were secondary. We may work on improving them in future.
+
+
+Details for setting up the table format can be found in [GitHub](https://github.com/facebook/rocksdb/wiki/CuckooTable-Format).
+
+
+## Cuckoo Hashing Algorithm
+
+In order to achieve high lookup speeds, we did multiple optimizations, including a cache friendly cuckoo hash algorithm. Cuckoo Hashing uses multiple hash functions, _h1, ..., __hn._
+
+### Original Cuckoo Hashing
+
+To insert any new key _k_, we compute hashes of the key _h1(k), ..., __hn__(k)_. We insert the key in the first hash location that is free. If all the locations are blocked, we try to move one of the colliding keys to a different location by trying to re-insert it.
+
+Finding smallest set of keys to displace in order to accommodate the new key is naturally a shortest path problem in a directed graph where nodes are buckets of hash table and there is an edge from bucket _A_ to bucket _B_ if the element stored in bucket _A_ can be accommodated in bucket _B_ using one of the hash functions. The source nodes are the possible hash locations for the given key _k_ and destination is any one of the empty buckets. We use this algorithm to handle collision.
+
+To retrieve a key _k_, we compute hashes, _h1(k), ..., __hn__(k)_ and the key must be present in one of these locations.
+
+Our goal is to minimize average (and maximum) number of hash functions required and hence the number of memory accesses. In our experiments, with a hash utilization of 90%, we found that the average number of lookups is 1.8 and maximum is 3. Around 44% of keys are accommodated in first hash location and 33% in second location.
+
+
+### Cache Friendly Cuckoo Hashing
+
+We noticed the following two sub-optimal properties in original Cuckoo implementation:
+
+
+  * If the key is not present in first hash location, we jump to second hash location which may not be in cache. This results in many cache misses.
+
+
+  * Because only 44% of keys are located in first cuckoo block, we couldn't have an optimal prefetching strategy - prefetching all hash locations for a key is wasteful. But prefetching only the first hash location helps only 44% of cases.
+
+
+
+The solution is to insert more keys near first location. In case of collision in the first hash location - _h1(k)_, we try to insert it in next few buckets, _h1(k)+1, _h1(k)+2, _..., h1(k)+t-1_. If all of these _t_ locations are occupied, we skip over to next hash function _h2_ and repeat the process. We call the set of _t_ buckets as a _Cuckoo Block_. We chose _t_ such that size of a block is not bigger than a cache line and we prefetch the first cuckoo block.
+
+
+With the new algorithm, for 90% hash utilization, we found that 85% of keys are accommodated in first Cuckoo Block. Prefetching the first cuckoo block yields best results. For a database of 100 million keys with key length 8 and value length 4, the hash algorithm alone can achieve 9.6 Mqps and we are working on improving it further. End to end RocksDB performance results can be found [here](https://github.com/facebook/rocksdb/wiki/CuckooTable-Format).
diff --git a/thirdparty/rocksdb/docs/_posts/2014-09-12-new-bloom-filter-format.markdown b/thirdparty/rocksdb/docs/_posts/2014-09-12-new-bloom-filter-format.markdown
new file mode 100644
index 0000000000..96fa50a401
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-09-12-new-bloom-filter-format.markdown
@@ -0,0 +1,52 @@
+---
+title: New Bloom Filter Format
+layout: post
+author: zagfox
+category: blog
+redirect_from:
+  - /blog/1367/cuckoo/
+---
+
+## Introduction
+
+In this post, we are introducing "full filter block" --- a new bloom filter format for [block based table](https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format). This could bring about 40% of improvement for key query under in-memory (all data stored in memory, files stored in tmpfs/ramfs, an [example](https://github.com/facebook/rocksdb/wiki/RocksDB-In-Memory-Workload-Performance-Benchmarks) workload. The main idea behind is to generate a big filter that covers all the keys in SST file to avoid lots of unnecessary memory look ups.
+
+
+<!--truncate-->
+
+## What is Bloom Filter
+
+In brief, [bloom filter](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter) is a bits array generated for a set of keys that could tell if an arbitrary key may exist in that set.
+
+In RocksDB, we generate such a bloom filter for each SST file. When we conduct a query for a key, we first goes to the bloom filter block of SST file. If key may exist in filter, we goes into data block in SST file to search for the key. If not, we would return directly. So it could help speed up point look up operation a lot.
+
+## Original Bloom Filter Format
+
+Original bloom filter creates filters for each individual data block in SST file. It has complex structure (ref [here](https://github.com/facebook/rocksdb/wiki/Rocksdb-BlockBasedTable-Format#filter-meta-block)) which results in a lot of non-adjacent memory look ups.
+
+Here's the work flow for checking original bloom filter in block based table:
+
+1. Given the target key, we goes to the index block to get the "data block ID" where this key may reside.
+1. Using the "data block ID", we goes to the filter block and get the correct "offset of filter".
+1. Using the "offset of filter", we goes to the actual filter and do the checking.
+
+## New Bloom Filter Format
+
+New bloom filter creates filter for all keys in SST file and we name it "full filter". The data structure of full filter is very simple, there is just one big filter:
+
+    [ full filter ]
+
+In this way, the work flow of bloom filter checking is much simplified.
+
+(1) Given the target key, we goes directly to the filter block and conduct the filter checking.
+
+To be specific, there would be no checking for index block and no address jumping inside of filter block.
+
+Though it is a big filter, the total filter size would be the same as the original filter.
+
+One little draw back is that the new bloom filter introduces more memory consumption when building SST file because we need to buffer keys (or their hashes) before generating filter. Original filter just creates a bunch of small filters so it just buffer a small amount of keys. For full filter, we buffer hashes of all keys, which would take more memory when SST file size increases.
+
+
+## Usage & Customization
+
+You can refer to the document here for [usage](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#usage-of-new-bloom-filter) and [customization](https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#customize-your-own-filterpolicy).
diff --git a/thirdparty/rocksdb/docs/_posts/2014-09-15-rocksdb-3-5-release.markdown b/thirdparty/rocksdb/docs/_posts/2014-09-15-rocksdb-3-5-release.markdown
new file mode 100644
index 0000000000..1878a5a567
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2014-09-15-rocksdb-3-5-release.markdown
@@ -0,0 +1,38 @@
+---
+title: RocksDB 3.5 Release!
+layout: post
+author: leijin
+category: blog
+redirect_from:
+  - /blog/1547/rocksdb-3-5-release/
+---
+
+New RocksDB release - 3.5!
+
+
+**New Features**
+
+
+  1. Add include/utilities/write_batch_with_index.h, providing a utility class to query data out of WriteBatch when building it.
+
+
+  2. new ReadOptions.total_order_seek to force total order seek when block-based table is built with hash index.
+
+<!--truncate-->
+
+**Public API changes**
+
+
+  1. The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
+
+
+  2. Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer.
+
+
+  3. Remove deprecated options: disable_seek_compaction and db_stats_log_interval
+
+
+  4. OptimizeForPointLookup() takes one parameter for block cache size. It now builds hash index, bloom filter, and block cache.
+
+
+[https://github.com/facebook/rocksdb/releases/tag/v3.5](https://github.com/facebook/rocksdb/releases/tag/rocksdb-3.5)
diff --git a/thirdparty/rocksdb/docs/_posts/2015-01-16-migrating-from-leveldb-to-rocksdb-2.markdown b/thirdparty/rocksdb/docs/_posts/2015-01-16-migrating-from-leveldb-to-rocksdb-2.markdown
new file mode 100644
index 0000000000..f18de0bbc3
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-01-16-migrating-from-leveldb-to-rocksdb-2.markdown
@@ -0,0 +1,112 @@
+---
+title: Migrating from LevelDB to RocksDB
+layout: post
+author: lgalanis
+category: blog
+redirect_from:
+  - /blog/1811/migrating-from-leveldb-to-rocksdb-2/
+---
+
+If you have an existing application that uses LevelDB and would like to migrate to using RocksDB, one problem you need to overcome is to map the options for LevelDB to proper options for RocksDB. As of release 3.9 this can be automatically done by using our option conversion utility found in rocksdb/utilities/leveldb_options.h. What is needed, is to first replace `leveldb::Options` with `rocksdb::LevelDBOptions`. Then, use `rocksdb::ConvertOptions( )` to convert the `LevelDBOptions` struct into appropriate RocksDB options. Here is an example:
+
+<!--truncate-->
+
+LevelDB code:
+
+```c++
+#include <string>
+#include "leveldb/db.h"
+
+using namespace leveldb;
+
+int main(int argc, char** argv) {
+  DB *db;
+
+  Options opt;
+  opt.create_if_missing = true;
+  opt.max_open_files = 1000;
+  opt.block_size = 4096;
+
+  Status s = DB::Open(opt, "/tmp/mydb", &db);
+
+  delete db;
+}
+```
+
+RocksDB code:
+
+```c++
+#include <string>  
+#include "rocksdb/db.h"  
+#include "rocksdb/utilities/leveldb_options.h"  
+
+using namespace rocksdb;  
+
+int main(int argc, char** argv) {  
+  DB *db;  
+
+  LevelDBOptions opt;  
+  opt.create_if_missing = true;  
+  opt.max_open_files = 1000;  
+  opt.block_size = 4096;  
+
+  Options rocksdb_options = ConvertOptions(opt);  
+  // add rocksdb specific options here  
+
+  Status s = DB::Open(rocksdb_options, "/tmp/mydb_rocks", &db);
+
+  delete db;  
+}  
+```
+
+The difference is:
+
+```diff
+-#include "leveldb/db.h"
++#include "rocksdb/db.h"
++#include "rocksdb/utilities/leveldb_options.h"
+
+-using namespace leveldb;
++using namespace rocksdb;
+
+-  Options opt;
++  LevelDBOptions opt;
+
+-  Status s = DB::Open(opt, "/tmp/mydb", &db);
++  Options rocksdb_options = ConvertOptions(opt);
++  // add rockdb specific options here
++
++  Status s = DB::Open(rocksdb_options, "/tmp/mydb_rocks", &db);
+```
+
+Once you get up and running with RocksDB you can then focus on tuning RocksDB further by modifying the converted options struct.
+
+The reason why ConvertOptions is handy is because a lot of individual options in RocksDB have moved to other structures in different components. For example, block_size is not available in struct rocksdb::Options. It resides in struct rocksdb::BlockBasedTableOptions, which is used to create a TableFactory object that RocksDB uses internally to create the proper TableBuilder objects. If you were to write your application from scratch it would look like this:
+
+RocksDB code from scratch:
+
+```c++
+#include <string>
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+
+using namespace rocksdb;
+
+int main(int argc, char** argv) {
+  DB *db;
+
+  Options opt;
+  opt.create_if_missing = true;
+  opt.max_open_files = 1000;
+
+  BlockBasedTableOptions topt;
+  topt.block_size = 4096;
+  opt.table_factory.reset(NewBlockBasedTableFactory(topt));
+
+  Status s = DB::Open(opt, "/tmp/mydb_rocks", &db);
+
+  delete db;
+}
+```
+
+The LevelDBOptions utility can ease migration to RocksDB from LevelDB and allows us to break down the various options across classes as it is needed.
diff --git a/thirdparty/rocksdb/docs/_posts/2015-02-24-reading-rocksdb-options-from-a-file.markdown b/thirdparty/rocksdb/docs/_posts/2015-02-24-reading-rocksdb-options-from-a-file.markdown
new file mode 100644
index 0000000000..cddc0dd01f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-02-24-reading-rocksdb-options-from-a-file.markdown
@@ -0,0 +1,41 @@
+---
+title: Reading RocksDB options from a file
+layout: post
+author: lgalanis
+category: blog
+redirect_from:
+  - /blog/1883/reading-rocksdb-options-from-a-file/
+---
+
+RocksDB options can be provided using a file or any string to RocksDB. The format is straightforward: `write_buffer_size=1024;max_write_buffer_number=2`. Any whitespace around `=` and `;` is OK. Moreover, options can be nested as necessary. For example `BlockBasedTableOptions` can be nested as follows: `write_buffer_size=1024; max_write_buffer_number=2; block_based_table_factory={block_size=4k};`. Similarly any white space around `{` or `}` is ok. Here is what it looks like in code:
+
+<!--truncate-->
+
+```c++
+#include <string>
+#include "rocksdb/db.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/convenience.h"
+
+using namespace rocksdb;                                                                                           
+
+int main(int argc, char** argv) {                                                                                  
+  DB *db;
+
+  Options opt;
+
+  std::string options_string =                                                                                     
+    "create_if_missing=true;max_open_files=1000;"                                                                  
+    "block_based_table_factory={block_size=4096}";                                                                 
+
+  Status s = GetDBOptionsFromString(opt, options_string, &opt);
+
+  s = DB::Open(opt, "/tmp/mydb_rocks", &db);                                                                       
+
+  // use db
+
+  delete db;
+}
+```
+
+Using `GetDBOptionsFromString` is a convenient way of changing options for your RocksDB application without needing to resort to recompilation or tedious command line parsing.
diff --git a/thirdparty/rocksdb/docs/_posts/2015-02-27-write-batch-with-index.markdown b/thirdparty/rocksdb/docs/_posts/2015-02-27-write-batch-with-index.markdown
new file mode 100644
index 0000000000..7f9f776536
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-02-27-write-batch-with-index.markdown
@@ -0,0 +1,20 @@
+---
+title: 'WriteBatchWithIndex: Utility for Implementing Read-Your-Own-Writes'
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/1901/write-batch-with-index/
+---
+
+RocksDB can be used as a storage engine of a higher level database. In fact, we are currently plugging RocksDB into MySQL and MongoDB as one of their storage engines. RocksDB can help with guaranteeing some of the ACID properties: durability is guaranteed by RocksDB by design; while consistency and isolation need to be enforced by concurrency controls on top of RocksDB; Atomicity can be implemented by committing a transaction's writes with one write batch to RocksDB in the end.
+
+<!--truncate-->
+
+However, if we enforce atomicity by only committing all writes in the end of the transaction in one batch, you cannot get the updated value from RocksDB previously written by the same transaction (read-your-own-write). To read the updated value, the databases on top of RocksDB need to maintain an internal buffer for all the written keys, and when a read happens they need to merge the result from RocksDB and from this buffer. This is a problem we faced when building the RocksDB storage engine in MongoDB. We solved it by creating a utility class, WriteBatchWithIndex (a write batch with a searchable index) and made it part of public API so that the community can also benefit from it.
+
+Before talking about the index part, let me introduce write batch first. The write batch class, `WriteBatch`, is a RocksDB data structure for atomic writes of multiple keys. Users can buffer their updates to a `WriteBatch` by calling `write_batch.Put("key1", "value1")` or `write_batch.Delete("key2")`, similar as calling RocksDB's functions of the same names. In the end, they call `db->Write(write_batch)` to atomically update all those batched operations to the DB. It is how a database can guarantee atomicity, as shown above. Adding a searchable index to `WriteBatch`, we now have `WriteBatchWithIndex`. Users can put updates to WriteBatchIndex in the same way as to `WriteBatch`. In the end, users can get a `WriteBatch` object from it and issue `db->Write()`. Additionally, users can create an iterator of a WriteBatchWithIndex, seek to any key location and iterate from there.
+
+To implement read-your-own-write using `WriteBatchWithIndex`, every time the user creates a transaction, we create a `WriteBatchWithIndex` attached to it. All the writes of the transaction go to the `WriteBatchWithIndex` first. When we commit the transaction, we atomically write the batch to RocksDB. When the user wants to call `Get()`, we first check if the value exists in the `WriteBatchWithIndex` and return the value if existing, by seeking and reading from an iterator of the write batch, before checking data in RocksDB. For example, here is the we implement it in MongoDB's RocksDB storage engine: [link](https://github.com/mongodb/mongo/blob/a31cc114a89a3645e97645805ba77db32c433dce/src/mongo/db/storage/rocks/rocks_recovery_unit.cpp#L245-L260). If a range query comes, we pass a DB's iterator to `WriteBatchWithIndex`, which creates a super iterator which combines the results from the DB iterator with the batch's iterator. Using this super iterator, we can iterate the DB with the transaction's own writes. Here is the iterator creation codes in MongoDB's RocksDB storage engine: [link](https://github.com/mongodb/mongo/blob/a31cc114a89a3645e97645805ba77db32c433dce/src/mongo/db/storage/rocks/rocks_recovery_unit.cpp#L266-L269). In this way, the database can solve the read-your-own-write problem by using RocksDB to handle a transaction's uncommitted writes.
+
+Using `WriteBatchWithIndex`, we successfully implemented read-your-own-writes in the RocksDB storage engine of MongoDB. If you also have a read-your-own-write problem, `WriteBatchWithIndex` can help you implement it quickly and correctly.
diff --git a/thirdparty/rocksdb/docs/_posts/2015-04-22-integrating-rocksdb-with-mongodb-2.markdown b/thirdparty/rocksdb/docs/_posts/2015-04-22-integrating-rocksdb-with-mongodb-2.markdown
new file mode 100644
index 0000000000..1ffe2c532e
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-04-22-integrating-rocksdb-with-mongodb-2.markdown
@@ -0,0 +1,16 @@
+---
+title: Integrating RocksDB with MongoDB
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/1967/integrating-rocksdb-with-mongodb-2/
+---
+
+Over the last couple of years, we have been busy integrating RocksDB with various services here at Facebook that needed to store key-value pairs locally. We have also seen other companies using RocksDB as local storage components of their distributed systems.
+
+<!--truncate-->
+
+The next big challenge for us is to bring RocksDB storage engine to general purpose databases. Today we have an exciting milestone to share with our community! We're running MongoDB with RocksDB in production and seeing great results! You can read more about it here: [http://blog.parse.com/announcements/mongodb-rocksdb-parse/](http://blog.parse.com/announcements/mongodb-rocksdb-parse/)
+
+Keep tuned for benchmarks and more stability and performance improvements.
diff --git a/thirdparty/rocksdb/docs/_posts/2015-06-12-rocksdb-in-osquery.markdown b/thirdparty/rocksdb/docs/_posts/2015-06-12-rocksdb-in-osquery.markdown
new file mode 100644
index 0000000000..f3a55faae1
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-06-12-rocksdb-in-osquery.markdown
@@ -0,0 +1,10 @@
+---
+title: RocksDB in osquery
+layout: post
+author: icanadi
+category: lgalanis
+redirect_from:
+  - /blog/1997/rocksdb-in-osquery/
+---
+
+Check out [this](https://code.facebook.com/posts/1411870269134471/how-rocksdb-is-used-in-osquery/) blog post by [Mike Arpaia](https://www.facebook.com/mike.arpaia) and [Ted Reed](https://www.facebook.com/treeded) about how osquery leverages RocksDB to build an embedded pub-sub system. This article is a great read and contains insights on how to properly use RocksDB.
diff --git a/thirdparty/rocksdb/docs/_posts/2015-07-15-rocksdb-2015-h2-roadmap.markdown b/thirdparty/rocksdb/docs/_posts/2015-07-15-rocksdb-2015-h2-roadmap.markdown
new file mode 100644
index 0000000000..b3e2703fc6
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-07-15-rocksdb-2015-h2-roadmap.markdown
@@ -0,0 +1,92 @@
+---
+title: RocksDB 2015 H2 roadmap
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/2015/rocksdb-2015-h2-roadmap/
+---
+
+Every 6 months, RocksDB team gets together to prioritize the work ahead of us. We just went through this exercise and we wanted to share the results with the community. Here's what RocksDB team will be focusing on for the next 6 months:
+
+<!--truncate-->
+
+**MyRocks**
+
+As you might know, we're working hard to integrate RocksDB as a storage engine for MySQL. This project is pretty important for us because we're heavy users of MySQL. We're already getting pretty good performance results, but there is more work to be done. We need to focus on both performance and stability. The most high priority items on are list are:
+
+
+
+
+  1. Reduce CPU costs of RocksDB as a MySQL storage engine
+
+
+  2. Implement pessimistic concurrency control to support repeatable read isolation level in MyRocks
+
+
+  3. Reduce P99 read latency, which is high mostly because of lingering tombstones
+
+
+  4. Port ZSTD compression
+
+
+**MongoRocks**
+
+Another database that we're working on is MongoDB. The project of integrating MongoDB with RocksDB storage engine is called MongoRocks. It's already running in production at Parse [1] and we're seeing surprisingly few issues. Our plans for the next half:
+
+
+
+
+  1. Keep improving performance and stability, possibly reuse work done on MyRocks (workloads are pretty similar).
+
+
+  2. Increase internal and external adoption.
+
+
+  3. Support new MongoDB 3.2.
+
+
+**RocksDB on cheaper storage media**
+
+Up to now, our mission was to build the best key-value store “for fast storage” (flash and in-memory). However, there are some use-cases at Facebook that don't need expensive high-end storage. In the next six months, we plan to deploy RocksDB on cheaper storage media. We will optimize performance to RocksDB on either or both:
+
+
+
+
+  1. Hard drive storage array.
+
+
+  2. Tiered Storage.
+
+
+**Quality of Service**
+
+When talking to our customers, there are couple of issues that keep reoccurring. We need to fix them to make our customers happy. We will improve RocksDB to provide better assurance of performance and resource usage. Non-exhaustive list includes:
+
+
+
+
+  1. Iterate P99 can be high due to the presence of tombstones.
+
+
+  2. Write stalls can happen during high write loads.
+
+
+  3. Better control of memory and disk usage.
+
+
+  4. Service quality and performance of backup engine.
+
+
+**Operation's user experience**
+
+As we increase deployment of RocksDB, engineers are spending more time on debugging RocksDB issues. We plan to improve user experience when running RocksDB. The goal is to reduce TTD (time-to-debug). The work includes monitoring, visualizations and documentations.
+
+[1]( http://blog.parse.com/announcements/mongodb-rocksdb-parse/](http://blog.parse.com/announcements/mongodb-rocksdb-parse/)
+
+
+### Comments
+
+**[Mike](allspace2012@outlook.com)**
+
+What’s the status of this roadmap? “RocksDB on cheaper storage media”, has this been implemented?
diff --git a/thirdparty/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown b/thirdparty/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown
new file mode 100644
index 0000000000..fe7b7b2681
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-07-17-spatial-indexing-in-rocksdb.markdown
@@ -0,0 +1,78 @@
+---
+title: Spatial indexing in RocksDB
+layout: post
+author: icanadi
+category: blog
+redirect_from:
+  - /blog/2039/spatial-indexing-in-rocksdb/
+---
+
+About a year ago, there was a need to develop a spatial database at Facebook. We needed to store and index Earth's map data. Before building our own, we looked at the existing spatial databases. They were all very good technology, but also general purpose. We could sacrifice a general-purpose API, so we thought we could build a more performant database, since it would be specifically designed for our use-case. Furthermore, we decided to build the spatial database on top of RocksDB, because we have a lot of operational experience with running and tuning RocksDB at a large scale.
+
+<!--truncate-->
+
+When we started looking at this project, the first thing that surprised us was that our planet is not that big. Earth's entire map data can fit in memory on a reasonably high-end machine. Thus, we also decided to build a spatial database optimized for memory-resident dataset.
+
+The first use-case of our spatial database was an experimental map renderer. As part of our project, we successfully loaded [Open Street Maps](https://www.openstreetmap.org/) dataset and hooked it up with [Mapnik](http://mapnik.org/), a map rendering engine.
+
+The usual Mapnik workflow is to load the map data into a SQL-based database and then define map layers with SQL statements. To render a tile, Mapnik needs to execute a couple of SQL queries. The benefit of this approach is that you don't need to reload your database when you change your map style. You can just change your SQL query and Mapnik picks it up. In our model, we decided to precompute the features we need for each tile. We need to know the map style before we create the database. However, when rendering the map tile, we only fetch the features that we need to render.
+
+We haven't open sourced the RocksDB Mapnik plugin or the database loading pipeline. However, the spatial indexing is available in RocksDB under a name [SpatialDB](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/utilities/spatial_db.h). The API is focused on map rendering use-case, but we hope that it can also be used for other spatial-based applications.
+
+Let's take a tour of the API. When you create a spatial database, you specify the spatial indexes that need to be built. Each spatial index is defined by a bounding box and granularity. For map rendering, we create a spatial index for each zoom levels. Higher zoom levels have more granularity.
+
+
+
+    SpatialDB::Create(
+      SpatialDBOptions(),
+      "/data/map", {
+        SpatialIndexOptions("zoom10", BoundingBox(0, 0, 100, 100), 10),
+        SpatialIndexOptions("zoom16", BoundingBox(0, 0, 100, 100), 16)
+      }
+    );
+
+
+
+
+When you insert a feature (building, street, country border) into SpatialDB, you need to specify the list of spatial indexes that will index the feature. In the loading phase we process the map style to determine the list of zoom levels on which we'll render the feature. For example, we will not render the building on zoom level that shows an entire country. Building will only be indexed on higher zoom level's index. Country borders will be indexes on all zoom levels.
+
+
+
+    FeatureSet feature;
+    feature.Set("type", "building");
+    feature.Set("height", 6);
+    db->Insert(WriteOptions(), BoundingBox<double>(5, 5, 10, 10),
+               well_known_binary_blob, feature, {"zoom16"});
+
+
+
+
+The indexing part is pretty simple. For each feature, we first find a list of index tiles that it intersects. Then, we add a link from the tile's [quad key](https://msdn.microsoft.com/en-us/library/bb259689.aspx) to the feature's primary key. Using quad keys improves data locality, i.e. features closer together geographically will have similar quad keys. Even though we're optimizing for a memory-resident dataset, data locality is still very important due to different caching effects.
+
+After you're done inserting all the features, you can call an API Compact() that will compact the dataset and speed up read queries.
+
+
+
+    db->Compact();
+
+
+
+
+SpatialDB's query specifies: 1) bounding box we're interested in, and 2) a zoom level. We find all tiles that intersect with the query's bounding box and return all features in those tiles.
+
+
+
+
+    Cursor* c = db_->Query(ReadOptions(), BoundingBox<double>(1, 1, 7, 7), "zoom16");
+    for (c->Valid(); c->Next()) {
+        Render(c->blob(), c->feature_set());
+    }
+
+
+
+
+Note: `Render()` function is not part of RocksDB. You will need to use one of many open source map renderers, for example check out [Mapnik](http://mapnik.org/).
+
+TL;DR If you need an embedded spatial database, check out RocksDB's SpatialDB. [Let us know](https://www.facebook.com/groups/rocksdb.dev/) how we can make it better.
+
+If you're interested in learning more, check out this [talk](https://www.youtube.com/watch?v=T1jWsDMONM8).
diff --git a/thirdparty/rocksdb/docs/_posts/2015-07-22-rocksdb-is-now-available-in-windows-platform.markdown b/thirdparty/rocksdb/docs/_posts/2015-07-22-rocksdb-is-now-available-in-windows-platform.markdown
new file mode 100644
index 0000000000..b6bb47d533
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-07-22-rocksdb-is-now-available-in-windows-platform.markdown
@@ -0,0 +1,30 @@
+---
+title: RocksDB is now available in Windows Platform
+layout: post
+author: dmitrism
+category: blog
+redirect_from:
+  - /blog/2033/rocksdb-is-now-available-in-windows-platform/
+---
+
+Over the past 6 months we have seen a number of use cases where RocksDB is successfully used by the community and various companies to achieve high throughput and volume in a modern server environment.
+
+We at Microsoft Bing could not be left behind. As a result we are happy to [announce](http://bit.ly/1OmWBT9) the availability of the Windows Port created here at Microsoft which we intend to use as a storage option for one of our key/value data stores.
+
+<!--truncate-->
+
+We are happy to make this available for the community. Keep tuned for more announcements to come.
+
+### Comments
+
+**[Siying Dong](siying.d@fb.com)**
+
+Appreciate your contributions to RocksDB project! I believe it will benefits many users!
+
+**[empresas sevilla](oxofkx@gmail.com)**
+
+Magnifico artículo|, un placer leer el blog
+
+**[jak usunac](tomogedac@o2.pl)**
+
+I believe it will benefits too
diff --git a/thirdparty/rocksdb/docs/_posts/2015-07-23-dynamic-level.markdown b/thirdparty/rocksdb/docs/_posts/2015-07-23-dynamic-level.markdown
new file mode 100644
index 0000000000..0ff3a0542f
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-07-23-dynamic-level.markdown
@@ -0,0 +1,29 @@
+---
+title: Dynamic Level Size for Level-Based Compaction
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/2207/dynamic-level/
+---
+
+In this article, we follow up on the first part of an answer to one of the questions in our [AMA](https://www.reddit.com/r/IAmA/comments/3de3cv/we_are_rocksdb_engineering_team_ask_us_anything/ct4a8tb), the dynamic level size in level-based compaction.
+
+<!--truncate-->
+
+Level-based compaction is the original LevelDB compaction style and one of the two major compaction styles in RocksDB (See [our wiki](https://github.com/facebook/rocksdb/wiki/RocksDB-Basics#multi-threaded-compactions)). In RocksDB we introduced parallelism and more configurable options to it but the main algorithm stayed the same, until we recently introduced the dynamic level size mode.
+
+
+In level-based compaction, we organize data to different sorted runs, called levels. Each level has a target size.  Usually target size of levels increases by the same size multiplier. For example, you can set target size of level 1 to be 1GB, and size multiplier to be 10, and the target size of level 1, 2, 3, 4 will be 1GB, 10GB, 100GB and 1000GB. Before level 1, there will be some staging file flushed from mem tables, called Level 0 files, which will later be merged to level 1. Compactions will be triggered as soon as actual size of a level exceeds its target size. We will merge a subset of data of that level to next level, to reduce size of the level. More compactions will be triggered until sizes of all the levels are lower than their target sizes. In a steady state, the size of each level will be around the same size of the size of level targets.
+
+
+Level-based compaction’s advantage is its good space efficiency. We usually use the metric space amplification to measure the space efficiency. In this article ignore the effects of data compression so space amplification= size_on_file_system / size_of_user_data.
+
+
+How do we estimate space amplification of level-based compaction? We focus specifically on the databases in steady state, which means database size is stable or grows slowly over time. This means updates will add roughly the same or little more data than what is removed by deletes. Given that, if we compact all the data all to the last level, the size of level will be equal as the size of last level before the compaction. On the other hand, the size of user data will be approximately the size of DB if we compact all the levels down to the last level. So the size of the last level will be a good estimation of user data size. So total size of the DB divided by the size of the last level will be a good estimation of space amplification.
+
+
+Applying the equation, if we have four non-zero levels, their sizes are 1GB, 10GB, 100GB, 1000GB, the size amplification will be approximately (1000GB + 100GB + 10GB + 1GB) / 1000GB = 1.111, which is a very good number. However, there is a catch here: how to make sure the last level’s size is 1000GB, the same as the level’s size target? A user has to fine tune level sizes to achieve this number and will need to re-tune if DB size changes. The theoretic number 1.11 is hard to achieve in practice. In a worse case, if you have the target size of last level to be 1000GB but the user data is only 200GB, then the actual space amplification will be (200GB + 100GB + 10GB + 1GB) / 200GB = 1.555, a much worse number.
+
+
+To solve this problem, my colleague Igor Kabiljo came up with a solution of dynamic level size target mode. You can enable it by setting options.level_compaction_dynamic_level_bytes=true. In this mode, size target of levels are changed dynamically based on size of the last level. Suppose the level size multiplier to be 10, and the DB size is 200GB. The target size of the last level is automatically set to be the actual size of the level, which is 200GB, the second to last level’s size target will be automatically set to be size_last_level / 10 = 20GB, the third last level’s will be size_last_level/100 = 2GB, and next level to be size_last_level/1000 = 200MB. We stop here because 200MB is within the range of the first level. In this way, we can achieve the 1.111 space amplification, without fine tuning of the level size targets. More details can be found in [code comments of the option](https://github.com/facebook/rocksdb/blob/v3.11/include/rocksdb/options.h#L366-L423) in the header file.
diff --git a/thirdparty/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown b/thirdparty/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown
new file mode 100644
index 0000000000..332a29f020
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-10-27-getthreadlist.markdown
@@ -0,0 +1,193 @@
+---
+title: GetThreadList
+layout: post
+author: yhciang
+category: blog
+redirect_from:
+  - /blog/2261/getthreadlist/
+---
+
+We recently added a new API, called `GetThreadList()`, that exposes the RocksDB background thread activity. With this feature, developers will be able to obtain the real-time information about the currently running compactions and flushes such as the input / output size, elapsed time, the number of bytes it has written. Below is an example output of `GetThreadList`.  To better illustrate the example, we have put a sample output of `GetThreadList` into a table where each column represents a thread status:
+
+<!--truncate-->
+
+<table width="637" >
+<tbody >
+<tr style="border:2px solid #000000" >
+
+<td style="padding:3px" >ThreadID
+</td>
+
+<td >140716395198208
+</td>
+
+<td >140716416169728
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >DB
+</td>
+
+<td >db1
+</td>
+
+<td >db2
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >CF
+</td>
+
+<td >default
+</td>
+
+<td >picachu
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >ThreadType
+</td>
+
+<td >High Pri
+</td>
+
+<td >Low Pri
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >Operation
+</td>
+
+<td >Flush
+</td>
+
+<td >Compaction
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >ElapsedTime
+</td>
+
+<td >143.459 ms
+</td>
+
+<td >607.538 ms
+</td>
+</tr>
+<tr >
+
+<td style="padding:3px" >Stage
+</td>
+
+<td >FlushJob::WriteLevel0Table
+</td>
+
+<td >CompactionJob::Install
+</td>
+</tr>
+<tr >
+
+<td style="vertical-align:top;padding:3px" >OperationProperties
+</td>
+
+<td style="vertical-align:top;padding:3px" >
+BytesMemtables 4092938
+BytesWritten 1050701
+</td>
+
+<td style="vertical-align:top" >
+BaseInputLevel 1
+BytesRead 4876417
+BytesWritten 4140109
+IsDeletion 0
+IsManual 0
+IsTrivialMove 0
+JobID 146
+OutputLevel 2
+TotalInputBytes 4883044
+</td>
+</tr>
+</tbody>
+</table>
+
+In the above output, we can see `GetThreadList()` reports the activity of two threads: one thread running flush job (middle column) and the other thread running a compaction job (right-most column).  In each thread status, it shows basic information about the thread such as thread id, it's target db / column family, and the job it is currently doing and the current status of the job.  For instance, we can see thread 140716416169728 is doing compaction on the `picachu` column family in database `db2`.  In addition, we can see the compaction has been running for 600 ms, and it has read 4876417 bytes out of 4883044 bytes. This indicates the compaction is about to complete.  The stage property indicates which code block the thread is currently executing.  For instance, thread 140716416169728 is currently running `CompactionJob::Install`, which further indicates the compaction job is almost done.
+
+Below we briefly describe its API.
+
+
+## How to Enable it?
+
+
+To enable thread-tracking of a rocksdb instance, simply set `enable_thread_tracking` to true in its DBOptions:
+
+```c++
+// If true, then the status of the threads involved in this DB will
+// be tracked and available via GetThreadList() API.
+//
+// Default: false
+bool enable_thread_tracking;
+```
+
+
+
+## The API
+
+
+The GetThreadList API is defined in [include/rocksdb/env.h](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/env.h#L317-L318), which is an Env
+function:
+
+```c++
+virtual Status GetThreadList(std::vector* thread_list)
+```
+
+Since an Env can be shared across multiple rocksdb instances, the output of
+`GetThreadList()` include the background activity of all the rocksdb instances
+that using the same Env.
+
+The `GetThreadList()` API simply returns a vector of `ThreadStatus`, each describes
+the current status of a thread. The `ThreadStatus` structure, defined in
+[include/rocksdb/thread_status.h](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/thread_status.h), contains the following information:
+
+```c++
+// An unique ID for the thread.
+const uint64_t thread_id;
+
+// The type of the thread, it could be HIGH_PRIORITY,
+// LOW_PRIORITY, and USER
+const ThreadType thread_type;
+
+// The name of the DB instance where the thread is currently
+// involved with. It would be set to empty string if the thread
+// does not involve in any DB operation.
+const std::string db_name;
+
+// The name of the column family where the thread is currently
+// It would be set to empty string if the thread does not involve
+// in any column family.
+const std::string cf_name;
+
+// The operation (high-level action) that the current thread is involved.
+const OperationType operation_type;
+
+// The elapsed time in micros of the current thread operation.
+const uint64_t op_elapsed_micros;
+
+// An integer showing the current stage where the thread is involved
+// in the current operation.
+const OperationStage operation_stage;
+
+// A list of properties that describe some details about the current
+// operation. Same field in op_properties[] might have different
+// meanings for different operations.
+uint64_t op_properties[kNumOperationProperties];
+
+// The state (lower-level action) that the current thread is involved.
+const StateType state_type;
+```
+
+If you are interested in the background thread activity of your RocksDB application, please feel free to give `GetThreadList()` a try :)
diff --git a/thirdparty/rocksdb/docs/_posts/2015-11-10-use-checkpoints-for-efficient-snapshots.markdown b/thirdparty/rocksdb/docs/_posts/2015-11-10-use-checkpoints-for-efficient-snapshots.markdown
new file mode 100644
index 0000000000..6852b8ffa3
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-11-10-use-checkpoints-for-efficient-snapshots.markdown
@@ -0,0 +1,45 @@
+---
+title: Use Checkpoints for Efficient Snapshots
+layout: post
+author: rven2
+category: blog
+redirect_from:
+  - /blog/2609/use-checkpoints-for-efficient-snapshots/
+---
+
+**Checkpoint** is a feature in RocksDB which provides the ability to take a snapshot of a running RocksDB database in a separate directory. Checkpoints can be used as a point in time snapshot, which can be opened Read-only to query rows as of the point in time or as a Writeable snapshot by opening it Read-Write. Checkpoints can be used for both full and incremental backups.
+
+<!--truncate-->
+
+
+The Checkpoint feature enables RocksDB to create a consistent snapshot of a given RocksDB database in the specified directory. If the snapshot is on the same filesystem as the original database, the SST files will be hard-linked, otherwise SST files will be copied. The manifest and CURRENT files will be copied. In addition, if there are multiple column families, log files will be copied for the period covering the start and end of the checkpoint, in order to provide a consistent snapshot across column families.
+
+
+
+
+A Checkpoint object needs to be created for a database before checkpoints are created. The API is as follows:
+
+
+
+
+`Status Create(DB* db, Checkpoint** checkpoint_ptr);`
+
+
+
+
+Given a checkpoint object and a directory, the CreateCheckpoint function creates a consistent snapshot of the database in the given directory.
+
+
+
+
+`Status CreateCheckpoint(const std::string& checkpoint_dir);`
+
+
+
+
+The directory should not already exist and will be created by this API. The directory will be an absolute path. The checkpoint can be used as a ​read-only copy of the DB or can be opened as a standalone DB. When opened read/write, the SST files continue to be hard links and these links are removed when the files are obsoleted. When the user is done with the snapshot, the user can delete the directory to remove the snapshot.
+
+
+
+
+Checkpoints are used for online backup in ​MyRocks. which is MySQL using RocksDB as the storage engine . ([MySQL on RocksDB](https://github.com/facebook/mysql-5.6)) ​
diff --git a/thirdparty/rocksdb/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown b/thirdparty/rocksdb/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
new file mode 100644
index 0000000000..b21b04fe38
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2015-11-16-analysis-file-read-latency-by-level.markdown
@@ -0,0 +1,244 @@
+---
+title: Analysis File Read Latency by Level
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/2537/analysis-file-read-latency-by-level/
+---
+
+In many use cases of RocksDB, people rely on OS page cache for caching compressed data. With this approach, verifying effective of the OS page caching is challenging, because file system is a black box to users.
+
+As an example, a user can tune the DB as following: use level-based compaction, with L1 - L4 sizes to be 1GB, 10GB, 100GB and 1TB. And they reserve about 20GB memory as OS page cache, expecting level 0, 1 and 2 are mostly cached in memory, leaving only reads from level 3 and 4 requiring disk I/Os. However, in practice, it's not easy to verify whether OS page cache does exactly what we expect. For example, if we end up with doing 4 instead of 2 I/Os per query, it's not easy for users to figure out whether the it's because of efficiency of OS page cache or reading multiple blocks for a level. Analysis like it is especially important if users run RocksDB on hard drive disks, for the gap of latency between hard drives and memory is much higher than flash-based SSDs.
+
+<!--truncate-->
+
+In order to make tuning easier, we added new instrumentation to help users analysis latency distribution of file reads in different levels. If users turn DB statistics on, we always keep track of distribution of file read latency for each level. Users can retrieve the information by querying DB property “rocksdb.stats” ( [https://github.com/facebook/rocksdb/blob/v3.13.1/include/rocksdb/db.h#L315-L316](https://github.com/facebook/rocksdb/blob/v3.13.1/include/rocksdb/db.h#L315-L316) ). It will also printed out as a part of compaction summary in info logs periodically.
+
+The output looks like this:
+
+
+```
+** Level 0 read latency histogram (micros):
+Count: 696 Average: 489.8118 StdDev: 222.40
+Min: 3.0000 Median: 452.3077 Max: 1896.0000
+Percentiles: P50: 452.31 P75: 641.30 P99: 1068.00 P99.9: 1860.80 P99.99: 1896.00
+------------------------------------------------------
+[ 2, 3 ) 1 0.144% 0.144%
+[ 18, 20 ) 1 0.144% 0.287%
+[ 45, 50 ) 5 0.718% 1.006%
+[ 50, 60 ) 26 3.736% 4.741% #
+[ 60, 70 ) 6 0.862% 5.603%
+[ 90, 100 ) 1 0.144% 5.747%
+[ 120, 140 ) 2 0.287% 6.034%
+[ 140, 160 ) 1 0.144% 6.178%
+[ 160, 180 ) 1 0.144% 6.322%
+[ 200, 250 ) 9 1.293% 7.615%
+[ 250, 300 ) 45 6.466% 14.080% #
+[ 300, 350 ) 88 12.644% 26.724% ###
+[ 350, 400 ) 88 12.644% 39.368% ###
+[ 400, 450 ) 71 10.201% 49.569% ##
+[ 450, 500 ) 65 9.339% 58.908% ##
+[ 500, 600 ) 74 10.632% 69.540% ##
+[ 600, 700 ) 92 13.218% 82.759% ###
+[ 700, 800 ) 64 9.195% 91.954% ##
+[ 800, 900 ) 35 5.029% 96.983% #
+[ 900, 1000 ) 12 1.724% 98.707%
+[ 1000, 1200 ) 6 0.862% 99.569%
+[ 1200, 1400 ) 2 0.287% 99.856%
+[ 1800, 2000 ) 1 0.144% 100.000%
+
+** Level 1 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 2 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 3 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 4 read latency histogram (micros):
+(......not pasted.....)
+
+** Level 5 read latency histogram (micros):
+Count: 25583746 Average: 421.1326 StdDev: 385.11
+Min: 1.0000 Median: 376.0011 Max: 202444.0000
+Percentiles: P50: 376.00 P75: 438.00 P99: 1421.68 P99.9: 4164.43 P99.99: 9056.52
+------------------------------------------------------
+[ 0, 1 ) 2351 0.009% 0.009%
+[ 1, 2 ) 6077 0.024% 0.033%
+[ 2, 3 ) 8471 0.033% 0.066%
+[ 3, 4 ) 788 0.003% 0.069%
+[ 4, 5 ) 393 0.002% 0.071%
+[ 5, 6 ) 786 0.003% 0.074%
+[ 6, 7 ) 1709 0.007% 0.080%
+[ 7, 8 ) 1769 0.007% 0.087%
+[ 8, 9 ) 1573 0.006% 0.093%
+[ 9, 10 ) 1495 0.006% 0.099%
+[ 10, 12 ) 3043 0.012% 0.111%
+[ 12, 14 ) 2259 0.009% 0.120%
+[ 14, 16 ) 1233 0.005% 0.125%
+[ 16, 18 ) 762 0.003% 0.128%
+[ 18, 20 ) 451 0.002% 0.130%
+[ 20, 25 ) 794 0.003% 0.133%
+[ 25, 30 ) 1279 0.005% 0.138%
+[ 30, 35 ) 1172 0.005% 0.142%
+[ 35, 40 ) 1363 0.005% 0.148%
+[ 40, 45 ) 409 0.002% 0.149%
+[ 45, 50 ) 105 0.000% 0.150%
+[ 50, 60 ) 80 0.000% 0.150%
+[ 60, 70 ) 280 0.001% 0.151%
+[ 70, 80 ) 1583 0.006% 0.157%
+[ 80, 90 ) 4245 0.017% 0.174%
+[ 90, 100 ) 6572 0.026% 0.200%
+[ 100, 120 ) 9724 0.038% 0.238%
+[ 120, 140 ) 3713 0.015% 0.252%
+[ 140, 160 ) 2383 0.009% 0.261%
+[ 160, 180 ) 18344 0.072% 0.333%
+[ 180, 200 ) 51873 0.203% 0.536%
+[ 200, 250 ) 631722 2.469% 3.005%
+[ 250, 300 ) 2721970 10.639% 13.644% ##
+[ 300, 350 ) 5909249 23.098% 36.742% #####
+[ 350, 400 ) 6522507 25.495% 62.237% #####
+[ 400, 450 ) 4296332 16.793% 79.030% ###
+[ 450, 500 ) 2130323 8.327% 87.357% ##
+[ 500, 600 ) 1553208 6.071% 93.428% #
+[ 600, 700 ) 642129 2.510% 95.938% #
+[ 700, 800 ) 372428 1.456% 97.394%
+[ 800, 900 ) 187561 0.733% 98.127%
+[ 900, 1000 ) 85858 0.336% 98.462%
+[ 1000, 1200 ) 82730 0.323% 98.786%
+[ 1200, 1400 ) 50691 0.198% 98.984%
+[ 1400, 1600 ) 38026 0.149% 99.133%
+[ 1600, 1800 ) 32991 0.129% 99.261%
+[ 1800, 2000 ) 30200 0.118% 99.380%
+[ 2000, 2500 ) 62195 0.243% 99.623%
+[ 2500, 3000 ) 36684 0.143% 99.766%
+[ 3000, 3500 ) 21317 0.083% 99.849%
+[ 3500, 4000 ) 10216 0.040% 99.889%
+[ 4000, 4500 ) 8351 0.033% 99.922%
+[ 4500, 5000 ) 4152 0.016% 99.938%
+[ 5000, 6000 ) 6328 0.025% 99.963%
+[ 6000, 7000 ) 3253 0.013% 99.976%
+[ 7000, 8000 ) 2082 0.008% 99.984%
+[ 8000, 9000 ) 1546 0.006% 99.990%
+[ 9000, 10000 ) 1055 0.004% 99.994%
+[ 10000, 12000 ) 1566 0.006% 100.000%
+[ 12000, 14000 ) 761 0.003% 100.003%
+[ 14000, 16000 ) 462 0.002% 100.005%
+[ 16000, 18000 ) 226 0.001% 100.006%
+[ 18000, 20000 ) 126 0.000% 100.006%
+[ 20000, 25000 ) 107 0.000% 100.007%
+[ 25000, 30000 ) 43 0.000% 100.007%
+[ 30000, 35000 ) 15 0.000% 100.007%
+[ 35000, 40000 ) 14 0.000% 100.007%
+[ 40000, 45000 ) 16 0.000% 100.007%
+[ 45000, 50000 ) 1 0.000% 100.007%
+[ 50000, 60000 ) 22 0.000% 100.007%
+[ 60000, 70000 ) 10 0.000% 100.007%
+[ 70000, 80000 ) 5 0.000% 100.007%
+[ 80000, 90000 ) 14 0.000% 100.007%
+[ 90000, 100000 ) 11 0.000% 100.007%
+[ 100000, 120000 ) 33 0.000% 100.007%
+[ 120000, 140000 ) 6 0.000% 100.007%
+[ 140000, 160000 ) 3 0.000% 100.007%
+[ 160000, 180000 ) 7 0.000% 100.007%
+[ 200000, 250000 ) 2 0.000% 100.007%
+```
+
+
+In this example, you can see we only issued 696 reads from level 0 while issued 25 million reads from level 5. The latency distribution is also clearly shown among those reads. This will be helpful for users to analysis OS page cache efficiency.
+
+Currently the read latency per level includes reads from data blocks, index blocks, as well as bloom filter blocks. We are also working on a feature to break down those three type of blocks.
+
+### Comments
+
+**[Tao Feng](fengtao04@gmail.com)**
+
+Is this feature also included in RocksJava?
+
+**[Siying Dong](siying.d@fb.com)**
+
+Should be. As long as you enable statistics, you should be able to get the value from `RocksDB.getProperty()` with property `rocksdb.dbstats`. Let me know if you can’t find it.
+
+**[chiddu](cnbscience@gmail.com)**
+
+> In this example, you can see we only issued 696 reads from level 0 while issued 256K reads from level 5.
+
+Isn’t it 2.5 M of reads instead of 256K ? .
+
+Also could anyone please provide more description on the histogram ? especially
+
+> Count: 25583746 Average: 421.1326 StdDev: 385.11
+> Min: 1.0000 Median: 376.0011 Max: 202444.0000
+> Percentiles: P50: 376.00 P75: 438.00 P99: 1421.68 P99.9: 4164.43 P99.99: 9056.52
+
+and
+
+> [ 0, 1 ) 2351 0.009% 0.009%
+> [ 1, 2 ) 6077 0.024% 0.033%
+> [ 2, 3 ) 8471 0.033% 0.066%
+> [ 3, 4 ) 788 0.003% 0.069%”
+
+thanks in advance
+
+**[Siying Dong](siying.d@fb.com)**
+
+Thank you for pointing out the mistake. I fixed it now.
+
+In this output, there are 2.5 million samples, average latency is 421 micro seconds, with standard deviation 385. Median is 376, max value is 202 milliseconds. 0.009% has value of 1, 0.024% has value of 1, 0.033% has value of 2. Accumulated value from 0 to 2 is 0.066%.
+
+Hope it helps.
+
+**[chiddu](cnbscience@gmail.com)**
+
+Thank you Siying for the quick reply, I was running couple of benchmark testing to check the performance of rocksdb on SSD. One of the test is similar to what is mentioned in the wiki, TEST 4 : Random read , except the key_size is 10 and value_size is 20. I am inserting 1 billion hashes and reading 1 billion hashes with 32 threads. The histogram shows something like this
+
+```
+Level 5 read latency histogram (micros):
+Count: 7133903059 Average: 480.4357 StdDev: 309.18
+Min: 0.0000 Median: 551.1491 Max: 224142.0000
+Percentiles: P50: 551.15 P75: 651.44 P99: 996.52 P99.9: 2073.07 P99.99: 3196.32
+——————————————————
+[ 0, 1 ) 28587385 0.401% 0.401%
+[ 1, 2 ) 686572516 9.624% 10.025% ##
+[ 2, 3 ) 567317522 7.952% 17.977% ##
+[ 3, 4 ) 44979472 0.631% 18.608%
+[ 4, 5 ) 50379685 0.706% 19.314%
+[ 5, 6 ) 64930061 0.910% 20.224%
+[ 6, 7 ) 22613561 0.317% 20.541%
+…………more………….
+```
+
+If I understand your previous comment correctly,
+
+1. How is it that the count is around 7 billion when I have only inserted 1 billion hashes ? is the stat broken ?
+1. What does the percentiles and the numbers signify ?
+1. 0, 1 ) 28587385 0.401% 0.401% what does this “28587385” stand for in the histogram row ?
+
+**[Siying Dong](siying.d@fb.com)**
+
+If I remember correctly, with db_bench, if you specify –num=1000000000 –threads=32, it is every thread reading one billion keys, total of 32 billions. Is it the case you ran into?
+
+28,587,385 means that number of data points take the value [0,1)
+28,587,385 / 7,133,903,058 = 0.401% provides percentage.
+
+**[chiddu](cnbscience@gmail.com)**
+
+I do have `num=1000000000` and `t=32`. The script says reading 1 billion hashes and not 32 billion hashes.
+
+this is the script on which I have used
+
+```
+echo “Load 1B keys sequentially into database…..”
+bpl=10485760;overlap=10;mcz=2;del=300000000;levels=6;ctrig=4; delay=8; stop=12; wbn=3; mbc=20; mb=67108864;wbs=134217728; dds=1; sync=0; r=1000000000; t=1; vs=20; bs=4096; cs=1048576; of=500000; si=1000000; ./db_bench –benchmarks=fillseq –disable_seek_compaction=1 –mmap_read=0 –statistics=1 –histogram=1 –num=$r –threads=$t –value_size=$vs –block_size=$bs –cache_size=$cs –bloom_bits=10 –cache_numshardbits=6 –open_files=$of –verify_checksum=1 –db=/data/mysql/leveldb/test –sync=$sync –disable_wal=1 –compression_type=none –stats_interval=$si –compression_ratio=0.5 –disable_data_sync=$dds –write_buffer_size=$wbs –target_file_size_base=$mb –max_write_buffer_number=$wbn –max_background_compactions=$mbc –level0_file_num_compaction_trigger=$ctrig –level0_slowdown_writes_trigger=$delay –level0_stop_writes_trigger=$stop –num_levels=$levels –delete_obsolete_files_period_micros=$del –min_level_to_compress=$mcz –max_grandparent_overlap_factor=$overlap –stats_per_interval=1 –max_bytes_for_level_base=$bpl –use_existing_db=0 –key_size=10
+
+echo “Reading 1B keys in database in random order….”
+bpl=10485760;overlap=10;mcz=2;del=300000000;levels=6;ctrig=4; delay=8; stop=12; wbn=3; mbc=20; mb=67108864;wbs=134217728; dds=0; sync=0; r=1000000000; t=32; vs=20; bs=4096; cs=1048576; of=500000; si=1000000; ./db_bench –benchmarks=readrandom –disable_seek_compaction=1 –mmap_read=0 –statistics=1 –histogram=1 –num=$r –threads=$t –value_size=$vs –block_size=$bs –cache_size=$cs –bloom_bits=10 –cache_numshardbits=6 –open_files=$of –verify_checksum=1 –db=/some_data_base –sync=$sync –disable_wal=1 –compression_type=none –stats_interval=$si –compression_ratio=0.5 –disable_data_sync=$dds –write_buffer_size=$wbs –target_file_size_base=$mb –max_write_buffer_number=$wbn –max_background_compactions=$mbc –level0_file_num_compaction_trigger=$ctrig –level0_slowdown_writes_trigger=$delay –level0_stop_writes_trigger=$stop –num_levels=$levels –delete_obsolete_files_period_micros=$del –min_level_to_compress=$mcz –max_grandparent_overlap_factor=$overlap –stats_per_interval=1 –max_bytes_for_level_base=$bpl –use_existing_db=1 –key_size=10
+```
+
+After running this script, there were no issues wrt to loading billion hashes , but when it came to reading part, its been almost 4 days and still I have only read 7 billion hashes and have read 200 million hashes in 2 and half days. Is there something which is missing in db_bench or something which I am missing ?
+
+**[Siying Dong](siying.d@fb.com)**
+
+It’s a printing error then. If you have `num=1000000000` and `t=32`, it will be 32 threads, and each reads 1 billion keys.
diff --git a/thirdparty/rocksdb/docs/_posts/2016-01-29-compaction_pri.markdown b/thirdparty/rocksdb/docs/_posts/2016-01-29-compaction_pri.markdown
new file mode 100644
index 0000000000..ba9ee627c9
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2016-01-29-compaction_pri.markdown
@@ -0,0 +1,51 @@
+---
+title: Option of Compaction Priority
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/2921/compaction_pri/
+---
+
+The most popular compaction style of RocksDB is level-based compaction, which is an improved version of LevelDB's compaction algorithm. Page 9- 16 of this [slides](https://github.com/facebook/rocksdb/blob/gh-pages/talks/2015-09-29-HPTS-Siying-RocksDB.pdf) gives an illustrated introduction of this compaction style. The basic idea that: data is organized by multiple levels with exponential increasing target size. Except a special level 0, every level is key-range partitioned into many files. When size of a level exceeds its target size, we pick one or more of its files, and merge the file into the next level.
+
+<!--truncate-->
+
+Which file to pick to compact is an interesting question. LevelDB only uses one thread for compaction and it always picks files in round robin manner. We implemented multi-thread compaction in RocksDB by picking multiple files from the same level and compact them in parallel. We had to move away from LevelDB's file picking approach. Recently, we created an option [options.compaction_pri](https://github.com/facebook/rocksdb/blob/d6c838f1e130d8860407bc771fa6d4ac238859ba/include/rocksdb/options.h#L83-L93), which indicated three different algorithms to pick files to compact.
+
+Why do we need to multiple algorithms to choose from? Because there are different factors to consider when picking the files, and we now don't yet know how to balance them automatically, so we expose it to users to choose. Here are factors to consider:
+
+**Write amplification**
+
+When we estimate write amplification, we usually simplify the problem by assuming keys are uniformly distributed inside each level. In reality, it is not the case, even if user updates are uniformly distributed across the whole key range. For instance, when we compact one file of a level to the next level, it creates a hole. Over time, incoming compaction will fill data to the hole, but the density will still be lower for a while. Picking a file with keys least densely populated is more expensive to get the file to the next level, because there will be more overlapping files in the next level so we need to rewrite more data. For example, assume a file is 100MB, if an L2 file overlaps with 8 L3 files, we need to rewrite about 800MB of data to get the file to L3. If the file overlaps with 12 L3 files, we'll need to rewrite about 1200MB to get a file of the same size out of L2. It uses 50% more writes. (This analysis ignores the key density of the next level, because the range covers N times of files in that level so one hole only impacts write amplification by 1/N)
+
+If all the updates are uniformly distributed, LevelDB's approach optimizes write amplification, because a file being picked covers a range whose last compaction time to the next level is the oldest, so the range will accumulated keys from incoming compactions for the longest and the density is the highest.
+
+We created a compaction priority **kOldestSmallestSeqFirst** for the same effect. With this mode, we always pick the file covers the oldest updates in the level, which usually is contains the densest key range. If you have a use case where writes are uniformly distributed across the key space and you want to reduce write amplification, you should set options.compaction_pri=kOldestSmallestSeqFirst.
+
+**Optimize for small working set**
+
+We are assuming updates are uniformly distributed across the whole key space in previous analysis. However, in many use cases, there are subset of keys that are frequently updated while other key ranges are very cold. In this case, keeping hot key ranges from compacting to deeper levels will benefit write amplification, as well as space amplification. For example, if in a DB only key 150-160 are updated and other keys are seldom updated. If level 1 contains 20 keys, we want to keep 150-160 all stay in level 1. Because when next level 0 -> 1 compaction comes, it will simply overwrite existing keys so size level 1 doesn't increase, so no need to schedule further compaction for level 1->2. On the other hand, if we compact key 150-155 to level2, when a new Level 1->2 compaction comes, it increases the size of level 1, making size of level 1 exceed target size and more compactions will be needed, which generates more writes.
+
+The compaction priority **kOldestLargestSeqFirst** optimizes this use case. In this mode, we will pick a file whose latest update is the oldest. It means there is no incoming data for the range for the longest. Usually it is the coldest range. By compacting coldest range first, we leave the hot ranges in the level. If your use case is to overwrite existing keys in a small range, try options.compaction_pri=kOldestLargestSeqFirst**.**
+
+**Drop delete marker sooner**
+
+If one file contains a lot of delete markers, it may slow down iterating over this area, because we still need to iterate those deleted keys just to ignore them. Furthermore, the sooner we compact delete keys into the last level, the sooner the disk space is reclaimed, so it is good for space efficiency.
+
+Our default compaction priority **kByCompensatedSize** considers the case. If number of deletes in a file exceeds number of inserts, it is more likely to be picked for compaction. The more number of deletes exceed inserts, the more likely it is being compacted. The optimization is added to avoid the worst performance of space efficiency and query performance when a large percentage of the DB is deleted.
+
+**Efficiency of compaction filter**
+
+Usually people use [compaction filters](https://github.com/facebook/rocksdb/blob/v4.1/include/rocksdb/options.h#L201-L226) to clean up old data to free up space. Picking files to compact may impact space efficiency. We don't yet have a a compaction priority to optimize this case. In some of our use cases, we solved the problem in a different way: we have an external service checking modify time of all SST files. If any of the files is too old, we force the single file to compaction by calling DB::CompactFiles() using the single file. In this way, we can provide a time bound of data passing through compaction filters.
+
+
+In all, there three choices of compaction priority modes optimizing different scenarios. if you have a new use case, we suggest you start with `options.compaction_pri=kOldestSmallestSeqFirst` (note it is not the default one for backward compatible reason). If you want to further optimize your use case, you can try other two use cases if your use cases apply.
+
+If you have good ideas about better compaction picker approach, you are welcome to implement and benchmark it. We'll be glad to review and merge your a pull requests.
+
+### Comments
+
+**[Mark Callaghan](mdcallag@gmail.com)**
+
+Performance results for compaction_pri values and linkbench are explained at [http://smalldatum.blogspot.com/2016/02/compaction-priority-in-rocksdb.html](http://smalldatum.blogspot.com/2016/02/compaction-priority-in-rocksdb.html)
diff --git a/thirdparty/rocksdb/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown b/thirdparty/rocksdb/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
new file mode 100644
index 0000000000..409015cc8c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2016-02-24-rocksdb-4-2-release.markdown
@@ -0,0 +1,41 @@
+---
+title: RocksDB 4.2 Release!
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/3017/rocksdb-4-2-release/
+---
+
+New RocksDB release - 4.2!
+
+
+**New Features**
+
+  1. Introduce CreateLoggerFromOptions(), this function create a Logger for provided DBOptions.
+
+
+  2. Add GetAggregatedIntProperty(), which returns the sum of the GetIntProperty of all the column families.
+
+
+  3. Add MemoryUtil in rocksdb/utilities/memory.h. It currently offers a way to get the memory usage by type from a list rocksdb instances.
+
+
+<!--truncate-->
+
+
+**Public API changes**
+
+  1. CompactionFilter::Context includes information of Column Family ID
+
+
+  2. The need-compaction hint given by TablePropertiesCollector::NeedCompact() will be persistent and recoverable after DB recovery. This introduces a breaking format change. If you use this experimental feature, including NewCompactOnDeletionCollectorFactory() in the new version, you may not be able to directly downgrade the DB back to version 4.0 or lower.
+
+
+  3. TablePropertiesCollectorFactory::CreateTablePropertiesCollector() now takes an option Context, containing the information of column family ID for the file being written.
+
+
+  4. Remove DefaultCompactionFilterFactory.
+
+
+[https://github.com/facebook/rocksdb/releases/tag/v4.2](https://github.com/facebook/rocksdb/releases/tag/v4.2)
diff --git a/thirdparty/rocksdb/docs/_posts/2016-02-25-rocksdb-ama.markdown b/thirdparty/rocksdb/docs/_posts/2016-02-25-rocksdb-ama.markdown
new file mode 100644
index 0000000000..2ba04f39a1
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2016-02-25-rocksdb-ama.markdown
@@ -0,0 +1,20 @@
+---
+title: RocksDB AMA
+layout: post
+author: yhchiang
+category: blog
+redirect_from:
+  - /blog/3065/rocksdb-ama/
+---
+
+RocksDB developers are doing a Reddit Ask-Me-Anything now at 10AM – 11AM PDT! We welcome you to stop by and ask any RocksDB related questions, including existing / upcoming features, tuning tips, or database design.
+
+Here are some enhancements that we'd like to focus on over the next six months:
+
+* 2-Phase Commit
+* Lua support in some custom functions
+* Backup and repair tools
+* Direct I/O to bypass OS cache
+* RocksDB Java API
+
+[https://www.reddit.com/r/IAmA/comments/47k1si/we_are_rocksdb_developers_ask_us_anything/](https://www.reddit.com/r/IAmA/comments/47k1si/we_are_rocksdb_developers_ask_us_anything/)
diff --git a/thirdparty/rocksdb/docs/_posts/2016-03-07-rocksdb-options-file.markdown b/thirdparty/rocksdb/docs/_posts/2016-03-07-rocksdb-options-file.markdown
new file mode 100644
index 0000000000..703449b01a
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2016-03-07-rocksdb-options-file.markdown
@@ -0,0 +1,24 @@
+---
+title: RocksDB Options File
+layout: post
+author: yhciang
+category: blog
+redirect_from:
+  - /blog/3089/rocksdb-options-file/
+---
+
+In RocksDB 4.3, we added a new set of features that makes managing RocksDB options easier.  Specifically:
+
+  * **Persisting Options Automatically**: Each RocksDB database will now automatically persist its current set of options into an INI file on every successful call of DB::Open(), SetOptions(), and CreateColumnFamily() / DropColumnFamily().
+
+
+
+  * **Load Options from File**: We added [LoadLatestOptions() / LoadOptionsFromFile()](https://github.com/facebook/rocksdb/blob/4.3.fb/include/rocksdb/utilities/options_util.h#L48-L58) that enables developers to construct RocksDB options object from an options file.
+
+
+
+  * **Sanity Check Options**: We added [CheckOptionsCompatibility](https://github.com/facebook/rocksdb/blob/4.3.fb/include/rocksdb/utilities/options_util.h#L64-L77) that performs compatibility check on two sets of RocksDB options.
+
+<!--truncate-->
+
+Want to know more about how to use this new features? Check out the [RocksDB Options File wiki page](https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File) and start using this new feature today!
diff --git a/thirdparty/rocksdb/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown b/thirdparty/rocksdb/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
new file mode 100644
index 0000000000..247768d307
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2016-04-26-rocksdb-4-5-1-released.markdown
@@ -0,0 +1,60 @@
+---
+title: RocksDB 4.5.1 Released!
+layout: post
+author: sdong
+category: blog
+redirect_from:
+  - /blog/3179/rocksdb-4-5-1-released/
+---
+
+## 4.5.1 (3/25/2016)
+
+### Bug Fixes
+
+  *  Fix failures caused by the destorying order of singleton objects.
+
+<br/>
+
+## 4.5.0 (2/5/2016)
+
+### Public API Changes
+
+  * Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes.
+  * Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll.
+  * DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead
+
+### New Features
+  * ldb tool now supports operations to non-default column families.
+  * Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true.
+  * Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate.
+
+<br/>
+
+<!--truncate-->
+
+## 4.4.0 (1/14/2016)
+
+### Public API Changes
+
+  * Change names in CompactionPri and add a new one.
+  * Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit.
+  * If options.max_write_buffer_number > 3, writes will be slowed down when writing to the last write buffer to delay a full stop.
+  * Introduce CompactionJobInfo::compaction_reason, this field include the reason to trigger the compaction.
+  * After slow down is triggered, if estimated pending compaction bytes keep increasing, slowdown more.
+  * Increase default options.delayed_write_rate to 2MB/s.
+  * Added a new parameter --path to ldb tool. --path accepts the name of either MANIFEST, SST or a WAL file. Either --db or --path can be used when calling ldb.
+
+<br/>
+
+## 4.3.0 (12/8/2015)
+
+### New Features
+
+  * CompactionFilter has new member function called IgnoreSnapshots which allows CompactionFilter to be called even if there are snapshots later than the key.
+  * RocksDB will now persist options under the same directory as the RocksDB database on successful DB::Open, CreateColumnFamily, DropColumnFamily, and SetOptions.
+  * Introduce LoadLatestOptions() in rocksdb/utilities/options_util.h. This function can construct the latest DBOptions / ColumnFamilyOptions used by the specified RocksDB intance.
+  * Introduce CheckOptionsCompatibility() in rocksdb/utilities/options_util.h. This function checks whether the input set of options is able to open the specified DB successfully.
+
+### Public API Changes
+
+  * When options.db_write_buffer_size triggers, only the column family with the largest column family size will be flushed, not all the column families.
diff --git a/thirdparty/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown b/thirdparty/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown
new file mode 100644
index 0000000000..b42a66e301
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2016-07-26-rocksdb-4-8-released.markdown
@@ -0,0 +1,48 @@
+---
+title: RocksDB 4.8 Released!
+layout: post
+author: yiwu
+category: blog
+redirect_from:
+  - /blog/3239/rocksdb-4-8-released/
+---
+
+## 4.8.0 (5/2/2016)
+
+### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-change-1)Public API Change
+
+  * Allow preset compression dictionary for improved compression of block-based tables. This is supported for zlib, zstd, and lz4. The compression dictionary's size is configurable via CompressionOptions::max_dict_bytes.
+  * Delete deprecated classes for creating backups (BackupableDB) and restoring from backups (RestoreBackupableDB). Now, BackupEngine should be used for creating backups, and BackupEngineReadOnly should be used for restorations. For more details, see [https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F](https://github.com/facebook/rocksdb/wiki/How-to-backup-RocksDB%3F)
+  * Expose estimate of per-level compression ratio via DB property: "rocksdb.compression-ratio-at-levelN".
+  * Added EventListener::OnTableFileCreationStarted. EventListener::OnTableFileCreated will be called on failure case. User can check creation status via TableFileCreationInfo::status.
+
+### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#new-features-2)New Features
+
+  * Add ReadOptions::readahead_size. If non-zero, NewIterator will create a new table reader which performs reads of the given size.
+
+<br/>
+
+<!--truncate-->
+
+## [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#470-482016)4.7.0 (4/8/2016)
+
+### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-change-2)Public API Change
+
+  * rename options compaction_measure_io_stats to report_bg_io_stats and include flush too.
+  * Change some default options. Now default options will optimize for server-workloads. Also enable slowdown and full stop triggers for pending compaction bytes. These changes may cause sub-optimal performance or significant increase of resource usage. To avoid these risks, users can open existing RocksDB with options extracted from RocksDB option files. See [https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File](https://github.com/facebook/rocksdb/wiki/RocksDB-Options-File) for how to use RocksDB option files. Or you can call Options.OldDefaults() to recover old defaults. DEFAULT_OPTIONS_HISTORY.md will track change history of default options.
+
+<br/>
+
+## [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#460-3102016)4.6.0 (3/10/2016)
+
+### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#public-api-changes-1)Public API Changes
+
+  * Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier
+  * Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signature of Cache::Insert() is updated accordingly.
+  * Tickers [NUMBER_DB_NEXT, NUMBER_DB_PREV, NUMBER_DB_NEXT_FOUND, NUMBER_DB_PREV_FOUND, ITER_BYTES_READ] are not updated immediately. The are updated when the Iterator is deleted.
+  * Add monotonically increasing counter (DB property "rocksdb.current-super-version-number") that increments upon any change to the LSM tree.
+
+### [](https://github.com/facebook/rocksdb/blob/master/HISTORY.md#new-features-3)New Features
+
+  * Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification.
+  * Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned"
diff --git a/thirdparty/rocksdb/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown b/thirdparty/rocksdb/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
new file mode 100644
index 0000000000..87c20eb47d
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2016-09-28-rocksdb-4-11-2-released.markdown
@@ -0,0 +1,49 @@
+---
+title: RocksDB 4.11.2 Released!
+layout: post
+author: sdong
+category: blog
+---
+We abandoned release candidates 4.10.x and directly go to 4.11.2 from 4.9, to make sure the latest release is stable. In 4.11.2, we fixed several data corruption related bugs introduced in 4.9.0.
+
+## 4.11.2 (9/15/2016)
+
+### Bug fixes
+
+  * Segfault when failing to open an SST file for read-ahead iterators.
+  * WAL without data for all CFs is not deleted after recovery.
+
+<!--truncate-->
+
+## 4.11.1 (8/30/2016)
+
+### Bug Fixes
+
+  * Mitigate the regression bug of deadlock condition during recovery when options.max_successive_merges hits.
+  * Fix data race condition related to hash index in block based table when putting indexes in the block cache.
+
+## 4.11.0 (8/1/2016)
+
+### Public API Change
+
+  * options.memtable_prefix_bloom_huge_page_tlb_size => memtable_huge_page_size. When it is set, RocksDB will try to allocate memory from huge page for memtable too, rather than just memtable bloom filter.
+
+### New Features
+
+  * A tool to migrate DB after options change. See include/rocksdb/utilities/option_change_migration.h.
+  * Add ReadOptions.background_purge_on_iterator_cleanup. If true, we avoid file deletion when destorying iterators.
+
+## 4.10.0 (7/5/2016)
+
+### Public API Change
+
+  * options.memtable_prefix_bloom_bits changes to options.memtable_prefix_bloom_bits_ratio and deprecate options.memtable_prefix_bloom_probes
+  * enum type CompressionType and PerfLevel changes from char to unsigned char. Value of all PerfLevel shift by one.
+  * Deprecate options.filter_deletes.
+
+### New Features
+
+  * Add avoid_flush_during_recovery option.
+  * Add a read option background_purge_on_iterator_cleanup to avoid deleting files in foreground when destroying iterators. Instead, a job is scheduled in high priority queue and would be executed in a separate background thread.
+  * RepairDB support for column families. RepairDB now associates data with non-default column families using information embedded in the SST/WAL files (4.7 or later). For data written by 4.6 or earlier, RepairDB associates it with the default column family.
+  * Add options.write_buffer_manager which allows users to control total memtable sizes across multiple DB instances.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-01-06-rocksdb-5-0-1-released.markdown b/thirdparty/rocksdb/docs/_posts/2017-01-06-rocksdb-5-0-1-released.markdown
new file mode 100644
index 0000000000..fb0413055d
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-01-06-rocksdb-5-0-1-released.markdown
@@ -0,0 +1,26 @@
+---
+title: RocksDB 5.0.1 Released!
+layout: post
+author: yiwu
+category: blog
+---
+
+### Public API Change
+
+  * Options::max_bytes_for_level_multiplier is now a double along with all getters and setters.
+  * Support dynamically change `delayed_write_rate` and `max_total_wal_size` options via SetDBOptions().
+  * Introduce DB::DeleteRange for optimized deletion of large ranges of contiguous keys.
+  * Support dynamically change `delayed_write_rate` option via SetDBOptions().
+  * Options::allow_concurrent_memtable_write and Options::enable_write_thread_adaptive_yield are now true by default.
+  * Remove Tickers::SEQUENCE_NUMBER to avoid confusion if statistics object is shared among RocksDB instance. Alternatively DB::GetLatestSequenceNumber() can be used to get the same value.
+  * Options.level0_stop_writes_trigger default value changes from 24 to 32.
+  * New compaction filter API: CompactionFilter::FilterV2(). Allows to drop ranges of keys.
+  * Removed flashcache support.
+  * DB::AddFile() is deprecated and is replaced with DB::IngestExternalFile(). DB::IngestExternalFile() remove all the restrictions that existed for DB::AddFile.
+
+### New Features
+
+  * Add avoid_flush_during_shutdown option, which speeds up DB shutdown by not flushing unpersisted data (i.e. with disableWAL = true). Unpersisted data will be lost. The options is dynamically changeable via SetDBOptions().
+  * Add memtable_insert_with_hint_prefix_extractor option. The option is mean to reduce CPU usage for inserting keys into memtable, if keys can be group by prefix and insert for each prefix are sequential or almost sequential. See include/rocksdb/options.h for more details.
+  * Add LuaCompactionFilter in utilities.  This allows developers to write compaction filters in Lua.  To use this feature, LUA_PATH needs to be set to the root directory of Lua.
+  * No longer populate "LATEST_BACKUP" file in backup directory, which formerly contained the number of the latest backup. The latest backup can be determined by finding the highest numbered file in the "meta/" subdirectory.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-02-07-rocksdb-5-1-2-released.markdown b/thirdparty/rocksdb/docs/_posts/2017-02-07-rocksdb-5-1-2-released.markdown
new file mode 100644
index 0000000000..35bafb219c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-02-07-rocksdb-5-1-2-released.markdown
@@ -0,0 +1,15 @@
+---
+title: RocksDB 5.1.2 Released!
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+### Public API Change
+* Support dynamically change `delete_obsolete_files_period_micros` option via SetDBOptions().
+* Added EventListener::OnExternalFileIngested which will be called when IngestExternalFile() add a file successfully.
+* BackupEngine::Open and BackupEngineReadOnly::Open now always return error statuses matching those of the backup Env.
+
+### Bug Fixes
+* Fix the bug that if 2PC is enabled, checkpoints may loss some recent transactions.
+* When file copying is needed when creating checkpoints or bulk loading files, fsync the file after the file copying.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-02-17-bulkoad-ingest-sst-file.markdown b/thirdparty/rocksdb/docs/_posts/2017-02-17-bulkoad-ingest-sst-file.markdown
new file mode 100644
index 0000000000..9a43a846a4
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-02-17-bulkoad-ingest-sst-file.markdown
@@ -0,0 +1,50 @@
+---
+title: Bulkloading by ingesting external SST files
+layout: post
+author: IslamAbdelRahman
+category: blog
+---
+
+## Introduction
+
+One of the basic operations of RocksDB is writing to RocksDB, Writes happen when user call (DB::Put, DB::Write, DB::Delete ... ), but what happens when you write to RocksDB ? .. this is a brief description of what happens.
+- User insert a new key/value by calling DB::Put() (or DB::Write())
+- We create a new entry for the new key/value in our in-memory structure (memtable / SkipList by default) and we assign it a new sequence number.
+- When the memtable exceeds a specific size (64 MB for example), we convert this memtable to a SST file, and put this file in level 0 of our LSM-Tree
+- Later, compaction will kick in and move data from level 0 to level 1, and then from level 1 to level 2 .. and so on 
+
+But what if we can skip these steps and add data to the lowest possible level directly ? This is what bulk-loading does
+
+## Bulkloading
+
+- Write all of our keys and values into SST file outside of the DB
+- Add the SST file into the LSM directly
+
+This is bulk-loading, and in specific use-cases it allow users to achieve faster data loading and better write-amplification.
+
+and doing it is as simple as 
+```cpp
+Options options;
+SstFileWriter sst_file_writer(EnvOptions(), options, options.comparator);
+Status s = sst_file_writer.Open(file_path);
+assert(s.ok());
+
+// Insert rows into the SST file, note that inserted keys must be 
+// strictly increasing (based on options.comparator)
+for (...) {
+  s = sst_file_writer.Add(key, value);
+  assert(s.ok());
+}
+
+// Ingest the external SST file into the DB
+s = db_->IngestExternalFile({"/home/usr/file1.sst"}, IngestExternalFileOptions());
+assert(s.ok());
+```
+
+You can find more details about how to generate SST files and ingesting them into RocksDB in this [wiki page](https://github.com/facebook/rocksdb/wiki/Creating-and-Ingesting-SST-files)
+
+## Use cases
+There are multiple use cases where bulkloading could be useful, for example
+- Generating SST files in offline jobs in Hadoop, then downloading and ingesting the SST files into RocksDB
+- Migrating shards between machines by dumping key-range in SST File and loading the file in a different machine
+- Migrating from a different storage (InnoDB to RocksDB migration in MyRocks)
diff --git a/thirdparty/rocksdb/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown b/thirdparty/rocksdb/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
new file mode 100644
index 0000000000..c6ce27d64d
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-03-02-rocksdb-5-2-1-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.2.1 Released!
+layout: post
+author: sdong
+category: blog
+---
+
+### Public API Change
+* NewLRUCache() will determine number of shard bits automatically based on capacity, if the user doesn't pass one. This also impacts the default block cache when the user doesn't explict provide one.
+* Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files.
+
+### New Features
+* Added new overloaded function GetApproximateSizes that allows to specify if memtable stats should be computed only without computing SST files' stats approximations.
+* Added new function GetApproximateMemTableStats that approximates both number of records and size of memtables.
+* (Experimental) Two-level indexing that partition the index and creates a 2nd level index on the partitions. The feature can be enabled by setting kTwoLevelIndexSearch as IndexType and configuring index_per_partition.
+
+### Bug Fixes
+* RangeSync() should work if ROCKSDB_FALLOCATE_PRESENT is not set
+* Fix wrong results in a data race case in Get()
+* Some fixes related to 2PC.
+* Fix several bugs in Direct I/O supports.
+* Fix a regression bug which can cause Seek() to miss some keys if the return key has been updated many times after the snapshot which is used by the iterator.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-05-12-partitioned-index-filter.markdown b/thirdparty/rocksdb/docs/_posts/2017-05-12-partitioned-index-filter.markdown
new file mode 100644
index 0000000000..a537feb0c7
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-05-12-partitioned-index-filter.markdown
@@ -0,0 +1,34 @@
+---
+title: Partitioned Index/Filters
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+As DB/mem ratio gets larger, the memory footprint of filter/index blocks becomes non-trivial. Although `cache_index_and_filter_blocks` allows storing only a subset of them in block cache, their relatively large size negatively affects the performance by i) occupying the block cache space that could otherwise be used for caching data, ii) increasing the load on the disk storage by loading them into the cache after a miss. Here we illustrate these problems in more detail and explain how partitioning index/filters alleviates the overhead.
+
+### How large are the index/filter blocks?
+
+RocksDB has by default one index/filter block per SST file. The size of the index/filter varies based on the configuration but for a SST of size 256MB the index/filter block of size 0.5/5MB is typical, which is much larger than the typical data block size of 4-32KB. That is fine when all index/filters fit perfectly into memory and hence are read once per SST lifetime, not so much when they compete with data blocks for the block cache space and are also likely to be re-read many times from the disk.
+
+### What is the big deal with large index/filter blocks?
+
+When index/filter blocks are stored in block cache they are effectively competing with data blocks (as well as with each other) on this scarce resource. A filter of size 5MB is occupying the space that could otherwise be used to cache 1000s of data blocks (of size 4KB). This would result in more cache misses for data blocks. The large index/filters also kick each other out of the block cache more often and exacerbate their own cache miss rate too. This is while only a small part of the index/filter block might have been actually used during its lifetime in the cache.
+
+After the cache miss of an index/filter, it has to be reloaded from the disk, and its large size is not helping in reducing the IO cost. While a simple point lookup might need at most a couple of data block reads (of size 4KB) one from each layer of LSM, it might end up also loading multiple megabytes of index/filter blocks. If that happens often then the disk is spending more time serving index/filters rather than the actual data blocks.
+
+## What is partitioned index/filters?
+
+With partitioning, the index/filter of a SST file is partitioned into smaller blocks with an additional top-level index on them. When reading an index/filter, only top-level index is loaded into memory. The partitioned index/filter then uses the top-level index to load on demand into the block cache the partitions that are required to perform the index/filter query. The top-level index, which has much smaller memory footprint, can be stored in heap or block cache depending on the `cache_index_and_filter_blocks` setting.
+
+### Success stories
+
+#### HDD, 100TB DB
+
+In this example we have a DB of size 86G on HDD and emulate the small memory that is present to a node with 100TB of data by using direct IO (skipping OS file cache) and a very small block cache of size 60MB. Partitioning improves throughput by 11x from 5 op/s to 55 op/s.
+
+#### SSD, Linkbench
+
+In this example we have a DB of size 300G on SSD and emulate the small memory that would be available in presence of other DBs on the same node by by using direct IO (skipping OS file cache) and block cache of size 6G and 2G. Without partitioning the linkbench throughput drops from 38k tps to 23k when reducing block cache size from 6G to 2G. With partitioning the throughput drops from 38k to only 30k.
+
+Learn more [here](https://github.com/facebook/rocksdb/wiki/Partitioned-Index-Filters).
diff --git a/thirdparty/rocksdb/docs/_posts/2017-05-14-core-local-stats.markdown b/thirdparty/rocksdb/docs/_posts/2017-05-14-core-local-stats.markdown
new file mode 100644
index 0000000000..a806541fca
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-05-14-core-local-stats.markdown
@@ -0,0 +1,106 @@
+---
+title: Core-local Statistics
+layout: post
+author: ajkr
+category: blog
+---
+
+## Origins: Global Atomics
+
+Until RocksDB 4.12, ticker/histogram statistics were implemented with std::atomic values shared across the entire program. A ticker consists of a single atomic, while a histogram consists of several atomics to represent things like min/max/per-bucket counters. These statistics could be updated by all user/background threads.
+
+For concurrent/high-throughput workloads, cache line bouncing of atomics caused high CPU utilization. For example, we have tickers that count block cache hits and misses. Almost every user read increments these tickers a few times. Many concurrent user reads would cause the cache lines containing these atomics to bounce between cores.
+
+### Performance
+
+Here are perf results for 32 reader threads where most reads (99%+) are served by uncompressed block cache. Such a scenario stresses the statistics code heavily.
+
+Benchmark command: `TEST_TMPDIR=/dev/shm/ perf record -g ./db_bench -statistics -use_existing_db=true -benchmarks=readrandom -threads=32 -cache_size=1048576000 -num=1000000 -reads=1000000 && perf report -g --children`
+
+Perf snippet for "cycles" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++   30.33%  30.17%  db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
++    3.65%   0.98%  db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Perf snippet for "cache-misses" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++   19.54%  19.50%  db_bench  db_bench 	     [.] rocksdb::StatisticsImpl::recordTick
++    3.44%   0.57%  db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+The high CPU overhead for updating tickers and histograms corresponds well to the high cache misses.
+
+## Thread-locals: Faster Updates
+
+Since RocksDB 4.12, ticker/histogram statistics use thread-local storage. Each thread has a local set of atomic values that no other thread can update. This prevents the cache line bouncing problem described above. Even though updates to a given value are always made by the same thread, atomics are still useful to synchronize with aggregations for querying statistics.
+
+Implementing this approach involved a couple challenges. First, each query for a statistic's global value must aggregate all threads' local values. This adds some overhead, which may pass unnoticed if statistics are queried infrequently. Second, exited threads' local values are still needed to provide accurate statistics. We handle this by merging a thread's local values into process-wide variables upon thread exit.
+
+### Performance
+
+Update benchmark setup is same as before. CPU overhead improved 7.8x compared to global atomics, corresponding to a 17.8x reduction in cache-misses overhead.
+
+Perf snippet for "cycles" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    2.96%  0.87%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
++    1.37%  0.10%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Perf snippet for "cache-misses" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    1.21%  0.65%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
+     0.08%  0.00%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+To measure statistics query latency, we ran sysbench with 4K OLTP clients concurrently with one client that queries statistics repeatedly. Times shown are in milliseconds.
+
+```
+ min: 18.45
+ avg: 27.91
+ max: 231.65
+ 95th percentile: 55.82
+```
+
+## Core-locals: Faster Querying
+
+The thread-local approach is working well for applications calling RocksDB from only a few threads, or polling statistics infrequently. Eventually, though, we found use cases where those assumptions do not hold. For example, one application has per-connection threads and typically runs into performance issues when connection count grows very high. For debugging such issues, they want high-frequency statistics polling to correlate issues in their application with changes in RocksDB's state.
+
+Once [PR #2258](https://github.com/facebook/rocksdb/pull/2258) lands, ticker/histogram statistics will be local to each CPU core. Similarly to thread-local, each core updates only its local values, thus avoiding cache line bouncing. Local values are still atomics to make aggregation possible. With this change, query work depends only on number of cores, not the number of threads. So, applications with many more threads than cores can no longer impact statistics query latency.
+
+### Performance
+
+Update benchmark setup is same as before. CPU overhead worsened ~23% compared to thread-local, while cache performance was unchanged.
+
+Perf snippet for "cycles" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    2.96%  0.87%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
++    1.37%  0.10%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Perf snippet for "cache-misses" event:
+
+```
+  Children  Self    Command   Shared Object  Symbol
++    1.21%  0.65%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::recordTick
+     0.08%  0.00%   db_bench  db_bench       [.] rocksdb::StatisticsImpl::measureTime
+```
+
+Query latency is measured same as before with times in milliseconds. Average latency improved by 6.3x compared to thread-local.
+
+```
+ min: 2.47
+ avg: 4.45
+ max: 91.13
+ 95th percentile: 7.56
+```
diff --git a/thirdparty/rocksdb/docs/_posts/2017-05-26-rocksdb-5-4-5-released.markdown b/thirdparty/rocksdb/docs/_posts/2017-05-26-rocksdb-5-4-5-released.markdown
new file mode 100644
index 0000000000..561dab4c20
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-05-26-rocksdb-5-4-5-released.markdown
@@ -0,0 +1,39 @@
+---
+title: RocksDB 5.4.5 Released!
+layout: post
+author: sagar0
+category: blog
+---
+
+### Public API Change
+* Support dynamically changing `stats_dump_period_sec` option via SetDBOptions().
+* Added ReadOptions::max_skippable_internal_keys to set a threshold to fail a request as incomplete when too many keys are being skipped while using iterators.
+* DB::Get in place of std::string accepts PinnableSlice, which avoids the extra memcpy of value to std::string in most of cases.
+    * PinnableSlice releases the pinned resources that contain the value when it is destructed or when ::Reset() is called on it.
+    * The old API that accepts std::string, although discouraged, is still supported.
+* Replace Options::use_direct_writes with Options::use_direct_io_for_flush_and_compaction. See Direct IO wiki for details.
+
+### New Features
+* Memtable flush can be avoided during checkpoint creation if total log file size is smaller than a threshold specified by the user.
+* Introduce level-based L0->L0 compactions to reduce file count, so write delays are incurred less often.
+* (Experimental) Partitioning filters which creates an index on the partitions. The feature can be enabled by setting partition_filters when using kFullFilter. Currently the feature also requires two-level indexing to be enabled. Number of partitions is the same as the number of partitions for indexes, which is controlled by metadata_block_size.
+* DB::ResetStats() to reset internal stats.
+* Added CompactionEventListener and EventListener::OnFlushBegin interfaces.
+* Added DB::CreateColumnFamilie() and DB::DropColumnFamilies() to bulk create/drop column families.
+* Facility for cross-building RocksJava using Docker.
+
+### Bug Fixes
+* Fix WriteBatchWithIndex address use after scope error.
+* Fix WritableFile buffer size in direct IO.
+* Add prefetch to PosixRandomAccessFile in buffered io.
+* Fix PinnableSlice access invalid address when row cache is enabled.
+* Fix huge fallocate calls fail and make XFS unhappy.
+* Fix memory alignment with logical sector size.
+* Fix alignment in ReadaheadRandomAccessFile.
+* Fix bias with read amplification stats (READ_AMP_ESTIMATE_USEFUL_BYTES and READ_AMP_TOTAL_READ_BYTES).
+* Fix a manual / auto compaction data race.
+* Fix CentOS 5 cross-building of RocksJava.
+* Build and link with ZStd when creating the static RocksJava build.
+* Fix snprintf's usage to be cross-platform.
+* Fix build errors with blob DB.
+* Fix readamp test type inconsistency.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-06-26-17-level-based-changes.markdown b/thirdparty/rocksdb/docs/_posts/2017-06-26-17-level-based-changes.markdown
new file mode 100644
index 0000000000..9e838eb7f2
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-06-26-17-level-based-changes.markdown
@@ -0,0 +1,60 @@
+---
+title: Level-based Compaction Changes
+layout: post
+author: ajkr
+category: blog
+---
+
+### Introduction
+
+RocksDB provides an option to limit the number of L0 files, which bounds read-amplification. Since L0 files (unlike files at lower levels) can span the entire key-range, a key might be in any file, thus reads need to check them one-by-one. Users often wish to configure a low limit to improve their read latency.
+
+Although, the mechanism with which we enforce L0's file count limit may be unappealing. When the limit is reached, RocksDB intentionally delays user writes. This slows down accumulation of files in L0, and frees up resources for compacting files down to lower levels. But adding delays will significantly increase user-visible write latency jitter.
+
+Also, due to how L0 files can span the entire key-range, compaction parallelization is limited. Files at L0 or L1 may be locked due to involvement in pending L0->L1 or L1->L2 compactions. We can only schedule a parallel L0->L1 compaction if it does not require any of the locked files, which is typically not the case.
+
+To handle these constraints better, we added a new type of compaction, L0->L0. It quickly reduces file count in L0 and can be scheduled even when L1 files are locked, unlike L0->L1. We also changed the L0->L1 picking algorithm to increase opportunities for parallelism.
+
+### Old L0->L1 Picking Logic
+
+Previously, our logic for picking which L0 file to compact was the same as every other level: pick the largest file in the level. One special property of L0->L1 compaction is that files can overlap in the input level, so those overlapping files must be pulled in as well. For example, a compaction may look like this:
+
+![full-range.png](/static/images/compaction/full-range.png)
+
+This compaction pulls in every L0 and L1 file. This happens regardless of which L0 file is initially chosen as each file overlaps with every other file.
+
+Users may insert their data less uniformly in the key-range. For example, a database may look like this during L0->L1 compaction:
+
+![part-range-old.png](/static/images/compaction/part-range-old.png)
+
+Let's say the third file from the top is the largest, and let's say the top two files are created after the compaction started. When the compaction is picked, the fourth L0 file and six rightmost L1 files are pulled in due to overlap. Notice this leaves the database in a state where we might not be able to schedule parallel compactions. For example, if the sixth file from the top is the next largest, we can't compact it because it overlaps with the top two files, which overlap with the locked L0 files.
+
+We can now see the high-level problems with this approach more clearly. First, locked files in L0 or L1 prevent us from parallelizing compactions. When locked files block L0->L1 compaction, there is nothing we can do to eliminate L0 files. Second, L0->L1 compactions are relatively slow. As we saw, when keys are uniformly distributed, L0->L1 compacts two entire levels. While this is happening, new files are being flushed to L0, advancing towards the file count limit.
+
+### New L0->L0 Algorithm
+
+We introduced compaction within L0 to improve both parallelization and speed of reducing L0 file count. An L0->L0 compaction may look like this:
+
+![l1-l2-contend.png](/static/images/compaction/l1-l2-contend.png)
+
+Say the L1->L2 compaction started first. Now L0->L1 is prevented by the locked L1 file. In this case, we compact files within L0. This allows us to start the work for eliminating L0 files earlier. It also lets us do less work since we don't pull in any L1 files, whereas L0->L1 compaction would've pulled in all of them. This lets us quickly reduce L0 file count to keep read-amp low while sustaining large bursts of writes (i.e., fast accumulation of L0 files).
+
+The tradeoff is this increases total compaction work, as we're now compacting files without contributing towards our eventual goal of moving them towards lower levels. Our benchmarks, though, consistently show less compaction stalls and improved write throughput. One justification is that L0 file data is highly likely in page cache and/or block cache due to it being recently written and frequently accessed. So, this type of compaction is relatively cheap compared to compactions at lower levels.
+
+This feature is available since RocksDB 5.4.
+
+### New L0->L1 Picking Logic
+
+Recall how the old L0->L1 picking algorithm chose the largest L0 file for compaction. This didn't fit well with L0->L0 compaction, which operates on a span of files. That span begins at the newest L0 file, and expands towards older files as long as they're not being compacted. Since the largest file may be anywhere, the old L0->L1 picking logic could arbitrarily prevent us from getting a long span of files. See the second illustration in this post for a scenario where this would happen.
+
+So, we changed the L0->L1 picking algorithm to start from the oldest file and expand towards newer files as long as they're not being compacted. For example:
+
+![l0-l1-contend.png](/static/images/compaction/l0-l1-contend.png)
+
+Now, there can never be L0 files unreachable for L0->L0 due to L0->L1 selecting files in the middle. When longer spans of files are available for L0->L0, we perform less compaction work per deleted L0 file, thus improving efficiency.
+
+This feature will be available in RocksDB 5.7.
+
+### Performance Changes
+
+Mark Callaghan did the most extensive benchmarking of this feature's impact on MyRocks. See his results [here](http://smalldatum.blogspot.com/2017/05/innodb-myrocks-and-tokudb-on-insert.html). Note the primary change between his March 17 and April 14 builds is the latter performs L0->L0 compaction.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-06-29-rocksdb-5-5-1-released.markdown b/thirdparty/rocksdb/docs/_posts/2017-06-29-rocksdb-5-5-1-released.markdown
new file mode 100644
index 0000000000..d7856088bf
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-06-29-rocksdb-5-5-1-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.5.1 Released!
+layout: post
+author: lightmark
+category: blog
+---
+
+### New Features
+* FIFO compaction to support Intra L0 compaction too with CompactionOptionsFIFO.allow_compaction=true.
+* Statistics::Reset() to reset user stats.
+* ldb add option --try_load_options, which will open DB with its own option file.
+* Introduce WriteBatch::PopSavePoint to pop the most recent save point explicitly.
+* Support dynamically change `max_open_files` option via SetDBOptions()
+* Added DB::CreateColumnFamilie() and DB::DropColumnFamilies() to bulk create/drop column families.
+* Add debugging function `GetAllKeyVersions` to see internal versions of a range of keys.
+* Support file ingestion with universal compaction style
+* Support file ingestion behind with option `allow_ingest_behind`
+* New option enable_pipelined_write which may improve write throughput in case writing from multiple threads and WAL enabled.
+
+### Bug Fixes
+* Fix the bug that Direct I/O uses direct reads for non-SST file
+* Fix the bug that flush doesn't respond to fsync result
diff --git a/thirdparty/rocksdb/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown b/thirdparty/rocksdb/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown
new file mode 100644
index 0000000000..3b54ffd5ad
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-07-25-rocksdb-5-6-1-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.6.1 Released!
+layout: post
+author: yiwu
+category: blog
+---
+
+### Public API Change
+* Scheduling flushes and compactions in the same thread pool is no longer supported by setting `max_background_flushes=0`. Instead, users can achieve this by configuring their high-pri thread pool to have zero threads. See https://github.com/facebook/rocksdb/wiki/Thread-Pool for more details.
+* Replace `Options::max_background_flushes`, `Options::max_background_compactions`, and `Options::base_background_compactions` all with `Options::max_background_jobs`, which automatically decides how many threads to allocate towards flush/compaction.
+* options.delayed_write_rate by default take the value of options.rate_limiter rate.
+* Replace global variable `IOStatsContext iostats_context` with `IOStatsContext* get_iostats_context()`; replace global variable `PerfContext perf_context` with `PerfContext* get_perf_context()`.
+
+### New Features
+* Change ticker/histogram statistics implementations to use core-local storage. This improves aggregation speed compared to our previous thread-local approach, particularly for applications with many threads. See http://rocksdb.org/blog/2017/05/14/core-local-stats.html for more details.
+* Users can pass a cache object to write buffer manager, so that they can cap memory usage for memtable and block cache using one single limit.
+* Flush will be triggered when 7/8 of the limit introduced by write_buffer_manager or db_write_buffer_size is triggered, so that the hard threshold is hard to hit. See https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager for more details.
+* Introduce WriteOptions.low_pri. If it is true, low priority writes will be throttled if the compaction is behind. See https://github.com/facebook/rocksdb/wiki/Low-Priority-Write for more details.
+* `DB::IngestExternalFile()` now supports ingesting files into a database containing range deletions.
+
+### Bug Fixes
+* Shouldn't ignore return value of fsync() in flush.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown b/thirdparty/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown
new file mode 100644
index 0000000000..7ac2fec34b
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-08-24-pinnableslice.markdown
@@ -0,0 +1,37 @@
+---
+title: PinnableSlice; less memcpy with point lookups
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+The classic API for [DB::Get](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L310) receives a std::string as argument to which it will copy the value. The memcpy overhead could be non-trivial when the value is large. The [new API](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L322) receives a PinnableSlice instead, which avoids memcpy in most of the cases.
+
+### What is PinnableSlice?
+
+Similarly to Slice, PinnableSlice refers to some in-memory data so it does not incur the memcpy cost. To ensure that the data will not be erased while it is being processed by the user, PinnableSlice, as its name suggests, has the data pinned in memory. The pinned data are released when PinnableSlice object is destructed or when ::Reset is invoked explicitly on it.
+
+### How good is it?
+
+Here are the improvements in throughput for an [in-memory benchmark](https://github.com/facebook/rocksdb/pull/1756#issuecomment-286201693):
+* value 1k byte: 14%
+* value 10k byte: 34%
+
+### Any limitations?
+
+PinnableSlice tries to avoid memcpy as much as possible. The primary gain is when reading large values from the block cache. There are however cases that it would still have to copy the data into its internal buffer. The reason is mainly the complexity of implementation and if there is enough motivation on the application side. the scope of PinnableSlice could be extended to such cases too. These include:
+* Merged values
+* Reads from memtables
+
+### How to use it?
+
+```cpp
+PinnableSlice pinnable_val;
+while (!stopped) { 
+   auto s = db->Get(opt, cf, key, &pinnable_val);
+   // ... use it
+   pinnable_val.Reset(); // then release it immediately
+}
+```
+
+You can also [initialize the internal buffer](https://github.com/facebook/rocksdb/blob/9e583711144f580390ce21a49a8ceacca338fcd5/include/rocksdb/db.h#L314) of PinnableSlice by passing your own string in the constructor. [simple_example.cc](https://github.com/facebook/rocksdb/blob/master/examples/simple_example.cc) demonstrates that with more examples.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-08-25-flushwal.markdown b/thirdparty/rocksdb/docs/_posts/2017-08-25-flushwal.markdown
new file mode 100644
index 0000000000..2dc5626ad4
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-08-25-flushwal.markdown
@@ -0,0 +1,26 @@
+---
+title: FlushWAL; less fwrite, faster writes
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+When `DB::Put` is called, the data is written to both memtable (to be flushed to SST files later) and the WAL (write-ahead log) if it is enabled. In the case of a crash, RocksDB can recover as much as the memtable state that is reflected into the WAL. By default RocksDB automatically flushes the WAL from the application memory to the OS buffer after each `::Put`. It however can be configured to perform the flush manually after an explicit call to `::FlushWAL`. Not doing fwrite syscall after each `::Put` offers a tradeoff between reliability and write latency for the general case. As we explain below, some applications such as MyRocks benefit from this API to gain higher write throughput with however no compromise in reliability.
+
+### How much is the gain?
+
+Using `::FlushWAL` API along with setting `DBOptions.concurrent_prepare`, MyRocks achieves 40% higher throughput in Sysbench's [update-nonindex](https://github.com/akopytov/sysbench/blob/master/src/lua/oltp_update_non_index.lua) benchmark.
+
+### Write, Flush, and Sync
+
+The write to the WAL is first written to the application memory buffer. The buffer in the next step is "flushed" to OS buffer by calling fwrite syscall. The OS buffer is later "synced" to the persistent storage. The data in the OS buffer, although not persisted yet, will survive the application crash. By default, the flush occurs automatically upon each call to `DB::Put` or `DB::Write`. The user can additionally request sync after each write by setting `WriteOptions::sync`.
+
+### FlushWAL API
+
+The user can turn off the automatic flush of the WAL by setting `DBOptions::manual_wal_flush`. In that case, the WAL buffer is flushed when it is either full or `DB::FlushWAL` is called by the user. The API also accepts a boolean argument should we want to sync right after the flush: `::FlushWAL(true)`.
+
+### Success story: MyRocks
+
+Some applications that use RocksDB, already have other machinsims in place to provide reliability. MySQL for example uses 2PC (two-phase commit) to write to both binlog as well as the storage engine such as InnoDB and MyRocks. The group commit logic in MySQL allows the 1st phase (Prepare) to be run in parallel but after a commit group is formed performs the 2nd phase (Commit) in a serial manner. This makes low commit latency in the storage engine essential for acheiving high throughput. The commit in MyRocks includes writing to the RocksDB WAL, which as explaiend above, by default incures the latency of flushing the WAL new appends to the OS buffer.
+
+Since binlog helps in recovering from some failure scenarios, MySQL can provide reliability without however needing a storage WAL flush after each individual commit. MyRocks benefits from this property, disables automatic WAL flush in RocksDB, and manually calls `::FlushWAL` when requested by MySQL.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-09-28-rocksdb-5-8-released.markdown b/thirdparty/rocksdb/docs/_posts/2017-09-28-rocksdb-5-8-released.markdown
new file mode 100644
index 0000000000..a22dcaa1cc
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-09-28-rocksdb-5-8-released.markdown
@@ -0,0 +1,25 @@
+---
+title: RocksDB 5.8 Released!
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+### Public API Change
+* Users of `Statistics::getHistogramString()` will see fewer histogram buckets and different bucket endpoints.
+* `Slice::compare` and BytewiseComparator `Compare` no longer accept `Slice`s containing nullptr.
+* `Transaction::Get` and `Transaction::GetForUpdate` variants with `PinnableSlice` added.
+
+### New Features
+* Add Iterator::Refresh(), which allows users to update the iterator state so that they can avoid some initialization costs of recreating iterators.
+* Replace dynamic_cast<> (except unit test) so people can choose to build with RTTI off. With make, release mode is by default built with -fno-rtti and debug mode is built without it. Users can override it by setting USE_RTTI=0 or 1.
+* Universal compactions including the bottom level can be executed in a dedicated thread pool. This alleviates head-of-line blocking in the compaction queue, which cause write stalling, particularly in multi-instance use cases. Users can enable this feature via `Env::SetBackgroundThreads(N, Env::Priority::BOTTOM)`, where `N > 0`.
+* Allow merge operator to be called even with a single merge operand during compactions, by appropriately overriding `MergeOperator::AllowSingleOperand`.
+* Add `DB::VerifyChecksum()`, which verifies the checksums in all SST files in a running DB.
+* Block-based table support for disabling checksums by setting `BlockBasedTableOptions::checksum = kNoChecksum`.
+
+### Bug Fixes
+* Fix wrong latencies in `rocksdb.db.get.micros`, `rocksdb.db.write.micros`, and `rocksdb.sst.read.micros`.
+* Fix incorrect dropping of deletions during intra-L0 compaction.
+* Fix transient reappearance of keys covered by range deletions when memtable prefix bloom filter is enabled.
+* Fix potentially wrong file smallest key when range deletions separated by snapshot are written together.
diff --git a/thirdparty/rocksdb/docs/_posts/2017-12-18-17-auto-tuned-rate-limiter.markdown b/thirdparty/rocksdb/docs/_posts/2017-12-18-17-auto-tuned-rate-limiter.markdown
new file mode 100644
index 0000000000..d2e6204e1e
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-12-18-17-auto-tuned-rate-limiter.markdown
@@ -0,0 +1,28 @@
+---
+title: Auto-tuned Rate Limiter
+layout: post
+author: ajkr
+category: blog
+---
+
+### Introduction
+
+Our rate limiter has been hard to configure since users need to pick a value that is low enough to prevent background I/O spikes, which can impact user-visible read/write latencies. Meanwhile, picking too low a value can cause memtables and L0 files to pile up, eventually leading to writes stalling. Tuning the rate limiter has been especially difficult for users whose DB instances have different workloads, or have workloads that vary over time, or commonly both.
+
+To address this, in RocksDB 5.9 we released a dynamic rate limiter that adjusts itself over time according to demand for background I/O. It can be enabled simply by passing `auto_tuned=true` in the `NewGenericRateLimiter()` call. In this case `rate_bytes_per_sec` will indicate the upper-bound of the window within which a rate limit will be picked dynamically. The chosen rate limit will be much lower unless absolutely necessary, so setting this to the device's maximum throughput is a reasonable choice on dedicated hosts.
+
+### Algorithm
+
+We use a simple multiplicative-increase, multiplicative-decrease algorithm. We measure demand for background I/O as the ratio of intervals where the rate limiter is drained. There are low and high watermarks for this ratio, which will trigger a change in rate limit when breached. The rate limit can move within a window bounded by the user-specified upper-bound, and a lower-bound that we derive internally. Users can expect this lower bound to be 1-2 orders of magnitude less than the provided upper-bound (so don't provide INT64_MAX as your upper-bound), although it's subject to change.
+
+### Benchmark Results
+
+Data is ingested at 10MB/s and the rate limiter was created with 1000MB/s as its upper bound. The dynamically chosen rate limit hovers around 125MB/s. The other clustering of points at 50MB/s is due to number of compaction threads being reduced to one when there's no compaction pressure. 
+
+![](/static/images/rate-limiter/write-KBps-series.png)
+
+![](/static/images/rate-limiter/auto-tuned-write-KBps-series.png)
+
+The following graph summarizes the above two time series graphs in CDF form. In particular, notice the p90 - p100 for background write rate are significantly lower with auto-tuned rate limiter enabled.
+
+![](/static/images/rate-limiter/write-KBps-cdf.png)
diff --git a/thirdparty/rocksdb/docs/_posts/2017-12-19-write-prepared-txn.markdown b/thirdparty/rocksdb/docs/_posts/2017-12-19-write-prepared-txn.markdown
new file mode 100644
index 0000000000..439b3f83cc
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2017-12-19-write-prepared-txn.markdown
@@ -0,0 +1,41 @@
+---
+title: WritePrepared Transactions
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+RocksDB supports both optimistic and pessimistic concurrency controls. The pessimistic transactions make use of locks to provide isolation between the transactions. The default write policy in pessimistic transactions is _WriteCommitted_, which means that the data is written to the DB, i.e., the memtable, only after the transaction is committed. This policy simplified the implementation but came with some limitations in throughput, transaction size, and variety in supported isolation levels. In the below, we explain these in detail and present the other write policies, _WritePrepared_ and _WriteUnprepared_. We then dive into the design of _WritePrepared_ transactions.
+
+### WriteCommitted, Pros and Cons
+
+With _WriteCommitted_ write policy, the data is written to the memtable only after the transaction commits. This greatly simplifies the read path as any data that is read by other transactions can be assumed to be committed. This write policy, however, implies that the writes are buffered in memory in the meanwhile. This makes memory a bottleneck for large transactions. The delay of the commit phase in 2PC (two-phase commit) also becomes noticeable since most of the work, i.e., writing to memtable, is done at the commit phase. When the commit of multiple transactions are done in a serial fashion, such as in 2PC implementation of MySQL, the lengthy commit latency becomes a major contributor to lower throughput. Moreover this write policy cannot provide weaker isolation levels, such as READ UNCOMMITTED, that could potentially provide higher throughput for some applications.
+
+### Alternatives: _WritePrepared_ and _WriteUnprepared_
+
+To tackle the lengthy commit issue, we should do memtable writes at earlier phases of 2PC so that the commit phase become lightweight and fast. 2PC is composed of Write stage, where the transaction `::Put` is invoked, the prepare phase, where `::Prepare` is invoked (upon which the DB promises to commit the transaction if later is requested), and commit phase, where `::Commit` is invoked and the transaction writes become visible to all readers. To make the commit phase lightweight, the memtable write could be done at either `::Prepare` or `::Put` stages, resulting into _WritePrepared_ and _WriteUnprepared_ write policies respectively. The downside is that when another transaction is reading data, it would need a way to tell apart which data is committed, and if they are, whether they are committed before the transaction's start, i.e., in the read snapshot of the transaction. _WritePrepared_ would still have the issue of buffering the data, which makes the memory the bottleneck for large transactions. It however provides a good milestone for transitioning from _WriteCommitted_ to _WriteUnprepared_ write policy. Here we explain the design of _WritePrepared_ policy. We will cover the changes that make the design to also supported _WriteUnprepared_ in an upcoming post.
+
+### _WritePrepared_ in a nutshell
+
+These are the primary design questions that needs to be addressed:
+1) How do we identify the key/values in the DB with transactions that wrote them?
+2) How do we figure if a key/value written by transaction Txn_w is in the read snapshot of the reading transaction Txn_r?
+3) How do we rollback the data written by aborted transactions?
+
+With _WritePrepared_, a transaction still buffers the writes in a write batch object in memory. When 2PC `::Prepare` is called, it writes the in-memory write batch to the WAL (write-ahead log) as well as to the memtable(s) (one memtable per column family); We reuse the existing notion of sequence numbers in RocksDB to tag all the key/values in the same write batch with the same sequence number, `prepare_seq`, which is also used as the identifier for the transaction. At commit time, it writes a commit marker to the WAL, whose sequence number, `commit_seq`, will be used as the commit timestamp of the transaction. Before releasing the commit sequence number to the readers, it stores a mapping from `prepare_seq` to `commit_seq` in an in-memory data structure that we call _CommitCache_. When a transaction reading values from the DB (tagged with `prepare_seq`) it makes use of the _CommitCache_ to figure if `commit_seq` of the value is in its read snapshot. To rollback an aborted transaction, we apply the status before the transaction by making another write that cancels out the writes of the aborted transaction.
+
+The _CommitCache_ is a lock-free data structure that caches the recent commit entries. Looking up the entries in the cache must be enough for almost all th transactions that commit in a timely manner. When evicting the older entries from the cache, it still maintains some other data structures to cover the corner cases for transactions that takes abnormally too long to finish. We will cover them in the design details below.
+
+### Benchmark Results
+Here we presents the improvements observed in MyRocks with sysbench and linkbench:
+* benchmark...........tps.........p95 latency....cpu/query
+* insert...................68%
+* update-noindex...30%......38%
+* update-index.......61%.......28%
+* read-write............6%........3.5%
+* read-only...........-1.2%.....-1.8%
+* linkbench.............1.9%......+overall........0.6%
+
+Here are also the detailed results for [In-Memory Sysbench](https://gist.github.com/maysamyabandeh/bdb868091b2929a6d938615fdcf58424) and [SSD Sysbench](https://gist.github.com/maysamyabandeh/ff94f378ab48925025c34c47eff99306) curtesy of [@mdcallag](https://github.com/mdcallag).
+
+Learn more [here](https://github.com/facebook/rocksdb/wiki/WritePrepared-Transactions).
diff --git a/thirdparty/rocksdb/docs/_posts/2018-02-05-rocksdb-5-10-2-released.markdown b/thirdparty/rocksdb/docs/_posts/2018-02-05-rocksdb-5-10-2-released.markdown
new file mode 100644
index 0000000000..9f32d3f94c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2018-02-05-rocksdb-5-10-2-released.markdown
@@ -0,0 +1,22 @@
+---
+title: RocksDB 5.10.2 Released!
+layout: post
+author: siying
+category: blog
+---
+
+### Public API Change
+* When running `make` with environment variable `USE_SSE` set and `PORTABLE` unset, will use all machine features available locally. Previously this combination only compiled SSE-related features.
+
+### New Features
+* CRC32C is now using the 3-way pipelined SSE algorithm `crc32c_3way` on supported platforms to improve performance. The system will choose to use this algorithm on supported platforms automatically whenever possible. If PCLMULQDQ is not supported it will fall back to the old Fast_CRC32 algorithm.
+* Provide lifetime hints when writing files on Linux. This reduces hardware write-amp on storage devices supporting multiple streams.
+* Add a DB stat, `NUMBER_ITER_SKIP`, which returns how many internal keys were skipped during iterations (e.g., due to being tombstones or duplicate versions of a key).
+* Add PerfContext counters, `key_lock_wait_count` and `key_lock_wait_time`, which measure the number of times transactions wait on key locks and total amount of time waiting.
+
+### Bug Fixes
+* Fix IOError on WAL write doesn't propagate to write group follower
+* Make iterator invalid on merge error.
+* Fix performance issue in `IngestExternalFile()` affecting databases with large number of SST files.
+* Fix possible corruption to LSM structure when `DeleteFilesInRange()` deletes a subset of files spanned by a `DeleteRange()` marker.
+* Fix DB::Flush() keep waiting after flush finish under certain condition.
diff --git a/thirdparty/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown b/thirdparty/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown
new file mode 100644
index 0000000000..c0e8c44258
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2018-08-01-rocksdb-tuning-advisor.markdown
@@ -0,0 +1,58 @@
+---
+title: Rocksdb Tuning Advisor
+layout: post
+author: poojam23
+category: blog
+---
+
+The performance of Rocksdb is contingent on its tuning. However, because
+of the complexity of its underlying technology and a large number of
+configurable parameters, a good configuration is sometimes hard to obtain. The aim of
+the python command-line tool, Rocksdb Advisor, is to automate the process of
+suggesting improvements in the configuration based on advice from Rocksdb
+experts.
+
+### Overview
+
+Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+Users provide the Rocksdb configuration that they want to improve upon (as the
+familiar Rocksdb OPTIONS file —
+[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini))
+and the path of the file which contains Rocksdb logs and statistics.
+The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py)
+creates appropriate DataSource objects (for Rocksdb
+[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py).
+The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
+The Advisor's output gives information about which rules were triggered,
+why they were triggered and what each of them suggests. Each suggestion
+provided by a triggered rule advises some action on a Rocksdb
+configuration option, for example, increase CFOptions.write_buffer_size,
+set bloom_bits to 2 etc.
+
+### Usage
+
+An example command to run the tool:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20
+```
+
+Sample output where a Rocksdb log-based rule has been triggered :
+
+```shell
+Rule: stall-too-many-memtables
+LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2']
+Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase
+scope: col_fam:
+{'default'}
+```
+
+### Read more
+
+For more information, refer to [advisor](https://github.com/facebook/rocksdb/tree/master/tools/advisor/README.md).
diff --git a/thirdparty/rocksdb/docs/_posts/2018-08-23-data-block-hash-index.markdown b/thirdparty/rocksdb/docs/_posts/2018-08-23-data-block-hash-index.markdown
new file mode 100644
index 0000000000..c4b24ec2ac
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2018-08-23-data-block-hash-index.markdown
@@ -0,0 +1,118 @@
+---
+title: Improving Point-Lookup Using Data Block Hash Index
+layout: post
+author: fgwu
+category: blog
+---
+We've designed and implemented a _data block hash index_ in RocksDB that has the benefit of both reducing the CPU util and increasing the throughput for point lookup queries with a reasonable and tunable space overhead. 
+
+Specifially, we append a compact hash table to the end of the data block for efficient indexing. It is backward compatible with the data base created without this feature. After turned on the hash index feature, existing data will be gradually converted to the hash index format.
+
+Benchmarks with `db_bench`  show the CPU utilization of one of the main functions in the point lookup code path, `DataBlockIter::Seek()`, is reduced by 21.8%, and the overall RocksDB throughput is increased by 10% under purely cached workloads, at an overhead of 4.6% more space. Shadow testing with Facebook production traffic shows good CPU improvements too.
+
+
+### How to use it
+Two new options are added as part of this feature: `BlockBasedTableOptions::data_block_index_type` and `BlockBasedTableOptions::data_block_hash_table_util_ratio`.
+
+The hash index is disabled by default unless `BlockBasedTableOptions::data_block_index_type` is set to `data_block_index_type = kDataBlockBinaryAndHash`. The hash table utilization ratio is adjustable using `BlockBasedTableOptions::data_block_hash_table_util_ratio`, which is valid only if `data_block_index_type = kDataBlockBinaryAndHash`.
+
+
+```
+// the definitions can be found in include/rocksdb/table.h
+
+// The index type that will be used for the data block.
+enum DataBlockIndexType : char {
+  kDataBlockBinarySearch = 0,  // traditional block type
+  kDataBlockBinaryAndHash = 1, // additional hash index
+};
+
+// Set to kDataBlockBinaryAndHash to enable hash index
+DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+// #entries/#buckets. It is valid only when data_block_hash_index_type is
+// kDataBlockBinaryAndHash.
+double data_block_hash_table_util_ratio = 0.75;
+
+```
+
+
+### Data Block Hash Index Design
+
+Current data block format groups adjacent keys together as a restart interval. One block consists of multiple restart intervals. The byte offset of the beginning of each restart interval, i.e. a restart point, is stored in an array called restart interval index or binary seek index. RocksDB does a binary search when performing point lookup for keys in data blocks to find the right restart interval the key may reside. We will use binary seek and binary search interchangeably in this post.
+
+In order to find the right location where the key may reside using binary search, multiple key parsing and comparison are needed. Each binary search branching triggers CPU cache miss, causing much CPU utilization. We have seen that this binary search takes up considerable CPU in production use-cases.
+
+![](/static/images/data-block-hash-index/block-format-binary-seek.png)
+
+We implemented a hash map at the end of the block to index the key to reduce the CPU overhead of the binary search. The hash index is just an array of pointers pointing into the binary seek index.
+
+![](/static/images/data-block-hash-index/block-format-hash-index.png)
+
+
+Each array element is considered as a hash bucket when storing the location of a key (or more precisely, the restart index of the restart interval where the key resides). When multiple keys happen to hash into the same bucket (hash collision), we just mark the bucket as “collision”. So that when later querying on that key, the hash table lookup knows that there was a hash collision happened so it can fall back to the traditional binary search to find the location of the key.
+
+We define hash table utilization ratio as the #keys/#buckets. If a utilization ratio is 0.5 and there are 100 buckets, 50 keys are stored in the bucket. The less the util ratio, the less hash collision, and the less chance for a point lookup falls back to binary seek (fall back ratio) due to the collision. So a small util ratio has more benefit to reduce the CPU time but introduces more space overhead.
+
+Space overhead depends on the util ratio. Each bucket is a `uint8_t`  (i.e. one byte). For a util ratio of 1, the space overhead is 1Byte per key, the fall back ratio observed is ~52%.
+
+![](/static/images/data-block-hash-index/hash-index-data-structure.png)
+
+### Things that Need Attention
+
+**Customized Comparator**
+
+Hash index will hash different keys (keys with different content, or byte sequence) into different hash values. This assumes the comparator will not treat different keys as equal if they have different content. 
+
+The default bytewise comparator orders the keys in alphabetical order and works well with hash index, as different keys will never be regarded as equal. However, some specially crafted comparators will do. For example, say, a `StringToIntComparator` can convert a string into an integer, and use the integer to perform the comparison. Key string “16” and “0x10” is equal to each other as seen by this `StringToIntComparator`, but they probably hash to different value. Later queries to one form of the key will not be able to find the existing key been stored in the other format.
+
+We add a new function member to the comparator interface: 
+
+```
+virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
+```
+
+
+Every comparator implementation should override this function and specify the behavior of the comparator. If a comparator can regard different keys equal, the function returns true, and as a result the hash index feature will not be enabled, and vice versa.
+
+NOTE: to use the hash index feature, one should 1) have a comparator that can never treat different keys as equal; and 2) override the `CanKeysWithDifferentByteContentsBeEqual()` function to return `false`, so the hash index can be enabled.
+
+
+**Util Ratio's Impact on Data Block Cache**
+
+Adding the hash index to the end of the data block essentially takes up the data block cache space, making the effective data block cache size smaller and increasing the data block cache miss ratio. Therefore, a very small util ratio will result in a large data block cache miss ratio, and the extra I/O may drag down the throughput gain achieved by the hash index lookup. Besides, when compression is enabled, cache miss also incurs data block decompression, which is CPU-consuming. Therefore the CPU may even increase if using a too small util ratio. The best util ratio depends on workloads, cache to data ratio, disk bandwidth/latency etc. In our experiment, we found util ratio = 0.5 ~ 1 is a good range to explore that brings both CPU and throughput gains.
+
+
+### Limitations
+
+As we use `uint8_t` to store binary seek index, i.e. restart interval index, the total number of restart intervals cannot be more than 253 (we reserved  255 and 254 as special flags). For blocks having a larger number of restart intervals, the hash index will not be created and the point lookup will be done by traditional binary seek.
+
+Data block hash index only supports point lookup. We do not support range lookup. Range lookup request will fall back to BinarySeek.
+
+RocksDB supports many types of records, such as `Put`, `Delete`, `Merge`, etc (visit [here](https://github.com/facebook/rocksdb/wiki/rocksdb-basics) for more information). Currently we only support `Put` and `Delete`, but not `Merge`. Internally we have a limited set of supported record types:
+
+
+```
+kPutRecord,          <=== supported
+kDeleteRecord,       <=== supported
+kSingleDeleteRecord, <=== supported
+kTypeBlobIndex,      <=== supported
+```
+
+For records not supported, the searching process will fall back to the traditional binary seek. 
+
+
+
+### Evaluation
+To evaluate the CPU util reduction and isolate other factors such as disk I/O and block decompression, we first evaluate the hash idnex in a purely cached workload. We observe that the CPU utilization of one of the main functions in the point lookup code path, DataBlockIter::Seek(), is reduced by 21.8% and the overall throughput is increased by 10% at an overhead of 4.6% more space.
+
+However, general worload is not always purely cached. So we also evaluate the performance under different cache space pressure. In the following test, we use `db_bench` with RocksDB deployed on SSDs. The total DB size is 5~6GB, and it is about 14GB if decompressed. Different block cache sizes are used, ranging from 14GB down to 2GB, with an increasing cache miss ratio.
+
+Orange bars are representing our hash index performance. We use a hash util ratio of 1.0 in this test. Block size are set to 16KiB with the restart interval as 16.
+
+![](/static/images/data-block-hash-index/perf-throughput.png)
+![](/static/images/data-block-hash-index/perf-cache-miss.png)
+
+We can see that if cache size is greater than 8GB, hash index can bring throughput gain. Cache size greater than 8GB can be translated to a cache miss ratio smaller than 40%. So if the workload has a cache miss ratio smaller than 40%, hash index is able to increase the throughput.
+
+Besides, shadow testing with Facebook production traffic shows good CPU improvements too.
+
diff --git a/thirdparty/rocksdb/docs/_posts/2018-11-21-delete-range.markdown b/thirdparty/rocksdb/docs/_posts/2018-11-21-delete-range.markdown
new file mode 100644
index 0000000000..96fc3562d1
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2018-11-21-delete-range.markdown
@@ -0,0 +1,292 @@
+---
+title: "DeleteRange: A New Native RocksDB Operation"
+layout: post
+author:
+- abhimadan
+- ajkr
+category: blog
+---
+## Motivation
+
+### Deletion patterns in LSM
+
+Deleting a range of keys is a common pattern in RocksDB. Most systems built on top of
+RocksDB have multi-component key schemas, where keys sharing a common prefix are
+logically related. Here are some examples.
+
+MyRocks is a MySQL fork using RocksDB as its storage engine. Each key's first
+four bytes identify the table or index to which that key belongs. Thus dropping
+a table or index involves deleting all the keys with that prefix.
+
+Rockssandra is a Cassandra variant that uses RocksDB as its storage engine. One
+of its admin tool commands, `nodetool cleanup`, removes key-ranges that have been migrated
+to other nodes in the cluster.
+
+Marketplace uses RocksDB to store product data. Its key begins with product ID,
+and it stores various data associated with the product in separate keys. When a
+product is removed, all these keys must be deleted.
+
+When we decide what to improve, we try to find a use case that's common across
+users, since we want to build a generally useful system, not one that has many
+one-off features for individual users. The range deletion pattern is common as
+illustrated above, so from this perspective it's a good target for optimization.
+
+### Existing mechanisms: challenges and opportunities
+
+The most common pattern we see is scan-and-delete, i.e., advance an iterator
+through the to-be-deleted range, and issue a `Delete` for each key. This is
+slow (involves read I/O) so cannot be done in any critical path. Additionally,
+it creates many tombstones, which slows down iterators and doesn't offer a deadline
+for space reclamation.
+
+Another common pattern is using a custom compaction filter that drops keys in
+the deleted range(s). This deletes the range asynchronously, so cannot be used
+in cases where readers must not see keys in deleted ranges. Further, it has the
+disadvantage of outputting tombstones to all but the bottom level. That's
+because compaction cannot detect whether dropping a key would cause an older
+version at a lower level to reappear.
+
+If space reclamation time is important, or it is important that the deleted
+range not affect iterators, the user can trigger `CompactRange` on the deleted
+range. This can involve arbitrarily long waits in the compaction queue, and
+increases write-amp. By the time it's finished, however, the range is completely
+gone from the LSM.
+
+`DeleteFilesInRange` can be used prior to compacting the deleted range as long
+as snapshot readers do not need to access them. It drops files that are
+completely contained in the deleted range. That saves write-amp because, in
+`CompactRange`, the file data would have to be rewritten several times before it
+reaches the bottom of the LSM, where tombstones can finally be dropped.
+
+In addition to the above approaches having various drawbacks, they are quite
+complicated to reason about and implement. In an ideal world, deleting a range
+of keys would be (1) simple, i.e., a single API call; (2) synchronous, i.e.,
+when the call finishes, the keys are guaranteed to be wiped from the DB; (3) low
+latency so it can be used in critical paths; and (4) a first-class operation
+with all the guarantees of any other write, like atomicity, crash-recovery, etc.
+
+## v1: Getting it to work
+
+### Where to persist them?
+
+The first place we thought about storing them is inline with the data blocks.
+We could not think of a good way to do it, however, since the start of a range
+tombstone covering a key could be anywhere, making binary search impossible.
+So, we decided to investigate segregated storage.
+
+A second solution we considered is appending to the manifest. This file is
+append-only, periodically compacted, and stores metadata like the level to which
+each SST belongs. This is tempting because it leverages an existing file, which
+is maintained in the background and fully read when the DB is opened. However,
+it conceptually violates the manifest's purpose, which is to store metadata. It
+also has no way to detect when a range tombstone no longer covers anything and
+is droppable. Further, it'd be possible for keys above a range tombstone to disappear
+when they have their seqnums zeroed upon compaction to the bottommost level.
+
+A third candidate is using a separate column family. This has similar problems
+to the manifest approach. That is, we cannot easily detect when a range
+tombstone is obsolete, and seqnum zeroing can cause a key
+to go from above a range tombstone to below, i.e., disappearing. The upside is
+we can reuse logic for memory buffering, consistent reads/writes, etc.
+
+The problems with the second and third solutions indicate a need for range
+tombstones to be aware of flush/compaction. An easy way to achieve this is put
+them in the SST files themselves - but not in the data blocks, as explained for
+the first solution. So, we introduced a separate meta-block for range tombstones.
+This resolved the problem of when to obsolete range tombstones, as it's simple:
+when they're compacted to the bottom level. We also reused the LSM invariants
+that newer versions of a key are always in a higher level to prevent the seqnum
+zeroing problem. This approach has the side benefit of constraining the range
+tombstones seen during reads to ones in a similar key-range.
+
+![](/static/images/delrange/delrange_sst_blocks.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*When there are range tombstones in an SST, they are segregated in a separate meta-block*
+{: style="text-align: center"}
+
+![](/static/images/delrange/delrange_key_schema.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 80%"}
+
+*Logical range tombstones (left) and their corresponding physical key-value representation (right)*
+{: style="text-align: center"}
+
+### Write path
+
+`WriteBatch` stores range tombstones in its buffer which are logged to the WAL and
+then applied to a dedicated range tombstone memtable during `Write`. Later in
+the background the range tombstone memtable and its corresponding data memtable
+are flushed together into a single SST with a range tombstone meta-block. SSTs
+periodically undergo compaction which rewrites SSTs with point data and range
+tombstones dropped or merged wherever possible.
+
+We chose to use a dedicated memtable for range tombstones. The memtable
+representation is always skiplist in order to minimize overhead in the usual
+case, which is the memtable contains zero or a small number of range tombstones.
+The range tombstones are segregated to a separate memtable for the same reason
+we segregated range tombstones in SSTs. That is, we did not know how to
+interleave the range tombstone with point data in a way that we would be able to
+find it for arbitrary keys that it covers.
+
+![](/static/images/delrange/delrange_write_path.png)
+{: style="display: block; margin-left: auto; margin-right: auto; width: 70%"}
+
+*Lifetime of point keys and range tombstones in RocksDB*
+{: style="text-align: center"}
+
+During flush and compaction, we chose to write out all non-obsolete range
+tombstones unsorted. Sorting by a single dimension is easy to implement, but
+doesn't bring asymptotic improvement to queries over range data. Ideally, we
+want to store skylines (see “Read Path” subsection below) computed over our ranges so we can binary search.
+However, a couple of concerns cause doing this in flush and compaction to feel
+unsatisfactory: (1) we need to store multiple skylines, one for each snapshot,
+which further complicates the range tombstone meta-block encoding; and (2) even
+if we implement this, the range tombstone memtable still needs to be linearly
+scanned. Given these concerns we decided to defer collapsing work to the read
+side, hoping a good caching strategy could optimize this at some future point.
+
+
+### Read path
+
+In point lookups, we aggregate range tombstones in an unordered vector as we
+search through live memtable, immutable memtables, and then SSTs. When a key is
+found that matches the lookup key, we do a scan through the vector, checking
+whether the key is deleted.
+
+In iterators, we aggregate range tombstones into a skyline as we visit live
+memtable, immutable memtables, and SSTs. The skyline is expensive to construct but fast to determine whether a key is covered. The skyline keeps track of the most recent range tombstone found to optimize `Next` and `Prev`.
+
+|![](/static/images/delrange/delrange_uncollapsed.png)	|![](/static/images/delrange/delrange_collapsed.png)	|
+
+*([Image source: Leetcode](https://leetcode.com/problems/the-skyline-problem/description/)) The skyline problem involves taking building location/height data in the
+unsearchable form of A and converting it to the form of B, which is
+binary-searchable. With overlapping range tombstones, to achieve efficient
+searching we need to solve an analogous problem, where the x-axis is the
+key-space and the y-axis is the sequence number.*
+{: style="text-align: center"}
+
+### Performance characteristics
+
+For the v1 implementation, writes are much faster compared to the scan and
+delete (optionally within a transaction) pattern. `DeleteRange` only logs to WAL
+and applies to memtable. Logging to WAL always `fflush`es, and optionally
+`fsync`s or `fdatasync`s. Applying to memtable is always an in-memory operation.
+Since range tombstones have a dedicated skiplist memtable, the complexity of inserting is O(log(T)), where T is the number of existing buffered range tombstones.
+
+Reading in the presence of v1 range tombstones, however, is much slower than reads
+in a database where scan-and-delete has happened, due to the linear scan over
+range tombstone memtables/meta-blocks.
+
+Iterating in a database with v1 range tombstones is usually slower than in a
+scan-and-delete database, although the gap lessens as iterations grow longer.
+When an iterator is first created and seeked, we construct a skyline over its
+tombstones. This operation is O(T\*log(T)) where T is the number of tombstones
+found across live memtable, immutable memtable, L0 files, and one file from each
+of the L1+ levels. However, moving the iterator forwards or backwards is simply
+a constant-time operation (excluding edge cases, e.g., many range tombstones
+between consecutive point keys).
+
+## v2: Making it fast
+
+`DeleteRange`’s negative impact on read perf is a barrier to its adoption. The
+root cause is range tombstones are not stored or cached in a format that can be
+efficiently searched. We needed to design DeleteRange so that we could maintain
+write performance while making read performance competitive with workarounds
+used in production (e.g., scan-and-delete).
+
+### Representations
+
+The key idea of the redesign is that, instead of globally collapsing range tombstones,
+ we can locally “fragment” them for each SST file and memtable to guarantee that:
+
+* no range tombstones overlap; and
+* range tombstones are ordered by start key.
+
+Combined, these properties make range tombstones binary searchable. This
+ fragmentation will happen on the read path, but unlike the previous design, we can
+ easily cache many of these range tombstone fragments on the read path.
+
+### Write path
+
+The write path remains unchanged.
+
+### Read path
+
+When an SST file is opened, its range tombstones are fragmented and cached. For point
+ lookups, we binary search each file's fragmented range tombstones for one that covers
+ the lookup key. Unlike the old design, once we find a tombstone, we no longer need to
+ search for the key in lower levels, since we know that any keys on those levels will be
+ covered (though we do still check the current level since there may be keys written after
+ the range tombstone).
+
+For range scans, we create iterators over all the fragmented range
+ tombstones and store them in a list, seeking each one to cover the start key of the range
+ scan (if possible), and query each encountered key in this structure as in the old design,
+ advancing range tombstone iterators as necessary. In effect, we implicitly create a skyline.
+ This requires significantly less work on iterator creation, but since each memtable/SST has
+its own range tombstone iterator, querying range tombstones requires key comparisons (and
+possibly iterator increments) for several iterators (as opposed to v1, where we had a global
+collapsed representation of all range tombstones). As a result, very long range scans may become
+ slower than before, but short range scans are an order of magnitude faster, which are the
+ more common class of range scan.
+
+## Benchmarks
+
+To understand the performance of this new design, we used `db_bench` to compare point lookup, short range scan,
+ and long range scan performance across:
+
+* the v1 DeleteRange design,
+* the scan-and-delete workaround, and
+* the v2 DeleteRange design.
+
+In these benchmarks, we used a database with 5 million data keys, and 10000 range tombstones (ignoring
+those dropped during compaction) that were written in regular intervals after 4.5 million data keys were written.
+Writing the range tombstones ensures that most of them are not compacted away, and we have more tombstones
+in higher levels that cover keys in lower levels, which allows the benchmarks to exercise more interesting behavior
+when reading deleted keys.
+
+Point lookup benchmarks read 100000 keys from a database using `readwhilewriting`. Range scan benchmarks used
+`seekrandomwhilewriting` and seeked 100000 times, and advanced up to 10 keys away from the seek position for short range scans, and advanced up to 1000 keys away from the seek position for long range scans.
+
+The results are summarized in the tables below, averaged over 10 runs (note the
+different SHAs for v1 benchmarks are due to a new `db_bench` flag that was added in order to compare performance with databases with no tombstones; for brevity, those results are not reported here). Also note that the block cache was large enough to hold the entire db, so the large throughput is due to limited I/Os and little time spent on decompression. The range tombstone blocks are always pinned uncompressed in memory. We believe these setup details should not affect relative performance between versions.
+
+### Point Lookups
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|35cd754a6	|1.3179	|759,830.90	|
+|scan-del	|7528130e3	|0.6036	|1,667,237.70	|
+|v2	|7528130e3	|0.6128	|1,634,633.40	|
+
+### Short Range Scans
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|0ed738fdd	|6.23	|176,562.00	|
+|scan-del	|PR 4677	|2.6844	|377,313.00	|
+|v2	|PR 4677	|2.8226	|361,249.70	|
+
+### Long Range scans
+
+|Name	|SHA	|avg micros/op	|avg ops/sec	|
+|v1	|0ed738fdd	|52.7066	|19,074.00	|
+|scan-del	|PR 4677	|38.0325	|26,648.60	|
+|v2	|PR 4677	|41.2882	|24,714.70	|
+
+## Future Work
+
+Note that memtable range tombstones are fragmented every read; for now this is acceptable,
+ since we expect there to be relatively few range tombstones in memtables (and users can
+ enforce this by keeping track of the number of memtable range deletions and manually flushing
+ after it passes a threshold). In the future, a specialized data structure can be used for storing
+ range tombstones in memory to avoid this work.
+
+Another future optimization is to create a new format version that requires range tombstones to
+ be stored in a fragmented form. This would save time when opening SST files, and when `max_open_files` 
+is not -1 (i.e., files may be opened several times).
+
+## Acknowledgements
+
+Special thanks to Peter Mattis and Nikhil Benesch from Cockroach Labs, who were early users of
+DeleteRange v1 in production, contributed the cleanest/most efficient v1 aggregation implementation, found and fixed bugs, and provided initial DeleteRange v2 design and continued help.
+
+Thanks to Huachao Huang and Jinpeng Zhang from PingCAP for early DeleteRange v1 adoption, bug reports, and fixes.
diff --git a/thirdparty/rocksdb/docs/_posts/2019-03-08-format-version-4.markdown b/thirdparty/rocksdb/docs/_posts/2019-03-08-format-version-4.markdown
new file mode 100644
index 0000000000..ce657696c7
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_posts/2019-03-08-format-version-4.markdown
@@ -0,0 +1,36 @@
+---
+title: format_version 4
+layout: post
+author: maysamyabandeh
+category: blog
+---
+
+The data blocks in RocksDB consist of a sequence of key/values pairs sorted by key, where the pairs are grouped into _restart intervals_ specified by `block_restart_interval`. Up to RocksDB version 5.14, where the latest and default value of `BlockBasedTableOptions::format_version` is 2, the format of index and data blocks are the same: index blocks use the same key format of <`user_key`,`seq`> and encode pointers to data blocks, <`offset`,`size`>, to a byte string and use them as values. The only difference is that the index blocks use `index_block_restart_interval` for the size of _restart intervals_. `format_version=`3,4 offer more optimized, backward-compatible, yet forward-incompatible format for index blocks. 
+
+### Pros
+
+Using `format_version`=4 significantly reduces the index block size, in some cases around 4-5x. This frees more space in block cache, which would result in higher hit rate for data and filter blocks, or offer the same performance with a smaller block cache size.
+
+### Cons
+
+Being _forward-incompatible_ means that if you enable `format_version=`4 you cannot downgrade to a RocksDB version lower than 5.16.
+
+### How to use it?
+
+- `BlockBasedTableOptions::format_version` = 4
+- `BlockBasedTableOptions::index_block_restart_interval` = 16
+
+### What is format_version 3?
+(Since RocksDB 5.15) In most cases, the sequence number `seq` is not necessary for keys in the index blocks. In such cases, `format_version`=3 skips encoding the sequence number and sets `index_key_is_user_key` in TableProperties, which is used by the reader to know how to decode the index block.
+
+### What is format_version 4?
+(Since RocksDB 5.16) Changes the format of index blocks by delta encoding the index values, which are the block handles. This saves the encoding of `BlockHandle::offset` of the non-head index entries in each restart interval. If used, `TableProperties::index_value_is_delta_encoded` is set, which is used by the reader to know how to decode the index block.  The format of each key is (shared_size, non_shared_size, shared, non_shared). The format of each value, i.e., block handle, is (offset, size) whenever the shared_size is 0, which included the first entry in each restart point. Otherwise the format is delta-size = block handle size - size of last block handle.
+
+The index format in `format_version=4` would be as follows:
+
+    restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+    restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+    ...
+    restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+    where, k is key, v is value, and its encoding is in parenthesis.
+
diff --git a/thirdparty/rocksdb/docs/_sass/_base.scss b/thirdparty/rocksdb/docs/_sass/_base.scss
new file mode 100644
index 0000000000..6d26d9feba
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_base.scss
@@ -0,0 +1,492 @@
+body {
+  background: $secondary-bg;
+  color: $text;
+	font: normal #{$base-font-size}/#{$base-line-height} $base-font-family;
+  height: 100vh;
+	text-align: left;
+	text-rendering: optimizeLegibility;
+}
+
+img {
+  max-width: 100%;
+}
+
+article {
+  p {
+    img {
+      max-width: 100%;
+      display:block;
+      margin-left: auto;
+      margin-right: auto;
+    }
+  }
+}
+
+a {
+  border-bottom: 1px dotted $primary-bg;
+  color: $text;
+  text-decoration: none;
+  -webkit-transition: background 0.3s, color 0.3s;
+  transition: background 0.3s, color 0.3s;
+}
+
+blockquote {
+  padding: 15px 30px 15px 15px;
+  margin: 20px 0 0 10px;
+  background-color: rgba(204, 122, 111, 0.1);
+  border-left: 10px solid rgba(191, 87, 73, 0.2);
+}
+
+#fb_oss a {
+  border: 0;
+}
+
+h1, h2, h3, h4 {
+  font-family: $header-font-family;
+  font-weight: 900;
+}
+
+.navPusher {
+  border-top: $header-height + $header-ptop + $header-pbot solid $primary-bg;
+	height: 100%;
+	left: 0;
+	position: relative;
+	z-index: 99;
+}
+
+.homeContainer {
+  background: $primary-bg;
+  color: $primary-overlay;
+
+  a {
+    color: $primary-overlay;
+  }
+
+  .homeSplashFade {
+    color: white;
+  }
+
+  .homeWrapper {
+    padding: 2em 10px;
+    text-align: left;
+
+      .wrapper {
+        margin: 0px auto;
+        max-width: $content-width;
+        padding: 0 20px;
+      }
+
+      .projectLogo {
+        img {
+          height: 100px;
+          margin-bottom: 0px;
+        }
+      }
+
+      h1#project_title {
+        font-family: $header-font-family;
+        font-size: 300%;
+        letter-spacing: -0.08em;
+        line-height: 1em;
+        margin-bottom: 80px;
+      }
+
+      h2#project_tagline {
+        font-family: $header-font-family;
+        font-size: 200%;
+        letter-spacing: -0.04em;
+        line-height: 1em;
+      }
+  }
+}
+
+.wrapper {
+	margin: 0px auto;
+	max-width: $content-width;
+	padding: 0 10px;
+}
+
+.projectLogo {
+  display: none;
+
+  img {
+    height: 100px;
+    margin-bottom: 0px;
+  }
+}
+
+section#intro {
+  margin: 40px 0;
+}
+
+.fbossFontLight {
+  font-family: $base-font-family;
+  font-weight: 300;
+  font-style: normal;
+}
+
+.fb-like {
+  display: block;
+  margin-bottom: 20px;
+  width: 100%;
+}
+
+.center {
+  display: block;
+  text-align: center;
+}
+
+.mainContainer {
+  background: $secondary-bg;
+  overflow: auto;
+
+  .mainWrapper {
+    padding: 4vh 10px;
+    text-align: left;
+
+    .allShareBlock {
+      padding: 10px 0;
+
+      .pluginBlock {
+        margin: 12px 0;
+        padding: 0;
+      }
+    }
+
+    a {
+      &:hover,
+      &:focus {
+        background: $primary-bg;
+        color: $primary-overlay;
+      }
+    }
+
+    em, i {
+      font-style: italic;
+    }
+
+    strong, b {
+      font-weight: bold;
+    }
+
+    h1 {
+      font-size: 300%;
+      line-height: 1em;
+      padding: 1.4em 0 1em;
+      text-align: left;
+    }
+
+    h2 {
+      font-size: 250%;
+      line-height: 1em;
+      margin-bottom: 20px;
+      padding: 1.4em 0 20px;
+      text-align: left;
+
+      & {
+        border-bottom: 1px solid darken($primary-bg, 10%);
+        color: darken($primary-bg, 10%);
+        font-size: 22px;
+        padding: 10px 0;
+      }
+
+      &.blockHeader {
+        border-bottom: 1px solid white;
+        color: white;
+        font-size: 22px;
+        margin-bottom: 20px;
+        padding: 10px 0;
+      }
+    }
+
+    h3 {
+      font-size: 150%;
+      line-height: 1.2em;
+      padding: 1em 0 0.8em;
+    }
+
+    h4 {
+      font-size: 130%;
+      line-height: 1.2em;
+      padding: 1em 0 0.8em;
+    }
+
+    p {
+      padding: 0.8em 0;
+    }
+
+    ul {
+      list-style: disc;
+    }
+
+    ol, ul {
+      padding-left: 24px;
+      li {
+        padding-bottom: 4px;
+        padding-left: 6px;
+      }
+    }
+
+    strong {
+      font-weight: bold;
+    }
+
+    .post {
+      position: relative;
+
+      .katex {
+        font-weight: 700;
+      }
+
+      &.basicPost {
+        margin-top: 30px;
+      }
+
+      a {
+        color: $primary-bg;
+
+        &:hover,
+        &:focus {
+          color: #fff;
+        }
+      }
+
+      h2 {
+        border-bottom: 4px solid $primary-bg;
+        font-size: 130%;
+      }
+
+      h3 {
+        border-bottom: 1px solid $primary-bg;
+        font-size: 110%;
+      }
+
+      ol {
+        list-style: decimal outside none;
+      }
+
+      .post-header {
+        padding: 1em 0;
+
+        h1 {
+          font-size: 150%;
+          line-height: 1em;
+          padding: 0.4em 0 0;
+
+          a {
+            border: none;
+          }
+        }
+
+        .post-meta {
+          color: $primary-bg;
+          font-family: $header-font-family;
+          text-align: center;
+        }
+      }
+
+      .postSocialPlugins {
+        padding-top: 1em;
+      }
+
+      .docPagination {
+        background: $primary-bg;
+        bottom: 0px;
+        left: 0px;
+        position: absolute;
+        right: 0px;
+
+        .pager {
+          display: inline-block;
+          width: 50%;
+        }
+
+        .pagingNext {
+          float: right;
+          text-align: right;
+        }
+
+        a {
+          border: none;
+          color: $primary-overlay;
+          display: block;
+          padding: 4px 12px;
+
+          &:hover {
+            background-color: $secondary-bg;
+            color: $text;
+          }
+
+          .pagerLabel {
+            display: inline;
+          }
+
+          .pagerTitle {
+            display: none;
+          }
+        }
+      }
+    }
+
+    .posts {
+      .post {
+        margin-bottom: 6vh;
+      }
+    }
+  }
+}
+
+#integrations_title  {
+  font-size: 250%;
+  margin: 80px 0;
+}
+
+.ytVideo {
+  height: 0;
+  overflow: hidden;
+  padding-bottom: 53.4%; /* 16:9 */
+  padding-top: 25px;
+  position: relative;
+}
+
+.ytVideo iframe,
+.ytVideo object,
+.ytVideo embed {
+  height: 100%;
+  left: 0;
+  position: absolute;
+  top: 0;
+  width: 100%;
+}
+
+@media only screen and (min-width: 480px) {
+  h1#project_title {
+    font-size: 500%;
+  }
+
+  h2#project_tagline {
+    font-size: 250%;
+  }
+
+  .projectLogo {
+    img {
+      margin-bottom: 10px;
+      height: 200px;
+    }
+  }
+
+  .homeContainer .homeWrapper {
+    padding-left: 10px;
+    padding-right: 10px;
+  }
+
+  .mainContainer {
+    .mainWrapper {
+      .post {
+        h2 {
+          font-size: 180%;
+        }
+
+        h3 {
+          font-size: 120%;
+        }
+
+        .docPagination {
+          a {
+            .pagerLabel {
+              display: none;
+            }
+            .pagerTitle {
+              display: inline;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 900px) {
+  .homeContainer {
+    .homeWrapper {
+      position: relative;
+
+      #inner {
+        box-sizing: border-box;
+        max-width: 600px;
+        padding-right: 40px;
+      }
+
+      .projectLogo {
+        align-items: center;
+        bottom: 0;
+        display: flex;
+        justify-content: flex-end;
+        left: 0;
+        padding: 2em 20px 4em;
+        position: absolute;
+        right: 20px;
+        top: 0;
+
+        img {
+          height: 100%;
+          max-height: 250px;
+        }
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 1024px) {
+  .mainContainer {
+    .mainWrapper {
+      .post {
+        box-sizing: border-box;
+        display: block;
+
+        .post-header {
+          h1 {
+            font-size: 250%;
+          }
+        }
+      }
+
+      .posts {
+        .post {
+          margin-bottom: 4vh;
+          width: 100%;
+        }
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 1200px) {
+  .homeContainer {
+    .homeWrapper {
+      #inner {
+        max-width: 750px;
+      }
+    }
+  }
+
+  .wrapper {
+    max-width: 1100px;
+  }
+}
+
+@media only screen and (min-width: 1500px) {
+  .homeContainer {
+    .homeWrapper {
+      #inner {
+        max-width: 1100px;
+        padding-bottom: 40px;
+        padding-top: 40px;
+      }
+    }
+  }
+
+  .wrapper {
+    max-width: 1400px;
+  }
+}
diff --git a/thirdparty/rocksdb/docs/_sass/_blog.scss b/thirdparty/rocksdb/docs/_sass/_blog.scss
new file mode 100644
index 0000000000..12a73c1fcd
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_blog.scss
@@ -0,0 +1,47 @@
+.blogContainer {
+  .posts {
+    margin-top: 60px;
+
+    .post {
+      border: 1px solid $primary-bg;
+      border-radius: 3px;
+      padding: 10px 20px 20px;
+    }
+  }
+
+  .lonePost {
+    margin-top: 60px;
+
+    .post {
+      padding: 10px 0px 0px;
+    }
+  }
+
+  .post-header {
+    h1 {
+      text-align: center;
+    }
+
+    .post-authorName {
+      color: rgba($text, 0.7);
+      font-size: 14px;
+      font-weight: 900;
+      margin-top: 0;
+      padding: 0;
+      text-align: center;
+    }
+
+    .authorPhoto {
+      border-radius: 50%;
+      height: 50px;
+      left: 50%;
+      margin-left: auto;
+      margin-right: auto;
+      display: inline-block;
+      overflow: hidden;
+      position: static;
+      top: -25px;
+      width: 50px;
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/docs/_sass/_buttons.scss b/thirdparty/rocksdb/docs/_sass/_buttons.scss
new file mode 100644
index 0000000000..a0371618fc
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_buttons.scss
@@ -0,0 +1,47 @@
+.button {
+  border: 1px solid $primary-bg;
+  border-radius: 3px;
+  color: $primary-bg;
+  display: inline-block;
+  font-size: 14px;
+  font-weight: 900;
+  line-height: 1.2em;
+  padding: 10px;
+  text-transform: uppercase;
+  transition: background 0.3s, color 0.3s;
+
+  &:hover {
+    background: $primary-bg;
+    color: $primary-overlay;
+  }
+}
+
+.homeContainer {
+  .button {
+    border-color: $primary-overlay;
+    border-width: 1px;
+    color: $primary-overlay;
+
+    &:hover {
+      background: $primary-overlay;
+      color: $primary-bg;
+    }
+  }
+}
+
+.blockButton {
+  display: block;
+}
+
+.edit-page-link {
+    float: right;
+    font-size: 14px;
+    font-weight: normal;
+    line-height: 20px;
+    opacity: 0.6;
+    transition: opacity 0.5s;
+}
+
+.edit-page-link:hover {
+  opacity: 1;
+}
diff --git a/thirdparty/rocksdb/docs/_sass/_footer.scss b/thirdparty/rocksdb/docs/_sass/_footer.scss
new file mode 100644
index 0000000000..5b74395179
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_footer.scss
@@ -0,0 +1,82 @@
+.footerContainer {
+  background: $secondary-bg;
+  color: $primary-bg;
+  overflow: hidden;
+  padding: 0 10px;
+  text-align: left;
+
+  .footerWrapper {
+    border-top: 1px solid $primary-bg;
+    padding: 0;
+
+    .footerBlocks {
+      align-items: center;
+      align-content: center;
+      display: flex;
+      flex-flow: row wrap;
+      margin: 0 -20px;
+      padding: 10px 0;
+    }
+
+    .footerSection {
+      box-sizing: border-box;
+      flex: 1 1 25%;
+      font-size: 14px;
+      min-width: 275px;
+      padding: 0px 20px;
+
+      a {
+        border: 0;
+        color: inherit;
+        display: inline-block;
+        line-height: 1.2em;
+      }
+
+      .footerLink {
+        padding-right: 20px;
+      }
+    }
+
+    .fbOpenSourceFooter {
+      align-items: center;
+      display: flex;
+      flex-flow: row nowrap;
+      max-width: 25%;
+
+      .facebookOSSLogoSvg {
+        flex: 0 0 31px;
+        height: 30px;
+        margin-right: 10px;
+        width: 31px;
+
+        path {
+          fill: $primary-bg;
+        }
+
+        .middleRing {
+          opacity: 0.7;
+        }
+
+        .innerRing {
+          opacity: 0.45;
+        }
+      }
+
+      h2 {
+        display: block;
+        font-weight: 900;
+        line-height: 1em;
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 900px) {
+  .footerSection {
+    &.rightAlign {
+      margin-left: auto;
+      max-width: 25%;
+      text-align: right;
+    }
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_gridBlock.scss b/thirdparty/rocksdb/docs/_sass/_gridBlock.scss
new file mode 100644
index 0000000000..679b31c14c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_gridBlock.scss
@@ -0,0 +1,115 @@
+.gridBlock {
+  margin: -5px 0;
+  padding: 0;
+  padding-bottom: 20px;
+
+  .blockElement {
+    padding: 5px 0;
+
+    img {
+      max-width: 100%;
+    }
+
+    h3 {
+      border-bottom: 1px solid rgba($primary-bg, 0.5);
+      color: $primary-bg;
+      font-size: 18px;
+      margin: 0;
+      padding: 10px 0;
+    }
+  }
+
+  .gridClear {
+    clear: both;
+  }
+
+}
+
+.gridBlock .alignCenter {
+	text-align: center;
+}
+.gridBlock .alignRight {
+	text-align: right;
+}
+.gridBlock .imageAlignSide {
+	align-items: center;
+	display: flex;
+	flex-flow: row wrap;
+}
+.blockImage {
+	max-width: 150px;
+	width: 50%;
+}
+.imageAlignTop .blockImage {
+	margin-bottom: 20px;
+}
+.imageAlignTop.alignCenter .blockImage {
+	margin-left: auto;
+	margin-right: auto;
+}
+.imageAlignSide .blockImage {
+	flex: 0 1 100px;
+	margin-right: 20px;
+}
+.imageAlignSide .blockContent {
+	flex: 1 1;
+}
+
+@media only screen and (max-width: 1023px) {
+	.responsiveList .blockContent {
+		position: relative;
+	}
+	.responsiveList .blockContent > div {
+		padding-left: 20px;
+	}
+	.responsiveList .blockContent::before {
+		content: "\2022";
+		position: absolute;
+	}
+}
+
+@media only screen and (min-width: 1024px) {
+  .gridBlock {
+    display: flex;
+    flex-direction: row;
+    flex-wrap: wrap;
+    margin: -10px -10px 10px -10px;
+
+    .twoByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 50%;
+      padding: 10px;
+    }
+
+    .fourByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 25%;
+      padding: 10px;
+    }
+  }
+
+  h2 + .gridBlock {
+    padding-top: 20px;
+  }
+}
+
+@media only screen and (min-width: 1400px) {
+  .gridBlock {
+    display: flex;
+    flex-direction: row;
+    flex-wrap: wrap;
+    margin: -10px -20px 10px -20px;
+
+    .twoByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 50%;
+      padding: 10px 20px;
+    }
+
+    .fourByGridBlock {
+      box-sizing: border-box;
+      flex: 1 0 25%;
+      padding: 10px 20px;
+    }
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_header.scss b/thirdparty/rocksdb/docs/_sass/_header.scss
new file mode 100644
index 0000000000..b4cd071139
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_header.scss
@@ -0,0 +1,138 @@
+.fixedHeaderContainer {
+  background: $primary-bg;
+  color: $primary-overlay;
+  height: $header-height;
+  padding: $header-ptop 0 $header-pbot;
+  position: fixed;
+  width: 100%;
+  z-index: 9999;
+
+  a {
+    align-items: center;
+    border: 0;
+    color: $primary-overlay;
+    display: flex;
+    flex-flow: row nowrap;
+    height: $header-height;
+  }
+
+  header {
+    display: flex;
+    flex-flow: row nowrap;
+    position: relative;
+    text-align: left;
+
+    img {
+      height: 24px;
+      margin-right: 10px;
+    }
+
+    h2 {
+      display: block;
+      font-family: $header-font-family;
+      font-weight: 900;
+      line-height: 18px;
+      position: relative;
+    }
+  }
+}
+
+.navigationFull {
+  height: 34px;
+  margin-left: auto;
+
+  nav {
+    position: relative;
+
+    ul {
+      display: flex;
+      flex-flow: row nowrap;
+      margin: 0 -10px;
+
+      li {
+        padding: 0 10px;
+        display: block;
+
+        a {
+          border: 0;
+          color: $primary-overlay-special;
+          font-size: 16px;
+          font-weight: 400;
+          line-height: 1.2em;
+
+          &:hover {
+            border-bottom: 2px solid $primary-overlay;
+            color: $primary-overlay;
+          }
+        }
+
+        &.navItemActive {
+          a {
+            color: $primary-overlay;
+          }
+        }
+      }
+    }
+  }
+}
+
+/* 900px
+
+
+  .fixedHeaderContainer {
+    .navigationWrapper {
+      nav {
+        padding: 0 1em;
+        position: relative;
+        top: -9px;
+
+        ul {
+          margin: 0 -0.4em;
+          li {
+            display: inline-block;
+
+            a {
+              padding: 14px 0.4em;
+              border: 0;
+              color: $primary-overlay-special;
+              display: inline-block;
+
+              &:hover {
+                color: $primary-overlay;
+              }
+            }
+
+            &.navItemActive {
+              a {
+                color: $primary-overlay;
+              }
+            }
+          }
+        }
+      }
+
+      &.navigationFull {
+        display: inline-block;
+      }
+
+      &.navigationSlider {
+        display: none;
+      }
+    }
+  }
+
+  1200px
+
+  .fixedHeaderContainer {
+    header {
+      max-width: 1100px;
+    }
+  }
+
+  1500px
+  .fixedHeaderContainer {
+    header {
+      max-width: 1400px;
+    }
+  }
+ */
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_poweredby.scss b/thirdparty/rocksdb/docs/_sass/_poweredby.scss
new file mode 100644
index 0000000000..4155b60536
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_poweredby.scss
@@ -0,0 +1,69 @@
+.poweredByContainer {
+  background: $primary-bg;
+  color: $primary-overlay;
+  margin-bottom: 20px;
+
+  a {
+    color: $primary-overlay;
+  }
+
+  .poweredByWrapper {
+    h2 {
+      border-color: $primary-overlay-special;
+      color: $primary-overlay-special;
+    }
+  }
+
+  .poweredByMessage {
+    color: $primary-overlay-special;
+    font-size: 14px;
+    padding-top: 20px;
+  }
+}
+
+.poweredByItems {
+  display: flex;
+  flex-flow: row wrap;
+  margin: 0 -10px;
+}
+
+.poweredByItem {
+  box-sizing: border-box;
+  flex: 1 0 50%;
+  line-height: 1.1em;
+  padding: 5px 10px;
+
+  &.itemLarge {
+    flex-basis: 100%;
+    padding: 10px;
+    text-align: center;
+
+    &:nth-child(4) {
+      padding-bottom: 20px;
+    }
+
+    img {
+      max-height: 30px;
+    }
+  }
+}
+
+@media only screen and (min-width: 480px) {
+  .itemLarge {
+    flex-basis: 50%;
+    max-width: 50%;
+  }
+}
+
+@media only screen and (min-width: 1024px) {
+  .poweredByItem {
+    flex-basis: 25%;
+    max-width: 25%;
+
+    &.itemLarge {
+      padding-bottom: 20px;
+      text-align: left;
+    }
+  }
+}
+
diff --git a/thirdparty/rocksdb/docs/_sass/_promo.scss b/thirdparty/rocksdb/docs/_sass/_promo.scss
new file mode 100644
index 0000000000..8c9a809dcb
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_promo.scss
@@ -0,0 +1,55 @@
+.promoSection {
+  display: flex;
+  flex-flow: column wrap;
+  font-size: 125%;
+  line-height: 1.6em;
+  margin: -10px 0;
+  position: relative;
+  z-index: 99;
+
+  .promoRow {
+    padding: 10px 0;
+
+    .pluginWrapper {
+      display: block;
+
+      &.ghWatchWrapper, &.ghStarWrapper {
+        height: 28px;
+      }
+    }
+
+    .pluginRowBlock {
+      display: flex;
+      flex-flow: row wrap;
+      margin: 0 -2px;
+
+      .pluginWrapper {
+        padding: 0 2px;
+      }
+    }
+  }
+}
+
+iframe.pluginIframe {
+  height: 500px;
+  margin-top: 20px;
+  width: 100%;
+}
+
+.iframeContent {
+  display: none;
+}
+
+.iframePreview {
+  display: inline-block;
+  margin-top: 20px;
+}
+
+@media only screen and (min-width: 1024px) {
+  .iframeContent {
+    display: block;
+  }
+  .iframePreview {
+    display: none;
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_react_docs_nav.scss b/thirdparty/rocksdb/docs/_sass/_react_docs_nav.scss
new file mode 100644
index 0000000000..f0a651e7fa
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_react_docs_nav.scss
@@ -0,0 +1,332 @@
+.docsNavContainer {
+  background: $sidenav;
+  height: 35px;
+  left: 0;
+  position: fixed;
+  width: 100%;
+  z-index: 100;
+}
+
+.docMainWrapper {
+  .wrapper {
+    &.mainWrapper {
+      padding-left: 0;
+      padding-right: 0;
+      padding-top: 10px;
+    }
+  }
+}
+
+.docsSliderActive {
+  .docsNavContainer {
+    box-sizing: border-box;
+    height: 100%;
+    overflow-y: auto;
+    -webkit-overflow-scrolling: touch;
+    padding-bottom: 50px;
+  }
+
+  .mainContainer {
+    display: none;
+  }
+}
+
+.navBreadcrumb {
+  box-sizing: border-box;
+  display: flex;
+  flex-flow: row nowrap;
+  font-size: 12px;
+  height: 35px;
+  overflow: hidden;
+  padding: 5px 10px;
+
+  a, span {
+    border: 0;
+    color: $sidenav-text;
+  }
+
+  i {
+    padding: 0 3px;
+  }
+}
+
+nav.toc {
+  position: relative;
+
+  section {
+    padding: 0px;
+    position: relative;
+
+    .navGroups {
+      display: none;
+      padding: 40px 10px 10px;
+    }
+  }
+
+  .toggleNav {
+    background: $sidenav;
+    color: $sidenav-text;
+    position: relative;
+    transition: background-color 0.3s, color 0.3s;
+
+    .navToggle {
+      cursor: pointer;
+      height: 24px;
+      margin-right: 10px;
+      position: relative;
+      text-align: left;
+      width: 18px;
+
+      &::before, &::after {
+        content: "";
+        position: absolute;
+        top: 50%;
+        left: 0;
+        left: 8px;
+        width: 3px;
+        height: 6px;
+        border: 5px solid $sidenav-text;
+        border-width: 5px 0;
+        margin-top: -8px;
+        transform: rotate(45deg);
+        z-index: 1;
+      }
+
+      &::after {
+        transform: rotate(-45deg);
+      }
+
+      i {
+        &::before, &::after {
+          content: "";
+          position: absolute;
+          top: 50%;
+          left: 2px;
+          background: transparent;
+          border-width: 0 5px 5px;
+          border-style: solid;
+          border-color: transparent $sidenav-text;
+          height: 0;
+          margin-top: -7px;
+          opacity: 1;
+          width: 5px;
+          z-index: 10;
+        }
+
+        &::after {
+          border-width: 5px 5px 0;
+          margin-top: 2px;
+        }
+      }
+    }
+
+    .navGroup {
+      background: $sidenav-overlay;
+      margin: 1px 0;
+
+      ul {
+        display: none;
+      }
+
+      h3 {
+        background: $sidenav-overlay;
+        color: $sidenav-text;
+        cursor: pointer;
+        font-size: 14px;
+        font-weight: 400;
+        line-height: 1.2em;
+        padding: 10px;
+        transition: color 0.2s;
+
+        i:not(:empty) {
+          width: 16px;
+          height: 16px;
+          display: inline-block;
+          box-sizing: border-box;
+          text-align: center;
+          color: rgba($sidenav-text, 0.5);
+          margin-right: 10px;
+          transition: color 0.2s;
+        }
+
+        &:hover {
+          color: $primary-bg;
+
+          i:not(:empty) {
+            color: $primary-bg;
+          }
+        }
+      }
+
+      &.navGroupActive {
+        background: $sidenav-active;
+        color: $sidenav-text;
+
+        ul {
+          display: block;
+          padding-bottom: 10px;
+          padding-top: 10px;
+        }
+
+        h3 {
+          background: $primary-bg;
+          color: $primary-overlay;
+
+          i {
+            display: none;
+          }
+        }
+      }
+    }
+
+    ul {
+      padding-left: 0;
+      padding-right: 24px;
+
+      li {
+        list-style-type: none;
+        padding-bottom: 0;
+        padding-left: 0;
+
+        a {
+          border: none;
+          color: $sidenav-text;
+          display: inline-block;
+          font-size: 14px;
+          line-height: 1.1em;
+          margin: 2px 10px 5px;
+          padding: 5px 0 2px;
+          transition: color 0.3s;
+
+          &:hover,
+          &:focus {
+            color: $primary-bg;
+          }
+
+          &.navItemActive {
+            color: $primary-bg;
+            font-weight: 900;
+          }
+        }
+      }
+    }
+  }
+
+  .toggleNavActive {
+    .navBreadcrumb {
+      background: $sidenav;
+      margin-bottom: 20px;
+      position: fixed;
+      width: 100%;
+    }
+
+    section {
+      .navGroups {
+        display: block;
+      }
+    }
+
+
+    .navToggle {
+      &::before, &::after {
+        border-width: 6px 0;
+        height: 0px;
+        margin-top: -6px;
+      }
+
+      i {
+        opacity: 0;
+      }
+    }
+  }
+}
+
+.docsNavVisible {
+  .navPusher {
+    .mainContainer {
+      padding-top: 35px;
+    }
+  }
+}
+
+@media only screen and (min-width: 900px) {
+  .navBreadcrumb {
+    padding: 5px 0;
+  }
+
+  nav.toc {
+    section {
+      .navGroups {
+        padding: 40px 0 0;
+      }
+    }
+  }
+}
+
+@media only screen and (min-width: 1024px) {
+  .navToggle {
+    display: none;
+  }
+
+  .docsSliderActive {
+    .mainContainer {
+      display: block;
+    }
+  }
+
+  .docsNavVisible {
+    .navPusher {
+      .mainContainer {
+        padding-top: 0;
+      }
+    }
+  }
+
+  .docsNavContainer {
+    background: none;
+    box-sizing: border-box;
+    height: auto;
+    margin: 40px 40px 0 0;
+    overflow-y: auto;
+    position: relative;
+    width: 300px;
+  }
+
+  nav.toc {
+    section {
+      .navGroups {
+        display: block;
+        padding-top: 0px;
+      }
+    }
+
+    .toggleNavActive {
+      .navBreadcrumb {
+        margin-bottom: 0;
+        position: relative;
+      }
+    }
+  }
+
+  .docMainWrapper {
+    display: flex;
+    flex-flow: row nowrap;
+    margin-bottom: 40px;
+
+    .wrapper {
+      padding-left: 0;
+      padding-right: 0;
+
+      &.mainWrapper {
+        padding-top: 0;
+      }
+    }
+  }
+
+  .navBreadcrumb {
+    display: none;
+    h2 {
+      padding: 0 10px;
+    }
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_react_header_nav.scss b/thirdparty/rocksdb/docs/_sass/_react_header_nav.scss
new file mode 100644
index 0000000000..13c0e562b7
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_react_header_nav.scss
@@ -0,0 +1,141 @@
+.navigationFull {
+  display: none;
+}
+
+.navigationSlider {
+  position: absolute;
+  right: 0px;
+
+  .navSlideout {
+    cursor: pointer;
+    padding-top: 4px;
+    position: absolute;
+    right: 10px;
+    top: 0;
+    transition: top 0.3s;
+    z-index: 101;
+  }
+
+  .slidingNav {
+    background: $secondary-bg;
+    box-sizing: border-box;
+    height: 0px;
+    overflow-x: hidden;
+    padding: 0;
+    position: absolute;
+    right: 0px;
+    top: 0;
+    transition: height 0.3s cubic-bezier(0.68, -0.55, 0.265, 1.55), width 0.3s cubic-bezier(0.68, -0.55, 0.265, 1.55);
+    width: 0;
+
+    ul {
+      flex-flow: column nowrap;
+      list-style: none;
+      padding: 10px;
+
+      li {
+        margin: 0;
+        padding: 2px 0;
+
+        a {
+          color: $primary-bg;
+          display: inline;
+          margin: 3px 5px;
+          padding: 2px 0px;
+          transition: background-color 0.3s;
+
+          &:focus,
+          &:hover {
+            border-bottom: 2px solid $primary-bg;
+          }
+        }
+      }
+    }
+  }
+
+  .navSlideoutActive {
+    .slidingNav {
+      height: auto;
+      padding-top: $header-height + $header-pbot;
+      width: 300px;
+    }
+
+    .navSlideout {
+      top: -2px;
+      .menuExpand {
+        span:nth-child(1) {
+          background-color: $text;
+          top: 16px;
+          transform: rotate(45deg);
+        }
+        span:nth-child(2) {
+          opacity: 0;
+        }
+        span:nth-child(3) {
+          background-color: $text;
+          transform: rotate(-45deg);
+        }
+      }
+    }
+  }
+}
+
+.menuExpand {
+  display: flex;
+  flex-flow: column nowrap;
+  height: 20px;
+  justify-content: space-between;
+
+  span {
+    background: $primary-overlay;
+    border-radius: 3px;
+    display: block;
+    flex: 0 0 4px;
+    height: 4px;
+    position: relative;
+    top: 0;
+    transition: background-color 0.3s, top 0.3s, opacity 0.3s, transform 0.3s;
+    width: 20px;
+  }
+}
+
+.navPusher {
+  border-top: $header-height + $header-ptop + $header-pbot solid $primary-bg;
+	position: relative;
+	left: 0;
+	z-index: 99;
+	height: 100%;
+
+  &::after {
+    position: absolute;
+    top: 0;
+    right: 0;
+    width: 0;
+    height: 0;
+    background: rgba(0,0,0,0.4);
+    content: '';
+    opacity: 0;
+    -webkit-transition: opacity 0.5s, width 0.1s 0.5s, height 0.1s 0.5s;
+    transition: opacity 0.5s, width 0.1s 0.5s, height 0.1s 0.5s;
+  }
+
+  .sliderActive &::after {
+    width: 100%;
+    height: 100%;
+    opacity: 1;
+    -webkit-transition: opacity 0.5s;
+    transition: opacity 0.5s;
+    z-index: 100;
+  }
+}
+
+
+@media only screen and (min-width: 1024px) {
+  .navigationFull {
+    display: block;
+  }
+
+  .navigationSlider {
+    display: none;
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_reset.scss b/thirdparty/rocksdb/docs/_sass/_reset.scss
new file mode 100644
index 0000000000..0e5f2e0c1d
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_reset.scss
@@ -0,0 +1,43 @@
+html, body, div, span, applet, object, iframe,
+h1, h2, h3, h4, h5, h6, p, blockquote, pre,
+a, abbr, acronym, address, big, cite, code,
+del, dfn, em, img, ins, kbd, q, s, samp,
+small, strike, strong, sub, sup, tt, var,
+b, u, i, center,
+dl, dt, dd, ol, ul, li,
+fieldset, form, label, legend,
+table, caption, tbody, tfoot, thead, tr, th, td,
+article, aside, canvas, details, embed,
+figure, figcaption, footer, header, hgroup,
+menu, nav, output, ruby, section, summary,
+time, mark, audio, video {
+	margin: 0;
+	padding: 0;
+	border: 0;
+	font-size: 100%;
+	font: inherit;
+	vertical-align: baseline;
+}
+/* HTML5 display-role reset for older browsers */
+article, aside, details, figcaption, figure,
+footer, header, hgroup, menu, nav, section {
+	display: block;
+}
+body {
+	line-height: 1;
+}
+ol, ul {
+	list-style: none;
+}
+blockquote, q {
+	quotes: none;
+}
+blockquote:before, blockquote:after,
+q:before, q:after {
+	content: '';
+	content: none;
+}
+table {
+	border-collapse: collapse;
+	border-spacing: 0;
+}
diff --git a/thirdparty/rocksdb/docs/_sass/_search.scss b/thirdparty/rocksdb/docs/_sass/_search.scss
new file mode 100644
index 0000000000..eadfa11d1e
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_search.scss
@@ -0,0 +1,142 @@
+input[type="search"] {
+    -moz-appearance:    none;
+    -webkit-appearance: none;
+}
+
+.navSearchWrapper {
+  align-self: center;
+  position: relative;
+
+  &::before {
+    border: 3px solid $primary-overlay-special;
+    border-radius: 50%;
+    content: " ";
+    display: block;
+    height: 6px;
+    left: 15px;
+    width: 6px;
+    position: absolute;
+    top: 4px;
+    z-index: 1;
+  }
+
+  &::after {
+    background: $primary-overlay-special;
+    content: " ";
+    height: 7px;
+    left: 24px;
+    position: absolute;
+    transform: rotate(-45deg);
+    top: 12px;
+    width: 3px;
+    z-index: 1;
+  }
+
+  .aa-dropdown-menu {
+    background: $secondary-bg;
+    border: 3px solid rgba($text, 0.25);
+    color: $text;
+    font-size: 14px;
+    left: auto !important;
+    line-height: 1.2em;
+    right: 0 !important;
+
+    .algolia-docsearch-suggestion--category-header {
+      background: $primary-overlay-special;
+      color: $primary-bg;
+
+      .algolia-docsearch-suggestion--highlight {
+        background-color: $primary-bg;
+        color: $primary-overlay;
+      }
+    }
+
+    .algolia-docsearch-suggestion--title .algolia-docsearch-suggestion--highlight,
+    .algolia-docsearch-suggestion--subcategory-column .algolia-docsearch-suggestion--highlight {
+        color: $primary-bg;
+    }
+
+    .algolia-docsearch-suggestion__secondary,
+    .algolia-docsearch-suggestion--subcategory-column {
+      border-color: rgba($text, 0.3);
+    }
+  }
+}
+
+input#search_input {
+  padding-left: 25px;
+  font-size: 14px;
+  line-height: 20px;
+  border-radius: 20px;
+  background-color: rgba($primary-overlay-special, 0.25);
+  border: none;
+  color: rgba($primary-overlay-special, 0);
+  outline: none;
+  position: relative;
+  transition: background-color .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), width .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), color .2s ease;
+  width: 60px;
+
+  &:focus, &:active {
+    background-color: $secondary-bg;
+    color: $text;
+    width: 240px;
+  }
+}
+
+.navigationSlider {
+  .navSearchWrapper {
+    &::before {
+      left: 6px;
+      top: 6px;
+    }
+
+    &::after {
+      left: 15px;
+      top: 14px;
+    }
+  }
+
+  input#search_input_react {
+    box-sizing: border-box;
+    padding-left: 25px;
+    font-size: 14px;
+    line-height: 20px;
+    border-radius: 20px;
+    background-color: rgba($primary-overlay-special, 0.25);
+    border: none;
+    color: $text;
+    outline: none;
+    position: relative;
+    transition: background-color .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), width .2s cubic-bezier(0.68, -0.55, 0.265, 1.55), color .2s ease;
+    width: 100%;
+
+    &:focus, &:active {
+      background-color: $primary-bg;
+      color: $primary-overlay;
+    }
+  }
+
+  .algolia-docsearch-suggestion--subcategory-inline {
+    display: none;
+  }
+
+  & > span {
+    width: 100%;
+  }
+
+  .aa-dropdown-menu {
+    background: $secondary-bg;
+    border: 0px solid $secondary-bg;
+    color: $text;
+    font-size: 12px;
+    line-height: 2em;
+    max-height: 140px;
+    min-width: auto;
+    overflow-y: scroll;
+    -webkit-overflow-scrolling: touch;
+    padding: 0;
+    border-radius: 0;
+    position: relative !important;
+    width: 100%;
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_slideshow.scss b/thirdparty/rocksdb/docs/_sass/_slideshow.scss
new file mode 100644
index 0000000000..cd98a6cdba
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_slideshow.scss
@@ -0,0 +1,48 @@
+.slideshow {
+  position: relative;
+
+  .slide {
+    display: none;
+
+    img {
+      display: block;
+      margin: 0 auto;
+    }
+
+    &.slideActive {
+      display: block;
+    }
+
+    a {
+      border: none;
+      display: block;
+    }
+  }
+
+  .pagination {
+    display: block;
+    margin: -10px;
+    padding: 1em 0;
+    text-align: center;
+    width: 100%;
+
+    .pager {
+      background: transparent;
+      border: 2px solid rgba(255, 255, 255, 0.5);
+      border-radius: 50%;
+      cursor: pointer;
+      display: inline-block;
+      height: 12px;
+      margin: 10px;
+      transition: background-color 0.3s, border-color 0.3s;
+      width: 12px;
+
+      &.pagerActive {
+        background: rgba(255, 255, 255, 0.5);
+        border-width: 4px;
+        height: 8px;
+        width: 8px;
+      }
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/docs/_sass/_syntax-highlighting.scss b/thirdparty/rocksdb/docs/_sass/_syntax-highlighting.scss
new file mode 100644
index 0000000000..e55c88a2ea
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_syntax-highlighting.scss
@@ -0,0 +1,129 @@
+
+
+.rougeHighlight { background-color: $code-bg; color: #93a1a1 }
+.rougeHighlight .c { color: #586e75 } /* Comment */
+.rougeHighlight .err { color: #93a1a1 } /* Error */
+.rougeHighlight .g { color: #93a1a1 } /* Generic */
+.rougeHighlight .k { color: #859900 } /* Keyword */
+.rougeHighlight .l { color: #93a1a1 } /* Literal */
+.rougeHighlight .n { color: #93a1a1 } /* Name */
+.rougeHighlight .o { color: #859900 } /* Operator */
+.rougeHighlight .x { color: #cb4b16 } /* Other */
+.rougeHighlight .p { color: #93a1a1 } /* Punctuation */
+.rougeHighlight .cm { color: #586e75 } /* Comment.Multiline */
+.rougeHighlight .cp { color: #859900 } /* Comment.Preproc */
+.rougeHighlight .c1 { color: #72c02c; } /* Comment.Single */
+.rougeHighlight .cs { color: #859900 } /* Comment.Special */
+.rougeHighlight .gd { color: #2aa198 } /* Generic.Deleted */
+.rougeHighlight .ge { color: #93a1a1; font-style: italic } /* Generic.Emph */
+.rougeHighlight .gr { color: #dc322f } /* Generic.Error */
+.rougeHighlight .gh { color: #cb4b16 } /* Generic.Heading */
+.rougeHighlight .gi { color: #859900 } /* Generic.Inserted */
+.rougeHighlight .go { color: #93a1a1 } /* Generic.Output */
+.rougeHighlight .gp { color: #93a1a1 } /* Generic.Prompt */
+.rougeHighlight .gs { color: #93a1a1; font-weight: bold } /* Generic.Strong */
+.rougeHighlight .gu { color: #cb4b16 } /* Generic.Subheading */
+.rougeHighlight .gt { color: #93a1a1 } /* Generic.Traceback */
+.rougeHighlight .kc { color: #cb4b16 } /* Keyword.Constant */
+.rougeHighlight .kd { color: #268bd2 } /* Keyword.Declaration */
+.rougeHighlight .kn { color: #859900 } /* Keyword.Namespace */
+.rougeHighlight .kp { color: #859900 } /* Keyword.Pseudo */
+.rougeHighlight .kr { color: #268bd2 } /* Keyword.Reserved */
+.rougeHighlight .kt { color: #dc322f } /* Keyword.Type */
+.rougeHighlight .ld { color: #93a1a1 } /* Literal.Date */
+.rougeHighlight .m { color: #2aa198 } /* Literal.Number */
+.rougeHighlight .s { color: #2aa198 } /* Literal.String */
+.rougeHighlight .na { color: #93a1a1 } /* Name.Attribute */
+.rougeHighlight .nb { color: #B58900 } /* Name.Builtin */
+.rougeHighlight .nc { color: #268bd2 } /* Name.Class */
+.rougeHighlight .no { color: #cb4b16 } /* Name.Constant */
+.rougeHighlight .nd { color: #268bd2 } /* Name.Decorator */
+.rougeHighlight .ni { color: #cb4b16 } /* Name.Entity */
+.rougeHighlight .ne { color: #cb4b16 } /* Name.Exception */
+.rougeHighlight .nf { color: #268bd2 } /* Name.Function */
+.rougeHighlight .nl { color: #93a1a1 } /* Name.Label */
+.rougeHighlight .nn { color: #93a1a1 } /* Name.Namespace */
+.rougeHighlight .nx { color: #93a1a1 } /* Name.Other */
+.rougeHighlight .py { color: #93a1a1 } /* Name.Property */
+.rougeHighlight .nt { color: #268bd2 } /* Name.Tag */
+.rougeHighlight .nv { color: #268bd2 } /* Name.Variable */
+.rougeHighlight .ow { color: #859900 } /* Operator.Word */
+.rougeHighlight .w { color: #93a1a1 } /* Text.Whitespace */
+.rougeHighlight .mf { color: #2aa198 } /* Literal.Number.Float */
+.rougeHighlight .mh { color: #2aa198 } /* Literal.Number.Hex */
+.rougeHighlight .mi { color: #2aa198 } /* Literal.Number.Integer */
+.rougeHighlight .mo { color: #2aa198 } /* Literal.Number.Oct */
+.rougeHighlight .sb { color: #586e75 } /* Literal.String.Backtick */
+.rougeHighlight .sc { color: #2aa198 } /* Literal.String.Char */
+.rougeHighlight .sd { color: #93a1a1 } /* Literal.String.Doc */
+.rougeHighlight .s2 { color: #2aa198 } /* Literal.String.Double */
+.rougeHighlight .se { color: #cb4b16 } /* Literal.String.Escape */
+.rougeHighlight .sh { color: #93a1a1 } /* Literal.String.Heredoc */
+.rougeHighlight .si { color: #2aa198 } /* Literal.String.Interpol */
+.rougeHighlight .sx { color: #2aa198 } /* Literal.String.Other */
+.rougeHighlight .sr { color: #dc322f } /* Literal.String.Regex */
+.rougeHighlight .s1 { color: #2aa198 } /* Literal.String.Single */
+.rougeHighlight .ss { color: #2aa198 } /* Literal.String.Symbol */
+.rougeHighlight .bp { color: #268bd2 } /* Name.Builtin.Pseudo */
+.rougeHighlight .vc { color: #268bd2 } /* Name.Variable.Class */
+.rougeHighlight .vg { color: #268bd2 } /* Name.Variable.Global */
+.rougeHighlight .vi { color: #268bd2 } /* Name.Variable.Instance */
+.rougeHighlight .il { color: #2aa198 } /* Literal.Number.Integer.Long */
+
+.highlighter-rouge {
+  color: darken(#72c02c, 8%);
+  font: 800 12px/1.5em Hack, monospace;
+  max-width: 100%;
+
+  .rougeHighlight {
+    border-radius: 3px;
+    margin: 20px 0;
+    padding: 0px;
+    overflow-x: scroll;
+    -webkit-overflow-scrolling: touch;
+
+    table {
+      background: none;
+      border: none;
+
+      tbody {
+        tr {
+          background: none;
+          display: flex;
+          flex-flow: row nowrap;
+
+          td {
+            display: block;
+            flex: 1 1;
+
+            &.gutter {
+              border-right: 1px solid lighten($code-bg, 10%);
+              color: lighten($code-bg, 15%);
+              margin-right: 10px;
+              max-width: 40px;
+              padding-right: 10px;
+
+              pre {
+                max-width: 20px;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+p > .highlighter-rouge,
+li > .highlighter-rouge,
+a > .highlighter-rouge {
+  font-size: 16px;
+  font-weight: 400;
+  line-height: inherit;
+}
+
+a:hover {
+  .highlighter-rouge {
+    color: white;
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_sass/_tables.scss b/thirdparty/rocksdb/docs/_sass/_tables.scss
new file mode 100644
index 0000000000..f847c70137
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_sass/_tables.scss
@@ -0,0 +1,47 @@
+table {
+  background: $lightergrey;
+  border: 1px solid $lightgrey;
+  border-collapse: collapse;
+  display:table;
+  margin: 20px 0;
+
+  thead {
+    border-bottom: 1px solid $lightgrey;
+    display: table-header-group;
+  }
+  tbody {
+    display: table-row-group;
+  }
+  tr {
+    display: table-row;
+    &:nth-of-type(odd) {
+      background: $greyish;
+    }
+
+    th, td {
+      border-right: 1px dotted $lightgrey;
+      display: table-cell;
+      font-size: 14px;
+      line-height: 1.3em;
+      padding: 10px;
+      text-align: left;
+
+      &:last-of-type {
+        border-right: 0;
+      }
+
+      code {
+        color: $green;
+        display: inline-block;
+        font-size: 12px;
+      }
+    }
+
+    th {
+      color: #000000;
+      font-weight: bold;
+      font-family: $header-font-family;
+      text-transform: uppercase;
+    }
+  }
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/docs/_top-level/support.md b/thirdparty/rocksdb/docs/_top-level/support.md
new file mode 100644
index 0000000000..64165751fe
--- /dev/null
+++ b/thirdparty/rocksdb/docs/_top-level/support.md
@@ -0,0 +1,22 @@
+---
+layout: top-level
+title: Support
+id: support
+category: support
+---
+
+## Need help?
+
+Do not hesitate to ask questions if you are having trouble with RocksDB.
+
+### GitHub issues
+
+Use [GitHub issues](https://github.com/facebook/rocksdb/issues) to report bugs, issues and feature requests for the RocksDB codebase.
+
+### Facebook Group
+
+Use the [RocksDB Facebook group](https://www.facebook.com/groups/rocksdb.dev/) for general questions and discussion about RocksDB.
+
+### FAQ
+
+Check out a list of [commonly asked questions](/docs/support/faq) about RocksDB.
diff --git a/thirdparty/rocksdb/docs/blog/all.html b/thirdparty/rocksdb/docs/blog/all.html
new file mode 100644
index 0000000000..3be2d3bff2
--- /dev/null
+++ b/thirdparty/rocksdb/docs/blog/all.html
@@ -0,0 +1,20 @@
+---
+id: all
+layout: blog
+category: blog
+---
+
+<div class="posts">
+  <div class="post">
+    <h1>All Posts</h1>
+    {% for post in site.posts %}
+      {% assign author = site.data.authors[post.author] %}
+      <p>
+        <strong>
+          <a href="{{ post.url }}">{{ post.title }}</a>
+        </strong>
+          on {{ post.date | date: "%B %e, %Y" }} by {{ author.display_name }}
+      </p>
+    {% endfor %}
+  </div>
+</div>
diff --git a/thirdparty/rocksdb/docs/blog/index.html b/thirdparty/rocksdb/docs/blog/index.html
new file mode 100644
index 0000000000..9f6b25d03c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/blog/index.html
@@ -0,0 +1,12 @@
+---
+id: blog 
+title: Blog
+layout: blog 
+category: blog
+---
+
+<div class="posts">
+  {% for page in site.posts %}
+    {% include post.html truncate=true %}
+  {% endfor %}
+</div>
diff --git a/thirdparty/rocksdb/docs/css/main.scss b/thirdparty/rocksdb/docs/css/main.scss
new file mode 100644
index 0000000000..48a3e14ef9
--- /dev/null
+++ b/thirdparty/rocksdb/docs/css/main.scss
@@ -0,0 +1,149 @@
+---
+# Only the main Sass file needs front matter (the dashes are enough)
+---
+@charset "utf-8";
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Italic.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Italic.woff' }}") format('woff');
+	font-weight: normal;
+	font-style: italic;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Black.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Black.woff' }}") format('woff');
+	font-weight: 900;
+	font-style: normal;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-BlackItalic.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-BlackItalic.woff' }}") format('woff');
+	font-weight: 900;
+	font-style: italic;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Light.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Light.woff' }}") format('woff');
+	font-weight: 300;
+	font-style: normal;
+}
+
+@font-face {
+	font-family: 'Lato';
+	src: url("{{ '/static/fonts/LatoLatin-Regular.woff2' }}") format('woff2'),
+		url("{{ '/static/fonts/LatoLatin-Regular.woff' }}") format('woff');
+	font-weight: normal;
+	font-style: normal;
+}
+
+// Our variables
+$base-font-family:  'Lato', Calibri, Arial, sans-serif;
+$header-font-family:  'Lato', 'Helvetica Neue', Arial, sans-serif;
+$base-font-size:    18px;
+$small-font-size:   $base-font-size * 0.875;
+$base-line-height:  1.4em;
+
+$spacing-unit:      12px;
+
+// Two configured colors (see _config.yml)
+$primary-bg:        				{{ site.color.primary }};
+$secondary-bg:      				{{ site.color.secondary }};
+
+// $primary-bg overlays
+{% if site.color.primary-overlay == 'light' %}
+$primary-overlay:       		darken($primary-bg, 70%);
+$primary-overlay-special:		darken($primary-bg, 40%);
+{% else %}
+$primary-overlay:      		 	#fff;
+$primary-overlay-special:		lighten($primary-bg, 30%);
+{% endif %}
+
+// $secondary-bg overlays
+{% if site.color.secondary-overlay == 'light' %}
+$text:              #393939;
+$sidenav: 				  darken($secondary-bg, 20%);
+$sidenav-text: 			$text;
+$sidenav-overlay: 	darken($sidenav, 10%);
+$sidenav-active: 		lighten($sidenav, 10%);
+{% else %}
+$text:              #fff;
+$sidenav: 				  lighten($secondary-bg, 20%);
+$sidenav-text: 			$text;
+$sidenav-overlay: 	lighten($sidenav, 10%);
+$sidenav-active: 		darken($sidenav, 10%);
+{% endif %}
+
+$code-bg: 					#002b36;
+
+$header-height: 34px;
+$header-ptop: 10px;
+$header-pbot: 8px;
+
+// Width of the content area
+$content-width:     900px;
+
+// Table setting variables
+$lightergrey: #F8F8F8;
+$greyish: #E8E8E8;
+$lightgrey: #B0B0B0;
+$green: #2db04b;
+
+// Using media queries with like this:
+// @include media-query($on-palm) {
+//     .wrapper {
+//         padding-right: $spacing-unit / 2;
+//         padding-left: $spacing-unit / 2;
+//     }
+// }
+@mixin media-query($device) {
+    @media screen and (max-width: $device) {
+        @content;
+    }
+}
+
+
+
+// Import partials from `sass_dir` (defaults to `_sass`)
+@import
+        "reset",
+        "base",
+				"header",
+        "search",
+        "syntax-highlighting",
+				"promo",
+				"buttons",
+				"gridBlock",
+				"poweredby",
+				"footer",
+				"react_header_nav",
+				"react_docs_nav",
+				"tables",
+				"blog"
+;
+
+// Anchor links
+// http://ben.balter.com/2014/03/13/pages-anchor-links/
+.header-link {
+  position: absolute;
+  margin-left: 0.2em;
+  opacity: 0;
+
+  -webkit-transition: opacity 0.2s ease-in-out 0.1s;
+  -moz-transition: opacity 0.2s ease-in-out 0.1s;
+  -ms-transition: opacity 0.2s ease-in-out 0.1s;
+}
+
+h2:hover .header-link,
+h3:hover .header-link,
+h4:hover .header-link,
+h5:hover .header-link,
+h6:hover .header-link {
+  opacity: 1;
+}
diff --git a/thirdparty/rocksdb/docs/doc-type-examples/2016-04-07-blog-post-example.md b/thirdparty/rocksdb/docs/doc-type-examples/2016-04-07-blog-post-example.md
new file mode 100644
index 0000000000..ef954d63a7
--- /dev/null
+++ b/thirdparty/rocksdb/docs/doc-type-examples/2016-04-07-blog-post-example.md
@@ -0,0 +1,21 @@
+---
+title: Blog Post Example
+layout: post
+author: exampleauthor
+category: blog
+---
+
+Any local blog posts would go in the `_posts` directory.
+
+This is an example blog post introduction, try to keep it short and about a paragraph long, to encourage people to click through to read the entire post.
+
+<!--truncate-->
+
+Everything below the `<!--truncate-->` tag will only show on the actual blog post page, not on the `/blog/` index.
+
+Author is defined in `_data/authors.yml`
+
+
+## No posts?
+
+If you have no blog for your site, you can remove the entire `_posts` folder. Otherwise add markdown files in here. See CONTRIBUTING.md for details.
diff --git a/thirdparty/rocksdb/docs/doc-type-examples/docs-hello-world.md b/thirdparty/rocksdb/docs/doc-type-examples/docs-hello-world.md
new file mode 100644
index 0000000000..c7094ba5af
--- /dev/null
+++ b/thirdparty/rocksdb/docs/doc-type-examples/docs-hello-world.md
@@ -0,0 +1,12 @@
+---
+docid: hello-world
+title: Hello, World!
+layout: docs
+permalink: /docs/hello-world.html
+---
+
+Any local docs would go in the `_docs` directory.
+
+## No documentation?
+
+If you have no documentation for your site, you can remove the entire `_docs` folder. Otherwise add markdown files in here. See CONTRIBUTING.md for details.
diff --git a/thirdparty/rocksdb/docs/doc-type-examples/top-level-example.md b/thirdparty/rocksdb/docs/doc-type-examples/top-level-example.md
new file mode 100644
index 0000000000..67b1fa7110
--- /dev/null
+++ b/thirdparty/rocksdb/docs/doc-type-examples/top-level-example.md
@@ -0,0 +1,8 @@
+---
+layout: top-level
+title: Support Example
+id: top-level-example
+category: top-level
+---
+
+This is a static page disconnected from the blog or docs collections that can be added at a top-level (i.e., the same level as `index.md`).
diff --git a/thirdparty/rocksdb/docs/docs/index.html b/thirdparty/rocksdb/docs/docs/index.html
new file mode 100644
index 0000000000..fa6ec8b5a6
--- /dev/null
+++ b/thirdparty/rocksdb/docs/docs/index.html
@@ -0,0 +1,6 @@
+---
+id: docs
+title: Docs
+layout: redirect
+destination: getting-started.html
+---
diff --git a/thirdparty/rocksdb/docs/feed.xml b/thirdparty/rocksdb/docs/feed.xml
new file mode 100644
index 0000000000..725f00566c
--- /dev/null
+++ b/thirdparty/rocksdb/docs/feed.xml
@@ -0,0 +1,30 @@
+---
+layout: null
+---
+<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+  <channel>
+    <title>{{ site.title | xml_escape }}</title>
+    <description>{{ site.description | xml_escape }}</description>
+    <link>https://rocksdb.org/feed.xml</link>
+    <atom:link href="{{ '/feed.xml' | absolute_url }}" rel="self" type="application/rss+xml"/>
+    <pubDate>{{ site.time | date_to_rfc822 }}</pubDate>
+    <lastBuildDate>{{ site.time | date_to_rfc822 }}</lastBuildDate>
+    <generator>Jekyll v{{ jekyll.version }}</generator>
+    {% for post in site.posts limit:10 %}
+      <item>
+        <title>{{ post.title | xml_escape }}</title>
+        <description>{{ post.content | xml_escape }}</description>
+        <pubDate>{{ post.date | date_to_rfc822 }}</pubDate>
+        <link>{{ post.url | absolute_url }}</link>
+        <guid isPermaLink="true">{{ post.url | absolute_url }}</guid>
+        {% for tag in post.tags %}
+        <category>{{ tag | xml_escape }}</category>
+        {% endfor %}
+        {% for cat in post.categories %}
+        <category>{{ cat | xml_escape }}</category>
+        {% endfor %}
+      </item>
+    {% endfor %}
+  </channel>
+</rss>
diff --git a/thirdparty/rocksdb/docs/index.md b/thirdparty/rocksdb/docs/index.md
new file mode 100644
index 0000000000..2b9570d230
--- /dev/null
+++ b/thirdparty/rocksdb/docs/index.md
@@ -0,0 +1,9 @@
+---
+layout: home
+title: RocksDB | A persistent key-value store
+id: home
+---
+
+## Features
+
+{% include content/gridblocks.html data_source=site.data.features align="center" %}
diff --git a/thirdparty/rocksdb/docs/static/favicon.png b/thirdparty/rocksdb/docs/static/favicon.png
new file mode 100644
index 0000000000..7f668f38f7
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/favicon.png differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Black.woff b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Black.woff
new file mode 100644
index 0000000000..d1e2579bf8
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Black.woff differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Black.woff2 b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Black.woff2
new file mode 100644
index 0000000000..4127b4d0b9
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Black.woff2 differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff
new file mode 100644
index 0000000000..142c1c9c48
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff2 b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff2
new file mode 100644
index 0000000000..e9862e6909
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-BlackItalic.woff2 differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Italic.woff b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Italic.woff
new file mode 100644
index 0000000000..d8cf84c8b9
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Italic.woff differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Italic.woff2 b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Italic.woff2
new file mode 100644
index 0000000000..aaa5a35c3d
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Italic.woff2 differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Light.woff b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Light.woff
new file mode 100644
index 0000000000..e7d4278cce
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Light.woff differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Light.woff2 b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Light.woff2
new file mode 100644
index 0000000000..b6d028836e
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Light.woff2 differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Regular.woff b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Regular.woff
new file mode 100644
index 0000000000..bf73a6d9f9
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Regular.woff differ
diff --git a/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Regular.woff2 b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Regular.woff2
new file mode 100644
index 0000000000..a4d084bfb7
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/fonts/LatoLatin-Regular.woff2 differ
diff --git a/thirdparty/rocksdb/docs/static/images/Resize-of-20140327_200754-300x225.jpg b/thirdparty/rocksdb/docs/static/images/Resize-of-20140327_200754-300x225.jpg
new file mode 100644
index 0000000000..9f93151019
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/Resize-of-20140327_200754-300x225.jpg differ
diff --git a/thirdparty/rocksdb/docs/static/images/binaryseek.png b/thirdparty/rocksdb/docs/static/images/binaryseek.png
new file mode 100644
index 0000000000..0e213f0482
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/binaryseek.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/compaction/full-range.png b/thirdparty/rocksdb/docs/static/images/compaction/full-range.png
new file mode 100644
index 0000000000..5b2c9fc61e
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/compaction/full-range.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/compaction/l0-l1-contend.png b/thirdparty/rocksdb/docs/static/images/compaction/l0-l1-contend.png
new file mode 100644
index 0000000000..bcf8ec73a7
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/compaction/l0-l1-contend.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/compaction/l1-l2-contend.png b/thirdparty/rocksdb/docs/static/images/compaction/l1-l2-contend.png
new file mode 100644
index 0000000000..6dafbbbf29
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/compaction/l1-l2-contend.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/compaction/part-range-old.png b/thirdparty/rocksdb/docs/static/images/compaction/part-range-old.png
new file mode 100644
index 0000000000..1cc723d13b
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/compaction/part-range-old.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/data-block-hash-index/block-format-binary-seek.png b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/block-format-binary-seek.png
new file mode 100644
index 0000000000..0e213f0482
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/block-format-binary-seek.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/data-block-hash-index/block-format-hash-index.png b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/block-format-hash-index.png
new file mode 100644
index 0000000000..accb8639e8
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/block-format-hash-index.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/data-block-hash-index/hash-index-data-structure.png b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/hash-index-data-structure.png
new file mode 100644
index 0000000000..9acc71d8e5
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/hash-index-data-structure.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/data-block-hash-index/perf-cache-miss.png b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/perf-cache-miss.png
new file mode 100644
index 0000000000..71788735d0
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/perf-cache-miss.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/data-block-hash-index/perf-throughput.png b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/perf-throughput.png
new file mode 100644
index 0000000000..54948af2f8
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/data-block-hash-index/perf-throughput.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/delrange/delrange_collapsed.png b/thirdparty/rocksdb/docs/static/images/delrange/delrange_collapsed.png
new file mode 100644
index 0000000000..52246c2c1d
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/delrange/delrange_collapsed.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/delrange/delrange_key_schema.png b/thirdparty/rocksdb/docs/static/images/delrange/delrange_key_schema.png
new file mode 100644
index 0000000000..0a14d4a3a5
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/delrange/delrange_key_schema.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/delrange/delrange_sst_blocks.png b/thirdparty/rocksdb/docs/static/images/delrange/delrange_sst_blocks.png
new file mode 100644
index 0000000000..6003e42ae8
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/delrange/delrange_sst_blocks.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/delrange/delrange_uncollapsed.png b/thirdparty/rocksdb/docs/static/images/delrange/delrange_uncollapsed.png
new file mode 100644
index 0000000000..39c7097af9
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/delrange/delrange_uncollapsed.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/delrange/delrange_write_path.png b/thirdparty/rocksdb/docs/static/images/delrange/delrange_write_path.png
new file mode 100644
index 0000000000..229dfb349a
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/delrange/delrange_write_path.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/pcache-blockindex.jpg b/thirdparty/rocksdb/docs/static/images/pcache-blockindex.jpg
new file mode 100644
index 0000000000..9c18bde93a
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/pcache-blockindex.jpg differ
diff --git a/thirdparty/rocksdb/docs/static/images/pcache-fileindex.jpg b/thirdparty/rocksdb/docs/static/images/pcache-fileindex.jpg
new file mode 100644
index 0000000000..51f4e095ce
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/pcache-fileindex.jpg differ
diff --git a/thirdparty/rocksdb/docs/static/images/pcache-filelayout.jpg b/thirdparty/rocksdb/docs/static/images/pcache-filelayout.jpg
new file mode 100644
index 0000000000..771ee60c15
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/pcache-filelayout.jpg differ
diff --git a/thirdparty/rocksdb/docs/static/images/pcache-readiopath.jpg b/thirdparty/rocksdb/docs/static/images/pcache-readiopath.jpg
new file mode 100644
index 0000000000..4993f0072a
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/pcache-readiopath.jpg differ
diff --git a/thirdparty/rocksdb/docs/static/images/pcache-tieredstorage.jpg b/thirdparty/rocksdb/docs/static/images/pcache-tieredstorage.jpg
new file mode 100644
index 0000000000..c362a2d693
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/pcache-tieredstorage.jpg differ
diff --git a/thirdparty/rocksdb/docs/static/images/pcache-writeiopath.jpg b/thirdparty/rocksdb/docs/static/images/pcache-writeiopath.jpg
new file mode 100644
index 0000000000..561b551811
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/pcache-writeiopath.jpg differ
diff --git a/thirdparty/rocksdb/docs/static/images/promo-adapt.svg b/thirdparty/rocksdb/docs/static/images/promo-adapt.svg
new file mode 100644
index 0000000000..7cd44434db
--- /dev/null
+++ b/thirdparty/rocksdb/docs/static/images/promo-adapt.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg id="svg4136" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://www.w3.org/2000/svg" height="256px" width="256px" version="1.1" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" viewBox="0 0 2691.7869 1102.369">
+ <g id="layer1" transform="translate(2299.2 62.602)">
+  <path id="path4167" style="color-rendering:auto;text-decoration-color:#000000;color:#000000;isolation:auto;mix-blend-mode:normal;shape-rendering:auto;solid-color:#000000;block-progression:tb;text-decoration-line:none;text-decoration-style:solid;image-rendering:auto;white-space:normal;text-indent:0;text-transform:none" d="m392.59 123.17c0.0121-1.9688-0.10333-3.9361-0.34718-5.8898-0.15395-1.3644-0.36978-2.7214-0.64725-4.0662-0.12442-0.56729-0.25999-1.1325-0.40601-1.6946-0.17896-0.75016-0.37707-1.4955-0.59429-2.2357-0.34263-1.1286-0.72934-2.2438-1.1592-3.3423-0.0693-0.18679-0.14009-0.37234-0.21179-0.5591-4.65-11.643-13.94-20.829-25.63-25.358l-235.94-136.22c-21.65-12.935-49.703-5.641-62.314 16.201-12.61 21.841-4.902 49.778 17.123 62.066l95.659 55.229-590.22 0.00006c-278.88 0.00003-504.81 225.93-504.81 504.81l-0.68255 171.28 91.222 0.15305-0.15877-171.43c-0.00003-230.38 184.06-414.43 414.43-414.43l590.22 0.00003-95.659 55.229c-22.025 12.288-29.733 40.224-17.123 62.066s40.659 29.136 62.313 16.205l236.18-136.36c12.515-4.9182 22.19-15.154 26.396-27.926 0.17775-0.55981 0.34438-1.1222 0.5001-1.6886 0.22565-0.75133 0.43169-1.5088 0.61787-2.2712 0.26151-1.1286 0.47932-2.2671 0.65315-3.4128 0.0346-0.20366 0.0681-0.40725 0.10003-0.61218 0.28861-1.8998 0.45557-3.8159 0.50016-5.737zm-1219 630.38-493.8-0.00012c-199.29-0.00006-363.89-137.75-404.69-324.05l283.68-0.00003-95.659 55.228c-22.023 12.288-29.728 40.224-17.119 62.064 12.61 21.841 40.656 29.135 62.309 16.207l235.94-136.22c11.696-4.5293 20.985-13.716 25.642-25.361 0.072-0.18618 0.1416-0.37291 0.2109-0.56005 0.4299-1.0986 0.8167-2.2135 1.1593-3.3423 0.2169-0.74012 0.4154-1.4857 0.5944-2.236 0.1446-0.56217 0.2801-1.1271 0.4061-1.6945 0.2771-1.3448 0.4934-2.7015 0.6471-4.066 0.244-1.9537 0.36-3.9213 0.3474-5.8901-0.045-1.9211-0.2109-3.8374-0.5005-5.7372-0.033-0.20396-0.066-0.40801-0.099-0.61167-0.1747-1.1456-0.3919-2.2841-0.6534-3.413-0.1868-0.76233-0.3923-1.5197-0.6176-2.2713-0.1567-0.56618-0.3227-1.1292-0.5004-1.6889-4.206-12.772-13.881-23.008-26.396-27.926l-236.18-136.36c-21.654-12.936-49.706-5.6434-62.318 16.201s-4.9022 49.785 17.128 62.07l95.659 55.229-293.42-0.00006h-90.38l-428.37 0.00009c-25.208-0.35649-45.834 19.98-45.834 45.19s20.626 45.547 45.834 45.19l436.44-0.00015c42.578 235.73 248.71 414.43 496.74 414.43l910.97-0.00009-95.659 55.229c-22.025 12.287-29.727 40.224-17.117 62.066s40.654 29.136 62.307 16.205l236.16-136.23c11.697-4.529 20.985-13.715 25.643-25.361 0.0717-0.18589 0.1425-0.37219 0.21179-0.55904 0.42982-1.0985 0.81653-2.2134 1.1592-3.3422 0.21722-0.74013 0.41533-1.4857 0.59431-2.236 0.14612-0.56229 0.28169-1.1272 0.40602-1.6946 0.27747-1.3448 0.49336-2.7015 0.64725-4.066 0.24373-1.9537 0.35986-3.9212 0.34715-5.89-0.0446-1.921-0.21149-3.8373-0.50013-5.737-0.0319-0.20396-0.0654-0.40828-0.10003-0.61194-0.17383-1.1456-0.39174-2.284-0.65315-3.4128-0.18618-0.76233-0.39219-1.5197-0.61781-2.2713-0.15575-0.56609-0.32248-1.1291-0.50016-1.6887-4.2062-12.772-13.881-23.008-26.396-27.926l-236.18-136.36c-21.654-12.931-49.697-5.6367-62.307 16.205s-4.908 49.779 17.117 62.066l95.659 55.229h-417.17z" fill-rule="evenodd"/>
+  <path id="path6042" d="m-1139.3 565.22v-376.9" stroke="#000" stroke-linecap="round" stroke-width="90.381" fill="none"/>
+  <path id="path6044" d="m-111.15 993.93v-376.9" stroke="#000" stroke-linecap="round" stroke-width="90.381" fill="none"/>
+ </g>
+</svg>
diff --git a/thirdparty/rocksdb/docs/static/images/promo-flash.svg b/thirdparty/rocksdb/docs/static/images/promo-flash.svg
new file mode 100644
index 0000000000..79810c30a9
--- /dev/null
+++ b/thirdparty/rocksdb/docs/static/images/promo-flash.svg
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 15.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" [
+	<!ENTITY ns_extend "http://ns.adobe.com/Extensibility/1.0/">
+	<!ENTITY ns_ai "http://ns.adobe.com/AdobeIllustrator/10.0/">
+	<!ENTITY ns_graphs "http://ns.adobe.com/Graphs/1.0/">
+	<!ENTITY ns_vars "http://ns.adobe.com/Variables/1.0/">
+	<!ENTITY ns_imrep "http://ns.adobe.com/ImageReplacement/1.0/">
+	<!ENTITY ns_sfw "http://ns.adobe.com/SaveForWeb/1.0/">
+	<!ENTITY ns_custom "http://ns.adobe.com/GenericCustomNamespace/1.0/">
+	<!ENTITY ns_adobe_xpath "http://ns.adobe.com/XPath/1.0/">
+]>
+<svg version="1.1" id="Ebene_1" xmlns:x="&ns_extend;" xmlns:i="&ns_ai;" xmlns:graph="&ns_graphs;"
+	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" width="256px"
+	 height="256px" viewBox="0 0 815.261 584.491" enable-background="new 0 0 815.261 584.491" xml:space="preserve">
+<switch>
+	<foreignObject requiredExtensions="&ns_ai;" x="0" y="0" width="1" height="1">
+		<i:pgfRef  xlink:href="#adobe_illustrator_pgf">
+		</i:pgfRef>
+	</foreignObject>
+	<g i:extraneous="self">
+		<path fill-rule="evenodd" clip-rule="evenodd" d="M571.106,229.357l86.136,62.324l-500.458,292.81l15.751-116.254L0,392.661
+			l346.5-136.874l-79.521-80.771l332.31-123.92L538.864,0h276.397l-73.986,207.161l-59.189-36.993L571.106,229.357z M630.295,51.79
+			L312.155,170.168l96.182,81.385L60.602,384.729l133.175,51.79l-14.797,96.183l421.722-251.554l-81.385-66.588l170.168-96.182
+			l44.392,22.195l66.587-125.776H593.302L630.295,51.79z"/>
+	</g>
+</switch>
+</svg>
diff --git a/thirdparty/rocksdb/docs/static/images/promo-operations.svg b/thirdparty/rocksdb/docs/static/images/promo-operations.svg
new file mode 100644
index 0000000000..3036294ab9
--- /dev/null
+++ b/thirdparty/rocksdb/docs/static/images/promo-operations.svg
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg height="256px" width="256px" viewBox="0 0 154.875 154.91782" xmlns="http://www.w3.org/2000/svg">
+<g transform="translate(-263.3125,-563.76968)">
+<path d="m336.77535,565.51158c-1.59726.10936-3.15002.35671-4.71936.53629h-.1063l-3.75379,20.48605c-6.11896,1.39345-11.87541,3.75824-17.05404,6.97168l-16.83926-12.12002c-4.55215,3.53414-8.69458,7.65592-12.3345,12.12002l11.69091,17.05386c-3.54965,5.42465-6.21914,11.6188-7.72253,18.12639-.00018.031-.00018.10198 0,.10702l-20.37883,3.2177c-.3725,3.04312-.53624,6.18809-.53624,9.33134 0,2.57176.071,5.10917.32165,7.61524l20.37883,3.64673c1.44933,7.07687 4.20261,13.68602 8.04414,19.52075l-12.11991,16.6248c3.4711,4.30922 7.47839,8.23258 11.79812,11.69099l17.16106-11.79826c5.9977,3.82597 12.69269,6.50875 19.94983,7.82975l3.21758,20.27155c2.28662.20798 4.63161.2145 6.97192.2145 3.30389,0 6.46004-.12522 9.65312-.53628l3.86135-20.70056c6.89012-1.71472 13.36295-4.68941 18.98427-8.68781l16.51747,12.01276c4.28351-3.64433 8.20054-7.83321 11.5837-12.33452l-12.0127-17.37561c3.25344-5.61849 5.50726-11.8176 6.64976-18.44817l20.2718-3.21771c.17838-2.11543.21297-4.16701.21297-6.32815 0-3.75557-.43675-7.43787-.96556-11.04745l-20.59342-3.75397c-1.61384-5.95909-4.26171-11.51888-7.61497-16.51756l12.11974-16.6248c-3.75686-4.59442-8.04235-8.83858-12.76333-12.4418l-17.48303,12.01278c-5.02475-2.97177-10.43184-5.25192-16.30306-6.54268l-3.21759-20.37879c-2.92858-.34452-5.88149-.53628-8.90214-.53628-.81656,0-1.65672-.024-2.46715,0-.39495.0126-.78593-.024-1.17962,0-.1063.007-.21621-.007-.32269,0zm2.78876,52.1268c.39207-.0213.78323,0 1.17998,0 12.69611,0 23.06003,10.36401 23.06003,23.06023s-10.36392,22.95297-23.06003,22.95297-22.95301-10.25675-22.95301-22.95297c0-12.29946 9.6261-22.44383 21.77303-23.06023z" fill="#333" stroke="#333" stroke-width="3.43222"/>
+</g>
+</svg>
diff --git a/thirdparty/rocksdb/docs/static/images/promo-performance.svg b/thirdparty/rocksdb/docs/static/images/promo-performance.svg
new file mode 100644
index 0000000000..be8a101203
--- /dev/null
+++ b/thirdparty/rocksdb/docs/static/images/promo-performance.svg
@@ -0,0 +1,134 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg viewBox="0 0 64 64" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<defs>
+<linearGradient id="linearGradient4121">
+<stop offset="0" stop-color="#fff"/>
+<stop offset="1" stop-color="#fff" stop-opacity="0"/>
+</linearGradient>
+<linearGradient gradientTransform="matrix(.30081,0,0,.20986,-1650.25,-38.87783)" gradientUnits="userSpaceOnUse" id="linearGradient4107" x1="-342.85715" x2="-376.17944" xlink:href="#linearGradient4121" y1="-397.01691" y2="-915.50836"/>
+<linearGradient gradientUnits="userSpaceOnUse" id="linearGradient15735" x1="797.06111" x2="788.0298" xlink:href="#linearGradient4121" y1="-667.70464" y2="-819.35937"/>
+</defs>
+<title>netalloy chequered flag</title>
+<g transform="matrix(.08343,-.03199,0,.08343,-73.61005,-8.84057)">
+<path d="m1688.925,115.67413h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1719.6503,115.67413h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1658.8445,136.51062h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1689.5697,136.51062h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m136.5856-1749.1506h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m158.0217-1749.1506h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m156.97238-1718.8552h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m178.40849-1718.8552h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1718.5759,177.7339h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1719.2205,198.5704h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m219.03217-1748.5061h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1748.2268,157.64693h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1748.8713,178.48341h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m198.94519-1778.157h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m220.3813-1778.157h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1777.8777,137.26013h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1778.5222,158.09662h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m177.95879-1808.6672h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m199.99451-1807.8077h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1808.8176,157.34711h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1809.4622,178.18361h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m199.24498-1838.7478h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m220.08148-1838.7478h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1657.9851,178.93314h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1658.6296,199.76964h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m220.2314-1687.9152h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1688.0657,199.31993h30.08058v20.9864h-30.08058z" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m1688.7102,220.15643h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m220.00832-1868.8394h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m221.05762-1837.2548h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1807.9261-241.48938h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1777.201-241.48938h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m115.67591-1837.6415h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m116.12562-1807.7758h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1778.4473-136.55739h30.08058v20.9864h-30.08058z" fill="url(#linearGradient4107)" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1776.9432-156.36673h27.94272v19.09188h-27.94272z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1869.0726-199.46429h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1868.4281-178.62779h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-158.16603,1839.1426h20.9864v30.08058h-20.9864z" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m-136.72992,1838.5409h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m116.27553-1899.0918h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1899.7549-220.4507h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-1899.1104-199.6142h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m-179.15244,1869.8247h20.9864v30.08058h-20.9864z" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m-157.71632,1869.2231h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(.59562,-.80326,.87909,.47665,0,0)"/>
+<path d="m239.53888-1718.5974h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m1719.3923,239.46391h30.08058v20.9864h-30.08058z" fill="#fff" opacity=".60096" transform="matrix(.87909,.47665,-.59562,.80326,0,0)"/>
+<path d="m239.91541-1897.8026h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m240.36511-1867.9369h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1838.6084-260.79688h30.08058v20.9864h-30.08058z" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m240.21521-1778.3398h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m115.63581-1776.621h20.9864v30.08058h-20.9864z" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-1837.9637-155.30208h27.82166v17.65242h-27.82166z" fill="#fff" opacity=".60096" transform="matrix(-.87909,-.47665,.59562,-.80326,0,0)"/>
+<path d="m1389.2368,874.53916-207.8251,263.56844" fill="none" stroke="#000" stroke-linecap="round" stroke-width="14.84459"/>
+<path d="m199.22183-1868.7964h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m156.79575-1690.0319h21.6532v29.62879h-21.6532z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m221.44484-1899.1632h18.00162v29.54019h-18.00162z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m241.58183-1687.8284h18.77428v29.87089h-18.77428z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m240.00063-1806.0569h20.9864v30.08058h-20.9864z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m114.93913-1688.511h21.01802v24.93378h-21.01802z" fill="#fff" opacity=".60096" transform="matrix(-.59562,.80326,-.87909,-.47665,0,0)"/>
+<path d="m-855.13867,658.61304h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-827.45502,658.61304h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-882.24158,687.65332h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-854.55792,687.65332h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m687.75781,800.87488h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m717.63379,800.87488h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m716.17133,828.17133h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m746.0473,828.17133h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-828.42303,745.10712h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-827.84222,774.14746h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m802.66547,801.45563h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-801.70728,717.11145h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-801.12653,746.15173h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m774.66974,774.73993h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m804.54578,774.73993h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-774.99152,688.69794h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-774.41077,717.73822h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m745.42053,747.24982h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m776.1322,748.02417h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-747.11432,716.6936h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-746.53357,745.73389h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m775.08759,720.14691h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m804.12793,720.14691h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-883.01599,746.7785h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-882.43518,775.81885h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m804.33685,856.04858h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-855.91309,775.19208h27.1029v29.24923h-27.1029z" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m-855.33228,804.23236h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m804.02594,693.03412h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m805.48834,721.49219h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m747.91748-833.9646h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m775.6012-833.9646h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m658.61548,721.14368h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m659.24225,748.05304h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m774.47833-687.71844h27.1029v29.24923h-27.1029z" fill="url(#linearGradient15735)" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m775.83344-715.32721h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m692.82385-775.39325h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m693.40466-746.35297h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m-717.83496-719.79126h29.24923v27.1029h-29.24923z" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m-687.95892-720.33331h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m659.45117,665.77631h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m665.17896-804.64252h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m665.7597-775.60217h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m-747.08417-692.1463h29.24923v27.1029h-29.24923z" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m-717.20813-692.68835h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(-.42736,-.90408,-.97568,-.21922,0,0)"/>
+<path d="m831.24615,828.40363h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m-827.68732,831.14166h27.1029v29.24923h-27.1029z" fill="#fff" opacity=".60096" transform="matrix(-.97568,-.21922,.42736,.90408,0,0)"/>
+<path d="m831.77087,666.93793h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m832.39764,693.84723h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m720.27252-860.8739h27.1029v29.24923h-27.1029z" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m832.18872,774.57513h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m658.55963,776.12384h29.24923v27.1029h-29.24923z" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m720.85333-713.84338h27.18274v24.15996h-27.18274z" fill="#fff" opacity=".60096" transform="matrix(.97568,.21922,-.42736,-.90408,0,0)"/>
+<path d="m1142.0758,779.44917 192.2989,413.58843" fill="none" stroke="#000" stroke-linecap="round" stroke-width="14.20834"/>
+<path d="m775.05536,693.07288h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m715.92511,854.14148h29.44084v23.17059h-29.44084z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m805.88055,665.00702h23.23064v27.43012h-23.23064z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m833.31165,857.23889h28.21643v22.16755h-28.21643z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m833.60077,748.08258h29.24923v27.1029h-29.24923z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+<path d="m657.4411,854.80676h28.65906v24.2827h-28.65906z" fill="#fff" opacity=".60096" transform="matrix(.42736,.90408,.97568,.21922,0,0)"/>
+</g>
+</svg>
diff --git a/thirdparty/rocksdb/docs/static/images/rate-limiter/auto-tuned-write-KBps-series.png b/thirdparty/rocksdb/docs/static/images/rate-limiter/auto-tuned-write-KBps-series.png
new file mode 100644
index 0000000000..b4b24849cd
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/rate-limiter/auto-tuned-write-KBps-series.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/rate-limiter/write-KBps-cdf.png b/thirdparty/rocksdb/docs/static/images/rate-limiter/write-KBps-cdf.png
new file mode 100644
index 0000000000..742f985bf0
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/rate-limiter/write-KBps-cdf.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/rate-limiter/write-KBps-series.png b/thirdparty/rocksdb/docs/static/images/rate-limiter/write-KBps-series.png
new file mode 100644
index 0000000000..c7bdcb95aa
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/rate-limiter/write-KBps-series.png differ
diff --git a/thirdparty/rocksdb/docs/static/images/tree_example1.png b/thirdparty/rocksdb/docs/static/images/tree_example1.png
new file mode 100644
index 0000000000..9f725860c6
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/images/tree_example1.png differ
diff --git a/thirdparty/rocksdb/docs/static/logo.svg b/thirdparty/rocksdb/docs/static/logo.svg
new file mode 100644
index 0000000000..e6e1e8afa0
--- /dev/null
+++ b/thirdparty/rocksdb/docs/static/logo.svg
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 18.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 841.9 595.3" enable-background="new 0 0 841.9 595.3" xml:space="preserve">
+<g id="Layer_2" display="none">
+	<g display="inline">
+		<path fill="#0F2647" d="M213,86.8h415.9c17,0,31,13.9,31,31v359.7c0,17-14,31-31,31H213c-17.1,0-31-14-31-31V117.8
+			C182,100.7,195.9,86.8,213,86.8z"/>
+	</g>
+</g>
+<g id="Layer_4">
+	<g>
+		<path fill="#FFBE00" d="M501.9,398l-64.7,1.6l-24.4,56l-63.6,16.6l-39.8,36.3h320.4c16.6,0,30.1-13.5,30.1-30.1V319.2l-113.6,28.4
+			L501.9,398z M268.5,402.7L214.6,449c-0.5-3.3-1.8-6.6-4.1-9.4l-28.3,23.8c6,7.1,16.1,8.5,23.8,3.8l52.7-23.9L268.5,402.7z
+			 M581.4,240.2l6.5,17.2c10.1-2.6,25.6-13.7,36.5-22.3c-0.4,3-0.2,6,0.9,9l34.7-12.8c-3.6-9.6-14.2-14.4-23.8-10.9
+			c-0.3,0.1-0.6,0.3-0.9,0.5l0.1-0.1L581.4,240.2z M641.2,269.2c-0.1,0-0.1,0-0.2,0l-63.6-5.5l-14.5-38.1v-40.4
+			c2.9,1.6,6.1,2.5,9.6,2.5c10.5,0,19.1-8.1,20.1-18.3l15.2-10.1l-0.7-20.5c-0.1-2.1-1.7-3.7-3.8-3.7l-19-0.3c-4-5.6-11.4-8-17.9-5
+			l-41.8,19.6c0.1,0.1,0.1,0.3,0.1,0.4c1.4,2.9,3.5,5.1,6,6.7c-50.9,26.3-72.5,61.8-72.5,61.8L263.6,323.1
+			c-11.4,6.1-25.7,1.8-31.8-9.6c-6.1-11.4-1.8-25.7,9.6-31.8l55.2-29.7c14-7.5,22.2-21.5,23.2-36.2l-33.1,17.8l-55.2,29.7
+			c-21.6,11.6-29.7,38.6-18.1,60.1c8,14.9,23.3,23.4,39.2,23.4c7.1,0,14.3-1.7,20.9-5.3l24.9-13.4c-1.8,9.8-1,20.3,2.9,30.2
+			c1.3,3.4,2.2,5.5,2.2,5.5c8.8,19.1-2.8,34.1-18,34.1h-4.8l-17.5,76.2c-2.3-2.4-5.3-4.2-8.8-5.1l-8.7,35.9
+			c8.8,2.1,17.5-2.4,21.1-10.4l39.7-71.2c50.2-4.9,76.6-38.9,75.4-86.8c52,3.2,121.3-29.5,152.1-74.2c23.4,29.7,67.6,25.5,87.3,18
+			l3.8-1.3c-1.4,2.6-2.2,5.6-2.2,8.8l37-0.4C659.7,277.3,651.4,269.1,641.2,269.2z M296.3,201.8c1.8,3.4,2.7,7,2.8,10.6l19.5-10.5
+			c-0.9-3.4-2.1-6.8-3.9-10l-1-1.9l-18.4,9.9L296.3,201.8z M289.2,188.6l18.4-9.9l-2.4-4.4c-2.7-5.1-9.1-7-14.2-4.3
+			c-5.1,2.7-7,9.1-4.3,14.2L289.2,188.6z"/>
+		<path fill="#0F2647" d="M571.4,139.9c-3.3,0-5.9,2.7-5.9,6c0,3.3,2.7,6,5.9,6c3.3,0,6-2.7,6-6
+			C577.4,142.6,574.7,139.9,571.4,139.9z M536.6,184.6c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3S534.4,183.8,536.6,184.6z M516.4,191.9c0,0-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3C515.5,192.9,516.4,191.9,516.4,191.9z M543,220.2
+			c0,0-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			C540.4,223.1,543,220.2,543,220.2z M512.7,230.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S515,231.2,512.7,230.3z M484.1,221.6c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S486.3,222.5,484.1,221.6z M473,255.6c0,0-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9C470.5,258.5,473,255.6,473,255.6z
+			 M423.9,260.6c0,0,2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			C424.8,259.6,423.9,260.6,423.9,260.6z M450.9,277.8c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S453.1,278.6,450.9,277.8z M480.5,284.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S482.7,285.2,480.5,284.3z M504.7,260.9c1.4-0.8,2.3-1.9,2.3-1.9
+			s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9
+			s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S506.9,261.7,504.7,260.9z M548.1,241.4
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3
+			S550.3,242.2,548.1,241.4z M405.7,278.2c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3S407.9,279.1,405.7,278.2z M434.6,306.7c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S436.8,307.5,434.6,306.7z M397.8,311.9c1.4-0.8,2.3-1.9,2.3-1.9
+			s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9
+			s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S400.1,312.7,397.8,311.9z M367.8,290.7
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3
+			S370,291.5,367.8,290.7z M325.2,318.1c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3S323,317.2,325.2,318.1z M363.5,328.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9
+			s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9
+			s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S365.7,329.1,363.5,328.3z M357.2,359.5c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9
+			c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9
+			c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3S359.4,360.4,357.2,359.5z M328,355c0,0-0.9-1.1-2.3-1.9
+			c2.2-0.9,3.7-2.3,3.7-2.3s-1.5-1.4-3.7-2.3c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9
+			c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9C325.4,358,328,355,328,355z M335.6,384.5
+			c1.4-0.8,2.3-1.9,2.3-1.9s-2.5-2.9-5.6-2.9c-3.1,0-5.6,2.9-5.6,2.9s0.9,1.1,2.3,1.9c-2.2,0.9-3.7,2.3-3.7,2.3s1.5,1.4,3.7,2.3
+			c-1.4,0.8-2.3,1.9-2.3,1.9s2.5,2.9,5.6,2.9c3.1,0,5.6-2.9,5.6-2.9s-0.9-1.1-2.3-1.9c2.2-0.9,3.7-2.3,3.7-2.3
+			S337.8,385.3,335.6,384.5z"/>
+	</g>
+</g>
+</svg>
diff --git a/thirdparty/rocksdb/docs/static/og_image.png b/thirdparty/rocksdb/docs/static/og_image.png
new file mode 100644
index 0000000000..4e2759e617
Binary files /dev/null and b/thirdparty/rocksdb/docs/static/og_image.png differ
diff --git a/thirdparty/rocksdb/env/env.cc b/thirdparty/rocksdb/env/env.cc
index ae0b111be8..fde03577d2 100644
--- a/thirdparty/rocksdb/env/env.cc
+++ b/thirdparty/rocksdb/env/env.cc
@@ -22,6 +22,22 @@ namespace rocksdb {
 Env::~Env() {
 }
 
+std::string Env::PriorityToString(Env::Priority priority) {
+  switch (priority) {
+    case Env::Priority::BOTTOM:
+      return "Bottom";
+    case Env::Priority::LOW:
+      return "Low";
+    case Env::Priority::HIGH:
+      return "High";
+    case Env::Priority::USER:
+      return "User";
+    case Env::Priority::TOTAL:
+      assert(false);
+  }
+  return "Invalid";
+}
+
 uint64_t Env::GetThreadID() const {
   std::hash<std::thread::id> hasher;
   return hasher(std::this_thread::get_id());
@@ -29,7 +45,7 @@ uint64_t Env::GetThreadID() const {
 
 Status Env::ReuseWritableFile(const std::string& fname,
                               const std::string& old_fname,
-                              unique_ptr<WritableFile>* result,
+                              std::unique_ptr<WritableFile>* result,
                               const EnvOptions& options) {
   Status s = RenameFile(old_fname, fname);
   if (!s.ok()) {
@@ -73,9 +89,21 @@ RandomAccessFile::~RandomAccessFile() {
 WritableFile::~WritableFile() {
 }
 
-Logger::~Logger() {
+MemoryMappedFileBuffer::~MemoryMappedFileBuffer() {}
+
+Logger::~Logger() {}
+
+Status Logger::Close() {
+  if (!closed_) {
+    closed_ = true;
+    return CloseImpl();
+  } else {
+    return Status::OK();
+  }
 }
 
+Status Logger::CloseImpl() { return Status::NotSupported(); }
+
 FileLock::~FileLock() {
 }
 
@@ -85,15 +113,19 @@ void LogFlush(Logger *info_log) {
   }
 }
 
-void Log(Logger* info_log, const char* format, ...) {
+static void Logv(Logger *info_log, const char* format, va_list ap) {
   if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
-    va_list ap;
-    va_start(ap, format);
     info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
-    va_end(ap);
   }
 }
 
+void Log(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Logv(info_log, format, ap);
+  va_end(ap);
+}
+
 void Logger::Logv(const InfoLogLevel log_level, const char* format, va_list ap) {
   static const char* kInfoLogLevelNames[5] = { "DEBUG", "INFO", "WARN",
     "ERROR", "FATAL" };
@@ -108,6 +140,8 @@ void Logger::Logv(const InfoLogLevel log_level, const char* format, va_list ap)
     // are INFO level. We don't want to add extra costs to those existing
     // logging.
     Logv(format, ap);
+  } else if (log_level == InfoLogLevel::HEADER_LEVEL) {
+    LogHeader(format, ap);
   } else {
     char new_format[500];
     snprintf(new_format, sizeof(new_format) - 1, "[%s] %s",
@@ -116,157 +150,166 @@ void Logger::Logv(const InfoLogLevel log_level, const char* format, va_list ap)
   }
 }
 
-
-void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
-         ...) {
+static void Logv(const InfoLogLevel log_level, Logger *info_log, const char *format, va_list ap) {
   if (info_log && info_log->GetInfoLogLevel() <= log_level) {
-    va_list ap;
-    va_start(ap, format);
-
     if (log_level == InfoLogLevel::HEADER_LEVEL) {
       info_log->LogHeader(format, ap);
     } else {
       info_log->Logv(log_level, format, ap);
     }
-
-    va_end(ap);
   }
 }
 
-void Header(Logger* info_log, const char* format, ...) {
+void Log(const InfoLogLevel log_level, Logger* info_log, const char* format,
+         ...) {
+  va_list ap;
+  va_start(ap, format);
+  Logv(log_level, info_log, format, ap);
+  va_end(ap);
+}
+
+static void Headerv(Logger *info_log, const char *format, va_list ap) {
   if (info_log) {
-    va_list ap;
-    va_start(ap, format);
     info_log->LogHeader(format, ap);
-    va_end(ap);
   }
 }
 
-void Debug(Logger* info_log, const char* format, ...) {
+void Header(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Headerv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Debugv(Logger* info_log, const char* format, va_list ap) {
   if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) {
-    va_list ap;
-    va_start(ap, format);
     info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
-    va_end(ap);
   }
 }
 
-void Info(Logger* info_log, const char* format, ...) {
+void Debug(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Debugv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Infov(Logger* info_log, const char* format, va_list ap) {
   if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) {
-    va_list ap;
-    va_start(ap, format);
     info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
-    va_end(ap);
   }
 }
 
-void Warn(Logger* info_log, const char* format, ...) {
+void Info(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Infov(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Warnv(Logger* info_log, const char* format, va_list ap) {
   if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) {
-    va_list ap;
-    va_start(ap, format);
     info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
-    va_end(ap);
   }
 }
-void Error(Logger* info_log, const char* format, ...) {
+
+void Warn(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Warnv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Errorv(Logger* info_log, const char* format, va_list ap) {
   if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) {
-    va_list ap;
-    va_start(ap, format);
     info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
-    va_end(ap);
   }
 }
-void Fatal(Logger* info_log, const char* format, ...) {
+
+void Error(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Errorv(info_log, format, ap);
+  va_end(ap);
+}
+
+static void Fatalv(Logger* info_log, const char* format, va_list ap) {
   if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) {
-    va_list ap;
-    va_start(ap, format);
     info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);
-    va_end(ap);
   }
 }
 
-void LogFlush(const shared_ptr<Logger>& info_log) {
-  if (info_log) {
-    info_log->Flush();
-  }
+void Fatal(Logger* info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Fatalv(info_log, format, ap);
+  va_end(ap);
+}
+
+void LogFlush(const std::shared_ptr<Logger>& info_log) {
+  LogFlush(info_log.get());
 }
 
-void Log(const InfoLogLevel log_level, const shared_ptr<Logger>& info_log,
+void Log(const InfoLogLevel log_level, const std::shared_ptr<Logger>& info_log,
          const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->Logv(log_level, format, ap);
-    va_end(ap);
-  }
+  va_list ap;
+  va_start(ap, format);
+  Logv(log_level, info_log.get(), format, ap);
+  va_end(ap);
 }
 
-void Header(const shared_ptr<Logger>& info_log, const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->LogHeader(format, ap);
-    va_end(ap);
-  }
+void Header(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Headerv(info_log.get(), format, ap);
+  va_end(ap);
 }
 
-void Debug(const shared_ptr<Logger>& info_log, const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap);
-    va_end(ap);
-  }
+void Debug(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Debugv(info_log.get(), format, ap);
+  va_end(ap);
 }
 
-void Info(const shared_ptr<Logger>& info_log, const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
-    va_end(ap);
-  }
+void Info(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Infov(info_log.get(), format, ap);
+  va_end(ap);
 }
 
-void Warn(const shared_ptr<Logger>& info_log, const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap);
-    va_end(ap);
-  }
+void Warn(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Warnv(info_log.get(), format, ap);
+  va_end(ap);
 }
 
-void Error(const shared_ptr<Logger>& info_log, const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
-    va_end(ap);
-  }
+void Error(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Errorv(info_log.get(), format, ap);
+  va_end(ap);
 }
 
-void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap);
-    va_end(ap);
-  }
+void Fatal(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Fatalv(info_log.get(), format, ap);
+  va_end(ap);
 }
 
-void Log(const shared_ptr<Logger>& info_log, const char* format, ...) {
-  if (info_log) {
-    va_list ap;
-    va_start(ap, format);
-    info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap);
-    va_end(ap);
-  }
+void Log(const std::shared_ptr<Logger>& info_log, const char* format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  Logv(info_log.get(), format, ap);
+  va_end(ap);
 }
 
 Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
                          bool should_sync) {
-  unique_ptr<WritableFile> file;
+  std::unique_ptr<WritableFile> file;
   EnvOptions soptions;
   Status s = env->NewWritableFile(fname, &file, soptions);
   if (!s.ok()) {
@@ -285,7 +328,7 @@ Status WriteStringToFile(Env* env, const Slice& data, const std::string& fname,
 Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
   EnvOptions soptions;
   data->clear();
-  unique_ptr<SequentialFile> file;
+  std::unique_ptr<SequentialFile> file;
   Status s = env->NewSequentialFile(fname, &file, soptions);
   if (!s.ok()) {
     return s;
@@ -333,6 +376,8 @@ EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options,
                                     const DBOptions& db_options) const {
   EnvOptions optimized_env_options(env_options);
   optimized_env_options.bytes_per_sync = db_options.wal_bytes_per_sync;
+  optimized_env_options.writable_file_max_buffer_size =
+      db_options.writable_file_max_buffer_size;
   return optimized_env_options;
 }
 
@@ -363,8 +408,7 @@ EnvOptions Env::OptimizeForCompactionTableWrite(
 EnvOptions Env::OptimizeForCompactionTableRead(
     const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
   EnvOptions optimized_env_options(env_options);
-  optimized_env_options.use_direct_reads =
-      db_options.use_direct_io_for_flush_and_compaction;
+  optimized_env_options.use_direct_reads = db_options.use_direct_reads;
   return optimized_env_options;
 }
 
diff --git a/thirdparty/rocksdb/env/env_basic_test.cc b/thirdparty/rocksdb/env/env_basic_test.cc
index 254c71fadc..3efae758a2 100644
--- a/thirdparty/rocksdb/env/env_basic_test.cc
+++ b/thirdparty/rocksdb/env/env_basic_test.cc
@@ -21,8 +21,8 @@ class NormalizingEnvWrapper : public EnvWrapper {
   explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {}
 
   // Removes . and .. from directory listing
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) override {
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* result) override {
     Status status = EnvWrapper::GetChildren(dir, result);
     if (status.ok()) {
       result->erase(std::remove_if(result->begin(), result->end(),
@@ -35,7 +35,7 @@ class NormalizingEnvWrapper : public EnvWrapper {
   }
 
   // Removes . and .. from directory listing
-  virtual Status GetChildrenFileAttributes(
+  Status GetChildrenFileAttributes(
       const std::string& dir, std::vector<FileAttributes>* result) override {
     Status status = EnvWrapper::GetChildrenFileAttributes(dir, result);
     if (status.ok()) {
@@ -57,14 +57,12 @@ class EnvBasicTestWithParam : public testing::Test,
   std::string test_dir_;
 
   EnvBasicTestWithParam() : env_(GetParam()) {
-    test_dir_ = test::TmpDir(env_) + "/env_basic_test";
+    test_dir_ = test::PerThreadDBPath(env_, "env_basic_test");
   }
 
-  void SetUp() {
-    env_->CreateDirIfMissing(test_dir_);
-  }
+  void SetUp() override { env_->CreateDirIfMissing(test_dir_); }
 
-  void TearDown() {
+  void TearDown() override {
     std::vector<std::string> files;
     env_->GetChildren(test_dir_, &files);
     for (const auto& file : files) {
@@ -133,7 +131,7 @@ INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam,
 
 TEST_P(EnvBasicTestWithParam, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
 
   // Check that the directory is empty.
@@ -186,8 +184,8 @@ TEST_P(EnvBasicTestWithParam, Basics) {
   ASSERT_EQ(0U, file_size);
 
   // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_TRUE(!env_->NewSequentialFile(test_dir_ + "/non_existent", &seq_file,
                                        soptions_)
                    .ok());
@@ -208,9 +206,9 @@ TEST_P(EnvBasicTestWithParam, Basics) {
 }
 
 TEST_P(EnvBasicTestWithParam, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   Slice result;
   char scratch[100];
 
@@ -247,7 +245,7 @@ TEST_P(EnvBasicTestWithParam, ReadWrite) {
 }
 
 TEST_P(EnvBasicTestWithParam, Misc) {
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile(test_dir_ + "/b", &writable_file, soptions_));
 
   // These are no-ops, but we test they return success.
@@ -266,14 +264,14 @@ TEST_P(EnvBasicTestWithParam, LargeWrite) {
     write_data.append(1, static_cast<char>(i));
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
   ASSERT_OK(writable_file->Append(write_data));
   ASSERT_OK(writable_file->Close());
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
   ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
@@ -340,7 +338,7 @@ TEST_P(EnvMoreTestWithParam, GetChildren) {
 
   // if dir is a file, returns IOError
   ASSERT_OK(env_->CreateDir(test_dir_));
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(
       env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_));
   ASSERT_OK(writable_file->Close());
diff --git a/thirdparty/rocksdb/env/env_chroot.cc b/thirdparty/rocksdb/env/env_chroot.cc
index 6a1fda8a83..8a7fb44999 100644
--- a/thirdparty/rocksdb/env/env_chroot.cc
+++ b/thirdparty/rocksdb/env/env_chroot.cc
@@ -38,9 +38,9 @@ class ChrootEnv : public EnvWrapper {
 #endif
   }
 
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) override {
+  Status NewSequentialFile(const std::string& fname,
+                           std::unique_ptr<SequentialFile>* result,
+                           const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -49,9 +49,9 @@ class ChrootEnv : public EnvWrapper {
                                          options);
   }
 
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) override {
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -60,9 +60,9 @@ class ChrootEnv : public EnvWrapper {
                                            options);
   }
 
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -71,10 +71,10 @@ class ChrootEnv : public EnvWrapper {
                                        options);
   }
 
-  virtual Status ReuseWritableFile(const std::string& fname,
-                                   const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override {
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* result,
+                           const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -88,9 +88,9 @@ class ChrootEnv : public EnvWrapper {
                                          options);
   }
 
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -99,8 +99,8 @@ class ChrootEnv : public EnvWrapper {
                                        options);
   }
 
-  virtual Status NewDirectory(const std::string& dir,
-                              unique_ptr<Directory>* result) override {
+  Status NewDirectory(const std::string& dir,
+                      std::unique_ptr<Directory>* result) override {
     auto status_and_enc_path = EncodePathWithNewBasename(dir);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -108,7 +108,7 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::NewDirectory(status_and_enc_path.second, result);
   }
 
-  virtual Status FileExists(const std::string& fname) override {
+  Status FileExists(const std::string& fname) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -116,8 +116,8 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::FileExists(status_and_enc_path.second);
   }
 
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) override {
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* result) override {
     auto status_and_enc_path = EncodePath(dir);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -125,7 +125,7 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::GetChildren(status_and_enc_path.second, result);
   }
 
-  virtual Status GetChildrenFileAttributes(
+  Status GetChildrenFileAttributes(
       const std::string& dir, std::vector<FileAttributes>* result) override {
     auto status_and_enc_path = EncodePath(dir);
     if (!status_and_enc_path.first.ok()) {
@@ -135,7 +135,7 @@ class ChrootEnv : public EnvWrapper {
                                                  result);
   }
 
-  virtual Status DeleteFile(const std::string& fname) override {
+  Status DeleteFile(const std::string& fname) override {
     auto status_and_enc_path = EncodePath(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -143,7 +143,7 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::DeleteFile(status_and_enc_path.second);
   }
 
-  virtual Status CreateDir(const std::string& dirname) override {
+  Status CreateDir(const std::string& dirname) override {
     auto status_and_enc_path = EncodePathWithNewBasename(dirname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -151,7 +151,7 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::CreateDir(status_and_enc_path.second);
   }
 
-  virtual Status CreateDirIfMissing(const std::string& dirname) override {
+  Status CreateDirIfMissing(const std::string& dirname) override {
     auto status_and_enc_path = EncodePathWithNewBasename(dirname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -159,7 +159,7 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::CreateDirIfMissing(status_and_enc_path.second);
   }
 
-  virtual Status DeleteDir(const std::string& dirname) override {
+  Status DeleteDir(const std::string& dirname) override {
     auto status_and_enc_path = EncodePath(dirname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -167,8 +167,7 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::DeleteDir(status_and_enc_path.second);
   }
 
-  virtual Status GetFileSize(const std::string& fname,
-                             uint64_t* file_size) override {
+  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
     auto status_and_enc_path = EncodePath(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -176,8 +175,8 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::GetFileSize(status_and_enc_path.second, file_size);
   }
 
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* file_mtime) override {
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
     auto status_and_enc_path = EncodePath(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -186,8 +185,7 @@ class ChrootEnv : public EnvWrapper {
                                                file_mtime);
   }
 
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& dest) override {
+  Status RenameFile(const std::string& src, const std::string& dest) override {
     auto status_and_src_enc_path = EncodePath(src);
     if (!status_and_src_enc_path.first.ok()) {
       return status_and_src_enc_path.first;
@@ -200,8 +198,7 @@ class ChrootEnv : public EnvWrapper {
                                   status_and_dest_enc_path.second);
   }
 
-  virtual Status LinkFile(const std::string& src,
-                          const std::string& dest) override {
+  Status LinkFile(const std::string& src, const std::string& dest) override {
     auto status_and_src_enc_path = EncodePath(src);
     if (!status_and_src_enc_path.first.ok()) {
       return status_and_src_enc_path.first;
@@ -214,7 +211,7 @@ class ChrootEnv : public EnvWrapper {
                                 status_and_dest_enc_path.second);
   }
 
-  virtual Status LockFile(const std::string& fname, FileLock** lock) override {
+  Status LockFile(const std::string& fname, FileLock** lock) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -225,7 +222,7 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::LockFile(status_and_enc_path.second, lock);
   }
 
-  virtual Status GetTestDirectory(std::string* path) override {
+  Status GetTestDirectory(std::string* path) override {
     // Adapted from PosixEnv's implementation since it doesn't provide a way to
     // create directory in the chroot.
     char buf[256];
@@ -237,8 +234,8 @@ class ChrootEnv : public EnvWrapper {
     return Status::OK();
   }
 
-  virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override {
+  Status NewLogger(const std::string& fname,
+                   std::shared_ptr<Logger>* result) override {
     auto status_and_enc_path = EncodePathWithNewBasename(fname);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
@@ -246,8 +243,8 @@ class ChrootEnv : public EnvWrapper {
     return EnvWrapper::NewLogger(status_and_enc_path.second, result);
   }
 
-  virtual Status GetAbsolutePath(const std::string& db_path,
-                                 std::string* output_path) override {
+  Status GetAbsolutePath(const std::string& db_path,
+                         std::string* output_path) override {
     auto status_and_enc_path = EncodePath(db_path);
     if (!status_and_enc_path.first.ok()) {
       return status_and_enc_path.first;
diff --git a/thirdparty/rocksdb/env/env_encryption.cc b/thirdparty/rocksdb/env/env_encryption.cc
index 6b688a6602..aa59e66357 100644
--- a/thirdparty/rocksdb/env/env_encryption.cc
+++ b/thirdparty/rocksdb/env/env_encryption.cc
@@ -8,6 +8,7 @@
 #include <algorithm>
 #include <cctype>
 #include <iostream>
+#include <cassert>
 
 #include "rocksdb/env_encryption.h"
 #include "util/aligned_buffer.h"
@@ -42,7 +43,7 @@ class EncryptedSequentialFile : public SequentialFile {
   // If an error was encountered, returns a non-OK status.
   //
   // REQUIRES: External synchronization
-  virtual Status Read(size_t n, Slice* result, char* scratch) override {
+  Status Read(size_t n, Slice* result, char* scratch) override {
     assert(scratch);
     Status status = file_->Read(n, result, scratch);
     if (!status.ok()) {
@@ -60,7 +61,7 @@ class EncryptedSequentialFile : public SequentialFile {
   // file, and Skip will return OK.
   //
   // REQUIRES: External synchronization
-  virtual Status Skip(uint64_t n) override {
+  Status Skip(uint64_t n) override {
     auto status = file_->Skip(n);
     if (!status.ok()) {
       return status;
@@ -71,26 +72,25 @@ class EncryptedSequentialFile : public SequentialFile {
 
   // Indicates the upper layers if the current SequentialFile implementation
   // uses direct IO.
-  virtual bool use_direct_io() const override { 
-    return file_->use_direct_io(); 
-  }
+  bool use_direct_io() const override { return file_->use_direct_io(); }
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override { 
-    return file_->GetRequiredBufferAlignment(); 
+  size_t GetRequiredBufferAlignment() const override {
+    return file_->GetRequiredBufferAlignment();
   }
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  virtual Status InvalidateCache(size_t offset, size_t length) override {
+  Status InvalidateCache(size_t offset, size_t length) override {
     return file_->InvalidateCache(offset + prefixLength_, length);
   }
 
   // Positioned Read for direct I/O
   // If Direct I/O enabled, offset, n, and scratch should be properly aligned
-  virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, char* scratch) override {
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
     assert(scratch);
     offset += prefixLength_; // Skip prefix
     auto status = file_->PositionedRead(offset, n, result, scratch);
@@ -101,7 +101,6 @@ class EncryptedSequentialFile : public SequentialFile {
     status = stream_->Decrypt(offset, (char*)result->data(), result->size());
     return status;
   }
-
 };
 
 // A file abstraction for randomly reading the contents of a file.
@@ -125,7 +124,8 @@ class EncryptedRandomAccessFile : public RandomAccessFile {
   //
   // Safe for concurrent use by multiple threads.
   // If Direct I/O enabled, offset, n, and scratch should be aligned properly.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override {
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
     assert(scratch);
     offset += prefixLength_;
     auto status = file_->Read(offset, n, result, scratch);
@@ -137,7 +137,7 @@ class EncryptedRandomAccessFile : public RandomAccessFile {
   }
 
   // Readahead the file starting from offset by n bytes for caching.
-  virtual Status Prefetch(uint64_t offset, size_t n) override {
+  Status Prefetch(uint64_t offset, size_t n) override {
     //return Status::OK();
     return file_->Prefetch(offset + prefixLength_, n);
   }
@@ -150,37 +150,33 @@ class EncryptedRandomAccessFile : public RandomAccessFile {
   // may not have been modified.
   //
   // This function guarantees, for IDs from a given environment, two unique ids
-  // cannot be made equal to eachother by adding arbitrary bytes to one of
+  // cannot be made equal to each other by adding arbitrary bytes to one of
   // them. That is, no unique ID is the prefix of another.
   //
   // This function guarantees that the returned ID will not be interpretable as
   // a single varint.
   //
   // Note: these IDs are only valid for the duration of the process.
-  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
+  size_t GetUniqueId(char* id, size_t max_size) const override {
     return file_->GetUniqueId(id, max_size);
   };
 
-  virtual void Hint(AccessPattern pattern) override {
-    file_->Hint(pattern);
-  }
+  void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
 
   // Indicates the upper layers if the current RandomAccessFile implementation
   // uses direct IO.
-  virtual bool use_direct_io() const override {
-     return file_->use_direct_io(); 
-  }
+  bool use_direct_io() const override { return file_->use_direct_io(); }
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override { 
-    return file_->GetRequiredBufferAlignment(); 
+  size_t GetRequiredBufferAlignment() const override {
+    return file_->GetRequiredBufferAlignment();
   }
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  virtual Status InvalidateCache(size_t offset, size_t length) override {
+  Status InvalidateCache(size_t offset, size_t length) override {
     return file_->InvalidateCache(offset + prefixLength_, length);
   }
 };
@@ -247,16 +243,18 @@ class EncryptedWritableFile : public WritableFileWrapper {
 
   // Indicates the upper layers if the current WritableFile implementation
   // uses direct IO.
-  virtual bool use_direct_io() const override { return file_->use_direct_io(); }
+  bool use_direct_io() const override { return file_->use_direct_io(); }
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override { return file_->GetRequiredBufferAlignment(); } 
+  size_t GetRequiredBufferAlignment() const override {
+    return file_->GetRequiredBufferAlignment();
+  }
 
     /*
    * Get the size of valid data in the file.
    */
-  virtual uint64_t GetFileSize() override {
+  uint64_t GetFileSize() override {
     return file_->GetFileSize() - prefixLength_;
   }
 
@@ -264,7 +262,7 @@ class EncryptedWritableFile : public WritableFileWrapper {
   // before closing. It is not always possible to keep track of the file
   // size due to whole pages writes. The behavior is undefined if called
   // with other writes to follow.
-  virtual Status Truncate(uint64_t size) override {
+  Status Truncate(uint64_t size) override {
     return file_->Truncate(size + prefixLength_);
   }
 
@@ -272,7 +270,7 @@ class EncryptedWritableFile : public WritableFileWrapper {
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
   // This call has no effect on dirty pages in the cache.
-  virtual Status InvalidateCache(size_t offset, size_t length) override {
+  Status InvalidateCache(size_t offset, size_t length) override {
     return file_->InvalidateCache(offset + prefixLength_, length);
   }
 
@@ -282,7 +280,7 @@ class EncryptedWritableFile : public WritableFileWrapper {
   // This asks the OS to initiate flushing the cached data to disk,
   // without waiting for completion.
   // Default implementation does nothing.
-  virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override { 
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
     return file_->RangeSync(offset + prefixLength_, nbytes);
   }
 
@@ -291,12 +289,12 @@ class EncryptedWritableFile : public WritableFileWrapper {
   // of space on devices where it can result in less file
   // fragmentation and/or less waste from over-zealous filesystem
   // pre-allocation.
-  virtual void PrepareWrite(size_t offset, size_t len) override {
+  void PrepareWrite(size_t offset, size_t len) override {
     file_->PrepareWrite(offset + prefixLength_, len);
   }
 
   // Pre-allocates space for a file.
-  virtual Status Allocate(uint64_t offset, uint64_t len) override {
+  Status Allocate(uint64_t offset, uint64_t len) override {
     return file_->Allocate(offset + prefixLength_, len);
   }
 };
@@ -314,17 +312,17 @@ class EncryptedRandomRWFile : public RandomRWFile {
 
   // Indicates if the class makes use of direct I/O
   // If false you must pass aligned buffer to Write()
-  virtual bool use_direct_io() const override { return file_->use_direct_io(); }
+  bool use_direct_io() const override { return file_->use_direct_io(); }
 
   // Use the returned alignment value to allocate
   // aligned buffer for Direct I/O
-  virtual size_t GetRequiredBufferAlignment() const override { 
-    return file_->GetRequiredBufferAlignment(); 
+  size_t GetRequiredBufferAlignment() const override {
+    return file_->GetRequiredBufferAlignment();
   }
 
   // Write bytes in `data` at  offset `offset`, Returns Status::OK() on success.
   // Pass aligned buffer when use_direct_io() returns true.
-  virtual Status Write(uint64_t offset, const Slice& data) override {
+  Status Write(uint64_t offset, const Slice& data) override {
     AlignedBuffer buf;
     Status status;
     Slice dataToWrite(data); 
@@ -347,7 +345,8 @@ class EncryptedRandomRWFile : public RandomRWFile {
   // Read up to `n` bytes starting from offset `offset` and store them in
   // result, provided `scratch` size should be at least `n`.
   // Returns Status::OK() on success.
-  virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { 
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
     assert(scratch);
     offset += prefixLength_;
     auto status = file_->Read(offset, n, result, scratch);
@@ -358,21 +357,13 @@ class EncryptedRandomRWFile : public RandomRWFile {
     return status;
   }
 
-  virtual Status Flush() override {
-    return file_->Flush();
-  }
+  Status Flush() override { return file_->Flush(); }
 
-  virtual Status Sync() override {
-    return file_->Sync();
-  }
+  Status Sync() override { return file_->Sync(); }
 
-  virtual Status Fsync() override { 
-    return file_->Fsync();
-  }
+  Status Fsync() override { return file_->Fsync(); }
 
-  virtual Status Close() override {
-    return file_->Close();
-  }
+  Status Close() override { return file_->Close(); }
 };
 
 // EncryptedEnv implements an Env wrapper that adds encryption to files stored on disk.
@@ -384,9 +375,9 @@ class EncryptedEnv : public EnvWrapper {
   }
 
   // NewSequentialFile opens a file for sequential reading.
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) override {
+  Status NewSequentialFile(const std::string& fname,
+                           std::unique_ptr<SequentialFile>* result,
+                           const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_reads) {
       return Status::InvalidArgument();
@@ -421,9 +412,9 @@ class EncryptedEnv : public EnvWrapper {
   }
 
   // NewRandomAccessFile opens a file for random read access.
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) override {
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_reads) {
       return Status::InvalidArgument();
@@ -456,11 +447,11 @@ class EncryptedEnv : public EnvWrapper {
     (*result) = std::unique_ptr<RandomAccessFile>(new EncryptedRandomAccessFile(underlying.release(), stream.release(), prefixLength));
     return Status::OK();
   }
-  
+
   // NewWritableFile opens a file for sequential writing.
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_writes) {
       return Status::InvalidArgument();
@@ -504,9 +495,9 @@ class EncryptedEnv : public EnvWrapper {
   // returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  virtual Status ReopenWritableFile(const std::string& fname,
-                                   unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override {
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_writes) {
       return Status::InvalidArgument();
@@ -544,10 +535,10 @@ class EncryptedEnv : public EnvWrapper {
   }
 
   // Reuse an existing file by renaming it and opening it as writable.
-  virtual Status ReuseWritableFile(const std::string& fname,
-                                   const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override {
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* result,
+                           const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_writes) {
       return Status::InvalidArgument();
@@ -584,14 +575,14 @@ class EncryptedEnv : public EnvWrapper {
     return Status::OK();
   }
 
-  // Open `fname` for random read and write, if file dont exist the file
+  // Open `fname` for random read and write, if file doesn't exist the file
   // will be created.  On success, stores a pointer to the new file in
   // *result and returns OK.  On failure returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override {
     result->reset();
     if (options.use_mmap_reads || options.use_mmap_writes) {
       return Status::InvalidArgument();
@@ -649,7 +640,8 @@ class EncryptedEnv : public EnvWrapper {
   //         NotFound if "dir" does not exist, the calling process does not have
   //                  permission to access "dir", or if "dir" is invalid.
   //         IOError if an IO Error was encountered
-  virtual Status GetChildrenFileAttributes(const std::string& dir, std::vector<FileAttributes>* result) override {
+  Status GetChildrenFileAttributes(
+      const std::string& dir, std::vector<FileAttributes>* result) override {
     auto status = EnvWrapper::GetChildrenFileAttributes(dir, result);
     if (!status.ok()) {
       return status;
@@ -660,10 +652,10 @@ class EncryptedEnv : public EnvWrapper {
       it->size_bytes -= prefixLength;
     }
     return Status::OK();
- }
+  }
 
   // Store the size of fname in *file_size.
-  virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
+  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
     auto status = EnvWrapper::GetFileSize(fname, file_size);
     if (!status.ok()) {
       return status;
@@ -671,7 +663,7 @@ class EncryptedEnv : public EnvWrapper {
     size_t prefixLength = provider_->GetPrefixLength();
     assert(*file_size >= prefixLength);
     *file_size -= prefixLength;
-    return Status::OK();    
+    return Status::OK();
   }
 
  private:
@@ -692,7 +684,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t
   auto blockSize = BlockSize();
   uint64_t blockIndex = fileOffset / blockSize;
   size_t blockOffset = fileOffset % blockSize;
-  unique_ptr<char[]> blockBuffer;
+  std::unique_ptr<char[]> blockBuffer;
 
   std::string scratch;
   AllocateScratch(scratch);
@@ -705,8 +697,8 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t
       // We're not encrypting a full block. 
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
-        // Allocate buffer 
-        blockBuffer = unique_ptr<char[]>(new char[blockSize]);
+        // Allocate buffer
+        blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
       // Copy plain data to block buffer 
@@ -737,11 +729,13 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
   auto blockSize = BlockSize();
   uint64_t blockIndex = fileOffset / blockSize;
   size_t blockOffset = fileOffset % blockSize;
-  unique_ptr<char[]> blockBuffer;
+  std::unique_ptr<char[]> blockBuffer;
 
   std::string scratch;
   AllocateScratch(scratch);
 
+  assert(fileOffset < dataSize);
+
   // Decrypt individual blocks.
   while (1) {
     char *block = data;
@@ -750,8 +744,8 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
       // We're not decrypting a full block. 
       // Copy data to blockBuffer
       if (!blockBuffer.get()) {
-        // Allocate buffer 
-        blockBuffer = unique_ptr<char[]>(new char[blockSize]);
+        // Allocate buffer
+        blockBuffer = std::unique_ptr<char[]>(new char[blockSize]);
       }
       block = blockBuffer.get();
       // Copy encrypted data to block buffer 
@@ -765,6 +759,14 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t
       // Copy decrypted data back to `data`.
       memmove(data, block + blockOffset, n);
     }
+
+    // Simply decrementing dataSize by n could cause it to underflow,
+    // which will very likely make it read over the original bounds later
+    assert(dataSize >= n);
+    if (dataSize < n) {
+      return Status::Corruption("Cannot decrypt data at given offset");
+    }
+
     dataSize -= n;
     if (dataSize == 0) {
       return Status::OK();
@@ -828,7 +830,7 @@ Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scra
 // GetPrefixLength returns the length of the prefix that is added to every file
 // and used for storing encryption options.
 // For optimal performance, the prefix length should be a multiple of 
-// the a page size.
+// the page size.
 size_t CTREncryptionProvider::GetPrefixLength() {
   return defaultPrefixLength;
 }
@@ -844,7 +846,9 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t &
 
 // CreateNewPrefix initialized an allocated block of prefix memory 
 // for a new file.
-Status CTREncryptionProvider::CreateNewPrefix(const std::string& fname, char *prefix, size_t prefixLength) {
+Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/,
+                                              char* prefix,
+                                              size_t prefixLength) {
   // Create & seed rnd.
   Random rnd((uint32_t)Env::Default()->NowMicros());
   // Fill entire prefix block with random values.
@@ -873,18 +877,29 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& fname, char *pr
 // in plain text.
 // Returns the amount of space (starting from the start of the prefix)
 // that has been initialized.
-size_t CTREncryptionProvider::PopulateSecretPrefixPart(char *prefix, size_t prefixLength, size_t blockSize) {
+size_t CTREncryptionProvider::PopulateSecretPrefixPart(char* /*prefix*/,
+                                                       size_t /*prefixLength*/,
+                                                       size_t /*blockSize*/) {
   // Nothing to do here, put in custom data in override when needed.
   return 0;
 }
 
-Status CTREncryptionProvider::CreateCipherStream(const std::string& fname, const EnvOptions& options, Slice &prefix, unique_ptr<BlockAccessCipherStream>* result) {
+Status CTREncryptionProvider::CreateCipherStream(
+    const std::string& fname, const EnvOptions& options, Slice& prefix,
+    std::unique_ptr<BlockAccessCipherStream>* result) {
   // Read plain text part of prefix.
   auto blockSize = cipher_.BlockSize();
   uint64_t initialCounter;
   Slice iv;
   decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv);
 
+  // If the prefix is smaller than twice the block size, we would below read a
+  // very large chunk of the file (and very likely read over the bounds)
+  assert(prefix.size() >= 2 * blockSize);
+  if (prefix.size() < 2 * blockSize) {
+    return Status::Corruption("Unable to read from file " + fname + ": read attempt would read beyond file bounds");
+  }
+
   // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted)
   CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter);
   auto status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), prefix.size() - (2 * blockSize));
@@ -898,9 +913,12 @@ Status CTREncryptionProvider::CreateCipherStream(const std::string& fname, const
 
 // CreateCipherStreamFromPrefix creates a block access cipher stream for a file given
 // given name and options. The given prefix is already decrypted.
-Status CTREncryptionProvider::CreateCipherStreamFromPrefix(const std::string& fname, const EnvOptions& options,
-    uint64_t initialCounter, const Slice& iv, const Slice& prefix, unique_ptr<BlockAccessCipherStream>* result) {
-  (*result) = unique_ptr<BlockAccessCipherStream>(new CTRCipherStream(cipher_, iv.data(), initialCounter));
+Status CTREncryptionProvider::CreateCipherStreamFromPrefix(
+    const std::string& /*fname*/, const EnvOptions& /*options*/,
+    uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/,
+    std::unique_ptr<BlockAccessCipherStream>* result) {
+  (*result) = std::unique_ptr<BlockAccessCipherStream>(
+      new CTRCipherStream(cipher_, iv.data(), initialCounter));
   return Status::OK();
 }
 
diff --git a/thirdparty/rocksdb/env/env_hdfs.cc b/thirdparty/rocksdb/env/env_hdfs.cc
index d98020c76b..5acf9301c6 100644
--- a/thirdparty/rocksdb/env/env_hdfs.cc
+++ b/thirdparty/rocksdb/env/env_hdfs.cc
@@ -11,13 +11,14 @@
 #ifndef ROCKSDB_HDFS_FILE_C
 #define ROCKSDB_HDFS_FILE_C
 
-#include <algorithm>
 #include <stdio.h>
 #include <sys/time.h>
 #include <time.h>
+#include <algorithm>
 #include <iostream>
 #include <sstream>
 #include "rocksdb/status.h"
+#include "util/logging.h"
 #include "util/string_util.h"
 
 #define HDFS_EXISTS 0
@@ -36,9 +37,11 @@ namespace {
 
 // Log error message
 static Status IOError(const std::string& context, int err_number) {
-  return (err_number == ENOSPC) ?
-      Status::NoSpace(context, strerror(err_number)) :
-      Status::IOError(context, strerror(err_number));
+  return (err_number == ENOSPC)
+             ? Status::NoSpace(context, strerror(err_number))
+             : (err_number == ENOENT)
+                   ? Status::PathNotFound(context, strerror(err_number))
+                   : Status::IOError(context, strerror(err_number));
 }
 
 // assume that there is one global logger for now. It is not thread-safe,
@@ -222,7 +225,7 @@ class HdfsWritableFile: public WritableFile {
                     filename_.c_str());
     const char* src = data.data();
     size_t left = data.size();
-    size_t ret = hdfsWrite(fileSys_, hfile_, src, left);
+    size_t ret = hdfsWrite(fileSys_, hfile_, src, static_cast<tSize>(left));
     ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsWritableFile Appended %s\n",
                     filename_.c_str());
     if (ret != left) {
@@ -252,7 +255,8 @@ class HdfsWritableFile: public WritableFile {
 
   // This is used by HdfsLogger to write data to the debug log file
   virtual Status Append(const char* src, size_t size) {
-    if (hdfsWrite(fileSys_, hfile_, src, size) != (tSize)size) {
+    if (hdfsWrite(fileSys_, hfile_, src, static_cast<tSize>(size)) !=
+        static_cast<tSize>(size)) {
       return IOError(filename_, errno);
     }
     return Status::OK();
@@ -277,6 +281,18 @@ class HdfsLogger : public Logger {
   HdfsWritableFile* file_;
   uint64_t (*gettid_)();  // Return the thread id for the current thread
 
+  Status HdfsCloseHelper() {
+    ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsLogger closed %s\n",
+                    file_->getName().c_str());
+    if (mylog != nullptr && mylog == this) {
+      mylog = nullptr;
+    }
+    return Status::OK();
+  }
+
+ protected:
+  virtual Status CloseImpl() override { return HdfsCloseHelper(); }
+
  public:
   HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)())
       : file_(f), gettid_(gettid) {
@@ -284,16 +300,15 @@ class HdfsLogger : public Logger {
                     file_->getName().c_str());
   }
 
-  virtual ~HdfsLogger() {
-    ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsLogger closed %s\n",
-                    file_->getName().c_str());
-    delete file_;
-    if (mylog != nullptr && mylog == this) {
-      mylog = nullptr;
+  ~HdfsLogger() override {
+    if (!closed_) {
+      closed_ = true;
+      HdfsCloseHelper();
     }
   }
 
-  virtual void Logv(const char* format, va_list ap) {
+  using Logger::Logv;
+  void Logv(const char* format, va_list ap) override {
     const uint64_t thread_id = (*gettid_)();
 
     // We try twice: the first time with a fixed-size stack allocated buffer,
@@ -370,8 +385,8 @@ const std::string HdfsEnv::pathsep = "/";
 
 // open a file for sequential reading
 Status HdfsEnv::NewSequentialFile(const std::string& fname,
-                                  unique_ptr<SequentialFile>* result,
-                                  const EnvOptions& options) {
+                                  std::unique_ptr<SequentialFile>* result,
+                                  const EnvOptions& /*options*/) {
   result->reset();
   HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
   if (f == nullptr || !f->isValid()) {
@@ -385,8 +400,8 @@ Status HdfsEnv::NewSequentialFile(const std::string& fname,
 
 // open a file for random reading
 Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
-                                    unique_ptr<RandomAccessFile>* result,
-                                    const EnvOptions& options) {
+                                    std::unique_ptr<RandomAccessFile>* result,
+                                    const EnvOptions& /*options*/) {
   result->reset();
   HdfsReadableFile* f = new HdfsReadableFile(fileSys_, fname);
   if (f == nullptr || !f->isValid()) {
@@ -400,8 +415,8 @@ Status HdfsEnv::NewRandomAccessFile(const std::string& fname,
 
 // create a new file for writing
 Status HdfsEnv::NewWritableFile(const std::string& fname,
-                                unique_ptr<WritableFile>* result,
-                                const EnvOptions& options) {
+                                std::unique_ptr<WritableFile>* result,
+                                const EnvOptions& /*options*/) {
   result->reset();
   Status s;
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
@@ -419,14 +434,16 @@ class HdfsDirectory : public Directory {
   explicit HdfsDirectory(int fd) : fd_(fd) {}
   ~HdfsDirectory() {}
 
-  virtual Status Fsync() { return Status::OK(); }
+  Status Fsync() override { return Status::OK(); }
+
+  int GetFd() const { return fd_; }
 
  private:
   int fd_;
 };
 
 Status HdfsEnv::NewDirectory(const std::string& name,
-                             unique_ptr<Directory>* result) {
+                             std::unique_ptr<Directory>* result) {
   int value = hdfsExists(fileSys_, name.c_str());
   switch (value) {
     case HDFS_EXISTS:
@@ -464,10 +481,10 @@ Status HdfsEnv::GetChildren(const std::string& path,
     pHdfsFileInfo = hdfsListDirectory(fileSys_, path.c_str(), &numEntries);
     if (numEntries >= 0) {
       for(int i = 0; i < numEntries; i++) {
-        char* pathname = pHdfsFileInfo[i].mName;
-        char* filename = std::rindex(pathname, '/');
-        if (filename != nullptr) {
-          result->push_back(filename+1);
+        std::string pathname(pHdfsFileInfo[i].mName);
+        size_t pos = pathname.rfind("/");
+        if (std::string::npos != pos) {
+          result->push_back(pathname.substr(pos + 1));
         }
       }
       if (pHdfsFileInfo != nullptr) {
@@ -558,19 +575,17 @@ Status HdfsEnv::RenameFile(const std::string& src, const std::string& target) {
   return IOError(src, errno);
 }
 
-Status HdfsEnv::LockFile(const std::string& fname, FileLock** lock) {
+Status HdfsEnv::LockFile(const std::string& /*fname*/, FileLock** lock) {
   // there isn's a very good way to atomically check and create
   // a file via libhdfs
   *lock = nullptr;
   return Status::OK();
 }
 
-Status HdfsEnv::UnlockFile(FileLock* lock) {
-  return Status::OK();
-}
+Status HdfsEnv::UnlockFile(FileLock* /*lock*/) { return Status::OK(); }
 
 Status HdfsEnv::NewLogger(const std::string& fname,
-                          shared_ptr<Logger>* result) {
+                          std::shared_ptr<Logger>* result) {
   HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
   if (f == nullptr || !f->isValid()) {
     delete f;
@@ -598,13 +613,13 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname) {
 
 // dummy placeholders used when HDFS is not available
 namespace rocksdb {
- Status HdfsEnv::NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) {
-   return Status::NotSupported("Not compiled with hdfs support");
- }
+Status HdfsEnv::NewSequentialFile(const std::string& /*fname*/,
+                                  std::unique_ptr<SequentialFile>* /*result*/,
+                                  const EnvOptions& /*options*/) {
+  return Status::NotSupported("Not compiled with hdfs support");
+}
 
- Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname) {
+ Status NewHdfsEnv(Env** /*hdfs_env*/, const std::string& /*fsname*/) {
    return Status::NotSupported("Not compiled with hdfs support");
  }
 }
diff --git a/thirdparty/rocksdb/env/env_posix.cc b/thirdparty/rocksdb/env/env_posix.cc
index 5a671d72fe..387c027939 100644
--- a/thirdparty/rocksdb/env/env_posix.cc
+++ b/thirdparty/rocksdb/env/env_posix.cc
@@ -20,10 +20,12 @@
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
 #include <sys/statfs.h>
 #include <sys/syscall.h>
+#include <sys/sysmacros.h>
 #endif
+#include <sys/statvfs.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <time.h>
@@ -48,6 +50,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "util/coding.h"
+#include "util/compression_context_cache.h"
 #include "util/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
@@ -73,32 +76,15 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() {
   return new ThreadStatusUpdater();
 }
 
+inline mode_t GetDBFileMode(bool allow_non_owner_access) {
+  return allow_non_owner_access ? 0644 : 0600;
+}
+
 // list of pathnames that are locked
 static std::set<std::string> lockedFiles;
 static port::Mutex mutex_lockedFiles;
 
-static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
-  mutex_lockedFiles.Lock();
-  if (lock) {
-    // If it already exists in the lockedFiles set, then it is already locked,
-    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
-    // This check is needed because fcntl() does not detect lock conflict
-    // if the fcntl is issued by the same thread that earlier acquired
-    // this lock.
-    if (lockedFiles.insert(fname).second == false) {
-      mutex_lockedFiles.Unlock();
-      errno = ENOLCK;
-      return -1;
-    }
-  } else {
-    // If we are unlocking, then verify that we had locked it earlier,
-    // it should already exist in lockedFiles. Remove it from lockedFiles.
-    if (lockedFiles.erase(fname) != 1) {
-      mutex_lockedFiles.Unlock();
-      errno = ENOLCK;
-      return -1;
-    }
-  }
+static int LockOrUnlock(int fd, bool lock) {
   errno = 0;
   struct flock f;
   memset(&f, 0, sizeof(f));
@@ -107,11 +93,7 @@ static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
   f.l_start = 0;
   f.l_len = 0;        // Lock/unlock entire file
   int value = fcntl(fd, F_SETLK, &f);
-  if (value == -1 && lock) {
-    // if there is an error in locking, then remove the pathname from lockedfiles
-    lockedFiles.erase(fname);
-  }
-  mutex_lockedFiles.Unlock();
+
   return value;
 }
 
@@ -121,11 +103,23 @@ class PosixFileLock : public FileLock {
   std::string filename;
 };
 
+int cloexec_flags(int flags, const EnvOptions* options) {
+  // If the system supports opening the file with cloexec enabled,
+  // do so, as this avoids a race condition if a db is opened around
+  // the same time that a child process is forked
+#ifdef O_CLOEXEC
+  if (options == nullptr || options->set_fd_cloexec) {
+    flags |= O_CLOEXEC;
+  }
+#endif
+  return flags;
+}
+
 class PosixEnv : public Env {
  public:
   PosixEnv();
 
-  virtual ~PosixEnv() {
+  ~PosixEnv() override {
     for (const auto tid : threads_to_join_) {
       pthread_join(tid, nullptr);
     }
@@ -147,12 +141,12 @@ class PosixEnv : public Env {
     }
   }
 
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) override {
+  Status NewSequentialFile(const std::string& fname,
+                           std::unique_ptr<SequentialFile>* result,
+                           const EnvOptions& options) override {
     result->reset();
     int fd = -1;
-    int flags = O_RDONLY;
+    int flags = cloexec_flags(O_RDONLY, &options);
     FILE* file = nullptr;
 
     if (options.use_direct_reads && !options.use_mmap_reads) {
@@ -166,7 +160,7 @@ class PosixEnv : public Env {
 
     do {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), flags, 0644);
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
     } while (fd < 0 && errno == EINTR);
     if (fd < 0) {
       return IOError("While opening a file for sequentially reading", fname,
@@ -197,13 +191,14 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) override {
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& options) override {
     result->reset();
     Status s;
     int fd;
-    int flags = O_RDONLY;
+    int flags = cloexec_flags(O_RDONLY, &options);
+
     if (options.use_direct_reads && !options.use_mmap_reads) {
 #ifdef ROCKSDB_LITE
       return Status::IOError(fname, "Direct I/O not supported in RocksDB lite");
@@ -216,7 +211,7 @@ class PosixEnv : public Env {
 
     do {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), flags, 0644);
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
     } while (fd < 0 && errno == EINTR);
     if (fd < 0) {
       return IOError("While open a file for random read", fname, errno);
@@ -236,9 +231,9 @@ class PosixEnv : public Env {
                                                   size, options));
         } else {
           s = IOError("while mmap file for read", fname, errno);
+          close(fd);
         }
       }
-      close(fd);
     } else {
       if (options.use_direct_reads && !options.use_mmap_reads) {
 #ifdef OS_MACOSX
@@ -254,7 +249,7 @@ class PosixEnv : public Env {
   }
 
   virtual Status OpenWritableFile(const std::string& fname,
-                                  unique_ptr<WritableFile>* result,
+                                  std::unique_ptr<WritableFile>* result,
                                   const EnvOptions& options,
                                   bool reopen = false) {
     result->reset();
@@ -285,9 +280,11 @@ class PosixEnv : public Env {
       flags |= O_WRONLY;
     }
 
+    flags = cloexec_flags(flags, &options);
+
     do {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), flags, 0644);
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
     } while (fd < 0 && errno == EINTR);
 
     if (fd < 0) {
@@ -335,22 +332,22 @@ class PosixEnv : public Env {
     return s;
   }
 
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& options) override {
     return OpenWritableFile(fname, result, options, false);
   }
 
-  virtual Status ReopenWritableFile(const std::string& fname,
-                                    unique_ptr<WritableFile>* result,
-                                    const EnvOptions& options) override {
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& options) override {
     return OpenWritableFile(fname, result, options, true);
   }
 
-  virtual Status ReuseWritableFile(const std::string& fname,
-                                   const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override {
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* result,
+                           const EnvOptions& options) override {
     result->reset();
     Status s;
     int fd = -1;
@@ -373,9 +370,12 @@ class PosixEnv : public Env {
       flags |= O_WRONLY;
     }
 
+    flags = cloexec_flags(flags, &options);
+
     do {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(old_fname.c_str(), flags, 0644);
+      fd = open(old_fname.c_str(), flags,
+                GetDBFileMode(allow_non_owner_access_));
     } while (fd < 0 && errno == EINTR);
     if (fd < 0) {
       s = IOError("while reopen file for write", fname, errno);
@@ -427,17 +427,18 @@ class PosixEnv : public Env {
       result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
     }
     return s;
-
-    return s;
   }
 
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override {
     int fd = -1;
+    int flags = cloexec_flags(O_RDWR, &options);
+
     while (fd < 0) {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), O_CREAT | O_RDWR, 0644);
+
+      fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
       if (fd < 0) {
         // Error while opening the file
         if (errno == EINTR) {
@@ -452,13 +453,57 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+  Status NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    int fd = -1;
+    Status status;
+    int flags = cloexec_flags(O_RDWR, nullptr);
+
+    while (fd < 0) {
+      IOSTATS_TIMER_GUARD(open_nanos);
+      fd = open(fname.c_str(), flags, 0644);
+      if (fd < 0) {
+        // Error while opening the file
+        if (errno == EINTR) {
+          continue;
+        }
+        status =
+            IOError("While open file for raw mmap buffer access", fname, errno);
+        break;
+      }
+    }
+    uint64_t size;
+    if (status.ok()) {
+      status = GetFileSize(fname, &size);
+    }
+    void* base = nullptr;
+    if (status.ok()) {
+      base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE,
+                  MAP_SHARED, fd, 0);
+      if (base == MAP_FAILED) {
+        status = IOError("while mmap file for read", fname, errno);
+      }
+    }
+    if (status.ok()) {
+      result->reset(
+          new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size)));
+    }
+    if (fd >= 0) {
+      // don't need to keep it open after mmap has been called
+      close(fd);
+    }
+    return status;
+  }
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
     result->reset();
     int fd;
+    int flags = cloexec_flags(0, nullptr);
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(name.c_str(), 0);
+      fd = open(name.c_str(), flags);
     }
     if (fd < 0) {
       return IOError("While open directory", name, errno);
@@ -468,14 +513,15 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status FileExists(const std::string& fname) override {
+  Status FileExists(const std::string& fname) override {
     int result = access(fname.c_str(), F_OK);
 
     if (result == 0) {
       return Status::OK();
     }
 
-    switch (errno) {
+    int err = errno;
+    switch (err) {
       case EACCES:
       case ELOOP:
       case ENAMETOOLONG:
@@ -483,14 +529,14 @@ class PosixEnv : public Env {
       case ENOTDIR:
         return Status::NotFound();
       default:
-        assert(result == EIO || result == ENOMEM);
-        return Status::IOError("Unexpected error(" + ToString(result) +
+        assert(err == EIO || err == ENOMEM);
+        return Status::IOError("Unexpected error(" + ToString(err) +
                                ") accessing file `" + fname + "' ");
     }
   }
 
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) override {
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* result) override {
     result->clear();
     DIR* d = opendir(dir.c_str());
     if (d == nullptr) {
@@ -511,7 +557,7 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status DeleteFile(const std::string& fname) override {
+  Status DeleteFile(const std::string& fname) override {
     Status result;
     if (unlink(fname.c_str()) != 0) {
       result = IOError("while unlink() file", fname, errno);
@@ -519,7 +565,7 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status CreateDir(const std::string& name) override {
+  Status CreateDir(const std::string& name) override {
     Status result;
     if (mkdir(name.c_str(), 0755) != 0) {
       result = IOError("While mkdir", name, errno);
@@ -527,7 +573,7 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status CreateDirIfMissing(const std::string& name) override {
+  Status CreateDirIfMissing(const std::string& name) override {
     Status result;
     if (mkdir(name.c_str(), 0755) != 0) {
       if (errno != EEXIST) {
@@ -541,7 +587,7 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status DeleteDir(const std::string& name) override {
+  Status DeleteDir(const std::string& name) override {
     Status result;
     if (rmdir(name.c_str()) != 0) {
       result = IOError("file rmdir", name, errno);
@@ -549,8 +595,7 @@ class PosixEnv : public Env {
     return result;
   };
 
-  virtual Status GetFileSize(const std::string& fname,
-                             uint64_t* size) override {
+  Status GetFileSize(const std::string& fname, uint64_t* size) override {
     Status s;
     struct stat sbuf;
     if (stat(fname.c_str(), &sbuf) != 0) {
@@ -562,8 +607,8 @@ class PosixEnv : public Env {
     return s;
   }
 
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* file_mtime) override {
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
     struct stat s;
     if (stat(fname.c_str(), &s) !=0) {
       return IOError("while stat a file for modification time", fname, errno);
@@ -571,8 +616,8 @@ class PosixEnv : public Env {
     *file_mtime = static_cast<uint64_t>(s.st_mtime);
     return Status::OK();
   }
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& target) override {
+  Status RenameFile(const std::string& src,
+                    const std::string& target) override {
     Status result;
     if (rename(src.c_str(), target.c_str()) != 0) {
       result = IOError("While renaming a file to " + target, src, errno);
@@ -580,8 +625,7 @@ class PosixEnv : public Env {
     return result;
   }
 
-  virtual Status LinkFile(const std::string& src,
-                          const std::string& target) override {
+  Status LinkFile(const std::string& src, const std::string& target) override {
     Status result;
     if (link(src.c_str(), target.c_str()) != 0) {
       if (errno == EXDEV) {
@@ -592,17 +636,67 @@ class PosixEnv : public Env {
     return result;
   }
 
-  virtual Status LockFile(const std::string& fname, FileLock** lock) override {
+  Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+    struct stat s;
+    if (stat(fname.c_str(), &s) != 0) {
+      return IOError("while stat a file for num file links", fname, errno);
+    }
+    *count = static_cast<uint64_t>(s.st_nlink);
+    return Status::OK();
+  }
+
+  Status AreFilesSame(const std::string& first, const std::string& second,
+                      bool* res) override {
+    struct stat statbuf[2];
+    if (stat(first.c_str(), &statbuf[0]) != 0) {
+      return IOError("stat file", first, errno);
+    }
+    if (stat(second.c_str(), &statbuf[1]) != 0) {
+      return IOError("stat file", second, errno);
+    }
+
+    if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) ||
+        minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) ||
+        statbuf[0].st_ino != statbuf[1].st_ino) {
+      *res = false;
+    } else {
+      *res = true;
+    }
+    return Status::OK();
+  }
+
+  Status LockFile(const std::string& fname, FileLock** lock) override {
     *lock = nullptr;
     Status result;
+
+    mutex_lockedFiles.Lock();
+    // If it already exists in the lockedFiles set, then it is already locked,
+    // and fail this lock attempt. Otherwise, insert it into lockedFiles.
+    // This check is needed because fcntl() does not detect lock conflict
+    // if the fcntl is issued by the same thread that earlier acquired
+    // this lock.
+    // We must do this check *before* opening the file:
+    // Otherwise, we will open a new file descriptor. Locks are associated with
+    // a process, not a file descriptor and when *any* file descriptor is closed,
+    // all locks the process holds for that *file* are released
+    if (lockedFiles.insert(fname).second == false) {
+      mutex_lockedFiles.Unlock();
+      errno = ENOLCK;
+      return IOError("lock ", fname, errno);
+    }
+
     int fd;
+    int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
+
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+      fd = open(fname.c_str(), flags, 0644);
     }
     if (fd < 0) {
       result = IOError("while open a file for lock", fname, errno);
-    } else if (LockOrUnlock(fname, fd, true) == -1) {
+    } else if (LockOrUnlock(fd, true) == -1) {
+      // if there is an error in locking, then remove the pathname from lockedfiles
+      lockedFiles.erase(fname);
       result = IOError("While lock file", fname, errno);
       close(fd);
     } else {
@@ -612,33 +706,42 @@ class PosixEnv : public Env {
       my_lock->filename = fname;
       *lock = my_lock;
     }
+
+    mutex_lockedFiles.Unlock();
     return result;
   }
 
-  virtual Status UnlockFile(FileLock* lock) override {
+  Status UnlockFile(FileLock* lock) override {
     PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
     Status result;
-    if (LockOrUnlock(my_lock->filename, my_lock->fd_, false) == -1) {
+    mutex_lockedFiles.Lock();
+    // If we are unlocking, then verify that we had locked it earlier,
+    // it should already exist in lockedFiles. Remove it from lockedFiles.
+    if (lockedFiles.erase(my_lock->filename) != 1) {
+      errno = ENOLCK;
+      result = IOError("unlock", my_lock->filename, errno);
+    } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
       result = IOError("unlock", my_lock->filename, errno);
     }
     close(my_lock->fd_);
     delete my_lock;
+    mutex_lockedFiles.Unlock();
     return result;
   }
 
-  virtual void Schedule(void (*function)(void* arg1), void* arg,
-                        Priority pri = LOW, void* tag = nullptr,
-                        void (*unschedFunction)(void* arg) = 0) override;
+  void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW,
+                void* tag = nullptr,
+                void (*unschedFunction)(void* arg) = nullptr) override;
 
-  virtual int UnSchedule(void* arg, Priority pri) override;
+  int UnSchedule(void* arg, Priority pri) override;
 
-  virtual void StartThread(void (*function)(void* arg), void* arg) override;
+  void StartThread(void (*function)(void* arg), void* arg) override;
 
-  virtual void WaitForJoin() override;
+  void WaitForJoin() override;
 
-  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
 
-  virtual Status GetTestDirectory(std::string* result) override {
+  Status GetTestDirectory(std::string* result) override {
     const char* env = getenv("TEST_TMPDIR");
     if (env && env[0] != '\0') {
       *result = env;
@@ -652,8 +755,7 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status GetThreadList(
-      std::vector<ThreadStatus>* thread_list) override {
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
     assert(thread_status_updater_);
     return thread_status_updater_->GetThreadList(thread_list);
   }
@@ -669,16 +771,31 @@ class PosixEnv : public Env {
     return gettid(tid);
   }
 
-  virtual uint64_t GetThreadID() const override {
-    return gettid(pthread_self());
+  uint64_t GetThreadID() const override { return gettid(pthread_self()); }
+
+  Status GetFreeSpace(const std::string& fname, uint64_t* free_space) override {
+    struct statvfs sbuf;
+
+    if (statvfs(fname.c_str(), &sbuf) < 0) {
+      return IOError("While doing statvfs", fname, errno);
+    }
+
+    *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
+    return Status::OK();
   }
 
-  virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override {
+  Status NewLogger(const std::string& fname,
+                   std::shared_ptr<Logger>* result) override {
     FILE* f;
     {
       IOSTATS_TIMER_GUARD(open_nanos);
-      f = fopen(fname.c_str(), "w");
+      f = fopen(fname.c_str(), "w"
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 7)
+          "e" // glibc extension to enable O_CLOEXEC
+#endif
+#endif
+          );
     }
     if (f == nullptr) {
       result->reset();
@@ -694,13 +811,13 @@ class PosixEnv : public Env {
     }
   }
 
-  virtual uint64_t NowMicros() override {
+  uint64_t NowMicros() override {
     struct timeval tv;
     gettimeofday(&tv, nullptr);
     return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
   }
 
-  virtual uint64_t NowNanos() override {
+  uint64_t NowNanos() override {
 #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX)
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
@@ -720,9 +837,19 @@ class PosixEnv : public Env {
 #endif
   }
 
-  virtual void SleepForMicroseconds(int micros) override { usleep(micros); }
+  uint64_t NowCPUNanos() override {
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_AIX) || \
+    defined(__MACH__)
+    struct timespec ts;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
+    return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
+#endif
+    return 0;
+  }
 
-  virtual Status GetHostName(char* name, uint64_t len) override {
+  void SleepForMicroseconds(int micros) override { usleep(micros); }
+
+  Status GetHostName(char* name, uint64_t len) override {
     int ret = gethostname(name, static_cast<size_t>(len));
     if (ret < 0) {
       if (errno == EFAULT || errno == EINVAL)
@@ -733,7 +860,7 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status GetCurrentTime(int64_t* unix_time) override {
+  Status GetCurrentTime(int64_t* unix_time) override {
     time_t ret = time(nullptr);
     if (ret == (time_t) -1) {
       return IOError("GetCurrentTime", "", errno);
@@ -742,9 +869,9 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
-  virtual Status GetAbsolutePath(const std::string& db_path,
-                                 std::string* output_path) override {
-    if (db_path.find('/') == 0) {
+  Status GetAbsolutePath(const std::string& db_path,
+                         std::string* output_path) override {
+    if (!db_path.empty() && db_path[0] == '/') {
       *output_path = db_path;
       return Status::OK();
     }
@@ -760,30 +887,46 @@ class PosixEnv : public Env {
   }
 
   // Allow increasing the number of worker threads.
-  virtual void SetBackgroundThreads(int num, Priority pri) override {
+  void SetBackgroundThreads(int num, Priority pri) override {
     assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
     thread_pools_[pri].SetBackgroundThreads(num);
   }
 
-  virtual int GetBackgroundThreads(Priority pri) override {
+  int GetBackgroundThreads(Priority pri) override {
     assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
     return thread_pools_[pri].GetBackgroundThreads();
   }
 
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    allow_non_owner_access_ = allow_non_owner_access;
+    return Status::OK();
+  }
+
   // Allow increasing the number of worker threads.
-  virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
+  void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
     assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
     thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
   }
 
-  virtual void LowerThreadPoolIOPriority(Priority pool = LOW) override {
+  void LowerThreadPoolIOPriority(Priority pool = LOW) override {
     assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
 #ifdef OS_LINUX
     thread_pools_[pool].LowerIOPriority();
+#else
+    (void)pool;
+#endif
+  }
+
+  void LowerThreadPoolCPUPriority(Priority pool = LOW) override {
+    assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
+#ifdef OS_LINUX
+    thread_pools_[pool].LowerCPUPriority();
+#else
+    (void)pool;
 #endif
   }
 
-  virtual std::string TimeToString(uint64_t secondsSince1970) override {
+  std::string TimeToString(uint64_t secondsSince1970) override {
     const time_t seconds = (time_t)secondsSince1970;
     struct tm t;
     int maxsize = 64;
@@ -813,6 +956,8 @@ class PosixEnv : public Env {
     // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
     // test and make this false
     optimized.fallocate_with_keep_size = true;
+    optimized.writable_file_max_buffer_size =
+        db_options.writable_file_max_buffer_size;
     return optimized;
   }
 
@@ -855,6 +1000,7 @@ class PosixEnv : public Env {
         return false;
     }
 #else
+    (void)path;
     return false;
 #endif
   }
@@ -864,13 +1010,17 @@ class PosixEnv : public Env {
   std::vector<ThreadPoolImpl> thread_pools_;
   pthread_mutex_t mu_;
   std::vector<pthread_t> threads_to_join_;
+  // If true, allow non owner read access for db files. Otherwise, non-owner
+  //  has no access to db files.
+  bool allow_non_owner_access_;
 };
 
 PosixEnv::PosixEnv()
     : checkedDiskForMmap_(false),
       forceMmapOff_(false),
       page_size_(getpagesize()),
-      thread_pools_(Priority::TOTAL) {
+      thread_pools_(Priority::TOTAL),
+      allow_non_owner_access_(true) {
   ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
   for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
     thread_pools_[pool_id].SetThreadPriority(
@@ -969,6 +1119,8 @@ Env* Env::Default() {
   // the destructor of static PosixEnv will go first, then the
   // the singletons of ThreadLocalPtr.
   ThreadLocalPtr::InitSingletons();
+  CompressionContextCache::InitSingleton();
+  INIT_SYNC_POINT_SINGLETONS();
   static PosixEnv default_env;
   return &default_env;
 }
diff --git a/thirdparty/rocksdb/env/env_test.cc b/thirdparty/rocksdb/env/env_test.cc
index 9ec2f142ed..4780092849 100644
--- a/thirdparty/rocksdb/env/env_test.cc
+++ b/thirdparty/rocksdb/env/env_test.cc
@@ -73,7 +73,7 @@ struct Deleter {
 std::unique_ptr<char, Deleter> NewAligned(const size_t size, const char ch) {
   char* ptr = nullptr;
 #ifdef OS_WIN
-  if (!(ptr = reinterpret_cast<char*>(_aligned_malloc(size, kPageSize)))) {
+  if (nullptr == (ptr = reinterpret_cast<char*>(_aligned_malloc(size, kPageSize)))) {
     return std::unique_ptr<char, Deleter>(nullptr, Deleter(_aligned_free));
   }
   std::unique_ptr<char, Deleter> uptr(ptr, Deleter(_aligned_free));
@@ -118,14 +118,14 @@ class EnvPosixTestWithParam
     }
   }
 
-  ~EnvPosixTestWithParam() { WaitThreadPoolsEmpty(); }
+  ~EnvPosixTestWithParam() override { WaitThreadPoolsEmpty(); }
 };
 
 static void SetBool(void* ptr) {
   reinterpret_cast<std::atomic<bool>*>(ptr)->store(true);
 }
 
-TEST_F(EnvPosixTest, RunImmediately) {
+TEST_F(EnvPosixTest, DISABLED_RunImmediately) {
   for (int pri = Env::BOTTOM; pri < Env::TOTAL; ++pri) {
     std::atomic<bool> called(false);
     env_->SetBackgroundThreads(1, static_cast<Env::Priority>(pri));
@@ -135,6 +135,118 @@ TEST_F(EnvPosixTest, RunImmediately) {
   }
 }
 
+TEST_F(EnvPosixTest, RunEventually) {
+  std::atomic<bool> called(false);
+  env_->StartThread(&SetBool, &called);
+  env_->WaitForJoin();
+  ASSERT_TRUE(called.load());
+}
+
+#ifdef OS_WIN
+TEST_F(EnvPosixTest, AreFilesSame) {
+  {
+    bool tmp;
+    if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+      fprintf(stderr,
+              "skipping EnvBasicTestWithParam.AreFilesSame due to "
+              "unsupported Env::AreFilesSame\n");
+      return;
+    }
+  }
+
+  const EnvOptions soptions;
+  auto* env = Env::Default();
+  std::string same_file_name = test::PerThreadDBPath(env, "same_file");
+  std::string same_file_link_name = same_file_name + "_link";
+
+  std::unique_ptr<WritableFile> same_file;
+  ASSERT_OK(env->NewWritableFile(same_file_name,
+    &same_file, soptions));
+  same_file->Append("random_data");
+  ASSERT_OK(same_file->Flush());
+  same_file.reset();
+
+  ASSERT_OK(env->LinkFile(same_file_name, same_file_link_name));
+  bool result = false;
+  ASSERT_OK(env->AreFilesSame(same_file_name, same_file_link_name, &result));
+  ASSERT_TRUE(result);
+}
+#endif
+
+#ifdef OS_LINUX
+TEST_F(EnvPosixTest, DISABLED_FilePermission) {
+  // Only works for Linux environment
+  if (env_ == Env::Default()) {
+    EnvOptions soptions;
+    std::vector<std::string> fileNames{
+        test::PerThreadDBPath(env_, "testfile"),
+        test::PerThreadDBPath(env_, "testfile1")};
+    std::unique_ptr<WritableFile> wfile;
+    ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions));
+    ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions));
+    wfile.reset();
+    std::unique_ptr<RandomRWFile> rwfile;
+    ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions));
+
+    struct stat sb;
+    for (const auto& filename : fileNames) {
+      if (::stat(filename.c_str(), &sb) == 0) {
+        ASSERT_EQ(sb.st_mode & 0777, 0644);
+      }
+      env_->DeleteFile(filename);
+    }
+
+    env_->SetAllowNonOwnerAccess(false);
+    ASSERT_OK(env_->NewWritableFile(fileNames[0], &wfile, soptions));
+    ASSERT_OK(env_->NewWritableFile(fileNames[1], &wfile, soptions));
+    wfile.reset();
+    ASSERT_OK(env_->NewRandomRWFile(fileNames[1], &rwfile, soptions));
+
+    for (const auto& filename : fileNames) {
+      if (::stat(filename.c_str(), &sb) == 0) {
+        ASSERT_EQ(sb.st_mode & 0777, 0600);
+      }
+      env_->DeleteFile(filename);
+    }
+  }
+}
+#endif
+
+TEST_F(EnvPosixTest, MemoryMappedFileBuffer) {
+  const int kFileBytes = 1 << 15;  // 32 KB
+  std::string expected_data;
+  std::string fname = test::PerThreadDBPath(env_, "testfile");
+  {
+    std::unique_ptr<WritableFile> wfile;
+    const EnvOptions soptions;
+    ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
+
+    Random rnd(301);
+    test::RandomString(&rnd, kFileBytes, &expected_data);
+    ASSERT_OK(wfile->Append(expected_data));
+  }
+
+  std::unique_ptr<MemoryMappedFileBuffer> mmap_buffer;
+  Status status = env_->NewMemoryMappedFileBuffer(fname, &mmap_buffer);
+  // it should be supported at least on linux
+#if !defined(OS_LINUX)
+  if (status.IsNotSupported()) {
+    fprintf(stderr,
+            "skipping EnvPosixTest.MemoryMappedFileBuffer due to "
+            "unsupported Env::NewMemoryMappedFileBuffer\n");
+    return;
+  }
+#endif  // !defined(OS_LINUX)
+
+  ASSERT_OK(status);
+  ASSERT_NE(nullptr, mmap_buffer.get());
+  ASSERT_NE(nullptr, mmap_buffer->GetBase());
+  ASSERT_EQ(kFileBytes, mmap_buffer->GetLen());
+  std::string actual_data(reinterpret_cast<const char*>(mmap_buffer->GetBase()),
+                          mmap_buffer->GetLen());
+  ASSERT_EQ(expected_data, actual_data);
+}
+
 TEST_P(EnvPosixTestWithParam, UnSchedule) {
   std::atomic<bool> called(false);
   env_->SetBackgroundThreads(1, Env::LOW);
@@ -171,6 +283,11 @@ TEST_P(EnvPosixTestWithParam, UnSchedule) {
   WaitThreadPoolsEmpty();
 }
 
+// This tests assumes that the last scheduled
+// task will run last. In fact, in the allotted
+// sleeping time nothing may actually run or they may
+// run in any order. The purpose of the test is unclear.
+#ifndef OS_WIN
 TEST_P(EnvPosixTestWithParam, RunMany) {
   std::atomic<int> last_id(0);
 
@@ -203,6 +320,7 @@ TEST_P(EnvPosixTestWithParam, RunMany) {
   ASSERT_EQ(4, cur);
   WaitThreadPoolsEmpty();
 }
+#endif
 
 struct State {
   port::Mutex mu;
@@ -694,14 +812,13 @@ class IoctlFriendlyTmpdir {
 
 #ifndef ROCKSDB_LITE
 TEST_F(EnvPosixTest, PositionedAppend) {
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   EnvOptions options;
   options.use_direct_writes = true;
   options.use_mmap_writes = false;
   IoctlFriendlyTmpdir ift;
   ASSERT_OK(env_->NewWritableFile(ift.name() + "/f", &writable_file, options));
   const size_t kBlockSize = 4096;
-  const size_t kPageSize = 4096;
   const size_t kDataSize = kPageSize;
   // Write a page worth of 'a'
   auto data_ptr = NewAligned(kDataSize, 'a');
@@ -715,7 +832,7 @@ TEST_F(EnvPosixTest, PositionedAppend) {
   // The file now has 1 sector worth of a followed by a page worth of b
 
   // Verify the above
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   ASSERT_OK(env_->NewSequentialFile(ift.name() + "/f", &seq_file, options));
   char scratch[kPageSize * 2];
   Slice result;
@@ -734,10 +851,10 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueID) {
     soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
     IoctlFriendlyTmpdir ift;
     std::string fname = ift.name() + "/testfile";
-    unique_ptr<WritableFile> wfile;
+    std::unique_ptr<WritableFile> wfile;
     ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
 
-    unique_ptr<RandomAccessFile> file;
+    std::unique_ptr<RandomAccessFile> file;
 
     // Get Unique ID
     ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
@@ -804,7 +921,7 @@ TEST_P(EnvPosixTestWithParam, AllocateTest) {
     EnvOptions soptions;
     soptions.use_mmap_writes = false;
     soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
-    unique_ptr<WritableFile> wfile;
+    std::unique_ptr<WritableFile> wfile;
     ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
 
     // allocate 100 MB
@@ -873,14 +990,14 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDConcurrent) {
       fnames.push_back(ift.name() + "/" + "testfile" + ToString(i));
 
       // Create file.
-      unique_ptr<WritableFile> wfile;
+      std::unique_ptr<WritableFile> wfile;
       ASSERT_OK(env_->NewWritableFile(fnames[i], &wfile, soptions));
     }
 
     // Collect and check whether the IDs are unique.
     std::unordered_set<std::string> ids;
     for (const std::string fname : fnames) {
-      unique_ptr<RandomAccessFile> file;
+      std::unique_ptr<RandomAccessFile> file;
       std::string unique_id;
       ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
       size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
@@ -916,14 +1033,14 @@ TEST_P(EnvPosixTestWithParam, RandomAccessUniqueIDDeletes) {
     for (int i = 0; i < 1000; ++i) {
       // Create file.
       {
-        unique_ptr<WritableFile> wfile;
+        std::unique_ptr<WritableFile> wfile;
         ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions));
       }
 
       // Get Unique ID
       std::string unique_id;
       {
-        unique_ptr<RandomAccessFile> file;
+        std::unique_ptr<RandomAccessFile> file;
         ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions));
         size_t id_size = file->GetUniqueId(temp_id, MAX_ID_SIZE);
         ASSERT_TRUE(id_size > 0);
@@ -951,7 +1068,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
     EnvOptions soptions;
     soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
-    std::string fname = test::TmpDir(env_) + "/" + "testfile";
+    std::string fname = test::PerThreadDBPath(env_, "testfile");
 
     const size_t kSectorSize = 512;
     auto data = NewAligned(kSectorSize, 0);
@@ -959,7 +1076,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
 
     // Create file.
     {
-      unique_ptr<WritableFile> wfile;
+      std::unique_ptr<WritableFile> wfile;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
       if (soptions.use_direct_writes) {
         soptions.use_direct_writes = false;
@@ -973,7 +1090,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
 
     // Random Read
     {
-      unique_ptr<RandomAccessFile> file;
+      std::unique_ptr<RandomAccessFile> file;
       auto scratch = NewAligned(kSectorSize, 0);
       Slice result;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
@@ -990,7 +1107,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
 
     // Sequential Read
     {
-      unique_ptr<SequentialFile> file;
+      std::unique_ptr<SequentialFile> file;
       auto scratch = NewAligned(kSectorSize, 0);
       Slice result;
 #if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
@@ -1018,7 +1135,7 @@ TEST_P(EnvPosixTestWithParam, InvalidateCache) {
 class TestLogger : public Logger {
  public:
   using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {
+  void Logv(const char* format, va_list ap) override {
     log_count++;
 
     char new_format[550];
@@ -1100,7 +1217,7 @@ class TestLogger2 : public Logger {
  public:
   explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {}
   using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {
+  void Logv(const char* format, va_list ap) override {
     char new_format[2000];
     std::fill_n(new_format, sizeof(new_format), '2');
     {
@@ -1134,11 +1251,11 @@ TEST_P(EnvPosixTestWithParam, LogBufferMaxSizeTest) {
 
 TEST_P(EnvPosixTestWithParam, Preallocation) {
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-    const std::string src = test::TmpDir(env_) + "/" + "testfile";
-    unique_ptr<WritableFile> srcfile;
-    EnvOptions soptions;
-    soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
-#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
+  const std::string src = test::PerThreadDBPath(env_, "testfile");
+  std::unique_ptr<WritableFile> srcfile;
+  EnvOptions soptions;
+  soptions.use_direct_reads = soptions.use_direct_writes = direct_io_;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
     if (soptions.use_direct_writes) {
       rocksdb::SyncPoint::GetInstance()->SetCallBack(
           "NewWritableFile:O_DIRECT", [&](void* arg) {
@@ -1196,11 +1313,10 @@ TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
 
     std::string data;
     for (int i = 0; i < kNumChildren; ++i) {
-      std::ostringstream oss;
-      oss << test::TmpDir(env_) << "/testfile_" << i;
-      const std::string path = oss.str();
-      unique_ptr<WritableFile> file;
-#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX)
+      const std::string path =
+          test::TmpDir(env_) + "/" + "testfile_" + std::to_string(i);
+      std::unique_ptr<WritableFile> file;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD)
       if (soptions.use_direct_writes) {
         rocksdb::SyncPoint::GetInstance()->SetCallBack(
             "NewWritableFile:O_DIRECT", [&](void* arg) {
@@ -1219,9 +1335,7 @@ TEST_P(EnvPosixTestWithParam, ConsistentChildrenAttributes) {
     std::vector<Env::FileAttributes> file_attrs;
     ASSERT_OK(env_->GetChildrenFileAttributes(test::TmpDir(env_), &file_attrs));
     for (int i = 0; i < kNumChildren; ++i) {
-      std::ostringstream oss;
-      oss << "testfile_" << i;
-      const std::string name = oss.str();
+      const std::string name = "testfile_" + std::to_string(i);
       const std::string path = test::TmpDir(env_) + "/" + name;
 
       auto file_attrs_iter = std::find_if(
@@ -1250,51 +1364,114 @@ TEST_P(EnvPosixTestWithParam, WritableFileWrapper) {
       inc(0);
     }
 
-    Status Append(const Slice& data) override { inc(1); return Status::OK(); }
-    Status Truncate(uint64_t size) override { return Status::OK(); }
-    Status Close() override { inc(2); return Status::OK(); }
-    Status Flush() override { inc(3); return Status::OK(); }
-    Status Sync() override { inc(4); return Status::OK(); }
-    Status Fsync() override { inc(5); return Status::OK(); }
-    void SetIOPriority(Env::IOPriority pri) override { inc(6); }
-    uint64_t GetFileSize() override { inc(7); return 0; }
-    void GetPreallocationStatus(size_t* block_size,
-                                size_t* last_allocated_block) override {
+    Status Append(const Slice& /*data*/) override {
+      inc(1);
+      return Status::OK();
+    }
+
+    Status PositionedAppend(const Slice& /*data*/,
+                            uint64_t /*offset*/) override {
+      inc(2);
+      return Status::OK();
+    }
+
+    Status Truncate(uint64_t /*size*/) override {
+      inc(3);
+      return Status::OK();
+    }
+
+    Status Close() override {
+      inc(4);
+      return Status::OK();
+    }
+
+    Status Flush() override {
+      inc(5);
+      return Status::OK();
+    }
+
+    Status Sync() override {
+      inc(6);
+      return Status::OK();
+    }
+
+    Status Fsync() override {
+      inc(7);
+      return Status::OK();
+    }
+
+    bool IsSyncThreadSafe() const override {
       inc(8);
+      return true;
     }
-    size_t GetUniqueId(char* id, size_t max_size) const override {
+
+    bool use_direct_io() const override {
       inc(9);
-      return 0;
+      return true;
     }
-    Status InvalidateCache(size_t offset, size_t length) override {
+
+    size_t GetRequiredBufferAlignment() const override {
       inc(10);
+      return 0;
+    }
+
+    void SetIOPriority(Env::IOPriority /*pri*/) override { inc(11); }
+
+    Env::IOPriority GetIOPriority() override {
+      inc(12);
+      return Env::IOPriority::IO_LOW;
+    }
+
+    void SetWriteLifeTimeHint(Env::WriteLifeTimeHint /*hint*/) override {
+      inc(13);
+    }
+
+    Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+      inc(14);
+      return Env::WriteLifeTimeHint::WLTH_NOT_SET;
+    }
+
+    uint64_t GetFileSize() override {
+      inc(15);
+      return 0;
+    }
+
+    void SetPreallocationBlockSize(size_t /*size*/) override { inc(16); }
+
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {
+      inc(17);
+    }
+
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      inc(18);
+      return 0;
+    }
+
+    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
+      inc(19);
       return Status::OK();
     }
 
-   protected:
-    Status Allocate(uint64_t offset, uint64_t len) override {
-      inc(11);
+    Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) override {
+      inc(20);
       return Status::OK();
     }
-    Status RangeSync(uint64_t offset, uint64_t nbytes) override {
-      inc(12);
+
+    void PrepareWrite(size_t /*offset*/, size_t /*len*/) override { inc(21); }
+
+    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
+      inc(22);
       return Status::OK();
     }
 
    public:
-    ~Base() {
-      inc(13);
-    }
+    ~Base() override { inc(23); }
   };
 
   class Wrapper : public WritableFileWrapper {
    public:
     explicit Wrapper(WritableFile* target) : WritableFileWrapper(target) {}
-
-    void CallProtectedMethods() {
-      Allocate(0, 0);
-      RangeSync(0, 0);
-    }
   };
 
   int step = 0;
@@ -1303,27 +1480,48 @@ TEST_P(EnvPosixTestWithParam, WritableFileWrapper) {
     Base b(&step);
     Wrapper w(&b);
     w.Append(Slice());
+    w.PositionedAppend(Slice(), 0);
+    w.Truncate(0);
     w.Close();
     w.Flush();
     w.Sync();
     w.Fsync();
+    w.IsSyncThreadSafe();
+    w.use_direct_io();
+    w.GetRequiredBufferAlignment();
     w.SetIOPriority(Env::IOPriority::IO_HIGH);
+    w.GetIOPriority();
+    w.SetWriteLifeTimeHint(Env::WriteLifeTimeHint::WLTH_NOT_SET);
+    w.GetWriteLifeTimeHint();
     w.GetFileSize();
+    w.SetPreallocationBlockSize(0);
     w.GetPreallocationStatus(nullptr, nullptr);
     w.GetUniqueId(nullptr, 0);
     w.InvalidateCache(0, 0);
-    w.CallProtectedMethods();
+    w.RangeSync(0, 0);
+    w.PrepareWrite(0, 0);
+    w.Allocate(0, 0);
   }
 
-  EXPECT_EQ(14, step);
+  EXPECT_EQ(24, step);
 }
 
 TEST_P(EnvPosixTestWithParam, PosixRandomRWFile) {
-  const std::string path = test::TmpDir(env_) + "/random_rw_file";
+  const std::string path = test::PerThreadDBPath(env_, "random_rw_file");
 
   env_->DeleteFile(path);
 
   std::unique_ptr<RandomRWFile> file;
+
+  // Cannot open non-existing file.
+  ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+
+  // Create the file using WriteableFile
+  {
+    std::unique_ptr<WritableFile> wf;
+    ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions()));
+  }
+
   ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
 
   char buf[10000];
@@ -1437,10 +1635,22 @@ class RandomRWFileWithMirrorString {
 };
 
 TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) {
-  const std::string path = test::TmpDir(env_) + "/random_rw_file_rand";
+  const std::string path = test::PerThreadDBPath(env_, "random_rw_file_rand");
   env_->DeleteFile(path);
 
-  unique_ptr<RandomRWFile> file;
+  std::unique_ptr<RandomRWFile> file;
+
+#ifdef OS_LINUX
+  // Cannot open non-existing file.
+  ASSERT_NOK(env_->NewRandomRWFile(path, &file, EnvOptions()));
+#endif
+
+  // Create the file using WriteableFile
+  {
+    std::unique_ptr<WritableFile> wf;
+    ASSERT_OK(env_->NewWritableFile(path, &wf, EnvOptions()));
+  }
+
   ASSERT_OK(env_->NewRandomRWFile(path, &file, EnvOptions()));
   RandomRWFileWithMirrorString file_with_mirror(file.get());
 
@@ -1470,6 +1680,72 @@ TEST_P(EnvPosixTestWithParam, PosixRandomRWFileRandomized) {
   env_->DeleteFile(path);
 }
 
+class TestEnv : public EnvWrapper {
+  public:
+    explicit TestEnv() : EnvWrapper(Env::Default()),
+                close_count(0) { }
+
+  class TestLogger : public Logger {
+   public:
+    using Logger::Logv;
+    TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+    ~TestLogger() override {
+      if (!closed_) {
+        CloseHelper();
+      }
+    }
+    void Logv(const char* /*format*/, va_list /*ap*/) override{};
+
+   protected:
+    Status CloseImpl() override { return CloseHelper(); }
+
+   private:
+    Status CloseHelper() {
+      env->CloseCountInc();;
+      return Status::OK();
+    }
+    TestEnv* env;
+  };
+
+  void CloseCountInc() { close_count++; }
+
+  int GetCloseCount() { return close_count; }
+
+  Status NewLogger(const std::string& /*fname*/,
+                   std::shared_ptr<Logger>* result) override {
+    result->reset(new TestLogger(this));
+    return Status::OK();
+  }
+
+ private:
+  int close_count;
+};
+
+class EnvTest : public testing::Test {};
+
+TEST_F(EnvTest, Close) {
+  TestEnv* env = new TestEnv();
+  std::shared_ptr<Logger> logger;
+  Status s;
+
+  s = env->NewLogger("", &logger);
+  ASSERT_EQ(s, Status::OK());
+  logger.get()->Close();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  // Call Close() again. CloseHelper() should not be called again
+  logger.get()->Close();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  logger.reset();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+
+  s = env->NewLogger("", &logger);
+  ASSERT_EQ(s, Status::OK());
+  logger.reset();
+  ASSERT_EQ(env->GetCloseCount(), 2);
+
+  delete env;
+}
+
 INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam,
                         ::testing::Values(std::pair<Env*, bool>(Env::Default(),
                                                                 false)));
@@ -1480,8 +1756,8 @@ INSTANTIATE_TEST_CASE_P(DefaultEnvWithDirectIO, EnvPosixTestWithParam,
 #endif  // !defined(ROCKSDB_LITE)
 
 #if !defined(ROCKSDB_LITE) && !defined(OS_WIN)
-static unique_ptr<Env> chroot_env(NewChrootEnv(Env::Default(),
-                                               test::TmpDir(Env::Default())));
+static std::unique_ptr<Env> chroot_env(
+    NewChrootEnv(Env::Default(), test::TmpDir(Env::Default())));
 INSTANTIATE_TEST_CASE_P(
     ChrootEnvWithoutDirectIO, EnvPosixTestWithParam,
     ::testing::Values(std::pair<Env*, bool>(chroot_env.get(), false)));
diff --git a/thirdparty/rocksdb/env/io_posix.cc b/thirdparty/rocksdb/env/io_posix.cc
index c5b14d3eff..628ed84130 100644
--- a/thirdparty/rocksdb/env/io_posix.cc
+++ b/thirdparty/rocksdb/env/io_posix.cc
@@ -35,6 +35,11 @@
 #include "util/string_util.h"
 #include "util/sync_point.h"
 
+#if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
+#define F_LINUX_SPECIFIC_BASE 1024
+#define F_SET_RW_HINT         (F_LINUX_SPECIFIC_BASE + 12)
+#endif
+
 namespace rocksdb {
 
 // A wrapper for fadvise, if the platform doesn't support fadvise,
@@ -43,6 +48,10 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
 #ifdef OS_LINUX
   return posix_fadvise(fd, offset, len, advice);
 #else
+  (void)fd;
+  (void)offset;
+  (void)len;
+  (void)advice;
   return 0;  // simply do nothing.
 #endif
 }
@@ -74,10 +83,14 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
   if (!device_dir.empty() && device_dir.back() == '/') {
     device_dir.pop_back();
   }
-  // NOTE: sda3 does not have a `queue/` subdir, only the parent sda has it.
+  // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
+  // and nvme0n1 have it.
   // $ ls -al '/sys/dev/block/8:3'
   // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
   // ../../block/sda/sda3
+  // $ ls -al '/sys/dev/block/259:4'
+  // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
+  // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
   size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
   if (parent_end == std::string::npos) {
     return kDefaultPageSize;
@@ -86,8 +99,11 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
   if (parent_begin == std::string::npos) {
     return kDefaultPageSize;
   }
-  if (device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1) !=
-      "block") {
+  std::string parent =
+      device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
+  std::string child = device_dir.substr(parent_end + 1, std::string::npos);
+  if (parent != "block" &&
+      (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
     device_dir = device_dir.substr(0, parent_end);
   }
   std::string fname = device_dir + "/queue/logical_block_size";
@@ -175,16 +191,15 @@ Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) {
 
 Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
                                            Slice* result, char* scratch) {
-  if (use_direct_io()) {
-    assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
-    assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
-    assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
-  }
+  assert(use_direct_io());
+  assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
+  assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
+  assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
+
   Status s;
   ssize_t r = -1;
   size_t left = n;
   char* ptr = scratch;
-  assert(use_direct_io());
   while (left > 0) {
     r = pread(fd_, ptr, left, static_cast<off_t>(offset));
     if (r <= 0) {
@@ -222,6 +237,8 @@ Status PosixSequentialFile::Skip(uint64_t n) {
 
 Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
 #ifndef OS_LINUX
+  (void)offset;
+  (void)length;
   return Status::OK();
 #else
   if (!use_direct_io()) {
@@ -248,7 +265,6 @@ size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
 
   struct stat buf;
   int result = fstat(fd, &buf);
-  assert(result != -1);
   if (result == -1) {
     return 0;
   }
@@ -405,6 +421,8 @@ Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
     return Status::OK();
   }
 #ifndef OS_LINUX
+  (void)offset;
+  (void)length;
   return Status::OK();
 #else
   // free OS pages
@@ -429,6 +447,9 @@ PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
                                              void* base, size_t length,
                                              const EnvOptions& options)
     : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
+#ifdef NDEBUG
+  (void)options;
+#endif
   fd_ = fd_ + 0;  // suppress the warning for used variables
   assert(options.use_mmap_reads);
   assert(!options.use_direct_reads);
@@ -440,10 +461,11 @@ PosixMmapReadableFile::~PosixMmapReadableFile() {
     fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
             mmapped_region_, length_);
   }
+  close(fd_);
 }
 
 Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
-                                   char* scratch) const {
+                                   char* /*scratch*/) const {
   Status s;
   if (offset > length_) {
     *result = Slice();
@@ -459,6 +481,8 @@ Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
 
 Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
 #ifndef OS_LINUX
+  (void)offset;
+  (void)length;
   return Status::OK();
 #else
   // free OS pages
@@ -567,6 +591,8 @@ PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
 #ifdef ROCKSDB_FALLOCATE_PRESENT
   allow_fallocate_ = options.allow_fallocate;
   fallocate_with_keep_size_ = options.fallocate_with_keep_size;
+#else
+  (void)options;
 #endif
   assert((page_size & (page_size - 1)) == 0);
   assert(options.use_mmap_writes);
@@ -667,6 +693,8 @@ uint64_t PosixMmapFile::GetFileSize() {
 
 Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
 #ifndef OS_LINUX
+  (void)offset;
+  (void)length;
   return Status::OK();
 #else
   // free OS pages
@@ -794,9 +822,10 @@ Status PosixWritableFile::Close() {
     // trim the extra space preallocated at the end of the file
     // NOTE(ljin): we probably don't want to surface failure as an IOError,
     // but it will be nice to log these errors.
-    int dummy __attribute__((unused));
+    int dummy __attribute__((__unused__));
     dummy = ftruncate(fd_, filesize_);
-#if defined(ROCKSDB_FALLOCATE_PRESENT) && !defined(TRAVIS)
+#if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) && \
+    !defined(TRAVIS)
     // in some file systems, ftruncate only trims trailing space if the
     // new file size is smaller than the current size. Calling fallocate
     // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
@@ -858,11 +887,31 @@ bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
 
 uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
 
+void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+#ifdef OS_LINUX
+// Suppress Valgrind "Unimplemented functionality" error.
+#ifndef ROCKSDB_VALGRIND_RUN
+  if (hint == write_hint_) {
+    return;
+  }
+  if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
+    write_hint_ = hint;
+  }
+#else
+  (void)hint;
+#endif // ROCKSDB_VALGRIND_RUN
+#else
+  (void)hint;
+#endif // OS_LINUX
+}
+
 Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
   if (use_direct_io()) {
     return Status::OK();
   }
 #ifndef OS_LINUX
+  (void)offset;
+  (void)length;
   return Status::OK();
 #else
   // free OS pages
@@ -922,7 +971,7 @@ size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
  */
 
 PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
-                                     const EnvOptions& options)
+                                     const EnvOptions& /*options*/)
     : filename_(fname), fd_(fd) {}
 
 PosixRandomRWFile::~PosixRandomRWFile() {
@@ -1010,6 +1059,11 @@ Status PosixRandomRWFile::Close() {
   return Status::OK();
 }
 
+PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
+  // TODO should have error handling though not much we can do...
+  munmap(this->base_, length_);
+}
+
 /*
  * PosixDirectory
  */
diff --git a/thirdparty/rocksdb/env/io_posix.h b/thirdparty/rocksdb/env/io_posix.h
index 69c98438f2..e6824d3e87 100644
--- a/thirdparty/rocksdb/env/io_posix.h
+++ b/thirdparty/rocksdb/env/io_posix.h
@@ -41,6 +41,9 @@ static Status IOError(const std::string& context, const std::string& file_name,
                            strerror(err_number));
   case ESTALE:
     return Status::IOError(Status::kStaleFile);
+  case ENOENT:
+    return Status::PathNotFound(IOErrorMsg(context, file_name),
+                                strerror(err_number));
   default:
     return Status::IOError(IOErrorMsg(context, file_name),
                            strerror(err_number));
@@ -132,6 +135,7 @@ class PosixWritableFile : public WritableFile {
   virtual Status Fsync() override;
   virtual bool IsSyncThreadSafe() const override;
   virtual bool use_direct_io() const override { return use_direct_io_; }
+  virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override;
   virtual uint64_t GetFileSize() override;
   virtual Status InvalidateCache(size_t offset, size_t length) override;
   virtual size_t GetRequiredBufferAlignment() const override {
@@ -201,7 +205,7 @@ class PosixMmapFile : public WritableFile {
 
   // Means Close() will properly take care of truncate
   // and it does not need any additional information
-  virtual Status Truncate(uint64_t size) override { return Status::OK(); }
+  virtual Status Truncate(uint64_t /*size*/) override { return Status::OK(); }
   virtual Status Close() override;
   virtual Status Append(const Slice& data) override;
   virtual Status Flush() override;
@@ -235,6 +239,12 @@ class PosixRandomRWFile : public RandomRWFile {
   int fd_;
 };
 
+struct PosixMemoryMappedFileBuffer : public MemoryMappedFileBuffer {
+  PosixMemoryMappedFileBuffer(void* _base, size_t _length)
+      : MemoryMappedFileBuffer(_base, _length) {}
+  virtual ~PosixMemoryMappedFileBuffer();
+};
+
 class PosixDirectory : public Directory {
  public:
   explicit PosixDirectory(int fd) : fd_(fd) {}
diff --git a/thirdparty/rocksdb/env/mock_env.cc b/thirdparty/rocksdb/env/mock_env.cc
index 669011c4ee..793a0837ab 100644
--- a/thirdparty/rocksdb/env/mock_env.cc
+++ b/thirdparty/rocksdb/env/mock_env.cc
@@ -72,9 +72,7 @@ class MemFile {
     }
   }
 
-  uint64_t Size() const {
-    return size_;
-  }
+  uint64_t Size() const { return size_; }
 
   void Truncate(size_t size) {
     MutexLock lock(&mutex_);
@@ -94,35 +92,37 @@ class MemFile {
     uint64_t end = std::min(start + 512, size_.load());
     MutexLock lock(&mutex_);
     for (uint64_t pos = start; pos < end; ++pos) {
-      data_[pos] = static_cast<char>(rnd_.Uniform(256));
+      data_[static_cast<size_t>(pos)] = static_cast<char>(rnd_.Uniform(256));
     }
   }
 
   Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
     MutexLock lock(&mutex_);
     const uint64_t available = Size() - std::min(Size(), offset);
+    size_t offset_ = static_cast<size_t>(offset);
     if (n > available) {
-      n = available;
+      n = static_cast<size_t>(available);
     }
     if (n == 0) {
       *result = Slice();
       return Status::OK();
     }
     if (scratch) {
-      memcpy(scratch, &(data_[offset]), n);
+      memcpy(scratch, &(data_[offset_]), n);
       *result = Slice(scratch, n);
     } else {
-      *result = Slice(&(data_[offset]), n);
+      *result = Slice(&(data_[offset_]), n);
     }
     return Status::OK();
   }
 
   Status Write(uint64_t offset, const Slice& data) {
     MutexLock lock(&mutex_);
+    size_t offset_ = static_cast<size_t>(offset);
     if (offset + data.size() > data_.size()) {
-      data_.resize(offset + data.size());
+      data_.resize(offset_ + data.size());
     }
-    data_.replace(offset, data.size(), data.data(), data.size());
+    data_.replace(offset_, data.size(), data.data(), data.size());
     size_ = data_.size();
     modified_time_ = Now();
     return Status::OK();
@@ -141,9 +141,7 @@ class MemFile {
     return Status::OK();
   }
 
-  uint64_t ModifiedTime() const {
-    return modified_time_;
-  }
+  uint64_t ModifiedTime() const { return modified_time_; }
 
  private:
   uint64_t Now() {
@@ -154,9 +152,7 @@ class MemFile {
   }
 
   // Private since only Unref() should be used to delete it.
-  ~MemFile() {
-    assert(refs_ == 0);
-  }
+  ~MemFile() { assert(refs_ == 0); }
 
   // No copying allowed.
   MemFile(const MemFile&);
@@ -187,11 +183,9 @@ class MockSequentialFile : public SequentialFile {
     file_->Ref();
   }
 
-  ~MockSequentialFile() {
-    file_->Unref();
-  }
+  ~MockSequentialFile() override { file_->Unref(); }
 
-  virtual Status Read(size_t n, Slice* result, char* scratch) override {
+  Status Read(size_t n, Slice* result, char* scratch) override {
     Status s = file_->Read(pos_, n, result, scratch);
     if (s.ok()) {
       pos_ += result->size();
@@ -199,15 +193,15 @@ class MockSequentialFile : public SequentialFile {
     return s;
   }
 
-  virtual Status Skip(uint64_t n) override {
+  Status Skip(uint64_t n) override {
     if (pos_ > file_->Size()) {
       return Status::IOError("pos_ > file_->Size()");
     }
-    const size_t available = file_->Size() - pos_;
+    const uint64_t available = file_->Size() - pos_;
     if (n > available) {
       n = available;
     }
-    pos_ += n;
+    pos_ += static_cast<size_t>(n);
     return Status::OK();
   }
 
@@ -218,16 +212,12 @@ class MockSequentialFile : public SequentialFile {
 
 class MockRandomAccessFile : public RandomAccessFile {
  public:
-  explicit MockRandomAccessFile(MemFile* file) : file_(file) {
-    file_->Ref();
-  }
+  explicit MockRandomAccessFile(MemFile* file) : file_(file) { file_->Ref(); }
 
-  ~MockRandomAccessFile() {
-    file_->Unref();
-  }
+  ~MockRandomAccessFile() override { file_->Unref(); }
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override {
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
     return file_->Read(offset, n, result, scratch);
   }
 
@@ -239,22 +229,22 @@ class MockRandomRWFile : public RandomRWFile {
  public:
   explicit MockRandomRWFile(MemFile* file) : file_(file) { file_->Ref(); }
 
-  ~MockRandomRWFile() { file_->Unref(); }
+  ~MockRandomRWFile() override { file_->Unref(); }
 
-  virtual Status Write(uint64_t offset, const Slice& data) override {
+  Status Write(uint64_t offset, const Slice& data) override {
     return file_->Write(offset, data);
   }
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override {
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
     return file_->Read(offset, n, result, scratch);
   }
 
-  virtual Status Close() override { return file_->Fsync(); }
+  Status Close() override { return file_->Fsync(); }
 
-  virtual Status Flush() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
 
-  virtual Status Sync() override { return file_->Fsync(); }
+  Status Sync() override { return file_->Fsync(); }
 
  private:
   MemFile* file_;
@@ -263,17 +253,14 @@ class MockRandomRWFile : public RandomRWFile {
 class MockWritableFile : public WritableFile {
  public:
   MockWritableFile(MemFile* file, RateLimiter* rate_limiter)
-    : file_(file),
-      rate_limiter_(rate_limiter) {
+      : file_(file), rate_limiter_(rate_limiter) {
     file_->Ref();
   }
 
-  ~MockWritableFile() {
-    file_->Unref();
-  }
+  ~MockWritableFile() override { file_->Unref(); }
 
-  virtual Status Append(const Slice& data) override {
-    uint64_t bytes_written = 0;
+  Status Append(const Slice& data) override {
+    size_t bytes_written = 0;
     while (bytes_written < data.size()) {
       auto bytes = RequestToken(data.size() - bytes_written);
       Status s = file_->Append(Slice(data.data() + bytes_written, bytes));
@@ -284,23 +271,23 @@ class MockWritableFile : public WritableFile {
     }
     return Status::OK();
   }
-  virtual Status Truncate(uint64_t size) override {
-    file_->Truncate(size);
+  Status Truncate(uint64_t size) override {
+    file_->Truncate(static_cast<size_t>(size));
     return Status::OK();
   }
-  virtual Status Close() override { return file_->Fsync(); }
+  Status Close() override { return file_->Fsync(); }
 
-  virtual Status Flush() override { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
 
-  virtual Status Sync() override { return file_->Fsync(); }
+  Status Sync() override { return file_->Fsync(); }
 
-  virtual uint64_t GetFileSize() override { return file_->Size(); }
+  uint64_t GetFileSize() override { return file_->Size(); }
 
  private:
   inline size_t RequestToken(size_t bytes) {
     if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) {
-      bytes = std::min(bytes,
-          static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
+      bytes = std::min(
+          bytes, static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()));
       rate_limiter_->Request(bytes, io_priority_);
     }
     return bytes;
@@ -312,17 +299,14 @@ class MockWritableFile : public WritableFile {
 
 class MockEnvDirectory : public Directory {
  public:
-  virtual Status Fsync() override { return Status::OK(); }
+  Status Fsync() override { return Status::OK(); }
 };
 
 class MockEnvFileLock : public FileLock {
  public:
-  explicit MockEnvFileLock(const std::string& fname)
-    : fname_(fname) {}
+  explicit MockEnvFileLock(const std::string& fname) : fname_(fname) {}
 
-  std::string FileName() const {
-    return fname_;
-  }
+  std::string FileName() const { return fname_; }
 
  private:
   const std::string fname_;
@@ -335,7 +319,7 @@ class TestMemLogger : public Logger {
   static const uint64_t flush_every_seconds_ = 5;
   std::atomic_uint_fast64_t last_flush_micros_;
   Env* env_;
-  bool flush_pending_;
+  std::atomic<bool> flush_pending_;
 
  public:
   TestMemLogger(std::unique_ptr<WritableFile> f, Env* env,
@@ -346,10 +330,9 @@ class TestMemLogger : public Logger {
         last_flush_micros_(0),
         env_(env),
         flush_pending_(false) {}
-  virtual ~TestMemLogger() {
-  }
+  ~TestMemLogger() override {}
 
-  virtual void Flush() override {
+  void Flush() override {
     if (flush_pending_) {
       flush_pending_ = false;
     }
@@ -357,7 +340,7 @@ class TestMemLogger : public Logger {
   }
 
   using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {
+  void Logv(const char* format, va_list ap) override {
     // We try twice: the first time with a fixed-size stack allocated buffer,
     // and the second time with a much larger dynamically allocated buffer.
     char buffer[500];
@@ -379,17 +362,12 @@ class TestMemLogger : public Logger {
       const time_t seconds = now_tv.tv_sec;
       struct tm t;
       memset(&t, 0, sizeof(t));
-      auto ret __attribute__((__unused__)) = localtime_r(&seconds, &t);
+      struct tm* ret __attribute__((__unused__));
+      ret = localtime_r(&seconds, &t);
       assert(ret);
-      p += snprintf(p, limit - p,
-                    "%04d/%02d/%02d-%02d:%02d:%02d.%06d ",
-                    t.tm_year + 1900,
-                    t.tm_mon + 1,
-                    t.tm_mday,
-                    t.tm_hour,
-                    t.tm_min,
-                    t.tm_sec,
-                    static_cast<int>(now_tv.tv_usec));
+      p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d ",
+                    t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                    t.tm_min, t.tm_sec, static_cast<int>(now_tv.tv_usec));
 
       // Print the message
       if (p < limit) {
@@ -402,7 +380,7 @@ class TestMemLogger : public Logger {
       // Truncate to available space if necessary
       if (p >= limit) {
         if (iter == 0) {
-          continue;       // Try again with larger buffer
+          continue;  // Try again with larger buffer
         } else {
           p = limit - 1;
         }
@@ -419,8 +397,8 @@ class TestMemLogger : public Logger {
       file_->Append(Slice(base, write_size));
       flush_pending_ = true;
       log_size_ += write_size;
-      uint64_t now_micros = static_cast<uint64_t>(now_tv.tv_sec) * 1000000 +
-        now_tv.tv_usec;
+      uint64_t now_micros =
+          static_cast<uint64_t>(now_tv.tv_sec) * 1000000 + now_tv.tv_usec;
       if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) {
         flush_pending_ = false;
         last_flush_micros_ = now_micros;
@@ -444,14 +422,14 @@ MockEnv::~MockEnv() {
   }
 }
 
-  // Partial implementation of the Env interface.
+// Partial implementation of the Env interface.
 Status MockEnv::NewSequentialFile(const std::string& fname,
-                                     unique_ptr<SequentialFile>* result,
-                                     const EnvOptions& soptions) {
+                                  std::unique_ptr<SequentialFile>* result,
+                                  const EnvOptions& /*soptions*/) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
-    *result = NULL;
+    *result = nullptr;
     return Status::IOError(fn, "File not found");
   }
   auto* f = file_map_[fn];
@@ -463,12 +441,12 @@ Status MockEnv::NewSequentialFile(const std::string& fname,
 }
 
 Status MockEnv::NewRandomAccessFile(const std::string& fname,
-                                       unique_ptr<RandomAccessFile>* result,
-                                       const EnvOptions& soptions) {
+                                    std::unique_ptr<RandomAccessFile>* result,
+                                    const EnvOptions& /*soptions*/) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
-    *result = NULL;
+    *result = nullptr;
     return Status::IOError(fn, "File not found");
   }
   auto* f = file_map_[fn];
@@ -480,12 +458,12 @@ Status MockEnv::NewRandomAccessFile(const std::string& fname,
 }
 
 Status MockEnv::NewRandomRWFile(const std::string& fname,
-                                unique_ptr<RandomRWFile>* result,
-                                const EnvOptions& soptions) {
+                                std::unique_ptr<RandomRWFile>* result,
+                                const EnvOptions& /*soptions*/) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
   if (file_map_.find(fn) == file_map_.end()) {
-    *result = NULL;
+    *result = nullptr;
     return Status::IOError(fn, "File not found");
   }
   auto* f = file_map_[fn];
@@ -498,7 +476,7 @@ Status MockEnv::NewRandomRWFile(const std::string& fname,
 
 Status MockEnv::ReuseWritableFile(const std::string& fname,
                                   const std::string& old_fname,
-                                  unique_ptr<WritableFile>* result,
+                                  std::unique_ptr<WritableFile>* result,
                                   const EnvOptions& options) {
   auto s = RenameFile(old_fname, fname);
   if (!s.ok()) {
@@ -509,7 +487,7 @@ Status MockEnv::ReuseWritableFile(const std::string& fname,
 }
 
 Status MockEnv::NewWritableFile(const std::string& fname,
-                                unique_ptr<WritableFile>* result,
+                                std::unique_ptr<WritableFile>* result,
                                 const EnvOptions& env_options) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
@@ -524,8 +502,8 @@ Status MockEnv::NewWritableFile(const std::string& fname,
   return Status::OK();
 }
 
-Status MockEnv::NewDirectory(const std::string& name,
-                                unique_ptr<Directory>* result) {
+Status MockEnv::NewDirectory(const std::string& /*name*/,
+                             std::unique_ptr<Directory>* result) {
   result->reset(new MockEnvDirectory());
   return Status::OK();
 }
@@ -540,8 +518,7 @@ Status MockEnv::FileExists(const std::string& fname) {
   // Now also check if fn exists as a dir
   for (const auto& iter : file_map_) {
     const std::string& filename = iter.first;
-    if (filename.size() >= fn.size() + 1 &&
-        filename[fn.size()] == '/' &&
+    if (filename.size() >= fn.size() + 1 && filename[fn.size()] == '/' &&
         Slice(filename).starts_with(Slice(fn))) {
       return Status::OK();
     }
@@ -550,7 +527,7 @@ Status MockEnv::FileExists(const std::string& fname) {
 }
 
 Status MockEnv::GetChildren(const std::string& dir,
-                               std::vector<std::string>* result) {
+                            std::vector<std::string>* result) {
   auto d = NormalizePath(dir);
   bool found_dir = false;
   {
@@ -566,8 +543,8 @@ Status MockEnv::GetChildren(const std::string& dir,
         found_dir = true;
         size_t next_slash = filename.find('/', d.size() + 1);
         if (next_slash != std::string::npos) {
-          result->push_back(filename.substr(
-                d.size() + 1, next_slash - d.size() - 1));
+          result->push_back(
+              filename.substr(d.size() + 1, next_slash - d.size() - 1));
         } else {
           result->push_back(filename.substr(d.size() + 1));
         }
@@ -598,6 +575,17 @@ Status MockEnv::DeleteFile(const std::string& fname) {
   return Status::OK();
 }
 
+Status MockEnv::Truncate(const std::string& fname, size_t size) {
+  auto fn = NormalizePath(fname);
+  MutexLock lock(&mutex_);
+  auto iter = file_map_.find(fn);
+  if (iter == file_map_.end()) {
+    return Status::IOError(fn, "File not found");
+  }
+  iter->second->Truncate(size);
+  return Status::OK();
+}
+
 Status MockEnv::CreateDir(const std::string& dirname) {
   auto dn = NormalizePath(dirname);
   if (file_map_.find(dn) == file_map_.end()) {
@@ -632,7 +620,7 @@ Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) {
 }
 
 Status MockEnv::GetFileModificationTime(const std::string& fname,
-                                           uint64_t* time) {
+                                        uint64_t* time) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
@@ -672,7 +660,7 @@ Status MockEnv::LinkFile(const std::string& src, const std::string& dest) {
 }
 
 Status MockEnv::NewLogger(const std::string& fname,
-                             shared_ptr<Logger>* result) {
+                          std::shared_ptr<Logger>* result) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
   auto iter = file_map_.find(fn);
@@ -748,18 +736,6 @@ uint64_t MockEnv::NowNanos() {
   return EnvWrapper::NowNanos() + fake_sleep_micros_.load() * 1000;
 }
 
-// Non-virtual functions, specific to MockEnv
-Status MockEnv::Truncate(const std::string& fname, size_t size) {
-  auto fn = NormalizePath(fname);
-  MutexLock lock(&mutex_);
-  auto iter = file_map_.find(fn);
-  if (iter == file_map_.end()) {
-    return Status::IOError(fn, "File not found");
-  }
-  iter->second->Truncate(size);
-  return Status::OK();
-}
-
 Status MockEnv::CorruptBuffer(const std::string& fname) {
   auto fn = NormalizePath(fname);
   MutexLock lock(&mutex_);
@@ -792,7 +768,7 @@ Env* NewMemEnv(Env* base_env) { return new MockEnv(base_env); }
 
 #else  // ROCKSDB_LITE
 
-Env* NewMemEnv(Env* base_env) { return nullptr; }
+Env* NewMemEnv(Env* /*base_env*/) { return nullptr; }
 
 #endif  // !ROCKSDB_LITE
 
diff --git a/thirdparty/rocksdb/env/mock_env.h b/thirdparty/rocksdb/env/mock_env.h
index ba1e5fa31e..87b8deaf8c 100644
--- a/thirdparty/rocksdb/env/mock_env.h
+++ b/thirdparty/rocksdb/env/mock_env.h
@@ -28,28 +28,28 @@ class MockEnv : public EnvWrapper {
 
   // Partial implementation of the Env interface.
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
+                                   std::unique_ptr<SequentialFile>* result,
                                    const EnvOptions& soptions) override;
 
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
+                                     std::unique_ptr<RandomAccessFile>* result,
                                      const EnvOptions& soptions) override;
 
   virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
+                                 std::unique_ptr<RandomRWFile>* result,
                                  const EnvOptions& options) override;
 
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options) override;
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& env_options) override;
 
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override;
+                              std::unique_ptr<Directory>* result) override;
 
   virtual Status FileExists(const std::string& fname) override;
 
@@ -60,6 +60,8 @@ class MockEnv : public EnvWrapper {
 
   virtual Status DeleteFile(const std::string& fname) override;
 
+  virtual Status Truncate(const std::string& fname, size_t size) override;
+
   virtual Status CreateDir(const std::string& dirname) override;
 
   virtual Status CreateDirIfMissing(const std::string& dirname) override;
@@ -79,7 +81,7 @@ class MockEnv : public EnvWrapper {
                           const std::string& target) override;
 
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override;
+                           std::shared_ptr<Logger>* result) override;
 
   virtual Status LockFile(const std::string& fname, FileLock** flock) override;
 
@@ -92,9 +94,6 @@ class MockEnv : public EnvWrapper {
   virtual uint64_t NowMicros() override;
   virtual uint64_t NowNanos() override;
 
-  // Non-virtual functions, specific to MockEnv
-  Status Truncate(const std::string& fname, size_t size);
-
   Status CorruptBuffer(const std::string& fname);
 
   // Doesn't really sleep, just affects output of GetCurrentTime(), NowMicros()
diff --git a/thirdparty/rocksdb/env/mock_env_test.cc b/thirdparty/rocksdb/env/mock_env_test.cc
index 19e259ccd8..2daf682e76 100644
--- a/thirdparty/rocksdb/env/mock_env_test.cc
+++ b/thirdparty/rocksdb/env/mock_env_test.cc
@@ -20,16 +20,14 @@ class MockEnvTest : public testing::Test {
   MockEnvTest()
       : env_(new MockEnv(Env::Default())) {
   }
-  ~MockEnvTest() {
-    delete env_;
-  }
+  ~MockEnvTest() override { delete env_; }
 };
 
 TEST_F(MockEnvTest, Corrupt) {
   const std::string kGood = "this is a good string, synced to disk";
   const std::string kCorrupted = "this part may be corrupted";
   const std::string kFileName = "/dir/f";
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_));
   ASSERT_OK(writable_file->Append(kGood));
   ASSERT_TRUE(writable_file->GetFileSize() == kGood.size());
@@ -37,7 +35,7 @@ TEST_F(MockEnvTest, Corrupt) {
   std::string scratch;
   scratch.resize(kGood.size() + kCorrupted.size() + 16);
   Slice result;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_));
   ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0])));
   ASSERT_EQ(result.compare(kGood), 0);
diff --git a/thirdparty/rocksdb/env/posix_logger.h b/thirdparty/rocksdb/env/posix_logger.h
index 3ec6f574a3..401df6a3ff 100644
--- a/thirdparty/rocksdb/env/posix_logger.h
+++ b/thirdparty/rocksdb/env/posix_logger.h
@@ -24,6 +24,7 @@
 #endif
 
 #include <atomic>
+#include "env/io_posix.h"
 #include "monitoring/iostats_context_imp.h"
 #include "rocksdb/env.h"
 #include "util/sync_point.h"
@@ -32,6 +33,15 @@ namespace rocksdb {
 
 class PosixLogger : public Logger {
  private:
+  Status PosixCloseHelper() {
+    int ret;
+
+    ret = fclose(file_);
+    if (ret) {
+      return IOError("Unable to close log file", "", ret);
+    }
+    return Status::OK();
+  }
   FILE* file_;
   uint64_t (*gettid_)();  // Return the thread id for the current thread
   std::atomic_size_t log_size_;
@@ -40,6 +50,10 @@ class PosixLogger : public Logger {
   std::atomic_uint_fast64_t last_flush_micros_;
   Env* env_;
   std::atomic<bool> flush_pending_;
+
+ protected:
+  virtual Status CloseImpl() override { return PosixCloseHelper(); }
+
  public:
   PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env,
               const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL)
@@ -52,7 +66,10 @@ class PosixLogger : public Logger {
         env_(env),
         flush_pending_(false) {}
   virtual ~PosixLogger() {
-    fclose(file_);
+    if (!closed_) {
+      closed_ = true;
+      PosixCloseHelper();
+    }
   }
   virtual void Flush() override {
     TEST_SYNC_POINT("PosixLogger::Flush:Begin1");
@@ -148,7 +165,6 @@ class PosixLogger : public Logger {
 
       size_t sz = fwrite(base, 1, write_size, file_);
       flush_pending_ = true;
-      assert(sz == write_size);
       if (sz > 0) {
         log_size_ += write_size;
       }
diff --git a/thirdparty/rocksdb/examples/.gitignore b/thirdparty/rocksdb/examples/.gitignore
index b5a05e44a2..823664ae1f 100644
--- a/thirdparty/rocksdb/examples/.gitignore
+++ b/thirdparty/rocksdb/examples/.gitignore
@@ -2,6 +2,7 @@ c_simple_example
 column_families_example
 compact_files_example
 compaction_filter_example
+multi_processes_example
 optimistic_transaction_example
 options_file_example
 simple_example
diff --git a/thirdparty/rocksdb/examples/Makefile b/thirdparty/rocksdb/examples/Makefile
index 57cd1a75a1..27a6f0f421 100644
--- a/thirdparty/rocksdb/examples/Makefile
+++ b/thirdparty/rocksdb/examples/Makefile
@@ -43,8 +43,11 @@ transaction_example: librocksdb transaction_example.cc
 options_file_example: librocksdb options_file_example.cc
 	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
+multi_processes_example: librocksdb multi_processes_example.cc
+	$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+
 clean:
-	rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example
+	rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example
 
 librocksdb:
 	cd .. && $(MAKE) static_lib
diff --git a/thirdparty/rocksdb/examples/multi_processes_example.cc b/thirdparty/rocksdb/examples/multi_processes_example.cc
new file mode 100644
index 0000000000..b1c1d02ba2
--- /dev/null
+++ b/thirdparty/rocksdb/examples/multi_processes_example.cc
@@ -0,0 +1,395 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// How to use this example
+// Open two terminals, in one of them, run `./multi_processes_example 0` to
+// start a process running the primary instance. This will create a new DB in
+// kDBPath. The process will run for a while inserting keys to the normal
+// RocksDB database.
+// Next, go to the other terminal and run `./multi_processes_example 1` to
+// start a process running the secondary instance. This will create a secondary
+// instance following the aforementioned primary instance. This process will
+// run for a while, tailing the logs of the primary. After process with primary
+// instance exits, this process will keep running until you hit 'CTRL+C'.
+
+#include <inttypes.h>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <string>
+#include <thread>
+#include <vector>
+
+#if defined(OS_LINUX)
+#include <dirent.h>
+#include <signal.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#endif  // !OS_LINUX
+
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+
+using rocksdb::ColumnFamilyDescriptor;
+using rocksdb::ColumnFamilyHandle;
+using rocksdb::ColumnFamilyOptions;
+using rocksdb::DB;
+using rocksdb::FlushOptions;
+using rocksdb::Iterator;
+using rocksdb::Options;
+using rocksdb::ReadOptions;
+using rocksdb::Slice;
+using rocksdb::Status;
+using rocksdb::WriteOptions;
+
+const std::string kDBPath = "/tmp/rocksdb_multi_processes_example";
+const std::string kPrimaryStatusFile =
+    "/tmp/rocksdb_multi_processes_example_primary_status";
+const uint64_t kMaxKey = 600000;
+const size_t kMaxValueLength = 256;
+const size_t kNumKeysPerFlush = 1000;
+
+const std::vector<std::string>& GetColumnFamilyNames() {
+  static std::vector<std::string> column_family_names = {
+      rocksdb::kDefaultColumnFamilyName, "pikachu"};
+  return column_family_names;
+}
+
+inline bool IsLittleEndian() {
+  uint32_t x = 1;
+  return *reinterpret_cast<char*>(&x) != 0;
+}
+
+static std::atomic<int>& ShouldSecondaryWait() {
+  static std::atomic<int> should_secondary_wait{1};
+  return should_secondary_wait;
+}
+
+static std::string Key(uint64_t k) {
+  std::string ret;
+  if (IsLittleEndian()) {
+    ret.append(reinterpret_cast<char*>(&k), sizeof(k));
+  } else {
+    char buf[sizeof(k)];
+    buf[0] = k & 0xff;
+    buf[1] = (k >> 8) & 0xff;
+    buf[2] = (k >> 16) & 0xff;
+    buf[3] = (k >> 24) & 0xff;
+    buf[4] = (k >> 32) & 0xff;
+    buf[5] = (k >> 40) & 0xff;
+    buf[6] = (k >> 48) & 0xff;
+    buf[7] = (k >> 56) & 0xff;
+    ret.append(buf, sizeof(k));
+  }
+  size_t i = 0, j = ret.size() - 1;
+  while (i < j) {
+    char tmp = ret[i];
+    ret[i] = ret[j];
+    ret[j] = tmp;
+    ++i;
+    --j;
+  }
+  return ret;
+}
+
+static uint64_t Key(std::string key) {
+  assert(key.size() == sizeof(uint64_t));
+  size_t i = 0, j = key.size() - 1;
+  while (i < j) {
+    char tmp = key[i];
+    key[i] = key[j];
+    key[j] = tmp;
+    ++i;
+    --j;
+  }
+  uint64_t ret = 0;
+  if (IsLittleEndian()) {
+    memcpy(&ret, key.c_str(), sizeof(uint64_t));
+  } else {
+    const char* buf = key.c_str();
+    ret |= static_cast<uint64_t>(buf[0]);
+    ret |= (static_cast<uint64_t>(buf[1]) << 8);
+    ret |= (static_cast<uint64_t>(buf[2]) << 16);
+    ret |= (static_cast<uint64_t>(buf[3]) << 24);
+    ret |= (static_cast<uint64_t>(buf[4]) << 32);
+    ret |= (static_cast<uint64_t>(buf[5]) << 40);
+    ret |= (static_cast<uint64_t>(buf[6]) << 48);
+    ret |= (static_cast<uint64_t>(buf[7]) << 56);
+  }
+  return ret;
+}
+
+static Slice GenerateRandomValue(const size_t max_length, char scratch[]) {
+  size_t sz = 1 + (std::rand() % max_length);
+  int rnd = std::rand();
+  for (size_t i = 0; i != sz; ++i) {
+    scratch[i] = static_cast<char>(rnd ^ i);
+  }
+  return Slice(scratch, sz);
+}
+
+static bool ShouldCloseDB() { return true; }
+
+// TODO: port this example to other systems. It should be straightforward for
+// POSIX-compliant systems.
+#if defined(OS_LINUX)
+void CreateDB() {
+  long my_pid = static_cast<long>(getpid());
+  Options options;
+  Status s = rocksdb::DestroyDB(kDBPath, options);
+  if (!s.ok()) {
+    fprintf(stderr, "[process %ld] Failed to destroy DB: %s\n", my_pid,
+            s.ToString().c_str());
+    assert(false);
+  }
+  options.create_if_missing = true;
+  DB* db = nullptr;
+  s = DB::Open(options, kDBPath, &db);
+  if (!s.ok()) {
+    fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid,
+            s.ToString().c_str());
+    assert(false);
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  ColumnFamilyOptions cf_opts(options);
+  for (const auto& cf_name : GetColumnFamilyNames()) {
+    if (rocksdb::kDefaultColumnFamilyName != cf_name) {
+      ColumnFamilyHandle* handle = nullptr;
+      s = db->CreateColumnFamily(cf_opts, cf_name, &handle);
+      if (!s.ok()) {
+        fprintf(stderr, "[process %ld] Failed to create CF %s: %s\n", my_pid,
+                cf_name.c_str(), s.ToString().c_str());
+        assert(false);
+      }
+      handles.push_back(handle);
+    }
+  }
+  fprintf(stdout, "[process %ld] Column families created\n", my_pid);
+  for (auto h : handles) {
+    delete h;
+  }
+  handles.clear();
+  delete db;
+}
+
+void RunPrimary() {
+  long my_pid = static_cast<long>(getpid());
+  fprintf(stdout, "[process %ld] Primary instance starts\n", my_pid);
+  CreateDB();
+  std::srand(time(nullptr));
+  DB* db = nullptr;
+  Options options;
+  options.create_if_missing = false;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : GetColumnFamilyNames()) {
+    column_families.push_back(ColumnFamilyDescriptor(cf_name, options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  WriteOptions write_opts;
+  char val_buf[kMaxValueLength] = {0};
+  uint64_t curr_key = 0;
+  while (curr_key < kMaxKey) {
+    Status s;
+    if (nullptr == db) {
+      s = DB::Open(options, kDBPath, column_families, &handles, &db);
+      if (!s.ok()) {
+        fprintf(stderr, "[process %ld] Failed to open DB: %s\n", my_pid,
+                s.ToString().c_str());
+        assert(false);
+      }
+    }
+    assert(nullptr != db);
+    assert(handles.size() == GetColumnFamilyNames().size());
+    for (auto h : handles) {
+      assert(nullptr != h);
+      for (size_t i = 0; i != kNumKeysPerFlush; ++i) {
+        Slice key = Key(curr_key + static_cast<uint64_t>(i));
+        Slice value = GenerateRandomValue(kMaxValueLength, val_buf);
+        s = db->Put(write_opts, h, key, value);
+        if (!s.ok()) {
+          fprintf(stderr, "[process %ld] Failed to insert\n", my_pid);
+          assert(false);
+        }
+      }
+      s = db->Flush(FlushOptions(), h);
+      if (!s.ok()) {
+        fprintf(stderr, "[process %ld] Failed to flush\n", my_pid);
+        assert(false);
+      }
+    }
+    curr_key += static_cast<uint64_t>(kNumKeysPerFlush);
+    if (ShouldCloseDB()) {
+      for (auto h : handles) {
+        delete h;
+      }
+      handles.clear();
+      delete db;
+      db = nullptr;
+    }
+  }
+  if (nullptr != db) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+    delete db;
+    db = nullptr;
+  }
+  fprintf(stdout, "[process %ld] Finished adding keys\n", my_pid);
+}
+
+void secondary_instance_sigint_handler(int signal) {
+  ShouldSecondaryWait().store(0, std::memory_order_relaxed);
+  fprintf(stdout, "\n");
+  fflush(stdout);
+};
+
+void RunSecondary() {
+  ::signal(SIGINT, secondary_instance_sigint_handler);
+  long my_pid = static_cast<long>(getpid());
+  const std::string kSecondaryPath =
+      "/tmp/rocksdb_multi_processes_example_secondary";
+  // Create directory if necessary
+  if (nullptr == opendir(kSecondaryPath.c_str())) {
+    int ret =
+        mkdir(kSecondaryPath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+    if (ret < 0) {
+      perror("failed to create directory for secondary instance");
+      exit(0);
+    }
+  }
+  DB* db = nullptr;
+  Options options;
+  options.create_if_missing = false;
+  options.max_open_files = -1;
+  Status s = DB::OpenAsSecondary(options, kDBPath, kSecondaryPath, &db);
+  if (!s.ok()) {
+    fprintf(stderr, "[process %ld] Failed to open in secondary mode: %s\n",
+            my_pid, s.ToString().c_str());
+    assert(false);
+  } else {
+    fprintf(stdout, "[process %ld] Secondary instance starts\n", my_pid);
+  }
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ropts.total_order_seek = true;
+
+  std::vector<std::thread> test_threads;
+  test_threads.emplace_back([&]() {
+    while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) {
+      std::unique_ptr<Iterator> iter(db->NewIterator(ropts));
+      iter->SeekToFirst();
+      size_t count = 0;
+      for (; iter->Valid(); iter->Next()) {
+        ++count;
+      }
+    }
+    fprintf(stdout, "[process %ld] Range_scan thread finished\n", my_pid);
+  });
+
+  test_threads.emplace_back([&]() {
+    std::srand(time(nullptr));
+    while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) {
+      Slice key = Key(std::rand() % kMaxKey);
+      std::string value;
+      db->Get(ropts, key, &value);
+    }
+    fprintf(stdout, "[process %ld] Point lookup thread finished\n");
+  });
+
+  uint64_t curr_key = 0;
+  while (1 == ShouldSecondaryWait().load(std::memory_order_relaxed)) {
+    s = db->TryCatchUpWithPrimary();
+    if (!s.ok()) {
+      fprintf(stderr,
+              "[process %ld] error while trying to catch up with "
+              "primary %s\n",
+              my_pid, s.ToString().c_str());
+      assert(false);
+    }
+    {
+      std::unique_ptr<Iterator> iter(db->NewIterator(ropts));
+      if (!iter) {
+        fprintf(stderr, "[process %ld] Failed to create iterator\n", my_pid);
+        assert(false);
+      }
+      iter->SeekToLast();
+      if (iter->Valid()) {
+        uint64_t curr_max_key = Key(iter->key().ToString());
+        if (curr_max_key != curr_key) {
+          fprintf(stdout, "[process %ld] Observed key %" PRIu64 "\n", my_pid,
+                  curr_key);
+          curr_key = curr_max_key;
+        }
+      }
+    }
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+  }
+  s = db->TryCatchUpWithPrimary();
+  if (!s.ok()) {
+    fprintf(stderr,
+            "[process %ld] error while trying to catch up with "
+            "primary %s\n",
+            my_pid, s.ToString().c_str());
+    assert(false);
+  }
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : GetColumnFamilyNames()) {
+    column_families.push_back(ColumnFamilyDescriptor(cf_name, options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  DB* verification_db = nullptr;
+  s = DB::OpenForReadOnly(options, kDBPath, column_families, &handles,
+                          &verification_db);
+  assert(s.ok());
+  Iterator* iter1 = verification_db->NewIterator(ropts);
+  iter1->SeekToFirst();
+
+  Iterator* iter = db->NewIterator(ropts);
+  iter->SeekToFirst();
+  for (; iter->Valid() && iter1->Valid(); iter->Next(), iter1->Next()) {
+    if (iter->key().ToString() != iter1->key().ToString()) {
+      fprintf(stderr, "%" PRIu64 "!= %" PRIu64 "\n",
+              Key(iter->key().ToString()), Key(iter1->key().ToString()));
+      assert(false);
+    } else if (iter->value().ToString() != iter1->value().ToString()) {
+      fprintf(stderr, "Value mismatch\n");
+      assert(false);
+    }
+  }
+  fprintf(stdout, "[process %ld] Verification succeeded\n", my_pid);
+  for (auto& thr : test_threads) {
+    thr.join();
+  }
+  delete iter;
+  delete iter1;
+  delete db;
+  delete verification_db;
+}
+
+int main(int argc, char** argv) {
+  if (argc < 2) {
+    fprintf(stderr, "%s <0 for primary, 1 for secondary>\n", argv[0]);
+    return 0;
+  }
+  if (atoi(argv[1]) == 0) {
+    RunPrimary();
+  } else {
+    RunSecondary();
+  }
+  return 0;
+}
+#else   // OS_LINUX
+int main() {
+  fpritnf(stderr, "Not implemented.\n");
+  return 0;
+}
+#endif  // !OS_LINUX
diff --git a/thirdparty/rocksdb/examples/rocksdb_option_file_example.ini b/thirdparty/rocksdb/examples/rocksdb_option_file_example.ini
index 8e07131b39..351f1ed010 100644
--- a/thirdparty/rocksdb/examples/rocksdb_option_file_example.ini
+++ b/thirdparty/rocksdb/examples/rocksdb_option_file_example.ini
@@ -138,6 +138,7 @@
   block_restart_interval=16
   cache_index_and_filter_blocks=false
   pin_l0_filter_and_index_blocks_in_cache=false
+  pin_top_level_index_and_filter=false
   index_type=kBinarySearch
   hash_index_allow_collision=true
   flush_block_policy_factory=FlushBlockBySizePolicyFactory
diff --git a/thirdparty/rocksdb/hdfs/env_hdfs.h b/thirdparty/rocksdb/hdfs/env_hdfs.h
index 3a62bc8cb9..903e32ef92 100644
--- a/thirdparty/rocksdb/hdfs/env_hdfs.h
+++ b/thirdparty/rocksdb/hdfs/env_hdfs.h
@@ -54,110 +54,109 @@ class HdfsEnv : public Env {
     hdfsDisconnect(fileSys_);
   }
 
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   std::unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options);
+  Status NewSequentialFile(const std::string& fname,
+                           std::unique_ptr<SequentialFile>* result,
+                           const EnvOptions& options) override;
 
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     std::unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options);
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& options) override;
 
-  virtual Status NewWritableFile(const std::string& fname,
-                                 std::unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options);
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& options) override;
 
-  virtual Status NewDirectory(const std::string& name,
-                              std::unique_ptr<Directory>* result);
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override;
 
-  virtual Status FileExists(const std::string& fname);
+  Status FileExists(const std::string& fname) override;
 
-  virtual Status GetChildren(const std::string& path,
-                             std::vector<std::string>* result);
+  Status GetChildren(const std::string& path,
+                     std::vector<std::string>* result) override;
 
-  virtual Status DeleteFile(const std::string& fname);
+  Status DeleteFile(const std::string& fname) override;
 
-  virtual Status CreateDir(const std::string& name);
+  Status CreateDir(const std::string& name) override;
 
-  virtual Status CreateDirIfMissing(const std::string& name);
+  Status CreateDirIfMissing(const std::string& name) override;
 
-  virtual Status DeleteDir(const std::string& name);
+  Status DeleteDir(const std::string& name) override;
 
-  virtual Status GetFileSize(const std::string& fname, uint64_t* size);
+  Status GetFileSize(const std::string& fname, uint64_t* size) override;
 
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* file_mtime);
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override;
 
-  virtual Status RenameFile(const std::string& src, const std::string& target);
+  Status RenameFile(const std::string& src, const std::string& target) override;
 
-  virtual Status LinkFile(const std::string& src, const std::string& target) {
+  Status LinkFile(const std::string& /*src*/,
+                  const std::string& /*target*/) override {
     return Status::NotSupported(); // not supported
   }
 
-  virtual Status LockFile(const std::string& fname, FileLock** lock);
+  Status LockFile(const std::string& fname, FileLock** lock) override;
 
-  virtual Status UnlockFile(FileLock* lock);
+  Status UnlockFile(FileLock* lock) override;
 
-  virtual Status NewLogger(const std::string& fname,
-                           std::shared_ptr<Logger>* result);
+  Status NewLogger(const std::string& fname,
+                   std::shared_ptr<Logger>* result) override;
 
-  virtual void Schedule(void (*function)(void* arg), void* arg,
-                        Priority pri = LOW, void* tag = nullptr, void (*unschedFunction)(void* arg) = 0) {
+  void Schedule(void (*function)(void* arg), void* arg, Priority pri = LOW,
+                void* tag = nullptr,
+                void (*unschedFunction)(void* arg) = 0) override {
     posixEnv->Schedule(function, arg, pri, tag, unschedFunction);
   }
 
-  virtual int UnSchedule(void* tag, Priority pri) {
+  int UnSchedule(void* tag, Priority pri) override {
     return posixEnv->UnSchedule(tag, pri);
   }
 
-  virtual void StartThread(void (*function)(void* arg), void* arg) {
+  void StartThread(void (*function)(void* arg), void* arg) override {
     posixEnv->StartThread(function, arg);
   }
 
-  virtual void WaitForJoin() { posixEnv->WaitForJoin(); }
+  void WaitForJoin() override { posixEnv->WaitForJoin(); }
 
-  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const
-      override {
+  unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override {
     return posixEnv->GetThreadPoolQueueLen(pri);
   }
 
-  virtual Status GetTestDirectory(std::string* path) {
+  Status GetTestDirectory(std::string* path) override {
     return posixEnv->GetTestDirectory(path);
   }
 
-  virtual uint64_t NowMicros() {
-    return posixEnv->NowMicros();
-  }
+  uint64_t NowMicros() override { return posixEnv->NowMicros(); }
 
-  virtual void SleepForMicroseconds(int micros) {
+  void SleepForMicroseconds(int micros) override {
     posixEnv->SleepForMicroseconds(micros);
   }
 
-  virtual Status GetHostName(char* name, uint64_t len) {
+  Status GetHostName(char* name, uint64_t len) override {
     return posixEnv->GetHostName(name, len);
   }
 
-  virtual Status GetCurrentTime(int64_t* unix_time) {
+  Status GetCurrentTime(int64_t* unix_time) override {
     return posixEnv->GetCurrentTime(unix_time);
   }
 
-  virtual Status GetAbsolutePath(const std::string& db_path,
-      std::string* output_path) {
+  Status GetAbsolutePath(const std::string& db_path,
+                         std::string* output_path) override {
     return posixEnv->GetAbsolutePath(db_path, output_path);
   }
 
-  virtual void SetBackgroundThreads(int number, Priority pri = LOW) {
+  void SetBackgroundThreads(int number, Priority pri = LOW) override {
     posixEnv->SetBackgroundThreads(number, pri);
   }
 
-  virtual int GetBackgroundThreads(Priority pri = LOW) {
+  int GetBackgroundThreads(Priority pri = LOW) override {
     return posixEnv->GetBackgroundThreads(pri);
   }
 
-  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override {
+  void IncBackgroundThreadsIfNeeded(int number, Priority pri) override {
     posixEnv->IncBackgroundThreadsIfNeeded(number, pri);
   }
 
-  virtual std::string TimeToString(uint64_t number) {
+  std::string TimeToString(uint64_t number) override {
     return posixEnv->TimeToString(number);
   }
 
@@ -166,9 +165,7 @@ class HdfsEnv : public Env {
     return (uint64_t)pthread_self();
   }
 
-  virtual uint64_t GetThreadID() const override {
-    return HdfsEnv::gettid();
-  }
+  uint64_t GetThreadID() const override { return HdfsEnv::gettid(); }
 
  private:
   std::string fsname_;  // string of the form "hdfs://hostname:port/"
@@ -206,7 +203,7 @@ class HdfsEnv : public Env {
     std::string host(parts[0]);
     std::string remaining(parts[1]);
 
-    int rem = remaining.find(pathsep);
+    int rem = static_cast<int>(remaining.find(pathsep));
     std::string portStr = (rem == 0 ? remaining :
                            remaining.substr(0, rem));
 
@@ -245,7 +242,7 @@ static const Status notsup;
 class HdfsEnv : public Env {
 
  public:
-  explicit HdfsEnv(const std::string& fsname) {
+  explicit HdfsEnv(const std::string& /*fsname*/) {
     fprintf(stderr, "You have not build rocksdb with HDFS support\n");
     fprintf(stderr, "Please see hdfs/README for details\n");
     abort();
@@ -255,115 +252,129 @@ class HdfsEnv : public Env {
   }
 
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
+                                   std::unique_ptr<SequentialFile>* result,
                                    const EnvOptions& options) override;
 
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) override {
+  virtual Status NewRandomAccessFile(
+      const std::string& /*fname*/,
+      std::unique_ptr<RandomAccessFile>* /*result*/,
+      const EnvOptions& /*options*/) override {
     return notsup;
   }
 
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) override {
+  virtual Status NewWritableFile(const std::string& /*fname*/,
+                                 std::unique_ptr<WritableFile>* /*result*/,
+                                 const EnvOptions& /*options*/) override {
     return notsup;
   }
 
-  virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+  virtual Status NewDirectory(const std::string& /*name*/,
+                              std::unique_ptr<Directory>* /*result*/) override {
     return notsup;
   }
 
-  virtual Status FileExists(const std::string& fname) override {
+  virtual Status FileExists(const std::string& /*fname*/) override {
     return notsup;
   }
 
-  virtual Status GetChildren(const std::string& path,
-                             std::vector<std::string>* result) override {
+  virtual Status GetChildren(const std::string& /*path*/,
+                             std::vector<std::string>* /*result*/) override {
     return notsup;
   }
 
-  virtual Status DeleteFile(const std::string& fname) override {
+  virtual Status DeleteFile(const std::string& /*fname*/) override {
     return notsup;
   }
 
-  virtual Status CreateDir(const std::string& name) override { return notsup; }
+  virtual Status CreateDir(const std::string& /*name*/) override {
+    return notsup;
+  }
 
-  virtual Status CreateDirIfMissing(const std::string& name) override {
+  virtual Status CreateDirIfMissing(const std::string& /*name*/) override {
     return notsup;
   }
 
-  virtual Status DeleteDir(const std::string& name) override { return notsup; }
+  virtual Status DeleteDir(const std::string& /*name*/) override {
+    return notsup;
+  }
 
-  virtual Status GetFileSize(const std::string& fname,
-                             uint64_t* size) override {
+  virtual Status GetFileSize(const std::string& /*fname*/,
+                             uint64_t* /*size*/) override {
     return notsup;
   }
 
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* time) override {
+  virtual Status GetFileModificationTime(const std::string& /*fname*/,
+                                         uint64_t* /*time*/) override {
     return notsup;
   }
 
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& target) override {
+  virtual Status RenameFile(const std::string& /*src*/,
+                            const std::string& /*target*/) override {
     return notsup;
   }
 
-  virtual Status LinkFile(const std::string& src,
-                          const std::string& target) override {
+  virtual Status LinkFile(const std::string& /*src*/,
+                          const std::string& /*target*/) override {
     return notsup;
   }
 
-  virtual Status LockFile(const std::string& fname, FileLock** lock) override {
+  virtual Status LockFile(const std::string& /*fname*/,
+                          FileLock** /*lock*/) override {
     return notsup;
   }
 
-  virtual Status UnlockFile(FileLock* lock) override { return notsup; }
+  virtual Status UnlockFile(FileLock* /*lock*/) override { return notsup; }
 
-  virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override {
+  virtual Status NewLogger(const std::string& /*fname*/,
+                           std::shared_ptr<Logger>* /*result*/) override {
     return notsup;
   }
 
-  virtual void Schedule(void (*function)(void* arg), void* arg,
-                        Priority pri = LOW, void* tag = nullptr,
-                        void (*unschedFunction)(void* arg) = 0) override {}
+  virtual void Schedule(void (* /*function*/)(void* arg), void* /*arg*/,
+                        Priority /*pri*/ = LOW, void* /*tag*/ = nullptr,
+                        void (* /*unschedFunction*/)(void* arg) = 0) override {}
 
-  virtual int UnSchedule(void* tag, Priority pri) override { return 0; }
+  virtual int UnSchedule(void* /*tag*/, Priority /*pri*/) override { return 0; }
 
-  virtual void StartThread(void (*function)(void* arg), void* arg) override {}
+  virtual void StartThread(void (* /*function*/)(void* arg),
+                           void* /*arg*/) override {}
 
   virtual void WaitForJoin() override {}
 
   virtual unsigned int GetThreadPoolQueueLen(
-      Priority pri = LOW) const override {
+      Priority /*pri*/ = LOW) const override {
     return 0;
   }
 
-  virtual Status GetTestDirectory(std::string* path) override { return notsup; }
+  virtual Status GetTestDirectory(std::string* /*path*/) override {
+    return notsup;
+  }
 
   virtual uint64_t NowMicros() override { return 0; }
 
-  virtual void SleepForMicroseconds(int micros) override {}
+  virtual void SleepForMicroseconds(int /*micros*/) override {}
 
-  virtual Status GetHostName(char* name, uint64_t len) override {
+  virtual Status GetHostName(char* /*name*/, uint64_t /*len*/) override {
     return notsup;
   }
 
-  virtual Status GetCurrentTime(int64_t* unix_time) override { return notsup; }
+  virtual Status GetCurrentTime(int64_t* /*unix_time*/) override {
+    return notsup;
+  }
 
-  virtual Status GetAbsolutePath(const std::string& db_path,
-                                 std::string* outputpath) override {
+  virtual Status GetAbsolutePath(const std::string& /*db_path*/,
+                                 std::string* /*outputpath*/) override {
     return notsup;
   }
 
-  virtual void SetBackgroundThreads(int number, Priority pri = LOW) override {}
-  virtual int GetBackgroundThreads(Priority pri = LOW) override { return 0; }
-  virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override {
+  virtual void SetBackgroundThreads(int /*number*/,
+                                    Priority /*pri*/ = LOW) override {}
+  virtual int GetBackgroundThreads(Priority /*pri*/ = LOW) override {
+    return 0;
   }
-  virtual std::string TimeToString(uint64_t number) override { return ""; }
+  virtual void IncBackgroundThreadsIfNeeded(int /*number*/,
+                                            Priority /*pri*/) override {}
+  virtual std::string TimeToString(uint64_t /*number*/) override { return ""; }
 
   virtual uint64_t GetThreadID() const override {
     return 0;
diff --git a/thirdparty/rocksdb/hdfs/setup.sh b/thirdparty/rocksdb/hdfs/setup.sh
old mode 100644
new mode 100755
index ac69b525df..ba76ec2090
--- a/thirdparty/rocksdb/hdfs/setup.sh
+++ b/thirdparty/rocksdb/hdfs/setup.sh
@@ -1,7 +1,8 @@
+# shellcheck disable=SC2148
 export USE_HDFS=1
-export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64/server:$JAVA_HOME/jre/lib/amd64:/usr/lib/hadoop/lib/native
+export LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64/server:$JAVA_HOME/jre/lib/amd64:$HADOOP_HOME/lib/native
 
-export CLASSPATH=
+export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob`
 for f in `find /usr/lib/hadoop-hdfs | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done
 for f in `find /usr/lib/hadoop | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done
 for f in `find /usr/lib/hadoop/client | grep jar`; do export CLASSPATH=$CLASSPATH:$f; done
diff --git a/thirdparty/rocksdb/include/rocksdb/advanced_options.h b/thirdparty/rocksdb/include/rocksdb/advanced_options.h
index 6f45134a68..b7ab7c584b 100644
--- a/thirdparty/rocksdb/include/rocksdb/advanced_options.h
+++ b/thirdparty/rocksdb/include/rocksdb/advanced_options.h
@@ -62,13 +62,6 @@ struct CompactionOptionsFIFO {
   // Default: 1GB
   uint64_t max_table_files_size;
 
-  // Drop files older than TTL. TTL based deletion will take precedence over
-  // size based deletion if ttl > 0.
-  // delete if sst_file_creation_time < (current_time - ttl)
-  // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
-  // Default: 0 (disabled)
-  uint64_t ttl = 0;
-
   // If true, try to do compaction to compact smaller files into larger ones.
   // Minimum files to compact follows options.level0_file_num_compaction_trigger
   // and compaction won't trigger if average compact bytes per del file is
@@ -78,35 +71,78 @@ struct CompactionOptionsFIFO {
   bool allow_compaction = false;
 
   CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {}
-  CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction,
-                        uint64_t _ttl = 0)
+  CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction)
       : max_table_files_size(_max_table_files_size),
-        ttl(_ttl),
         allow_compaction(_allow_compaction) {}
 };
 
 // Compression options for different compression algorithms like Zlib
 struct CompressionOptions {
+  // RocksDB's generic default compression level. Internally it'll be translated
+  // to the default compression level specific to the library being used (see
+  // comment above `ColumnFamilyOptions::compression`).
+  //
+  // The default value is the max 16-bit int as it'll be written out in OPTIONS
+  // file, which should be portable.
+  const static int kDefaultCompressionLevel = 32767;
+
   int window_bits;
   int level;
   int strategy;
-  // Maximum size of dictionary used to prime the compression library. Currently
-  // this dictionary will be constructed by sampling the first output file in a
-  // subcompaction when the target level is bottommost. This dictionary will be
-  // loaded into the compression library before compressing/uncompressing each
-  // data block of subsequent files in the subcompaction. Effectively, this
-  // improves compression ratios when there are repetitions across data blocks.
-  // A value of 0 indicates the feature is disabled.
+
+  // Maximum size of dictionaries used to prime the compression library.
+  // Enabling dictionary can improve compression ratios when there are
+  // repetitions across data blocks.
+  //
+  // The dictionary is created by sampling the SST file data. If
+  // `zstd_max_train_bytes` is nonzero, the samples are passed through zstd's
+  // dictionary generator. Otherwise, the random samples are used directly as
+  // the dictionary.
+  //
+  // When compression dictionary is disabled, we compress and write each block
+  // before buffering data for the next one. When compression dictionary is
+  // enabled, we buffer all SST file data in-memory so we can sample it, as data
+  // can only be compressed and written after the dictionary has been finalized.
+  // So users of this feature may see increased memory usage.
+  //
   // Default: 0.
   uint32_t max_dict_bytes;
 
+  // Maximum size of training data passed to zstd's dictionary trainer. Using
+  // zstd's dictionary trainer can achieve even better compression ratio
+  // improvements than using `max_dict_bytes` alone.
+  //
+  // The training data will be used to generate a dictionary of max_dict_bytes.
+  //
+  // Default: 0.
+  uint32_t zstd_max_train_bytes;
+
+  // When the compression options are set by the user, it will be set to "true".
+  // For bottommost_compression_opts, to enable it, user must set enabled=true.
+  // Otherwise, bottommost compression will use compression_opts as default
+  // compression options.
+  //
+  // For compression_opts, if compression_opts.enabled=false, it is still
+  // used as compression options for compression process.
+  //
+  // Default: false.
+  bool enabled;
+
   CompressionOptions()
-      : window_bits(-14), level(-1), strategy(0), max_dict_bytes(0) {}
-  CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes)
+      : window_bits(-14),
+        level(kDefaultCompressionLevel),
+        strategy(0),
+        max_dict_bytes(0),
+        zstd_max_train_bytes(0),
+        enabled(false) {}
+  CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes,
+                     int _zstd_max_train_bytes, bool _enabled)
       : window_bits(wbits),
         level(_lev),
         strategy(_strategy),
-        max_dict_bytes(_max_dict_bytes) {}
+        max_dict_bytes(_max_dict_bytes),
+        zstd_max_train_bytes(_zstd_max_train_bytes),
+        enabled(_enabled) {}
 };
 
 enum UpdateStatus {    // Return status For inplace update callback
@@ -229,13 +265,22 @@ struct AdvancedColumnFamilyOptions {
   // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0,
   // create prefix bloom for memtable with the size of
   // write_buffer_size * memtable_prefix_bloom_size_ratio.
-  // If it is larger than 0.25, it is santinized to 0.25.
+  // If it is larger than 0.25, it is sanitized to 0.25.
   //
   // Default: 0 (disable)
   //
   // Dynamically changeable through SetOptions() API
   double memtable_prefix_bloom_size_ratio = 0.0;
 
+  // Enable whole key bloom filter in memtable. Note this will only take effect
+  // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering
+  // can potentially reduce CPU usage for point-look-ups.
+  //
+  // Default: false (disable)
+  //
+  // Dynamically changeable through SetOptions() API
+  bool memtable_whole_key_filtering = false;
+
   // Page size for huge page for the arena used by the memtable. If <=0, it
   // won't allocate from huge page but from malloc.
   // Users are responsible to reserve huge pages for it to be allocated. For
@@ -368,6 +413,7 @@ struct AdvancedColumnFamilyOptions {
   //    of the level.
   // At the same time max_bytes_for_level_multiplier and
   // max_bytes_for_level_multiplier_additional are still satisfied.
+  // (When L0 is too large, we make some adjustment. See below.)
   //
   // With this option on, from an empty DB, we make last level the base level,
   // which means merging L0 data into the last level, until it exceeds
@@ -406,13 +452,34 @@ struct AdvancedColumnFamilyOptions {
   // max_bytes_for_level_base, for a more predictable LSM tree shape. It is
   // useful to limit worse case space amplification.
   //
+  //
+  // If the compaction from L0 is lagged behind, a special mode will be turned
+  // on to prioritize write amplification against max_bytes_for_level_multiplier
+  // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking
+  // at number of L0 files and total L0 size. If number of L0 files is at least
+  // the double of level0_file_num_compaction_trigger, or the total size is
+  // at least max_bytes_for_level_base, this mode is on. The target of L1 grows
+  // to the actual data size in L0, and then determine the target for each level
+  // so that each level will have the same level multiplier.
+  //
+  // For example, when L0 size is 100MB, the size of last level is 1600MB,
+  // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10.
+  // Since L0 size is larger than max_bytes_for_level_base, this is a L0
+  // compaction backlogged mode. So that the L1 size is determined to be 100MB.
+  // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will
+  // be needed. The level multiplier will be calculated to be 4 and the three
+  // levels' target to be [100MB, 400MB, 1600MB].
+  //
+  // In this mode, The number of levels will be no more than the normal mode,
+  // and the level multiplier will be lower. The write amplification will
+  // likely to be reduced.
+  //
+  //
   // max_bytes_for_level_multiplier_additional is ignored with this flag on.
   //
   // Turning this feature on or off for an existing DB can cause unexpected
   // LSM tree structure so it's not recommended.
   //
-  // NOTE: this option is experimental
-  //
   // Default: false
   bool level_compaction_dynamic_level_bytes = false;
 
@@ -435,19 +502,25 @@ struct AdvancedColumnFamilyOptions {
   // threshold. But it's not guaranteed.
   // Value 0 will be sanitized.
   //
-  // Default: result.target_file_size_base * 25
+  // Default: target_file_size_base * 25
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t max_compaction_bytes = 0;
 
   // All writes will be slowed down to at least delayed_write_rate if estimated
   // bytes needed to be compaction exceed this threshold.
   //
   // Default: 64GB
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull;
 
   // All writes are stopped if estimated bytes needed to be compaction exceed
   // this threshold.
   //
   // Default: 256GB
+  //
+  // Dynamically changeable through SetOptions() API
   uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull;
 
   // The compaction style. Default: kCompactionStyleLevel
@@ -455,13 +528,21 @@ struct AdvancedColumnFamilyOptions {
 
   // If level compaction_style = kCompactionStyleLevel, for each level,
   // which files are prioritized to be picked to compact.
-  // Default: kByCompensatedSize
-  CompactionPri compaction_pri = kByCompensatedSize;
+  // Default: kMinOverlappingRatio
+  CompactionPri compaction_pri = kMinOverlappingRatio;
 
   // The options needed to support Universal Style compactions
+  //
+  // Dynamically changeable through SetOptions() API
+  // Dynamic change example:
+  // SetOptions("compaction_options_universal", "{size_ratio=2;}")
   CompactionOptionsUniversal compaction_options_universal;
 
   // The options for FIFO compaction style
+  //
+  // Dynamically changeable through SetOptions() API
+  // Dynamic change example:
+  // SetOptions("compaction_options_fifo", "{max_table_files_size=100;}")
   CompactionOptionsFIFO compaction_options_fifo;
 
   // An iteration->Next() sequentially skips over keys with the same
@@ -531,19 +612,44 @@ struct AdvancedColumnFamilyOptions {
   bool optimize_filters_for_hits = false;
 
   // After writing every SST file, reopen it and read all the keys.
+  //
   // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
   bool paranoid_file_checks = false;
 
-  // In debug mode, RocksDB run consistency checks on the LSM everytime the LSM
+  // In debug mode, RocksDB run consistency checks on the LSM every time the LSM
   // change (Flush, Compaction, AddFile). These checks are disabled in release
   // mode, use this option to enable them in release mode as well.
   // Default: false
   bool force_consistency_checks = false;
 
   // Measure IO stats in compactions and flushes, if true.
+  //
   // Default: false
+  //
+  // Dynamically changeable through SetOptions() API
   bool report_bg_io_stats = false;
 
+  // Files older than TTL will go through the compaction process.
+  // Supported in Level and FIFO compaction.
+  // Pre-req: This needs max_open_files to be set to -1.
+  // In Level: Non-bottom-level files older than TTL will go through the
+  //           compation process.
+  // In FIFO: Files older than TTL will be deleted.
+  // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60
+  //
+  // Default: 0 (disabled)
+  //
+  // Dynamically changeable through SetOptions() API
+  uint64_t ttl = 0;
+
+  // If this option is set then 1 in N blocks are compressed
+  // using a fast (lz4) and slow (zstd) compression algorithm.
+  // The compressibility is reported as stats and the stored
+  // data is left uncompressed (unless compression is also requested).
+  uint64_t sample_for_compression = 0;
+
   // Create ColumnFamilyOptions with default values for all fields
   AdvancedColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
diff --git a/thirdparty/rocksdb/include/rocksdb/c.h b/thirdparty/rocksdb/include/rocksdb/c.h
index 2269f7261c..05699492c9 100644
--- a/thirdparty/rocksdb/include/rocksdb/c.h
+++ b/thirdparty/rocksdb/include/rocksdb/c.h
@@ -42,9 +42,6 @@
   (5) All of the pointer arguments must be non-NULL.
 */
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_C_H_
-#define STORAGE_ROCKSDB_INCLUDE_C_H_
-
 #pragma once
 
 #ifdef _WIN32
@@ -113,20 +110,30 @@ typedef struct rocksdb_envoptions_t      rocksdb_envoptions_t;
 typedef struct rocksdb_ingestexternalfileoptions_t rocksdb_ingestexternalfileoptions_t;
 typedef struct rocksdb_sstfilewriter_t   rocksdb_sstfilewriter_t;
 typedef struct rocksdb_ratelimiter_t     rocksdb_ratelimiter_t;
+typedef struct rocksdb_perfcontext_t     rocksdb_perfcontext_t;
 typedef struct rocksdb_pinnableslice_t rocksdb_pinnableslice_t;
 typedef struct rocksdb_transactiondb_options_t rocksdb_transactiondb_options_t;
 typedef struct rocksdb_transactiondb_t rocksdb_transactiondb_t;
 typedef struct rocksdb_transaction_options_t rocksdb_transaction_options_t;
-typedef struct rocksdb_optimistictransactiondb_t rocksdb_optimistictransactiondb_t;
-typedef struct rocksdb_optimistictransaction_options_t rocksdb_optimistictransaction_options_t;
+typedef struct rocksdb_optimistictransactiondb_t
+    rocksdb_optimistictransactiondb_t;
+typedef struct rocksdb_optimistictransaction_options_t
+    rocksdb_optimistictransaction_options_t;
 typedef struct rocksdb_transaction_t rocksdb_transaction_t;
 typedef struct rocksdb_checkpoint_t rocksdb_checkpoint_t;
+typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t;
+typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t;
+typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t;
+typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t;
 
 /* DB operations */
 
 extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open(
     const rocksdb_options_t* options, const char* name, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_with_ttl(
+    const rocksdb_options_t* options, const char* name, int ttl, char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_open_for_read_only(
     const rocksdb_options_t* options, const char* name,
     unsigned char error_if_log_file_exist, char** errptr);
@@ -137,6 +144,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_backup_engine_t* rocksdb_backup_engine_open(
 extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup(
     rocksdb_backup_engine_t* be, rocksdb_t* db, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_create_new_backup_flush(
+    rocksdb_backup_engine_t* be, rocksdb_t* db, unsigned char flush_before_backup,
+    char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_backup_engine_purge_old_backups(
     rocksdb_backup_engine_t* be, uint32_t num_backups_to_keep, char** errptr);
 
@@ -147,6 +158,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_destroy(
 extern ROCKSDB_LIBRARY_API void rocksdb_restore_options_set_keep_log_files(
     rocksdb_restore_options_t* opt, int v);
 
+extern ROCKSDB_LIBRARY_API void
+rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+    uint32_t backup_id, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void
 rocksdb_backup_engine_restore_db_from_latest_backup(
     rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
@@ -291,6 +306,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_multi_get_cf(
 extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator(
     rocksdb_t* db, const rocksdb_readoptions_t* options);
 
+extern ROCKSDB_LIBRARY_API rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+        rocksdb_t* db, uint64_t seq_number,
+        const rocksdb_wal_readoptions_t* options,
+        char** errptr
+);
+
 extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_create_iterator_cf(
     rocksdb_t* db, const rocksdb_readoptions_t* options,
     rocksdb_column_family_handle_t* column_family);
@@ -394,6 +415,14 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_iter_value(
 extern ROCKSDB_LIBRARY_API void rocksdb_iter_get_error(
     const rocksdb_iterator_t*, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter);
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_wal_iter_valid(
+        const rocksdb_wal_iterator_t*);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_status (const rocksdb_wal_iterator_t* iter, char** errptr) ;
+extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_wal_iter_get_batch (const rocksdb_wal_iterator_t* iter, uint64_t* seq) ;
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_get_latest_sequence_number (rocksdb_t *db);
+extern ROCKSDB_LIBRARY_API void rocksdb_wal_iter_destroy (const rocksdb_wal_iterator_t* iter) ;
+
 /* Write batch */
 
 extern ROCKSDB_LIBRARY_API rocksdb_writebatch_t* rocksdb_writebatch_create();
@@ -608,7 +637,6 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t* rocksdb_writebatch_wi_create_iter
     rocksdb_iterator_t* base_iterator,
     rocksdb_column_family_handle_t* cf);
 
-
 /* Block based table options */
 
 extern ROCKSDB_LIBRARY_API rocksdb_block_based_table_options_t*
@@ -623,6 +651,18 @@ rocksdb_block_based_options_set_block_size_deviation(
 extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_block_restart_interval(
     rocksdb_block_based_table_options_t* options, int block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_index_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int index_block_restart_interval);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_metadata_block_size(
+    rocksdb_block_based_table_options_t* options, uint64_t metadata_block_size);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_partition_filters(
+    rocksdb_block_based_table_options_t* options, unsigned char partition_filters);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_use_delta_encoding(
+    rocksdb_block_based_table_options_t* options, unsigned char use_delta_encoding);
 extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_filter_policy(
     rocksdb_block_based_table_options_t* options,
     rocksdb_filterpolicy_t* filter_policy);
@@ -653,8 +693,14 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_cache_index_and_filter_blocks(
     rocksdb_block_based_table_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+    rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
 rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
     rocksdb_block_based_table_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+    rocksdb_block_based_table_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
     rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);
 
@@ -682,6 +728,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cuckoo_table_factory(
 extern ROCKSDB_LIBRARY_API void rocksdb_set_options(
     rocksdb_t* db, int count, const char* const keys[], const char* const values[], char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_set_options_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle, int count, const char* const keys[], const char* const values[], char** errptr);
+
 extern ROCKSDB_LIBRARY_API rocksdb_options_t* rocksdb_options_create();
 extern ROCKSDB_LIBRARY_API void rocksdb_options_destroy(rocksdb_options_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_increase_parallelism(
@@ -693,6 +742,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_optimize_level_style_compaction(
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_optimize_universal_style_compaction(
     rocksdb_options_t* opt, uint64_t memtable_memory_budget);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_allow_ingest_behind(rocksdb_options_t*,
+                                                   unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter(
     rocksdb_options_t*, rocksdb_compactionfilter_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_compaction_filter_factory(
@@ -717,7 +769,7 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_error_if_exists(
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_paranoid_checks(
     rocksdb_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths(rocksdb_options_t*,
-                                                             const rocksdb_dbpath_t** path_values, 
+                                                             const rocksdb_dbpath_t** path_values,
                                                              size_t num_paths);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*,
                                                         rocksdb_env_t*);
@@ -765,8 +817,9 @@ rocksdb_options_set_max_bytes_for_level_multiplier_additional(
     rocksdb_options_t*, int* level_values, size_t num_levels);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics(
     rocksdb_options_t*);
-extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_stats_update_on_db_open(
-    rocksdb_options_t* opt, unsigned char val);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+                                                 unsigned char val);
 
 /* returns a pointer to a malloc()-ed, null terminated string */
 extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string(
@@ -779,6 +832,12 @@ rocksdb_options_set_min_write_buffer_number_to_merge(rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_max_write_buffer_number_to_maintain(rocksdb_options_t*,
                                                         int);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_enable_pipelined_write(
+    rocksdb_options_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_subcompactions(
+    rocksdb_options_t*, uint32_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_jobs(
+    rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_background_compactions(
     rocksdb_options_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_base_background_compactions(
@@ -851,6 +910,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_use_adaptive_mutex(
     rocksdb_options_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_options_set_bytes_per_sync(
     rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_options_set_wal_bytes_per_sync(
+        rocksdb_options_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t*, uint64_t);
 extern ROCKSDB_LIBRARY_API void
 rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t*,
                                                     unsigned char);
@@ -941,6 +1004,99 @@ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(
     int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness);
 extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t*);
 
+/* PerfContext */
+enum {
+  rocksdb_uninitialized = 0,
+  rocksdb_disable = 1,
+  rocksdb_enable_count = 2,
+  rocksdb_enable_time_except_for_mutex = 3,
+  rocksdb_enable_time = 4,
+  rocksdb_out_of_bounds = 5
+};
+
+enum {
+  rocksdb_user_key_comparison_count = 0,
+  rocksdb_block_cache_hit_count,
+  rocksdb_block_read_count,
+  rocksdb_block_read_byte,
+  rocksdb_block_read_time,
+  rocksdb_block_checksum_time,
+  rocksdb_block_decompress_time,
+  rocksdb_get_read_bytes,
+  rocksdb_multiget_read_bytes,
+  rocksdb_iter_read_bytes,
+  rocksdb_internal_key_skipped_count,
+  rocksdb_internal_delete_skipped_count,
+  rocksdb_internal_recent_skipped_count,
+  rocksdb_internal_merge_count,
+  rocksdb_get_snapshot_time,
+  rocksdb_get_from_memtable_time,
+  rocksdb_get_from_memtable_count,
+  rocksdb_get_post_process_time,
+  rocksdb_get_from_output_files_time,
+  rocksdb_seek_on_memtable_time,
+  rocksdb_seek_on_memtable_count,
+  rocksdb_next_on_memtable_count,
+  rocksdb_prev_on_memtable_count,
+  rocksdb_seek_child_seek_time,
+  rocksdb_seek_child_seek_count,
+  rocksdb_seek_min_heap_time,
+  rocksdb_seek_max_heap_time,
+  rocksdb_seek_internal_seek_time,
+  rocksdb_find_next_user_entry_time,
+  rocksdb_write_wal_time,
+  rocksdb_write_memtable_time,
+  rocksdb_write_delay_time,
+  rocksdb_write_pre_and_post_process_time,
+  rocksdb_db_mutex_lock_nanos,
+  rocksdb_db_condition_wait_nanos,
+  rocksdb_merge_operator_time_nanos,
+  rocksdb_read_index_block_nanos,
+  rocksdb_read_filter_block_nanos,
+  rocksdb_new_table_block_iter_nanos,
+  rocksdb_new_table_iterator_nanos,
+  rocksdb_block_seek_nanos,
+  rocksdb_find_table_nanos,
+  rocksdb_bloom_memtable_hit_count,
+  rocksdb_bloom_memtable_miss_count,
+  rocksdb_bloom_sst_hit_count,
+  rocksdb_bloom_sst_miss_count,
+  rocksdb_key_lock_wait_time,
+  rocksdb_key_lock_wait_count,
+  rocksdb_env_new_sequential_file_nanos,
+  rocksdb_env_new_random_access_file_nanos,
+  rocksdb_env_new_writable_file_nanos,
+  rocksdb_env_reuse_writable_file_nanos,
+  rocksdb_env_new_random_rw_file_nanos,
+  rocksdb_env_new_directory_nanos,
+  rocksdb_env_file_exists_nanos,
+  rocksdb_env_get_children_nanos,
+  rocksdb_env_get_children_file_attributes_nanos,
+  rocksdb_env_delete_file_nanos,
+  rocksdb_env_create_dir_nanos,
+  rocksdb_env_create_dir_if_missing_nanos,
+  rocksdb_env_delete_dir_nanos,
+  rocksdb_env_get_file_size_nanos,
+  rocksdb_env_get_file_modification_time_nanos,
+  rocksdb_env_rename_file_nanos,
+  rocksdb_env_link_file_nanos,
+  rocksdb_env_lock_file_nanos,
+  rocksdb_env_unlock_file_nanos,
+  rocksdb_env_new_logger_nanos,
+  rocksdb_total_metric_count = 68
+};
+
+extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int);
+extern ROCKSDB_LIBRARY_API rocksdb_perfcontext_t* rocksdb_perfcontext_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_reset(
+    rocksdb_perfcontext_t* context);
+extern ROCKSDB_LIBRARY_API char* rocksdb_perfcontext_report(
+    rocksdb_perfcontext_t* context, unsigned char exclude_zero_counters);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_perfcontext_metric(
+    rocksdb_perfcontext_t* context, int metric);
+extern ROCKSDB_LIBRARY_API void rocksdb_perfcontext_destroy(
+    rocksdb_perfcontext_t* context);
+
 /* Compaction Filter */
 
 extern ROCKSDB_LIBRARY_API rocksdb_compactionfilter_t*
@@ -1040,16 +1196,29 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_snapshot(
     rocksdb_readoptions_t*, const rocksdb_snapshot_t*);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_upper_bound(
     rocksdb_readoptions_t*, const char* key, size_t keylen);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iterate_lower_bound(
+    rocksdb_readoptions_t*, const char* key, size_t keylen);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_read_tier(
     rocksdb_readoptions_t*, int);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_tailing(
     rocksdb_readoptions_t*, unsigned char);
+// The functionality that this option controlled has been removed.
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_managed(
+    rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_readahead_size(
     rocksdb_readoptions_t*, size_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_prefix_same_as_start(
+    rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_pin_data(
     rocksdb_readoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_total_order_seek(
     rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_max_skippable_internal_keys(
+    rocksdb_readoptions_t*, uint64_t);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_ignore_range_deletions(
+    rocksdb_readoptions_t*, unsigned char);
 
 /* Write options */
 
@@ -1061,6 +1230,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_sync(
     rocksdb_writeoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_disable_WAL(
     rocksdb_writeoptions_t* opt, int disable);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_ignore_missing_column_families(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_no_slowdown(
+    rocksdb_writeoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void rocksdb_writeoptions_set_low_pri(
+    rocksdb_writeoptions_t*, unsigned char);
 
 /* Compact range options */
 
@@ -1071,6 +1246,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_destroy(
 extern ROCKSDB_LIBRARY_API void
 rocksdb_compactoptions_set_exclusive_manual_compaction(
     rocksdb_compactoptions_t*, unsigned char);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_compactoptions_set_bottommost_level_compaction(
+    rocksdb_compactoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_change_level(
     rocksdb_compactoptions_t*, unsigned char);
 extern ROCKSDB_LIBRARY_API void rocksdb_compactoptions_set_target_level(
@@ -1143,6 +1321,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_delete(
     char** errptr);
 extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_finish(
     rocksdb_sstfilewriter_t* writer, char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_file_size(
+    rocksdb_sstfilewriter_t* writer, uint64_t* file_size);
 extern ROCKSDB_LIBRARY_API void rocksdb_sstfilewriter_destroy(
     rocksdb_sstfilewriter_t* writer);
 
@@ -1162,6 +1342,10 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
     rocksdb_ingestexternalfileoptions_t* opt,
     unsigned char allow_blocking_flush);
+extern ROCKSDB_LIBRARY_API void
+rocksdb_ingestexternalfileoptions_set_ingest_behind(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char ingest_behind);
 extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy(
     rocksdb_ingestexternalfileoptions_t* opt);
 
@@ -1240,6 +1424,10 @@ extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_smallestkey(
     const rocksdb_livefiles_t*, int index, size_t* size);
 extern ROCKSDB_LIBRARY_API const char* rocksdb_livefiles_largestkey(
     const rocksdb_livefiles_t*, int index, size_t* size);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_entries(
+    const rocksdb_livefiles_t*, int index);
+extern ROCKSDB_LIBRARY_API uint64_t rocksdb_livefiles_deletions(
+    const rocksdb_livefiles_t*, int index);
 extern ROCKSDB_LIBRARY_API void rocksdb_livefiles_destroy(
     const rocksdb_livefiles_t*);
 
@@ -1271,6 +1459,13 @@ extern ROCKSDB_LIBRARY_API rocksdb_transactiondb_t* rocksdb_transactiondb_open(
     const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
     char** errptr);
 
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    int num_column_families, const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
 extern ROCKSDB_LIBRARY_API const rocksdb_snapshot_t*
 rocksdb_transactiondb_create_snapshot(rocksdb_transactiondb_t* txn_db);
 
@@ -1289,6 +1484,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transaction_commit(
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback(
     rocksdb_transaction_t* txn, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_set_savepoint(
+    rocksdb_transaction_t* txn);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_rollback_to_savepoint(
+    rocksdb_transaction_t* txn, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_destroy(
     rocksdb_transaction_t* txn);
 
@@ -1310,6 +1511,11 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_transaction_get_for_update(
     const char* key, size_t klen, size_t* vlen, unsigned char exclusive,
     char** errptr);
 
+char* rocksdb_transaction_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, unsigned char exclusive, char** errptr);
+
 extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get(
     rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
     const char* key, size_t klen, size_t* vlen, char** errptr);
@@ -1344,10 +1550,19 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge(
     rocksdb_transaction_t* txn, const char* key, size_t klen, const char* val,
     size_t vlen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_transaction_merge_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge(
     rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
     const char* key, size_t klen, const char* val, size_t vlen, char** errptr);
 
+extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_merge_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    const char* val, size_t vlen, char** errptr);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transaction_delete(
     rocksdb_transaction_t* txn, const char* key, size_t klen, char** errptr);
 
@@ -1368,10 +1583,20 @@ extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
 rocksdb_transaction_create_iterator(rocksdb_transaction_t* txn,
                                     const rocksdb_readoptions_t* options);
 
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transaction_create_iterator_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
 extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
 rocksdb_transactiondb_create_iterator(rocksdb_transactiondb_t* txn_db,
                                       const rocksdb_readoptions_t* options);
 
+extern ROCKSDB_LIBRARY_API rocksdb_iterator_t*
+rocksdb_transactiondb_create_iterator_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family);
+
 extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close(
     rocksdb_transactiondb_t* txn_db);
 
@@ -1383,6 +1608,20 @@ extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
 rocksdb_optimistictransactiondb_open(const rocksdb_options_t* options,
                                      const char* name, char** errptr);
 
+extern ROCKSDB_LIBRARY_API rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+    const rocksdb_options_t* options, const char* name, int num_column_families,
+    const char** column_family_names,
+    const rocksdb_options_t** column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr);
+
+extern ROCKSDB_LIBRARY_API rocksdb_t*
+rocksdb_optimistictransactiondb_get_base_db(
+    rocksdb_optimistictransactiondb_t* otxn_db);
+
+extern ROCKSDB_LIBRARY_API void rocksdb_optimistictransactiondb_close_base_db(
+    rocksdb_t* base_db);
+
 extern ROCKSDB_LIBRARY_API rocksdb_transaction_t*
 rocksdb_optimistictransaction_begin(
     rocksdb_optimistictransactiondb_t* otxn_db,
@@ -1441,7 +1680,6 @@ extern ROCKSDB_LIBRARY_API void
 rocksdb_transaction_options_set_max_write_batch_size(
     rocksdb_transaction_options_t* opt, size_t size);
 
-
 extern ROCKSDB_LIBRARY_API rocksdb_optimistictransaction_options_t*
 rocksdb_optimistictransaction_options_create();
 
@@ -1468,8 +1706,33 @@ extern ROCKSDB_LIBRARY_API void rocksdb_pinnableslice_destroy(
 extern ROCKSDB_LIBRARY_API const char* rocksdb_pinnableslice_value(
     const rocksdb_pinnableslice_t* t, size_t* vlen);
 
+extern ROCKSDB_LIBRARY_API rocksdb_memory_consumers_t*
+    rocksdb_memory_consumers_create();
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_db(
+    rocksdb_memory_consumers_t* consumers, rocksdb_t* db);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_add_cache(
+    rocksdb_memory_consumers_t* consumers, rocksdb_cache_t* cache);
+extern ROCKSDB_LIBRARY_API void rocksdb_memory_consumers_destroy(
+    rocksdb_memory_consumers_t* consumers);
+extern ROCKSDB_LIBRARY_API rocksdb_memory_usage_t*
+rocksdb_approximate_memory_usage_create(rocksdb_memory_consumers_t* consumers,
+                                       char** errptr);
+extern ROCKSDB_LIBRARY_API void rocksdb_approximate_memory_usage_destroy(
+    rocksdb_memory_usage_t* usage);
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage);
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage);
+
 #ifdef __cplusplus
 }  /* end extern "C" */
 #endif
-
-#endif  /* STORAGE_ROCKSDB_INCLUDE_C_H_ */
diff --git a/thirdparty/rocksdb/include/rocksdb/cache.h b/thirdparty/rocksdb/include/rocksdb/cache.h
index 5ebd66bde8..ed7790aebb 100644
--- a/thirdparty/rocksdb/include/rocksdb/cache.h
+++ b/thirdparty/rocksdb/include/rocksdb/cache.h
@@ -25,6 +25,7 @@
 #include <stdint.h>
 #include <memory>
 #include <string>
+#include "rocksdb/memory_allocator.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
@@ -33,6 +34,61 @@ namespace rocksdb {
 
 class Cache;
 
+extern const bool kDefaultToAdaptiveMutex;
+
+struct LRUCacheOptions {
+  // Capacity of the cache.
+  size_t capacity = 0;
+
+  // Cache is sharded into 2^num_shard_bits shards,
+  // by hash of key. Refer to NewLRUCache for further
+  // information.
+  int num_shard_bits = -1;
+
+  // If strict_capacity_limit is set,
+  // insert to the cache will fail when cache is full.
+  bool strict_capacity_limit = false;
+
+  // Percentage of cache reserved for high priority entries.
+  // If greater than zero, the LRU list will be split into a high-pri
+  // list and a low-pri list. High-pri entries will be insert to the
+  // tail of high-pri list, while low-pri entries will be first inserted to
+  // the low-pri list (the midpoint). This is refered to as
+  // midpoint insertion strategy to make entries never get hit in cache
+  // age out faster.
+  //
+  // See also
+  // BlockBasedTableOptions::cache_index_and_filter_blocks_with_high_priority.
+  double high_pri_pool_ratio = 0.0;
+
+  // If non-nullptr will use this allocator instead of system allocator when
+  // allocating memory for cache blocks. Call this method before you start using
+  // the cache!
+  //
+  // Caveat: when the cache is used as block cache, the memory allocator is
+  // ignored when dealing with compression libraries that allocate memory
+  // internally (currently only XPRESS).
+  std::shared_ptr<MemoryAllocator> memory_allocator;
+
+  // Whether to use adaptive mutexes for cache shards. Note that adaptive
+  // mutexes need to be supported by the platform in order for this to have any
+  // effect. The default value is true if RocksDB is compiled with
+  // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise.
+  bool use_adaptive_mutex = kDefaultToAdaptiveMutex;
+
+  LRUCacheOptions() {}
+  LRUCacheOptions(size_t _capacity, int _num_shard_bits,
+                  bool _strict_capacity_limit, double _high_pri_pool_ratio,
+                  std::shared_ptr<MemoryAllocator> _memory_allocator = nullptr,
+                  bool _use_adaptive_mutex = kDefaultToAdaptiveMutex)
+      : capacity(_capacity),
+        num_shard_bits(_num_shard_bits),
+        strict_capacity_limit(_strict_capacity_limit),
+        high_pri_pool_ratio(_high_pri_pool_ratio),
+        memory_allocator(std::move(_memory_allocator)),
+        use_adaptive_mutex(_use_adaptive_mutex) {}
+};
+
 // Create a new cache with a fixed size capacity. The cache is sharded
 // to 2^num_shard_bits shards, by hash of the key. The total capacity
 // is divided and evenly assigned to each shard. If strict_capacity_limit
@@ -41,10 +97,13 @@ class Cache;
 // high_pri_pool_pct.
 // num_shard_bits = -1 means it is automatically determined: every shard
 // will be at least 512KB and number of shard bits will not exceed 6.
-extern std::shared_ptr<Cache> NewLRUCache(size_t capacity,
-                                          int num_shard_bits = -1,
-                                          bool strict_capacity_limit = false,
-                                          double high_pri_pool_ratio = 0.0);
+extern std::shared_ptr<Cache> NewLRUCache(
+    size_t capacity, int num_shard_bits = -1,
+    bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.0,
+    std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
+    bool use_adaptive_mutex = kDefaultToAdaptiveMutex);
+
+extern std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts);
 
 // Similar to NewLRUCache, but create a cache based on CLOCK algorithm with
 // better concurrent performance in some cases. See util/clock_cache.cc for
@@ -61,7 +120,8 @@ class Cache {
   // likely to get evicted than low priority entries.
   enum class Priority { HIGH, LOW };
 
-  Cache() {}
+  Cache(std::shared_ptr<MemoryAllocator> allocator = nullptr)
+      : memory_allocator_(std::move(allocator)) {}
 
   // Destroys all existing entries by calling the "deleter"
   // function that was passed via the Insert() function.
@@ -189,12 +249,17 @@ class Cache {
 
   // Mark the last inserted object as being a raw data block. This will be used
   // in tests. The default implementation does nothing.
-  virtual void TEST_mark_as_data_block(const Slice& key, size_t charge) {}
+  virtual void TEST_mark_as_data_block(const Slice& /*key*/,
+                                       size_t /*charge*/) {}
+
+  MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); }
 
  private:
   // No copying allowed
   Cache(const Cache&);
   Cache& operator=(const Cache&);
+
+  std::shared_ptr<MemoryAllocator> memory_allocator_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/cleanable.h b/thirdparty/rocksdb/include/rocksdb/cleanable.h
index cd2e9425f1..6dba8d9531 100644
--- a/thirdparty/rocksdb/include/rocksdb/cleanable.h
+++ b/thirdparty/rocksdb/include/rocksdb/cleanable.h
@@ -16,8 +16,7 @@
 // non-const method, all threads accessing the same Iterator must use
 // external synchronization.
 
-#ifndef INCLUDE_ROCKSDB_CLEANABLE_H_
-#define INCLUDE_ROCKSDB_CLEANABLE_H_
+#pragma once
 
 namespace rocksdb {
 
@@ -30,7 +29,7 @@ class Cleanable {
   Cleanable(Cleanable&) = delete;
   Cleanable& operator=(Cleanable&) = delete;
 
-  // Move consturctor and move assignment is allowed.
+  // Move constructor and move assignment is allowed.
   Cleanable(Cleanable&&);
   Cleanable& operator=(Cleanable&&);
 
@@ -78,5 +77,3 @@ class Cleanable {
 };
 
 }  // namespace rocksdb
-
-#endif  // INCLUDE_ROCKSDB_CLEANABLE_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/compaction_filter.h b/thirdparty/rocksdb/include/rocksdb/compaction_filter.h
index 64f61a35e0..5d476fb8e6 100644
--- a/thirdparty/rocksdb/include/rocksdb/compaction_filter.h
+++ b/thirdparty/rocksdb/include/rocksdb/compaction_filter.h
@@ -6,8 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
-#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
+#pragma once
 
 #include <cassert>
 #include <memory>
@@ -76,14 +75,11 @@ class CompactionFilter {
   // to modify the existing_value and pass it back through new_value.
   // value_changed needs to be set to true in this case.
   //
-  // If you use snapshot feature of RocksDB (i.e. call GetSnapshot() API on a
-  // DB* object), CompactionFilter might not be very useful for you. Due to
-  // guarantees we need to maintain, compaction process will not call Filter()
-  // on any keys that were written before the latest snapshot. In other words,
-  // compaction will only call Filter() on keys written after your most recent
-  // call to GetSnapshot(). In most cases, Filter() will not be called very
-  // often. This is something we're fixing. See the discussion at:
-  // https://www.facebook.com/groups/mysqlonrocksdb/permalink/999723240091865/
+  // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a
+  // DB* object) will not guarantee to preserve the state of the DB with
+  // CompactionFilter. Data seen from a snapshot might disppear after a
+  // compaction finishes. If you use snapshots, think twice about whether you
+  // want to use compaction filter and whether you are using it in a safe way.
   //
   // If multithreaded compaction is being used *and* a single CompactionFilter
   // instance was supplied via Options::compaction_filter, this method may be
@@ -94,12 +90,10 @@ class CompactionFilter {
   // be used by a single thread that is doing the compaction run, and this
   // call does not need to be thread-safe.  However, multiple filters may be
   // in existence and operating concurrently.
-  //
-  // The last paragraph is not true if you set max_subcompactions to more than
-  // 1. In that case, subcompaction from multiple threads may call a single
-  // CompactionFilter concurrently.
-  virtual bool Filter(int level, const Slice& key, const Slice& existing_value,
-                      std::string* new_value, bool* value_changed) const {
+  virtual bool Filter(int /*level*/, const Slice& /*key*/,
+                      const Slice& /*existing_value*/,
+                      std::string* /*new_value*/,
+                      bool* /*value_changed*/) const {
     return false;
   }
 
@@ -112,8 +106,8 @@ class CompactionFilter {
   // may not realize there is a write conflict and may allow a Transaction to
   // Commit that should have failed.  Instead, it is better to implement any
   // Merge filtering inside the MergeOperator.
-  virtual bool FilterMergeOperand(int level, const Slice& key,
-                                  const Slice& operand) const {
+  virtual bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+                                  const Slice& /*operand*/) const {
     return false;
   }
 
@@ -138,9 +132,9 @@ class CompactionFilter {
   //
   //      Caveats:
   //       - The keys are skipped even if there are snapshots containing them,
-  //         as if IgnoreSnapshots() was true; i.e. values removed
-  //         by kRemoveAndSkipUntil can disappear from a snapshot - beware
-  //         if you're using TransactionDB or DB::GetSnapshot().
+  //         i.e. values removed by kRemoveAndSkipUntil can disappear from a
+  //         snapshot - beware if you're using TransactionDB or
+  //         DB::GetSnapshot().
   //       - If value for a key was overwritten or merged into (multiple Put()s
   //         or Merge()s), and compaction filter skips this key with
   //         kRemoveAndSkipUntil, it's possible that it will remove only
@@ -158,7 +152,7 @@ class CompactionFilter {
   // MergeOperator.
   virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
                             const Slice& existing_value, std::string* new_value,
-                            std::string* skip_until) const {
+                            std::string* /*skip_until*/) const {
     switch (value_type) {
       case ValueType::kValue: {
         bool value_changed = false;
@@ -179,15 +173,12 @@ class CompactionFilter {
     return Decision::kKeep;
   }
 
-  // By default, compaction will only call Filter() on keys written after the
-  // most recent call to GetSnapshot(). However, if the compaction filter
-  // overrides IgnoreSnapshots to make it return true, the compaction filter
-  // will be called even if the keys were written before the last snapshot.
-  // This behavior is to be used only when we want to delete a set of keys
-  // irrespective of snapshots. In particular, care should be taken
-  // to understand that the values of these keys will change even if we are
-  // using a snapshot.
-  virtual bool IgnoreSnapshots() const { return false; }
+  // This function is deprecated. Snapshots will always be ignored for
+  // compaction filters, because we realized that not ignoring snapshots doesn't
+  // provide the gurantee we initially thought it would provide. Repeatable
+  // reads will not be guaranteed anyway. If you override the function and
+  // returns false, we will fail the compaction.
+  virtual bool IgnoreSnapshots() const { return true; }
 
   // Returns a name that identifies this compaction filter.
   // The name will be printed to LOG file on start up for diagnosis.
@@ -198,7 +189,7 @@ class CompactionFilter {
 // application to know about different compactions
 class CompactionFilterFactory {
  public:
-  virtual ~CompactionFilterFactory() { }
+  virtual ~CompactionFilterFactory() {}
 
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
       const CompactionFilter::Context& context) = 0;
@@ -208,5 +199,3 @@ class CompactionFilterFactory {
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/compaction_job_stats.h b/thirdparty/rocksdb/include/rocksdb/compaction_job_stats.h
index ebb04a46bf..4021fcab20 100644
--- a/thirdparty/rocksdb/include/rocksdb/compaction_job_stats.h
+++ b/thirdparty/rocksdb/include/rocksdb/compaction_job_stats.h
@@ -18,6 +18,9 @@ struct CompactionJobStats {
   // the elapsed time of this compaction in microseconds.
   uint64_t elapsed_micros;
 
+  // the elapsed CPU time of this compaction in microseconds.
+  uint64_t cpu_micros;
+
   // the number of compaction input records.
   uint64_t num_input_records;
   // the number of compaction input files.
@@ -72,7 +75,7 @@ struct CompactionJobStats {
   // Time spent on file fsync.
   uint64_t file_fsync_nanos;
 
-  // Time spent on preparing file write (falocate, etc)
+  // Time spent on preparing file write (fallocate, etc)
   uint64_t file_prepare_write_nanos;
 
   // 0-terminated strings storing the first 8 bytes of the smallest and
diff --git a/thirdparty/rocksdb/include/rocksdb/comparator.h b/thirdparty/rocksdb/include/rocksdb/comparator.h
index 64db73a724..46279f9a69 100644
--- a/thirdparty/rocksdb/include/rocksdb/comparator.h
+++ b/thirdparty/rocksdb/include/rocksdb/comparator.h
@@ -6,8 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
+#pragma once
 
 #include <string>
 
@@ -21,7 +20,7 @@ class Slice;
 // from multiple threads.
 class Comparator {
  public:
-  virtual ~Comparator();
+  virtual ~Comparator() {}
 
   // Three-way comparison.  Returns value:
   //   < 0 iff "a" < "b",
@@ -56,9 +55,8 @@ class Comparator {
   // If *start < limit, changes *start to a short string in [start,limit).
   // Simple comparator implementations may return with *start unchanged,
   // i.e., an implementation of this method that does nothing is correct.
-  virtual void FindShortestSeparator(
-      std::string* start,
-      const Slice& limit) const = 0;
+  virtual void FindShortestSeparator(std::string* start,
+                                     const Slice& limit) const = 0;
 
   // Changes *key to a short string >= *key.
   // Simple comparator implementations may return with *key unchanged,
@@ -68,6 +66,18 @@ class Comparator {
   // if it is a wrapped comparator, may return the root one.
   // return itself it is not wrapped.
   virtual const Comparator* GetRootComparator() const { return this; }
+
+  // given two keys, determine if t is the successor of s
+  virtual bool IsSameLengthImmediateSuccessor(const Slice& /*s*/,
+                                              const Slice& /*t*/) const {
+    return false;
+  }
+
+  // return true if two keys with different byte sequences can be regarded
+  // as equal by this comparator.
+  // The major use case is to determine if DataBlockHashIndex is compatible
+  // with the customized comparator.
+  virtual bool CanKeysWithDifferentByteContentsBeEqual() const { return true; }
 };
 
 // Return a builtin comparator that uses lexicographic byte-wise
@@ -80,5 +90,3 @@ extern const Comparator* BytewiseComparator();
 extern const Comparator* ReverseBytewiseComparator();
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/concurrent_task_limiter.h b/thirdparty/rocksdb/include/rocksdb/concurrent_task_limiter.h
new file mode 100644
index 0000000000..2e054efdad
--- /dev/null
+++ b/thirdparty/rocksdb/include/rocksdb/concurrent_task_limiter.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+
+namespace rocksdb {
+
+class ConcurrentTaskLimiter {
+ public:
+  virtual ~ConcurrentTaskLimiter() {}
+
+  // Returns a name that identifies this concurrent task limiter.
+  virtual const std::string& GetName() const = 0;
+
+  // Set max concurrent tasks.
+  // limit = 0 means no new task allowed.
+  // limit < 0 means no limitation.
+  virtual void SetMaxOutstandingTask(int32_t limit) = 0;
+
+  // Reset to unlimited max concurrent task.
+  virtual void ResetMaxOutstandingTask() = 0;
+
+  // Returns current outstanding task count.
+  virtual int32_t GetOutstandingTask() const = 0;
+};
+
+// Create a ConcurrentTaskLimiter that can be shared with mulitple CFs
+// across RocksDB instances to control concurrent tasks.
+//
+// @param name: Name of the limiter.
+// @param limit: max concurrent tasks.
+//        limit = 0 means no new task allowed.
+//        limit < 0 means no limitation.
+extern ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name,
+                                                       int32_t limit);
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/convenience.h b/thirdparty/rocksdb/include/rocksdb/convenience.h
index 4a60afb11d..d3cbe6016a 100644
--- a/thirdparty/rocksdb/include/rocksdb/convenience.h
+++ b/thirdparty/rocksdb/include/rocksdb/convenience.h
@@ -277,15 +277,13 @@ Status GetPlainTableOptionsFromMap(
 // BlockBasedTableOptions as part of the string for block-based table factory:
 //   "write_buffer_size=1024;block_based_table_factory={block_size=4k};"
 //   "max_write_buffer_num=2"
-Status GetColumnFamilyOptionsFromString(
-    const ColumnFamilyOptions& base_options,
-    const std::string& opts_str,
-    ColumnFamilyOptions* new_options);
+Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options,
+                                        const std::string& opts_str,
+                                        ColumnFamilyOptions* new_options);
 
-Status GetDBOptionsFromString(
-    const DBOptions& base_options,
-    const std::string& opts_str,
-    DBOptions* new_options);
+Status GetDBOptionsFromString(const DBOptions& base_options,
+                              const std::string& opts_str,
+                              DBOptions* new_options);
 
 Status GetStringFromDBOptions(std::string* opts_str,
                               const DBOptions& db_options,
@@ -301,14 +299,12 @@ Status GetStringFromCompressionType(std::string* compression_str,
 std::vector<CompressionType> GetSupportedCompressions();
 
 Status GetBlockBasedTableOptionsFromString(
-    const BlockBasedTableOptions& table_options,
-    const std::string& opts_str,
+    const BlockBasedTableOptions& table_options, const std::string& opts_str,
     BlockBasedTableOptions* new_table_options);
 
-Status GetPlainTableOptionsFromString(
-    const PlainTableOptions& table_options,
-    const std::string& opts_str,
-    PlainTableOptions* new_table_options);
+Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options,
+                                      const std::string& opts_str,
+                                      PlainTableOptions* new_table_options);
 
 Status GetMemTableRepFactoryFromString(
     const std::string& opts_str,
@@ -325,10 +321,19 @@ void CancelAllBackgroundWork(DB* db, bool wait = false);
 
 // Delete files which are entirely in the given range
 // Could leave some keys in the range which are in files which are not
-// entirely in the range.
+// entirely in the range. Also leaves L0 files regardless of whether they're
+// in the range.
 // Snapshots before the delete might not see the data in the given range.
 Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
-                          const Slice* begin, const Slice* end);
+                          const Slice* begin, const Slice* end,
+                          bool include_end = true);
+
+// Delete files in multiple ranges at once
+// Delete files in a lot of ranges one at a time can be slow, use this API for
+// better performance in that case.
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangePtr* ranges, size_t n,
+                           bool include_end = true);
 
 // Verify the checksum of file
 Status VerifySstFileChecksum(const Options& options,
diff --git a/thirdparty/rocksdb/include/rocksdb/db.h b/thirdparty/rocksdb/include/rocksdb/db.h
index 964f7b1db4..b40af20e27 100644
--- a/thirdparty/rocksdb/include/rocksdb/db.h
+++ b/thirdparty/rocksdb/include/rocksdb/db.h
@@ -6,8 +6,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_DB_H_
-#define STORAGE_ROCKSDB_INCLUDE_DB_H_
+#pragma once
 
 #include <stdint.h>
 #include <stdio.h>
@@ -53,8 +52,11 @@ struct ExternalSstFileInfo;
 class WriteBatch;
 class Env;
 class EventListener;
-
-using std::unique_ptr;
+class StatsHistoryIterator;
+class TraceWriter;
+#ifdef ROCKSDB_LITE
+class CompactionJobInfo;
+#endif
 
 extern const std::string kDefaultColumnFamilyName;
 struct ColumnFamilyDescriptor {
@@ -92,11 +94,25 @@ static const int kMinorVersion = __ROCKSDB_MINOR__;
 
 // A range of keys
 struct Range {
-  Slice start;          // Included in the range
-  Slice limit;          // Not included in the range
+  Slice start;
+  Slice limit;
+
+  Range() {}
+  Range(const Slice& s, const Slice& l) : start(s), limit(l) {}
+};
+
+struct RangePtr {
+  const Slice* start;
+  const Slice* limit;
 
-  Range() { }
-  Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+  RangePtr() : start(nullptr), limit(nullptr) {}
+  RangePtr(const Slice* s, const Slice* l) : start(s), limit(l) {}
+};
+
+struct IngestExternalFileArg {
+  ColumnFamilyHandle* column_family = nullptr;
+  std::vector<std::string> external_files;
+  IngestExternalFileOptions options;
 };
 
 // A collections of table properties objects, where
@@ -115,8 +131,7 @@ class DB {
   // OK on success.
   // Stores nullptr in *dbptr and returns a non-OK status on error.
   // Caller should delete *dbptr when it is no longer needed.
-  static Status Open(const Options& options,
-                     const std::string& name,
+  static Status Open(const Options& options, const std::string& name,
                      DB** dbptr);
 
   // Open the database for read only. All DB interfaces
@@ -126,9 +141,9 @@ class DB {
   //
   // Not supported in ROCKSDB_LITE, in which case the function will
   // return Status::NotSupported.
-  static Status OpenForReadOnly(const Options& options,
-      const std::string& name, DB** dbptr,
-      bool error_if_log_file_exist = false);
+  static Status OpenForReadOnly(const Options& options, const std::string& name,
+                                DB** dbptr,
+                                bool error_if_log_file_exist = false);
 
   // Open the database for read only with column families. When opening DB with
   // read only, you can specify only a subset of column families in the
@@ -144,6 +159,54 @@ class DB {
       std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
       bool error_if_log_file_exist = false);
 
+  // The following OpenAsSecondary functions create a secondary instance that
+  // can dynamically tail the MANIFEST of a primary that must have already been
+  // created. User can call TryCatchUpWithPrimary to make the secondary
+  // instance catch up with primary (WAL tailing is NOT supported now) whenever
+  // the user feels necessary. Column families created by the primary after the
+  // secondary instance starts are currently ignored by the secondary instance.
+  // Column families opened by secondary and dropped by the primary will be
+  // dropped by secondary as well. However the user of the secondary instance
+  // can still access the data of such dropped column family as long as they
+  // do not destroy the corresponding column family handle.
+  // WAL tailing is not supported at present, but will arrive soon.
+  //
+  // The options argument specifies the options to open the secondary instance.
+  // The name argument specifies the name of the primary db that you have used
+  // to open the primary instance.
+  // The secondary_path argument points to a directory where the secondary
+  // instance stores its info log.
+  // The dbptr is an out-arg corresponding to the opened secondary instance.
+  // The pointer points to a heap-allocated database, and the user should
+  // delete it after use.
+  // Open DB as secondary instance with only the default column family.
+  // Return OK on success, non-OK on failures.
+  static Status OpenAsSecondary(const Options& options, const std::string& name,
+                                const std::string& secondary_path, DB** dbptr);
+
+  // Open DB as secondary instance with column families. You can open a subset
+  // of column families in secondary mode.
+  // The db_options specify the database specific options.
+  // The name argument specifies the name of the primary db that you have used
+  // to open the primary instance.
+  // The secondary_path argument points to a directory where the secondary
+  // instance stores its info log.
+  // The column_families argument specifieds a list of column families to open.
+  // If any of the column families does not exist, the function returns non-OK
+  // status.
+  // The handles is an out-arg corresponding to the opened database column
+  // familiy handles.
+  // The dbptr is an out-arg corresponding to the opened secondary instance.
+  // The pointer points to a heap-allocated database, and the caller should
+  // delete it after use. Before deleting the dbptr, the user should also
+  // delete the pointers stored in handles vector.
+  // Return OK on success, on-OK on failures.
+  static Status OpenAsSecondary(
+      const DBOptions& db_options, const std::string& name,
+      const std::string& secondary_path,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
+
   // Open DB with column families.
   // db_options specify database specific options
   // column_families is the vector of all column families in the database,
@@ -162,6 +225,18 @@ class DB {
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr);
 
+  virtual Status Resume() { return Status::NotSupported(); }
+
+  // Close the DB by releasing resources, closing files etc. This should be
+  // called before calling the destructor so that the caller can get back a
+  // status in case there are any errors. This will not fsync the WAL files.
+  // If syncing is required, the caller must first call SyncWAL(), or Write()
+  // using an empty write batch with WriteOptions.sync=true.
+  // Regardless of the return status, the DB must be freed. If the return
+  // status is NotSupported(), then the DB implementation does cleanup in the
+  // destructor
+  virtual Status Close() { return Status::NotSupported(); }
+
   // ListColumnFamilies will open the DB specified by argument name
   // and return the list of all column families in that DB
   // through column_families argument. The ordering of
@@ -170,7 +245,7 @@ class DB {
                                    const std::string& name,
                                    std::vector<std::string>* column_families);
 
-  DB() { }
+  DB() {}
   virtual ~DB();
 
   // Create a column_family and return the handle of column family
@@ -267,16 +342,12 @@ class DB {
   // a non-OK status on error. It is not an error if no keys exist in the range
   // ["begin_key", "end_key").
   //
-  // This feature is currently an experimental performance optimization for
-  // deleting very large ranges of contiguous keys. Invoking it many times or on
-  // small ranges may severely degrade read performance; in particular, the
-  // resulting performance can be worse than calling Delete() for each key in
-  // the range. Note also the degraded read performance affects keys outside the
-  // deleted ranges, and affects database operations involving scans, like flush
-  // and compaction.
-  //
-  // Consider setting ReadOptions::ignore_range_deletions = true to speed
-  // up reads for key(s) that are known to be unaffected by range deletions.
+  // This feature is now usable in production, with the following caveats:
+  // 1) Accumulating many range tombstones in the memtable will degrade read
+  // performance; this can be avoided by manually flushing occasionally.
+  // 2) Limiting the maximum number of open files in the presence of range
+  // tombstones can degrade read performance. To avoid this problem, set
+  // max_open_files to -1 whenever possible.
   virtual Status DeleteRange(const WriteOptions& options,
                              ColumnFamilyHandle* column_family,
                              const Slice& begin_key, const Slice& end_key);
@@ -322,7 +393,8 @@ class DB {
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) = 0;
-  virtual Status Get(const ReadOptions& options, const Slice& key, std::string* value) {
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     std::string* value) {
     return Get(options, DefaultColumnFamily(), key, value);
   }
 
@@ -343,9 +415,10 @@ class DB {
   virtual std::vector<Status> MultiGet(const ReadOptions& options,
                                        const std::vector<Slice>& keys,
                                        std::vector<std::string>* values) {
-    return MultiGet(options, std::vector<ColumnFamilyHandle*>(
-                                 keys.size(), DefaultColumnFamily()),
-                    keys, values);
+    return MultiGet(
+        options,
+        std::vector<ColumnFamilyHandle*>(keys.size(), DefaultColumnFamily()),
+        keys, values);
   }
 
   // If the key definitely does not exist in the database, then this method
@@ -552,11 +625,20 @@ class DB {
     //      log files that should be kept.
     static const std::string kMinLogNumberToKeep;
 
+    //  "rocksdb.min-obsolete-sst-number-to-keep" - return the minimum file
+    //      number for an obsolete SST to be kept. The max value of `uint64_t`
+    //      will be returned if all obsolete files can be deleted.
+    static const std::string kMinObsoleteSstNumberToKeep;
+
     //  "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST
     //      files.
     //  WARNING: may slow down online queries if there are too many files.
     static const std::string kTotalSstFilesSize;
 
+    //  "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST
+    //      files belong to the latest LSM tree.
+    static const std::string kLiveSstFilesSize;
+
     //  "rocksdb.base-level" - returns number of level to which L0 data will be
     //      compacted.
     static const std::string kBaseLevel;
@@ -588,6 +670,21 @@ class DB {
     //      FIFO compaction with
     //      compaction_options_fifo.allow_compaction = false.
     static const std::string kEstimateOldestKeyTime;
+
+    //  "rocksdb.block-cache-capacity" - returns block cache capacity.
+    static const std::string kBlockCacheCapacity;
+
+    //  "rocksdb.block-cache-usage" - returns the memory size for the entries
+    //      residing in block cache.
+    static const std::string kBlockCacheUsage;
+
+    // "rocksdb.block-cache-pinned-usage" - returns the memory size for the
+    //      entries being pinned.
+    static const std::string kBlockCachePinnedUsage;
+
+    // "rocksdb.options-statistics" - returns multi-line string
+    //      of options.statistics
+    static const std::string kOptionsStatistics;
   };
 #endif /* ROCKSDB_LITE */
 
@@ -602,9 +699,9 @@ class DB {
   }
   virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
                               const Slice& property,
-                              std::map<std::string, double>* value) = 0;
+                              std::map<std::string, std::string>* value) = 0;
   virtual bool GetMapProperty(const Slice& property,
-                              std::map<std::string, double>* value) {
+                              std::map<std::string, std::string>* value) {
     return GetMapProperty(DefaultColumnFamily(), property, value);
   }
 
@@ -631,7 +728,9 @@ class DB {
   //  "rocksdb.current-super-version-number"
   //  "rocksdb.estimate-live-data-size"
   //  "rocksdb.min-log-number-to-keep"
+  //  "rocksdb.min-obsolete-sst-number-to-keep"
   //  "rocksdb.total-sst-files-size"
+  //  "rocksdb.live-sst-files-size"
   //  "rocksdb.base-level"
   //  "rocksdb.estimate-pending-compaction-bytes"
   //  "rocksdb.num-running-compactions"
@@ -639,6 +738,9 @@ class DB {
   //  "rocksdb.actual-delayed-write-rate"
   //  "rocksdb.is-write-stopped"
   //  "rocksdb.estimate-oldest-key-time"
+  //  "rocksdb.block-cache-capacity"
+  //  "rocksdb.block-cache-usage"
+  //  "rocksdb.block-cache-pinned-usage"
   virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
                               const Slice& property, uint64_t* value) = 0;
   virtual bool GetIntProperty(const Slice& property, uint64_t* value) {
@@ -678,13 +780,10 @@ class DB {
   // include_flags should be of type DB::SizeApproximationFlags
   virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
                                    const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags
-                                   = INCLUDE_FILES) = 0;
+                                   uint8_t include_flags = INCLUDE_FILES) = 0;
   virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes,
-                                   uint8_t include_flags
-                                   = INCLUDE_FILES) {
-    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes,
-                        include_flags);
+                                   uint8_t include_flags = INCLUDE_FILES) {
+    GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
   }
 
   // The method is similar to GetApproximateSizes, except it
@@ -701,8 +800,7 @@ class DB {
 
   // Deprecated versions of GetApproximateSizes
   ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
-      const Range* range, int n, uint64_t* sizes,
-      bool include_memtable) {
+      const Range* range, int n, uint64_t* sizes, bool include_memtable) {
     uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
     if (include_memtable) {
       include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
@@ -710,9 +808,8 @@ class DB {
     GetApproximateSizes(DefaultColumnFamily(), range, n, sizes, include_flags);
   }
   ROCKSDB_DEPRECATED_FUNC virtual void GetApproximateSizes(
-      ColumnFamilyHandle* column_family,
-      const Range* range, int n, uint64_t* sizes,
-      bool include_memtable) {
+      ColumnFamilyHandle* column_family, const Range* range, int n,
+      uint64_t* sizes, bool include_memtable) {
     uint8_t include_flags = SizeApproximationFlags::INCLUDE_FILES;
     if (include_memtable) {
       include_flags |= SizeApproximationFlags::INCLUDE_MEMTABLES;
@@ -789,20 +886,25 @@ class DB {
   virtual Status CompactFiles(
       const CompactionOptions& compact_options,
       ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& input_file_names,
-      const int output_level, const int output_path_id = -1) = 0;
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) = 0;
 
   virtual Status CompactFiles(
       const CompactionOptions& compact_options,
-      const std::vector<std::string>& input_file_names,
-      const int output_level, const int output_path_id = -1) {
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) {
     return CompactFiles(compact_options, DefaultColumnFamily(),
-                        input_file_names, output_level, output_path_id);
+                        input_file_names, output_level, output_path_id,
+                        output_file_names, compaction_job_info);
   }
 
   // This function will wait until all currently running background processes
   // finish. After it returns, no background process will be run until
-  // UnblockBackgroundWork is called
+  // ContinueBackgroundWork is called
   virtual Status PauseBackgroundWork() = 0;
   virtual Status ContinueBackgroundWork() = 0;
 
@@ -854,15 +956,28 @@ class DB {
   virtual DBOptions GetDBOptions() const = 0;
 
   // Flush all mem-table data.
+  // Flush a single column family, even when atomic flush is enabled. To flush
+  // multiple column families, use Flush(options, column_families).
   virtual Status Flush(const FlushOptions& options,
                        ColumnFamilyHandle* column_family) = 0;
   virtual Status Flush(const FlushOptions& options) {
     return Flush(options, DefaultColumnFamily());
   }
+  // Flushes multiple column families.
+  // If atomic flush is not enabled, Flush(options, column_families) is
+  // equivalent to calling Flush(options, column_family) multiple times.
+  // If atomic flush is enabled, Flush(options, column_families) will flush all
+  // column families specified in 'column_families' up to the latest sequence
+  // number at the time when flush is requested.
+  // Note that RocksDB 5.15 and earlier may not be able to open later versions
+  // with atomic flush enabled.
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) = 0;
 
   // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL
   // afterwards.
-  virtual Status FlushWAL(bool sync) {
+  virtual Status FlushWAL(bool /*sync*/) {
     return Status::NotSupported("FlushWAL not implemented");
   }
   // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the
@@ -871,9 +986,27 @@ class DB {
   // Currently only works if allow_mmap_writes = false in Options.
   virtual Status SyncWAL() = 0;
 
+  // Lock the WAL. Also flushes the WAL after locking.
+  virtual Status LockWAL() {
+    return Status::NotSupported("LockWAL not implemented");
+  }
+
+  // Unlock the WAL.
+  virtual Status UnlockWAL() {
+    return Status::NotSupported("UnlockWAL not implemented");
+  }
+
   // The sequence number of the most recent transaction.
   virtual SequenceNumber GetLatestSequenceNumber() const = 0;
 
+  // Instructs DB to preserve deletes with sequence numbers >= passed seqnum.
+  // Has no effect if DBOptions.preserve_deletes is set to false.
+  // This function assumes that user calls this function with monotonically
+  // increasing seqnums (otherwise we can't guarantee that a particular delete
+  // hasn't been already processed); returns true if the value was successfully
+  // updated, false if user attempted to call if with seqnum <= current value.
+  virtual bool SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) = 0;
+
 #ifndef ROCKSDB_LITE
 
   // Prevent file deletions. Compactions will continue to occur,
@@ -895,14 +1028,14 @@ class DB {
   // GetLiveFiles followed by GetSortedWalFiles can generate a lossless backup
 
   // Retrieve the list of all files in the database. The files are
-  // relative to the dbname and are not absolute paths. The valid size of the
-  // manifest file is returned in manifest_file_size. The manifest file is an
-  // ever growing file, but only the portion specified by manifest_file_size is
-  // valid for this snapshot.
-  // Setting flush_memtable to true does Flush before recording the live files.
-  // Setting flush_memtable to false is useful when we don't want to wait for
-  // flush which may have to wait for compaction to complete taking an
-  // indeterminate time.
+  // relative to the dbname and are not absolute paths. Despite being relative
+  // paths, the file names begin with "/". The valid size of the manifest file
+  // is returned in manifest_file_size. The manifest file is an ever growing
+  // file, but only the portion specified by manifest_file_size is valid for
+  // this snapshot. Setting flush_memtable to true does Flush before recording
+  // the live files. Setting flush_memtable to false is useful when we don't
+  // want to wait for flush which may have to wait for compaction to complete
+  // taking an indeterminate time.
   //
   // In case you have multiple column families, even if flush_memtable is true,
   // you still need to call GetSortedWalFiles after GetLiveFiles to compensate
@@ -915,6 +1048,7 @@ class DB {
   // Retrieve the sorted list of all wal files with earliest file first
   virtual Status GetSortedWalFiles(VectorLogPtr& files) = 0;
 
+  // Note: this API is not yet consistent with WritePrepared transactions.
   // Sets iter to an iterator that is positioned at a write-batch containing
   // seq_number. If the sequence number is non existent, it returns an iterator
   // at the first available seq_no after the requested seq_no
@@ -924,9 +1058,9 @@ class DB {
   // cleared aggressively and the iterator might keep getting invalid before
   // an update is read.
   virtual Status GetUpdatesSince(
-      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
-      const TransactionLogIterator::ReadOptions&
-          read_options = TransactionLogIterator::ReadOptions()) = 0;
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) = 0;
 
 // Windows API macro interference
 #undef DeleteFile
@@ -941,17 +1075,11 @@ class DB {
       std::vector<LiveFileMetaData>* /*metadata*/) {}
 
   // Obtains the meta data of the specified column family of the DB.
-  // Status::NotFound() will be returned if the current DB does not have
-  // any column family match the specified name.
-  //
-  // If cf_name is not specified, then the metadata of the default
-  // column family will be returned.
   virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
                                        ColumnFamilyMetaData* /*metadata*/) {}
 
   // Get the metadata of the default column family.
-  void GetColumnFamilyMetaData(
-      ColumnFamilyMetaData* metadata) {
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* metadata) {
     GetColumnFamilyMetaData(DefaultColumnFamily(), metadata);
   }
 
@@ -963,7 +1091,7 @@ class DB {
   // the file can fit in, and ingest the file into this level (2). A file that
   // have a key range that overlap with the memtable key range will require us
   // to Flush the memtable first before ingesting the file.
-  // In the second mode we will always ingest in the bottom mode level (see
+  // In the second mode we will always ingest in the bottom most level (see
   // docs to IngestExternalFileOptions::ingest_behind).
   //
   // (1) External SST files can be created using SstFileWriter
@@ -983,6 +1111,24 @@ class DB {
     return IngestExternalFile(DefaultColumnFamily(), external_files, options);
   }
 
+  // IngestExternalFiles() will ingest files for multiple column families, and
+  // record the result atomically to the MANIFEST.
+  // If this function returns OK, all column families' ingestion must succeed.
+  // If this function returns NOK, or the process crashes, then non-of the
+  // files will be ingested into the database after recovery.
+  // Note that it is possible for application to observe a mixed state during
+  // the execution of this function. If the user performs range scan over the
+  // column families with iterators, iterator on one column family may return
+  // ingested data, while iterator on other column family returns old data.
+  // Users can use snapshot for a consistent view of data.
+  // If your db ingests multiple SST files using this API, i.e. args.size()
+  // > 1, then RocksDB 5.15 and earlier will not be able to open it.
+  //
+  // REQUIRES: each arg corresponds to a different column family: namely, for
+  // 0 <= i < j < len(args), args[i].column_family != args[j].column_family.
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) = 0;
+
   virtual Status VerifyChecksum() = 0;
 
   // AddFile() is deprecated, please use IngestExternalFile()
@@ -1107,21 +1253,56 @@ class DB {
       ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
       TablePropertiesCollection* props) = 0;
 
-  virtual Status SuggestCompactRange(ColumnFamilyHandle* column_family,
-                                     const Slice* begin, const Slice* end) {
+  virtual Status SuggestCompactRange(ColumnFamilyHandle* /*column_family*/,
+                                     const Slice* /*begin*/,
+                                     const Slice* /*end*/) {
     return Status::NotSupported("SuggestCompactRange() is not implemented.");
   }
 
-  virtual Status PromoteL0(ColumnFamilyHandle* column_family,
-                           int target_level) {
+  virtual Status PromoteL0(ColumnFamilyHandle* /*column_family*/,
+                           int /*target_level*/) {
     return Status::NotSupported("PromoteL0() is not implemented.");
   }
 
+  // Trace DB operations. Use EndTrace() to stop tracing.
+  virtual Status StartTrace(const TraceOptions& /*options*/,
+                            std::unique_ptr<TraceWriter>&& /*trace_writer*/) {
+    return Status::NotSupported("StartTrace() is not implemented.");
+  }
+
+  virtual Status EndTrace() {
+    return Status::NotSupported("EndTrace() is not implemented.");
+  }
 #endif  // ROCKSDB_LITE
 
   // Needed for StackableDB
   virtual DB* GetRootDB() { return this; }
 
+  // Given a time window, return an iterator for accessing stats history
+  // User is responsible for deleting StatsHistoryIterator after use
+  virtual Status GetStatsHistory(
+      uint64_t /*start_time*/, uint64_t /*end_time*/,
+      std::unique_ptr<StatsHistoryIterator>* /*stats_iterator*/) {
+    return Status::NotSupported("GetStatsHistory() is not implemented.");
+  }
+
+#ifndef ROCKSDB_LITE
+  // Make the secondary instance catch up with the primary by tailing and
+  // replaying the MANIFEST and WAL of the primary.
+  // Column families created by the primary after the secondary instance starts
+  // will be ignored unless the secondary instance closes and restarts with the
+  // newly created column families.
+  // Column families that exist before secondary instance starts and dropped by
+  // the primary afterwards will be marked as dropped. However, as long as the
+  // secondary instance does not delete the corresponding column family
+  // handles, the data of the column family is still accessible to the
+  // secondary.
+  // TODO: we will support WAL tailing soon.
+  virtual Status TryCatchUpWithPrimary() {
+    return Status::NotSupported("Supported only by secondary instance");
+  }
+#endif  // !ROCKSDB_LITE
+
  private:
   // No copying allowed
   DB(const DB&);
@@ -1130,7 +1311,9 @@ class DB {
 
 // Destroy the contents of the specified database.
 // Be very careful using this method.
-Status DestroyDB(const std::string& name, const Options& options);
+Status DestroyDB(const std::string& name, const Options& options,
+                 const std::vector<ColumnFamilyDescriptor>& column_families =
+                     std::vector<ColumnFamilyDescriptor>());
 
 #ifndef ROCKSDB_LITE
 // If a DB cannot be opened, you may attempt to call this method to
@@ -1158,5 +1341,3 @@ Status RepairDB(const std::string& dbname, const Options& options);
 #endif
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_DB_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/db_dump_tool.h b/thirdparty/rocksdb/include/rocksdb/db_dump_tool.h
index cb9a265f5c..aeaa3422df 100644
--- a/thirdparty/rocksdb/include/rocksdb/db_dump_tool.h
+++ b/thirdparty/rocksdb/include/rocksdb/db_dump_tool.h
@@ -17,7 +17,7 @@ struct DumpOptions {
   std::string db_path;
   // File location that will contain dump output
   std::string dump_location;
-  // Dont include db information header in the dump
+  // Don't include db information header in the dump
   bool anonymous = false;
 };
 
diff --git a/thirdparty/rocksdb/include/rocksdb/env.h b/thirdparty/rocksdb/include/rocksdb/env.h
index 709d503668..4d3a96fe28 100644
--- a/thirdparty/rocksdb/include/rocksdb/env.h
+++ b/thirdparty/rocksdb/include/rocksdb/env.h
@@ -14,8 +14,7 @@
 // All Env implementations are safe for concurrent access from
 // multiple threads without any external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_ENV_H_
-#define STORAGE_ROCKSDB_INCLUDE_ENV_H_
+#pragma once
 
 #include <stdint.h>
 #include <cstdarg>
@@ -33,6 +32,13 @@
 #undef GetCurrentTime
 #endif
 
+#if defined(__GNUC__) || defined(__clang__)
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param) \
+    __attribute__((__format__(__printf__, format_param, dots_param)))
+#else
+#define ROCKSDB_PRINTF_FORMAT_ATTR(format_param, dots_param)
+#endif
+
 namespace rocksdb {
 
 class FileLock;
@@ -42,31 +48,29 @@ class SequentialFile;
 class Slice;
 class WritableFile;
 class RandomRWFile;
+class MemoryMappedFileBuffer;
 class Directory;
 struct DBOptions;
 struct ImmutableDBOptions;
+struct MutableDBOptions;
 class RateLimiter;
 class ThreadStatusUpdater;
 struct ThreadStatus;
 
-using std::unique_ptr;
-using std::shared_ptr;
-
 const size_t kDefaultPageSize = 4 * 1024;
 
 // Options while opening a file to read/write
 struct EnvOptions {
-
   // Construct with default Options
   EnvOptions();
 
   // Construct from Options
   explicit EnvOptions(const DBOptions& options);
 
-   // If true, then use mmap to read data
+  // If true, then use mmap to read data
   bool use_mmap_reads = false;
 
-   // If true, then use mmap to write data
+  // If true, then use mmap to write data
   bool use_mmap_writes = true;
 
   // If true, then use O_DIRECT for reading data
@@ -136,9 +140,8 @@ class Env {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options)
-                                   = 0;
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) = 0;
 
   // Create a brand new random access read-only file with the
   // specified name.  On success, stores a pointer to the new file in
@@ -148,9 +151,18 @@ class Env {
   //
   // The returned file may be concurrently accessed by multiple threads.
   virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options)
-                                     = 0;
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) = 0;
+  // These values match Linux definition
+  // https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/fcntl.h#n56
+  enum WriteLifeTimeHint {
+    WLTH_NOT_SET = 0,  // No hint information set
+    WLTH_NONE,         // No hints about write life time
+    WLTH_SHORT,        // Data written has a short life time
+    WLTH_MEDIUM,       // Data written has a medium life time
+    WLTH_LONG,         // Data written has a long life time
+    WLTH_EXTREME,      // Data written has an extremely long life time
+  };
 
   // Create an object that writes to a new file with the specified
   // name.  Deletes any existing file with the same name and creates a
@@ -160,7 +172,7 @@ class Env {
   //
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& options) = 0;
 
   // Create an object that writes to a new file with the specified
@@ -170,16 +182,16 @@ class Env {
   // returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  virtual Status ReopenWritableFile(const std::string& fname,
-                                    unique_ptr<WritableFile>* result,
-                                    const EnvOptions& options) {
+  virtual Status ReopenWritableFile(const std::string& /*fname*/,
+                                    std::unique_ptr<WritableFile>* /*result*/,
+                                    const EnvOptions& /*options*/) {
     return Status::NotSupported();
   }
 
   // Reuse an existing file by renaming it and opening it as writable.
   virtual Status ReuseWritableFile(const std::string& fname,
                                    const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
+                                   std::unique_ptr<WritableFile>* result,
                                    const EnvOptions& options);
 
   // Open `fname` for random read and write, if file doesn't exist the file
@@ -187,12 +199,22 @@ class Env {
   // *result and returns OK.  On failure returns non-OK.
   //
   // The returned file will only be accessed by one thread at a time.
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) {
+  virtual Status NewRandomRWFile(const std::string& /*fname*/,
+                                 std::unique_ptr<RandomRWFile>* /*result*/,
+                                 const EnvOptions& /*options*/) {
     return Status::NotSupported("RandomRWFile is not implemented in this Env");
   }
 
+  // Opens `fname` as a memory-mapped file for read and write (in-place updates
+  // only, i.e., no appends). On success, stores a raw buffer covering the whole
+  // file in `*result`. The file must exist prior to this call.
+  virtual Status NewMemoryMappedFileBuffer(
+      const std::string& /*fname*/,
+      std::unique_ptr<MemoryMappedFileBuffer>* /*result*/) {
+    return Status::NotSupported(
+        "MemoryMappedFileBuffer is not implemented in this Env");
+  }
+
   // Create an object that represents a directory. Will fail if directory
   // doesn't exist. If the directory exists, it will open the directory
   // and create a new Directory object.
@@ -201,7 +223,7 @@ class Env {
   // *result and returns OK. On failure stores nullptr in *result and
   // returns non-OK.
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) = 0;
+                              std::unique_ptr<Directory>* result) = 0;
 
   // Returns OK if the named file exists.
   //         NotFound if the named file does not exist,
@@ -236,6 +258,11 @@ class Env {
   // Delete the named file.
   virtual Status DeleteFile(const std::string& fname) = 0;
 
+  // Truncate the named file to the specified size.
+  virtual Status Truncate(const std::string& /*fname*/, size_t /*size*/) {
+    return Status::NotSupported("Truncate is not supported for this Env");
+  }
+
   // Create the specified directory. Returns error if directory exists.
   virtual Status CreateDir(const std::string& dirname) = 0;
 
@@ -257,10 +284,22 @@ class Env {
                             const std::string& target) = 0;
 
   // Hard Link file src to target.
-  virtual Status LinkFile(const std::string& src, const std::string& target) {
+  virtual Status LinkFile(const std::string& /*src*/,
+                          const std::string& /*target*/) {
     return Status::NotSupported("LinkFile is not supported for this Env");
   }
 
+  virtual Status NumFileLinks(const std::string& /*fname*/,
+                              uint64_t* /*count*/) {
+    return Status::NotSupported(
+        "Getting number of file links is not supported for this Env");
+  }
+
+  virtual Status AreFilesSame(const std::string& /*first*/,
+                              const std::string& /*second*/, bool* /*res*/) {
+    return Status::NotSupported("AreFilesSame is not supported for this Env");
+  }
+
   // Lock the specified file.  Used to prevent concurrent access to
   // the same db by multiple processes.  On failure, stores nullptr in
   // *lock and returns non-OK.
@@ -283,14 +322,12 @@ class Env {
   virtual Status UnlockFile(FileLock* lock) = 0;
 
   // Priority for scheduling job in thread pool
-  enum Priority { BOTTOM, LOW, HIGH, TOTAL };
+  enum Priority { BOTTOM, LOW, HIGH, USER, TOTAL };
+
+  static std::string PriorityToString(Priority priority);
 
   // Priority for requesting bytes in rate limiter scheduler
-  enum IOPriority {
-    IO_LOW = 0,
-    IO_HIGH = 1,
-    IO_TOTAL = 2
-  };
+  enum IOPriority { IO_LOW = 0, IO_HIGH = 1, IO_TOTAL = 2 };
 
   // Arrange to run "(*function)(arg)" once in a background thread, in
   // the thread pool specified by pri. By default, jobs go to the 'LOW'
@@ -304,11 +341,11 @@ class Env {
   // registered at the time of Schedule is invoked with arg as a parameter.
   virtual void Schedule(void (*function)(void* arg), void* arg,
                         Priority pri = LOW, void* tag = nullptr,
-                        void (*unschedFunction)(void* arg) = 0) = 0;
+                        void (*unschedFunction)(void* arg) = nullptr) = 0;
 
   // Arrange to remove jobs for given arg from the queue_ if they are not
   // already scheduled. Caller is expected to have exclusive lock on arg.
-  virtual int UnSchedule(void* arg, Priority pri) { return 0; }
+  virtual int UnSchedule(void* /*arg*/, Priority /*pri*/) { return 0; }
 
   // Start a new thread, invoking "function(arg)" within the new thread.
   // When "function(arg)" returns, the thread will be destroyed.
@@ -318,7 +355,7 @@ class Env {
   virtual void WaitForJoin() {}
 
   // Get thread pool queue length for specific thread pool.
-  virtual unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const {
+  virtual unsigned int GetThreadPoolQueueLen(Priority /*pri*/ = LOW) const {
     return 0;
   }
 
@@ -330,7 +367,7 @@ class Env {
 
   // Create and return a log file for storing informational messages.
   virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) = 0;
+                           std::shared_ptr<Logger>* result) = 0;
 
   // Returns the number of micro-seconds since some fixed point in time.
   // It is often used as system time such as in GenericRateLimiter
@@ -342,11 +379,12 @@ class Env {
   // Default implementation simply relies on NowMicros.
   // In platform-specific implementations, NowNanos() should return time points
   // that are MONOTONIC.
-  virtual uint64_t NowNanos() {
-    return NowMicros() * 1000;
-  }
+  virtual uint64_t NowNanos() { return NowMicros() * 1000; }
+
+  // 0 indicates not supported.
+  virtual uint64_t NowCPUNanos() { return 0; }
 
-  // Sleep/delay the thread for the perscribed number of micro-seconds.
+  // Sleep/delay the thread for the prescribed number of micro-seconds.
   virtual void SleepForMicroseconds(int micros) = 0;
 
   // Get the current host name.
@@ -358,7 +396,7 @@ class Env {
 
   // Get full directory name for this db.
   virtual Status GetAbsolutePath(const std::string& db_path,
-      std::string* output_path) = 0;
+                                 std::string* output_path) = 0;
 
   // The number of background worker threads of a specific thread pool
   // for this environment. 'LOW' is the default pool.
@@ -366,13 +404,20 @@ class Env {
   virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0;
   virtual int GetBackgroundThreads(Priority pri = LOW) = 0;
 
+  virtual Status SetAllowNonOwnerAccess(bool /*allow_non_owner_access*/) {
+    return Status::NotSupported("Not supported.");
+  }
+
   // Enlarge number of background worker threads of a specific thread pool
   // for this environment if it is smaller than specified. 'LOW' is the default
   // pool.
   virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0;
 
   // Lower IO priority for threads from the specified pool.
-  virtual void LowerThreadPoolIOPriority(Priority pool = LOW) {}
+  virtual void LowerThreadPoolIOPriority(Priority /*pool*/ = LOW) {}
+
+  // Lower CPU priority for threads from the specified pool.
+  virtual void LowerThreadPoolCPUPriority(Priority /*pool*/ = LOW) {}
 
   // Converts seconds-since-Jan-01-1970 to a printable string
   virtual std::string TimeToString(uint64_t time) = 0;
@@ -406,7 +451,7 @@ class Env {
   // table files.
   virtual EnvOptions OptimizeForCompactionTableWrite(
       const EnvOptions& env_options,
-      const ImmutableDBOptions& db_options) const;
+      const ImmutableDBOptions& immutable_ops) const;
 
   // OptimizeForCompactionTableWrite will create a new EnvOptions object that
   // is a copy of the EnvOptions in the parameters, but is optimized for reading
@@ -416,7 +461,7 @@ class Env {
       const ImmutableDBOptions& db_options) const;
 
   // Returns the status of all threads that belong to the current Env.
-  virtual Status GetThreadList(std::vector<ThreadStatus>* thread_list) {
+  virtual Status GetThreadList(std::vector<ThreadStatus>* /*thread_list*/) {
     return Status::NotSupported("Not supported.");
   }
 
@@ -430,6 +475,17 @@ class Env {
   // Returns the ID of the current thread.
   virtual uint64_t GetThreadID() const;
 
+// This seems to clash with a macro on Windows, so #undef it here
+#undef GetFreeSpace
+
+  // Get the amount of free disk space
+  virtual Status GetFreeSpace(const std::string& /*path*/,
+                              uint64_t* /*diskfree*/) {
+    return Status::NotSupported();
+  }
+
+  // If you're adding methods here, remember to add them to EnvWrapper too.
+
  protected:
   // The pointer to an internal structure that will update the
   // status of each thread.
@@ -449,7 +505,7 @@ ThreadStatusUpdater* CreateThreadStatusUpdater();
 // A file abstraction for reading sequentially through a file
 class SequentialFile {
  public:
-  SequentialFile() { }
+  SequentialFile() {}
   virtual ~SequentialFile();
 
   // Read up to "n" bytes from the file.  "scratch[0..n-1]" may be
@@ -482,23 +538,25 @@ class SequentialFile {
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
     return Status::NotSupported("InvalidateCache not supported.");
   }
 
   // Positioned Read for direct I/O
   // If Direct I/O enabled, offset, n, and scratch should be properly aligned
-  virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result,
-                                char* scratch) {
+  virtual Status PositionedRead(uint64_t /*offset*/, size_t /*n*/,
+                                Slice* /*result*/, char* /*scratch*/) {
     return Status::NotSupported();
   }
+
+  // If you're adding methods here, remember to add them to
+  // SequentialFileWrapper too.
 };
 
 // A file abstraction for randomly reading the contents of a file.
 class RandomAccessFile {
  public:
-
-  RandomAccessFile() { }
+  RandomAccessFile() {}
   virtual ~RandomAccessFile();
 
   // Read up to "n" bytes from the file starting at "offset".
@@ -515,7 +573,7 @@ class RandomAccessFile {
                       char* scratch) const = 0;
 
   // Readahead the file starting from offset by n bytes for caching.
-  virtual Status Prefetch(uint64_t offset, size_t n) {
+  virtual Status Prefetch(uint64_t /*offset*/, size_t /*n*/) {
     return Status::OK();
   }
 
@@ -534,14 +592,14 @@ class RandomAccessFile {
   // a single varint.
   //
   // Note: these IDs are only valid for the duration of the process.
-  virtual size_t GetUniqueId(char* id, size_t max_size) const {
-    return 0; // Default implementation to prevent issues with backwards
-              // compatibility.
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
+               // compatibility.
   };
 
   enum AccessPattern { NORMAL, RANDOM, SEQUENTIAL, WILLNEED, DONTNEED };
 
-  virtual void Hint(AccessPattern pattern) {}
+  virtual void Hint(AccessPattern /*pattern*/) {}
 
   // Indicates the upper layers if the current RandomAccessFile implementation
   // uses direct IO.
@@ -554,9 +612,12 @@ class RandomAccessFile {
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
     return Status::NotSupported("InvalidateCache not supported.");
   }
+
+  // If you're adding methods here, remember to add them to
+  // RandomAccessFileWrapper too.
 };
 
 // A file abstraction for sequential writing.  The implementation
@@ -565,10 +626,10 @@ class RandomAccessFile {
 class WritableFile {
  public:
   WritableFile()
-    : last_preallocated_block_(0),
-      preallocation_block_size_(0),
-      io_priority_(Env::IO_TOTAL) {
-  }
+      : last_preallocated_block_(0),
+        preallocation_block_size_(0),
+        io_priority_(Env::IO_TOTAL),
+        write_hint_(Env::WLTH_NOT_SET) {}
   virtual ~WritableFile();
 
   // Append data to the end of the file
@@ -596,7 +657,8 @@ class WritableFile {
   //
   // PositionedAppend() requires aligned buffer to be passed in. The alignment
   // required is queried via GetRequiredBufferAlignment()
-  virtual Status PositionedAppend(const Slice& /* data */, uint64_t /* offset */) {
+  virtual Status PositionedAppend(const Slice& /* data */,
+                                  uint64_t /* offset */) {
     return Status::NotSupported();
   }
 
@@ -604,12 +666,10 @@ class WritableFile {
   // before closing. It is not always possible to keep track of the file
   // size due to whole pages writes. The behavior is undefined if called
   // with other writes to follow.
-  virtual Status Truncate(uint64_t size) {
-    return Status::OK();
-  }
+  virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); }
   virtual Status Close() = 0;
   virtual Status Flush() = 0;
-  virtual Status Sync() = 0; // sync data
+  virtual Status Sync() = 0;  // sync data
 
   /*
    * Sync data and/or metadata as well.
@@ -617,15 +677,11 @@ class WritableFile {
    * Override this method for environments where we need to sync
    * metadata as well.
    */
-  virtual Status Fsync() {
-    return Sync();
-  }
+  virtual Status Fsync() { return Sync(); }
 
   // true if Sync() and Fsync() are safe to call concurrently with Append()
   // and Flush().
-  virtual bool IsSyncThreadSafe() const {
-    return false;
-  }
+  virtual bool IsSyncThreadSafe() const { return false; }
 
   // Indicates the upper layers if the current WritableFile implementation
   // uses direct IO.
@@ -638,18 +694,19 @@ class WritableFile {
    * Change the priority in rate limiter if rate limiting is enabled.
    * If rate limiting is not enabled, this call has no effect.
    */
-  virtual void SetIOPriority(Env::IOPriority pri) {
-    io_priority_ = pri;
-  }
+  virtual void SetIOPriority(Env::IOPriority pri) { io_priority_ = pri; }
 
   virtual Env::IOPriority GetIOPriority() { return io_priority_; }
 
+  virtual void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
+    write_hint_ = hint;
+  }
+
+  virtual Env::WriteLifeTimeHint GetWriteLifeTimeHint() { return write_hint_; }
   /*
    * Get the size of valid data in the file.
    */
-  virtual uint64_t GetFileSize() {
-    return 0;
-  }
+  virtual uint64_t GetFileSize() { return 0; }
 
   /*
    * Get and set the default pre-allocation block size for writes to
@@ -668,15 +725,15 @@ class WritableFile {
   }
 
   // For documentation, refer to RandomAccessFile::GetUniqueId()
-  virtual size_t GetUniqueId(char* id, size_t max_size) const {
-    return 0; // Default implementation to prevent issues with backwards
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;  // Default implementation to prevent issues with backwards
   }
 
   // Remove any kind of caching of data from the offset to offset+length
   // of this file. If the length is 0, then it refers to the end of file.
   // If the system is not caching the file contents, then this is a noop.
   // This call has no effect on dirty pages in the cache.
-  virtual Status InvalidateCache(size_t offset, size_t length) {
+  virtual Status InvalidateCache(size_t /*offset*/, size_t /*length*/) {
     return Status::NotSupported("InvalidateCache not supported.");
   }
 
@@ -686,7 +743,9 @@ class WritableFile {
   // This asks the OS to initiate flushing the cached data to disk,
   // without waiting for completion.
   // Default implementation does nothing.
-  virtual Status RangeSync(uint64_t offset, uint64_t nbytes) { return Status::OK(); }
+  virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) {
+    return Status::OK();
+  }
 
   // PrepareWrite performs any necessary preparation for a write
   // before the write actually occurs.  This allows for pre-allocation
@@ -702,10 +761,10 @@ class WritableFile {
     // cover this write would be and Allocate to that point.
     const auto block_size = preallocation_block_size_;
     size_t new_last_preallocated_block =
-      (offset + len + block_size - 1) / block_size;
+        (offset + len + block_size - 1) / block_size;
     if (new_last_preallocated_block > last_preallocated_block_) {
       size_t num_spanned_blocks =
-        new_last_preallocated_block - last_preallocated_block_;
+          new_last_preallocated_block - last_preallocated_block_;
       Allocate(block_size * last_preallocated_block_,
                block_size * num_spanned_blocks);
       last_preallocated_block_ = new_last_preallocated_block;
@@ -713,10 +772,13 @@ class WritableFile {
   }
 
   // Pre-allocates space for a file.
-  virtual Status Allocate(uint64_t offset, uint64_t len) {
+  virtual Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) {
     return Status::OK();
   }
 
+  // If you're adding methods here, remember to add them to
+  // WritableFileWrapper too.
+
  protected:
   size_t preallocation_block_size() { return preallocation_block_size_; }
 
@@ -728,10 +790,8 @@ class WritableFile {
   void operator=(const WritableFile&);
 
  protected:
-  friend class WritableFileWrapper;
-  friend class WritableFileMirror;
-
   Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
 };
 
 // A file abstraction for random reading and writing.
@@ -766,11 +826,36 @@ class RandomRWFile {
 
   virtual Status Close() = 0;
 
+  // If you're adding methods here, remember to add them to
+  // RandomRWFileWrapper too.
+
   // No copying allowed
   RandomRWFile(const RandomRWFile&) = delete;
   RandomRWFile& operator=(const RandomRWFile&) = delete;
 };
 
+// MemoryMappedFileBuffer object represents a memory-mapped file's raw buffer.
+// Subclasses should release the mapping upon destruction.
+class MemoryMappedFileBuffer {
+ public:
+  MemoryMappedFileBuffer(void* _base, size_t _length)
+      : base_(_base), length_(_length) {}
+
+  virtual ~MemoryMappedFileBuffer() = 0;
+
+  // We do not want to unmap this twice. We can make this class
+  // movable if desired, however, since
+  MemoryMappedFileBuffer(const MemoryMappedFileBuffer&) = delete;
+  MemoryMappedFileBuffer& operator=(const MemoryMappedFileBuffer&) = delete;
+
+  void* GetBase() const { return base_; }
+  size_t GetLen() const { return length_; }
+
+ protected:
+  void* base_;
+  const size_t length_;
+};
+
 // Directory object represents collection of files and implements
 // filesystem operations that can be executed on directories.
 class Directory {
@@ -778,6 +863,13 @@ class Directory {
   virtual ~Directory() {}
   // Fsync directory. Can be called concurrently from multiple threads.
   virtual Status Fsync() = 0;
+
+  virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const {
+    return 0;
+  }
+
+  // If you're adding methods here, remember to add them to
+  // DirectoryWrapper too.
 };
 
 enum InfoLogLevel : unsigned char {
@@ -796,9 +888,14 @@ class Logger {
   size_t kDoNotSupportGetLogFileSize = (std::numeric_limits<size_t>::max)();
 
   explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL)
-      : log_level_(log_level) {}
+      : closed_(false), log_level_(log_level) {}
   virtual ~Logger();
 
+  // Close the log file. Must be called before destructor. If the return
+  // status is NotSupported(), it means the implementation does cleanup in
+  // the destructor
+  virtual Status Close();
+
   // Write a header to the log file with the specified format
   // It is recommended that you log all header information at the start of the
   // application. But it is not enforced.
@@ -815,7 +912,8 @@ class Logger {
   // and format.  Any log with level under the internal log level
   // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be
   // printed.
-  virtual void Logv(const InfoLogLevel log_level, const char* format, va_list ap);
+  virtual void Logv(const InfoLogLevel log_level, const char* format,
+                    va_list ap);
 
   virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; }
   // Flush to the OS buffers
@@ -825,6 +923,12 @@ class Logger {
     log_level_ = log_level;
   }
 
+  // If you're adding methods here, remember to add them to LoggerWrapper too.
+
+ protected:
+  virtual Status CloseImpl();
+  bool closed_;
+
  private:
   // No copying allowed
   Logger(const Logger&);
@@ -832,58 +936,65 @@ class Logger {
   InfoLogLevel log_level_;
 };
 
-
 // Identifies a locked file.
 class FileLock {
  public:
-  FileLock() { }
+  FileLock() {}
   virtual ~FileLock();
+
  private:
   // No copying allowed
   FileLock(const FileLock&);
   void operator=(const FileLock&);
 };
 
-extern void LogFlush(const shared_ptr<Logger>& info_log);
+extern void LogFlush(const std::shared_ptr<Logger>& info_log);
 
 extern void Log(const InfoLogLevel log_level,
-                const shared_ptr<Logger>& info_log, const char* format, ...);
+                const std::shared_ptr<Logger>& info_log, const char* format,
+                ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
 
 // a set of log functions with different log levels.
-extern void Header(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Debug(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Info(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Warn(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Error(const shared_ptr<Logger>& info_log, const char* format, ...);
-extern void Fatal(const shared_ptr<Logger>& info_log, const char* format, ...);
+extern void Header(const std::shared_ptr<Logger>& info_log, const char* format,
+                   ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(const std::shared_ptr<Logger>& info_log, const char* format,
+                 ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(const std::shared_ptr<Logger>& info_log, const char* format,
+                  ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
 
 // Log the specified data to *info_log if info_log is non-nullptr.
 // The default info log level is InfoLogLevel::INFO_LEVEL.
-extern void Log(const shared_ptr<Logger>& info_log, const char* format, ...)
-#   if defined(__GNUC__) || defined(__clang__)
-    __attribute__((__format__ (__printf__, 2, 3)))
-#   endif
-    ;
+extern void Log(const std::shared_ptr<Logger>& info_log, const char* format,
+                ...) ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
 
-extern void LogFlush(Logger *info_log);
+extern void LogFlush(Logger* info_log);
 
 extern void Log(const InfoLogLevel log_level, Logger* info_log,
-                const char* format, ...);
+                const char* format, ...) ROCKSDB_PRINTF_FORMAT_ATTR(3, 4);
 
 // The default info log level is InfoLogLevel::INFO_LEVEL.
 extern void Log(Logger* info_log, const char* format, ...)
-#   if defined(__GNUC__) || defined(__clang__)
-    __attribute__((__format__ (__printf__, 2, 3)))
-#   endif
-    ;
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
 
 // a set of log functions with different log levels.
-extern void Header(Logger* info_log, const char* format, ...);
-extern void Debug(Logger* info_log, const char* format, ...);
-extern void Info(Logger* info_log, const char* format, ...);
-extern void Warn(Logger* info_log, const char* format, ...);
-extern void Error(Logger* info_log, const char* format, ...);
-extern void Fatal(Logger* info_log, const char* format, ...);
+extern void Header(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Debug(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Info(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Warn(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Error(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
+extern void Fatal(Logger* info_log, const char* format, ...)
+    ROCKSDB_PRINTF_FORMAT_ATTR(2, 3);
 
 // A utility routine: write "data" to the named file.
 extern Status WriteStringToFile(Env* env, const Slice& data,
@@ -894,50 +1005,79 @@ extern Status WriteStringToFile(Env* env, const Slice& data,
 extern Status ReadFileToString(Env* env, const std::string& fname,
                                std::string* data);
 
+// Below are helpers for wrapping most of the classes in this file.
+// They forward all calls to another instance of the class.
+// Useful when wrapping the default implementations.
+// Typical usage is to inherit your wrapper from *Wrapper, e.g.:
+//
+// class MySequentialFileWrapper : public rocksdb::SequentialFileWrapper {
+//  public:
+//   MySequentialFileWrapper(rocksdb::SequentialFile* target):
+//     rocksdb::SequentialFileWrapper(target) {}
+//   Status Read(size_t n, Slice* result, char* scratch) override {
+//     cout << "Doing a read of size " << n << "!" << endl;
+//     return rocksdb::SequentialFileWrapper::Read(n, result, scratch);
+//   }
+//   // All other methods are forwarded to target_ automatically.
+// };
+//
+// This is often more convenient than inheriting the class directly because
+// (a) Don't have to override and forward all methods - the Wrapper will
+//     forward everything you're not explicitly overriding.
+// (b) Don't need to update the wrapper when more methods are added to the
+//     rocksdb class. Unless you actually want to override the behavior.
+//     (And unless rocksdb people forgot to update the *Wrapper class.)
+
 // An implementation of Env that forwards all calls to another Env.
 // May be useful to clients who wish to override just part of the
 // functionality of another Env.
 class EnvWrapper : public Env {
  public:
   // Initialize an EnvWrapper that delegates all calls to *t
-  explicit EnvWrapper(Env* t) : target_(t) { }
+  explicit EnvWrapper(Env* t) : target_(t) {}
   ~EnvWrapper() override;
 
   // Return the target to which this Env forwards all calls
   Env* target() const { return target_; }
 
   // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& options) override {
     return target_->NewSequentialFile(f, r, options);
   }
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& options) override {
     return target_->NewRandomAccessFile(f, r, options);
   }
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& options) override {
     return target_->NewWritableFile(f, r, options);
   }
   Status ReopenWritableFile(const std::string& fname,
-                            unique_ptr<WritableFile>* result,
+                            std::unique_ptr<WritableFile>* result,
                             const EnvOptions& options) override {
     return target_->ReopenWritableFile(fname, result, options);
   }
   Status ReuseWritableFile(const std::string& fname,
                            const std::string& old_fname,
-                           unique_ptr<WritableFile>* r,
+                           std::unique_ptr<WritableFile>* r,
                            const EnvOptions& options) override {
     return target_->ReuseWritableFile(fname, old_fname, r, options);
   }
   Status NewRandomRWFile(const std::string& fname,
-                         unique_ptr<RandomRWFile>* result,
+                         std::unique_ptr<RandomRWFile>* result,
                          const EnvOptions& options) override {
     return target_->NewRandomRWFile(fname, result, options);
   }
+  Status NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override {
+    return target_->NewMemoryMappedFileBuffer(fname, result);
+  }
   Status NewDirectory(const std::string& name,
-                      unique_ptr<Directory>* result) override {
+                      std::unique_ptr<Directory>* result) override {
     return target_->NewDirectory(name, result);
   }
   Status FileExists(const std::string& f) override {
@@ -954,6 +1094,9 @@ class EnvWrapper : public Env {
   Status DeleteFile(const std::string& f) override {
     return target_->DeleteFile(f);
   }
+  Status Truncate(const std::string& fname, size_t size) override {
+    return target_->Truncate(fname, size);
+  }
   Status CreateDir(const std::string& d) override {
     return target_->CreateDir(d);
   }
@@ -980,6 +1123,15 @@ class EnvWrapper : public Env {
     return target_->LinkFile(s, t);
   }
 
+  Status NumFileLinks(const std::string& fname, uint64_t* count) override {
+    return target_->NumFileLinks(fname, count);
+  }
+
+  Status AreFilesSame(const std::string& first, const std::string& second,
+                      bool* res) override {
+    return target_->AreFilesSame(first, second, res);
+  }
+
   Status LockFile(const std::string& f, FileLock** l) override {
     return target_->LockFile(f, l);
   }
@@ -987,7 +1139,7 @@ class EnvWrapper : public Env {
   Status UnlockFile(FileLock* l) override { return target_->UnlockFile(l); }
 
   void Schedule(void (*f)(void* arg), void* a, Priority pri,
-                void* tag = nullptr, void (*u)(void* arg) = 0) override {
+                void* tag = nullptr, void (*u)(void* arg) = nullptr) override {
     return target_->Schedule(f, a, pri, tag, u);
   }
 
@@ -1006,10 +1158,12 @@ class EnvWrapper : public Env {
     return target_->GetTestDirectory(path);
   }
   Status NewLogger(const std::string& fname,
-                   shared_ptr<Logger>* result) override {
+                   std::shared_ptr<Logger>* result) override {
     return target_->NewLogger(fname, result);
   }
   uint64_t NowMicros() override { return target_->NowMicros(); }
+  uint64_t NowNanos() override { return target_->NowNanos(); }
+  uint64_t NowCPUNanos() override { return target_->NowCPUNanos(); }
 
   void SleepForMicroseconds(int micros) override {
     target_->SleepForMicroseconds(micros);
@@ -1031,6 +1185,10 @@ class EnvWrapper : public Env {
     return target_->GetBackgroundThreads(pri);
   }
 
+  Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
+    return target_->SetAllowNonOwnerAccess(allow_non_owner_access);
+  }
+
   void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
     return target_->IncBackgroundThreadsIfNeeded(num, pri);
   }
@@ -1039,6 +1197,10 @@ class EnvWrapper : public Env {
     target_->LowerThreadPoolIOPriority(pool);
   }
 
+  void LowerThreadPoolCPUPriority(Priority pool = LOW) override {
+    target_->LowerThreadPoolCPUPriority(pool);
+  }
+
   std::string TimeToString(uint64_t time) override {
     return target_->TimeToString(time);
   }
@@ -1051,26 +1213,100 @@ class EnvWrapper : public Env {
     return target_->GetThreadStatusUpdater();
   }
 
-  uint64_t GetThreadID() const override {
-    return target_->GetThreadID();
-  }
+  uint64_t GetThreadID() const override { return target_->GetThreadID(); }
 
   std::string GenerateUniqueId() override {
     return target_->GenerateUniqueId();
   }
 
+  EnvOptions OptimizeForLogRead(const EnvOptions& env_options) const override {
+    return target_->OptimizeForLogRead(env_options);
+  }
+  EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const override {
+    return target_->OptimizeForManifestRead(env_options);
+  }
+  EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
+                                 const DBOptions& db_options) const override {
+    return target_->OptimizeForLogWrite(env_options, db_options);
+  }
+  EnvOptions OptimizeForManifestWrite(
+      const EnvOptions& env_options) const override {
+    return target_->OptimizeForManifestWrite(env_options);
+  }
+  EnvOptions OptimizeForCompactionTableWrite(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& immutable_ops) const override {
+    return target_->OptimizeForCompactionTableWrite(env_options, immutable_ops);
+  }
+  EnvOptions OptimizeForCompactionTableRead(
+      const EnvOptions& env_options,
+      const ImmutableDBOptions& db_options) const override {
+    return target_->OptimizeForCompactionTableRead(env_options, db_options);
+  }
+  Status GetFreeSpace(const std::string& path, uint64_t* diskfree) override {
+    return target_->GetFreeSpace(path, diskfree);
+  }
+
  private:
   Env* target_;
 };
 
-// An implementation of WritableFile that forwards all calls to another
-// WritableFile. May be useful to clients who wish to override just part of the
-// functionality of another WritableFile.
-// It's declared as friend of WritableFile to allow forwarding calls to
-// protected virtual methods.
+class SequentialFileWrapper : public SequentialFile {
+ public:
+  explicit SequentialFileWrapper(SequentialFile* target) : target_(target) {}
+
+  Status Read(size_t n, Slice* result, char* scratch) override {
+    return target_->Read(n, result, scratch);
+  }
+  Status Skip(uint64_t n) override { return target_->Skip(n); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+  Status PositionedRead(uint64_t offset, size_t n, Slice* result,
+                        char* scratch) override {
+    return target_->PositionedRead(offset, n, result, scratch);
+  }
+
+ private:
+  SequentialFile* target_;
+};
+
+class RandomAccessFileWrapper : public RandomAccessFile {
+ public:
+  explicit RandomAccessFileWrapper(RandomAccessFile* target)
+      : target_(target) {}
+
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    return target_->Read(offset, n, result, scratch);
+  }
+  Status Prefetch(uint64_t offset, size_t n) override {
+    return target_->Prefetch(offset, n);
+  }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  };
+  void Hint(AccessPattern pattern) override { target_->Hint(pattern); }
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status InvalidateCache(size_t offset, size_t length) override {
+    return target_->InvalidateCache(offset, length);
+  }
+
+ private:
+  RandomAccessFile* target_;
+};
+
 class WritableFileWrapper : public WritableFile {
  public:
-  explicit WritableFileWrapper(WritableFile* t) : target_(t) { }
+  explicit WritableFileWrapper(WritableFile* t) : target_(t) {}
 
   Status Append(const Slice& data) override { return target_->Append(data); }
   Status PositionedAppend(const Slice& data, uint64_t offset) override {
@@ -1082,41 +1318,127 @@ class WritableFileWrapper : public WritableFile {
   Status Sync() override { return target_->Sync(); }
   Status Fsync() override { return target_->Fsync(); }
   bool IsSyncThreadSafe() const override { return target_->IsSyncThreadSafe(); }
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+
   void SetIOPriority(Env::IOPriority pri) override {
     target_->SetIOPriority(pri);
   }
+
   Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); }
+
+  void SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) override {
+    target_->SetWriteLifeTimeHint(hint);
+  }
+
+  Env::WriteLifeTimeHint GetWriteLifeTimeHint() override {
+    return target_->GetWriteLifeTimeHint();
+  }
+
   uint64_t GetFileSize() override { return target_->GetFileSize(); }
+
+  void SetPreallocationBlockSize(size_t size) override {
+    target_->SetPreallocationBlockSize(size);
+  }
+
   void GetPreallocationStatus(size_t* block_size,
                               size_t* last_allocated_block) override {
     target_->GetPreallocationStatus(block_size, last_allocated_block);
   }
+
   size_t GetUniqueId(char* id, size_t max_size) const override {
     return target_->GetUniqueId(id, max_size);
   }
+
   Status InvalidateCache(size_t offset, size_t length) override {
     return target_->InvalidateCache(offset, length);
   }
 
-  void SetPreallocationBlockSize(size_t size) override {
-    target_->SetPreallocationBlockSize(size);
+  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+    return target_->RangeSync(offset, nbytes);
   }
+
   void PrepareWrite(size_t offset, size_t len) override {
     target_->PrepareWrite(offset, len);
   }
 
- protected:
   Status Allocate(uint64_t offset, uint64_t len) override {
     return target_->Allocate(offset, len);
   }
-  Status RangeSync(uint64_t offset, uint64_t nbytes) override {
-    return target_->RangeSync(offset, nbytes);
-  }
 
  private:
   WritableFile* target_;
 };
 
+class RandomRWFileWrapper : public RandomRWFile {
+ public:
+  explicit RandomRWFileWrapper(RandomRWFile* target) : target_(target) {}
+
+  bool use_direct_io() const override { return target_->use_direct_io(); }
+  size_t GetRequiredBufferAlignment() const override {
+    return target_->GetRequiredBufferAlignment();
+  }
+  Status Write(uint64_t offset, const Slice& data) override {
+    return target_->Write(offset, data);
+  }
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
+    return target_->Read(offset, n, result, scratch);
+  }
+  Status Flush() override { return target_->Flush(); }
+  Status Sync() override { return target_->Sync(); }
+  Status Fsync() override { return target_->Fsync(); }
+  Status Close() override { return target_->Close(); }
+
+ private:
+  RandomRWFile* target_;
+};
+
+class DirectoryWrapper : public Directory {
+ public:
+  explicit DirectoryWrapper(Directory* target) : target_(target) {}
+
+  Status Fsync() override { return target_->Fsync(); }
+  size_t GetUniqueId(char* id, size_t max_size) const override {
+    return target_->GetUniqueId(id, max_size);
+  }
+
+ private:
+  Directory* target_;
+};
+
+class LoggerWrapper : public Logger {
+ public:
+  explicit LoggerWrapper(Logger* target) : target_(target) {}
+
+  Status Close() override { return target_->Close(); }
+  void LogHeader(const char* format, va_list ap) override {
+    return target_->LogHeader(format, ap);
+  }
+  void Logv(const char* format, va_list ap) override {
+    return target_->Logv(format, ap);
+  }
+  void Logv(const InfoLogLevel log_level, const char* format,
+            va_list ap) override {
+    return target_->Logv(log_level, format, ap);
+  }
+  size_t GetLogFileSize() const override { return target_->GetLogFileSize(); }
+  void Flush() override { return target_->Flush(); }
+  InfoLogLevel GetInfoLogLevel() const override {
+    return target_->GetInfoLogLevel();
+  }
+  void SetInfoLogLevel(const InfoLogLevel log_level) override {
+    return target_->SetInfoLogLevel(log_level);
+  }
+
+ private:
+  Logger* target_;
+};
+
 // Returns a new environment that stores its data in memory and delegates
 // all non-file-storage tasks to base_env. The caller must delete the result
 // when it is no longer needed.
@@ -1133,5 +1455,3 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname);
 Env* NewTimedEnv(Env* base_env);
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_ENV_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/env_encryption.h b/thirdparty/rocksdb/include/rocksdb/env_encryption.h
index e4c924a4b4..a80da963a3 100644
--- a/thirdparty/rocksdb/include/rocksdb/env_encryption.h
+++ b/thirdparty/rocksdb/include/rocksdb/env_encryption.h
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#if !defined(ROCKSDB_LITE) 
+#if !defined(ROCKSDB_LITE)
 
 #include <string>
 
@@ -15,180 +15,190 @@ namespace rocksdb {
 
 class EncryptionProvider;
 
-// Returns an Env that encrypts data when stored on disk and decrypts data when 
+// Returns an Env that encrypts data when stored on disk and decrypts data when
 // read from disk.
 Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider);
 
-// BlockAccessCipherStream is the base class for any cipher stream that 
-// supports random access at block level (without requiring data from other blocks).
-// E.g. CTR (Counter operation mode) supports this requirement.
+// BlockAccessCipherStream is the base class for any cipher stream that
+// supports random access at block level (without requiring data from other
+// blocks). E.g. CTR (Counter operation mode) supports this requirement.
 class BlockAccessCipherStream {
-    public:
-      virtual ~BlockAccessCipherStream() {};
+ public:
+  virtual ~BlockAccessCipherStream(){};
 
-      // BlockSize returns the size of each block supported by this cipher stream.
-      virtual size_t BlockSize() = 0;
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() = 0;
 
-      // Encrypt one or more (partial) blocks of data at the file offset.
-      // Length of data is given in dataSize.
-      virtual Status Encrypt(uint64_t fileOffset, char *data, size_t dataSize);
+  // Encrypt one or more (partial) blocks of data at the file offset.
+  // Length of data is given in dataSize.
+  virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize);
 
-      // Decrypt one or more (partial) blocks of data at the file offset.
-      // Length of data is given in dataSize.
-      virtual Status Decrypt(uint64_t fileOffset, char *data, size_t dataSize);
+  // Decrypt one or more (partial) blocks of data at the file offset.
+  // Length of data is given in dataSize.
+  virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize);
 
-    protected:
-      // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
-      virtual void AllocateScratch(std::string&) = 0;
+ protected:
+  // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+  virtual void AllocateScratch(std::string&) = 0;
 
-      // Encrypt a block of data at the given block index.
-      // Length of data is equal to BlockSize();
-      virtual Status EncryptBlock(uint64_t blockIndex, char *data, char* scratch) = 0;
+  // Encrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status EncryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) = 0;
 
-      // Decrypt a block of data at the given block index.
-      // Length of data is equal to BlockSize();
-      virtual Status DecryptBlock(uint64_t blockIndex, char *data, char* scratch) = 0;
+  // Decrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status DecryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) = 0;
 };
 
-// BlockCipher 
+// BlockCipher
 class BlockCipher {
-    public:
-      virtual ~BlockCipher() {};
+ public:
+  virtual ~BlockCipher(){};
 
-      // BlockSize returns the size of each block supported by this cipher stream.
-      virtual size_t BlockSize() = 0;
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() = 0;
 
-      // Encrypt a block of data.
-      // Length of data is equal to BlockSize().
-      virtual Status Encrypt(char *data) = 0;
+  // Encrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Encrypt(char* data) = 0;
 
-      // Decrypt a block of data.
-      // Length of data is equal to BlockSize().
-      virtual Status Decrypt(char *data) = 0;
+  // Decrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Decrypt(char* data) = 0;
 };
 
 // Implements a BlockCipher using ROT13.
 //
-// Note: This is a sample implementation of BlockCipher, 
+// Note: This is a sample implementation of BlockCipher,
 // it is NOT considered safe and should NOT be used in production.
 class ROT13BlockCipher : public BlockCipher {
-    private: 
-      size_t blockSize_;
-    public:
-      ROT13BlockCipher(size_t blockSize) 
-        : blockSize_(blockSize) {}
-      virtual ~ROT13BlockCipher() {};
-
-      // BlockSize returns the size of each block supported by this cipher stream.
-      virtual size_t BlockSize() override { return blockSize_; }
-
-      // Encrypt a block of data.
-      // Length of data is equal to BlockSize().
-      virtual Status Encrypt(char *data) override;
-
-      // Decrypt a block of data.
-      // Length of data is equal to BlockSize().
-      virtual Status Decrypt(char *data) override;
+ private:
+  size_t blockSize_;
+
+ public:
+  ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {}
+  virtual ~ROT13BlockCipher(){};
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() override { return blockSize_; }
+
+  // Encrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Encrypt(char* data) override;
+
+  // Decrypt a block of data.
+  // Length of data is equal to BlockSize().
+  virtual Status Decrypt(char* data) override;
 };
 
-// CTRCipherStream implements BlockAccessCipherStream using an 
-// Counter operations mode. 
+// CTRCipherStream implements BlockAccessCipherStream using an
+// Counter operations mode.
 // See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation
 //
-// Note: This is a possible implementation of BlockAccessCipherStream, 
+// Note: This is a possible implementation of BlockAccessCipherStream,
 // it is considered suitable for use.
 class CTRCipherStream final : public BlockAccessCipherStream {
-    private:
-      BlockCipher& cipher_;
-      std::string iv_;
-      uint64_t initialCounter_;
-    public:
-      CTRCipherStream(BlockCipher& c, const char *iv, uint64_t initialCounter) 
-        : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter) {};
-      virtual ~CTRCipherStream() {};
-
-      // BlockSize returns the size of each block supported by this cipher stream.
-      virtual size_t BlockSize() override { return cipher_.BlockSize(); }
-
-    protected:
-      // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
-      virtual void AllocateScratch(std::string&) override;
-
-      // Encrypt a block of data at the given block index.
-      // Length of data is equal to BlockSize();
-      virtual Status EncryptBlock(uint64_t blockIndex, char *data, char *scratch) override;
-
-      // Decrypt a block of data at the given block index.
-      // Length of data is equal to BlockSize();
-      virtual Status DecryptBlock(uint64_t blockIndex, char *data, char *scratch) override;
+ private:
+  BlockCipher& cipher_;
+  std::string iv_;
+  uint64_t initialCounter_;
+
+ public:
+  CTRCipherStream(BlockCipher& c, const char* iv, uint64_t initialCounter)
+      : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter){};
+  virtual ~CTRCipherStream(){};
+
+  // BlockSize returns the size of each block supported by this cipher stream.
+  virtual size_t BlockSize() override { return cipher_.BlockSize(); }
+
+ protected:
+  // Allocate scratch space which is passed to EncryptBlock/DecryptBlock.
+  virtual void AllocateScratch(std::string&) override;
+
+  // Encrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status EncryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) override;
+
+  // Decrypt a block of data at the given block index.
+  // Length of data is equal to BlockSize();
+  virtual Status DecryptBlock(uint64_t blockIndex, char* data,
+                              char* scratch) override;
 };
 
-// The encryption provider is used to create a cipher stream for a specific file.
-// The returned cipher stream will be used for actual encryption/decryption 
-// actions.
+// The encryption provider is used to create a cipher stream for a specific
+// file. The returned cipher stream will be used for actual
+// encryption/decryption actions.
 class EncryptionProvider {
  public:
-    virtual ~EncryptionProvider() {};
-
-    // GetPrefixLength returns the length of the prefix that is added to every file
-    // and used for storing encryption options.
-    // For optimal performance, the prefix length should be a multiple of 
-    // the a page size.
-    virtual size_t GetPrefixLength() = 0;
-
-    // CreateNewPrefix initialized an allocated block of prefix memory 
-    // for a new file.
-    virtual Status CreateNewPrefix(const std::string& fname, char *prefix, size_t prefixLength) = 0;
-
-    // CreateCipherStream creates a block access cipher stream for a file given
-    // given name and options.
-    virtual Status CreateCipherStream(const std::string& fname, const EnvOptions& options,
-      Slice& prefix, unique_ptr<BlockAccessCipherStream>* result) = 0;
+  virtual ~EncryptionProvider(){};
+
+  // GetPrefixLength returns the length of the prefix that is added to every
+  // file and used for storing encryption options. For optimal performance, the
+  // prefix length should be a multiple of the page size.
+  virtual size_t GetPrefixLength() = 0;
+
+  // CreateNewPrefix initialized an allocated block of prefix memory
+  // for a new file.
+  virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
+                                 size_t prefixLength) = 0;
+
+  // CreateCipherStream creates a block access cipher stream for a file given
+  // given name and options.
+  virtual Status CreateCipherStream(
+      const std::string& fname, const EnvOptions& options, Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result) = 0;
 };
 
-// This encryption provider uses a CTR cipher stream, with a given block cipher 
+// This encryption provider uses a CTR cipher stream, with a given block cipher
 // and IV.
 //
-// Note: This is a possible implementation of EncryptionProvider, 
+// Note: This is a possible implementation of EncryptionProvider,
 // it is considered suitable for use, provided a safe BlockCipher is used.
 class CTREncryptionProvider : public EncryptionProvider {
-    private:
-      BlockCipher& cipher_;
-    protected:
-      const static size_t defaultPrefixLength = 4096;
+ private:
+  BlockCipher& cipher_;
+
+ protected:
+  const static size_t defaultPrefixLength = 4096;
 
  public:
-      CTREncryptionProvider(BlockCipher& c) 
-        : cipher_(c) {};
-    virtual ~CTREncryptionProvider() {}
-
-    // GetPrefixLength returns the length of the prefix that is added to every file
-    // and used for storing encryption options.
-    // For optimal performance, the prefix length should be a multiple of 
-    // the a page size.
-    virtual size_t GetPrefixLength() override;
-
-    // CreateNewPrefix initialized an allocated block of prefix memory 
-    // for a new file.
-    virtual Status CreateNewPrefix(const std::string& fname, char *prefix, size_t prefixLength) override;
-
-    // CreateCipherStream creates a block access cipher stream for a file given
-    // given name and options.
-    virtual Status CreateCipherStream(const std::string& fname, const EnvOptions& options,
-      Slice& prefix, unique_ptr<BlockAccessCipherStream>* result) override;
-
-  protected:
-    // PopulateSecretPrefixPart initializes the data into a new prefix block 
-    // that will be encrypted. This function will store the data in plain text. 
-    // It will be encrypted later (before written to disk).
-    // Returns the amount of space (starting from the start of the prefix)
-    // that has been initialized.
-    virtual size_t PopulateSecretPrefixPart(char *prefix, size_t prefixLength, size_t blockSize);
-
-    // CreateCipherStreamFromPrefix creates a block access cipher stream for a file given
-    // given name and options. The given prefix is already decrypted.
-    virtual Status CreateCipherStreamFromPrefix(const std::string& fname, const EnvOptions& options,
-      uint64_t initialCounter, const Slice& iv, const Slice& prefix, unique_ptr<BlockAccessCipherStream>* result);
+  CTREncryptionProvider(BlockCipher& c) : cipher_(c){};
+  virtual ~CTREncryptionProvider() {}
+
+  // GetPrefixLength returns the length of the prefix that is added to every
+  // file and used for storing encryption options. For optimal performance, the
+  // prefix length should be a multiple of the page size.
+  virtual size_t GetPrefixLength() override;
+
+  // CreateNewPrefix initialized an allocated block of prefix memory
+  // for a new file.
+  virtual Status CreateNewPrefix(const std::string& fname, char* prefix,
+                                 size_t prefixLength) override;
+
+  // CreateCipherStream creates a block access cipher stream for a file given
+  // given name and options.
+  virtual Status CreateCipherStream(
+      const std::string& fname, const EnvOptions& options, Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result) override;
+
+ protected:
+  // PopulateSecretPrefixPart initializes the data into a new prefix block
+  // that will be encrypted. This function will store the data in plain text.
+  // It will be encrypted later (before written to disk).
+  // Returns the amount of space (starting from the start of the prefix)
+  // that has been initialized.
+  virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength,
+                                          size_t blockSize);
+
+  // CreateCipherStreamFromPrefix creates a block access cipher stream for a
+  // file given given name and options. The given prefix is already decrypted.
+  virtual Status CreateCipherStreamFromPrefix(
+      const std::string& fname, const EnvOptions& options,
+      uint64_t initialCounter, const Slice& iv, const Slice& prefix,
+      std::unique_ptr<BlockAccessCipherStream>* result);
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/filter_policy.h b/thirdparty/rocksdb/include/rocksdb/filter_policy.h
index 8add48e496..5d465b7820 100644
--- a/thirdparty/rocksdb/include/rocksdb/filter_policy.h
+++ b/thirdparty/rocksdb/include/rocksdb/filter_policy.h
@@ -17,12 +17,11 @@
 // Most people will want to use the builtin bloom filter support (see
 // NewBloomFilterPolicy() below).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
-#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+#pragma once
 
+#include <stdlib.h>
 #include <memory>
 #include <stdexcept>
-#include <stdlib.h>
 #include <string>
 #include <vector>
 
@@ -46,7 +45,11 @@ class FilterBitsBuilder {
   virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
 
   // Calculate num of entries fit into a space.
-  virtual int CalculateNumEntry(const uint32_t space) {
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4702)  // unreachable code
+#endif
+  virtual int CalculateNumEntry(const uint32_t /*space*/) {
 #ifndef ROCKSDB_LITE
     throw std::runtime_error("CalculateNumEntry not Implemented");
 #else
@@ -54,6 +57,9 @@ class FilterBitsBuilder {
 #endif
     return 0;
   }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 };
 
 // A class that checks if a key can be in filter
@@ -96,8 +102,8 @@ class FilterPolicy {
   //
   // Warning: do not change the initial contents of *dst.  Instead,
   // append the newly constructed filter to *dst.
-  virtual void CreateFilter(const Slice* keys, int n, std::string* dst)
-      const = 0;
+  virtual void CreateFilter(const Slice* keys, int n,
+                            std::string* dst) const = 0;
 
   // "filter" contains the data appended by a preceding call to
   // CreateFilter() on this class.  This method must return true if
@@ -108,14 +114,13 @@ class FilterPolicy {
 
   // Get the FilterBitsBuilder, which is ONLY used for full filter block
   // It contains interface to take individual key, then generate filter
-  virtual FilterBitsBuilder* GetFilterBitsBuilder() const {
-    return nullptr;
-  }
+  virtual FilterBitsBuilder* GetFilterBitsBuilder() const { return nullptr; }
 
   // Get the FilterBitsReader, which is ONLY used for full filter block
   // It contains interface to tell if key can be in filter
   // The input slice should NOT be deleted by FilterPolicy
-  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const {
+  virtual FilterBitsReader* GetFilterBitsReader(
+      const Slice& /*contents*/) const {
     return nullptr;
   }
 };
@@ -138,8 +143,6 @@ class FilterPolicy {
 // ignores trailing spaces, it would be incorrect to use a
 // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
 // trailing spaces in keys.
-extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
-    bool use_block_based_builder = true);
-}
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
+extern const FilterPolicy* NewBloomFilterPolicy(
+    int bits_per_key, bool use_block_based_builder = false);
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/flush_block_policy.h b/thirdparty/rocksdb/include/rocksdb/flush_block_policy.h
index 5daa967624..38807249ce 100644
--- a/thirdparty/rocksdb/include/rocksdb/flush_block_policy.h
+++ b/thirdparty/rocksdb/include/rocksdb/flush_block_policy.h
@@ -20,10 +20,9 @@ class FlushBlockPolicy {
  public:
   // Keep track of the key/value sequences and return the boolean value to
   // determine if table builder should flush current data block.
-  virtual bool Update(const Slice& key,
-                      const Slice& value) = 0;
+  virtual bool Update(const Slice& key, const Slice& value) = 0;
 
-  virtual ~FlushBlockPolicy() { }
+  virtual ~FlushBlockPolicy() {}
 };
 
 class FlushBlockPolicyFactory {
@@ -41,7 +40,7 @@ class FlushBlockPolicyFactory {
       const BlockBasedTableOptions& table_options,
       const BlockBuilder& data_block_builder) const = 0;
 
-  virtual ~FlushBlockPolicyFactory() { }
+  virtual ~FlushBlockPolicyFactory() {}
 };
 
 class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
@@ -59,4 +58,4 @@ class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
       const BlockBuilder& data_block_builder);
 };
 
-}  // rocksdb
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/iostats_context.h b/thirdparty/rocksdb/include/rocksdb/iostats_context.h
index 77a59643a1..67f2b32177 100644
--- a/thirdparty/rocksdb/include/rocksdb/iostats_context.h
+++ b/thirdparty/rocksdb/include/rocksdb/iostats_context.h
@@ -44,6 +44,10 @@ struct IOStatsContext {
   uint64_t prepare_write_nanos;
   // time spent in Logger::Logv().
   uint64_t logger_nanos;
+  // CPU time spent in write() and pwrite()
+  uint64_t cpu_write_nanos;
+  // CPU time spent in read() and pread()
+  uint64_t cpu_read_nanos;
 };
 
 // Get Thread-local IOStatsContext object pointer
diff --git a/thirdparty/rocksdb/include/rocksdb/iterator.h b/thirdparty/rocksdb/include/rocksdb/iterator.h
index d4ac528181..e99b434a01 100644
--- a/thirdparty/rocksdb/include/rocksdb/iterator.h
+++ b/thirdparty/rocksdb/include/rocksdb/iterator.h
@@ -16,8 +16,7 @@
 // non-const method, all threads accessing the same Iterator must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
+#pragma once
 
 #include <string>
 #include "rocksdb/cleanable.h"
@@ -33,6 +32,7 @@ class Iterator : public Cleanable {
 
   // An iterator is either positioned at a key/value pair, or
   // not valid.  This method returns true iff the iterator is valid.
+  // Always returns false if !status().ok().
   virtual bool Valid() const = 0;
 
   // Position at the first key in the source.  The iterator is Valid()
@@ -43,15 +43,18 @@ class Iterator : public Cleanable {
   // Valid() after this call iff the source is not empty.
   virtual void SeekToLast() = 0;
 
-  // Position at the first key in the source that at or past target
+  // Position at the first key in the source that at or past target.
   // The iterator is Valid() after this call iff the source contains
   // an entry that comes at or past target.
+  // All Seek*() methods clear any error status() that the iterator had prior to
+  // the call; after the seek, status() indicates only the error (if any) that
+  // happened during the seek, not any past errors.
   virtual void Seek(const Slice& target) = 0;
 
-  // Position at the last key in the source that at or before target
+  // Position at the last key in the source that at or before target.
   // The iterator is Valid() after this call iff the source contains
   // an entry that comes at or before target.
-  virtual void SeekForPrev(const Slice& target) {}
+  virtual void SeekForPrev(const Slice& target) = 0;
 
   // Moves to the next entry in the source.  After this call, Valid() is
   // true iff the iterator was not positioned at the last entry in the source.
@@ -72,7 +75,7 @@ class Iterator : public Cleanable {
   // Return the value for the current entry.  The underlying storage for
   // the returned slice is valid only until the next modification of
   // the iterator.
-  // REQUIRES: !AtEnd() && !AtStart()
+  // REQUIRES: Valid()
   virtual Slice value() const = 0;
 
   // If an error has occurred, return it.  Else return an ok status.
@@ -97,6 +100,9 @@ class Iterator : public Cleanable {
   // Property "rocksdb.iterator.super-version-number":
   //   LSM version used by the iterator. The same format as DB Property
   //   kCurrentSuperVersionNumber. See its comment for more information.
+  // Property "rocksdb.iterator.internal-key":
+  //   Get the user-key portion of the internal key at which the iteration
+  //   stopped.
   virtual Status GetProperty(std::string prop_name, std::string* prop);
 
  private:
@@ -112,5 +118,3 @@ extern Iterator* NewEmptyIterator();
 extern Iterator* NewErrorIterator(const Status& status);
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/ldb_tool.h b/thirdparty/rocksdb/include/rocksdb/ldb_tool.h
index 0ec2da9fc0..636605ff7f 100644
--- a/thirdparty/rocksdb/include/rocksdb/ldb_tool.h
+++ b/thirdparty/rocksdb/include/rocksdb/ldb_tool.h
@@ -2,8 +2,8 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_LITE
 #pragma once
+#ifndef ROCKSDB_LITE
 #include <string>
 #include <vector>
 #include "rocksdb/db.h"
@@ -38,6 +38,6 @@ class LDBTool {
       const std::vector<ColumnFamilyDescriptor>* column_families = nullptr);
 };
 
-} // namespace rocksdb
+}  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/listener.h b/thirdparty/rocksdb/include/rocksdb/listener.h
index e132033db2..d4a61c20e3 100644
--- a/thirdparty/rocksdb/include/rocksdb/listener.h
+++ b/thirdparty/rocksdb/include/rocksdb/listener.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include <chrono>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -27,6 +28,7 @@ enum class TableFileCreationReason {
   kFlush,
   kCompaction,
   kRecovery,
+  kMisc,
 };
 
 struct TableFileCreationBriefInfo {
@@ -55,8 +57,8 @@ struct TableFileCreationInfo : public TableFileCreationBriefInfo {
   Status status;
 };
 
-enum class CompactionReason {
-  kUnknown,
+enum class CompactionReason : int {
+  kUnknown = 0,
   // [Level] number of L0 files > level0_file_num_compaction_trigger
   kLevelL0FilesNum,
   // [Level] total size of level > MaxBytesForLevel()
@@ -77,6 +79,33 @@ enum class CompactionReason {
   kManualCompaction,
   // DB::SuggestCompactRange() marked files for compaction
   kFilesMarkedForCompaction,
+  // [Level] Automatic compaction within bottommost level to cleanup duplicate
+  // versions of same user key, usually due to a released snapshot.
+  kBottommostFiles,
+  // Compaction based on TTL
+  kTtl,
+  // According to the comments in flush_job.cc, RocksDB treats flush as
+  // a level 0 compaction in internal stats.
+  kFlush,
+  // Compaction caused by external sst file ingestion
+  kExternalSstIngestion,
+  // total number of compaction reasons, new reasons must be added above this.
+  kNumOfReasons,
+};
+
+enum class FlushReason : int {
+  kOthers = 0x00,
+  kGetLiveFiles = 0x01,
+  kShutDown = 0x02,
+  kExternalFileIngestion = 0x03,
+  kManualCompaction = 0x04,
+  kWriteBufferManager = 0x05,
+  kWriteBufferFull = 0x06,
+  kTest = 0x07,
+  kDeleteFiles = 0x08,
+  kAutoCompaction = 0x09,
+  kManualFlush = 0x0a,
+  kErrorRecovery = 0xb,
 };
 
 enum class BackgroundErrorReason {
@@ -86,6 +115,22 @@ enum class BackgroundErrorReason {
   kMemTable,
 };
 
+enum class WriteStallCondition {
+  kNormal,
+  kDelayed,
+  kStopped,
+};
+
+struct WriteStallInfo {
+  // the name of the column family
+  std::string cf_name;
+  // state of the write controller
+  struct {
+    WriteStallCondition cur;
+    WriteStallCondition prev;
+  } condition;
+};
+
 #ifndef ROCKSDB_LITE
 
 struct TableFileDeletionInfo {
@@ -99,7 +144,24 @@ struct TableFileDeletionInfo {
   Status status;
 };
 
+struct FileOperationInfo {
+  using TimePoint = std::chrono::time_point<std::chrono::system_clock,
+                                            std::chrono::nanoseconds>;
+
+  const std::string& path;
+  uint64_t offset;
+  size_t length;
+  const TimePoint& start_timestamp;
+  const TimePoint& finish_timestamp;
+  Status status;
+  FileOperationInfo(const std::string& _path, const TimePoint& start,
+                    const TimePoint& finish)
+      : path(_path), start_timestamp(start), finish_timestamp(finish) {}
+};
+
 struct FlushJobInfo {
+  // the id of the column family
+  uint32_t cf_id;
   // the name of the column family
   std::string cf_name;
   // the path to the newly created file
@@ -124,13 +186,17 @@ struct FlushJobInfo {
   SequenceNumber largest_seqno;
   // Table properties of the table being flushed
   TableProperties table_properties;
+
+  FlushReason flush_reason;
 };
 
 struct CompactionJobInfo {
   CompactionJobInfo() = default;
-  explicit CompactionJobInfo(const CompactionJobStats& _stats) :
-      stats(_stats) {}
+  explicit CompactionJobInfo(const CompactionJobStats& _stats)
+      : stats(_stats) {}
 
+  // the id of the column family where the compaction happened.
+  uint32_t cf_id;
   // the name of the column family where the compaction happened.
   std::string cf_name;
   // the status indicating whether the compaction was successful or not.
@@ -178,7 +244,6 @@ struct MemTableInfo {
   uint64_t num_entries;
   // Total number of deletes in memtable
   uint64_t num_deletes;
-
 };
 
 struct ExternalFileIngestionInfo {
@@ -194,36 +259,12 @@ struct ExternalFileIngestionInfo {
   TableProperties table_properties;
 };
 
-// A call-back function to RocksDB which will be called when the compaction
-// iterator is compacting values. It is mean to be returned from
-// EventListner::GetCompactionEventListner() at the beginning of compaction
-// job.
-class CompactionEventListener {
- public:
-  enum CompactionListenerValueType {
-    kValue,
-    kMergeOperand,
-    kDelete,
-    kSingleDelete,
-    kRangeDelete,
-    kBlobIndex,
-    kInvalid,
-  };
-
-  virtual void OnCompaction(int level, const Slice& key,
-                            CompactionListenerValueType value_type,
-                            const Slice& existing_value,
-                            const SequenceNumber& sn, bool is_new) = 0;
-
-  virtual ~CompactionEventListener() = default;
-};
-
-// EventListener class contains a set of call-back functions that will
+// EventListener class contains a set of callback functions that will
 // be called when specific RocksDB event happens such as flush.  It can
 // be used as a building block for developing custom features such as
 // stats-collector or external compaction algorithm.
 //
-// Note that call-back functions should not run for an extended period of
+// Note that callback functions should not run for an extended period of
 // time before the function returns, otherwise RocksDB may be blocked.
 // For example, it is not suggested to do DB::CompactFiles() (as it may
 // run for a long while) or issue many of DB::Put() (as Put may be blocked
@@ -239,17 +280,10 @@ class CompactionEventListener {
 // [Locking] All EventListener callbacks are designed to be called without
 // the current thread holding any DB mutex. This is to prevent potential
 // deadlock and performance issue when using EventListener callback
-// in a complex way. However, all EventListener call-back functions
-// should not run for an extended period of time before the function
-// returns, otherwise RocksDB may be blocked. For example, it is not
-// suggested to do DB::CompactFiles() (as it may run for a long while)
-// or issue many of DB::Put() (as Put may be blocked in certain cases)
-// in the same thread in the EventListener callback. However, doing
-// DB::CompactFiles() and DB::Put() in a thread other than the
-// EventListener callback thread is considered safe.
+// in a complex way.
 class EventListener {
  public:
-  // A call-back function to RocksDB which will be called whenever a
+  // A callback function to RocksDB which will be called whenever a
   // registered RocksDB flushes a file.  The default implementation is
   // no-op.
   //
@@ -259,7 +293,7 @@ class EventListener {
   virtual void OnFlushCompleted(DB* /*db*/,
                                 const FlushJobInfo& /*flush_job_info*/) {}
 
-  // A call-back function to RocksDB which will be called before a
+  // A callback function to RocksDB which will be called before a
   // RocksDB starts to flush memtables.  The default implementation is
   // no-op.
   //
@@ -269,9 +303,9 @@ class EventListener {
   virtual void OnFlushBegin(DB* /*db*/,
                             const FlushJobInfo& /*flush_job_info*/) {}
 
-  // A call-back function for RocksDB which will be called whenever
+  // A callback function for RocksDB which will be called whenever
   // a SST file is deleted.  Different from OnCompactionCompleted and
-  // OnFlushCompleted, this call-back is designed for external logging
+  // OnFlushCompleted, this callback is designed for external logging
   // service and thus only provide string parameters instead
   // of a pointer to DB.  Applications that build logic basic based
   // on file creations and deletions is suggested to implement
@@ -282,7 +316,16 @@ class EventListener {
   // returned value.
   virtual void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) {}
 
-  // A call-back function for RocksDB which will be called whenever
+  // A callback function to RocksDB which will be called before a
+  // RocksDB starts to compact.  The default implementation is
+  // no-op.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& /*ci*/) {}
+
+  // A callback function for RocksDB which will be called whenever
   // a registered RocksDB compacts a file. The default implementation
   // is a no-op.
   //
@@ -298,9 +341,9 @@ class EventListener {
   virtual void OnCompactionCompleted(DB* /*db*/,
                                      const CompactionJobInfo& /*ci*/) {}
 
-  // A call-back function for RocksDB which will be called whenever
+  // A callback function for RocksDB which will be called whenever
   // a SST file is created.  Different from OnCompactionCompleted and
-  // OnFlushCompleted, this call-back is designed for external logging
+  // OnFlushCompleted, this callback is designed for external logging
   // service and thus only provide string parameters instead
   // of a pointer to DB.  Applications that build logic basic based
   // on file creations and deletions is suggested to implement
@@ -315,7 +358,7 @@ class EventListener {
   // returned value.
   virtual void OnTableFileCreated(const TableFileCreationInfo& /*info*/) {}
 
-  // A call-back function for RocksDB which will be called before
+  // A callback function for RocksDB which will be called before
   // a SST file is being created. It will follow by OnTableFileCreated after
   // the creation finishes.
   //
@@ -325,7 +368,7 @@ class EventListener {
   virtual void OnTableFileCreationStarted(
       const TableFileCreationBriefInfo& /*info*/) {}
 
-  // A call-back function for RocksDB which will be called before
+  // A callback function for RocksDB which will be called before
   // a memtable is made immutable.
   //
   // Note that the this function must be implemented in a way such that
@@ -335,10 +378,9 @@ class EventListener {
   // Note that if applications would like to use the passed reference
   // outside this function call, they should make copies from these
   // returned value.
-  virtual void OnMemTableSealed(
-    const MemTableInfo& /*info*/) {}
+  virtual void OnMemTableSealed(const MemTableInfo& /*info*/) {}
 
-  // A call-back function for RocksDB which will be called before
+  // A callback function for RocksDB which will be called before
   // a column family handle is deleted.
   //
   // Note that the this function must be implemented in a way such that
@@ -346,10 +388,10 @@ class EventListener {
   // returns.  Otherwise, RocksDB may be blocked.
   // @param handle is a pointer to the column family handle to be deleted
   // which will become a dangling pointer after the deletion.
-  virtual void OnColumnFamilyHandleDeletionStarted(ColumnFamilyHandle* handle) {
-  }
+  virtual void OnColumnFamilyHandleDeletionStarted(
+      ColumnFamilyHandle* /*handle*/) {}
 
-  // A call-back function for RocksDB which will be called after an external
+  // A callback function for RocksDB which will be called after an external
   // file is ingested using IngestExternalFile.
   //
   // Note that the this function will run on the same thread as
@@ -358,7 +400,7 @@ class EventListener {
   virtual void OnExternalFileIngested(
       DB* /*db*/, const ExternalFileIngestionInfo& /*info*/) {}
 
-  // A call-back function for RocksDB which will be called before setting the
+  // A callback function for RocksDB which will be called before setting the
   // background error status to a non-OK value. The new background error status
   // is provided in `bg_error` and can be modified by the callback. E.g., a
   // callback can suppress errors by resetting it to Status::OK(), thus
@@ -372,19 +414,47 @@ class EventListener {
   virtual void OnBackgroundError(BackgroundErrorReason /* reason */,
                                  Status* /* bg_error */) {}
 
-  // Factory method to return CompactionEventListener. If multiple listeners
-  // provides CompactionEventListner, only the first one will be used.
-  virtual CompactionEventListener* GetCompactionEventListener() {
-    return nullptr;
-  }
+  // A callback function for RocksDB which will be called whenever a change
+  // of superversion triggers a change of the stall conditions.
+  //
+  // Note that the this function must be implemented in a way such that
+  // it should not run for an extended period of time before the function
+  // returns.  Otherwise, RocksDB may be blocked.
+  virtual void OnStallConditionsChanged(const WriteStallInfo& /*info*/) {}
+
+  // A callback function for RocksDB which will be called whenever a file read
+  // operation finishes.
+  virtual void OnFileReadFinish(const FileOperationInfo& /* info */) {}
+
+  // A callback function for RocksDB which will be called whenever a file write
+  // operation finishes.
+  virtual void OnFileWriteFinish(const FileOperationInfo& /* info */) {}
+
+  // If true, the OnFileReadFinish and OnFileWriteFinish will be called. If
+  // false, then they won't be called.
+  virtual bool ShouldBeNotifiedOnFileIO() { return false; }
+
+  // A callback function for RocksDB which will be called just before
+  // starting the automatic recovery process for recoverable background
+  // errors, such as NoSpace(). The callback can suppress the automatic
+  // recovery by setting *auto_recovery to false. The database will then
+  // have to be transitioned out of read-only mode by calling DB::Resume()
+  virtual void OnErrorRecoveryBegin(BackgroundErrorReason /* reason */,
+                                    Status /* bg_error */,
+                                    bool* /* auto_recovery */) {}
+
+  // A callback function for RocksDB which will be called once the database
+  // is recovered from read-only mode after an error. When this is called, it
+  // means normal writes to the database can be issued and the user can
+  // initiate any further recovery actions needed
+  virtual void OnErrorRecoveryCompleted(Status /* old_bg_error */) {}
 
   virtual ~EventListener() {}
 };
 
 #else
 
-class EventListener {
-};
+class EventListener {};
 
 #endif  // ROCKSDB_LITE
 
diff --git a/thirdparty/rocksdb/include/rocksdb/memory_allocator.h b/thirdparty/rocksdb/include/rocksdb/memory_allocator.h
new file mode 100644
index 0000000000..889c0e9218
--- /dev/null
+++ b/thirdparty/rocksdb/include/rocksdb/memory_allocator.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+#include <memory>
+
+namespace rocksdb {
+
+// MemoryAllocator is an interface that a client can implement to supply custom
+// memory allocation and deallocation methods. See rocksdb/cache.h for more
+// information.
+// All methods should be thread-safe.
+class MemoryAllocator {
+ public:
+  virtual ~MemoryAllocator() = default;
+
+  // Name of the cache allocator, printed in the log
+  virtual const char* Name() const = 0;
+
+  // Allocate a block of at least size. Has to be thread-safe.
+  virtual void* Allocate(size_t size) = 0;
+
+  // Deallocate previously allocated block. Has to be thread-safe.
+  virtual void Deallocate(void* p) = 0;
+
+  // Returns the memory size of the block allocated at p. The default
+  // implementation that just returns the original allocation_size is fine.
+  virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const {
+    // default implementation just returns the allocation size
+    return allocation_size;
+  }
+};
+
+struct JemallocAllocatorOptions {
+  // Jemalloc tcache cache allocations by size class. For each size class,
+  // it caches between 20 (for large size classes) to 200 (for small size
+  // classes). To reduce tcache memory usage in case the allocator is access
+  // by large number of threads, we can control whether to cache an allocation
+  // by its size.
+  bool limit_tcache_size = false;
+
+  // Lower bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size/4.
+  size_t tcache_size_lower_bound = 1024;
+
+  // Upper bound of allocation size to use tcache, if limit_tcache_size=true.
+  // When used with block cache, it is recommneded to set it to block_size.
+  size_t tcache_size_upper_bound = 16 * 1024;
+};
+
+// Generate memory allocators which allocates through Jemalloc and utilize
+// MADV_DONTDUMP through madvice to exclude cache items from core dump.
+// Applications can use the allocator with block cache to exclude block cache
+// usage from core dump.
+//
+// Implementation details:
+// The JemallocNodumpAllocator creates a delicated jemalloc arena, and all
+// allocations of the JemallocNodumpAllocator is through the same arena.
+// The memory allocator hooks memory allocation of the arena, and call
+// madvice() with MADV_DONTDUMP flag to exclude the piece of memory from
+// core dump. Side benefit of using single arena would be reduce of jemalloc
+// metadata for some workload.
+//
+// To mitigate mutex contention for using one single arena, jemalloc tcache
+// (thread-local cache) is enabled to cache unused allocations for future use.
+// The tcache normally incur 0.5M extra memory usage per-thread. The usage
+// can be reduce by limitting allocation sizes to cache.
+extern Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/memtablerep.h b/thirdparty/rocksdb/include/rocksdb/memtablerep.h
index 347dd3096c..328422f570 100644
--- a/thirdparty/rocksdb/include/rocksdb/memtablerep.h
+++ b/thirdparty/rocksdb/include/rocksdb/memtablerep.h
@@ -35,28 +35,38 @@
 
 #pragma once
 
-#include <memory>
-#include <stdexcept>
+#include <rocksdb/slice.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <memory>
+#include <stdexcept>
 
 namespace rocksdb {
 
 class Arena;
 class Allocator;
 class LookupKey;
-class Slice;
 class SliceTransform;
 class Logger;
 
 typedef void* KeyHandle;
 
+extern Slice GetLengthPrefixedSlice(const char* data);
+
 class MemTableRep {
  public:
   // KeyComparator provides a means to compare keys, which are internal keys
   // concatenated with values.
   class KeyComparator {
    public:
+    typedef rocksdb::Slice DecodedType;
+
+    virtual DecodedType decode_key(const char* key) const {
+      // The format of key is frozen and can be terated as a part of the API
+      // contract. Refer to MemTable::Add for details.
+      return GetLengthPrefixedSlice(key);
+    }
+
     // Compare a and b. Return a negative value if a is less than b, 0 if they
     // are equal, and a positive value if a is greater than b
     virtual int operator()(const char* prefix_len_key1,
@@ -65,7 +75,7 @@ class MemTableRep {
     virtual int operator()(const char* prefix_len_key,
                            const Slice& key) const = 0;
 
-    virtual ~KeyComparator() { }
+    virtual ~KeyComparator() {}
   };
 
   explicit MemTableRep(Allocator* allocator) : allocator_(allocator) {}
@@ -83,25 +93,46 @@ class MemTableRep {
   // collection, and no concurrent modifications to the table in progress
   virtual void Insert(KeyHandle handle) = 0;
 
+  // Same as ::Insert
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKey(KeyHandle handle) {
+    Insert(handle);
+    return true;
+  }
+
   // Same as Insert(), but in additional pass a hint to insert location for
   // the key. If hint points to nullptr, a new hint will be populated.
   // otherwise the hint will be updated to reflect the last insert location.
   //
   // Currently only skip-list based memtable implement the interface. Other
   // implementations will fallback to Insert() by default.
-  virtual void InsertWithHint(KeyHandle handle, void** hint) {
+  virtual void InsertWithHint(KeyHandle handle, void** /*hint*/) {
     // Ignore the hint by default.
     Insert(handle);
   }
 
+  // Same as ::InsertWithHint
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyWithHint(KeyHandle handle, void** hint) {
+    InsertWithHint(handle, hint);
+    return true;
+  }
+
   // Like Insert(handle), but may be called concurrent with other calls
-  // to InsertConcurrently for other handles
-  virtual void InsertConcurrently(KeyHandle handle) {
-#ifndef ROCKSDB_LITE
-    throw std::runtime_error("concurrent insert not supported");
-#else
-    abort();
-#endif
+  // to InsertConcurrently for other handles.
+  //
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual void InsertConcurrently(KeyHandle handle);
+
+  // Same as ::InsertConcurrently
+  // Returns false if MemTableRepFactory::CanHandleDuplicatedKey() is true and
+  // the <key, seq> already exists.
+  virtual bool InsertKeyConcurrently(KeyHandle handle) {
+    InsertConcurrently(handle);
+    return true;
   }
 
   // Returns true iff an entry that compares equal to key is in the collection.
@@ -111,7 +142,15 @@ class MemTableRep {
   // does nothing.  After MarkReadOnly() is called, this table rep will
   // not be written to (ie No more calls to Allocate(), Insert(),
   // or any writes done directly to entries accessed through the iterator.)
-  virtual void MarkReadOnly() { }
+  virtual void MarkReadOnly() {}
+
+  // Notify this table rep that it has been flushed to stable storage.
+  // By default, does nothing.
+  //
+  // Invariant: MarkReadOnly() is called, before MarkFlushed().
+  // Note that this method if overridden, should not run for an extended period
+  // of time. Otherwise, RocksDB may be blocked.
+  virtual void MarkFlushed() {}
 
   // Look up key from the mem table, since the first key in the mem table whose
   // user_key matches the one given k, call the function callback_func(), with
@@ -128,8 +167,8 @@ class MemTableRep {
   virtual void Get(const LookupKey& k, void* callback_args,
                    bool (*callback_func)(void* arg, const char* entry));
 
-  virtual uint64_t ApproximateNumEntries(const Slice& start_ikey,
-                                         const Slice& end_key) {
+  virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
+                                         const Slice& /*end_key*/) {
     return 0;
   }
 
@@ -137,7 +176,7 @@ class MemTableRep {
   // that was allocated through the allocator.  Safe to call from any thread.
   virtual size_t ApproximateMemoryUsage() = 0;
 
-  virtual ~MemTableRep() { }
+  virtual ~MemTableRep() {}
 
   // Iteration over the contents of a skip collection
   class Iterator {
@@ -232,6 +271,12 @@ class MemTableRepFactory {
   // Return true if the current MemTableRep supports concurrent inserts
   // Default: false
   virtual bool IsInsertConcurrentlySupported() const { return false; }
+
+  // Return true if the current MemTableRep supports detecting duplicate
+  // <key,seq> at insertion time. If true, then MemTableRep::Insert* returns
+  // false when if the <key,seq> already exists.
+  // Default: false
+  virtual bool CanHandleDuplicatedKey() const { return false; }
 };
 
 // This uses a skip list to store keys. It is the default.
@@ -253,6 +298,8 @@ class SkipListFactory : public MemTableRepFactory {
 
   bool IsInsertConcurrentlySupported() const override { return true; }
 
+  bool CanHandleDuplicatedKey() const override { return true; }
+
  private:
   const size_t lookahead_;
 };
@@ -270,16 +317,14 @@ class VectorRepFactory : public MemTableRepFactory {
   const size_t count_;
 
  public:
-  explicit VectorRepFactory(size_t count = 0) : count_(count) { }
+  explicit VectorRepFactory(size_t count = 0) : count_(count) {}
 
   using MemTableRepFactory::CreateMemTableRep;
   virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&,
                                          Allocator*, const SliceTransform*,
                                          Logger* logger) override;
 
-  virtual const char* Name() const override {
-    return "VectorRepFactory";
-  }
+  virtual const char* Name() const override { return "VectorRepFactory"; }
 };
 
 // This class contains a fixed array of buckets, each
@@ -290,8 +335,7 @@ class VectorRepFactory : public MemTableRepFactory {
 //                            link lists in the skiplist
 extern MemTableRepFactory* NewHashSkipListRepFactory(
     size_t bucket_count = 1000000, int32_t skiplist_height = 4,
-    int32_t skiplist_branching_factor = 4
-);
+    int32_t skiplist_branching_factor = 4);
 
 // The factory is to create memtables based on a hash table:
 // it contains a fixed array of buckets, each pointing to either a linked list
@@ -315,39 +359,5 @@ extern MemTableRepFactory* NewHashLinkListRepFactory(
     bool if_log_bucket_dist_when_flash = true,
     uint32_t threshold_use_skiplist = 256);
 
-// This factory creates a cuckoo-hashing based mem-table representation.
-// Cuckoo-hash is a closed-hash strategy, in which all key/value pairs
-// are stored in the bucket array itself intead of in some data structures
-// external to the bucket array.  In addition, each key in cuckoo hash
-// has a constant number of possible buckets in the bucket array.  These
-// two properties together makes cuckoo hash more memory efficient and
-// a constant worst-case read time.  Cuckoo hash is best suitable for
-// point-lookup workload.
-//
-// When inserting a key / value, it first checks whether one of its possible
-// buckets is empty.  If so, the key / value will be inserted to that vacant
-// bucket.  Otherwise, one of the keys originally stored in one of these
-// possible buckets will be "kicked out" and move to one of its possible
-// buckets (and possibly kicks out another victim.)  In the current
-// implementation, such "kick-out" path is bounded.  If it cannot find a
-// "kick-out" path for a specific key, this key will be stored in a backup
-// structure, and the current memtable to be forced to immutable.
-//
-// Note that currently this mem-table representation does not support
-// snapshot (i.e., it only queries latest state) and iterators.  In addition,
-// MultiGet operation might also lose its atomicity due to the lack of
-// snapshot support.
-//
-// Parameters:
-//   write_buffer_size: the write buffer size in bytes.
-//   average_data_size: the average size of key + value in bytes.  This value
-//     together with write_buffer_size will be used to compute the number
-//     of buckets.
-//   hash_function_count: the number of hash functions that will be used by
-//     the cuckoo-hash.  The number also equals to the number of possible
-//     buckets each key will have.
-extern MemTableRepFactory* NewHashCuckooRepFactory(
-    size_t write_buffer_size, size_t average_data_size = 64,
-    unsigned int hash_function_count = 4);
 #endif  // ROCKSDB_LITE
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/merge_operator.h b/thirdparty/rocksdb/include/rocksdb/merge_operator.h
index f294710055..d8ddcc6a09 100644
--- a/thirdparty/rocksdb/include/rocksdb/merge_operator.h
+++ b/thirdparty/rocksdb/include/rocksdb/merge_operator.h
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
+#pragma once
 
 #include <deque>
 #include <memory>
@@ -66,11 +65,9 @@ class MergeOperator {
   // internal corruption. This will be treated as an error by the library.
   //
   // Also make use of the *logger for error messages.
-  virtual bool FullMerge(const Slice& key,
-                         const Slice* existing_value,
-                         const std::deque<std::string>& operand_list,
-                         std::string* new_value,
-                         Logger* logger) const {
+  virtual bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+                         const std::deque<std::string>& /*operand_list*/,
+                         std::string* /*new_value*/, Logger* /*logger*/) const {
     // deprecated, please use FullMergeV2()
     assert(false);
     return false;
@@ -89,7 +86,7 @@ class MergeOperator {
     // The key associated with the merge operation.
     const Slice& key;
     // The existing value of the current key, nullptr means that the
-    // value dont exist.
+    // value doesn't exist.
     const Slice* existing_value;
     // A list of operands to apply.
     const std::vector<Slice>& operand_list;
@@ -111,6 +108,23 @@ class MergeOperator {
     Slice& existing_operand;
   };
 
+  // This function applies a stack of merge operands in chrionological order
+  // on top of an existing value. There are two ways in which this method is
+  // being used:
+  // a) During Get() operation, it used to calculate the final value of a key
+  // b) During compaction, in order to collapse some operands with the based
+  //    value.
+  //
+  // Note: The name of the method is somewhat misleading, as both in the cases
+  // of Get() or compaction it may be called on a subset of operands:
+  // K:    0    +1    +2    +7    +4     +5      2     +1     +2
+  //                              ^
+  //                              |
+  //                          snapshot
+  // In the example above, Get(K) operation will call FullMerge with a base
+  // value of 2 and operands [+1, +2]. Compaction process might decide to
+  // collapse the beginning of the history up to the snapshot by performing
+  // full Merge with base value of 0 and operands [+1, +2, +7, +3].
   virtual bool FullMergeV2(const MergeOperationInput& merge_in,
                            MergeOperationOutput* merge_out) const;
 
@@ -145,9 +159,10 @@ class MergeOperator {
   // If there is corruption in the data, handle it in the FullMergeV2() function
   // and return false there.  The default implementation of PartialMerge will
   // always return false.
-  virtual bool PartialMerge(const Slice& key, const Slice& left_operand,
-                            const Slice& right_operand, std::string* new_value,
-                            Logger* logger) const {
+  virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+                            const Slice& /*right_operand*/,
+                            std::string* /*new_value*/,
+                            Logger* /*logger*/) const {
     return false;
   }
 
@@ -184,12 +199,26 @@ class MergeOperator {
   //       consistent MergeOperator between DB opens.
   virtual const char* Name() const = 0;
 
-  // Determines whether the MergeOperator can be called with just a single
+  // Determines whether the PartialMerge can be called with just a single
   // merge operand.
-  // Override and return true for allowing a single operand. FullMergeV2 and
-  // PartialMerge/PartialMergeMulti should be implemented accordingly to handle
-  // a single operand.
+  // Override and return true for allowing a single operand. PartialMerge
+  // and PartialMergeMulti should be overridden and implemented
+  // correctly to properly handle a single operand.
   virtual bool AllowSingleOperand() const { return false; }
+
+  // Allows to control when to invoke a full merge during Get.
+  // This could be used to limit the number of merge operands that are looked at
+  // during a point lookup, thereby helping in limiting the number of levels to
+  // read from.
+  // Doesn't help with iterators.
+  //
+  // Note: the merge operands are passed to this function in the reversed order
+  // relative to how they were merged (passed to FullMerge or FullMergeV2)
+  // for performance reasons, see also:
+  // https://github.com/facebook/rocksdb/issues/3865
+  virtual bool ShouldMerge(const std::vector<Slice>& /*operands*/) const {
+    return false;
+  }
 };
 
 // The simpler, associative merge operator.
@@ -210,13 +239,10 @@ class AssociativeMergeOperator : public MergeOperator {
   // returns false, it is because client specified bad data or there was
   // internal corruption. The client should assume that this will be treated
   // as an error by the library.
-  virtual bool Merge(const Slice& key,
-                     const Slice* existing_value,
-                     const Slice& value,
-                     std::string* new_value,
+  virtual bool Merge(const Slice& key, const Slice* existing_value,
+                     const Slice& value, std::string* new_value,
                      Logger* logger) const = 0;
 
-
  private:
   // Default implementations of the MergeOperator functions
   bool FullMergeV2(const MergeOperationInput& merge_in,
@@ -228,5 +254,3 @@ class AssociativeMergeOperator : public MergeOperator {
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_MERGE_OPERATOR_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/metadata.h b/thirdparty/rocksdb/include/rocksdb/metadata.h
index 37e7b50b9b..a0ab41efdf 100644
--- a/thirdparty/rocksdb/include/rocksdb/metadata.h
+++ b/thirdparty/rocksdb/include/rocksdb/metadata.h
@@ -20,10 +20,10 @@ struct SstFileMetaData;
 
 // The metadata that describes a column family.
 struct ColumnFamilyMetaData {
-  ColumnFamilyMetaData() : size(0), name("") {}
+  ColumnFamilyMetaData() : size(0), file_count(0), name("") {}
   ColumnFamilyMetaData(const std::string& _name, uint64_t _size,
-                       const std::vector<LevelMetaData>&& _levels) :
-      size(_size), name(_name), levels(_levels) {}
+                       const std::vector<LevelMetaData>&& _levels)
+      : size(_size), name(_name), levels(_levels) {}
 
   // The size of this column family in bytes, which is equal to the sum of
   // the file size of its "levels".
@@ -39,9 +39,8 @@ struct ColumnFamilyMetaData {
 // The metadata that describes a level.
 struct LevelMetaData {
   LevelMetaData(int _level, uint64_t _size,
-                const std::vector<SstFileMetaData>&& _files) :
-      level(_level), size(_size),
-      files(_files) {}
+                const std::vector<SstFileMetaData>&& _files)
+      : level(_level), size(_size), files(_files) {}
 
   // The level which this meta data describes.
   const int level;
@@ -54,9 +53,21 @@ struct LevelMetaData {
 
 // The metadata that describes a SST file.
 struct SstFileMetaData {
-  SstFileMetaData() {}
+  SstFileMetaData()
+      : size(0),
+        name(""),
+        db_path(""),
+        smallest_seqno(0),
+        largest_seqno(0),
+        smallestkey(""),
+        largestkey(""),
+        num_reads_sampled(0),
+        being_compacted(false),
+        num_entries(0),
+        num_deletions(0) {}
+
   SstFileMetaData(const std::string& _file_name, const std::string& _path,
-                  uint64_t _size, SequenceNumber _smallest_seqno,
+                  size_t _size, SequenceNumber _smallest_seqno,
                   SequenceNumber _largest_seqno,
                   const std::string& _smallestkey,
                   const std::string& _largestkey, uint64_t _num_reads_sampled,
@@ -69,10 +80,12 @@ struct SstFileMetaData {
         smallestkey(_smallestkey),
         largestkey(_largestkey),
         num_reads_sampled(_num_reads_sampled),
-        being_compacted(_being_compacted) {}
+        being_compacted(_being_compacted),
+        num_entries(0),
+        num_deletions(0) {}
 
   // File size in bytes.
-  uint64_t size;
+  size_t size;
   // The name of the file.
   std::string name;
   // The full path where the file locates.
@@ -80,15 +93,19 @@ struct SstFileMetaData {
 
   SequenceNumber smallest_seqno;  // Smallest sequence number in file.
   SequenceNumber largest_seqno;   // Largest sequence number in file.
-  std::string smallestkey;     // Smallest user defined key in the file.
-  std::string largestkey;      // Largest user defined key in the file.
-  uint64_t num_reads_sampled;  // How many times the file is read.
+  std::string smallestkey;        // Smallest user defined key in the file.
+  std::string largestkey;         // Largest user defined key in the file.
+  uint64_t num_reads_sampled;     // How many times the file is read.
   bool being_compacted;  // true if the file is currently being compacted.
+
+  uint64_t num_entries;
+  uint64_t num_deletions;
 };
 
 // The full set of metadata associated with each SST file.
 struct LiveFileMetaData : SstFileMetaData {
   std::string column_family_name;  // Name of the column family
-  int level;               // Level at which this file resides.
+  int level;                       // Level at which this file resides.
+  LiveFileMetaData() : column_family_name(), level(0) {}
 };
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/options.h b/thirdparty/rocksdb/include/rocksdb/options.h
index 4d2f143a0f..f7d6dfaf58 100644
--- a/thirdparty/rocksdb/include/rocksdb/options.h
+++ b/thirdparty/rocksdb/include/rocksdb/options.h
@@ -6,16 +6,15 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
-#define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+#pragma once
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string>
-#include <memory>
-#include <vector>
 #include <limits>
+#include <memory>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "rocksdb/advanced_options.h"
 #include "rocksdb/comparator.h"
@@ -35,6 +34,7 @@ class Cache;
 class CompactionFilter;
 class CompactionFilterFactory;
 class Comparator;
+class ConcurrentTaskLimiter;
 class Env;
 enum InfoLogLevel : unsigned char;
 class SstFileManager;
@@ -77,6 +77,7 @@ enum CompressionType : unsigned char {
 };
 
 struct Options;
+struct DbPath;
 
 struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // The function recovers options to a previous version. Only 4.6 or later
@@ -93,8 +94,7 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // an iterator, only Put() and Get() API calls
   //
   // Not supported in ROCKSDB_LITE
-  ColumnFamilyOptions* OptimizeForPointLookup(
-      uint64_t block_cache_size_mb);
+  ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb);
 
   // Default values for some parameters in ColumnFamilyOptions are not
   // optimized for heavy workloads and big datasets, which means you might
@@ -188,8 +188,7 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Dynamically changeable through SetOptions() API
   size_t write_buffer_size = 64 << 20;
 
-  // Compress blocks using the specified compression algorithm.  This
-  // parameter can be changed dynamically.
+  // Compress blocks using the specified compression algorithm.
   //
   // Default: kSnappyCompression, if it's supported. If snappy is not linked
   // with the library, the default is kNoCompression.
@@ -197,20 +196,36 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz:
   //    ~200-500MB/s compression
   //    ~400-800MB/s decompression
+  //
   // Note that these speeds are significantly faster than most
   // persistent storage speeds, and therefore it is typically never
   // worth switching to kNoCompression.  Even if the input data is
   // incompressible, the kSnappyCompression implementation will
   // efficiently detect that and will switch to uncompressed mode.
+  //
+  // If you do not set `compression_opts.level`, or set it to
+  // `CompressionOptions::kDefaultCompressionLevel`, we will attempt to pick the
+  // default corresponding to `compression` as follows:
+  //
+  // - kZSTD: 3
+  // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1)
+  // - kLZ4HCCompression: 0
+  // - For all others, we do not specify a compression level
+  //
+  // Dynamically changeable through SetOptions() API
   CompressionType compression;
 
   // Compression algorithm that will be used for the bottommost level that
-  // contain files. If level-compaction is used, this option will only affect
-  // levels after base level.
+  // contain files.
   //
   // Default: kDisableCompressionOption (Disabled)
   CompressionType bottommost_compression = kDisableCompressionOption;
 
+  // different options for compression algorithms used by bottommost_compression
+  // if it is enabled. To enable it, please see the definition of
+  // CompressionOptions.
+  CompressionOptions bottommost_compression_opts;
+
   // different options for compression algorithms
   CompressionOptions compression_opts;
 
@@ -264,6 +279,28 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
   // BlockBasedTableOptions.
   std::shared_ptr<TableFactory> table_factory;
 
+  // A list of paths where SST files for this column family
+  // can be put into, with its target size. Similar to db_paths,
+  // newer data is placed into paths specified earlier in the
+  // vector while older data gradually moves to paths specified
+  // later in the vector.
+  // Note that, if a path is supplied to multiple column
+  // families, it would have files and total size from all
+  // the column families combined. User should provision for the
+  // total size(from all the column families) in such cases.
+  //
+  // If left empty, db_paths will be used.
+  // Default: empty
+  std::vector<DbPath> cf_paths;
+
+  // Compaction concurrent thread limiter for the column family.
+  // If non-nullptr, use given concurrent thread limiter to control
+  // the max outstanding compaction tasks. Limiter can be shared with
+  // multiple column families across db instances.
+  //
+  // Default: nullptr
+  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter = nullptr;
+
   // Create ColumnFamilyOptions with default values for all fields
   ColumnFamilyOptions();
   // Create ColumnFamilyOptions from Options
@@ -275,14 +312,14 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
 enum class WALRecoveryMode : char {
   // Original levelDB recovery
   // We tolerate incomplete record in trailing data on all logs
-  // Use case : This is legacy behavior (default)
+  // Use case : This is legacy behavior
   kTolerateCorruptedTailRecords = 0x00,
   // Recover from clean shutdown
   // We don't expect to find any corruption in the WAL
   // Use case : This is ideal for unit tests and rare applications that
   // can require high consistency guarantee
   kAbsoluteConsistency = 0x01,
-  // Recover to point-in-time consistency
+  // Recover to point-in-time consistency (default)
   // We stop the WAL playback on discovering WAL inconsistency
   // Use case : Ideal for systems that have disk controller cache like
   // hard disk, SSD without super capacitor that store related data
@@ -303,7 +340,6 @@ struct DbPath {
   DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {}
 };
 
-
 struct DBOptions {
   // The function recovers options to the option as in version 4.6.
   DBOptions* OldDefaults(int rocksdb_major_version = 4,
@@ -378,9 +414,9 @@ struct DBOptions {
   std::shared_ptr<Logger> info_log = nullptr;
 
 #ifdef NDEBUG
-      InfoLogLevel info_log_level = INFO_LEVEL;
+  InfoLogLevel info_log_level = INFO_LEVEL;
 #else
-      InfoLogLevel info_log_level = DEBUG_LEVEL;
+  InfoLogLevel info_log_level = DEBUG_LEVEL;
 #endif  // NDEBUG
 
   // Number of open files that can be used by the DB.  You may need to
@@ -388,7 +424,10 @@ struct DBOptions {
   // files opened are always kept open. You can estimate number of files based
   // on target_file_size_base and target_file_size_multiplier for level-based
   // compaction. For universal-style compaction, you can usually set it to -1.
+  //
   // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int max_open_files = -1;
 
   // If max_open_files is -1, DB will open all files on DB::Open(). You can
@@ -401,19 +440,25 @@ struct DBOptions {
   // (i.e. the ones that are causing all the space amplification). If set to 0
   // (default), we will dynamically choose the WAL size limit to be
   // [sum of all write_buffer_size * max_write_buffer_number] * 4
+  // This option takes effect only when there are more than one column family as
+  // otherwise the wal size is dictated by the write_buffer_size.
+  //
   // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t max_total_wal_size = 0;
 
   // If non-null, then we should collect metrics about database operations
   std::shared_ptr<Statistics> statistics = nullptr;
 
-  // If true, then every store to stable storage will issue a fsync.
-  // If false, then every store to stable storage will issue a fdatasync.
-  // This parameter should be set to true while storing data to
-  // filesystem like ext3 that can lose files after a reboot.
-  // Default: false
-  // Note: on many platforms fdatasync is defined as fsync, so this parameter
-  // would make no difference. Refer to fdatasync definition in this code base.
+  // By default, writes to stable storage use fdatasync (on platforms
+  // where this function is available). If this option is true,
+  // fsync is used instead.
+  //
+  // fsync and fdatasync are equally safe for our purposes and fdatasync is
+  // faster, so it is rarely necessary to set this option. It is provided
+  // as a workaround for kernel/filesystem bugs, such as one that affected
+  // fdatasync with ext4 in kernel versions prior to 3.7.
   bool use_fsync = false;
 
   // A list of paths where SST files can be put into, with its target size.
@@ -461,13 +506,23 @@ struct DBOptions {
   // value is 6 hours. The files that get out of scope by compaction
   // process will still get automatically delete on every compaction,
   // regardless of this setting
+  //
+  // Default: 6 hours
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t delete_obsolete_files_period_micros = 6ULL * 60 * 60 * 1000000;
 
   // Maximum number of concurrent background jobs (compactions and flushes).
+  //
+  // Default: 2
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int max_background_jobs = 2;
 
   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
   // value of max_background_jobs. This option is ignored.
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int base_background_compactions = -1;
 
   // NOT SUPPORTED ANYMORE: RocksDB automatically decides this based on the
@@ -482,7 +537,10 @@ struct DBOptions {
   // If you're increasing this, also consider increasing number of threads in
   // LOW priority thread pool. For more information, see
   // Env::SetBackgroundThreads
+  //
   // Default: -1
+  //
+  // Dynamically changeable through SetDBOptions() API.
   int max_background_compactions = -1;
 
   // This value represents the maximum number of threads that will
@@ -543,8 +601,9 @@ struct DBOptions {
 
   // manifest file is rolled over on reaching this limit.
   // The older manifest file be deleted.
-  // The default value is MAX_INT so that roll-over does not take place.
-  uint64_t max_manifest_file_size = std::numeric_limits<uint64_t>::max();
+  // The default value is 1GB so that the manifest file can grow, but not
+  // reach the limit of storage capacity.
+  uint64_t max_manifest_file_size = 1024 * 1024 * 1024;
 
   // Number of shards used for table cache.
   int table_cache_numshardbits = 6;
@@ -560,7 +619,7 @@ struct DBOptions {
   //    then WAL_size_limit_MB, they will be deleted starting with the
   //    earliest until size_limit is met. All empty files will be deleted.
   // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then
-  //    WAL files will be checked every WAL_ttl_secondsi / 2 and those that
+  //    WAL files will be checked every WAL_ttl_seconds / 2 and those that
   //    are older than WAL_ttl_seconds will be deleted.
   // 4. If both are not 0, WAL files will be checked every 10 min and both
   //    checks will be performed with ttl being first.
@@ -589,13 +648,13 @@ struct DBOptions {
   // buffered. The hardware buffer of the devices may however still
   // be used. Memory mapped files are not impacted by these parameters.
 
-  // Use O_DIRECT for user reads
+  // Use O_DIRECT for user and compaction reads.
+  // When true, we also force new_table_reader_for_compaction_inputs to true.
   // Default: false
   // Not supported in ROCKSDB_LITE mode!
   bool use_direct_reads = false;
 
-  // Use O_DIRECT for both reads and writes in background flush and compactions
-  // When true, we also force new_table_reader_for_compaction_inputs to true.
+  // Use O_DIRECT for writes in background flush and compactions.
   // Default: false
   // Not supported in ROCKSDB_LITE mode!
   bool use_direct_io_for_flush_and_compaction = false;
@@ -610,9 +669,21 @@ struct DBOptions {
   bool skip_log_error_on_recovery = false;
 
   // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+  //
   // Default: 600 (10 min)
+  //
+  // Dynamically changeable through SetDBOptions() API.
   unsigned int stats_dump_period_sec = 600;
 
+  // if not zero, dump rocksdb.stats to RocksDB every stats_persist_period_sec
+  // Default: 600
+  unsigned int stats_persist_period_sec = 600;
+
+  // if not zero, periodically take stats snapshots and store in memory, the
+  // memory size for stats snapshots is capped at stats_history_buffer_size
+  // Default: 1MB
+  size_t stats_history_buffer_size = 1024 * 1024;
+
   // If set true, will hint the underlying file system that the file
   // access pattern is random, when a sst file is opened.
   // Default: true
@@ -636,7 +707,7 @@ struct DBOptions {
   // a limit, a flush will be triggered in the next DB to which the next write
   // is issued.
   //
-  // If the object is only passed to on DB, the behavior is the same as
+  // If the object is only passed to one DB, the behavior is the same as
   // db_write_buffer_size. When write_buffer_manager is set, the value set will
   // override db_write_buffer_size.
   //
@@ -649,12 +720,7 @@ struct DBOptions {
   // Specify the file access pattern once a compaction is started.
   // It will be applied to all input files of a compaction.
   // Default: NORMAL
-  enum AccessHint {
-      NONE,
-      NORMAL,
-      SEQUENTIAL,
-      WILLNEED
-  };
+  enum AccessHint { NONE, NORMAL, SEQUENTIAL, WILLNEED };
   AccessHint access_hint_on_compaction_start = NORMAL;
 
   // If true, always create a new file descriptor and new table reader
@@ -677,6 +743,8 @@ struct DBOptions {
   // true.
   //
   // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
   size_t compaction_readahead_size = 0;
 
   // This is a maximum buffer size that is used by WinMmapReadableFile in
@@ -703,9 +771,10 @@ struct DBOptions {
   // write requests if the logical sector size is unusual
   //
   // Default: 1024 * 1024 (1 MB)
+  //
+  // Dynamically changeable through SetDBOptions() API.
   size_t writable_file_max_buffer_size = 1024 * 1024;
 
-
   // Use adaptive mutex, which spins in the user space before resorting
   // to kernel. This could reduce context switch when the mutex is not
   // heavily contended. However, if the mutex is hot, we could end up
@@ -725,20 +794,27 @@ struct DBOptions {
   // to smooth out write I/Os over time. Users shouldn't rely on it for
   // persistency guarantee.
   // Issue one request for every bytes_per_sync written. 0 turns it off.
-  // Default: 0
   //
   // You may consider using rate_limiter to regulate write rate to device.
   // When rate limiter is enabled, it automatically enables bytes_per_sync
   // to 1MB.
   //
   // This option applies to table files
+  //
+  // Default: 0, turned off
+  //
+  // Note: DOES NOT apply to WAL files. See wal_bytes_per_sync instead
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t bytes_per_sync = 0;
 
   // Same as bytes_per_sync, but applies to WAL files
+  //
   // Default: 0, turned off
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t wal_bytes_per_sync = 0;
 
-  // A vector of EventListeners which call-back functions will be called
+  // A vector of EventListeners whose callback functions will be called
   // when specific RocksDB event happens.
   std::vector<std::shared_ptr<EventListener>> listeners;
 
@@ -762,6 +838,8 @@ struct DBOptions {
   // Unit: byte per second.
   //
   // Default: 0
+  //
+  // Dynamically changeable through SetDBOptions() API.
   uint64_t delayed_write_rate = 0;
 
   // By default, a single write thread queue is maintained. The thread gets
@@ -888,17 +966,51 @@ struct DBOptions {
   // Immutable.
   bool allow_ingest_behind = false;
 
+  // Needed to support differential snapshots.
+  // If set to true then DB will only process deletes with sequence number
+  // less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
+  // Clients are responsible to periodically call this method to advance
+  // the cutoff time. If this method is never called and preserve_deletes
+  // is set to true NO deletes will ever be processed.
+  // At the moment this only keeps normal deletes, SingleDeletes will
+  // not be preserved.
+  // DEFAULT: false
+  // Immutable (TODO: make it dynamically changeable)
+  bool preserve_deletes = false;
+
   // If enabled it uses two queues for writes, one for the ones with
   // disable_memtable and one for the ones that also write to memtable. This
   // allows the memtable writes not to lag behind other writes. It can be used
   // to optimize MySQL 2PC in which only the commits, which are serial, write to
   // memtable.
-  bool concurrent_prepare = false;
+  bool two_write_queues = false;
 
   // If true WAL is not flushed automatically after each write. Instead it
   // relies on manual invocation of FlushWAL to write the WAL buffer to its
   // file.
   bool manual_wal_flush = false;
+
+  // If true, RocksDB supports flushing multiple column families and committing
+  // their results atomically to MANIFEST. Note that it is not
+  // necessary to set atomic_flush to true if WAL is always enabled since WAL
+  // allows the database to be restored to the last persistent state in WAL.
+  // This option is useful when there are column families with writes NOT
+  // protected by WAL.
+  // For manual flush, application has to specify which column families to
+  // flush atomically in DB::Flush.
+  // For auto-triggered flush, RocksDB atomically flushes ALL column families.
+  //
+  // Currently, any WAL-enabled writes after atomic flush may be replayed
+  // independently if the process crashes later and tries to recover.
+  bool atomic_flush = false;
+
+  // If true, ColumnFamilyHandle's and Iterator's destructors won't delete
+  // obsolete files directly and will instead schedule a background job
+  // to do it. Use it if you're destroying iterators or ColumnFamilyHandle-s
+  // from latency-sensitive threads.
+  // If set to true, takes precedence over
+  // ReadOptions::background_purge_on_iterator_cleanup.
+  bool avoid_unnecessary_blocking_io = false;
 };
 
 // Options to control the behavior of a database (passed to DB::Open)
@@ -961,14 +1073,24 @@ struct ReadOptions {
   // Default: nullptr
   const Snapshot* snapshot;
 
+  // `iterate_lower_bound` defines the smallest key at which the backward
+  // iterator can return an entry. Once the bound is passed, Valid() will be
+  // false. `iterate_lower_bound` is inclusive ie the bound value is a valid
+  // entry.
+  //
+  // If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+  // need to have the same prefix. This is because ordering is not guaranteed
+  // outside of prefix domain.
+  //
+  // Default: nullptr
+  const Slice* iterate_lower_bound;
+
   // "iterate_upper_bound" defines the extent upto which the forward iterator
   // can returns entries. Once the bound is reached, Valid() will be false.
   // "iterate_upper_bound" is exclusive ie the bound value is
   // not a valid entry.  If iterator_extractor is not null, the Seek target
-  // and iterator_upper_bound need to have the same prefix.
+  // and iterate_upper_bound need to have the same prefix.
   // This is because ordering is not guaranteed outside of prefix domain.
-  // There is no lower bound on the iterator. If needed, that can be easily
-  // implemented.
   //
   // Default: nullptr
   const Slice* iterate_upper_bound;
@@ -996,10 +1118,11 @@ struct ReadOptions {
   // Default: true
   bool verify_checksums;
 
-  // Should the "data block"/"index block"/"filter block" read for this
-  // iteration be cached in memory?
+  // Should the "data block"/"index block"" read for this iteration be placed in
+  // block cache?
   // Callers may wish to set this field to false for bulk scans.
-  // Default: true
+  // This would help not to the change eviction order of existing items in the
+  // block cache. Default: true
   bool fill_cache;
 
   // Specify to create a tailing iterator -- a special iterator that has a
@@ -1010,11 +1133,8 @@ struct ReadOptions {
   // Not supported in ROCKSDB_LITE mode!
   bool tailing;
 
-  // Specify to create a managed iterator -- a special iterator that
-  // uses less resources by having the ability to free its underlying
-  // resources on request.
-  // Default: false
-  // Not supported in ROCKSDB_LITE mode!
+  // This options is not used anymore. It was to turn on a functionality that
+  // has been removed.
   bool managed;
 
   // Enable a total order seek regardless of index format (e.g. hash index)
@@ -1053,6 +1173,21 @@ struct ReadOptions {
   // Default: false
   bool ignore_range_deletions;
 
+  // A callback to determine whether relevant keys for this scan exist in a
+  // given table based on the table's properties. The callback is passed the
+  // properties of each table during iteration. If the callback returns false,
+  // the table will not be scanned. This option only affects Iterators and has
+  // no impact on point lookups.
+  // Default: empty (every table will be scanned)
+  std::function<bool(const TableProperties&)> table_filter;
+
+  // Needed to support differential snapshots. Has 2 effects:
+  // 1) Iterator will skip all internal keys with seqnum < iter_start_seqnum
+  // 2) if this param > 0 iterator will return INTERNAL keys instead of
+  //    user keys; e.g. return tombstones as well.
+  // Default: 0 (don't filter by seqnum, return user keys)
+  SequenceNumber iter_start_seqnum;
+
   ReadOptions();
   ReadOptions(bool cksum, bool cache);
 };
@@ -1078,7 +1213,11 @@ struct WriteOptions {
   bool sync;
 
   // If true, writes will not first go to the write ahead log,
-  // and the write may got lost after a crash.
+  // and the write may get lost after a crash. The backup engine
+  // relies on write-ahead logs to back up the memtable, so if
+  // you disable write-ahead logs, you must create backups with
+  // flush_before_backup=true to avoid losing unflushed memtable data.
+  // Default: false
   bool disableWAL;
 
   // If true and if user is trying to write to column families that don't exist
@@ -1089,6 +1228,7 @@ struct WriteOptions {
 
   // If true and we need to wait or sleep for the write request, fails
   // immediately with Status::Incomplete().
+  // Default: false
   bool no_slowdown;
 
   // If true, this write request is of lower priority if compaction is
@@ -1113,8 +1253,13 @@ struct FlushOptions {
   // If true, the flush will wait until the flush is done.
   // Default: true
   bool wait;
-
-  FlushOptions() : wait(true) {}
+  // If true, the flush would proceed immediately even it means writes will
+  // stall for the duration of the flush; if false the operation will wait
+  // until it's possible to do flush w/o causing stall or until required flush
+  // is performed by someone else (foreground call or background thread).
+  // Default: false
+  bool allow_write_stall;
+  FlushOptions() : wait(true), allow_write_stall(false) {}
 };
 
 // Create a Logger from provided DBOptions
@@ -1126,14 +1271,20 @@ extern Status CreateLoggerFromOptions(const std::string& dbname,
 struct CompactionOptions {
   // Compaction output compression type
   // Default: snappy
+  // If set to `kDisableCompressionOption`, RocksDB will choose compression type
+  // according to the `ColumnFamilyOptions`, taking into account the output
+  // level if `compression_per_level` is specified.
   CompressionType compression;
   // Compaction will create files of size `output_file_size_limit`.
   // Default: MAX, which means that compaction will create a single file
   uint64_t output_file_size_limit;
+  // If > 0, it will replace the option in the DBOptions for this compaction.
+  uint32_t max_subcompactions;
 
   CompactionOptions()
       : compression(kSnappyCompression),
-        output_file_size_limit(std::numeric_limits<uint64_t>::max()) {}
+        output_file_size_limit(std::numeric_limits<uint64_t>::max()),
+        max_subcompactions(0) {}
 };
 
 // For level based compaction, we can configure if we want to skip/force
@@ -1166,6 +1317,11 @@ struct CompactRangeOptions {
   // if there is a compaction filter
   BottommostLevelCompaction bottommost_level_compaction =
       BottommostLevelCompaction::kIfHaveCompactionFilter;
+  // If true, will execute immediately even if doing so would cause the DB to
+  // enter write stall mode. Otherwise, it'll sleep until load is low enough.
+  bool allow_write_stall = false;
+  // If > 0, it will replace the option in the DBOptions for this compaction.
+  uint32_t max_subcompactions = 0;
 };
 
 // IngestExternalFileOptions is used by IngestExternalFile()
@@ -1189,8 +1345,44 @@ struct IngestExternalFileOptions {
   // with allow_ingest_behind=true since the dawn of time.
   // All files will be ingested at the bottommost level with seqno=0.
   bool ingest_behind = false;
+  // Set to true if you would like to write global_seqno to a given offset in
+  // the external SST file for backward compatibility. Older versions of
+  // RocksDB writes a global_seqno to a given offset within ingested SST files,
+  // and new versions of RocksDB do not. If you ingest an external SST using
+  // new version of RocksDB and would like to be able to downgrade to an
+  // older version of RocksDB, you should set 'write_global_seqno' to true. If
+  // your service is just starting to use the new RocksDB, we recommend that
+  // you set this option to false, which brings two benefits:
+  // 1. No extra random write for global_seqno during ingestion.
+  // 2. Without writing external SST file, it's possible to do checksum.
+  // We have a plan to set this option to false by default in the future.
+  bool write_global_seqno = true;
+  // Set to true if you would like to verify the checksums of each block of the
+  // external SST file before ingestion.
+  // Warning: setting this to true causes slowdown in file ingestion because
+  // the external SST file has to be read.
+  bool verify_checksums_before_ingest = false;
 };
 
-}  // namespace rocksdb
+enum TraceFilterType : uint64_t {
+  // Trace all the operations
+  kTraceFilterNone = 0x0,
+  // Do not trace the get operations
+  kTraceFilterGet = 0x1 << 0,
+  // Do not trace the write operations
+  kTraceFilterWrite = 0x1 << 1
+};
+
+// TraceOptions is used for StartTrace
+struct TraceOptions {
+  // To avoid the trace file size grows large than the storage space,
+  // user can set the max trace file size in Bytes. Default is 64GB
+  uint64_t max_trace_file_size = uint64_t{64} * 1024 * 1024 * 1024;
+  // Specify trace sampling option, i.e. capture one per how many requests.
+  // Default to 1 (capture every request).
+  uint64_t sampling_frequency = 1;
+  // Note: The filtering happens before sampling.
+  uint64_t filter = kTraceFilterNone;
+};
 
-#endif  // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/perf_context.h b/thirdparty/rocksdb/include/rocksdb/perf_context.h
index 1095d063bd..a1d803c2c2 100644
--- a/thirdparty/rocksdb/include/rocksdb/perf_context.h
+++ b/thirdparty/rocksdb/include/rocksdb/perf_context.h
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
-#define STORAGE_ROCKSDB_INCLUDE_PERF_CONTEXT_H
+#pragma once
 
 #include <stdint.h>
+#include <map>
 #include <string>
 
 #include "rocksdb/perf_level.h"
@@ -17,18 +17,64 @@ namespace rocksdb {
 // and transparently.
 // Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats.
 
+// Break down performance counters by level and store per-level perf context in
+// PerfContextByLevel
+struct PerfContextByLevel {
+  // # of times bloom filter has avoided file reads, i.e., negatives.
+  uint64_t bloom_filter_useful = 0;
+  // # of times bloom FullFilter has not avoided the reads.
+  uint64_t bloom_filter_full_positive = 0;
+  // # of times bloom FullFilter has not avoided the reads and data actually
+  // exist.
+  uint64_t bloom_filter_full_true_positive = 0;
+
+  // total number of user key returned (only include keys that are found, does
+  // not include keys that are deleted or merged without a final put
+  uint64_t user_key_return_count;
+
+  // total nanos spent on reading data from SST files
+  uint64_t get_from_table_nanos;
+
+  uint64_t block_cache_hit_count = 0;   // total number of block cache hits
+  uint64_t block_cache_miss_count = 0;  // total number of block cache misses
+
+  void Reset();  // reset all performance counters to zero
+};
+
 struct PerfContext {
+  ~PerfContext();
+
+  PerfContext() {}
 
-  void Reset(); // reset all performance counters to zero
+  PerfContext(const PerfContext&);
+  PerfContext& operator=(const PerfContext&);
+  PerfContext(PerfContext&&) noexcept;
+
+  void Reset();  // reset all performance counters to zero
 
   std::string ToString(bool exclude_zero_counters = false) const;
 
-  uint64_t user_key_comparison_count; // total number of user key comparisons
-  uint64_t block_cache_hit_count;     // total number of block cache hits
-  uint64_t block_read_count;          // total number of block reads (with IO)
-  uint64_t block_read_byte;           // total number of bytes from block reads
-  uint64_t block_read_time;           // total nanos spent on block reads
-  uint64_t block_checksum_time;       // total nanos spent on block checksum
+  // enable per level perf context and allocate storage for PerfContextByLevel
+  void EnablePerLevelPerfContext();
+
+  // temporarily disable per level perf contxt by setting the flag to false
+  void DisablePerLevelPerfContext();
+
+  // free the space for PerfContextByLevel, also disable per level perf context
+  void ClearPerLevelPerfContext();
+
+  uint64_t user_key_comparison_count;  // total number of user key comparisons
+  uint64_t block_cache_hit_count;      // total number of block cache hits
+  uint64_t block_read_count;           // total number of block reads (with IO)
+  uint64_t block_read_byte;            // total number of bytes from block reads
+  uint64_t block_read_time;            // total nanos spent on block reads
+  uint64_t block_cache_index_hit_count;   // total number of index block hits
+  uint64_t index_block_read_count;        // total number of index block reads
+  uint64_t block_cache_filter_hit_count;  // total number of filter block hits
+  uint64_t filter_block_read_count;       // total number of filter block reads
+  uint64_t compression_dict_block_read_count;  // total number of compression
+                                               // dictionary block reads
+  uint64_t block_checksum_time;    // total nanos spent on block checksum
   uint64_t block_decompress_time;  // total nanos spent on block decompression
 
   uint64_t get_read_bytes;       // bytes for vals returned by Get
@@ -69,9 +115,9 @@ struct PerfContext {
   //
   uint64_t internal_merge_count;
 
-  uint64_t get_snapshot_time;       // total nanos spent on getting snapshot
-  uint64_t get_from_memtable_time;  // total nanos spent on querying memtables
-  uint64_t get_from_memtable_count;    // number of mem tables queried
+  uint64_t get_snapshot_time;        // total nanos spent on getting snapshot
+  uint64_t get_from_memtable_time;   // total nanos spent on querying memtables
+  uint64_t get_from_memtable_count;  // number of mem tables queried
   // total nanos spent after Get() finds a key
   uint64_t get_post_process_time;
   uint64_t get_from_output_files_time;  // total nanos reading from output files
@@ -95,16 +141,27 @@ struct PerfContext {
   // total nanos spent on iterating internal entries to find the next user entry
   uint64_t find_next_user_entry_time;
 
+  // This group of stats provide a breakdown of time spent by Write().
+  // May be inaccurate when 2PC, two_write_queues or enable_pipelined_write
+  // are enabled.
+  //
   // total nanos spent on writing to WAL
   uint64_t write_wal_time;
   // total nanos spent on writing to mem tables
   uint64_t write_memtable_time;
-  // total nanos spent on delaying write
+  // total nanos spent on delaying or throttling write
   uint64_t write_delay_time;
-  // total nanos spent on writing a record, excluding the above three times
+  // total nanos spent on switching memtable/wal and scheduling
+  // flushes/compactions.
+  uint64_t write_scheduling_flushes_compactions_time;
+  // total nanos spent on writing a record, excluding the above four things
   uint64_t write_pre_and_post_process_time;
 
-  uint64_t db_mutex_lock_nanos;      // time spent on acquiring DB mutex.
+  // time spent waiting for other threads of the batch group
+  uint64_t write_thread_wait_nanos;
+
+  // time spent on acquiring DB mutex.
+  uint64_t db_mutex_lock_nanos;
   // Time spent on waiting with a condition variable created with DB mutex.
   uint64_t db_condition_wait_nanos;
   // Time spent on merge operator.
@@ -131,6 +188,11 @@ struct PerfContext {
   // total number of SST table bloom misses
   uint64_t bloom_sst_miss_count;
 
+  // Time spent waiting on key locks in transaction lock manager.
+  uint64_t key_lock_wait_time;
+  // number of times acquiring a lock was blocked by another transaction.
+  uint64_t key_lock_wait_count;
+
   // Total time spent in Env filesystem operations. These are only populated
   // when TimedEnv is used.
   uint64_t env_new_sequential_file_nanos;
@@ -153,12 +215,18 @@ struct PerfContext {
   uint64_t env_lock_file_nanos;
   uint64_t env_unlock_file_nanos;
   uint64_t env_new_logger_nanos;
+
+  uint64_t get_cpu_nanos;
+  uint64_t iter_next_cpu_nanos;
+  uint64_t iter_prev_cpu_nanos;
+  uint64_t iter_seek_cpu_nanos;
+
+  std::map<uint32_t, PerfContextByLevel>* level_to_perf_context = nullptr;
+  bool per_level_perf_context_enabled = false;
 };
 
 // Get Thread-local PerfContext object pointer
 // if defined(NPERF_CONTEXT), then the pointer is not thread-local
 PerfContext* get_perf_context();
 
-}
-
-#endif
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/perf_level.h b/thirdparty/rocksdb/include/rocksdb/perf_level.h
index 84a331c355..de0a214d6a 100644
--- a/thirdparty/rocksdb/include/rocksdb/perf_level.h
+++ b/thirdparty/rocksdb/include/rocksdb/perf_level.h
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef INCLUDE_ROCKSDB_PERF_LEVEL_H_
-#define INCLUDE_ROCKSDB_PERF_LEVEL_H_
+#pragma once
 
 #include <stdint.h>
 #include <string>
@@ -18,8 +17,11 @@ enum PerfLevel : unsigned char {
   kEnableCount = 2,               // enable only count stats
   kEnableTimeExceptForMutex = 3,  // Other than count stats, also enable time
                                   // stats except for mutexes
-  kEnableTime = 4,                // enable count and time stats
-  kOutOfBounds = 5                // N.B. Must always be the last value!
+  // Other than time, also measure CPU time counters. Still don't measure
+  // time (neither wall time nor CPU time) for mutexes.
+  kEnableTimeAndCPUTimeExceptForMutex = 4,
+  kEnableTime = 5,  // enable count and time stats
+  kOutOfBounds = 6  // N.B. Must always be the last value!
 };
 
 // set the perf stats level for current thread
@@ -29,5 +31,3 @@ void SetPerfLevel(PerfLevel level);
 PerfLevel GetPerfLevel();
 
 }  // namespace rocksdb
-
-#endif  // INCLUDE_ROCKSDB_PERF_LEVEL_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/rate_limiter.h b/thirdparty/rocksdb/include/rocksdb/rate_limiter.h
index 838c98a6de..57b1169b63 100644
--- a/thirdparty/rocksdb/include/rocksdb/rate_limiter.h
+++ b/thirdparty/rocksdb/include/rocksdb/rate_limiter.h
@@ -45,7 +45,7 @@ class RateLimiter {
   // Request for token for bytes. If this request can not be satisfied, the call
   // is blocked. Caller is responsible to make sure
   // bytes <= GetSingleBurstBytes()
-  virtual void Request(const int64_t bytes, const Env::IOPriority pri) {
+  virtual void Request(const int64_t /*bytes*/, const Env::IOPriority /*pri*/) {
     assert(false);
   }
 
@@ -81,11 +81,11 @@ class RateLimiter {
   // Max bytes can be granted in a single burst
   virtual int64_t GetSingleBurstBytes() const = 0;
 
-  // Total bytes that go though rate limiter
+  // Total bytes that go through rate limiter
   virtual int64_t GetTotalBytesThrough(
       const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
 
-  // Total # of requests that go though rate limiter
+  // Total # of requests that go through rate limiter
   virtual int64_t GetTotalRequests(
       const Env::IOPriority pri = Env::IO_TOTAL) const = 0;
 
@@ -127,9 +127,13 @@ class RateLimiter {
 // 1/fairness chance even though high-pri requests exist to avoid starvation.
 // You should be good by leaving it at default 10.
 // @mode: Mode indicates which types of operations count against the limit.
+// @auto_tuned: Enables dynamic adjustment of rate limit within the range
+//              `[rate_bytes_per_sec / 20, rate_bytes_per_sec]`, according to
+//              the recent demand for background I/O.
 extern RateLimiter* NewGenericRateLimiter(
     int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000,
     int32_t fairness = 10,
-    RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly);
+    RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
+    bool auto_tuned = false);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/slice.h b/thirdparty/rocksdb/include/rocksdb/slice.h
index 4f24c8a221..2b01e6d9a6 100644
--- a/thirdparty/rocksdb/include/rocksdb/slice.h
+++ b/thirdparty/rocksdb/include/rocksdb/slice.h
@@ -16,15 +16,18 @@
 // non-const method, all threads accessing the same Slice must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_H_
-#define STORAGE_ROCKSDB_INCLUDE_SLICE_H_
+#pragma once
 
 #include <assert.h>
-#include <cstdio>
 #include <stddef.h>
 #include <string.h>
+#include <cstdio>
 #include <string>
 
+#ifdef __cpp_lib_string_view
+#include <string_view>
+#endif
+
 #include "rocksdb/cleanable.h"
 
 namespace rocksdb {
@@ -32,18 +35,24 @@ namespace rocksdb {
 class Slice {
  public:
   // Create an empty slice.
-  Slice() : data_(""), size_(0) { }
+  Slice() : data_(""), size_(0) {}
 
   // Create a slice that refers to d[0,n-1].
-  Slice(const char* d, size_t n) : data_(d), size_(n) { }
+  Slice(const char* d, size_t n) : data_(d), size_(n) {}
 
   // Create a slice that refers to the contents of "s"
   /* implicit */
-  Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+  Slice(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+#ifdef __cpp_lib_string_view
+  // Create a slice that refers to the same contents as "sv"
+  /* implicit */
+  Slice(std::string_view sv) : data_(sv.data()), size_(sv.size()) {}
+#endif
 
   // Create a slice that refers to s[0,strlen(s)-1]
   /* implicit */
-  Slice(const char* s) : data_(s), size_(strlen(s)) { }
+  Slice(const char* s) : data_(s) { size_ = (s == nullptr) ? 0 : strlen(s); }
 
   // Create a single slice from SliceParts using buf as storage.
   // buf must exist as long as the returned Slice exists.
@@ -66,7 +75,10 @@ class Slice {
   }
 
   // Change this slice to refer to an empty array
-  void clear() { data_ = ""; size_ = 0; }
+  void clear() {
+    data_ = "";
+    size_ = 0;
+  }
 
   // Drop the first "n" bytes from this slice.
   void remove_prefix(size_t n) {
@@ -84,6 +96,13 @@ class Slice {
   // when hex is true, returns a string of twice the length hex encoded (0-9A-F)
   std::string ToString(bool hex = false) const;
 
+#ifdef __cpp_lib_string_view
+  // Return a string_view that references the same data as this slice.
+  std::string_view ToStringView() const {
+    return std::string_view(data_, size_);
+  }
+#endif
+
   // Decodes the current slice interpreted as an hexadecimal string into result,
   // if successful returns true, if this isn't a valid hex string
   // (e.g not coming from Slice::ToString(true)) DecodeHex returns false.
@@ -99,8 +118,7 @@ class Slice {
 
   // Return true iff "x" is a prefix of "*this"
   bool starts_with(const Slice& x) const {
-    return ((size_ >= x.size_) &&
-            (memcmp(data_, x.data_, x.size_) == 0));
+    return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
   }
 
   bool ends_with(const Slice& x) const {
@@ -111,7 +129,7 @@ class Slice {
   // Compare two slices and returns the first byte where they differ
   size_t difference_offset(const Slice& b) const;
 
- // private: make these public for rocksdbjni access
+  // private: make these public for rocksdbjni access
   const char* data_;
   size_t size_;
 
@@ -121,7 +139,7 @@ class Slice {
 /**
  * A Slice that can be pinned with some cleanup tasks, which will be run upon
  * ::Reset() or object destruction, whichever is invoked first. This can be used
- * to avoid memcpy by having the PinnsableSlice object referring to the data
+ * to avoid memcpy by having the PinnableSlice object referring to the data
  * that is locked in the memory and release them after the data is consumed.
  */
 class PinnableSlice : public Slice, public Cleanable {
@@ -177,13 +195,14 @@ class PinnableSlice : public Slice, public Cleanable {
     }
   }
 
-  void remove_prefix(size_t n) {
+  void remove_prefix(size_t /*n*/) {
     assert(0);  // Not implemented
   }
 
   void Reset() {
     Cleanable::Reset();
     pinned_ = false;
+    size_ = 0;
   }
 
   inline std::string* GetSelf() { return buf_; }
@@ -200,8 +219,8 @@ class PinnableSlice : public Slice, public Cleanable {
 // A set of Slices that are virtually concatenated together.  'parts' points
 // to an array of Slices.  The number of elements in the array is 'num_parts'.
 struct SliceParts {
-  SliceParts(const Slice* _parts, int _num_parts) :
-      parts(_parts), num_parts(_num_parts) { }
+  SliceParts(const Slice* _parts, int _num_parts)
+      : parts(_parts), num_parts(_num_parts) {}
   SliceParts() : parts(nullptr), num_parts(0) {}
 
   const Slice* parts;
@@ -213,17 +232,17 @@ inline bool operator==(const Slice& x, const Slice& y) {
           (memcmp(x.data(), y.data(), x.size()) == 0));
 }
 
-inline bool operator!=(const Slice& x, const Slice& y) {
-  return !(x == y);
-}
+inline bool operator!=(const Slice& x, const Slice& y) { return !(x == y); }
 
 inline int Slice::compare(const Slice& b) const {
   assert(data_ != nullptr && b.data_ != nullptr);
   const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
   int r = memcmp(data_, b.data_, min_len);
   if (r == 0) {
-    if (size_ < b.size_) r = -1;
-    else if (size_ > b.size_) r = +1;
+    if (size_ < b.size_)
+      r = -1;
+    else if (size_ > b.size_)
+      r = +1;
   }
   return r;
 }
@@ -238,5 +257,3 @@ inline size_t Slice::difference_offset(const Slice& b) const {
 }
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/slice_transform.h b/thirdparty/rocksdb/include/rocksdb/slice_transform.h
index fc82bf5845..39e3d5fa13 100644
--- a/thirdparty/rocksdb/include/rocksdb/slice_transform.h
+++ b/thirdparty/rocksdb/include/rocksdb/slice_transform.h
@@ -12,8 +12,7 @@
 // define InDomain and InRange to determine which slices are in either
 // of these sets respectively.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
-#define STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+#pragma once
 
 #include <string>
 
@@ -22,14 +21,14 @@ namespace rocksdb {
 class Slice;
 
 /*
- * A SliceTranform is a generic pluggable way of transforming one string
+ * A SliceTransform is a generic pluggable way of transforming one string
  * to another. Its primary use-case is in configuring rocksdb
  * to store prefix blooms by setting prefix_extractor in
  * ColumnFamilyOptions.
  */
 class SliceTransform {
  public:
-  virtual ~SliceTransform() {};
+  virtual ~SliceTransform(){};
 
   // Return the name of this transformation.
   virtual const char* Name() const = 0;
@@ -58,7 +57,12 @@ class SliceTransform {
   virtual bool InDomain(const Slice& key) const = 0;
 
   // This is currently not used and remains here for backward compatibility.
-  virtual bool InRange(const Slice& dst) const { return false; }
+  virtual bool InRange(const Slice& /*dst*/) const { return false; }
+
+  // Some SliceTransform will have a full length which can be used to
+  // determine if two keys are consecuitive. Can be disabled by always
+  // returning 0
+  virtual bool FullLengthEnabled(size_t* /*len*/) const { return false; }
 
   // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix.
   //
@@ -72,7 +76,7 @@ class SliceTransform {
   // by setting ReadOptions.total_order_seek = true.
   //
   // Here is an example: Suppose we implement a slice transform that returns
-  // the first part of the string after spliting it using delimiter ",":
+  // the first part of the string after splitting it using delimiter ",":
   // 1. SameResultWhenAppended("abc,") should return true. If applying prefix
   //    bloom filter using it, all slices matching "abc:.*" will be extracted
   //    to "abc,", so any SST file or memtable containing any of those key
@@ -83,7 +87,7 @@ class SliceTransform {
   //    "abcd,e", the file can be filtered out and the key will be invisible.
   //
   // i.e., an implementation always returning false is safe.
-  virtual bool SameResultWhenAppended(const Slice& prefix) const {
+  virtual bool SameResultWhenAppended(const Slice& /*prefix*/) const {
     return false;
   }
 };
@@ -94,6 +98,4 @@ extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len);
 
 extern const SliceTransform* NewNoopTransform();
 
-}
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_SLICE_TRANSFORM_H_
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/sst_dump_tool.h b/thirdparty/rocksdb/include/rocksdb/sst_dump_tool.h
index 021faa019c..c7cc4a0fc4 100644
--- a/thirdparty/rocksdb/include/rocksdb/sst_dump_tool.h
+++ b/thirdparty/rocksdb/include/rocksdb/sst_dump_tool.h
@@ -5,11 +5,13 @@
 #ifndef ROCKSDB_LITE
 #pragma once
 
+#include "rocksdb/options.h"
+
 namespace rocksdb {
 
 class SSTDumpTool {
  public:
-  int Run(int argc, char** argv);
+  int Run(int argc, char** argv, Options options = Options());
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/sst_file_manager.h b/thirdparty/rocksdb/include/rocksdb/sst_file_manager.h
index 692007d31a..3e3ef859b5 100644
--- a/thirdparty/rocksdb/include/rocksdb/sst_file_manager.h
+++ b/thirdparty/rocksdb/include/rocksdb/sst_file_manager.h
@@ -8,6 +8,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "rocksdb/status.h"
 
@@ -16,9 +17,10 @@ namespace rocksdb {
 class Env;
 class Logger;
 
-// SstFileManager is used to track SST files in the DB and control there
+// SstFileManager is used to track SST files in the DB and control their
 // deletion rate.
 // All SstFileManager public functions are thread-safe.
+// SstFileManager is not extensible.
 class SstFileManager {
  public:
   virtual ~SstFileManager() {}
@@ -27,23 +29,33 @@ class SstFileManager {
   // the total size of the SST files exceeds max_allowed_space, writes to
   // RocksDB will fail.
   //
-  // Setting max_allowed_space to 0 will disable this feature, maximum allowed
+  // Setting max_allowed_space to 0 will disable this feature; maximum allowed
   // space will be infinite (Default value).
   //
   // thread-safe.
   virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0;
 
+  // Set the amount of buffer room each compaction should be able to leave.
+  // In other words, at its maximum disk space consumption, the compaction
+  // should still leave compaction_buffer_size available on the disk so that
+  // other background functions may continue, such as logging and flushing.
+  virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
+
   // Return true if the total size of SST files exceeded the maximum allowed
   // space usage.
   //
   // thread-safe.
   virtual bool IsMaxAllowedSpaceReached() = 0;
 
+  // Returns true if the total size of SST files as well as estimated size
+  // of ongoing compactions exceeds the maximums allowed space usage.
+  virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
+
   // Return the total size of all tracked files.
   // thread-safe
   virtual uint64_t GetTotalSize() = 0;
 
-  // Return a map containing all tracked files and there corresponding sizes.
+  // Return a map containing all tracked files and their corresponding sizes.
   // thread-safe
   virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0;
 
@@ -55,31 +67,54 @@ class SstFileManager {
   // zero means disable delete rate limiting and delete files immediately
   // thread-safe
   virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0;
+
+  // Return trash/DB size ratio where new files will be deleted immediately
+  // thread-safe
+  virtual double GetMaxTrashDBRatio() = 0;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  // thread-safe
+  virtual void SetMaxTrashDBRatio(double ratio) = 0;
+
+  // Return the total size of trash files
+  // thread-safe
+  virtual uint64_t GetTotalTrashSize() = 0;
 };
 
 // Create a new SstFileManager that can be shared among multiple RocksDB
 // instances to track SST file and control there deletion rate.
+// Even though SstFileManager don't track WAL files but it still control
+// there deletion rate.
 //
 // @param env: Pointer to Env object, please see "rocksdb/env.h".
 // @param info_log: If not nullptr, info_log will be used to log errors.
 //
 // == Deletion rate limiting specific arguments ==
-// @param trash_dir: Path to the directory where deleted files will be moved
-//    to be deleted in a background thread while applying rate limiting. If this
-//    directory doesn't exist, it will be created. This directory should not be
-//    used by any other process or any other SstFileManager, Set to "" to
-//    disable deletion rate limiting.
+// @param trash_dir: Deprecated, this argument have no effect
 // @param rate_bytes_per_sec: How many bytes should be deleted per second, If
 //    this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
 //    in 1 second, we will wait for another 3 seconds before we delete other
 //    files, Set to 0 to disable deletion rate limiting.
-// @param delete_existing_trash: If set to true, the newly created
-//    SstFileManager will delete files that already exist in trash_dir.
+//    This option also affect the delete rate of WAL files in the DB.
+// @param delete_existing_trash: Deprecated, this argument have no effect, but
+//    if user provide trash_dir we will schedule deletes for files in the dir
 // @param status: If not nullptr, status will contain any errors that happened
 //    during creating the missing trash_dir or deleting existing files in trash.
+// @param max_trash_db_ratio: If the trash size constitutes for more than this
+//    fraction of the total DB size we will start deleting new files passed to
+//    DeleteScheduler immediately
+// @param bytes_max_delete_chunk: if a file to delete is larger than delete
+//    chunk, ftruncate the file by this size each time, rather than dropping the
+//    whole file. 0 means to always delete the whole file. If the file has more
+//    than one linked names, the file will be deleted as a whole. Either way,
+//    `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
+//    files already renamed as a trash may be partial, so users should not
+//    directly recover them without checking.
 extern SstFileManager* NewSstFileManager(
     Env* env, std::shared_ptr<Logger> info_log = nullptr,
     std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
-    bool delete_existing_trash = true, Status* status = nullptr);
+    bool delete_existing_trash = true, Status* status = nullptr,
+    double max_trash_db_ratio = 0.25,
+    uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/sst_file_reader.h b/thirdparty/rocksdb/include/rocksdb/sst_file_reader.h
new file mode 100644
index 0000000000..517907dd50
--- /dev/null
+++ b/thirdparty/rocksdb/include/rocksdb/sst_file_reader.h
@@ -0,0 +1,45 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/table_properties.h"
+
+namespace rocksdb {
+
+// SstFileReader is used to read sst files that are generated by DB or
+// SstFileWriter.
+class SstFileReader {
+ public:
+  SstFileReader(const Options& options);
+
+  ~SstFileReader();
+
+  // Prepares to read from the file located at "file_path".
+  Status Open(const std::string& file_path);
+
+  // Returns a new iterator over the table contents.
+  // Most read options provide the same control as we read from DB.
+  // If "snapshot" is nullptr, the iterator returns only the latest keys.
+  Iterator* NewIterator(const ReadOptions& options);
+
+  std::shared_ptr<const TableProperties> GetTableProperties() const;
+
+  // Verifies whether there is corruption in this table.
+  Status VerifyChecksum();
+
+ private:
+  struct Rep;
+  std::unique_ptr<Rep> rep_;
+};
+
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/sst_file_writer.h b/thirdparty/rocksdb/include/rocksdb/sst_file_writer.h
index 04d5c271a0..273c913e4f 100644
--- a/thirdparty/rocksdb/include/rocksdb/sst_file_writer.h
+++ b/thirdparty/rocksdb/include/rocksdb/sst_file_writer.h
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef ROCKSDB_LITE
-
 #pragma once
 
+#ifndef ROCKSDB_LITE
+
 #include <memory>
 #include <string>
 
@@ -28,7 +28,18 @@ class Comparator;
 // ExternalSstFileInfo include information about sst files created
 // using SstFileWriter.
 struct ExternalSstFileInfo {
-  ExternalSstFileInfo() {}
+  ExternalSstFileInfo()
+      : file_path(""),
+        smallest_key(""),
+        largest_key(""),
+        smallest_range_del_key(""),
+        largest_range_del_key(""),
+        sequence_number(0),
+        file_size(0),
+        num_entries(0),
+        num_range_del_entries(0),
+        version(0) {}
+
   ExternalSstFileInfo(const std::string& _file_path,
                       const std::string& _smallest_key,
                       const std::string& _largest_key,
@@ -37,17 +48,24 @@ struct ExternalSstFileInfo {
       : file_path(_file_path),
         smallest_key(_smallest_key),
         largest_key(_largest_key),
+        smallest_range_del_key(""),
+        largest_range_del_key(""),
         sequence_number(_sequence_number),
         file_size(_file_size),
         num_entries(_num_entries),
+        num_range_del_entries(0),
         version(_version) {}
 
-  std::string file_path;           // external sst file path
-  std::string smallest_key;        // smallest user key in file
-  std::string largest_key;         // largest user key in file
-  SequenceNumber sequence_number;  // sequence number of all keys in file
-  uint64_t file_size;              // file size in bytes
-  uint64_t num_entries;            // number of entries in file
+  std::string file_path;     // external sst file path
+  std::string smallest_key;  // smallest user key in file
+  std::string largest_key;   // largest user key in file
+  std::string
+      smallest_range_del_key;  // smallest range deletion user key in file
+  std::string largest_range_del_key;  // largest range deletion user key in file
+  SequenceNumber sequence_number;     // sequence number of all keys in file
+  uint64_t file_size;                 // file size in bytes
+  uint64_t num_entries;               // number of entries in file
+  uint64_t num_range_del_entries;  // number of range deletion entries in file
   int32_t version;                 // file version
 };
 
@@ -59,21 +77,24 @@ class SstFileWriter {
   // be ingested into this column_family, note that passing nullptr means that
   // the column_family is unknown.
   // If invalidate_page_cache is set to true, SstFileWriter will give the OS a
-  // hint that this file pages is not needed everytime we write 1MB to the file.
-  // To use the rate limiter an io_priority smaller than IO_TOTAL can be passed.
+  // hint that this file pages is not needed every time we write 1MB to the
+  // file. To use the rate limiter an io_priority smaller than IO_TOTAL can be
+  // passed.
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
-                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL)
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+                bool skip_filters = false)
       : SstFileWriter(env_options, options, options.comparator, column_family,
-                      invalidate_page_cache, io_priority) {}
+                      invalidate_page_cache, io_priority, skip_filters) {}
 
   // Deprecated API
   SstFileWriter(const EnvOptions& env_options, const Options& options,
                 const Comparator* user_comparator,
                 ColumnFamilyHandle* column_family = nullptr,
                 bool invalidate_page_cache = true,
-                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL);
+                Env::IOPriority io_priority = Env::IOPriority::IO_TOTAL,
+                bool skip_filters = false);
 
   ~SstFileWriter();
 
@@ -96,6 +117,9 @@ class SstFileWriter {
   // REQUIRES: key is after any previously added key according to comparator.
   Status Delete(const Slice& user_key);
 
+  // Add a range deletion tombstone to currently opened file
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key);
+
   // Finalize writing to sst file and close file.
   //
   // An optional ExternalSstFileInfo pointer can be passed to the function
diff --git a/thirdparty/rocksdb/include/rocksdb/statistics.h b/thirdparty/rocksdb/include/rocksdb/statistics.h
index 731ff78096..bad1c87ec5 100644
--- a/thirdparty/rocksdb/include/rocksdb/statistics.h
+++ b/thirdparty/rocksdb/include/rocksdb/statistics.h
@@ -3,14 +3,14 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
-#define STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
+#pragma once
 
 #include <atomic>
 #include <cstddef>
 #include <cstdint>
-#include <string>
+#include <map>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "rocksdb/status.h"
@@ -22,6 +22,8 @@ namespace rocksdb {
  *  1. Any ticker should be added before TICKER_ENUM_MAX.
  *  2. Add a readable string in TickersNameMap below for the newly added ticker.
  *  3. Add a corresponding enum value to TickerType.java in the java API
+ *  4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType
+ * and toCppTickers
  */
 enum Tickers : uint32_t {
   // total block cache misses
@@ -71,8 +73,13 @@ enum Tickers : uint32_t {
   // # of bytes written into cache.
   BLOCK_CACHE_BYTES_WRITE,
 
-  // # of times bloom filter has avoided file reads.
+  // # of times bloom filter has avoided file reads, i.e., negatives.
   BLOOM_FILTER_USEFUL,
+  // # of times bloom FullFilter has not avoided the reads.
+  BLOOM_FILTER_FULL_POSITIVE,
+  // # of times bloom FullFilter has not avoided the reads and data actually
+  // exist.
+  BLOOM_FILTER_FULL_TRUE_POSITIVE,
 
   // # persistent cache hit
   PERSISTENT_CACHE_HIT,
@@ -108,6 +115,8 @@ enum Tickers : uint32_t {
   COMPACTION_RANGE_DEL_DROP_OBSOLETE,  // all keys in range were deleted.
   // Deletions obsoleted before bottom level due to file gap optimization.
   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+  // If a compaction was cancelled in sfm to prevent ENOSPC
+  COMPACTION_CANCELLED,
 
   // Number of keys written to the database via the Put and Write call's
   NUMBER_KEYS_WRITTEN,
@@ -149,7 +158,8 @@ enum Tickers : uint32_t {
   // Disabled by default. To enable it set stats level to kAll
   DB_MUTEX_WAIT_MICROS,
   RATE_LIMIT_DELAY_MILLIS,
-  NO_ITERATORS,  // number of iterators currently open
+  // DEPRECATED number of iterators currently open
+  NO_ITERATORS,
 
   // Number of MultiGet calls, keys read, and bytes read
   NUMBER_MULTIGET_CALLS,
@@ -223,112 +233,115 @@ enum Tickers : uint32_t {
   // Number of refill intervals where rate limiter's bytes are fully consumed.
   NUMBER_RATE_LIMITER_DRAINS,
 
+  // Number of internal keys skipped by Iterator
+  NUMBER_ITER_SKIP,
+
+  // BlobDB specific stats
+  // # of Put/PutTTL/PutUntil to BlobDB.
+  BLOB_DB_NUM_PUT,
+  // # of Write to BlobDB.
+  BLOB_DB_NUM_WRITE,
+  // # of Get to BlobDB.
+  BLOB_DB_NUM_GET,
+  // # of MultiGet to BlobDB.
+  BLOB_DB_NUM_MULTIGET,
+  // # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
+  BLOB_DB_NUM_SEEK,
+  // # of Next to BlobDB iterator.
+  BLOB_DB_NUM_NEXT,
+  // # of Prev to BlobDB iterator.
+  BLOB_DB_NUM_PREV,
+  // # of keys written to BlobDB.
+  BLOB_DB_NUM_KEYS_WRITTEN,
+  // # of keys read from BlobDB.
+  BLOB_DB_NUM_KEYS_READ,
+  // # of bytes (key + value) written to BlobDB.
+  BLOB_DB_BYTES_WRITTEN,
+  // # of bytes (keys + value) read from BlobDB.
+  BLOB_DB_BYTES_READ,
+  // # of keys written by BlobDB as non-TTL inlined value.
+  BLOB_DB_WRITE_INLINED,
+  // # of keys written by BlobDB as TTL inlined value.
+  BLOB_DB_WRITE_INLINED_TTL,
+  // # of keys written by BlobDB as non-TTL blob value.
+  BLOB_DB_WRITE_BLOB,
+  // # of keys written by BlobDB as TTL blob value.
+  BLOB_DB_WRITE_BLOB_TTL,
+  // # of bytes written to blob file.
+  BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+  // # of bytes read from blob file.
+  BLOB_DB_BLOB_FILE_BYTES_READ,
+  // # of times a blob files being synced.
+  BLOB_DB_BLOB_FILE_SYNCED,
+  // # of blob index evicted from base DB by BlobDB compaction filter because
+  // of expiration.
+  BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+  // size of blob index evicted from base DB by BlobDB compaction filter
+  // because of expiration.
+  BLOB_DB_BLOB_INDEX_EXPIRED_SIZE,
+  // # of blob index evicted from base DB by BlobDB compaction filter because
+  // of corresponding file deleted.
+  BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+  // size of blob index evicted from base DB by BlobDB compaction filter
+  // because of corresponding file deleted.
+  BLOB_DB_BLOB_INDEX_EVICTED_SIZE,
+  // # of blob files being garbage collected.
+  BLOB_DB_GC_NUM_FILES,
+  // # of blob files generated by garbage collection.
+  BLOB_DB_GC_NUM_NEW_FILES,
+  // # of BlobDB garbage collection failures.
+  BLOB_DB_GC_FAILURES,
+  // # of keys drop by BlobDB garbage collection because they had been
+  // overwritten.
+  BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+  // # of keys drop by BlobDB garbage collection because of expiration.
+  BLOB_DB_GC_NUM_KEYS_EXPIRED,
+  // # of keys relocated to new blob file by garbage collection.
+  BLOB_DB_GC_NUM_KEYS_RELOCATED,
+  // # of bytes drop by BlobDB garbage collection because they had been
+  // overwritten.
+  BLOB_DB_GC_BYTES_OVERWRITTEN,
+  // # of bytes drop by BlobDB garbage collection because of expiration.
+  BLOB_DB_GC_BYTES_EXPIRED,
+  // # of bytes relocated to new blob file by garbage collection.
+  BLOB_DB_GC_BYTES_RELOCATED,
+  // # of blob files evicted because of BlobDB is full.
+  BLOB_DB_FIFO_NUM_FILES_EVICTED,
+  // # of keys in the blob files evicted because of BlobDB is full.
+  BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+  // # of bytes in the blob files evicted because of BlobDB is full.
+  BLOB_DB_FIFO_BYTES_EVICTED,
+
+  // These counters indicate a performance issue in WritePrepared transactions.
+  // We should not seem them ticking them much.
+  // # of times prepare_mutex_ is acquired in the fast path.
+  TXN_PREPARE_MUTEX_OVERHEAD,
+  // # of times old_commit_map_mutex_ is acquired in the fast path.
+  TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+  // # of times we checked a batch for duplicate keys.
+  TXN_DUPLICATE_KEY_OVERHEAD,
+  // # of times snapshot_mutex_ is acquired in the fast path.
+  TXN_SNAPSHOT_MUTEX_OVERHEAD,
+
+  // Number of keys actually found in MultiGet calls (vs number requested by
+  // caller)
+  // NUMBER_MULTIGET_KEYS_READ gives the number requested by caller
+  NUMBER_MULTIGET_KEYS_FOUND,
+
+  NO_ITERATOR_CREATED,  // number of iterators created
+  NO_ITERATOR_DELETED,  // number of iterators deleted
+
+  BLOCK_CACHE_COMPRESSION_DICT_MISS,
+  BLOCK_CACHE_COMPRESSION_DICT_HIT,
+  BLOCK_CACHE_COMPRESSION_DICT_ADD,
+  BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+  BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
   TICKER_ENUM_MAX
 };
 
 // The order of items listed in  Tickers should be the same as
 // the order listed in TickersNameMap
-const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
-    {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
-    {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
-    {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
-    {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
-    {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
-    {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
-    {BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"},
-    {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
-    {BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"},
-    {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
-    {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
-    {BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"},
-    {BLOCK_CACHE_FILTER_BYTES_INSERT,
-     "rocksdb.block.cache.filter.bytes.insert"},
-    {BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"},
-    {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
-    {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
-    {BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"},
-    {BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"},
-    {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
-    {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
-    {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
-    {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"},
-    {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"},
-    {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"},
-    {SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"},
-    {MEMTABLE_HIT, "rocksdb.memtable.hit"},
-    {MEMTABLE_MISS, "rocksdb.memtable.miss"},
-    {GET_HIT_L0, "rocksdb.l0.hit"},
-    {GET_HIT_L1, "rocksdb.l1.hit"},
-    {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
-    {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
-    {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
-    {COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
-    {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
-    {COMPACTION_RANGE_DEL_DROP_OBSOLETE,
-      "rocksdb.compaction.range_del.drop.obsolete"},
-    {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
-      "rocksdb.compaction.optimized.del.drop.obsolete"},
-    {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
-    {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
-    {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
-    {BYTES_WRITTEN, "rocksdb.bytes.written"},
-    {BYTES_READ, "rocksdb.bytes.read"},
-    {NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
-    {NUMBER_DB_NEXT, "rocksdb.number.db.next"},
-    {NUMBER_DB_PREV, "rocksdb.number.db.prev"},
-    {NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
-    {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
-    {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
-    {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
-    {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
-    {NO_FILE_OPENS, "rocksdb.no.file.opens"},
-    {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
-    {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
-    {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
-    {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
-    {STALL_MICROS, "rocksdb.stall.micros"},
-    {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
-    {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
-    {NO_ITERATORS, "rocksdb.num.iterators"},
-    {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
-    {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
-    {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
-    {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
-    {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
-    {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
-    {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
-    {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
-    {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
-    {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
-    {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
-    {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"},
-    {BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
-     "rocksdb.block.cachecompressed.add.failures"},
-    {WAL_FILE_SYNCED, "rocksdb.wal.synced"},
-    {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
-    {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
-    {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
-    {WRITE_TIMEDOUT, "rocksdb.write.timeout"},
-    {WRITE_WITH_WAL, "rocksdb.write.wal"},
-    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
-    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
-    {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
-    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
-     "rocksdb.number.direct.load.table.properties"},
-    {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
-    {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
-    {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
-    {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"},
-    {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"},
-    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
-    {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
-    {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
-    {ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
-    {ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
-    {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
-    {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
-    {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
-};
+extern const std::vector<std::pair<Tickers, std::string>> TickersNameMap;
 
 /**
  * Keep adding histogram's here.
@@ -342,6 +355,7 @@ enum Histograms : uint32_t {
   DB_GET = 0,
   DB_WRITE,
   COMPACTION_TIME,
+  COMPACTION_CPU_TIME,
   SUBCOMPACTION_SETUP_TIME,
   TABLE_SYNC_MICROS,
   COMPACTION_OUTFILE_SYNC_MICROS,
@@ -379,42 +393,42 @@ enum Histograms : uint32_t {
   // requests.
   READ_NUM_MERGE_OPERANDS,
 
-  HISTOGRAM_ENUM_MAX,  // TODO(ldemailly): enforce HistogramsNameMap match
+  // BlobDB specific stats
+  // Size of keys written to BlobDB.
+  BLOB_DB_KEY_SIZE,
+  // Size of values written to BlobDB.
+  BLOB_DB_VALUE_SIZE,
+  // BlobDB Put/PutWithTTL/PutUntil/Write latency.
+  BLOB_DB_WRITE_MICROS,
+  // BlobDB Get lagency.
+  BLOB_DB_GET_MICROS,
+  // BlobDB MultiGet latency.
+  BLOB_DB_MULTIGET_MICROS,
+  // BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
+  BLOB_DB_SEEK_MICROS,
+  // BlobDB Next latency.
+  BLOB_DB_NEXT_MICROS,
+  // BlobDB Prev latency.
+  BLOB_DB_PREV_MICROS,
+  // Blob file write latency.
+  BLOB_DB_BLOB_FILE_WRITE_MICROS,
+  // Blob file read latency.
+  BLOB_DB_BLOB_FILE_READ_MICROS,
+  // Blob file sync latency.
+  BLOB_DB_BLOB_FILE_SYNC_MICROS,
+  // BlobDB garbage collection time.
+  BLOB_DB_GC_MICROS,
+  // BlobDB compression time.
+  BLOB_DB_COMPRESSION_MICROS,
+  // BlobDB decompression time.
+  BLOB_DB_DECOMPRESSION_MICROS,
+  // Time spent flushing memtable to disk
+  FLUSH_TIME,
+
+  HISTOGRAM_ENUM_MAX,
 };
 
-const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
-    {DB_GET, "rocksdb.db.get.micros"},
-    {DB_WRITE, "rocksdb.db.write.micros"},
-    {COMPACTION_TIME, "rocksdb.compaction.times.micros"},
-    {SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
-    {TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
-    {COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
-    {WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
-    {MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
-    {TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
-    {DB_MULTIGET, "rocksdb.db.multiget.micros"},
-    {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
-    {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
-    {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
-    {STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
-    {STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
-    {STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
-    {HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
-    {SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
-    {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
-    {DB_SEEK, "rocksdb.db.seek.micros"},
-    {WRITE_STALL, "rocksdb.db.write.stall"},
-    {SST_READ_MICROS, "rocksdb.sst.read.micros"},
-    {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
-    {BYTES_PER_READ, "rocksdb.bytes.per.read"},
-    {BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
-    {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
-    {BYTES_COMPRESSED, "rocksdb.bytes.compressed"},
-    {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"},
-    {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
-    {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
-    {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
-};
+extern const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap;
 
 struct HistogramData {
   double median;
@@ -425,9 +439,16 @@ struct HistogramData {
   // zero-initialize new members since old Statistics::histogramData()
   // implementations won't write them.
   double max = 0.0;
+  uint64_t count = 0;
+  uint64_t sum = 0;
+  double min = 0.0;
 };
 
-enum StatsLevel {
+enum StatsLevel : uint8_t {
+  // Disable timer stats, and skip histogram stats
+  kExceptHistogramOrTimers,
+  // Skip timer stats
+  kExceptTimers,
   // Collect all stats except time inside mutex lock AND time spent on
   // compression.
   kExceptDetailedTimers,
@@ -448,16 +469,33 @@ class Statistics {
   virtual uint64_t getTickerCount(uint32_t tickerType) const = 0;
   virtual void histogramData(uint32_t type,
                              HistogramData* const data) const = 0;
-  virtual std::string getHistogramString(uint32_t type) const { return ""; }
+  virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; }
   virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0;
   virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0;
   virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0;
-  virtual void measureTime(uint32_t histogramType, uint64_t time) = 0;
+  virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) {
+    if (get_stats_level() <= StatsLevel::kExceptTimers) {
+      return;
+    }
+    recordInHistogram(histogramType, time);
+  }
+  // The function is here only for backward compatibility reason.
+  // Users implementing their own Statistics class should override
+  // recordInHistogram() instead and leave measureTime() as it is.
+  virtual void measureTime(uint32_t /*histogramType*/, uint64_t /*time*/) {
+    // This is not supposed to be called.
+    assert(false);
+  }
+  virtual void recordInHistogram(uint32_t histogramType, uint64_t time) {
+    // measureTime() is the old and inaccurate function name.
+    // To keep backward compatible. If users implement their own
+    // statistics, which overrides meareTime() but doesn't override
+    // this function. We forward to measureTime().
+    measureTime(histogramType, time);
+  }
 
   // Resets all ticker and histogram stats
-  virtual Status Reset() {
-    return Status::NotSupported("Not implemented");
-  }
+  virtual Status Reset() { return Status::NotSupported("Not implemented"); }
 
   // String representation of the statistic object.
   virtual std::string ToString() const {
@@ -465,17 +503,27 @@ class Statistics {
     return std::string("ToString(): not implemented");
   }
 
+  virtual bool getTickerMap(std::map<std::string, uint64_t>*) const {
+    // Do nothing by default
+    return false;
+  };
+
   // Override this function to disable particular histogram collection
   virtual bool HistEnabledForType(uint32_t type) const {
     return type < HISTOGRAM_ENUM_MAX;
   }
+  void set_stats_level(StatsLevel sl) {
+    stats_level_.store(sl, std::memory_order_relaxed);
+  }
+  StatsLevel get_stats_level() const {
+    return stats_level_.load(std::memory_order_relaxed);
+  }
 
-  StatsLevel stats_level_ = kExceptDetailedTimers;
+ private:
+  std::atomic<StatsLevel> stats_level_{kExceptDetailedTimers};
 };
 
 // Create a concrete DBStatistics object
 std::shared_ptr<Statistics> CreateDBStatistics();
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_STATISTICS_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/stats_history.h b/thirdparty/rocksdb/include/rocksdb/stats_history.h
new file mode 100644
index 0000000000..40ea51d1ff
--- /dev/null
+++ b/thirdparty/rocksdb/include/rocksdb/stats_history.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <map>
+#include <string>
+
+// #include "db/db_impl.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+
+namespace rocksdb {
+
+class DBImpl;
+
+class StatsHistoryIterator {
+ public:
+  StatsHistoryIterator() {}
+  virtual ~StatsHistoryIterator() {}
+
+  virtual bool Valid() const = 0;
+
+  // Moves to the next stats history record.  After this call, Valid() is
+  // true iff the iterator was not positioned at the last entry in the source.
+  // REQUIRES: Valid()
+  virtual void Next() = 0;
+
+  // Return the time stamp (in microseconds) when stats history is recorded.
+  // REQUIRES: Valid()
+  virtual uint64_t GetStatsTime() const = 0;
+
+  // Return the current stats history as an std::map which specifies the
+  // mapping from stats name to stats value . The underlying storage
+  // for the returned map is valid only until the next modification of
+  // the iterator.
+  // REQUIRES: Valid()
+  virtual const std::map<std::string, uint64_t>& GetStatsMap() const = 0;
+
+  // If an error has occurred, return it.  Else return an ok status.
+  virtual Status status() const = 0;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/status.h b/thirdparty/rocksdb/include/rocksdb/status.h
index 709f383709..12e8070d1e 100644
--- a/thirdparty/rocksdb/include/rocksdb/status.h
+++ b/thirdparty/rocksdb/include/rocksdb/status.h
@@ -14,8 +14,7 @@
 // non-const method, all threads accessing the same Status must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_STATUS_H_
-#define STORAGE_ROCKSDB_INCLUDE_STATUS_H_
+#pragma once
 
 #include <string>
 #include "rocksdb/slice.h"
@@ -25,7 +24,7 @@ namespace rocksdb {
 class Status {
  public:
   // Create a success status.
-  Status() : code_(kOk), subcode_(kNone), state_(nullptr) {}
+  Status() : code_(kOk), subcode_(kNone), sev_(kNoError), state_(nullptr) {}
   ~Status() { delete[] state_; }
 
   // Copy the specified status.
@@ -44,7 +43,7 @@ class Status {
   bool operator==(const Status& rhs) const;
   bool operator!=(const Status& rhs) const;
 
-  enum Code {
+  enum Code : unsigned char {
     kOk = 0,
     kNotFound = 1,
     kCorruption = 2,
@@ -58,12 +57,13 @@ class Status {
     kAborted = 10,
     kBusy = 11,
     kExpired = 12,
-    kTryAgain = 13
+    kTryAgain = 13,
+    kCompactionTooLarge = 14
   };
 
   Code code() const { return code_; }
 
-  enum SubCode {
+  enum SubCode : unsigned char {
     kNone = 0,
     kMutexTimeout = 1,
     kLockTimeout = 2,
@@ -72,11 +72,25 @@ class Status {
     kDeadlock = 5,
     kStaleFile = 6,
     kMemoryLimit = 7,
+    kSpaceLimit = 8,
+    kPathNotFound = 9,
     kMaxSubCode
   };
 
   SubCode subcode() const { return subcode_; }
 
+  enum Severity : unsigned char {
+    kNoError = 0,
+    kSoftError = 1,
+    kHardError = 2,
+    kFatalError = 3,
+    kUnrecoverableError = 4,
+    kMaxSeverity
+  };
+
+  Status(const Status& s, Severity sev);
+  Severity severity() const { return sev_; }
+
   // Returns a C style string indicating the message of the Status
   const char* getState() const { return state_; }
 
@@ -162,6 +176,14 @@ class Status {
     return Status(kTryAgain, msg, msg2);
   }
 
+  static Status CompactionTooLarge(SubCode msg = kNone) {
+    return Status(kCompactionTooLarge, msg);
+  }
+  static Status CompactionTooLarge(const Slice& msg,
+                                   const Slice& msg2 = Slice()) {
+    return Status(kCompactionTooLarge, msg, msg2);
+  }
+
   static Status NoSpace() { return Status(kIOError, kNoSpace); }
   static Status NoSpace(const Slice& msg, const Slice& msg2 = Slice()) {
     return Status(kIOError, kNoSpace, msg, msg2);
@@ -172,6 +194,16 @@ class Status {
     return Status(kAborted, kMemoryLimit, msg, msg2);
   }
 
+  static Status SpaceLimit() { return Status(kIOError, kSpaceLimit); }
+  static Status SpaceLimit(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kSpaceLimit, msg, msg2);
+  }
+
+  static Status PathNotFound() { return Status(kIOError, kPathNotFound); }
+  static Status PathNotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+    return Status(kIOError, kPathNotFound, msg, msg2);
+  }
+
   // Returns true iff the status indicates success.
   bool ok() const { return code() == kOk; }
 
@@ -221,6 +253,9 @@ class Status {
   // re-attempted.
   bool IsTryAgain() const { return code() == kTryAgain; }
 
+  // Returns true iff the status indicates the proposed compaction is too large
+  bool IsCompactionTooLarge() const { return code() == kCompactionTooLarge; }
+
   // Returns true iff the status indicates a NoSpace error
   // This is caused by an I/O error returning the specific "out of space"
   // error condition. Stricto sensu, an NoSpace error is an I/O error
@@ -237,6 +272,14 @@ class Status {
     return (code() == kAborted) && (subcode() == kMemoryLimit);
   }
 
+  // Returns true iff the status indicates a PathNotFound error
+  // This is caused by an I/O error returning the specific "no such file or
+  // directory" error condition. A PathNotFound error is an I/O error with
+  // a specific subcode, enabling users to take appropriate action if necessary
+  bool IsPathNotFound() const {
+    return (code() == kIOError) && (subcode() == kPathNotFound);
+  }
+
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
@@ -249,12 +292,11 @@ class Status {
   //    state_[4..]  == message
   Code code_;
   SubCode subcode_;
+  Severity sev_;
   const char* state_;
 
-  static const char* msgs[static_cast<int>(kMaxSubCode)];
-
   explicit Status(Code _code, SubCode _subcode = kNone)
-      : code_(_code), subcode_(_subcode), state_(nullptr) {}
+      : code_(_code), subcode_(_subcode), sev_(kNoError), state_(nullptr) {}
 
   Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2);
   Status(Code _code, const Slice& msg, const Slice& msg2)
@@ -263,7 +305,12 @@ class Status {
   static const char* CopyState(const char* s);
 };
 
-inline Status::Status(const Status& s) : code_(s.code_), subcode_(s.subcode_) {
+inline Status::Status(const Status& s)
+    : code_(s.code_), subcode_(s.subcode_), sev_(s.sev_) {
+  state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
+}
+inline Status::Status(const Status& s, Severity sev)
+    : code_(s.code_), subcode_(s.subcode_), sev_(sev) {
   state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
 }
 inline Status& Status::operator=(const Status& s) {
@@ -272,6 +319,7 @@ inline Status& Status::operator=(const Status& s) {
   if (this != &s) {
     code_ = s.code_;
     subcode_ = s.subcode_;
+    sev_ = s.sev_;
     delete[] state_;
     state_ = (s.state_ == nullptr) ? nullptr : CopyState(s.state_);
   }
@@ -296,6 +344,8 @@ inline Status& Status::operator=(Status&& s)
     s.code_ = kOk;
     subcode_ = std::move(s.subcode_);
     s.subcode_ = kNone;
+    sev_ = std::move(s.sev_);
+    s.sev_ = kNoError;
     delete[] state_;
     state_ = nullptr;
     std::swap(state_, s.state_);
@@ -312,5 +362,3 @@ inline bool Status::operator!=(const Status& rhs) const {
 }
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_STATUS_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/table.h b/thirdparty/rocksdb/include/rocksdb/table.h
index 1b4c0ced90..6c584375cc 100644
--- a/thirdparty/rocksdb/include/rocksdb/table.h
+++ b/thirdparty/rocksdb/include/rocksdb/table.h
@@ -16,6 +16,7 @@
 //   https://github.com/facebook/rocksdb/wiki/A-Tutorial-of-RocksDB-SST-formats#wiki-examples
 
 #pragma once
+
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -40,12 +41,11 @@ class WritableFileWriter;
 struct EnvOptions;
 struct Options;
 
-using std::unique_ptr;
-
 enum ChecksumType : char {
   kNoChecksum = 0x0,
   kCRC32c = 0x1,
   kxxHash = 0x2,
+  kxxHash64 = 0x3,
 };
 
 // For advanced user only
@@ -60,6 +60,10 @@ struct BlockBasedTableOptions {
   // TODO(kailiu) Temporarily disable this feature by making the default value
   // to be false.
   //
+  // TODO(ajkr) we need to update names of variables controlling meta-block
+  // caching as they should now apply to range tombstone and compression
+  // dictionary meta-blocks, in addition to index and filter meta-blocks.
+  //
   // Indicating if we'd put index/filter blocks to the block cache.
   // If not specified, each "table reader" object will pre-load index/filter
   // block during table initialization.
@@ -77,6 +81,13 @@ struct BlockBasedTableOptions {
   // evicted from cache when the table reader is freed.
   bool pin_l0_filter_and_index_blocks_in_cache = false;
 
+  // If cache_index_and_filter_blocks is true and the below is true, then
+  // the top-level index of partitioned filter and index blocks are stored in
+  // the cache, but a reference is held in the "table reader" object so the
+  // blocks are pinned and only evicted from cache when the table reader is
+  // freed. This is not limited to l0 in LSM tree.
+  bool pin_top_level_index_and_filter = true;
+
   // The index type that will be used for this table.
   enum IndexType : char {
     // A space efficient index block that is optimized for
@@ -87,15 +98,24 @@ struct BlockBasedTableOptions {
     // `Options.prefix_extractor` is provided.
     kHashSearch,
 
-    // TODO(myabandeh): this feature is in experimental phase and shall not be
-    // used in production; either remove the feature or remove this comment if
-    // it is ready to be used in production.
     // A two-level index implementation. Both levels are binary search indexes.
     kTwoLevelIndexSearch,
   };
 
   IndexType index_type = kBinarySearch;
 
+  // The index type that will be used for the data block.
+  enum DataBlockIndexType : char {
+    kDataBlockBinarySearch = 0,   // traditional block type
+    kDataBlockBinaryAndHash = 1,  // additional hash index
+  };
+
+  DataBlockIndexType data_block_index_type = kDataBlockBinarySearch;
+
+  // #entries/#buckets. It is valid only when data_block_hash_index_type is
+  // kDataBlockBinaryAndHash.
+  double data_block_hash_table_util_ratio = 0.75;
+
   // This option is now deprecated. No matter what value it is set to,
   // it will behave as if hash_index_allow_collision=true.
   bool hash_index_allow_collision = true;
@@ -120,6 +140,8 @@ struct BlockBasedTableOptions {
 
   // If non-NULL use the specified cache for compressed blocks.
   // If NULL, rocksdb will not use a compressed block cache.
+  // Note: though it looks similar to `block_cache`, RocksDB doesn't put the
+  //       same type of object there.
   std::shared_ptr<Cache> block_cache_compressed = nullptr;
 
   // Approximate size of user data packed per block.  Note that the
@@ -158,10 +180,8 @@ struct BlockBasedTableOptions {
   // Note: currently this option requires kTwoLevelIndexSearch to be set as
   // well.
   // TODO(myabandeh): remove the note above once the limitation is lifted
-  // TODO(myabandeh): this feature is in experimental phase and shall not be
-  // used in production; either remove the feature or remove this comment if
-  // it is ready to be used in production.
-  // Use partitioned full filters for each SST file
+  // Use partitioned full filters for each SST file. This option is
+  // incompatible with block-based filters.
   bool partition_filters = false;
 
   // Use delta encoding to compress keys in blocks.
@@ -207,7 +227,7 @@ struct BlockBasedTableOptions {
   // Default: 0 (disabled)
   uint32_t read_amp_bytes_per_bit = 0;
 
-  // We currently have three versions:
+  // We currently have five versions:
   // 0 -- This version is currently written out by all RocksDB's versions by
   // default.  Can be read by really old RocksDB's. Doesn't support changing
   // checksum (default is CRC32).
@@ -219,14 +239,31 @@ struct BlockBasedTableOptions {
   // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you
   // don't plan to run RocksDB before version 3.10, you should probably use
   // this.
-  // This option only affects newly written tables. When reading exising tables,
-  // the information about version is read from the footer.
+  // 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we
+  // encode the keys in index blocks. If you don't plan to run RocksDB before
+  // version 5.15, you should probably use this.
+  // This option only affects newly written tables. When reading existing
+  // tables, the information about version is read from the footer.
+  // 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we
+  // encode the values in index blocks. If you don't plan to run RocksDB before
+  // version 5.16 and you are using index_block_restart_interval > 1, you should
+  // probably use this as it would reduce the index size.
+  // This option only affects newly written tables. When reading existing
+  // tables, the information about version is read from the footer.
   uint32_t format_version = 2;
+
+  // Store index blocks on disk in compressed format. Changing this option to
+  // false  will avoid the overhead of decompression if index blocks are evicted
+  // and read back
+  bool enable_index_compression = true;
+
+  // Align data blocks on lesser of page size and block size
+  bool block_align = false;
 };
 
 // Table Properties that are specific to block-based table properties.
 struct BlockBasedTablePropertyNames {
-  // value of this propertis is a fixed int32 number.
+  // value of this properties is a fixed int32 number.
   static const std::string kIndexType;
   // value is "1" for true and "0" for false.
   static const std::string kWholeKeyFiltering;
@@ -319,13 +356,13 @@ struct PlainTableOptions {
 };
 
 // -- Plain Table with prefix-only seek
-// For this factory, you need to set Options.prefix_extrator properly to make it
-// work. Look-up will starts with prefix hash lookup for key prefix. Inside the
-// hash bucket found, a binary search is executed for hash conflicts. Finally,
-// a linear search is used.
+// For this factory, you need to set Options.prefix_extractor properly to make
+// it work. Look-up will starts with prefix hash lookup for key prefix. Inside
+// the hash bucket found, a binary search is executed for hash conflicts.
+// Finally, a linear search is used.
 
-extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
-                                              PlainTableOptions());
+extern TableFactory* NewPlainTableFactory(
+    const PlainTableOptions& options = PlainTableOptions());
 
 struct CuckooTablePropertyNames {
   // The key that is used to fill empty buckets.
@@ -382,7 +419,7 @@ struct CuckooTableOptions {
   bool identity_as_first_hash = false;
   // If this option is set to true, module is used during hash calculation.
   // This often yields better space efficiency at the cost of performance.
-  // If this optino is set to false, # of entries in table is constrained to be
+  // If this option is set to false, # of entries in table is constrained to be
   // power of two, and bit and is used to calculate hash, which is faster in
   // general.
   bool use_module_hash = true;
@@ -417,10 +454,10 @@ class TableFactory {
   // NewTableReader() is called in three places:
   // (1) TableCache::FindTable() calls the function when table cache miss
   //     and cache the table object returned.
-  // (2) SstFileReader (for SST Dump) opens the table and dump the table
+  // (2) SstFileDumper (for SST Dump) opens the table and dump the table
   //     contents using the iterator of the table.
-  // (3) DBImpl::AddFile() calls this function to read the contents of
-  //     the sst file it's attempting to add
+  // (3) DBImpl::IngestExternalFile() calls this function to read the contents
+  //     of the sst file it's attempting to add
   //
   // table_reader_options is a TableReaderOptions which contain all the
   //    needed parameters and configuration to open the table.
@@ -429,8 +466,8 @@ class TableFactory {
   // table_reader is the output table reader.
   virtual Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const = 0;
 
   // Return a table builder to write to a file for this table type.
@@ -459,16 +496,15 @@ class TableFactory {
   //
   // If the function cannot find a way to sanitize the input DB Options,
   // a non-ok Status will be returned.
-  virtual Status SanitizeOptions(
-      const DBOptions& db_opts,
-      const ColumnFamilyOptions& cf_opts) const = 0;
+  virtual Status SanitizeOptions(const DBOptions& db_opts,
+                                 const ColumnFamilyOptions& cf_opts) const = 0;
 
   // Return a string that contains printable format of table configurations.
   // RocksDB prints configurations at DB Open().
   virtual std::string GetPrintableTableOptions() const = 0;
 
-  virtual Status GetOptionString(std::string* opt_string,
-                                 const std::string& delimiter) const {
+  virtual Status GetOptionString(std::string* /*opt_string*/,
+                                 const std::string& /*delimiter*/) const {
     return Status::NotSupported(
         "The table factory doesn't implement GetOptionString().");
   }
@@ -501,7 +537,8 @@ class TableFactory {
 // @block_based_table_factory:  block based table factory to use. If NULL, use
 //                              a default one.
 // @plain_table_factory: plain table factory to use. If NULL, use a default one.
-// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default one.
+// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default
+// one.
 extern TableFactory* NewAdaptiveTableFactory(
     std::shared_ptr<TableFactory> table_factory_to_write = nullptr,
     std::shared_ptr<TableFactory> block_based_table_factory = nullptr,
diff --git a/thirdparty/rocksdb/include/rocksdb/table_properties.h b/thirdparty/rocksdb/include/rocksdb/table_properties.h
index 2605fadd25..70e8d2cba7 100644
--- a/thirdparty/rocksdb/include/rocksdb/table_properties.h
+++ b/thirdparty/rocksdb/include/rocksdb/table_properties.h
@@ -15,7 +15,7 @@ namespace rocksdb {
 // Other than basic table properties, each table may also have the user
 // collected properties.
 // The value of the user-collected properties are encoded as raw bytes --
-// users have to interprete these values by themselves.
+// users have to interpret these values by themselves.
 // Note: To do prefix seek/scan in `UserCollectedProperties`, you can do
 // something similar to:
 //
@@ -33,11 +33,16 @@ struct TablePropertiesNames {
   static const std::string kIndexSize;
   static const std::string kIndexPartitions;
   static const std::string kTopLevelIndexSize;
+  static const std::string kIndexKeyIsUserKey;
+  static const std::string kIndexValueIsDeltaEncoded;
   static const std::string kFilterSize;
   static const std::string kRawKeySize;
   static const std::string kRawValueSize;
   static const std::string kNumDataBlocks;
   static const std::string kNumEntries;
+  static const std::string kDeletedKeys;
+  static const std::string kMergeOperands;
+  static const std::string kNumRangeDeletions;
   static const std::string kFormatVersion;
   static const std::string kFixedKeyLen;
   static const std::string kFilterPolicy;
@@ -48,6 +53,7 @@ struct TablePropertiesNames {
   static const std::string kPrefixExtractorName;
   static const std::string kPropertyCollectors;
   static const std::string kCompression;
+  static const std::string kCompressionOptions;
   static const std::string kCreationTime;
   static const std::string kOldestKeyTime;
 };
@@ -56,18 +62,10 @@ extern const std::string kPropertiesBlock;
 extern const std::string kCompressionDictBlock;
 extern const std::string kRangeDelBlock;
 
-enum EntryType {
-  kEntryPut,
-  kEntryDelete,
-  kEntrySingleDelete,
-  kEntryMerge,
-  kEntryOther,
-};
-
 // `TablePropertiesCollector` provides the mechanism for users to collect
 // their own properties that they are interested in. This class is essentially
 // a collection of callback functions that will be invoked during table
-// building. It is construced with TablePropertiesCollectorFactory. The methods
+// building. It is constructed with TablePropertiesCollectorFactory. The methods
 // don't need to be thread-safe, as we will create exactly one
 // TablePropertiesCollector object per table and then call it sequentially
 class TablePropertiesCollector {
@@ -95,6 +93,14 @@ class TablePropertiesCollector {
     return Add(key, value);
   }
 
+  // Called after each new block is cut
+  virtual void BlockAdd(uint64_t /* blockRawBytes */,
+                        uint64_t /* blockCompressedBytesFast */,
+                        uint64_t /* blockCompressedBytesSlow */) {
+    // Nothing to do here. Callback registers can override.
+    return;
+  }
+
   // Finish() will be called when a table has already been built and is ready
   // for writing the properties block.
   // @params properties  User will add their collected statistics to
@@ -142,6 +148,11 @@ struct TableProperties {
   uint64_t index_partitions = 0;
   // Size of the top-level index if kTwoLevelIndexSearch is used
   uint64_t top_level_index_size = 0;
+  // Whether the index key is user key. Otherwise it includes 8 byte of sequence
+  // number added by internal key format.
+  uint64_t index_key_is_user_key = 0;
+  // Whether delta encoding is used to encode the index values.
+  uint64_t index_value_is_delta_encoded = 0;
   // the size of filter block.
   uint64_t filter_size = 0;
   // total raw key size
@@ -152,6 +163,12 @@ struct TableProperties {
   uint64_t num_data_blocks = 0;
   // the number of entries in this table
   uint64_t num_entries = 0;
+  // the number of deletions in the table
+  uint64_t num_deletions = 0;
+  // the number of merge operands in the table
+  uint64_t num_merge_operands = 0;
+  // the number of range deletions in this table
+  uint64_t num_range_deletions = 0;
   // format version, reserved for backward compatibility
   uint64_t format_version = 0;
   // If 0, key is variable length. Otherwise number of bytes for each key.
@@ -193,6 +210,9 @@ struct TableProperties {
   // The compression algo used to compress the SST files.
   std::string compression_name;
 
+  // Compression options used to compress the SST files.
+  std::string compression_options;
+
   // user collected properties
   UserCollectedProperties user_collected_properties;
   UserCollectedProperties readable_properties;
@@ -214,6 +234,10 @@ struct TableProperties {
 // Below is a list of non-basic properties that are collected by database
 // itself. Especially some properties regarding to the internal keys (which
 // is unknown to `table`).
+//
+// DEPRECATED: these properties now belong as TableProperties members. Please
+// use TableProperties::num_deletions and TableProperties::num_merge_operands,
+// respectively.
 extern uint64_t GetDeletedKeys(const UserCollectedProperties& props);
 extern uint64_t GetMergeOperands(const UserCollectedProperties& props,
                                  bool* property_present);
diff --git a/thirdparty/rocksdb/include/rocksdb/thread_status.h b/thirdparty/rocksdb/include/rocksdb/thread_status.h
index 55c32ed6d2..b81c1c284e 100644
--- a/thirdparty/rocksdb/include/rocksdb/thread_status.h
+++ b/thirdparty/rocksdb/include/rocksdb/thread_status.h
@@ -20,8 +20,7 @@
 #include <utility>
 #include <vector>
 
-#if !defined(ROCKSDB_LITE) && \
-    !defined(NROCKSDB_THREAD_STATUS) && \
+#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) && \
     defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
 #define ROCKSDB_USING_THREAD_STATUS
 #endif
@@ -43,8 +42,9 @@ struct ThreadStatus {
   // The type of a thread.
   enum ThreadType : int {
     HIGH_PRIORITY = 0,  // RocksDB BG thread in high-pri thread pool
-    LOW_PRIORITY,  // RocksDB BG thread in low-pri thread pool
-    USER,  // User thread (Non-RocksDB BG thread)
+    LOW_PRIORITY,       // RocksDB BG thread in low-pri thread pool
+    USER,               // User thread (Non-RocksDB BG thread)
+    BOTTOM_PRIORITY,    // RocksDB BG thread in bottom-pri thread pool
     NUM_THREAD_TYPES
   };
 
@@ -104,22 +104,20 @@ struct ThreadStatus {
     NUM_STATE_TYPES
   };
 
-  ThreadStatus(const uint64_t _id,
-               const ThreadType _thread_type,
-               const std::string& _db_name,
-               const std::string& _cf_name,
+  ThreadStatus(const uint64_t _id, const ThreadType _thread_type,
+               const std::string& _db_name, const std::string& _cf_name,
                const OperationType _operation_type,
                const uint64_t _op_elapsed_micros,
                const OperationStage _operation_stage,
-               const uint64_t _op_props[],
-               const StateType _state_type) :
-      thread_id(_id), thread_type(_thread_type),
-      db_name(_db_name),
-      cf_name(_cf_name),
-      operation_type(_operation_type),
-      op_elapsed_micros(_op_elapsed_micros),
-      operation_stage(_operation_stage),
-      state_type(_state_type) {
+               const uint64_t _op_props[], const StateType _state_type)
+      : thread_id(_id),
+        thread_type(_thread_type),
+        db_name(_db_name),
+        cf_name(_cf_name),
+        operation_type(_operation_type),
+        op_elapsed_micros(_op_elapsed_micros),
+        operation_stage(_operation_stage),
+        state_type(_state_type) {
     for (int i = 0; i < kNumOperationProperties; ++i) {
       op_properties[i] = _op_props[i];
     }
@@ -163,7 +161,7 @@ struct ThreadStatus {
   // The followings are a set of utility functions for interpreting
   // the information of ThreadStatus
 
-  static const std::string& GetThreadTypeName(ThreadType thread_type);
+  static std::string GetThreadTypeName(ThreadType thread_type);
 
   // Obtain the name of an operation given its type.
   static const std::string& GetOperationName(OperationType op_type);
@@ -171,23 +169,20 @@ struct ThreadStatus {
   static const std::string MicrosToString(uint64_t op_elapsed_time);
 
   // Obtain a human-readable string describing the specified operation stage.
-  static const std::string& GetOperationStageName(
-      OperationStage stage);
+  static const std::string& GetOperationStageName(OperationStage stage);
 
   // Obtain the name of the "i"th operation property of the
   // specified operation.
-  static const std::string& GetOperationPropertyName(
-      OperationType op_type, int i);
+  static const std::string& GetOperationPropertyName(OperationType op_type,
+                                                     int i);
 
   // Translate the "i"th property of the specified operation given
   // a property value.
-  static std::map<std::string, uint64_t>
-      InterpretOperationProperties(
-          OperationType op_type, const uint64_t* op_properties);
+  static std::map<std::string, uint64_t> InterpretOperationProperties(
+      OperationType op_type, const uint64_t* op_properties);
 
   // Obtain the name of a state given its type.
   static const std::string& GetStateName(StateType state_type);
 };
 
-
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/threadpool.h b/thirdparty/rocksdb/include/rocksdb/threadpool.h
index e871ee18c7..2e2f2b44fe 100644
--- a/thirdparty/rocksdb/include/rocksdb/threadpool.h
+++ b/thirdparty/rocksdb/include/rocksdb/threadpool.h
@@ -47,7 +47,6 @@ class ThreadPool {
   virtual void SubmitJob(const std::function<void()>&) = 0;
   // This moves the function in for efficiency
   virtual void SubmitJob(std::function<void()>&&) = 0;
-
 };
 
 // NewThreadPool() is a function that could be used to create a ThreadPool
diff --git a/thirdparty/rocksdb/include/rocksdb/trace_reader_writer.h b/thirdparty/rocksdb/include/rocksdb/trace_reader_writer.h
new file mode 100644
index 0000000000..28919a0fad
--- /dev/null
+++ b/thirdparty/rocksdb/include/rocksdb/trace_reader_writer.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+// Allow custom implementations of TraceWriter and TraceReader.
+// By default, RocksDB provides a way to capture the traces to a file using the
+// factory NewFileTraceWriter(). But users could also choose to export traces to
+// any other system by providing custom implementations of TraceWriter and
+// TraceReader.
+
+// TraceWriter allows exporting RocksDB traces to any system, one operation at
+// a time.
+class TraceWriter {
+ public:
+  TraceWriter() {}
+  virtual ~TraceWriter() {}
+
+  virtual Status Write(const Slice& data) = 0;
+  virtual Status Close() = 0;
+  virtual uint64_t GetFileSize() = 0;
+};
+
+// TraceReader allows reading RocksDB traces from any system, one operation at
+// a time. A RocksDB Replayer could depend on this to replay opertions.
+class TraceReader {
+ public:
+  TraceReader() {}
+  virtual ~TraceReader() {}
+
+  virtual Status Read(std::string* data) = 0;
+  virtual Status Close() = 0;
+};
+
+// Factory methods to read/write traces from/to a file.
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceWriter>* trace_writer);
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceReader>* trace_reader);
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/transaction_log.h b/thirdparty/rocksdb/include/rocksdb/transaction_log.h
index 7fc46ae264..80f373b247 100644
--- a/thirdparty/rocksdb/include/rocksdb/transaction_log.h
+++ b/thirdparty/rocksdb/include/rocksdb/transaction_log.h
@@ -3,21 +3,20 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
-#define STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+#pragma once
 
+#include <memory>
+#include <vector>
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "rocksdb/write_batch.h"
-#include <memory>
-#include <vector>
 
 namespace rocksdb {
 
 class LogFile;
 typedef std::vector<std::unique_ptr<LogFile>> VectorLogPtr;
 
-enum  WalFileType {
+enum WalFileType {
   /* Indicates that WAL file is in archive directory. WAL files are moved from
    * the main db directory to archive directory once they are not live and stay
    * there until cleaned up. Files are cleaned depending on archive size
@@ -28,7 +27,7 @@ enum  WalFileType {
 
   /* Indicates that WAL file is live and resides in the main db directory */
   kAliveLogFile = 1
-} ;
+};
 
 class LogFile {
  public:
@@ -40,7 +39,6 @@ class LogFile {
   //     For an archived-log-file = /archive/000003.log
   virtual std::string PathName() const = 0;
 
-
   // Primary identifier for log file.
   // This is directly proportional to creation time of the log file
   virtual uint64_t LogNumber() const = 0;
@@ -61,7 +59,7 @@ struct BatchResult {
 
   // Add empty __ctor and __dtor for the rule of five
   // However, preserve the original semantics and prohibit copying
-  // as the unique_ptr member does not copy.
+  // as the std::unique_ptr member does not copy.
   BatchResult() {}
 
   ~BatchResult() {}
@@ -120,6 +118,4 @@ class TransactionLogIterator {
         : verify_checksums_(verify_checksums) {}
   };
 };
-} //  namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_TRANSACTION_LOG_ITERATOR_H_
+}  //  namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/types.h b/thirdparty/rocksdb/include/rocksdb/types.h
index 106ac2f76b..2cd4039bd7 100644
--- a/thirdparty/rocksdb/include/rocksdb/types.h
+++ b/thirdparty/rocksdb/include/rocksdb/types.h
@@ -3,10 +3,10 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_TYPES_H_
-#define STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+#pragma once
 
 #include <stdint.h>
+#include "rocksdb/slice.h"
 
 namespace rocksdb {
 
@@ -15,6 +15,40 @@ namespace rocksdb {
 // Represents a sequence number in a WAL file.
 typedef uint64_t SequenceNumber;
 
-}  //  namespace rocksdb
+const SequenceNumber kMinUnCommittedSeq = 1;  // 0 is always committed
+
+// User-oriented representation of internal key types.
+enum EntryType {
+  kEntryPut,
+  kEntryDelete,
+  kEntrySingleDelete,
+  kEntryMerge,
+  kEntryRangeDeletion,
+  kEntryBlobIndex,
+  kEntryOther,
+};
+
+// <user key, sequence number, and entry type> tuple.
+struct FullKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  EntryType type;
+
+  FullKey() : sequence(0) {}  // Intentionally left uninitialized (for speed)
+  FullKey(const Slice& u, const SequenceNumber& seq, EntryType t)
+      : user_key(u), sequence(seq), type(t) {}
+  std::string DebugString(bool hex = false) const;
 
-#endif //  STORAGE_ROCKSDB_INCLUDE_TYPES_H_
+  void clear() {
+    user_key.clear();
+    sequence = 0;
+    type = EntryType::kEntryPut;
+  }
+};
+
+// Parse slice representing internal key to FullKey
+// Parsed FullKey is valid for as long as the memory pointed to by
+// internal_key is alive.
+bool ParseFullKey(const Slice& internal_key, FullKey* result);
+
+}  //  namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/universal_compaction.h b/thirdparty/rocksdb/include/rocksdb/universal_compaction.h
index ed2220873c..e219694b3f 100644
--- a/thirdparty/rocksdb/include/rocksdb/universal_compaction.h
+++ b/thirdparty/rocksdb/include/rocksdb/universal_compaction.h
@@ -3,8 +3,7 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#ifndef STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
-#define STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
+#pragma once
 
 #include <stdint.h>
 #include <climits>
@@ -17,13 +16,12 @@ namespace rocksdb {
 // into a single compaction run
 //
 enum CompactionStopStyle {
-  kCompactionStopStyleSimilarSize, // pick files of similar size
-  kCompactionStopStyleTotalSize    // total size of picked files > next file
+  kCompactionStopStyleSimilarSize,  // pick files of similar size
+  kCompactionStopStyleTotalSize     // total size of picked files > next file
 };
 
 class CompactionOptionsUniversal {
  public:
-
   // Percentage flexibility while comparing file size. If the candidate file(s)
   // size is 1% smaller than the next file's size, then include next file into
   // this candidate set. // Default: 1
@@ -86,5 +84,3 @@ class CompactionOptionsUniversal {
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_UNIVERSAL_COMPACTION_OPTIONS_H
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/backupable_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/backupable_db.h
index fc2b6ba43f..7817c56496 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/backupable_db.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/backupable_db.h
@@ -15,10 +15,10 @@
 #endif
 
 #include <inttypes.h>
-#include <string>
+#include <functional>
 #include <map>
+#include <string>
 #include <vector>
-#include <functional>
 
 #include "rocksdb/utilities/stackable_db.h"
 
@@ -109,8 +109,14 @@ struct BackupableDBOptions {
   uint64_t callback_trigger_interval_size;
 
   // When Open() is called, it will open at most this many of the latest
-  // non-corrupted backups. If 0, it will open all available backups.
-  // Default: 0
+  // non-corrupted backups.
+  //
+  // Note setting this to a non-default value prevents old files from being
+  // deleted in the shared directory, as we can't do proper ref-counting. If
+  // using this option, make sure to occasionally disable it (by resetting to
+  // INT_MAX) and run GarbageCollect to clean accumulated stale files.
+  //
+  // Default: INT_MAX
   int max_valid_backups_to_open;
 
   void Dump(Logger* logger) const;
@@ -122,7 +128,7 @@ struct BackupableDBOptions {
       bool _backup_log_files = true, uint64_t _backup_rate_limit = 0,
       uint64_t _restore_rate_limit = 0, int _max_background_operations = 1,
       uint64_t _callback_trigger_interval_size = 4 * 1024 * 1024,
-      int _max_valid_backups_to_open = 0)
+      int _max_valid_backups_to_open = INT_MAX)
       : backup_dir(_backup_dir),
         backup_env(_backup_env),
         share_table_files(_share_table_files),
@@ -251,12 +257,13 @@ class BackupEngine {
 
   // BackupableDBOptions have to be the same as the ones used in previous
   // BackupEngines for the same backup directory.
-  static Status Open(Env* db_env,
-                     const BackupableDBOptions& options,
+  static Status Open(Env* db_env, const BackupableDBOptions& options,
                      BackupEngine** backup_engine_ptr);
 
   // same as CreateNewBackup, but stores extra application metadata
   // Flush will always trigger if 2PC is enabled.
+  // If write-ahead logs are disabled, set flush_before_backup=true to
+  // avoid losing unflushed key/value pairs from the memtable.
   virtual Status CreateNewBackupWithMetadata(
       DB* db, const std::string& app_metadata, bool flush_before_backup = false,
       std::function<void()> progress_callback = []() {}) = 0;
@@ -264,6 +271,8 @@ class BackupEngine {
   // Captures the state of the database in the latest backup
   // NOT a thread safe call
   // Flush will always trigger if 2PC is enabled.
+  // If write-ahead logs are disabled, set flush_before_backup=true to
+  // avoid losing unflushed key/value pairs from the memtable.
   virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false,
                                  std::function<void()> progress_callback =
                                      []() {}) {
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/date_tiered_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/date_tiered_db.h
deleted file mode 100644
index f259b05a8a..0000000000
--- a/thirdparty/rocksdb/include/rocksdb/utilities/date_tiered_db.h
+++ /dev/null
@@ -1,108 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "rocksdb/db.h"
-
-namespace rocksdb {
-
-// Date tiered database is a wrapper of DB that implements
-// a simplified DateTieredCompactionStrategy by using multiple column famillies
-// as time windows.
-//
-// DateTieredDB provides an interface similar to DB, but it assumes that user
-// provides keys with last 8 bytes encoded as timestamp in seconds. DateTieredDB
-// is assigned with a TTL to declare when data should be deleted.
-//
-// DateTieredDB hides column families layer from standard RocksDB instance. It
-// uses multiple column families to manage time series data, each containing a
-// specific range of time. Column families are named by its maximum possible
-// timestamp. A column family is created automatically when data newer than
-// latest timestamp of all existing column families. The time range of a column
-// family is configurable by `column_family_interval`. By doing this, we
-// guarantee that compaction will only happen in a column family.
-//
-// DateTieredDB is assigned with a TTL. When all data in a column family are
-// expired (CF_Timestamp <= CUR_Timestamp - TTL), we directly drop the whole
-// column family.
-//
-// TODO(jhli): This is only a simplified version of DTCS. In a complete DTCS,
-// time windows can be merged over time, so that older time windows will have
-// larger time range. Also, compaction are executed only for adjacent SST files
-// to guarantee there is no time overlap between SST files.
-
-class DateTieredDB {
- public:
-  // Open a DateTieredDB whose name is `dbname`.
-  // Similar to DB::Open(), created database object is stored in dbptr.
-  //
-  // Two parameters can be configured: `ttl` to specify the length of time that
-  // keys should exist in the database, and `column_family_interval` to specify
-  // the time range of a column family interval.
-  //
-  // Open a read only database if read only is set as true.
-  // TODO(jhli): Should use an option object that includes ttl and
-  // column_family_interval.
-  static Status Open(const Options& options, const std::string& dbname,
-                     DateTieredDB** dbptr, int64_t ttl,
-                     int64_t column_family_interval, bool read_only = false);
-
-  explicit DateTieredDB() {}
-
-  virtual ~DateTieredDB() {}
-
-  // Wrapper for Put method. Similar to DB::Put(), but column family to be
-  // inserted is decided by the timestamp in keys, i.e. the last 8 bytes of user
-  // key. If key is already obsolete, it will not be inserted.
-  //
-  // When client put a key value pair in DateTieredDB, it assumes last 8 bytes
-  // of keys are encoded as timestamp. Timestamp is a 64-bit signed integer
-  // encoded as the number of seconds since 1970-01-01 00:00:00 (UTC) (Same as
-  // Env::GetCurrentTime()). Timestamp should be encoded in big endian.
-  virtual Status Put(const WriteOptions& options, const Slice& key,
-                     const Slice& val) = 0;
-
-  // Wrapper for Get method. Similar to DB::Get() but column family is decided
-  // by timestamp in keys. If key is already obsolete, it will not be found.
-  virtual Status Get(const ReadOptions& options, const Slice& key,
-                     std::string* value) = 0;
-
-  // Wrapper for Delete method. Similar to DB::Delete() but column family is
-  // decided by timestamp in keys. If key is already obsolete, return NotFound
-  // status.
-  virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
-
-  // Wrapper for KeyMayExist method. Similar to DB::KeyMayExist() but column
-  // family is decided by timestamp in keys. Return false when key is already
-  // obsolete.
-  virtual bool KeyMayExist(const ReadOptions& options, const Slice& key,
-                           std::string* value, bool* value_found = nullptr) = 0;
-
-  // Wrapper for Merge method. Similar to DB::Merge() but column family is
-  // decided by timestamp in keys.
-  virtual Status Merge(const WriteOptions& options, const Slice& key,
-                       const Slice& value) = 0;
-
-  // Create an iterator that hides low level details. This iterator internally
-  // merge results from all active time series column families. Note that
-  // column families are not deleted until all data are obsolete, so this
-  // iterator can possibly access obsolete key value pairs.
-  virtual Iterator* NewIterator(const ReadOptions& opts) = 0;
-
-  // Explicitly drop column families in which all keys are obsolete. This
-  // process is also inplicitly done in Put() operation.
-  virtual Status DropObsoleteColumnFamilies() = 0;
-
-  static const uint64_t kTSLength = sizeof(int64_t);  // size of timestamp
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/db_ttl.h b/thirdparty/rocksdb/include/rocksdb/utilities/db_ttl.h
index 7c9c0cc55a..227796cbe2 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/db_ttl.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/db_ttl.h
@@ -9,8 +9,8 @@
 #include <string>
 #include <vector>
 
-#include "rocksdb/utilities/stackable_db.h"
 #include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
 
 namespace rocksdb {
 
@@ -60,6 +60,10 @@ class DBWithTTL : public StackableDB {
                      DBWithTTL** dbptr, std::vector<int32_t> ttls,
                      bool read_only = false);
 
+  virtual void SetTtl(int32_t ttl) = 0;
+
+  virtual void SetTtl(ColumnFamilyHandle* h, int32_t ttl) = 0;
+
  protected:
   explicit DBWithTTL(DB* db) : StackableDB(db) {}
 };
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/debug.h b/thirdparty/rocksdb/include/rocksdb/utilities/debug.h
index bc5b9bf03d..50645423d0 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/debug.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/debug.h
@@ -31,9 +31,13 @@ struct KeyVersion {
 };
 
 // Returns listing of all versions of keys in the provided user key range.
-// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`].
+// The range is inclusive-inclusive, i.e., [`begin_key`, `end_key`], or
+// `max_num_ikeys` has been reached. Since all those keys returned will be
+// copied to memory, if the range covers too many keys, the memory usage
+// may be huge. `max_num_ikeys` can be used to cap the memory usage.
 // The result is inserted into the provided vector, `key_versions`.
 Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+                         size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/document_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/document_db.h
deleted file mode 100644
index 3668a50b9d..0000000000
--- a/thirdparty/rocksdb/include/rocksdb/utilities/document_db.h
+++ /dev/null
@@ -1,149 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <string>
-#include <vector>
-
-#include "rocksdb/utilities/stackable_db.h"
-#include "rocksdb/utilities/json_document.h"
-#include "rocksdb/db.h"
-
-namespace rocksdb {
-
-// IMPORTANT: DocumentDB is a work in progress. It is unstable and we might
-// change the API without warning. Talk to RocksDB team before using this in
-// production ;)
-
-// DocumentDB is a layer on top of RocksDB that provides a very simple JSON API.
-// When creating a DB, you specify a list of indexes you want to keep on your
-// data. You can insert a JSON document to the DB, which is automatically
-// indexed. Every document added to the DB needs to have "_id" field which is
-// automatically indexed and is an unique primary key. All other indexes are
-// non-unique.
-
-// NOTE: field names in the JSON are NOT allowed to start with '$' or
-// contain '.'. We don't currently enforce that rule, but will start behaving
-// badly.
-
-// Cursor is what you get as a result of executing query. To get all
-// results from a query, call Next() on a Cursor while  Valid() returns true
-class Cursor {
- public:
-  Cursor() = default;
-  virtual ~Cursor() {}
-
-  virtual bool Valid() const = 0;
-  virtual void Next() = 0;
-  // Lifecycle of the returned JSONDocument is until the next Next() call
-  virtual const JSONDocument& document() const = 0;
-  virtual Status status() const = 0;
-
- private:
-  // No copying allowed
-  Cursor(const Cursor&);
-  void operator=(const Cursor&);
-};
-
-struct DocumentDBOptions {
-  int background_threads = 4;
-  uint64_t memtable_size = 128 * 1024 * 1024;    // 128 MB
-  uint64_t cache_size = 1 * 1024 * 1024 * 1024;  // 1 GB
-};
-
-// TODO(icanadi) Add `JSONDocument* info` parameter to all calls that can be
-// used by the caller to get more information about the call execution (number
-// of dropped records, number of updated records, etc.)
-class DocumentDB : public StackableDB {
- public:
-  struct IndexDescriptor {
-    // Currently, you can only define an index on a single field. To specify an
-    // index on a field X, set index description to JSON "{X: 1}"
-    // Currently the value needs to be 1, which means ascending.
-    // In the future, we plan to also support indexes on multiple keys, where
-    // you could mix ascending sorting (1) with descending sorting indexes (-1)
-    JSONDocument* description;
-    std::string name;
-  };
-
-  // Open DocumentDB with specified indexes. The list of indexes has to be
-  // complete, i.e. include all indexes present in the DB, except the primary
-  // key index.
-  // Otherwise, Open() will return an error
-  static Status Open(const DocumentDBOptions& options, const std::string& name,
-                     const std::vector<IndexDescriptor>& indexes,
-                     DocumentDB** db, bool read_only = false);
-
-  explicit DocumentDB(DB* db) : StackableDB(db) {}
-
-  // Create a new index. It will stop all writes for the duration of the call.
-  // All current documents in the DB are scanned and corresponding index entries
-  // are created
-  virtual Status CreateIndex(const WriteOptions& write_options,
-                             const IndexDescriptor& index) = 0;
-
-  // Drop an index. Client is responsible to make sure that index is not being
-  // used by currently executing queries
-  virtual Status DropIndex(const std::string& name) = 0;
-
-  // Insert a document to the DB. The document needs to have a primary key "_id"
-  // which can either be a string or an integer. Otherwise the write will fail
-  // with InvalidArgument.
-  virtual Status Insert(const WriteOptions& options,
-                        const JSONDocument& document) = 0;
-
-  // Deletes all documents matching a filter atomically
-  virtual Status Remove(const ReadOptions& read_options,
-                        const WriteOptions& write_options,
-                        const JSONDocument& query) = 0;
-
-  // Does this sequence of operations:
-  // 1. Find all documents matching a filter
-  // 2. For all documents, atomically:
-  // 2.1. apply the update operators
-  // 2.2. update the secondary indexes
-  //
-  // Currently only $set update operator is supported.
-  // Syntax is: {$set: {key1: value1, key2: value2, etc...}}
-  // This operator will change a document's key1 field to value1, key2 to
-  // value2, etc. New values will be set even if a document didn't have an entry
-  // for the specified key.
-  //
-  // You can not change a primary key of a document.
-  //
-  // Update example: Update({id: {$gt: 5}, $index: id}, {$set: {enabled: true}})
-  virtual Status Update(const ReadOptions& read_options,
-                        const WriteOptions& write_options,
-                        const JSONDocument& filter,
-                        const JSONDocument& updates) = 0;
-
-  // query has to be an array in which every element is an operator. Currently
-  // only $filter operator is supported. Syntax of $filter operator is:
-  // {$filter: {key1: condition1, key2: condition2, etc.}} where conditions can
-  // be either:
-  // 1) a single value in which case the condition is equality condition, or
-  // 2) a defined operators, like {$gt: 4}, which will match all documents that
-  // have key greater than 4.
-  //
-  // Supported operators are:
-  // 1) $gt -- greater than
-  // 2) $gte -- greater than or equal
-  // 3) $lt -- less than
-  // 4) $lte -- less than or equal
-  // If you want the filter to use an index, you need to specify it like this:
-  // {$filter: {...(conditions)..., $index: index_name}}
-  //
-  // Example query:
-  // * [{$filter: {name: John, age: {$gte: 18}, $index: age}}]
-  // will return all Johns whose age is greater or equal to 18 and it will use
-  // index "age" to satisfy the query.
-  virtual Cursor* Query(const ReadOptions& read_options,
-                        const JSONDocument& query) = 0;
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/env_librados.h b/thirdparty/rocksdb/include/rocksdb/utilities/env_librados.h
index 272365f0c6..7be75878d9 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/env_librados.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/env_librados.h
@@ -2,8 +2,8 @@
 //  This source code is licensed under both the GPLv2 (found in the
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_UTILITIES_ENV_LIBRADOS_H
-#define ROCKSDB_UTILITIES_ENV_LIBRADOS_H
+
+#pragma once
 
 #include <memory>
 #include <string>
@@ -172,5 +172,4 @@ class EnvLibrados : public EnvWrapper {
   librados::IoCtx* _GetIoctx(const std::string& prefix);
   friend class LibradosWritableFile;
 };
-}
-#endif
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/env_mirror.h b/thirdparty/rocksdb/include/rocksdb/utilities/env_mirror.h
index ffd175ae5e..6d513fc791 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/env_mirror.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/env_mirror.h
@@ -19,8 +19,8 @@
 
 #ifndef ROCKSDB_LITE
 
-#include <iostream>
 #include <algorithm>
+#include <iostream>
 #include <vector>
 #include "rocksdb/env.h"
 
@@ -31,37 +31,32 @@ class RandomAccessFileMirror;
 class WritableFileMirror;
 
 class EnvMirror : public EnvWrapper {
-  Env* a_, *b_;
+  Env *a_, *b_;
   bool free_a_, free_b_;
 
  public:
-  EnvMirror(Env* a, Env* b, bool free_a=false, bool free_b=false)
-    : EnvWrapper(a),
-      a_(a),
-      b_(b),
-      free_a_(free_a),
-      free_b_(free_b) {}
+  EnvMirror(Env* a, Env* b, bool free_a = false, bool free_b = false)
+      : EnvWrapper(a), a_(a), b_(b), free_a_(free_a), free_b_(free_b) {}
   ~EnvMirror() {
-    if (free_a_)
-      delete a_;
-    if (free_b_)
-      delete b_;
+    if (free_a_) delete a_;
+    if (free_b_) delete b_;
   }
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& options) override;
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& options) override;
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& options) override;
   Status ReuseWritableFile(const std::string& fname,
                            const std::string& old_fname,
-                           unique_ptr<WritableFile>* r,
+                           std::unique_ptr<WritableFile>* r,
                            const EnvOptions& options) override;
   virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
-    unique_ptr<Directory> br;
+                              std::unique_ptr<Directory>* result) override {
+    std::unique_ptr<Directory> br;
     Status as = a_->NewDirectory(name, result);
     Status bs = b_->NewDirectory(name, &br);
     assert(as == bs);
@@ -73,6 +68,11 @@ class EnvMirror : public EnvWrapper {
     assert(as == bs);
     return as;
   }
+#if defined(_MSC_VER)
+#pragma warning(push)
+// logical operation on address of string constant
+#pragma warning(disable : 4130)
+#endif
   Status GetChildren(const std::string& dir,
                      std::vector<std::string>* r) override {
     std::vector<std::string> ar, br;
@@ -87,6 +87,9 @@ class EnvMirror : public EnvWrapper {
     *r = ar;
     return as;
   }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
   Status DeleteFile(const std::string& f) override {
     Status as = a_->DeleteFile(f);
     Status bs = b_->DeleteFile(f);
@@ -148,12 +151,12 @@ class EnvMirror : public EnvWrapper {
 
   class FileLockMirror : public FileLock {
    public:
-    FileLock* a_, *b_;
+    FileLock *a_, *b_;
     FileLockMirror(FileLock* a, FileLock* b) : a_(a), b_(b) {}
   };
 
   Status LockFile(const std::string& f, FileLock** l) override {
-    FileLock* al, *bl;
+    FileLock *al, *bl;
     Status as = a_->LockFile(f, &al);
     Status bs = b_->LockFile(f, &bl);
     assert(as == bs);
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/geo_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/geo_db.h
deleted file mode 100644
index 408774c599..0000000000
--- a/thirdparty/rocksdb/include/rocksdb/utilities/geo_db.h
+++ /dev/null
@@ -1,114 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-
-#ifndef ROCKSDB_LITE
-#pragma once
-#include <string>
-#include <vector>
-
-#include "rocksdb/utilities/stackable_db.h"
-#include "rocksdb/status.h"
-
-namespace rocksdb {
-
-//
-// Configurable options needed for setting up a Geo database
-//
-struct GeoDBOptions {
-  // Backup info and error messages will be written to info_log
-  // if non-nullptr.
-  // Default: nullptr
-  Logger* info_log;
-
-  explicit GeoDBOptions(Logger* _info_log = nullptr):info_log(_info_log) { }
-};
-
-//
-// A position in the earth's geoid
-//
-class GeoPosition {
- public:
-  double latitude;
-  double longitude;
-
-  explicit GeoPosition(double la = 0, double lo = 0) :
-    latitude(la), longitude(lo) {
-  }
-};
-
-//
-// Description of an object on the Geoid. It is located by a GPS location,
-// and is identified by the id. The value associated with this object is
-// an opaque string 'value'. Different objects identified by unique id's
-// can have the same gps-location associated with them.
-//
-class GeoObject {
- public:
-  GeoPosition position;
-  std::string id;
-  std::string value;
-
-  GeoObject() {}
-
-  GeoObject(const GeoPosition& pos, const std::string& i,
-            const std::string& val) :
-    position(pos), id(i), value(val) {
-  }
-};
-
-class GeoIterator {
- public:
-  GeoIterator() = default;
-  virtual ~GeoIterator() {}
-  virtual void Next() = 0;
-  virtual bool Valid() const = 0;
-  virtual const GeoObject& geo_object() = 0;
-  virtual Status status() const = 0;
-};
-
-//
-// Stack your DB with GeoDB to be able to get geo-spatial support
-//
-class GeoDB : public StackableDB {
- public:
-  // GeoDBOptions have to be the same as the ones used in a previous
-  // incarnation of the DB
-  //
-  // GeoDB owns the pointer `DB* db` now. You should not delete it or
-  // use it after the invocation of GeoDB
-  // GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
-  GeoDB(DB* db, const GeoDBOptions& options) : StackableDB(db) {}
-  virtual ~GeoDB() {}
-
-  // Insert a new object into the location database. The object is
-  // uniquely identified by the id. If an object with the same id already
-  // exists in the db, then the old one is overwritten by the new
-  // object being inserted here.
-  virtual Status Insert(const GeoObject& object) = 0;
-
-  // Retrieve the value of the object located at the specified GPS
-  // location and is identified by the 'id'.
-  virtual Status GetByPosition(const GeoPosition& pos,
-                               const Slice& id, std::string* value) = 0;
-
-  // Retrieve the value of the object identified by the 'id'. This method
-  // could be potentially slower than GetByPosition
-  virtual Status GetById(const Slice& id, GeoObject*  object) = 0;
-
-  // Delete the specified object
-  virtual Status Remove(const Slice& id) = 0;
-
-  // Returns an iterator for the items within a circular radius from the
-  // specified gps location. If 'number_of_values' is specified,
-  // then the iterator is capped to that number of objects.
-  // The radius is specified in 'meters'.
-  virtual GeoIterator* SearchRadial(const GeoPosition& pos,
-                                    double radius,
-                                    int number_of_values = INT_MAX) = 0;
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/json_document.h b/thirdparty/rocksdb/include/rocksdb/utilities/json_document.h
deleted file mode 100644
index 5d841f9515..0000000000
--- a/thirdparty/rocksdb/include/rocksdb/utilities/json_document.h
+++ /dev/null
@@ -1,195 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <deque>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "rocksdb/slice.h"
-
-// We use JSONDocument for DocumentDB API
-// Implementation inspired by folly::dynamic, rapidjson and fbson
-
-namespace fbson {
-  class FbsonValue;
-  class ObjectVal;
-  template <typename T>
-  class FbsonWriterT;
-  class FbsonOutStream;
-  typedef FbsonWriterT<FbsonOutStream> FbsonWriter;
-}  // namespace fbson
-
-namespace rocksdb {
-
-// NOTE: none of this is thread-safe
-class JSONDocument {
- public:
-  // return nullptr on parse failure
-  static JSONDocument* ParseJSON(const char* json);
-
-  enum Type {
-    kNull,
-    kArray,
-    kBool,
-    kDouble,
-    kInt64,
-    kObject,
-    kString,
-  };
-
-  /* implicit */ JSONDocument();  // null
-  /* implicit */ JSONDocument(bool b);
-  /* implicit */ JSONDocument(double d);
-  /* implicit */ JSONDocument(int8_t i);
-  /* implicit */ JSONDocument(int16_t i);
-  /* implicit */ JSONDocument(int32_t i);
-  /* implicit */ JSONDocument(int64_t i);
-  /* implicit */ JSONDocument(const std::string& s);
-  /* implicit */ JSONDocument(const char* s);
-  // constructs JSONDocument of specific type with default value
-  explicit JSONDocument(Type _type);
-
-  JSONDocument(const JSONDocument& json_document);
-
-  JSONDocument(JSONDocument&& json_document);
-
-  Type type() const;
-
-  // REQUIRES: IsObject()
-  bool Contains(const std::string& key) const;
-  // REQUIRES: IsObject()
-  // Returns non-owner object
-  JSONDocument operator[](const std::string& key) const;
-
-  // REQUIRES: IsArray() == true || IsObject() == true
-  size_t Count() const;
-
-  // REQUIRES: IsArray()
-  // Returns non-owner object
-  JSONDocument operator[](size_t i) const;
-
-  JSONDocument& operator=(JSONDocument jsonDocument);
-
-  bool IsNull() const;
-  bool IsArray() const;
-  bool IsBool() const;
-  bool IsDouble() const;
-  bool IsInt64() const;
-  bool IsObject() const;
-  bool IsString() const;
-
-  // REQUIRES: IsBool() == true
-  bool GetBool() const;
-  // REQUIRES: IsDouble() == true
-  double GetDouble() const;
-  // REQUIRES: IsInt64() == true
-  int64_t GetInt64() const;
-  // REQUIRES: IsString() == true
-  std::string GetString() const;
-
-  bool operator==(const JSONDocument& rhs) const;
-
-  bool operator!=(const JSONDocument& rhs) const;
-
-  JSONDocument Copy() const;
-
-  bool IsOwner() const;
-
-  std::string DebugString() const;
-
- private:
-  class ItemsIteratorGenerator;
-
- public:
-  // REQUIRES: IsObject()
-  ItemsIteratorGenerator Items() const;
-
-  // appends serialized object to dst
-  void Serialize(std::string* dst) const;
-  // returns nullptr if Slice doesn't represent valid serialized JSONDocument
-  static JSONDocument* Deserialize(const Slice& src);
-
- private:
-  friend class JSONDocumentBuilder;
-
-  JSONDocument(fbson::FbsonValue* val, bool makeCopy);
-
-  void InitFromValue(const fbson::FbsonValue* val);
-
-  // iteration on objects
-  class const_item_iterator {
-   private:
-    class Impl;
-   public:
-    typedef std::pair<std::string, JSONDocument> value_type;
-    explicit const_item_iterator(Impl* impl);
-    const_item_iterator(const_item_iterator&&);
-    const_item_iterator& operator++();
-    bool operator!=(const const_item_iterator& other);
-    value_type operator*();
-    ~const_item_iterator();
-   private:
-    friend class ItemsIteratorGenerator;
-    std::unique_ptr<Impl> it_;
-  };
-
-  class ItemsIteratorGenerator {
-   public:
-    explicit ItemsIteratorGenerator(const fbson::ObjectVal& object);
-    const_item_iterator begin() const;
-
-    const_item_iterator end() const;
-
-   private:
-    const fbson::ObjectVal& object_;
-  };
-
-  std::unique_ptr<char[]> data_;
-  mutable fbson::FbsonValue* value_;
-
-  // Our serialization format's first byte specifies the encoding version. That
-  // way, we can easily change our format while providing backwards
-  // compatibility. This constant specifies the current version of the
-  // serialization format
-  static const char kSerializationFormatVersion;
-};
-
-class JSONDocumentBuilder {
- public:
-  JSONDocumentBuilder();
-
-  explicit JSONDocumentBuilder(fbson::FbsonOutStream* out);
-
-  void Reset();
-
-  bool WriteStartArray();
-
-  bool WriteEndArray();
-
-  bool WriteStartObject();
-
-  bool WriteEndObject();
-
-  bool WriteKeyValue(const std::string& key, const JSONDocument& value);
-
-  bool WriteJSONDocument(const JSONDocument& value);
-
-  JSONDocument GetJSONDocument();
-
-  ~JSONDocumentBuilder();
-
- private:
-  std::unique_ptr<fbson::FbsonWriter> writer_;
-};
-
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd.h b/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd.h
index b9eb1035fb..57ab88a34e 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd.h
@@ -96,6 +96,12 @@ class LDBCommand {
     ldb_options_ = ldb_options;
   }
 
+  const std::map<std::string, std::string>& TEST_GetOptionMap() {
+    return option_map_;
+  }
+
+  const std::vector<std::string>& TEST_GetFlags() { return flags_; }
+
   virtual bool NoDBOpen() { return false; }
 
   virtual ~LDBCommand() { CloseDB(); }
@@ -210,6 +216,15 @@ class LDBCommand {
   bool ParseStringOption(const std::map<std::string, std::string>& options,
                          const std::string& option, std::string* value);
 
+  /**
+   * Returns the value of the specified option as a boolean.
+   * default_val is used if the option is not found in options.
+   * Throws an exception if the value of the option is not
+   * "true" or "false" (case insensitive).
+   */
+  bool ParseBooleanOption(const std::map<std::string, std::string>& options,
+                          const std::string& option, bool default_val);
+
   Options options_;
   std::vector<ColumnFamilyDescriptor> column_families_;
   LDBOptions ldb_options_;
@@ -229,15 +244,6 @@ class LDBCommand {
   bool IsValueHex(const std::map<std::string, std::string>& options,
                   const std::vector<std::string>& flags);
 
-  /**
-   * Returns the value of the specified option as a boolean.
-   * default_val is used if the option is not found in options.
-   * Throws an exception if the value of the option is not
-   * "true" or "false" (case insensitive).
-   */
-  bool ParseBooleanOption(const std::map<std::string, std::string>& options,
-                          const std::string& option, bool default_val);
-
   /**
    * Converts val to a boolean.
    * val must be either true or false (case insensitive).
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h b/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
index 5ddc6feb69..85c219542d 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/ldb_cmd_execute_result.h
@@ -12,26 +12,28 @@
 namespace rocksdb {
 
 class LDBCommandExecuteResult {
-public:
+ public:
   enum State {
-    EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2,
+    EXEC_NOT_STARTED = 0,
+    EXEC_SUCCEED = 1,
+    EXEC_FAILED = 2,
   };
 
   LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {}
 
-  LDBCommandExecuteResult(State state, std::string& msg) :
-    state_(state), message_(msg) {}
+  LDBCommandExecuteResult(State state, std::string& msg)
+      : state_(state), message_(msg) {}
 
   std::string ToString() {
     std::string ret;
     switch (state_) {
-    case EXEC_SUCCEED:
-      break;
-    case EXEC_FAILED:
-      ret.append("Failed: ");
-      break;
-    case EXEC_NOT_STARTED:
-      ret.append("Not started: ");
+      case EXEC_SUCCEED:
+        break;
+      case EXEC_FAILED:
+        ret.append("Failed: ");
+        break;
+      case EXEC_NOT_STARTED:
+        ret.append("Not started: ");
     }
     if (!message_.empty()) {
       ret.append(message_);
@@ -44,17 +46,11 @@ class LDBCommandExecuteResult {
     message_ = "";
   }
 
-  bool IsSucceed() {
-    return state_ == EXEC_SUCCEED;
-  }
+  bool IsSucceed() { return state_ == EXEC_SUCCEED; }
 
-  bool IsNotStarted() {
-    return state_ == EXEC_NOT_STARTED;
-  }
+  bool IsNotStarted() { return state_ == EXEC_NOT_STARTED; }
 
-  bool IsFailed() {
-    return state_ == EXEC_FAILED;
-  }
+  bool IsFailed() { return state_ == EXEC_FAILED; }
 
   static LDBCommandExecuteResult Succeed(std::string msg) {
     return LDBCommandExecuteResult(EXEC_SUCCEED, msg);
@@ -64,7 +60,7 @@ class LDBCommandExecuteResult {
     return LDBCommandExecuteResult(EXEC_FAILED, msg);
   }
 
-private:
+ private:
   State state_;
   std::string message_;
 
@@ -72,4 +68,4 @@ class LDBCommandExecuteResult {
   bool operator!=(const LDBCommandExecuteResult&);
 };
 
-}
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/lua/rocks_lua_compaction_filter.h b/thirdparty/rocksdb/include/rocksdb/utilities/lua/rocks_lua_compaction_filter.h
deleted file mode 100644
index a7af592d8c..0000000000
--- a/thirdparty/rocksdb/include/rocksdb/utilities/lua/rocks_lua_compaction_filter.h
+++ /dev/null
@@ -1,189 +0,0 @@
-//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-
-#if defined(LUA) && !defined(ROCKSDB_LITE)
-// lua headers
-extern "C" {
-#include <lauxlib.h>
-#include <lua.h>
-#include <lualib.h>
-}
-
-#include <mutex>
-#include <string>
-#include <vector>
-
-#include "rocksdb/compaction_filter.h"
-#include "rocksdb/env.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/utilities/lua/rocks_lua_custom_library.h"
-#include "rocksdb/utilities/lua/rocks_lua_util.h"
-
-namespace rocksdb {
-namespace lua {
-
-struct RocksLuaCompactionFilterOptions {
-  // The lua script in string that implements all necessary CompactionFilter
-  // virtual functions.  The specified lua_script must implement the following
-  // functions, which are Name and Filter, as described below.
-  //
-  // 0. The Name function simply returns a string representing the name of
-  //    the lua script.  If there's any erorr in the Name function, an
-  //    empty string will be used.
-  //    --- Example
-  //      function Name()
-  //        return "DefaultLuaCompactionFilter"
-  //      end
-  //
-  //
-  // 1. The script must contains a function called Filter, which implements
-  //    CompactionFilter::Filter() , takes three input arguments, and returns
-  //    three values as the following API:
-  //
-  //   function Filter(level, key, existing_value)
-  //     ...
-  //     return is_filtered, is_changed, new_value
-  //   end
-  //
-  //   Note that if ignore_value is set to true, then Filter should implement
-  //   the following API:
-  //
-  //   function Filter(level, key)
-  //     ...
-  //     return is_filtered
-  //   end
-  //
-  //   If there're any error in the Filter() function, then it will keep
-  //   the input key / value pair.
-  //
-  //   -- Input
-  //   The function must take three arguments (integer, string, string),
-  //   which map to "level", "key", and "existing_value" passed from
-  //   RocksDB.
-  //
-  //   -- Output
-  //   The function must return three values (boolean, boolean, string).
-  //     - is_filtered: if the first return value is true, then it indicates
-  //       the input key / value pair should be filtered.
-  //     - is_changed: if the second return value is true, then it indicates
-  //       the existing_value needs to be changed, and the resulting value
-  //       is stored in the third return value.
-  //     - new_value: if the second return value is true, then this third
-  //       return value stores the new value of the input key / value pair.
-  //
-  //   -- Examples
-  //     -- a filter that keeps all key-value pairs
-  //     function Filter(level, key, existing_value)
-  //       return false, false, ""
-  //     end
-  //
-  //     -- a filter that keeps all keys and change their values to "Rocks"
-  //     function Filter(level, key, existing_value)
-  //       return false, true, "Rocks"
-  //     end
-
-  std::string lua_script;
-
-  // If set to true, then existing_value will not be passed to the Filter
-  // function, and the Filter function only needs to return a single boolean
-  // flag indicating whether to filter out this key or not.
-  //
-  //   function Filter(level, key)
-  //     ...
-  //     return is_filtered
-  //   end
-  bool ignore_value = false;
-
-  // A boolean flag to determine whether to ignore snapshots.
-  bool ignore_snapshots = false;
-
-  // When specified a non-null pointer, the first "error_limit_per_filter"
-  // errors of each CompactionFilter that is lua related will be included
-  // in this log.
-  std::shared_ptr<Logger> error_log;
-
-  // The number of errors per CompactionFilter will be printed
-  // to error_log.
-  int error_limit_per_filter = 1;
-
-  // A string to luaL_reg array map that allows the Lua CompactionFilter
-  // to use custom C library.  The string will be used as the library
-  // name in Lua.
-  std::vector<std::shared_ptr<RocksLuaCustomLibrary>> libraries;
-
-  ///////////////////////////////////////////////////////////////////////////
-  //  NOT YET SUPPORTED
-  // The name of the Lua function in "lua_script" that implements
-  // CompactionFilter::FilterMergeOperand().  The function must take
-  // three input arguments (integer, string, string), which map to "level",
-  // "key", and "operand" passed from the RocksDB.  In addition, the
-  // function must return a single boolean value, indicating whether
-  // to filter the input key / operand.
-  //
-  // DEFAULT:  the default implementation always returns false.
-  // @see CompactionFilter::FilterMergeOperand
-};
-
-class RocksLuaCompactionFilterFactory : public CompactionFilterFactory {
- public:
-  explicit RocksLuaCompactionFilterFactory(
-      const RocksLuaCompactionFilterOptions opt);
-
-  virtual ~RocksLuaCompactionFilterFactory() {}
-
-  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override;
-
-  // Change the Lua script so that the next compaction after this
-  // function call will use the new Lua script.
-  void SetScript(const std::string& new_script);
-
-  // Obtain the current Lua script
-  std::string GetScript();
-
-  const char* Name() const override;
-
- private:
-  RocksLuaCompactionFilterOptions opt_;
-  std::string name_;
-  // A lock to protect "opt_" to make it dynamically changeable.
-  std::mutex opt_mutex_;
-};
-
-// A wrapper class that invokes Lua script to perform CompactionFilter
-// functions.
-class RocksLuaCompactionFilter : public rocksdb::CompactionFilter {
- public:
-  explicit RocksLuaCompactionFilter(const RocksLuaCompactionFilterOptions& opt)
-      : options_(opt),
-        lua_state_wrapper_(opt.lua_script, opt.libraries),
-        error_count_(0),
-        name_("") {}
-
-  virtual bool Filter(int level, const Slice& key, const Slice& existing_value,
-                      std::string* new_value,
-                      bool* value_changed) const override;
-  // Not yet supported
-  virtual bool FilterMergeOperand(int level, const Slice& key,
-                                  const Slice& operand) const override {
-    return false;
-  }
-  virtual bool IgnoreSnapshots() const override;
-  virtual const char* Name() const override;
-
- protected:
-  void LogLuaError(const char* format, ...) const;
-
-  RocksLuaCompactionFilterOptions options_;
-  LuaStateWrapper lua_state_wrapper_;
-  mutable int error_count_;
-  mutable std::string name_;
-};
-
-}  // namespace lua
-}  // namespace rocksdb
-#endif  // defined(LUA) && !defined(ROCKSDB_LITE)
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h b/thirdparty/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
index 3ca8b32f3e..471ae12a6d 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/lua/rocks_lua_custom_library.h
@@ -36,7 +36,7 @@ class RocksLuaCustomLibrary {
   // and pushed on the top of the lua_State.  This custom setup function
   // allows developers to put additional table or constant values inside
   // the same table / namespace.
-  virtual void CustomSetup(lua_State* L) const {}
+  virtual void CustomSetup(lua_State* /*L*/) const {}
 };
 }  // namespace lua
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/object_registry.h b/thirdparty/rocksdb/include/rocksdb/utilities/object_registry.h
index b046ba7c1f..86a51b92ea 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/object_registry.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/object_registry.h
@@ -27,8 +27,8 @@ namespace rocksdb {
 template <typename T>
 T* NewCustomObject(const std::string& target, std::unique_ptr<T>* res_guard);
 
-// Returns a new T when called with a string. Populates the unique_ptr argument
-// if granting ownership to caller.
+// Returns a new T when called with a string. Populates the std::unique_ptr
+// argument if granting ownership to caller.
 template <typename T>
 using FactoryFunc = std::function<T*(const std::string&, std::unique_ptr<T>*)>;
 
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
index 02917ff583..28b24083e2 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/optimistic_transaction_db.h
@@ -11,6 +11,7 @@
 
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
+#include "rocksdb/utilities/stackable_db.h"
 
 namespace rocksdb {
 
@@ -30,7 +31,7 @@ struct OptimisticTransactionOptions {
   const Comparator* cmp = BytewiseComparator();
 };
 
-class OptimisticTransactionDB {
+class OptimisticTransactionDB : public StackableDB {
  public:
   // Open an OptimisticTransactionDB similar to DB::Open().
   static Status Open(const Options& options, const std::string& dbname,
@@ -57,18 +58,12 @@ class OptimisticTransactionDB {
           OptimisticTransactionOptions(),
       Transaction* old_txn = nullptr) = 0;
 
-  // Return the underlying Database that was opened
-  virtual DB* GetBaseDB() = 0;
+  OptimisticTransactionDB(const OptimisticTransactionDB&) = delete;
+  void operator=(const OptimisticTransactionDB&) = delete;
 
  protected:
   // To Create an OptimisticTransactionDB, call Open()
-  explicit OptimisticTransactionDB(DB* db) {}
-  OptimisticTransactionDB() {}
-
- private:
-  // No copying allowed
-  OptimisticTransactionDB(const OptimisticTransactionDB&);
-  void operator=(const OptimisticTransactionDB&);
+  explicit OptimisticTransactionDB(DB* db) : StackableDB(db) {}
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/options_util.h b/thirdparty/rocksdb/include/rocksdb/utilities/options_util.h
index d02c574104..d97b394eac 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/options_util.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/options_util.h
@@ -33,6 +33,12 @@ namespace rocksdb {
 // * merge_operator
 // * compaction_filter
 //
+// User can also choose to load customized comparator and/or merge_operator
+// through object registry:
+// * comparator needs to be registered through Registrar<const Comparator>
+// * merge operator needs to be registered through
+//     Registrar<std::shared_ptr<MergeOperator>>.
+//
 // For table_factory, this function further supports deserializing
 // BlockBasedTableFactory and its BlockBasedTableOptions except the
 // pointer options of BlockBasedTableOptions (flush_block_policy_factory,
@@ -58,7 +64,8 @@ namespace rocksdb {
 Status LoadLatestOptions(const std::string& dbpath, Env* env,
                          DBOptions* db_options,
                          std::vector<ColumnFamilyDescriptor>* cf_descs,
-                         bool ignore_unknown_options = false);
+                         bool ignore_unknown_options = false,
+                         std::shared_ptr<Cache>* cache = {});
 
 // Similar to LoadLatestOptions, this function constructs the DBOptions
 // and ColumnFamilyDescriptors based on the specified RocksDB Options file.
@@ -67,7 +74,8 @@ Status LoadLatestOptions(const std::string& dbpath, Env* env,
 Status LoadOptionsFromFile(const std::string& options_file_name, Env* env,
                            DBOptions* db_options,
                            std::vector<ColumnFamilyDescriptor>* cf_descs,
-                           bool ignore_unknown_options = false);
+                           bool ignore_unknown_options = false,
+                           std::shared_ptr<Cache>* cache = {});
 
 // Returns the latest options file name under the specified db path.
 Status GetLatestOptionsFileName(const std::string& dbpath, Env* env,
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/sim_cache.h b/thirdparty/rocksdb/include/rocksdb/utilities/sim_cache.h
index f29fd5e8f6..bc2a7bc13d 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/sim_cache.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/sim_cache.h
@@ -73,7 +73,8 @@ class SimCache : public Cache {
   // stop logging to the file automatically after reaching a specific size in
   // bytes, a values of 0 disable this feature
   virtual Status StartActivityLogging(const std::string& activity_log_file,
-                                      Env* env, uint64_t max_logging_size = 0) = 0;
+                                      Env* env,
+                                      uint64_t max_logging_size = 0) = 0;
 
   // Stop cache activity logging if any
   virtual void StopActivityLogging() = 0;
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/spatial_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/spatial_db.h
deleted file mode 100644
index 477b77cf62..0000000000
--- a/thirdparty/rocksdb/include/rocksdb/utilities/spatial_db.h
+++ /dev/null
@@ -1,261 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <string>
-#include <vector>
-
-#include "rocksdb/db.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/utilities/stackable_db.h"
-
-namespace rocksdb {
-namespace spatial {
-
-// NOTE: SpatialDB is experimental and we might change its API without warning.
-// Please talk to us before developing against SpatialDB API.
-//
-// SpatialDB is a support for spatial indexes built on top of RocksDB.
-// When creating a new SpatialDB, clients specifies a list of spatial indexes to
-// build on their data. Each spatial index is defined by the area and
-// granularity. If you're storing map data, different spatial index
-// granularities can be used for different zoom levels.
-//
-// Each element inserted into SpatialDB has:
-// * a bounding box, which determines how will the element be indexed
-// * string blob, which will usually be WKB representation of the polygon
-// (http://en.wikipedia.org/wiki/Well-known_text)
-// * feature set, which is a map of key-value pairs, where value can be null,
-// int, double, bool, string
-// * a list of indexes to insert the element in
-//
-// Each query is executed on a single spatial index. Query guarantees that it
-// will return all elements intersecting the specified bounding box, but it
-// might also return some extra non-intersecting elements.
-
-// Variant is a class that can be many things: null, bool, int, double or string
-// It is used to store different value types in FeatureSet (see below)
-struct Variant {
-  // Don't change the values here, they are persisted on disk
-  enum Type {
-    kNull = 0x0,
-    kBool = 0x1,
-    kInt = 0x2,
-    kDouble = 0x3,
-    kString = 0x4,
-  };
-
-  Variant() : type_(kNull) {}
-  /* implicit */ Variant(bool b) : type_(kBool) { data_.b = b; }
-  /* implicit */ Variant(uint64_t i) : type_(kInt) { data_.i = i; }
-  /* implicit */ Variant(double d) : type_(kDouble) { data_.d = d; }
-  /* implicit */ Variant(const std::string& s) : type_(kString) {
-    new (&data_.s) std::string(s);
-  }
-
-  Variant(const Variant& v) : type_(v.type_) { Init(v, data_); }
-
-  Variant& operator=(const Variant& v);
-
-  Variant(Variant&& rhs) : type_(kNull) { *this = std::move(rhs); }
-
-  Variant& operator=(Variant&& v);
-
-  ~Variant() { Destroy(type_, data_); }
-
-  Type type() const { return type_; }
-  bool get_bool() const { return data_.b; }
-  uint64_t get_int() const { return data_.i; }
-  double get_double() const { return data_.d; }
-  const std::string& get_string() const { return *GetStringPtr(data_); }
-
-  bool operator==(const Variant& other) const;
-  bool operator!=(const Variant& other) const { return !(*this == other); }
-
- private:
-  Type type_;
-
-  union Data {
-    bool b;
-    uint64_t i;
-    double d;
-    // Current version of MS compiler not C++11 compliant so can not put
-    // std::string
-    // however, even then we still need the rest of the maintenance.
-    char s[sizeof(std::string)];
-  } data_;
-
-  // Avoid type_punned aliasing problem
-  static std::string* GetStringPtr(Data& d) {
-    void* p = d.s;
-    return reinterpret_cast<std::string*>(p);
-  }
-
-  static const std::string* GetStringPtr(const Data& d) {
-    const void* p = d.s;
-    return reinterpret_cast<const std::string*>(p);
-  }
-
-  static void Init(const Variant&, Data&);
-
-  static void Destroy(Type t, Data& d) {
-    if (t == kString) {
-      using std::string;
-      GetStringPtr(d)->~string();
-    }
-  }
-};
-
-// FeatureSet is a map of key-value pairs. One feature set is associated with
-// each element in SpatialDB. It can be used to add rich data about the element.
-class FeatureSet {
- private:
-  typedef std::unordered_map<std::string, Variant> map;
-
- public:
-  class iterator {
-   public:
-    /* implicit */ iterator(const map::const_iterator itr) : itr_(itr) {}
-    iterator& operator++() {
-      ++itr_;
-      return *this;
-    }
-    bool operator!=(const iterator& other) { return itr_ != other.itr_; }
-    bool operator==(const iterator& other) { return itr_ == other.itr_; }
-    map::value_type operator*() { return *itr_; }
-
-   private:
-    map::const_iterator itr_;
-  };
-  FeatureSet() = default;
-
-  FeatureSet* Set(const std::string& key, const Variant& value);
-  bool Contains(const std::string& key) const;
-  // REQUIRES: Contains(key)
-  const Variant& Get(const std::string& key) const;
-  iterator Find(const std::string& key) const;
-
-  iterator begin() const { return map_.begin(); }
-  iterator end() const { return map_.end(); }
-
-  void Clear();
-  size_t Size() const { return map_.size(); }
-
-  void Serialize(std::string* output) const;
-  // REQUIRED: empty FeatureSet
-  bool Deserialize(const Slice& input);
-
-  std::string DebugString() const;
-
- private:
-  map map_;
-};
-
-// BoundingBox is a helper structure for defining rectangles representing
-// bounding boxes of spatial elements.
-template <typename T>
-struct BoundingBox {
-  T min_x, min_y, max_x, max_y;
-  BoundingBox() = default;
-  BoundingBox(T _min_x, T _min_y, T _max_x, T _max_y)
-      : min_x(_min_x), min_y(_min_y), max_x(_max_x), max_y(_max_y) {}
-
-  bool Intersects(const BoundingBox<T>& a) const {
-    return !(min_x > a.max_x || min_y > a.max_y || a.min_x > max_x ||
-             a.min_y > max_y);
-  }
-};
-
-struct SpatialDBOptions {
-  uint64_t cache_size = 1 * 1024 * 1024 * 1024LL;  // 1GB
-  int num_threads = 16;
-  bool bulk_load = true;
-};
-
-// Cursor is used to return data from the query to the client. To get all the
-// data from the query, just call Next() while Valid() is true
-class Cursor {
- public:
-  Cursor() = default;
-  virtual ~Cursor() {}
-
-  virtual bool Valid() const = 0;
-  // REQUIRES: Valid()
-  virtual void Next() = 0;
-
-  // Lifetime of the underlying storage until the next call to Next()
-  // REQUIRES: Valid()
-  virtual const Slice blob() = 0;
-  // Lifetime of the underlying storage until the next call to Next()
-  // REQUIRES: Valid()
-  virtual const FeatureSet& feature_set() = 0;
-
-  virtual Status status() const = 0;
-
- private:
-  // No copying allowed
-  Cursor(const Cursor&);
-  void operator=(const Cursor&);
-};
-
-// SpatialIndexOptions defines a spatial index that will be built on the data
-struct SpatialIndexOptions {
-  // Spatial indexes are referenced by names
-  std::string name;
-  // An area that is indexed. If the element is not intersecting with spatial
-  // index's bbox, it will not be inserted into the index
-  BoundingBox<double> bbox;
-  // tile_bits control the granularity of the spatial index. Each dimension of
-  // the bbox will be split into (1 << tile_bits) tiles, so there will be a
-  // total of (1 << tile_bits)^2 tiles. It is recommended to configure a size of
-  // each  tile to be approximately the size of the query on that spatial index
-  uint32_t tile_bits;
-  SpatialIndexOptions() {}
-  SpatialIndexOptions(const std::string& _name,
-                      const BoundingBox<double>& _bbox, uint32_t _tile_bits)
-      : name(_name), bbox(_bbox), tile_bits(_tile_bits) {}
-};
-
-class SpatialDB : public StackableDB {
- public:
-  // Creates the SpatialDB with specified list of indexes.
-  // REQUIRED: db doesn't exist
-  static Status Create(const SpatialDBOptions& options, const std::string& name,
-                       const std::vector<SpatialIndexOptions>& spatial_indexes);
-
-  // Open the existing SpatialDB.  The resulting db object will be returned
-  // through db parameter.
-  // REQUIRED: db was created using SpatialDB::Create
-  static Status Open(const SpatialDBOptions& options, const std::string& name,
-                     SpatialDB** db, bool read_only = false);
-
-  explicit SpatialDB(DB* db) : StackableDB(db) {}
-
-  // Insert the element into the DB. Element will be inserted into specified
-  // spatial_indexes, based on specified bbox.
-  // REQUIRES: spatial_indexes.size() > 0
-  virtual Status Insert(const WriteOptions& write_options,
-                        const BoundingBox<double>& bbox, const Slice& blob,
-                        const FeatureSet& feature_set,
-                        const std::vector<std::string>& spatial_indexes) = 0;
-
-  // Calling Compact() after inserting a bunch of elements should speed up
-  // reading. This is especially useful if you use SpatialDBOptions::bulk_load
-  // Num threads determines how many threads we'll use for compactions. Setting
-  // this to bigger number will use more IO and CPU, but finish faster
-  virtual Status Compact(int num_threads = 1) = 0;
-
-  // Query the specified spatial_index. Query will return all elements that
-  // intersect bbox, but it may also return some extra elements.
-  virtual Cursor* Query(const ReadOptions& read_options,
-                        const BoundingBox<double>& bbox,
-                        const std::string& spatial_index) = 0;
-};
-
-}  // namespace spatial
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/stackable_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/stackable_db.h
index 991de90aab..8fef9b3e85 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/stackable_db.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/stackable_db.h
@@ -4,6 +4,7 @@
 
 #pragma once
 #include <map>
+#include <memory>
 #include <string>
 #include "rocksdb/db.h"
 
@@ -12,22 +13,30 @@
 #undef DeleteFile
 #endif
 
-
 namespace rocksdb {
 
 // This class contains APIs to stack rocksdb wrappers.Eg. Stack TTL over base d
 class StackableDB : public DB {
  public:
-  // StackableDB is the owner of db now!
+  // StackableDB take sole ownership of the underlying db.
   explicit StackableDB(DB* db) : db_(db) {}
 
+  // StackableDB take shared ownership of the underlying db.
+  explicit StackableDB(std::shared_ptr<DB> db)
+      : db_(db.get()), shared_db_ptr_(db) {}
+
   ~StackableDB() {
-    delete db_;
+    if (shared_db_ptr_ == nullptr) {
+      delete db_;
+    } else {
+      assert(shared_db_ptr_.get() == db_);
+    }
+    db_ = nullptr;
   }
 
-  virtual DB* GetBaseDB() {
-    return db_;
-  }
+  virtual Status Close() override { return db_->Close(); }
+
+  virtual DB* GetBaseDB() { return db_; }
 
   virtual DB* GetRootDB() override { return db_->GetRootDB(); }
 
@@ -95,6 +104,12 @@ class StackableDB : public DB {
     return db_->IngestExternalFile(column_family, external_files, options);
   }
 
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override {
+    return db_->IngestExternalFiles(args);
+  }
+
   virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
 
   using DB::KeyMayExist;
@@ -126,10 +141,8 @@ class StackableDB : public DB {
     return db_->Merge(options, column_family, key, value);
   }
 
-
-  virtual Status Write(const WriteOptions& opts, WriteBatch* updates)
-    override {
-      return db_->Write(opts, updates);
+  virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override {
+    return db_->Write(opts, updates);
   }
 
   using DB::NewIterator;
@@ -145,10 +158,7 @@ class StackableDB : public DB {
     return db_->NewIterators(options, column_families, iterators);
   }
 
-
-  virtual const Snapshot* GetSnapshot() override {
-    return db_->GetSnapshot();
-  }
+  virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
 
   virtual void ReleaseSnapshot(const Snapshot* snapshot) override {
     return db_->ReleaseSnapshot(snapshot);
@@ -160,9 +170,9 @@ class StackableDB : public DB {
                            const Slice& property, std::string* value) override {
     return db_->GetProperty(column_family, property, value);
   }
-  virtual bool GetMapProperty(ColumnFamilyHandle* column_family,
-                              const Slice& property,
-                              std::map<std::string, double>* value) override {
+  virtual bool GetMapProperty(
+      ColumnFamilyHandle* column_family, const Slice& property,
+      std::map<std::string, std::string>* value) override {
     return db_->GetMapProperty(column_family, property, value);
   }
 
@@ -179,12 +189,10 @@ class StackableDB : public DB {
   }
 
   using DB::GetApproximateSizes;
-  virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
-                                   const Range* r, int n, uint64_t* sizes,
-                                   uint8_t include_flags
-                                   = INCLUDE_FILES) override {
-    return db_->GetApproximateSizes(column_family, r, n, sizes,
-                                    include_flags);
+  virtual void GetApproximateSizes(
+      ColumnFamilyHandle* column_family, const Range* r, int n, uint64_t* sizes,
+      uint8_t include_flags = INCLUDE_FILES) override {
+    return db_->GetApproximateSizes(column_family, r, n, sizes, include_flags);
   }
 
   using DB::GetApproximateMemTableStats;
@@ -206,11 +214,13 @@ class StackableDB : public DB {
   virtual Status CompactFiles(
       const CompactionOptions& compact_options,
       ColumnFamilyHandle* column_family,
-      const std::vector<std::string>& input_file_names,
-      const int output_level, const int output_path_id = -1) override {
-    return db_->CompactFiles(
-        compact_options, column_family, input_file_names,
-        output_level, output_path_id);
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override {
+    return db_->CompactFiles(compact_options, column_family, input_file_names,
+                             output_level, output_path_id, output_file_names,
+                             compaction_job_info);
   }
 
   virtual Status PauseBackgroundWork() override {
@@ -231,24 +241,20 @@ class StackableDB : public DB {
   }
 
   using DB::MaxMemCompactionLevel;
-  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family)
-      override {
+  virtual int MaxMemCompactionLevel(
+      ColumnFamilyHandle* column_family) override {
     return db_->MaxMemCompactionLevel(column_family);
   }
 
   using DB::Level0StopWriteTrigger;
-  virtual int Level0StopWriteTrigger(ColumnFamilyHandle* column_family)
-      override {
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override {
     return db_->Level0StopWriteTrigger(column_family);
   }
 
-  virtual const std::string& GetName() const override {
-    return db_->GetName();
-  }
+  virtual const std::string& GetName() const override { return db_->GetName(); }
 
-  virtual Env* GetEnv() const override {
-    return db_->GetEnv();
-  }
+  virtual Env* GetEnv() const override { return db_->GetEnv(); }
 
   using DB::GetOptions;
   virtual Options GetOptions(ColumnFamilyHandle* column_family) const override {
@@ -265,13 +271,20 @@ class StackableDB : public DB {
                        ColumnFamilyHandle* column_family) override {
     return db_->Flush(fopts, column_family);
   }
-
-  virtual Status SyncWAL() override {
-    return db_->SyncWAL();
+  virtual Status Flush(
+      const FlushOptions& fopts,
+      const std::vector<ColumnFamilyHandle*>& column_families) override {
+    return db_->Flush(fopts, column_families);
   }
 
+  virtual Status SyncWAL() override { return db_->SyncWAL(); }
+
   virtual Status FlushWAL(bool sync) override { return db_->FlushWAL(sync); }
 
+  virtual Status LockWAL() override { return db_->LockWAL(); }
+
+  virtual Status UnlockWAL() override { return db_->UnlockWAL(); }
+
 #ifndef ROCKSDB_LITE
 
   virtual Status DisableFileDeletions() override {
@@ -287,9 +300,8 @@ class StackableDB : public DB {
     db_->GetLiveFilesMetaData(metadata);
   }
 
-  virtual void GetColumnFamilyMetaData(
-      ColumnFamilyHandle *column_family,
-      ColumnFamilyMetaData* cf_meta) override {
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                       ColumnFamilyMetaData* cf_meta) override {
     db_->GetColumnFamilyMetaData(column_family, cf_meta);
   }
 
@@ -297,13 +309,18 @@ class StackableDB : public DB {
 
   virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
                               bool flush_memtable = true) override {
-      return db_->GetLiveFiles(vec, mfs, flush_memtable);
+    return db_->GetLiveFiles(vec, mfs, flush_memtable);
   }
 
   virtual SequenceNumber GetLatestSequenceNumber() const override {
     return db_->GetLatestSequenceNumber();
   }
 
+  virtual bool SetPreserveDeletesSequenceNumber(
+      SequenceNumber seqnum) override {
+    return db_->SetPreserveDeletesSequenceNumber(seqnum);
+  }
+
   virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
     return db_->GetSortedWalFiles(files);
   }
@@ -347,7 +364,7 @@ class StackableDB : public DB {
   }
 
   virtual Status GetUpdatesSince(
-      SequenceNumber seq_number, unique_ptr<TransactionLogIterator>* iter,
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
       const TransactionLogIterator::ReadOptions& read_options) override {
     return db_->GetUpdatesSince(seq_number, iter, read_options);
   }
@@ -369,6 +386,7 @@ class StackableDB : public DB {
 
  protected:
   DB* db_;
+  std::shared_ptr<DB> shared_db_ptr_;
 };
 
-} //  namespace rocksdb
+}  //  namespace rocksdb
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/table_properties_collectors.h b/thirdparty/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
index 0f8827037b..bb350bcf9c 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/table_properties_collectors.h
@@ -5,12 +5,58 @@
 
 #pragma once
 #ifndef ROCKSDB_LITE
+#include <atomic>
 #include <memory>
 
 #include "rocksdb/table_properties.h"
 
 namespace rocksdb {
 
+// A factory of a table property collector that marks a SST
+// file as need-compaction when it observe at least "D" deletion
+// entries in any "N" consecutive entires.
+class CompactOnDeletionCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  virtual ~CompactOnDeletionCollectorFactory() {}
+
+  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override;
+
+  // Change the value of sliding_window_size "N"
+  // Setting it to 0 disables the delete triggered compaction
+  void SetWindowSize(size_t sliding_window_size) {
+    sliding_window_size_.store(sliding_window_size);
+  }
+
+  // Change the value of deletion_trigger "D"
+  void SetDeletionTrigger(size_t deletion_trigger) {
+    deletion_trigger_.store(deletion_trigger);
+  }
+
+  virtual const char* Name() const override {
+    return "CompactOnDeletionCollector";
+  }
+
+ private:
+  friend std::shared_ptr<CompactOnDeletionCollectorFactory>
+  NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                       size_t deletion_trigger);
+  // A factory of a table property collector that marks a SST
+  // file as need-compaction when it observe at least "D" deletion
+  // entries in any "N" consecutive entires.
+  //
+  // @param sliding_window_size "N"
+  // @param deletion_trigger "D"
+  CompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                    size_t deletion_trigger)
+      : sliding_window_size_(sliding_window_size),
+        deletion_trigger_(deletion_trigger) {}
+
+  std::atomic<size_t> sliding_window_size_;
+  std::atomic<size_t> deletion_trigger_;
+};
+
 // Creates a factory of a table property collector that marks a SST
 // file as need-compaction when it observe at least "D" deletion
 // entries in any "N" consecutive entires.
@@ -20,10 +66,9 @@ namespace rocksdb {
 //     than the specified size.
 // @param deletion_trigger "D".  Note that even when "N" is changed,
 //     the specified number for "D" will not be changed.
-extern std::shared_ptr<TablePropertiesCollectorFactory>
-    NewCompactOnDeletionCollectorFactory(
-        size_t sliding_window_size,
-        size_t deletion_trigger);
+extern std::shared_ptr<CompactOnDeletionCollectorFactory>
+NewCompactOnDeletionCollectorFactory(size_t sliding_window_size,
+                                     size_t deletion_trigger);
 }  // namespace rocksdb
 
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/transaction.h b/thirdparty/rocksdb/include/rocksdb/utilities/transaction.h
index a3519739c2..ce67248227 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/transaction.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/transaction.h
@@ -118,7 +118,7 @@ class Transaction {
   // longer be valid and should be discarded after a call to ClearSnapshot().
   virtual void ClearSnapshot() = 0;
 
-  // Prepare the current transation for 2PC
+  // Prepare the current transaction for 2PC
   virtual Status Prepare() = 0;
 
   // Write all batched keys to the db atomically.
@@ -152,6 +152,12 @@ class Transaction {
   // If there is no previous call to SetSavePoint(), returns Status::NotFound()
   virtual Status RollbackToSavePoint() = 0;
 
+  // Pop the most recent save point.
+  // If there is no previous call to SetSavePoint(), Status::NotFound()
+  // will be returned.
+  // Otherwise returns Status::OK().
+  virtual Status PopSavePoint() = 0;
+
   // This function is similar to DB::Get() except it will also read pending
   // changes in this transaction.  Currently, this function will return
   // Status::MergeInProgress if the most recent write to the queried key in
@@ -169,8 +175,8 @@ class Transaction {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      std::string* value) = 0;
 
-  // An overload of the the above method that receives a PinnableSlice
-  // For backward compatiblity a default implementation is provided
+  // An overload of the above method that receives a PinnableSlice
+  // For backward compatibility a default implementation is provided
   virtual Status Get(const ReadOptions& options,
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* pinnable_val) {
@@ -202,8 +208,10 @@ class Transaction {
   // Read this key and ensure that this transaction will only
   // be able to be committed if this key is not written outside this
   // transaction after it has first been read (or after the snapshot if a
-  // snapshot is set in this transaction).  The transaction behavior is the
-  // same regardless of whether the key exists or not.
+  // snapshot is set in this transaction and do_validate is true). If
+  // do_validate is false, ReadOptions::snapshot is expected to be nullptr so
+  // that GetForUpdate returns the latest committed value. The transaction
+  // behavior is the same regardless of whether the key exists or not.
   //
   // Note: Currently, this function will return Status::MergeInProgress
   // if the most recent write to the queried key in this batch is a Merge.
@@ -228,26 +236,31 @@ class Transaction {
   virtual Status GetForUpdate(const ReadOptions& options,
                               ColumnFamilyHandle* column_family,
                               const Slice& key, std::string* value,
-                              bool exclusive = true) = 0;
+                              bool exclusive = true,
+                              const bool do_validate = true) = 0;
 
-  // An overload of the the above method that receives a PinnableSlice
-  // For backward compatiblity a default implementation is provided
+  // An overload of the above method that receives a PinnableSlice
+  // For backward compatibility a default implementation is provided
   virtual Status GetForUpdate(const ReadOptions& options,
                               ColumnFamilyHandle* column_family,
                               const Slice& key, PinnableSlice* pinnable_val,
-                              bool exclusive = true) {
+                              bool exclusive = true,
+                              const bool do_validate = true) {
     if (pinnable_val == nullptr) {
       std::string* null_str = nullptr;
-      return GetForUpdate(options, key, null_str);
+      return GetForUpdate(options, column_family, key, null_str, exclusive,
+                          do_validate);
     } else {
-      auto s = GetForUpdate(options, key, pinnable_val->GetSelf());
+      auto s = GetForUpdate(options, column_family, key,
+                            pinnable_val->GetSelf(), exclusive, do_validate);
       pinnable_val->PinSelf();
       return s;
     }
   }
 
   virtual Status GetForUpdate(const ReadOptions& options, const Slice& key,
-                              std::string* value, bool exclusive = true) = 0;
+                              std::string* value, bool exclusive = true,
+                              const bool do_validate = true) = 0;
 
   virtual std::vector<Status> MultiGetForUpdate(
       const ReadOptions& options,
@@ -280,6 +293,9 @@ class Transaction {
   // functions in WriteBatch, but will also do conflict checking on the
   // keys being written.
   //
+  // assume_tracked=true expects the key be already tracked. If valid then it
+  // skips ValidateSnapshot. Returns error otherwise.
+  //
   // If this Transaction was created on an OptimisticTransactionDB, these
   // functions should always return Status::OK().
   //
@@ -292,28 +308,33 @@ class Transaction {
   //  (See max_write_buffer_number_to_maintain)
   // or other errors on unexpected failures.
   virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
-                     const Slice& value) = 0;
+                     const Slice& value, const bool assume_tracked = false) = 0;
   virtual Status Put(const Slice& key, const Slice& value) = 0;
   virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
-                     const SliceParts& value) = 0;
+                     const SliceParts& value,
+                     const bool assume_tracked = false) = 0;
   virtual Status Put(const SliceParts& key, const SliceParts& value) = 0;
 
   virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
-                       const Slice& value) = 0;
+                       const Slice& value,
+                       const bool assume_tracked = false) = 0;
   virtual Status Merge(const Slice& key, const Slice& value) = 0;
 
-  virtual Status Delete(ColumnFamilyHandle* column_family,
-                        const Slice& key) = 0;
+  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                        const bool assume_tracked = false) = 0;
   virtual Status Delete(const Slice& key) = 0;
   virtual Status Delete(ColumnFamilyHandle* column_family,
-                        const SliceParts& key) = 0;
+                        const SliceParts& key,
+                        const bool assume_tracked = false) = 0;
   virtual Status Delete(const SliceParts& key) = 0;
 
   virtual Status SingleDelete(ColumnFamilyHandle* column_family,
-                              const Slice& key) = 0;
+                              const Slice& key,
+                              const bool assume_tracked = false) = 0;
   virtual Status SingleDelete(const Slice& key) = 0;
   virtual Status SingleDelete(ColumnFamilyHandle* column_family,
-                              const SliceParts& key) = 0;
+                              const SliceParts& key,
+                              const bool assume_tracked = false) = 0;
   virtual Status SingleDelete(const SliceParts& key) = 0;
 
   // PutUntracked() will write a Put to the batch of operations to be committed
@@ -321,9 +342,9 @@ class Transaction {
   // gets committed successfully.  But unlike Transaction::Put(),
   // no conflict checking will be done for this key.
   //
-  // If this Transaction was created on a TransactionDB, this function will
-  // still acquire locks necessary to make sure this write doesn't cause
-  // conflicts in other transactions and may return Status::Busy().
+  // If this Transaction was created on a PessimisticTransactionDB, this
+  // function will still acquire locks necessary to make sure this write doesn't
+  // cause conflicts in other transactions and may return Status::Busy().
   virtual Status PutUntracked(ColumnFamilyHandle* column_family,
                               const Slice& key, const Slice& value) = 0;
   virtual Status PutUntracked(const Slice& key, const Slice& value) = 0;
@@ -344,6 +365,10 @@ class Transaction {
   virtual Status DeleteUntracked(ColumnFamilyHandle* column_family,
                                  const SliceParts& key) = 0;
   virtual Status DeleteUntracked(const SliceParts& key) = 0;
+  virtual Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+                                       const Slice& key) = 0;
+
+  virtual Status SingleDeleteUntracked(const Slice& key) = 0;
 
   // Similar to WriteBatch::PutLogData
   virtual void PutLogData(const Slice& blob) = 0;
@@ -364,7 +389,7 @@ class Transaction {
   virtual void EnableIndexing() = 0;
 
   // Returns the number of distinct Keys being tracked by this transaction.
-  // If this transaction was created by a TransactinDB, this is the number of
+  // If this transaction was created by a TransactionDB, this is the number of
   // keys that are currently locked by this transaction.
   // If this transaction was created by an OptimisticTransactionDB, this is the
   // number of keys that need to be checked for conflicts at commit time.
@@ -436,8 +461,8 @@ class Transaction {
 
   virtual bool IsDeadlockDetect() const { return false; }
 
-  virtual std::vector<TransactionID> GetWaitingTxns(uint32_t* column_family_id,
-                                                    std::string* key) const {
+  virtual std::vector<TransactionID> GetWaitingTxns(
+      uint32_t* /*column_family_id*/, std::string* /*key*/) const {
     assert(false);
     return std::vector<TransactionID>();
   }
@@ -456,9 +481,17 @@ class Transaction {
   TransactionState GetState() const { return txn_state_; }
   void SetState(TransactionState state) { txn_state_ = state; }
 
+  // NOTE: Experimental feature
+  // The globally unique id with which the transaction is identified. This id
+  // might or might not be set depending on the implementation. Similarly the
+  // implementation decides the point in lifetime of a transaction at which it
+  // assigns the id. Although currently it is the case, the id is not guaranteed
+  // to remain the same across restarts.
+  uint64_t GetId() { return id_; }
+
  protected:
-  explicit Transaction(const TransactionDB* db) {}
-  Transaction() {}
+  explicit Transaction(const TransactionDB* /*db*/) {}
+  Transaction() : log_number_(0), txn_state_(STARTED) {}
 
   // the log in which the prepared section for this txn resides
   // (for two phase commit)
@@ -468,7 +501,15 @@ class Transaction {
   // Execution status of the transaction.
   std::atomic<TransactionState> txn_state_;
 
+  uint64_t id_ = 0;
+  virtual void SetId(uint64_t id) {
+    assert(id_ == 0);
+    id_ = id;
+  }
+
  private:
+  friend class PessimisticTransactionDB;
+  friend class WriteUnpreparedTxnDB;
   // No copying allowed
   Transaction(const Transaction&);
   void operator=(const Transaction&);
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/transaction_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/transaction_db.h
index 77043897a7..6c4346ff3e 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/transaction_db.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/transaction_db.h
@@ -75,7 +75,7 @@ struct TransactionDBOptions {
   // expiration set.
   int64_t default_lock_timeout = 1000;  // 1 second
 
-  // If set, the TransactionDB will use this implemenation of a mutex and
+  // If set, the TransactionDB will use this implementation of a mutex and
   // condition variable for all transaction locking instead of the default
   // mutex/condvar implementation.
   std::shared_ptr<TransactionDBMutexFactory> custom_mutex_factory;
@@ -85,6 +85,24 @@ struct TransactionDBOptions {
   // before the commit phase. The DB then needs to provide the mechanisms to
   // tell apart committed from uncommitted data.
   TxnDBWritePolicy write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+
+  // TODO(myabandeh): remove this option
+  // Note: this is a temporary option as a hot fix in rollback of writeprepared
+  // txns in myrocks. MyRocks uses merge operands for autoinc column id without
+  // however obtaining locks. This breaks the assumption behind the rollback
+  // logic in myrocks. This hack of simply not rolling back merge operands works
+  // for the special way that myrocks uses this operands.
+  bool rollback_merge_operands = false;
+
+ private:
+  // 128 entries
+  size_t wp_snapshot_cache_bits = static_cast<size_t>(7);
+  // 8m entry, 64MB size
+  size_t wp_commit_cache_bits = static_cast<size_t>(23);
+
+  friend class WritePreparedTxnDB;
+  friend class WritePreparedTransactionTestBase;
+  friend class MySQLStyleTransactionTest;
 };
 
 struct TransactionOptions {
@@ -97,12 +115,18 @@ struct TransactionOptions {
   // Status::Busy.  The user should retry their transaction.
   bool deadlock_detect = false;
 
+  // If set, it states that the CommitTimeWriteBatch represents the latest state
+  // of the application, has only one sub-batch, i.e., no duplicate keys,  and
+  // meant to be used later during recovery. It enables an optimization to
+  // postpone updating the memtable with CommitTimeWriteBatch to only
+  // SwitchMemtable or recovery.
+  bool use_only_the_last_commit_time_batch_for_recovery = false;
+
   // TODO(agiardullo): TransactionDB does not yet support comparators that allow
   // two non-equal keys to be equivalent.  Ie, cmp->Compare(a,b) should only
   // return 0 if
   // a.compare(b) returns 0.
 
-
   // If positive, specifies the wait timeout in milliseconds when
   // a transaction attempts to lock a key.
   //
@@ -122,6 +146,29 @@ struct TransactionOptions {
 
   // The maximum number of bytes used for the write batch. 0 means no limit.
   size_t max_write_batch_size = 0;
+
+  // Skip Concurrency Control. This could be as an optimization if the
+  // application knows that the transaction would not have any conflict with
+  // concurrent transactions. It could also be used during recovery if (i)
+  // application guarantees no conflict between prepared transactions in the WAL
+  // (ii) application guarantees that recovered transactions will be rolled
+  // back/commit before new transactions start.
+  // Default: false
+  bool skip_concurrency_control = false;
+};
+
+// The per-write optimizations that do not involve transactions. TransactionDB
+// implementation might or might not make use of the specified optimizations.
+struct TransactionDBWriteOptimizations {
+  // If it is true it means that the application guarantees that the
+  // key-set in the write batch do not conflict with any concurrent transaction
+  // and hence the concurrency control mechanism could be skipped for this
+  // write.
+  bool skip_concurrency_control = false;
+  // If true, the application guarantees that there is no duplicate <column
+  // family, key> in the write batch and any employed mechanism to handle
+  // duplicate keys could be skipped.
+  bool skip_duplicate_key_check = false;
 };
 
 struct KeyLockInfo {
@@ -133,27 +180,41 @@ struct KeyLockInfo {
 struct DeadlockInfo {
   TransactionID m_txn_id;
   uint32_t m_cf_id;
-  std::string m_waiting_key;
   bool m_exclusive;
+  std::string m_waiting_key;
 };
 
 struct DeadlockPath {
   std::vector<DeadlockInfo> path;
   bool limit_exceeded;
+  int64_t deadlock_time;
 
-  explicit DeadlockPath(std::vector<DeadlockInfo> path_entry)
-      : path(path_entry), limit_exceeded(false) {}
+  explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
+                        const int64_t& dl_time)
+      : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
 
   // empty path, limit exceeded constructor and default constructor
-  explicit DeadlockPath(bool limit = false) : path(0), limit_exceeded(limit) {}
+  explicit DeadlockPath(const int64_t& dl_time = 0, bool limit = false)
+      : path(0), limit_exceeded(limit), deadlock_time(dl_time) {}
 
   bool empty() { return path.empty() && !limit_exceeded; }
 };
 
 class TransactionDB : public StackableDB {
  public:
+  // Optimized version of ::Write that receives more optimization request such
+  // as skip_concurrency_control.
+  using StackableDB::Write;
+  virtual Status Write(const WriteOptions& opts,
+                       const TransactionDBWriteOptimizations&,
+                       WriteBatch* updates) {
+    // The default implementation ignores TransactionDBWriteOptimizations and
+    // falls back to the un-optimized version of ::Write
+    return Write(opts, updates);
+  }
   // Open a TransactionDB similar to DB::Open().
   // Internally call PrepareWrap() and WrapDB()
+  // If the return status is not ok, then dbptr is set to nullptr.
   static Status Open(const Options& options,
                      const TransactionDBOptions& txn_db_options,
                      const std::string& dbname, TransactionDB** dbptr);
@@ -164,27 +225,29 @@ class TransactionDB : public StackableDB {
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles,
                      TransactionDB** dbptr);
-  // The following functions are used to open a TransactionDB internally using
-  // an opened DB or StackableDB.
-  // 1. Call prepareWrap(), passing an empty std::vector<size_t> to
-  // compaction_enabled_cf_indices.
-  // 2. Open DB or Stackable DB with db_options and column_families passed to
-  // prepareWrap()
   // Note: PrepareWrap() may change parameters, make copies before the
   // invocation if needed.
-  // 3. Call Wrap*DB() with compaction_enabled_cf_indices in step 1 and handles
-  // of the opened DB/StackableDB in step 2
   static void PrepareWrap(DBOptions* db_options,
                           std::vector<ColumnFamilyDescriptor>* column_families,
                           std::vector<size_t>* compaction_enabled_cf_indices);
+  // If the return status is not ok, then dbptr will bet set to nullptr. The
+  // input db parameter might or might not be deleted as a result of the
+  // failure. If it is properly deleted it will be set to nullptr. If the return
+  // status is ok, the ownership of db is transferred to dbptr.
   static Status WrapDB(DB* db, const TransactionDBOptions& txn_db_options,
                        const std::vector<size_t>& compaction_enabled_cf_indices,
                        const std::vector<ColumnFamilyHandle*>& handles,
                        TransactionDB** dbptr);
+  // If the return status is not ok, then dbptr will bet set to nullptr. The
+  // input db parameter might or might not be deleted as a result of the
+  // failure. If it is properly deleted it will be set to nullptr. If the return
+  // status is ok, the ownership of db is transferred to dbptr.
   static Status WrapStackableDB(
       StackableDB* db, const TransactionDBOptions& txn_db_options,
       const std::vector<size_t>& compaction_enabled_cf_indices,
       const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr);
+  // Since the destructor in StackableDB is virtual, this destructor is virtual
+  // too. The root db will be deleted by the base's destructor.
   ~TransactionDB() override {}
 
   // Starts a new Transaction.
@@ -213,6 +276,7 @@ class TransactionDB : public StackableDB {
 
  protected:
   // To Create an TransactionDB, call Open()
+  // The ownership of db is transferred to the base StackableDB
   explicit TransactionDB(DB* db) : StackableDB(db) {}
 
  private:
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/utility_db.h b/thirdparty/rocksdb/include/rocksdb/utilities/utility_db.h
index a34a638980..3008fee1a9 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/utility_db.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/utility_db.h
@@ -4,12 +4,12 @@
 
 #pragma once
 #ifndef ROCKSDB_LITE
-#include <vector>
 #include <string>
+#include <vector>
 
-#include "rocksdb/utilities/stackable_db.h"
-#include "rocksdb/utilities/db_ttl.h"
 #include "rocksdb/db.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/stackable_db.h"
 
 namespace rocksdb {
 
@@ -22,14 +22,12 @@ class UtilityDB {
 #if defined(__GNUC__) || defined(__clang__)
   __attribute__((deprecated))
 #elif _WIN32
-   __declspec(deprecated)
+  __declspec(deprecated)
 #endif
-    static Status OpenTtlDB(const Options& options,
-                                                      const std::string& name,
-                                                      StackableDB** dbptr,
-                                                      int32_t ttl = 0,
-                                                      bool read_only = false);
+  static Status
+  OpenTtlDB(const Options& options, const std::string& name,
+            StackableDB** dbptr, int32_t ttl = 0, bool read_only = false);
 };
 
-} //  namespace rocksdb
+}  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/include/rocksdb/utilities/write_batch_with_index.h b/thirdparty/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
index 24d8f30aa5..d25b9513ba 100644
--- a/thirdparty/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
+++ b/thirdparty/rocksdb/include/rocksdb/utilities/write_batch_with_index.h
@@ -27,6 +27,7 @@ namespace rocksdb {
 class ColumnFamilyHandle;
 class Comparator;
 class DB;
+class ReadCallback;
 struct ReadOptions;
 struct DBOptions;
 
@@ -154,6 +155,12 @@ class WriteBatchWithIndex : public WriteBatchBase {
   // The returned iterator should be deleted by the caller.
   // The base_iterator is now 'owned' by the returned iterator. Deleting the
   // returned iterator will also delete the base_iterator.
+  //
+  // Updating write batch with the current key of the iterator is not safe.
+  // We strongly recommand users not to do it. It will invalidate the current
+  // key() and value() of the iterator. This invalidation happens even before
+  // the write batch update finishes. The state may recover after Next() is
+  // called.
   Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family,
                                 Iterator* base_iterator);
   // default column family
@@ -187,7 +194,7 @@ class WriteBatchWithIndex : public WriteBatchBase {
   Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
                            const Slice& key, std::string* value);
 
-  // An overload of the the above method that receives a PinnableSlice
+  // An overload of the above method that receives a PinnableSlice
   Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
                            const Slice& key, PinnableSlice* value);
 
@@ -195,7 +202,7 @@ class WriteBatchWithIndex : public WriteBatchBase {
                            ColumnFamilyHandle* column_family, const Slice& key,
                            std::string* value);
 
-  // An overload of the the above method that receives a PinnableSlice
+  // An overload of the above method that receives a PinnableSlice
   Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
                            ColumnFamilyHandle* column_family, const Slice& key,
                            PinnableSlice* value);
@@ -224,8 +231,21 @@ class WriteBatchWithIndex : public WriteBatchBase {
   Status PopSavePoint() override;
 
   void SetMaxBytes(size_t max_bytes) override;
+  size_t GetDataSize() const;
 
  private:
+  friend class PessimisticTransactionDB;
+  friend class WritePreparedTxn;
+  friend class WriteUnpreparedTxn;
+  friend class WriteBatchWithIndex_SubBatchCnt_Test;
+  // Returns the number of sub-batches inside the write batch. A sub-batch
+  // starts right before inserting a key that is a duplicate of a key in the
+  // last sub-batch.
+  size_t SubBatchCnt();
+
+  Status GetFromBatchAndDB(DB* db, const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value, ReadCallback* callback);
   struct Rep;
   std::unique_ptr<Rep> rep;
 };
diff --git a/thirdparty/rocksdb/include/rocksdb/version.h b/thirdparty/rocksdb/include/rocksdb/version.h
index b48732d75f..d72f6b649d 100644
--- a/thirdparty/rocksdb/include/rocksdb/version.h
+++ b/thirdparty/rocksdb/include/rocksdb/version.h
@@ -4,9 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 #pragma once
 
-#define ROCKSDB_MAJOR 5
-#define ROCKSDB_MINOR 8
-#define ROCKSDB_PATCH 6
+#define ROCKSDB_MAJOR 6
+#define ROCKSDB_MINOR 1
+#define ROCKSDB_PATCH 2
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these
diff --git a/thirdparty/rocksdb/include/rocksdb/wal_filter.h b/thirdparty/rocksdb/include/rocksdb/wal_filter.h
index 686fa49989..e25746dba4 100644
--- a/thirdparty/rocksdb/include/rocksdb/wal_filter.h
+++ b/thirdparty/rocksdb/include/rocksdb/wal_filter.h
@@ -4,8 +4,9 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #pragma once
-#include <string>
+
 #include <map>
+#include <string>
 
 namespace rocksdb {
 
@@ -33,7 +34,7 @@ class WalFilter {
   virtual ~WalFilter() {}
 
   // Provide ColumnFamily->LogNumber map to filter
-  // so that filter can determine whether a log number applies to a given 
+  // so that filter can determine whether a log number applies to a given
   // column family (i.e. that log hasn't been flushed to SST already for the
   // column family).
   // We also pass in name->id map as only name is known during
@@ -44,8 +45,8 @@ class WalFilter {
   // @params cf_name_id_map   column_family_name to column_family_id map
 
   virtual void ColumnFamilyLogNumberMap(
-    const std::map<uint32_t, uint64_t>& cf_lognumber_map,
-    const std::map<std::string, uint32_t>& cf_name_id_map) {}
+      const std::map<uint32_t, uint64_t>& /*cf_lognumber_map*/,
+      const std::map<std::string, uint32_t>& /*cf_name_id_map*/) {}
 
   // LogRecord is invoked for each log record encountered for all the logs
   // during replay on logs on recovery. This method can be used to:
@@ -75,21 +76,19 @@ class WalFilter {
   // @returns               Processing option for the current record.
   //                        Please see WalProcessingOption enum above for
   //                        details.
-  virtual WalProcessingOption LogRecordFound(unsigned long long log_number,
-                                        const std::string& log_file_name,
-                                        const WriteBatch& batch,
-                                        WriteBatch* new_batch,
-                                        bool* batch_changed) {
+  virtual WalProcessingOption LogRecordFound(
+      unsigned long long /*log_number*/, const std::string& /*log_file_name*/,
+      const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
     // Default implementation falls back to older function for compatibility
     return LogRecord(batch, new_batch, batch_changed);
   }
 
-  // Please see the comments for LogRecord above. This function is for 
-  // compatibility only and contains a subset of parameters. 
+  // Please see the comments for LogRecord above. This function is for
+  // compatibility only and contains a subset of parameters.
   // New code should use the function above.
-  virtual WalProcessingOption LogRecord(const WriteBatch& batch,
-                                        WriteBatch* new_batch,
-                                        bool* batch_changed) const {
+  virtual WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+                                        WriteBatch* /*new_batch*/,
+                                        bool* /*batch_changed*/) const {
     return WalProcessingOption::kContinueProcessing;
   }
 
diff --git a/thirdparty/rocksdb/include/rocksdb/write_batch.h b/thirdparty/rocksdb/include/rocksdb/write_batch.h
index 336391ead5..8782d08f1f 100644
--- a/thirdparty/rocksdb/include/rocksdb/write_batch.h
+++ b/thirdparty/rocksdb/include/rocksdb/write_batch.h
@@ -22,13 +22,12 @@
 // non-const method, all threads accessing the same WriteBatch must use
 // external synchronization.
 
-#ifndef STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
-#define STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
+#pragma once
 
+#include <stdint.h>
 #include <atomic>
 #include <stack>
 #include <string>
-#include <stdint.h>
 #include "rocksdb/status.h"
 #include "rocksdb/write_batch_base.h"
 
@@ -217,8 +216,9 @@ class WriteBatch : public WriteBatchBase {
     }
     virtual void SingleDelete(const Slice& /*key*/) {}
 
-    virtual Status DeleteRangeCF(uint32_t column_family_id,
-                                 const Slice& begin_key, const Slice& end_key) {
+    virtual Status DeleteRangeCF(uint32_t /*column_family_id*/,
+                                 const Slice& /*begin_key*/,
+                                 const Slice& /*end_key*/) {
       return Status::InvalidArgument("DeleteRangeCF not implemented");
     }
 
@@ -242,20 +242,24 @@ class WriteBatch : public WriteBatchBase {
     // The default implementation of LogData does nothing.
     virtual void LogData(const Slice& blob);
 
-    virtual Status MarkBeginPrepare() {
+    virtual Status MarkBeginPrepare(bool = false) {
       return Status::InvalidArgument("MarkBeginPrepare() handler not defined.");
     }
 
-    virtual Status MarkEndPrepare(const Slice& xid) {
+    virtual Status MarkEndPrepare(const Slice& /*xid*/) {
       return Status::InvalidArgument("MarkEndPrepare() handler not defined.");
     }
 
-    virtual Status MarkRollback(const Slice& xid) {
+    virtual Status MarkNoop(bool /*empty_batch*/) {
+      return Status::InvalidArgument("MarkNoop() handler not defined.");
+    }
+
+    virtual Status MarkRollback(const Slice& /*xid*/) {
       return Status::InvalidArgument(
           "MarkRollbackPrepare() handler not defined.");
     }
 
-    virtual Status MarkCommit(const Slice& xid) {
+    virtual Status MarkCommit(const Slice& /*xid*/) {
       return Status::InvalidArgument("MarkCommit() handler not defined.");
     }
 
@@ -263,6 +267,11 @@ class WriteBatch : public WriteBatchBase {
     // iteration is halted. Otherwise, it continues iterating. The default
     // implementation always returns true.
     virtual bool Continue();
+
+   protected:
+    friend class WriteBatch;
+    virtual bool WriteAfterCommit() const { return true; }
+    virtual bool WriteBeforePrepare() const { return false; }
   };
   Status Iterate(Handler* handler) const;
 
@@ -307,9 +316,10 @@ class WriteBatch : public WriteBatchBase {
 
   // Constructor with a serialized string object
   explicit WriteBatch(const std::string& rep);
+  explicit WriteBatch(std::string&& rep);
 
   WriteBatch(const WriteBatch& src);
-  WriteBatch(WriteBatch&& src);
+  WriteBatch(WriteBatch&& src) noexcept;
   WriteBatch& operator=(const WriteBatch& src);
   WriteBatch& operator=(WriteBatch&& src);
 
@@ -323,6 +333,10 @@ class WriteBatch : public WriteBatchBase {
  private:
   friend class WriteBatchInternal;
   friend class LocalSavePoint;
+  // TODO(myabandeh): this is needed for a hack to collapse the write batch and
+  // remove duplicate keys. Remove it when the hack is replaced with a proper
+  // solution.
+  friend class WriteBatchWithIndex;
   SavePoints* save_points_;
 
   // When sending a WriteBatch through WriteImpl we might want to
@@ -339,6 +353,12 @@ class WriteBatch : public WriteBatchBase {
   // Maximum size of rep_.
   size_t max_bytes_;
 
+  // Is the content of the batch the application's latest state that meant only
+  // to be used for recovery? Refer to
+  // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery for
+  // more details.
+  bool is_latest_persistent_state_ = false;
+
  protected:
   std::string rep_;  // See comment in write_batch.cc for the format of rep_
 
@@ -346,5 +366,3 @@ class WriteBatch : public WriteBatchBase {
 };
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_ROCKSDB_INCLUDE_WRITE_BATCH_H_
diff --git a/thirdparty/rocksdb/include/rocksdb/write_batch_base.h b/thirdparty/rocksdb/include/rocksdb/write_batch_base.h
index 3e6d011bd5..a7747a7c84 100644
--- a/thirdparty/rocksdb/include/rocksdb/write_batch_base.h
+++ b/thirdparty/rocksdb/include/rocksdb/write_batch_base.h
@@ -20,7 +20,7 @@ struct SliceParts;
 
 // Abstract base class that defines the basic interface for a write batch.
 // See WriteBatch for a basic implementation and WrithBatchWithIndex for an
-// indexed implemenation.
+// indexed implementation.
 class WriteBatchBase {
  public:
   virtual ~WriteBatchBase() {}
@@ -69,7 +69,7 @@ class WriteBatchBase {
                               const SliceParts& key);
   virtual Status SingleDelete(const SliceParts& key);
 
-  // If the database contains mappings in the range ["begin_key", "end_key"],
+  // If the database contains mappings in the range ["begin_key", "end_key"),
   // erase them. Else do nothing.
   virtual Status DeleteRange(ColumnFamilyHandle* column_family,
                              const Slice& begin_key, const Slice& end_key) = 0;
diff --git a/thirdparty/rocksdb/include/rocksdb/write_buffer_manager.h b/thirdparty/rocksdb/include/rocksdb/write_buffer_manager.h
index 856cf4b246..dea904c187 100644
--- a/thirdparty/rocksdb/include/rocksdb/write_buffer_manager.h
+++ b/thirdparty/rocksdb/include/rocksdb/write_buffer_manager.h
@@ -30,6 +30,8 @@ class WriteBufferManager {
 
   bool enabled() const { return buffer_size_ != 0; }
 
+  bool cost_to_cache() const { return cache_rep_ != nullptr; }
+
   // Only valid if enabled()
   size_t memory_usage() const {
     return memory_used_.load(std::memory_order_relaxed);
diff --git a/thirdparty/rocksdb/issue_template.md b/thirdparty/rocksdb/issue_template.md
new file mode 100644
index 0000000000..3e16c99a61
--- /dev/null
+++ b/thirdparty/rocksdb/issue_template.md
@@ -0,0 +1,7 @@
+> Note: Please use Issues only for bug reports. For questions, discussions, feature requests, etc. post to dev group: https://www.facebook.com/groups/rocksdb.dev
+
+### Expected behavior
+
+### Actual behavior
+
+### Steps to reproduce the behavior
diff --git a/thirdparty/rocksdb/java/CMakeLists.txt b/thirdparty/rocksdb/java/CMakeLists.txt
index d67896c2cd..360951834a 100644
--- a/thirdparty/rocksdb/java/CMakeLists.txt
+++ b/thirdparty/rocksdb/java/CMakeLists.txt
@@ -1,14 +1,22 @@
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 3.4)
 
 set(JNI_NATIVE_SOURCES
         rocksjni/backupablejni.cc
         rocksjni/backupenginejni.cc
+        rocksjni/cassandra_compactionfilterjni.cc
+        rocksjni/cassandra_value_operator.cc
         rocksjni/checkpoint.cc
         rocksjni/clock_cache.cc
         rocksjni/columnfamilyhandle.cc
         rocksjni/compaction_filter.cc
+        rocksjni/compaction_filter_factory.cc
+        rocksjni/compaction_filter_factory_jnicallback.cc
+	rocksjni/compaction_job_info.cc
+	rocksjni/compaction_job_stats.cc
+	rocksjni/compaction_options.cc
         rocksjni/compaction_options_fifo.cc
         rocksjni/compaction_options_universal.cc
+        rocksjni/compact_range_options.cc
         rocksjni/comparator.cc
         rocksjni/comparatorjnicallback.cc
         rocksjni/compression_options.cc
@@ -17,53 +25,81 @@ set(JNI_NATIVE_SOURCES
         rocksjni/filter.cc
         rocksjni/ingest_external_file_options.cc
         rocksjni/iterator.cc
+        rocksjni/jnicallback.cc
         rocksjni/loggerjnicallback.cc
         rocksjni/lru_cache.cc
+        rocksjni/memory_util.cc
         rocksjni/memtablejni.cc
         rocksjni/merge_operator.cc
+        rocksjni/native_comparator_wrapper_test.cc
+        rocksjni/optimistic_transaction_db.cc
+        rocksjni/optimistic_transaction_options.cc
         rocksjni/options.cc
+        rocksjni/options_util.cc
+        rocksjni/persistent_cache.cc
         rocksjni/ratelimiterjni.cc
         rocksjni/remove_emptyvalue_compactionfilterjni.cc
-        rocksjni/cassandra_compactionfilterjni.cc
         rocksjni/restorejni.cc
+        rocksjni/rocks_callback_object.cc
         rocksjni/rocksdb_exception_test.cc
         rocksjni/rocksjni.cc
         rocksjni/slice.cc
         rocksjni/snapshot.cc
+        rocksjni/sst_file_manager.cc
         rocksjni/sst_file_writerjni.cc
         rocksjni/statistics.cc
         rocksjni/statisticsjni.cc
         rocksjni/table.cc
+        rocksjni/table_filter.cc
+        rocksjni/table_filter_jnicallback.cc
+        rocksjni/thread_status.cc
+        rocksjni/trace_writer.cc
+        rocksjni/trace_writer_jnicallback.cc
+        rocksjni/transaction.cc
+        rocksjni/transaction_db.cc
+        rocksjni/transaction_db_options.cc
         rocksjni/transaction_log.cc
+        rocksjni/transaction_notifier.cc
+        rocksjni/transaction_notifier_jnicallback.cc
+        rocksjni/transaction_options.cc
         rocksjni/ttl.cc
+        rocksjni/wal_filter.cc
+        rocksjni/wal_filter_jnicallback.cc
         rocksjni/write_batch.cc
+        rocksjni/writebatchhandlerjnicallback.cc
         rocksjni/write_batch_test.cc
         rocksjni/write_batch_with_index.cc
-        rocksjni/writebatchhandlerjnicallback.cc
+        rocksjni/write_buffer_manager.cc
 )
 
 set(NATIVE_JAVA_CLASSES
         org.rocksdb.AbstractCompactionFilter
+        org.rocksdb.AbstractCompactionFilterFactory
         org.rocksdb.AbstractComparator
         org.rocksdb.AbstractImmutableNativeReference
         org.rocksdb.AbstractNativeReference
         org.rocksdb.AbstractRocksIterator
         org.rocksdb.AbstractSlice
-        org.rocksdb.AbstractWriteBatch
+        org.rocksdb.AbstractTableFilter
+        org.rocksdb.AbstractTraceWriter
+        org.rocksdb.AbstractTransactionNotifier
+        org.rocksdb.AbstractWalFilter
         org.rocksdb.BackupableDBOptions
         org.rocksdb.BackupEngine
-        org.rocksdb.BackupEngineTest
         org.rocksdb.BlockBasedTableConfig
         org.rocksdb.BloomFilter
-        org.rocksdb.Cache
         org.rocksdb.CassandraCompactionFilter
         org.rocksdb.CassandraValueMergeOperator
         org.rocksdb.Checkpoint
         org.rocksdb.ClockCache
         org.rocksdb.ColumnFamilyHandle
         org.rocksdb.ColumnFamilyOptions
+        org.rocksdb.CompactionJobInfo
+        org.rocksdb.CompactionJobStats
+        org.rocksdb.CompactionOptions
         org.rocksdb.CompactionOptionsFIFO
         org.rocksdb.CompactionOptionsUniversal
+        org.rocksdb.CompactRangeOptions
         org.rocksdb.Comparator
         org.rocksdb.ComparatorOptions
         org.rocksdb.CompressionOptions
@@ -72,25 +108,30 @@ set(NATIVE_JAVA_CLASSES
         org.rocksdb.DirectSlice
         org.rocksdb.Env
         org.rocksdb.EnvOptions
-        org.rocksdb.ExternalSstFileInfo
         org.rocksdb.Filter
         org.rocksdb.FlushOptions
         org.rocksdb.HashLinkedListMemTableConfig
         org.rocksdb.HashSkipListMemTableConfig
+        org.rocksdb.HdfsEnv
         org.rocksdb.IngestExternalFileOptions
         org.rocksdb.Logger
         org.rocksdb.LRUCache
+        org.rocksdb.MemoryUtil
         org.rocksdb.MemTableConfig
-        org.rocksdb.MergeOperator
+        org.rocksdb.NativeComparatorWrapper
         org.rocksdb.NativeLibraryLoader
+        org.rocksdb.OptimisticTransactionDB
+        org.rocksdb.OptimisticTransactionOptions
         org.rocksdb.Options
+        org.rocksdb.OptionsUtil
+        org.rocksdb.PersistentCache
         org.rocksdb.PlainTableConfig
         org.rocksdb.RateLimiter
         org.rocksdb.ReadOptions
         org.rocksdb.RemoveEmptyValueCompactionFilter
         org.rocksdb.RestoreOptions
+        org.rocksdb.RocksCallbackObject
         org.rocksdb.RocksDB
-        org.rocksdb.RocksDBExceptionTest
         org.rocksdb.RocksEnv
         org.rocksdb.RocksIterator
         org.rocksdb.RocksIteratorInterface
@@ -100,25 +141,40 @@ set(NATIVE_JAVA_CLASSES
         org.rocksdb.SkipListMemTableConfig
         org.rocksdb.Slice
         org.rocksdb.Snapshot
-        org.rocksdb.SnapshotTest
+        org.rocksdb.SstFileManager
         org.rocksdb.SstFileWriter
         org.rocksdb.Statistics
         org.rocksdb.StringAppendOperator
         org.rocksdb.TableFormatConfig
+        org.rocksdb.ThreadStatus
+        org.rocksdb.TimedEnv
+        org.rocksdb.Transaction
+        org.rocksdb.TransactionDB
+        org.rocksdb.TransactionDBOptions
         org.rocksdb.TransactionLogIterator
+        org.rocksdb.TransactionOptions
         org.rocksdb.TtlDB
+        org.rocksdb.UInt64AddOperator
         org.rocksdb.VectorMemTableConfig
         org.rocksdb.WBWIRocksIterator
         org.rocksdb.WriteBatch
         org.rocksdb.WriteBatch.Handler
-        org.rocksdb.WriteBatchTest
-        org.rocksdb.WriteBatchTestInternalHelper
+        org.rocksdb.WriteBatchInterface
         org.rocksdb.WriteBatchWithIndex
         org.rocksdb.WriteOptions
+        org.rocksdb.NativeComparatorWrapperTest
+        org.rocksdb.RocksDBExceptionTest
+        org.rocksdb.SnapshotTest
+        org.rocksdb.WriteBatchTest
+        org.rocksdb.WriteBatchTestInternalHelper
+        org.rocksdb.WriteBufferManager
 )
 
-include_directories($ENV{JAVA_HOME}/include)
-include_directories($ENV{JAVA_HOME}/include/win32)
+include(FindJava)
+include(UseJava)
+include(FindJNI)
+
+include_directories(${JNI_INCLUDE_DIRS})
 include_directories(${PROJECT_SOURCE_DIR}/java)
 
 set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs)
@@ -128,7 +184,183 @@ set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar)
 set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar)
 set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar)
 set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar)
-set(JAVA_TESTCLASSPATH "${JAVA_JUNIT_JAR}\;${JAVA_HAMCR_JAR}\;${JAVA_MOCKITO_JAR}\;${JAVA_CGLIB_JAR}\;${JAVA_ASSERTJ_JAR}")
+set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR})
+
+add_jar(
+  rocksdbjni_classes
+  SOURCES
+  src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
+  src/main/java/org/rocksdb/AbstractCompactionFilter.java
+  src/main/java/org/rocksdb/AbstractComparator.java
+  src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
+  src/main/java/org/rocksdb/AbstractMutableOptions.java
+  src/main/java/org/rocksdb/AbstractNativeReference.java
+  src/main/java/org/rocksdb/AbstractRocksIterator.java
+  src/main/java/org/rocksdb/AbstractSlice.java
+  src/main/java/org/rocksdb/AbstractTableFilter.java
+  src/main/java/org/rocksdb/AbstractTraceWriter.java
+  src/main/java/org/rocksdb/AbstractTransactionNotifier.java
+  src/main/java/org/rocksdb/AbstractWalFilter.java
+  src/main/java/org/rocksdb/AbstractWriteBatch.java
+  src/main/java/org/rocksdb/AccessHint.java
+  src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/BackupableDBOptions.java
+  src/main/java/org/rocksdb/BackupEngine.java
+  src/main/java/org/rocksdb/BackupInfo.java
+  src/main/java/org/rocksdb/BlockBasedTableConfig.java
+  src/main/java/org/rocksdb/BloomFilter.java
+  src/main/java/org/rocksdb/BuiltinComparator.java
+  src/main/java/org/rocksdb/Cache.java
+  src/main/java/org/rocksdb/CassandraCompactionFilter.java
+  src/main/java/org/rocksdb/CassandraValueMergeOperator.java
+  src/main/java/org/rocksdb/Checkpoint.java
+  src/main/java/org/rocksdb/ChecksumType.java
+  src/main/java/org/rocksdb/ClockCache.java
+  src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
+  src/main/java/org/rocksdb/ColumnFamilyHandle.java
+  src/main/java/org/rocksdb/ColumnFamilyMetaData.java
+  src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/ColumnFamilyOptions.java
+  src/main/java/org/rocksdb/CompactionJobInfo.java
+  src/main/java/org/rocksdb/CompactionJobStats.java
+  src/main/java/org/rocksdb/CompactionOptions.java
+  src/main/java/org/rocksdb/CompactionOptionsFIFO.java
+  src/main/java/org/rocksdb/CompactionOptionsUniversal.java
+  src/main/java/org/rocksdb/CompactionPriority.java
+  src/main/java/org/rocksdb/CompactionReason.java
+  src/main/java/org/rocksdb/CompactRangeOptions.java
+  src/main/java/org/rocksdb/CompactionStopStyle.java
+  src/main/java/org/rocksdb/CompactionStyle.java
+  src/main/java/org/rocksdb/Comparator.java
+  src/main/java/org/rocksdb/ComparatorOptions.java
+  src/main/java/org/rocksdb/ComparatorType.java
+  src/main/java/org/rocksdb/CompressionOptions.java
+  src/main/java/org/rocksdb/CompressionType.java
+  src/main/java/org/rocksdb/DataBlockIndexType.java
+  src/main/java/org/rocksdb/DBOptionsInterface.java
+  src/main/java/org/rocksdb/DBOptions.java
+  src/main/java/org/rocksdb/DbPath.java
+  src/main/java/org/rocksdb/DirectComparator.java
+  src/main/java/org/rocksdb/DirectSlice.java
+  src/main/java/org/rocksdb/EncodingType.java
+  src/main/java/org/rocksdb/Env.java
+  src/main/java/org/rocksdb/EnvOptions.java
+  src/main/java/org/rocksdb/Experimental.java
+  src/main/java/org/rocksdb/Filter.java
+  src/main/java/org/rocksdb/FlushOptions.java
+  src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java
+  src/main/java/org/rocksdb/HashSkipListMemTableConfig.java
+  src/main/java/org/rocksdb/HdfsEnv.java
+  src/main/java/org/rocksdb/HistogramData.java
+  src/main/java/org/rocksdb/HistogramType.java
+  src/main/java/org/rocksdb/IndexType.java
+  src/main/java/org/rocksdb/InfoLogLevel.java
+  src/main/java/org/rocksdb/IngestExternalFileOptions.java
+  src/main/java/org/rocksdb/LevelMetaData.java
+  src/main/java/org/rocksdb/LiveFileMetaData.java
+  src/main/java/org/rocksdb/LogFile.java
+  src/main/java/org/rocksdb/Logger.java
+  src/main/java/org/rocksdb/LRUCache.java
+  src/main/java/org/rocksdb/MemoryUsageType.java
+  src/main/java/org/rocksdb/MemoryUtil.java
+  src/main/java/org/rocksdb/MemTableConfig.java
+  src/main/java/org/rocksdb/MergeOperator.java
+  src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
+  src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java
+  src/main/java/org/rocksdb/MutableDBOptions.java
+  src/main/java/org/rocksdb/MutableDBOptionsInterface.java
+  src/main/java/org/rocksdb/MutableOptionKey.java
+  src/main/java/org/rocksdb/MutableOptionValue.java
+  src/main/java/org/rocksdb/NativeComparatorWrapper.java
+  src/main/java/org/rocksdb/NativeLibraryLoader.java
+  src/main/java/org/rocksdb/OperationStage.java
+  src/main/java/org/rocksdb/OperationType.java
+  src/main/java/org/rocksdb/OptimisticTransactionDB.java
+  src/main/java/org/rocksdb/OptimisticTransactionOptions.java
+  src/main/java/org/rocksdb/Options.java
+  src/main/java/org/rocksdb/OptionsUtil.java
+  src/main/java/org/rocksdb/PersistentCache.java
+  src/main/java/org/rocksdb/PlainTableConfig.java
+  src/main/java/org/rocksdb/Priority.java
+  src/main/java/org/rocksdb/Range.java
+  src/main/java/org/rocksdb/RateLimiter.java
+  src/main/java/org/rocksdb/RateLimiterMode.java
+  src/main/java/org/rocksdb/ReadOptions.java
+  src/main/java/org/rocksdb/ReadTier.java
+  src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java
+  src/main/java/org/rocksdb/RestoreOptions.java
+  src/main/java/org/rocksdb/RocksCallbackObject.java
+  src/main/java/org/rocksdb/RocksDBException.java
+  src/main/java/org/rocksdb/RocksDB.java
+  src/main/java/org/rocksdb/RocksEnv.java
+  src/main/java/org/rocksdb/RocksIteratorInterface.java
+  src/main/java/org/rocksdb/RocksIterator.java
+  src/main/java/org/rocksdb/RocksMemEnv.java
+  src/main/java/org/rocksdb/RocksMutableObject.java
+  src/main/java/org/rocksdb/RocksObject.java
+  src/main/java/org/rocksdb/SizeApproximationFlag.java
+  src/main/java/org/rocksdb/SkipListMemTableConfig.java
+  src/main/java/org/rocksdb/Slice.java
+  src/main/java/org/rocksdb/Snapshot.java
+  src/main/java/org/rocksdb/SstFileManager.java
+  src/main/java/org/rocksdb/SstFileMetaData.java
+  src/main/java/org/rocksdb/SstFileWriter.java
+  src/main/java/org/rocksdb/StateType.java
+  src/main/java/org/rocksdb/StatisticsCollectorCallback.java
+  src/main/java/org/rocksdb/StatisticsCollector.java
+  src/main/java/org/rocksdb/Statistics.java
+  src/main/java/org/rocksdb/StatsCollectorInput.java
+  src/main/java/org/rocksdb/StatsLevel.java
+  src/main/java/org/rocksdb/Status.java
+  src/main/java/org/rocksdb/StringAppendOperator.java
+  src/main/java/org/rocksdb/TableFilter.java
+  src/main/java/org/rocksdb/TableProperties.java
+  src/main/java/org/rocksdb/TableFormatConfig.java
+  src/main/java/org/rocksdb/ThreadType.java
+  src/main/java/org/rocksdb/ThreadStatus.java
+  src/main/java/org/rocksdb/TickerType.java
+  src/main/java/org/rocksdb/TimedEnv.java
+  src/main/java/org/rocksdb/TraceOptions.java
+  src/main/java/org/rocksdb/TraceWriter.java
+  src/main/java/org/rocksdb/TransactionalDB.java
+  src/main/java/org/rocksdb/TransactionalOptions.java
+  src/main/java/org/rocksdb/TransactionDB.java
+  src/main/java/org/rocksdb/TransactionDBOptions.java
+  src/main/java/org/rocksdb/Transaction.java
+  src/main/java/org/rocksdb/TransactionLogIterator.java
+  src/main/java/org/rocksdb/TransactionOptions.java
+  src/main/java/org/rocksdb/TtlDB.java
+  src/main/java/org/rocksdb/TxnDBWritePolicy.java
+  src/main/java/org/rocksdb/VectorMemTableConfig.java
+  src/main/java/org/rocksdb/WalFileType.java
+  src/main/java/org/rocksdb/WalFilter.java
+  src/main/java/org/rocksdb/WalProcessingOption.java
+  src/main/java/org/rocksdb/WALRecoveryMode.java
+  src/main/java/org/rocksdb/WBWIRocksIterator.java
+  src/main/java/org/rocksdb/WriteBatchInterface.java
+  src/main/java/org/rocksdb/WriteBatch.java
+  src/main/java/org/rocksdb/WriteBatchWithIndex.java
+  src/main/java/org/rocksdb/WriteOptions.java
+  src/main/java/org/rocksdb/WriteBufferManager.java
+  src/main/java/org/rocksdb/util/BytewiseComparator.java
+  src/main/java/org/rocksdb/util/DirectBytewiseComparator.java
+  src/main/java/org/rocksdb/util/Environment.java
+  src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java
+  src/main/java/org/rocksdb/util/SizeUnit.java
+  src/test/java/org/rocksdb/BackupEngineTest.java
+  src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
+  src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
+  src/test/java/org/rocksdb/PlatformRandomHelper.java
+  src/test/java/org/rocksdb/RocksDBExceptionTest.java
+  src/test/java/org/rocksdb/RocksMemoryResource.java
+  src/test/java/org/rocksdb/SnapshotTest.java
+  src/main/java/org/rocksdb/UInt64AddOperator.java
+  src/test/java/org/rocksdb/WriteBatchTest.java
+  src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
+  src/test/java/org/rocksdb/util/WriteBatchGetter.java
+  INCLUDE_JARS ${JAVA_TESTCLASSPATH}
+)
 
 if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes)
   file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/java/classes)
@@ -192,16 +424,35 @@ if(NOT EXISTS ${JAVA_ASSERTJ_JAR})
   file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR})
 endif()
 
-if(WIN32)
-  set(JAVAC cmd /c javac)
-  set(JAVAH cmd /c javah)
-else()
-  set(JAVAC javac)
-  set(JAVAH javah)
+set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include)
+
+file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR})
+create_javah(
+  TARGET rocksdbjni_headers
+  CLASSES ${NATIVE_JAVA_CLASSES}
+  CLASSPATH rocksdbjni_classes ${JAVA_TESTCLASSPATH}
+  OUTPUT_DIR ${JNI_OUTPUT_DIR}
+)
+
+if(NOT MSVC)
+  set_property(TARGET ${ROCKSDB_STATIC_LIB} PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
-execute_process(COMMAND ${JAVAC} ${JAVAC_ARGS} -cp ${JAVA_TESTCLASSPATH} -d ${PROJECT_SOURCE_DIR}/java/classes ${PROJECT_SOURCE_DIR}/java/src/main/java/org/rocksdb/util/*.java ${PROJECT_SOURCE_DIR}/java/src/main/java/org/rocksdb/*.java ${PROJECT_SOURCE_DIR}/java/src/test/java/org/rocksdb/*.java)
-execute_process(COMMAND ${JAVAH} -cp ${PROJECT_SOURCE_DIR}/java/classes -d ${PROJECT_SOURCE_DIR}/java/include -jni ${NATIVE_JAVA_CLASSES})
-add_library(rocksdbjni${ARTIFACT_SUFFIX} SHARED ${JNI_NATIVE_SOURCES})
-set_target_properties(rocksdbjni${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/rocksdbjni${ARTIFACT_SUFFIX}.pdb")
-target_link_libraries(rocksdbjni${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX} ${LIBS})
+set(ROCKSDBJNI_STATIC_LIB rocksdbjni${ARTIFACT_SUFFIX})
+add_library(${ROCKSDBJNI_STATIC_LIB} ${JNI_NATIVE_SOURCES})
+add_dependencies(${ROCKSDBJNI_STATIC_LIB} rocksdbjni_headers)
+target_link_libraries(${ROCKSDBJNI_STATIC_LIB} ${ROCKSDB_STATIC_LIB} ${LIBS})
+
+if(NOT MINGW)
+  set(ROCKSDBJNI_SHARED_LIB rocksdbjni-shared${ARTIFACT_SUFFIX})
+  add_library(${ROCKSDBJNI_SHARED_LIB} SHARED ${JNI_NATIVE_SOURCES})
+  add_dependencies(${ROCKSDBJNI_SHARED_LIB} rocksdbjni_headers)
+  target_link_libraries(${ROCKSDBJNI_SHARED_LIB} ${ROCKSDB_STATIC_LIB} ${LIBS})
+
+  set_target_properties(
+    ${ROCKSDBJNI_SHARED_LIB}
+    PROPERTIES
+    COMPILE_PDB_OUTPUT_DIRECTORY ${CMAKE_CFG_INTDIR}
+    COMPILE_PDB_NAME ${ROCKSDBJNI_STATIC_LIB}.pdb
+  )
+endif()
diff --git a/thirdparty/rocksdb/java/Makefile b/thirdparty/rocksdb/java/Makefile
index b29447bd8a..efc9d2b4e1 100644
--- a/thirdparty/rocksdb/java/Makefile
+++ b/thirdparty/rocksdb/java/Makefile
@@ -1,6 +1,10 @@
 NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
-	org.rocksdb.AbstractComparator\
+	org.rocksdb.AbstractCompactionFilterFactory\
 	org.rocksdb.AbstractSlice\
+	org.rocksdb.AbstractTableFilter\
+	org.rocksdb.AbstractTraceWriter\
+	org.rocksdb.AbstractTransactionNotifier\
+	org.rocksdb.AbstractWalFilter\
 	org.rocksdb.BackupEngine\
 	org.rocksdb.BackupableDBOptions\
 	org.rocksdb.BlockBasedTableConfig\
@@ -11,8 +15,12 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.CassandraValueMergeOperator\
 	org.rocksdb.ColumnFamilyHandle\
 	org.rocksdb.ColumnFamilyOptions\
+	org.rocksdb.CompactionJobInfo\
+	org.rocksdb.CompactionJobStats\
+	org.rocksdb.CompactionOptions\
 	org.rocksdb.CompactionOptionsFIFO\
 	org.rocksdb.CompactionOptionsUniversal\
+	org.rocksdb.CompactRangeOptions\
 	org.rocksdb.Comparator\
 	org.rocksdb.ComparatorOptions\
 	org.rocksdb.CompressionOptions\
@@ -26,35 +34,54 @@ NATIVE_JAVA_CLASSES = org.rocksdb.AbstractCompactionFilter\
 	org.rocksdb.IngestExternalFileOptions\
 	org.rocksdb.HashLinkedListMemTableConfig\
 	org.rocksdb.HashSkipListMemTableConfig\
+	org.rocksdb.HdfsEnv\
 	org.rocksdb.Logger\
 	org.rocksdb.LRUCache\
+	org.rocksdb.MemoryUsageType\
+	org.rocksdb.MemoryUtil\
 	org.rocksdb.MergeOperator\
+	org.rocksdb.NativeComparatorWrapper\
+	org.rocksdb.OptimisticTransactionDB\
+	org.rocksdb.OptimisticTransactionOptions\
 	org.rocksdb.Options\
+	org.rocksdb.OptionsUtil\
+	org.rocksdb.PersistentCache\
 	org.rocksdb.PlainTableConfig\
 	org.rocksdb.RateLimiter\
 	org.rocksdb.ReadOptions\
 	org.rocksdb.RemoveEmptyValueCompactionFilter\
 	org.rocksdb.RestoreOptions\
+	org.rocksdb.RocksCallbackObject\
 	org.rocksdb.RocksDB\
 	org.rocksdb.RocksEnv\
 	org.rocksdb.RocksIterator\
 	org.rocksdb.RocksMemEnv\
 	org.rocksdb.SkipListMemTableConfig\
 	org.rocksdb.Slice\
+	org.rocksdb.SstFileManager\
 	org.rocksdb.SstFileWriter\
 	org.rocksdb.Statistics\
+	org.rocksdb.ThreadStatus\
+	org.rocksdb.TimedEnv\
+	org.rocksdb.Transaction\
+	org.rocksdb.TransactionDB\
+	org.rocksdb.TransactionDBOptions\
+	org.rocksdb.TransactionOptions\
 	org.rocksdb.TransactionLogIterator\
 	org.rocksdb.TtlDB\
 	org.rocksdb.VectorMemTableConfig\
 	org.rocksdb.Snapshot\
 	org.rocksdb.StringAppendOperator\
+	org.rocksdb.UInt64AddOperator\
 	org.rocksdb.WriteBatch\
 	org.rocksdb.WriteBatch.Handler\
 	org.rocksdb.WriteOptions\
 	org.rocksdb.WriteBatchWithIndex\
+	org.rocksdb.WriteBufferManager\
 	org.rocksdb.WBWIRocksIterator
 
 NATIVE_JAVA_TEST_CLASSES = org.rocksdb.RocksDBExceptionTest\
+    org.rocksdb.NativeComparatorWrapperTest.NativeStringComparatorWrapper\
     org.rocksdb.WriteBatchTest\
     org.rocksdb.WriteBatchTestInternalHelper
 
@@ -77,6 +104,10 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.ClockCacheTest\
 	org.rocksdb.ColumnFamilyOptionsTest\
 	org.rocksdb.ColumnFamilyTest\
+	org.rocksdb.CompactionFilterFactoryTest\
+	org.rocksdb.CompactionJobInfoTest\
+	org.rocksdb.CompactionJobStatsTest\
+	org.rocksdb.CompactionOptionsTest\
 	org.rocksdb.CompactionOptionsFIFOTest\
 	org.rocksdb.CompactionOptionsUniversalTest\
 	org.rocksdb.CompactionPriorityTest\
@@ -89,6 +120,7 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.DirectComparatorTest\
 	org.rocksdb.DirectSliceTest\
 	org.rocksdb.EnvOptionsTest\
+	org.rocksdb.HdfsEnvTest\
 	org.rocksdb.IngestExternalFileOptionsTest\
 	org.rocksdb.util.EnvironmentTest\
 	org.rocksdb.FilterTest\
@@ -96,12 +128,19 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.InfoLogLevelTest\
 	org.rocksdb.KeyMayExistTest\
 	org.rocksdb.LoggerTest\
-    org.rocksdb.LRUCacheTest\
+	org.rocksdb.LRUCacheTest\
+	org.rocksdb.MemoryUtilTest\
 	org.rocksdb.MemTableTest\
 	org.rocksdb.MergeTest\
 	org.rocksdb.MixedOptionsTest\
 	org.rocksdb.MutableColumnFamilyOptionsTest\
+	org.rocksdb.MutableDBOptionsTest\
+	org.rocksdb.NativeComparatorWrapperTest\
 	org.rocksdb.NativeLibraryLoaderTest\
+	org.rocksdb.OptimisticTransactionTest\
+	org.rocksdb.OptimisticTransactionDBTest\
+	org.rocksdb.OptimisticTransactionOptionsTest\
+	org.rocksdb.OptionsUtilTest\
 	org.rocksdb.OptionsTest\
 	org.rocksdb.PlainTableConfigTest\
 	org.rocksdb.RateLimiterTest\
@@ -109,17 +148,25 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\
 	org.rocksdb.ReadOptionsTest\
 	org.rocksdb.RocksDBTest\
 	org.rocksdb.RocksDBExceptionTest\
-	org.rocksdb.RocksEnvTest\
+	org.rocksdb.DefaultEnvTest\
 	org.rocksdb.RocksIteratorTest\
 	org.rocksdb.RocksMemEnvTest\
 	org.rocksdb.util.SizeUnitTest\
 	org.rocksdb.SliceTest\
 	org.rocksdb.SnapshotTest\
+	org.rocksdb.SstFileManagerTest\
 	org.rocksdb.SstFileWriterTest\
+	org.rocksdb.TableFilterTest\
+	org.rocksdb.TimedEnvTest\
+	org.rocksdb.TransactionTest\
+	org.rocksdb.TransactionDBTest\
+	org.rocksdb.TransactionOptionsTest\
+	org.rocksdb.TransactionDBOptionsTest\
 	org.rocksdb.TransactionLogIteratorTest\
 	org.rocksdb.TtlDBTest\
 	org.rocksdb.StatisticsTest\
 	org.rocksdb.StatisticsCollectorTest\
+	org.rocksdb.WalFilterTest\
 	org.rocksdb.WALRecoveryModeTest\
 	org.rocksdb.WriteBatchHandlerTest\
 	org.rocksdb.WriteBatchTest\
@@ -205,6 +252,20 @@ column_family_sample: java
 	java $(JAVA_ARGS) -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni
 	$(AM_V_at)@rm -rf /tmp/rocksdbjni
 
+transaction_sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/TransactionSample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) TransactionSample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+
+optimistic_transaction_sample: java
+	$(AM_V_GEN)mkdir -p $(SAMPLES_MAIN_CLASSES)
+	$(AM_V_at)javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/OptimisticTransactionSample.java
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+	java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) OptimisticTransactionSample /tmp/rocksdbjni
+	$(AM_V_at)@rm -rf /tmp/rocksdbjni
+
 resolve_test_deps:
 	test -d "$(JAVA_TEST_LIBDIR)" || mkdir -p "$(JAVA_TEST_LIBDIR)"
 	test -s "$(JAVA_JUNIT_JAR)" || cp $(MVN_LOCAL)/junit/junit/4.12/junit-4.12.jar $(JAVA_TEST_LIBDIR) || curl -k -L -o $(JAVA_JUNIT_JAR) $(SEARCH_REPO_URL)junit/junit/4.12/junit-4.12.jar
diff --git a/thirdparty/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/thirdparty/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
index 8af6d2edfb..67f6a5cc05 100644
--- a/thirdparty/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
+++ b/thirdparty/rocksdb/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java
@@ -493,7 +493,7 @@ private void prepareOptions(Options options) throws RocksDBException {
       options.setCreateIfMissing(false);
     }
     if (useMemenv_) {
-      options.setEnv(new RocksMemEnv());
+      options.setEnv(new RocksMemEnv(Env.getDefault()));
     }
     switch (memtable_) {
       case "skip_list":
@@ -543,6 +543,7 @@ private void prepareOptions(Options options) throws RocksDBException {
         (Integer)flags_.get(Flag.max_background_compactions));
     options.setMaxBackgroundFlushes(
         (Integer)flags_.get(Flag.max_background_flushes));
+    options.setMaxBackgroundJobs((Integer) flags_.get(Flag.max_background_jobs));
     options.setMaxOpenFiles(
         (Integer)flags_.get(Flag.open_files));
     options.setUseFsync(
@@ -1116,6 +1117,14 @@ private enum Flag {
         return Integer.parseInt(value);
       }
     },
+    max_background_jobs(defaultOptions_.maxBackgroundJobs(),
+        "The maximum number of concurrent background jobs\n"
+            + "\tthat can occur in parallel.") {
+      @Override
+      public Object parseValue(String value) {
+        return Integer.parseInt(value);
+      }
+    },
     /* TODO(yhchiang): enable the following
     compaction_style((int32_t) defaultOptions_.compactionStyle(),
         "style of compaction: level-based vs universal.") {
diff --git a/thirdparty/rocksdb/java/crossbuild/build-linux-centos.sh b/thirdparty/rocksdb/java/crossbuild/build-linux-centos.sh
index 2832eed8b8..c532398f66 100755
--- a/thirdparty/rocksdb/java/crossbuild/build-linux-centos.sh
+++ b/thirdparty/rocksdb/java/crossbuild/build-linux-centos.sh
@@ -9,7 +9,7 @@ sudo rm -f /etc/yum/vars/releasever
 sudo yum -y install epel-release
 
 # install all required packages for rocksdb that are available through yum
-sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel
+sudo yum -y install openssl java-1.7.0-openjdk-devel zlib-devel bzip2-devel lz4-devel snappy-devel libzstd-devel jemalloc-devel
 
 # install gcc/g++ 4.8.2 from tru/devtools-2
 sudo wget -O /etc/yum.repos.d/devtools-2.repo https://people.centos.org/tru/devtools-2/devtools-2.repo
@@ -26,7 +26,6 @@ export JAVA_HOME=/usr/lib/jvm/java-1.7.0
 # build rocksdb
 cd /rocksdb
 scl enable devtoolset-2 'make jclean clean'
-scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic'
+scl enable devtoolset-2 'PORTABLE=1 make -j8 rocksdbjavastatic'
 cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
 cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build
-
diff --git a/thirdparty/rocksdb/java/crossbuild/docker-build-linux-centos.sh b/thirdparty/rocksdb/java/crossbuild/docker-build-linux-centos.sh
index 44a8bfe06d..d894b14a2e 100755
--- a/thirdparty/rocksdb/java/crossbuild/docker-build-linux-centos.sh
+++ b/thirdparty/rocksdb/java/crossbuild/docker-build-linux-centos.sh
@@ -1,11 +1,28 @@
 #!/usr/bin/env bash
 
 set -e
+#set -x
 
 rm -rf /rocksdb-local
 cp -r /rocksdb-host /rocksdb-local
 cd /rocksdb-local
-scl enable devtoolset-2 'make jclean clean'
-scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic'
+
+# Use scl devtoolset if available (i.e. CentOS <7)
+if hash scl 2>/dev/null; then
+	if scl --list | grep -q 'devtoolset-7'; then
+		scl enable devtoolset-7 'make jclean clean'
+		scl enable devtoolset-7 'PORTABLE=1 make -j6 rocksdbjavastatic'
+	elif scl --list | grep -q 'devtoolset-2'; then
+		scl enable devtoolset-2 'make jclean clean'
+		scl enable devtoolset-2 'PORTABLE=1 make -j6 rocksdbjavastatic'
+	else
+		echo "Could not find devtoolset"
+		exit 1;
+	fi
+else
+	make jclean clean
+        PORTABLE=1 make -j6 rocksdbjavastatic
+fi
+
 cp java/target/librocksdbjni-linux*.so java/target/rocksdbjni-*-linux*.jar /rocksdb-host/java/target
 
diff --git a/thirdparty/rocksdb/java/jdb_bench.sh b/thirdparty/rocksdb/java/jdb_bench.sh
index 9665de785e..0a07fa8e2f 100755
--- a/thirdparty/rocksdb/java/jdb_bench.sh
+++ b/thirdparty/rocksdb/java/jdb_bench.sh
@@ -1,3 +1,4 @@
+# shellcheck disable=SC2148
 PLATFORM=64
 if [ `getconf LONG_BIT` != "64" ]
 then
@@ -7,4 +8,5 @@ fi
 ROCKS_JAR=`find target -name rocksdbjni*.jar`
 
 echo "Running benchmark in $PLATFORM-Bit mode."
+# shellcheck disable=SC2068
 java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=target -cp "${ROCKS_JAR}:benchmark/target/classes" org.rocksdb.benchmark.DbBenchmark $@
diff --git a/thirdparty/rocksdb/java/rocksjni/backupablejni.cc b/thirdparty/rocksdb/java/rocksjni/backupablejni.cc
index 28db2b0210..c5ac30377c 100644
--- a/thirdparty/rocksdb/java/rocksjni/backupablejni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/backupablejni.cc
@@ -7,15 +7,15 @@
 // calling c++ rocksdb::BackupEnginge and rocksdb::BackupableDBOptions methods
 // from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <string>
 #include <vector>
 
 #include "include/org_rocksdb_BackupableDBOptions.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "rocksjni/portal.h"
 
 ///////////////////////////////////////////////////////////////////////////
 // BackupDBOptions
@@ -26,9 +26,9 @@
  * Signature: (Ljava/lang/String;)J
  */
 jlong Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions(
-    JNIEnv* env, jclass jcls, jstring jpath) {
+    JNIEnv* env, jclass /*jcls*/, jstring jpath) {
   const char* cpath = env->GetStringUTFChars(jpath, nullptr);
-  if(cpath == nullptr) {
+  if (cpath == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
@@ -42,8 +42,9 @@ jlong Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions(
  * Method:    backupDir
  * Signature: (J)Ljava/lang/String;
  */
-jstring Java_org_rocksdb_BackupableDBOptions_backupDir(
-    JNIEnv* env, jobject jopt, jlong jhandle) {
+jstring Java_org_rocksdb_BackupableDBOptions_backupDir(JNIEnv* env,
+                                                       jobject /*jopt*/,
+                                                       jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return env->NewStringUTF(bopt->backup_dir.c_str());
 }
@@ -54,7 +55,7 @@ jstring Java_org_rocksdb_BackupableDBOptions_backupDir(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setBackupEnv(
-    JNIEnv* env, jobject jopt, jlong jhandle, jlong jrocks_env_handle) {
+    JNIEnv* /*env*/, jobject /*jopt*/, jlong jhandle, jlong jrocks_env_handle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jrocks_env_handle);
   bopt->backup_env = rocks_env;
@@ -65,8 +66,10 @@ void Java_org_rocksdb_BackupableDBOptions_setBackupEnv(
  * Method:    setShareTableFiles
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_BackupableDBOptions_setShareTableFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+void Java_org_rocksdb_BackupableDBOptions_setShareTableFiles(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong jhandle,
+                                                             jboolean flag) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->share_table_files = flag;
 }
@@ -76,8 +79,9 @@ void Java_org_rocksdb_BackupableDBOptions_setShareTableFiles(
  * Method:    shareTableFiles
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_BackupableDBOptions_shareTableFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_BackupableDBOptions_shareTableFiles(JNIEnv* /*env*/,
+                                                              jobject /*jobj*/,
+                                                              jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return bopt->share_table_files;
 }
@@ -87,11 +91,13 @@ jboolean Java_org_rocksdb_BackupableDBOptions_shareTableFiles(
  * Method:    setInfoLog
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_BackupableDBOptions_setInfoLog(
-  JNIEnv* env, jobject jobj, jlong jhandle, jlong jlogger_handle) {
+void Java_org_rocksdb_BackupableDBOptions_setInfoLog(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jhandle,
+                                                     jlong /*jlogger_handle*/) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   auto* sptr_logger =
-      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(jhandle);
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback>*>(jhandle);
   bopt->info_log = sptr_logger->get();
 }
 
@@ -100,8 +106,10 @@ void Java_org_rocksdb_BackupableDBOptions_setInfoLog(
  * Method:    setSync
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_BackupableDBOptions_setSync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+void Java_org_rocksdb_BackupableDBOptions_setSync(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle,
+                                                  jboolean flag) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->sync = flag;
 }
@@ -111,8 +119,9 @@ void Java_org_rocksdb_BackupableDBOptions_setSync(
  * Method:    sync
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_BackupableDBOptions_sync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_BackupableDBOptions_sync(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return bopt->sync;
 }
@@ -122,8 +131,10 @@ jboolean Java_org_rocksdb_BackupableDBOptions_sync(
  * Method:    setDestroyOldData
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jhandle,
+                                                            jboolean flag) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->destroy_old_data = flag;
 }
@@ -133,8 +144,9 @@ void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData(
  * Method:    destroyOldData
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return bopt->destroy_old_data;
 }
@@ -144,8 +156,10 @@ jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData(
  * Method:    setBackupLogFiles
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jhandle,
+                                                            jboolean flag) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->backup_log_files = flag;
 }
@@ -155,8 +169,9 @@ void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles(
  * Method:    backupLogFiles
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return bopt->backup_log_files;
 }
@@ -167,7 +182,8 @@ jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jbackup_rate_limit) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jbackup_rate_limit) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->backup_rate_limit = jbackup_rate_limit;
 }
@@ -177,8 +193,9 @@ void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimit(
  * Method:    backupRateLimit
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return bopt->backup_rate_limit;
 }
@@ -189,10 +206,12 @@ jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimiter(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jrate_limiter_handle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   auto* sptr_rate_limiter =
-      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(jrate_limiter_handle);
+      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(
+          jrate_limiter_handle);
   bopt->backup_rate_limiter = *sptr_rate_limiter;
 }
 
@@ -202,7 +221,8 @@ void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimiter(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrestore_rate_limit) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jrestore_rate_limit) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->restore_rate_limit = jrestore_rate_limit;
 }
@@ -212,8 +232,9 @@ void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimit(
  * Method:    restoreRateLimit
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return bopt->restore_rate_limit;
 }
@@ -224,10 +245,12 @@ jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimiter(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jrate_limiter_handle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   auto* sptr_rate_limiter =
-      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(jrate_limiter_handle);
+      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(
+          jrate_limiter_handle);
   bopt->restore_rate_limiter = *sptr_rate_limiter;
 }
 
@@ -237,7 +260,7 @@ void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimiter(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setShareFilesWithChecksum(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean flag) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->share_files_with_checksum = flag;
 }
@@ -248,7 +271,7 @@ void Java_org_rocksdb_BackupableDBOptions_setShareFilesWithChecksum(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_BackupableDBOptions_shareFilesWithChecksum(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return bopt->share_files_with_checksum;
 }
@@ -259,10 +282,10 @@ jboolean Java_org_rocksdb_BackupableDBOptions_shareFilesWithChecksum(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setMaxBackgroundOperations(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_operations) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jint max_background_operations) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
-  bopt->max_background_operations =
-      static_cast<int>(max_background_operations);
+  bopt->max_background_operations = static_cast<int>(max_background_operations);
 }
 
 /*
@@ -271,7 +294,7 @@ void Java_org_rocksdb_BackupableDBOptions_setMaxBackgroundOperations(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_BackupableDBOptions_maxBackgroundOperations(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return static_cast<jint>(bopt->max_background_operations);
 }
@@ -282,7 +305,7 @@ jint Java_org_rocksdb_BackupableDBOptions_maxBackgroundOperations(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_BackupableDBOptions_setCallbackTriggerIntervalSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
     jlong jcallback_trigger_interval_size) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   bopt->callback_trigger_interval_size =
@@ -295,7 +318,7 @@ void Java_org_rocksdb_BackupableDBOptions_setCallbackTriggerIntervalSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_BackupableDBOptions_callbackTriggerIntervalSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   return static_cast<jlong>(bopt->callback_trigger_interval_size);
 }
@@ -305,8 +328,9 @@ jlong Java_org_rocksdb_BackupableDBOptions_callbackTriggerIntervalSize(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_BackupableDBOptions_disposeInternal(
-    JNIEnv* env, jobject jopt, jlong jhandle) {
+void Java_org_rocksdb_BackupableDBOptions_disposeInternal(JNIEnv* /*env*/,
+                                                          jobject /*jopt*/,
+                                                          jlong jhandle) {
   auto* bopt = reinterpret_cast<rocksdb::BackupableDBOptions*>(jhandle);
   assert(bopt != nullptr);
   delete bopt;
diff --git a/thirdparty/rocksdb/java/rocksjni/backupenginejni.cc b/thirdparty/rocksdb/java/rocksjni/backupenginejni.cc
index 004de976cb..e62b0b4f0f 100644
--- a/thirdparty/rocksdb/java/rocksjni/backupenginejni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/backupenginejni.cc
@@ -18,16 +18,15 @@
  * Method:    open
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_BackupEngine_open(
-    JNIEnv* env, jclass jcls, jlong env_handle,
-    jlong backupable_db_options_handle) {
+jlong Java_org_rocksdb_BackupEngine_open(JNIEnv* env, jclass /*jcls*/,
+                                         jlong env_handle,
+                                         jlong backupable_db_options_handle) {
   auto* rocks_env = reinterpret_cast<rocksdb::Env*>(env_handle);
-  auto* backupable_db_options =
-      reinterpret_cast<rocksdb::BackupableDBOptions*>(
+  auto* backupable_db_options = reinterpret_cast<rocksdb::BackupableDBOptions*>(
       backupable_db_options_handle);
   rocksdb::BackupEngine* backup_engine;
-  auto status = rocksdb::BackupEngine::Open(rocks_env,
-      *backupable_db_options, &backup_engine);
+  auto status = rocksdb::BackupEngine::Open(rocks_env, *backupable_db_options,
+                                            &backup_engine);
 
   if (status.ok()) {
     return reinterpret_cast<jlong>(backup_engine);
@@ -43,12 +42,42 @@ jlong Java_org_rocksdb_BackupEngine_open(
  * Signature: (JJZ)V
  */
 void Java_org_rocksdb_BackupEngine_createNewBackup(
-    JNIEnv* env, jobject jbe, jlong jbe_handle, jlong db_handle,
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jlong db_handle,
     jboolean jflush_before_backup) {
   auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
-  auto status = backup_engine->CreateNewBackup(db,
-      static_cast<bool>(jflush_before_backup));
+  auto status = backup_engine->CreateNewBackup(
+      db, static_cast<bool>(jflush_before_backup));
+
+  if (status.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+}
+
+/*
+ * Class:     org_rocksdb_BackupEngine
+ * Method:    createNewBackupWithMetadata
+ * Signature: (JJLjava/lang/String;Z)V
+ */
+void Java_org_rocksdb_BackupEngine_createNewBackupWithMetadata(
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jlong db_handle,
+    jstring japp_metadata, jboolean jflush_before_backup) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
+
+  jboolean has_exception = JNI_FALSE;
+  std::string app_metadata =
+      rocksdb::JniUtil::copyStdString(env, japp_metadata, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, "Could not copy jstring to std::string");
+    return;
+  }
+
+  auto status = backup_engine->CreateNewBackupWithMetadata(
+      db, app_metadata, static_cast<bool>(jflush_before_backup));
 
   if (status.ok()) {
     return;
@@ -62,8 +91,9 @@ void Java_org_rocksdb_BackupEngine_createNewBackup(
  * Method:    getBackupInfo
  * Signature: (J)Ljava/util/List;
  */
-jobject Java_org_rocksdb_BackupEngine_getBackupInfo(
-    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+jobject Java_org_rocksdb_BackupEngine_getBackupInfo(JNIEnv* env,
+                                                    jobject /*jbe*/,
+                                                    jlong jbe_handle) {
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
   std::vector<rocksdb::BackupInfo> backup_infos;
   backup_engine->GetBackupInfo(&backup_infos);
@@ -75,24 +105,25 @@ jobject Java_org_rocksdb_BackupEngine_getBackupInfo(
  * Method:    getCorruptedBackups
  * Signature: (J)[I
  */
-jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups(
-    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups(JNIEnv* env,
+                                                            jobject /*jbe*/,
+                                                            jlong jbe_handle) {
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
   std::vector<rocksdb::BackupID> backup_ids;
   backup_engine->GetCorruptedBackups(&backup_ids);
   // store backupids in int array
   std::vector<jint> int_backup_ids(backup_ids.begin(), backup_ids.end());
-  
+
   // Store ints in java array
   // Its ok to loose precision here (64->32)
   jsize ret_backup_ids_size = static_cast<jsize>(backup_ids.size());
   jintArray ret_backup_ids = env->NewIntArray(ret_backup_ids_size);
-  if(ret_backup_ids == nullptr) {
+  if (ret_backup_ids == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
   env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size,
-      int_backup_ids.data());
+                         int_backup_ids.data());
   return ret_backup_ids;
 }
 
@@ -101,8 +132,8 @@ jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups(
  * Method:    garbageCollect
  * Signature: (J)V
  */
-void Java_org_rocksdb_BackupEngine_garbageCollect(
-    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+void Java_org_rocksdb_BackupEngine_garbageCollect(JNIEnv* env, jobject /*jbe*/,
+                                                  jlong jbe_handle) {
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
   auto status = backup_engine->GarbageCollect();
 
@@ -118,12 +149,12 @@ void Java_org_rocksdb_BackupEngine_garbageCollect(
  * Method:    purgeOldBackups
  * Signature: (JI)V
  */
-void Java_org_rocksdb_BackupEngine_purgeOldBackups(
-    JNIEnv* env, jobject jbe, jlong jbe_handle, jint jnum_backups_to_keep) {
+void Java_org_rocksdb_BackupEngine_purgeOldBackups(JNIEnv* env, jobject /*jbe*/,
+                                                   jlong jbe_handle,
+                                                   jint jnum_backups_to_keep) {
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
-  auto status =
-      backup_engine->
-          PurgeOldBackups(static_cast<uint32_t>(jnum_backups_to_keep));
+  auto status = backup_engine->PurgeOldBackups(
+      static_cast<uint32_t>(jnum_backups_to_keep));
 
   if (status.ok()) {
     return;
@@ -137,8 +168,9 @@ void Java_org_rocksdb_BackupEngine_purgeOldBackups(
  * Method:    deleteBackup
  * Signature: (JI)V
  */
-void Java_org_rocksdb_BackupEngine_deleteBackup(
-    JNIEnv* env, jobject jbe, jlong jbe_handle, jint jbackup_id) {
+void Java_org_rocksdb_BackupEngine_deleteBackup(JNIEnv* env, jobject /*jbe*/,
+                                                jlong jbe_handle,
+                                                jint jbackup_id) {
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
   auto status =
       backup_engine->DeleteBackup(static_cast<rocksdb::BackupID>(jbackup_id));
@@ -156,26 +188,25 @@ void Java_org_rocksdb_BackupEngine_deleteBackup(
  * Signature: (JILjava/lang/String;Ljava/lang/String;J)V
  */
 void Java_org_rocksdb_BackupEngine_restoreDbFromBackup(
-    JNIEnv* env, jobject jbe, jlong jbe_handle, jint jbackup_id,
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jint jbackup_id,
     jstring jdb_dir, jstring jwal_dir, jlong jrestore_options_handle) {
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
   const char* db_dir = env->GetStringUTFChars(jdb_dir, nullptr);
-  if(db_dir == nullptr) {
+  if (db_dir == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
   const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr);
-  if(wal_dir == nullptr) {
+  if (wal_dir == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseStringUTFChars(jdb_dir, db_dir);
     return;
   }
   auto* restore_options =
       reinterpret_cast<rocksdb::RestoreOptions*>(jrestore_options_handle);
-  auto status =
-      backup_engine->RestoreDBFromBackup(
-          static_cast<rocksdb::BackupID>(jbackup_id), db_dir, wal_dir,
-          *restore_options);
+  auto status = backup_engine->RestoreDBFromBackup(
+      static_cast<rocksdb::BackupID>(jbackup_id), db_dir, wal_dir,
+      *restore_options);
 
   env->ReleaseStringUTFChars(jwal_dir, wal_dir);
   env->ReleaseStringUTFChars(jdb_dir, db_dir);
@@ -193,25 +224,24 @@ void Java_org_rocksdb_BackupEngine_restoreDbFromBackup(
  * Signature: (JLjava/lang/String;Ljava/lang/String;J)V
  */
 void Java_org_rocksdb_BackupEngine_restoreDbFromLatestBackup(
-    JNIEnv* env, jobject jbe, jlong jbe_handle, jstring jdb_dir,
+    JNIEnv* env, jobject /*jbe*/, jlong jbe_handle, jstring jdb_dir,
     jstring jwal_dir, jlong jrestore_options_handle) {
   auto* backup_engine = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
   const char* db_dir = env->GetStringUTFChars(jdb_dir, nullptr);
-  if(db_dir == nullptr) {
+  if (db_dir == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
   const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr);
-  if(wal_dir == nullptr) {
+  if (wal_dir == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseStringUTFChars(jdb_dir, db_dir);
     return;
   }
   auto* restore_options =
       reinterpret_cast<rocksdb::RestoreOptions*>(jrestore_options_handle);
-  auto status =
-      backup_engine->RestoreDBFromLatestBackup(db_dir, wal_dir,
-          *restore_options);
+  auto status = backup_engine->RestoreDBFromLatestBackup(db_dir, wal_dir,
+                                                         *restore_options);
 
   env->ReleaseStringUTFChars(jwal_dir, wal_dir);
   env->ReleaseStringUTFChars(jdb_dir, db_dir);
@@ -228,8 +258,9 @@ void Java_org_rocksdb_BackupEngine_restoreDbFromLatestBackup(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_BackupEngine_disposeInternal(
-    JNIEnv* env, jobject jbe, jlong jbe_handle) {
+void Java_org_rocksdb_BackupEngine_disposeInternal(JNIEnv* /*env*/,
+                                                   jobject /*jbe*/,
+                                                   jlong jbe_handle) {
   auto* be = reinterpret_cast<rocksdb::BackupEngine*>(jbe_handle);
   assert(be != nullptr);
   delete be;
diff --git a/thirdparty/rocksdb/java/rocksjni/cassandra_compactionfilterjni.cc b/thirdparty/rocksdb/java/rocksjni/cassandra_compactionfilterjni.cc
index 9d77559ab5..799e25e3f4 100644
--- a/thirdparty/rocksdb/java/rocksjni/cassandra_compactionfilterjni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/cassandra_compactionfilterjni.cc
@@ -11,12 +11,13 @@
 /*
  * Class:     org_rocksdb_CassandraCompactionFilter
  * Method:    createNewCassandraCompactionFilter0
- * Signature: ()J
+ * Signature: (ZI)J
  */
 jlong Java_org_rocksdb_CassandraCompactionFilter_createNewCassandraCompactionFilter0(
-    JNIEnv* env, jclass jcls, jboolean purge_ttl_on_expiration) {
-  auto* compaction_filter =
-      new rocksdb::cassandra::CassandraCompactionFilter(purge_ttl_on_expiration);
+    JNIEnv* /*env*/, jclass /*jcls*/, jboolean purge_ttl_on_expiration,
+    jint gc_grace_period_in_seconds) {
+  auto* compaction_filter = new rocksdb::cassandra::CassandraCompactionFilter(
+      purge_ttl_on_expiration, gc_grace_period_in_seconds);
   // set the native handle to our native compaction filter
   return reinterpret_cast<jlong>(compaction_filter);
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/cassandra_value_operator.cc b/thirdparty/rocksdb/java/rocksjni/cassandra_value_operator.cc
index aa58eccc24..73b3dcc637 100644
--- a/thirdparty/rocksdb/java/rocksjni/cassandra_value_operator.cc
+++ b/thirdparty/rocksdb/java/rocksjni/cassandra_value_operator.cc
@@ -3,33 +3,35 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
-#include <string>
 #include <memory>
+#include <string>
 
 #include "include/org_rocksdb_CassandraValueMergeOperator.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/memtablerep.h"
 #include "rocksdb/table.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/merge_operator.h"
+#include "rocksjni/portal.h"
 #include "utilities/cassandra/merge_operator.h"
 
 /*
  * Class:     org_rocksdb_CassandraValueMergeOperator
  * Method:    newSharedCassandraValueMergeOperator
- * Signature: ()J
+ * Signature: (II)J
  */
-jlong Java_org_rocksdb_CassandraValueMergeOperator_newSharedCassandraValueMergeOperator
-(JNIEnv* env, jclass jclazz) {
-  auto* sptr_string_append_op = new std::shared_ptr<rocksdb::MergeOperator>(
-    rocksdb::CassandraValueMergeOperator::CreateSharedInstance());
-  return reinterpret_cast<jlong>(sptr_string_append_op);
+jlong Java_org_rocksdb_CassandraValueMergeOperator_newSharedCassandraValueMergeOperator(
+    JNIEnv* /*env*/, jclass /*jclazz*/, jint gcGracePeriodInSeconds,
+    jint operands_limit) {
+  auto* op = new std::shared_ptr<rocksdb::MergeOperator>(
+      new rocksdb::cassandra::CassandraValueMergeOperator(
+          gcGracePeriodInSeconds, operands_limit));
+  return reinterpret_cast<jlong>(op);
 }
 
 /*
@@ -38,8 +40,8 @@ jlong Java_org_rocksdb_CassandraValueMergeOperator_newSharedCassandraValueMergeO
  * Signature: (J)V
  */
 void Java_org_rocksdb_CassandraValueMergeOperator_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  auto* sptr_string_append_op =
-      reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>* >(jhandle);
-  delete sptr_string_append_op;  // delete std::shared_ptr
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* op =
+      reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>(jhandle);
+  delete op;
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/checkpoint.cc b/thirdparty/rocksdb/java/rocksjni/checkpoint.cc
index 426f5d029e..f67f016296 100644
--- a/thirdparty/rocksdb/java/rocksjni/checkpoint.cc
+++ b/thirdparty/rocksdb/java/rocksjni/checkpoint.cc
@@ -6,22 +6,23 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::Checkpoint methods from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <string>
 
 #include "include/org_rocksdb_Checkpoint.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/utilities/checkpoint.h"
+#include "rocksjni/portal.h"
 /*
  * Class:     org_rocksdb_Checkpoint
  * Method:    newCheckpoint
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* env,
-    jclass jclazz, jlong jdb_handle) {
+jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* /*env*/,
+                                                jclass /*jclazz*/,
+                                                jlong jdb_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   rocksdb::Checkpoint* checkpoint;
   rocksdb::Checkpoint::Create(db, &checkpoint);
@@ -33,8 +34,9 @@ jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* env,
  * Method:    dispose
  * Signature: (J)V
  */
-void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* env, jobject jobj,
-    jlong jhandle) {
+void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
   auto* checkpoint = reinterpret_cast<rocksdb::Checkpoint*>(jhandle);
   assert(checkpoint != nullptr);
   delete checkpoint;
@@ -45,24 +47,21 @@ void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* env, jobject jobj,
  * Method:    createCheckpoint
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_Checkpoint_createCheckpoint(
-    JNIEnv* env, jobject jobj, jlong jcheckpoint_handle,
-    jstring jcheckpoint_path) {
-  const char* checkpoint_path = env->GetStringUTFChars(
-      jcheckpoint_path, 0);
-  if(checkpoint_path == nullptr) {
+void Java_org_rocksdb_Checkpoint_createCheckpoint(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong jcheckpoint_handle,
+                                                  jstring jcheckpoint_path) {
+  const char* checkpoint_path = env->GetStringUTFChars(jcheckpoint_path, 0);
+  if (checkpoint_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
-  auto* checkpoint = reinterpret_cast<rocksdb::Checkpoint*>(
-      jcheckpoint_handle);
-  rocksdb::Status s = checkpoint->CreateCheckpoint(
-      checkpoint_path);
-  
+  auto* checkpoint = reinterpret_cast<rocksdb::Checkpoint*>(jcheckpoint_handle);
+  rocksdb::Status s = checkpoint->CreateCheckpoint(checkpoint_path);
+
   env->ReleaseStringUTFChars(jcheckpoint_path, checkpoint_path);
-  
+
   if (!s.ok()) {
-      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/clock_cache.cc b/thirdparty/rocksdb/java/rocksjni/clock_cache.cc
index 0a4d7b28d6..b1cf0855ed 100644
--- a/thirdparty/rocksdb/java/rocksjni/clock_cache.cc
+++ b/thirdparty/rocksdb/java/rocksjni/clock_cache.cc
@@ -17,12 +17,11 @@
  * Signature: (JIZ)J
  */
 jlong Java_org_rocksdb_ClockCache_newClockCache(
-    JNIEnv* env, jclass jcls, jlong jcapacity, jint jnum_shard_bits,
+    JNIEnv* /*env*/, jclass /*jcls*/, jlong jcapacity, jint jnum_shard_bits,
     jboolean jstrict_capacity_limit) {
   auto* sptr_clock_cache =
       new std::shared_ptr<rocksdb::Cache>(rocksdb::NewClockCache(
-          static_cast<size_t>(jcapacity),
-          static_cast<int>(jnum_shard_bits),
+          static_cast<size_t>(jcapacity), static_cast<int>(jnum_shard_bits),
           static_cast<bool>(jstrict_capacity_limit)));
   return reinterpret_cast<jlong>(sptr_clock_cache);
 }
@@ -32,9 +31,10 @@ jlong Java_org_rocksdb_ClockCache_newClockCache(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_ClockCache_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+void Java_org_rocksdb_ClockCache_disposeInternal(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
   auto* sptr_clock_cache =
-      reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jhandle);
+      reinterpret_cast<std::shared_ptr<rocksdb::Cache>*>(jhandle);
   delete sptr_clock_cache;  // delete std::shared_ptr
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/columnfamilyhandle.cc b/thirdparty/rocksdb/java/rocksjni/columnfamilyhandle.cc
index 6e40a7e010..ed28057386 100644
--- a/thirdparty/rocksdb/java/rocksjni/columnfamilyhandle.cc
+++ b/thirdparty/rocksdb/java/rocksjni/columnfamilyhandle.cc
@@ -3,24 +3,70 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-// This file implements the "bridge" between Java and C++ and enables
-// calling c++ rocksdb::Iterator methods from Java side.
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::ColumnFamilyHandle.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 
 #include "include/org_rocksdb_ColumnFamilyHandle.h"
 #include "rocksjni/portal.h"
 
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    getName
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_ColumnFamilyHandle_getName(JNIEnv* env,
+                                                       jobject /*jobj*/,
+                                                       jlong jhandle) {
+  auto* cfh = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jhandle);
+  std::string cf_name = cfh->GetName();
+  return rocksdb::JniUtil::copyBytes(env, cf_name);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    getID
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_ColumnFamilyHandle_getID(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
+  auto* cfh = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jhandle);
+  const int32_t id = cfh->GetID();
+  return static_cast<jint>(id);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyHandle
+ * Method:    getDescriptor
+ * Signature: (J)Lorg/rocksdb/ColumnFamilyDescriptor;
+ */
+jobject Java_org_rocksdb_ColumnFamilyHandle_getDescriptor(JNIEnv* env,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto* cfh = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jhandle);
+  rocksdb::ColumnFamilyDescriptor desc;
+  rocksdb::Status s = cfh->GetDescriptor(&desc);
+  if (s.ok()) {
+    return rocksdb::ColumnFamilyDescriptorJni::construct(env, &desc);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+}
+
 /*
  * Class:     org_rocksdb_ColumnFamilyHandle
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto* cfh = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(handle);
+void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle) {
+  auto* cfh = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jhandle);
   assert(cfh != nullptr);
   delete cfh;
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/compact_range_options.cc b/thirdparty/rocksdb/java/rocksjni/compact_range_options.cc
new file mode 100644
index 0000000000..cc9ac859e8
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/compact_range_options.cc
@@ -0,0 +1,196 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactRangeOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactRangeOptions.h"
+#include "rocksdb/options.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    newCompactRangeOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions(
+    JNIEnv* /*env*/, jclass /*jclazz*/) {
+  auto* options = new rocksdb::CompactRangeOptions();
+  return reinterpret_cast<jlong>(options);
+}
+
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    exclusiveManualCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->exclusive_manual_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setExclusiveManualCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean exclusive_manual_compaction) {
+  auto* options =
+      reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->exclusive_manual_compaction = static_cast<bool>(exclusive_manual_compaction);
+}
+
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    bottommostLevelCompaction
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_bottommostLevelCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return rocksdb::BottommostLevelCompactionJni::toJavaBottommostLevelCompaction(
+    options->bottommost_level_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setBottommostLevelCompaction
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jint bottommost_level_compaction) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->bottommost_level_compaction =
+    rocksdb::BottommostLevelCompactionJni::toCppBottommostLevelCompaction(bottommost_level_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    changeLevel
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->change_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setChangeLevel
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setChangeLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->change_level = static_cast<bool>(change_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    targetLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_targetLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->target_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setTargetLevel
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setTargetLevel
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_level) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->target_level = static_cast<int>(target_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    targetPathId
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_targetPathId
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->target_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setTargetPathId
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setTargetPathId
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint target_path_id) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->target_path_id = static_cast<uint32_t>(target_path_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    allowWriteStall
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jboolean>(options->allow_write_stall);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setAllowWriteStall
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean allow_write_stall) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->allow_write_stall = static_cast<bool>(allow_write_stall);
+}
+
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    maxSubcompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  return static_cast<jint>(options->max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    setMaxSubcompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions
+  (JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  options->max_subcompactions = static_cast<uint32_t>(max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactRangeOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactRangeOptions_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::CompactRangeOptions*>(jhandle);
+  delete options;
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_filter.cc b/thirdparty/rocksdb/java/rocksjni/compaction_filter.cc
index 72de46b3fb..263bae05ee 100644
--- a/thirdparty/rocksdb/java/rocksjni/compaction_filter.cc
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_filter.cc
@@ -18,8 +18,9 @@
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_AbstractCompactionFilter_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_AbstractCompactionFilter_disposeInternal(JNIEnv* /*env*/,
+                                                               jobject /*jobj*/,
+                                                               jlong handle) {
   auto* cf = reinterpret_cast<rocksdb::CompactionFilter*>(handle);
   assert(cf != nullptr);
   delete cf;
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory.cc b/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory.cc
new file mode 100644
index 0000000000..2ef0a77462
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory.cc
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactionFilterFactory.
+
+#include <jni.h>
+#include <memory>
+
+#include "include/org_rocksdb_AbstractCompactionFilterFactory.h"
+#include "rocksjni/compaction_filter_factory_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractCompactionFilterFactory
+ * Method:    createNewCompactionFilterFactory0
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractCompactionFilterFactory_createNewCompactionFilterFactory0(
+    JNIEnv* env, jobject jobj) {
+  auto* cff = new rocksdb::CompactionFilterFactoryJniCallback(env, jobj);
+  auto* ptr_sptr_cff =
+      new std::shared_ptr<rocksdb::CompactionFilterFactoryJniCallback>(cff);
+  return reinterpret_cast<jlong>(ptr_sptr_cff);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractCompactionFilterFactory
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractCompactionFilterFactory_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* ptr_sptr_cff = reinterpret_cast<
+      std::shared_ptr<rocksdb::CompactionFilterFactoryJniCallback>*>(jhandle);
+  delete ptr_sptr_cff;
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.cc b/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.cc
new file mode 100644
index 0000000000..c727a3e02f
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.cc
@@ -0,0 +1,76 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::CompactionFilterFactory.
+
+#include "rocksjni/compaction_filter_factory_jnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+CompactionFilterFactoryJniCallback::CompactionFilterFactoryJniCallback(
+    JNIEnv* env, jobject jcompaction_filter_factory)
+    : JniCallback(env, jcompaction_filter_factory) {
+  
+  // Note: The name of a CompactionFilterFactory will not change during
+  // it's lifetime, so we cache it in a global var
+  jmethodID jname_method_id =
+      AbstractCompactionFilterFactoryJni::getNameMethodId(env);
+  if(jname_method_id == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  jstring jname =
+      (jstring)env->CallObjectMethod(m_jcallback_obj, jname_method_id);
+  if(env->ExceptionCheck()) {
+    // exception thrown
+    return;
+  }
+  jboolean has_exception = JNI_FALSE;
+  m_name = JniUtil::copyString(env, jname, &has_exception);  // also releases jname
+  if (has_exception == JNI_TRUE) {
+    // exception thrown
+    return;
+  }
+
+  m_jcreate_compaction_filter_methodid =
+      AbstractCompactionFilterFactoryJni::getCreateCompactionFilterMethodId(env);
+  if(m_jcreate_compaction_filter_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+}
+
+const char* CompactionFilterFactoryJniCallback::Name() const {
+  return m_name.get();
+}
+
+std::unique_ptr<CompactionFilter> CompactionFilterFactoryJniCallback::CreateCompactionFilter(
+    const CompactionFilter::Context& context) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  jlong addr_compaction_filter = env->CallLongMethod(m_jcallback_obj,
+      m_jcreate_compaction_filter_methodid,
+      static_cast<jboolean>(context.is_full_compaction),
+      static_cast<jboolean>(context.is_manual_compaction));
+
+  if(env->ExceptionCheck()) {
+    // exception thrown from CallLongMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return nullptr;
+  }
+
+  auto* cff = reinterpret_cast<CompactionFilter*>(addr_compaction_filter);
+
+  releaseJniEnv(attached_thread);
+
+  return std::unique_ptr<CompactionFilter>(cff);
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.h b/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.h
new file mode 100644
index 0000000000..10802edfd1
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_filter_factory_jnicallback.h
@@ -0,0 +1,35 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::CompactionFilterFactory.
+
+#ifndef JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_
+
+#include <jni.h>
+#include <memory>
+
+#include "rocksdb/compaction_filter.h"
+#include "rocksjni/jnicallback.h"
+
+namespace rocksdb {
+
+class CompactionFilterFactoryJniCallback : public JniCallback, public CompactionFilterFactory {
+ public:
+    CompactionFilterFactoryJniCallback(
+        JNIEnv* env, jobject jcompaction_filter_factory);
+    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context);
+    virtual const char* Name() const;
+
+ private:
+    std::unique_ptr<const char[]> m_name;
+    jmethodID m_jcreate_compaction_filter_methodid;
+};
+
+}  //namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_COMPACTION_FILTER_FACTORY_JNICALLBACK_H_
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_job_info.cc b/thirdparty/rocksdb/java/rocksjni/compaction_job_info.cc
new file mode 100644
index 0000000000..6af6efcb85
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_job_info.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactionJobInfo.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionJobInfo.h"
+#include "rocksdb/listener.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    newCompactionJobInfo
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionJobInfo_newCompactionJobInfo(
+    JNIEnv*, jclass) {
+  auto* compact_job_info = new rocksdb::CompactionJobInfo();
+  return reinterpret_cast<jlong>(compact_job_info);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionJobInfo_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  delete compact_job_info;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    columnFamilyName
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_CompactionJobInfo_columnFamilyName(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return rocksdb::JniUtil::copyBytes(
+      env, compact_job_info->cf_name);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    status
+ * Signature: (J)Lorg/rocksdb/Status;
+ */
+jobject Java_org_rocksdb_CompactionJobInfo_status(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return rocksdb::StatusJni::construct(
+      env, compact_job_info->status);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    threadId
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobInfo_threadId(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return static_cast<jlong>(compact_job_info->thread_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    jobId
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionJobInfo_jobId(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return static_cast<jint>(compact_job_info->job_id);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    baseInputLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionJobInfo_baseInputLevel(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return static_cast<jint>(compact_job_info->base_input_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    outputLevel
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionJobInfo_outputLevel(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return static_cast<jint>(compact_job_info->output_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    inputFiles
+ * Signature: (J)[Ljava/lang/String;
+ */
+jobjectArray Java_org_rocksdb_CompactionJobInfo_inputFiles(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return rocksdb::JniUtil::toJavaStrings(
+      env, &compact_job_info->input_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    outputFiles
+ * Signature: (J)[Ljava/lang/String;
+ */
+jobjectArray Java_org_rocksdb_CompactionJobInfo_outputFiles(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return rocksdb::JniUtil::toJavaStrings(
+      env, &compact_job_info->output_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    tableProperties
+ * Signature: (J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_CompactionJobInfo_tableProperties(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  auto* map = &compact_job_info->table_properties;
+  
+  jobject jhash_map = rocksdb::HashMapJni::construct(
+      env, static_cast<uint32_t>(map->size()));
+  if (jhash_map == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const rocksdb::HashMapJni::FnMapKV<const std::string, std::shared_ptr<const rocksdb::TableProperties>, jobject, jobject> fn_map_kv =
+        [env](const std::pair<const std::string, std::shared_ptr<const rocksdb::TableProperties>>& kv) {
+      jstring jkey = rocksdb::JniUtil::toJavaString(env, &(kv.first), false);
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      jobject jtable_properties = rocksdb::TablePropertiesJni::fromCppTableProperties(
+          env, *(kv.second.get()));
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        env->DeleteLocalRef(jkey);
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      return std::unique_ptr<std::pair<jobject, jobject>>(
+        new std::pair<jobject, jobject>(static_cast<jobject>(jkey), jtable_properties));
+    };
+
+  if (!rocksdb::HashMapJni::putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+    // exception occurred
+    return nullptr;
+  }
+
+  return jhash_map;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    compactionReason
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_CompactionJobInfo_compactionReason(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+    reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return rocksdb::CompactionReasonJni::toJavaCompactionReason(
+      compact_job_info->compaction_reason);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    compression
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_CompactionJobInfo_compression(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_info =
+    reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  return rocksdb::CompressionTypeJni::toJavaCompressionType(
+      compact_job_info->compression);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobInfo
+ * Method:    stats
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobInfo_stats(
+    JNIEnv *, jclass, jlong jhandle) {
+  auto* compact_job_info =
+      reinterpret_cast<rocksdb::CompactionJobInfo*>(jhandle);
+  auto* stats = new rocksdb::CompactionJobStats();
+  stats->Add(compact_job_info->stats);
+  return reinterpret_cast<jlong>(stats);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_job_stats.cc b/thirdparty/rocksdb/java/rocksjni/compaction_job_stats.cc
new file mode 100644
index 0000000000..7d13dd12f9
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_job_stats.cc
@@ -0,0 +1,361 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactionJobStats.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionJobStats.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    newCompactionJobStats
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_newCompactionJobStats(
+    JNIEnv*, jclass) {
+  auto* compact_job_stats = new rocksdb::CompactionJobStats();
+  return reinterpret_cast<jlong>(compact_job_stats);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionJobStats_disposeInternal(
+    JNIEnv *, jobject, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  delete compact_job_stats;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    reset
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionJobStats_reset(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  compact_job_stats->Reset();
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    add
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionJobStats_add(
+    JNIEnv*, jclass, jlong jhandle, jlong jother_handle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  auto* other_compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jother_handle);
+  compact_job_stats->Add(*other_compact_job_stats);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    elapsedMicros
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_elapsedMicros(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->elapsed_micros);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputRecords(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_input_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputFiles
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputFiles(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(compact_job_stats->num_input_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputFilesAtOutputLevel
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputFilesAtOutputLevel(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_input_files_at_output_level);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numOutputRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numOutputRecords(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_output_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numOutputFiles
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numOutputFiles(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_output_files);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    isManualCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionJobStats_isManualCompaction(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  if (compact_job_stats->is_manual_compaction) {
+    return JNI_TRUE;
+  } else {
+    return JNI_FALSE;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalInputBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalInputBytes(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->total_input_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalOutputBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalOutputBytes(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->total_output_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numRecordsReplaced
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numRecordsReplaced(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_records_replaced);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalInputRawKeyBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalInputRawKeyBytes(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->total_input_raw_key_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    totalInputRawValueBytes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_totalInputRawValueBytes(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->total_input_raw_value_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numInputDeletionRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numInputDeletionRecords(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_input_deletion_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numExpiredDeletionRecords
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numExpiredDeletionRecords(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_expired_deletion_records);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numCorruptKeys
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numCorruptKeys(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_corrupt_keys);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    fileWriteNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_fileWriteNanos(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->file_write_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    fileRangeSyncNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_fileRangeSyncNanos(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->file_range_sync_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    fileFsyncNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_fileFsyncNanos(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->file_fsync_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    filePrepareWriteNanos
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_filePrepareWriteNanos(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->file_prepare_write_nanos);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    smallestOutputKeyPrefix
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_CompactionJobStats_smallestOutputKeyPrefix(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return rocksdb::JniUtil::copyBytes(env,
+      compact_job_stats->smallest_output_key_prefix);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    largestOutputKeyPrefix
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_CompactionJobStats_largestOutputKeyPrefix(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return rocksdb::JniUtil::copyBytes(env,
+      compact_job_stats->largest_output_key_prefix);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numSingleDelFallthru
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numSingleDelFallthru(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_single_del_fallthru);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionJobStats
+ * Method:    numSingleDelMismatch
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionJobStats_numSingleDelMismatch(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_job_stats =
+      reinterpret_cast<rocksdb::CompactionJobStats*>(jhandle);
+  return static_cast<jlong>(
+      compact_job_stats->num_single_del_mismatch);
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_options.cc b/thirdparty/rocksdb/java/rocksjni/compaction_options.cc
new file mode 100644
index 0000000000..6aaabea736
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_options.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactionOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_CompactionOptions.h"
+#include "rocksdb/options.h"
+#include "rocksjni/portal.h"
+
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    newCompactionOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_CompactionOptions_newCompactionOptions(
+    JNIEnv*, jclass) {
+  auto* compact_opts = new rocksdb::CompactionOptions();
+  return reinterpret_cast<jlong>(compact_opts);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_CompactionOptions_disposeInternal(
+    JNIEnv *, jobject, jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jhandle);
+  delete compact_opts;
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    compression
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_CompactionOptions_compression(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jhandle);
+  return rocksdb::CompressionTypeJni::toJavaCompressionType(
+      compact_opts->compression);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    setCompression
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_CompactionOptions_setCompression(
+    JNIEnv*, jclass, jlong jhandle, jbyte jcompression_type_value) {
+  auto* compact_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jhandle);
+  compact_opts->compression =
+      rocksdb::CompressionTypeJni::toCppCompressionType(
+          jcompression_type_value);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    outputFileSizeLimit
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_CompactionOptions_outputFileSizeLimit(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jhandle);
+  return static_cast<jlong>(
+      compact_opts->output_file_size_limit);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    setOutputFileSizeLimit
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_CompactionOptions_setOutputFileSizeLimit(
+    JNIEnv*, jclass, jlong jhandle, jlong joutput_file_size_limit) {
+  auto* compact_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jhandle);
+  compact_opts->output_file_size_limit =
+      static_cast<uint64_t>(joutput_file_size_limit);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    maxSubcompactions
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompactionOptions_maxSubcompactions(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto* compact_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jhandle);
+  return static_cast<jint>(
+      compact_opts->max_subcompactions);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptions
+ * Method:    setMaxSubcompactions
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompactionOptions_setMaxSubcompactions(
+    JNIEnv*, jclass, jlong jhandle, jint jmax_subcompactions) {
+  auto* compact_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jhandle);
+  compact_opts->max_subcompactions =
+      static_cast<uint32_t>(jmax_subcompactions);
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_options_fifo.cc b/thirdparty/rocksdb/java/rocksjni/compaction_options_fifo.cc
index ef04d81c64..b7c445fd6f 100644
--- a/thirdparty/rocksdb/java/rocksjni/compaction_options_fifo.cc
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_options_fifo.cc
@@ -17,7 +17,7 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_CompactionOptionsFIFO_newCompactionOptionsFIFO(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv*, jclass) {
   const auto* opt = new rocksdb::CompactionOptionsFIFO();
   return reinterpret_cast<jlong>(opt);
 }
@@ -28,7 +28,7 @@ jlong Java_org_rocksdb_CompactionOptionsFIFO_newCompactionOptionsFIFO(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_CompactionOptionsFIFO_setMaxTableFilesSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
   opt->max_table_files_size = static_cast<uint64_t>(jmax_table_files_size);
 }
@@ -39,17 +39,39 @@ void Java_org_rocksdb_CompactionOptionsFIFO_setMaxTableFilesSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_CompactionOptionsFIFO_maxTableFilesSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
   return static_cast<jlong>(opt->max_table_files_size);
 }
 
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    setAllowCompaction
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompactionOptionsFIFO_setAllowCompaction(
+    JNIEnv*, jobject, jlong jhandle, jboolean allow_compaction) {
+  auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
+  opt->allow_compaction = static_cast<bool>(allow_compaction);
+}
+
+/*
+ * Class:     org_rocksdb_CompactionOptionsFIFO
+ * Method:    allowCompaction
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompactionOptionsFIFO_allowCompaction(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
+  return static_cast<jboolean>(opt->allow_compaction);
+}
+
 /*
  * Class:     org_rocksdb_CompactionOptionsFIFO
  * Method:    disposeInternal
  * Signature: (J)V
  */
 void Java_org_rocksdb_CompactionOptionsFIFO_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   delete reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(jhandle);
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/compaction_options_universal.cc b/thirdparty/rocksdb/java/rocksjni/compaction_options_universal.cc
index d397db8e43..7ca519885d 100644
--- a/thirdparty/rocksdb/java/rocksjni/compaction_options_universal.cc
+++ b/thirdparty/rocksdb/java/rocksjni/compaction_options_universal.cc
@@ -18,7 +18,7 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_CompactionOptionsUniversal_newCompactionOptionsUniversal(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv*, jclass) {
   const auto* opt = new rocksdb::CompactionOptionsUniversal();
   return reinterpret_cast<jlong>(opt);
 }
@@ -29,7 +29,7 @@ jlong Java_org_rocksdb_CompactionOptionsUniversal_newCompactionOptionsUniversal(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_setSizeRatio(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jsize_ratio) {
+    JNIEnv*, jobject, jlong jhandle, jint jsize_ratio) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   opt->size_ratio = static_cast<unsigned int>(jsize_ratio);
 }
@@ -40,7 +40,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setSizeRatio(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   return static_cast<jint>(opt->size_ratio);
 }
@@ -51,7 +51,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_sizeRatio(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_setMinMergeWidth(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmin_merge_width) {
+    JNIEnv*, jobject, jlong jhandle, jint jmin_merge_width) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   opt->min_merge_width = static_cast<unsigned int>(jmin_merge_width);
 }
@@ -62,7 +62,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMinMergeWidth(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   return static_cast<jint>(opt->min_merge_width);
 }
@@ -73,7 +73,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_minMergeWidth(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_setMaxMergeWidth(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_merge_width) {
+    JNIEnv*, jobject, jlong jhandle, jint jmax_merge_width) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   opt->max_merge_width = static_cast<unsigned int>(jmax_merge_width);
 }
@@ -84,7 +84,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMaxMergeWidth(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   return static_cast<jint>(opt->max_merge_width);
 }
@@ -95,8 +95,7 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_maxMergeWidth(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_setMaxSizeAmplificationPercent(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmax_size_amplification_percent) {
+    JNIEnv*, jobject, jlong jhandle, jint jmax_size_amplification_percent) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   opt->max_size_amplification_percent =
       static_cast<unsigned int>(jmax_size_amplification_percent);
@@ -108,7 +107,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setMaxSizeAmplificationPercent(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompactionOptionsUniversal_maxSizeAmplificationPercent(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   return static_cast<jint>(opt->max_size_amplification_percent);
 }
@@ -119,7 +118,8 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_maxSizeAmplificationPercent(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_setCompressionSizePercent(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jcompression_size_percent) {
+    JNIEnv*, jobject, jlong jhandle,
+    jint jcompression_size_percent) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   opt->compression_size_percent =
       static_cast<unsigned int>(jcompression_size_percent);
@@ -131,7 +131,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setCompressionSizePercent(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompactionOptionsUniversal_compressionSizePercent(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   return static_cast<jint>(opt->compression_size_percent);
 }
@@ -142,11 +142,10 @@ jint Java_org_rocksdb_CompactionOptionsUniversal_compressionSizePercent(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_setStopStyle(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jstop_style_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jstop_style_value) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
-  opt->stop_style =
-      rocksdb::CompactionStopStyleJni::toCppCompactionStopStyle(
-          jstop_style_value); 
+  opt->stop_style = rocksdb::CompactionStopStyleJni::toCppCompactionStopStyle(
+      jstop_style_value);
 }
 
 /*
@@ -155,7 +154,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setStopStyle(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   return rocksdb::CompactionStopStyleJni::toJavaCompactionStopStyle(
       opt->stop_style);
@@ -167,7 +166,7 @@ jbyte Java_org_rocksdb_CompactionOptionsUniversal_stopStyle(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_setAllowTrivialMove(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jallow_trivial_move) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_trivial_move) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   opt->allow_trivial_move = static_cast<bool>(jallow_trivial_move);
 }
@@ -178,7 +177,7 @@ void Java_org_rocksdb_CompactionOptionsUniversal_setAllowTrivialMove(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_CompactionOptionsUniversal_allowTrivialMove(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
   return opt->allow_trivial_move;
 }
@@ -189,6 +188,6 @@ jboolean Java_org_rocksdb_CompactionOptionsUniversal_allowTrivialMove(
  * Signature: (J)V
  */
 void Java_org_rocksdb_CompactionOptionsUniversal_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   delete reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(jhandle);
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/comparator.cc b/thirdparty/rocksdb/java/rocksjni/comparator.cc
index 5955d0bf75..13f8feb6f3 100644
--- a/thirdparty/rocksdb/java/rocksjni/comparator.cc
+++ b/thirdparty/rocksdb/java/rocksjni/comparator.cc
@@ -6,33 +6,18 @@
 // This file implements the "bridge" between Java and C++ for
 // rocksdb::Comparator.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
-#include <string>
 #include <functional>
+#include <string>
 
-#include "include/org_rocksdb_AbstractComparator.h"
 #include "include/org_rocksdb_Comparator.h"
 #include "include/org_rocksdb_DirectComparator.h"
+#include "include/org_rocksdb_NativeComparatorWrapper.h"
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/portal.h"
 
-// <editor-fold desc="org.rocksdb.AbstractComparator>
-
-/*
- * Class:     org_rocksdb_AbstractComparator
- * Method:    disposeInternal
- * Signature: (J)V
- */
-void Java_org_rocksdb_AbstractComparator_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto* bcjc = reinterpret_cast<rocksdb::BaseComparatorJniCallback*>(handle);
-  assert(bcjc != nullptr);
-  delete bcjc;
-}
-// </editor-fold>
-
 // <editor-fold desc="org.rocksdb.Comparator>
 
 /*
@@ -40,12 +25,12 @@ void Java_org_rocksdb_AbstractComparator_disposeInternal(
  * Method:    createNewComparator0
  * Signature: ()J
  */
-jlong Java_org_rocksdb_Comparator_createNewComparator0(
-    JNIEnv* env, jobject jobj, jlong copt_handle) {
-  const rocksdb::ComparatorJniCallbackOptions* copt =
-    reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
-  const rocksdb::ComparatorJniCallback* c =
-    new rocksdb::ComparatorJniCallback(env, jobj, copt);
+jlong Java_org_rocksdb_Comparator_createNewComparator0(JNIEnv* env,
+                                                       jobject jobj,
+                                                       jlong copt_handle) {
+  auto* copt =
+      reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  auto* c = new rocksdb::ComparatorJniCallback(env, jobj, copt);
   return reinterpret_cast<jlong>(c);
 }
 // </editor-fold>
@@ -59,10 +44,20 @@ jlong Java_org_rocksdb_Comparator_createNewComparator0(
  */
 jlong Java_org_rocksdb_DirectComparator_createNewDirectComparator0(
     JNIEnv* env, jobject jobj, jlong copt_handle) {
-  const rocksdb::ComparatorJniCallbackOptions* copt =
-    reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
-  const rocksdb::DirectComparatorJniCallback* c =
-    new rocksdb::DirectComparatorJniCallback(env, jobj, copt);
+  auto* copt =
+      reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(copt_handle);
+  auto* c = new rocksdb::DirectComparatorJniCallback(env, jobj, copt);
   return reinterpret_cast<jlong>(c);
 }
+
+/*
+ * Class:     org_rocksdb_NativeComparatorWrapper
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_NativeComparatorWrapper_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jcomparator_handle) {
+  auto* comparator = reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+  delete comparator;
+}
 // </editor-fold>
diff --git a/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.cc b/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.cc
index 73ab46ad21..5b4d11b020 100644
--- a/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.cc
+++ b/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.cc
@@ -13,24 +13,9 @@ namespace rocksdb {
 BaseComparatorJniCallback::BaseComparatorJniCallback(
     JNIEnv* env, jobject jComparator,
     const ComparatorJniCallbackOptions* copt)
-    : mtx_compare(new port::Mutex(copt->use_adaptive_mutex)),
+    : JniCallback(env, jComparator),
+    mtx_compare(new port::Mutex(copt->use_adaptive_mutex)),
     mtx_findShortestSeparator(new port::Mutex(copt->use_adaptive_mutex)) {
-  // Note: Comparator methods may be accessed by multiple threads,
-  // so we ref the jvm not the env
-  const jint rs = env->GetJavaVM(&m_jvm);
-  if(rs != JNI_OK) {
-    // exception thrown
-    return;
-  }
-
-  // Note: we want to access the Java Comparator instance
-  // across multiple method calls, so we create a global ref
-  assert(jComparator != nullptr);
-  m_jComparator = env->NewGlobalRef(jComparator);
-  if(m_jComparator == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return;
-  }
 
   // Note: The name of a Comparator will not change during it's lifetime,
   // so we cache it in a global var
@@ -39,7 +24,7 @@ BaseComparatorJniCallback::BaseComparatorJniCallback(
     // exception thrown: NoSuchMethodException or OutOfMemoryError
     return;
   }
-  jstring jsName = (jstring)env->CallObjectMethod(m_jComparator, jNameMethodId);
+  jstring jsName = (jstring)env->CallObjectMethod(m_jcallback_obj, jNameMethodId);
   if(env->ExceptionCheck()) {
     // exception thrown
     return;
@@ -74,18 +59,18 @@ BaseComparatorJniCallback::BaseComparatorJniCallback(
 }
 
 const char* BaseComparatorJniCallback::Name() const {
-  return m_name.c_str();
+  return m_name.get();
 }
 
 int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
   jboolean attached_thread = JNI_FALSE;
-  JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
+  JNIEnv* env = getJniEnv(&attached_thread);
   assert(env != nullptr);
 
   // TODO(adamretter): slice objects can potentially be cached using thread
   // local variables to avoid locking. Could make this configurable depending on
   // performance.
-  mtx_compare->Lock();
+  mtx_compare.get()->Lock();
 
   bool pending_exception =
       AbstractSliceJni::setHandle(env, m_jSliceA, &a, JNI_FALSE);
@@ -94,7 +79,7 @@ int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
       // exception thrown from setHandle or descendant
       env->ExceptionDescribe(); // print out exception to stderr
     }
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return 0;
   }
 
@@ -105,15 +90,15 @@ int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
       // exception thrown from setHandle or descendant
       env->ExceptionDescribe(); // print out exception to stderr
     }
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return 0;
   }
-  
+
   jint result =
-    env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA,
+    env->CallIntMethod(m_jcallback_obj, m_jCompareMethodId, m_jSliceA,
       m_jSliceB);
 
-  mtx_compare->Unlock();
+  mtx_compare.get()->Unlock();
 
   if(env->ExceptionCheck()) {
     // exception thrown from CallIntMethod
@@ -121,19 +106,19 @@ int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const {
     result = 0; // we could not get a result from java callback so use 0
   }
 
-  JniUtil::releaseJniEnv(m_jvm, attached_thread);
+  releaseJniEnv(attached_thread);
 
   return result;
 }
 
 void BaseComparatorJniCallback::FindShortestSeparator(
-  std::string* start, const Slice& limit) const {
+    std::string* start, const Slice& limit) const {
   if (start == nullptr) {
     return;
   }
 
   jboolean attached_thread = JNI_FALSE;
-  JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
+  JNIEnv* env = getJniEnv(&attached_thread);
   assert(env != nullptr);
 
   const char* startUtf = start->c_str();
@@ -143,21 +128,21 @@ void BaseComparatorJniCallback::FindShortestSeparator(
     if(env->ExceptionCheck()) {
       env->ExceptionDescribe(); // print out exception to stderr
     }
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return;
   }
   if(env->ExceptionCheck()) {
     // exception thrown: OutOfMemoryError
     env->ExceptionDescribe(); // print out exception to stderr
     env->DeleteLocalRef(jsStart);
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return;
   }
 
   // TODO(adamretter): slice object can potentially be cached using thread local
   // variable to avoid locking. Could make this configurable depending on
   // performance.
-  mtx_findShortestSeparator->Lock();
+  mtx_findShortestSeparator.get()->Lock();
 
   bool pending_exception =
       AbstractSliceJni::setHandle(env, m_jSliceLimit, &limit, JNI_FALSE);
@@ -169,21 +154,21 @@ void BaseComparatorJniCallback::FindShortestSeparator(
     if(jsStart != nullptr) {
       env->DeleteLocalRef(jsStart);
     }
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return;
   }
 
   jstring jsResultStart =
-    (jstring)env->CallObjectMethod(m_jComparator,
+    (jstring)env->CallObjectMethod(m_jcallback_obj,
       m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit);
 
-  mtx_findShortestSeparator->Unlock();
+  mtx_findShortestSeparator.get()->Unlock();
 
   if(env->ExceptionCheck()) {
     // exception thrown from CallObjectMethod
     env->ExceptionDescribe();  // print out exception to stderr
     env->DeleteLocalRef(jsStart);
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return;
   }
 
@@ -192,29 +177,29 @@ void BaseComparatorJniCallback::FindShortestSeparator(
   if (jsResultStart != nullptr) {
     // update start with result
     jboolean has_exception = JNI_FALSE;
-    std::string result = JniUtil::copyString(env, jsResultStart,
+    std::unique_ptr<const char[]> result_start = JniUtil::copyString(env, jsResultStart,
         &has_exception);  // also releases jsResultStart
     if (has_exception == JNI_TRUE) {
       if (env->ExceptionCheck()) {
         env->ExceptionDescribe();  // print out exception to stderr
       }
-      JniUtil::releaseJniEnv(m_jvm, attached_thread);
+      releaseJniEnv(attached_thread);
       return;
     }
 
-    *start = result;
+    start->assign(result_start.get());
   }
-
-  JniUtil::releaseJniEnv(m_jvm, attached_thread);
+  releaseJniEnv(attached_thread);
 }
 
-void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
+void BaseComparatorJniCallback::FindShortSuccessor(
+    std::string* key) const {
   if (key == nullptr) {
     return;
   }
 
   jboolean attached_thread = JNI_FALSE;
-  JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
+  JNIEnv* env = getJniEnv(&attached_thread);
   assert(env != nullptr);
 
   const char* keyUtf = key->c_str();
@@ -224,25 +209,25 @@ void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
     if(env->ExceptionCheck()) {
       env->ExceptionDescribe(); // print out exception to stderr
     }
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return;
   } else if(env->ExceptionCheck()) {
     // exception thrown: OutOfMemoryError
     env->ExceptionDescribe(); // print out exception to stderr
     env->DeleteLocalRef(jsKey);
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return;
   }
 
   jstring jsResultKey =
-    (jstring)env->CallObjectMethod(m_jComparator,
+    (jstring)env->CallObjectMethod(m_jcallback_obj,
       m_jFindShortSuccessorMethodId, jsKey);
 
   if(env->ExceptionCheck()) {
     // exception thrown from CallObjectMethod
     env->ExceptionDescribe(); // print out exception to stderr
     env->DeleteLocalRef(jsKey);
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
     return;
   }
 
@@ -251,31 +236,20 @@ void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const {
   if (jsResultKey != nullptr) {
     // updates key with result, also releases jsResultKey.
     jboolean has_exception = JNI_FALSE;
-    std::string result = JniUtil::copyString(env, jsResultKey, &has_exception);
+    std::unique_ptr<const char[]> result_key = JniUtil::copyString(env, jsResultKey,
+        &has_exception);    // also releases jsResultKey
     if (has_exception == JNI_TRUE) {
       if (env->ExceptionCheck()) {
         env->ExceptionDescribe();  // print out exception to stderr
       }
-      JniUtil::releaseJniEnv(m_jvm, attached_thread);
+      releaseJniEnv(attached_thread);
       return;
     }
 
-    *key = result;
-  }
-
-  JniUtil::releaseJniEnv(m_jvm, attached_thread);
-}
-
-BaseComparatorJniCallback::~BaseComparatorJniCallback() {
-  jboolean attached_thread = JNI_FALSE;
-  JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
-  assert(env != nullptr);
-
-  if(m_jComparator != nullptr) {
-    env->DeleteGlobalRef(m_jComparator);
+    key->assign(result_key.get());
   }
 
-  JniUtil::releaseJniEnv(m_jvm, attached_thread);
+  releaseJniEnv(attached_thread);
 }
 
 ComparatorJniCallback::ComparatorJniCallback(
@@ -303,7 +277,7 @@ ComparatorJniCallback::ComparatorJniCallback(
 
 ComparatorJniCallback::~ComparatorJniCallback() {
   jboolean attached_thread = JNI_FALSE;
-  JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
+  JNIEnv* env = getJniEnv(&attached_thread);
   assert(env != nullptr);
 
   if(m_jSliceA != nullptr) {
@@ -318,7 +292,7 @@ ComparatorJniCallback::~ComparatorJniCallback() {
     env->DeleteGlobalRef(m_jSliceLimit);
   }
 
-  JniUtil::releaseJniEnv(m_jvm, attached_thread);
+  releaseJniEnv(attached_thread);
 }
 
 DirectComparatorJniCallback::DirectComparatorJniCallback(
@@ -346,7 +320,7 @@ DirectComparatorJniCallback::DirectComparatorJniCallback(
 
 DirectComparatorJniCallback::~DirectComparatorJniCallback() {
   jboolean attached_thread = JNI_FALSE;
-  JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
+  JNIEnv* env = getJniEnv(&attached_thread);
   assert(env != nullptr);
 
   if(m_jSliceA != nullptr) {
@@ -361,6 +335,6 @@ DirectComparatorJniCallback::~DirectComparatorJniCallback() {
     env->DeleteGlobalRef(m_jSliceLimit);
   }
 
-  JniUtil::releaseJniEnv(m_jvm, attached_thread);
+  releaseJniEnv(attached_thread);
 }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.h b/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.h
index a753008b33..0aa9cc0af8 100644
--- a/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.h
+++ b/thirdparty/rocksdb/java/rocksjni/comparatorjnicallback.h
@@ -10,7 +10,9 @@
 #define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_
 
 #include <jni.h>
+#include <memory>
 #include <string>
+#include "rocksjni/jnicallback.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
 #include "port/port.h"
@@ -44,12 +46,11 @@ struct ComparatorJniCallbackOptions {
  * introduce independent locking in regions of each of those methods
  * via the mutexs mtx_compare and mtx_findShortestSeparator respectively
  */
-class BaseComparatorJniCallback : public Comparator {
+class BaseComparatorJniCallback : public JniCallback, public Comparator {
  public:
     BaseComparatorJniCallback(
       JNIEnv* env, jobject jComparator,
       const ComparatorJniCallbackOptions* copt);
-    virtual ~BaseComparatorJniCallback();
     virtual const char* Name() const;
     virtual int Compare(const Slice& a, const Slice& b) const;
     virtual void FindShortestSeparator(
@@ -58,17 +59,15 @@ class BaseComparatorJniCallback : public Comparator {
 
  private:
     // used for synchronisation in compare method
-    port::Mutex* mtx_compare;
+    std::unique_ptr<port::Mutex> mtx_compare;
     // used for synchronisation in findShortestSeparator method
-    port::Mutex* mtx_findShortestSeparator;
-    jobject m_jComparator;
-    std::string m_name;
+    std::unique_ptr<port::Mutex> mtx_findShortestSeparator;
+    std::unique_ptr<const char[]> m_name;
     jmethodID m_jCompareMethodId;
     jmethodID m_jFindShortestSeparatorMethodId;
     jmethodID m_jFindShortSuccessorMethodId;
 
  protected:
-    JavaVM* m_jvm;
     jobject m_jSliceA;
     jobject m_jSliceB;
     jobject m_jSliceLimit;
diff --git a/thirdparty/rocksdb/java/rocksjni/compression_options.cc b/thirdparty/rocksdb/java/rocksjni/compression_options.cc
index 7d5af645ae..f0155eb335 100644
--- a/thirdparty/rocksdb/java/rocksjni/compression_options.cc
+++ b/thirdparty/rocksdb/java/rocksjni/compression_options.cc
@@ -17,7 +17,7 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_CompressionOptions_newCompressionOptions(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv*, jclass) {
   const auto* opt = new rocksdb::CompressionOptions();
   return reinterpret_cast<jlong>(opt);
 }
@@ -28,7 +28,7 @@ jlong Java_org_rocksdb_CompressionOptions_newCompressionOptions(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompressionOptions_setWindowBits(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jwindow_bits) {
+    JNIEnv*, jobject, jlong jhandle, jint jwindow_bits) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
   opt->window_bits = static_cast<int>(jwindow_bits);
 }
@@ -39,7 +39,7 @@ void Java_org_rocksdb_CompressionOptions_setWindowBits(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompressionOptions_windowBits(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
   return static_cast<jint>(opt->window_bits);
 }
@@ -50,7 +50,7 @@ jint Java_org_rocksdb_CompressionOptions_windowBits(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompressionOptions_setLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jlevel) {
+    JNIEnv*, jobject, jlong jhandle, jint jlevel) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
   opt->level = static_cast<int>(jlevel);
 }
@@ -61,7 +61,7 @@ void Java_org_rocksdb_CompressionOptions_setLevel(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompressionOptions_level(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
   return static_cast<jint>(opt->level);
 }
@@ -72,7 +72,7 @@ jint Java_org_rocksdb_CompressionOptions_level(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompressionOptions_setStrategy(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jstrategy) {
+    JNIEnv*, jobject, jlong jhandle, jint jstrategy) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
   opt->strategy = static_cast<int>(jstrategy);
 }
@@ -83,7 +83,7 @@ void Java_org_rocksdb_CompressionOptions_setStrategy(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompressionOptions_strategy(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
   return static_cast<jint>(opt->strategy);
 }
@@ -94,9 +94,9 @@ jint Java_org_rocksdb_CompressionOptions_strategy(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_CompressionOptions_setMaxDictBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_dict_bytes) {
+    JNIEnv*, jobject, jlong jhandle, jint jmax_dict_bytes) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
-  opt->max_dict_bytes = static_cast<int>(jmax_dict_bytes);
+  opt->max_dict_bytes = static_cast<uint32_t>(jmax_dict_bytes);
 }
 
 /*
@@ -105,17 +105,60 @@ void Java_org_rocksdb_CompressionOptions_setMaxDictBytes(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_CompressionOptions_maxDictBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
   return static_cast<jint>(opt->max_dict_bytes);
 }
 
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setZstdMaxTrainBytes
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_CompressionOptions_setZstdMaxTrainBytes(
+    JNIEnv*, jobject, jlong jhandle, jint jzstd_max_train_bytes) {
+  auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
+  opt->zstd_max_train_bytes = static_cast<uint32_t>(jzstd_max_train_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    zstdMaxTrainBytes
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_CompressionOptions_zstdMaxTrainBytes(
+    JNIEnv *, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
+  return static_cast<jint>(opt->zstd_max_train_bytes);
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    setEnabled
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_CompressionOptions_setEnabled(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenabled) {
+  auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
+  opt->enabled = jenabled == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_CompressionOptions
+ * Method:    enabled
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_CompressionOptions_enabled(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
+  return static_cast<bool>(opt->enabled);
+}
 /*
  * Class:     org_rocksdb_CompressionOptions
  * Method:    disposeInternal
  * Signature: (J)V
  */
 void Java_org_rocksdb_CompressionOptions_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   delete reinterpret_cast<rocksdb::CompressionOptions*>(jhandle);
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/env.cc b/thirdparty/rocksdb/java/rocksjni/env.cc
index dc949a07fa..ed54bd36a0 100644
--- a/thirdparty/rocksdb/java/rocksjni/env.cc
+++ b/thirdparty/rocksdb/java/rocksjni/env.cc
@@ -6,10 +6,16 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::Env methods from Java side.
 
+#include <jni.h>
+#include <vector>
+
+#include "portal.h"
+#include "rocksdb/env.h"
 #include "include/org_rocksdb_Env.h"
+#include "include/org_rocksdb_HdfsEnv.h"
 #include "include/org_rocksdb_RocksEnv.h"
 #include "include/org_rocksdb_RocksMemEnv.h"
-#include "rocksdb/env.h"
+#include "include/org_rocksdb_TimedEnv.h"
 
 /*
  * Class:     org_rocksdb_Env
@@ -17,55 +23,143 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_Env_getDefaultEnvInternal(
-    JNIEnv* env, jclass jclazz) {
+    JNIEnv*, jclass) {
   return reinterpret_cast<jlong>(rocksdb::Env::Default());
 }
 
+/*
+ * Class:     org_rocksdb_RocksEnv
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksEnv_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* e = reinterpret_cast<rocksdb::Env*>(jhandle);
+  assert(e != nullptr);
+  delete e;
+}
+
 /*
  * Class:     org_rocksdb_Env
  * Method:    setBackgroundThreads
- * Signature: (JII)V
+ * Signature: (JIB)V
  */
 void Java_org_rocksdb_Env_setBackgroundThreads(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint num, jint priority) {
+    JNIEnv*, jobject, jlong jhandle, jint jnum, jbyte jpriority_value) {
   auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
-  switch (priority) {
-    case org_rocksdb_Env_FLUSH_POOL:
-      rocks_env->SetBackgroundThreads(num, rocksdb::Env::Priority::LOW);
-      break;
-    case org_rocksdb_Env_COMPACTION_POOL:
-      rocks_env->SetBackgroundThreads(num, rocksdb::Env::Priority::HIGH);
-      break;
-  }
+  rocks_env->SetBackgroundThreads(static_cast<int>(jnum),
+      rocksdb::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    getBackgroundThreads
+ * Signature: (JB)I
+ */
+jint Java_org_rocksdb_Env_getBackgroundThreads(
+    JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
+  const int num = rocks_env->GetBackgroundThreads(
+      rocksdb::PriorityJni::toCppPriority(jpriority_value));
+  return static_cast<jint>(num);
 }
 
 /*
- * Class:     org_rocksdb_sEnv
+ * Class:     org_rocksdb_Env
  * Method:    getThreadPoolQueueLen
- * Signature: (JI)I
+ * Signature: (JB)I
  */
 jint Java_org_rocksdb_Env_getThreadPoolQueueLen(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint pool_id) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
+  const int queue_len = rocks_env->GetThreadPoolQueueLen(
+      rocksdb::PriorityJni::toCppPriority(jpriority_value));
+  return static_cast<jint>(queue_len);
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    incBackgroundThreadsIfNeeded
+ * Signature: (JIB)V
+ */
+void Java_org_rocksdb_Env_incBackgroundThreadsIfNeeded(
+    JNIEnv*, jobject, jlong jhandle, jint jnum, jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
+  rocks_env->IncBackgroundThreadsIfNeeded(static_cast<int>(jnum),
+      rocksdb::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    lowerThreadPoolIOPriority
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Env_lowerThreadPoolIOPriority(
+    JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
+  rocks_env->LowerThreadPoolIOPriority(
+      rocksdb::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    lowerThreadPoolCPUPriority
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_Env_lowerThreadPoolCPUPriority(
+    JNIEnv*, jobject, jlong jhandle, jbyte jpriority_value) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
+  rocks_env->LowerThreadPoolCPUPriority(
+      rocksdb::PriorityJni::toCppPriority(jpriority_value));
+}
+
+/*
+ * Class:     org_rocksdb_Env
+ * Method:    getThreadList
+ * Signature: (J)[Lorg/rocksdb/ThreadStatus;
+ */
+jobjectArray Java_org_rocksdb_Env_getThreadList(
+    JNIEnv* env, jobject, jlong jhandle) {
   auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jhandle);
-  switch (pool_id) {
-    case org_rocksdb_RocksEnv_FLUSH_POOL:
-      return rocks_env->GetThreadPoolQueueLen(rocksdb::Env::Priority::LOW);
-    case org_rocksdb_RocksEnv_COMPACTION_POOL:
-      return rocks_env->GetThreadPoolQueueLen(rocksdb::Env::Priority::HIGH);
+  std::vector<rocksdb::ThreadStatus> thread_status;
+  rocksdb::Status s = rocks_env->GetThreadList(&thread_status);
+  if (!s.ok()) {
+    // error, throw exception
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
   }
-  return 0;
+
+  // object[]
+  const jsize len = static_cast<jsize>(thread_status.size());
+  jobjectArray jthread_status =
+      env->NewObjectArray(len, rocksdb::ThreadStatusJni::getJClass(env), nullptr);
+  if (jthread_status == nullptr) {
+    // an exception occurred
+    return nullptr;
+  }
+  for (jsize i = 0; i < len; ++i) {
+    jobject jts =
+        rocksdb::ThreadStatusJni::construct(env, &(thread_status[i]));
+    env->SetObjectArrayElement(jthread_status, i, jts);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jthread_status);
+      return nullptr;
+    }
+  }
+
+  return jthread_status;
 }
 
 /*
  * Class:     org_rocksdb_RocksMemEnv
  * Method:    createMemEnv
- * Signature: ()J
+ * Signature: (J)J
  */
 jlong Java_org_rocksdb_RocksMemEnv_createMemEnv(
-    JNIEnv* env, jclass jclazz) {
-  return reinterpret_cast<jlong>(rocksdb::NewMemEnv(
-      rocksdb::Env::Default()));
+    JNIEnv*, jclass, jlong jbase_env_handle) {
+  auto* base_env = reinterpret_cast<rocksdb::Env*>(jbase_env_handle);
+  return reinterpret_cast<jlong>(rocksdb::NewMemEnv(base_env));
 }
 
 /*
@@ -74,8 +168,67 @@ jlong Java_org_rocksdb_RocksMemEnv_createMemEnv(
  * Signature: (J)V
  */
 void Java_org_rocksdb_RocksMemEnv_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* e = reinterpret_cast<rocksdb::Env*>(jhandle);
   assert(e != nullptr);
   delete e;
 }
+
+/*
+ * Class:     org_rocksdb_HdfsEnv
+ * Method:    createHdfsEnv
+ * Signature: (Ljava/lang/String;)J
+ */
+jlong Java_org_rocksdb_HdfsEnv_createHdfsEnv(
+    JNIEnv* env, jclass, jstring jfsname) {
+  jboolean has_exception = JNI_FALSE;
+  auto fsname = rocksdb::JniUtil::copyStdString(env, jfsname, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return 0;
+  }
+  rocksdb::Env* hdfs_env;
+  rocksdb::Status s = rocksdb::NewHdfsEnv(&hdfs_env, fsname);
+  if (!s.ok()) {
+    // error occurred
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+  return reinterpret_cast<jlong>(hdfs_env);
+}
+
+/*
+ * Class:     org_rocksdb_HdfsEnv
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_HdfsEnv_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* e = reinterpret_cast<rocksdb::Env*>(jhandle);
+  assert(e != nullptr);
+  delete e;
+}
+
+/*
+ * Class:     org_rocksdb_TimedEnv
+ * Method:    createTimedEnv
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TimedEnv_createTimedEnv(
+    JNIEnv*, jclass, jlong jbase_env_handle) {
+  auto* base_env = reinterpret_cast<rocksdb::Env*>(jbase_env_handle);
+  return reinterpret_cast<jlong>(rocksdb::NewTimedEnv(base_env));
+}
+
+/*
+ * Class:     org_rocksdb_TimedEnv
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TimedEnv_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* e = reinterpret_cast<rocksdb::Env*>(jhandle);
+  assert(e != nullptr);
+  delete e;
+}
+
diff --git a/thirdparty/rocksdb/java/rocksjni/env_options.cc b/thirdparty/rocksdb/java/rocksjni/env_options.cc
index 538b0b69f7..9ed330183c 100644
--- a/thirdparty/rocksdb/java/rocksjni/env_options.cc
+++ b/thirdparty/rocksdb/java/rocksjni/env_options.cc
@@ -32,104 +32,115 @@
  * Method:    newEnvOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_EnvOptions_newEnvOptions(JNIEnv *env, jclass jcls) {
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__(
+    JNIEnv*, jclass) {
   auto *env_opt = new rocksdb::EnvOptions();
   return reinterpret_cast<jlong>(env_opt);
 }
 
+/*
+ * Class:     org_rocksdb_EnvOptions
+ * Method:    newEnvOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_EnvOptions_newEnvOptions__J(
+    JNIEnv*, jclass, jlong jdboptions_handle) {
+  auto* db_options =
+      reinterpret_cast<rocksdb::DBOptions*>(jdboptions_handle);
+  auto* env_opt = new rocksdb::EnvOptions(*db_options);
+  return reinterpret_cast<jlong>(env_opt);
+}
+
 /*
  * Class:     org_rocksdb_EnvOptions
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_EnvOptions_disposeInternal(JNIEnv *env, jobject jobj,
-                                                 jlong jhandle) {
-  auto* eo = reinterpret_cast<rocksdb::EnvOptions *>(jhandle);
+void Java_org_rocksdb_EnvOptions_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto *eo = reinterpret_cast<rocksdb::EnvOptions *>(jhandle);
   assert(eo != nullptr);
   delete eo;
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    setUseDirectReads
+ * Method:    setUseMmapReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseDirectReads(JNIEnv *env, jobject jobj,
-                                                   jlong jhandle,
-                                                   jboolean use_direct_reads) {
-  ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
+void Java_org_rocksdb_EnvOptions_setUseMmapReads(
+    JNIEnv*, jobject, jlong jhandle, jboolean use_mmap_reads) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads);
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    useDirectReads
+ * Method:    useMmapReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useDirectReads(JNIEnv *env, jobject jobj,
-                                                    jlong jhandle) {
-  return ENV_OPTIONS_GET(jhandle, use_direct_reads);
+jboolean Java_org_rocksdb_EnvOptions_useMmapReads(
+    JNIEnv*, jobject, jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_mmap_reads);
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    setUseDirectWrites
+ * Method:    setUseMmapWrites
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
-    JNIEnv *env, jobject jobj, jlong jhandle, jboolean use_direct_writes) {
-  ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
+void Java_org_rocksdb_EnvOptions_setUseMmapWrites(
+    JNIEnv*, jobject, jlong jhandle, jboolean use_mmap_writes) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes);
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    useDirectWrites
+ * Method:    useMmapWrites
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(JNIEnv *env, jobject jobj,
-                                                     jlong jhandle) {
-  return ENV_OPTIONS_GET(jhandle, use_direct_writes);
+jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(
+    JNIEnv*, jobject, jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    setUseMmapReads
+ * Method:    setUseDirectReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseMmapReads(JNIEnv *env, jobject jobj,
-                                                 jlong jhandle,
-                                                 jboolean use_mmap_reads) {
-  ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_reads);
+void Java_org_rocksdb_EnvOptions_setUseDirectReads(
+    JNIEnv*, jobject, jlong jhandle, jboolean use_direct_reads) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_direct_reads);
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    useMmapReads
+ * Method:    useDirectReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useMmapReads(JNIEnv *env, jobject jobj,
-                                                  jlong jhandle) {
-  return ENV_OPTIONS_GET(jhandle, use_mmap_reads);
+jboolean Java_org_rocksdb_EnvOptions_useDirectReads(
+    JNIEnv*, jobject, jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_direct_reads);
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    setUseMmapWrites
+ * Method:    setUseDirectWrites
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setUseMmapWrites(JNIEnv *env, jobject jobj,
-                                                  jlong jhandle,
-                                                  jboolean use_mmap_writes) {
-  ENV_OPTIONS_SET_BOOL(jhandle, use_mmap_writes);
+void Java_org_rocksdb_EnvOptions_setUseDirectWrites(
+    JNIEnv*, jobject, jlong jhandle, jboolean use_direct_writes) {
+  ENV_OPTIONS_SET_BOOL(jhandle, use_direct_writes);
 }
 
 /*
  * Class:     org_rocksdb_EnvOptions
- * Method:    useMmapWrites
+ * Method:    useDirectWrites
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *env, jobject jobj,
-                                                   jlong jhandle) {
-  return ENV_OPTIONS_GET(jhandle, use_mmap_writes);
+jboolean Java_org_rocksdb_EnvOptions_useDirectWrites(
+    JNIEnv*,  jobject, jlong jhandle) {
+  return ENV_OPTIONS_GET(jhandle, use_direct_writes);
 }
 
 /*
@@ -137,9 +148,8 @@ jboolean Java_org_rocksdb_EnvOptions_useMmapWrites(JNIEnv *env, jobject jobj,
  * Method:    setAllowFallocate
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *env, jobject jobj,
-                                                   jlong jhandle,
-                                                   jboolean allow_fallocate) {
+void Java_org_rocksdb_EnvOptions_setAllowFallocate(
+    JNIEnv*, jobject, jlong jhandle, jboolean allow_fallocate) {
   ENV_OPTIONS_SET_BOOL(jhandle, allow_fallocate);
 }
 
@@ -148,8 +158,8 @@ void Java_org_rocksdb_EnvOptions_setAllowFallocate(JNIEnv *env, jobject jobj,
  * Method:    allowFallocate
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *env, jobject jobj,
-                                                    jlong jhandle) {
+jboolean Java_org_rocksdb_EnvOptions_allowFallocate(
+    JNIEnv*, jobject, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, allow_fallocate);
 }
 
@@ -158,9 +168,8 @@ jboolean Java_org_rocksdb_EnvOptions_allowFallocate(JNIEnv *env, jobject jobj,
  * Method:    setSetFdCloexec
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *env, jobject jobj,
-                                                 jlong jhandle,
-                                                 jboolean set_fd_cloexec) {
+void Java_org_rocksdb_EnvOptions_setSetFdCloexec(
+    JNIEnv*, jobject, jlong jhandle, jboolean set_fd_cloexec) {
   ENV_OPTIONS_SET_BOOL(jhandle, set_fd_cloexec);
 }
 
@@ -169,8 +178,8 @@ void Java_org_rocksdb_EnvOptions_setSetFdCloexec(JNIEnv *env, jobject jobj,
  * Method:    setFdCloexec
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *env, jobject jobj,
-                                                  jlong jhandle) {
+jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(
+    JNIEnv*, jobject, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, set_fd_cloexec);
 }
 
@@ -179,9 +188,8 @@ jboolean Java_org_rocksdb_EnvOptions_setFdCloexec(JNIEnv *env, jobject jobj,
  * Method:    setBytesPerSync
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *env, jobject jobj,
-                                                 jlong jhandle,
-                                                 jlong bytes_per_sync) {
+void Java_org_rocksdb_EnvOptions_setBytesPerSync(
+    JNIEnv*, jobject, jlong jhandle, jlong bytes_per_sync) {
   ENV_OPTIONS_SET_UINT64_T(jhandle, bytes_per_sync);
 }
 
@@ -190,8 +198,8 @@ void Java_org_rocksdb_EnvOptions_setBytesPerSync(JNIEnv *env, jobject jobj,
  * Method:    bytesPerSync
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *env, jobject jobj,
-                                               jlong jhandle) {
+jlong Java_org_rocksdb_EnvOptions_bytesPerSync(
+    JNIEnv*,  jobject,  jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, bytes_per_sync);
 }
 
@@ -201,8 +209,7 @@ jlong Java_org_rocksdb_EnvOptions_bytesPerSync(JNIEnv *env, jobject jobj,
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize(
-    JNIEnv *env, jobject jobj, jlong jhandle,
-    jboolean fallocate_with_keep_size) {
+    JNIEnv*, jobject, jlong jhandle, jboolean fallocate_with_keep_size) {
   ENV_OPTIONS_SET_BOOL(jhandle, fallocate_with_keep_size);
 }
 
@@ -211,9 +218,8 @@ void Java_org_rocksdb_EnvOptions_setFallocateWithKeepSize(
  * Method:    fallocateWithKeepSize
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *env,
-                                                           jobject jobj,
-                                                           jlong jhandle) {
+jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(
+    JNIEnv*, jobject, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, fallocate_with_keep_size);
 }
 
@@ -223,7 +229,7 @@ jboolean Java_org_rocksdb_EnvOptions_fallocateWithKeepSize(JNIEnv *env,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize(
-    JNIEnv *env, jobject jobj, jlong jhandle, jlong compaction_readahead_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong compaction_readahead_size) {
   ENV_OPTIONS_SET_SIZE_T(jhandle, compaction_readahead_size);
 }
 
@@ -232,9 +238,8 @@ void Java_org_rocksdb_EnvOptions_setCompactionReadaheadSize(
  * Method:    compactionReadaheadSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *env,
-                                                          jobject jobj,
-                                                          jlong jhandle) {
+jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(
+    JNIEnv*, jobject, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, compaction_readahead_size);
 }
 
@@ -244,8 +249,7 @@ jlong Java_org_rocksdb_EnvOptions_compactionReadaheadSize(JNIEnv *env,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_EnvOptions_setRandomAccessMaxBufferSize(
-    JNIEnv *env, jobject jobj, jlong jhandle,
-    jlong random_access_max_buffer_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong random_access_max_buffer_size) {
   ENV_OPTIONS_SET_SIZE_T(jhandle, random_access_max_buffer_size);
 }
 
@@ -254,9 +258,8 @@ void Java_org_rocksdb_EnvOptions_setRandomAccessMaxBufferSize(
  * Method:    randomAccessMaxBufferSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(JNIEnv *env,
-                                                            jobject jobj,
-                                                            jlong jhandle) {
+jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(
+    JNIEnv*, jobject, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, random_access_max_buffer_size);
 }
 
@@ -266,8 +269,7 @@ jlong Java_org_rocksdb_EnvOptions_randomAccessMaxBufferSize(JNIEnv *env,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize(
-    JNIEnv *env, jobject jobj, jlong jhandle,
-    jlong writable_file_max_buffer_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong writable_file_max_buffer_size) {
   ENV_OPTIONS_SET_SIZE_T(jhandle, writable_file_max_buffer_size);
 }
 
@@ -276,9 +278,8 @@ void Java_org_rocksdb_EnvOptions_setWritableFileMaxBufferSize(
  * Method:    writableFileMaxBufferSize
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *env,
-                                                            jobject jobj,
-                                                            jlong jhandle) {
+jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(
+    JNIEnv*, jobject, jlong jhandle) {
   return ENV_OPTIONS_GET(jhandle, writable_file_max_buffer_size);
 }
 
@@ -287,11 +288,10 @@ jlong Java_org_rocksdb_EnvOptions_writableFileMaxBufferSize(JNIEnv *env,
  * Method:    setRateLimiter
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_EnvOptions_setRateLimiter(JNIEnv *env, jobject jobj,
-                                                jlong jhandle,
-                                                jlong rl_handle) {
-  auto* sptr_rate_limiter =
+void Java_org_rocksdb_EnvOptions_setRateLimiter(
+    JNIEnv*, jobject, jlong jhandle, jlong rl_handle) {
+  auto *sptr_rate_limiter =
       reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(rl_handle);
-  auto* env_opt = reinterpret_cast<rocksdb::EnvOptions *>(jhandle);
+  auto *env_opt = reinterpret_cast<rocksdb::EnvOptions *>(jhandle);
   env_opt->rate_limiter = sptr_rate_limiter->get();
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/filter.cc b/thirdparty/rocksdb/java/rocksjni/filter.cc
index 7b186b8943..5e9c63643d 100644
--- a/thirdparty/rocksdb/java/rocksjni/filter.cc
+++ b/thirdparty/rocksdb/java/rocksjni/filter.cc
@@ -6,15 +6,15 @@
 // This file implements the "bridge" between Java and C++ for
 // rocksdb::FilterPolicy.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <string>
 
-#include "include/org_rocksdb_Filter.h"
 #include "include/org_rocksdb_BloomFilter.h"
-#include "rocksjni/portal.h"
+#include "include/org_rocksdb_Filter.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksjni/portal.h"
 
 /*
  * Class:     org_rocksdb_BloomFilter
@@ -22,11 +22,10 @@
  * Signature: (IZ)J
  */
 jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter(
-    JNIEnv* env, jclass jcls, jint bits_per_key,
+    JNIEnv* /*env*/, jclass /*jcls*/, jint bits_per_key,
     jboolean use_block_base_builder) {
-  auto* sptr_filter =
-      new std::shared_ptr<const rocksdb::FilterPolicy>(
-          rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder));
+  auto* sptr_filter = new std::shared_ptr<const rocksdb::FilterPolicy>(
+      rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder));
   return reinterpret_cast<jlong>(sptr_filter);
 }
 
@@ -35,9 +34,9 @@ jlong Java_org_rocksdb_BloomFilter_createNewBloomFilter(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_Filter_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+void Java_org_rocksdb_Filter_disposeInternal(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jhandle) {
   auto* handle =
-      reinterpret_cast<std::shared_ptr<const rocksdb::FilterPolicy> *>(jhandle);
+      reinterpret_cast<std::shared_ptr<const rocksdb::FilterPolicy>*>(jhandle);
   delete handle;  // delete std::shared_ptr
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/ingest_external_file_options.cc b/thirdparty/rocksdb/java/rocksjni/ingest_external_file_options.cc
index 251a6e3c62..e0871ff8ed 100644
--- a/thirdparty/rocksdb/java/rocksjni/ingest_external_file_options.cc
+++ b/thirdparty/rocksdb/java/rocksjni/ingest_external_file_options.cc
@@ -17,7 +17,7 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__(
-    JNIEnv* env, jclass jclazz) {
+    JNIEnv*, jclass) {
   auto* options = new rocksdb::IngestExternalFileOptions();
   return reinterpret_cast<jlong>(options);
 }
@@ -28,7 +28,7 @@ jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__(
  * Signature: (ZZZZ)J
  */
 jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__ZZZZ(
-    JNIEnv* env, jclass jcls, jboolean jmove_files,
+    JNIEnv*, jclass, jboolean jmove_files,
     jboolean jsnapshot_consistency, jboolean jallow_global_seqno,
     jboolean jallow_blocking_flush) {
   auto* options = new rocksdb::IngestExternalFileOptions();
@@ -45,7 +45,7 @@ jlong Java_org_rocksdb_IngestExternalFileOptions_newIngestExternalFileOptions__Z
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   return static_cast<jboolean>(options->move_files);
@@ -57,7 +57,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_moveFiles(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_IngestExternalFileOptions_setMoveFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jmove_files) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jmove_files) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   options->move_files = static_cast<bool>(jmove_files);
@@ -69,7 +69,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setMoveFiles(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_IngestExternalFileOptions_snapshotConsistency(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   return static_cast<jboolean>(options->snapshot_consistency);
@@ -81,8 +81,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_snapshotConsistency(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_IngestExternalFileOptions_setSnapshotConsistency(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jsnapshot_consistency) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jsnapshot_consistency) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   options->snapshot_consistency = static_cast<bool>(jsnapshot_consistency);
@@ -94,7 +93,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setSnapshotConsistency(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_IngestExternalFileOptions_allowGlobalSeqNo(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   return static_cast<jboolean>(options->allow_global_seqno);
@@ -106,7 +105,7 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_allowGlobalSeqNo(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_IngestExternalFileOptions_setAllowGlobalSeqNo(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jallow_global_seqno) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_global_seqno) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   options->allow_global_seqno = static_cast<bool>(jallow_global_seqno);
@@ -118,7 +117,7 @@ void Java_org_rocksdb_IngestExternalFileOptions_setAllowGlobalSeqNo(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_IngestExternalFileOptions_allowBlockingFlush(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   return static_cast<jboolean>(options->allow_blocking_flush);
@@ -130,19 +129,67 @@ jboolean Java_org_rocksdb_IngestExternalFileOptions_allowBlockingFlush(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_IngestExternalFileOptions_setAllowBlockingFlush(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jallow_blocking_flush) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_blocking_flush) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   options->allow_blocking_flush = static_cast<bool>(jallow_blocking_flush);
 }
 
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    ingestBehind
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_IngestExternalFileOptions_ingestBehind(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
+  return options->ingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setIngestBehind
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_IngestExternalFileOptions_setIngestBehind(
+    JNIEnv*, jobject, jlong jhandle, jboolean jingest_behind) {
+  auto* options =
+      reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
+  options->ingest_behind = jingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    writeGlobalSeqno
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_IngestExternalFileOptions_writeGlobalSeqno(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* options =
+      reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
+  return options->write_global_seqno == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_IngestExternalFileOptions
+ * Method:    setWriteGlobalSeqno
+ * Signature: (JZ)V
+ */
+JNIEXPORT void JNICALL Java_org_rocksdb_IngestExternalFileOptions_setWriteGlobalSeqno(
+    JNIEnv*, jobject, jlong jhandle, jboolean jwrite_global_seqno) {
+  auto* options =
+      reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
+  options->write_global_seqno = jwrite_global_seqno == JNI_TRUE;
+}
+
 /*
  * Class:     org_rocksdb_IngestExternalFileOptions
  * Method:    disposeInternal
  * Signature: (J)V
  */
 void Java_org_rocksdb_IngestExternalFileOptions_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* options =
       reinterpret_cast<rocksdb::IngestExternalFileOptions*>(jhandle);
   delete options;
diff --git a/thirdparty/rocksdb/java/rocksjni/iterator.cc b/thirdparty/rocksdb/java/rocksjni/iterator.cc
index 3ac9d5033f..18daeb8161 100644
--- a/thirdparty/rocksdb/java/rocksjni/iterator.cc
+++ b/thirdparty/rocksdb/java/rocksjni/iterator.cc
@@ -6,21 +6,22 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::Iterator methods from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 
 #include "include/org_rocksdb_RocksIterator.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/iterator.h"
+#include "rocksjni/portal.h"
 
 /*
  * Class:     org_rocksdb_RocksIterator
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_RocksIterator_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_RocksIterator_disposeInternal(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
   auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
   assert(it != nullptr);
   delete it;
@@ -31,8 +32,9 @@ void Java_org_rocksdb_RocksIterator_disposeInternal(
  * Method:    isValid0
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_RocksIterator_isValid0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jboolean Java_org_rocksdb_RocksIterator_isValid0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong handle) {
   return reinterpret_cast<rocksdb::Iterator*>(handle)->Valid();
 }
 
@@ -41,8 +43,9 @@ jboolean Java_org_rocksdb_RocksIterator_isValid0(
  * Method:    seekToFirst0
  * Signature: (J)V
  */
-void Java_org_rocksdb_RocksIterator_seekToFirst0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_RocksIterator_seekToFirst0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong handle) {
   reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToFirst();
 }
 
@@ -51,8 +54,9 @@ void Java_org_rocksdb_RocksIterator_seekToFirst0(
  * Method:    seekToLast0
  * Signature: (J)V
  */
-void Java_org_rocksdb_RocksIterator_seekToLast0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_RocksIterator_seekToLast0(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong handle) {
   reinterpret_cast<rocksdb::Iterator*>(handle)->SeekToLast();
 }
 
@@ -61,8 +65,8 @@ void Java_org_rocksdb_RocksIterator_seekToLast0(
  * Method:    next0
  * Signature: (J)V
  */
-void Java_org_rocksdb_RocksIterator_next0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_RocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle) {
   reinterpret_cast<rocksdb::Iterator*>(handle)->Next();
 }
 
@@ -71,8 +75,8 @@ void Java_org_rocksdb_RocksIterator_next0(
  * Method:    prev0
  * Signature: (J)V
  */
-void Java_org_rocksdb_RocksIterator_prev0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_RocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle) {
   reinterpret_cast<rocksdb::Iterator*>(handle)->Prev();
 }
 
@@ -81,17 +85,16 @@ void Java_org_rocksdb_RocksIterator_prev0(
  * Method:    seek0
  * Signature: (J[BI)V
  */
-void Java_org_rocksdb_RocksIterator_seek0(
-    JNIEnv* env, jobject jobj, jlong handle,
-    jbyteArray jtarget, jint jtarget_len) {
+void Java_org_rocksdb_RocksIterator_seek0(JNIEnv* env, jobject /*jobj*/,
+                                          jlong handle, jbyteArray jtarget,
+                                          jint jtarget_len) {
   jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
-  if(target == nullptr) {
+  if (target == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
-  rocksdb::Slice target_slice(
-      reinterpret_cast<char*>(target), jtarget_len);
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
 
   auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
   it->Seek(target_slice);
@@ -99,13 +102,36 @@ void Java_org_rocksdb_RocksIterator_seek0(
   env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
 }
 
+/*
+ * Class:     org_rocksdb_RocksIterator
+ * Method:    seekForPrev0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_RocksIterator_seekForPrev0(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong handle,
+                                                 jbyteArray jtarget,
+                                                 jint jtarget_len) {
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
+
+  auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
+  it->SeekForPrev(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
 /*
  * Class:     org_rocksdb_RocksIterator
  * Method:    status0
  * Signature: (J)V
  */
-void Java_org_rocksdb_RocksIterator_status0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_RocksIterator_status0(JNIEnv* env, jobject /*jobj*/,
+                                            jlong handle) {
   auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
   rocksdb::Status s = it->status();
 
@@ -121,18 +147,19 @@ void Java_org_rocksdb_RocksIterator_status0(
  * Method:    key0
  * Signature: (J)[B
  */
-jbyteArray Java_org_rocksdb_RocksIterator_key0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jbyteArray Java_org_rocksdb_RocksIterator_key0(JNIEnv* env, jobject /*jobj*/,
+                                               jlong handle) {
   auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
   rocksdb::Slice key_slice = it->key();
 
   jbyteArray jkey = env->NewByteArray(static_cast<jsize>(key_slice.size()));
-  if(jkey == nullptr) {
+  if (jkey == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
-  env->SetByteArrayRegion(jkey, 0, static_cast<jsize>(key_slice.size()),
-                          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
+  env->SetByteArrayRegion(
+      jkey, 0, static_cast<jsize>(key_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(key_slice.data())));
   return jkey;
 }
 
@@ -141,18 +168,19 @@ jbyteArray Java_org_rocksdb_RocksIterator_key0(
  * Method:    value0
  * Signature: (J)[B
  */
-jbyteArray Java_org_rocksdb_RocksIterator_value0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jbyteArray Java_org_rocksdb_RocksIterator_value0(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong handle) {
   auto* it = reinterpret_cast<rocksdb::Iterator*>(handle);
   rocksdb::Slice value_slice = it->value();
 
   jbyteArray jkeyValue =
       env->NewByteArray(static_cast<jsize>(value_slice.size()));
-  if(jkeyValue == nullptr) {
+  if (jkeyValue == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
-  env->SetByteArrayRegion(jkeyValue, 0, static_cast<jsize>(value_slice.size()),
-                          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
+  env->SetByteArrayRegion(
+      jkeyValue, 0, static_cast<jsize>(value_slice.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_slice.data())));
   return jkeyValue;
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/jnicallback.cc b/thirdparty/rocksdb/java/rocksjni/jnicallback.cc
new file mode 100644
index 0000000000..f72eecd4c2
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/jnicallback.cc
@@ -0,0 +1,53 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject
+
+#include <assert.h>
+#include "rocksjni/jnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+JniCallback::JniCallback(JNIEnv* env, jobject jcallback_obj) {
+  // Note: jcallback_obj may be accessed by multiple threads,
+  // so we ref the jvm not the env
+  const jint rs = env->GetJavaVM(&m_jvm);
+  if(rs != JNI_OK) {
+    // exception thrown
+    return;
+  }
+
+  // Note: we may want to access the Java callback object instance
+  // across multiple method calls, so we create a global ref
+  assert(jcallback_obj != nullptr);
+  m_jcallback_obj = env->NewGlobalRef(jcallback_obj);
+  if(jcallback_obj == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+}
+
+JNIEnv* JniCallback::getJniEnv(jboolean* attached) const {
+  return JniUtil::getJniEnv(m_jvm, attached);
+}
+
+void JniCallback::releaseJniEnv(jboolean& attached) const {
+  JniUtil::releaseJniEnv(m_jvm, attached);
+}
+
+JniCallback::~JniCallback() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  if(m_jcallback_obj != nullptr) {    
+    env->DeleteGlobalRef(m_jcallback_obj);
+  }
+
+  releaseJniEnv(attached_thread);
+}
+// @lint-ignore TXT4 T25377293 Grandfathered in
+}  // namespace rocksdb
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/jnicallback.h b/thirdparty/rocksdb/java/rocksjni/jnicallback.h
new file mode 100644
index 0000000000..940ecf064d
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/jnicallback.h
@@ -0,0 +1,29 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject
+
+#ifndef JAVA_ROCKSJNI_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_JNICALLBACK_H_
+
+#include <jni.h>
+
+namespace rocksdb {
+  class JniCallback {
+   public:
+    JniCallback(JNIEnv* env, jobject jcallback_obj);
+    virtual ~JniCallback();
+
+   protected:
+    JavaVM* m_jvm;
+    jobject m_jcallback_obj;
+    JNIEnv* getJniEnv(jboolean* attached) const;
+    void releaseJniEnv(jboolean& attached) const;
+  };
+}
+
+// @lint-ignore TXT4 T25377293 Grandfathered in
+#endif  // JAVA_ROCKSJNI_JNICALLBACK_H_
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.cc b/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.cc
index 09140ed709..61571e9871 100644
--- a/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.cc
+++ b/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.cc
@@ -8,119 +8,102 @@
 
 #include "include/org_rocksdb_Logger.h"
 
-#include "rocksjni/loggerjnicallback.h"
-#include "rocksjni/portal.h"
 #include <cstdarg>
 #include <cstdio>
+#include "rocksjni/loggerjnicallback.h"
+#include "rocksjni/portal.h"
 
 namespace rocksdb {
 
-LoggerJniCallback::LoggerJniCallback(
-    JNIEnv* env, jobject jlogger) {
-  // Note: Logger methods may be accessed by multiple threads,
-  // so we ref the jvm not the env
-  const jint rs = env->GetJavaVM(&m_jvm);
-  if(rs != JNI_OK) {
-    // exception thrown
-    return;
-  }
-
-  // Note: we want to access the Java Logger instance
-  // across multiple method calls, so we create a global ref
-  assert(jlogger != nullptr);
-  m_jLogger = env->NewGlobalRef(jlogger);
-  if(m_jLogger == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return;
-  }
+LoggerJniCallback::LoggerJniCallback(JNIEnv* env, jobject jlogger)
+    : JniCallback(env, jlogger) {
   m_jLogMethodId = LoggerJni::getLogMethodId(env);
-  if(m_jLogMethodId == nullptr) {
+  if (m_jLogMethodId == nullptr) {
     // exception thrown: NoSuchMethodException or OutOfMemoryError
     return;
   }
 
   jobject jdebug_level = InfoLogLevelJni::DEBUG_LEVEL(env);
-  if(jdebug_level == nullptr) {
+  if (jdebug_level == nullptr) {
     // exception thrown: NoSuchFieldError, ExceptionInInitializerError
     // or OutOfMemoryError
     return;
   }
   m_jdebug_level = env->NewGlobalRef(jdebug_level);
-  if(m_jdebug_level == nullptr) {
+  if (m_jdebug_level == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
   jobject jinfo_level = InfoLogLevelJni::INFO_LEVEL(env);
-  if(jinfo_level == nullptr) {
+  if (jinfo_level == nullptr) {
     // exception thrown: NoSuchFieldError, ExceptionInInitializerError
     // or OutOfMemoryError
     return;
   }
   m_jinfo_level = env->NewGlobalRef(jinfo_level);
-  if(m_jinfo_level == nullptr) {
+  if (m_jinfo_level == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
   jobject jwarn_level = InfoLogLevelJni::WARN_LEVEL(env);
-  if(jwarn_level == nullptr) {
+  if (jwarn_level == nullptr) {
     // exception thrown: NoSuchFieldError, ExceptionInInitializerError
     // or OutOfMemoryError
     return;
   }
   m_jwarn_level = env->NewGlobalRef(jwarn_level);
-  if(m_jwarn_level == nullptr) {
+  if (m_jwarn_level == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
   jobject jerror_level = InfoLogLevelJni::ERROR_LEVEL(env);
-  if(jerror_level == nullptr) {
+  if (jerror_level == nullptr) {
     // exception thrown: NoSuchFieldError, ExceptionInInitializerError
     // or OutOfMemoryError
     return;
   }
   m_jerror_level = env->NewGlobalRef(jerror_level);
-  if(m_jerror_level == nullptr) {
+  if (m_jerror_level == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
   jobject jfatal_level = InfoLogLevelJni::FATAL_LEVEL(env);
-  if(jfatal_level == nullptr) {
+  if (jfatal_level == nullptr) {
     // exception thrown: NoSuchFieldError, ExceptionInInitializerError
     // or OutOfMemoryError
     return;
   }
   m_jfatal_level = env->NewGlobalRef(jfatal_level);
-  if(m_jfatal_level == nullptr) {
+  if (m_jfatal_level == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
   jobject jheader_level = InfoLogLevelJni::HEADER_LEVEL(env);
-  if(jheader_level == nullptr) {
+  if (jheader_level == nullptr) {
     // exception thrown: NoSuchFieldError, ExceptionInInitializerError
     // or OutOfMemoryError
     return;
   }
   m_jheader_level = env->NewGlobalRef(jheader_level);
-  if(m_jheader_level == nullptr) {
+  if (m_jheader_level == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 }
 
-void LoggerJniCallback::Logv(const char* format, va_list ap) {
+void LoggerJniCallback::Logv(const char* /*format*/, va_list /*ap*/) {
   // We implement this method because it is virtual but we don't
   // use it because we need to know about the log level.
 }
 
-void LoggerJniCallback::Logv(const InfoLogLevel log_level,
-    const char* format, va_list ap) {
+void LoggerJniCallback::Logv(const InfoLogLevel log_level, const char* format,
+                             va_list ap) {
   if (GetInfoLogLevel() <= log_level) {
-
     // determine InfoLogLevel java enum instance
     jobject jlog_level;
     switch (log_level) {
@@ -153,45 +136,47 @@ void LoggerJniCallback::Logv(const InfoLogLevel log_level,
 
     // pass msg to java callback handler
     jboolean attached_thread = JNI_FALSE;
-    JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
+    JNIEnv* env = getJniEnv(&attached_thread);
     assert(env != nullptr);
 
     jstring jmsg = env->NewStringUTF(msg.get());
-    if(jmsg == nullptr) {
+    if (jmsg == nullptr) {
       // unable to construct string
-      if(env->ExceptionCheck()) {
-        env->ExceptionDescribe(); // print out exception to stderr
+      if (env->ExceptionCheck()) {
+        env->ExceptionDescribe();  // print out exception to stderr
       }
-      JniUtil::releaseJniEnv(m_jvm, attached_thread);
+      releaseJniEnv(attached_thread);
       return;
     }
-    if(env->ExceptionCheck()) {
+    if (env->ExceptionCheck()) {
       // exception thrown: OutOfMemoryError
-      env->ExceptionDescribe(); // print out exception to stderr
+      env->ExceptionDescribe();  // print out exception to stderr
       env->DeleteLocalRef(jmsg);
-      JniUtil::releaseJniEnv(m_jvm, attached_thread);
+      releaseJniEnv(attached_thread);
       return;
     }
 
-    env->CallVoidMethod(m_jLogger, m_jLogMethodId, jlog_level, jmsg);
-    if(env->ExceptionCheck()) {
+    env->CallVoidMethod(m_jcallback_obj, m_jLogMethodId, jlog_level, jmsg);
+    if (env->ExceptionCheck()) {
       // exception thrown
-      env->ExceptionDescribe(); // print out exception to stderr
+      env->ExceptionDescribe();  // print out exception to stderr
       env->DeleteLocalRef(jmsg);
-      JniUtil::releaseJniEnv(m_jvm, attached_thread);
+      releaseJniEnv(attached_thread);
       return;
     }
 
     env->DeleteLocalRef(jmsg);
-    JniUtil::releaseJniEnv(m_jvm, attached_thread);
+    releaseJniEnv(attached_thread);
   }
 }
 
-std::unique_ptr<char[]> LoggerJniCallback::format_str(const char* format, va_list ap) const {
+std::unique_ptr<char[]> LoggerJniCallback::format_str(const char* format,
+                                                      va_list ap) const {
   va_list ap_copy;
 
   va_copy(ap_copy, ap);
-  const size_t required = vsnprintf(nullptr, 0, format, ap_copy) + 1; // Extra space for '\0'
+  const size_t required =
+      vsnprintf(nullptr, 0, format, ap_copy) + 1;  // Extra space for '\0'
   va_end(ap_copy);
 
   std::unique_ptr<char[]> buf(new char[required]);
@@ -202,41 +187,36 @@ std::unique_ptr<char[]> LoggerJniCallback::format_str(const char* format, va_lis
 
   return buf;
 }
-
 LoggerJniCallback::~LoggerJniCallback() {
   jboolean attached_thread = JNI_FALSE;
-  JNIEnv* env = JniUtil::getJniEnv(m_jvm, &attached_thread);
+  JNIEnv* env = getJniEnv(&attached_thread);
   assert(env != nullptr);
 
-  if(m_jLogger != nullptr) {
-    env->DeleteGlobalRef(m_jLogger);
-  }
-
-  if(m_jdebug_level != nullptr) {
+  if (m_jdebug_level != nullptr) {
     env->DeleteGlobalRef(m_jdebug_level);
   }
 
-  if(m_jinfo_level != nullptr) {
+  if (m_jinfo_level != nullptr) {
     env->DeleteGlobalRef(m_jinfo_level);
   }
 
-  if(m_jwarn_level != nullptr) {
+  if (m_jwarn_level != nullptr) {
     env->DeleteGlobalRef(m_jwarn_level);
   }
 
-  if(m_jerror_level != nullptr) {
+  if (m_jerror_level != nullptr) {
     env->DeleteGlobalRef(m_jerror_level);
   }
 
-  if(m_jfatal_level != nullptr) {
+  if (m_jfatal_level != nullptr) {
     env->DeleteGlobalRef(m_jfatal_level);
   }
 
-  if(m_jheader_level != nullptr) {
+  if (m_jheader_level != nullptr) {
     env->DeleteGlobalRef(m_jheader_level);
   }
 
-  JniUtil::releaseJniEnv(m_jvm, attached_thread);
+  releaseJniEnv(attached_thread);
 }
 
 }  // namespace rocksdb
@@ -246,8 +226,8 @@ LoggerJniCallback::~LoggerJniCallback() {
  * Method:    createNewLoggerOptions
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_Logger_createNewLoggerOptions(
-    JNIEnv* env, jobject jobj, jlong joptions) {
+jlong Java_org_rocksdb_Logger_createNewLoggerOptions(JNIEnv* env, jobject jobj,
+                                                     jlong joptions) {
   auto* sptr_logger = new std::shared_ptr<rocksdb::LoggerJniCallback>(
       new rocksdb::LoggerJniCallback(env, jobj));
 
@@ -263,10 +243,11 @@ jlong Java_org_rocksdb_Logger_createNewLoggerOptions(
  * Method:    createNewLoggerDbOptions
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_Logger_createNewLoggerDbOptions(
-    JNIEnv* env, jobject jobj, jlong jdb_options) {
+jlong Java_org_rocksdb_Logger_createNewLoggerDbOptions(JNIEnv* env,
+                                                       jobject jobj,
+                                                       jlong jdb_options) {
   auto* sptr_logger = new std::shared_ptr<rocksdb::LoggerJniCallback>(
-    new rocksdb::LoggerJniCallback(env, jobj));
+      new rocksdb::LoggerJniCallback(env, jobj));
 
   // set log level
   auto* db_options = reinterpret_cast<rocksdb::DBOptions*>(jdb_options);
@@ -280,12 +261,12 @@ jlong Java_org_rocksdb_Logger_createNewLoggerDbOptions(
  * Method:    setInfoLogLevel
  * Signature: (JB)V
  */
-void Java_org_rocksdb_Logger_setInfoLogLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+void Java_org_rocksdb_Logger_setInfoLogLevel(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jhandle, jbyte jlog_level) {
   auto* handle =
-      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(jhandle);
-  handle->get()->
-      SetInfoLogLevel(static_cast<rocksdb::InfoLogLevel>(jlog_level));
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback>*>(jhandle);
+  handle->get()->SetInfoLogLevel(
+      static_cast<rocksdb::InfoLogLevel>(jlog_level));
 }
 
 /*
@@ -293,10 +274,10 @@ void Java_org_rocksdb_Logger_setInfoLogLevel(
  * Method:    infoLogLevel
  * Signature: (J)B
  */
-jbyte Java_org_rocksdb_Logger_infoLogLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jbyte Java_org_rocksdb_Logger_infoLogLevel(JNIEnv* /*env*/, jobject /*jobj*/,
+                                           jlong jhandle) {
   auto* handle =
-      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(jhandle);
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback>*>(jhandle);
   return static_cast<jbyte>(handle->get()->GetInfoLogLevel());
 }
 
@@ -305,9 +286,9 @@ jbyte Java_org_rocksdb_Logger_infoLogLevel(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_Logger_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+void Java_org_rocksdb_Logger_disposeInternal(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jhandle) {
   auto* handle =
-      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(jhandle);
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback>*>(jhandle);
   delete handle;  // delete std::shared_ptr
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.h b/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.h
index 2db85975d6..80c5a19833 100644
--- a/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.h
+++ b/thirdparty/rocksdb/java/rocksjni/loggerjnicallback.h
@@ -12,15 +12,16 @@
 #include <jni.h>
 #include <memory>
 #include <string>
+#include "rocksjni/jnicallback.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 
 namespace rocksdb {
 
-  class LoggerJniCallback : public Logger {
+  class LoggerJniCallback : public JniCallback, public Logger {
    public:
      LoggerJniCallback(JNIEnv* env, jobject jLogger);
-     virtual ~LoggerJniCallback();
+     ~LoggerJniCallback();
 
      using Logger::SetInfoLogLevel;
      using Logger::GetInfoLogLevel;
@@ -34,8 +35,6 @@ namespace rocksdb {
          const char* format, va_list ap);
 
    private:
-     JavaVM* m_jvm;
-     jobject m_jLogger;
      jmethodID m_jLogMethodId;
      jobject m_jdebug_level;
      jobject m_jinfo_level;
diff --git a/thirdparty/rocksdb/java/rocksjni/lru_cache.cc b/thirdparty/rocksdb/java/rocksjni/lru_cache.cc
index 16582689e7..2424bc8e01 100644
--- a/thirdparty/rocksdb/java/rocksjni/lru_cache.cc
+++ b/thirdparty/rocksdb/java/rocksjni/lru_cache.cc
@@ -16,13 +16,14 @@
  * Method:    newLRUCache
  * Signature: (JIZD)J
  */
-jlong Java_org_rocksdb_LRUCache_newLRUCache(
-    JNIEnv* env, jclass jcls, jlong jcapacity, jint jnum_shard_bits,
-    jboolean jstrict_capacity_limit, jdouble jhigh_pri_pool_ratio) {
+jlong Java_org_rocksdb_LRUCache_newLRUCache(JNIEnv* /*env*/, jclass /*jcls*/,
+                                            jlong jcapacity,
+                                            jint jnum_shard_bits,
+                                            jboolean jstrict_capacity_limit,
+                                            jdouble jhigh_pri_pool_ratio) {
   auto* sptr_lru_cache =
       new std::shared_ptr<rocksdb::Cache>(rocksdb::NewLRUCache(
-          static_cast<size_t>(jcapacity),
-          static_cast<int>(jnum_shard_bits),
+          static_cast<size_t>(jcapacity), static_cast<int>(jnum_shard_bits),
           static_cast<bool>(jstrict_capacity_limit),
           static_cast<double>(jhigh_pri_pool_ratio)));
   return reinterpret_cast<jlong>(sptr_lru_cache);
@@ -33,9 +34,10 @@ jlong Java_org_rocksdb_LRUCache_newLRUCache(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_LRUCache_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+void Java_org_rocksdb_LRUCache_disposeInternal(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
   auto* sptr_lru_cache =
-      reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jhandle);
+      reinterpret_cast<std::shared_ptr<rocksdb::Cache>*>(jhandle);
   delete sptr_lru_cache;  // delete std::shared_ptr
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/memory_util.cc b/thirdparty/rocksdb/java/rocksjni/memory_util.cc
new file mode 100644
index 0000000000..0438502139
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/memory_util.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "include/org_rocksdb_MemoryUtil.h"
+
+#include "rocksjni/portal.h"
+
+#include "rocksdb/utilities/memory_util.h"
+
+
+/*
+ * Class:     org_rocksdb_MemoryUtil
+ * Method:    getApproximateMemoryUsageByType
+ * Signature: ([J[J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_MemoryUtil_getApproximateMemoryUsageByType(
+    JNIEnv *env, jclass /*jclazz*/, jlongArray jdb_handles, jlongArray jcache_handles) {
+
+  std::vector<rocksdb::DB*> dbs;
+  jsize db_handle_count = env->GetArrayLength(jdb_handles);
+  if(db_handle_count > 0) {
+    jlong *ptr_jdb_handles = env->GetLongArrayElements(jdb_handles, nullptr);
+    if (ptr_jdb_handles == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    for (jsize i = 0; i < db_handle_count; i++) {
+      dbs.push_back(reinterpret_cast<rocksdb::DB *>(ptr_jdb_handles[i]));
+    }
+    env->ReleaseLongArrayElements(jdb_handles, ptr_jdb_handles, JNI_ABORT);
+  }
+
+  std::unordered_set<const rocksdb::Cache*> cache_set;
+  jsize cache_handle_count = env->GetArrayLength(jcache_handles);
+  if(cache_handle_count > 0) {
+    jlong *ptr_jcache_handles = env->GetLongArrayElements(jcache_handles, nullptr);
+    if (ptr_jcache_handles == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    for (jsize i = 0; i < cache_handle_count; i++) {
+      auto *cache_ptr =
+          reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(ptr_jcache_handles[i]);
+      cache_set.insert(cache_ptr->get());
+    }
+    env->ReleaseLongArrayElements(jcache_handles, ptr_jcache_handles, JNI_ABORT);
+  }
+
+  std::map<rocksdb::MemoryUtil::UsageType, uint64_t> usage_by_type;
+  if(rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, &usage_by_type) != rocksdb::Status::OK()) {
+    // Non-OK status
+    return nullptr;
+  }
+
+  jobject jusage_by_type = rocksdb::HashMapJni::construct(
+      env, static_cast<uint32_t>(usage_by_type.size()));
+  if (jusage_by_type == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+  const rocksdb::HashMapJni::FnMapKV<const rocksdb::MemoryUtil::UsageType, const uint64_t, jobject, jobject>
+      fn_map_kv =
+      [env](const std::pair<rocksdb::MemoryUtil::UsageType, uint64_t>& pair) {
+        // Construct key
+        const jobject jusage_type =
+            rocksdb::ByteJni::valueOf(env, rocksdb::MemoryUsageTypeJni::toJavaMemoryUsageType(pair.first));
+        if (jusage_type == nullptr) {
+          // an error occurred
+          return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+        }
+        // Construct value
+        const jobject jusage_value =
+            rocksdb::LongJni::valueOf(env, pair.second);
+        if (jusage_value == nullptr) {
+          // an error occurred
+          return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+        }
+        // Construct and return pointer to pair of jobjects
+        return std::unique_ptr<std::pair<jobject, jobject>>(
+            new std::pair<jobject, jobject>(jusage_type,
+                                            jusage_value));
+      };
+
+  if (!rocksdb::HashMapJni::putAll(env, jusage_by_type, usage_by_type.begin(),
+                                   usage_by_type.end(), fn_map_kv)) {
+    // exception occcurred
+    jusage_by_type = nullptr;
+  }
+
+  return jusage_by_type;
+
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/memtablejni.cc b/thirdparty/rocksdb/java/rocksjni/memtablejni.cc
index 56a04f9f81..ad704c3b16 100644
--- a/thirdparty/rocksdb/java/rocksjni/memtablejni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/memtablejni.cc
@@ -5,12 +5,12 @@
 //
 // This file implements the "bridge" between Java and C++ for MemTables.
 
-#include "rocksjni/portal.h"
-#include "include/org_rocksdb_HashSkipListMemTableConfig.h"
 #include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
-#include "include/org_rocksdb_VectorMemTableConfig.h"
+#include "include/org_rocksdb_HashSkipListMemTableConfig.h"
 #include "include/org_rocksdb_SkipListMemTableConfig.h"
+#include "include/org_rocksdb_VectorMemTableConfig.h"
 #include "rocksdb/memtablerep.h"
+#include "rocksjni/portal.h"
 
 /*
  * Class:     org_rocksdb_HashSkipListMemTableConfig
@@ -18,13 +18,12 @@
  * Signature: (JII)J
  */
 jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jlong jbucket_count,
-    jint jheight, jint jbranching_factor) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jbucket_count);
+    JNIEnv* env, jobject /*jobj*/, jlong jbucket_count, jint jheight,
+    jint jbranching_factor) {
+  rocksdb::Status s = rocksdb::JniUtil::check_if_jlong_fits_size_t(jbucket_count);
   if (s.ok()) {
     return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
-        static_cast<size_t>(jbucket_count),
-        static_cast<int32_t>(jheight),
+        static_cast<size_t>(jbucket_count), static_cast<int32_t>(jheight),
         static_cast<int32_t>(jbranching_factor)));
   }
   rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
@@ -37,13 +36,13 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
  * Signature: (JJIZI)J
  */
 jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jlong jbucket_count, jlong jhuge_page_tlb_size,
-    jint jbucket_entries_logging_threshold,
+    JNIEnv* env, jobject /*jobj*/, jlong jbucket_count,
+    jlong jhuge_page_tlb_size, jint jbucket_entries_logging_threshold,
     jboolean jif_log_bucket_dist_when_flash, jint jthreshold_use_skiplist) {
   rocksdb::Status statusBucketCount =
-      rocksdb::check_if_jlong_fits_size_t(jbucket_count);
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jbucket_count);
   rocksdb::Status statusHugePageTlb =
-      rocksdb::check_if_jlong_fits_size_t(jhuge_page_tlb_size);
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jhuge_page_tlb_size);
   if (statusBucketCount.ok() && statusHugePageTlb.ok()) {
     return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
         static_cast<size_t>(jbucket_count),
@@ -52,8 +51,8 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
         static_cast<bool>(jif_log_bucket_dist_when_flash),
         static_cast<int32_t>(jthreshold_use_skiplist)));
   }
-  rocksdb::IllegalArgumentExceptionJni::ThrowNew(env,
-      !statusBucketCount.ok()?statusBucketCount:statusHugePageTlb);
+  rocksdb::IllegalArgumentExceptionJni::ThrowNew(
+      env, !statusBucketCount.ok() ? statusBucketCount : statusHugePageTlb);
   return 0;
 }
 
@@ -63,11 +62,11 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jlong jreserved_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jreserved_size);
+    JNIEnv* env, jobject /*jobj*/, jlong jreserved_size) {
+  rocksdb::Status s = rocksdb::JniUtil::check_if_jlong_fits_size_t(jreserved_size);
   if (s.ok()) {
-    return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
-        static_cast<size_t>(jreserved_size)));
+    return reinterpret_cast<jlong>(
+        new rocksdb::VectorRepFactory(static_cast<size_t>(jreserved_size)));
   }
   rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   return 0;
@@ -79,11 +78,11 @@ jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0(
-    JNIEnv* env, jobject jobj, jlong jlookahead) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jlookahead);
+    JNIEnv* env, jobject /*jobj*/, jlong jlookahead) {
+  rocksdb::Status s = rocksdb::JniUtil::check_if_jlong_fits_size_t(jlookahead);
   if (s.ok()) {
-    return reinterpret_cast<jlong>(new rocksdb::SkipListFactory(
-        static_cast<size_t>(jlookahead)));
+    return reinterpret_cast<jlong>(
+        new rocksdb::SkipListFactory(static_cast<size_t>(jlookahead)));
   }
   rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   return 0;
diff --git a/thirdparty/rocksdb/java/rocksjni/merge_operator.cc b/thirdparty/rocksdb/java/rocksjni/merge_operator.cc
index 1b94382ef0..e06a06f7e3 100644
--- a/thirdparty/rocksdb/java/rocksjni/merge_operator.cc
+++ b/thirdparty/rocksdb/java/rocksjni/merge_operator.cc
@@ -6,32 +6,33 @@
 // This file implements the "bridge" between Java and C++
 // for rocksdb::MergeOperator.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
-#include <string>
 #include <memory>
+#include <string>
 
 #include "include/org_rocksdb_StringAppendOperator.h"
-#include "rocksjni/portal.h"
+#include "include/org_rocksdb_UInt64AddOperator.h"
 #include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/memtablerep.h"
 #include "rocksdb/table.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/merge_operator.h"
+#include "rocksjni/portal.h"
 #include "utilities/merge_operators.h"
 
 /*
  * Class:     org_rocksdb_StringAppendOperator
  * Method:    newSharedStringAppendOperator
- * Signature: ()J
+ * Signature: (C)J
  */
-jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator
-(JNIEnv* env, jclass jclazz) {
+jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator(
+    JNIEnv* /*env*/, jclass /*jclazz*/, jchar jdelim) {
   auto* sptr_string_append_op = new std::shared_ptr<rocksdb::MergeOperator>(
-    rocksdb::MergeOperators::CreateFromStringId("stringappend"));
+      rocksdb::MergeOperators::CreateStringAppendOperator((char)jdelim));
   return reinterpret_cast<jlong>(sptr_string_append_op);
 }
 
@@ -40,9 +41,35 @@ jlong Java_org_rocksdb_StringAppendOperator_newSharedStringAppendOperator
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_StringAppendOperator_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+void Java_org_rocksdb_StringAppendOperator_disposeInternal(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
   auto* sptr_string_append_op =
-      reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>* >(jhandle);
+      reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>(jhandle);
   delete sptr_string_append_op;  // delete std::shared_ptr
 }
+
+/*
+ * Class:     org_rocksdb_UInt64AddOperator
+ * Method:    newSharedUInt64AddOperator
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_UInt64AddOperator_newSharedUInt64AddOperator(
+    JNIEnv* /*env*/, jclass /*jclazz*/) {
+  auto* sptr_uint64_add_op = new std::shared_ptr<rocksdb::MergeOperator>(
+      rocksdb::MergeOperators::CreateUInt64AddOperator());
+  return reinterpret_cast<jlong>(sptr_uint64_add_op);
+}
+
+/*
+ * Class:     org_rocksdb_UInt64AddOperator
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_UInt64AddOperator_disposeInternal(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle) {
+  auto* sptr_uint64_add_op =
+      reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>(jhandle);
+  delete sptr_uint64_add_op;  // delete std::shared_ptr
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/native_comparator_wrapper_test.cc b/thirdparty/rocksdb/java/rocksjni/native_comparator_wrapper_test.cc
new file mode 100644
index 0000000000..829019f917
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/native_comparator_wrapper_test.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+#include <string>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+
+#include "include/org_rocksdb_NativeComparatorWrapperTest_NativeStringComparatorWrapper.h"
+
+namespace rocksdb {
+
+class NativeComparatorWrapperTestStringComparator : public Comparator {
+  const char* Name() const {
+    return "NativeComparatorWrapperTestStringComparator";
+  }
+
+  int Compare(const Slice& a, const Slice& b) const {
+    return a.ToString().compare(b.ToString());
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const {
+    return;
+  }
+
+  void FindShortSuccessor(std::string* /*key*/) const { return; }
+};
+}  // namespace rocksdb
+
+/*
+ * Class: org_rocksdb_NativeComparatorWrapperTest_NativeStringComparatorWrapper
+ * Method:    newStringComparator
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_NativeComparatorWrapperTest_00024NativeStringComparatorWrapper_newStringComparator(
+    JNIEnv* /*env*/, jobject /*jobj*/) {
+  auto* comparator = new rocksdb::NativeComparatorWrapperTestStringComparator();
+  return reinterpret_cast<jlong>(comparator);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/optimistic_transaction_db.cc b/thirdparty/rocksdb/java/rocksjni/optimistic_transaction_db.cc
new file mode 100644
index 0000000000..1505ff9895
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/optimistic_transaction_db.cc
@@ -0,0 +1,278 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::TransactionDB.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_OptimisticTransactionDB.h"
+
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/transaction.h"
+
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong joptions_handle, jstring jdb_path) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+
+  auto* options = reinterpret_cast<rocksdb::Options*>(joptions_handle);
+  rocksdb::OptimisticTransactionDB* otdb = nullptr;
+  rocksdb::Status s =
+      rocksdb::OptimisticTransactionDB::Open(*options, db_path, &otdb);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(otdb);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    open
+ * Signature: (JLjava/lang/String;[[B[J)[J
+ */
+jlongArray
+Java_org_rocksdb_OptimisticTransactionDB_open__JLjava_lang_String_2_3_3B_3J(
+    JNIEnv* env, jclass, jlong jdb_options_handle, jstring jdb_path,
+    jobjectArray jcolumn_names, jlongArray jcolumn_options_handles) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  const jsize len_cols = env->GetArrayLength(jcolumn_names);
+  if (len_cols > 0) {
+    if (env->EnsureLocalCapacity(len_cols) != 0) {
+      // out of memory
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+
+    jlong* jco = env->GetLongArrayElements(jcolumn_options_handles, nullptr);
+    if (jco == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+
+    for (int i = 0; i < len_cols; i++) {
+      const jobject jcn = env->GetObjectArrayElement(jcolumn_names, i);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+        env->ReleaseStringUTFChars(jdb_path, db_path);
+        return nullptr;
+      }
+
+      const jbyteArray jcn_ba = reinterpret_cast<jbyteArray>(jcn);
+      const jsize jcf_name_len = env->GetArrayLength(jcn_ba);
+      if (env->EnsureLocalCapacity(jcf_name_len) != 0) {
+        // out of memory
+        env->DeleteLocalRef(jcn);
+        env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+        env->ReleaseStringUTFChars(jdb_path, db_path);
+        return nullptr;
+      }
+
+      jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, nullptr);
+      if (jcf_name == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->DeleteLocalRef(jcn);
+        env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+        env->ReleaseStringUTFChars(jdb_path, db_path);
+        return nullptr;
+      }
+
+      const std::string cf_name(reinterpret_cast<char*>(jcf_name),
+                                jcf_name_len);
+      const rocksdb::ColumnFamilyOptions* cf_options =
+          reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jco[i]);
+      column_families.push_back(
+          rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options));
+
+      env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT);
+      env->DeleteLocalRef(jcn);
+    }
+    env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+  }
+
+  auto* db_options = reinterpret_cast<rocksdb::DBOptions*>(jdb_options_handle);
+  std::vector<rocksdb::ColumnFamilyHandle*> handles;
+  rocksdb::OptimisticTransactionDB* otdb = nullptr;
+  const rocksdb::Status s = rocksdb::OptimisticTransactionDB::Open(
+      *db_options, db_path, column_families, &handles, &otdb);
+
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  // check if open operation was successful
+  if (s.ok()) {
+    const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
+    std::unique_ptr<jlong[]> results =
+        std::unique_ptr<jlong[]>(new jlong[resultsLen]);
+    results[0] = reinterpret_cast<jlong>(otdb);
+    for (int i = 1; i <= len_cols; i++) {
+      results[i] = reinterpret_cast<jlong>(handles[i - 1]);
+    }
+
+    jlongArray jresults = env->NewLongArray(resultsLen);
+    if (jresults == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return nullptr;
+    }
+    return jresults;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_OptimisticTransactionDB_disposeInternal(
+    JNIEnv *, jobject, jlong jhandle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<rocksdb::OptimisticTransactionDB*>(jhandle);
+  assert(optimistic_txn_db != nullptr);
+  delete optimistic_txn_db;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_OptimisticTransactionDB_closeDatabase(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<rocksdb::OptimisticTransactionDB*>(jhandle);
+  assert(optimistic_txn_db != nullptr);
+  rocksdb::Status s = optimistic_txn_db->Close();
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<rocksdb::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  rocksdb::Transaction* txn =
+      optimistic_txn_db->BeginTransaction(*write_options);
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction__JJJ(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jwrite_options_handle, jlong joptimistic_txn_options_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<rocksdb::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* optimistic_txn_options =
+      reinterpret_cast<rocksdb::OptimisticTransactionOptions*>(
+          joptimistic_txn_options_handle);
+  rocksdb::Transaction* txn = optimistic_txn_db->BeginTransaction(
+      *write_options, *optimistic_txn_options);
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jold_txn_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<rocksdb::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* old_txn = reinterpret_cast<rocksdb::Transaction*>(jold_txn_handle);
+  rocksdb::OptimisticTransactionOptions optimistic_txn_options;
+  rocksdb::Transaction* txn = optimistic_txn_db->BeginTransaction(
+      *write_options, optimistic_txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_optimistic_txn
+  assert(txn == old_txn);
+
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJJ)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_beginTransaction_1withOld__JJJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong joptimistic_txn_options_handle, jlong jold_txn_handle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<rocksdb::OptimisticTransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* optimistic_txn_options =
+      reinterpret_cast<rocksdb::OptimisticTransactionOptions*>(
+          joptimistic_txn_options_handle);
+  auto* old_txn = reinterpret_cast<rocksdb::Transaction*>(jold_txn_handle);
+  rocksdb::Transaction* txn = optimistic_txn_db->BeginTransaction(
+      *write_options, *optimistic_txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_optimisic_txn
+  assert(txn == old_txn);
+
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionDB
+ * Method:    getBaseDB
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionDB_getBaseDB(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* optimistic_txn_db =
+      reinterpret_cast<rocksdb::OptimisticTransactionDB*>(jhandle);
+  return reinterpret_cast<jlong>(optimistic_txn_db->GetBaseDB());
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/optimistic_transaction_options.cc b/thirdparty/rocksdb/java/rocksjni/optimistic_transaction_options.cc
new file mode 100644
index 0000000000..3eee446673
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/optimistic_transaction_options.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::OptimisticTransactionOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_OptimisticTransactionOptions.h"
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    newOptimisticTransactionOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_OptimisticTransactionOptions_newOptimisticTransactionOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  rocksdb::OptimisticTransactionOptions* opts =
+      new rocksdb::OptimisticTransactionOptions();
+  return reinterpret_cast<jlong>(opts);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    isSetSnapshot
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_OptimisticTransactionOptions_isSetSnapshot(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts =
+      reinterpret_cast<rocksdb::OptimisticTransactionOptions*>(jhandle);
+  return opts->set_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    setSetSnapshot
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_OptimisticTransactionOptions_setSetSnapshot(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean jset_snapshot) {
+  auto* opts =
+      reinterpret_cast<rocksdb::OptimisticTransactionOptions*>(jhandle);
+  opts->set_snapshot = jset_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    setComparator
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_OptimisticTransactionOptions_setComparator(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jcomparator_handle) {
+  auto* opts =
+      reinterpret_cast<rocksdb::OptimisticTransactionOptions*>(jhandle);
+  opts->cmp = reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+}
+
+/*
+ * Class:     org_rocksdb_OptimisticTransactionOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_OptimisticTransactionOptions_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  delete reinterpret_cast<rocksdb::OptimisticTransactionOptions*>(jhandle);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/options.cc b/thirdparty/rocksdb/java/rocksjni/options.cc
index 8194abaf6b..12f44b5eb0 100644
--- a/thirdparty/rocksdb/java/rocksjni/options.cc
+++ b/thirdparty/rocksdb/java/rocksjni/options.cc
@@ -5,34 +5,35 @@
 //
 // This file implements the "bridge" between Java and C++ for rocksdb::Options.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <memory>
 #include <vector>
 
-#include "include/org_rocksdb_Options.h"
-#include "include/org_rocksdb_DBOptions.h"
 #include "include/org_rocksdb_ColumnFamilyOptions.h"
-#include "include/org_rocksdb_WriteOptions.h"
-#include "include/org_rocksdb_ReadOptions.h"
 #include "include/org_rocksdb_ComparatorOptions.h"
+#include "include/org_rocksdb_DBOptions.h"
 #include "include/org_rocksdb_FlushOptions.h"
+#include "include/org_rocksdb_Options.h"
+#include "include/org_rocksdb_ReadOptions.h"
+#include "include/org_rocksdb_WriteOptions.h"
 
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/portal.h"
 #include "rocksjni/statisticsjni.h"
+#include "rocksjni/table_filter_jnicallback.h"
 
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
 #include "rocksdb/options.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
-#include "rocksdb/memtablerep.h"
 #include "rocksdb/table.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/rate_limiter.h"
-#include "rocksdb/comparator.h"
-#include "rocksdb/convenience.h"
-#include "rocksdb/merge_operator.h"
 #include "utilities/merge_operators.h"
 
 /*
@@ -40,7 +41,8 @@
  * Method:    newOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_Options_newOptions__(JNIEnv* env, jclass jcls) {
+jlong Java_org_rocksdb_Options_newOptions__(
+    JNIEnv*, jclass) {
   auto* op = new rocksdb::Options();
   return reinterpret_cast<jlong>(op);
 }
@@ -50,22 +52,34 @@ jlong Java_org_rocksdb_Options_newOptions__(JNIEnv* env, jclass jcls) {
  * Method:    newOptions
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_Options_newOptions__JJ(JNIEnv* env, jclass jcls,
-    jlong jdboptions, jlong jcfoptions) {
+jlong Java_org_rocksdb_Options_newOptions__JJ(
+    JNIEnv*, jclass, jlong jdboptions, jlong jcfoptions) {
   auto* dbOpt = reinterpret_cast<const rocksdb::DBOptions*>(jdboptions);
-  auto* cfOpt = reinterpret_cast<const rocksdb::ColumnFamilyOptions*>(
-      jcfoptions);
+  auto* cfOpt =
+      reinterpret_cast<const rocksdb::ColumnFamilyOptions*>(jcfoptions);
   auto* op = new rocksdb::Options(*dbOpt, *cfOpt);
   return reinterpret_cast<jlong>(op);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    copyOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_copyOptions(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt =
+      new rocksdb::Options(*(reinterpret_cast<rocksdb::Options*>(jhandle)));
+  return reinterpret_cast<jlong>(new_opt);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    disposeInternal
  * Signature: (J)V
  */
 void Java_org_rocksdb_Options_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+    JNIEnv*, jobject, jlong handle) {
   auto* op = reinterpret_cast<rocksdb::Options*>(handle);
   assert(op != nullptr);
   delete op;
@@ -77,9 +91,9 @@ void Java_org_rocksdb_Options_disposeInternal(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setIncreaseParallelism(
-    JNIEnv * env, jobject jobj, jlong jhandle, jint totalThreads) {
-  reinterpret_cast<rocksdb::Options*>
-      (jhandle)->IncreaseParallelism(static_cast<int>(totalThreads));
+    JNIEnv*, jobject, jlong jhandle, jint totalThreads) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->IncreaseParallelism(
+      static_cast<int>(totalThreads));
 }
 
 /*
@@ -88,7 +102,7 @@ void Java_org_rocksdb_Options_setIncreaseParallelism(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setCreateIfMissing(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
+    JNIEnv*, jobject, jlong jhandle, jboolean flag) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing = flag;
 }
 
@@ -98,7 +112,7 @@ void Java_org_rocksdb_Options_setCreateIfMissing(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_createIfMissing(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->create_if_missing;
 }
 
@@ -108,9 +122,9 @@ jboolean Java_org_rocksdb_Options_createIfMissing(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
-  reinterpret_cast<rocksdb::Options*>
-      (jhandle)->create_missing_column_families = flag;
+    JNIEnv*, jobject, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->create_missing_column_families =
+      flag;
 }
 
 /*
@@ -119,9 +133,9 @@ void Java_org_rocksdb_Options_setCreateMissingColumnFamilies(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>
-      (jhandle)->create_missing_column_families;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->create_missing_column_families;
 }
 
 /*
@@ -130,7 +144,7 @@ jboolean Java_org_rocksdb_Options_createMissingColumnFamilies(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setComparatorHandle__JI(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
+    JNIEnv*, jobject, jlong jhandle, jint builtinComparator) {
   switch (builtinComparator) {
     case 1:
       reinterpret_cast<rocksdb::Options*>(jhandle)->comparator =
@@ -146,12 +160,32 @@ void Java_org_rocksdb_Options_setComparatorHandle__JI(
 /*
  * Class:     org_rocksdb_Options
  * Method:    setComparatorHandle
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_Options_setComparatorHandle__JJ(
-    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
-  reinterpret_cast<rocksdb::Options*>(jopt_handle)->comparator =
-      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+ * Signature: (JJB)V
+ */
+void Java_org_rocksdb_Options_setComparatorHandle__JJB(
+    JNIEnv*, jobject, jlong jopt_handle, jlong jcomparator_handle,
+    jbyte jcomparator_type) {
+  rocksdb::Comparator* comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      comparator =
+          reinterpret_cast<rocksdb::ComparatorJniCallback*>(jcomparator_handle);
+      break;
+
+    // JAVA_DIRECT_COMPARATOR
+    case 0x1:
+      comparator = reinterpret_cast<rocksdb::DirectComparatorJniCallback*>(
+          jcomparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x2:
+      comparator = reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+      break;
+  }
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
+  opt->comparator = comparator;
 }
 
 /*
@@ -160,16 +194,16 @@ void Java_org_rocksdb_Options_setComparatorHandle__JJ(
  * Signature: (JJjava/lang/String)V
  */
 void Java_org_rocksdb_Options_setMergeOperatorName(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) {
+    JNIEnv* env, jobject, jlong jhandle, jstring jop_name) {
   const char* op_name = env->GetStringUTFChars(jop_name, nullptr);
-  if(op_name == nullptr) {
+  if (op_name == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
   auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
-  options->merge_operator = rocksdb::MergeOperators::CreateFromStringId(
-        op_name);
+  options->merge_operator =
+      rocksdb::MergeOperators::CreateFromStringId(op_name);
 
   env->ReleaseStringUTFChars(jop_name, op_name);
 }
@@ -180,10 +214,38 @@ void Java_org_rocksdb_Options_setMergeOperatorName(
  * Signature: (JJjava/lang/String)V
  */
 void Java_org_rocksdb_Options_setMergeOperator(
-  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+    JNIEnv*, jobject, jlong jhandle, jlong mergeOperatorHandle) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->merge_operator =
-    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
-      (mergeOperatorHandle));
+      *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>(
+          mergeOperatorHandle));
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionFilterHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setCompactionFilterHandle(
+    JNIEnv*, jobject, jlong jopt_handle,
+    jlong jcompactionfilter_handle) {
+  reinterpret_cast<rocksdb::Options*>(jopt_handle)->
+      compaction_filter = reinterpret_cast<rocksdb::CompactionFilter*>
+  (jcompactionfilter_handle);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setCompactionFilterFactoryHandle
+ * Signature: (JJ)V
+ */
+void JNICALL Java_org_rocksdb_Options_setCompactionFilterFactoryHandle(
+    JNIEnv*, jobject, jlong jopt_handle,
+    jlong jcompactionfilterfactory_handle) {
+  auto* cff_factory =
+      reinterpret_cast<std::shared_ptr<rocksdb::CompactionFilterFactory> *>(
+          jcompactionfilterfactory_handle);
+  reinterpret_cast<rocksdb::Options*>(jopt_handle)->
+      compaction_filter_factory = *cff_factory;
 }
 
 /*
@@ -192,8 +254,9 @@ void Java_org_rocksdb_Options_setMergeOperator(
  * Signature: (JJ)I
  */
 void Java_org_rocksdb_Options_setWriteBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong jwrite_buffer_size) {
+  auto s =
+        rocksdb::JniUtil::check_if_jlong_fits_size_t(jwrite_buffer_size);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
         jwrite_buffer_size;
@@ -202,13 +265,27 @@ void Java_org_rocksdb_Options_setWriteBufferSize(
   }
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWriteBufferManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWriteBufferManager(
+    JNIEnv*, jobject, jlong joptions_handle,
+    jlong jwrite_buffer_manager_handle) {
+  auto* write_buffer_manager =
+          reinterpret_cast<std::shared_ptr<rocksdb::WriteBufferManager> *>(jwrite_buffer_manager_handle);
+  reinterpret_cast<rocksdb::Options*>(joptions_handle)->write_buffer_manager =
+          *write_buffer_manager;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    writeBufferSize
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_writeBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size;
 }
 
@@ -218,9 +295,10 @@ jlong Java_org_rocksdb_Options_writeBufferSize(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxWriteBufferNumber(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) {
+    JNIEnv*, jobject, jlong jhandle,
+    jint jmax_write_buffer_number) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number =
-          jmax_write_buffer_number;
+      jmax_write_buffer_number;
 }
 
 /*
@@ -229,11 +307,10 @@ void Java_org_rocksdb_Options_setMaxWriteBufferNumber(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setStatistics(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jstatistics_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jstatistics_handle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
-  auto* pSptr =
-      reinterpret_cast<std::shared_ptr<rocksdb::StatisticsJni>*>(
-          jstatistics_handle);
+  auto* pSptr = reinterpret_cast<std::shared_ptr<rocksdb::StatisticsJni>*>(
+      jstatistics_handle);
   opt->statistics = *pSptr;
 }
 
@@ -243,7 +320,7 @@ void Java_org_rocksdb_Options_setStatistics(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_statistics(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   std::shared_ptr<rocksdb::Statistics> sptr = opt->statistics;
   if (sptr == nullptr) {
@@ -261,7 +338,7 @@ jlong Java_org_rocksdb_Options_statistics(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_maxWriteBufferNumber(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number;
 }
 
@@ -271,7 +348,7 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumber(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_errorIfExists(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists;
 }
 
@@ -281,7 +358,7 @@ jboolean Java_org_rocksdb_Options_errorIfExists(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setErrorIfExists(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) {
+    JNIEnv*, jobject, jlong jhandle, jboolean error_if_exists) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->error_if_exists =
       static_cast<bool>(error_if_exists);
 }
@@ -292,7 +369,7 @@ void Java_org_rocksdb_Options_setErrorIfExists(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_paranoidChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks;
 }
 
@@ -302,7 +379,7 @@ jboolean Java_org_rocksdb_Options_paranoidChecks(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setParanoidChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) {
+    JNIEnv*, jobject, jlong jhandle, jboolean paranoid_checks) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_checks =
       static_cast<bool>(paranoid_checks);
 }
@@ -313,7 +390,7 @@ void Java_org_rocksdb_Options_setParanoidChecks(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setEnv(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jenv) {
+    JNIEnv*, jobject, jlong jhandle, jlong jenv) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->env =
       reinterpret_cast<rocksdb::Env*>(jenv);
 }
@@ -324,8 +401,7 @@ void Java_org_rocksdb_Options_setEnv(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMaxTotalWalSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_total_wal_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_total_wal_size) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_total_wal_size =
       static_cast<jlong>(jmax_total_wal_size);
 }
@@ -336,9 +412,8 @@ void Java_org_rocksdb_Options_setMaxTotalWalSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_maxTotalWalSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->
-      max_total_wal_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_total_wal_size;
 }
 
 /*
@@ -347,7 +422,7 @@ jlong Java_org_rocksdb_Options_maxTotalWalSize(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_maxOpenFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files;
 }
 
@@ -357,7 +432,7 @@ jint Java_org_rocksdb_Options_maxOpenFiles(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxOpenFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) {
+    JNIEnv*, jobject, jlong jhandle, jint max_open_files) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_open_files =
       static_cast<int>(max_open_files);
 }
@@ -368,7 +443,7 @@ void Java_org_rocksdb_Options_setMaxOpenFiles(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxFileOpeningThreads(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_file_opening_threads) {
+    JNIEnv*, jobject, jlong jhandle, jint jmax_file_opening_threads) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_file_opening_threads =
       static_cast<int>(jmax_file_opening_threads);
 }
@@ -379,7 +454,7 @@ void Java_org_rocksdb_Options_setMaxFileOpeningThreads(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_maxFileOpeningThreads(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<int>(opt->max_file_opening_threads);
 }
@@ -390,7 +465,7 @@ jint Java_org_rocksdb_Options_maxFileOpeningThreads(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_useFsync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync;
 }
 
@@ -400,7 +475,7 @@ jboolean Java_org_rocksdb_Options_useFsync(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setUseFsync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) {
+    JNIEnv*, jobject, jlong jhandle, jboolean use_fsync) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->use_fsync =
       static_cast<bool>(use_fsync);
 }
@@ -411,34 +486,32 @@ void Java_org_rocksdb_Options_setUseFsync(
  * Signature: (J[Ljava/lang/String;[J)V
  */
 void Java_org_rocksdb_Options_setDbPaths(
-    JNIEnv* env, jobject jobj, jlong jhandle, jobjectArray jpaths,
+    JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
   std::vector<rocksdb::DbPath> db_paths;
   jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
-  if(ptr_jtarget_size == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return;
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
   }
 
   jboolean has_exception = JNI_FALSE;
   const jsize len = env->GetArrayLength(jpaths);
-  for(jsize i = 0; i < len; i++) {
-    jobject jpath = reinterpret_cast<jstring>(env->
-        GetObjectArrayElement(jpaths, i));
-    if(env->ExceptionCheck()) {
+  for (jsize i = 0; i < len; i++) {
+    jobject jpath =
+        reinterpret_cast<jstring>(env->GetObjectArrayElement(jpaths, i));
+    if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
-      env->ReleaseLongArrayElements(
-          jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
       return;
     }
-    std::string path = rocksdb::JniUtil::copyString(
+    std::string path = rocksdb::JniUtil::copyStdString(
         env, static_cast<jstring>(jpath), &has_exception);
     env->DeleteLocalRef(jpath);
 
-    if(has_exception == JNI_TRUE) {
-        env->ReleaseLongArrayElements(
-            jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
-        return;
+    if (has_exception == JNI_TRUE) {
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
     }
 
     jlong jtarget_size = ptr_jtarget_size[i];
@@ -459,7 +532,7 @@ void Java_org_rocksdb_Options_setDbPaths(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_dbPathsLen(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jlong>(opt->db_paths.size());
 }
@@ -470,32 +543,30 @@ jlong Java_org_rocksdb_Options_dbPathsLen(
  * Signature: (J[Ljava/lang/String;[J)V
  */
 void Java_org_rocksdb_Options_dbPaths(
-    JNIEnv* env, jobject jobj, jlong jhandle, jobjectArray jpaths,
+    JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
   jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
-  if(ptr_jtarget_size == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return;
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
   }
 
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   const jsize len = env->GetArrayLength(jpaths);
-  for(jsize i = 0; i < len; i++) {
+  for (jsize i = 0; i < len; i++) {
     rocksdb::DbPath db_path = opt->db_paths[i];
 
     jstring jpath = env->NewStringUTF(db_path.path.c_str());
-    if(jpath == nullptr) {
+    if (jpath == nullptr) {
       // exception thrown: OutOfMemoryError
-      env->ReleaseLongArrayElements(
-          jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
       return;
     }
     env->SetObjectArrayElement(jpaths, i, jpath);
-    if(env->ExceptionCheck()) {
+    if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
       env->DeleteLocalRef(jpath);
-      env->ReleaseLongArrayElements(
-          jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
       return;
     }
 
@@ -511,7 +582,7 @@ void Java_org_rocksdb_Options_dbPaths(
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_Options_dbLogDir(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   return env->NewStringUTF(
       reinterpret_cast<rocksdb::Options*>(jhandle)->db_log_dir.c_str());
 }
@@ -522,9 +593,9 @@ jstring Java_org_rocksdb_Options_dbLogDir(
  * Signature: (JLjava/lang/String)V
  */
 void Java_org_rocksdb_Options_setDbLogDir(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) {
+    JNIEnv* env, jobject, jlong jhandle, jstring jdb_log_dir) {
   const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr);
-  if(log_dir == nullptr) {
+  if (log_dir == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
@@ -538,7 +609,7 @@ void Java_org_rocksdb_Options_setDbLogDir(
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_Options_walDir(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   return env->NewStringUTF(
       reinterpret_cast<rocksdb::Options*>(jhandle)->wal_dir.c_str());
 }
@@ -549,9 +620,9 @@ jstring Java_org_rocksdb_Options_walDir(
  * Signature: (JLjava/lang/String)V
  */
 void Java_org_rocksdb_Options_setWalDir(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) {
+    JNIEnv* env, jobject, jlong jhandle, jstring jwal_dir) {
   const char* wal_dir = env->GetStringUTFChars(jwal_dir, nullptr);
-  if(wal_dir == nullptr) {
+  if (wal_dir == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
@@ -565,7 +636,7 @@ void Java_org_rocksdb_Options_setWalDir(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)
       ->delete_obsolete_files_period_micros;
 }
@@ -576,10 +647,9 @@ jlong Java_org_rocksdb_Options_deleteObsoleteFilesPeriodMicros(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) {
+    JNIEnv*, jobject, jlong jhandle, jlong micros) {
   reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->delete_obsolete_files_period_micros =
-          static_cast<int64_t>(micros);
+      ->delete_obsolete_files_period_micros = static_cast<int64_t>(micros);
 }
 
 /*
@@ -588,9 +658,9 @@ void Java_org_rocksdb_Options_setDeleteObsoleteFilesPeriodMicros(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setBaseBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->base_background_compactions = static_cast<int>(max);
+    JNIEnv*, jobject, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->base_background_compactions =
+      static_cast<int>(max);
 }
 
 /*
@@ -599,7 +669,7 @@ void Java_org_rocksdb_Options_setBaseBackgroundCompactions(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_baseBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)
       ->base_background_compactions;
 }
@@ -610,9 +680,9 @@ jint Java_org_rocksdb_Options_baseBackgroundCompactions(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_maxBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_background_compactions;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_background_compactions;
 }
 
 /*
@@ -621,9 +691,9 @@ jint Java_org_rocksdb_Options_maxBackgroundCompactions(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->max_background_compactions = static_cast<int>(max);
+    JNIEnv*, jobject, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_compactions =
+      static_cast<int>(max);
 }
 
 /*
@@ -632,9 +702,9 @@ void Java_org_rocksdb_Options_setMaxBackgroundCompactions(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxSubcompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)
-      ->max_subcompactions = static_cast<int32_t>(max);
+    JNIEnv*, jobject, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_subcompactions =
+      static_cast<int32_t>(max);
 }
 
 /*
@@ -643,7 +713,7 @@ void Java_org_rocksdb_Options_setMaxSubcompactions(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_maxSubcompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->max_subcompactions;
 }
 
@@ -653,7 +723,7 @@ jint Java_org_rocksdb_Options_maxSubcompactions(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_maxBackgroundFlushes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes;
 }
 
@@ -663,18 +733,39 @@ jint Java_org_rocksdb_Options_maxBackgroundFlushes(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxBackgroundFlushes(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) {
+    JNIEnv*, jobject, jlong jhandle, jint max_background_flushes) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_flushes =
       static_cast<int>(max_background_flushes);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    maxBackgroundJobs
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_Options_maxBackgroundJobs(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_jobs;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setMaxBackgroundJobs
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_Options_setMaxBackgroundJobs(
+    JNIEnv*, jobject, jlong jhandle, jint max_background_jobs) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_background_jobs =
+      static_cast<int>(max_background_jobs);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    maxLogFileSize
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_maxLogFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size;
 }
 
@@ -684,8 +775,8 @@ jlong Java_org_rocksdb_Options_maxLogFileSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMaxLogFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong max_log_file_size) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(max_log_file_size);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
         max_log_file_size;
@@ -700,7 +791,7 @@ void Java_org_rocksdb_Options_setMaxLogFileSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_logFileTimeToRoll(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll;
 }
 
@@ -710,9 +801,9 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setLogFileTimeToRoll(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      log_file_time_to_roll);
+    JNIEnv* env, jobject, jlong jhandle, jlong log_file_time_to_roll) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(log_file_time_to_roll);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
         log_file_time_to_roll;
@@ -727,7 +818,7 @@ void Java_org_rocksdb_Options_setLogFileTimeToRoll(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_keepLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num;
 }
 
@@ -737,8 +828,8 @@ jlong Java_org_rocksdb_Options_keepLogFileNum(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setKeepLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num);
+    JNIEnv* env, jobject, jlong jhandle, jlong keep_log_file_num) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(keep_log_file_num);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
         keep_log_file_num;
@@ -753,7 +844,7 @@ void Java_org_rocksdb_Options_setKeepLogFileNum(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_recycleLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->recycle_log_file_num;
 }
 
@@ -763,9 +854,8 @@ jlong Java_org_rocksdb_Options_recycleLogFileNum(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setRecycleLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong recycle_log_file_num) {
-  rocksdb::Status s =
-      rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num);
+    JNIEnv* env, jobject, jlong jhandle, jlong recycle_log_file_num) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(recycle_log_file_num);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->recycle_log_file_num =
         recycle_log_file_num;
@@ -780,7 +870,7 @@ void Java_org_rocksdb_Options_setRecycleLogFileNum(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_maxManifestFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size;
 }
 
@@ -789,7 +879,7 @@ jlong Java_org_rocksdb_Options_maxManifestFileSize(
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_Options_memTableFactoryName(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get();
 
@@ -811,7 +901,7 @@ jstring Java_org_rocksdb_Options_memTableFactoryName(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMaxManifestFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong max_manifest_file_size) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_manifest_file_size =
       static_cast<int64_t>(max_manifest_file_size);
 }
@@ -821,7 +911,7 @@ void Java_org_rocksdb_Options_setMaxManifestFileSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMemTableFactory(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->memtable_factory.reset(
       reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
 }
@@ -832,12 +922,25 @@ void Java_org_rocksdb_Options_setMemTableFactory(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setRateLimiter(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
-  std::shared_ptr<rocksdb::RateLimiter> *pRateLimiter =
-      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(
+    JNIEnv*, jobject, jlong jhandle, jlong jrate_limiter_handle) {
+  std::shared_ptr<rocksdb::RateLimiter>* pRateLimiter =
+      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(
           jrate_limiter_handle);
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      rate_limiter = *pRateLimiter;
+  reinterpret_cast<rocksdb::Options*>(jhandle)->rate_limiter = *pRateLimiter;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setSstFileManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setSstFileManager(
+    JNIEnv*, jobject, jlong jhandle, jlong jsst_file_manager_handle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(
+          jsst_file_manager_handle);
+  reinterpret_cast<rocksdb::Options*>(jhandle)->sst_file_manager =
+      *sptr_sst_file_manager;
 }
 
 /*
@@ -846,9 +949,9 @@ void Java_org_rocksdb_Options_setRateLimiter(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setLogger(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jlogger_handle) {
-std::shared_ptr<rocksdb::LoggerJniCallback> *pLogger =
-      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(
+    JNIEnv*, jobject, jlong jhandle, jlong jlogger_handle) {
+  std::shared_ptr<rocksdb::LoggerJniCallback>* pLogger =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback>*>(
           jlogger_handle);
   reinterpret_cast<rocksdb::Options*>(jhandle)->info_log = *pLogger;
 }
@@ -859,7 +962,7 @@ std::shared_ptr<rocksdb::LoggerJniCallback> *pLogger =
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Options_setInfoLogLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jlog_level) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->info_log_level =
       static_cast<rocksdb::InfoLogLevel>(jlog_level);
 }
@@ -870,7 +973,7 @@ void Java_org_rocksdb_Options_setInfoLogLevel(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Options_infoLogLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return static_cast<jbyte>(
       reinterpret_cast<rocksdb::Options*>(jhandle)->info_log_level);
 }
@@ -881,7 +984,7 @@ jbyte Java_org_rocksdb_Options_infoLogLevel(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_tableCacheNumshardbits(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits;
 }
 
@@ -891,7 +994,7 @@ jint Java_org_rocksdb_Options_tableCacheNumshardbits(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setTableCacheNumshardbits(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) {
+    JNIEnv*, jobject, jlong jhandle, jint table_cache_numshardbits) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->table_cache_numshardbits =
       static_cast<int>(table_cache_numshardbits);
 }
@@ -901,10 +1004,9 @@ void Java_org_rocksdb_Options_setTableCacheNumshardbits(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+    JNIEnv*, jobject, jlong jhandle, jint jprefix_length) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
-      rocksdb::NewFixedPrefixTransform(
-          static_cast<int>(jprefix_length)));
+      rocksdb::NewFixedPrefixTransform(static_cast<int>(jprefix_length)));
 }
 
 /*
@@ -912,10 +1014,9 @@ void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_useCappedPrefixExtractor(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
+    JNIEnv*, jobject, jlong jhandle, jint jprefix_length) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
-      rocksdb::NewCappedPrefixTransform(
-          static_cast<int>(jprefix_length)));
+      rocksdb::NewCappedPrefixTransform(static_cast<int>(jprefix_length)));
 }
 
 /*
@@ -924,7 +1025,7 @@ void Java_org_rocksdb_Options_useCappedPrefixExtractor(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_walTtlSeconds(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds;
 }
 
@@ -934,7 +1035,7 @@ jlong Java_org_rocksdb_Options_walTtlSeconds(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setWalTtlSeconds(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) {
+    JNIEnv*, jobject, jlong jhandle, jlong WAL_ttl_seconds) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_ttl_seconds =
       static_cast<int64_t>(WAL_ttl_seconds);
 }
@@ -945,7 +1046,7 @@ void Java_org_rocksdb_Options_setWalTtlSeconds(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_walSizeLimitMB(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB;
 }
 
@@ -955,7 +1056,7 @@ jlong Java_org_rocksdb_Options_walSizeLimitMB(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setWalSizeLimitMB(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) {
+    JNIEnv*, jobject, jlong jhandle, jlong WAL_size_limit_MB) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->WAL_size_limit_MB =
       static_cast<int64_t>(WAL_size_limit_MB);
 }
@@ -966,7 +1067,7 @@ void Java_org_rocksdb_Options_setWalSizeLimitMB(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_manifestPreallocationSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)
       ->manifest_preallocation_size;
 }
@@ -977,8 +1078,8 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setManifestPreallocationSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong preallocation_size) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(preallocation_size);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
         preallocation_size;
@@ -992,9 +1093,11 @@ void Java_org_rocksdb_Options_setManifestPreallocationSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setTableFactory(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->table_factory.reset(
-      reinterpret_cast<rocksdb::TableFactory*>(jfactory_handle));
+    JNIEnv*, jobject, jlong jhandle, jlong jtable_factory_handle) {
+  auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  auto* table_factory =
+      reinterpret_cast<rocksdb::TableFactory*>(jtable_factory_handle);
+  options->table_factory.reset(table_factory);
 }
 
 /*
@@ -1003,7 +1106,7 @@ void Java_org_rocksdb_Options_setTableFactory(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_allowMmapReads(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads;
 }
 
@@ -1013,7 +1116,7 @@ jboolean Java_org_rocksdb_Options_allowMmapReads(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAllowMmapReads(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) {
+    JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_reads) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_reads =
       static_cast<bool>(allow_mmap_reads);
 }
@@ -1024,7 +1127,7 @@ void Java_org_rocksdb_Options_setAllowMmapReads(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_allowMmapWrites(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes;
 }
 
@@ -1034,7 +1137,7 @@ jboolean Java_org_rocksdb_Options_allowMmapWrites(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAllowMmapWrites(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) {
+    JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_writes) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->allow_mmap_writes =
       static_cast<bool>(allow_mmap_writes);
 }
@@ -1044,8 +1147,8 @@ void Java_org_rocksdb_Options_setAllowMmapWrites(
  * Method:    useDirectReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv* env, jobject jobj,
-                                                 jlong jhandle) {
+jboolean Java_org_rocksdb_Options_useDirectReads(
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_reads;
 }
 
@@ -1054,9 +1157,8 @@ jboolean Java_org_rocksdb_Options_useDirectReads(JNIEnv* env, jobject jobj,
  * Method:    setUseDirectReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv* env, jobject jobj,
-                                                jlong jhandle,
-                                                jboolean use_direct_reads) {
+void Java_org_rocksdb_Options_setUseDirectReads(
+    JNIEnv*, jobject, jlong jhandle, jboolean use_direct_reads) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->use_direct_reads =
       static_cast<bool>(use_direct_reads);
 }
@@ -1067,7 +1169,7 @@ void Java_org_rocksdb_Options_setUseDirectReads(JNIEnv* env, jobject jobj,
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_useDirectIoForFlushAndCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)
       ->use_direct_io_for_flush_and_compaction;
 }
@@ -1078,7 +1180,7 @@ jboolean Java_org_rocksdb_Options_useDirectIoForFlushAndCompaction(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setUseDirectIoForFlushAndCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jboolean use_direct_io_for_flush_and_compaction) {
   reinterpret_cast<rocksdb::Options*>(jhandle)
       ->use_direct_io_for_flush_and_compaction =
@@ -1091,7 +1193,7 @@ void Java_org_rocksdb_Options_setUseDirectIoForFlushAndCompaction(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAllowFAllocate(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jallow_fallocate) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_fallocate) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->allow_fallocate =
       static_cast<bool>(jallow_fallocate);
 }
@@ -1102,7 +1204,7 @@ void Java_org_rocksdb_Options_setAllowFAllocate(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_allowFAllocate(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->allow_fallocate);
 }
@@ -1113,7 +1215,7 @@ jboolean Java_org_rocksdb_Options_allowFAllocate(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_isFdCloseOnExec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec;
 }
 
@@ -1123,7 +1225,7 @@ jboolean Java_org_rocksdb_Options_isFdCloseOnExec(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setIsFdCloseOnExec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) {
+    JNIEnv*, jobject, jlong jhandle, jboolean is_fd_close_on_exec) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->is_fd_close_on_exec =
       static_cast<bool>(is_fd_close_on_exec);
 }
@@ -1134,7 +1236,7 @@ void Java_org_rocksdb_Options_setIsFdCloseOnExec(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_statsDumpPeriodSec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec;
 }
 
@@ -1144,7 +1246,8 @@ jint Java_org_rocksdb_Options_statsDumpPeriodSec(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setStatsDumpPeriodSec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) {
+    JNIEnv*, jobject, jlong jhandle,
+    jint stats_dump_period_sec) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->stats_dump_period_sec =
       static_cast<int>(stats_dump_period_sec);
 }
@@ -1155,7 +1258,7 @@ void Java_org_rocksdb_Options_setStatsDumpPeriodSec(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open;
 }
 
@@ -1165,7 +1268,8 @@ jboolean Java_org_rocksdb_Options_adviseRandomOnOpen(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAdviseRandomOnOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) {
+    JNIEnv*, jobject, jlong jhandle,
+    jboolean advise_random_on_open) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->advise_random_on_open =
       static_cast<bool>(advise_random_on_open);
 }
@@ -1176,7 +1280,8 @@ void Java_org_rocksdb_Options_setAdviseRandomOnOpen(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setDbWriteBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jdb_write_buffer_size) {
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jdb_write_buffer_size) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->db_write_buffer_size = static_cast<size_t>(jdb_write_buffer_size);
 }
@@ -1187,7 +1292,7 @@ void Java_org_rocksdb_Options_setDbWriteBufferSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_dbWriteBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jlong>(opt->db_write_buffer_size);
 }
@@ -1198,7 +1303,8 @@ jlong Java_org_rocksdb_Options_dbWriteBufferSize(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Options_setAccessHintOnCompactionStart(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jaccess_hint_value) {
+    JNIEnv*, jobject, jlong jhandle,
+    jbyte jaccess_hint_value) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->access_hint_on_compaction_start =
       rocksdb::AccessHintJni::toCppAccessHint(jaccess_hint_value);
@@ -1210,7 +1316,7 @@ void Java_org_rocksdb_Options_setAccessHintOnCompactionStart(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Options_accessHintOnCompactionStart(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return rocksdb::AccessHintJni::toJavaAccessHint(
       opt->access_hint_on_compaction_start);
@@ -1222,7 +1328,7 @@ jbyte Java_org_rocksdb_Options_accessHintOnCompactionStart(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setNewTableReaderForCompactionInputs(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jboolean jnew_table_reader_for_compaction_inputs) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->new_table_reader_for_compaction_inputs =
@@ -1235,7 +1341,7 @@ void Java_org_rocksdb_Options_setNewTableReaderForCompactionInputs(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_newTableReaderForCompactionInputs(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<bool>(opt->new_table_reader_for_compaction_inputs);
 }
@@ -1246,7 +1352,8 @@ jboolean Java_org_rocksdb_Options_newTableReaderForCompactionInputs(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setCompactionReadaheadSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jcompaction_readahead_size) {
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jcompaction_readahead_size) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->compaction_readahead_size =
       static_cast<size_t>(jcompaction_readahead_size);
@@ -1258,7 +1365,7 @@ void Java_org_rocksdb_Options_setCompactionReadaheadSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_compactionReadaheadSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jlong>(opt->compaction_readahead_size);
 }
@@ -1269,8 +1376,7 @@ jlong Java_org_rocksdb_Options_compactionReadaheadSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setRandomAccessMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jrandom_access_max_buffer_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jrandom_access_max_buffer_size) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->random_access_max_buffer_size =
       static_cast<size_t>(jrandom_access_max_buffer_size);
@@ -1282,7 +1388,7 @@ void Java_org_rocksdb_Options_setRandomAccessMaxBufferSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jlong>(opt->random_access_max_buffer_size);
 }
@@ -1293,7 +1399,7 @@ jlong Java_org_rocksdb_Options_randomAccessMaxBufferSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setWritableFileMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jlong jwritable_file_max_buffer_size) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->writable_file_max_buffer_size =
@@ -1306,7 +1412,7 @@ void Java_org_rocksdb_Options_setWritableFileMaxBufferSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_writableFileMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jlong>(opt->writable_file_max_buffer_size);
 }
@@ -1317,7 +1423,7 @@ jlong Java_org_rocksdb_Options_writableFileMaxBufferSize(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_useAdaptiveMutex(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex;
 }
 
@@ -1327,7 +1433,7 @@ jboolean Java_org_rocksdb_Options_useAdaptiveMutex(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setUseAdaptiveMutex(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) {
+    JNIEnv*, jobject, jlong jhandle, jboolean use_adaptive_mutex) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->use_adaptive_mutex =
       static_cast<bool>(use_adaptive_mutex);
 }
@@ -1338,7 +1444,7 @@ void Java_org_rocksdb_Options_setUseAdaptiveMutex(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_bytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync;
 }
 
@@ -1348,7 +1454,7 @@ jlong Java_org_rocksdb_Options_bytesPerSync(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setBytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) {
+    JNIEnv*, jobject, jlong jhandle, jlong bytes_per_sync) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->bytes_per_sync =
       static_cast<int64_t>(bytes_per_sync);
 }
@@ -1359,7 +1465,7 @@ void Java_org_rocksdb_Options_setBytesPerSync(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setWalBytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwal_bytes_per_sync) {
+    JNIEnv*, jobject, jlong jhandle, jlong jwal_bytes_per_sync) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->wal_bytes_per_sync =
       static_cast<int64_t>(jwal_bytes_per_sync);
 }
@@ -1370,7 +1476,7 @@ void Java_org_rocksdb_Options_setWalBytesPerSync(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_walBytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jlong>(opt->wal_bytes_per_sync);
 }
@@ -1381,8 +1487,7 @@ jlong Java_org_rocksdb_Options_walBytesPerSync(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setEnableThreadTracking(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jenable_thread_tracking) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_thread_tracking) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->enable_thread_tracking = static_cast<bool>(jenable_thread_tracking);
 }
@@ -1393,7 +1498,7 @@ void Java_org_rocksdb_Options_setEnableThreadTracking(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_enableThreadTracking(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->enable_thread_tracking);
 }
@@ -1404,7 +1509,7 @@ jboolean Java_org_rocksdb_Options_enableThreadTracking(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setDelayedWriteRate(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jdelayed_write_rate) {
+    JNIEnv*, jobject, jlong jhandle, jlong jdelayed_write_rate) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->delayed_write_rate = static_cast<uint64_t>(jdelayed_write_rate);
 }
@@ -1415,20 +1520,42 @@ void Java_org_rocksdb_Options_setDelayedWriteRate(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_delayedWriteRate(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jlong>(opt->delayed_write_rate);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setEnablePipelinedWrite
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setEnablePipelinedWrite(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_pipelined_write) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  opt->enable_pipelined_write = jenable_pipelined_write == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    enablePipelinedWrite
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_enablePipelinedWrite(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return static_cast<jboolean>(opt->enable_pipelined_write);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setAllowConcurrentMemtableWrite
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      allow_concurrent_memtable_write = static_cast<bool>(allow);
+    JNIEnv*, jobject, jlong jhandle, jboolean allow) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->allow_concurrent_memtable_write = static_cast<bool>(allow);
 }
 
 /*
@@ -1437,9 +1564,9 @@ void Java_org_rocksdb_Options_setAllowConcurrentMemtableWrite(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->
-      allow_concurrent_memtable_write;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->allow_concurrent_memtable_write;
 }
 
 /*
@@ -1448,9 +1575,9 @@ jboolean Java_org_rocksdb_Options_allowConcurrentMemtableWrite(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setEnableWriteThreadAdaptiveYield(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean yield) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      enable_write_thread_adaptive_yield = static_cast<bool>(yield);
+    JNIEnv*, jobject, jlong jhandle, jboolean yield) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->enable_write_thread_adaptive_yield = static_cast<bool>(yield);
 }
 
 /*
@@ -1459,9 +1586,9 @@ void Java_org_rocksdb_Options_setEnableWriteThreadAdaptiveYield(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_enableWriteThreadAdaptiveYield(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->
-      enable_write_thread_adaptive_yield;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->enable_write_thread_adaptive_yield;
 }
 
 /*
@@ -1470,9 +1597,9 @@ jboolean Java_org_rocksdb_Options_enableWriteThreadAdaptiveYield(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      write_thread_max_yield_usec = static_cast<int64_t>(max);
+    JNIEnv*, jobject, jlong jhandle, jlong max) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->write_thread_max_yield_usec =
+      static_cast<int64_t>(max);
 }
 
 /*
@@ -1481,9 +1608,9 @@ void Java_org_rocksdb_Options_setWriteThreadMaxYieldUsec(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->
-      write_thread_max_yield_usec;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->write_thread_max_yield_usec;
 }
 
 /*
@@ -1492,9 +1619,9 @@ jlong Java_org_rocksdb_Options_writeThreadMaxYieldUsec(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong slow) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      write_thread_slow_yield_usec = static_cast<int64_t>(slow);
+    JNIEnv*, jobject, jlong jhandle, jlong slow) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->write_thread_slow_yield_usec =
+      static_cast<int64_t>(slow);
 }
 
 /*
@@ -1503,9 +1630,9 @@ void Java_org_rocksdb_Options_setWriteThreadSlowYieldUsec(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->
-      write_thread_slow_yield_usec;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->write_thread_slow_yield_usec;
 }
 
 /*
@@ -1514,7 +1641,7 @@ jlong Java_org_rocksdb_Options_writeThreadSlowYieldUsec(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setSkipStatsUpdateOnDbOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jboolean jskip_stats_update_on_db_open) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->skip_stats_update_on_db_open =
@@ -1527,7 +1654,7 @@ void Java_org_rocksdb_Options_setSkipStatsUpdateOnDbOpen(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
 }
@@ -1538,11 +1665,11 @@ jboolean Java_org_rocksdb_Options_skipStatsUpdateOnDbOpen(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Options_setWalRecoveryMode(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jwal_recovery_mode_value) {
+    JNIEnv*, jobject, jlong jhandle,
+    jbyte jwal_recovery_mode_value) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
-  opt->wal_recovery_mode =
-      rocksdb::WALRecoveryModeJni::toCppWALRecoveryMode(
-          jwal_recovery_mode_value);
+  opt->wal_recovery_mode = rocksdb::WALRecoveryModeJni::toCppWALRecoveryMode(
+      jwal_recovery_mode_value);
 }
 
 /*
@@ -1551,7 +1678,7 @@ void Java_org_rocksdb_Options_setWalRecoveryMode(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Options_walRecoveryMode(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return rocksdb::WALRecoveryModeJni::toJavaWALRecoveryMode(
       opt->wal_recovery_mode);
@@ -1563,7 +1690,7 @@ jbyte Java_org_rocksdb_Options_walRecoveryMode(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAllow2pc(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jallow_2pc) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_2pc) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->allow_2pc = static_cast<bool>(jallow_2pc);
 }
@@ -1573,7 +1700,8 @@ void Java_org_rocksdb_Options_setAllow2pc(
  * Method:    allow2pc
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_Options_allow2pc(JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_Options_allow2pc(
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->allow_2pc);
 }
@@ -1584,20 +1712,34 @@ jboolean Java_org_rocksdb_Options_allow2pc(JNIEnv* env, jobject jobj, jlong jhan
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setRowCache(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrow_cache_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jrow_cache_handle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
-  auto* row_cache = reinterpret_cast<std::shared_ptr<rocksdb::Cache>*>(jrow_cache_handle);
+  auto* row_cache =
+      reinterpret_cast<std::shared_ptr<rocksdb::Cache>*>(jrow_cache_handle);
   opt->row_cache = *row_cache;
 }
 
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setWalFilter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setWalFilter(
+    JNIEnv*, jobject, jlong jhandle, jlong jwal_filter_handle) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  auto* wal_filter =
+      reinterpret_cast<rocksdb::WalFilterJniCallback*>(jwal_filter_handle);
+  opt->wal_filter = wal_filter;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setFailIfOptionsFileError
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setFailIfOptionsFileError(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jfail_if_options_file_error) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jfail_if_options_file_error) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->fail_if_options_file_error =
       static_cast<bool>(jfail_if_options_file_error);
@@ -1609,7 +1751,7 @@ void Java_org_rocksdb_Options_setFailIfOptionsFileError(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_failIfOptionsFileError(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->fail_if_options_file_error);
 }
@@ -1620,7 +1762,7 @@ jboolean Java_org_rocksdb_Options_failIfOptionsFileError(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setDumpMallocStats(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jdump_malloc_stats) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jdump_malloc_stats) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->dump_malloc_stats = static_cast<bool>(jdump_malloc_stats);
 }
@@ -1631,7 +1773,7 @@ void Java_org_rocksdb_Options_setDumpMallocStats(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_dumpMallocStats(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->dump_malloc_stats);
 }
@@ -1642,10 +1784,10 @@ jboolean Java_org_rocksdb_Options_dumpMallocStats(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAvoidFlushDuringRecovery(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean javoid_flush_during_recovery) {
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_recovery) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
-  opt->avoid_flush_during_recovery = static_cast<bool>(javoid_flush_during_recovery);
+  opt->avoid_flush_during_recovery =
+      static_cast<bool>(javoid_flush_during_recovery);
 }
 
 /*
@@ -1654,7 +1796,7 @@ void Java_org_rocksdb_Options_setAvoidFlushDuringRecovery(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_avoidFlushDuringRecovery(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->avoid_flush_during_recovery);
 }
@@ -1665,10 +1807,10 @@ jboolean Java_org_rocksdb_Options_avoidFlushDuringRecovery(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setAvoidFlushDuringShutdown(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean javoid_flush_during_shutdown) {
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_shutdown) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
-  opt->avoid_flush_during_shutdown = static_cast<bool>(javoid_flush_during_shutdown);
+  opt->avoid_flush_during_shutdown =
+      static_cast<bool>(javoid_flush_during_shutdown);
 }
 
 /*
@@ -1677,17 +1819,127 @@ void Java_org_rocksdb_Options_setAvoidFlushDuringShutdown(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_avoidFlushDuringShutdown(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<jboolean>(opt->avoid_flush_during_shutdown);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAllowIngestBehind
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAllowIngestBehind(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_ingest_behind) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  opt->allow_ingest_behind = jallow_ingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    allowIngestBehind
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_allowIngestBehind(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return static_cast<jboolean>(opt->allow_ingest_behind);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setPreserveDeletes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setPreserveDeletes(
+    JNIEnv*, jobject, jlong jhandle, jboolean jpreserve_deletes) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  opt->preserve_deletes = jpreserve_deletes == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    preserveDeletes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_preserveDeletes(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return static_cast<jboolean>(opt->preserve_deletes);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTwoWriteQueues
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setTwoWriteQueues(
+    JNIEnv*, jobject, jlong jhandle, jboolean jtwo_write_queues) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  opt->two_write_queues = jtwo_write_queues == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    twoWriteQueues
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_twoWriteQueues(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return static_cast<jboolean>(opt->two_write_queues);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setManualWalFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setManualWalFlush(
+    JNIEnv*, jobject, jlong jhandle, jboolean jmanual_wal_flush) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  opt->manual_wal_flush = jmanual_wal_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    manualWalFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_manualWalFlush(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return static_cast<jboolean>(opt->manual_wal_flush);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setAtomicFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_Options_setAtomicFlush(
+    JNIEnv*, jobject, jlong jhandle, jboolean jatomic_flush) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  opt->atomic_flush = jatomic_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    atomicFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Options_atomicFlush(
+    JNIEnv *, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return static_cast<jboolean>(opt->atomic_flush);
+}
+
 /*
  * Method:    tableFactoryName
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_Options_tableFactoryName(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   rocksdb::TableFactory* tf = opt->table_factory.get();
 
@@ -1698,16 +1950,15 @@ jstring Java_org_rocksdb_Options_tableFactoryName(
   return env->NewStringUTF(tf->Name());
 }
 
-
 /*
  * Class:     org_rocksdb_Options
  * Method:    minWriteBufferNumberToMerge
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->min_write_buffer_number_to_merge;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->min_write_buffer_number_to_merge;
 }
 
 /*
@@ -1716,20 +1967,18 @@ jint Java_org_rocksdb_Options_minWriteBufferNumberToMerge(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmin_write_buffer_number_to_merge) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->min_write_buffer_number_to_merge =
-          static_cast<int>(jmin_write_buffer_number_to_merge);
+    JNIEnv*, jobject, jlong jhandle, jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->min_write_buffer_number_to_merge =
+      static_cast<int>(jmin_write_buffer_number_to_merge);
 }
 /*
  * Class:     org_rocksdb_Options
  * Method:    maxWriteBufferNumberToMaintain
  * Signature: (J)I
  */
-jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv* env,
-                                                             jobject jobj,
-                                                             jlong jhandle) {
+jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)
       ->max_write_buffer_number_to_maintain;
 }
@@ -1740,7 +1989,7 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumberToMaintain(JNIEnv* env,
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jint jmax_write_buffer_number_to_maintain) {
   reinterpret_cast<rocksdb::Options*>(jhandle)
       ->max_write_buffer_number_to_maintain =
@@ -1753,7 +2002,7 @@ void Java_org_rocksdb_Options_setMaxWriteBufferNumberToMaintain(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Options_setCompressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jcompression_type_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
   opts->compression = rocksdb::CompressionTypeJni::toCppCompressionType(
       jcompression_type_value);
@@ -1765,10 +2014,9 @@ void Java_org_rocksdb_Options_setCompressionType(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Options_compressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
-  return rocksdb::CompressionTypeJni::toJavaCompressionType(
-      opts->compression);
+  return rocksdb::CompressionTypeJni::toJavaCompressionType(opts->compression);
 }
 
 /**
@@ -1779,28 +2027,30 @@ jbyte Java_org_rocksdb_Options_compressionType(
  * @param jcompression_levels A reference to a java byte array
  *     where each byte indicates a compression level
  *
- * @return A unique_ptr to the vector, or unique_ptr(nullptr) if a JNI exception occurs
+ * @return A std::unique_ptr to the vector, or std::unique_ptr(nullptr) if a JNI
+ * exception occurs
  */
-std::unique_ptr<std::vector<rocksdb::CompressionType>> rocksdb_compression_vector_helper(
+std::unique_ptr<std::vector<rocksdb::CompressionType>>rocksdb_compression_vector_helper(
     JNIEnv* env, jbyteArray jcompression_levels) {
   jsize len = env->GetArrayLength(jcompression_levels);
   jbyte* jcompression_level =
       env->GetByteArrayElements(jcompression_levels, nullptr);
-  if(jcompression_level == nullptr) {
+  if (jcompression_level == nullptr) {
     // exception thrown: OutOfMemoryError
     return std::unique_ptr<std::vector<rocksdb::CompressionType>>();
   }
 
   auto* compression_levels = new std::vector<rocksdb::CompressionType>();
-  std::unique_ptr<std::vector<rocksdb::CompressionType>> uptr_compression_levels(compression_levels);
+  std::unique_ptr<std::vector<rocksdb::CompressionType>>
+      uptr_compression_levels(compression_levels);
 
-  for(jsize i = 0; i < len; i++) {
+  for (jsize i = 0; i < len; i++) {
     jbyte jcl = jcompression_level[i];
     compression_levels->push_back(static_cast<rocksdb::CompressionType>(jcl));
   }
 
   env->ReleaseByteArrayElements(jcompression_levels, jcompression_level,
-      JNI_ABORT);
+                                JNI_ABORT);
 
   return uptr_compression_levels;
 }
@@ -1815,32 +2065,32 @@ std::unique_ptr<std::vector<rocksdb::CompressionType>> rocksdb_compression_vecto
  *
  * @return A jbytearray or nullptr if an exception occurs
  */
-jbyteArray rocksdb_compression_list_helper(JNIEnv* env,
-    std::vector<rocksdb::CompressionType> compression_levels) {
+jbyteArray rocksdb_compression_list_helper(
+    JNIEnv* env, std::vector<rocksdb::CompressionType> compression_levels) {
   const size_t len = compression_levels.size();
   jbyte* jbuf = new jbyte[len];
 
   for (size_t i = 0; i < len; i++) {
-      jbuf[i] = compression_levels[i];
+    jbuf[i] = compression_levels[i];
   }
 
   // insert in java array
   jbyteArray jcompression_levels = env->NewByteArray(static_cast<jsize>(len));
-  if(jcompression_levels == nullptr) {
-      // exception thrown: OutOfMemoryError
-      delete [] jbuf;
-      return nullptr;
+  if (jcompression_levels == nullptr) {
+    // exception thrown: OutOfMemoryError
+    delete[] jbuf;
+    return nullptr;
   }
   env->SetByteArrayRegion(jcompression_levels, 0, static_cast<jsize>(len),
-      jbuf);
-  if(env->ExceptionCheck()) {
-      // exception thrown: ArrayIndexOutOfBoundsException
-      env->DeleteLocalRef(jcompression_levels);
-      delete [] jbuf;
-      return nullptr;
+                          jbuf);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jcompression_levels);
+    delete[] jbuf;
+    return nullptr;
   }
 
-  delete [] jbuf;
+  delete[] jbuf;
 
   return jcompression_levels;
 }
@@ -1851,11 +2101,10 @@ jbyteArray rocksdb_compression_list_helper(JNIEnv* env,
  * Signature: (J[B)V
  */
 void Java_org_rocksdb_Options_setCompressionPerLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jbyteArray jcompressionLevels) {
+    JNIEnv* env, jobject, jlong jhandle, jbyteArray jcompressionLevels) {
   auto uptr_compression_levels =
       rocksdb_compression_vector_helper(env, jcompressionLevels);
-  if(!uptr_compression_levels) {
+  if (!uptr_compression_levels) {
     // exception occurred
     return;
   }
@@ -1869,10 +2118,9 @@ void Java_org_rocksdb_Options_setCompressionPerLevel(
  * Signature: (J)[B
  */
 jbyteArray Java_org_rocksdb_Options_compressionPerLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
-  return rocksdb_compression_list_helper(env,
-      options->compression_per_level);
+  return rocksdb_compression_list_helper(env, options->compression_per_level);
 }
 
 /*
@@ -1881,7 +2129,7 @@ jbyteArray Java_org_rocksdb_Options_compressionPerLevel(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Options_setBottommostCompressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jcompression_type_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
   auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
   options->bottommost_compression =
       rocksdb::CompressionTypeJni::toCppCompressionType(
@@ -1894,23 +2142,37 @@ void Java_org_rocksdb_Options_setBottommostCompressionType(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Options_bottommostCompressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
   return rocksdb::CompressionTypeJni::toJavaCompressionType(
       options->bottommost_compression);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setBottommostCompressionOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setBottommostCompressionOptions(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jbottommost_compression_options_handle) {
+  auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  auto* bottommost_compression_options =
+      reinterpret_cast<rocksdb::CompressionOptions*>(
+          jbottommost_compression_options_handle);
+  options->bottommost_compression_opts = *bottommost_compression_options;
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setCompressionOptions
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setCompressionOptions(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jcompression_options_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jcompression_options_handle) {
   auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
-  auto* compression_options =
-      reinterpret_cast<rocksdb::CompressionOptions*>(jcompression_options_handle);
+  auto* compression_options = reinterpret_cast<rocksdb::CompressionOptions*>(
+      jcompression_options_handle);
   options->compression_opts = *compression_options;
 }
 
@@ -1920,9 +2182,11 @@ void Java_org_rocksdb_Options_setCompressionOptions(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Options_setCompactionStyle(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compaction_style) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_style =
-      static_cast<rocksdb::CompactionStyle>(compaction_style);
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_style) {
+  auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  options->compaction_style =
+      rocksdb::CompactionStyleJni::toCppCompactionStyle(
+          jcompaction_style);
 }
 
 /*
@@ -1931,8 +2195,10 @@ void Java_org_rocksdb_Options_setCompactionStyle(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Options_compactionStyle(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_style;
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* options = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return rocksdb::CompactionStyleJni::toJavaCompactionStyle(
+      options->compaction_style);
 }
 
 /*
@@ -1941,9 +2207,10 @@ jbyte Java_org_rocksdb_Options_compactionStyle(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_options_fifo.max_table_files_size =
-    static_cast<uint64_t>(jmax_table_files_size);
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size =
+      static_cast<uint64_t>(jmax_table_files_size);
 }
 
 /*
@@ -1952,8 +2219,9 @@ void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_options_fifo.max_table_files_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size;
 }
 
 /*
@@ -1962,7 +2230,7 @@ jlong Java_org_rocksdb_Options_maxTableFilesSizeFIFO(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_numLevels(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels;
 }
 
@@ -1972,7 +2240,7 @@ jint Java_org_rocksdb_Options_numLevels(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setNumLevels(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) {
+    JNIEnv*, jobject, jlong jhandle, jint jnum_levels) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->num_levels =
       static_cast<int>(jnum_levels);
 }
@@ -1983,9 +2251,9 @@ void Java_org_rocksdb_Options_setNumLevels(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_file_num_compaction_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger;
 }
 
 /*
@@ -1994,11 +2262,11 @@ jint Java_org_rocksdb_Options_levelZeroFileNumCompactionTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jint jlevel0_file_num_compaction_trigger) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_file_num_compaction_trigger =
-          static_cast<int>(jlevel0_file_num_compaction_trigger);
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int>(jlevel0_file_num_compaction_trigger);
 }
 
 /*
@@ -2007,9 +2275,9 @@ void Java_org_rocksdb_Options_setLevelZeroFileNumCompactionTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_slowdown_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_slowdown_writes_trigger;
 }
 
 /*
@@ -2018,11 +2286,9 @@ jint Java_org_rocksdb_Options_levelZeroSlowdownWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_slowdown_writes_trigger) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_slowdown_writes_trigger =
-          static_cast<int>(jlevel0_slowdown_writes_trigger);
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->level0_slowdown_writes_trigger =
+      static_cast<int>(jlevel0_slowdown_writes_trigger);
 }
 
 /*
@@ -2031,9 +2297,9 @@ void Java_org_rocksdb_Options_setLevelZeroSlowdownWritesTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_stop_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_stop_writes_trigger;
 }
 
 /*
@@ -2042,8 +2308,7 @@ jint Java_org_rocksdb_Options_levelZeroStopWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_stop_writes_trigger) {
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->level0_stop_writes_trigger =
       static_cast<int>(jlevel0_stop_writes_trigger);
 }
@@ -2054,7 +2319,7 @@ void Java_org_rocksdb_Options_setLevelZeroStopWritesTrigger(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_targetFileSizeBase(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base;
 }
 
@@ -2064,8 +2329,7 @@ jlong Java_org_rocksdb_Options_targetFileSizeBase(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setTargetFileSizeBase(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jtarget_file_size_base) {
+    JNIEnv*, jobject, jlong jhandle, jlong jtarget_file_size_base) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_base =
       static_cast<uint64_t>(jtarget_file_size_base);
 }
@@ -2076,9 +2340,9 @@ void Java_org_rocksdb_Options_setTargetFileSizeBase(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_targetFileSizeMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->target_file_size_multiplier;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->target_file_size_multiplier;
 }
 
 /*
@@ -2087,11 +2351,9 @@ jint Java_org_rocksdb_Options_targetFileSizeMultiplier(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setTargetFileSizeMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jtarget_file_size_multiplier) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->target_file_size_multiplier =
-          static_cast<int>(jtarget_file_size_multiplier);
+    JNIEnv*, jobject, jlong jhandle, jint jtarget_file_size_multiplier) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->target_file_size_multiplier =
+      static_cast<int>(jtarget_file_size_multiplier);
 }
 
 /*
@@ -2100,9 +2362,8 @@ void Java_org_rocksdb_Options_setTargetFileSizeMultiplier(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_maxBytesForLevelBase(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_bytes_for_level_base;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->max_bytes_for_level_base;
 }
 
 /*
@@ -2111,11 +2372,9 @@ jlong Java_org_rocksdb_Options_maxBytesForLevelBase(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMaxBytesForLevelBase(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_bytes_for_level_base) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_bytes_for_level_base =
-          static_cast<int64_t>(jmax_bytes_for_level_base);
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->max_bytes_for_level_base =
+      static_cast<int64_t>(jmax_bytes_for_level_base);
 }
 
 /*
@@ -2124,9 +2383,9 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelBase(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_levelCompactionDynamicLevelBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level_compaction_dynamic_level_bytes;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level_compaction_dynamic_level_bytes;
 }
 
 /*
@@ -2135,11 +2394,9 @@ jboolean Java_org_rocksdb_Options_levelCompactionDynamicLevelBytes(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setLevelCompactionDynamicLevelBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jenable_dynamic_level_bytes) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level_compaction_dynamic_level_bytes =
-          (jenable_dynamic_level_bytes);
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_dynamic_level_bytes) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level_compaction_dynamic_level_bytes = (jenable_dynamic_level_bytes);
 }
 
 /*
@@ -2147,11 +2404,10 @@ void Java_org_rocksdb_Options_setLevelCompactionDynamicLevelBytes(
  * Method:    maxBytesForLevelMultiplier
  * Signature: (J)D
  */
-jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(JNIEnv* env,
-                                                            jobject jobj,
-                                                            jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_bytes_for_level_multiplier;
+jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_bytes_for_level_multiplier;
 }
 
 /*
@@ -2160,8 +2416,7 @@ jdouble Java_org_rocksdb_Options_maxBytesForLevelMultiplier(JNIEnv* env,
  * Signature: (JD)V
  */
 void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jdouble jmax_bytes_for_level_multiplier) {
+    JNIEnv*, jobject, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_bytes_for_level_multiplier =
       static_cast<double>(jmax_bytes_for_level_multiplier);
 }
@@ -2171,8 +2426,8 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplier(
  * Method:    maxCompactionBytes
  * Signature: (J)I
  */
-jlong Java_org_rocksdb_Options_maxCompactionBytes(JNIEnv* env, jobject jobj,
-                                                  jlong jhandle) {
+jlong Java_org_rocksdb_Options_maxCompactionBytes(
+    JNIEnv*, jobject, jlong jhandle) {
   return static_cast<jlong>(
       reinterpret_cast<rocksdb::Options*>(jhandle)->max_compaction_bytes);
 }
@@ -2183,7 +2438,7 @@ jlong Java_org_rocksdb_Options_maxCompactionBytes(JNIEnv* env, jobject jobj,
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMaxCompactionBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_compaction_bytes) {
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_compaction_bytes) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->max_compaction_bytes =
       static_cast<uint64_t>(jmax_compaction_bytes);
 }
@@ -2194,7 +2449,7 @@ void Java_org_rocksdb_Options_setMaxCompactionBytes(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_arenaBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size;
 }
 
@@ -2204,8 +2459,8 @@ jlong Java_org_rocksdb_Options_arenaBlockSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setArenaBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong jarena_block_size) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(jarena_block_size);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
         jarena_block_size;
@@ -2220,9 +2475,8 @@ void Java_org_rocksdb_Options_setArenaBlockSize(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_disableAutoCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->disable_auto_compactions;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->disable_auto_compactions;
 }
 
 /*
@@ -2231,11 +2485,9 @@ jboolean Java_org_rocksdb_Options_disableAutoCompactions(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setDisableAutoCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jdisable_auto_compactions) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->disable_auto_compactions =
-          static_cast<bool>(jdisable_auto_compactions);
+    JNIEnv*, jobject, jlong jhandle, jboolean jdisable_auto_compactions) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->disable_auto_compactions =
+      static_cast<bool>(jdisable_auto_compactions);
 }
 
 /*
@@ -2244,9 +2496,9 @@ void Java_org_rocksdb_Options_setDisableAutoCompactions(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_sequential_skip_in_iterations;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_sequential_skip_in_iterations;
 }
 
 /*
@@ -2255,11 +2507,11 @@ jlong Java_org_rocksdb_Options_maxSequentialSkipInIterations(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jlong jmax_sequential_skip_in_iterations) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->max_sequential_skip_in_iterations =
-          static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->max_sequential_skip_in_iterations =
+      static_cast<int64_t>(jmax_sequential_skip_in_iterations);
 }
 
 /*
@@ -2268,9 +2520,8 @@ void Java_org_rocksdb_Options_setMaxSequentialSkipInIterations(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_support;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->inplace_update_support;
 }
 
 /*
@@ -2279,11 +2530,9 @@ jboolean Java_org_rocksdb_Options_inplaceUpdateSupport(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setInplaceUpdateSupport(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jinplace_update_support) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_support =
-          static_cast<bool>(jinplace_update_support);
+    JNIEnv*, jobject, jlong jhandle, jboolean jinplace_update_support) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->inplace_update_support =
+      static_cast<bool>(jinplace_update_support);
 }
 
 /*
@@ -2292,9 +2541,8 @@ void Java_org_rocksdb_Options_setInplaceUpdateSupport(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->inplace_update_num_locks;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->inplace_update_num_locks;
 }
 
 /*
@@ -2303,10 +2551,9 @@ jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jinplace_update_num_locks) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      jinplace_update_num_locks);
+    JNIEnv* env, jobject, jlong jhandle, jlong jinplace_update_num_locks) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jinplace_update_num_locks);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->inplace_update_num_locks =
         jinplace_update_num_locks;
@@ -2320,9 +2567,8 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
  * Method:    memtablePrefixBloomSizeRatio
  * Signature: (J)I
  */
-jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv* env,
-                                                              jobject jobj,
-                                                              jlong jhandle) {
+jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)
       ->memtable_prefix_bloom_size_ratio;
 }
@@ -2333,7 +2579,7 @@ jdouble Java_org_rocksdb_Options_memtablePrefixBloomSizeRatio(JNIEnv* env,
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setMemtablePrefixBloomSizeRatio(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jdouble jmemtable_prefix_bloom_size_ratio) {
   reinterpret_cast<rocksdb::Options*>(jhandle)
       ->memtable_prefix_bloom_size_ratio =
@@ -2346,7 +2592,7 @@ void Java_org_rocksdb_Options_setMemtablePrefixBloomSizeRatio(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_bloomLocality(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality;
 }
 
@@ -2356,7 +2602,7 @@ jint Java_org_rocksdb_Options_bloomLocality(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setBloomLocality(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) {
+    JNIEnv*, jobject, jlong jhandle, jint jbloom_locality) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->bloom_locality =
       static_cast<int32_t>(jbloom_locality);
 }
@@ -2367,7 +2613,7 @@ void Java_org_rocksdb_Options_setBloomLocality(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_maxSuccessiveMerges(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges;
 }
 
@@ -2377,10 +2623,9 @@ jlong Java_org_rocksdb_Options_maxSuccessiveMerges(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_successive_merges) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      jmax_successive_merges);
+    JNIEnv* env, jobject, jlong jhandle, jlong jmax_successive_merges) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jmax_successive_merges);
   if (s.ok()) {
     reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
         jmax_successive_merges;
@@ -2395,9 +2640,9 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->optimize_filters_for_hits;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->optimize_filters_for_hits;
 }
 
 /*
@@ -2406,11 +2651,9 @@ jboolean Java_org_rocksdb_Options_optimizeFiltersForHits(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setOptimizeFiltersForHits(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean joptimize_filters_for_hits) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->optimize_filters_for_hits =
-          static_cast<bool>(joptimize_filters_for_hits);
+    JNIEnv*, jobject, jlong jhandle, jboolean joptimize_filters_for_hits) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->optimize_filters_for_hits =
+      static_cast<bool>(joptimize_filters_for_hits);
 }
 
 /*
@@ -2419,7 +2662,7 @@ void Java_org_rocksdb_Options_setOptimizeFiltersForHits(
  * Signature: (J)V
  */
 void Java_org_rocksdb_Options_optimizeForSmallDb(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   reinterpret_cast<rocksdb::Options*>(jhandle)->OptimizeForSmallDb();
 }
 
@@ -2429,10 +2672,9 @@ void Java_org_rocksdb_Options_optimizeForSmallDb(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_optimizeForPointLookup(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong block_cache_size_mb) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      OptimizeForPointLookup(block_cache_size_mb);
+    JNIEnv*, jobject, jlong jhandle, jlong block_cache_size_mb) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->OptimizeForPointLookup(
+      block_cache_size_mb);
 }
 
 /*
@@ -2441,10 +2683,9 @@ void Java_org_rocksdb_Options_optimizeForPointLookup(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_optimizeLevelStyleCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong memtable_memory_budget) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      OptimizeLevelStyleCompaction(memtable_memory_budget);
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->OptimizeLevelStyleCompaction(
+      memtable_memory_budget);
 }
 
 /*
@@ -2453,10 +2694,9 @@ void Java_org_rocksdb_Options_optimizeLevelStyleCompaction(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong memtable_memory_budget) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      OptimizeUniversalStyleCompaction(memtable_memory_budget);
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->OptimizeUniversalStyleCompaction(memtable_memory_budget);
 }
 
 /*
@@ -2465,9 +2705,8 @@ void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction(
  * Signature: (J)V
  */
 void Java_org_rocksdb_Options_prepareForBulkLoad(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  reinterpret_cast<rocksdb::Options*>(jhandle)->
-      PrepareForBulkLoad();
+    JNIEnv*, jobject, jlong jhandle) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->PrepareForBulkLoad();
 }
 
 /*
@@ -2476,9 +2715,8 @@ void Java_org_rocksdb_Options_prepareForBulkLoad(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_memtableHugePageSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->memtable_huge_page_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->memtable_huge_page_size;
 }
 
 /*
@@ -2487,14 +2725,12 @@ jlong Java_org_rocksdb_Options_memtableHugePageSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setMemtableHugePageSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmemtable_huge_page_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      jmemtable_huge_page_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong jmemtable_huge_page_size) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jmemtable_huge_page_size);
   if (s.ok()) {
-    reinterpret_cast<rocksdb::Options*>(
-        jhandle)->memtable_huge_page_size =
-            jmemtable_huge_page_size;
+    reinterpret_cast<rocksdb::Options*>(jhandle)->memtable_huge_page_size =
+        jmemtable_huge_page_size;
   } else {
     rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   }
@@ -2506,9 +2742,9 @@ void Java_org_rocksdb_Options_setMemtableHugePageSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->soft_pending_compaction_bytes_limit;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->soft_pending_compaction_bytes_limit;
 }
 
 /*
@@ -2517,10 +2753,11 @@ jlong Java_org_rocksdb_Options_softPendingCompactionBytesLimit(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setSoftPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jsoft_pending_compaction_bytes_limit) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->soft_pending_compaction_bytes_limit =
-          static_cast<int64_t>(jsoft_pending_compaction_bytes_limit);
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jsoft_pending_compaction_bytes_limit) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->soft_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jsoft_pending_compaction_bytes_limit);
 }
 
 /*
@@ -2529,9 +2766,9 @@ void Java_org_rocksdb_Options_setSoftPendingCompactionBytesLimit(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->hard_pending_compaction_bytes_limit;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->hard_pending_compaction_bytes_limit;
 }
 
 /*
@@ -2540,10 +2777,11 @@ jlong Java_org_rocksdb_Options_hardPendingCompactionBytesLimit(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setHardPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jhard_pending_compaction_bytes_limit) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->hard_pending_compaction_bytes_limit =
-          static_cast<int64_t>(jhard_pending_compaction_bytes_limit);
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jhard_pending_compaction_bytes_limit) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->hard_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jhard_pending_compaction_bytes_limit);
 }
 
 /*
@@ -2552,9 +2790,9 @@ void Java_org_rocksdb_Options_setHardPendingCompactionBytesLimit(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-    jhandle)->level0_file_num_compaction_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger;
 }
 
 /*
@@ -2563,11 +2801,11 @@ jint Java_org_rocksdb_Options_level0FileNumCompactionTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setLevel0FileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jint jlevel0_file_num_compaction_trigger) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_file_num_compaction_trigger =
-          static_cast<int32_t>(jlevel0_file_num_compaction_trigger);
+  reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int32_t>(jlevel0_file_num_compaction_trigger);
 }
 
 /*
@@ -2576,9 +2814,9 @@ void Java_org_rocksdb_Options_setLevel0FileNumCompactionTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-    jhandle)->level0_slowdown_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_slowdown_writes_trigger;
 }
 
 /*
@@ -2587,11 +2825,9 @@ jint Java_org_rocksdb_Options_level0SlowdownWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setLevel0SlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_slowdown_writes_trigger) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_slowdown_writes_trigger =
-          static_cast<int32_t>(jlevel0_slowdown_writes_trigger);
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->level0_slowdown_writes_trigger =
+      static_cast<int32_t>(jlevel0_slowdown_writes_trigger);
 }
 
 /*
@@ -2600,9 +2836,9 @@ void Java_org_rocksdb_Options_setLevel0SlowdownWritesTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_Options_level0StopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-    jhandle)->level0_stop_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)
+      ->level0_stop_writes_trigger;
 }
 
 /*
@@ -2611,11 +2847,9 @@ jint Java_org_rocksdb_Options_level0StopWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_Options_setLevel0StopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_stop_writes_trigger) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->level0_stop_writes_trigger =
-          static_cast<int32_t>(jlevel0_stop_writes_trigger);
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->level0_stop_writes_trigger =
+      static_cast<int32_t>(jlevel0_stop_writes_trigger);
 }
 
 /*
@@ -2624,10 +2858,9 @@ void Java_org_rocksdb_Options_setLevel0StopWritesTrigger(
  * Signature: (J)[I
  */
 jintArray Java_org_rocksdb_Options_maxBytesForLevelMultiplierAdditional(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  auto mbflma =
-      reinterpret_cast<rocksdb::Options*>(jhandle)->
-          max_bytes_for_level_multiplier_additional;
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto mbflma = reinterpret_cast<rocksdb::Options*>(jhandle)
+                    ->max_bytes_for_level_multiplier_additional;
 
   const size_t size = mbflma.size();
 
@@ -2638,21 +2871,21 @@ jintArray Java_org_rocksdb_Options_maxBytesForLevelMultiplierAdditional(
 
   jsize jlen = static_cast<jsize>(size);
   jintArray result = env->NewIntArray(jlen);
-  if(result == nullptr) {
-      // exception thrown: OutOfMemoryError
-      delete [] additionals;
-      return nullptr;
+  if (result == nullptr) {
+    // exception thrown: OutOfMemoryError
+    delete[] additionals;
+    return nullptr;
   }
 
   env->SetIntArrayRegion(result, 0, jlen, additionals);
-  if(env->ExceptionCheck()) {
-      // exception thrown: ArrayIndexOutOfBoundsException
-      env->DeleteLocalRef(result);
-      delete [] additionals;
-      return nullptr;
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(result);
+    delete[] additionals;
+    return nullptr;
   }
 
-  delete [] additionals;
+  delete[] additionals;
 
   return result;
 }
@@ -2663,12 +2896,12 @@ jintArray Java_org_rocksdb_Options_maxBytesForLevelMultiplierAdditional(
  * Signature: (J[I)V
  */
 void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplierAdditional(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv* env, jobject, jlong jhandle,
     jintArray jmax_bytes_for_level_multiplier_additional) {
   jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional);
-  jint *additionals =
-      env->GetIntArrayElements(jmax_bytes_for_level_multiplier_additional, nullptr);
-  if(additionals == nullptr) {
+  jint* additionals = env->GetIntArrayElements(
+      jmax_bytes_for_level_multiplier_additional, nullptr);
+  if (additionals == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
@@ -2676,11 +2909,12 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplierAdditional(
   auto* opt = reinterpret_cast<rocksdb::Options*>(jhandle);
   opt->max_bytes_for_level_multiplier_additional.clear();
   for (jsize i = 0; i < len; i++) {
-    opt->max_bytes_for_level_multiplier_additional.push_back(static_cast<int32_t>(additionals[i]));
+    opt->max_bytes_for_level_multiplier_additional.push_back(
+        static_cast<int32_t>(additionals[i]));
   }
 
   env->ReleaseIntArrayElements(jmax_bytes_for_level_multiplier_additional,
-      additionals, JNI_ABORT);
+                               additionals, JNI_ABORT);
 }
 
 /*
@@ -2689,9 +2923,8 @@ void Java_org_rocksdb_Options_setMaxBytesForLevelMultiplierAdditional(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_paranoidFileChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::Options*>(
-      jhandle)->paranoid_file_checks;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_file_checks;
 }
 
 /*
@@ -2700,10 +2933,9 @@ jboolean Java_org_rocksdb_Options_paranoidFileChecks(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setParanoidFileChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jparanoid_file_checks) {
-  reinterpret_cast<rocksdb::Options*>(
-      jhandle)->paranoid_file_checks =
-          static_cast<bool>(jparanoid_file_checks);
+    JNIEnv*, jobject, jlong jhandle, jboolean jparanoid_file_checks) {
+  reinterpret_cast<rocksdb::Options*>(jhandle)->paranoid_file_checks =
+      static_cast<bool>(jparanoid_file_checks);
 }
 
 /*
@@ -2712,11 +2944,11 @@ void Java_org_rocksdb_Options_setParanoidFileChecks(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Options_setCompactionPriority(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jbyte jcompaction_priority_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_priority_value) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
   opts->compaction_pri =
-      rocksdb::CompactionPriorityJni::toCppCompactionPriority(jcompaction_priority_value);
+      rocksdb::CompactionPriorityJni::toCppCompactionPriority(
+          jcompaction_priority_value);
 }
 
 /*
@@ -2725,7 +2957,7 @@ void Java_org_rocksdb_Options_setCompactionPriority(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Options_compactionPriority(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
   return rocksdb::CompactionPriorityJni::toJavaCompactionPriority(
       opts->compaction_pri);
@@ -2737,7 +2969,7 @@ jbyte Java_org_rocksdb_Options_compactionPriority(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setReportBgIoStats(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jreport_bg_io_stats) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jreport_bg_io_stats) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
   opts->report_bg_io_stats = static_cast<bool>(jreport_bg_io_stats);
 }
@@ -2748,23 +2980,44 @@ void Java_org_rocksdb_Options_setReportBgIoStats(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_reportBgIoStats(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<bool>(opts->report_bg_io_stats);
 }
 
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    setTtl
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Options_setTtl(
+    JNIEnv*, jobject, jlong jhandle, jlong jttl) {
+  auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
+  opts->ttl = static_cast<uint64_t>(jttl);
+}
+
+/*
+ * Class:     org_rocksdb_Options
+ * Method:    ttl
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Options_ttl(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
+  return static_cast<jlong>(opts->ttl);
+}
+
 /*
  * Class:     org_rocksdb_Options
  * Method:    setCompactionOptionsUniversal
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setCompactionOptionsUniversal(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jlong jcompaction_options_universal_handle) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
-  auto* opts_uni =
-      reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(
-          jcompaction_options_universal_handle);
+  auto* opts_uni = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(
+      jcompaction_options_universal_handle);
   opts->compaction_options_universal = *opts_uni;
 }
 
@@ -2774,11 +3027,10 @@ void Java_org_rocksdb_Options_setCompactionOptionsUniversal(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_Options_setCompactionOptionsFIFO(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jcompaction_options_fifo_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jcompaction_options_fifo_handle) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
-  auto* opts_fifo =
-      reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(
-          jcompaction_options_fifo_handle);
+  auto* opts_fifo = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(
+      jcompaction_options_fifo_handle);
   opts->compaction_options_fifo = *opts_fifo;
 }
 
@@ -2788,8 +3040,7 @@ void Java_org_rocksdb_Options_setCompactionOptionsFIFO(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_Options_setForceConsistencyChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jforce_consistency_checks) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jforce_consistency_checks) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
   opts->force_consistency_checks = static_cast<bool>(jforce_consistency_checks);
 }
@@ -2800,7 +3051,7 @@ void Java_org_rocksdb_Options_setForceConsistencyChecks(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_Options_forceConsistencyChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opts = reinterpret_cast<rocksdb::Options*>(jhandle);
   return static_cast<bool>(opts->force_consistency_checks);
 }
@@ -2814,20 +3065,44 @@ jboolean Java_org_rocksdb_Options_forceConsistencyChecks(
  * Signature: ()J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv*, jclass) {
   auto* op = new rocksdb::ColumnFamilyOptions();
   return reinterpret_cast<jlong>(op);
 }
 
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
- * Method:    getColumnFamilyOptionsFromProps
+ * Method:    copyColumnFamilyOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_copyColumnFamilyOptions(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt = new rocksdb::ColumnFamilyOptions(
+      *(reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)));
+  return reinterpret_cast<jlong>(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    newColumnFamilyOptionsFromOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptionsFromOptions(
+    JNIEnv*, jclass, jlong joptions_handle) {
+  auto new_opt = new rocksdb::ColumnFamilyOptions(
+      *reinterpret_cast<rocksdb::Options*>(joptions_handle));
+  return reinterpret_cast<jlong>(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    getColumnFamilyOptionsFromProps
  * Signature: (Ljava/util/String;)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps(
-    JNIEnv* env, jclass jclazz, jstring jopt_string) {
+    JNIEnv* env, jclass, jstring jopt_string) {
   const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
-  if(opt_string == nullptr) {
+  if (opt_string == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
@@ -2856,7 +3131,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps(
  * Signature: (J)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+    JNIEnv*, jobject, jlong handle) {
   auto* cfo = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(handle);
   assert(cfo != nullptr);
   delete cfo;
@@ -2868,9 +3143,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal(
  * Signature: (J)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      OptimizeForSmallDb();
+    JNIEnv*, jobject, jlong jhandle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeForSmallDb();
 }
 
 /*
@@ -2879,10 +3154,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeForSmallDb(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong block_cache_size_mb) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      OptimizeForPointLookup(block_cache_size_mb);
+    JNIEnv*, jobject, jlong jhandle, jlong block_cache_size_mb) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeForPointLookup(block_cache_size_mb);
 }
 
 /*
@@ -2891,10 +3165,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong memtable_memory_budget) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      OptimizeLevelStyleCompaction(memtable_memory_budget);
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeLevelStyleCompaction(memtable_memory_budget);
 }
 
 /*
@@ -2903,10 +3176,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong memtable_memory_budget) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      OptimizeUniversalStyleCompaction(memtable_memory_budget);
+    JNIEnv*, jobject, jlong jhandle, jlong memtable_memory_budget) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->OptimizeUniversalStyleCompaction(memtable_memory_budget);
 }
 
 /*
@@ -2915,7 +3187,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) {
+    JNIEnv*, jobject, jlong jhandle, jint builtinComparator) {
   switch (builtinComparator) {
     case 1:
       reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->comparator =
@@ -2931,12 +3203,32 @@ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI(
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    setComparatorHandle
- * Signature: (JJ)V
- */
-void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJ(
-    JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle)->comparator =
-      reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+ * Signature: (JJB)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJB(
+    JNIEnv*, jobject, jlong jopt_handle, jlong jcomparator_handle,
+    jbyte jcomparator_type) {
+  rocksdb::Comparator* comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      comparator =
+          reinterpret_cast<rocksdb::ComparatorJniCallback*>(jcomparator_handle);
+      break;
+
+    // JAVA_DIRECT_COMPARATOR
+    case 0x1:
+      comparator = reinterpret_cast<rocksdb::DirectComparatorJniCallback*>(
+          jcomparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x2:
+      comparator = reinterpret_cast<rocksdb::Comparator*>(jcomparator_handle);
+      break;
+  }
+  auto* opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle);
+  opt->comparator = comparator;
 }
 
 /*
@@ -2945,10 +3237,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJ(
  * Signature: (JJjava/lang/String)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) {
+    JNIEnv* env, jobject, jlong jhandle, jstring jop_name) {
   auto* options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   const char* op_name = env->GetStringUTFChars(jop_name, nullptr);
-  if(op_name == nullptr) {
+  if (op_name == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
@@ -2964,10 +3256,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName(
  * Signature: (JJjava/lang/String)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator(
-  JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) {
+    JNIEnv*, jobject, jlong jhandle, jlong mergeOperatorHandle) {
   reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->merge_operator =
-    *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>
-      (mergeOperatorHandle));
+      *(reinterpret_cast<std::shared_ptr<rocksdb::MergeOperator>*>(
+          mergeOperatorHandle));
 }
 
 /*
@@ -2976,11 +3268,25 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle(
-    JNIEnv* env, jobject jobj, jlong jopt_handle,
-    jlong jcompactionfilter_handle) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle)->
-      compaction_filter = reinterpret_cast<rocksdb::CompactionFilter*>
-        (jcompactionfilter_handle);
+    JNIEnv*, jobject, jlong jopt_handle, jlong jcompactionfilter_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle)
+      ->compaction_filter =
+      reinterpret_cast<rocksdb::CompactionFilter*>(jcompactionfilter_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setCompactionFilterFactoryHandle
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterFactoryHandle(
+    JNIEnv*, jobject, jlong jopt_handle,
+    jlong jcompactionfilterfactory_handle) {
+  auto* cff_factory =
+      reinterpret_cast<std::shared_ptr<rocksdb::CompactionFilterFactoryJniCallback>*>(
+          jcompactionfilterfactory_handle);
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jopt_handle)
+      ->compaction_filter_factory = *cff_factory;
 }
 
 /*
@@ -2989,11 +3295,11 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionFilterHandle(
  * Signature: (JJ)I
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong jwrite_buffer_size) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(jwrite_buffer_size);
   if (s.ok()) {
-    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-        write_buffer_size = jwrite_buffer_size;
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+        ->write_buffer_size = jwrite_buffer_size;
   } else {
     rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   }
@@ -3005,9 +3311,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      write_buffer_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->write_buffer_size;
 }
 
 /*
@@ -3016,9 +3322,9 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      max_write_buffer_number = jmax_write_buffer_number;
+    JNIEnv*, jobject, jlong jhandle, jint jmax_write_buffer_number) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number = jmax_write_buffer_number;
 }
 
 /*
@@ -3027,9 +3333,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      max_write_buffer_number;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_write_buffer_number;
 }
 
 /*
@@ -3037,10 +3343,10 @@ jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      memtable_factory.reset(
-      reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
+    JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->memtable_factory.reset(
+          reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
 }
 
 /*
@@ -3049,7 +3355,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory(
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get();
 
@@ -3070,10 +3376,10 @@ jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(
-          static_cast<int>(jprefix_length)));
+    JNIEnv*, jobject, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->prefix_extractor.reset(
+          rocksdb::NewFixedPrefixTransform(static_cast<int>(jprefix_length)));
 }
 
 /*
@@ -3081,10 +3387,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_useCappedPrefixExtractor(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      prefix_extractor.reset(rocksdb::NewCappedPrefixTransform(
-          static_cast<int>(jprefix_length)));
+    JNIEnv*, jobject, jlong jhandle, jint jprefix_length) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->prefix_extractor.reset(
+          rocksdb::NewCappedPrefixTransform(static_cast<int>(jprefix_length)));
 }
 
 /*
@@ -3092,10 +3398,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_useCappedPrefixExtractor(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      table_factory.reset(reinterpret_cast<rocksdb::TableFactory*>(
-      jfactory_handle));
+    JNIEnv*, jobject, jlong jhandle, jlong jfactory_handle) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->table_factory.reset(
+      reinterpret_cast<rocksdb::TableFactory*>(jfactory_handle));
 }
 
 /*
@@ -3103,7 +3408,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory(
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   rocksdb::TableFactory* tf = opt->table_factory.get();
 
@@ -3114,16 +3419,15 @@ jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName(
   return env->NewStringUTF(tf->Name());
 }
 
-
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    minWriteBufferNumberToMerge
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->min_write_buffer_number_to_merge;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->min_write_buffer_number_to_merge;
 }
 
 /*
@@ -3132,11 +3436,10 @@ jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jmin_write_buffer_number_to_merge) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->min_write_buffer_number_to_merge =
-          static_cast<int>(jmin_write_buffer_number_to_merge);
+    JNIEnv*, jobject, jlong jhandle, jint jmin_write_buffer_number_to_merge) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->min_write_buffer_number_to_merge =
+      static_cast<int>(jmin_write_buffer_number_to_merge);
 }
 
 /*
@@ -3145,7 +3448,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
       ->max_write_buffer_number_to_maintain;
 }
@@ -3156,7 +3459,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumberToMaintain(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jint jmax_write_buffer_number_to_maintain) {
   reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
       ->max_write_buffer_number_to_maintain =
@@ -3169,7 +3472,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumberToMaintain(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jcompression_type_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   cf_opts->compression = rocksdb::CompressionTypeJni::toCppCompressionType(
       jcompression_type_value);
@@ -3181,7 +3484,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   return rocksdb::CompressionTypeJni::toJavaCompressionType(
       cf_opts->compression);
@@ -3193,14 +3496,13 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType(
  * Signature: (J[B)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jbyteArray jcompressionLevels) {
+    JNIEnv* env, jobject, jlong jhandle, jbyteArray jcompressionLevels) {
   auto* options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   auto uptr_compression_levels =
       rocksdb_compression_vector_helper(env, jcompressionLevels);
-  if(!uptr_compression_levels) {
-      // exception occurred
-      return;
+  if (!uptr_compression_levels) {
+    // exception occurred
+    return;
   }
   options->compression_per_level = *(uptr_compression_levels.get());
 }
@@ -3211,10 +3513,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionPerLevel(
  * Signature: (J)[B
  */
 jbyteArray Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   return rocksdb_compression_list_helper(env,
-      cf_options->compression_per_level);
+                                         cf_options->compression_per_level);
 }
 
 /*
@@ -3223,7 +3525,7 @@ jbyteArray Java_org_rocksdb_ColumnFamilyOptions_compressionPerLevel(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jcompression_type_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompression_type_value) {
   auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   cf_options->bottommost_compression =
       rocksdb::CompressionTypeJni::toCppCompressionType(
@@ -3236,11 +3538,25 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionType(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_ColumnFamilyOptions_bottommostCompressionType(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   return rocksdb::CompressionTypeJni::toJavaCompressionType(
       cf_options->bottommost_compression);
 }
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setBottommostCompressionOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setBottommostCompressionOptions(
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jbottommost_compression_options_handle) {
+  auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  auto* bottommost_compression_options =
+      reinterpret_cast<rocksdb::CompressionOptions*>(
+          jbottommost_compression_options_handle);
+  cf_options->bottommost_compression_opts = *bottommost_compression_options;
+}
 
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
@@ -3248,11 +3564,10 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_bottommostCompressionType(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompressionOptions(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jcompression_options_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jcompression_options_handle) {
   auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
-  auto* compression_options =
-    reinterpret_cast<rocksdb::CompressionOptions*>(jcompression_options_handle);
+  auto* compression_options = reinterpret_cast<rocksdb::CompressionOptions*>(
+      jcompression_options_handle);
   cf_options->compression_opts = *compression_options;
 }
 
@@ -3262,9 +3577,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompressionOptions(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte compaction_style) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->compaction_style =
-      static_cast<rocksdb::CompactionStyle>(compaction_style);
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_style) {
+  auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  cf_options->compaction_style =
+      rocksdb::CompactionStyleJni::toCppCompactionStyle(jcompaction_style);
 }
 
 /*
@@ -3273,9 +3589,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>
-      (jhandle)->compaction_style;
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  return rocksdb::CompactionStyleJni::toJavaCompactionStyle(
+      cf_options->compaction_style);
 }
 
 /*
@@ -3284,9 +3601,10 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->compaction_options_fifo.max_table_files_size =
-    static_cast<uint64_t>(jmax_table_files_size);
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_table_files_size) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size =
+      static_cast<uint64_t>(jmax_table_files_size);
 }
 
 /*
@@ -3295,8 +3613,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_maxTableFilesSizeFIFO(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->compaction_options_fifo.max_table_files_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->compaction_options_fifo.max_table_files_size;
 }
 
 /*
@@ -3305,7 +3624,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxTableFilesSizeFIFO(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->num_levels;
 }
 
@@ -3315,7 +3634,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_numLevels(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) {
+    JNIEnv*, jobject, jlong jhandle, jint jnum_levels) {
   reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->num_levels =
       static_cast<int>(jnum_levels);
 }
@@ -3326,9 +3645,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_file_num_compaction_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger;
 }
 
 /*
@@ -3337,11 +3656,11 @@ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jint jlevel0_file_num_compaction_trigger) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_file_num_compaction_trigger =
-          static_cast<int>(jlevel0_file_num_compaction_trigger);
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int>(jlevel0_file_num_compaction_trigger);
 }
 
 /*
@@ -3350,9 +3669,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_slowdown_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger;
 }
 
 /*
@@ -3361,11 +3680,10 @@ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_slowdown_writes_trigger) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_slowdown_writes_trigger =
-          static_cast<int>(jlevel0_slowdown_writes_trigger);
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger =
+      static_cast<int>(jlevel0_slowdown_writes_trigger);
 }
 
 /*
@@ -3374,9 +3692,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_stop_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger;
 }
 
 /*
@@ -3385,11 +3703,10 @@ jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_stop_writes_trigger) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      level0_stop_writes_trigger = static_cast<int>(
-      jlevel0_stop_writes_trigger);
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger =
+      static_cast<int>(jlevel0_stop_writes_trigger);
 }
 
 /*
@@ -3398,9 +3715,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      target_file_size_base;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_base;
 }
 
 /*
@@ -3409,10 +3726,9 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jtarget_file_size_base) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      target_file_size_base = static_cast<uint64_t>(jtarget_file_size_base);
+    JNIEnv*, jobject, jlong jhandle, jlong jtarget_file_size_base) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_base = static_cast<uint64_t>(jtarget_file_size_base);
 }
 
 /*
@@ -3421,9 +3737,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->target_file_size_multiplier;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_multiplier;
 }
 
 /*
@@ -3432,11 +3748,10 @@ jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jtarget_file_size_multiplier) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->target_file_size_multiplier =
-          static_cast<int>(jtarget_file_size_multiplier);
+    JNIEnv*, jobject, jlong jhandle, jint jtarget_file_size_multiplier) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->target_file_size_multiplier =
+      static_cast<int>(jtarget_file_size_multiplier);
 }
 
 /*
@@ -3445,9 +3760,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->max_bytes_for_level_base;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_bytes_for_level_base;
 }
 
 /*
@@ -3456,11 +3771,10 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_bytes_for_level_base) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->max_bytes_for_level_base =
-          static_cast<int64_t>(jmax_bytes_for_level_base);
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_bytes_for_level_base) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_bytes_for_level_base =
+      static_cast<int64_t>(jmax_bytes_for_level_base);
 }
 
 /*
@@ -3469,9 +3783,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ColumnFamilyOptions_levelCompactionDynamicLevelBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level_compaction_dynamic_level_bytes;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level_compaction_dynamic_level_bytes;
 }
 
 /*
@@ -3480,11 +3794,9 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_levelCompactionDynamicLevelBytes(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setLevelCompactionDynamicLevelBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jenable_dynamic_level_bytes) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level_compaction_dynamic_level_bytes =
-          (jenable_dynamic_level_bytes);
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_dynamic_level_bytes) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level_compaction_dynamic_level_bytes = (jenable_dynamic_level_bytes);
 }
 
 /*
@@ -3493,9 +3805,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevelCompactionDynamicLevelBytes(
  * Signature: (J)D
  */
 jdouble Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->max_bytes_for_level_multiplier;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_bytes_for_level_multiplier;
 }
 
 /*
@@ -3504,8 +3816,7 @@ jdouble Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier(
  * Signature: (JD)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jdouble jmax_bytes_for_level_multiplier) {
+    JNIEnv*, jobject, jlong jhandle, jdouble jmax_bytes_for_level_multiplier) {
   reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
       ->max_bytes_for_level_multiplier =
       static_cast<double>(jmax_bytes_for_level_multiplier);
@@ -3516,9 +3827,8 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier(
  * Method:    maxCompactionBytes
  * Signature: (J)I
  */
-jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(JNIEnv* env,
-                                                              jobject jobj,
-                                                              jlong jhandle) {
+jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(
+    JNIEnv*, jobject, jlong jhandle) {
   return static_cast<jlong>(
       reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
           ->max_compaction_bytes);
@@ -3530,7 +3840,7 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxCompactionBytes(JNIEnv* env,
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxCompactionBytes(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_compaction_bytes) {
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_compaction_bytes) {
   reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
       ->max_compaction_bytes = static_cast<uint64_t>(jmax_compaction_bytes);
 }
@@ -3541,9 +3851,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxCompactionBytes(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      arena_block_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->arena_block_size;
 }
 
 /*
@@ -3552,11 +3862,11 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong jarena_block_size) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(jarena_block_size);
   if (s.ok()) {
-    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-        arena_block_size = jarena_block_size;
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->arena_block_size =
+        jarena_block_size;
   } else {
     rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   }
@@ -3568,9 +3878,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->disable_auto_compactions;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->disable_auto_compactions;
 }
 
 /*
@@ -3579,11 +3889,9 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jdisable_auto_compactions) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->disable_auto_compactions =
-          static_cast<bool>(jdisable_auto_compactions);
+    JNIEnv*, jobject, jlong jhandle, jboolean jdisable_auto_compactions) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->disable_auto_compactions = static_cast<bool>(jdisable_auto_compactions);
 }
 
 /*
@@ -3592,9 +3900,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->max_sequential_skip_in_iterations;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_sequential_skip_in_iterations;
 }
 
 /*
@@ -3603,11 +3911,11 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jlong jmax_sequential_skip_in_iterations) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->max_sequential_skip_in_iterations =
-          static_cast<int64_t>(jmax_sequential_skip_in_iterations);
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_sequential_skip_in_iterations =
+      static_cast<int64_t>(jmax_sequential_skip_in_iterations);
 }
 
 /*
@@ -3616,9 +3924,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->inplace_update_support;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->inplace_update_support;
 }
 
 /*
@@ -3627,11 +3935,9 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jinplace_update_support) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->inplace_update_support =
-          static_cast<bool>(jinplace_update_support);
+    JNIEnv*, jobject, jlong jhandle, jboolean jinplace_update_support) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->inplace_update_support = static_cast<bool>(jinplace_update_support);
 }
 
 /*
@@ -3640,9 +3946,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->inplace_update_num_locks;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->inplace_update_num_locks;
 }
 
 /*
@@ -3651,13 +3957,12 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jinplace_update_num_locks) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      jinplace_update_num_locks);
+    JNIEnv* env, jobject, jlong jhandle, jlong jinplace_update_num_locks) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jinplace_update_num_locks);
   if (s.ok()) {
-    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-        inplace_update_num_locks = jinplace_update_num_locks;
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+        ->inplace_update_num_locks = jinplace_update_num_locks;
   } else {
     rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   }
@@ -3669,7 +3974,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks(
  * Signature: (J)I
  */
 jdouble Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomSizeRatio(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
       ->memtable_prefix_bloom_size_ratio;
 }
@@ -3680,7 +3985,7 @@ jdouble Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomSizeRatio(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomSizeRatio(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jdouble jmemtable_prefix_bloom_size_ratio) {
   reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
       ->memtable_prefix_bloom_size_ratio =
@@ -3693,9 +3998,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomSizeRatio(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      bloom_locality;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->bloom_locality;
 }
 
 /*
@@ -3704,7 +4009,7 @@ jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) {
+    JNIEnv*, jobject, jlong jhandle, jint jbloom_locality) {
   reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->bloom_locality =
       static_cast<int32_t>(jbloom_locality);
 }
@@ -3715,9 +4020,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-      max_successive_merges;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->max_successive_merges;
 }
 
 /*
@@ -3726,13 +4031,12 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_successive_merges) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      jmax_successive_merges);
+    JNIEnv* env, jobject, jlong jhandle, jlong jmax_successive_merges) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jmax_successive_merges);
   if (s.ok()) {
-    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)->
-        max_successive_merges = jmax_successive_merges;
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+        ->max_successive_merges = jmax_successive_merges;
   } else {
     rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   }
@@ -3744,9 +4048,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ColumnFamilyOptions_optimizeFiltersForHits(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->optimize_filters_for_hits;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->optimize_filters_for_hits;
 }
 
 /*
@@ -3755,11 +4059,10 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_optimizeFiltersForHits(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean joptimize_filters_for_hits) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->optimize_filters_for_hits =
-          static_cast<bool>(joptimize_filters_for_hits);
+    JNIEnv*, jobject, jlong jhandle, jboolean joptimize_filters_for_hits) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->optimize_filters_for_hits =
+      static_cast<bool>(joptimize_filters_for_hits);
 }
 
 /*
@@ -3768,9 +4071,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setOptimizeFiltersForHits(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->memtable_huge_page_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->memtable_huge_page_size;
 }
 
 /*
@@ -3779,15 +4082,12 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_memtableHugePageSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMemtableHugePageSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmemtable_huge_page_size) {
-
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      jmemtable_huge_page_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong jmemtable_huge_page_size) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(jmemtable_huge_page_size);
   if (s.ok()) {
-    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-        jhandle)->memtable_huge_page_size =
-            jmemtable_huge_page_size;
+    reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+        ->memtable_huge_page_size = jmemtable_huge_page_size;
   } else {
     rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   }
@@ -3799,9 +4099,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMemtableHugePageSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_softPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->soft_pending_compaction_bytes_limit;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->soft_pending_compaction_bytes_limit;
 }
 
 /*
@@ -3810,10 +4110,11 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_softPendingCompactionBytesLimit(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setSoftPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jsoft_pending_compaction_bytes_limit) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->soft_pending_compaction_bytes_limit =
-          static_cast<int64_t>(jsoft_pending_compaction_bytes_limit);
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jsoft_pending_compaction_bytes_limit) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->soft_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jsoft_pending_compaction_bytes_limit);
 }
 
 /*
@@ -3822,9 +4123,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setSoftPendingCompactionBytesLimit(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ColumnFamilyOptions_hardPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->hard_pending_compaction_bytes_limit;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->hard_pending_compaction_bytes_limit;
 }
 
 /*
@@ -3833,10 +4134,11 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_hardPendingCompactionBytesLimit(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setHardPendingCompactionBytesLimit(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jhard_pending_compaction_bytes_limit) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->hard_pending_compaction_bytes_limit =
-          static_cast<int64_t>(jhard_pending_compaction_bytes_limit);
+    JNIEnv*, jobject, jlong jhandle,
+    jlong jhard_pending_compaction_bytes_limit) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->hard_pending_compaction_bytes_limit =
+      static_cast<int64_t>(jhard_pending_compaction_bytes_limit);
 }
 
 /*
@@ -3845,9 +4147,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setHardPendingCompactionBytesLimit(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_level0FileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-    jhandle)->level0_file_num_compaction_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger;
 }
 
 /*
@@ -3856,11 +4158,11 @@ jint Java_org_rocksdb_ColumnFamilyOptions_level0FileNumCompactionTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setLevel0FileNumCompactionTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jint jlevel0_file_num_compaction_trigger) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_file_num_compaction_trigger =
-          static_cast<int32_t>(jlevel0_file_num_compaction_trigger);
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_file_num_compaction_trigger =
+      static_cast<int32_t>(jlevel0_file_num_compaction_trigger);
 }
 
 /*
@@ -3869,9 +4171,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0FileNumCompactionTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_level0SlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-    jhandle)->level0_slowdown_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger;
 }
 
 /*
@@ -3880,11 +4182,10 @@ jint Java_org_rocksdb_ColumnFamilyOptions_level0SlowdownWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setLevel0SlowdownWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_slowdown_writes_trigger) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_slowdown_writes_trigger =
-          static_cast<int32_t>(jlevel0_slowdown_writes_trigger);
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_slowdown_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_slowdown_writes_trigger =
+      static_cast<int32_t>(jlevel0_slowdown_writes_trigger);
 }
 
 /*
@@ -3893,9 +4194,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0SlowdownWritesTrigger(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_ColumnFamilyOptions_level0StopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-    jhandle)->level0_stop_writes_trigger;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger;
 }
 
 /*
@@ -3904,11 +4205,10 @@ jint Java_org_rocksdb_ColumnFamilyOptions_level0StopWritesTrigger(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setLevel0StopWritesTrigger(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jint jlevel0_stop_writes_trigger) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->level0_stop_writes_trigger =
-          static_cast<int32_t>(jlevel0_stop_writes_trigger);
+    JNIEnv*, jobject, jlong jhandle, jint jlevel0_stop_writes_trigger) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->level0_stop_writes_trigger =
+      static_cast<int32_t>(jlevel0_stop_writes_trigger);
 }
 
 /*
@@ -3917,9 +4217,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setLevel0StopWritesTrigger(
  * Signature: (J)[I
  */
 jintArray Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditional(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  auto mbflma = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->max_bytes_for_level_multiplier_additional;
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto mbflma = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+                    ->max_bytes_for_level_multiplier_additional;
 
   const size_t size = mbflma.size();
 
@@ -3930,20 +4230,20 @@ jintArray Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditio
 
   jsize jlen = static_cast<jsize>(size);
   jintArray result = env->NewIntArray(jlen);
-  if(result == nullptr) {
+  if (result == nullptr) {
     // exception thrown: OutOfMemoryError
-    delete [] additionals;
+    delete[] additionals;
     return nullptr;
   }
   env->SetIntArrayRegion(result, 0, jlen, additionals);
-  if(env->ExceptionCheck()) {
-      // exception thrown: ArrayIndexOutOfBoundsException
-      env->DeleteLocalRef(result);
-      delete [] additionals;
-      return nullptr;
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(result);
+    delete[] additionals;
+    return nullptr;
   }
 
-  delete [] additionals;
+  delete[] additionals;
 
   return result;
 }
@@ -3954,12 +4254,12 @@ jintArray Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplierAdditio
  * Signature: (J[I)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditional(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv* env, jobject, jlong jhandle,
     jintArray jmax_bytes_for_level_multiplier_additional) {
   jsize len = env->GetArrayLength(jmax_bytes_for_level_multiplier_additional);
-  jint *additionals =
+  jint* additionals =
       env->GetIntArrayElements(jmax_bytes_for_level_multiplier_additional, 0);
-  if(additionals == nullptr) {
+  if (additionals == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
@@ -3967,11 +4267,12 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditiona
   auto* cf_opt = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   cf_opt->max_bytes_for_level_multiplier_additional.clear();
   for (jsize i = 0; i < len; i++) {
-    cf_opt->max_bytes_for_level_multiplier_additional.push_back(static_cast<int32_t>(additionals[i]));
+    cf_opt->max_bytes_for_level_multiplier_additional.push_back(
+        static_cast<int32_t>(additionals[i]));
   }
 
   env->ReleaseIntArrayElements(jmax_bytes_for_level_multiplier_additional,
-      additionals, JNI_ABORT);
+                               additionals, JNI_ABORT);
 }
 
 /*
@@ -3980,9 +4281,9 @@ void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplierAdditiona
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ColumnFamilyOptions_paranoidFileChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->paranoid_file_checks;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->paranoid_file_checks;
 }
 
 /*
@@ -3991,10 +4292,9 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_paranoidFileChecks(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setParanoidFileChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jparanoid_file_checks) {
-  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
-      jhandle)->paranoid_file_checks =
-          static_cast<bool>(jparanoid_file_checks);
+    JNIEnv*, jobject, jlong jhandle, jboolean jparanoid_file_checks) {
+  reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle)
+      ->paranoid_file_checks = static_cast<bool>(jparanoid_file_checks);
 }
 
 /*
@@ -4003,11 +4303,11 @@ void Java_org_rocksdb_ColumnFamilyOptions_setParanoidFileChecks(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompactionPriority(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jbyte jcompaction_priority_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jcompaction_priority_value) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   cf_opts->compaction_pri =
-      rocksdb::CompactionPriorityJni::toCppCompactionPriority(jcompaction_priority_value);
+      rocksdb::CompactionPriorityJni::toCppCompactionPriority(
+          jcompaction_priority_value);
 }
 
 /*
@@ -4016,7 +4316,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionPriority(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   return rocksdb::CompactionPriorityJni::toJavaCompactionPriority(
       cf_opts->compaction_pri);
@@ -4028,7 +4328,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionPriority(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setReportBgIoStats(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jreport_bg_io_stats) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jreport_bg_io_stats) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   cf_opts->report_bg_io_stats = static_cast<bool>(jreport_bg_io_stats);
 }
@@ -4039,23 +4339,44 @@ void Java_org_rocksdb_ColumnFamilyOptions_setReportBgIoStats(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ColumnFamilyOptions_reportBgIoStats(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   return static_cast<bool>(cf_opts->report_bg_io_stats);
 }
 
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    setTtl
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ColumnFamilyOptions_setTtl(
+    JNIEnv*, jobject, jlong jhandle, jlong jttl) {
+  auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  cf_opts->ttl = static_cast<uint64_t>(jttl);
+}
+
+/*
+ * Class:     org_rocksdb_ColumnFamilyOptions
+ * Method:    ttl
+ * Signature: (J)J
+ */
+JNIEXPORT jlong JNICALL Java_org_rocksdb_ColumnFamilyOptions_ttl(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
+  return static_cast<jlong>(cf_opts->ttl);
+}
+
 /*
  * Class:     org_rocksdb_ColumnFamilyOptions
  * Method:    setCompactionOptionsUniversal
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsUniversal(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jlong jcompaction_options_universal_handle) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
-  auto* opts_uni =
-      reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(
-          jcompaction_options_universal_handle);
+  auto* opts_uni = reinterpret_cast<rocksdb::CompactionOptionsUniversal*>(
+      jcompaction_options_universal_handle);
   cf_opts->compaction_options_universal = *opts_uni;
 }
 
@@ -4065,11 +4386,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsUniversal(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsFIFO(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jcompaction_options_fifo_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jcompaction_options_fifo_handle) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
-  auto* opts_fifo =
-      reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(
-          jcompaction_options_fifo_handle);
+  auto* opts_fifo = reinterpret_cast<rocksdb::CompactionOptionsFIFO*>(
+      jcompaction_options_fifo_handle);
   cf_opts->compaction_options_fifo = *opts_fifo;
 }
 
@@ -4079,10 +4399,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setCompactionOptionsFIFO(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ColumnFamilyOptions_setForceConsistencyChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jforce_consistency_checks) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jforce_consistency_checks) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
-  cf_opts->force_consistency_checks = static_cast<bool>(jforce_consistency_checks);
+  cf_opts->force_consistency_checks =
+      static_cast<bool>(jforce_consistency_checks);
 }
 
 /*
@@ -4091,7 +4411,7 @@ void Java_org_rocksdb_ColumnFamilyOptions_setForceConsistencyChecks(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ColumnFamilyOptions_forceConsistencyChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* cf_opts = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jhandle);
   return static_cast<bool>(cf_opts->force_consistency_checks);
 }
@@ -4104,21 +4424,45 @@ jboolean Java_org_rocksdb_ColumnFamilyOptions_forceConsistencyChecks(
  * Method:    newDBOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env,
-    jclass jcls) {
+jlong Java_org_rocksdb_DBOptions_newDBOptions(
+    JNIEnv*, jclass) {
   auto* dbop = new rocksdb::DBOptions();
   return reinterpret_cast<jlong>(dbop);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    copyDBOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_copyDBOptions(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt =
+      new rocksdb::DBOptions(*(reinterpret_cast<rocksdb::DBOptions*>(jhandle)));
+  return reinterpret_cast<jlong>(new_opt);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    newDBOptionsFromOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_newDBOptionsFromOptions(
+    JNIEnv*, jclass, jlong joptions_handle) {
+  auto new_opt =
+      new rocksdb::DBOptions(*reinterpret_cast<rocksdb::Options*>(joptions_handle));
+  return reinterpret_cast<jlong>(new_opt);
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    getDBOptionsFromProps
  * Signature: (Ljava/util/String;)J
  */
 jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps(
-    JNIEnv* env, jclass jclazz, jstring jopt_string) {
+    JNIEnv* env, jclass, jstring jopt_string) {
   const char* opt_string = env->GetStringUTFChars(jopt_string, nullptr);
-  if(opt_string == nullptr) {
+  if (opt_string == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
@@ -4147,7 +4491,7 @@ jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps(
  * Signature: (J)V
  */
 void Java_org_rocksdb_DBOptions_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+    JNIEnv*, jobject, jlong handle) {
   auto* dbo = reinterpret_cast<rocksdb::DBOptions*>(handle);
   assert(dbo != nullptr);
   delete dbo;
@@ -4159,7 +4503,7 @@ void Java_org_rocksdb_DBOptions_disposeInternal(
  * Signature: (J)V
  */
 void Java_org_rocksdb_DBOptions_optimizeForSmallDb(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->OptimizeForSmallDb();
 }
 
@@ -4169,7 +4513,7 @@ void Java_org_rocksdb_DBOptions_optimizeForSmallDb(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setEnv(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jenv_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jenv_handle) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->env =
       reinterpret_cast<rocksdb::Env*>(jenv_handle);
 }
@@ -4180,21 +4524,19 @@ void Java_org_rocksdb_DBOptions_setEnv(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setIncreaseParallelism(
-    JNIEnv * env, jobject jobj, jlong jhandle, jint totalThreads) {
-  reinterpret_cast<rocksdb::DBOptions*>
-      (jhandle)->IncreaseParallelism(static_cast<int>(totalThreads));
+    JNIEnv*, jobject, jlong jhandle, jint totalThreads) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->IncreaseParallelism(
+      static_cast<int>(totalThreads));
 }
 
-
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setCreateIfMissing
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setCreateIfMissing(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      create_if_missing = flag;
+    JNIEnv*, jobject, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->create_if_missing = flag;
 }
 
 /*
@@ -4203,7 +4545,7 @@ void Java_org_rocksdb_DBOptions_setCreateIfMissing(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_createIfMissing(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->create_if_missing;
 }
 
@@ -4213,9 +4555,9 @@ jboolean Java_org_rocksdb_DBOptions_createIfMissing(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) {
-  reinterpret_cast<rocksdb::DBOptions*>
-      (jhandle)->create_missing_column_families = flag;
+    JNIEnv*, jobject, jlong jhandle, jboolean flag) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->create_missing_column_families = flag;
 }
 
 /*
@@ -4224,9 +4566,9 @@ void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>
-      (jhandle)->create_missing_column_families;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->create_missing_column_families;
 }
 
 /*
@@ -4235,7 +4577,7 @@ jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setErrorIfExists(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) {
+    JNIEnv*, jobject, jlong jhandle, jboolean error_if_exists) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->error_if_exists =
       static_cast<bool>(error_if_exists);
 }
@@ -4246,7 +4588,7 @@ void Java_org_rocksdb_DBOptions_setErrorIfExists(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_errorIfExists(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->error_if_exists;
 }
 
@@ -4256,7 +4598,7 @@ jboolean Java_org_rocksdb_DBOptions_errorIfExists(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setParanoidChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) {
+    JNIEnv*, jobject, jlong jhandle, jboolean paranoid_checks) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->paranoid_checks =
       static_cast<bool>(paranoid_checks);
 }
@@ -4267,7 +4609,7 @@ void Java_org_rocksdb_DBOptions_setParanoidChecks(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_paranoidChecks(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->paranoid_checks;
 }
 
@@ -4277,22 +4619,36 @@ jboolean Java_org_rocksdb_DBOptions_paranoidChecks(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setRateLimiter(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
-  std::shared_ptr<rocksdb::RateLimiter> *pRateLimiter =
-      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(
+    JNIEnv*, jobject, jlong jhandle, jlong jrate_limiter_handle) {
+  std::shared_ptr<rocksdb::RateLimiter>* pRateLimiter =
+      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(
           jrate_limiter_handle);
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->rate_limiter = *pRateLimiter;
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setSstFileManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setSstFileManager(
+    JNIEnv*, jobject, jlong jhandle, jlong jsst_file_manager_handle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(
+          jsst_file_manager_handle);
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->sst_file_manager =
+      *sptr_sst_file_manager;
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setLogger
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setLogger(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jlogger_handle) {
-  std::shared_ptr<rocksdb::LoggerJniCallback> *pLogger =
-      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback> *>(
+    JNIEnv*, jobject, jlong jhandle, jlong jlogger_handle) {
+  std::shared_ptr<rocksdb::LoggerJniCallback>* pLogger =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback>*>(
           jlogger_handle);
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log = *pLogger;
 }
@@ -4303,9 +4659,9 @@ void Java_org_rocksdb_DBOptions_setLogger(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_DBOptions_setInfoLogLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jlog_level) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log_level =
-    static_cast<rocksdb::InfoLogLevel>(jlog_level);
+      static_cast<rocksdb::InfoLogLevel>(jlog_level);
 }
 
 /*
@@ -4314,7 +4670,7 @@ void Java_org_rocksdb_DBOptions_setInfoLogLevel(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_DBOptions_infoLogLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return static_cast<jbyte>(
       reinterpret_cast<rocksdb::DBOptions*>(jhandle)->info_log_level);
 }
@@ -4325,8 +4681,7 @@ jbyte Java_org_rocksdb_DBOptions_infoLogLevel(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jmax_total_wal_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_total_wal_size) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_total_wal_size =
       static_cast<jlong>(jmax_total_wal_size);
 }
@@ -4337,9 +4692,8 @@ void Java_org_rocksdb_DBOptions_setMaxTotalWalSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      max_total_wal_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_total_wal_size;
 }
 
 /*
@@ -4348,7 +4702,7 @@ jlong Java_org_rocksdb_DBOptions_maxTotalWalSize(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setMaxOpenFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) {
+    JNIEnv*, jobject, jlong jhandle, jint max_open_files) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_open_files =
       static_cast<int>(max_open_files);
 }
@@ -4359,7 +4713,7 @@ void Java_org_rocksdb_DBOptions_setMaxOpenFiles(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_maxOpenFiles(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_open_files;
 }
 
@@ -4369,7 +4723,7 @@ jint Java_org_rocksdb_DBOptions_maxOpenFiles(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setMaxFileOpeningThreads(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_file_opening_threads) {
+    JNIEnv*, jobject, jlong jhandle, jint jmax_file_opening_threads) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_file_opening_threads =
       static_cast<int>(jmax_file_opening_threads);
 }
@@ -4380,7 +4734,7 @@ void Java_org_rocksdb_DBOptions_setMaxFileOpeningThreads(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<int>(opt->max_file_opening_threads);
 }
@@ -4391,11 +4745,10 @@ jint Java_org_rocksdb_DBOptions_maxFileOpeningThreads(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setStatistics(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jstatistics_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jstatistics_handle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  auto* pSptr =
-      reinterpret_cast<std::shared_ptr<rocksdb::StatisticsJni>*>(
-          jstatistics_handle);
+  auto* pSptr = reinterpret_cast<std::shared_ptr<rocksdb::StatisticsJni>*>(
+      jstatistics_handle);
   opt->statistics = *pSptr;
 }
 
@@ -4405,7 +4758,7 @@ void Java_org_rocksdb_DBOptions_setStatistics(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_statistics(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   std::shared_ptr<rocksdb::Statistics> sptr = opt->statistics;
   if (sptr == nullptr) {
@@ -4423,7 +4776,7 @@ jlong Java_org_rocksdb_DBOptions_statistics(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setUseFsync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) {
+    JNIEnv*, jobject, jlong jhandle, jboolean use_fsync) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_fsync =
       static_cast<bool>(use_fsync);
 }
@@ -4434,7 +4787,7 @@ void Java_org_rocksdb_DBOptions_setUseFsync(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_useFsync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_fsync;
 }
 
@@ -4444,34 +4797,32 @@ jboolean Java_org_rocksdb_DBOptions_useFsync(
  * Signature: (J[Ljava/lang/String;[J)V
  */
 void Java_org_rocksdb_DBOptions_setDbPaths(
-    JNIEnv* env, jobject jobj, jlong jhandle, jobjectArray jpaths,
+    JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
   std::vector<rocksdb::DbPath> db_paths;
   jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
-  if(ptr_jtarget_size == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return;
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
   }
 
   jboolean has_exception = JNI_FALSE;
   const jsize len = env->GetArrayLength(jpaths);
-  for(jsize i = 0; i < len; i++) {
-    jobject jpath = reinterpret_cast<jstring>(env->
-        GetObjectArrayElement(jpaths, i));
-    if(env->ExceptionCheck()) {
+  for (jsize i = 0; i < len; i++) {
+    jobject jpath =
+        reinterpret_cast<jstring>(env->GetObjectArrayElement(jpaths, i));
+    if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
-      env->ReleaseLongArrayElements(
-          jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
       return;
     }
-    std::string path = rocksdb::JniUtil::copyString(
+    std::string path = rocksdb::JniUtil::copyStdString(
         env, static_cast<jstring>(jpath), &has_exception);
     env->DeleteLocalRef(jpath);
 
-    if(has_exception == JNI_TRUE) {
-        env->ReleaseLongArrayElements(
-            jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
-        return;
+    if (has_exception == JNI_TRUE) {
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      return;
     }
 
     jlong jtarget_size = ptr_jtarget_size[i];
@@ -4492,7 +4843,7 @@ void Java_org_rocksdb_DBOptions_setDbPaths(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_dbPathsLen(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jlong>(opt->db_paths.size());
 }
@@ -4503,32 +4854,30 @@ jlong Java_org_rocksdb_DBOptions_dbPathsLen(
  * Signature: (J[Ljava/lang/String;[J)V
  */
 void Java_org_rocksdb_DBOptions_dbPaths(
-    JNIEnv* env, jobject jobj, jlong jhandle, jobjectArray jpaths,
+    JNIEnv* env, jobject, jlong jhandle, jobjectArray jpaths,
     jlongArray jtarget_sizes) {
   jlong* ptr_jtarget_size = env->GetLongArrayElements(jtarget_sizes, nullptr);
-  if(ptr_jtarget_size == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return;
+  if (ptr_jtarget_size == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
   }
 
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   const jsize len = env->GetArrayLength(jpaths);
-  for(jsize i = 0; i < len; i++) {
+  for (jsize i = 0; i < len; i++) {
     rocksdb::DbPath db_path = opt->db_paths[i];
 
     jstring jpath = env->NewStringUTF(db_path.path.c_str());
-    if(jpath == nullptr) {
+    if (jpath == nullptr) {
       // exception thrown: OutOfMemoryError
-      env->ReleaseLongArrayElements(
-          jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
       return;
     }
     env->SetObjectArrayElement(jpaths, i, jpath);
-    if(env->ExceptionCheck()) {
+    if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
       env->DeleteLocalRef(jpath);
-      env->ReleaseLongArrayElements(
-          jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
+      env->ReleaseLongArrayElements(jtarget_sizes, ptr_jtarget_size, JNI_ABORT);
       return;
     }
 
@@ -4544,9 +4893,9 @@ void Java_org_rocksdb_DBOptions_dbPaths(
  * Signature: (JLjava/lang/String)V
  */
 void Java_org_rocksdb_DBOptions_setDbLogDir(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) {
+    JNIEnv* env, jobject, jlong jhandle, jstring jdb_log_dir) {
   const char* log_dir = env->GetStringUTFChars(jdb_log_dir, nullptr);
-  if(log_dir == nullptr) {
+  if (log_dir == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
@@ -4561,7 +4910,7 @@ void Java_org_rocksdb_DBOptions_setDbLogDir(
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_DBOptions_dbLogDir(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   return env->NewStringUTF(
       reinterpret_cast<rocksdb::DBOptions*>(jhandle)->db_log_dir.c_str());
 }
@@ -4572,7 +4921,7 @@ jstring Java_org_rocksdb_DBOptions_dbLogDir(
  * Signature: (JLjava/lang/String)V
  */
 void Java_org_rocksdb_DBOptions_setWalDir(
-    JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) {
+    JNIEnv* env, jobject, jlong jhandle, jstring jwal_dir) {
   const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0);
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->wal_dir.assign(wal_dir);
   env->ReleaseStringUTFChars(jwal_dir, wal_dir);
@@ -4584,7 +4933,7 @@ void Java_org_rocksdb_DBOptions_setWalDir(
  * Signature: (J)Ljava/lang/String
  */
 jstring Java_org_rocksdb_DBOptions_walDir(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv* env, jobject, jlong jhandle) {
   return env->NewStringUTF(
       reinterpret_cast<rocksdb::DBOptions*>(jhandle)->wal_dir.c_str());
 }
@@ -4595,10 +4944,9 @@ jstring Java_org_rocksdb_DBOptions_walDir(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) {
+    JNIEnv*, jobject, jlong jhandle, jlong micros) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)
-      ->delete_obsolete_files_period_micros =
-          static_cast<int64_t>(micros);
+      ->delete_obsolete_files_period_micros = static_cast<int64_t>(micros);
 }
 
 /*
@@ -4607,7 +4955,7 @@ void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
       ->delete_obsolete_files_period_micros;
 }
@@ -4618,9 +4966,9 @@ jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setBaseBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
-      ->base_background_compactions = static_cast<int>(max);
+    JNIEnv*, jobject, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->base_background_compactions =
+      static_cast<int>(max);
 }
 
 /*
@@ -4629,7 +4977,7 @@ void Java_org_rocksdb_DBOptions_setBaseBackgroundCompactions(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_baseBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
       ->base_background_compactions;
 }
@@ -4640,9 +4988,9 @@ jint Java_org_rocksdb_DBOptions_baseBackgroundCompactions(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
-      ->max_background_compactions = static_cast<int>(max);
+    JNIEnv*, jobject, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_background_compactions =
+      static_cast<int>(max);
 }
 
 /*
@@ -4651,9 +4999,9 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(
-      jhandle)->max_background_compactions;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->max_background_compactions;
 }
 
 /*
@@ -4662,9 +5010,9 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setMaxSubcompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
-      ->max_subcompactions = static_cast<int32_t>(max);
+    JNIEnv*, jobject, jlong jhandle, jint max) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_subcompactions =
+      static_cast<int32_t>(max);
 }
 
 /*
@@ -4673,9 +5021,8 @@ void Java_org_rocksdb_DBOptions_setMaxSubcompactions(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_maxSubcompactions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
-      ->max_subcompactions;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_subcompactions;
 }
 
 /*
@@ -4684,7 +5031,7 @@ jint Java_org_rocksdb_DBOptions_maxSubcompactions(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) {
+    JNIEnv*, jobject, jlong jhandle, jint max_background_flushes) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_background_flushes =
       static_cast<int>(max_background_flushes);
 }
@@ -4695,9 +5042,29 @@ void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      max_background_flushes;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_background_flushes;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setMaxBackgroundJobs
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_DBOptions_setMaxBackgroundJobs(
+    JNIEnv*, jobject, jlong jhandle, jint max_background_jobs) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_background_jobs =
+      static_cast<int>(max_background_jobs);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    maxBackgroundJobs
+ * Signature: (J)I
+ */
+jint Java_org_rocksdb_DBOptions_maxBackgroundJobs(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_background_jobs;
 }
 
 /*
@@ -4706,8 +5073,8 @@ jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setMaxLogFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong max_log_file_size) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(max_log_file_size);
   if (s.ok()) {
     reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_log_file_size =
         max_log_file_size;
@@ -4722,7 +5089,7 @@ void Java_org_rocksdb_DBOptions_setMaxLogFileSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_maxLogFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_log_file_size;
 }
 
@@ -4732,9 +5099,9 @@ jlong Java_org_rocksdb_DBOptions_maxLogFileSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(
-      log_file_time_to_roll);
+    JNIEnv* env, jobject, jlong jhandle, jlong log_file_time_to_roll) {
+  auto s =
+      rocksdb::JniUtil::check_if_jlong_fits_size_t(log_file_time_to_roll);
   if (s.ok()) {
     reinterpret_cast<rocksdb::DBOptions*>(jhandle)->log_file_time_to_roll =
         log_file_time_to_roll;
@@ -4749,7 +5116,7 @@ void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->log_file_time_to_roll;
 }
 
@@ -4759,8 +5126,8 @@ jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setKeepLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num);
+    JNIEnv* env, jobject, jlong jhandle, jlong keep_log_file_num) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(keep_log_file_num);
   if (s.ok()) {
     reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num =
         keep_log_file_num;
@@ -4775,7 +5142,7 @@ void Java_org_rocksdb_DBOptions_setKeepLogFileNum(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_keepLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num;
 }
 
@@ -4785,8 +5152,8 @@ jlong Java_org_rocksdb_DBOptions_keepLogFileNum(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setRecycleLogFileNum(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong recycle_log_file_num) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num);
+    JNIEnv* env, jobject, jlong jhandle, jlong recycle_log_file_num) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(recycle_log_file_num);
   if (s.ok()) {
     reinterpret_cast<rocksdb::DBOptions*>(jhandle)->recycle_log_file_num =
         recycle_log_file_num;
@@ -4800,8 +5167,8 @@ void Java_org_rocksdb_DBOptions_setRecycleLogFileNum(
  * Method:    recycleLogFileNum
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv* env, jobject jobj,
-                                                   jlong jhandle) {
+jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->recycle_log_file_num;
 }
 
@@ -4811,7 +5178,7 @@ jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv* env, jobject jobj,
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setMaxManifestFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong max_manifest_file_size) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_manifest_file_size =
       static_cast<int64_t>(max_manifest_file_size);
 }
@@ -4822,9 +5189,8 @@ void Java_org_rocksdb_DBOptions_setMaxManifestFileSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      max_manifest_file_size;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->max_manifest_file_size;
 }
 
 /*
@@ -4833,7 +5199,7 @@ jlong Java_org_rocksdb_DBOptions_maxManifestFileSize(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) {
+    JNIEnv*, jobject, jlong jhandle, jint table_cache_numshardbits) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->table_cache_numshardbits =
       static_cast<int>(table_cache_numshardbits);
 }
@@ -4844,9 +5210,9 @@ void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      table_cache_numshardbits;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->table_cache_numshardbits;
 }
 
 /*
@@ -4855,7 +5221,7 @@ jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setWalTtlSeconds(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) {
+    JNIEnv*, jobject, jlong jhandle, jlong WAL_ttl_seconds) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_ttl_seconds =
       static_cast<int64_t>(WAL_ttl_seconds);
 }
@@ -4866,7 +5232,7 @@ void Java_org_rocksdb_DBOptions_setWalTtlSeconds(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_walTtlSeconds(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_ttl_seconds;
 }
 
@@ -4876,7 +5242,7 @@ jlong Java_org_rocksdb_DBOptions_walTtlSeconds(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) {
+    JNIEnv*, jobject, jlong jhandle, jlong WAL_size_limit_MB) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_size_limit_MB =
       static_cast<int64_t>(WAL_size_limit_MB);
 }
@@ -4887,7 +5253,7 @@ void Java_org_rocksdb_DBOptions_setWalSizeLimitMB(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->WAL_size_limit_MB;
 }
 
@@ -4897,11 +5263,11 @@ jlong Java_org_rocksdb_DBOptions_walSizeLimitMB(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setManifestPreallocationSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
-  rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size);
+    JNIEnv* env, jobject, jlong jhandle, jlong preallocation_size) {
+  auto s = rocksdb::JniUtil::check_if_jlong_fits_size_t(preallocation_size);
   if (s.ok()) {
-    reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-        manifest_preallocation_size = preallocation_size;
+    reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+        ->manifest_preallocation_size = preallocation_size;
   } else {
     rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
   }
@@ -4913,7 +5279,7 @@ void Java_org_rocksdb_DBOptions_setManifestPreallocationSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
       ->manifest_preallocation_size;
 }
@@ -4923,8 +5289,8 @@ jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize(
  * Method:    useDirectReads
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv* env, jobject jobj,
-                                                   jlong jhandle) {
+jboolean Java_org_rocksdb_DBOptions_useDirectReads(
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_reads;
 }
 
@@ -4933,9 +5299,8 @@ jboolean Java_org_rocksdb_DBOptions_useDirectReads(JNIEnv* env, jobject jobj,
  * Method:    setUseDirectReads
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv* env, jobject jobj,
-                                                  jlong jhandle,
-                                                  jboolean use_direct_reads) {
+void Java_org_rocksdb_DBOptions_setUseDirectReads(
+    JNIEnv*, jobject, jlong jhandle, jboolean use_direct_reads) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_direct_reads =
       static_cast<bool>(use_direct_reads);
 }
@@ -4946,7 +5311,7 @@ void Java_org_rocksdb_DBOptions_setUseDirectReads(JNIEnv* env, jobject jobj,
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_useDirectIoForFlushAndCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
       ->use_direct_io_for_flush_and_compaction;
 }
@@ -4957,7 +5322,7 @@ jboolean Java_org_rocksdb_DBOptions_useDirectIoForFlushAndCompaction(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setUseDirectIoForFlushAndCompaction(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jboolean use_direct_io_for_flush_and_compaction) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)
       ->use_direct_io_for_flush_and_compaction =
@@ -4970,7 +5335,7 @@ void Java_org_rocksdb_DBOptions_setUseDirectIoForFlushAndCompaction(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAllowFAllocate(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jallow_fallocate) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_fallocate) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_fallocate =
       static_cast<bool>(jallow_fallocate);
 }
@@ -4981,7 +5346,7 @@ void Java_org_rocksdb_DBOptions_setAllowFAllocate(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_allowFAllocate(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->allow_fallocate);
 }
@@ -4992,7 +5357,7 @@ jboolean Java_org_rocksdb_DBOptions_allowFAllocate(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAllowMmapReads(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) {
+    JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_reads) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_reads =
       static_cast<bool>(allow_mmap_reads);
 }
@@ -5003,7 +5368,7 @@ void Java_org_rocksdb_DBOptions_setAllowMmapReads(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_allowMmapReads(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_reads;
 }
 
@@ -5013,7 +5378,7 @@ jboolean Java_org_rocksdb_DBOptions_allowMmapReads(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAllowMmapWrites(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) {
+    JNIEnv*, jobject, jlong jhandle, jboolean allow_mmap_writes) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_writes =
       static_cast<bool>(allow_mmap_writes);
 }
@@ -5024,7 +5389,7 @@ void Java_org_rocksdb_DBOptions_setAllowMmapWrites(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->allow_mmap_writes;
 }
 
@@ -5034,7 +5399,7 @@ jboolean Java_org_rocksdb_DBOptions_allowMmapWrites(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) {
+    JNIEnv*, jobject, jlong jhandle, jboolean is_fd_close_on_exec) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->is_fd_close_on_exec =
       static_cast<bool>(is_fd_close_on_exec);
 }
@@ -5045,7 +5410,7 @@ void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->is_fd_close_on_exec;
 }
 
@@ -5055,7 +5420,7 @@ jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec(
  * Signature: (JI)V
  */
 void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) {
+    JNIEnv*, jobject, jlong jhandle, jint stats_dump_period_sec) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->stats_dump_period_sec =
       static_cast<int>(stats_dump_period_sec);
 }
@@ -5066,7 +5431,7 @@ void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec(
  * Signature: (J)I
  */
 jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->stats_dump_period_sec;
 }
 
@@ -5076,7 +5441,7 @@ jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) {
+    JNIEnv*, jobject, jlong jhandle, jboolean advise_random_on_open) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->advise_random_on_open =
       static_cast<bool>(advise_random_on_open);
 }
@@ -5087,7 +5452,7 @@ void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->advise_random_on_open;
 }
 
@@ -5097,18 +5462,32 @@ jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setDbWriteBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jdb_write_buffer_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jdb_write_buffer_size) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->db_write_buffer_size = static_cast<size_t>(jdb_write_buffer_size);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWriteBufferManager
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWriteBufferManager(
+    JNIEnv*, jobject, jlong jdb_options_handle,
+    jlong jwrite_buffer_manager_handle) {
+  auto* write_buffer_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::WriteBufferManager> *>(jwrite_buffer_manager_handle);
+  reinterpret_cast<rocksdb::DBOptions*>(jdb_options_handle)->write_buffer_manager =
+      *write_buffer_manager;
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    dbWriteBufferSize
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_dbWriteBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jlong>(opt->db_write_buffer_size);
 }
@@ -5119,7 +5498,7 @@ jlong Java_org_rocksdb_DBOptions_dbWriteBufferSize(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_DBOptions_setAccessHintOnCompactionStart(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jaccess_hint_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jaccess_hint_value) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->access_hint_on_compaction_start =
       rocksdb::AccessHintJni::toCppAccessHint(jaccess_hint_value);
@@ -5131,7 +5510,7 @@ void Java_org_rocksdb_DBOptions_setAccessHintOnCompactionStart(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_DBOptions_accessHintOnCompactionStart(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return rocksdb::AccessHintJni::toJavaAccessHint(
       opt->access_hint_on_compaction_start);
@@ -5143,7 +5522,7 @@ jbyte Java_org_rocksdb_DBOptions_accessHintOnCompactionStart(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setNewTableReaderForCompactionInputs(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jboolean jnew_table_reader_for_compaction_inputs) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->new_table_reader_for_compaction_inputs =
@@ -5156,7 +5535,7 @@ void Java_org_rocksdb_DBOptions_setNewTableReaderForCompactionInputs(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_newTableReaderForCompactionInputs(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<bool>(opt->new_table_reader_for_compaction_inputs);
 }
@@ -5167,7 +5546,7 @@ jboolean Java_org_rocksdb_DBOptions_newTableReaderForCompactionInputs(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setCompactionReadaheadSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jcompaction_readahead_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jcompaction_readahead_size) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->compaction_readahead_size =
       static_cast<size_t>(jcompaction_readahead_size);
@@ -5179,7 +5558,7 @@ void Java_org_rocksdb_DBOptions_setCompactionReadaheadSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jlong>(opt->compaction_readahead_size);
 }
@@ -5190,8 +5569,7 @@ jlong Java_org_rocksdb_DBOptions_compactionReadaheadSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setRandomAccessMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jrandom_access_max_buffer_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jrandom_access_max_buffer_size) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->random_access_max_buffer_size =
       static_cast<size_t>(jrandom_access_max_buffer_size);
@@ -5203,7 +5581,7 @@ void Java_org_rocksdb_DBOptions_setRandomAccessMaxBufferSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jlong>(opt->random_access_max_buffer_size);
 }
@@ -5214,8 +5592,7 @@ jlong Java_org_rocksdb_DBOptions_randomAccessMaxBufferSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setWritableFileMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jlong jwritable_file_max_buffer_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jwritable_file_max_buffer_size) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->writable_file_max_buffer_size =
       static_cast<size_t>(jwritable_file_max_buffer_size);
@@ -5227,7 +5604,7 @@ void Java_org_rocksdb_DBOptions_setWritableFileMaxBufferSize(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jlong>(opt->writable_file_max_buffer_size);
 }
@@ -5238,7 +5615,7 @@ jlong Java_org_rocksdb_DBOptions_writableFileMaxBufferSize(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) {
+    JNIEnv*, jobject, jlong jhandle, jboolean use_adaptive_mutex) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_adaptive_mutex =
       static_cast<bool>(use_adaptive_mutex);
 }
@@ -5249,7 +5626,7 @@ void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->use_adaptive_mutex;
 }
 
@@ -5259,7 +5636,7 @@ jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setBytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) {
+    JNIEnv*, jobject, jlong jhandle, jlong bytes_per_sync) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->bytes_per_sync =
       static_cast<int64_t>(bytes_per_sync);
 }
@@ -5270,7 +5647,7 @@ void Java_org_rocksdb_DBOptions_setBytesPerSync(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_bytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->bytes_per_sync;
 }
 
@@ -5280,7 +5657,7 @@ jlong Java_org_rocksdb_DBOptions_bytesPerSync(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setWalBytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jwal_bytes_per_sync) {
+    JNIEnv*, jobject, jlong jhandle, jlong jwal_bytes_per_sync) {
   reinterpret_cast<rocksdb::DBOptions*>(jhandle)->wal_bytes_per_sync =
       static_cast<int64_t>(jwal_bytes_per_sync);
 }
@@ -5291,54 +5668,75 @@ void Java_org_rocksdb_DBOptions_setWalBytesPerSync(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_walBytesPerSync(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jlong>(opt->wal_bytes_per_sync);
 }
 
 /*
  * Class:     org_rocksdb_DBOptions
- * Method:    setEnableThreadTracking
+ * Method:    setDelayedWriteRate
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setDelayedWriteRate(
+    JNIEnv*, jobject, jlong jhandle, jlong jdelayed_write_rate) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  opt->delayed_write_rate = static_cast<uint64_t>(jdelayed_write_rate);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    delayedWriteRate
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_DBOptions_delayedWriteRate(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  return static_cast<jlong>(opt->delayed_write_rate);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setEnablePipelinedWrite
  * Signature: (JZ)V
  */
-void Java_org_rocksdb_DBOptions_setEnableThreadTracking(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jenable_thread_tracking) {
+void Java_org_rocksdb_DBOptions_setEnablePipelinedWrite(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_pipelined_write) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  opt->enable_thread_tracking = static_cast<bool>(jenable_thread_tracking);
+  opt->enable_pipelined_write = jenable_pipelined_write == JNI_TRUE;
 }
 
 /*
  * Class:     org_rocksdb_DBOptions
- * Method:    enableThreadTracking
+ * Method:    enablePipelinedWrite
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_DBOptions_enableThreadTracking(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_DBOptions_enablePipelinedWrite(
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  return static_cast<jboolean>(opt->enable_thread_tracking);
+  return static_cast<jboolean>(opt->enable_pipelined_write);
 }
 
 /*
  * Class:     org_rocksdb_DBOptions
- * Method:    setDelayedWriteRate
- * Signature: (JJ)V
+ * Method:    setEnableThreadTracking
+ * Signature: (JZ)V
  */
-void Java_org_rocksdb_DBOptions_setDelayedWriteRate(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jdelayed_write_rate) {
+void Java_org_rocksdb_DBOptions_setEnableThreadTracking(
+    JNIEnv*, jobject, jlong jhandle, jboolean jenable_thread_tracking) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  opt->delayed_write_rate = static_cast<uint64_t>(jdelayed_write_rate);
+  opt->enable_thread_tracking = jenable_thread_tracking == JNI_TRUE;
 }
 
 /*
  * Class:     org_rocksdb_DBOptions
- * Method:    delayedWriteRate
- * Signature: (J)J
+ * Method:    enableThreadTracking
+ * Signature: (J)Z
  */
-jlong Java_org_rocksdb_DBOptions_delayedWriteRate(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_DBOptions_enableThreadTracking(
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  return static_cast<jlong>(opt->delayed_write_rate);
+  return static_cast<jboolean>(opt->enable_thread_tracking);
 }
 
 /*
@@ -5347,9 +5745,9 @@ jlong Java_org_rocksdb_DBOptions_delayedWriteRate(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAllowConcurrentMemtableWrite(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      allow_concurrent_memtable_write = static_cast<bool>(allow);
+    JNIEnv*, jobject, jlong jhandle, jboolean allow) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->allow_concurrent_memtable_write = static_cast<bool>(allow);
 }
 
 /*
@@ -5358,9 +5756,9 @@ void Java_org_rocksdb_DBOptions_setAllowConcurrentMemtableWrite(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_allowConcurrentMemtableWrite(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      allow_concurrent_memtable_write;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->allow_concurrent_memtable_write;
 }
 
 /*
@@ -5369,9 +5767,9 @@ jboolean Java_org_rocksdb_DBOptions_allowConcurrentMemtableWrite(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setEnableWriteThreadAdaptiveYield(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean yield) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      enable_write_thread_adaptive_yield = static_cast<bool>(yield);
+    JNIEnv*, jobject, jlong jhandle, jboolean yield) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->enable_write_thread_adaptive_yield = static_cast<bool>(yield);
 }
 
 /*
@@ -5380,9 +5778,9 @@ void Java_org_rocksdb_DBOptions_setEnableWriteThreadAdaptiveYield(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_enableWriteThreadAdaptiveYield(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      enable_write_thread_adaptive_yield;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->enable_write_thread_adaptive_yield;
 }
 
 /*
@@ -5391,9 +5789,9 @@ jboolean Java_org_rocksdb_DBOptions_enableWriteThreadAdaptiveYield(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong max) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      write_thread_max_yield_usec = static_cast<int64_t>(max);
+    JNIEnv*, jobject, jlong jhandle, jlong max) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->write_thread_max_yield_usec =
+      static_cast<int64_t>(max);
 }
 
 /*
@@ -5402,9 +5800,9 @@ void Java_org_rocksdb_DBOptions_setWriteThreadMaxYieldUsec(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      write_thread_max_yield_usec;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->write_thread_max_yield_usec;
 }
 
 /*
@@ -5413,9 +5811,9 @@ jlong Java_org_rocksdb_DBOptions_writeThreadMaxYieldUsec(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong slow) {
-  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      write_thread_slow_yield_usec = static_cast<int64_t>(slow);
+    JNIEnv*, jobject, jlong jhandle, jlong slow) {
+  reinterpret_cast<rocksdb::DBOptions*>(jhandle)->write_thread_slow_yield_usec =
+      static_cast<int64_t>(slow);
 }
 
 /*
@@ -5424,9 +5822,9 @@ void Java_org_rocksdb_DBOptions_setWriteThreadSlowYieldUsec(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->
-      write_thread_slow_yield_usec;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::DBOptions*>(jhandle)
+      ->write_thread_slow_yield_usec;
 }
 
 /*
@@ -5435,8 +5833,7 @@ jlong Java_org_rocksdb_DBOptions_writeThreadSlowYieldUsec(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setSkipStatsUpdateOnDbOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jskip_stats_update_on_db_open) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jskip_stats_update_on_db_open) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->skip_stats_update_on_db_open =
       static_cast<bool>(jskip_stats_update_on_db_open);
@@ -5448,7 +5845,7 @@ void Java_org_rocksdb_DBOptions_setSkipStatsUpdateOnDbOpen(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->skip_stats_update_on_db_open);
 }
@@ -5459,11 +5856,10 @@ jboolean Java_org_rocksdb_DBOptions_skipStatsUpdateOnDbOpen(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_DBOptions_setWalRecoveryMode(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jwal_recovery_mode_value) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jwal_recovery_mode_value) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  opt->wal_recovery_mode =
-      rocksdb::WALRecoveryModeJni::toCppWALRecoveryMode(
-          jwal_recovery_mode_value);
+  opt->wal_recovery_mode = rocksdb::WALRecoveryModeJni::toCppWALRecoveryMode(
+      jwal_recovery_mode_value);
 }
 
 /*
@@ -5472,7 +5868,7 @@ void Java_org_rocksdb_DBOptions_setWalRecoveryMode(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_DBOptions_walRecoveryMode(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return rocksdb::WALRecoveryModeJni::toJavaWALRecoveryMode(
       opt->wal_recovery_mode);
@@ -5484,7 +5880,7 @@ jbyte Java_org_rocksdb_DBOptions_walRecoveryMode(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAllow2pc(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jallow_2pc) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_2pc) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->allow_2pc = static_cast<bool>(jallow_2pc);
 }
@@ -5494,7 +5890,8 @@ void Java_org_rocksdb_DBOptions_setAllow2pc(
  * Method:    allow2pc
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_DBOptions_allow2pc(JNIEnv* env, jobject jobj, jlong jhandle) {
+jboolean Java_org_rocksdb_DBOptions_allow2pc(
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->allow_2pc);
 }
@@ -5505,20 +5902,33 @@ jboolean Java_org_rocksdb_DBOptions_allow2pc(JNIEnv* env, jobject jobj, jlong jh
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DBOptions_setRowCache(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jrow_cache_handle) {
+    JNIEnv*, jobject, jlong jhandle, jlong jrow_cache_handle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  auto* row_cache = reinterpret_cast<std::shared_ptr<rocksdb::Cache>*>(jrow_cache_handle);
+  auto* row_cache =
+      reinterpret_cast<std::shared_ptr<rocksdb::Cache>*>(jrow_cache_handle);
   opt->row_cache = *row_cache;
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setWalFilter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_DBOptions_setWalFilter(
+    JNIEnv*, jobject, jlong jhandle, jlong jwal_filter_handle) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  auto* wal_filter =
+    reinterpret_cast<rocksdb::WalFilterJniCallback*>(jwal_filter_handle);
+  opt->wal_filter = wal_filter;
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setFailIfOptionsFileError
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jfail_if_options_file_error) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jfail_if_options_file_error) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->fail_if_options_file_error =
       static_cast<bool>(jfail_if_options_file_error);
@@ -5530,7 +5940,7 @@ void Java_org_rocksdb_DBOptions_setFailIfOptionsFileError(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->fail_if_options_file_error);
 }
@@ -5541,7 +5951,7 @@ jboolean Java_org_rocksdb_DBOptions_failIfOptionsFileError(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setDumpMallocStats(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jdump_malloc_stats) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jdump_malloc_stats) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   opt->dump_malloc_stats = static_cast<bool>(jdump_malloc_stats);
 }
@@ -5552,7 +5962,7 @@ void Java_org_rocksdb_DBOptions_setDumpMallocStats(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_dumpMallocStats(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->dump_malloc_stats);
 }
@@ -5563,10 +5973,10 @@ jboolean Java_org_rocksdb_DBOptions_dumpMallocStats(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAvoidFlushDuringRecovery(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean javoid_flush_during_recovery) {
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_recovery) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  opt->avoid_flush_during_recovery = static_cast<bool>(javoid_flush_during_recovery);
+  opt->avoid_flush_during_recovery =
+      static_cast<bool>(javoid_flush_during_recovery);
 }
 
 /*
@@ -5575,21 +5985,131 @@ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringRecovery(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringRecovery(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->avoid_flush_during_recovery);
 }
 
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAllowIngestBehind
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAllowIngestBehind(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_ingest_behind) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  opt->allow_ingest_behind = jallow_ingest_behind == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    allowIngestBehind
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_allowIngestBehind(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->allow_ingest_behind);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setPreserveDeletes
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setPreserveDeletes(
+    JNIEnv*, jobject, jlong jhandle, jboolean jpreserve_deletes) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  opt->preserve_deletes = jpreserve_deletes == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    preserveDeletes
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_preserveDeletes(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->preserve_deletes);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setTwoWriteQueues
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setTwoWriteQueues(
+    JNIEnv*, jobject, jlong jhandle, jboolean jtwo_write_queues) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  opt->two_write_queues = jtwo_write_queues == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    twoWriteQueues
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_twoWriteQueues(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->two_write_queues);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setManualWalFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setManualWalFlush(
+    JNIEnv*, jobject, jlong jhandle, jboolean jmanual_wal_flush) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  opt->manual_wal_flush = jmanual_wal_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    manualWalFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_manualWalFlush(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->manual_wal_flush);
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    setAtomicFlush
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_DBOptions_setAtomicFlush(
+    JNIEnv*, jobject, jlong jhandle, jboolean jatomic_flush) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  opt->atomic_flush = jatomic_flush == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_DBOptions
+ * Method:    atomicFlush
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_DBOptions_atomicFlush(
+    JNIEnv *, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
+  return static_cast<jboolean>(opt->atomic_flush);
+}
+
 /*
  * Class:     org_rocksdb_DBOptions
  * Method:    setAvoidFlushDuringShutdown
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_DBOptions_setAvoidFlushDuringShutdown(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean javoid_flush_during_shutdown) {
+    JNIEnv*, jobject, jlong jhandle, jboolean javoid_flush_during_shutdown) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
-  opt->avoid_flush_during_shutdown = static_cast<bool>(javoid_flush_during_shutdown);
+  opt->avoid_flush_during_shutdown =
+      static_cast<bool>(javoid_flush_during_shutdown);
 }
 
 /*
@@ -5598,7 +6118,7 @@ void Java_org_rocksdb_DBOptions_setAvoidFlushDuringShutdown(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringShutdown(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jhandle);
   return static_cast<jboolean>(opt->avoid_flush_during_shutdown);
 }
@@ -5612,18 +6132,30 @@ jboolean Java_org_rocksdb_DBOptions_avoidFlushDuringShutdown(
  * Signature: ()J
  */
 jlong Java_org_rocksdb_WriteOptions_newWriteOptions(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv*, jclass) {
   auto* op = new rocksdb::WriteOptions();
   return reinterpret_cast<jlong>(op);
 }
 
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    copyWriteOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteOptions_copyWriteOptions(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt = new rocksdb::WriteOptions(
+      *(reinterpret_cast<rocksdb::WriteOptions*>(jhandle)));
+  return reinterpret_cast<jlong>(new_opt);
+}
+
 /*
  * Class:     org_rocksdb_WriteOptions
  * Method:    disposeInternal
  * Signature: ()V
  */
 void Java_org_rocksdb_WriteOptions_disposeInternal(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(jhandle);
   assert(write_options != nullptr);
   delete write_options;
@@ -5635,7 +6167,7 @@ void Java_org_rocksdb_WriteOptions_disposeInternal(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_WriteOptions_setSync(
-  JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jflag) {
   reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync = jflag;
 }
 
@@ -5645,7 +6177,7 @@ void Java_org_rocksdb_WriteOptions_setSync(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_WriteOptions_sync(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->sync;
 }
 
@@ -5655,7 +6187,7 @@ jboolean Java_org_rocksdb_WriteOptions_sync(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_WriteOptions_setDisableWAL(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jflag) {
   reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL = jflag;
 }
 
@@ -5665,7 +6197,7 @@ void Java_org_rocksdb_WriteOptions_setDisableWAL(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_WriteOptions_disableWAL(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->disableWAL;
 }
 
@@ -5675,11 +6207,11 @@ jboolean Java_org_rocksdb_WriteOptions_disableWAL(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_WriteOptions_setIgnoreMissingColumnFamilies(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jboolean jignore_missing_column_families) {
-  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->
-      ignore_missing_column_families =
-          static_cast<bool>(jignore_missing_column_families);
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)
+      ->ignore_missing_column_families =
+      static_cast<bool>(jignore_missing_column_families);
 }
 
 /*
@@ -5688,9 +6220,9 @@ void Java_org_rocksdb_WriteOptions_setIgnoreMissingColumnFamilies(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_WriteOptions_ignoreMissingColumnFamilies(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
-  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->
-      ignore_missing_column_families;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)
+      ->ignore_missing_column_families;
 }
 
 /*
@@ -5699,7 +6231,7 @@ jboolean Java_org_rocksdb_WriteOptions_ignoreMissingColumnFamilies(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_WriteOptions_setNoSlowdown(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jno_slowdown) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jno_slowdown) {
   reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->no_slowdown =
       static_cast<bool>(jno_slowdown);
 }
@@ -5710,10 +6242,31 @@ void Java_org_rocksdb_WriteOptions_setNoSlowdown(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_WriteOptions_noSlowdown(
-    JNIEnv* env, jobject jwrite_options, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->no_slowdown;
 }
 
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    setLowPri
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_WriteOptions_setLowPri(
+    JNIEnv*, jobject, jlong jhandle, jboolean jlow_pri) {
+  reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->low_pri =
+      static_cast<bool>(jlow_pri);
+}
+
+/*
+ * Class:     org_rocksdb_WriteOptions
+ * Method:    lowPri
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteOptions_lowPri(
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::WriteOptions*>(jhandle)->low_pri;
+}
+
 /////////////////////////////////////////////////////////////////////
 // rocksdb::ReadOptions
 
@@ -5722,19 +6275,44 @@ jboolean Java_org_rocksdb_WriteOptions_noSlowdown(
  * Method:    newReadOptions
  * Signature: ()J
  */
-jlong Java_org_rocksdb_ReadOptions_newReadOptions(
-    JNIEnv* env, jclass jcls) {
+jlong Java_org_rocksdb_ReadOptions_newReadOptions__(
+    JNIEnv*, jclass) {
   auto* read_options = new rocksdb::ReadOptions();
   return reinterpret_cast<jlong>(read_options);
 }
 
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    newReadOptions
+ * Signature: (ZZ)J
+ */
+jlong Java_org_rocksdb_ReadOptions_newReadOptions__ZZ(
+    JNIEnv*, jclass, jboolean jverify_checksums, jboolean jfill_cache) {
+  auto* read_options =
+      new rocksdb::ReadOptions(static_cast<bool>(jverify_checksums),
+          static_cast<bool>(jfill_cache));
+  return reinterpret_cast<jlong>(read_options);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    copyReadOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_copyReadOptions(
+    JNIEnv*, jclass, jlong jhandle) {
+  auto new_opt = new rocksdb::ReadOptions(
+      *(reinterpret_cast<rocksdb::ReadOptions*>(jhandle)));
+  return reinterpret_cast<jlong>(new_opt);
+}
+
 /*
  * Class:     org_rocksdb_ReadOptions
  * Method:    disposeInternal
  * Signature: (J)V
  */
 void Java_org_rocksdb_ReadOptions_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* read_options = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
   assert(read_options != nullptr);
   delete read_options;
@@ -5746,8 +6324,7 @@ void Java_org_rocksdb_ReadOptions_disposeInternal(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jverify_checksums) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jverify_checksums) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->verify_checksums =
       static_cast<bool>(jverify_checksums);
 }
@@ -5758,9 +6335,8 @@ void Java_org_rocksdb_ReadOptions_setVerifyChecksums(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::ReadOptions*>(
-      jhandle)->verify_checksums;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->verify_checksums;
 }
 
 /*
@@ -5769,7 +6345,7 @@ jboolean Java_org_rocksdb_ReadOptions_verifyChecksums(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setFillCache(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jfill_cache) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache =
       static_cast<bool>(jfill_cache);
 }
@@ -5780,7 +6356,7 @@ void Java_org_rocksdb_ReadOptions_setFillCache(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_fillCache(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->fill_cache;
 }
 
@@ -5790,7 +6366,7 @@ jboolean Java_org_rocksdb_ReadOptions_fillCache(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setTailing(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtailing) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jtailing) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing =
       static_cast<bool>(jtailing);
 }
@@ -5801,7 +6377,7 @@ void Java_org_rocksdb_ReadOptions_setTailing(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_tailing(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->tailing;
 }
 
@@ -5811,7 +6387,7 @@ jboolean Java_org_rocksdb_ReadOptions_tailing(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_managed(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->managed;
 }
 
@@ -5821,7 +6397,7 @@ jboolean Java_org_rocksdb_ReadOptions_managed(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setManaged(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jmanaged) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jmanaged) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->managed =
       static_cast<bool>(jmanaged);
 }
@@ -5832,7 +6408,7 @@ void Java_org_rocksdb_ReadOptions_setManaged(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->total_order_seek;
 }
 
@@ -5842,7 +6418,7 @@ jboolean Java_org_rocksdb_ReadOptions_totalOrderSeek(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setTotalOrderSeek(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jtotal_order_seek) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jtotal_order_seek) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->total_order_seek =
       static_cast<bool>(jtotal_order_seek);
 }
@@ -5853,7 +6429,7 @@ void Java_org_rocksdb_ReadOptions_setTotalOrderSeek(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->prefix_same_as_start;
 }
 
@@ -5863,7 +6439,7 @@ jboolean Java_org_rocksdb_ReadOptions_prefixSameAsStart(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setPrefixSameAsStart(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jprefix_same_as_start) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jprefix_same_as_start) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->prefix_same_as_start =
       static_cast<bool>(jprefix_same_as_start);
 }
@@ -5874,7 +6450,7 @@ void Java_org_rocksdb_ReadOptions_setPrefixSameAsStart(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_pinData(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->pin_data;
 }
 
@@ -5884,7 +6460,7 @@ jboolean Java_org_rocksdb_ReadOptions_pinData(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setPinData(
-    JNIEnv* env, jobject jobj, jlong jhandle, jboolean jpin_data) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jpin_data) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->pin_data =
       static_cast<bool>(jpin_data);
 }
@@ -5895,7 +6471,7 @@ void Java_org_rocksdb_ReadOptions_setPinData(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_backgroundPurgeOnIteratorCleanup(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
   return static_cast<jboolean>(opt->background_purge_on_iterator_cleanup);
 }
@@ -5906,7 +6482,7 @@ jboolean Java_org_rocksdb_ReadOptions_backgroundPurgeOnIteratorCleanup(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setBackgroundPurgeOnIteratorCleanup(
-    JNIEnv* env, jobject jobj, jlong jhandle,
+    JNIEnv*, jobject, jlong jhandle,
     jboolean jbackground_purge_on_iterator_cleanup) {
   auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
   opt->background_purge_on_iterator_cleanup =
@@ -5919,7 +6495,7 @@ void Java_org_rocksdb_ReadOptions_setBackgroundPurgeOnIteratorCleanup(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ReadOptions_readaheadSize(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
   return static_cast<jlong>(opt->readahead_size);
 }
@@ -5930,18 +6506,41 @@ jlong Java_org_rocksdb_ReadOptions_readaheadSize(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ReadOptions_setReadaheadSize(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jreadahead_size) {
+    JNIEnv*, jobject, jlong jhandle, jlong jreadahead_size) {
   auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
   opt->readahead_size = static_cast<size_t>(jreadahead_size);
 }
 
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    maxSkippableInternalKeys
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_maxSkippableInternalKeys(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->max_skippable_internal_keys);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setMaxSkippableInternalKeys
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setMaxSkippableInternalKeys(
+    JNIEnv*, jobject, jlong jhandle, jlong jmax_skippable_internal_keys) {
+  auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  opt->max_skippable_internal_keys =
+      static_cast<uint64_t>(jmax_skippable_internal_keys);
+}
+
 /*
  * Class:     org_rocksdb_ReadOptions
  * Method:    ignoreRangeDeletions
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
   return static_cast<jboolean>(opt->ignore_range_deletions);
 }
@@ -5952,8 +6551,7 @@ jboolean Java_org_rocksdb_ReadOptions_ignoreRangeDeletions(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ReadOptions_setIgnoreRangeDeletions(
-    JNIEnv* env, jobject jobj, jlong jhandle,
-    jboolean jignore_range_deletions) {
+    JNIEnv*, jobject, jlong jhandle, jboolean jignore_range_deletions) {
   auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
   opt->ignore_range_deletions = static_cast<bool>(jignore_range_deletions);
 }
@@ -5964,7 +6562,7 @@ void Java_org_rocksdb_ReadOptions_setIgnoreRangeDeletions(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_ReadOptions_setSnapshot(
-    JNIEnv* env, jobject jobj, jlong jhandle, jlong jsnapshot) {
+    JNIEnv*, jobject, jlong jhandle, jlong jsnapshot) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot =
       reinterpret_cast<rocksdb::Snapshot*>(jsnapshot);
 }
@@ -5975,9 +6573,8 @@ void Java_org_rocksdb_ReadOptions_setSnapshot(
  * Signature: (J)J
  */
 jlong Java_org_rocksdb_ReadOptions_snapshot(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  auto& snapshot =
-      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot;
+    JNIEnv*, jobject, jlong jhandle) {
+  auto& snapshot = reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->snapshot;
   return reinterpret_cast<jlong>(snapshot);
 }
 
@@ -5987,7 +6584,7 @@ jlong Java_org_rocksdb_ReadOptions_snapshot(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_ReadOptions_readTier(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return static_cast<jbyte>(
       reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->read_tier);
 }
@@ -5998,11 +6595,92 @@ jbyte Java_org_rocksdb_ReadOptions_readTier(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_ReadOptions_setReadTier(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jread_tier) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jread_tier) {
   reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->read_tier =
       static_cast<rocksdb::ReadTier>(jread_tier);
 }
 
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterateUpperBound
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_ReadOptions_setIterateUpperBound(
+    JNIEnv*, jobject, jlong jhandle, jlong jupper_bound_slice_handle) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->iterate_upper_bound =
+      reinterpret_cast<rocksdb::Slice*>(jupper_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterateUpperBound
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterateUpperBound(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto& upper_bound_slice_handle =
+      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->iterate_upper_bound;
+  return reinterpret_cast<jlong>(upper_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterateLowerBound
+ * Signature: (JJ)I
+ */
+void Java_org_rocksdb_ReadOptions_setIterateLowerBound(
+    JNIEnv*, jobject, jlong jhandle, jlong jlower_bound_slice_handle) {
+  reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->iterate_lower_bound =
+      reinterpret_cast<rocksdb::Slice*>(jlower_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterateLowerBound
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterateLowerBound(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto& lower_bound_slice_handle =
+      reinterpret_cast<rocksdb::ReadOptions*>(jhandle)->iterate_lower_bound;
+  return reinterpret_cast<jlong>(lower_bound_slice_handle);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setTableFilter
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setTableFilter(
+    JNIEnv*, jobject, jlong jhandle, jlong jjni_table_filter_handle) {
+  auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  auto* jni_table_filter =
+      reinterpret_cast<rocksdb::TableFilterJniCallback*>(jjni_table_filter_handle);
+  opt->table_filter = jni_table_filter->GetTableFilterFunction();
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    setIterStartSeqnum
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_ReadOptions_setIterStartSeqnum(
+    JNIEnv*, jobject, jlong jhandle, jlong jiter_start_seqnum) {
+  auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  opt->iter_start_seqnum = static_cast<uint64_t>(jiter_start_seqnum);
+}
+
+/*
+ * Class:     org_rocksdb_ReadOptions
+ * Method:    iterStartSeqnum
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_ReadOptions_iterStartSeqnum(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* opt = reinterpret_cast<rocksdb::ReadOptions*>(jhandle);
+  return static_cast<jlong>(opt->iter_start_seqnum);
+}
+
 /////////////////////////////////////////////////////////////////////
 // rocksdb::ComparatorOptions
 
@@ -6012,7 +6690,7 @@ void Java_org_rocksdb_ReadOptions_setReadTier(
  * Signature: ()J
  */
 jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv*, jclass) {
   auto* comparator_opt = new rocksdb::ComparatorJniCallbackOptions();
   return reinterpret_cast<jlong>(comparator_opt);
 }
@@ -6023,9 +6701,9 @@ jlong Java_org_rocksdb_ComparatorOptions_newComparatorOptions(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex(
-    JNIEnv * env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   return reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)
-    ->use_adaptive_mutex;
+      ->use_adaptive_mutex;
 }
 
 /*
@@ -6034,9 +6712,9 @@ jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex(
-    JNIEnv * env, jobject jobj, jlong jhandle, jboolean juse_adaptive_mutex) {
+    JNIEnv*, jobject, jlong jhandle, jboolean juse_adaptive_mutex) {
   reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle)
-    ->use_adaptive_mutex = static_cast<bool>(juse_adaptive_mutex);
+      ->use_adaptive_mutex = static_cast<bool>(juse_adaptive_mutex);
 }
 
 /*
@@ -6045,7 +6723,7 @@ void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex(
  * Signature: (J)V
  */
 void Java_org_rocksdb_ComparatorOptions_disposeInternal(
-    JNIEnv * env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* comparator_opt =
       reinterpret_cast<rocksdb::ComparatorJniCallbackOptions*>(jhandle);
   assert(comparator_opt != nullptr);
@@ -6061,7 +6739,7 @@ void Java_org_rocksdb_ComparatorOptions_disposeInternal(
  * Signature: ()J
  */
 jlong Java_org_rocksdb_FlushOptions_newFlushOptions(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv*, jclass) {
   auto* flush_opt = new rocksdb::FlushOptions();
   return reinterpret_cast<jlong>(flush_opt);
 }
@@ -6072,9 +6750,9 @@ jlong Java_org_rocksdb_FlushOptions_newFlushOptions(
  * Signature: (JZ)V
  */
 void Java_org_rocksdb_FlushOptions_setWaitForFlush(
-    JNIEnv * env, jobject jobj, jlong jhandle, jboolean jwait) {
-  reinterpret_cast<rocksdb::FlushOptions*>(jhandle)
-    ->wait = static_cast<bool>(jwait);
+    JNIEnv*, jobject, jlong jhandle, jboolean jwait) {
+  reinterpret_cast<rocksdb::FlushOptions*>(jhandle)->wait =
+      static_cast<bool>(jwait);
 }
 
 /*
@@ -6083,9 +6761,30 @@ void Java_org_rocksdb_FlushOptions_setWaitForFlush(
  * Signature: (J)Z
  */
 jboolean Java_org_rocksdb_FlushOptions_waitForFlush(
-    JNIEnv * env, jobject jobj, jlong jhandle) {
-  return reinterpret_cast<rocksdb::FlushOptions*>(jhandle)
-    ->wait;
+    JNIEnv*, jobject, jlong jhandle) {
+  return reinterpret_cast<rocksdb::FlushOptions*>(jhandle)->wait;
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    setAllowWriteStall
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_FlushOptions_setAllowWriteStall(
+    JNIEnv*, jobject, jlong jhandle, jboolean jallow_write_stall) {
+  auto* flush_options = reinterpret_cast<rocksdb::FlushOptions*>(jhandle);
+  flush_options->allow_write_stall = jallow_write_stall == JNI_TRUE;
+}
+
+/*
+ * Class:     org_rocksdb_FlushOptions
+ * Method:    allowWriteStall
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_FlushOptions_allowWriteStall(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* flush_options = reinterpret_cast<rocksdb::FlushOptions*>(jhandle);
+  return static_cast<jboolean>(flush_options->allow_write_stall);
 }
 
 /*
@@ -6094,7 +6793,7 @@ jboolean Java_org_rocksdb_FlushOptions_waitForFlush(
  * Signature: (J)V
  */
 void Java_org_rocksdb_FlushOptions_disposeInternal(
-    JNIEnv * env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* flush_opt = reinterpret_cast<rocksdb::FlushOptions*>(jhandle);
   assert(flush_opt != nullptr);
   delete flush_opt;
diff --git a/thirdparty/rocksdb/java/rocksjni/options_util.cc b/thirdparty/rocksdb/java/rocksjni/options_util.cc
new file mode 100644
index 0000000000..7dd0078455
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/options_util.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ rocksdb::OptionsUtil methods from Java side.
+
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_OptionsUtil.h"
+
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksjni/portal.h"
+
+void build_column_family_descriptor_list(
+    JNIEnv* env, jobject jcfds,
+    std::vector<rocksdb::ColumnFamilyDescriptor>& cf_descs) {
+  jmethodID add_mid = rocksdb::ListJni::getListAddMethodId(env);
+  if (add_mid == nullptr) {
+    // exception occurred accessing method
+    return;
+  }
+
+  // Column family descriptor
+  for (rocksdb::ColumnFamilyDescriptor& cfd : cf_descs) {
+    // Construct a ColumnFamilyDescriptor java object
+    jobject jcfd = rocksdb::ColumnFamilyDescriptorJni::construct(env, &cfd);
+    if (env->ExceptionCheck()) {
+      // exception occurred constructing object
+      if (jcfd != nullptr) {
+        env->DeleteLocalRef(jcfd);
+      }
+      return;
+    }
+
+    // Add the object to java list.
+    jboolean rs = env->CallBooleanMethod(jcfds, add_mid, jcfd);
+    if (env->ExceptionCheck() || rs == JNI_FALSE) {
+      // exception occurred calling method, or could not add
+      if (jcfd != nullptr) {
+        env->DeleteLocalRef(jcfd);
+      }
+      return;
+    }
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadLatestOptions
+ * Signature: (Ljava/lang/String;JLjava/util/List;Z)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadLatestOptions(
+    JNIEnv* env, jclass /*jcls*/, jstring jdbpath, jlong jenv_handle,
+    jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) {
+  jboolean has_exception = JNI_FALSE;
+  auto db_path = rocksdb::JniUtil::copyStdString(env, jdbpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<rocksdb::ColumnFamilyDescriptor> cf_descs;
+  rocksdb::Status s = rocksdb::LoadLatestOptions(
+      db_path, reinterpret_cast<rocksdb::Env*>(jenv_handle),
+      reinterpret_cast<rocksdb::DBOptions*>(jdb_opts_handle), &cf_descs,
+      ignore_unknown_options);
+  if (!s.ok()) {
+    // error, raise an exception
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    loadOptionsFromFile
+ * Signature: (Ljava/lang/String;JJLjava/util/List;Z)V
+ */
+void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile(
+    JNIEnv* env, jclass /*jcls*/, jstring jopts_file_name, jlong jenv_handle,
+    jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) {
+  jboolean has_exception = JNI_FALSE;
+  auto opts_file_name = rocksdb::JniUtil::copyStdString(env, jopts_file_name, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  std::vector<rocksdb::ColumnFamilyDescriptor> cf_descs;
+  rocksdb::Status s = rocksdb::LoadOptionsFromFile(
+      opts_file_name, reinterpret_cast<rocksdb::Env*>(jenv_handle),
+      reinterpret_cast<rocksdb::DBOptions*>(jdb_opts_handle), &cf_descs,
+      ignore_unknown_options);
+  if (!s.ok()) {
+    // error, raise an exception
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  } else {
+    build_column_family_descriptor_list(env, jcfds, cf_descs);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_OptionsUtil
+ * Method:    getLatestOptionsFileName
+ * Signature: (Ljava/lang/String;J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_OptionsUtil_getLatestOptionsFileName(
+    JNIEnv* env, jclass /*jcls*/, jstring jdbpath, jlong jenv_handle) {
+  jboolean has_exception = JNI_FALSE;
+  auto db_path = rocksdb::JniUtil::copyStdString(env, jdbpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+  std::string options_file_name;
+  rocksdb::Status s = rocksdb::GetLatestOptionsFileName(
+      db_path, reinterpret_cast<rocksdb::Env*>(jenv_handle),
+      &options_file_name);
+  if (!s.ok()) {
+    // error, raise an exception
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  } else {
+    return env->NewStringUTF(options_file_name.c_str());
+  }
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/persistent_cache.cc b/thirdparty/rocksdb/java/rocksjni/persistent_cache.cc
new file mode 100644
index 0000000000..2b6fc60ba2
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/persistent_cache.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::PersistentCache.
+
+#include <jni.h>
+#include <string>
+
+#include "include/org_rocksdb_PersistentCache.h"
+#include "rocksdb/persistent_cache.h"
+#include "loggerjnicallback.h"
+#include "portal.h"
+
+/*
+ * Class:     org_rocksdb_PersistentCache
+ * Method:    newPersistentCache
+ * Signature: (JLjava/lang/String;JJZ)J
+ */
+jlong Java_org_rocksdb_PersistentCache_newPersistentCache(
+    JNIEnv* env, jclass, jlong jenv_handle, jstring jpath,
+    jlong jsz, jlong jlogger_handle, jboolean joptimized_for_nvm) {
+  auto* rocks_env = reinterpret_cast<rocksdb::Env*>(jenv_handle);
+  jboolean has_exception = JNI_FALSE;
+  std::string path = rocksdb::JniUtil::copyStdString(env, jpath, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    return 0;
+  }
+  auto* logger =
+      reinterpret_cast<std::shared_ptr<rocksdb::LoggerJniCallback>*>(jlogger_handle);
+  auto* cache = new std::shared_ptr<rocksdb::PersistentCache>(nullptr);
+  rocksdb::Status s = rocksdb::NewPersistentCache(
+      rocks_env, path, static_cast<uint64_t>(jsz), *logger,
+      static_cast<bool>(joptimized_for_nvm), cache);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+  return reinterpret_cast<jlong>(cache);
+}
+
+/*
+ * Class:     org_rocksdb_PersistentCache
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_PersistentCache_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* cache =
+      reinterpret_cast<std::shared_ptr<rocksdb::PersistentCache>*>(jhandle);
+  delete cache;  // delete std::shared_ptr
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/portal.h b/thirdparty/rocksdb/java/rocksjni/portal.h
index ed671ce6e9..70e67653ec 100644
--- a/thirdparty/rocksdb/java/rocksjni/portal.h
+++ b/thirdparty/rocksdb/java/rocksjni/portal.h
@@ -10,20 +10,34 @@
 #ifndef JAVA_ROCKSJNI_PORTAL_H_
 #define JAVA_ROCKSJNI_PORTAL_H_
 
-#include <jni.h>
+#include <algorithm>
+#include <cstring>
 #include <functional>
 #include <iostream>
+#include <iterator>
+#include <jni.h>
 #include <limits>
+#include <memory>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include "rocksdb/db.h"
 #include "rocksdb/filter_policy.h"
+#include "rocksdb/rate_limiter.h"
 #include "rocksdb/status.h"
+#include "rocksdb/table.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksjni/compaction_filter_factory_jnicallback.h"
 #include "rocksjni/comparatorjnicallback.h"
 #include "rocksjni/loggerjnicallback.h"
+#include "rocksjni/table_filter_jnicallback.h"
+#include "rocksjni/trace_writer_jnicallback.h"
+#include "rocksjni/transaction_notifier_jnicallback.h"
+#include "rocksjni/wal_filter_jnicallback.h"
 #include "rocksjni/writebatchhandlerjnicallback.h"
 
 // Remove macro on windows
@@ -33,15 +47,6 @@
 
 namespace rocksdb {
 
-// Detect if jlong overflows size_t
-inline Status check_if_jlong_fits_size_t(const jlong& jvalue) {
-  Status s = Status::OK();
-  if (static_cast<uint64_t>(jvalue) > std::numeric_limits<size_t>::max()) {
-    s = Status::InvalidArgument(Slice("jlong overflows 32 bit value."));
-  }
-  return s;
-}
-
 class JavaClass {
  public:
   /**
@@ -150,11 +155,12 @@ template<class DERIVED> class JavaException : public JavaClass {
   }
 };
 
-// The portal class for org.rocksdb.RocksDB
-class RocksDBJni : public RocksDBNativeClass<rocksdb::DB*, RocksDBJni> {
+// The portal class for java.lang.IllegalArgumentException
+class IllegalArgumentExceptionJni :
+    public JavaException<IllegalArgumentExceptionJni> {
  public:
   /**
-   * Get the Java Class org.rocksdb.RocksDB
+   * Get the Java Class java.lang.IllegalArgumentException
    *
    * @param env A pointer to the Java environment
    *
@@ -163,7 +169,135 @@ class RocksDBJni : public RocksDBNativeClass<rocksdb::DB*, RocksDBJni> {
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksDB");
+    return JavaException::getJClass(env, "java/lang/IllegalArgumentException");
+  }
+
+  /**
+   * Create and throw a Java IllegalArgumentException with the provided status
+   *
+   * If s.ok() == true, then this function will not throw any exception.
+   *
+   * @param env A pointer to the Java environment
+   * @param s The status for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, const Status& s) {
+    assert(!s.ok());
+    if (s.ok()) {
+      return false;
+    }
+
+    // get the IllegalArgumentException class
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      std::cerr << "IllegalArgumentExceptionJni::ThrowNew/class - Error: unexpected exception!" << std::endl;
+      return env->ExceptionCheck();
+    }
+
+    return JavaException::ThrowNew(env, s.ToString());
+  }
+};
+
+// The portal class for org.rocksdb.Status.Code
+class CodeJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Status.Code
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/Status$Code");
+  }
+
+  /**
+   * Get the Java Method: Status.Code#getValue
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getValueMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getValue", "()b");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.Status.SubCode
+class SubCodeJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Status.SubCode
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/Status$SubCode");
+  }
+
+  /**
+   * Get the Java Method: Status.SubCode#getValue
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getValueMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getValue", "()b");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  static rocksdb::Status::SubCode toCppSubCode(const jbyte jsub_code) {
+    switch (jsub_code) {
+      case 0x0:
+        return rocksdb::Status::SubCode::kNone;
+      case 0x1:
+        return rocksdb::Status::SubCode::kMutexTimeout;
+      case 0x2:
+        return rocksdb::Status::SubCode::kLockTimeout;
+      case 0x3:
+        return rocksdb::Status::SubCode::kLockLimit;
+      case 0x4:
+        return rocksdb::Status::SubCode::kNoSpace;
+      case 0x5:
+        return rocksdb::Status::SubCode::kDeadlock;
+      case 0x6:
+        return rocksdb::Status::SubCode::kStaleFile;
+      case 0x7:
+        return rocksdb::Status::SubCode::kMemoryLimit;
+
+      case 0x7F:
+      default:
+        return rocksdb::Status::SubCode::kNone;
+    }
   }
 };
 
@@ -183,6 +317,69 @@ class StatusJni : public RocksDBNativeClass<rocksdb::Status*, StatusJni> {
     return RocksDBNativeClass::getJClass(env, "org/rocksdb/Status");
   }
 
+  /**
+   * Get the Java Method: Status#getCode
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getCodeMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getCode", "()Lorg/rocksdb/Status$Code;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Status#getSubCode
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getSubCodeMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getSubCode", "()Lorg/rocksdb/Status$SubCode;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Status#getState
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getStateMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getState", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
   /**
    * Create a new Java org.rocksdb.Status object with the same properties as
    * the provided C++ rocksdb::Status object
@@ -287,12 +484,204 @@ class StatusJni : public RocksDBNativeClass<rocksdb::Status*, StatusJni> {
         return 0x2;
       case rocksdb::Status::SubCode::kLockLimit:
         return 0x3;
-      case rocksdb::Status::SubCode::kMaxSubCode:
-        return 0x7E;
+      case rocksdb::Status::SubCode::kNoSpace:
+        return 0x4;
+      case rocksdb::Status::SubCode::kDeadlock:
+        return 0x5;
+      case rocksdb::Status::SubCode::kStaleFile:
+        return 0x6;
+      case rocksdb::Status::SubCode::kMemoryLimit:
+        return 0x7;
       default:
         return 0x7F;  // undefined
     }
   }
+
+  static std::unique_ptr<rocksdb::Status> toCppStatus(
+      const jbyte jcode_value, const jbyte jsub_code_value) {
+    std::unique_ptr<rocksdb::Status> status;
+    switch (jcode_value) {
+      case 0x0:
+        //Ok
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::OK()));
+        break;
+      case 0x1:
+        //NotFound
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::NotFound(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x2:
+        //Corruption
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::Corruption(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x3:
+        //NotSupported
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::NotSupported(
+                rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x4:
+        //InvalidArgument
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::InvalidArgument(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x5:
+        //IOError
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::IOError(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x6:
+        //MergeInProgress
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::MergeInProgress(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x7:
+        //Incomplete
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::Incomplete(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x8:
+        //ShutdownInProgress
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::ShutdownInProgress(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x9:
+        //TimedOut
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::TimedOut(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xA:
+        //Aborted
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::Aborted(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xB:
+        //Busy
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::Busy(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xC:
+        //Expired
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::Expired(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0xD:
+        //TryAgain
+        status = std::unique_ptr<rocksdb::Status>(
+            new rocksdb::Status(rocksdb::Status::TryAgain(
+              rocksdb::SubCodeJni::toCppSubCode(jsub_code_value))));
+        break;
+      case 0x7F:
+      default:
+        return nullptr;
+    }
+    return status;
+  }
+
+  // Returns the equivalent rocksdb::Status for the Java org.rocksdb.Status
+  static std::unique_ptr<rocksdb::Status> toCppStatus(JNIEnv* env, const jobject jstatus) {
+    jmethodID mid_code = getCodeMethod(env);
+    if (mid_code == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jobject jcode = env->CallObjectMethod(jstatus, mid_code);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    jmethodID mid_code_value = rocksdb::CodeJni::getValueMethod(env);
+    if (mid_code_value == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jbyte jcode_value = env->CallByteMethod(jcode, mid_code_value);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      if (jcode != nullptr) {
+        env->DeleteLocalRef(jcode);
+      }
+      return nullptr;
+    }
+
+    jmethodID mid_subCode = getSubCodeMethod(env);
+    if (mid_subCode == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jobject jsubCode = env->CallObjectMethod(jstatus, mid_subCode);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      if (jcode != nullptr) {
+        env->DeleteLocalRef(jcode);
+      }
+      return nullptr;
+    }
+
+    jbyte jsub_code_value = 0x0;  // None
+    if (jsubCode != nullptr) {
+      jmethodID mid_subCode_value = rocksdb::SubCodeJni::getValueMethod(env);
+      if (mid_subCode_value == nullptr) {
+        // exception occurred
+        return nullptr;
+      }
+      jsub_code_value = env->CallByteMethod(jsubCode, mid_subCode_value);
+      if (env->ExceptionCheck()) {
+        // exception occurred
+        if (jcode != nullptr) {
+          env->DeleteLocalRef(jcode);
+        }
+        return nullptr;
+      }
+    }
+
+    jmethodID mid_state = getStateMethod(env);
+    if (mid_state == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    jobject jstate = env->CallObjectMethod(jstatus, mid_state);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      if (jsubCode != nullptr) {
+        env->DeleteLocalRef(jsubCode);
+      }
+      if (jcode != nullptr) {
+        env->DeleteLocalRef(jcode);
+      }
+      return nullptr;
+    }
+
+    std::unique_ptr<rocksdb::Status> status =
+        toCppStatus(jcode_value, jsub_code_value);
+
+    // delete all local refs
+    if (jstate != nullptr) {
+      env->DeleteLocalRef(jstate);
+    }
+    if (jsubCode != nullptr) {
+      env->DeleteLocalRef(jsubCode);
+    }
+    if (jcode != nullptr) {
+      env->DeleteLocalRef(jcode);
+    }
+
+    return status;
+  }
 };
 
 // The portal class for org.rocksdb.RocksDBException
@@ -324,6 +713,20 @@ class RocksDBExceptionJni :
     return JavaException::ThrowNew(env, msg);
   }
 
+  /**
+   * Create and throw a Java RocksDBException with the provided status
+   *
+   * If s->ok() == true, then this function will not throw any exception.
+   *
+   * @param env A pointer to the Java environment
+   * @param s The status for the exception
+   *
+   * @return true if an exception was thrown, false otherwise
+   */
+  static bool ThrowNew(JNIEnv* env, std::unique_ptr<Status>& s) {
+    return rocksdb::RocksDBExceptionJni::ThrowNew(env, *(s.get()));
+  }
+
   /**
    * Create and throw a Java RocksDBException with the provided status
    *
@@ -335,7 +738,6 @@ class RocksDBExceptionJni :
    * @return true if an exception was thrown, false otherwise
    */
   static bool ThrowNew(JNIEnv* env, const Status& s) {
-    assert(!s.ok());
     if (s.ok()) {
       return false;
     }
@@ -500,60 +902,61 @@ class RocksDBExceptionJni :
 
     return true;
   }
-};
 
-// The portal class for java.lang.IllegalArgumentException
-class IllegalArgumentExceptionJni :
-    public JavaException<IllegalArgumentExceptionJni> {
- public:
   /**
-   * Get the Java Class java.lang.IllegalArgumentException
+   * Get the Java Method: RocksDBException#getStatus
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
    */
-  static jclass getJClass(JNIEnv* env) {
-    return JavaException::getJClass(env, "java/lang/IllegalArgumentException");
+  static jmethodID getStatusMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "getStatus", "()Lorg/rocksdb/Status;");
+    assert(mid != nullptr);
+    return mid;
   }
 
-  /**
-   * Create and throw a Java IllegalArgumentException with the provided status
-   *
-   * If s.ok() == true, then this function will not throw any exception.
-   *
-   * @param env A pointer to the Java environment
-   * @param s The status for the exception
-   *
-   * @return true if an exception was thrown, false otherwise
-   */
-  static bool ThrowNew(JNIEnv* env, const Status& s) {
-    assert(!s.ok());
-    if (s.ok()) {
-      return false;
+  static std::unique_ptr<rocksdb::Status> toCppStatus(
+      JNIEnv* env, jthrowable jrocksdb_exception) {
+    if(!env->IsInstanceOf(jrocksdb_exception, getJClass(env))) {
+      // not an instance of RocksDBException
+      return nullptr;
     }
 
-    // get the IllegalArgumentException class
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
-      std::cerr << "IllegalArgumentExceptionJni::ThrowNew/class - Error: unexpected exception!" << std::endl;
-      return env->ExceptionCheck();
+    // get the java status object
+    jmethodID mid = getStatusMethod(env);
+    if(mid == nullptr) {
+      // exception occurred accessing class or method
+      return nullptr;
     }
 
-    return JavaException::ThrowNew(env, s.ToString());
+    jobject jstatus = env->CallObjectMethod(jrocksdb_exception, mid);
+    if(env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    if(jstatus == nullptr) {
+      return nullptr;   // no status available
+    }
+
+    return rocksdb::StatusJni::toCppStatus(env, jstatus);
   }
 };
 
-
-// The portal class for org.rocksdb.Options
-class OptionsJni : public RocksDBNativeClass<
-    rocksdb::Options*, OptionsJni> {
+// The portal class for java.util.List
+class ListJni : public JavaClass {
  public:
   /**
-   * Get the Java Class org.rocksdb.Options
+   * Get the Java Class java.util.List
    *
    * @param env A pointer to the Java environment
    *
@@ -561,17 +964,12 @@ class OptionsJni : public RocksDBNativeClass<
    *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options");
+  static jclass getListClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/List");
   }
-};
 
-// The portal class for org.rocksdb.DBOptions
-class DBOptionsJni : public RocksDBNativeClass<
-    rocksdb::DBOptions*, DBOptionsJni> {
- public:
   /**
-   * Get the Java Class org.rocksdb.DBOptions
+   * Get the Java Class java.util.ArrayList
    *
    * @param env A pointer to the Java environment
    *
@@ -579,15 +977,12 @@ class DBOptionsJni : public RocksDBNativeClass<
    *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions");
+  static jclass getArrayListClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/ArrayList");
   }
-};
 
-class ColumnFamilyDescriptorJni : public JavaClass {
- public:
   /**
-   * Get the Java Class org.rocksdb.ColumnFamilyDescriptor
+   * Get the Java Class java.util.Iterator
    *
    * @param env A pointer to the Java environment
    *
@@ -595,115 +990,119 @@ class ColumnFamilyDescriptorJni : public JavaClass {
    *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "org/rocksdb/ColumnFamilyDescriptor");
+  static jclass getIteratorClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/Iterator");
   }
 
   /**
-   * Get the Java Method: ColumnFamilyDescriptor#columnFamilyName
+   * Get the Java Method: List#iterator
    *
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
    *     be retieved
    */
-  static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+  static jmethodID getIteratorMethod(JNIEnv* env) {
+    jclass jlist_clazz = getListClass(env);
+    if(jlist_clazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
     static jmethodID mid =
-        env->GetMethodID(jclazz, "columnFamilyName", "()[B");
+        env->GetMethodID(jlist_clazz, "iterator", "()Ljava/util/Iterator;");
     assert(mid != nullptr);
     return mid;
   }
 
   /**
-   * Get the Java Method: ColumnFamilyDescriptor#columnFamilyOptions
+   * Get the Java Method: Iterator#hasNext
    *
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
    *     be retieved
    */
-  static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+  static jmethodID getHasNextMethod(JNIEnv* env) {
+    jclass jiterator_clazz = getIteratorClass(env);
+    if(jiterator_clazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    static jmethodID mid =
-        env->GetMethodID(jclazz, "columnFamilyOptions",
-            "()Lorg/rocksdb/ColumnFamilyOptions;");
+    static jmethodID mid = env->GetMethodID(jiterator_clazz, "hasNext", "()Z");
     assert(mid != nullptr);
     return mid;
   }
-};
 
-// The portal class for org.rocksdb.ColumnFamilyOptions
-class ColumnFamilyOptionsJni : public RocksDBNativeClass<
-    rocksdb::ColumnFamilyOptions*, ColumnFamilyOptionsJni> {
- public:
   /**
-   * Get the Java Class org.rocksdb.ColumnFamilyOptions
+   * Get the Java Method: Iterator#next
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
    */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env,
-        "org/rocksdb/ColumnFamilyOptions");
+  static jmethodID getNextMethod(JNIEnv* env) {
+    jclass jiterator_clazz = getIteratorClass(env);
+    if(jiterator_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jiterator_clazz, "next", "()Ljava/lang/Object;");
+    assert(mid != nullptr);
+    return mid;
   }
-};
 
-// The portal class for org.rocksdb.WriteOptions
-class WriteOptionsJni : public RocksDBNativeClass<
-    rocksdb::WriteOptions*, WriteOptionsJni> {
- public:
   /**
-   * Get the Java Class org.rocksdb.WriteOptions
+   * Get the Java Method: ArrayList constructor
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
    */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteOptions");
+  static jmethodID getArrayListConstructorMethodId(JNIEnv* env) {
+    jclass jarray_list_clazz = getArrayListClass(env);
+    if(jarray_list_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+    static jmethodID mid =
+        env->GetMethodID(jarray_list_clazz, "<init>", "(I)V");
+    assert(mid != nullptr);
+    return mid;
   }
-};
 
-// The portal class for org.rocksdb.ReadOptions
-class ReadOptionsJni : public RocksDBNativeClass<
-    rocksdb::ReadOptions*, ReadOptionsJni> {
- public:
   /**
-   * Get the Java Class org.rocksdb.ReadOptions
+   * Get the Java Method: List#add
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
    */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/ReadOptions");
+  static jmethodID getListAddMethodId(JNIEnv* env) {
+    jclass jlist_clazz = getListClass(env);
+    if(jlist_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jlist_clazz, "add", "(Ljava/lang/Object;)Z");
+    assert(mid != nullptr);
+    return mid;
   }
 };
 
-// The portal class for org.rocksdb.WriteBatch
-class WriteBatchJni : public RocksDBNativeClass<
-    rocksdb::WriteBatch*, WriteBatchJni> {
+// The portal class for java.lang.Byte
+class ByteJni : public JavaClass {
  public:
   /**
-   * Get the Java Class org.rocksdb.WriteBatch
+   * Get the Java Class java.lang.Byte
    *
    * @param env A pointer to the Java environment
    *
@@ -712,17 +1111,11 @@ class WriteBatchJni : public RocksDBNativeClass<
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch");
+    return JavaClass::getJClass(env, "java/lang/Byte");
   }
-};
 
-// The portal class for org.rocksdb.WriteBatch.Handler
-class WriteBatchHandlerJni : public RocksDBNativeClass<
-    const rocksdb::WriteBatchHandlerJniCallback*,
-    WriteBatchHandlerJni> {
- public:
   /**
-   * Get the Java Class org.rocksdb.WriteBatch.Handler
+   * Get the Java Class byte[]
    *
    * @param env A pointer to the Java environment
    *
@@ -730,156 +1123,128 @@ class WriteBatchHandlerJni : public RocksDBNativeClass<
    *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env,
-        "org/rocksdb/WriteBatch$Handler");
+  static jclass getArrayJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "[B");
   }
 
   /**
-   * Get the Java Method: WriteBatch.Handler#put
+   * Creates a new 2-dimensional Java Byte Array byte[][]
    *
    * @param env A pointer to the Java environment
+   * @param len The size of the first dimension
    *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   * @return A reference to the Java byte[][] or nullptr if an exception occurs
    */
-  static jmethodID getPutMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+  static jobjectArray new2dByteArray(JNIEnv* env, const jsize len) {
+    jclass clazz = getArrayJClass(env);
+    if(clazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "put", "([B[B)V");
-    assert(mid != nullptr);
-    return mid;
+    return env->NewObjectArray(len, clazz, nullptr);
   }
 
   /**
-   * Get the Java Method: WriteBatch.Handler#merge
+   * Get the Java Method: Byte#byteValue
    *
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   *     be retrieved
    */
-  static jmethodID getMergeMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+  static jmethodID getByteValueMethod(JNIEnv* env) {
+    jclass clazz = getJClass(env);
+    if(clazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "merge", "([B[B)V");
+    static jmethodID mid = env->GetMethodID(clazz, "byteValue", "()B");
     assert(mid != nullptr);
     return mid;
   }
 
   /**
-   * Get the Java Method: WriteBatch.Handler#delete
+   * Calls the Java Method: Byte#valueOf, returning a constructed Byte jobject
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   * @return A constructing Byte object or nullptr if the class or method id could not
+   *     be retrieved, or an exception occurred
    */
-  static jmethodID getDeleteMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+  static jobject valueOf(JNIEnv* env, jbyte jprimitive_byte) {
+    jclass clazz = getJClass(env);
+    if (clazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "delete", "([B)V");
-    assert(mid != nullptr);
-    return mid;
+    static jmethodID mid =
+        env->GetStaticMethodID(clazz, "valueOf", "(B)Ljava/lang/Byte;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jobject jbyte_obj =
+        env->CallStaticObjectMethod(clazz, mid, jprimitive_byte);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jbyte_obj;
   }
 
+};
+
+// The portal class for java.lang.Integer
+class IntegerJni : public JavaClass {
+ public:
   /**
-   * Get the Java Method: WriteBatch.Handler#deleteRange
+   * Get the Java Class java.lang.Integer
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jmethodID getDeleteRangeMethodId(JNIEnv* env) {
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/lang/Integer");
+  }
+
+  static jobject valueOf(JNIEnv* env, jint jprimitive_int) {
     jclass jclazz = getJClass(env);
     if (jclazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "([B[B)V");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  /**
-   * Get the Java Method: WriteBatch.Handler#logData
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getLogDataMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
+    jmethodID mid =
+        env->GetStaticMethodID(jclazz, "valueOf", "(I)Ljava/lang/Integer;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "logData", "([B)V");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  /**
-   * Get the Java Method: WriteBatch.Handler#shouldContinue
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getContinueMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
+    const jobject jinteger_obj =
+        env->CallStaticObjectMethod(jclazz, mid, jprimitive_int);
+    if (env->ExceptionCheck()) {
+      // exception occurred
       return nullptr;
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "shouldContinue", "()Z");
-    assert(mid != nullptr);
-    return mid;
-  }
-};
-
-// The portal class for org.rocksdb.WriteBatchWithIndex
-class WriteBatchWithIndexJni : public RocksDBNativeClass<
-    rocksdb::WriteBatchWithIndex*, WriteBatchWithIndexJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.WriteBatchWithIndex
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env,
-        "org/rocksdb/WriteBatchWithIndex");
+    return jinteger_obj;
   }
 };
 
-// The portal class for org.rocksdb.HistogramData
-class HistogramDataJni : public JavaClass {
+// The portal class for java.lang.Long
+class LongJni : public JavaClass {
  public:
   /**
-   * Get the Java Class org.rocksdb.HistogramData
+   * Get the Java Class java.lang.Long
    *
    * @param env A pointer to the Java environment
    *
@@ -888,165 +1253,39 @@ class HistogramDataJni : public JavaClass {
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "org/rocksdb/HistogramData");
+    return JavaClass::getJClass(env, "java/lang/Long");
   }
 
-  /**
-   * Get the Java Method: HistogramData constructor
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getConstructorMethodId(JNIEnv* env) {
+  static jobject valueOf(JNIEnv* env, jlong jprimitive_long) {
     jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+    if (jclazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "(DDDDD)V");
-    assert(mid != nullptr);
-    return mid;
-  }
-};
-
-// The portal class for org.rocksdb.BackupableDBOptions
-class BackupableDBOptionsJni : public RocksDBNativeClass<
-    rocksdb::BackupableDBOptions*, BackupableDBOptionsJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.BackupableDBOptions
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env,
-        "org/rocksdb/BackupableDBOptions");
-  }
-};
-
-// The portal class for org.rocksdb.BackupEngine
-class BackupEngineJni : public RocksDBNativeClass<
-    rocksdb::BackupEngine*, BackupEngineJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.BackupableEngine
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/BackupEngine");
-  }
-};
-
-// The portal class for org.rocksdb.RocksIterator
-class IteratorJni : public RocksDBNativeClass<
-    rocksdb::Iterator*, IteratorJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.RocksIterator
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksIterator");
-  }
-};
-
-// The portal class for org.rocksdb.Filter
-class FilterJni : public RocksDBNativeClass<
-    std::shared_ptr<rocksdb::FilterPolicy>*, FilterJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.Filter
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Filter");
-  }
-};
-
-// The portal class for org.rocksdb.ColumnFamilyHandle
-class ColumnFamilyHandleJni : public RocksDBNativeClass<
-    rocksdb::ColumnFamilyHandle*, ColumnFamilyHandleJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.ColumnFamilyHandle
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env,
-        "org/rocksdb/ColumnFamilyHandle");
-  }
-};
+    jmethodID mid =
+        env->GetStaticMethodID(jclazz, "valueOf", "(J)Ljava/lang/Long;");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
 
-// The portal class for org.rocksdb.FlushOptions
-class FlushOptionsJni : public RocksDBNativeClass<
-    rocksdb::FlushOptions*, FlushOptionsJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.FlushOptions
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/FlushOptions");
-  }
-};
+    const jobject jlong_obj =
+        env->CallStaticObjectMethod(jclazz, mid, jprimitive_long);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      return nullptr;
+    }
 
-// The portal class for org.rocksdb.ComparatorOptions
-class ComparatorOptionsJni : public RocksDBNativeClass<
-    rocksdb::ComparatorJniCallbackOptions*, ComparatorOptionsJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.ComparatorOptions
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/ComparatorOptions");
+    return jlong_obj;
   }
 };
 
-// The portal class for org.rocksdb.AbstractComparator
-class AbstractComparatorJni : public RocksDBNativeClass<
-    const rocksdb::BaseComparatorJniCallback*,
-    AbstractComparatorJni> {
- public:
+// The portal class for java.lang.StringBuilder
+class StringBuilderJni : public JavaClass {
+  public:
   /**
-   * Get the Java Class org.rocksdb.AbstractComparator
+   * Get the Java Class java.lang.StringBuilder
    *
    * @param env A pointer to the Java environment
    *
@@ -1055,19 +1294,18 @@ class AbstractComparatorJni : public RocksDBNativeClass<
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env,
-        "org/rocksdb/AbstractComparator");
+    return JavaClass::getJClass(env, "java/lang/StringBuilder");
   }
 
   /**
-   * Get the Java Method: Comparator#name
+   * Get the Java Method: StringBuilder#append
    *
    * @param env A pointer to the Java environment
    *
    * @return The Java Method ID or nullptr if the class or method id could not
    *     be retieved
    */
-  static jmethodID getNameMethodId(JNIEnv* env) {
+  static jmethodID getListAddMethodId(JNIEnv* env) {
     jclass jclazz = getJClass(env);
     if(jclazz == nullptr) {
       // exception occurred accessing class
@@ -1075,614 +1313,773 @@ class AbstractComparatorJni : public RocksDBNativeClass<
     }
 
     static jmethodID mid =
-        env->GetMethodID(jclazz, "name", "()Ljava/lang/String;");
+        env->GetMethodID(jclazz, "append",
+            "(Ljava/lang/String;)Ljava/lang/StringBuilder;");
     assert(mid != nullptr);
     return mid;
   }
 
   /**
-   * Get the Java Method: Comparator#compare
+   * Appends a C-style string to a StringBuilder
    *
    * @param env A pointer to the Java environment
+   * @param jstring_builder Reference to a java.lang.StringBuilder
+   * @param c_str A C-style string to append to the StringBuilder
    *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   * @return A reference to the updated StringBuilder, or a nullptr if
+   *     an exception occurs
    */
-  static jmethodID getCompareMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
+  static jobject append(JNIEnv* env, jobject jstring_builder,
+      const char* c_str) {
+    jmethodID mid = getListAddMethodId(env);
+    if(mid == nullptr) {
+      // exception occurred accessing class or method
       return nullptr;
     }
 
-    static jmethodID mid =
-        env->GetMethodID(jclazz, "compare",
-            "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  /**
-   * Get the Java Method: Comparator#findShortestSeparator
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getFindShortestSeparatorMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
+    jstring new_value_str = env->NewStringUTF(c_str);
+    if(new_value_str == nullptr) {
+      // exception thrown: OutOfMemoryError
       return nullptr;
     }
 
-    static jmethodID mid =
-        env->GetMethodID(jclazz, "findShortestSeparator",
-            "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  /**
-   * Get the Java Method: Comparator#findShortSuccessor
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getFindShortSuccessorMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
+    jobject jresult_string_builder =
+        env->CallObjectMethod(jstring_builder, mid, new_value_str);
+    if(env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(new_value_str);
       return nullptr;
     }
 
-    static jmethodID mid =
-        env->GetMethodID(jclazz, "findShortSuccessor",
-            "(Ljava/lang/String;)Ljava/lang/String;");
-    assert(mid != nullptr);
-    return mid;
-  }
-};
-
-// The portal class for org.rocksdb.AbstractSlice
-class AbstractSliceJni : public NativeRocksMutableObject<
-    const rocksdb::Slice*, AbstractSliceJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.AbstractSlice
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractSlice");
+    return jresult_string_builder;
   }
 };
 
-// The portal class for org.rocksdb.Slice
-class SliceJni : public NativeRocksMutableObject<
-    const rocksdb::Slice*, AbstractSliceJni> {
+// various utility functions for working with RocksDB and JNI
+class JniUtil {
  public:
-  /**
-   * Get the Java Class org.rocksdb.Slice
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Slice");
-  }
-
-  /**
-   * Constructs a Slice object
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return A reference to a Java Slice object, or a nullptr if an
-   *     exception occurs
-   */
-  static jobject construct0(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
-    }
-
-    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "()V");
-    if(mid == nullptr) {
-      // exception occurred accessing method
-      return nullptr;
-    }
-    
-    jobject jslice = env->NewObject(jclazz, mid);
-    if(env->ExceptionCheck()) {
-      return nullptr;
+    /**
+     * Detect if jlong overflows size_t
+     * 
+     * @param jvalue the jlong value
+     * 
+     * @return
+     */
+    inline static Status check_if_jlong_fits_size_t(const jlong& jvalue) {
+      Status s = Status::OK();
+      if (static_cast<uint64_t>(jvalue) > std::numeric_limits<size_t>::max()) {
+        s = Status::InvalidArgument(Slice("jlong overflows 32 bit value."));
+      }
+      return s;
     }
 
-    return jslice;
-  }
-};
+    /**
+     * Obtains a reference to the JNIEnv from
+     * the JVM
+     *
+     * If the current thread is not attached to the JavaVM
+     * then it will be attached so as to retrieve the JNIEnv
+     *
+     * If a thread is attached, it must later be manually
+     * released by calling JavaVM::DetachCurrentThread.
+     * This can be handled by always matching calls to this
+     * function with calls to {@link JniUtil::releaseJniEnv(JavaVM*, jboolean)}
+     *
+     * @param jvm (IN) A pointer to the JavaVM instance
+     * @param attached (OUT) A pointer to a boolean which
+     *     will be set to JNI_TRUE if we had to attach the thread
+     *
+     * @return A pointer to the JNIEnv or nullptr if a fatal error
+     *     occurs and the JNIEnv cannot be retrieved
+     */
+    static JNIEnv* getJniEnv(JavaVM* jvm, jboolean* attached) {
+      assert(jvm != nullptr);
 
-// The portal class for org.rocksdb.DirectSlice
-class DirectSliceJni : public NativeRocksMutableObject<
-    const rocksdb::Slice*, AbstractSliceJni> {
- public:
-  /**
-   * Get the Java Class org.rocksdb.DirectSlice
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DirectSlice");
-  }
+      JNIEnv *env;
+      const jint env_rs = jvm->GetEnv(reinterpret_cast<void**>(&env),
+          JNI_VERSION_1_2);
 
-  /**
-   * Constructs a DirectSlice object
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return A reference to a Java DirectSlice object, or a nullptr if an
-   *     exception occurs
-   */
-  static jobject construct0(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
+      if(env_rs == JNI_OK) {
+        // current thread is already attached, return the JNIEnv
+        *attached = JNI_FALSE;
+        return env;
+      } else if(env_rs == JNI_EDETACHED) {
+        // current thread is not attached, attempt to attach
+        const jint rs_attach = jvm->AttachCurrentThread(reinterpret_cast<void**>(&env), NULL);
+        if(rs_attach == JNI_OK) {
+          *attached = JNI_TRUE;
+          return env;
+        } else {
+          // error, could not attach the thread
+          std::cerr << "JniUtil::getJniEnv - Fatal: could not attach current thread to JVM!" << std::endl;
+          return nullptr;
+        }
+      } else if(env_rs == JNI_EVERSION) {
+        // error, JDK does not support JNI_VERSION_1_2+
+        std::cerr << "JniUtil::getJniEnv - Fatal: JDK does not support JNI_VERSION_1_2" << std::endl;
+        return nullptr;
+      } else {
+        std::cerr << "JniUtil::getJniEnv - Fatal: Unknown error: env_rs=" << env_rs << std::endl;
+        return nullptr;
+      }
     }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "()V");
-    if(mid == nullptr) {
-      // exception occurred accessing method
-      return nullptr;
+    /**
+     * Counterpart to {@link JniUtil::getJniEnv(JavaVM*, jboolean*)}
+     *
+     * Detachess the current thread from the JVM if it was previously
+     * attached
+     *
+     * @param jvm (IN) A pointer to the JavaVM instance
+     * @param attached (IN) JNI_TRUE if we previously had to attach the thread
+     *     to the JavaVM to get the JNIEnv
+     */
+    static void releaseJniEnv(JavaVM* jvm, jboolean& attached) {
+      assert(jvm != nullptr);
+      if(attached == JNI_TRUE) {
+        const jint rs_detach = jvm->DetachCurrentThread();
+        assert(rs_detach == JNI_OK);
+        if(rs_detach != JNI_OK) {
+          std::cerr << "JniUtil::getJniEnv - Warn: Unable to detach current thread from JVM!" << std::endl;
+        }
+      }
     }
 
-    jobject jdirect_slice = env->NewObject(jclazz, mid);
-    if(env->ExceptionCheck()) {
-      return nullptr;
+    /**
+     * Copies a Java String[] to a C++ std::vector<std::string>
+     *
+     * @param env (IN) A pointer to the java environment
+     * @param jss (IN) The Java String array to copy
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an OutOfMemoryError or ArrayIndexOutOfBoundsException
+     *     exception occurs
+     *
+     * @return A std::vector<std:string> containing copies of the Java strings
+     */
+    static std::vector<std::string> copyStrings(JNIEnv* env,
+        jobjectArray jss, jboolean* has_exception) {
+          return rocksdb::JniUtil::copyStrings(env, jss,
+              env->GetArrayLength(jss), has_exception);
     }
 
-    return jdirect_slice;
-  }
-};
+    /**
+     * Copies a Java String[] to a C++ std::vector<std::string>
+     *
+     * @param env (IN) A pointer to the java environment
+     * @param jss (IN) The Java String array to copy
+     * @param jss_len (IN) The length of the Java String array to copy
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an OutOfMemoryError or ArrayIndexOutOfBoundsException
+     *     exception occurs
+     *
+     * @return A std::vector<std:string> containing copies of the Java strings
+     */
+    static std::vector<std::string> copyStrings(JNIEnv* env,
+        jobjectArray jss, const jsize jss_len, jboolean* has_exception) {
+      std::vector<std::string> strs;
+      strs.reserve(jss_len);
+      for (jsize i = 0; i < jss_len; i++) {
+        jobject js = env->GetObjectArrayElement(jss, i);
+        if(env->ExceptionCheck()) {
+          // exception thrown: ArrayIndexOutOfBoundsException
+          *has_exception = JNI_TRUE;
+          return strs;
+        }
 
-// The portal class for java.util.List
-class ListJni : public JavaClass {
- public:
-  /**
-   * Get the Java Class java.util.List
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getListClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "java/util/List");
-  }
+        jstring jstr = static_cast<jstring>(js);
+        const char* str = env->GetStringUTFChars(jstr, nullptr);
+        if(str == nullptr) {
+          // exception thrown: OutOfMemoryError
+          env->DeleteLocalRef(js);
+          *has_exception = JNI_TRUE;
+          return strs;
+        }
 
-  /**
-   * Get the Java Class java.util.ArrayList
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getArrayListClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "java/util/ArrayList");
-  }
+        strs.push_back(std::string(str));
 
-  /**
-   * Get the Java Class java.util.Iterator
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getIteratorClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "java/util/Iterator");
-  }
+        env->ReleaseStringUTFChars(jstr, str);
+        env->DeleteLocalRef(js);
+      }
 
-  /**
-   * Get the Java Method: List#iterator
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getIteratorMethod(JNIEnv* env) {
-    jclass jlist_clazz = getListClass(env);
-    if(jlist_clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
+      *has_exception = JNI_FALSE;
+      return strs;
     }
 
-    static jmethodID mid =
-        env->GetMethodID(jlist_clazz, "iterator", "()Ljava/util/Iterator;");
-    assert(mid != nullptr);
-    return mid;
-  }
+    /**
+     * Copies a jstring to a C-style null-terminated byte string
+     * and releases the original jstring
+     *
+     * The jstring is copied as UTF-8
+     *
+     * If an exception occurs, then JNIEnv::ExceptionCheck()
+     * will have been called
+     *
+     * @param env (IN) A pointer to the java environment
+     * @param js (IN) The java string to copy
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an OutOfMemoryError exception occurs
+     *
+     * @return A pointer to the copied string, or a
+     *     nullptr if has_exception == JNI_TRUE
+     */
+    static std::unique_ptr<char[]> copyString(JNIEnv* env, jstring js,
+        jboolean* has_exception) {
+      const char *utf = env->GetStringUTFChars(js, nullptr);
+      if(utf == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->ExceptionCheck();
+        *has_exception = JNI_TRUE;
+        return nullptr;
+      } else if(env->ExceptionCheck()) {
+        // exception thrown
+        env->ReleaseStringUTFChars(js, utf);
+        *has_exception = JNI_TRUE;
+        return nullptr;
+      }
 
-  /**
-   * Get the Java Method: Iterator#hasNext
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getHasNextMethod(JNIEnv* env) {
-    jclass jiterator_clazz = getIteratorClass(env);
-    if(jiterator_clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
+      const jsize utf_len = env->GetStringUTFLength(js);
+      std::unique_ptr<char[]> str(new char[utf_len + 1]);  // Note: + 1 is needed for the c_str null terminator
+      std::strcpy(str.get(), utf);
+      env->ReleaseStringUTFChars(js, utf);
+      *has_exception = JNI_FALSE;
+      return str;
     }
 
-    static jmethodID mid = env->GetMethodID(jiterator_clazz, "hasNext", "()Z");
-    assert(mid != nullptr);
-    return mid;
-  }
+    /**
+     * Copies a jstring to a std::string
+     * and releases the original jstring
+     *
+     * If an exception occurs, then JNIEnv::ExceptionCheck()
+     * will have been called
+     *
+     * @param env (IN) A pointer to the java environment
+     * @param js (IN) The java string to copy
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an OutOfMemoryError exception occurs
+     *
+     * @return A std:string copy of the jstring, or an
+     *     empty std::string if has_exception == JNI_TRUE
+     */
+    static std::string copyStdString(JNIEnv* env, jstring js,
+      jboolean* has_exception) {
+      const char *utf = env->GetStringUTFChars(js, nullptr);
+      if(utf == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->ExceptionCheck();
+        *has_exception = JNI_TRUE;
+        return std::string();
+      } else if(env->ExceptionCheck()) {
+        // exception thrown
+        env->ReleaseStringUTFChars(js, utf);
+        *has_exception = JNI_TRUE;
+        return std::string();
+      }
 
-  /**
-   * Get the Java Method: Iterator#next
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getNextMethod(JNIEnv* env) {
-    jclass jiterator_clazz = getIteratorClass(env);
-    if(jiterator_clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
+      std::string name(utf);
+      env->ReleaseStringUTFChars(js, utf);
+      *has_exception = JNI_FALSE;
+      return name;
     }
 
-    static jmethodID mid =
-        env->GetMethodID(jiterator_clazz, "next", "()Ljava/lang/Object;");
-    assert(mid != nullptr);
-    return mid;
-  }
-
-  /**
-   * Get the Java Method: ArrayList constructor
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getArrayListConstructorMethodId(JNIEnv* env) {
-    jclass jarray_list_clazz = getArrayListClass(env);
-    if(jarray_list_clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
+    /**
+     * Copies bytes from a std::string to a jByteArray
+     *
+     * @param env A pointer to the java environment
+     * @param bytes The bytes to copy
+     *
+     * @return the Java byte[], or nullptr if an exception occurs
+     * 
+     * @throws RocksDBException thrown 
+     *   if memory size to copy exceeds general java specific array size limitation.
+     */
+    static jbyteArray copyBytes(JNIEnv* env, std::string bytes) {
+      return createJavaByteArrayWithSizeCheck(env, bytes.c_str(), bytes.size());
     }
-    static jmethodID mid =
-        env->GetMethodID(jarray_list_clazz, "<init>", "(I)V");
-    assert(mid != nullptr);
-    return mid;
-  }
 
-  /**
-   * Get the Java Method: List#add
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getListAddMethodId(JNIEnv* env) {
-    jclass jlist_clazz = getListClass(env);
-    if(jlist_clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
-    }
+    /**
+     * Given a Java byte[][] which is an array of java.lang.Strings
+     * where each String is a byte[], the passed function `string_fn`
+     * will be called on each String, the result is the collected by
+     * calling the passed function `collector_fn`
+     *
+     * @param env (IN) A pointer to the java environment
+     * @param jbyte_strings (IN) A Java array of Strings expressed as bytes
+     * @param string_fn (IN) A transform function to call for each String
+     * @param collector_fn (IN) A collector which is called for the result
+     *     of each `string_fn`
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an ArrayIndexOutOfBoundsException or OutOfMemoryError
+     *     exception occurs
+     */
+    template <typename T> static void byteStrings(JNIEnv* env,
+        jobjectArray jbyte_strings,
+        std::function<T(const char*, const size_t)> string_fn,
+        std::function<void(size_t, T)> collector_fn,
+        jboolean *has_exception) {
+      const jsize jlen = env->GetArrayLength(jbyte_strings);
 
-    static jmethodID mid =
-        env->GetMethodID(jlist_clazz, "add", "(Ljava/lang/Object;)Z");
-    assert(mid != nullptr);
-    return mid;
-  }
-};
+      for(jsize i = 0; i < jlen; i++) {
+        jobject jbyte_string_obj = env->GetObjectArrayElement(jbyte_strings, i);
+        if(env->ExceptionCheck()) {
+          // exception thrown: ArrayIndexOutOfBoundsException
+          *has_exception = JNI_TRUE;  // signal error
+          return;
+        }
 
-// The portal class for java.lang.Byte
-class ByteJni : public JavaClass {
- public:
-  /**
-   * Get the Java Class java.lang.Byte
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "java/lang/Byte");
-  }
+        jbyteArray jbyte_string_ary =
+            reinterpret_cast<jbyteArray>(jbyte_string_obj);
+        T result = byteString(env, jbyte_string_ary, string_fn, has_exception);
 
-  /**
-   * Get the Java Class byte[]
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getArrayJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "[B");
-  }
+        env->DeleteLocalRef(jbyte_string_obj);
 
-  /**
-   * Creates a new 2-dimensional Java Byte Array byte[][]
-   *
-   * @param env A pointer to the Java environment
-   * @param len The size of the first dimension
-   *
-   * @return A reference to the Java byte[][] or nullptr if an exception occurs
-   */
-  static jobjectArray new2dByteArray(JNIEnv* env, const jsize len) {
-    jclass clazz = getArrayJClass(env);
-    if(clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
-    }
+        if(*has_exception == JNI_TRUE) {
+          // exception thrown: OutOfMemoryError
+          return;
+        }
 
-    return env->NewObjectArray(len, clazz, nullptr);
-  }
+        collector_fn(i, result);
+      }
 
-  /**
-   * Get the Java Method: Byte#byteValue
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getByteValueMethod(JNIEnv* env) {
-    jclass clazz = getJClass(env);
-    if(clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
+      *has_exception = JNI_FALSE;
     }
 
-    static jmethodID mid = env->GetMethodID(clazz, "byteValue", "()B");
-    assert(mid != nullptr);
-    return mid;
-  }
-};
+    /**
+     * Given a Java String which is expressed as a Java Byte Array byte[],
+     * the passed function `string_fn` will be called on the String
+     * and the result returned
+     *
+     * @param env (IN) A pointer to the java environment
+     * @param jbyte_string_ary (IN) A Java String expressed in bytes
+     * @param string_fn (IN) A transform function to call on the String
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an OutOfMemoryError exception occurs
+     */
+    template <typename T> static T byteString(JNIEnv* env,
+        jbyteArray jbyte_string_ary,
+        std::function<T(const char*, const size_t)> string_fn,
+        jboolean* has_exception) {
+      const jsize jbyte_string_len = env->GetArrayLength(jbyte_string_ary);
+      return byteString<T>(env, jbyte_string_ary, jbyte_string_len, string_fn,
+          has_exception);
+    }
 
-// The portal class for java.lang.StringBuilder
-class StringBuilderJni : public JavaClass {
-  public:
-  /**
-   * Get the Java Class java.lang.StringBuilder
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "java/lang/StringBuilder");
-  }
+    /**
+     * Given a Java String which is expressed as a Java Byte Array byte[],
+     * the passed function `string_fn` will be called on the String
+     * and the result returned
+     *
+     * @param env (IN) A pointer to the java environment
+     * @param jbyte_string_ary (IN) A Java String expressed in bytes
+     * @param jbyte_string_len (IN) The length of the Java String
+     *     expressed in bytes
+     * @param string_fn (IN) A transform function to call on the String
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an OutOfMemoryError exception occurs
+     */
+    template <typename T> static T byteString(JNIEnv* env,
+        jbyteArray jbyte_string_ary, const jsize jbyte_string_len,
+        std::function<T(const char*, const size_t)> string_fn,
+        jboolean* has_exception) {
+      jbyte* jbyte_string =
+          env->GetByteArrayElements(jbyte_string_ary, nullptr);
+      if(jbyte_string == nullptr) {
+        // exception thrown: OutOfMemoryError
+        *has_exception = JNI_TRUE;
+        return nullptr;  // signal error
+      }
 
-  /**
-   * Get the Java Method: StringBuilder#append
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
-   */
-  static jmethodID getListAddMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
-    }
+      T result =
+          string_fn(reinterpret_cast<char *>(jbyte_string), jbyte_string_len);
 
-    static jmethodID mid =
-        env->GetMethodID(jclazz, "append",
-            "(Ljava/lang/String;)Ljava/lang/StringBuilder;");
-    assert(mid != nullptr);
-    return mid;
-  }
+      env->ReleaseByteArrayElements(jbyte_string_ary, jbyte_string, JNI_ABORT);
 
-  /**
-   * Appends a C-style string to a StringBuilder
-   *
-   * @param env A pointer to the Java environment
-   * @param jstring_builder Reference to a java.lang.StringBuilder
-   * @param c_str A C-style string to append to the StringBuilder
-   *
-   * @return A reference to the updated StringBuilder, or a nullptr if
-   *     an exception occurs
-   */
-  static jobject append(JNIEnv* env, jobject jstring_builder,
-      const char* c_str) {
-    jmethodID mid = getListAddMethodId(env);
-    if(mid == nullptr) {
-      // exception occurred accessing class or method
-      return nullptr;
+      *has_exception = JNI_FALSE;
+      return result;
     }
 
-    jstring new_value_str = env->NewStringUTF(c_str);
-    if(new_value_str == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return nullptr;
-    }
+    /**
+     * Converts a std::vector<string> to a Java byte[][] where each Java String
+     * is expressed as a Java Byte Array byte[].
+     *
+     * @param env A pointer to the java environment
+     * @param strings A vector of Strings
+     *
+     * @return A Java array of Strings expressed as bytes,
+     *     or nullptr if an exception is thrown
+     */
+    static jobjectArray stringsBytes(JNIEnv* env, std::vector<std::string> strings) {
+      jclass jcls_ba = ByteJni::getArrayJClass(env);
+      if(jcls_ba == nullptr) {
+        // exception occurred
+        return nullptr;
+      }
 
-    jobject jresult_string_builder =
-        env->CallObjectMethod(jstring_builder, mid, new_value_str);
-    if(env->ExceptionCheck()) {
-      // exception occurred
-      env->DeleteLocalRef(new_value_str);
-      return nullptr;
-    }
+      const jsize len = static_cast<jsize>(strings.size());
 
-    return jresult_string_builder;
-  }
-};
+      jobjectArray jbyte_strings = env->NewObjectArray(len, jcls_ba, nullptr);
+      if(jbyte_strings == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
 
-// The portal class for org.rocksdb.BackupInfo
-class BackupInfoJni : public JavaClass {
- public:
-  /**
-   * Get the Java Class org.rocksdb.BackupInfo
-   *
-   * @param env A pointer to the Java environment
-   *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
-   */
-  static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "org/rocksdb/BackupInfo");
-  }
+      for (jsize i = 0; i < len; i++) {
+        std::string *str = &strings[i];
+        const jsize str_len = static_cast<jsize>(str->size());
 
-  /**
-   * Constructs a BackupInfo object
-   *
-   * @param env A pointer to the Java environment
-   * @param backup_id id of the backup
-   * @param timestamp timestamp of the backup
-   * @param size size of the backup
-   * @param number_files number of files related to the backup
-   *
-   * @return A reference to a Java BackupInfo object, or a nullptr if an
-   *     exception occurs
-   */
-  static jobject construct0(JNIEnv* env, uint32_t backup_id, int64_t timestamp,
-      uint64_t size, uint32_t number_files) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
-    }
+        jbyteArray jbyte_string_ary = env->NewByteArray(str_len);
+        if(jbyte_string_ary == nullptr) {
+          // exception thrown: OutOfMemoryError
+          env->DeleteLocalRef(jbyte_strings);
+          return nullptr;
+        }
 
-    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "(IJJI)V");
-    if(mid == nullptr) {
-      // exception occurred accessing method
-      return nullptr;
-    }
+        env->SetByteArrayRegion(
+          jbyte_string_ary, 0, str_len,
+          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(str->c_str())));
+        if(env->ExceptionCheck()) {
+          // exception thrown: ArrayIndexOutOfBoundsException
+          env->DeleteLocalRef(jbyte_string_ary);
+          env->DeleteLocalRef(jbyte_strings);
+          return nullptr;
+        }
 
-    jobject jbackup_info =
-        env->NewObject(jclazz, mid, backup_id, timestamp, size, number_files);
-    if(env->ExceptionCheck()) {
-      return nullptr;
-    }
+        env->SetObjectArrayElement(jbyte_strings, i, jbyte_string_ary);
+        if(env->ExceptionCheck()) {
+          // exception thrown: ArrayIndexOutOfBoundsException
+          // or ArrayStoreException
+          env->DeleteLocalRef(jbyte_string_ary);
+          env->DeleteLocalRef(jbyte_strings);
+          return nullptr;
+        }
 
-    return jbackup_info;
-  }
-};
+        env->DeleteLocalRef(jbyte_string_ary);
+      }
 
-class BackupInfoListJni {
- public:
-  /**
-   * Converts a C++ std::vector<BackupInfo> object to
-   * a Java ArrayList<org.rocksdb.BackupInfo> object
-   *
-   * @param env A pointer to the Java environment
-   * @param backup_infos A vector of BackupInfo
-   *
-   * @return Either a reference to a Java ArrayList object, or a nullptr
-   *     if an exception occurs
-   */
-  static jobject getBackupInfo(JNIEnv* env,
-      std::vector<BackupInfo> backup_infos) {
-    jclass jarray_list_clazz = rocksdb::ListJni::getArrayListClass(env);
-    if(jarray_list_clazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
+      return jbyte_strings;
     }
 
-    jmethodID cstr_mid = rocksdb::ListJni::getArrayListConstructorMethodId(env);
-    if(cstr_mid == nullptr) {
-      // exception occurred accessing method
-      return nullptr;
+     /**
+     * Converts a std::vector<std::string> to a Java String[].
+     *
+     * @param env A pointer to the java environment
+     * @param strings A vector of Strings
+     *
+     * @return A Java array of Strings,
+     *     or nullptr if an exception is thrown
+     */
+    static jobjectArray toJavaStrings(JNIEnv* env,
+        const std::vector<std::string>* strings) {
+      jclass jcls_str = env->FindClass("java/lang/String");
+      if(jcls_str == nullptr) {
+        // exception occurred
+        return nullptr;
+      }
+
+      const jsize len = static_cast<jsize>(strings->size());
+
+      jobjectArray jstrings = env->NewObjectArray(len, jcls_str, nullptr);
+      if(jstrings == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
+
+      for (jsize i = 0; i < len; i++) {
+        const std::string *str = &((*strings)[i]);
+        jstring js = rocksdb::JniUtil::toJavaString(env, str);
+        if (js == nullptr) {
+          env->DeleteLocalRef(jstrings);
+          return nullptr;
+        }
+
+        env->SetObjectArrayElement(jstrings, i, js);
+        if(env->ExceptionCheck()) {
+          // exception thrown: ArrayIndexOutOfBoundsException
+          // or ArrayStoreException
+          env->DeleteLocalRef(js);
+          env->DeleteLocalRef(jstrings);
+          return nullptr;
+        }
+      }
+
+      return jstrings;
     }
 
-    jmethodID add_mid = rocksdb::ListJni::getListAddMethodId(env);
-    if(add_mid == nullptr) {
-      // exception occurred accessing method
-      return nullptr;
+    /**
+     * Creates a Java UTF String from a C++ std::string
+     *
+     * @param env A pointer to the java environment
+     * @param string the C++ std::string
+     * @param treat_empty_as_null true if empty strings should be treated as null
+     *
+     * @return the Java UTF string, or nullptr if the provided string
+     *     is null (or empty and treat_empty_as_null is set), or if an
+     *     exception occurs allocating the Java String.
+     */
+    static jstring toJavaString(JNIEnv* env, const std::string* string,
+        const bool treat_empty_as_null = false) {
+      if (string == nullptr) {
+        return nullptr;
+      }
+
+      if (treat_empty_as_null && string->empty()) {
+        return nullptr;
+      }
+
+      return env->NewStringUTF(string->c_str());
+    }
+    
+    /**
+      * Copies bytes to a new jByteArray with the check of java array size limitation.
+      *
+      * @param bytes pointer to memory to copy to a new jByteArray
+      * @param size number of bytes to copy
+      *
+      * @return the Java byte[], or nullptr if an exception occurs
+      * 
+      * @throws RocksDBException thrown 
+      *   if memory size to copy exceeds general java array size limitation to avoid overflow.
+      */
+    static jbyteArray createJavaByteArrayWithSizeCheck(JNIEnv* env, const char* bytes, const size_t size) {
+      // Limitation for java array size is vm specific
+      // In general it cannot exceed Integer.MAX_VALUE (2^31 - 1)
+      // Current HotSpot VM limitation for array size is Integer.MAX_VALUE - 5 (2^31 - 1 - 5)
+      // It means that the next call to env->NewByteArray can still end with 
+      // OutOfMemoryError("Requested array size exceeds VM limit") coming from VM
+      static const size_t MAX_JARRAY_SIZE = (static_cast<size_t>(1)) << 31;
+      if(size > MAX_JARRAY_SIZE) {
+        rocksdb::RocksDBExceptionJni::ThrowNew(env, "Requested array size exceeds VM limit");
+        return nullptr;
+      }
+      
+      const jsize jlen = static_cast<jsize>(size);
+      jbyteArray jbytes = env->NewByteArray(jlen);
+      if(jbytes == nullptr) {
+        // exception thrown: OutOfMemoryError	
+        return nullptr;
+      }
+      
+      env->SetByteArrayRegion(jbytes, 0, jlen,
+        const_cast<jbyte*>(reinterpret_cast<const jbyte*>(bytes)));
+      if(env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jbytes);
+        return nullptr;
+      }
+
+      return jbytes;
     }
 
-    // create java list
-    jobject jbackup_info_handle_list =
-        env->NewObject(jarray_list_clazz, cstr_mid, backup_infos.size());
-    if(env->ExceptionCheck()) {
-      // exception occurred constructing object
-      return nullptr;
+    /**
+     * Copies bytes from a rocksdb::Slice to a jByteArray
+     *
+     * @param env A pointer to the java environment
+     * @param bytes The bytes to copy
+     *
+     * @return the Java byte[] or nullptr if an exception occurs
+     * 
+     * @throws RocksDBException thrown 
+     *   if memory size to copy exceeds general java specific array size limitation.
+     */
+    static jbyteArray copyBytes(JNIEnv* env, const Slice& bytes) {
+      return createJavaByteArrayWithSizeCheck(env, bytes.data(), bytes.size());
     }
 
-    // insert in java list
-    auto end = backup_infos.end();
-    for (auto it = backup_infos.begin(); it != end; ++it) {
-      auto backup_info = *it;
+    /*
+     * Helper for operations on a key and value
+     * for example WriteBatch->Put
+     *
+     * TODO(AR) could be used for RocksDB->Put etc.
+     */
+    static std::unique_ptr<rocksdb::Status> kv_op(
+        std::function<rocksdb::Status(rocksdb::Slice, rocksdb::Slice)> op,
+        JNIEnv* env, jobject /*jobj*/,
+        jbyteArray jkey, jint jkey_len,
+        jbyteArray jvalue, jint jvalue_len) {
+      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+      if(env->ExceptionCheck()) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
 
-      jobject obj = rocksdb::BackupInfoJni::construct0(env,
-          backup_info.backup_id,
-          backup_info.timestamp,
-          backup_info.size,
-          backup_info.number_files);
+      jbyte* value = env->GetByteArrayElements(jvalue, nullptr);
       if(env->ExceptionCheck()) {
-        // exception occurred constructing object
-        if(obj != nullptr) {
-          env->DeleteLocalRef(obj);
-        }
-        if(jbackup_info_handle_list != nullptr) {
-          env->DeleteLocalRef(jbackup_info_handle_list);
+        // exception thrown: OutOfMemoryError
+        if(key != nullptr) {
+          env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
         }
         return nullptr;
       }
 
-      jboolean rs =
-          env->CallBooleanMethod(jbackup_info_handle_list, add_mid, obj);
-      if(env->ExceptionCheck() || rs == JNI_FALSE) {
-        // exception occurred calling method, or could not add
-        if(obj != nullptr) {
-          env->DeleteLocalRef(obj);
+      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+      rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
+          jvalue_len);
+
+      auto status = op(key_slice, value_slice);
+
+      if(value != nullptr) {
+        env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT);
+      }
+      if(key != nullptr) {
+        env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+      }
+
+      return std::unique_ptr<rocksdb::Status>(new rocksdb::Status(status));
+    }
+
+    /*
+     * Helper for operations on a key
+     * for example WriteBatch->Delete
+     *
+     * TODO(AR) could be used for RocksDB->Delete etc.
+     */
+    static std::unique_ptr<rocksdb::Status> k_op(
+        std::function<rocksdb::Status(rocksdb::Slice)> op,
+        JNIEnv* env, jobject /*jobj*/,
+        jbyteArray jkey, jint jkey_len) {
+      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+      if(env->ExceptionCheck()) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
+
+      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+      auto status = op(key_slice);
+
+      if(key != nullptr) {
+        env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+      }
+
+      return std::unique_ptr<rocksdb::Status>(new rocksdb::Status(status));
+    }
+
+    /*
+     * Helper for operations on a value
+     * for example WriteBatchWithIndex->GetFromBatch
+     */
+    static jbyteArray v_op(
+        std::function<rocksdb::Status(rocksdb::Slice, std::string*)> op,
+        JNIEnv* env, jbyteArray jkey, jint jkey_len) {
+      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+      if(env->ExceptionCheck()) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
+
+      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+      std::string value;
+      rocksdb::Status s = op(key_slice, &value);
+
+      if(key != nullptr) {
+        env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+      }
+
+      if (s.IsNotFound()) {
+        return nullptr;
+      }
+
+      if (s.ok()) {
+        jbyteArray jret_value =
+            env->NewByteArray(static_cast<jsize>(value.size()));
+        if(jret_value == nullptr) {
+          // exception thrown: OutOfMemoryError
+          return nullptr;
         }
-        if(jbackup_info_handle_list != nullptr) {
-          env->DeleteLocalRef(jbackup_info_handle_list);
+
+        env->SetByteArrayRegion(jret_value, 0, static_cast<jsize>(value.size()),
+                                const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value.c_str())));
+        if(env->ExceptionCheck()) {
+          // exception thrown: ArrayIndexOutOfBoundsException
+          if(jret_value != nullptr) {
+            env->DeleteLocalRef(jret_value);
+          }
+          return nullptr;
         }
-        return nullptr;
+
+        return jret_value;
       }
+
+      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+      return nullptr;
     }
 
-    return jbackup_info_handle_list;
-  }
+    /**
+     * Creates a vector<T*> of C++ pointers from
+     *     a Java array of C++ pointer addresses.
+     * 
+     * @param env (IN) A pointer to the java environment
+     * @param pointers (IN) A Java array of C++ pointer addresses
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an ArrayIndexOutOfBoundsException or OutOfMemoryError
+     *     exception occurs.
+     * 
+     * @return A vector of C++ pointers.
+     */
+    template<typename T> static std::vector<T*> fromJPointers(
+        JNIEnv* env, jlongArray jptrs, jboolean *has_exception) {
+      const jsize jptrs_len = env->GetArrayLength(jptrs);
+      std::vector<T*> ptrs;
+      jlong* jptr = env->GetLongArrayElements(jptrs, nullptr);
+      if (jptr == nullptr) {
+        // exception thrown: OutOfMemoryError
+        *has_exception = JNI_TRUE;
+        return ptrs;
+      }
+      ptrs.reserve(jptrs_len);
+      for (jsize i = 0; i < jptrs_len; i++) {
+        ptrs.push_back(reinterpret_cast<T*>(jptr[i]));
+      }
+      env->ReleaseLongArrayElements(jptrs, jptr, JNI_ABORT);
+      return ptrs;
+    }
+
+    /**
+     * Creates a Java array of C++ pointer addresses
+     *     from a vector of C++ pointers.
+     * 
+     * @param env (IN) A pointer to the java environment
+     * @param pointers (IN) A vector of C++ pointers
+     * @param has_exception (OUT) will be set to JNI_TRUE
+     *     if an ArrayIndexOutOfBoundsException or OutOfMemoryError
+     *     exception occurs
+     * 
+     * @return Java array of C++ pointer addresses.
+     */
+    template<typename T> static jlongArray toJPointers(JNIEnv* env,
+        const std::vector<T*> &pointers,
+        jboolean *has_exception) {
+      const jsize len = static_cast<jsize>(pointers.size());
+      std::unique_ptr<jlong[]> results(new jlong[len]);
+      std::transform(pointers.begin(), pointers.end(), results.get(), [](T* pointer) -> jlong {
+        return reinterpret_cast<jlong>(pointer);
+      });
+
+      jlongArray jpointers = env->NewLongArray(len);
+      if (jpointers == nullptr) {
+        // exception thrown: OutOfMemoryError
+        *has_exception = JNI_TRUE;
+        return nullptr;
+      }
+
+      env->SetLongArrayRegion(jpointers, 0, len, results.get());
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        *has_exception = JNI_TRUE;
+        env->DeleteLocalRef(jpointers);
+        return nullptr;
+      }
+
+      *has_exception = JNI_FALSE;
+
+      return jpointers;
+    }
 };
 
-// The portal class for org.rocksdb.WBWIRocksIterator
-class WBWIRocksIteratorJni : public JavaClass {
+class MapJni : public JavaClass {
  public:
   /**
-   * Get the Java Class org.rocksdb.WBWIRocksIterator
+   * Get the Java Class java.util.Map
    *
    * @param env A pointer to the Java environment
    *
@@ -1691,247 +2088,308 @@ class WBWIRocksIteratorJni : public JavaClass {
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator");
+    return JavaClass::getJClass(env, "java/util/Map");
   }
 
   /**
-   * Get the Java Field: WBWIRocksIterator#entry
+   * Get the Java Method: Map#put
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Field ID or nullptr if the class or field id could not
+   * @return The Java Method ID or nullptr if the class or method id could not
    *     be retieved
    */
-  static jfieldID getWriteEntryField(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+  static jmethodID getMapPutMethodId(JNIEnv* env) {
+    jclass jlist_clazz = getJClass(env);
+    if(jlist_clazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    static jfieldID fid =
-        env->GetFieldID(jclazz, "entry",
-            "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;");
-    assert(fid != nullptr);
-    return fid;
+    static jmethodID mid =
+        env->GetMethodID(jlist_clazz, "put", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;");
+    assert(mid != nullptr);
+    return mid;
   }
+};
 
+class HashMapJni : public JavaClass {
+ public:
   /**
-   * Gets the value of the WBWIRocksIterator#entry
+   * Get the Java Class java.util.HashMap
    *
-   * @param env A pointer to the Java environment 
-   * @param jwbwi_rocks_iterator A reference to a WBWIIterator
+   * @param env A pointer to the Java environment
    *
-   * @return A reference to a Java WBWIRocksIterator.WriteEntry object, or
-   *     a nullptr if an exception occurs
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jobject getWriteEntry(JNIEnv* env, jobject jwbwi_rocks_iterator) {
-    assert(jwbwi_rocks_iterator != nullptr);
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "java/util/HashMap");
+  }
 
-    jfieldID jwrite_entry_field = getWriteEntryField(env);
-    if(jwrite_entry_field == nullptr) {
-      // exception occurred accessing the field
+  /**
+   * Create a new Java java.util.HashMap object.
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java java.util.HashMap object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const uint32_t initial_capacity = 16) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
       return nullptr;
     }
 
-    jobject jwe = env->GetObjectField(jwbwi_rocks_iterator, jwrite_entry_field);
-    assert(jwe != nullptr);
-    return jwe;
-  }
-};
-
-// The portal class for org.rocksdb.WBWIRocksIterator.WriteType
-class WriteTypeJni : public JavaClass {
- public:
-    /**
-     * Get the PUT enum field value of WBWIRocksIterator.WriteType
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject PUT(JNIEnv* env) {
-      return getEnum(env, "PUT");
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(I)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
     }
 
-    /**
-     * Get the MERGE enum field value of WBWIRocksIterator.WriteType
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject MERGE(JNIEnv* env) {
-      return getEnum(env, "MERGE");
+    jobject jhash_map = env->NewObject(jclazz, mid, static_cast<jint>(initial_capacity));
+    if (env->ExceptionCheck()) {
+      return nullptr;
     }
 
-    /**
-     * Get the DELETE enum field value of WBWIRocksIterator.WriteType
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject DELETE(JNIEnv* env) {
-      return getEnum(env, "DELETE");
+    return jhash_map;
+  }
+
+  /**
+   * A function which maps a std::pair<K,V> to a std::pair<JK, JV>
+   *
+   * @return Either a pointer to a std::pair<jobject, jobject>, or nullptr
+   *     if an error occurs during the mapping
+   */
+  template <typename K, typename V, typename JK, typename JV>
+  using FnMapKV = std::function<std::unique_ptr<std::pair<JK, JV>> (const std::pair<K, V>&)>;
+
+  // template <class I, typename K, typename V, typename K1, typename V1, typename std::enable_if<std::is_same<typename std::iterator_traits<I>::value_type, std::pair<const K,V>>::value, int32_t>::type = 0>
+  // static void putAll(JNIEnv* env, const jobject jhash_map, I iterator, const FnMapKV<const K,V,K1,V1> &fn_map_kv) {
+  /**
+   * Returns true if it succeeds, false if an error occurs
+   */
+  template<class iterator_type, typename K, typename V>
+  static bool putAll(JNIEnv* env, const jobject jhash_map, iterator_type iterator, iterator_type end, const FnMapKV<K, V, jobject, jobject> &fn_map_kv) {
+    const jmethodID jmid_put = rocksdb::MapJni::getMapPutMethodId(env);
+    if (jmid_put == nullptr) {
+      return false;
     }
 
-    /**
-     * Get the LOG enum field value of WBWIRocksIterator.WriteType
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject LOG(JNIEnv* env) {
-      return getEnum(env, "LOG");
+    for (auto it = iterator; it != end; ++it) {
+      const std::unique_ptr<std::pair<jobject, jobject>> result = fn_map_kv(*it);
+      if (result == nullptr) {
+          // an error occurred during fn_map_kv
+          return false;
+      }
+      env->CallObjectMethod(jhash_map, jmid_put, result->first, result->second);
+      if (env->ExceptionCheck()) {
+        // exception occurred
+        env->DeleteLocalRef(result->second);
+        env->DeleteLocalRef(result->first);
+        return false;
+      }
+
+      // release local references
+      env->DeleteLocalRef(result->second);
+      env->DeleteLocalRef(result->first);
     }
 
- private:
+    return true;
+  }
+
   /**
-   * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteType
+   * Creates a java.util.Map<String, String> from a std::map<std::string, std::string>
    *
    * @param env A pointer to the Java environment
+   * @param map the Cpp map
    *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred
    */
-  static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator$WriteType");
+  static jobject fromCppMap(JNIEnv* env, const std::map<std::string, std::string>* map) {
+    if (map == nullptr) {
+      return nullptr;
+    }
+
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+
+    const rocksdb::HashMapJni::FnMapKV<const std::string, const std::string, jobject, jobject> fn_map_kv =
+        [env](const std::pair<const std::string, const std::string>& kv) {
+      jstring jkey = rocksdb::JniUtil::toJavaString(env, &(kv.first), false);
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      jstring jvalue = rocksdb::JniUtil::toJavaString(env, &(kv.second), true);
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        env->DeleteLocalRef(jkey);
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      return std::unique_ptr<std::pair<jobject, jobject>>(new std::pair<jobject, jobject>(static_cast<jobject>(jkey), static_cast<jobject>(jvalue)));
+    };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jhash_map;
   }
 
   /**
-   * Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType
+   * Creates a java.util.Map<String, Long> from a std::map<std::string, uint32_t>
    *
    * @param env A pointer to the Java environment
-   * @param name The name of the enum field
+   * @param map the Cpp map
    *
-   * @return A reference to the enum field value or a nullptr if
-   *     the enum field value could not be retrieved
+   * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred
    */
-  static jobject getEnum(JNIEnv* env, const char name[]) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
+  static jobject fromCppMap(JNIEnv* env, const std::map<std::string, uint32_t>* map) {
+    if (map == nullptr) {
       return nullptr;
     }
 
-    jfieldID jfid =
-        env->GetStaticFieldID(jclazz, name,
-            "Lorg/rocksdb/WBWIRocksIterator$WriteType;");
-    if(env->ExceptionCheck()) {
-      // exception occurred while getting field
+    if (map == nullptr) {
       return nullptr;
-    } else if(jfid == nullptr) {
+    }
+
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
       return nullptr;
     }
 
-    jobject jwrite_type = env->GetStaticObjectField(jclazz, jfid);
-    assert(jwrite_type != nullptr);
-    return jwrite_type;
+    const rocksdb::HashMapJni::FnMapKV<const std::string, const uint32_t, jobject, jobject> fn_map_kv =
+        [env](const std::pair<const std::string, const uint32_t>& kv) {
+      jstring jkey = rocksdb::JniUtil::toJavaString(env, &(kv.first), false);
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      jobject jvalue = rocksdb::IntegerJni::valueOf(env, static_cast<jint>(kv.second));
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        env->DeleteLocalRef(jkey);
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      return std::unique_ptr<std::pair<jobject, jobject>>(new std::pair<jobject, jobject>(static_cast<jobject>(jkey), jvalue));
+    };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
+    }
+
+    return jhash_map;
   }
-};
 
-// The portal class for org.rocksdb.WBWIRocksIterator.WriteEntry
-class WriteEntryJni : public JavaClass {
- public:
   /**
-   * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteEntry
+   * Creates a java.util.Map<String, Long> from a std::map<std::string, uint64_t>
    *
    * @param env A pointer to the Java environment
+   * @param map the Cpp map
    *
-   * @return The Java Class or nullptr if one of the
-   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
-   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred
    */
-    static jclass getJClass(JNIEnv* env) {
-      return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator$WriteEntry");
+  static jobject fromCppMap(JNIEnv* env, const std::map<std::string, uint64_t>* map) {
+    if (map == nullptr) {
+      return nullptr;
     }
-};
 
-// The portal class for org.rocksdb.InfoLogLevel
-class InfoLogLevelJni : public JavaClass {
- public:
-    /**
-     * Get the DEBUG_LEVEL enum field value of InfoLogLevel
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject DEBUG_LEVEL(JNIEnv* env) {
-      return getEnum(env, "DEBUG_LEVEL");
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
+      return nullptr;
     }
 
-    /**
-     * Get the INFO_LEVEL enum field value of InfoLogLevel
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject INFO_LEVEL(JNIEnv* env) {
-      return getEnum(env, "INFO_LEVEL");
-    }
+    const rocksdb::HashMapJni::FnMapKV<const std::string, const uint64_t, jobject, jobject> fn_map_kv =
+        [env](const std::pair<const std::string, const uint64_t>& kv) {
+      jstring jkey = rocksdb::JniUtil::toJavaString(env, &(kv.first), false);
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
 
-    /**
-     * Get the WARN_LEVEL enum field value of InfoLogLevel
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject WARN_LEVEL(JNIEnv* env) {
-      return getEnum(env, "WARN_LEVEL");
+      jobject jvalue = rocksdb::LongJni::valueOf(env, static_cast<jlong>(kv.second));
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        env->DeleteLocalRef(jkey);
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      return std::unique_ptr<std::pair<jobject, jobject>>(new std::pair<jobject, jobject>(static_cast<jobject>(jkey), jvalue));
+    };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
     }
 
+    return jhash_map;
+  }
+
     /**
-     * Get the ERROR_LEVEL enum field value of InfoLogLevel
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject ERROR_LEVEL(JNIEnv* env) {
-      return getEnum(env, "ERROR_LEVEL");
+   * Creates a java.util.Map<String, Long> from a std::map<uint32_t, uint64_t>
+   *
+   * @param env A pointer to the Java environment
+   * @param map the Cpp map
+   *
+   * @return a reference to the Java java.util.Map object, or nullptr if an exception occcurred
+   */
+  static jobject fromCppMap(JNIEnv* env, const std::map<uint32_t, uint64_t>* map) {
+    if (map == nullptr) {
+      return nullptr;
     }
 
-    /**
-     * Get the FATAL_LEVEL enum field value of InfoLogLevel
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject FATAL_LEVEL(JNIEnv* env) {
-      return getEnum(env, "FATAL_LEVEL");
+    jobject jhash_map = construct(env, static_cast<uint32_t>(map->size()));
+    if (jhash_map == nullptr) {
+      // exception occurred
+      return nullptr;
     }
 
-    /**
-     * Get the HEADER_LEVEL enum field value of InfoLogLevel
-     *
-     * @param env A pointer to the Java environment
-     *
-     * @return A reference to the enum field value or a nullptr if
-     *     the enum field value could not be retrieved
-     */
-    static jobject HEADER_LEVEL(JNIEnv* env) {
-      return getEnum(env, "HEADER_LEVEL");
+    const rocksdb::HashMapJni::FnMapKV<const uint32_t, const uint64_t, jobject, jobject> fn_map_kv =
+        [env](const std::pair<const uint32_t, const uint64_t>& kv) {
+      jobject jkey = rocksdb::IntegerJni::valueOf(env, static_cast<jint>(kv.first));
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      jobject jvalue = rocksdb::LongJni::valueOf(env, static_cast<jlong>(kv.second));
+      if (env->ExceptionCheck()) {
+        // an error occurred
+        env->DeleteLocalRef(jkey);
+        return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+      }
+
+      return std::unique_ptr<std::pair<jobject, jobject>>(new std::pair<jobject, jobject>(static_cast<jobject>(jkey), jvalue));
+    };
+
+    if (!putAll(env, jhash_map, map->begin(), map->end(), fn_map_kv)) {
+      // exception occurred
+      return nullptr;
     }
 
- private:
+    return jhash_map;
+  }
+};
+
+// The portal class for org.rocksdb.RocksDB
+class RocksDBJni : public RocksDBNativeClass<rocksdb::DB*, RocksDBJni> {
+ public:
   /**
-   * Get the Java Class org.rocksdb.InfoLogLevel
+   * Get the Java Class org.rocksdb.RocksDB
    *
    * @param env A pointer to the Java environment
    *
@@ -1940,46 +2398,104 @@ class InfoLogLevelJni : public JavaClass {
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env, "org/rocksdb/InfoLogLevel");
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksDB");
   }
+};
 
+// The portal class for org.rocksdb.Options
+class OptionsJni : public RocksDBNativeClass<
+    rocksdb::Options*, OptionsJni> {
+ public:
   /**
-   * Get an enum field of org.rocksdb.InfoLogLevel
+   * Get the Java Class org.rocksdb.Options
    *
    * @param env A pointer to the Java environment
-   * @param name The name of the enum field
    *
-   * @return A reference to the enum field value or a nullptr if
-   *     the enum field value could not be retrieved
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jobject getEnum(JNIEnv* env, const char name[]) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options");
+  }
+};
+
+// The portal class for org.rocksdb.DBOptions
+class DBOptionsJni : public RocksDBNativeClass<
+    rocksdb::DBOptions*, DBOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.DBOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ColumnFamilyOptions
+class ColumnFamilyOptionsJni
+    : public RocksDBNativeClass<rocksdb::ColumnFamilyOptions*,
+                                ColumnFamilyOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ColumnFamilyOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+                                         "org/rocksdb/ColumnFamilyOptions");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.ColumnFamilyOptions object with the same
+   * properties as the provided C++ rocksdb::ColumnFamilyOptions object
+   *
+   * @param env A pointer to the Java environment
+   * @param cfoptions A pointer to rocksdb::ColumnFamilyOptions object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyOptions object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const ColumnFamilyOptions* cfoptions) {
+    auto* cfo = new rocksdb::ColumnFamilyOptions(*cfoptions);
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    jfieldID jfid =
-        env->GetStaticFieldID(jclazz, name, "Lorg/rocksdb/InfoLogLevel;");
-    if(env->ExceptionCheck()) {
-      // exception occurred while getting field
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
-    } else if(jfid == nullptr) {
+    }
+
+    jobject jcfd = env->NewObject(jclazz, mid, reinterpret_cast<jlong>(cfo));
+    if (env->ExceptionCheck()) {
       return nullptr;
     }
 
-    jobject jinfo_log_level = env->GetStaticObjectField(jclazz, jfid);
-    assert(jinfo_log_level != nullptr);
-    return jinfo_log_level;
+    return jcfd;
   }
 };
 
-// The portal class for org.rocksdb.Logger
-class LoggerJni : public RocksDBNativeClass<
-    std::shared_ptr<rocksdb::LoggerJniCallback>*, LoggerJni> {
+// The portal class for org.rocksdb.WriteOptions
+class WriteOptionsJni : public RocksDBNativeClass<
+    rocksdb::WriteOptions*, WriteOptionsJni> {
  public:
   /**
-   * Get the Java Class org/rocksdb/Logger
+   * Get the Java Class org.rocksdb.WriteOptions
    *
    * @param env A pointer to the Java environment
    *
@@ -1988,37 +2504,34 @@ class LoggerJni : public RocksDBNativeClass<
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Logger");
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteOptions");
   }
+};
 
+// The portal class for org.rocksdb.ReadOptions
+class ReadOptionsJni : public RocksDBNativeClass<
+    rocksdb::ReadOptions*, ReadOptionsJni> {
+ public:
   /**
-   * Get the Java Method: Logger#log
+   * Get the Java Class org.rocksdb.ReadOptions
    *
    * @param env A pointer to the Java environment
    *
-   * @return The Java Method ID or nullptr if the class or method id could not
-   *     be retieved
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
-  static jmethodID getLogMethodId(JNIEnv* env) {
-    jclass jclazz = getJClass(env);
-    if(jclazz == nullptr) {
-      // exception occurred accessing class
-      return nullptr;
-    }
-
-    static jmethodID mid =
-        env->GetMethodID(jclazz, "log",
-            "(Lorg/rocksdb/InfoLogLevel;Ljava/lang/String;)V");
-    assert(mid != nullptr);
-    return mid;
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/ReadOptions");
   }
 };
 
-// The portal class for org.rocksdb.TransactionLogIterator.BatchResult
-class BatchResultJni : public JavaClass {
-  public:
+// The portal class for org.rocksdb.WriteBatch
+class WriteBatchJni : public RocksDBNativeClass<
+    rocksdb::WriteBatch*, WriteBatchJni> {
+ public:
   /**
-   * Get the Java Class org.rocksdb.TransactionLogIterator.BatchResult
+   * Get the Java Class org.rocksdb.WriteBatch
    *
    * @param env A pointer to the Java environment
    *
@@ -2027,1315 +2540,4553 @@ class BatchResultJni : public JavaClass {
    *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
    */
   static jclass getJClass(JNIEnv* env) {
-    return JavaClass::getJClass(env,
-        "org/rocksdb/TransactionLogIterator$BatchResult");
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/WriteBatch");
   }
 
   /**
-   * Create a new Java org.rocksdb.TransactionLogIterator.BatchResult object
-   * with the same properties as the provided C++ rocksdb::BatchResult object
+   * Create a new Java org.rocksdb.WriteBatch object
    *
    * @param env A pointer to the Java environment
-   * @param batch_result The rocksdb::BatchResult object
+   * @param wb A pointer to rocksdb::WriteBatch object
    *
-   * @return A reference to a Java
-   *     org.rocksdb.TransactionLogIterator.BatchResult object,
-   *     or nullptr if an an exception occurs
+   * @return A reference to a Java org.rocksdb.WriteBatch object, or
+   * nullptr if an an exception occurs
    */
-  static jobject construct(JNIEnv* env,
-      rocksdb::BatchResult& batch_result) {
+  static jobject construct(JNIEnv* env, const WriteBatch* wb) {
     jclass jclazz = getJClass(env);
     if(jclazz == nullptr) {
       // exception occurred accessing class
       return nullptr;
     }
 
-    jmethodID mid = env->GetMethodID(
-      jclazz, "<init>", "(JJ)V");
-    if(mid == nullptr) {
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(J)V");
+    if (mid == nullptr) {
       // exception thrown: NoSuchMethodException or OutOfMemoryError
       return nullptr;
     }
 
-    jobject jbatch_result = env->NewObject(jclazz, mid,
-      batch_result.sequence, batch_result.writeBatchPtr.get());
-    if(jbatch_result == nullptr) {
-      // exception thrown: InstantiationException or OutOfMemoryError
+    jobject jwb = env->NewObject(jclazz, mid, reinterpret_cast<jlong>(wb));
+    if (env->ExceptionCheck()) {
       return nullptr;
     }
 
-    batch_result.writeBatchPtr.release();
-    return jbatch_result;
+    return jwb;
   }
 };
 
-// The portal class for org.rocksdb.CompactionStopStyle
-class CompactionStopStyleJni {
+// The portal class for org.rocksdb.WriteBatch.Handler
+class WriteBatchHandlerJni : public RocksDBNativeClass<
+    const rocksdb::WriteBatchHandlerJniCallback*,
+    WriteBatchHandlerJni> {
  public:
-  // Returns the equivalent org.rocksdb.CompactionStopStyle for the provided
-  // C++ rocksdb::CompactionStopStyle enum
-  static jbyte toJavaCompactionStopStyle(
-      const rocksdb::CompactionStopStyle& compaction_stop_style) {
-    switch(compaction_stop_style) {
-      case rocksdb::CompactionStopStyle::kCompactionStopStyleSimilarSize:
-        return 0x0;
-      case rocksdb::CompactionStopStyle::kCompactionStopStyleTotalSize:
-        return 0x1;
-      default:
-        return 0x7F;  // undefined
-    }
+  /**
+   * Get the Java Class org.rocksdb.WriteBatch.Handler
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatch$Handler");
   }
 
-  // Returns the equivalent C++ rocksdb::CompactionStopStyle enum for the
-  // provided Java org.rocksdb.CompactionStopStyle
-  static rocksdb::CompactionStopStyle toCppCompactionStopStyle(
-      jbyte jcompaction_stop_style) {
-    switch(jcompaction_stop_style) {
-      case 0x0:
-        return rocksdb::CompactionStopStyle::kCompactionStopStyleSimilarSize;
-      case 0x1:
-        return rocksdb::CompactionStopStyle::kCompactionStopStyleTotalSize;
-      default:
-        // undefined/default
-        return rocksdb::CompactionStopStyle::kCompactionStopStyleSimilarSize;
+  /**
+   * Get the Java Method: WriteBatch.Handler#put
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getPutCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
-  }
-};
 
-// The portal class for org.rocksdb.CompressionType
-class CompressionTypeJni {
- public:
-  // Returns the equivalent org.rocksdb.CompressionType for the provided
-  // C++ rocksdb::CompressionType enum
-  static jbyte toJavaCompressionType(
-      const rocksdb::CompressionType& compression_type) {
-    switch(compression_type) {
-      case rocksdb::CompressionType::kNoCompression:
-        return 0x0;
-      case rocksdb::CompressionType::kSnappyCompression:
-        return 0x1;
-      case rocksdb::CompressionType::kZlibCompression:
-        return 0x2;
-      case rocksdb::CompressionType::kBZip2Compression:
-        return 0x3;
-      case rocksdb::CompressionType::kLZ4Compression:
-        return 0x4;
-      case rocksdb::CompressionType::kLZ4HCCompression:
-        return 0x5;
-      case rocksdb::CompressionType::kXpressCompression:
-        return 0x6;
-      case rocksdb::CompressionType::kZSTD:
-        return 0x7;
-      case rocksdb::CompressionType::kDisableCompressionOption:
-      default:
-        return 0x7F;
-    }
+    static jmethodID mid = env->GetMethodID(jclazz, "put", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
   }
 
-  // Returns the equivalent C++ rocksdb::CompressionType enum for the
-  // provided Java org.rocksdb.CompressionType
-  static rocksdb::CompressionType toCppCompressionType(
-      jbyte jcompression_type) {
-    switch(jcompression_type) {
-      case 0x0:
-        return rocksdb::CompressionType::kNoCompression;
-      case 0x1:
-        return rocksdb::CompressionType::kSnappyCompression;
-      case 0x2:
-        return rocksdb::CompressionType::kZlibCompression;
-      case 0x3:
-        return rocksdb::CompressionType::kBZip2Compression;
-      case 0x4:
-        return rocksdb::CompressionType::kLZ4Compression;
-      case 0x5:
-        return rocksdb::CompressionType::kLZ4HCCompression;
-      case 0x6:
-        return rocksdb::CompressionType::kXpressCompression;
-      case 0x7:
-        return rocksdb::CompressionType::kZSTD;
-      case 0x7F:
-      default:
-        return rocksdb::CompressionType::kDisableCompressionOption;
+  /**
+   * Get the Java Method: WriteBatch.Handler#put
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getPutMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "put", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
   }
-};
 
-// The portal class for org.rocksdb.CompactionPriority
-class CompactionPriorityJni {
- public:
-  // Returns the equivalent org.rocksdb.CompactionPriority for the provided
-  // C++ rocksdb::CompactionPri enum
-  static jbyte toJavaCompactionPriority(
-      const rocksdb::CompactionPri& compaction_priority) {
-    switch(compaction_priority) {
-      case rocksdb::CompactionPri::kByCompensatedSize:
-        return 0x0;
-      case rocksdb::CompactionPri::kOldestLargestSeqFirst:
-        return 0x1;
-      case rocksdb::CompactionPri::kOldestSmallestSeqFirst:
-        return 0x2;
-      case rocksdb::CompactionPri::kMinOverlappingRatio:
-        return 0x3;
-      default:
-        return 0x0;  // undefined
+  /**
+   * Get the Java Method: WriteBatch.Handler#merge
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getMergeCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "merge", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
   }
 
-  // Returns the equivalent C++ rocksdb::CompactionPri enum for the
-  // provided Java org.rocksdb.CompactionPriority
-  static rocksdb::CompactionPri toCppCompactionPriority(
-      jbyte jcompaction_priority) {
-    switch(jcompaction_priority) {
-      case 0x0:
-        return rocksdb::CompactionPri::kByCompensatedSize;
-      case 0x1:
-        return rocksdb::CompactionPri::kOldestLargestSeqFirst;
-      case 0x2:
-        return rocksdb::CompactionPri::kOldestSmallestSeqFirst;
-      case 0x3:
-        return rocksdb::CompactionPri::kMinOverlappingRatio;
-      default:
-        // undefined/default
-        return rocksdb::CompactionPri::kByCompensatedSize;
+  /**
+   * Get the Java Method: WriteBatch.Handler#merge
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getMergeMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "merge", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
   }
-};
 
-// The portal class for org.rocksdb.AccessHint
-class AccessHintJni {
- public:
-  // Returns the equivalent org.rocksdb.AccessHint for the provided
-  // C++ rocksdb::DBOptions::AccessHint enum
-  static jbyte toJavaAccessHint(
-      const rocksdb::DBOptions::AccessHint& access_hint) {
-    switch(access_hint) {
-      case rocksdb::DBOptions::AccessHint::NONE:
-        return 0x0;
-      case rocksdb::DBOptions::AccessHint::NORMAL:
-        return 0x1;
-      case rocksdb::DBOptions::AccessHint::SEQUENTIAL:
-        return 0x2;
-      case rocksdb::DBOptions::AccessHint::WILLNEED:
-        return 0x3;
-      default:
-        // undefined/default
-        return 0x1;
+  /**
+   * Get the Java Method: WriteBatch.Handler#delete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getDeleteCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "delete", "(I[B)V");
+    assert(mid != nullptr);
+    return mid;
   }
 
-  // Returns the equivalent C++ rocksdb::DBOptions::AccessHint enum for the
-  // provided Java org.rocksdb.AccessHint
-  static rocksdb::DBOptions::AccessHint toCppAccessHint(jbyte jaccess_hint) {
-    switch(jaccess_hint) {
-      case 0x0:
-        return rocksdb::DBOptions::AccessHint::NONE;
-      case 0x1:
-        return rocksdb::DBOptions::AccessHint::NORMAL;
-      case 0x2:
-        return rocksdb::DBOptions::AccessHint::SEQUENTIAL;
-      case 0x3:
-        return rocksdb::DBOptions::AccessHint::WILLNEED;
-      default:
-        // undefined/default
-        return rocksdb::DBOptions::AccessHint::NORMAL;
+  /**
+   * Get the Java Method: WriteBatch.Handler#delete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getDeleteMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "delete", "([B)V");
+    assert(mid != nullptr);
+    return mid;
   }
-};
 
-// The portal class for org.rocksdb.WALRecoveryMode
-class WALRecoveryModeJni {
- public:
-  // Returns the equivalent org.rocksdb.WALRecoveryMode for the provided
-  // C++ rocksdb::WALRecoveryMode enum
-  static jbyte toJavaWALRecoveryMode(
-      const rocksdb::WALRecoveryMode& wal_recovery_mode) {
-    switch(wal_recovery_mode) {
-      case rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords:
-        return 0x0;
-      case rocksdb::WALRecoveryMode::kAbsoluteConsistency:
-        return 0x1;
-      case rocksdb::WALRecoveryMode::kPointInTimeRecovery:
-        return 0x2;
-      case rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords:
-        return 0x3;
-      default:
-        // undefined/default
-        return 0x2;
+  /**
+   * Get the Java Method: WriteBatch.Handler#singleDelete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getSingleDeleteCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "(I[B)V");
+    assert(mid != nullptr);
+    return mid;
   }
 
-  // Returns the equivalent C++ rocksdb::WALRecoveryMode enum for the
-  // provided Java org.rocksdb.WALRecoveryMode
-  static rocksdb::WALRecoveryMode toCppWALRecoveryMode(jbyte jwal_recovery_mode) {
-    switch(jwal_recovery_mode) {
-      case 0x0:
-        return rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords;
-      case 0x1:
-        return rocksdb::WALRecoveryMode::kAbsoluteConsistency;
-      case 0x2:
-        return rocksdb::WALRecoveryMode::kPointInTimeRecovery;
-      case 0x3:
-        return rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords;
-      default:
-        // undefined/default
-        return rocksdb::WALRecoveryMode::kPointInTimeRecovery;
+  /**
+   * Get the Java Method: WriteBatch.Handler#singleDelete
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getSingleDeleteMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "singleDelete", "([B)V");
+    assert(mid != nullptr);
+    return mid;
   }
-};
 
-// The portal class for org.rocksdb.TickerType
-class TickerTypeJni {
- public:
-  // Returns the equivalent org.rocksdb.TickerType for the provided
-  // C++ rocksdb::Tickers enum
-  static jbyte toJavaTickerType(
-      const rocksdb::Tickers& tickers) {
-    switch(tickers) {
-      case rocksdb::Tickers::BLOCK_CACHE_MISS:
-        return 0x0;
-      case rocksdb::Tickers::BLOCK_CACHE_HIT:
-        return 0x1;
-      case rocksdb::Tickers::BLOCK_CACHE_ADD:
-        return 0x2;
-      case rocksdb::Tickers::BLOCK_CACHE_ADD_FAILURES:
-        return 0x3;
-      case rocksdb::Tickers::BLOCK_CACHE_INDEX_MISS:
-        return 0x4;
-      case rocksdb::Tickers::BLOCK_CACHE_INDEX_HIT:
-        return 0x5;
-      case rocksdb::Tickers::BLOCK_CACHE_INDEX_ADD:
-        return 0x6;
-      case rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT:
-        return 0x7;
-      case rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT:
-        return 0x8;
-      case rocksdb::Tickers::BLOCK_CACHE_FILTER_MISS:
-        return 0x9;
-      case rocksdb::Tickers::BLOCK_CACHE_FILTER_HIT:
-        return 0xA;
-      case rocksdb::Tickers::BLOCK_CACHE_FILTER_ADD:
-        return 0xB;
-      case rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT:
-        return 0xC;
-      case rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT:
-        return 0xD;
-      case rocksdb::Tickers::BLOCK_CACHE_DATA_MISS:
-        return 0xE;
-      case rocksdb::Tickers::BLOCK_CACHE_DATA_HIT:
-        return 0xF;
-      case rocksdb::Tickers::BLOCK_CACHE_DATA_ADD:
-        return 0x10;
-      case rocksdb::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT:
-        return 0x11;
-      case rocksdb::Tickers::BLOCK_CACHE_BYTES_READ:
-        return 0x12;
-      case rocksdb::Tickers::BLOCK_CACHE_BYTES_WRITE:
-        return 0x13;
-      case rocksdb::Tickers::BLOOM_FILTER_USEFUL:
-        return 0x14;
-      case rocksdb::Tickers::PERSISTENT_CACHE_HIT:
-        return 0x15;
-      case rocksdb::Tickers::PERSISTENT_CACHE_MISS:
-        return 0x16;
-      case rocksdb::Tickers::SIM_BLOCK_CACHE_HIT:
-        return 0x17;
-      case rocksdb::Tickers::SIM_BLOCK_CACHE_MISS:
-        return 0x18;
-      case rocksdb::Tickers::MEMTABLE_HIT:
-        return 0x19;
-      case rocksdb::Tickers::MEMTABLE_MISS:
-        return 0x1A;
-      case rocksdb::Tickers::GET_HIT_L0:
-        return 0x1B;
-      case rocksdb::Tickers::GET_HIT_L1:
-        return 0x1C;
+  /**
+   * Get the Java Method: WriteBatch.Handler#deleteRange
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getDeleteRangeCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#deleteRange
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getDeleteRangeMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "deleteRange", "([B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#logData
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getLogDataMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "logData", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#putBlobIndex
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getPutBlobIndexCfMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "putBlobIndex", "(I[B[B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markBeginPrepare
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getMarkBeginPrepareMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markBeginPrepare", "()V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markEndPrepare
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getMarkEndPrepareMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markEndPrepare", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markNoop
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getMarkNoopMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markNoop", "(Z)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markRollback
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getMarkRollbackMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markRollback", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#markCommit
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getMarkCommitMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "markCommit", "([B)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: WriteBatch.Handler#shouldContinue
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getContinueMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "shouldContinue", "()Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+class WriteBatchSavePointJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WriteBatch.SavePoint
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WriteBatch$SavePoint");
+  }
+
+  /**
+   * Get the Java Method: HistogramData constructor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getConstructorMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JJJ)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Create a new Java org.rocksdb.WriteBatch.SavePoint object
+   *
+   * @param env A pointer to the Java environment
+   * @param savePoint A pointer to rocksdb::WriteBatch::SavePoint object
+   *
+   * @return A reference to a Java org.rocksdb.WriteBatch.SavePoint object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, const SavePoint &save_point) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = getConstructorMethodId(env);
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jsave_point = env->NewObject(jclazz, mid,
+        static_cast<jlong>(save_point.size),
+        static_cast<jlong>(save_point.count),
+        static_cast<jlong>(save_point.content_flags));
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jsave_point;
+  }
+};
+
+// The portal class for org.rocksdb.WriteBatchWithIndex
+class WriteBatchWithIndexJni : public RocksDBNativeClass<
+    rocksdb::WriteBatchWithIndex*, WriteBatchWithIndexJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WriteBatchWithIndex
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/WriteBatchWithIndex");
+  }
+};
+
+// The portal class for org.rocksdb.HistogramData
+class HistogramDataJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.HistogramData
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/HistogramData");
+  }
+
+  /**
+   * Get the Java Method: HistogramData constructor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getConstructorMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "(DDDDDDJJD)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.BackupableDBOptions
+class BackupableDBOptionsJni : public RocksDBNativeClass<
+    rocksdb::BackupableDBOptions*, BackupableDBOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.BackupableDBOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/BackupableDBOptions");
+  }
+};
+
+// The portal class for org.rocksdb.BackupEngine
+class BackupEngineJni : public RocksDBNativeClass<
+    rocksdb::BackupEngine*, BackupEngineJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.BackupableEngine
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/BackupEngine");
+  }
+};
+
+// The portal class for org.rocksdb.RocksIterator
+class IteratorJni : public RocksDBNativeClass<
+    rocksdb::Iterator*, IteratorJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.RocksIterator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksIterator");
+  }
+};
+
+// The portal class for org.rocksdb.Filter
+class FilterJni : public RocksDBNativeClass<
+    std::shared_ptr<rocksdb::FilterPolicy>*, FilterJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Filter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Filter");
+  }
+};
+
+// The portal class for org.rocksdb.ColumnFamilyHandle
+class ColumnFamilyHandleJni : public RocksDBNativeClass<
+    rocksdb::ColumnFamilyHandle*, ColumnFamilyHandleJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ColumnFamilyHandle
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/ColumnFamilyHandle");
+  }
+};
+
+// The portal class for org.rocksdb.FlushOptions
+class FlushOptionsJni : public RocksDBNativeClass<
+    rocksdb::FlushOptions*, FlushOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.FlushOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/FlushOptions");
+  }
+};
+
+// The portal class for org.rocksdb.ComparatorOptions
+class ComparatorOptionsJni : public RocksDBNativeClass<
+    rocksdb::ComparatorJniCallbackOptions*, ComparatorOptionsJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ComparatorOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/ComparatorOptions");
+  }
+};
+
+// The portal class for org.rocksdb.AbstractCompactionFilterFactory
+class AbstractCompactionFilterFactoryJni : public RocksDBNativeClass<
+    const rocksdb::CompactionFilterFactoryJniCallback*,
+    AbstractCompactionFilterFactoryJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractCompactionFilterFactory
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractCompactionFilterFactory");
+  }
+
+  /**
+   * Get the Java Method: AbstractCompactionFilterFactory#name
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractCompactionFilterFactory#createCompactionFilter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getCreateCompactionFilterMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz,
+      "createCompactionFilter",
+      "(ZZ)J");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractTransactionNotifier
+class AbstractTransactionNotifierJni : public RocksDBNativeClass<
+    const rocksdb::TransactionNotifierJniCallback*,
+    AbstractTransactionNotifierJni> {
+ public:
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractTransactionNotifier");
+  }
+
+  // Get the java method `snapshotCreated`
+  // of org.rocksdb.AbstractTransactionNotifier.
+  static jmethodID getSnapshotCreatedMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "snapshotCreated", "(J)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractComparator
+class AbstractComparatorJni : public RocksDBNativeClass<
+    const rocksdb::BaseComparatorJniCallback*,
+    AbstractComparatorJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractComparator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractComparator");
+  }
+
+  /**
+   * Get the Java Method: Comparator#name
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Comparator#compare
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getCompareMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "compare",
+            "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Comparator#findShortestSeparator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getFindShortestSeparatorMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "findShortestSeparator",
+            "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: Comparator#findShortSuccessor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getFindShortSuccessorMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "findShortSuccessor",
+            "(Ljava/lang/String;)Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.AbstractSlice
+class AbstractSliceJni : public NativeRocksMutableObject<
+    const rocksdb::Slice*, AbstractSliceJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractSlice
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/AbstractSlice");
+  }
+};
+
+// The portal class for org.rocksdb.Slice
+class SliceJni : public NativeRocksMutableObject<
+    const rocksdb::Slice*, AbstractSliceJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Slice
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Slice");
+  }
+
+  /**
+   * Constructs a Slice object
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java Slice object, or a nullptr if an
+   *     exception occurs
+   */
+  static jobject construct0(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "()V");
+    if(mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jobject jslice = env->NewObject(jclazz, mid);
+    if(env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jslice;
+  }
+};
+
+// The portal class for org.rocksdb.DirectSlice
+class DirectSliceJni : public NativeRocksMutableObject<
+    const rocksdb::Slice*, AbstractSliceJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.DirectSlice
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/DirectSlice");
+  }
+
+  /**
+   * Constructs a DirectSlice object
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java DirectSlice object, or a nullptr if an
+   *     exception occurs
+   */
+  static jobject construct0(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(jclazz, "<init>", "()V");
+    if(mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jobject jdirect_slice = env->NewObject(jclazz, mid);
+    if(env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jdirect_slice;
+  }
+};
+
+// The portal class for org.rocksdb.BackupInfo
+class BackupInfoJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.BackupInfo
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/BackupInfo");
+  }
+
+  /**
+   * Constructs a BackupInfo object
+   *
+   * @param env A pointer to the Java environment
+   * @param backup_id id of the backup
+   * @param timestamp timestamp of the backup
+   * @param size size of the backup
+   * @param number_files number of files related to the backup
+   * @param app_metadata application specific metadata
+   *
+   * @return A reference to a Java BackupInfo object, or a nullptr if an
+   *     exception occurs
+   */
+  static jobject construct0(JNIEnv* env, uint32_t backup_id, int64_t timestamp,
+                            uint64_t size, uint32_t number_files,
+                            const std::string& app_metadata) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "<init>", "(IJJILjava/lang/String;)V");
+    if(mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jstring japp_metadata = nullptr;
+    if (app_metadata != nullptr) {
+      japp_metadata = env->NewStringUTF(app_metadata.c_str());
+      if (japp_metadata == nullptr) {
+        // exception occurred creating java string
+        return nullptr;
+      }
+    }
+
+    jobject jbackup_info = env->NewObject(jclazz, mid, backup_id, timestamp,
+                                          size, number_files, japp_metadata);
+    if(env->ExceptionCheck()) {
+      env->DeleteLocalRef(japp_metadata);
+      return nullptr;
+    }
+
+    return jbackup_info;
+  }
+};
+
+class BackupInfoListJni {
+ public:
+  /**
+   * Converts a C++ std::vector<BackupInfo> object to
+   * a Java ArrayList<org.rocksdb.BackupInfo> object
+   *
+   * @param env A pointer to the Java environment
+   * @param backup_infos A vector of BackupInfo
+   *
+   * @return Either a reference to a Java ArrayList object, or a nullptr
+   *     if an exception occurs
+   */
+  static jobject getBackupInfo(JNIEnv* env,
+      std::vector<BackupInfo> backup_infos) {
+    jclass jarray_list_clazz = rocksdb::ListJni::getArrayListClass(env);
+    if(jarray_list_clazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID cstr_mid = rocksdb::ListJni::getArrayListConstructorMethodId(env);
+    if(cstr_mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    jmethodID add_mid = rocksdb::ListJni::getListAddMethodId(env);
+    if(add_mid == nullptr) {
+      // exception occurred accessing method
+      return nullptr;
+    }
+
+    // create java list
+    jobject jbackup_info_handle_list =
+        env->NewObject(jarray_list_clazz, cstr_mid, backup_infos.size());
+    if(env->ExceptionCheck()) {
+      // exception occurred constructing object
+      return nullptr;
+    }
+
+    // insert in java list
+    auto end = backup_infos.end();
+    for (auto it = backup_infos.begin(); it != end; ++it) {
+      auto backup_info = *it;
+
+      jobject obj = rocksdb::BackupInfoJni::construct0(
+          env, backup_info.backup_id, backup_info.timestamp, backup_info.size,
+          backup_info.number_files, backup_info.app_metadata);
+      if(env->ExceptionCheck()) {
+        // exception occurred constructing object
+        if(obj != nullptr) {
+          env->DeleteLocalRef(obj);
+        }
+        if(jbackup_info_handle_list != nullptr) {
+          env->DeleteLocalRef(jbackup_info_handle_list);
+        }
+        return nullptr;
+      }
+
+      jboolean rs =
+          env->CallBooleanMethod(jbackup_info_handle_list, add_mid, obj);
+      if(env->ExceptionCheck() || rs == JNI_FALSE) {
+        // exception occurred calling method, or could not add
+        if(obj != nullptr) {
+          env->DeleteLocalRef(obj);
+        }
+        if(jbackup_info_handle_list != nullptr) {
+          env->DeleteLocalRef(jbackup_info_handle_list);
+        }
+        return nullptr;
+      }
+    }
+
+    return jbackup_info_handle_list;
+  }
+};
+
+// The portal class for org.rocksdb.WBWIRocksIterator
+class WBWIRocksIteratorJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WBWIRocksIterator
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator");
+  }
+
+  /**
+   * Get the Java Field: WBWIRocksIterator#entry
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Field ID or nullptr if the class or field id could not
+   *     be retieved
+   */
+  static jfieldID getWriteEntryField(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jfieldID fid =
+        env->GetFieldID(jclazz, "entry",
+            "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;");
+    assert(fid != nullptr);
+    return fid;
+  }
+
+  /**
+   * Gets the value of the WBWIRocksIterator#entry
+   *
+   * @param env A pointer to the Java environment
+   * @param jwbwi_rocks_iterator A reference to a WBWIIterator
+   *
+   * @return A reference to a Java WBWIRocksIterator.WriteEntry object, or
+   *     a nullptr if an exception occurs
+   */
+  static jobject getWriteEntry(JNIEnv* env, jobject jwbwi_rocks_iterator) {
+    assert(jwbwi_rocks_iterator != nullptr);
+
+    jfieldID jwrite_entry_field = getWriteEntryField(env);
+    if(jwrite_entry_field == nullptr) {
+      // exception occurred accessing the field
+      return nullptr;
+    }
+
+    jobject jwe = env->GetObjectField(jwbwi_rocks_iterator, jwrite_entry_field);
+    assert(jwe != nullptr);
+    return jwe;
+  }
+};
+
+// The portal class for org.rocksdb.WBWIRocksIterator.WriteType
+class WriteTypeJni : public JavaClass {
+ public:
+  /**
+   * Get the PUT enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject PUT(JNIEnv* env) {
+    return getEnum(env, "PUT");
+  }
+
+  /**
+   * Get the MERGE enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject MERGE(JNIEnv* env) {
+    return getEnum(env, "MERGE");
+  }
+
+  /**
+   * Get the DELETE enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject DELETE(JNIEnv* env) {
+    return getEnum(env, "DELETE");
+  }
+
+  /**
+   * Get the LOG enum field value of WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject LOG(JNIEnv* env) {
+    return getEnum(env, "LOG");
+  }
+
+  // Returns the equivalent org.rocksdb.WBWIRocksIterator.WriteType for the
+  // provided C++ rocksdb::WriteType enum
+  static jbyte toJavaWriteType(const rocksdb::WriteType& writeType) {
+    switch (writeType) {
+      case rocksdb::WriteType::kPutRecord:
+        return 0x0;
+      case rocksdb::WriteType::kMergeRecord:
+        return 0x1;
+      case rocksdb::WriteType::kDeleteRecord:
+        return 0x2;
+      case rocksdb::WriteType::kSingleDeleteRecord:
+        return 0x3;
+      case rocksdb::WriteType::kDeleteRangeRecord:
+        return 0x4;
+      case rocksdb::WriteType::kLogDataRecord:
+        return 0x5;
+      case rocksdb::WriteType::kXIDRecord:
+        return 0x6;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+ private:
+  /**
+   * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator$WriteType");
+  }
+
+  /**
+   * Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType
+   *
+   * @param env A pointer to the Java environment
+   * @param name The name of the enum field
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject getEnum(JNIEnv* env, const char name[]) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jfieldID jfid =
+        env->GetStaticFieldID(jclazz, name,
+            "Lorg/rocksdb/WBWIRocksIterator$WriteType;");
+    if(env->ExceptionCheck()) {
+      // exception occurred while getting field
+      return nullptr;
+    } else if(jfid == nullptr) {
+      return nullptr;
+    }
+
+    jobject jwrite_type = env->GetStaticObjectField(jclazz, jfid);
+    assert(jwrite_type != nullptr);
+    return jwrite_type;
+  }
+};
+
+// The portal class for org.rocksdb.WBWIRocksIterator.WriteEntry
+class WriteEntryJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.WBWIRocksIterator.WriteEntry
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+    static jclass getJClass(JNIEnv* env) {
+      return JavaClass::getJClass(env, "org/rocksdb/WBWIRocksIterator$WriteEntry");
+    }
+};
+
+// The portal class for org.rocksdb.InfoLogLevel
+class InfoLogLevelJni : public JavaClass {
+ public:
+    /**
+     * Get the DEBUG_LEVEL enum field value of InfoLogLevel
+     *
+     * @param env A pointer to the Java environment
+     *
+     * @return A reference to the enum field value or a nullptr if
+     *     the enum field value could not be retrieved
+     */
+    static jobject DEBUG_LEVEL(JNIEnv* env) {
+      return getEnum(env, "DEBUG_LEVEL");
+    }
+
+    /**
+     * Get the INFO_LEVEL enum field value of InfoLogLevel
+     *
+     * @param env A pointer to the Java environment
+     *
+     * @return A reference to the enum field value or a nullptr if
+     *     the enum field value could not be retrieved
+     */
+    static jobject INFO_LEVEL(JNIEnv* env) {
+      return getEnum(env, "INFO_LEVEL");
+    }
+
+    /**
+     * Get the WARN_LEVEL enum field value of InfoLogLevel
+     *
+     * @param env A pointer to the Java environment
+     *
+     * @return A reference to the enum field value or a nullptr if
+     *     the enum field value could not be retrieved
+     */
+    static jobject WARN_LEVEL(JNIEnv* env) {
+      return getEnum(env, "WARN_LEVEL");
+    }
+
+    /**
+     * Get the ERROR_LEVEL enum field value of InfoLogLevel
+     *
+     * @param env A pointer to the Java environment
+     *
+     * @return A reference to the enum field value or a nullptr if
+     *     the enum field value could not be retrieved
+     */
+    static jobject ERROR_LEVEL(JNIEnv* env) {
+      return getEnum(env, "ERROR_LEVEL");
+    }
+
+    /**
+     * Get the FATAL_LEVEL enum field value of InfoLogLevel
+     *
+     * @param env A pointer to the Java environment
+     *
+     * @return A reference to the enum field value or a nullptr if
+     *     the enum field value could not be retrieved
+     */
+    static jobject FATAL_LEVEL(JNIEnv* env) {
+      return getEnum(env, "FATAL_LEVEL");
+    }
+
+    /**
+     * Get the HEADER_LEVEL enum field value of InfoLogLevel
+     *
+     * @param env A pointer to the Java environment
+     *
+     * @return A reference to the enum field value or a nullptr if
+     *     the enum field value could not be retrieved
+     */
+    static jobject HEADER_LEVEL(JNIEnv* env) {
+      return getEnum(env, "HEADER_LEVEL");
+    }
+
+ private:
+  /**
+   * Get the Java Class org.rocksdb.InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/InfoLogLevel");
+  }
+
+  /**
+   * Get an enum field of org.rocksdb.InfoLogLevel
+   *
+   * @param env A pointer to the Java environment
+   * @param name The name of the enum field
+   *
+   * @return A reference to the enum field value or a nullptr if
+   *     the enum field value could not be retrieved
+   */
+  static jobject getEnum(JNIEnv* env, const char name[]) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jfieldID jfid =
+        env->GetStaticFieldID(jclazz, name, "Lorg/rocksdb/InfoLogLevel;");
+    if(env->ExceptionCheck()) {
+      // exception occurred while getting field
+      return nullptr;
+    } else if(jfid == nullptr) {
+      return nullptr;
+    }
+
+    jobject jinfo_log_level = env->GetStaticObjectField(jclazz, jfid);
+    assert(jinfo_log_level != nullptr);
+    return jinfo_log_level;
+  }
+};
+
+// The portal class for org.rocksdb.Logger
+class LoggerJni : public RocksDBNativeClass<
+    std::shared_ptr<rocksdb::LoggerJniCallback>*, LoggerJni> {
+ public:
+  /**
+   * Get the Java Class org/rocksdb/Logger
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env, "org/rocksdb/Logger");
+  }
+
+  /**
+   * Get the Java Method: Logger#log
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getLogMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "log",
+            "(Lorg/rocksdb/InfoLogLevel;Ljava/lang/String;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
+
+// The portal class for org.rocksdb.TransactionLogIterator.BatchResult
+class BatchResultJni : public JavaClass {
+  public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionLogIterator.BatchResult
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env,
+        "org/rocksdb/TransactionLogIterator$BatchResult");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionLogIterator.BatchResult object
+   * with the same properties as the provided C++ rocksdb::BatchResult object
+   *
+   * @param env A pointer to the Java environment
+   * @param batch_result The rocksdb::BatchResult object
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.TransactionLogIterator.BatchResult object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env,
+      rocksdb::BatchResult& batch_result) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+      jclazz, "<init>", "(JJ)V");
+    if(mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jobject jbatch_result = env->NewObject(jclazz, mid,
+      batch_result.sequence, batch_result.writeBatchPtr.get());
+    if(jbatch_result == nullptr) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      return nullptr;
+    }
+
+    batch_result.writeBatchPtr.release();
+    return jbatch_result;
+  }
+};
+
+// The portal class for org.rocksdb.BottommostLevelCompaction
+class BottommostLevelCompactionJni {
+ public:
+  // Returns the equivalent org.rocksdb.BottommostLevelCompaction for the provided
+  // C++ rocksdb::BottommostLevelCompaction enum
+  static jint toJavaBottommostLevelCompaction(
+      const rocksdb::BottommostLevelCompaction& bottommost_level_compaction) {
+    switch(bottommost_level_compaction) {
+      case rocksdb::BottommostLevelCompaction::kSkip:
+        return 0x0;
+      case rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter:
+        return 0x1;
+      case rocksdb::BottommostLevelCompaction::kForce:
+        return 0x2;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::BottommostLevelCompaction enum for the
+  // provided Java org.rocksdb.BottommostLevelCompaction
+  static rocksdb::BottommostLevelCompaction toCppBottommostLevelCompaction(
+      jint bottommost_level_compaction) {
+    switch(bottommost_level_compaction) {
+      case 0x0:
+        return rocksdb::BottommostLevelCompaction::kSkip;
+      case 0x1:
+        return rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter;
+      case 0x2:
+        return rocksdb::BottommostLevelCompaction::kForce;
+      default:
+        // undefined/default
+        return rocksdb::BottommostLevelCompaction::kIfHaveCompactionFilter;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.CompactionStopStyle
+class CompactionStopStyleJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompactionStopStyle for the provided
+  // C++ rocksdb::CompactionStopStyle enum
+  static jbyte toJavaCompactionStopStyle(
+      const rocksdb::CompactionStopStyle& compaction_stop_style) {
+    switch(compaction_stop_style) {
+      case rocksdb::CompactionStopStyle::kCompactionStopStyleSimilarSize:
+        return 0x0;
+      case rocksdb::CompactionStopStyle::kCompactionStopStyleTotalSize:
+        return 0x1;
+      default:
+        return 0x7F;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::CompactionStopStyle enum for the
+  // provided Java org.rocksdb.CompactionStopStyle
+  static rocksdb::CompactionStopStyle toCppCompactionStopStyle(
+      jbyte jcompaction_stop_style) {
+    switch(jcompaction_stop_style) {
+      case 0x0:
+        return rocksdb::CompactionStopStyle::kCompactionStopStyleSimilarSize;
+      case 0x1:
+        return rocksdb::CompactionStopStyle::kCompactionStopStyleTotalSize;
+      default:
+        // undefined/default
+        return rocksdb::CompactionStopStyle::kCompactionStopStyleSimilarSize;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.CompressionType
+class CompressionTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompressionType for the provided
+  // C++ rocksdb::CompressionType enum
+  static jbyte toJavaCompressionType(
+      const rocksdb::CompressionType& compression_type) {
+    switch(compression_type) {
+      case rocksdb::CompressionType::kNoCompression:
+        return 0x0;
+      case rocksdb::CompressionType::kSnappyCompression:
+        return 0x1;
+      case rocksdb::CompressionType::kZlibCompression:
+        return 0x2;
+      case rocksdb::CompressionType::kBZip2Compression:
+        return 0x3;
+      case rocksdb::CompressionType::kLZ4Compression:
+        return 0x4;
+      case rocksdb::CompressionType::kLZ4HCCompression:
+        return 0x5;
+      case rocksdb::CompressionType::kXpressCompression:
+        return 0x6;
+      case rocksdb::CompressionType::kZSTD:
+        return 0x7;
+      case rocksdb::CompressionType::kDisableCompressionOption:
+      default:
+        return 0x7F;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::CompressionType enum for the
+  // provided Java org.rocksdb.CompressionType
+  static rocksdb::CompressionType toCppCompressionType(
+      jbyte jcompression_type) {
+    switch(jcompression_type) {
+      case 0x0:
+        return rocksdb::CompressionType::kNoCompression;
+      case 0x1:
+        return rocksdb::CompressionType::kSnappyCompression;
+      case 0x2:
+        return rocksdb::CompressionType::kZlibCompression;
+      case 0x3:
+        return rocksdb::CompressionType::kBZip2Compression;
+      case 0x4:
+        return rocksdb::CompressionType::kLZ4Compression;
+      case 0x5:
+        return rocksdb::CompressionType::kLZ4HCCompression;
+      case 0x6:
+        return rocksdb::CompressionType::kXpressCompression;
+      case 0x7:
+        return rocksdb::CompressionType::kZSTD;
+      case 0x7F:
+      default:
+        return rocksdb::CompressionType::kDisableCompressionOption;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.CompactionPriority
+class CompactionPriorityJni {
+ public:
+  // Returns the equivalent org.rocksdb.CompactionPriority for the provided
+  // C++ rocksdb::CompactionPri enum
+  static jbyte toJavaCompactionPriority(
+      const rocksdb::CompactionPri& compaction_priority) {
+    switch(compaction_priority) {
+      case rocksdb::CompactionPri::kByCompensatedSize:
+        return 0x0;
+      case rocksdb::CompactionPri::kOldestLargestSeqFirst:
+        return 0x1;
+      case rocksdb::CompactionPri::kOldestSmallestSeqFirst:
+        return 0x2;
+      case rocksdb::CompactionPri::kMinOverlappingRatio:
+        return 0x3;
+      default:
+        return 0x0;  // undefined
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::CompactionPri enum for the
+  // provided Java org.rocksdb.CompactionPriority
+  static rocksdb::CompactionPri toCppCompactionPriority(
+      jbyte jcompaction_priority) {
+    switch(jcompaction_priority) {
+      case 0x0:
+        return rocksdb::CompactionPri::kByCompensatedSize;
+      case 0x1:
+        return rocksdb::CompactionPri::kOldestLargestSeqFirst;
+      case 0x2:
+        return rocksdb::CompactionPri::kOldestSmallestSeqFirst;
+      case 0x3:
+        return rocksdb::CompactionPri::kMinOverlappingRatio;
+      default:
+        // undefined/default
+        return rocksdb::CompactionPri::kByCompensatedSize;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.AccessHint
+class AccessHintJni {
+ public:
+  // Returns the equivalent org.rocksdb.AccessHint for the provided
+  // C++ rocksdb::DBOptions::AccessHint enum
+  static jbyte toJavaAccessHint(
+      const rocksdb::DBOptions::AccessHint& access_hint) {
+    switch(access_hint) {
+      case rocksdb::DBOptions::AccessHint::NONE:
+        return 0x0;
+      case rocksdb::DBOptions::AccessHint::NORMAL:
+        return 0x1;
+      case rocksdb::DBOptions::AccessHint::SEQUENTIAL:
+        return 0x2;
+      case rocksdb::DBOptions::AccessHint::WILLNEED:
+        return 0x3;
+      default:
+        // undefined/default
+        return 0x1;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::DBOptions::AccessHint enum for the
+  // provided Java org.rocksdb.AccessHint
+  static rocksdb::DBOptions::AccessHint toCppAccessHint(jbyte jaccess_hint) {
+    switch(jaccess_hint) {
+      case 0x0:
+        return rocksdb::DBOptions::AccessHint::NONE;
+      case 0x1:
+        return rocksdb::DBOptions::AccessHint::NORMAL;
+      case 0x2:
+        return rocksdb::DBOptions::AccessHint::SEQUENTIAL;
+      case 0x3:
+        return rocksdb::DBOptions::AccessHint::WILLNEED;
+      default:
+        // undefined/default
+        return rocksdb::DBOptions::AccessHint::NORMAL;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.WALRecoveryMode
+class WALRecoveryModeJni {
+ public:
+  // Returns the equivalent org.rocksdb.WALRecoveryMode for the provided
+  // C++ rocksdb::WALRecoveryMode enum
+  static jbyte toJavaWALRecoveryMode(
+      const rocksdb::WALRecoveryMode& wal_recovery_mode) {
+    switch(wal_recovery_mode) {
+      case rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords:
+        return 0x0;
+      case rocksdb::WALRecoveryMode::kAbsoluteConsistency:
+        return 0x1;
+      case rocksdb::WALRecoveryMode::kPointInTimeRecovery:
+        return 0x2;
+      case rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords:
+        return 0x3;
+      default:
+        // undefined/default
+        return 0x2;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::WALRecoveryMode enum for the
+  // provided Java org.rocksdb.WALRecoveryMode
+  static rocksdb::WALRecoveryMode toCppWALRecoveryMode(jbyte jwal_recovery_mode) {
+    switch(jwal_recovery_mode) {
+      case 0x0:
+        return rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords;
+      case 0x1:
+        return rocksdb::WALRecoveryMode::kAbsoluteConsistency;
+      case 0x2:
+        return rocksdb::WALRecoveryMode::kPointInTimeRecovery;
+      case 0x3:
+        return rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords;
+      default:
+        // undefined/default
+        return rocksdb::WALRecoveryMode::kPointInTimeRecovery;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.TickerType
+class TickerTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.TickerType for the provided
+  // C++ rocksdb::Tickers enum
+  static jbyte toJavaTickerType(
+      const rocksdb::Tickers& tickers) {
+    switch(tickers) {
+      case rocksdb::Tickers::BLOCK_CACHE_MISS:
+        return 0x0;
+      case rocksdb::Tickers::BLOCK_CACHE_HIT:
+        return 0x1;
+      case rocksdb::Tickers::BLOCK_CACHE_ADD:
+        return 0x2;
+      case rocksdb::Tickers::BLOCK_CACHE_ADD_FAILURES:
+        return 0x3;
+      case rocksdb::Tickers::BLOCK_CACHE_INDEX_MISS:
+        return 0x4;
+      case rocksdb::Tickers::BLOCK_CACHE_INDEX_HIT:
+        return 0x5;
+      case rocksdb::Tickers::BLOCK_CACHE_INDEX_ADD:
+        return 0x6;
+      case rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT:
+        return 0x7;
+      case rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT:
+        return 0x8;
+      case rocksdb::Tickers::BLOCK_CACHE_FILTER_MISS:
+        return 0x9;
+      case rocksdb::Tickers::BLOCK_CACHE_FILTER_HIT:
+        return 0xA;
+      case rocksdb::Tickers::BLOCK_CACHE_FILTER_ADD:
+        return 0xB;
+      case rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT:
+        return 0xC;
+      case rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT:
+        return 0xD;
+      case rocksdb::Tickers::BLOCK_CACHE_DATA_MISS:
+        return 0xE;
+      case rocksdb::Tickers::BLOCK_CACHE_DATA_HIT:
+        return 0xF;
+      case rocksdb::Tickers::BLOCK_CACHE_DATA_ADD:
+        return 0x10;
+      case rocksdb::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT:
+        return 0x11;
+      case rocksdb::Tickers::BLOCK_CACHE_BYTES_READ:
+        return 0x12;
+      case rocksdb::Tickers::BLOCK_CACHE_BYTES_WRITE:
+        return 0x13;
+      case rocksdb::Tickers::BLOOM_FILTER_USEFUL:
+        return 0x14;
+      case rocksdb::Tickers::PERSISTENT_CACHE_HIT:
+        return 0x15;
+      case rocksdb::Tickers::PERSISTENT_CACHE_MISS:
+        return 0x16;
+      case rocksdb::Tickers::SIM_BLOCK_CACHE_HIT:
+        return 0x17;
+      case rocksdb::Tickers::SIM_BLOCK_CACHE_MISS:
+        return 0x18;
+      case rocksdb::Tickers::MEMTABLE_HIT:
+        return 0x19;
+      case rocksdb::Tickers::MEMTABLE_MISS:
+        return 0x1A;
+      case rocksdb::Tickers::GET_HIT_L0:
+        return 0x1B;
+      case rocksdb::Tickers::GET_HIT_L1:
+        return 0x1C;
       case rocksdb::Tickers::GET_HIT_L2_AND_UP:
         return 0x1D;
-      case rocksdb::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY:
+      case rocksdb::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY:
+        return 0x1E;
+      case rocksdb::Tickers::COMPACTION_KEY_DROP_OBSOLETE:
+        return 0x1F;
+      case rocksdb::Tickers::COMPACTION_KEY_DROP_RANGE_DEL:
+        return 0x20;
+      case rocksdb::Tickers::COMPACTION_KEY_DROP_USER:
+        return 0x21;
+      case rocksdb::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE:
+        return 0x22;
+      case rocksdb::Tickers::NUMBER_KEYS_WRITTEN:
+        return 0x23;
+      case rocksdb::Tickers::NUMBER_KEYS_READ:
+        return 0x24;
+      case rocksdb::Tickers::NUMBER_KEYS_UPDATED:
+        return 0x25;
+      case rocksdb::Tickers::BYTES_WRITTEN:
+        return 0x26;
+      case rocksdb::Tickers::BYTES_READ:
+        return 0x27;
+      case rocksdb::Tickers::NUMBER_DB_SEEK:
+        return 0x28;
+      case rocksdb::Tickers::NUMBER_DB_NEXT:
+        return 0x29;
+      case rocksdb::Tickers::NUMBER_DB_PREV:
+        return 0x2A;
+      case rocksdb::Tickers::NUMBER_DB_SEEK_FOUND:
+        return 0x2B;
+      case rocksdb::Tickers::NUMBER_DB_NEXT_FOUND:
+        return 0x2C;
+      case rocksdb::Tickers::NUMBER_DB_PREV_FOUND:
+        return 0x2D;
+      case rocksdb::Tickers::ITER_BYTES_READ:
+        return 0x2E;
+      case rocksdb::Tickers::NO_FILE_CLOSES:
+        return 0x2F;
+      case rocksdb::Tickers::NO_FILE_OPENS:
+        return 0x30;
+      case rocksdb::Tickers::NO_FILE_ERRORS:
+        return 0x31;
+      case rocksdb::Tickers::STALL_L0_SLOWDOWN_MICROS:
+        return 0x32;
+      case rocksdb::Tickers::STALL_MEMTABLE_COMPACTION_MICROS:
+        return 0x33;
+      case rocksdb::Tickers::STALL_L0_NUM_FILES_MICROS:
+        return 0x34;
+      case rocksdb::Tickers::STALL_MICROS:
+        return 0x35;
+      case rocksdb::Tickers::DB_MUTEX_WAIT_MICROS:
+        return 0x36;
+      case rocksdb::Tickers::RATE_LIMIT_DELAY_MILLIS:
+        return 0x37;
+      case rocksdb::Tickers::NO_ITERATORS:
+        return 0x38;
+      case rocksdb::Tickers::NUMBER_MULTIGET_CALLS:
+        return 0x39;
+      case rocksdb::Tickers::NUMBER_MULTIGET_KEYS_READ:
+        return 0x3A;
+      case rocksdb::Tickers::NUMBER_MULTIGET_BYTES_READ:
+        return 0x3B;
+      case rocksdb::Tickers::NUMBER_FILTERED_DELETES:
+        return 0x3C;
+      case rocksdb::Tickers::NUMBER_MERGE_FAILURES:
+        return 0x3D;
+      case rocksdb::Tickers::BLOOM_FILTER_PREFIX_CHECKED:
+        return 0x3E;
+      case rocksdb::Tickers::BLOOM_FILTER_PREFIX_USEFUL:
+        return 0x3F;
+      case rocksdb::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION:
+        return 0x40;
+      case rocksdb::Tickers::GET_UPDATES_SINCE_CALLS:
+        return 0x41;
+      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_MISS:
+        return 0x42;
+      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_HIT:
+        return 0x43;
+      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD:
+        return 0x44;
+      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES:
+        return 0x45;
+      case rocksdb::Tickers::WAL_FILE_SYNCED:
+        return 0x46;
+      case rocksdb::Tickers::WAL_FILE_BYTES:
+        return 0x47;
+      case rocksdb::Tickers::WRITE_DONE_BY_SELF:
+        return 0x48;
+      case rocksdb::Tickers::WRITE_DONE_BY_OTHER:
+        return 0x49;
+      case rocksdb::Tickers::WRITE_TIMEDOUT:
+        return 0x4A;
+      case rocksdb::Tickers::WRITE_WITH_WAL:
+        return 0x4B;
+      case rocksdb::Tickers::COMPACT_READ_BYTES:
+        return 0x4C;
+      case rocksdb::Tickers::COMPACT_WRITE_BYTES:
+        return 0x4D;
+      case rocksdb::Tickers::FLUSH_WRITE_BYTES:
+        return 0x4E;
+      case rocksdb::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES:
+        return 0x4F;
+      case rocksdb::Tickers::NUMBER_SUPERVERSION_ACQUIRES:
+        return 0x50;
+      case rocksdb::Tickers::NUMBER_SUPERVERSION_RELEASES:
+        return 0x51;
+      case rocksdb::Tickers::NUMBER_SUPERVERSION_CLEANUPS:
+        return 0x52;
+      case rocksdb::Tickers::NUMBER_BLOCK_COMPRESSED:
+        return 0x53;
+      case rocksdb::Tickers::NUMBER_BLOCK_DECOMPRESSED:
+        return 0x54;
+      case rocksdb::Tickers::NUMBER_BLOCK_NOT_COMPRESSED:
+        return 0x55;
+      case rocksdb::Tickers::MERGE_OPERATION_TOTAL_TIME:
+        return 0x56;
+      case rocksdb::Tickers::FILTER_OPERATION_TOTAL_TIME:
+        return 0x57;
+      case rocksdb::Tickers::ROW_CACHE_HIT:
+        return 0x58;
+      case rocksdb::Tickers::ROW_CACHE_MISS:
+        return 0x59;
+      case rocksdb::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES:
+        return 0x5A;
+      case rocksdb::Tickers::READ_AMP_TOTAL_READ_BYTES:
+        return 0x5B;
+      case rocksdb::Tickers::NUMBER_RATE_LIMITER_DRAINS:
+        return 0x5C;
+      case rocksdb::Tickers::NUMBER_ITER_SKIP:
+        return 0x5D;
+      case rocksdb::Tickers::NUMBER_MULTIGET_KEYS_FOUND:
+        return 0x5E;
+      case rocksdb::Tickers::NO_ITERATOR_CREATED:
+        // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX.
+        return -0x01;
+      case rocksdb::Tickers::NO_ITERATOR_DELETED:
+        return 0x60;
+      case rocksdb::Tickers::COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE:
+        return 0x61;
+      case rocksdb::Tickers::COMPACTION_CANCELLED:
+        return 0x62;
+      case rocksdb::Tickers::BLOOM_FILTER_FULL_POSITIVE:
+        return 0x63;
+      case rocksdb::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE:
+        return 0x64;
+      case rocksdb::Tickers::BLOB_DB_NUM_PUT:
+        return 0x65;
+      case rocksdb::Tickers::BLOB_DB_NUM_WRITE:
+        return 0x66;
+      case rocksdb::Tickers::BLOB_DB_NUM_GET:
+        return 0x67;
+      case rocksdb::Tickers::BLOB_DB_NUM_MULTIGET:
+        return 0x68;
+      case rocksdb::Tickers::BLOB_DB_NUM_SEEK:
+        return 0x69;
+      case rocksdb::Tickers::BLOB_DB_NUM_NEXT:
+        return 0x6A;
+      case rocksdb::Tickers::BLOB_DB_NUM_PREV:
+        return 0x6B;
+      case rocksdb::Tickers::BLOB_DB_NUM_KEYS_WRITTEN:
+        return 0x6C;
+      case rocksdb::Tickers::BLOB_DB_NUM_KEYS_READ:
+        return 0x6D;
+      case rocksdb::Tickers::BLOB_DB_BYTES_WRITTEN:
+        return 0x6E;
+      case rocksdb::Tickers::BLOB_DB_BYTES_READ:
+        return 0x6F;
+      case rocksdb::Tickers::BLOB_DB_WRITE_INLINED:
+        return 0x70;
+      case rocksdb::Tickers::BLOB_DB_WRITE_INLINED_TTL:
+        return 0x71;
+      case rocksdb::Tickers::BLOB_DB_WRITE_BLOB:
+        return 0x72;
+      case rocksdb::Tickers::BLOB_DB_WRITE_BLOB_TTL:
+        return 0x73;
+      case rocksdb::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN:
+        return 0x74;
+      case rocksdb::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ:
+        return 0x75;
+      case rocksdb::Tickers::BLOB_DB_BLOB_FILE_SYNCED:
+        return 0x76;
+      case rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT:
+        return 0x77;
+      case rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE:
+        return 0x78;
+      case rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT:
+        return 0x79;
+      case rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE:
+        return 0x7A;
+      case rocksdb::Tickers::BLOB_DB_GC_NUM_FILES:
+        return 0x7B;
+      case rocksdb::Tickers::BLOB_DB_GC_NUM_NEW_FILES:
+        return 0x7C;
+      case rocksdb::Tickers::BLOB_DB_GC_FAILURES:
+        return 0x7D;
+      case rocksdb::Tickers::BLOB_DB_GC_NUM_KEYS_OVERWRITTEN:
+        return 0x7E;
+      case rocksdb::Tickers::BLOB_DB_GC_NUM_KEYS_EXPIRED:
+        return 0x7F;
+      case rocksdb::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED:
+        return -0x02;
+      case rocksdb::Tickers::BLOB_DB_GC_BYTES_OVERWRITTEN:
+        return -0x03;
+      case rocksdb::Tickers::BLOB_DB_GC_BYTES_EXPIRED:
+        return -0x04;
+      case rocksdb::Tickers::BLOB_DB_GC_BYTES_RELOCATED:
+        return -0x05;
+      case rocksdb::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED:
+        return -0x06;
+      case rocksdb::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED:
+        return -0x07;
+      case rocksdb::Tickers::BLOB_DB_FIFO_BYTES_EVICTED:
+        return -0x08;
+      case rocksdb::Tickers::TXN_PREPARE_MUTEX_OVERHEAD:
+        return -0x09;
+      case rocksdb::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD:
+        return -0x0A;
+      case rocksdb::Tickers::TXN_DUPLICATE_KEY_OVERHEAD:
+        return -0x0B;
+      case rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD:
+        return -0x0C;
+      case rocksdb::Tickers::TICKER_ENUM_MAX:
+        // 0x5F for backwards compatibility on current minor version.
+        return 0x5F;
+      default:
+        // undefined/default
+        return 0x0;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::Tickers enum for the
+  // provided Java org.rocksdb.TickerType
+  static rocksdb::Tickers toCppTickers(jbyte jticker_type) {
+    switch(jticker_type) {
+      case 0x0:
+        return rocksdb::Tickers::BLOCK_CACHE_MISS;
+      case 0x1:
+        return rocksdb::Tickers::BLOCK_CACHE_HIT;
+      case 0x2:
+        return rocksdb::Tickers::BLOCK_CACHE_ADD;
+      case 0x3:
+        return rocksdb::Tickers::BLOCK_CACHE_ADD_FAILURES;
+      case 0x4:
+        return rocksdb::Tickers::BLOCK_CACHE_INDEX_MISS;
+      case 0x5:
+        return rocksdb::Tickers::BLOCK_CACHE_INDEX_HIT;
+      case 0x6:
+        return rocksdb::Tickers::BLOCK_CACHE_INDEX_ADD;
+      case 0x7:
+        return rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT;
+      case 0x8:
+        return rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT;
+      case 0x9:
+        return rocksdb::Tickers::BLOCK_CACHE_FILTER_MISS;
+      case 0xA:
+        return rocksdb::Tickers::BLOCK_CACHE_FILTER_HIT;
+      case 0xB:
+        return rocksdb::Tickers::BLOCK_CACHE_FILTER_ADD;
+      case 0xC:
+        return rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT;
+      case 0xD:
+        return rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT;
+      case 0xE:
+        return rocksdb::Tickers::BLOCK_CACHE_DATA_MISS;
+      case 0xF:
+        return rocksdb::Tickers::BLOCK_CACHE_DATA_HIT;
+      case 0x10:
+        return rocksdb::Tickers::BLOCK_CACHE_DATA_ADD;
+      case 0x11:
+        return rocksdb::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT;
+      case 0x12:
+        return rocksdb::Tickers::BLOCK_CACHE_BYTES_READ;
+      case 0x13:
+        return rocksdb::Tickers::BLOCK_CACHE_BYTES_WRITE;
+      case 0x14:
+        return rocksdb::Tickers::BLOOM_FILTER_USEFUL;
+      case 0x15:
+        return rocksdb::Tickers::PERSISTENT_CACHE_HIT;
+      case 0x16:
+        return rocksdb::Tickers::PERSISTENT_CACHE_MISS;
+      case 0x17:
+        return rocksdb::Tickers::SIM_BLOCK_CACHE_HIT;
+      case 0x18:
+        return rocksdb::Tickers::SIM_BLOCK_CACHE_MISS;
+      case 0x19:
+        return rocksdb::Tickers::MEMTABLE_HIT;
+      case 0x1A:
+        return rocksdb::Tickers::MEMTABLE_MISS;
+      case 0x1B:
+        return rocksdb::Tickers::GET_HIT_L0;
+      case 0x1C:
+        return rocksdb::Tickers::GET_HIT_L1;
+      case 0x1D:
+        return rocksdb::Tickers::GET_HIT_L2_AND_UP;
+      case 0x1E:
+        return rocksdb::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY;
+      case 0x1F:
+        return rocksdb::Tickers::COMPACTION_KEY_DROP_OBSOLETE;
+      case 0x20:
+        return rocksdb::Tickers::COMPACTION_KEY_DROP_RANGE_DEL;
+      case 0x21:
+        return rocksdb::Tickers::COMPACTION_KEY_DROP_USER;
+      case 0x22:
+        return rocksdb::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE;
+      case 0x23:
+        return rocksdb::Tickers::NUMBER_KEYS_WRITTEN;
+      case 0x24:
+        return rocksdb::Tickers::NUMBER_KEYS_READ;
+      case 0x25:
+        return rocksdb::Tickers::NUMBER_KEYS_UPDATED;
+      case 0x26:
+        return rocksdb::Tickers::BYTES_WRITTEN;
+      case 0x27:
+        return rocksdb::Tickers::BYTES_READ;
+      case 0x28:
+        return rocksdb::Tickers::NUMBER_DB_SEEK;
+      case 0x29:
+        return rocksdb::Tickers::NUMBER_DB_NEXT;
+      case 0x2A:
+        return rocksdb::Tickers::NUMBER_DB_PREV;
+      case 0x2B:
+        return rocksdb::Tickers::NUMBER_DB_SEEK_FOUND;
+      case 0x2C:
+        return rocksdb::Tickers::NUMBER_DB_NEXT_FOUND;
+      case 0x2D:
+        return rocksdb::Tickers::NUMBER_DB_PREV_FOUND;
+      case 0x2E:
+        return rocksdb::Tickers::ITER_BYTES_READ;
+      case 0x2F:
+        return rocksdb::Tickers::NO_FILE_CLOSES;
+      case 0x30:
+        return rocksdb::Tickers::NO_FILE_OPENS;
+      case 0x31:
+        return rocksdb::Tickers::NO_FILE_ERRORS;
+      case 0x32:
+        return rocksdb::Tickers::STALL_L0_SLOWDOWN_MICROS;
+      case 0x33:
+        return rocksdb::Tickers::STALL_MEMTABLE_COMPACTION_MICROS;
+      case 0x34:
+        return rocksdb::Tickers::STALL_L0_NUM_FILES_MICROS;
+      case 0x35:
+        return rocksdb::Tickers::STALL_MICROS;
+      case 0x36:
+        return rocksdb::Tickers::DB_MUTEX_WAIT_MICROS;
+      case 0x37:
+        return rocksdb::Tickers::RATE_LIMIT_DELAY_MILLIS;
+      case 0x38:
+        return rocksdb::Tickers::NO_ITERATORS;
+      case 0x39:
+        return rocksdb::Tickers::NUMBER_MULTIGET_CALLS;
+      case 0x3A:
+        return rocksdb::Tickers::NUMBER_MULTIGET_KEYS_READ;
+      case 0x3B:
+        return rocksdb::Tickers::NUMBER_MULTIGET_BYTES_READ;
+      case 0x3C:
+        return rocksdb::Tickers::NUMBER_FILTERED_DELETES;
+      case 0x3D:
+        return rocksdb::Tickers::NUMBER_MERGE_FAILURES;
+      case 0x3E:
+        return rocksdb::Tickers::BLOOM_FILTER_PREFIX_CHECKED;
+      case 0x3F:
+        return rocksdb::Tickers::BLOOM_FILTER_PREFIX_USEFUL;
+      case 0x40:
+        return rocksdb::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION;
+      case 0x41:
+        return rocksdb::Tickers::GET_UPDATES_SINCE_CALLS;
+      case 0x42:
+        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_MISS;
+      case 0x43:
+        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_HIT;
+      case 0x44:
+        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD;
+      case 0x45:
+        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES;
+      case 0x46:
+        return rocksdb::Tickers::WAL_FILE_SYNCED;
+      case 0x47:
+        return rocksdb::Tickers::WAL_FILE_BYTES;
+      case 0x48:
+        return rocksdb::Tickers::WRITE_DONE_BY_SELF;
+      case 0x49:
+        return rocksdb::Tickers::WRITE_DONE_BY_OTHER;
+      case 0x4A:
+        return rocksdb::Tickers::WRITE_TIMEDOUT;
+      case 0x4B:
+        return rocksdb::Tickers::WRITE_WITH_WAL;
+      case 0x4C:
+        return rocksdb::Tickers::COMPACT_READ_BYTES;
+      case 0x4D:
+        return rocksdb::Tickers::COMPACT_WRITE_BYTES;
+      case 0x4E:
+        return rocksdb::Tickers::FLUSH_WRITE_BYTES;
+      case 0x4F:
+        return rocksdb::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES;
+      case 0x50:
+        return rocksdb::Tickers::NUMBER_SUPERVERSION_ACQUIRES;
+      case 0x51:
+        return rocksdb::Tickers::NUMBER_SUPERVERSION_RELEASES;
+      case 0x52:
+        return rocksdb::Tickers::NUMBER_SUPERVERSION_CLEANUPS;
+      case 0x53:
+        return rocksdb::Tickers::NUMBER_BLOCK_COMPRESSED;
+      case 0x54:
+        return rocksdb::Tickers::NUMBER_BLOCK_DECOMPRESSED;
+      case 0x55:
+        return rocksdb::Tickers::NUMBER_BLOCK_NOT_COMPRESSED;
+      case 0x56:
+        return rocksdb::Tickers::MERGE_OPERATION_TOTAL_TIME;
+      case 0x57:
+        return rocksdb::Tickers::FILTER_OPERATION_TOTAL_TIME;
+      case 0x58:
+        return rocksdb::Tickers::ROW_CACHE_HIT;
+      case 0x59:
+        return rocksdb::Tickers::ROW_CACHE_MISS;
+      case 0x5A:
+        return rocksdb::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES;
+      case 0x5B:
+        return rocksdb::Tickers::READ_AMP_TOTAL_READ_BYTES;
+      case 0x5C:
+        return rocksdb::Tickers::NUMBER_RATE_LIMITER_DRAINS;
+      case 0x5D:
+        return rocksdb::Tickers::NUMBER_ITER_SKIP;
+      case 0x5E:
+        return rocksdb::Tickers::NUMBER_MULTIGET_KEYS_FOUND;
+      case -0x01:
+        // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX.
+        return rocksdb::Tickers::NO_ITERATOR_CREATED;
+      case 0x60:
+        return rocksdb::Tickers::NO_ITERATOR_DELETED;
+      case 0x61:
+        return rocksdb::Tickers::COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE;
+      case 0x62:
+        return rocksdb::Tickers::COMPACTION_CANCELLED;
+      case 0x63:
+        return rocksdb::Tickers::BLOOM_FILTER_FULL_POSITIVE;
+      case 0x64:
+        return rocksdb::Tickers::BLOOM_FILTER_FULL_TRUE_POSITIVE;
+      case 0x65:
+        return rocksdb::Tickers::BLOB_DB_NUM_PUT;
+      case 0x66:
+        return rocksdb::Tickers::BLOB_DB_NUM_WRITE;
+      case 0x67:
+        return rocksdb::Tickers::BLOB_DB_NUM_GET;
+      case 0x68:
+        return rocksdb::Tickers::BLOB_DB_NUM_MULTIGET;
+      case 0x69:
+        return rocksdb::Tickers::BLOB_DB_NUM_SEEK;
+      case 0x6A:
+        return rocksdb::Tickers::BLOB_DB_NUM_NEXT;
+      case 0x6B:
+        return rocksdb::Tickers::BLOB_DB_NUM_PREV;
+      case 0x6C:
+        return rocksdb::Tickers::BLOB_DB_NUM_KEYS_WRITTEN;
+      case 0x6D:
+        return rocksdb::Tickers::BLOB_DB_NUM_KEYS_READ;
+      case 0x6E:
+        return rocksdb::Tickers::BLOB_DB_BYTES_WRITTEN;
+      case 0x6F:
+        return rocksdb::Tickers::BLOB_DB_BYTES_READ;
+      case 0x70:
+        return rocksdb::Tickers::BLOB_DB_WRITE_INLINED;
+      case 0x71:
+        return rocksdb::Tickers::BLOB_DB_WRITE_INLINED_TTL;
+      case 0x72:
+        return rocksdb::Tickers::BLOB_DB_WRITE_BLOB;
+      case 0x73:
+        return rocksdb::Tickers::BLOB_DB_WRITE_BLOB_TTL;
+      case 0x74:
+        return rocksdb::Tickers::BLOB_DB_BLOB_FILE_BYTES_WRITTEN;
+      case 0x75:
+        return rocksdb::Tickers::BLOB_DB_BLOB_FILE_BYTES_READ;
+      case 0x76:
+        return rocksdb::Tickers::BLOB_DB_BLOB_FILE_SYNCED;
+      case 0x77:
+        return rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_COUNT;
+      case 0x78:
+        return rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EXPIRED_SIZE;
+      case 0x79:
+        return rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_COUNT;
+      case 0x7A:
+        return rocksdb::Tickers::BLOB_DB_BLOB_INDEX_EVICTED_SIZE;
+      case 0x7B:
+        return rocksdb::Tickers::BLOB_DB_GC_NUM_FILES;
+      case 0x7C:
+        return rocksdb::Tickers::BLOB_DB_GC_NUM_NEW_FILES;
+      case 0x7D:
+        return rocksdb::Tickers::BLOB_DB_GC_FAILURES;
+      case 0x7E:
+        return rocksdb::Tickers::BLOB_DB_GC_NUM_KEYS_OVERWRITTEN;
+      case 0x7F:
+        return rocksdb::Tickers::BLOB_DB_GC_NUM_KEYS_EXPIRED;
+      case -0x02:
+        return rocksdb::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED;
+      case -0x03:
+        return rocksdb::Tickers::BLOB_DB_GC_BYTES_OVERWRITTEN;
+      case -0x04:
+        return rocksdb::Tickers::BLOB_DB_GC_BYTES_EXPIRED;
+      case -0x05:
+        return rocksdb::Tickers::BLOB_DB_GC_BYTES_RELOCATED;
+      case -0x06:
+        return rocksdb::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED;
+      case -0x07:
+        return rocksdb::Tickers::BLOB_DB_FIFO_NUM_KEYS_EVICTED;
+      case -0x08:
+        return rocksdb::Tickers::BLOB_DB_FIFO_BYTES_EVICTED;
+      case -0x09:
+        return rocksdb::Tickers::TXN_PREPARE_MUTEX_OVERHEAD;
+      case -0x0A:
+        return rocksdb::Tickers::TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD;
+      case -0x0B:
+        return rocksdb::Tickers::TXN_DUPLICATE_KEY_OVERHEAD;
+      case -0x0C:
+        return rocksdb::Tickers::TXN_SNAPSHOT_MUTEX_OVERHEAD;
+      case 0x5F:
+        // 0x5F for backwards compatibility on current minor version.
+        return rocksdb::Tickers::TICKER_ENUM_MAX;
+
+      default:
+        // undefined/default
+        return rocksdb::Tickers::BLOCK_CACHE_MISS;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.HistogramType
+class HistogramTypeJni {
+ public:
+  // Returns the equivalent org.rocksdb.HistogramType for the provided
+  // C++ rocksdb::Histograms enum
+  static jbyte toJavaHistogramsType(
+      const rocksdb::Histograms& histograms) {
+    switch(histograms) {
+      case rocksdb::Histograms::DB_GET:
+        return 0x0;
+      case rocksdb::Histograms::DB_WRITE:
+        return 0x1;
+      case rocksdb::Histograms::COMPACTION_TIME:
+        return 0x2;
+      case rocksdb::Histograms::SUBCOMPACTION_SETUP_TIME:
+        return 0x3;
+      case rocksdb::Histograms::TABLE_SYNC_MICROS:
+        return 0x4;
+      case rocksdb::Histograms::COMPACTION_OUTFILE_SYNC_MICROS:
+        return 0x5;
+      case rocksdb::Histograms::WAL_FILE_SYNC_MICROS:
+        return 0x6;
+      case rocksdb::Histograms::MANIFEST_FILE_SYNC_MICROS:
+        return 0x7;
+      case rocksdb::Histograms::TABLE_OPEN_IO_MICROS:
+        return 0x8;
+      case rocksdb::Histograms::DB_MULTIGET:
+        return 0x9;
+      case rocksdb::Histograms::READ_BLOCK_COMPACTION_MICROS:
+        return 0xA;
+      case rocksdb::Histograms::READ_BLOCK_GET_MICROS:
+        return 0xB;
+      case rocksdb::Histograms::WRITE_RAW_BLOCK_MICROS:
+        return 0xC;
+      case rocksdb::Histograms::STALL_L0_SLOWDOWN_COUNT:
+        return 0xD;
+      case rocksdb::Histograms::STALL_MEMTABLE_COMPACTION_COUNT:
+        return 0xE;
+      case rocksdb::Histograms::STALL_L0_NUM_FILES_COUNT:
+        return 0xF;
+      case rocksdb::Histograms::HARD_RATE_LIMIT_DELAY_COUNT:
+        return 0x10;
+      case rocksdb::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT:
+        return 0x11;
+      case rocksdb::Histograms::NUM_FILES_IN_SINGLE_COMPACTION:
+        return 0x12;
+      case rocksdb::Histograms::DB_SEEK:
+        return 0x13;
+      case rocksdb::Histograms::WRITE_STALL:
+        return 0x14;
+      case rocksdb::Histograms::SST_READ_MICROS:
+        return 0x15;
+      case rocksdb::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED:
+        return 0x16;
+      case rocksdb::Histograms::BYTES_PER_READ:
+        return 0x17;
+      case rocksdb::Histograms::BYTES_PER_WRITE:
+        return 0x18;
+      case rocksdb::Histograms::BYTES_PER_MULTIGET:
+        return 0x19;
+      case rocksdb::Histograms::BYTES_COMPRESSED:
+        return 0x1A;
+      case rocksdb::Histograms::BYTES_DECOMPRESSED:
+        return 0x1B;
+      case rocksdb::Histograms::COMPRESSION_TIMES_NANOS:
+        return 0x1C;
+      case rocksdb::Histograms::DECOMPRESSION_TIMES_NANOS:
+        return 0x1D;
+      case rocksdb::Histograms::READ_NUM_MERGE_OPERANDS:
         return 0x1E;
-      case rocksdb::Tickers::COMPACTION_KEY_DROP_OBSOLETE:
-        return 0x1F;
-      case rocksdb::Tickers::COMPACTION_KEY_DROP_RANGE_DEL:
+      // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor version compatibility.
+      case rocksdb::Histograms::FLUSH_TIME:
         return 0x20;
-      case rocksdb::Tickers::COMPACTION_KEY_DROP_USER:
+      case rocksdb::Histograms::BLOB_DB_KEY_SIZE:
         return 0x21;
-      case rocksdb::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE:
+      case rocksdb::Histograms::BLOB_DB_VALUE_SIZE:
         return 0x22;
-      case rocksdb::Tickers::NUMBER_KEYS_WRITTEN:
+      case rocksdb::Histograms::BLOB_DB_WRITE_MICROS:
         return 0x23;
-      case rocksdb::Tickers::NUMBER_KEYS_READ:
+      case rocksdb::Histograms::BLOB_DB_GET_MICROS:
         return 0x24;
-      case rocksdb::Tickers::NUMBER_KEYS_UPDATED:
+      case rocksdb::Histograms::BLOB_DB_MULTIGET_MICROS:
         return 0x25;
-      case rocksdb::Tickers::BYTES_WRITTEN:
+      case rocksdb::Histograms::BLOB_DB_SEEK_MICROS:
         return 0x26;
-      case rocksdb::Tickers::BYTES_READ:
+      case rocksdb::Histograms::BLOB_DB_NEXT_MICROS:
         return 0x27;
-      case rocksdb::Tickers::NUMBER_DB_SEEK:
+      case rocksdb::Histograms::BLOB_DB_PREV_MICROS:
         return 0x28;
-      case rocksdb::Tickers::NUMBER_DB_NEXT:
+      case rocksdb::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS:
         return 0x29;
-      case rocksdb::Tickers::NUMBER_DB_PREV:
+      case rocksdb::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS:
         return 0x2A;
-      case rocksdb::Tickers::NUMBER_DB_SEEK_FOUND:
+      case rocksdb::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS:
         return 0x2B;
-      case rocksdb::Tickers::NUMBER_DB_NEXT_FOUND:
+      case rocksdb::Histograms::BLOB_DB_GC_MICROS:
         return 0x2C;
-      case rocksdb::Tickers::NUMBER_DB_PREV_FOUND:
+      case rocksdb::Histograms::BLOB_DB_COMPRESSION_MICROS:
         return 0x2D;
-      case rocksdb::Tickers::ITER_BYTES_READ:
+      case rocksdb::Histograms::BLOB_DB_DECOMPRESSION_MICROS:
         return 0x2E;
-      case rocksdb::Tickers::NO_FILE_CLOSES:
-        return 0x2F;
-      case rocksdb::Tickers::NO_FILE_OPENS:
-        return 0x30;
-      case rocksdb::Tickers::NO_FILE_ERRORS:
-        return 0x31;
-      case rocksdb::Tickers::STALL_L0_SLOWDOWN_MICROS:
-        return 0x32;
-      case rocksdb::Tickers::STALL_MEMTABLE_COMPACTION_MICROS:
-        return 0x33;
-      case rocksdb::Tickers::STALL_L0_NUM_FILES_MICROS:
-        return 0x34;
-      case rocksdb::Tickers::STALL_MICROS:
-        return 0x35;
-      case rocksdb::Tickers::DB_MUTEX_WAIT_MICROS:
-        return 0x36;
-      case rocksdb::Tickers::RATE_LIMIT_DELAY_MILLIS:
-        return 0x37;
-      case rocksdb::Tickers::NO_ITERATORS:
-        return 0x38;
-      case rocksdb::Tickers::NUMBER_MULTIGET_CALLS:
-        return 0x39;
-      case rocksdb::Tickers::NUMBER_MULTIGET_KEYS_READ:
-        return 0x3A;
-      case rocksdb::Tickers::NUMBER_MULTIGET_BYTES_READ:
-        return 0x3B;
-      case rocksdb::Tickers::NUMBER_FILTERED_DELETES:
-        return 0x3C;
-      case rocksdb::Tickers::NUMBER_MERGE_FAILURES:
-        return 0x3D;
-      case rocksdb::Tickers::BLOOM_FILTER_PREFIX_CHECKED:
-        return 0x3E;
-      case rocksdb::Tickers::BLOOM_FILTER_PREFIX_USEFUL:
-        return 0x3F;
-      case rocksdb::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION:
-        return 0x40;
-      case rocksdb::Tickers::GET_UPDATES_SINCE_CALLS:
-        return 0x41;
-      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_MISS:
-        return 0x42;
-      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_HIT:
-        return 0x43;
-      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD:
-        return 0x44;
-      case rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES:
-        return 0x45;
-      case rocksdb::Tickers::WAL_FILE_SYNCED:
-        return 0x46;
-      case rocksdb::Tickers::WAL_FILE_BYTES:
-        return 0x47;
-      case rocksdb::Tickers::WRITE_DONE_BY_SELF:
-        return 0x48;
-      case rocksdb::Tickers::WRITE_DONE_BY_OTHER:
-        return 0x49;
-      case rocksdb::Tickers::WRITE_TIMEDOUT:
-        return 0x4A;
-      case rocksdb::Tickers::WRITE_WITH_WAL:
-        return 0x4B;
-      case rocksdb::Tickers::COMPACT_READ_BYTES:
-        return 0x4C;
-      case rocksdb::Tickers::COMPACT_WRITE_BYTES:
-        return 0x4D;
-      case rocksdb::Tickers::FLUSH_WRITE_BYTES:
-        return 0x4E;
-      case rocksdb::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES:
-        return 0x4F;
-      case rocksdb::Tickers::NUMBER_SUPERVERSION_ACQUIRES:
-        return 0x50;
-      case rocksdb::Tickers::NUMBER_SUPERVERSION_RELEASES:
-        return 0x51;
-      case rocksdb::Tickers::NUMBER_SUPERVERSION_CLEANUPS:
-        return 0x52;
-      case rocksdb::Tickers::NUMBER_BLOCK_COMPRESSED:
-        return 0x53;
-      case rocksdb::Tickers::NUMBER_BLOCK_DECOMPRESSED:
-        return 0x54;
-      case rocksdb::Tickers::NUMBER_BLOCK_NOT_COMPRESSED:
-        return 0x55;
-      case rocksdb::Tickers::MERGE_OPERATION_TOTAL_TIME:
-        return 0x56;
-      case rocksdb::Tickers::FILTER_OPERATION_TOTAL_TIME:
-        return 0x57;
-      case rocksdb::Tickers::ROW_CACHE_HIT:
-        return 0x58;
-      case rocksdb::Tickers::ROW_CACHE_MISS:
-        return 0x59;
-      case rocksdb::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES:
-        return 0x5A;
-      case rocksdb::Tickers::READ_AMP_TOTAL_READ_BYTES:
-        return 0x5B;
-      case rocksdb::Tickers::NUMBER_RATE_LIMITER_DRAINS:
-        return 0x5C;
-      case rocksdb::Tickers::TICKER_ENUM_MAX:
-        return 0x5D;
-      
+      case rocksdb::Histograms::HISTOGRAM_ENUM_MAX:
+        // 0x1F for backwards compatibility on current minor version.
+        return 0x1F;
+
+      default:
+        // undefined/default
+        return 0x0;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::Histograms enum for the
+  // provided Java org.rocksdb.HistogramsType
+  static rocksdb::Histograms toCppHistograms(jbyte jhistograms_type) {
+    switch(jhistograms_type) {
+      case 0x0:
+        return rocksdb::Histograms::DB_GET;
+      case 0x1:
+        return rocksdb::Histograms::DB_WRITE;
+      case 0x2:
+        return rocksdb::Histograms::COMPACTION_TIME;
+      case 0x3:
+        return rocksdb::Histograms::SUBCOMPACTION_SETUP_TIME;
+      case 0x4:
+        return rocksdb::Histograms::TABLE_SYNC_MICROS;
+      case 0x5:
+        return rocksdb::Histograms::COMPACTION_OUTFILE_SYNC_MICROS;
+      case 0x6:
+        return rocksdb::Histograms::WAL_FILE_SYNC_MICROS;
+      case 0x7:
+        return rocksdb::Histograms::MANIFEST_FILE_SYNC_MICROS;
+      case 0x8:
+        return rocksdb::Histograms::TABLE_OPEN_IO_MICROS;
+      case 0x9:
+        return rocksdb::Histograms::DB_MULTIGET;
+      case 0xA:
+        return rocksdb::Histograms::READ_BLOCK_COMPACTION_MICROS;
+      case 0xB:
+        return rocksdb::Histograms::READ_BLOCK_GET_MICROS;
+      case 0xC:
+        return rocksdb::Histograms::WRITE_RAW_BLOCK_MICROS;
+      case 0xD:
+        return rocksdb::Histograms::STALL_L0_SLOWDOWN_COUNT;
+      case 0xE:
+        return rocksdb::Histograms::STALL_MEMTABLE_COMPACTION_COUNT;
+      case 0xF:
+        return rocksdb::Histograms::STALL_L0_NUM_FILES_COUNT;
+      case 0x10:
+        return rocksdb::Histograms::HARD_RATE_LIMIT_DELAY_COUNT;
+      case 0x11:
+        return rocksdb::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT;
+      case 0x12:
+        return rocksdb::Histograms::NUM_FILES_IN_SINGLE_COMPACTION;
+      case 0x13:
+        return rocksdb::Histograms::DB_SEEK;
+      case 0x14:
+        return rocksdb::Histograms::WRITE_STALL;
+      case 0x15:
+        return rocksdb::Histograms::SST_READ_MICROS;
+      case 0x16:
+        return rocksdb::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED;
+      case 0x17:
+        return rocksdb::Histograms::BYTES_PER_READ;
+      case 0x18:
+        return rocksdb::Histograms::BYTES_PER_WRITE;
+      case 0x19:
+        return rocksdb::Histograms::BYTES_PER_MULTIGET;
+      case 0x1A:
+        return rocksdb::Histograms::BYTES_COMPRESSED;
+      case 0x1B:
+        return rocksdb::Histograms::BYTES_DECOMPRESSED;
+      case 0x1C:
+        return rocksdb::Histograms::COMPRESSION_TIMES_NANOS;
+      case 0x1D:
+        return rocksdb::Histograms::DECOMPRESSION_TIMES_NANOS;
+      case 0x1E:
+        return rocksdb::Histograms::READ_NUM_MERGE_OPERANDS;
+      // 0x20 to skip 0x1F so TICKER_ENUM_MAX remains unchanged for minor version compatibility.
+      case 0x20:
+        return rocksdb::Histograms::FLUSH_TIME;
+      case 0x21:
+        return rocksdb::Histograms::BLOB_DB_KEY_SIZE;
+      case 0x22:
+        return rocksdb::Histograms::BLOB_DB_VALUE_SIZE;
+      case 0x23:
+        return rocksdb::Histograms::BLOB_DB_WRITE_MICROS;
+      case 0x24:
+        return rocksdb::Histograms::BLOB_DB_GET_MICROS;
+      case 0x25:
+        return rocksdb::Histograms::BLOB_DB_MULTIGET_MICROS;
+      case 0x26:
+        return rocksdb::Histograms::BLOB_DB_SEEK_MICROS;
+      case 0x27:
+        return rocksdb::Histograms::BLOB_DB_NEXT_MICROS;
+      case 0x28:
+        return rocksdb::Histograms::BLOB_DB_PREV_MICROS;
+      case 0x29:
+        return rocksdb::Histograms::BLOB_DB_BLOB_FILE_WRITE_MICROS;
+      case 0x2A:
+        return rocksdb::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS;
+      case 0x2B:
+        return rocksdb::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS;
+      case 0x2C:
+        return rocksdb::Histograms::BLOB_DB_GC_MICROS;
+      case 0x2D:
+        return rocksdb::Histograms::BLOB_DB_COMPRESSION_MICROS;
+      case 0x2E:
+        return rocksdb::Histograms::BLOB_DB_DECOMPRESSION_MICROS;
+      case 0x1F:
+        // 0x1F for backwards compatibility on current minor version.
+        return rocksdb::Histograms::HISTOGRAM_ENUM_MAX;
+
+      default:
+        // undefined/default
+        return rocksdb::Histograms::DB_GET;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.StatsLevel
+class StatsLevelJni {
+ public:
+  // Returns the equivalent org.rocksdb.StatsLevel for the provided
+  // C++ rocksdb::StatsLevel enum
+  static jbyte toJavaStatsLevel(
+      const rocksdb::StatsLevel& stats_level) {
+    switch(stats_level) {
+      case rocksdb::StatsLevel::kExceptDetailedTimers:
+        return 0x0;
+      case rocksdb::StatsLevel::kExceptTimeForMutex:
+        return 0x1;
+      case rocksdb::StatsLevel::kAll:
+        return 0x2;
+
+      default:
+        // undefined/default
+        return 0x0;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::StatsLevel enum for the
+  // provided Java org.rocksdb.StatsLevel
+  static rocksdb::StatsLevel toCppStatsLevel(jbyte jstats_level) {
+    switch(jstats_level) {
+      case 0x0:
+        return rocksdb::StatsLevel::kExceptDetailedTimers;
+      case 0x1:
+        return rocksdb::StatsLevel::kExceptTimeForMutex;
+      case 0x2:
+        return rocksdb::StatsLevel::kAll;
+
+      default:
+        // undefined/default
+        return rocksdb::StatsLevel::kExceptDetailedTimers;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.RateLimiterMode
+class RateLimiterModeJni {
+ public:
+  // Returns the equivalent org.rocksdb.RateLimiterMode for the provided
+  // C++ rocksdb::RateLimiter::Mode enum
+  static jbyte toJavaRateLimiterMode(
+      const rocksdb::RateLimiter::Mode& rate_limiter_mode) {
+    switch(rate_limiter_mode) {
+      case rocksdb::RateLimiter::Mode::kReadsOnly:
+        return 0x0;
+      case rocksdb::RateLimiter::Mode::kWritesOnly:
+        return 0x1;
+      case rocksdb::RateLimiter::Mode::kAllIo:
+        return 0x2;
+
+      default:
+        // undefined/default
+        return 0x1;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::RateLimiter::Mode enum for the
+  // provided Java org.rocksdb.RateLimiterMode
+  static rocksdb::RateLimiter::Mode toCppRateLimiterMode(jbyte jrate_limiter_mode) {
+    switch(jrate_limiter_mode) {
+      case 0x0:
+        return rocksdb::RateLimiter::Mode::kReadsOnly;
+      case 0x1:
+        return rocksdb::RateLimiter::Mode::kWritesOnly;
+      case 0x2:
+        return rocksdb::RateLimiter::Mode::kAllIo;
+
       default:
         // undefined/default
+        return rocksdb::RateLimiter::Mode::kWritesOnly;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.MemoryUsageType
+class MemoryUsageTypeJni {
+public:
+  // Returns the equivalent org.rocksdb.MemoryUsageType for the provided
+  // C++ rocksdb::MemoryUtil::UsageType enum
+  static jbyte toJavaMemoryUsageType(
+      const rocksdb::MemoryUtil::UsageType& usage_type) {
+    switch(usage_type) {
+      case rocksdb::MemoryUtil::UsageType::kMemTableTotal:
         return 0x0;
+      case rocksdb::MemoryUtil::UsageType::kMemTableUnFlushed:
+        return 0x1;
+      case rocksdb::MemoryUtil::UsageType::kTableReadersTotal:
+        return 0x2;
+      case rocksdb::MemoryUtil::UsageType::kCacheTotal:
+        return 0x3;
+      default:
+        // undefined: use kNumUsageTypes
+        return 0x4;
+    }
+  }
+
+  // Returns the equivalent C++ rocksdb::MemoryUtil::UsageType enum for the
+  // provided Java org.rocksdb.MemoryUsageType
+  static rocksdb::MemoryUtil::UsageType toCppMemoryUsageType(
+      jbyte usage_type) {
+    switch(usage_type) {
+      case 0x0:
+        return rocksdb::MemoryUtil::UsageType::kMemTableTotal;
+      case 0x1:
+        return rocksdb::MemoryUtil::UsageType::kMemTableUnFlushed;
+      case 0x2:
+        return rocksdb::MemoryUtil::UsageType::kTableReadersTotal;
+      case 0x3:
+        return rocksdb::MemoryUtil::UsageType::kCacheTotal;
+      default:
+        // undefined/default: use kNumUsageTypes
+        return rocksdb::MemoryUtil::UsageType::kNumUsageTypes;
+    }
+  }
+};
+
+// The portal class for org.rocksdb.Transaction
+class TransactionJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.Transaction
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env,
+        "org/rocksdb/Transaction");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.Transaction.WaitingTransactions object
+   *
+   * @param env A pointer to the Java environment
+   * @param jtransaction A Java org.rocksdb.Transaction object
+   * @param column_family_id The id of the column family
+   * @param key The key
+   * @param transaction_ids The transaction ids
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.Transaction.WaitingTransactions object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject newWaitingTransactions(JNIEnv* env, jobject jtransaction,
+      const uint32_t column_family_id, const std::string &key,
+      const std::vector<TransactionID> &transaction_ids) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+      jclazz, "newWaitingTransactions", "(JLjava/lang/String;[J)Lorg/rocksdb/Transaction$WaitingTransactions;");
+    if(mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jkey = env->NewStringUTF(key.c_str());
+    if(jkey == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    const size_t len = transaction_ids.size();
+    jlongArray jtransaction_ids = env->NewLongArray(static_cast<jsize>(len));
+    if(jtransaction_ids == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jkey);
+      return nullptr;
+    }
+
+    jlong *body = env->GetLongArrayElements(jtransaction_ids, nullptr);
+    if(body == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->DeleteLocalRef(jkey);
+        env->DeleteLocalRef(jtransaction_ids);
+        return nullptr;
+    }
+    for(size_t i = 0; i < len; ++i) {
+      body[i] = static_cast<jlong>(transaction_ids[i]);
+    }
+    env->ReleaseLongArrayElements(jtransaction_ids, body, 0);
+
+    jobject jwaiting_transactions = env->CallObjectMethod(jtransaction,
+      mid, static_cast<jlong>(column_family_id), jkey, jtransaction_ids);
+    if(env->ExceptionCheck()) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      env->DeleteLocalRef(jkey);
+      env->DeleteLocalRef(jtransaction_ids);
+      return nullptr;
+    }
+
+    return jwaiting_transactions;
+  }
+};
+
+// The portal class for org.rocksdb.TransactionDB
+class TransactionDBJni : public JavaClass {
+ public:
+ /**
+  * Get the Java Class org.rocksdb.TransactionDB
+  *
+  * @param env A pointer to the Java environment
+  *
+  * @return The Java Class or nullptr if one of the
+  *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+  *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+  */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env,
+       "org/rocksdb/TransactionDB");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionDB.DeadlockInfo object
+   *
+   * @param env A pointer to the Java environment
+   * @param jtransaction A Java org.rocksdb.Transaction object
+   * @param column_family_id The id of the column family
+   * @param key The key
+   * @param transaction_ids The transaction ids
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.Transaction.WaitingTransactions object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject newDeadlockInfo(JNIEnv* env, jobject jtransaction_db,
+      const rocksdb::TransactionID transaction_id,
+      const uint32_t column_family_id, const std::string &waiting_key,
+      const bool exclusive) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+        jclazz, "newDeadlockInfo", "(JJLjava/lang/String;Z)Lorg/rocksdb/TransactionDB$DeadlockInfo;");
+    if(mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jwaiting_key = env->NewStringUTF(waiting_key.c_str());
+    if(jwaiting_key == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    // resolve the column family id to a ColumnFamilyHandle
+    jobject jdeadlock_info = env->CallObjectMethod(jtransaction_db,
+        mid, transaction_id, static_cast<jlong>(column_family_id),
+        jwaiting_key, exclusive);
+    if(env->ExceptionCheck()) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      env->DeleteLocalRef(jwaiting_key);
+      return nullptr;
+    }
+
+    return jdeadlock_info;
+  }
+};
+
+// The portal class for org.rocksdb.TxnDBWritePolicy
+class TxnDBWritePolicyJni {
+ public:
+ // Returns the equivalent org.rocksdb.TxnDBWritePolicy for the provided
+ // C++ rocksdb::TxnDBWritePolicy enum
+ static jbyte toJavaTxnDBWritePolicy(
+     const rocksdb::TxnDBWritePolicy& txndb_write_policy) {
+   switch(txndb_write_policy) {
+     case rocksdb::TxnDBWritePolicy::WRITE_COMMITTED:
+       return 0x0;
+     case rocksdb::TxnDBWritePolicy::WRITE_PREPARED:
+       return 0x1;
+    case rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED:
+       return 0x2;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::TxnDBWritePolicy enum for the
+ // provided Java org.rocksdb.TxnDBWritePolicy
+ static rocksdb::TxnDBWritePolicy toCppTxnDBWritePolicy(
+     jbyte jtxndb_write_policy) {
+   switch(jtxndb_write_policy) {
+     case 0x0:
+       return rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
+     case 0x1:
+       return rocksdb::TxnDBWritePolicy::WRITE_PREPARED;
+     case 0x2:
+       return rocksdb::TxnDBWritePolicy::WRITE_UNPREPARED;
+     default:
+       // undefined/default
+       return rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.TransactionDB.KeyLockInfo
+class KeyLockInfoJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionDB.KeyLockInfo
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env,
+        "org/rocksdb/TransactionDB$KeyLockInfo");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionDB.KeyLockInfo object
+   * with the same properties as the provided C++ rocksdb::KeyLockInfo object
+   *
+   * @param env A pointer to the Java environment
+   * @param key_lock_info The rocksdb::KeyLockInfo object
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.TransactionDB.KeyLockInfo object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env,
+      const rocksdb::KeyLockInfo& key_lock_info) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(
+      jclazz, "<init>", "(Ljava/lang/String;[JZ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jkey = env->NewStringUTF(key_lock_info.key.c_str());
+    if (jkey == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    const jsize jtransaction_ids_len = static_cast<jsize>(key_lock_info.ids.size());
+    jlongArray jtransactions_ids = env->NewLongArray(jtransaction_ids_len);
+    if (jtransactions_ids == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jkey);
+      return nullptr;
+    }
+
+    const jobject jkey_lock_info = env->NewObject(jclazz, mid,
+      jkey, jtransactions_ids, key_lock_info.exclusive);
+    if(jkey_lock_info == nullptr) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      env->DeleteLocalRef(jtransactions_ids);
+      env->DeleteLocalRef(jkey);
+      return nullptr;
     }
+
+    return jkey_lock_info;
   }
+};
 
-  // Returns the equivalent C++ rocksdb::Tickers enum for the
-  // provided Java org.rocksdb.TickerType
-  static rocksdb::Tickers toCppTickers(jbyte jticker_type) {
-    switch(jticker_type) {
-      case 0x0:
-        return rocksdb::Tickers::BLOCK_CACHE_MISS;
-      case 0x1:
-        return rocksdb::Tickers::BLOCK_CACHE_HIT;
-      case 0x2:
-        return rocksdb::Tickers::BLOCK_CACHE_ADD;
-      case 0x3:
-        return rocksdb::Tickers::BLOCK_CACHE_ADD_FAILURES;
-      case 0x4:
-        return rocksdb::Tickers::BLOCK_CACHE_INDEX_MISS;
-      case 0x5:
-        return rocksdb::Tickers::BLOCK_CACHE_INDEX_HIT;
-      case 0x6:
-        return rocksdb::Tickers::BLOCK_CACHE_INDEX_ADD;
-      case 0x7:
-        return rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT;
-      case 0x8:
-        return rocksdb::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT;
-      case 0x9:
-        return rocksdb::Tickers::BLOCK_CACHE_FILTER_MISS;
-      case 0xA:
-        return rocksdb::Tickers::BLOCK_CACHE_FILTER_HIT;
-      case 0xB:
-        return rocksdb::Tickers::BLOCK_CACHE_FILTER_ADD;
-      case 0xC:
-        return rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT;
-      case 0xD:
-        return rocksdb::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT;
-      case 0xE:
-        return rocksdb::Tickers::BLOCK_CACHE_DATA_MISS;
-      case 0xF:
-        return rocksdb::Tickers::BLOCK_CACHE_DATA_HIT;
-      case 0x10:
-        return rocksdb::Tickers::BLOCK_CACHE_DATA_ADD;
-      case 0x11:
-        return rocksdb::Tickers::BLOCK_CACHE_DATA_BYTES_INSERT;
-      case 0x12:
-        return rocksdb::Tickers::BLOCK_CACHE_BYTES_READ;
-      case 0x13:
-        return rocksdb::Tickers::BLOCK_CACHE_BYTES_WRITE;
-      case 0x14:
-        return rocksdb::Tickers::BLOOM_FILTER_USEFUL;
-      case 0x15:
-        return rocksdb::Tickers::PERSISTENT_CACHE_HIT;
-      case 0x16:
-        return rocksdb::Tickers::PERSISTENT_CACHE_MISS;
-      case 0x17:
-        return rocksdb::Tickers::SIM_BLOCK_CACHE_HIT;
-      case 0x18:
-        return rocksdb::Tickers::SIM_BLOCK_CACHE_MISS;
-      case 0x19:
-        return rocksdb::Tickers::MEMTABLE_HIT;
-      case 0x1A:
-        return rocksdb::Tickers::MEMTABLE_MISS;
-      case 0x1B:
-        return rocksdb::Tickers::GET_HIT_L0;
-      case 0x1C:
-        return rocksdb::Tickers::GET_HIT_L1;
-      case 0x1D:
-        return rocksdb::Tickers::GET_HIT_L2_AND_UP;
-      case 0x1E:
-        return rocksdb::Tickers::COMPACTION_KEY_DROP_NEWER_ENTRY;
-      case 0x1F:
-        return rocksdb::Tickers::COMPACTION_KEY_DROP_OBSOLETE;
-      case 0x20:
-        return rocksdb::Tickers::COMPACTION_KEY_DROP_RANGE_DEL;
-      case 0x21:
-        return rocksdb::Tickers::COMPACTION_KEY_DROP_USER;
-      case 0x22:
-        return rocksdb::Tickers::COMPACTION_RANGE_DEL_DROP_OBSOLETE;
-      case 0x23:
-        return rocksdb::Tickers::NUMBER_KEYS_WRITTEN;
-      case 0x24:
-        return rocksdb::Tickers::NUMBER_KEYS_READ;
-      case 0x25:
-        return rocksdb::Tickers::NUMBER_KEYS_UPDATED;
-      case 0x26:
-        return rocksdb::Tickers::BYTES_WRITTEN;
-      case 0x27:
-        return rocksdb::Tickers::BYTES_READ;
-      case 0x28:
-        return rocksdb::Tickers::NUMBER_DB_SEEK;
-      case 0x29:
-        return rocksdb::Tickers::NUMBER_DB_NEXT;
-      case 0x2A:
-        return rocksdb::Tickers::NUMBER_DB_PREV;
-      case 0x2B:
-        return rocksdb::Tickers::NUMBER_DB_SEEK_FOUND;
-      case 0x2C:
-        return rocksdb::Tickers::NUMBER_DB_NEXT_FOUND;
-      case 0x2D:
-        return rocksdb::Tickers::NUMBER_DB_PREV_FOUND;
-      case 0x2E:
-        return rocksdb::Tickers::ITER_BYTES_READ;
-      case 0x2F:
-        return rocksdb::Tickers::NO_FILE_CLOSES;
-      case 0x30:
-        return rocksdb::Tickers::NO_FILE_OPENS;
-      case 0x31:
-        return rocksdb::Tickers::NO_FILE_ERRORS;
-      case 0x32:
-        return rocksdb::Tickers::STALL_L0_SLOWDOWN_MICROS;
-      case 0x33:
-        return rocksdb::Tickers::STALL_MEMTABLE_COMPACTION_MICROS;
-      case 0x34:
-        return rocksdb::Tickers::STALL_L0_NUM_FILES_MICROS;
-      case 0x35:
-        return rocksdb::Tickers::STALL_MICROS;
-      case 0x36:
-        return rocksdb::Tickers::DB_MUTEX_WAIT_MICROS;
-      case 0x37:
-        return rocksdb::Tickers::RATE_LIMIT_DELAY_MILLIS;
-      case 0x38:
-        return rocksdb::Tickers::NO_ITERATORS;
-      case 0x39:
-        return rocksdb::Tickers::NUMBER_MULTIGET_CALLS;
-      case 0x3A:
-        return rocksdb::Tickers::NUMBER_MULTIGET_KEYS_READ;
-      case 0x3B:
-        return rocksdb::Tickers::NUMBER_MULTIGET_BYTES_READ;
-      case 0x3C:
-        return rocksdb::Tickers::NUMBER_FILTERED_DELETES;
-      case 0x3D:
-        return rocksdb::Tickers::NUMBER_MERGE_FAILURES;
-      case 0x3E:
-        return rocksdb::Tickers::BLOOM_FILTER_PREFIX_CHECKED;
-      case 0x3F:
-        return rocksdb::Tickers::BLOOM_FILTER_PREFIX_USEFUL;
-      case 0x40:
-        return rocksdb::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION;
-      case 0x41:
-        return rocksdb::Tickers::GET_UPDATES_SINCE_CALLS;
-      case 0x42:
-        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_MISS;
-      case 0x43:
-        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_HIT;
-      case 0x44:
-        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD;
-      case 0x45:
-        return rocksdb::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES;
-      case 0x46:
-        return rocksdb::Tickers::WAL_FILE_SYNCED;
-      case 0x47:
-        return rocksdb::Tickers::WAL_FILE_BYTES;
-      case 0x48:
-        return rocksdb::Tickers::WRITE_DONE_BY_SELF;
-      case 0x49:
-        return rocksdb::Tickers::WRITE_DONE_BY_OTHER;
-      case 0x4A:
-        return rocksdb::Tickers::WRITE_TIMEDOUT;
-      case 0x4B:
-        return rocksdb::Tickers::WRITE_WITH_WAL;
-      case 0x4C:
-        return rocksdb::Tickers::COMPACT_READ_BYTES;
-      case 0x4D:
-        return rocksdb::Tickers::COMPACT_WRITE_BYTES;
-      case 0x4E:
-        return rocksdb::Tickers::FLUSH_WRITE_BYTES;
-      case 0x4F:
-        return rocksdb::Tickers::NUMBER_DIRECT_LOAD_TABLE_PROPERTIES;
-      case 0x50:
-        return rocksdb::Tickers::NUMBER_SUPERVERSION_ACQUIRES;
-      case 0x51:
-        return rocksdb::Tickers::NUMBER_SUPERVERSION_RELEASES;
-      case 0x52:
-        return rocksdb::Tickers::NUMBER_SUPERVERSION_CLEANUPS;
-      case 0x53:
-        return rocksdb::Tickers::NUMBER_BLOCK_COMPRESSED;
-      case 0x54:
-        return rocksdb::Tickers::NUMBER_BLOCK_DECOMPRESSED;
-      case 0x55:
-        return rocksdb::Tickers::NUMBER_BLOCK_NOT_COMPRESSED;
-      case 0x56:
-        return rocksdb::Tickers::MERGE_OPERATION_TOTAL_TIME;
-      case 0x57:
-        return rocksdb::Tickers::FILTER_OPERATION_TOTAL_TIME;
-      case 0x58:
-        return rocksdb::Tickers::ROW_CACHE_HIT;
-      case 0x59:
-        return rocksdb::Tickers::ROW_CACHE_MISS;
-      case 0x5A:
-        return rocksdb::Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES;
-      case 0x5B:
-        return rocksdb::Tickers::READ_AMP_TOTAL_READ_BYTES;
-      case 0x5C:
-        return rocksdb::Tickers::NUMBER_RATE_LIMITER_DRAINS;
-      case 0x5D:
-        return rocksdb::Tickers::TICKER_ENUM_MAX;
+// The portal class for org.rocksdb.TransactionDB.DeadlockInfo
+class DeadlockInfoJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionDB.DeadlockInfo
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+   static jclass getJClass(JNIEnv* env) {
+     return JavaClass::getJClass(env,"org/rocksdb/TransactionDB$DeadlockInfo");
+  }
+};
+
+// The portal class for org.rocksdb.TransactionDB.DeadlockPath
+class DeadlockPathJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.TransactionDB.DeadlockPath
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env,
+        "org/rocksdb/TransactionDB$DeadlockPath");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.TransactionDB.DeadlockPath object
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return A reference to a Java
+   *     org.rocksdb.TransactionDB.DeadlockPath object,
+   *     or nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env,
+    const jobjectArray jdeadlock_infos, const bool limit_exceeded) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-      default:
-        // undefined/default
-        return rocksdb::Tickers::BLOCK_CACHE_MISS;
+    jmethodID mid = env->GetMethodID(
+      jclazz, "<init>", "([LDeadlockInfo;Z)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    const jobject jdeadlock_path = env->NewObject(jclazz, mid,
+      jdeadlock_infos, limit_exceeded);
+    if(jdeadlock_path == nullptr) {
+      // exception thrown: InstantiationException or OutOfMemoryError
+      return nullptr;
     }
+
+    return jdeadlock_path;
   }
 };
 
-// The portal class for org.rocksdb.HistogramType
-class HistogramTypeJni {
+class AbstractTableFilterJni : public RocksDBNativeClass<const rocksdb::TableFilterJniCallback*, AbstractTableFilterJni> {
  public:
-  // Returns the equivalent org.rocksdb.HistogramType for the provided
-  // C++ rocksdb::Histograms enum
-  static jbyte toJavaHistogramsType(
-      const rocksdb::Histograms& histograms) {
-    switch(histograms) {
-      case rocksdb::Histograms::DB_GET:
-        return 0x0;
-      case rocksdb::Histograms::DB_WRITE:
-        return 0x1;
-      case rocksdb::Histograms::COMPACTION_TIME:
-        return 0x2;
-      case rocksdb::Histograms::SUBCOMPACTION_SETUP_TIME:
-        return 0x3;
-      case rocksdb::Histograms::TABLE_SYNC_MICROS:
-        return 0x4;
-      case rocksdb::Histograms::COMPACTION_OUTFILE_SYNC_MICROS:
-        return 0x5;
-      case rocksdb::Histograms::WAL_FILE_SYNC_MICROS:
-        return 0x6;
-      case rocksdb::Histograms::MANIFEST_FILE_SYNC_MICROS:
-        return 0x7;
-      case rocksdb::Histograms::TABLE_OPEN_IO_MICROS:
-        return 0x8;
-      case rocksdb::Histograms::DB_MULTIGET:
-        return 0x9;
-      case rocksdb::Histograms::READ_BLOCK_COMPACTION_MICROS:
-        return 0xA;
-      case rocksdb::Histograms::READ_BLOCK_GET_MICROS:
-        return 0xB;
-      case rocksdb::Histograms::WRITE_RAW_BLOCK_MICROS:
-        return 0xC;
-      case rocksdb::Histograms::STALL_L0_SLOWDOWN_COUNT:
-        return 0xD;
-      case rocksdb::Histograms::STALL_MEMTABLE_COMPACTION_COUNT:
-        return 0xE;
-      case rocksdb::Histograms::STALL_L0_NUM_FILES_COUNT:
-        return 0xF;
-      case rocksdb::Histograms::HARD_RATE_LIMIT_DELAY_COUNT:
-        return 0x10;
-      case rocksdb::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT:
-        return 0x11;
-      case rocksdb::Histograms::NUM_FILES_IN_SINGLE_COMPACTION:
-        return 0x12;
-      case rocksdb::Histograms::DB_SEEK:
-        return 0x13;
-      case rocksdb::Histograms::WRITE_STALL:
-        return 0x14;
-      case rocksdb::Histograms::SST_READ_MICROS:
-        return 0x15;
-      case rocksdb::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED:
-        return 0x16;
-      case rocksdb::Histograms::BYTES_PER_READ:
-        return 0x17;
-      case rocksdb::Histograms::BYTES_PER_WRITE:
-        return 0x18;
-      case rocksdb::Histograms::BYTES_PER_MULTIGET:
-        return 0x19;
-      case rocksdb::Histograms::BYTES_COMPRESSED:
-        return 0x1A;
-      case rocksdb::Histograms::BYTES_DECOMPRESSED:
-        return 0x1B;
-      case rocksdb::Histograms::COMPRESSION_TIMES_NANOS:
-        return 0x1C;
-      case rocksdb::Histograms::DECOMPRESSION_TIMES_NANOS:
-        return 0x1D;
-      case rocksdb::Histograms::READ_NUM_MERGE_OPERANDS:
-        return 0x1E;
-      case rocksdb::Histograms::HISTOGRAM_ENUM_MAX:
-        return 0x1F;
+  /**
+   * Get the Java Method: TableFilter#filter(TableProperties)
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getFilterMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-      default:
-        // undefined/default
-        return 0x0;
+    static jmethodID mid =
+        env->GetMethodID(jclazz, "filter", "(Lorg/rocksdb/TableProperties;)Z");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+ private:
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableFilter");
+  }
+};
+
+class TablePropertiesJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.TableProperties object.
+   *
+   * @param env A pointer to the Java environment
+   * @param table_properties A Cpp table properties object
+   *
+   * @return A reference to a Java org.rocksdb.TableProperties object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppTableProperties(JNIEnv* env, const rocksdb::TableProperties& table_properties) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JJJJJJJJJJJJJJJJJJJ[BLjava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/lang/String;Ljava/util/Map;Ljava/util/Map;Ljava/util/Map;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
     }
+
+    jbyteArray jcolumn_family_name = rocksdb::JniUtil::copyBytes(env, table_properties.column_family_name);
+    if (jcolumn_family_name == nullptr) {
+      // exception occurred creating java string
+      return nullptr;
+    }
+
+    jstring jfilter_policy_name = rocksdb::JniUtil::toJavaString(env, &table_properties.filter_policy_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      return nullptr;
+    }
+
+    jstring jcomparator_name = rocksdb::JniUtil::toJavaString(env, &table_properties.comparator_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      return nullptr;
+    }
+
+    jstring jmerge_operator_name = rocksdb::JniUtil::toJavaString(env, &table_properties.merge_operator_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      return nullptr;
+    }
+
+    jstring jprefix_extractor_name = rocksdb::JniUtil::toJavaString(env, &table_properties.prefix_extractor_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      return nullptr;
+    }
+  
+    jstring jproperty_collectors_names = rocksdb::JniUtil::toJavaString(env, &table_properties.property_collectors_names, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      return nullptr;
+    }
+
+    jstring jcompression_name = rocksdb::JniUtil::toJavaString(env, &table_properties.compression_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      env->DeleteLocalRef(jproperty_collectors_names);
+      return nullptr;
+    }
+
+    // Map<String, String>
+    jobject juser_collected_properties = rocksdb::HashMapJni::fromCppMap(env, &table_properties.user_collected_properties);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java map
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      env->DeleteLocalRef(jproperty_collectors_names);
+      env->DeleteLocalRef(jcompression_name);
+      return nullptr;
+    }
+
+    // Map<String, String>
+    jobject jreadable_properties = rocksdb::HashMapJni::fromCppMap(env, &table_properties.readable_properties);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java map
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      env->DeleteLocalRef(jproperty_collectors_names);
+      env->DeleteLocalRef(jcompression_name);
+      env->DeleteLocalRef(juser_collected_properties);
+      return nullptr;
+    }
+
+    // Map<String, Long>
+    jobject jproperties_offsets = rocksdb::HashMapJni::fromCppMap(env, &table_properties.properties_offsets);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java map
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfilter_policy_name);
+      env->DeleteLocalRef(jcomparator_name);
+      env->DeleteLocalRef(jmerge_operator_name);
+      env->DeleteLocalRef(jprefix_extractor_name);
+      env->DeleteLocalRef(jproperty_collectors_names);
+      env->DeleteLocalRef(jcompression_name);
+      env->DeleteLocalRef(juser_collected_properties);
+      env->DeleteLocalRef(jreadable_properties);
+      return nullptr;
+    }
+
+    jobject jtable_properties = env->NewObject(jclazz, mid,
+        static_cast<jlong>(table_properties.data_size),
+        static_cast<jlong>(table_properties.index_size),
+        static_cast<jlong>(table_properties.index_partitions),
+        static_cast<jlong>(table_properties.top_level_index_size),
+        static_cast<jlong>(table_properties.index_key_is_user_key),
+        static_cast<jlong>(table_properties.index_value_is_delta_encoded),
+        static_cast<jlong>(table_properties.filter_size),
+        static_cast<jlong>(table_properties.raw_key_size),
+        static_cast<jlong>(table_properties.raw_value_size),
+        static_cast<jlong>(table_properties.num_data_blocks),
+        static_cast<jlong>(table_properties.num_entries),
+        static_cast<jlong>(table_properties.num_deletions),
+        static_cast<jlong>(table_properties.num_merge_operands),
+        static_cast<jlong>(table_properties.num_range_deletions),
+        static_cast<jlong>(table_properties.format_version),
+        static_cast<jlong>(table_properties.fixed_key_len),
+        static_cast<jlong>(table_properties.column_family_id),
+        static_cast<jlong>(table_properties.creation_time),
+        static_cast<jlong>(table_properties.oldest_key_time),
+        jcolumn_family_name,
+        jfilter_policy_name,
+        jcomparator_name,
+        jmerge_operator_name,
+        jprefix_extractor_name,
+        jproperty_collectors_names,
+        jcompression_name,
+        juser_collected_properties,
+        jreadable_properties,
+        jproperties_offsets
+    );
+
+    if (env->ExceptionCheck()) {
+      return nullptr;
+    }
+
+    return jtable_properties;
   }
 
-  // Returns the equivalent C++ rocksdb::Histograms enum for the
-  // provided Java org.rocksdb.HistogramsType
-  static rocksdb::Histograms toCppHistograms(jbyte jhistograms_type) {
-    switch(jhistograms_type) {
-      case 0x0:
-        return rocksdb::Histograms::DB_GET;
-      case 0x1:
-        return rocksdb::Histograms::DB_WRITE;
-      case 0x2:
-        return rocksdb::Histograms::COMPACTION_TIME;
-      case 0x3:
-        return rocksdb::Histograms::SUBCOMPACTION_SETUP_TIME;
-      case 0x4:
-        return rocksdb::Histograms::TABLE_SYNC_MICROS;
-      case 0x5:
-        return rocksdb::Histograms::COMPACTION_OUTFILE_SYNC_MICROS;
-      case 0x6:
-        return rocksdb::Histograms::WAL_FILE_SYNC_MICROS;
-      case 0x7:
-        return rocksdb::Histograms::MANIFEST_FILE_SYNC_MICROS;
-      case 0x8:
-        return rocksdb::Histograms::TABLE_OPEN_IO_MICROS;
-      case 0x9:
-        return rocksdb::Histograms::DB_MULTIGET;
-      case 0xA:
-        return rocksdb::Histograms::READ_BLOCK_COMPACTION_MICROS;
-      case 0xB:
-        return rocksdb::Histograms::READ_BLOCK_GET_MICROS;
-      case 0xC:
-        return rocksdb::Histograms::WRITE_RAW_BLOCK_MICROS;
-      case 0xD:
-        return rocksdb::Histograms::STALL_L0_SLOWDOWN_COUNT;
-      case 0xE:
-        return rocksdb::Histograms::STALL_MEMTABLE_COMPACTION_COUNT;
-      case 0xF:
-        return rocksdb::Histograms::STALL_L0_NUM_FILES_COUNT;
-      case 0x10:
-        return rocksdb::Histograms::HARD_RATE_LIMIT_DELAY_COUNT;
-      case 0x11:
-        return rocksdb::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT;
-      case 0x12:
-        return rocksdb::Histograms::NUM_FILES_IN_SINGLE_COMPACTION;
-      case 0x13:
-        return rocksdb::Histograms::DB_SEEK;
-      case 0x14:
-        return rocksdb::Histograms::WRITE_STALL;
-      case 0x15:
-        return rocksdb::Histograms::SST_READ_MICROS;
-      case 0x16:
-        return rocksdb::Histograms::NUM_SUBCOMPACTIONS_SCHEDULED;
-      case 0x17:
-        return rocksdb::Histograms::BYTES_PER_READ;
-      case 0x18:
-        return rocksdb::Histograms::BYTES_PER_WRITE;
-      case 0x19:
-        return rocksdb::Histograms::BYTES_PER_MULTIGET;
-      case 0x1A:
-        return rocksdb::Histograms::BYTES_COMPRESSED;
-      case 0x1B:
-        return rocksdb::Histograms::BYTES_DECOMPRESSED;
-      case 0x1C:
-        return rocksdb::Histograms::COMPRESSION_TIMES_NANOS;
-      case 0x1D:
-        return rocksdb::Histograms::DECOMPRESSION_TIMES_NANOS;
-      case 0x1E:
-        return rocksdb::Histograms::READ_NUM_MERGE_OPERANDS;
-      case 0x1F:
-        return rocksdb::Histograms::HISTOGRAM_ENUM_MAX;
+ private:
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/TableProperties");
+  }
+};
 
-      default:
-        // undefined/default
-        return rocksdb::Histograms::DB_GET;
+class ColumnFamilyDescriptorJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ColumnFamilyDescriptor
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ColumnFamilyDescriptor");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.ColumnFamilyDescriptor object with the same
+   * properties as the provided C++ rocksdb::ColumnFamilyDescriptor object
+   *
+   * @param env A pointer to the Java environment
+   * @param cfd A pointer to rocksdb::ColumnFamilyDescriptor object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyDescriptor object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env, ColumnFamilyDescriptor* cfd) {
+    jbyteArray jcf_name = JniUtil::copyBytes(env, cfd->name);
+    jobject cfopts = ColumnFamilyOptionsJni::construct(env, &(cfd->options));
+
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
-  }
-};
 
-// The portal class for org.rocksdb.StatsLevel
-class StatsLevelJni {
- public:
-  // Returns the equivalent org.rocksdb.StatsLevel for the provided
-  // C++ rocksdb::StatsLevel enum
-  static jbyte toJavaStatsLevel(
-      const rocksdb::StatsLevel& stats_level) {
-    switch(stats_level) {
-      case rocksdb::StatsLevel::kExceptDetailedTimers:
-        return 0x0;
-      case rocksdb::StatsLevel::kExceptTimeForMutex:
-        return 0x1;
-      case rocksdb::StatsLevel::kAll:
-        return 0x2;
+    jmethodID mid = env->GetMethodID(jclazz, "<init>",
+                                     "([BLorg/rocksdb/ColumnFamilyOptions;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
 
-      default:
-        // undefined/default
-        return 0x0;
+    jobject jcfd = env->NewObject(jclazz, mid, jcf_name, cfopts);
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
     }
+
+    return jcfd;
   }
 
-  // Returns the equivalent C++ rocksdb::StatsLevel enum for the
-  // provided Java org.rocksdb.StatsLevel
-  static rocksdb::StatsLevel toCppStatsLevel(jbyte jstats_level) {
-    switch(jstats_level) {
-      case 0x0:
-        return rocksdb::StatsLevel::kExceptDetailedTimers;
-      case 0x1:
-        return rocksdb::StatsLevel::kExceptTimeForMutex;
-      case 0x2:
-        return rocksdb::StatsLevel::kAll;
+  /**
+   * Get the Java Method: ColumnFamilyDescriptor#columnFamilyName
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getColumnFamilyNameMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-      default:
-        // undefined/default
-        return rocksdb::StatsLevel::kExceptDetailedTimers;
+    static jmethodID mid = env->GetMethodID(jclazz, "columnFamilyName", "()[B");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: ColumnFamilyDescriptor#columnFamilyOptions
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "columnFamilyOptions", "()Lorg/rocksdb/ColumnFamilyOptions;");
+    assert(mid != nullptr);
+    return mid;
   }
 };
 
-// various utility functions for working with RocksDB and JNI
-class JniUtil {
+// The portal class for org.rocksdb.IndexType
+class IndexTypeJni {
  public:
-    /**
-     * Obtains a reference to the JNIEnv from
-     * the JVM
-     *
-     * If the current thread is not attached to the JavaVM
-     * then it will be attached so as to retrieve the JNIEnv
-     *
-     * If a thread is attached, it must later be manually
-     * released by calling JavaVM::DetachCurrentThread.
-     * This can be handled by always matching calls to this
-     * function with calls to {@link JniUtil::releaseJniEnv(JavaVM*, jboolean)}
-     *
-     * @param jvm (IN) A pointer to the JavaVM instance
-     * @param attached (OUT) A pointer to a boolean which
-     *     will be set to JNI_TRUE if we had to attach the thread
-     *
-     * @return A pointer to the JNIEnv or nullptr if a fatal error
-     *     occurs and the JNIEnv cannot be retrieved
-     */
-    static JNIEnv* getJniEnv(JavaVM* jvm, jboolean* attached) {
-      assert(jvm != nullptr);
+ // Returns the equivalent org.rocksdb.IndexType for the provided
+ // C++ rocksdb::IndexType enum
+ static jbyte toJavaIndexType(
+     const rocksdb::BlockBasedTableOptions::IndexType& index_type) {
+   switch(index_type) {
+     case rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch:
+       return 0x0;
+     case rocksdb::BlockBasedTableOptions::IndexType::kHashSearch:
+       return 0x1;
+    case rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch:
+       return 0x2;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::IndexType enum for the
+ // provided Java org.rocksdb.IndexType
+ static rocksdb::BlockBasedTableOptions::IndexType toCppIndexType(
+     jbyte jindex_type) {
+   switch(jindex_type) {
+     case 0x0:
+       return rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch;
+     case 0x1:
+       return rocksdb::BlockBasedTableOptions::IndexType::kHashSearch;
+     case 0x2:
+       return rocksdb::BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+     default:
+       // undefined/default
+       return rocksdb::BlockBasedTableOptions::IndexType::kBinarySearch;
+   }
+ }
+};
 
-      JNIEnv *env;
-      const jint env_rs = jvm->GetEnv(reinterpret_cast<void**>(&env),
-          JNI_VERSION_1_2);
+// The portal class for org.rocksdb.DataBlockIndexType
+class DataBlockIndexTypeJni {
+ public:
+ // Returns the equivalent org.rocksdb.DataBlockIndexType for the provided
+ // C++ rocksdb::DataBlockIndexType enum
+ static jbyte toJavaDataBlockIndexType(
+     const rocksdb::BlockBasedTableOptions::DataBlockIndexType& index_type) {
+   switch(index_type) {
+     case rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch:
+       return 0x0;
+     case rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash:
+       return 0x1;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::DataBlockIndexType enum for the
+ // provided Java org.rocksdb.DataBlockIndexType
+ static rocksdb::BlockBasedTableOptions::DataBlockIndexType toCppDataBlockIndexType(
+     jbyte jindex_type) {
+   switch(jindex_type) {
+     case 0x0:
+       return rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch;
+     case 0x1:
+       return rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
+     default:
+       // undefined/default
+       return rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch;
+   }
+ }
+};
 
-      if(env_rs == JNI_OK) {
-        // current thread is already attached, return the JNIEnv
-        *attached = JNI_FALSE;
-        return env;
-      } else if(env_rs == JNI_EDETACHED) {
-        // current thread is not attached, attempt to attach
-        const jint rs_attach = jvm->AttachCurrentThread(reinterpret_cast<void**>(&env), NULL);
-        if(rs_attach == JNI_OK) {
-          *attached = JNI_TRUE;
-          return env;
-        } else {
-          // error, could not attach the thread
-          std::cerr << "JniUtil::getJinEnv - Fatal: could not attach current thread to JVM!" << std::endl;
-          return nullptr;
-        }
-      } else if(env_rs == JNI_EVERSION) {
-        // error, JDK does not support JNI_VERSION_1_2+
-        std::cerr << "JniUtil::getJinEnv - Fatal: JDK does not support JNI_VERSION_1_2" << std::endl;
-        return nullptr;
-      } else {
-        std::cerr << "JniUtil::getJinEnv - Fatal: Unknown error: env_rs=" << env_rs << std::endl;
+// The portal class for org.rocksdb.ChecksumType
+class ChecksumTypeJni {
+ public:
+ // Returns the equivalent org.rocksdb.ChecksumType for the provided
+ // C++ rocksdb::ChecksumType enum
+ static jbyte toJavaChecksumType(
+     const rocksdb::ChecksumType& checksum_type) {
+   switch(checksum_type) {
+     case rocksdb::ChecksumType::kNoChecksum:
+       return 0x0;
+     case rocksdb::ChecksumType::kCRC32c:
+       return 0x1;
+     case rocksdb::ChecksumType::kxxHash:
+       return 0x2;
+     case rocksdb::ChecksumType::kxxHash64:
+       return 0x3;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::ChecksumType enum for the
+ // provided Java org.rocksdb.ChecksumType
+ static rocksdb::ChecksumType toCppChecksumType(
+     jbyte jchecksum_type) {
+   switch(jchecksum_type) {
+     case 0x0:
+       return rocksdb::ChecksumType::kNoChecksum;
+     case 0x1:
+       return rocksdb::ChecksumType::kCRC32c;
+     case 0x2:
+       return rocksdb::ChecksumType::kxxHash;
+     case 0x3:
+       return rocksdb::ChecksumType::kxxHash64;
+     default:
+       // undefined/default
+       return rocksdb::ChecksumType::kCRC32c;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.Priority
+class PriorityJni {
+ public:
+ // Returns the equivalent org.rocksdb.Priority for the provided
+ // C++ rocksdb::Env::Priority enum
+ static jbyte toJavaPriority(
+     const rocksdb::Env::Priority& priority) {
+   switch(priority) {
+     case rocksdb::Env::Priority::BOTTOM:
+       return 0x0;
+     case rocksdb::Env::Priority::LOW:
+       return 0x1;
+     case rocksdb::Env::Priority::HIGH:
+       return 0x2;
+     case rocksdb::Env::Priority::TOTAL:
+       return 0x3;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::env::Priority enum for the
+ // provided Java org.rocksdb.Priority
+ static rocksdb::Env::Priority toCppPriority(
+     jbyte jpriority) {
+   switch(jpriority) {
+     case 0x0:
+       return rocksdb::Env::Priority::BOTTOM;
+     case 0x1:
+       return rocksdb::Env::Priority::LOW;
+     case 0x2:
+       return rocksdb::Env::Priority::HIGH;
+     case 0x3:
+       return rocksdb::Env::Priority::TOTAL;
+     default:
+       // undefined/default
+       return rocksdb::Env::Priority::LOW;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.ThreadType
+class ThreadTypeJni {
+ public:
+ // Returns the equivalent org.rocksdb.ThreadType for the provided
+ // C++ rocksdb::ThreadStatus::ThreadType enum
+ static jbyte toJavaThreadType(
+     const rocksdb::ThreadStatus::ThreadType& thread_type) {
+   switch(thread_type) {
+     case rocksdb::ThreadStatus::ThreadType::HIGH_PRIORITY:
+       return 0x0;
+     case rocksdb::ThreadStatus::ThreadType::LOW_PRIORITY:
+       return 0x1;
+     case rocksdb::ThreadStatus::ThreadType::USER:
+       return 0x2;
+     case rocksdb::ThreadStatus::ThreadType::BOTTOM_PRIORITY:
+       return 0x3;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::ThreadStatus::ThreadType enum for the
+ // provided Java org.rocksdb.ThreadType
+ static rocksdb::ThreadStatus::ThreadType toCppThreadType(
+     jbyte jthread_type) {
+   switch(jthread_type) {
+     case 0x0:
+       return rocksdb::ThreadStatus::ThreadType::HIGH_PRIORITY;
+     case 0x1:
+       return rocksdb::ThreadStatus::ThreadType::LOW_PRIORITY;
+     case 0x2:
+       return ThreadStatus::ThreadType::USER;
+     case 0x3:
+       return rocksdb::ThreadStatus::ThreadType::BOTTOM_PRIORITY;
+     default:
+       // undefined/default
+       return rocksdb::ThreadStatus::ThreadType::LOW_PRIORITY;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.OperationType
+class OperationTypeJni {
+ public:
+ // Returns the equivalent org.rocksdb.OperationType for the provided
+ // C++ rocksdb::ThreadStatus::OperationType enum
+ static jbyte toJavaOperationType(
+     const rocksdb::ThreadStatus::OperationType& operation_type) {
+   switch(operation_type) {
+     case rocksdb::ThreadStatus::OperationType::OP_UNKNOWN:
+       return 0x0;
+     case rocksdb::ThreadStatus::OperationType::OP_COMPACTION:
+       return 0x1;
+     case rocksdb::ThreadStatus::OperationType::OP_FLUSH:
+       return 0x2;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::ThreadStatus::OperationType enum for the
+ // provided Java org.rocksdb.OperationType
+ static rocksdb::ThreadStatus::OperationType toCppOperationType(
+     jbyte joperation_type) {
+   switch(joperation_type) {
+     case 0x0:
+       return rocksdb::ThreadStatus::OperationType::OP_UNKNOWN;
+     case 0x1:
+       return rocksdb::ThreadStatus::OperationType::OP_COMPACTION;
+     case 0x2:
+       return rocksdb::ThreadStatus::OperationType::OP_FLUSH;
+     default:
+       // undefined/default
+       return rocksdb::ThreadStatus::OperationType::OP_UNKNOWN;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.OperationStage
+class OperationStageJni {
+ public:
+ // Returns the equivalent org.rocksdb.OperationStage for the provided
+ // C++ rocksdb::ThreadStatus::OperationStage enum
+ static jbyte toJavaOperationStage(
+     const rocksdb::ThreadStatus::OperationStage& operation_stage) {
+   switch(operation_stage) {
+     case rocksdb::ThreadStatus::OperationStage::STAGE_UNKNOWN:
+       return 0x0;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_FLUSH_RUN:
+       return 0x1;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_FLUSH_WRITE_L0:
+       return 0x2;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_PREPARE:
+       return 0x3;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_RUN:
+       return 0x4;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_PROCESS_KV:
+       return 0x5;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_INSTALL:
+       return 0x6;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_SYNC_FILE:
+       return 0x7;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_PICK_MEMTABLES_TO_FLUSH:
+       return 0x8;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_MEMTABLE_ROLLBACK:
+       return 0x9;
+     case rocksdb::ThreadStatus::OperationStage::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS:
+       return 0xA;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::ThreadStatus::OperationStage enum for the
+ // provided Java org.rocksdb.OperationStage
+ static rocksdb::ThreadStatus::OperationStage toCppOperationStage(
+     jbyte joperation_stage) {
+   switch(joperation_stage) {
+     case 0x0:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_UNKNOWN;
+     case 0x1:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_FLUSH_RUN;
+     case 0x2:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_FLUSH_WRITE_L0;
+     case 0x3:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_PREPARE;
+     case 0x4:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_RUN;
+     case 0x5:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_PROCESS_KV;
+     case 0x6:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_INSTALL;
+     case 0x7:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_COMPACTION_SYNC_FILE;
+     case 0x8:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_PICK_MEMTABLES_TO_FLUSH;
+     case 0x9:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_MEMTABLE_ROLLBACK;
+     case 0xA:
+       return rocksdb::ThreadStatus::OperationStage::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS;
+     default:
+       // undefined/default
+       return rocksdb::ThreadStatus::OperationStage::STAGE_UNKNOWN;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.StateType
+class StateTypeJni {
+ public:
+ // Returns the equivalent org.rocksdb.StateType for the provided
+ // C++ rocksdb::ThreadStatus::StateType enum
+ static jbyte toJavaStateType(
+     const rocksdb::ThreadStatus::StateType& state_type) {
+   switch(state_type) {
+     case rocksdb::ThreadStatus::StateType::STATE_UNKNOWN:
+       return 0x0;
+     case rocksdb::ThreadStatus::StateType::STATE_MUTEX_WAIT:
+       return 0x1;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::ThreadStatus::StateType enum for the
+ // provided Java org.rocksdb.StateType
+ static rocksdb::ThreadStatus::StateType toCppStateType(
+     jbyte jstate_type) {
+   switch(jstate_type) {
+     case 0x0:
+       return rocksdb::ThreadStatus::StateType::STATE_UNKNOWN;
+     case 0x1:
+       return rocksdb::ThreadStatus::StateType::STATE_MUTEX_WAIT;
+     default:
+       // undefined/default
+       return rocksdb::ThreadStatus::StateType::STATE_UNKNOWN;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.ThreadStatus
+class ThreadStatusJni : public JavaClass {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.ThreadStatus
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env,
+        "org/rocksdb/ThreadStatus");
+  }
+
+  /**
+   * Create a new Java org.rocksdb.ThreadStatus object with the same
+   * properties as the provided C++ rocksdb::ThreadStatus object
+   *
+   * @param env A pointer to the Java environment
+   * @param thread_status A pointer to rocksdb::ThreadStatus object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyOptions object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject construct(JNIEnv* env,
+      const rocksdb::ThreadStatus* thread_status) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JBLjava/lang/String;Ljava/lang/String;BJB[JB)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
+
+    jstring jdb_name =
+        JniUtil::toJavaString(env, &(thread_status->db_name), true);
+    if (env->ExceptionCheck()) {
+        // an error occurred
         return nullptr;
-      }
     }
 
-    /**
-     * Counterpart to {@link JniUtil::getJniEnv(JavaVM*, jboolean*)}
-     *
-     * Detachess the current thread from the JVM if it was previously
-     * attached
-     *
-     * @param jvm (IN) A pointer to the JavaVM instance
-     * @param attached (IN) JNI_TRUE if we previously had to attach the thread
-     *     to the JavaVM to get the JNIEnv
-     */
-    static void releaseJniEnv(JavaVM* jvm, jboolean& attached) {
-      assert(jvm != nullptr);
-      if(attached == JNI_TRUE) {
-        const jint rs_detach = jvm->DetachCurrentThread();
-        assert(rs_detach == JNI_OK);
-        if(rs_detach != JNI_OK) {
-          std::cerr << "JniUtil::getJinEnv - Warn: Unable to detach current thread from JVM!" << std::endl;
-        }
-      }
+    jstring jcf_name =
+        JniUtil::toJavaString(env, &(thread_status->cf_name), true);
+    if (env->ExceptionCheck()) {
+        // an error occurred
+        env->DeleteLocalRef(jdb_name);
+        return nullptr;
     }
 
-    /**
-     * Copies a Java String[] to a C++ std::vector<std::string>
-     *
-     * @param env (IN) A pointer to the java environment
-     * @param jss (IN) The Java String array to copy
-     * @param has_exception (OUT) will be set to JNI_TRUE
-     *     if an OutOfMemoryError or ArrayIndexOutOfBoundsException
-     *     exception occurs
-     *
-     * @return A std::vector<std:string> containing copies of the Java strings
-     */
-    static std::vector<std::string> copyStrings(JNIEnv* env,
-        jobjectArray jss, jboolean* has_exception) {
-          return rocksdb::JniUtil::copyStrings(env, jss,
-              env->GetArrayLength(jss), has_exception);
+    // long[]
+    const jsize len = static_cast<jsize>(rocksdb::ThreadStatus::kNumOperationProperties);
+    jlongArray joperation_properties =
+        env->NewLongArray(len);
+    if (joperation_properties == nullptr) {
+      // an exception occurred
+      env->DeleteLocalRef(jdb_name);
+      env->DeleteLocalRef(jcf_name);
+      return nullptr;
+    }
+    jlong *body = env->GetLongArrayElements(joperation_properties, nullptr);
+    if (body == nullptr) {
+        // exception thrown: OutOfMemoryError
+        env->DeleteLocalRef(jdb_name);
+        env->DeleteLocalRef(jcf_name);
+        env->DeleteLocalRef(joperation_properties);
+        return nullptr;
+    }
+    for (size_t i = 0; i < len; ++i) {
+      body[i] = static_cast<jlong>(thread_status->op_properties[i]);
+    }
+    env->ReleaseLongArrayElements(joperation_properties, body, 0);
+
+    jobject jcfd = env->NewObject(jclazz, mid,
+        static_cast<jlong>(thread_status->thread_id),
+        ThreadTypeJni::toJavaThreadType(thread_status->thread_type),
+        jdb_name,
+        jcf_name,
+        OperationTypeJni::toJavaOperationType(thread_status->operation_type),
+        static_cast<jlong>(thread_status->op_elapsed_micros),
+        OperationStageJni::toJavaOperationStage(thread_status->operation_stage),
+        joperation_properties,
+        StateTypeJni::toJavaStateType(thread_status->state_type));
+    if (env->ExceptionCheck()) {
+      // exception occurred
+        env->DeleteLocalRef(jdb_name);
+        env->DeleteLocalRef(jcf_name);
+        env->DeleteLocalRef(joperation_properties);
+      return nullptr;
     }
 
-    /**
-     * Copies a Java String[] to a C++ std::vector<std::string>
-     *
-     * @param env (IN) A pointer to the java environment
-     * @param jss (IN) The Java String array to copy
-     * @param jss_len (IN) The length of the Java String array to copy
-     * @param has_exception (OUT) will be set to JNI_TRUE
-     *     if an OutOfMemoryError or ArrayIndexOutOfBoundsException
-     *     exception occurs
-     *
-     * @return A std::vector<std:string> containing copies of the Java strings
-     */
-    static std::vector<std::string> copyStrings(JNIEnv* env,
-        jobjectArray jss, const jsize jss_len, jboolean* has_exception) {
-      std::vector<std::string> strs;
-      for (jsize i = 0; i < jss_len; i++) {
-        jobject js = env->GetObjectArrayElement(jss, i);
-        if(env->ExceptionCheck()) {
-          // exception thrown: ArrayIndexOutOfBoundsException
-          *has_exception = JNI_TRUE;
-          return strs;
-        }
+    // cleanup
+    env->DeleteLocalRef(jdb_name);
+    env->DeleteLocalRef(jcf_name);
+    env->DeleteLocalRef(joperation_properties);
+
+    return jcfd;
+  }
+};
+
+// The portal class for org.rocksdb.CompactionStyle
+class CompactionStyleJni {
+ public:
+ // Returns the equivalent org.rocksdb.CompactionStyle for the provided
+ // C++ rocksdb::CompactionStyle enum
+ static jbyte toJavaCompactionStyle(
+     const rocksdb::CompactionStyle& compaction_style) {
+   switch(compaction_style) {
+     case rocksdb::CompactionStyle::kCompactionStyleLevel:
+       return 0x0;
+     case rocksdb::CompactionStyle::kCompactionStyleUniversal:
+       return 0x1;
+     case rocksdb::CompactionStyle::kCompactionStyleFIFO:
+       return 0x2;
+     case rocksdb::CompactionStyle::kCompactionStyleNone:
+       return 0x3;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::CompactionStyle enum for the
+ // provided Java org.rocksdb.CompactionStyle
+ static rocksdb::CompactionStyle toCppCompactionStyle(
+     jbyte jcompaction_style) {
+   switch(jcompaction_style) {
+     case 0x0:
+       return rocksdb::CompactionStyle::kCompactionStyleLevel;
+     case 0x1:
+       return rocksdb::CompactionStyle::kCompactionStyleUniversal;
+     case 0x2:
+       return rocksdb::CompactionStyle::kCompactionStyleFIFO;
+     case 0x3:
+       return rocksdb::CompactionStyle::kCompactionStyleNone;
+     default:
+       // undefined/default
+       return rocksdb::CompactionStyle::kCompactionStyleLevel;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.CompactionReason
+class CompactionReasonJni {
+ public:
+ // Returns the equivalent org.rocksdb.CompactionReason for the provided
+ // C++ rocksdb::CompactionReason enum
+ static jbyte toJavaCompactionReason(
+     const rocksdb::CompactionReason& compaction_reason) {
+   switch(compaction_reason) {
+     case rocksdb::CompactionReason::kUnknown:
+       return 0x0;
+     case rocksdb::CompactionReason::kLevelL0FilesNum:
+       return 0x1;
+     case rocksdb::CompactionReason::kLevelMaxLevelSize:
+       return 0x2;
+     case rocksdb::CompactionReason::kUniversalSizeAmplification:
+       return 0x3;
+     case rocksdb::CompactionReason::kUniversalSizeRatio:
+       return 0x4;
+     case rocksdb::CompactionReason::kUniversalSortedRunNum:
+       return 0x5;
+     case rocksdb::CompactionReason::kFIFOMaxSize:
+       return 0x6;
+     case rocksdb::CompactionReason::kFIFOReduceNumFiles:
+       return 0x7;
+     case rocksdb::CompactionReason::kFIFOTtl:
+       return 0x8;
+     case rocksdb::CompactionReason::kManualCompaction:
+       return 0x9;
+     case rocksdb::CompactionReason::kFilesMarkedForCompaction:
+       return 0x10;
+     case rocksdb::CompactionReason::kBottommostFiles:
+       return 0x0A;
+     case rocksdb::CompactionReason::kTtl:
+       return 0x0B;
+     case rocksdb::CompactionReason::kFlush:
+       return 0x0C;
+     case rocksdb::CompactionReason::kExternalSstIngestion:
+       return 0x0D;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::CompactionReason enum for the
+ // provided Java org.rocksdb.CompactionReason
+ static rocksdb::CompactionReason toCppCompactionReason(
+     jbyte jcompaction_reason) {
+   switch(jcompaction_reason) {
+     case 0x0:
+       return rocksdb::CompactionReason::kUnknown;
+     case 0x1:
+       return rocksdb::CompactionReason::kLevelL0FilesNum;
+     case 0x2:
+       return rocksdb::CompactionReason::kLevelMaxLevelSize;
+     case 0x3:
+       return rocksdb::CompactionReason::kUniversalSizeAmplification;
+     case 0x4:
+       return rocksdb::CompactionReason::kUniversalSizeRatio;
+     case 0x5:
+       return rocksdb::CompactionReason::kUniversalSortedRunNum;
+     case 0x6:
+       return rocksdb::CompactionReason::kFIFOMaxSize;
+     case 0x7:
+       return rocksdb::CompactionReason::kFIFOReduceNumFiles;
+     case 0x8:
+       return rocksdb::CompactionReason::kFIFOTtl;
+     case 0x9:
+       return rocksdb::CompactionReason::kManualCompaction;
+     case 0x10:
+       return rocksdb::CompactionReason::kFilesMarkedForCompaction;
+     case 0x0A:
+       return rocksdb::CompactionReason::kBottommostFiles;
+     case 0x0B:
+       return rocksdb::CompactionReason::kTtl;
+     case 0x0C:
+       return rocksdb::CompactionReason::kFlush;
+     case 0x0D:
+       return rocksdb::CompactionReason::kExternalSstIngestion;
+     default:
+       // undefined/default
+       return rocksdb::CompactionReason::kUnknown;
+   }
+ }
+};
+
+// The portal class for org.rocksdb.WalFileType
+class WalFileTypeJni {
+ public:
+ // Returns the equivalent org.rocksdb.WalFileType for the provided
+ // C++ rocksdb::WalFileType enum
+ static jbyte toJavaWalFileType(
+     const rocksdb::WalFileType& wal_file_type) {
+   switch(wal_file_type) {
+     case rocksdb::WalFileType::kArchivedLogFile:
+       return 0x0;
+     case rocksdb::WalFileType::kAliveLogFile:
+       return 0x1;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::WalFileType enum for the
+ // provided Java org.rocksdb.WalFileType
+ static rocksdb::WalFileType toCppWalFileType(
+     jbyte jwal_file_type) {
+   switch(jwal_file_type) {
+     case 0x0:
+       return rocksdb::WalFileType::kArchivedLogFile;
+     case 0x1:
+       return rocksdb::WalFileType::kAliveLogFile;
+     default:
+       // undefined/default
+       return rocksdb::WalFileType::kAliveLogFile;
+   }
+ }
+};
+
+class LogFileJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.LogFile object.
+   *
+   * @param env A pointer to the Java environment
+   * @param log_file A Cpp log file object
+   *
+   * @return A reference to a Java org.rocksdb.LogFile object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppLogFile(JNIEnv* env, rocksdb::LogFile* log_file) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-        jstring jstr = static_cast<jstring>(js);
-        const char* str = env->GetStringUTFChars(jstr, nullptr);
-        if(str == nullptr) {
-          // exception thrown: OutOfMemoryError
-          env->DeleteLocalRef(js);
-          *has_exception = JNI_TRUE;
-          return strs;
-        }
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(Ljava/lang/String;JBJJ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
 
-        strs.push_back(std::string(str));
+    std::string path_name = log_file->PathName();
+    jstring jpath_name = rocksdb::JniUtil::toJavaString(env, &path_name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      return nullptr;
+    }
 
-        env->ReleaseStringUTFChars(jstr, str);
-        env->DeleteLocalRef(js);
-      }
+    jobject jlog_file = env->NewObject(jclazz, mid,
+        jpath_name,
+        static_cast<jlong>(log_file->LogNumber()),
+        rocksdb::WalFileTypeJni::toJavaWalFileType(log_file->Type()),
+        static_cast<jlong>(log_file->StartSequence()),
+        static_cast<jlong>(log_file->SizeFileBytes())
+    );
 
-      *has_exception = JNI_FALSE;
-      return strs;
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jpath_name);
+      return nullptr;
     }
 
-    /**
-     * Copies a jstring to a std::string
-     * and releases the original jstring
-     *
-     * If an exception occurs, then JNIEnv::ExceptionCheck()
-     * will have been called
-     *
-     * @param env (IN) A pointer to the java environment
-     * @param js (IN) The java string to copy
-     * @param has_exception (OUT) will be set to JNI_TRUE
-     *     if an OutOfMemoryError exception occurs
-     *
-     * @return A std:string copy of the jstring, or an
-     *     empty std::string if has_exception == JNI_TRUE
-     */
-    static std::string copyString(JNIEnv* env, jstring js,
-        jboolean* has_exception) {
-      const char *utf = env->GetStringUTFChars(js, nullptr);
-      if(utf == nullptr) {
-        // exception thrown: OutOfMemoryError
-        env->ExceptionCheck();
-        *has_exception = JNI_TRUE;
-        return std::string();
-      } else if(env->ExceptionCheck()) {
-        // exception thrown
-        env->ReleaseStringUTFChars(js, utf);
-        *has_exception = JNI_TRUE;
-        return std::string();
-      }
+    // cleanup
+    env->DeleteLocalRef(jpath_name);
 
-      std::string name(utf);
-      env->ReleaseStringUTFChars(js, utf);
-      *has_exception = JNI_FALSE;
-      return name;
+    return jlog_file;
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/LogFile");
+  }
+};
+
+class LiveFileMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.LiveFileMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param live_file_meta_data A Cpp live file meta data object
+   *
+   * @return A reference to a Java org.rocksdb.LiveFileMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppLiveFileMetaData(JNIEnv* env,
+      rocksdb::LiveFileMetaData* live_file_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
 
-    /**
-     * Copies bytes from a std::string to a jByteArray
-     *
-     * @param env A pointer to the java environment
-     * @param bytes The bytes to copy
-     *
-     * @return the Java byte[] or nullptr if an exception occurs
-     */
-    static jbyteArray copyBytes(JNIEnv* env, std::string bytes) {
-      const jsize jlen = static_cast<jsize>(bytes.size());
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "([BILjava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
 
-      jbyteArray jbytes = env->NewByteArray(jlen);
-      if(jbytes == nullptr) {
-        // exception thrown: OutOfMemoryError
-        return nullptr;
-      }
+    jbyteArray jcolumn_family_name = rocksdb::JniUtil::copyBytes(
+        env, live_file_meta_data->column_family_name);
+    if (jcolumn_family_name == nullptr) {
+      // exception occurred creating java byte array
+      return nullptr;
+    }
 
-      env->SetByteArrayRegion(jbytes, 0, jlen,
-        const_cast<jbyte*>(reinterpret_cast<const jbyte*>(bytes.c_str())));
-      if(env->ExceptionCheck()) {
-        // exception thrown: ArrayIndexOutOfBoundsException
-        env->DeleteLocalRef(jbytes);
-        return nullptr;
-      }
+    jstring jfile_name = rocksdb::JniUtil::toJavaString(
+        env, &live_file_meta_data->name, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      return nullptr;
+    }
 
-      return jbytes;
+    jstring jpath = rocksdb::JniUtil::toJavaString(
+        env, &live_file_meta_data->db_path, true);
+    if (env->ExceptionCheck()) {
+      // exception occurred creating java string
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      return nullptr;
     }
 
-    /**
-     * Given a Java byte[][] which is an array of java.lang.Strings
-     * where each String is a byte[], the passed function `string_fn`
-     * will be called on each String, the result is the collected by
-     * calling the passed function `collector_fn`
-     *
-     * @param env (IN) A pointer to the java environment
-     * @param jbyte_strings (IN) A Java array of Strings expressed as bytes
-     * @param string_fn (IN) A transform function to call for each String
-     * @param collector_fn (IN) A collector which is called for the result
-     *     of each `string_fn`
-     * @param has_exception (OUT) will be set to JNI_TRUE
-     *     if an ArrayIndexOutOfBoundsException or OutOfMemoryError
-     *     exception occurs
-     */
-    template <typename T> static void byteStrings(JNIEnv* env,
-        jobjectArray jbyte_strings,
-        std::function<T(const char*, const size_t)> string_fn,
-        std::function<void(size_t, T)> collector_fn,
-        jboolean *has_exception) {
-      const jsize jlen = env->GetArrayLength(jbyte_strings);
+    jbyteArray jsmallest_key = rocksdb::JniUtil::copyBytes(
+        env, live_file_meta_data->smallestkey);
+    if (jsmallest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      return nullptr;
+    }
 
-      for(jsize i = 0; i < jlen; i++) {
-        jobject jbyte_string_obj = env->GetObjectArrayElement(jbyte_strings, i);
-        if(env->ExceptionCheck()) {
-          // exception thrown: ArrayIndexOutOfBoundsException
-          *has_exception = JNI_TRUE;  // signal error
-          return;
-        }
+    jbyteArray jlargest_key = rocksdb::JniUtil::copyBytes(
+        env, live_file_meta_data->largestkey);
+    if (jlargest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      return nullptr;
+    }
 
-        jbyteArray jbyte_string_ary =
-            reinterpret_cast<jbyteArray>(jbyte_string_obj);
-        T result = byteString(env, jbyte_string_ary, string_fn, has_exception);
+    jobject jlive_file_meta_data = env->NewObject(jclazz, mid,
+        jcolumn_family_name,
+        static_cast<jint>(live_file_meta_data->level),
+        jfile_name,
+        jpath,
+        static_cast<jlong>(live_file_meta_data->size),
+        static_cast<jlong>(live_file_meta_data->smallest_seqno),
+        static_cast<jlong>(live_file_meta_data->largest_seqno),
+        jsmallest_key,
+        jlargest_key,
+        static_cast<jlong>(live_file_meta_data->num_reads_sampled),
+        static_cast<jboolean>(live_file_meta_data->being_compacted),
+        static_cast<jlong>(live_file_meta_data->num_entries),
+        static_cast<jlong>(live_file_meta_data->num_deletions)
+    );
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jcolumn_family_name);
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      env->DeleteLocalRef(jlargest_key);
+      return nullptr;
+    }
 
-        env->DeleteLocalRef(jbyte_string_obj);
+    // cleanup
+    env->DeleteLocalRef(jcolumn_family_name);
+    env->DeleteLocalRef(jfile_name);
+    env->DeleteLocalRef(jpath);
+    env->DeleteLocalRef(jsmallest_key);
+    env->DeleteLocalRef(jlargest_key);
 
-        if(*has_exception == JNI_TRUE) {
-          // exception thrown: OutOfMemoryError
-          return;
-        }
+    return jlive_file_meta_data;
+  }
 
-        collector_fn(i, result);
-      }
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/LiveFileMetaData");
+  }
+};
 
-      *has_exception = JNI_FALSE;
+class SstFileMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.SstFileMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param sst_file_meta_data A Cpp sst file meta data object
+   *
+   * @return A reference to a Java org.rocksdb.SstFileMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppSstFileMetaData(JNIEnv* env,
+      const rocksdb::SstFileMetaData* sst_file_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
 
-    /**
-     * Given a Java String which is expressed as a Java Byte Array byte[],
-     * the passed function `string_fn` will be called on the String
-     * and the result returned
-     *
-     * @param env (IN) A pointer to the java environment
-     * @param jbyte_string_ary (IN) A Java String expressed in bytes
-     * @param string_fn (IN) A transform function to call on the String
-     * @param has_exception (OUT) will be set to JNI_TRUE
-     *     if an OutOfMemoryError exception occurs
-     */
-    template <typename T> static T byteString(JNIEnv* env,
-        jbyteArray jbyte_string_ary,
-        std::function<T(const char*, const size_t)> string_fn,
-        jboolean* has_exception) {
-      const jsize jbyte_string_len = env->GetArrayLength(jbyte_string_ary);
-      jbyte* jbyte_string =
-          env->GetByteArrayElements(jbyte_string_ary, nullptr);
-      if(jbyte_string == nullptr) {
-        // exception thrown: OutOfMemoryError
-        *has_exception = JNI_TRUE;
-        return nullptr;  // signal error
-      }
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(Ljava/lang/String;Ljava/lang/String;JJJ[B[BJZJJ)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
 
-      T result =
-          string_fn(reinterpret_cast<char *>(jbyte_string), jbyte_string_len);
+    jstring jfile_name = rocksdb::JniUtil::toJavaString(
+        env, &sst_file_meta_data->name, true);
+    if (jfile_name == nullptr) {
+      // exception occurred creating java byte array
+      return nullptr;
+    }
 
-      env->ReleaseByteArrayElements(jbyte_string_ary, jbyte_string, JNI_ABORT);
+    jstring jpath = rocksdb::JniUtil::toJavaString(
+        env, &sst_file_meta_data->db_path, true);
+    if (jpath == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jfile_name);
+      return nullptr;
+    }
 
-      *has_exception = JNI_FALSE;
-      return result;
+    jbyteArray jsmallest_key = rocksdb::JniUtil::copyBytes(
+        env, sst_file_meta_data->smallestkey);
+    if (jsmallest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      return nullptr;
     }
 
-    /**
-     * Converts a std::vector<string> to a Java byte[][] where each Java String
-     * is expressed as a Java Byte Array byte[].
-     *
-     * @param env A pointer to the java environment
-     * @param strings A vector of Strings
-     *
-     * @return A Java array of Strings expressed as bytes
-     */
-    static jobjectArray stringsBytes(JNIEnv* env, std::vector<std::string> strings) {
-      jclass jcls_ba = ByteJni::getArrayJClass(env);
-      if(jcls_ba == nullptr) {
-        // exception occurred
-        return nullptr;
-      }
+    jbyteArray jlargest_key = rocksdb::JniUtil::copyBytes(
+        env, sst_file_meta_data->largestkey);
+    if (jlargest_key == nullptr) {
+      // exception occurred creating java byte array
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      return nullptr;
+    }
 
-      const jsize len = static_cast<jsize>(strings.size());
+    jobject jsst_file_meta_data = env->NewObject(jclazz, mid,
+        jfile_name,
+        jpath,
+        static_cast<jlong>(sst_file_meta_data->size),
+        static_cast<jint>(sst_file_meta_data->smallest_seqno),
+        static_cast<jlong>(sst_file_meta_data->largest_seqno),
+        jsmallest_key,
+        jlargest_key,
+        static_cast<jlong>(sst_file_meta_data->num_reads_sampled),
+        static_cast<jboolean>(sst_file_meta_data->being_compacted),
+        static_cast<jlong>(sst_file_meta_data->num_entries),
+        static_cast<jlong>(sst_file_meta_data->num_deletions)
+    );
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      env->DeleteLocalRef(jlargest_key);
+      return nullptr;
+    }
 
-      jobjectArray jbyte_strings = env->NewObjectArray(len, jcls_ba, nullptr);
-      if(jbyte_strings == nullptr) {
-        // exception thrown: OutOfMemoryError
-        return nullptr;
-      }
+    // cleanup
+      env->DeleteLocalRef(jfile_name);
+      env->DeleteLocalRef(jpath);
+      env->DeleteLocalRef(jsmallest_key);
+      env->DeleteLocalRef(jlargest_key);
 
-      for (jsize i = 0; i < len; i++) {
-        std::string *str = &strings[i];
-        const jsize str_len = static_cast<jsize>(str->size());
+    return jsst_file_meta_data;
+  }
 
-        jbyteArray jbyte_string_ary = env->NewByteArray(str_len);
-        if(jbyte_string_ary == nullptr) {
-          // exception thrown: OutOfMemoryError
-          env->DeleteLocalRef(jbyte_strings);
-          return nullptr;
-        }
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/SstFileMetaData");
+  }
+};
 
-        env->SetByteArrayRegion(
-          jbyte_string_ary, 0, str_len,
-          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(str->c_str())));
-        if(env->ExceptionCheck()) {
-          // exception thrown: ArrayIndexOutOfBoundsException
-          env->DeleteLocalRef(jbyte_string_ary);
-          env->DeleteLocalRef(jbyte_strings);
-          return nullptr;
-        }
+class LevelMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.LevelMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param level_meta_data A Cpp level meta data object
+   *
+   * @return A reference to a Java org.rocksdb.LevelMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppLevelMetaData(JNIEnv* env,
+      const rocksdb::LevelMetaData* level_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(IJ[Lorg/rocksdb/SstFileMetaData;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
+    }
 
-        env->SetObjectArrayElement(jbyte_strings, i, jbyte_string_ary);
-        if(env->ExceptionCheck()) {
-          // exception thrown: ArrayIndexOutOfBoundsException
-          // or ArrayStoreException
-          env->DeleteLocalRef(jbyte_string_ary);
-          env->DeleteLocalRef(jbyte_strings);
-          return nullptr;
-        }
+    const jsize jlen =
+        static_cast<jsize>(level_meta_data->files.size());
+    jobjectArray jfiles = env->NewObjectArray(jlen, SstFileMetaDataJni::getJClass(env), nullptr);
+    if (jfiles == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
 
-        env->DeleteLocalRef(jbyte_string_ary);
+    jsize i = 0;
+    for (auto it = level_meta_data->files.begin();
+        it != level_meta_data->files.end(); ++it) {
+      jobject jfile = SstFileMetaDataJni::fromCppSstFileMetaData(env, &(*it));
+      if (jfile == nullptr) {
+        // exception occurred
+        env->DeleteLocalRef(jfiles);
+        return nullptr;
       }
+      env->SetObjectArrayElement(jfiles, i++, jfile);
+    }
 
-      return jbyte_strings;
+    jobject jlevel_meta_data = env->NewObject(jclazz, mid,
+        static_cast<jint>(level_meta_data->level),
+        static_cast<jlong>(level_meta_data->size),
+        jfiles
+    );
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jfiles);
+      return nullptr;
     }
 
-    /*
-     * Helper for operations on a key and value
-     * for example WriteBatch->Put
-     *
-     * TODO(AR) could be extended to cover returning rocksdb::Status
-     * from `op` and used for RocksDB->Put etc.
-     */
-    static void kv_op(
-        std::function<void(rocksdb::Slice, rocksdb::Slice)> op,
-        JNIEnv* env, jobject jobj,
-        jbyteArray jkey, jint jkey_len,
-        jbyteArray jentry_value, jint jentry_value_len) {
-      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-      if(env->ExceptionCheck()) {
-        // exception thrown: OutOfMemoryError
-        return;
-      }
+    // cleanup
+    env->DeleteLocalRef(jfiles);
 
-      jbyte* value = env->GetByteArrayElements(jentry_value, nullptr);
-      if(env->ExceptionCheck()) {
-        // exception thrown: OutOfMemoryError
-        if(key != nullptr) {
-          env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-        }
-        return;
-      }
+    return jlevel_meta_data;
+  }
 
-      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-      rocksdb::Slice value_slice(reinterpret_cast<char*>(value),
-          jentry_value_len);
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/LevelMetaData");
+  }
+};
 
-      op(key_slice, value_slice);
+class ColumnFamilyMetaDataJni : public JavaClass {
+ public:
+  /**
+   * Create a new Java org.rocksdb.ColumnFamilyMetaData object.
+   *
+   * @param env A pointer to the Java environment
+   * @param column_famly_meta_data A Cpp live file meta data object
+   *
+   * @return A reference to a Java org.rocksdb.ColumnFamilyMetaData object, or
+   * nullptr if an an exception occurs
+   */
+  static jobject fromCppColumnFamilyMetaData(JNIEnv* env,
+      const rocksdb::ColumnFamilyMetaData* column_famly_meta_data) {
+    jclass jclazz = getJClass(env);
+    if (jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-      if(value != nullptr) {
-        env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT);
-      }
-      if(key != nullptr) {
-        env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-      }
+    jmethodID mid = env->GetMethodID(jclazz, "<init>", "(JJ[B[Lorg/rocksdb/LevelMetaData;)V");
+    if (mid == nullptr) {
+      // exception thrown: NoSuchMethodException or OutOfMemoryError
+      return nullptr;
     }
 
-    /*
-     * Helper for operations on a key
-     * for example WriteBatch->Delete
-     *
-     * TODO(AR) could be extended to cover returning rocksdb::Status
-     * from `op` and used for RocksDB->Delete etc.
-     */
-    static void k_op(
-        std::function<void(rocksdb::Slice)> op,
-        JNIEnv* env, jobject jobj,
-        jbyteArray jkey, jint jkey_len) {
-      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-      if(env->ExceptionCheck()) {
-        // exception thrown: OutOfMemoryError
-        return;
+    jbyteArray jname = rocksdb::JniUtil::copyBytes(
+        env, column_famly_meta_data->name);
+    if (jname == nullptr) {
+      // exception occurred creating java byte array
+      return nullptr;
+    }
+
+    const jsize jlen =
+        static_cast<jsize>(column_famly_meta_data->levels.size());
+    jobjectArray jlevels = env->NewObjectArray(jlen, LevelMetaDataJni::getJClass(env), nullptr);
+    if(jlevels == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jname);
+      return nullptr;
+    }
+
+    jsize i = 0;
+    for (auto it = column_famly_meta_data->levels.begin();
+        it != column_famly_meta_data->levels.end(); ++it) {
+      jobject jlevel = LevelMetaDataJni::fromCppLevelMetaData(env, &(*it));
+      if (jlevel == nullptr) {
+        // exception occurred
+        env->DeleteLocalRef(jname);
+        env->DeleteLocalRef(jlevels);
+        return nullptr;
       }
+      env->SetObjectArrayElement(jlevels, i++, jlevel);
+    }
 
-      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+    jobject jcolumn_family_meta_data = env->NewObject(jclazz, mid,
+        static_cast<jlong>(column_famly_meta_data->size),
+        static_cast<jlong>(column_famly_meta_data->file_count),
+        jname,
+        jlevels
+    );
+
+    if (env->ExceptionCheck()) {
+      env->DeleteLocalRef(jname);
+      env->DeleteLocalRef(jlevels);
+      return nullptr;
+    }
 
-      op(key_slice);
+    // cleanup
+    env->DeleteLocalRef(jname);
+    env->DeleteLocalRef(jlevels);
 
-      if(key != nullptr) {
-        env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-      }
+    return jcolumn_family_meta_data;
+  }
+
+  static jclass getJClass(JNIEnv* env) {
+    return JavaClass::getJClass(env, "org/rocksdb/ColumnFamilyMetaData");
+  }
+};
+
+// The portal class for org.rocksdb.AbstractTraceWriter
+class AbstractTraceWriterJni : public RocksDBNativeClass<
+    const rocksdb::TraceWriterJniCallback*,
+    AbstractTraceWriterJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractTraceWriter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractTraceWriter");
+  }
+
+  /**
+   * Get the Java Method: AbstractTraceWriter#write
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getWriteProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
     }
 
-    /*
-     * Helper for operations on a value
-     * for example WriteBatchWithIndex->GetFromBatch
-     */
-    static jbyteArray v_op(
-        std::function<rocksdb::Status(rocksdb::Slice, std::string*)> op,
-        JNIEnv* env, jbyteArray jkey, jint jkey_len) {
-      jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-      if(env->ExceptionCheck()) {
-        // exception thrown: OutOfMemoryError
-        return nullptr;
-      }
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "writeProxy", "(J)S");
+    assert(mid != nullptr);
+    return mid;
+  }
 
-      rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  /**
+   * Get the Java Method: AbstractTraceWriter#closeWriter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getCloseWriterProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-      std::string value;
-      rocksdb::Status s = op(key_slice, &value);
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "closeWriterProxy", "()S");
+    assert(mid != nullptr);
+    return mid;
+  }
 
-      if(key != nullptr) {
-        env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-      }
+  /**
+   * Get the Java Method: AbstractTraceWriter#getFileSize
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getGetFileSizeMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-      if (s.IsNotFound()) {
-        return nullptr;
-      }
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "getFileSize", "()J");
+    assert(mid != nullptr);
+    return mid;
+  }
+};
 
-      if (s.ok()) {
-        jbyteArray jret_value =
-            env->NewByteArray(static_cast<jsize>(value.size()));
-        if(jret_value == nullptr) {
-          // exception thrown: OutOfMemoryError
-          return nullptr;
-        }
+// The portal class for org.rocksdb.AbstractWalFilter
+class AbstractWalFilterJni : public RocksDBNativeClass<
+    const rocksdb::WalFilterJniCallback*,
+    AbstractWalFilterJni> {
+ public:
+  /**
+   * Get the Java Class org.rocksdb.AbstractWalFilter
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Class or nullptr if one of the
+   *     ClassFormatError, ClassCircularityError, NoClassDefFoundError,
+   *     OutOfMemoryError or ExceptionInInitializerError exceptions is thrown
+   */
+  static jclass getJClass(JNIEnv* env) {
+    return RocksDBNativeClass::getJClass(env,
+        "org/rocksdb/AbstractWalFilter");
+  }
 
-        env->SetByteArrayRegion(jret_value, 0, static_cast<jsize>(value.size()),
-                                const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value.c_str())));
-        if(env->ExceptionCheck()) {
-          // exception thrown: ArrayIndexOutOfBoundsException
-          if(jret_value != nullptr) {
-            env->DeleteLocalRef(jret_value);
-          }
-          return nullptr;
-        }
+  /**
+   * Get the Java Method: AbstractWalFilter#columnFamilyLogNumberMap
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getColumnFamilyLogNumberMapMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
 
-        return jret_value;
-      }
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "columnFamilyLogNumberMap",
+        "(Ljava/util/Map;Ljava/util/Map;)V");
+    assert(mid != nullptr);
+    return mid;
+  }
 
-      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  /**
+   * Get the Java Method: AbstractTraceWriter#logRecordFoundProxy
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getLogRecordFoundProxyMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
+      return nullptr;
+    }
+
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "logRecordFoundProxy", "(JLjava/lang/String;JJ)S");
+    assert(mid != nullptr);
+    return mid;
+  }
+
+  /**
+   * Get the Java Method: AbstractTraceWriter#name
+   *
+   * @param env A pointer to the Java environment
+   *
+   * @return The Java Method ID or nullptr if the class or method id could not
+   *     be retieved
+   */
+  static jmethodID getNameMethodId(JNIEnv* env) {
+    jclass jclazz = getJClass(env);
+    if(jclazz == nullptr) {
+      // exception occurred accessing class
       return nullptr;
     }
+
+    static jmethodID mid = env->GetMethodID(
+        jclazz, "name", "()Ljava/lang/String;");
+    assert(mid != nullptr);
+    return mid;
+  }
 };
 
+// The portal class for org.rocksdb.WalProcessingOption
+class WalProcessingOptionJni {
+ public:
+ // Returns the equivalent org.rocksdb.WalProcessingOption for the provided
+ // C++ rocksdb::WalFilter::WalProcessingOption enum
+ static jbyte toJavaWalProcessingOption(
+     const rocksdb::WalFilter::WalProcessingOption& wal_processing_option) {
+   switch(wal_processing_option) {
+     case rocksdb::WalFilter::WalProcessingOption::kContinueProcessing:
+       return 0x0;
+     case rocksdb::WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
+       return 0x1;
+     case rocksdb::WalFilter::WalProcessingOption::kStopReplay:
+       return 0x2;
+     case rocksdb::WalFilter::WalProcessingOption::kCorruptedRecord:
+       return 0x3;
+     default:
+       return 0x7F;  // undefined
+   }
+ }
+
+ // Returns the equivalent C++ rocksdb::WalFilter::WalProcessingOption enum for
+ // the provided Java org.rocksdb.WalProcessingOption
+ static rocksdb::WalFilter::WalProcessingOption toCppWalProcessingOption(
+     jbyte jwal_processing_option) {
+   switch(jwal_processing_option) {
+     case 0x0:
+       return rocksdb::WalFilter::WalProcessingOption::kContinueProcessing;
+     case 0x1:
+       return rocksdb::WalFilter::WalProcessingOption::kIgnoreCurrentRecord;
+     case 0x2:
+       return rocksdb::WalFilter::WalProcessingOption::kStopReplay;
+     case 0x3:
+       return rocksdb::WalFilter::WalProcessingOption::kCorruptedRecord;
+     default:
+       // undefined/default
+       return rocksdb::WalFilter::WalProcessingOption::kCorruptedRecord;
+   }
+ }
+};
 }  // namespace rocksdb
 #endif  // JAVA_ROCKSJNI_PORTAL_H_
diff --git a/thirdparty/rocksdb/java/rocksjni/ratelimiterjni.cc b/thirdparty/rocksdb/java/rocksjni/ratelimiterjni.cc
index b4174ff102..0804c2fbca 100644
--- a/thirdparty/rocksdb/java/rocksjni/ratelimiterjni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/ratelimiterjni.cc
@@ -5,23 +5,26 @@
 //
 // This file implements the "bridge" between Java and C++ for RateLimiter.
 
-#include "rocksjni/portal.h"
 #include "include/org_rocksdb_RateLimiter.h"
 #include "rocksdb/rate_limiter.h"
+#include "rocksjni/portal.h"
 
 /*
  * Class:     org_rocksdb_RateLimiter
  * Method:    newRateLimiterHandle
- * Signature: (JJI)J
+ * Signature: (JJIBZ)J
  */
 jlong Java_org_rocksdb_RateLimiter_newRateLimiterHandle(
-    JNIEnv* env, jclass jclazz, jlong jrate_bytes_per_second,
-    jlong jrefill_period_micros, jint jfairness) {
-  auto * sptr_rate_limiter =
+    JNIEnv* /*env*/, jclass /*jclazz*/, jlong jrate_bytes_per_second,
+    jlong jrefill_period_micros, jint jfairness, jbyte jrate_limiter_mode,
+    jboolean jauto_tune) {
+  auto rate_limiter_mode =
+      rocksdb::RateLimiterModeJni::toCppRateLimiterMode(jrate_limiter_mode);
+  auto* sptr_rate_limiter =
       new std::shared_ptr<rocksdb::RateLimiter>(rocksdb::NewGenericRateLimiter(
           static_cast<int64_t>(jrate_bytes_per_second),
           static_cast<int64_t>(jrefill_period_micros),
-          static_cast<int32_t>(jfairness)));
+          static_cast<int32_t>(jfairness), rate_limiter_mode, jauto_tune));
 
   return reinterpret_cast<jlong>(sptr_rate_limiter);
 }
@@ -31,10 +34,11 @@ jlong Java_org_rocksdb_RateLimiter_newRateLimiterHandle(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_RateLimiter_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+void Java_org_rocksdb_RateLimiter_disposeInternal(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
   auto* handle =
-      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(jhandle);
+      reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(jhandle);
   delete handle;  // delete std::shared_ptr
 }
 
@@ -43,11 +47,26 @@ void Java_org_rocksdb_RateLimiter_disposeInternal(
  * Method:    setBytesPerSecond
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_RateLimiter_setBytesPerSecond(
-    JNIEnv* env, jobject jobj, jlong handle,
-    jlong jbytes_per_second) {
-  reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(handle)->get()->
-      SetBytesPerSecond(jbytes_per_second);
+void Java_org_rocksdb_RateLimiter_setBytesPerSecond(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle,
+                                                    jlong jbytes_per_second) {
+  reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(handle)
+      ->get()
+      ->SetBytesPerSecond(jbytes_per_second);
+}
+
+/*
+ * Class:     org_rocksdb_RateLimiter
+ * Method:    getBytesPerSecond
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RateLimiter_getBytesPerSecond(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
+  return reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(handle)
+      ->get()
+      ->GetBytesPerSecond();
 }
 
 /*
@@ -55,11 +74,11 @@ void Java_org_rocksdb_RateLimiter_setBytesPerSecond(
  * Method:    request
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_RateLimiter_request(
-    JNIEnv* env, jobject jobj, jlong handle,
-    jlong jbytes) {
-  reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(handle)->get()->
-      Request(jbytes, rocksdb::Env::IO_TOTAL);
+void Java_org_rocksdb_RateLimiter_request(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle, jlong jbytes) {
+  reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(handle)
+      ->get()
+      ->Request(jbytes, rocksdb::Env::IO_TOTAL);
 }
 
 /*
@@ -67,10 +86,12 @@ void Java_org_rocksdb_RateLimiter_request(
  * Method:    getSingleBurstBytes
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_RateLimiter_getSingleBurstBytes(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  return reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(handle)->
-      get()->GetSingleBurstBytes();
+jlong Java_org_rocksdb_RateLimiter_getSingleBurstBytes(JNIEnv* /*env*/,
+                                                       jobject /*jobj*/,
+                                                       jlong handle) {
+  return reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(handle)
+      ->get()
+      ->GetSingleBurstBytes();
 }
 
 /*
@@ -78,10 +99,12 @@ jlong Java_org_rocksdb_RateLimiter_getSingleBurstBytes(
  * Method:    getTotalBytesThrough
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_RateLimiter_getTotalBytesThrough(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  return reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(handle)->
-      get()->GetTotalBytesThrough();
+jlong Java_org_rocksdb_RateLimiter_getTotalBytesThrough(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong handle) {
+  return reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(handle)
+      ->get()
+      ->GetTotalBytesThrough();
 }
 
 /*
@@ -89,8 +112,10 @@ jlong Java_org_rocksdb_RateLimiter_getTotalBytesThrough(
  * Method:    getTotalRequests
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_RateLimiter_getTotalRequests(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  return reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter> *>(handle)->
-      get()->GetTotalRequests();
+jlong Java_org_rocksdb_RateLimiter_getTotalRequests(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  return reinterpret_cast<std::shared_ptr<rocksdb::RateLimiter>*>(handle)
+      ->get()
+      ->GetTotalRequests();
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc b/thirdparty/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc
index 8c54a46b86..ede150fa62 100644
--- a/thirdparty/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc
@@ -8,16 +8,14 @@
 #include "include/org_rocksdb_RemoveEmptyValueCompactionFilter.h"
 #include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h"
 
-
 /*
  * Class:     org_rocksdb_RemoveEmptyValueCompactionFilter
  * Method:    createNewRemoveEmptyValueCompactionFilter0
  * Signature: ()J
  */
 jlong Java_org_rocksdb_RemoveEmptyValueCompactionFilter_createNewRemoveEmptyValueCompactionFilter0(
-    JNIEnv* env, jclass jcls) {
-  auto* compaction_filter =
-      new rocksdb::RemoveEmptyValueCompactionFilter();
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  auto* compaction_filter = new rocksdb::RemoveEmptyValueCompactionFilter();
 
   // set the native handle to our native compaction filter
   return reinterpret_cast<jlong>(compaction_filter);
diff --git a/thirdparty/rocksdb/java/rocksjni/restorejni.cc b/thirdparty/rocksdb/java/rocksjni/restorejni.cc
index eb8e65b4a1..beca74fb56 100644
--- a/thirdparty/rocksdb/java/rocksjni/restorejni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/restorejni.cc
@@ -7,21 +7,21 @@
 // calling C++ rocksdb::RestoreOptions methods
 // from Java side.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <string>
 
 #include "include/org_rocksdb_RestoreOptions.h"
-#include "rocksjni/portal.h"
 #include "rocksdb/utilities/backupable_db.h"
+#include "rocksjni/portal.h"
 /*
  * Class:     org_rocksdb_RestoreOptions
  * Method:    newRestoreOptions
  * Signature: (Z)J
  */
-jlong Java_org_rocksdb_RestoreOptions_newRestoreOptions(JNIEnv* env,
-    jclass jcls, jboolean keep_log_files) {
+jlong Java_org_rocksdb_RestoreOptions_newRestoreOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/, jboolean keep_log_files) {
   auto* ropt = new rocksdb::RestoreOptions(keep_log_files);
   return reinterpret_cast<jlong>(ropt);
 }
@@ -31,8 +31,9 @@ jlong Java_org_rocksdb_RestoreOptions_newRestoreOptions(JNIEnv* env,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_RestoreOptions_disposeInternal(JNIEnv* env, jobject jobj,
-    jlong jhandle) {
+void Java_org_rocksdb_RestoreOptions_disposeInternal(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jhandle) {
   auto* ropt = reinterpret_cast<rocksdb::RestoreOptions*>(jhandle);
   assert(ropt);
   delete ropt;
diff --git a/thirdparty/rocksdb/java/rocksjni/rocks_callback_object.cc b/thirdparty/rocksdb/java/rocksjni/rocks_callback_object.cc
new file mode 100644
index 0000000000..874ef3375a
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/rocks_callback_object.cc
@@ -0,0 +1,31 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// JNI Callbacks from C++ to sub-classes or org.rocksdb.RocksCallbackObject
+
+#include <jni.h>
+
+#include "include/org_rocksdb_RocksCallbackObject.h"
+#include "jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_RocksCallbackObject
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksCallbackObject_disposeInternal(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong handle) {
+  // TODO(AR) is deleting from the super class JniCallback OK, or must we delete
+  // the subclass? Example hierarchies:
+  //   1) Comparator -> BaseComparatorJniCallback + JniCallback ->
+  //   DirectComparatorJniCallback 2) Comparator -> BaseComparatorJniCallback +
+  //   JniCallback -> ComparatorJniCallback
+  // I think this is okay, as Comparator and JniCallback both have virtual
+  // destructors...
+  delete reinterpret_cast<rocksdb::JniCallback*>(handle);
+  // @lint-ignore TXT4 T25377293 Grandfathered in
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/rocksdb_exception_test.cc b/thirdparty/rocksdb/java/rocksjni/rocksdb_exception_test.cc
index 339d4c5eda..6e5978121b 100644
--- a/thirdparty/rocksdb/java/rocksjni/rocksdb_exception_test.cc
+++ b/thirdparty/rocksdb/java/rocksjni/rocksdb_exception_test.cc
@@ -17,7 +17,7 @@
  * Signature: ()V
  */
 void Java_org_rocksdb_RocksDBExceptionTest_raiseException(JNIEnv* env,
-                                                          jobject jobj) {
+                                                          jobject /*jobj*/) {
   rocksdb::RocksDBExceptionJni::ThrowNew(env, std::string("test message"));
 }
 
@@ -27,7 +27,7 @@ void Java_org_rocksdb_RocksDBExceptionTest_raiseException(JNIEnv* env,
  * Signature: ()V
  */
 void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCode(
-    JNIEnv* env, jobject jobj) {
+    JNIEnv* env, jobject /*jobj*/) {
   rocksdb::RocksDBExceptionJni::ThrowNew(env, "test message",
                                          rocksdb::Status::NotSupported());
 }
@@ -38,7 +38,7 @@ void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCode(
  * Signature: ()V
  */
 void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionNoMsgWithStatusCode(
-    JNIEnv* env, jobject jobj) {
+    JNIEnv* env, jobject /*jobj*/) {
   rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotSupported());
 }
 
@@ -48,7 +48,7 @@ void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionNoMsgWithStatusCode(
  * Signature: ()V
  */
 void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCodeSubCode(
-    JNIEnv* env, jobject jobj) {
+    JNIEnv* env, jobject /*jobj*/) {
   rocksdb::RocksDBExceptionJni::ThrowNew(
       env, "test message",
       rocksdb::Status::TimedOut(rocksdb::Status::SubCode::kLockTimeout));
@@ -60,7 +60,7 @@ void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCodeSubCode(
  * Signature: ()V
  */
 void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionNoMsgWithStatusCodeSubCode(
-    JNIEnv* env, jobject jobj) {
+    JNIEnv* env, jobject /*jobj*/) {
   rocksdb::RocksDBExceptionJni::ThrowNew(
       env, rocksdb::Status::TimedOut(rocksdb::Status::SubCode::kLockTimeout));
 }
@@ -71,7 +71,7 @@ void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionNoMsgWithStatusCodeSubC
  * Signature: ()V
  */
 void Java_org_rocksdb_RocksDBExceptionTest_raiseExceptionWithStatusCodeState(
-    JNIEnv* env, jobject jobj) {
+    JNIEnv* env, jobject /*jobj*/) {
   rocksdb::Slice state("test state");
   rocksdb::RocksDBExceptionJni::ThrowNew(env, "test message",
                                          rocksdb::Status::NotSupported(state));
diff --git a/thirdparty/rocksdb/java/rocksjni/rocksjni.cc b/thirdparty/rocksdb/java/rocksjni/rocksjni.cc
index a08a459714..53224232c8 100644
--- a/thirdparty/rocksdb/java/rocksjni/rocksjni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/rocksjni.cc
@@ -27,14 +27,13 @@
 #undef min
 #endif
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Open
-jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
-    std::function<rocksdb::Status(
-      const rocksdb::Options&, const std::string&, rocksdb::DB**)> open_fn
-    ) {
+jlong rocksdb_open_helper(
+    JNIEnv* env, jlong jopt_handle, jstring jdb_path,
+    std::function<rocksdb::Status(const rocksdb::Options&, const std::string&,
+                                  rocksdb::DB**)>
+        open_fn) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
-  if(db_path == nullptr) {
+  if (db_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
@@ -59,12 +58,12 @@ jlong rocksdb_open_helper(JNIEnv* env, jlong jopt_handle, jstring jdb_path,
  * Signature: (JLjava/lang/String;)J
  */
 jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(
-    JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path) {
-  return rocksdb_open_helper(env, jopt_handle, jdb_path,
-    (rocksdb::Status(*)
-      (const rocksdb::Options&, const std::string&, rocksdb::DB**)
-    )&rocksdb::DB::Open
-  );
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) {
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path,
+      (rocksdb::Status(*)(const rocksdb::Options&, const std::string&,
+                          rocksdb::DB**)) &
+          rocksdb::DB::Open);
 }
 
 /*
@@ -73,31 +72,32 @@ jlong Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2(
  * Signature: (JLjava/lang/String;)J
  */
 jlong Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2(
-    JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path) {
-  return rocksdb_open_helper(env, jopt_handle, jdb_path, [](
-      const rocksdb::Options& options,
-      const std::string& db_path, rocksdb::DB** db) {
-    return rocksdb::DB::OpenForReadOnly(options, db_path, db);
-  });
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) {
+  return rocksdb_open_helper(env, jopt_handle, jdb_path,
+                             [](const rocksdb::Options& options,
+                                const std::string& db_path, rocksdb::DB** db) {
+                               return rocksdb::DB::OpenForReadOnly(options,
+                                                                   db_path, db);
+                             });
 }
 
-jlongArray rocksdb_open_helper(JNIEnv* env, jlong jopt_handle,
-    jstring jdb_path, jobjectArray jcolumn_names, jlongArray jcolumn_options,
+jlongArray rocksdb_open_helper(
+    JNIEnv* env, jlong jopt_handle, jstring jdb_path,
+    jobjectArray jcolumn_names, jlongArray jcolumn_options,
     std::function<rocksdb::Status(
-      const rocksdb::DBOptions&, const std::string&,
-      const std::vector<rocksdb::ColumnFamilyDescriptor>&,
-      std::vector<rocksdb::ColumnFamilyHandle*>*,
-      rocksdb::DB**)> open_fn
-    ) {
+        const rocksdb::DBOptions&, const std::string&,
+        const std::vector<rocksdb::ColumnFamilyDescriptor>&,
+        std::vector<rocksdb::ColumnFamilyHandle*>*, rocksdb::DB**)>
+        open_fn) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
-  if(db_path == nullptr) {
+  if (db_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
 
   const jsize len_cols = env->GetArrayLength(jcolumn_names);
   jlong* jco = env->GetLongArrayElements(jcolumn_options, nullptr);
-  if(jco == nullptr) {
+  if (jco == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseStringUTFChars(jdb_path, db_path);
     return nullptr;
@@ -106,64 +106,62 @@ jlongArray rocksdb_open_helper(JNIEnv* env, jlong jopt_handle,
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
   jboolean has_exception = JNI_FALSE;
   rocksdb::JniUtil::byteStrings<std::string>(
-    env,
-    jcolumn_names,
-    [](const char* str_data, const size_t str_len) {
-      return std::string(str_data, str_len);
-    },
-    [&jco, &column_families](size_t idx, std::string cf_name) {
-      rocksdb::ColumnFamilyOptions* cf_options =
-          reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jco[idx]);
-      column_families.push_back(
-          rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options));
-    },
-    &has_exception);
+      env, jcolumn_names,
+      [](const char* str_data, const size_t str_len) {
+        return std::string(str_data, str_len);
+      },
+      [&jco, &column_families](size_t idx, std::string cf_name) {
+        rocksdb::ColumnFamilyOptions* cf_options =
+            reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jco[idx]);
+        column_families.push_back(
+            rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options));
+      },
+      &has_exception);
 
   env->ReleaseLongArrayElements(jcolumn_options, jco, JNI_ABORT);
 
-  if(has_exception == JNI_TRUE) {
+  if (has_exception == JNI_TRUE) {
     // exception occurred
     env->ReleaseStringUTFChars(jdb_path, db_path);
     return nullptr;
   }
 
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jopt_handle);
-  std::vector<rocksdb::ColumnFamilyHandle*> handles;
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
   rocksdb::DB* db = nullptr;
-  rocksdb::Status s = open_fn(*opt, db_path, column_families,
-      &handles, &db);
+  rocksdb::Status s = open_fn(*opt, db_path, column_families, &cf_handles, &db);
 
   // we have now finished with db_path
   env->ReleaseStringUTFChars(jdb_path, db_path);
 
   // check if open operation was successful
-  if (s.ok()) {
-    const jsize resultsLen = 1 + len_cols; //db handle + column family handles
-    std::unique_ptr<jlong[]> results =
-        std::unique_ptr<jlong[]>(new jlong[resultsLen]);
-    results[0] = reinterpret_cast<jlong>(db);
-    for(int i = 1; i <= len_cols; i++) {
-      results[i] = reinterpret_cast<jlong>(handles[i - 1]);
-    }
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
 
-    jlongArray jresults = env->NewLongArray(resultsLen);
-    if(jresults == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return nullptr;
-    }
+  const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
+  std::unique_ptr<jlong[]> results =
+      std::unique_ptr<jlong[]>(new jlong[resultsLen]);
+  results[0] = reinterpret_cast<jlong>(db);
+  for (int i = 1; i <= len_cols; i++) {
+    results[i] = reinterpret_cast<jlong>(cf_handles[i - 1]);
+  }
 
-    env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
-    if(env->ExceptionCheck()) {
-      // exception thrown: ArrayIndexOutOfBoundsException
-      env->DeleteLocalRef(jresults);
-      return nullptr;
-    }
+  jlongArray jresults = env->NewLongArray(resultsLen);
+  if (jresults == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
 
-    return jresults;
-  } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresults);
     return nullptr;
   }
+
+  return jresults;
 }
 
 /*
@@ -172,16 +170,16 @@ jlongArray rocksdb_open_helper(JNIEnv* env, jlong jopt_handle,
  * Signature: (JLjava/lang/String;[[B[J)[J
  */
 jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3J(
-    JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path,
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
     jobjectArray jcolumn_names, jlongArray jcolumn_options) {
-  return rocksdb_open_helper(env, jopt_handle, jdb_path, jcolumn_names,
-    jcolumn_options, [](
-        const rocksdb::DBOptions& options, const std::string& db_path,
-        const std::vector<rocksdb::ColumnFamilyDescriptor>& column_families,
-        std::vector<rocksdb::ColumnFamilyHandle*>* handles, rocksdb::DB** db) {
-      return rocksdb::DB::OpenForReadOnly(options, db_path, column_families,
-        handles, db);
-  });
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
+      [](const rocksdb::DBOptions& options, const std::string& db_path,
+         const std::vector<rocksdb::ColumnFamilyDescriptor>& column_families,
+         std::vector<rocksdb::ColumnFamilyHandle*>* handles, rocksdb::DB** db) {
+        return rocksdb::DB::OpenForReadOnly(options, db_path, column_families,
+                                            handles, db);
+      });
 }
 
 /*
@@ -190,19 +188,41 @@ jlongArray Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2_3_3B_3J(
  * Signature: (JLjava/lang/String;[[B[J)[J
  */
 jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J(
-    JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path,
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
     jobjectArray jcolumn_names, jlongArray jcolumn_options) {
-  return rocksdb_open_helper(env, jopt_handle, jdb_path, jcolumn_names,
-    jcolumn_options, (rocksdb::Status(*)
-      (const rocksdb::DBOptions&, const std::string&,
-        const std::vector<rocksdb::ColumnFamilyDescriptor>&,
-        std::vector<rocksdb::ColumnFamilyHandle*>*, rocksdb::DB**)
-      )&rocksdb::DB::Open
-    );
+  return rocksdb_open_helper(
+      env, jopt_handle, jdb_path, jcolumn_names, jcolumn_options,
+      (rocksdb::Status(*)(const rocksdb::DBOptions&, const std::string&,
+                          const std::vector<rocksdb::ColumnFamilyDescriptor>&,
+                          std::vector<rocksdb::ColumnFamilyHandle*>*,
+                          rocksdb::DB**)) &
+          rocksdb::DB::Open);
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::ListColumnFamilies
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jhandle);
+  assert(db != nullptr);
+  delete db;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_closeDatabase(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jhandle);
+  assert(db != nullptr);
+  rocksdb::Status s = db->Close();
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
 
 /*
  * Class:     org_rocksdb_RocksDB
@@ -210,17 +230,17 @@ jlongArray Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2_3_3B_3J(
  * Signature: (JLjava/lang/String;)[[B
  */
 jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies(
-    JNIEnv* env, jclass jclazz, jlong jopt_handle, jstring jdb_path) {
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path) {
   std::vector<std::string> column_family_names;
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
-  if(db_path == nullptr) {
+  if (db_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
 
   auto* opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
-  rocksdb::Status s = rocksdb::DB::ListColumnFamilies(*opt, db_path,
-      &column_family_names);
+  rocksdb::Status s =
+      rocksdb::DB::ListColumnFamilies(*opt, db_path, &column_family_names);
 
   env->ReleaseStringUTFChars(jdb_path, db_path);
 
@@ -230,31 +250,225 @@ jobjectArray Java_org_rocksdb_RocksDB_listColumnFamilies(
   return jcolumn_family_names;
 }
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamily
+ * Signature: (J[BIJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_createColumnFamily(
+    JNIEnv* env, jobject, jlong jhandle, jbyteArray jcf_name,
+    jint jcf_name_len, jlong jcf_options_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jhandle);
+  jboolean has_exception = JNI_FALSE;
+  const std::string cf_name =
+      rocksdb::JniUtil::byteString<std::string>(env, jcf_name, jcf_name_len,
+          [](const char* str, const size_t len) {
+              return std::string(str, len); 
+          }, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return 0;
+  }
+  auto* cf_options =
+      reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jcf_options_handle);
+  rocksdb::ColumnFamilyHandle *cf_handle;
+  rocksdb::Status s = db->CreateColumnFamily(*cf_options, cf_name, &cf_handle);
+  if (!s.ok()) {
+    // error occurred
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+  return reinterpret_cast<jlong>(cf_handle);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamilies
+ * Signature: (JJ[[B)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__JJ_3_3B(
+    JNIEnv* env, jobject, jlong jhandle, jlong jcf_options_handle,
+    jobjectArray jcf_names) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jhandle);
+  auto* cf_options =
+      reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jcf_options_handle);
+  jboolean has_exception = JNI_FALSE;
+  std::vector<std::string> cf_names;
+  rocksdb::JniUtil::byteStrings<std::string>(env, jcf_names,
+      [](const char* str, const size_t len) {
+          return std::string(str, len); 
+      },
+      [&cf_names](const size_t, std::string str) {
+        cf_names.push_back(str);
+      },
+      &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  rocksdb::Status s = db->CreateColumnFamilies(*cf_options, cf_names, &cf_handles);
+  if (!s.ok()) {
+    // error occurred
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  jlongArray jcf_handles = rocksdb::JniUtil::toJPointers<rocksdb::ColumnFamilyHandle>(
+      env, cf_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+  return jcf_handles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    createColumnFamilies
+ * Signature: (J[J[[B)[J
+ */
+jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B(
+    JNIEnv* env, jobject, jlong jhandle, jlongArray jcf_options_handles,
+    jobjectArray jcf_names) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jhandle);
+  const jsize jlen = env->GetArrayLength(jcf_options_handles);
+  std::vector<rocksdb::ColumnFamilyDescriptor> cf_descriptors;
+  cf_descriptors.reserve(jlen);
+
+  jboolean jcf_options_handles_is_copy = JNI_FALSE;
+  jlong *jcf_options_handles_elems = env->GetLongArrayElements(jcf_options_handles, &jcf_options_handles_is_copy);
+  if(jcf_options_handles_elems == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+  }
+
+  // extract the column family descriptors
+  jboolean has_exception = JNI_FALSE;
+  for (jsize i = 0; i < jlen; i++) {
+    auto* cf_options = reinterpret_cast<rocksdb::ColumnFamilyOptions*>(
+        jcf_options_handles_elems[i]);
+    jbyteArray jcf_name = static_cast<jbyteArray>(
+        env->GetObjectArrayElement(jcf_names, i));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems, JNI_ABORT);
+      return nullptr;
+    }    
+    const std::string cf_name =
+        rocksdb::JniUtil::byteString<std::string>(env, jcf_name,
+        [](const char* str, const size_t len) {
+              return std::string(str, len); 
+        },
+        &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jcf_name);
+      env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems, JNI_ABORT);
+      return nullptr;
+    }
+
+    cf_descriptors.push_back(rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options));
+
+    env->DeleteLocalRef(jcf_name);
+  }
+
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  rocksdb::Status s = db->CreateColumnFamilies(cf_descriptors, &cf_handles);
+
+  env->ReleaseLongArrayElements(jcf_options_handles, jcf_options_handles_elems, JNI_ABORT);
+
+  if (!s.ok()) {
+    // error occurred
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+
+  jlongArray jcf_handles = rocksdb::JniUtil::toJPointers<rocksdb::ColumnFamilyHandle>(
+      env, cf_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return nullptr;
+  }
+  return jcf_handles;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    dropColumnFamily
+ * Signature: (JJ)V;
+ */
+void Java_org_rocksdb_RocksDB_dropColumnFamily(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  rocksdb::Status s = db_handle->DropColumnFamily(cf_handle);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    dropColumnFamilies
+ * Signature: (J[J)V
+ */
+void Java_org_rocksdb_RocksDB_dropColumnFamilies(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jlongArray jcolumn_family_handles) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  if (jcolumn_family_handles != nullptr) {
+    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
+
+    jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
+    if (jcfh == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return;
+    }
+
+    for (jsize i = 0; i < len_cols; i++) {
+      auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcfh[i]);
+      cf_handles.push_back(cf_handle);
+    }
+    env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
+  }
+
+  rocksdb::Status s = db_handle->DropColumnFamilies(cf_handles);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // rocksdb::DB::Put
 
 /**
  * @return true if the put succeeded, false if a Java Exception was thrown
  */
-bool rocksdb_put_helper(JNIEnv* env, rocksdb::DB* db,
-                        const rocksdb::WriteOptions& write_options,
-                        rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey,
-                        jint jkey_off, jint jkey_len, jbyteArray jval,
-                        jint jval_off, jint jval_len) {
+bool rocksdb_put_helper(
+    JNIEnv* env, rocksdb::DB* db,
+    const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey,
+    jint jkey_off, jint jkey_len, jbyteArray jval,
+    jint jval_off, jint jval_len) {
   jbyte* key = new jbyte[jkey_len];
   env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
-  if(env->ExceptionCheck()) {
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
-    delete [] key;
+    delete[] key;
     return false;
   }
 
   jbyte* value = new jbyte[jval_len];
   env->GetByteArrayRegion(jval, jval_off, jval_len, value);
-  if(env->ExceptionCheck()) {
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
-    delete [] value;
-    delete [] key;
+    delete[] value;
+    delete[] key;
     return false;
   }
 
@@ -270,8 +484,8 @@ bool rocksdb_put_helper(JNIEnv* env, rocksdb::DB* db,
   }
 
   // cleanup
-  delete [] value;
-  delete [] key;
+  delete[] value;
+  delete[] key;
 
   if (s.ok()) {
     return true;
@@ -286,17 +500,15 @@ bool rocksdb_put_helper(JNIEnv* env, rocksdb::DB* db,
  * Method:    put
  * Signature: (J[BII[BII)V
  */
-void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jobject jdb,
-                                               jlong jdb_handle,
-                                               jbyteArray jkey, jint jkey_off,
-                                               jint jkey_len, jbyteArray jval,
-                                               jint jval_off, jint jval_len) {
+void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
-
   rocksdb_put_helper(env, db, default_write_options, nullptr, jkey, jkey_off,
-                     jkey_len, jval, jval_off, jval_len);
+      jkey_len, jval, jval_off, jval_len);
 }
 
 /*
@@ -304,22 +516,21 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BII(JNIEnv* env, jobject jdb,
  * Method:    put
  * Signature: (J[BII[BIIJ)V
  */
-void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jobject jdb,
-                                                jlong jdb_handle,
-                                                jbyteArray jkey, jint jkey_off,
-                                                jint jkey_len, jbyteArray jval,
-                                                jint jval_off, jint jval_len,
-                                                jlong jcf_handle) {
+void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len,
+    jlong jcf_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   static const rocksdb::WriteOptions default_write_options =
       rocksdb::WriteOptions();
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     rocksdb_put_helper(env, db, default_write_options, cf_handle, jkey,
-                       jkey_off, jkey_len, jval, jval_off, jval_len);
+        jkey_off, jkey_len, jval, jval_off, jval_len);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
   }
 }
 
@@ -328,18 +539,16 @@ void Java_org_rocksdb_RocksDB_put__J_3BII_3BIIJ(JNIEnv* env, jobject jdb,
  * Method:    put
  * Signature: (JJ[BII[BII)V
  */
-void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jobject jdb,
-                                                jlong jdb_handle,
-                                                jlong jwrite_options_handle,
-                                                jbyteArray jkey, jint jkey_off,
-                                                jint jkey_len, jbyteArray jval,
-                                                jint jval_off, jint jval_len) {
+void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
-      jwrite_options_handle);
-
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
   rocksdb_put_helper(env, db, *write_options, nullptr, jkey, jkey_off, jkey_len,
-                     jval, jval_off, jval_len);
+      jval, jval_off, jval_len);
 }
 
 /*
@@ -348,403 +557,858 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BII(JNIEnv* env, jobject jdb,
  * Signature: (JJ[BII[BIIJ)V
  */
 void Java_org_rocksdb_RocksDB_put__JJ_3BII_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
-    jint jval_off, jint jval_len, jlong jcf_handle) {
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len,
+    jlong jcf_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
-      jwrite_options_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
     rocksdb_put_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
-                       jkey_len, jval, jval_off, jval_len);
+        jkey_len, jval, jval_off, jval_len);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Write
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    write0
- * Signature: (JJJ)V
- */
-void Java_org_rocksdb_RocksDB_write0(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jwrite_options_handle, jlong jwb_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
-      jwrite_options_handle);
-  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
-
-  rocksdb::Status s = db->Write(*write_options, wb);
-
-  if (!s.ok()) {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  }
-}
+// rocksdb::DB::Delete()
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    write1
- * Signature: (JJJ)V
+/**
+ * @return true if the delete succeeded, false if a Java Exception was thrown
  */
-void Java_org_rocksdb_RocksDB_write1(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jwrite_options_handle, jlong jwbwi_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options = reinterpret_cast<rocksdb::WriteOptions*>(
-      jwrite_options_handle);
-  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
-  auto* wb = wbwi->GetWriteBatch();
-
-  rocksdb::Status s = db->Write(*write_options, wb);
-
-  if (!s.ok()) {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::KeyMayExist
-jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
-    const rocksdb::ReadOptions& read_opt,
-    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_off,
-    jint jkey_len, jobject jstring_builder, bool* has_exception) {
-
+bool rocksdb_delete_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len) {
   jbyte* key = new jbyte[jkey_len];
   env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
-  if(env->ExceptionCheck()) {
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
-    delete [] key;
-    *has_exception = true;
+    delete[] key;
     return false;
   }
-
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
 
-  std::string value;
-  bool value_found = false;
-  bool keyMayExist;
+  rocksdb::Status s;
   if (cf_handle != nullptr) {
-    keyMayExist = db->KeyMayExist(read_opt, cf_handle, key_slice,
-        &value, &value_found);
+    s = db->Delete(write_options, cf_handle, key_slice);
   } else {
-    keyMayExist = db->KeyMayExist(read_opt, key_slice,
-        &value, &value_found);
+    // backwards compatibility
+    s = db->Delete(write_options, key_slice);
   }
 
   // cleanup
-  delete [] key;
+  delete[] key;
 
-  // extract the value
-  if (value_found && !value.empty()) {
-    jobject jresult_string_builder =
-        rocksdb::StringBuilderJni::append(env, jstring_builder,
-            value.c_str());
-    if(jresult_string_builder == nullptr) {
-      *has_exception = true;
-      return false;
-    }
+  if (s.ok()) {
+    return true;
   }
 
-  *has_exception = false;
-  return static_cast<jboolean>(keyMayExist);
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return false;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    keyMayExist
- * Signature: (J[BIILjava/lang/StringBuilder;)Z
+ * Method:    delete
+ * Signature: (J[BII)V
  */
-jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIILjava_lang_StringBuilder_2(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jkey, jint jkey_off,
-    jint jkey_len, jobject jstring_builder) {
+void Java_org_rocksdb_RocksDB_delete__J_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  bool has_exception = false;
-  return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
-      nullptr, jkey, jkey_off, jkey_len, jstring_builder, &has_exception);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  rocksdb_delete_helper(env, db, default_write_options, nullptr, jkey, jkey_off,
+      jkey_len);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    keyMayExist
- * Signature: (J[BIIJLjava/lang/StringBuilder;)Z
+ * Method:    delete
+ * Signature: (J[BIIJ)V
  */
-jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIIJLjava_lang_StringBuilder_2(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jkey, jint jkey_off,
-    jint jkey_len, jlong jcf_handle, jobject jstring_builder) {
+void Java_org_rocksdb_RocksDB_delete__J_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jlong jcf_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
-      jcf_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
-    bool has_exception = false;
-    return key_may_exist_helper(env, db, rocksdb::ReadOptions(),
-        cf_handle, jkey, jkey_off, jkey_len, jstring_builder, &has_exception);
+    rocksdb_delete_helper(env, db, default_write_options, cf_handle, jkey,
+        jkey_off, jkey_len);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
-    return true;
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
   }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    keyMayExist
- * Signature: (JJ[BIILjava/lang/StringBuilder;)Z
+ * Method:    delete
+ * Signature: (JJ[BII)V
  */
-jboolean Java_org_rocksdb_RocksDB_keyMayExist__JJ_3BIILjava_lang_StringBuilder_2(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jread_options_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jobject jstring_builder) {
+void Java_org_rocksdb_RocksDB_delete__JJ_3BII(
+    JNIEnv* env, jobject,
+    jlong jdb_handle,
+    jlong jwrite_options,
+    jbyteArray jkey, jint jkey_off, jint jkey_len) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
-      jread_options_handle);
-  bool has_exception = false;
-  return key_may_exist_helper(env, db, read_options,
-      nullptr, jkey, jkey_off, jkey_len, jstring_builder, &has_exception);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  rocksdb_delete_helper(env, db, *write_options, nullptr, jkey, jkey_off,
+      jkey_len);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    keyMayExist
- * Signature: (JJ[BIIJLjava/lang/StringBuilder;)Z
+ * Method:    delete
+ * Signature: (JJ[BIIJ)V
  */
-jboolean Java_org_rocksdb_RocksDB_keyMayExist__JJ_3BIIJLjava_lang_StringBuilder_2(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jread_options_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle,
-    jobject jstring_builder) {
+void Java_org_rocksdb_RocksDB_delete__JJ_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
-      jread_options_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(
-      jcf_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
-    bool has_exception = false;
-    return key_may_exist_helper(env, db, read_options, cf_handle,
-        jkey, jkey_off, jkey_len, jstring_builder, &has_exception);
+    rocksdb_delete_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
+        jkey_len);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
-    return true;
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
   }
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Get
-
-jbyteArray rocksdb_get_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_opt,
-    rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey,
-    jint jkey_off, jint jkey_len) {
-
-  jbyte* key = new jbyte[jkey_len];
-  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
-  if(env->ExceptionCheck()) {
-    // exception thrown: ArrayIndexOutOfBoundsException
-    delete [] key;
-    return nullptr;
+// rocksdb::DB::SingleDelete()
+/**
+ * @return true if the single delete succeeded, false if a Java Exception
+ *     was thrown
+ */
+bool rocksdb_single_delete_helper(
+    JNIEnv* env, rocksdb::DB* db,
+    const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle,
+    jbyteArray jkey, jint jkey_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return false;
   }
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
 
-  rocksdb::Slice key_slice(
-      reinterpret_cast<char*>(key), jkey_len);
-
-  std::string value;
   rocksdb::Status s;
-  if (column_family_handle != nullptr) {
-    s = db->Get(read_opt, column_family_handle, key_slice, &value);
+  if (cf_handle != nullptr) {
+    s = db->SingleDelete(write_options, cf_handle, key_slice);
   } else {
     // backwards compatibility
-    s = db->Get(read_opt, key_slice, &value);
+    s = db->SingleDelete(write_options, key_slice);
   }
 
-  // cleanup
-  delete [] key;
-
-  if (s.IsNotFound()) {
-    return nullptr;
-  }
+  // trigger java unref on key and value.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
 
   if (s.ok()) {
-    jbyteArray jret_value = rocksdb::JniUtil::copyBytes(env, value);
-    if(jret_value == nullptr) {
-      // exception occurred
-      return nullptr;
-    }
-    return jret_value;
+    return true;
   }
 
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return nullptr;
+  return false;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (J[BII)[B
+ * Method:    singleDelete
+ * Signature: (J[BI)V
  */
-jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len) {
-  return rocksdb_get_helper(env,
-      reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(), nullptr,
-      jkey, jkey_off, jkey_len);
+void Java_org_rocksdb_RocksDB_singleDelete__J_3BI(
+    JNIEnv* env, jobject,
+    jlong jdb_handle,
+    jbyteArray jkey,
+    jint jkey_len) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  rocksdb_single_delete_helper(env, db, default_write_options, nullptr,
+      jkey, jkey_len);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (J[BIIJ)[B
+ * Method:    singleDelete
+ * Signature: (J[BIJ)V
  */
-jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) {
-  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
-    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(),
-        cf_handle, jkey, jkey_off, jkey_len);
+    rocksdb_single_delete_helper(env, db, default_write_options, cf_handle,
+        jkey, jkey_len);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
-    return nullptr;
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
   }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (JJ[BII)[B
+ * Method:    singleDelete
+ * Signature: (JJ[BIJ)V
  */
-jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len) {
-  return rocksdb_get_helper(env,
-      reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), nullptr,
-      jkey, jkey_off, jkey_len);
+void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jlong jwrite_options,
+    jbyteArray jkey,
+    jint jkey_len) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  rocksdb_single_delete_helper(env, db, *write_options, nullptr, jkey,
+      jkey_len);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (JJ[BIIJ)[B
+ * Method:    singleDelete
+ * Signature: (JJ[BIJ)V
  */
-jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) {
-  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
+void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
-    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle,
-        jkey, jkey_off, jkey_len);
+    rocksdb_single_delete_helper(env, db, *write_options, cf_handle, jkey,
+        jkey_len);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
-    return nullptr;
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
   }
 }
 
-jint rocksdb_get_helper(JNIEnv* env, rocksdb::DB* db,
-                        const rocksdb::ReadOptions& read_options,
-                        rocksdb::ColumnFamilyHandle* column_family_handle,
-                        jbyteArray jkey, jint jkey_off, jint jkey_len,
-                        jbyteArray jval, jint jval_off, jint jval_len,
-                        bool* has_exception) {
-  static const int kNotFound = -1;
-  static const int kStatusError = -2;
-
-  jbyte* key = new jbyte[jkey_len];
-  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
-  if(env->ExceptionCheck()) {
-    // exception thrown: OutOfMemoryError
-    delete [] key;
-    *has_exception = true;
-    return kStatusError;
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::DeleteRange()
+/**
+ * @return true if the delete range succeeded, false if a Java Exception
+ *     was thrown
+ */
+bool rocksdb_delete_range_helper(
+    JNIEnv* env, rocksdb::DB* db,
+    const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle,
+    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
+    jbyteArray jend_key, jint jend_key_off, jint jend_key_len) {
+  jbyte* begin_key = new jbyte[jbegin_key_len];
+  env->GetByteArrayRegion(jbegin_key, jbegin_key_off, jbegin_key_len,
+      begin_key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] begin_key;
+    return false;
   }
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+  rocksdb::Slice begin_key_slice(reinterpret_cast<char*>(begin_key),
+      jbegin_key_len);
 
-  // TODO(yhchiang): we might save one memory allocation here by adding
-  // a DB::Get() function which takes preallocated jbyte* as input.
-  std::string cvalue;
-  rocksdb::Status s;
-  if (column_family_handle != nullptr) {
-    s = db->Get(read_options, column_family_handle, key_slice, &cvalue);
-  } else {
-    // backwards compatibility
-    s = db->Get(read_options, key_slice, &cvalue);
+  jbyte* end_key = new jbyte[jend_key_len];
+  env->GetByteArrayRegion(jend_key, jend_key_off, jend_key_len, end_key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] begin_key;
+    delete[] end_key;
+    return false;
   }
+  rocksdb::Slice end_key_slice(reinterpret_cast<char*>(end_key), jend_key_len);
 
-  // cleanup
-  delete [] key;
-
-  if (s.IsNotFound()) {
-    *has_exception = false;
-    return kNotFound;
-  } else if (!s.ok()) {
-    *has_exception = true;
-    // Here since we are throwing a Java exception from c++ side.
-    // As a result, c++ does not know calling this function will in fact
-    // throwing an exception.  As a result, the execution flow will
-    // not stop here, and codes after this throw will still be
-    // executed.
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-
-    // Return a dummy const value to avoid compilation error, although
-    // java side might not have a chance to get the return value :)
-    return kStatusError;
-  }
+  rocksdb::Status s =
+      db->DeleteRange(write_options, cf_handle, begin_key_slice, end_key_slice);
 
-  const jint cvalue_len = static_cast<jint>(cvalue.size());
-  const jint length = std::min(jval_len, cvalue_len);
+  // cleanup
+  delete[] begin_key;
+  delete[] end_key;
 
-  env->SetByteArrayRegion(jval, jval_off, length,
-                          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(cvalue.c_str())));
-  if(env->ExceptionCheck()) {
-    // exception thrown: OutOfMemoryError
-    *has_exception = true;
-    return kStatusError;
+  if (s.ok()) {
+    return true;
   }
 
-  *has_exception = false;
-  return cvalue_len;
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return false;
 }
 
-inline void multi_get_helper_release_keys(JNIEnv* env,
-    std::vector<std::pair<jbyte*, jobject>> &keys_to_free) {
-  auto end = keys_to_free.end();
-  for (auto it = keys_to_free.begin(); it != end; ++it) {
-    delete [] it->first;
-    env->DeleteLocalRef(it->second);
-  }
-  keys_to_free.clear();
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (J[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
+    jbyteArray jend_key, jint jend_key_off, jint jend_key_len) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  rocksdb_delete_range_helper(env, db, default_write_options, nullptr,
+      jbegin_key, jbegin_key_off, jbegin_key_len,
+      jend_key, jend_key_off, jend_key_len);
 }
 
-/**
- * cf multi get
- *
- * @return byte[][] of values or nullptr if an exception occurs
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (J[BII[BIIJ)V
  */
-jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
-    const rocksdb::ReadOptions& rOpt, jobjectArray jkeys,
-    jintArray jkey_offs, jintArray jkey_lens,
-    jlongArray jcolumn_family_handles) {
-  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
+    jbyteArray jend_key, jint jend_key_off, jint jend_key_len,
+    jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_delete_range_helper(env, db, default_write_options, cf_handle,
+        jbegin_key, jbegin_key_off, jbegin_key_len,
+        jend_key, jend_key_off, jend_key_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (JJ[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
+    jbyteArray jend_key, jint jend_key_off, jint jend_key_len) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  rocksdb_delete_range_helper(env, db, *write_options, nullptr, jbegin_key,
+      jbegin_key_off, jbegin_key_len, jend_key,
+      jend_key_off, jend_key_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    deleteRange
+ * Signature: (JJ[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options,
+    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
+    jbyteArray jend_key, jint jend_key_off, jint jend_key_len,
+    jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_delete_range_helper(env, db, *write_options, cf_handle,
+        jbegin_key, jbegin_key_off, jbegin_key_len,
+        jend_key, jend_key_off, jend_key_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Merge
+
+/**
+ * @return true if the merge succeeded, false if a Java Exception was thrown
+ */
+bool rocksdb_merge_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
+    rocksdb::ColumnFamilyHandle* cf_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len) {
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    return false;
+  }
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  jbyte* value = new jbyte[jval_len];
+  env->GetByteArrayRegion(jval, jval_off, jval_len, value);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] value;
+    delete[] key;
+    return false;
+  }
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jval_len);
+
+  rocksdb::Status s;
+  if (cf_handle != nullptr) {
+    s = db->Merge(write_options, cf_handle, key_slice, value_slice);
+  } else {
+    s = db->Merge(write_options, key_slice, value_slice);
+  }
+
+  // cleanup
+  delete[] value;
+  delete[] key;
+
+  if (s.ok()) {
+    return true;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return false;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  rocksdb_merge_helper(env, db, default_write_options, nullptr, jkey, jkey_off,
+      jkey_len, jval, jval_off, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (J[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len,
+    jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  static const rocksdb::WriteOptions default_write_options =
+      rocksdb::WriteOptions();
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, default_write_options, cf_handle, jkey,
+        jkey_off, jkey_len, jval, jval_off, jval_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BII[BII)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  rocksdb_merge_helper(env, db, *write_options, nullptr, jkey, jkey_off,
+      jkey_len, jval, jval_off, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    merge
+ * Signature: (JJ[BII[BIIJ)V
+ */
+void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jwrite_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    rocksdb_merge_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
+        jkey_len, jval, jval_off, jval_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+  }
+}
+
+jlong rocksdb_iterator_helper(rocksdb::DB* db,
+      rocksdb::ReadOptions read_options,
+      rocksdb::ColumnFamilyHandle* cf_handle) {
+  rocksdb::Iterator* iterator = nullptr;
+  if (cf_handle != nullptr) {
+    iterator = db->NewIterator(read_options, cf_handle);
+  } else {
+    iterator = db->NewIterator(read_options);
+  }
+  return reinterpret_cast<jlong>(iterator);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Write
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write0
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_write0(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jlong jwrite_options_handle, jlong jwb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+
+  rocksdb::Status s = db->Write(*write_options, wb);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    write1
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_write1(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jlong jwrite_options_handle, jlong jwbwi_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  auto* wb = wbwi->GetWriteBatch();
+
+  rocksdb::Status s = db->Write(*write_options, wb);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// rocksdb::DB::Get
+
+jbyteArray rocksdb_get_helper(
+    JNIEnv* env, rocksdb::DB* db,
+    const rocksdb::ReadOptions& read_opt,
+    rocksdb::ColumnFamilyHandle* column_family_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len) {
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete[] key;
+    return nullptr;
+  }
+
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  std::string value;
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_opt, column_family_handle, key_slice, &value);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_opt, key_slice, &value);
+  }
+
+  // cleanup
+  delete[] key;
+
+  if (s.IsNotFound()) {
+    return nullptr;
+  }
+
+  if (s.ok()) {
+    jbyteArray jret_value = rocksdb::JniUtil::copyBytes(env, value);
+    if (jret_value == nullptr) {
+      // exception occurred
+      return nullptr;
+    }
+    return jret_value;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BII)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len) {
+  return rocksdb_get_helper(env, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), nullptr, jkey, jkey_off, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BIIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) {
+  auto db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), cf_handle,
+        jkey, jkey_off, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    return nullptr;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BII)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BII(
+    JNIEnv* env, jobject,
+    jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len) {
+  return rocksdb_get_helper(
+      env, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), nullptr, jkey,
+      jkey_off, jkey_len);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BIIJ)[B
+ */
+jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    return rocksdb_get_helper(
+        env, db_handle, ro_opt, cf_handle, jkey, jkey_off, jkey_len);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    return nullptr;
+  }
+}
+
+jint rocksdb_get_helper(
+    JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options,
+    rocksdb::ColumnFamilyHandle* column_family_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len,
+    bool* has_exception) {
+  static const int kNotFound = -1;
+  static const int kStatusError = -2;
+
+  jbyte* key = new jbyte[jkey_len];
+  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
+  if (env->ExceptionCheck()) {
+    // exception thrown: OutOfMemoryError
+    delete[] key;
+    *has_exception = true;
+    return kStatusError;
+  }
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+
+  // TODO(yhchiang): we might save one memory allocation here by adding
+  // a DB::Get() function which takes preallocated jbyte* as input.
+  std::string cvalue;
+  rocksdb::Status s;
+  if (column_family_handle != nullptr) {
+    s = db->Get(read_options, column_family_handle, key_slice, &cvalue);
+  } else {
+    // backwards compatibility
+    s = db->Get(read_options, key_slice, &cvalue);
+  }
+
+  // cleanup
+  delete[] key;
+
+  if (s.IsNotFound()) {
+    *has_exception = false;
+    return kNotFound;
+  } else if (!s.ok()) {
+    *has_exception = true;
+    // Here since we are throwing a Java exception from c++ side.
+    // As a result, c++ does not know calling this function will in fact
+    // throwing an exception.  As a result, the execution flow will
+    // not stop here, and codes after this throw will still be
+    // executed.
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+
+    // Return a dummy const value to avoid compilation error, although
+    // java side might not have a chance to get the return value :)
+    return kStatusError;
+  }
+
+  const jint cvalue_len = static_cast<jint>(cvalue.size());
+  const jint length = std::min(jval_len, cvalue_len);
+
+  env->SetByteArrayRegion(
+      jval, jval_off, length,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(cvalue.c_str())));
+  if (env->ExceptionCheck()) {
+    // exception thrown: OutOfMemoryError
+    *has_exception = true;
+    return kStatusError;
+  }
+
+  *has_exception = false;
+  return cvalue_len;
+}
+
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BII[BII)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len) {
+  bool has_exception = false;
+  return rocksdb_get_helper(env, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      rocksdb::ReadOptions(), nullptr, jkey, jkey_off,
+      jkey_len, jval, jval_off, jval_len, &has_exception);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (J[BII[BIIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len,
+    jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    bool has_exception = false;
+    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), cf_handle,
+        jkey, jkey_off, jkey_len, jval, jval_off,
+        jval_len, &has_exception);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BII[BII)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len) {
+  bool has_exception = false;
+  return rocksdb_get_helper(
+      env, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), nullptr, jkey,
+      jkey_off, jkey_len, jval, jval_off, jval_len, &has_exception);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    get
+ * Signature: (JJ[BII[BIIJ)I
+ */
+jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jbyteArray jval, jint jval_off, jint jval_len,
+    jlong jcf_handle) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  if (cf_handle != nullptr) {
+    bool has_exception = false;
+    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle,
+        jkey, jkey_off, jkey_len,
+        jval, jval_off, jval_len,
+        &has_exception);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // will never be evaluated
+    return 0;
+  }
+}
+
+inline void multi_get_helper_release_keys(
+    JNIEnv* env, std::vector<std::pair<jbyte*, jobject>>& keys_to_free) {
+  auto end = keys_to_free.end();
+  for (auto it = keys_to_free.begin(); it != end; ++it) {
+    delete[] it->first;
+    env->DeleteLocalRef(it->second);
+  }
+  keys_to_free.clear();
+}
+
+/**
+ * cf multi get
+ *
+ * @return byte[][] of values or nullptr if an exception occurs
+ */
+jobjectArray multi_get_helper(
+    JNIEnv* env, jobject, rocksdb::DB* db, const rocksdb::ReadOptions& rOpt,
+    jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens,
+    jlongArray jcolumn_family_handles) {
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
   if (jcolumn_family_handles != nullptr) {
     const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
 
     jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
-    if(jcfh == nullptr) {
+    if (jcfh == nullptr) {
       // exception thrown: OutOfMemoryError
       return nullptr;
     }
 
     for (jsize i = 0; i < len_cols; i++) {
-      auto* cf_handle =
-          reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcfh[i]);
+      auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcfh[i]);
       cf_handles.push_back(cf_handle);
     }
     env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
@@ -757,13 +1421,13 @@ jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
   }
 
   jint* jkey_off = env->GetIntArrayElements(jkey_offs, nullptr);
-  if(jkey_off == nullptr) {
+  if (jkey_off == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
 
   jint* jkey_len = env->GetIntArrayElements(jkey_lens, nullptr);
-  if(jkey_len == nullptr) {
+  if (jkey_len == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
     return nullptr;
@@ -773,7 +1437,7 @@ jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
   std::vector<std::pair<jbyte*, jobject>> keys_to_free;
   for (jsize i = 0; i < len_keys; i++) {
     jobject jkey = env->GetObjectArrayElement(jkeys, i);
-    if(env->ExceptionCheck()) {
+    if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
       env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
       env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
@@ -786,9 +1450,9 @@ jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
     const jint len_key = jkey_len[i];
     jbyte* key = new jbyte[len_key];
     env->GetByteArrayRegion(jkey_ba, jkey_off[i], len_key, key);
-    if(env->ExceptionCheck()) {
+    if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
-      delete [] key;
+      delete[] key;
       env->DeleteLocalRef(jkey);
       env->ReleaseIntArrayElements(jkey_lens, jkey_len, JNI_ABORT);
       env->ReleaseIntArrayElements(jkey_offs, jkey_off, JNI_ABORT);
@@ -820,7 +1484,7 @@ jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
   // prepare the results
   jobjectArray jresults =
       rocksdb::ByteJni::new2dByteArray(env, static_cast<jsize>(s.size()));
-  if(jresults == nullptr) {
+  if (jresults == nullptr) {
     // exception occurred
     return nullptr;
   }
@@ -837,21 +1501,22 @@ jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
       std::string* value = &values[i];
       const jsize jvalue_len = static_cast<jsize>(value->size());
       jbyteArray jentry_value = env->NewByteArray(jvalue_len);
-      if(jentry_value == nullptr) {
+      if (jentry_value == nullptr) {
         // exception thrown: OutOfMemoryError
         return nullptr;
       }
 
-      env->SetByteArrayRegion(jentry_value, 0, static_cast<jsize>(jvalue_len),
+      env->SetByteArrayRegion(
+          jentry_value, 0, static_cast<jsize>(jvalue_len),
           const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value->c_str())));
-      if(env->ExceptionCheck()) {
+      if (env->ExceptionCheck()) {
         // exception thrown: ArrayIndexOutOfBoundsException
         env->DeleteLocalRef(jentry_value);
         return nullptr;
       }
 
       env->SetObjectArrayElement(jresults, static_cast<jsize>(i), jentry_value);
-      if(env->ExceptionCheck()) {
+      if (env->ExceptionCheck()) {
         // exception thrown: ArrayIndexOutOfBoundsException
         env->DeleteLocalRef(jentry_value);
         return nullptr;
@@ -870,10 +1535,11 @@ jobjectArray multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db,
  * Signature: (J[[B[I[I)[[B
  */
 jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys,
-    jintArray jkey_offs, jintArray jkey_lens) {
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens) {
   return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      rocksdb::ReadOptions(), jkeys, jkey_offs, jkey_lens, nullptr);
+      rocksdb::ReadOptions(), jkeys, jkey_offs, jkey_lens,
+      nullptr);
 }
 
 /*
@@ -882,8 +1548,8 @@ jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I(
  * Signature: (J[[B[I[I[J)[[B
  */
 jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I_3J(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jobjectArray jkeys,
-    jintArray jkey_offs, jintArray jkey_lens,
+    JNIEnv* env, jobject jdb, jlong jdb_handle,
+    jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens,
     jlongArray jcolumn_family_handles) {
   return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
       rocksdb::ReadOptions(), jkeys, jkey_offs, jkey_lens,
@@ -898,7 +1564,8 @@ jobjectArray Java_org_rocksdb_RocksDB_multiGet__J_3_3B_3I_3I_3J(
 jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B_3I_3I(
     JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
     jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens) {
-  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+  return multi_get_helper(
+      env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
       *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkeys, jkey_offs,
       jkey_lens, nullptr);
 }
@@ -912,1286 +1579,1467 @@ jobjectArray Java_org_rocksdb_RocksDB_multiGet__JJ_3_3B_3I_3I_3J(
     JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
     jobjectArray jkeys, jintArray jkey_offs, jintArray jkey_lens,
     jlongArray jcolumn_family_handles) {
-  return multi_get_helper(env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
+  return multi_get_helper(
+      env, jdb, reinterpret_cast<rocksdb::DB*>(jdb_handle),
       *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), jkeys, jkey_offs,
       jkey_lens, jcolumn_family_handles);
 }
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (J[BII[BII)I
- */
-jint Java_org_rocksdb_RocksDB_get__J_3BII_3BII(JNIEnv* env, jobject jdb,
-                                               jlong jdb_handle,
-                                               jbyteArray jkey, jint jkey_off,
-                                               jint jkey_len, jbyteArray jval,
-                                               jint jval_off, jint jval_len) {
-  bool has_exception = false;
-  return rocksdb_get_helper(env, reinterpret_cast<rocksdb::DB*>(jdb_handle),
-                            rocksdb::ReadOptions(), nullptr, jkey, jkey_off,
-                            jkey_len, jval, jval_off, jval_len,
-                            &has_exception);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (J[BII[BIIJ)I
- */
-jint Java_org_rocksdb_RocksDB_get__J_3BII_3BIIJ(JNIEnv* env, jobject jdb,
-                                                jlong jdb_handle,
-                                                jbyteArray jkey, jint jkey_off,
-                                                jint jkey_len, jbyteArray jval,
-                                                jint jval_off, jint jval_len,
-                                                jlong jcf_handle) {
-  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    bool has_exception = false;
-    return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), cf_handle,
-                              jkey, jkey_off, jkey_len, jval, jval_off,
-                              jval_len, &has_exception);
-  } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
-    // will never be evaluated
-    return 0;
-  }
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (JJ[BII[BII)I
- */
-jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BII(JNIEnv* env, jobject jdb,
-                                                jlong jdb_handle,
-                                                jlong jropt_handle,
-                                                jbyteArray jkey, jint jkey_off,
-                                                jint jkey_len, jbyteArray jval,
-                                                jint jval_off, jint jval_len) {
-  bool has_exception = false;
-  return rocksdb_get_helper(
-      env, reinterpret_cast<rocksdb::DB*>(jdb_handle),
-      *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle), nullptr, jkey,
-      jkey_off, jkey_len, jval, jval_off, jval_len, &has_exception);
-}
-
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    get
- * Signature: (JJ[BII[BIIJ)I
- */
-jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
-    jint jval_off, jint jval_len, jlong jcf_handle) {
-  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto& ro_opt = *reinterpret_cast<rocksdb::ReadOptions*>(jropt_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    bool has_exception = false;
-    return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, jkey_off,
-                              jkey_len, jval, jval_off, jval_len,
-                              &has_exception);
-  } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
-    // will never be evaluated
-    return 0;
-  }
-}
-
 //////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Delete()
-
-/**
- * @return true if the delete succeeded, false if a Java Exception was thrown
- */
-bool rocksdb_delete_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
-    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_off,
-    jint jkey_len) {
+// rocksdb::DB::KeyMayExist
+jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db,
+      const rocksdb::ReadOptions& read_opt,
+      rocksdb::ColumnFamilyHandle* cf_handle,
+      jbyteArray jkey, jint jkey_off, jint jkey_len,
+      jobject jstring_builder, bool* has_exception) {
   jbyte* key = new jbyte[jkey_len];
   env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
-  if(env->ExceptionCheck()) {
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
-    delete [] key;
+    delete[] key;
+    *has_exception = true;
     return false;
   }
+
   rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
 
-  rocksdb::Status s;
+  std::string value;
+  bool value_found = false;
+  bool keyMayExist;
   if (cf_handle != nullptr) {
-    s = db->Delete(write_options, cf_handle, key_slice);
+    keyMayExist =
+        db->KeyMayExist(read_opt, cf_handle, key_slice, &value, &value_found);
   } else {
-    // backwards compatibility
-    s = db->Delete(write_options, key_slice);
+    keyMayExist = db->KeyMayExist(read_opt, key_slice, &value, &value_found);
   }
 
   // cleanup
-  delete [] key;
+  delete[] key;
 
-  if (s.ok()) {
-    return true;
+  // extract the value
+  if (value_found && !value.empty()) {
+    jobject jresult_string_builder =
+        rocksdb::StringBuilderJni::append(env, jstring_builder, value.c_str());
+    if (jresult_string_builder == nullptr) {
+      *has_exception = true;
+      return false;
+    }
   }
 
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return false;
+  *has_exception = false;
+  return static_cast<jboolean>(keyMayExist);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    delete
- * Signature: (J[BII)V
+ * Method:    keyMayExist
+ * Signature: (J[BIILjava/lang/StringBuilder;)Z
  */
-void Java_org_rocksdb_RocksDB_delete__J_3BII(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len) {
+jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIILjava_lang_StringBuilder_2(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jobject jstring_builder) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
-  rocksdb_delete_helper(env, db, default_write_options, nullptr,
-      jkey, jkey_off, jkey_len);
+  bool has_exception = false;
+  return key_may_exist_helper(env, db, rocksdb::ReadOptions(), nullptr, jkey,
+      jkey_off, jkey_len, jstring_builder, &has_exception);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    delete
- * Signature: (J[BIIJ)V
+ * Method:    keyMayExist
+ * Signature: (J[BIIJLjava/lang/StringBuilder;)Z
  */
-void Java_org_rocksdb_RocksDB_delete__J_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle) {
+jboolean
+Java_org_rocksdb_RocksDB_keyMayExist__J_3BIIJLjava_lang_StringBuilder_2(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len,
+    jlong jcf_handle, jobject jstring_builder) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
-    rocksdb_delete_helper(env, db, default_write_options, cf_handle,
-        jkey, jkey_off, jkey_len);
+    bool has_exception = false;
+    return key_may_exist_helper(env, db, rocksdb::ReadOptions(), cf_handle,
+        jkey, jkey_off, jkey_len, jstring_builder,
+        &has_exception);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    return true;
   }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    delete
- * Signature: (JJ[BII)V
+ * Method:    keyMayExist
+ * Signature: (JJ[BIILjava/lang/StringBuilder;)Z
  */
-void Java_org_rocksdb_RocksDB_delete__JJ_3BII(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jwrite_options, jbyteArray jkey, jint jkey_off, jint jkey_len) {
+jboolean
+Java_org_rocksdb_RocksDB_keyMayExist__JJ_3BIILjava_lang_StringBuilder_2(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jobject jstring_builder) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
-  rocksdb_delete_helper(env, db, *write_options, nullptr, jkey, jkey_off,
-      jkey_len);
+  auto& read_options =
+      *reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+  bool has_exception = false;
+  return key_may_exist_helper(env, db, read_options, nullptr, jkey, jkey_off,
+      jkey_len, jstring_builder, &has_exception);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    delete
- * Signature: (JJ[BIIJ)V
+ * Method:    keyMayExist
+ * Signature: (JJ[BIIJLjava/lang/StringBuilder;)Z
  */
-void Java_org_rocksdb_RocksDB_delete__JJ_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jwrite_options, jbyteArray jkey, jint jkey_off, jint jkey_len,
-    jlong jcf_handle) {
+jboolean
+Java_org_rocksdb_RocksDB_keyMayExist__JJ_3BIIJLjava_lang_StringBuilder_2(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_off, jint jkey_len, jlong jcf_handle,
+    jobject jstring_builder) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
+  auto& read_options =
+      *reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   if (cf_handle != nullptr) {
-    rocksdb_delete_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
-        jkey_len);
+    bool has_exception = false;
+    return key_may_exist_helper(env, db, read_options, cf_handle, jkey,
+        jkey_off, jkey_len, jstring_builder, &has_exception);
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    return true;
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::SingleDelete()
-/**
- * @return true if the single delete succeeded, false if a Java Exception
- *     was thrown
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    iterator
+ * Signature: (J)J
  */
-bool rocksdb_single_delete_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options,
-    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len) {
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  if(key == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return false;
-  }
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
-
-  rocksdb::Status s;
-  if (cf_handle != nullptr) {
-    s = db->SingleDelete(write_options, cf_handle, key_slice);
-  } else {
-    // backwards compatibility
-    s = db->SingleDelete(write_options, key_slice);
-  }
-
-  // trigger java unref on key and value.
-  // by passing JNI_ABORT, it will simply release the reference without
-  // copying the result back to the java byte array.
-  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
-
-  if (s.ok()) {
-    return true;
-  }
-
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return false;
+jlong Java_org_rocksdb_RocksDB_iterator__J(
+      JNIEnv*, jobject, jlong db_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(), nullptr);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    singleDelete
- * Signature: (J[BI)V
+ * Method:    iterator
+ * Signature: (JJ)J
  */
-void Java_org_rocksdb_RocksDB_singleDelete__J_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_len) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
-  rocksdb_single_delete_helper(env, db, default_write_options, nullptr,
-      jkey, jkey_len);
+jlong Java_org_rocksdb_RocksDB_iterator__JJ(
+    JNIEnv*, jobject, jlong db_handle, jlong jread_options_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto& read_options =
+      *reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options, nullptr);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    singleDelete
- * Signature: (J[BIJ)V
+ * Method:    iteratorCF
+ * Signature: (JJ)J
  */
-void Java_org_rocksdb_RocksDB_singleDelete__J_3BIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ(
+    JNIEnv*, jobject, jlong db_handle, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    rocksdb_single_delete_helper(env, db, default_write_options, cf_handle,
-        jkey, jkey_len);
-  } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
-  }
+  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(), cf_handle);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    singleDelete
- * Signature: (JJ[BIJ)V
+ * Method:    iteratorCF
+ * Signature: (JJJ)J
  */
-void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BI(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jwrite_options, jbyteArray jkey, jint jkey_len) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
-  rocksdb_single_delete_helper(env, db, *write_options, nullptr, jkey,
-      jkey_len);
+jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(
+    JNIEnv*, jobject,
+    jlong db_handle, jlong jcf_handle, jlong jread_options_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto& read_options =
+      *reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+  return rocksdb_iterator_helper(db, read_options, cf_handle);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    singleDelete
- * Signature: (JJ[BIJ)V
+ * Method:    iterators
+ * Signature: (J[JJ)[J
  */
-void Java_org_rocksdb_RocksDB_singleDelete__JJ_3BIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jwrite_options, jbyteArray jkey, jint jkey_len,
-    jlong jcf_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    rocksdb_single_delete_helper(env, db, *write_options, cf_handle, jkey,
-        jkey_len);
+jlongArray Java_org_rocksdb_RocksDB_iterators(
+    JNIEnv* env, jobject, jlong db_handle,
+    jlongArray jcolumn_family_handles,
+    jlong jread_options_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto& read_options =
+      *reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  if (jcolumn_family_handles != nullptr) {
+    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
+    jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
+    if (jcfh == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    for (jsize i = 0; i < len_cols; i++) {
+      auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcfh[i]);
+      cf_handles.push_back(cf_handle);
+    }
+
+    env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
+  }
+
+  std::vector<rocksdb::Iterator*> iterators;
+  rocksdb::Status s = db->NewIterators(read_options, cf_handles, &iterators);
+  if (s.ok()) {
+    jlongArray jLongArray =
+        env->NewLongArray(static_cast<jsize>(iterators.size()));
+    if (jLongArray == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+
+    for (std::vector<rocksdb::Iterator*>::size_type i = 0; i < iterators.size();
+         i++) {
+      env->SetLongArrayRegion(
+          jLongArray, static_cast<jsize>(i), 1,
+          const_cast<jlong*>(reinterpret_cast<const jlong*>(&iterators[i])));
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jLongArray);
+        return nullptr;
+      }
+    }
+
+    return jLongArray;
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::DeleteRange()
-/**
- * @return true if the delete range succeeded, false if a Java Exception
- *     was thrown
+/*
+ * Method:    getSnapshot
+ * Signature: (J)J
  */
-bool rocksdb_delete_range_helper(JNIEnv* env, rocksdb::DB* db,
-                                 const rocksdb::WriteOptions& write_options,
-                                 rocksdb::ColumnFamilyHandle* cf_handle,
-                                 jbyteArray jbegin_key, jint jbegin_key_off,
-                                 jint jbegin_key_len, jbyteArray jend_key,
-                                 jint jend_key_off, jint jend_key_len) {
-  jbyte* begin_key = new jbyte[jbegin_key_len];
-  env->GetByteArrayRegion(jbegin_key, jbegin_key_off, jbegin_key_len,
-                          begin_key);
-  if (env->ExceptionCheck()) {
-    // exception thrown: ArrayIndexOutOfBoundsException
-    delete[] begin_key;
-    return false;
-  }
-  rocksdb::Slice begin_key_slice(reinterpret_cast<char*>(begin_key),
-                                 jbegin_key_len);
+jlong Java_org_rocksdb_RocksDB_getSnapshot(
+    JNIEnv*, jobject, jlong db_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  const rocksdb::Snapshot* snapshot = db->GetSnapshot();
+  return reinterpret_cast<jlong>(snapshot);
+}
 
-  jbyte* end_key = new jbyte[jend_key_len];
-  env->GetByteArrayRegion(jend_key, jend_key_off, jend_key_len, end_key);
-  if (env->ExceptionCheck()) {
-    // exception thrown: ArrayIndexOutOfBoundsException
-    delete[] begin_key;
-    delete[] end_key;
-    return false;
+/*
+ * Method:    releaseSnapshot
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_RocksDB_releaseSnapshot(
+    JNIEnv*, jobject, jlong db_handle,
+    jlong snapshot_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  auto* snapshot = reinterpret_cast<rocksdb::Snapshot*>(snapshot_handle);
+  db->ReleaseSnapshot(snapshot);
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getProperty
+ * Signature: (JJLjava/lang/String;I)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_RocksDB_getProperty(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jstring jproperty, jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
   }
-  rocksdb::Slice end_key_slice(reinterpret_cast<char*>(end_key), jend_key_len);
+  rocksdb::Slice property_name(property, jproperty_len);
 
-  rocksdb::Status s =
-      db->DeleteRange(write_options, cf_handle, begin_key_slice, end_key_slice);
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
 
-  // cleanup
-  delete[] begin_key;
-  delete[] end_key;
+  std::string property_value;
+  bool retCode = db->GetProperty(cf_handle, property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
 
-  if (s.ok()) {
-    return true;
+  if (retCode) {
+    return env->NewStringUTF(property_value.c_str());
   }
 
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return false;
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  return nullptr;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    deleteRange
- * Signature: (J[BII[BII)V
+ * Method:    getMapProperty
+ * Signature: (JJLjava/lang/String;I)Ljava/util/Map;
  */
-void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BII(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jbegin_key,
-    jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key,
-    jint jend_key_off, jint jend_key_len) {
+jobject Java_org_rocksdb_RocksDB_getMapProperty(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jstring jproperty, jint jproperty_len) {
+    const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  rocksdb::Slice property_name(property, jproperty_len);
+  
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
-  rocksdb_delete_range_helper(env, db, default_write_options, nullptr,
-                              jbegin_key, jbegin_key_off, jbegin_key_len,
-                              jend_key, jend_key_off, jend_key_len);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  std::map<std::string, std::string> property_value;
+  bool retCode = db->GetMapProperty(cf_handle, property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (retCode) {
+    return rocksdb::HashMapJni::fromCppMap(env, &property_value);
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  return nullptr;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    deleteRange
- * Signature: (J[BII[BIIJ)V
+ * Method:    getLongProperty
+ * Signature: (JJLjava/lang/String;I)J
  */
-void Java_org_rocksdb_RocksDB_deleteRange__J_3BII_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jbegin_key,
-    jint jbegin_key_off, jint jbegin_key_len, jbyteArray jend_key,
-    jint jend_key_off, jint jend_key_len, jlong jcf_handle) {
+jlong Java_org_rocksdb_RocksDB_getLongProperty(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jstring jproperty, jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  rocksdb::Slice property_name(property, jproperty_len);
+
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    rocksdb_delete_range_helper(env, db, default_write_options, cf_handle,
-                                jbegin_key, jbegin_key_off, jbegin_key_len,
-                                jend_key, jend_key_off, jend_key_len);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(
-        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
+
+  uint64_t property_value;
+  bool retCode = db->GetIntProperty(cf_handle, property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
+
+  if (retCode) {
+    return property_value;
   }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  return 0;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    deleteRange
- * Signature: (JJ[BII[BII)V
+ * Method:    resetStats
+ * Signature: (J)V
  */
-void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BII(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options,
-    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
-    jbyteArray jend_key, jint jend_key_off, jint jend_key_len) {
+void Java_org_rocksdb_RocksDB_resetStats(
+    JNIEnv *, jobject, jlong jdb_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
-  rocksdb_delete_range_helper(env, db, *write_options, nullptr, jbegin_key,
-                              jbegin_key_off, jbegin_key_len, jend_key,
-                              jend_key_off, jend_key_len);
+  db->ResetStats();
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    deleteRange
- * Signature: (JJ[BII[BIIJ)V
+ * Method:    getAggregatedLongProperty
+ * Signature: (JLjava/lang/String;I)J
  */
-void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options,
-    jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len,
-    jbyteArray jend_key, jint jend_key_off, jint jend_key_len,
-    jlong jcf_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    rocksdb_delete_range_helper(env, db, *write_options, cf_handle, jbegin_key,
-                                jbegin_key_off, jbegin_key_len, jend_key,
-                                jend_key_off, jend_key_len);
-  } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(
-        env, rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+jlong Java_org_rocksdb_RocksDB_getAggregatedLongProperty(
+    JNIEnv* env, jobject, jlong db_handle,
+    jstring jproperty, jint jproperty_len) {
+  const char* property = env->GetStringUTFChars(jproperty, nullptr);
+  if (property == nullptr) {
+    return 0;
   }
-}
+  rocksdb::Slice property_name(property, jproperty_len);
+  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+  uint64_t property_value = 0;
+  bool retCode = db->GetAggregatedIntProperty(property_name, &property_value);
+  env->ReleaseStringUTFChars(jproperty, property);
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Merge
+  if (retCode) {
+    return property_value;
+  }
 
-/**
- * @return true if the merge succeeded, false if a Java Exception was thrown
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
+  return 0;
+}
+
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getApproximateSizes
+ * Signature: (JJ[JB)[J
  */
-bool rocksdb_merge_helper(JNIEnv* env, rocksdb::DB* db,
-                          const rocksdb::WriteOptions& write_options,
-                          rocksdb::ColumnFamilyHandle* cf_handle,
-                          jbyteArray jkey, jint jkey_off, jint jkey_len,
-                          jbyteArray jval, jint jval_off, jint jval_len) {
-  jbyte* key = new jbyte[jkey_len];
-  env->GetByteArrayRegion(jkey, jkey_off, jkey_len, key);
-  if(env->ExceptionCheck()) {
-    // exception thrown: ArrayIndexOutOfBoundsException
-    delete [] key;
-    return false;
-  }
-  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_len);
+jlongArray Java_org_rocksdb_RocksDB_getApproximateSizes(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlongArray jrange_slice_handles, jbyte jinclude_flags) {
+  const jsize jlen = env->GetArrayLength(jrange_slice_handles);
+  const size_t range_count = jlen / 2;
 
-  jbyte* value = new jbyte[jval_len];
-  env->GetByteArrayRegion(jval, jval_off, jval_len, value);
-  if(env->ExceptionCheck()) {
-    // exception thrown: ArrayIndexOutOfBoundsException
-    delete [] value;
-    delete [] key;
-    return false;
+  jboolean jranges_is_copy = JNI_FALSE;
+  jlong* jranges = env->GetLongArrayElements(jrange_slice_handles,
+      &jranges_is_copy);
+  if (jranges == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
   }
-  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jval_len);
 
-  rocksdb::Status s;
-  if (cf_handle != nullptr) {
-    s = db->Merge(write_options, cf_handle, key_slice, value_slice);
+  auto ranges = std::unique_ptr<rocksdb::Range[]>(
+      new rocksdb::Range[range_count]);
+  for (jsize i = 0; i < jlen; ++i) {
+    auto* start = reinterpret_cast<rocksdb::Slice*>(jranges[i]);
+    auto* limit = reinterpret_cast<rocksdb::Slice*>(jranges[++i]);
+    ranges.get()[i] = rocksdb::Range(*start, *limit);
+  }
+  
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
   } else {
-    s = db->Merge(write_options, key_slice, value_slice);
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   }
 
-  // cleanup
-  delete [] value;
-  delete [] key;
+  auto sizes = std::unique_ptr<uint64_t[]>(new uint64_t[range_count]);
+  db->GetApproximateSizes(cf_handle, ranges.get(),
+      static_cast<int>(range_count), sizes.get(),
+      static_cast<uint8_t>(jinclude_flags));
 
-  if (s.ok()) {
-    return true;
+  // release LongArrayElements
+  env->ReleaseLongArrayElements(jrange_slice_handles, jranges, JNI_ABORT);
+
+  // prepare results
+  auto results = std::unique_ptr<jlong[]>(new jlong[range_count]);
+  for (size_t i = 0; i < range_count; ++i) {
+    results.get()[i] = static_cast<jlong>(sizes.get()[i]);
   }
 
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return false;
-}
+  const jsize jrange_count = jlen / 2;
+  jlongArray jresults = env->NewLongArray(jrange_count);
+  if (jresults == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    merge
- * Signature: (J[BII[BII)V
- */
-void Java_org_rocksdb_RocksDB_merge__J_3BII_3BII(JNIEnv* env, jobject jdb,
-                                                 jlong jdb_handle,
-                                                 jbyteArray jkey, jint jkey_off,
-                                                 jint jkey_len, jbyteArray jval,
-                                                 jint jval_off, jint jval_len) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
+  env->SetLongArrayRegion(jresults, 0, jrange_count, results.get());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jresults);
+    return nullptr;
+  }
 
-  rocksdb_merge_helper(env, db, default_write_options, nullptr, jkey, jkey_off,
-                       jkey_len, jval, jval_off, jval_len);
+  return jresults;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    merge
- * Signature: (J[BII[BIIJ)V
+ * Method:    getApproximateMemTableStats
+ * Signature: (JJJJ)[J
  */
-void Java_org_rocksdb_RocksDB_merge__J_3BII_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jkey, jint jkey_off,
-    jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len,
-    jlong jcf_handle) {
+jlongArray Java_org_rocksdb_RocksDB_getApproximateMemTableStats(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlong jstartHandle, jlong jlimitHandle) {
+  auto* start = reinterpret_cast<rocksdb::Slice*>(jstartHandle);
+  auto* limit = reinterpret_cast<rocksdb::Slice*>( jlimitHandle);
+  const rocksdb::Range range(*start, *limit);
+
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  static const rocksdb::WriteOptions default_write_options =
-      rocksdb::WriteOptions();
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    rocksdb_merge_helper(env, db, default_write_options, cf_handle, jkey,
-                         jkey_off, jkey_len, jval, jval_off, jval_len);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   }
-}
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    merge
- * Signature: (JJ[BII[BII)V
- */
-void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BII(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
-    jint jval_off, jint jval_len) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  uint64_t count = 0;
+  uint64_t sizes = 0;
+  db->GetApproximateMemTableStats(cf_handle, range, &count, &sizes);
 
-  rocksdb_merge_helper(env, db, *write_options, nullptr, jkey, jkey_off,
-                       jkey_len, jval, jval_off, jval_len);
+  // prepare results
+  jlong results[2] = {
+      static_cast<jlong>(count),
+      static_cast<jlong>(sizes)};
+
+  const jsize jcount = static_cast<jsize>(count);
+  jlongArray jsizes = env->NewLongArray(jcount);
+  if (jsizes == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  env->SetLongArrayRegion(jsizes, 0, jcount, results);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jsizes);
+    return nullptr;
+  }
+
+  return jsizes;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    merge
- * Signature: (JJ[BII[BIIJ)V
+ * Method:    compactRange
+ * Signature: (J[BI[BIJJ)V
  */
-void Java_org_rocksdb_RocksDB_merge__JJ_3BII_3BIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options_handle,
-    jbyteArray jkey, jint jkey_off, jint jkey_len, jbyteArray jval,
-    jint jval_off, jint jval_len, jlong jcf_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* write_options =
-      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  if (cf_handle != nullptr) {
-    rocksdb_merge_helper(env, db, *write_options, cf_handle, jkey, jkey_off,
-                         jkey_len, jval, jval_off, jval_len);
+void Java_org_rocksdb_RocksDB_compactRange(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jbyteArray jbegin, jint jbegin_len,
+    jbyteArray jend, jint jend_len,
+    jlong jcompact_range_opts_handle,
+    jlong jcf_handle) {
+  jboolean has_exception = JNI_FALSE;
+
+  std::string str_begin;
+  if (jbegin_len > 0) {
+    str_begin = rocksdb::JniUtil::byteString<std::string>(env, jbegin, jbegin_len,
+        [](const char* str, const size_t len) {
+            return std::string(str, len);
+        },
+        &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      return;
+    }
+  }
+
+  std::string str_end;
+  if (jend_len > 0) {
+    str_end = rocksdb::JniUtil::byteString<std::string>(env, jend, jend_len,
+        [](const char* str, const size_t len) {
+            return std::string(str, len);
+        },
+        &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      return;
+    }
+  }
+
+  rocksdb::CompactRangeOptions *compact_range_opts = nullptr;
+  if (jcompact_range_opts_handle == 0) {
+    // NOTE: we DO own the pointer!
+    compact_range_opts = new rocksdb::CompactRangeOptions();
   } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle."));
+    // NOTE: we do NOT own the pointer!
+    compact_range_opts =
+        reinterpret_cast<rocksdb::CompactRangeOptions*>(jcompact_range_opts_handle);
   }
-}
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::~DB()
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    disposeInternal
- * Signature: (J)V
- */
-void Java_org_rocksdb_RocksDB_disposeInternal(
-    JNIEnv* env, jobject java_db, jlong jhandle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jhandle);
-  assert(db != nullptr);
-  delete db;
-}
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
 
-jlong rocksdb_iterator_helper(
-    rocksdb::DB* db, rocksdb::ReadOptions read_options,
-    rocksdb::ColumnFamilyHandle* cf_handle) {
-  rocksdb::Iterator* iterator = nullptr;
-  if (cf_handle != nullptr) {
-    iterator = db->NewIterator(read_options, cf_handle);
+  rocksdb::Status s;
+  if (jbegin_len > 0 || jend_len > 0) {
+    const rocksdb::Slice begin(str_begin);
+    const rocksdb::Slice end(str_end);
+    s = db->CompactRange(*compact_range_opts, cf_handle, &begin, &end);
   } else {
-    iterator = db->NewIterator(read_options);
+    s = db->CompactRange(*compact_range_opts, cf_handle, nullptr, nullptr);
   }
-  return reinterpret_cast<jlong>(iterator);
-}
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    iterator
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_RocksDB_iterator__J(
-    JNIEnv* env, jobject jdb, jlong db_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(),
-      nullptr);
+  if (jcompact_range_opts_handle == 0) {
+    delete compact_range_opts;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    iterator
- * Signature: (JJ)J
+ * Method:    setOptions
+ * Signature: (JJ[Ljava/lang/String;[Ljava/lang/String;)V
  */
-jlong Java_org_rocksdb_RocksDB_iterator__JJ(
-    JNIEnv* env, jobject jdb, jlong db_handle,
-    jlong jread_options_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
-      jread_options_handle);
-  return rocksdb_iterator_helper(db, read_options,
-      nullptr);
-}
+void Java_org_rocksdb_RocksDB_setOptions(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jobjectArray jkeys, jobjectArray jvalues) {
+  const jsize len = env->GetArrayLength(jkeys);
+  assert(len == env->GetArrayLength(jvalues));
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    iteratorCF
- * Signature: (JJ)J
- */
-jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ(
-    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  return rocksdb_iterator_helper(db, rocksdb::ReadOptions(),
-        cf_handle);
-}
+  std::unordered_map<std::string, std::string> options_map;
+  for (jsize i = 0; i < len; i++) {
+    jobject jobj_key = env->GetObjectArrayElement(jkeys, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return;
+    }
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    iteratorCF
- * Signature: (JJJ)J
- */
-jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ(
-    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
-    jlong jread_options_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
+    jobject jobj_value = env->GetObjectArrayElement(jvalues, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    jboolean has_exception = JNI_FALSE;
+    std::string s_key =
+        rocksdb::JniUtil::copyStdString(
+          env, reinterpret_cast<jstring>(jobj_key), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    std::string s_value =
+        rocksdb::JniUtil::copyStdString(
+          env, reinterpret_cast<jstring>(jobj_value), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
+    }
+
+    options_map[s_key] = s_value;
+
+    env->DeleteLocalRef(jobj_key);
+    env->DeleteLocalRef(jobj_value);
+  }
+
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
-      jread_options_handle);
-  return rocksdb_iterator_helper(db, read_options,
-        cf_handle);
+  auto s = db->SetOptions(cf_handle, options_map);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    iterators
- * Signature: (J[JJ)[J
+ * Method:    setDBOptions
+ * Signature: (J[Ljava/lang/String;[Ljava/lang/String;)V
  */
-jlongArray Java_org_rocksdb_RocksDB_iterators(
-    JNIEnv* env, jobject jdb, jlong db_handle,
-    jlongArray jcolumn_family_handles, jlong jread_options_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  auto& read_options = *reinterpret_cast<rocksdb::ReadOptions*>(
-        jread_options_handle);
+void Java_org_rocksdb_RocksDB_setDBOptions(
+    JNIEnv* env, jobject, jlong jdb_handle,
+    jobjectArray jkeys, jobjectArray jvalues) {
+  const jsize len = env->GetArrayLength(jkeys);
+  assert(len == env->GetArrayLength(jvalues));
 
-  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
-  if (jcolumn_family_handles != nullptr) {
-    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
-    jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
-    if(jcfh == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return nullptr;
+  std::unordered_map<std::string, std::string> options_map;
+    for (jsize i = 0; i < len; i++) {
+    jobject jobj_key = env->GetObjectArrayElement(jkeys, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return;
     }
 
-    for (jsize i = 0; i < len_cols; i++) {
-      auto* cf_handle =
-          reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcfh[i]);
-      cf_handles.push_back(cf_handle);
+    jobject jobj_value = env->GetObjectArrayElement(jvalues, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jobj_key);
+      return;
     }
 
-    env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
-  }
-
-  std::vector<rocksdb::Iterator*> iterators;
-  rocksdb::Status s = db->NewIterators(read_options,
-      cf_handles, &iterators);
-  if (s.ok()) {
-    jlongArray jLongArray =
-        env->NewLongArray(static_cast<jsize>(iterators.size()));
-    if(jLongArray == nullptr) {
-      // exception thrown: OutOfMemoryError
-      return nullptr;
+    jboolean has_exception = JNI_FALSE;
+    std::string s_key =
+        rocksdb::JniUtil::copyStdString(
+          env, reinterpret_cast<jstring>(jobj_key), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
     }
 
-    for (std::vector<rocksdb::Iterator*>::size_type i = 0;
-        i < iterators.size(); i++) {
-      env->SetLongArrayRegion(jLongArray, static_cast<jsize>(i), 1,
-                              const_cast<jlong*>(reinterpret_cast<const jlong*>(&iterators[i])));
-      if(env->ExceptionCheck()) {
-        // exception thrown: ArrayIndexOutOfBoundsException
-        env->DeleteLocalRef(jLongArray);
-        return nullptr;
-      }
+    std::string s_value =
+        rocksdb::JniUtil::copyStdString(
+          env, reinterpret_cast<jstring>(jobj_value), &has_exception);
+    if (has_exception == JNI_TRUE) {
+      // exception occurred
+      env->DeleteLocalRef(jobj_value);
+      env->DeleteLocalRef(jobj_key);
+      return;
     }
 
-    return jLongArray;
-  } else {
-    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-    return nullptr;
+    options_map[s_key] = s_value;
+
+    env->DeleteLocalRef(jobj_key);
+    env->DeleteLocalRef(jobj_value);
   }
-}
 
-/*
- * Class:     org_rocksdb_RocksDB
- * Method:    getDefaultColumnFamily
- * Signature: (J)J
- */
-jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(
-    JNIEnv* env, jobject jobj, jlong jdb_handle) {
-  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* cf_handle = db_handle->DefaultColumnFamily();
-  return reinterpret_cast<jlong>(cf_handle);
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto s = db->SetDBOptions(options_map);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    createColumnFamily
- * Signature: (J[BJ)J
+ * Method:    compactFiles
+ * Signature: (JJJ[Ljava/lang/String;IIJ)[Ljava/lang/String;
  */
-jlong Java_org_rocksdb_RocksDB_createColumnFamily(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jbyteArray jcolumn_name, jlong jcolumn_options) {
-  rocksdb::ColumnFamilyHandle* handle;
+jobjectArray Java_org_rocksdb_RocksDB_compactFiles(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcompaction_opts_handle,
+    jlong jcf_handle, jobjectArray jinput_file_names, jint joutput_level,
+    jint joutput_path_id, jlong jcompaction_job_info_handle) {
   jboolean has_exception = JNI_FALSE;
-  std::string column_name = rocksdb::JniUtil::byteString<std::string>(env,
-    jcolumn_name,
-    [](const char* str, const size_t len) { return std::string(str, len); },
-    &has_exception);
-  if(has_exception == JNI_TRUE) {
+  const std::vector<std::string> input_file_names =
+      rocksdb::JniUtil::copyStrings(env, jinput_file_names, &has_exception);
+  if (has_exception == JNI_TRUE) {
     // exception occurred
-    return 0;
+    return nullptr;
   }
 
-  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* cfOptions =
-      reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jcolumn_options);
+  auto* compaction_opts =
+      reinterpret_cast<rocksdb::CompactionOptions*>(jcompaction_opts_handle);
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
 
-  rocksdb::Status s = db_handle->CreateColumnFamily(
-      *cfOptions, column_name, &handle);
+  rocksdb::CompactionJobInfo* compaction_job_info = nullptr;
+  if (jcompaction_job_info_handle != 0) {
+    compaction_job_info =
+        reinterpret_cast<rocksdb::CompactionJobInfo*>(jcompaction_job_info_handle);
+  }
 
-  if (s.ok()) {
-    return reinterpret_cast<jlong>(handle);
+  std::vector<std::string> output_file_names;
+  auto s = db->CompactFiles(*compaction_opts, cf_handle, input_file_names,
+      static_cast<int>(joutput_level), static_cast<int>(joutput_path_id),
+      &output_file_names, compaction_job_info);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
   }
 
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return 0;
+  return rocksdb::JniUtil::toJavaStrings(env, &output_file_names);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    dropColumnFamily
- * Signature: (JJ)V;
+ * Method:    pauseBackgroundWork
+ * Signature: (J)V
  */
-void Java_org_rocksdb_RocksDB_dropColumnFamily(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jcf_handle) {
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  rocksdb::Status s = db_handle->DropColumnFamily(cf_handle);
+void Java_org_rocksdb_RocksDB_pauseBackgroundWork(
+    JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto s = db->PauseBackgroundWork();
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
 
 /*
- * Method:    getSnapshot
- * Signature: (J)J
+ * Class:     org_rocksdb_RocksDB
+ * Method:    continueBackgroundWork
+ * Signature: (J)V
  */
-jlong Java_org_rocksdb_RocksDB_getSnapshot(
-    JNIEnv* env, jobject jdb, jlong db_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  const rocksdb::Snapshot* snapshot = db->GetSnapshot();
-  return reinterpret_cast<jlong>(snapshot);
+void Java_org_rocksdb_RocksDB_continueBackgroundWork(
+    JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto s = db->ContinueBackgroundWork();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
- * Method:    releaseSnapshot
- * Signature: (JJ)V
+ * Class:     org_rocksdb_RocksDB
+ * Method:    enableAutoCompaction
+ * Signature: (J[J)V
  */
-void Java_org_rocksdb_RocksDB_releaseSnapshot(
-    JNIEnv* env, jobject jdb, jlong db_handle, jlong snapshot_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  auto* snapshot = reinterpret_cast<rocksdb::Snapshot*>(snapshot_handle);
-  db->ReleaseSnapshot(snapshot);
+void Java_org_rocksdb_RocksDB_enableAutoCompaction(
+    JNIEnv* env, jobject, jlong jdb_handle, jlongArray jcf_handles) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  jboolean has_exception = JNI_FALSE;
+  const std::vector<rocksdb::ColumnFamilyHandle*> cf_handles =
+      rocksdb::JniUtil::fromJPointers<rocksdb::ColumnFamilyHandle>(env, jcf_handles, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  db->EnableAutoCompaction(cf_handles);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    getProperty0
- * Signature: (JLjava/lang/String;I)Ljava/lang/String;
+ * Method:    numberLevels
+ * Signature: (JJ)I
  */
-jstring Java_org_rocksdb_RocksDB_getProperty0__JLjava_lang_String_2I(
-    JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
-    jint jproperty_len) {
-  const char* property = env->GetStringUTFChars(jproperty, nullptr);
-  if(property == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return nullptr;
-  }
-  rocksdb::Slice property_slice(property, jproperty_len);
-
-  auto *db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  std::string property_value;
-  bool retCode = db->GetProperty(property_slice, &property_value);
-  env->ReleaseStringUTFChars(jproperty, property);
-
-  if (retCode) {
-    return env->NewStringUTF(property_value.c_str());
+jint Java_org_rocksdb_RocksDB_numberLevels(
+    JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   }
-
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
-  return nullptr;
+  return static_cast<jint>(db->NumberLevels(cf_handle));
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    getProperty0
- * Signature: (JJLjava/lang/String;I)Ljava/lang/String;
+ * Method:    maxMemCompactionLevel
+ * Signature: (JJ)I
  */
-jstring Java_org_rocksdb_RocksDB_getProperty0__JJLjava_lang_String_2I(
-    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
-    jstring jproperty, jint jproperty_len) {
-  const char* property = env->GetStringUTFChars(jproperty, nullptr);
-  if(property == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return nullptr;
-  }
-  rocksdb::Slice property_slice(property, jproperty_len);
-
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  std::string property_value;
-  bool retCode = db->GetProperty(cf_handle, property_slice, &property_value);
-  env->ReleaseStringUTFChars(jproperty, property);
-
-  if (retCode) {
-    return env->NewStringUTF(property_value.c_str());
+jint Java_org_rocksdb_RocksDB_maxMemCompactionLevel(
+    JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   }
-
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
-  return nullptr;
+  return static_cast<jint>(db->MaxMemCompactionLevel(cf_handle));
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    getLongProperty
- * Signature: (JLjava/lang/String;I)L;
+ * Method:    level0StopWriteTrigger
+ * Signature: (JJ)I
  */
-jlong Java_org_rocksdb_RocksDB_getLongProperty__JLjava_lang_String_2I(
-    JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty,
-    jint jproperty_len) {
-  const char* property = env->GetStringUTFChars(jproperty, nullptr);
-  if(property == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return 0;
-  }
-  rocksdb::Slice property_slice(property, jproperty_len);
-
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  uint64_t property_value = 0;
-  bool retCode = db->GetIntProperty(property_slice, &property_value);
-  env->ReleaseStringUTFChars(jproperty, property);
-
-  if (retCode) {
-    return property_value;
+jint Java_org_rocksdb_RocksDB_level0StopWriteTrigger(
+    JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   }
-
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
-  return 0;
+  return static_cast<jint>(db->Level0StopWriteTrigger(cf_handle));
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    getLongProperty
- * Signature: (JJLjava/lang/String;I)L;
+ * Method:    getName
+ * Signature: (J)Ljava/lang/String;
  */
-jlong Java_org_rocksdb_RocksDB_getLongProperty__JJLjava_lang_String_2I(
-    JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle,
-    jstring jproperty, jint jproperty_len) {
-  const char* property = env->GetStringUTFChars(jproperty, nullptr);
-  if(property == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return 0;
-  }
-  rocksdb::Slice property_slice(property, jproperty_len);
-
-  auto* db = reinterpret_cast<rocksdb::DB*>(db_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  uint64_t property_value;
-  bool retCode = db->GetIntProperty(cf_handle, property_slice, &property_value);
-  env->ReleaseStringUTFChars(jproperty, property);
-
-  if (retCode) {
-    return property_value;
-  }
-
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound());
-  return 0;
+jstring Java_org_rocksdb_RocksDB_getName(
+    JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  std::string name = db->GetName();
+  return rocksdb::JniUtil::toJavaString(env, &name, false);
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::Flush
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getEnv
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_RocksDB_getEnv(
+    JNIEnv*, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  return reinterpret_cast<jlong>(db->GetEnv());
+}
 
-void rocksdb_flush_helper(
-    JNIEnv* env, rocksdb::DB* db, const rocksdb::FlushOptions& flush_options,
-  rocksdb::ColumnFamilyHandle* column_family_handle) {
-  rocksdb::Status s;
-  if (column_family_handle != nullptr) {
-    s = db->Flush(flush_options, column_family_handle);
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    flush
+ * Signature: (JJ[J)V
+ */
+void Java_org_rocksdb_RocksDB_flush(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jflush_opts_handle,
+    jlongArray jcf_handles) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* flush_opts =
+      reinterpret_cast<rocksdb::FlushOptions*>(jflush_opts_handle);
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  if (jcf_handles == nullptr) {
+      cf_handles.push_back(db->DefaultColumnFamily());
   } else {
-    s = db->Flush(flush_options);
+      jboolean has_exception = JNI_FALSE;
+      cf_handles =
+          rocksdb::JniUtil::fromJPointers<rocksdb::ColumnFamilyHandle>(
+            env, jcf_handles, &has_exception);
+      if (has_exception) {
+        // exception occurred
+        return;
+      }
   }
+  auto s = db->Flush(*flush_opts, cf_handles);
   if (!s.ok()) {
-      rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    flush
- * Signature: (JJ)V
+ * Method:    flushWal
+ * Signature: (JZ)V
  */
-void Java_org_rocksdb_RocksDB_flush__JJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jflush_options) {
+void Java_org_rocksdb_RocksDB_flushWal(
+    JNIEnv* env, jobject, jlong jdb_handle, jboolean jsync) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* flush_options =
-      reinterpret_cast<rocksdb::FlushOptions*>(jflush_options);
-  rocksdb_flush_helper(env, db, *flush_options, nullptr);
+  auto s = db->FlushWAL(jsync == JNI_TRUE);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    flush
- * Signature: (JJJ)V
+ * Method:    syncWal
+ * Signature: (J)V
  */
-void Java_org_rocksdb_RocksDB_flush__JJJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-    jlong jflush_options, jlong jcf_handle) {
+void Java_org_rocksdb_RocksDB_syncWal(
+    JNIEnv* env, jobject, jlong jdb_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* flush_options =
-      reinterpret_cast<rocksdb::FlushOptions*>(jflush_options);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  rocksdb_flush_helper(env, db, *flush_options, cf_handle);
+  auto s = db->SyncWAL();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::CompactRange - Full
-
-void rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db,
-    rocksdb::ColumnFamilyHandle* cf_handle, jboolean jreduce_level,
-    jint jtarget_level, jint jtarget_path_id) {
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getLatestSequenceNumber
+ * Signature: (J)V
+ */
+jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(
+    JNIEnv*, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  return db->GetLatestSequenceNumber();
+}
 
-  rocksdb::Status s;
-  rocksdb::CompactRangeOptions compact_options;
-  compact_options.change_level = jreduce_level;
-  compact_options.target_level = jtarget_level;
-  compact_options.target_path_id = static_cast<uint32_t>(jtarget_path_id);
-  if (cf_handle != nullptr) {
-    s = db->CompactRange(compact_options, cf_handle, nullptr, nullptr);
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    setPreserveDeletesSequenceNumber
+ * Signature: (JJ)Z
+ */
+jboolean JNICALL Java_org_rocksdb_RocksDB_setPreserveDeletesSequenceNumber(
+    JNIEnv*, jobject, jlong jdb_handle, jlong jseq_number) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  if (db->SetPreserveDeletesSequenceNumber(
+      static_cast<uint64_t>(jseq_number))) {
+    return JNI_TRUE;
   } else {
-    // backwards compatibility
-    s = db->CompactRange(compact_options, nullptr, nullptr);
+    return JNI_FALSE;
   }
+}
 
-  if (s.ok()) {
-    return;
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    disableFileDeletions
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_RocksDB_disableFileDeletions(
+    JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::Status s = db->DisableFileDeletions();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    compactRange0
- * Signature: (JZII)V
+ * Method:    enableFileDeletions
+ * Signature: (JZ)V
  */
-void Java_org_rocksdb_RocksDB_compactRange0__JZII(JNIEnv* env,
-    jobject jdb, jlong jdb_handle, jboolean jreduce_level,
-    jint jtarget_level, jint jtarget_path_id) {
+void Java_org_rocksdb_RocksDB_enableFileDeletions(
+    JNIEnv* env, jobject, jlong jdb_handle, jboolean jforce) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  rocksdb_compactrange_helper(env, db, nullptr, jreduce_level,
-      jtarget_level, jtarget_path_id);
+  rocksdb::Status s = db->EnableFileDeletions(jforce);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    compactRange
- * Signature: (JZIIJ)V
+ * Method:    getLiveFiles
+ * Signature: (JZ)[Ljava/lang/String;
  */
-void Java_org_rocksdb_RocksDB_compactRange__JZIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle,
-     jboolean jreduce_level, jint jtarget_level,
-     jint jtarget_path_id, jlong jcf_handle) {
+jobjectArray Java_org_rocksdb_RocksDB_getLiveFiles(
+    JNIEnv* env, jobject, jlong jdb_handle, jboolean jflush_memtable) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  rocksdb_compactrange_helper(env, db, cf_handle, jreduce_level,
-      jtarget_level, jtarget_path_id);
-}
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  auto s = db->GetLiveFiles(
+      live_files, &manifest_file_size, jflush_memtable == JNI_TRUE);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::CompactRange - Range
+  // append the manifest_file_size to the vector
+  // for passing back to java
+  live_files.push_back(std::to_string(manifest_file_size));
 
-/**
- * @return true if the compact range succeeded, false if a Java Exception
- *     was thrown
- */
-bool rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db,
-    rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jbegin, jint jbegin_len,
-    jbyteArray jend, jint jend_len, jboolean jreduce_level, jint jtarget_level,
-    jint jtarget_path_id) {
+  return rocksdb::JniUtil::toJavaStrings(env, &live_files);
+}
 
-  jbyte* begin = env->GetByteArrayElements(jbegin, nullptr);
-  if(begin == nullptr) {
-    // exception thrown: OutOfMemoryError
-    return false;
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getSortedWalFiles
+ * Signature: (J)[Lorg/rocksdb/LogFile;
+ */
+jobjectArray Java_org_rocksdb_RocksDB_getSortedWalFiles(
+    JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  std::vector<std::unique_ptr<rocksdb::LogFile>> sorted_wal_files;
+  auto s = db->GetSortedWalFiles(sorted_wal_files);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
   }
 
-  jbyte* end = env->GetByteArrayElements(jend, nullptr);
-  if(end == nullptr) {
+  // convert to Java type
+  const jsize jlen = static_cast<jsize>(sorted_wal_files.size());
+  jobjectArray jsorted_wal_files = env->NewObjectArray(
+      jlen, rocksdb::LogFileJni::getJClass(env), nullptr);
+  if(jsorted_wal_files == nullptr) {
     // exception thrown: OutOfMemoryError
-    env->ReleaseByteArrayElements(jbegin, begin, JNI_ABORT);
-    return false;
+    return nullptr;
   }
 
-  const rocksdb::Slice begin_slice(reinterpret_cast<char*>(begin), jbegin_len);
-  const rocksdb::Slice end_slice(reinterpret_cast<char*>(end), jend_len);
+  jsize i = 0;
+  for (auto it = sorted_wal_files.begin(); it != sorted_wal_files.end(); ++it) {
+    jobject jlog_file = rocksdb::LogFileJni::fromCppLogFile(env, it->get());
+    if (jlog_file == nullptr) {
+      // exception occurred
+      env->DeleteLocalRef(jsorted_wal_files);
+      return nullptr;
+    }
+
+    env->SetObjectArrayElement(jsorted_wal_files, i++, jlog_file);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jlog_file);
+      env->DeleteLocalRef(jsorted_wal_files);
+      return nullptr;
+    }
 
-  rocksdb::Status s;
-  rocksdb::CompactRangeOptions compact_options;
-  compact_options.change_level = jreduce_level;
-  compact_options.target_level = jtarget_level;
-  compact_options.target_path_id = static_cast<uint32_t>(jtarget_path_id);
-  if (cf_handle != nullptr) {
-    s = db->CompactRange(compact_options, cf_handle, &begin_slice, &end_slice);
-  } else {
-    // backwards compatibility
-    s = db->CompactRange(compact_options, &begin_slice, &end_slice);
+    env->DeleteLocalRef(jlog_file);
   }
 
-  env->ReleaseByteArrayElements(jend, end, JNI_ABORT);
-  env->ReleaseByteArrayElements(jbegin, begin, JNI_ABORT);
+  return jsorted_wal_files;
+}
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getUpdatesSince
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_RocksDB_getUpdatesSince(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jsequence_number) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::SequenceNumber sequence_number =
+      static_cast<rocksdb::SequenceNumber>(jsequence_number);
+  std::unique_ptr<rocksdb::TransactionLogIterator> iter;
+  rocksdb::Status s = db->GetUpdatesSince(sequence_number, &iter);
   if (s.ok()) {
-    return true;
+    return reinterpret_cast<jlong>(iter.release());
   }
 
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return false;
+  return 0;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    compactRange0
- * Signature: (J[BI[BIZII)V
+ * Method:    deleteFile
+ * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_RocksDB_compactRange0__J_3BI_3BIZII(JNIEnv* env,
-    jobject jdb, jlong jdb_handle, jbyteArray jbegin, jint jbegin_len,
-    jbyteArray jend, jint jend_len, jboolean jreduce_level,
-    jint jtarget_level, jint jtarget_path_id) {
+void Java_org_rocksdb_RocksDB_deleteFile(
+    JNIEnv* env, jobject, jlong jdb_handle, jstring jname) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  rocksdb_compactrange_helper(env, db, nullptr, jbegin, jbegin_len,
-      jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
+  jboolean has_exception = JNI_FALSE;
+  std::string name =
+      rocksdb::JniUtil::copyStdString(env, jname, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+  db->DeleteFile(name);
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    compactRange
- * Signature: (JJ[BI[BIZII)V
+ * Method:    getLiveFilesMetaData
+ * Signature: (J)[Lorg/rocksdb/LiveFileMetaData;
  */
-void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jbegin,
-    jint jbegin_len, jbyteArray jend, jint jend_len,
-    jboolean jreduce_level, jint jtarget_level,
-    jint jtarget_path_id, jlong jcf_handle) {
+jobjectArray Java_org_rocksdb_RocksDB_getLiveFilesMetaData(
+    JNIEnv* env, jobject, jlong jdb_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len,
-      jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id);
+  std::vector<rocksdb::LiveFileMetaData> live_files_meta_data;
+  db->GetLiveFilesMetaData(&live_files_meta_data);
+  
+  // convert to Java type
+  const jsize jlen = static_cast<jsize>(live_files_meta_data.size());
+  jobjectArray jlive_files_meta_data = env->NewObjectArray(
+      jlen, rocksdb::LiveFileMetaDataJni::getJClass(env), nullptr);
+  if(jlive_files_meta_data == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  jsize i = 0;
+  for (auto it = live_files_meta_data.begin(); it != live_files_meta_data.end(); ++it) {
+    jobject jlive_file_meta_data =
+        rocksdb::LiveFileMetaDataJni::fromCppLiveFileMetaData(env, &(*it));
+    if (jlive_file_meta_data == nullptr) {
+      // exception occurred
+      env->DeleteLocalRef(jlive_files_meta_data);
+      return nullptr;
+    }
+
+    env->SetObjectArrayElement(jlive_files_meta_data, i++, jlive_file_meta_data);
+    if (env->ExceptionCheck()) {
+      // exception occurred
+      env->DeleteLocalRef(jlive_file_meta_data);
+      env->DeleteLocalRef(jlive_files_meta_data);
+      return nullptr;
+    }
+
+    env->DeleteLocalRef(jlive_file_meta_data);
+  }
+
+  return jlive_files_meta_data;
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::PauseBackgroundWork
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    getColumnFamilyMetaData
+ * Signature: (JJ)Lorg/rocksdb/ColumnFamilyMetaData;
+ */
+jobject Java_org_rocksdb_RocksDB_getColumnFamilyMetaData(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
+  rocksdb::ColumnFamilyMetaData cf_metadata;
+  db->GetColumnFamilyMetaData(cf_handle, &cf_metadata);
+  return rocksdb::ColumnFamilyMetaDataJni::fromCppColumnFamilyMetaData(
+      env, &cf_metadata);
+}
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    pauseBackgroundWork
- * Signature: (J)V
+ * Method:    ingestExternalFile
+ * Signature: (JJ[Ljava/lang/String;IJ)V
  */
-void Java_org_rocksdb_RocksDB_pauseBackgroundWork(
-    JNIEnv* env, jobject jobj, jlong jdb_handle) {
+void Java_org_rocksdb_RocksDB_ingestExternalFile(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jobjectArray jfile_path_list, jint jfile_path_list_len,
+    jlong jingest_external_file_options_handle) {
+  jboolean has_exception = JNI_FALSE;
+  std::vector<std::string> file_path_list = rocksdb::JniUtil::copyStrings(
+      env, jfile_path_list, jfile_path_list_len, &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return;
+  }
+
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto s = db->PauseBackgroundWork();
+  auto* column_family =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  auto* ifo = reinterpret_cast<rocksdb::IngestExternalFileOptions*>(
+      jingest_external_file_options_handle);
+  rocksdb::Status s =
+      db->IngestExternalFile(column_family, file_path_list, *ifo);
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::ContinueBackgroundWork
-
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    continueBackgroundWork
+ * Method:    verifyChecksum
  * Signature: (J)V
  */
-void Java_org_rocksdb_RocksDB_continueBackgroundWork(
-    JNIEnv* env, jobject jobj, jlong jdb_handle) {
+void Java_org_rocksdb_RocksDB_verifyChecksum(
+    JNIEnv* env, jobject, jlong jdb_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto s = db->ContinueBackgroundWork();
+  auto s = db->VerifyChecksum();
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::GetLatestSequenceNumber
-
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    getLatestSequenceNumber
- * Signature: (J)V
+ * Method:    getDefaultColumnFamily
+ * Signature: (J)J
  */
-jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv* env,
-    jobject jdb, jlong jdb_handle) {
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  return db->GetLatestSequenceNumber();
+jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily(
+    JNIEnv*, jobject, jlong jdb_handle) {
+  auto* db_handle = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto* cf_handle = db_handle->DefaultColumnFamily();
+  return reinterpret_cast<jlong>(cf_handle);
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB enable/disable file deletions
-
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    enableFileDeletions
- * Signature: (J)V
+ * Method:    getPropertiesOfAllTables
+ * Signature: (JJ)Ljava/util/Map;
  */
-void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env,
-    jobject jdb, jlong jdb_handle) {
+jobject Java_org_rocksdb_RocksDB_getPropertiesOfAllTables(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  rocksdb::Status s = db->DisableFileDeletions();
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
+  rocksdb::TablePropertiesCollection table_properties_collection;
+  auto s = db->GetPropertiesOfAllTables(cf_handle,
+      &table_properties_collection);
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
+  
+  // convert to Java type
+  jobject jhash_map = rocksdb::HashMapJni::construct(
+      env, static_cast<uint32_t>(table_properties_collection.size()));
+  if (jhash_map == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const rocksdb::HashMapJni::FnMapKV<const std::string, const std::shared_ptr<const rocksdb::TableProperties>, jobject, jobject> fn_map_kv =
+      [env](const std::pair<const std::string, const std::shared_ptr<const rocksdb::TableProperties>>& kv) {
+    jstring jkey = rocksdb::JniUtil::toJavaString(env, &(kv.first), false);
+    if (env->ExceptionCheck()) {
+      // an error occurred
+      return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+    }
+
+    jobject jtable_properties = rocksdb::TablePropertiesJni::fromCppTableProperties(env, *(kv.second.get()));
+    if (jtable_properties == nullptr) {
+      // an error occurred
+      env->DeleteLocalRef(jkey);
+      return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+    }
+
+    return std::unique_ptr<std::pair<jobject, jobject>>(new std::pair<jobject, jobject>(static_cast<jobject>(jkey), static_cast<jobject>(jtable_properties)));
+  };
+
+  if (!rocksdb::HashMapJni::putAll(env, jhash_map, table_properties_collection.begin(), table_properties_collection.end(), fn_map_kv)) {
+    // exception occurred
+    return nullptr;
+  }
+
+  return jhash_map;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    enableFileDeletions
- * Signature: (JZ)V
+ * Method:    getPropertiesOfTablesInRange
+ * Signature: (JJ[J)Ljava/util/Map;
  */
-void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env,
-    jobject jdb, jlong jdb_handle, jboolean jforce) {
+jobject Java_org_rocksdb_RocksDB_getPropertiesOfTablesInRange(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle,
+    jlongArray jrange_slice_handles) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  rocksdb::Status s = db->EnableFileDeletions(jforce);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
+  const jsize jlen = env->GetArrayLength(jrange_slice_handles);
+  jboolean jrange_slice_handles_is_copy = JNI_FALSE;
+  jlong *jrange_slice_handle = env->GetLongArrayElements(
+      jrange_slice_handles, &jrange_slice_handles_is_copy);
+  if (jrange_slice_handle == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const size_t ranges_len = static_cast<size_t>(jlen / 2);
+  auto ranges = std::unique_ptr<rocksdb::Range[]>(new rocksdb::Range[ranges_len]);
+  for (jsize i = 0, j = 0; i < jlen; ++i) {
+    auto* start = reinterpret_cast<rocksdb::Slice*>(
+        jrange_slice_handle[i]);
+    auto* limit = reinterpret_cast<rocksdb::Slice*>(
+        jrange_slice_handle[++i]);
+    ranges[j++] = rocksdb::Range(*start, *limit);
+  }
+
+  rocksdb::TablePropertiesCollection table_properties_collection;
+  auto s = db->GetPropertiesOfTablesInRange(
+      cf_handle, ranges.get(), ranges_len, &table_properties_collection);
   if (!s.ok()) {
+    // error occurred
+    env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle, JNI_ABORT);  
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
   }
-}
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::GetUpdatesSince
+  // cleanup
+  env->ReleaseLongArrayElements(jrange_slice_handles, jrange_slice_handle, JNI_ABORT);
+
+  return jrange_slice_handles;
+}
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    getUpdatesSince
- * Signature: (JJ)J
+ * Method:    suggestCompactRange
+ * Signature: (JJ)[J
  */
-jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env,
-    jobject jdb, jlong jdb_handle, jlong jsequence_number) {
+jlongArray Java_org_rocksdb_RocksDB_suggestCompactRange(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  rocksdb::SequenceNumber sequence_number =
-      static_cast<rocksdb::SequenceNumber>(jsequence_number);
-  std::unique_ptr<rocksdb::TransactionLogIterator> iter;
-  rocksdb::Status s = db->GetUpdatesSince(sequence_number, &iter);
-  if (s.ok()) {
-    return reinterpret_cast<jlong>(iter.release());
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  }
+  auto* begin = new rocksdb::Slice();
+  auto* end = new rocksdb::Slice();
+  auto s = db->SuggestCompactRange(cf_handle, begin, end);
+  if (!s.ok()) {
+    // error occurred
+    delete begin;
+    delete end;
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
   }
 
-  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
-  return 0;
+  jlongArray jslice_handles = env->NewLongArray(2);
+  if (jslice_handles == nullptr) {
+    // exception thrown: OutOfMemoryError
+    delete begin;
+    delete end;
+    return nullptr;
+  }
+
+  jlong slice_handles[2];
+  slice_handles[0] = reinterpret_cast<jlong>(begin);
+  slice_handles[1] = reinterpret_cast<jlong>(end);
+  env->SetLongArrayRegion(jslice_handles, 0, 2, slice_handles);
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    delete begin;
+    delete end;
+    env->DeleteLocalRef(jslice_handles);
+    return nullptr;
+  }
+
+  return jslice_handles;
 }
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    setOptions
- * Signature: (JJ[Ljava/lang/String;[Ljava/lang/String;)V
+ * Method:    promoteL0
+ * Signature: (JJI)V
  */
-void Java_org_rocksdb_RocksDB_setOptions(JNIEnv* env, jobject jdb,
-    jlong jdb_handle, jlong jcf_handle, jobjectArray jkeys,
-    jobjectArray jvalues) {
-  const jsize len = env->GetArrayLength(jkeys);
-  assert(len == env->GetArrayLength(jvalues));
-
-  std::unordered_map<std::string, std::string> options_map;
-  for (jsize i = 0; i < len; i++) {
-    jobject jobj_key = env->GetObjectArrayElement(jkeys, i);
-    if(env->ExceptionCheck()) {
-      // exception thrown: ArrayIndexOutOfBoundsException
-      return;
-    }
-
-    jobject jobj_value = env->GetObjectArrayElement(jvalues, i);
-    if(env->ExceptionCheck()) {
-      // exception thrown: ArrayIndexOutOfBoundsException
-      env->DeleteLocalRef(jobj_key);
-      return;
-    }
-
-    jstring jkey = reinterpret_cast<jstring>(jobj_key);
-    jstring jval = reinterpret_cast<jstring>(jobj_value);
-
-    const char* key = env->GetStringUTFChars(jkey, nullptr);
-    if(key == nullptr) {
-      // exception thrown: OutOfMemoryError
-      env->DeleteLocalRef(jobj_value);
-      env->DeleteLocalRef(jobj_key);
-      return;
-    }
-
-    const char* value = env->GetStringUTFChars(jval, nullptr);
-    if(value == nullptr) {
-      // exception thrown: OutOfMemoryError
-      env->ReleaseStringUTFChars(jkey, key);
-      env->DeleteLocalRef(jobj_value);
-      env->DeleteLocalRef(jobj_key);
-      return;
-    }
-
-    std::string s_key(key);
-    std::string s_value(value);
-    options_map[s_key] = s_value;
-
-    env->ReleaseStringUTFChars(jkey, key);
-    env->ReleaseStringUTFChars(jval, value);
-    env->DeleteLocalRef(jobj_key);
-    env->DeleteLocalRef(jobj_value);
+void Java_org_rocksdb_RocksDB_promoteL0(
+    JNIEnv*, jobject, jlong jdb_handle, jlong jcf_handle, jint jtarget_level) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  rocksdb::ColumnFamilyHandle* cf_handle;
+  if (jcf_handle == 0) {
+    cf_handle = db->DefaultColumnFamily();
+  } else {
+    cf_handle = 
+        reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   }
+  db->PromoteL0(cf_handle, static_cast<int>(jtarget_level));
+}
 
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    startTrace
+ * Signature: (JJJ)V
+ */
+void Java_org_rocksdb_RocksDB_startTrace(
+    JNIEnv* env, jobject, jlong jdb_handle, jlong jmax_trace_file_size,
+    jlong jtrace_writer_jnicallback_handle) {
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  db->SetOptions(cf_handle, options_map);
+  rocksdb::TraceOptions trace_options;
+  trace_options.max_trace_file_size = 
+      static_cast<uint64_t>(jmax_trace_file_size);
+  // transfer ownership of trace writer from Java to C++
+  auto trace_writer = std::unique_ptr<rocksdb::TraceWriterJniCallback>(
+      reinterpret_cast<rocksdb::TraceWriterJniCallback*>(
+        jtrace_writer_jnicallback_handle));
+  auto s = db->StartTrace(trace_options, std::move(trace_writer));
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
 }
 
-//////////////////////////////////////////////////////////////////////////////
-// rocksdb::DB::IngestExternalFile
+/*
+ * Class:     org_rocksdb_RocksDB
+ * Method:    endTrace
+ * Signature: (J)V
+ */
+JNIEXPORT void JNICALL Java_org_rocksdb_RocksDB_endTrace(
+    JNIEnv* env, jobject, jlong jdb_handle) {
+  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
+  auto s = db->EndTrace();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
 
 /*
  * Class:     org_rocksdb_RocksDB
- * Method:    ingestExternalFile
- * Signature: (JJ[Ljava/lang/String;IJ)V
+ * Method:    destroyDB
+ * Signature: (Ljava/lang/String;J)V
  */
-void Java_org_rocksdb_RocksDB_ingestExternalFile(
-    JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jcf_handle,
-    jobjectArray jfile_path_list, jint jfile_path_list_len,
-    jlong jingest_external_file_options_handle) {
-  jboolean has_exception = JNI_FALSE;
-  std::vector<std::string> file_path_list =
-      rocksdb::JniUtil::copyStrings(env, jfile_path_list, jfile_path_list_len,
-          &has_exception);
-  if(has_exception == JNI_TRUE) {
-    // exception occurred
+void Java_org_rocksdb_RocksDB_destroyDB(
+    JNIEnv* env, jclass, jstring jdb_path, jlong joptions_handle) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
     return;
   }
 
-  auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
-  auto* column_family =
-      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
-  auto* ifo =
-      reinterpret_cast<rocksdb::IngestExternalFileOptions*>(
-          jingest_external_file_options_handle);
-  rocksdb::Status s =
-      db->IngestExternalFile(column_family, file_path_list, *ifo);
+  auto* options = reinterpret_cast<rocksdb::Options*>(joptions_handle);
+  if (options == nullptr) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Invalid Options."));
+  }
+
+  rocksdb::Status s = rocksdb::DestroyDB(db_path, *options);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
diff --git a/thirdparty/rocksdb/java/rocksjni/slice.cc b/thirdparty/rocksdb/java/rocksjni/slice.cc
index ef0e384f1a..e617cde25a 100644
--- a/thirdparty/rocksdb/java/rocksjni/slice.cc
+++ b/thirdparty/rocksdb/java/rocksjni/slice.cc
@@ -6,14 +6,14 @@
 // This file implements the "bridge" between Java and C++ for
 // rocksdb::Slice.
 
+#include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <jni.h>
 #include <string>
 
 #include "include/org_rocksdb_AbstractSlice.h"
-#include "include/org_rocksdb_Slice.h"
 #include "include/org_rocksdb_DirectSlice.h"
+#include "include/org_rocksdb_Slice.h"
 #include "rocksdb/slice.h"
 #include "rocksjni/portal.h"
 
@@ -24,10 +24,11 @@
  * Method:    createNewSliceFromString
  * Signature: (Ljava/lang/String;)J
  */
-jlong Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
-    JNIEnv * env, jclass jcls, jstring jstr) {
+jlong Java_org_rocksdb_AbstractSlice_createNewSliceFromString(JNIEnv* env,
+                                                              jclass /*jcls*/,
+                                                              jstring jstr) {
   const auto* str = env->GetStringUTFChars(jstr, nullptr);
-  if(str == nullptr) {
+  if (str == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
@@ -51,8 +52,8 @@ jlong Java_org_rocksdb_AbstractSlice_createNewSliceFromString(
  * Method:    size0
  * Signature: (J)I
  */
-jint Java_org_rocksdb_AbstractSlice_size0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jint Java_org_rocksdb_AbstractSlice_size0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return static_cast<jint>(slice->size());
 }
@@ -62,8 +63,8 @@ jint Java_org_rocksdb_AbstractSlice_size0(
  * Method:    empty0
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_AbstractSlice_empty0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jboolean Java_org_rocksdb_AbstractSlice_empty0(JNIEnv* /*env*/,
+                                               jobject /*jobj*/, jlong handle) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return slice->empty();
 }
@@ -73,8 +74,8 @@ jboolean Java_org_rocksdb_AbstractSlice_empty0(
  * Method:    toString0
  * Signature: (JZ)Ljava/lang/String;
  */
-jstring Java_org_rocksdb_AbstractSlice_toString0(
-    JNIEnv* env, jobject jobj, jlong handle, jboolean hex) {
+jstring Java_org_rocksdb_AbstractSlice_toString0(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong handle, jboolean hex) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const std::string s = slice->ToString(hex);
   return env->NewStringUTF(s.c_str());
@@ -85,11 +86,10 @@ jstring Java_org_rocksdb_AbstractSlice_toString0(
  * Method:    compare0
  * Signature: (JJ)I;
  */
-jint Java_org_rocksdb_AbstractSlice_compare0(
-    JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
+jint Java_org_rocksdb_AbstractSlice_compare0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong handle, jlong otherHandle) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const auto* otherSlice =
-    reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  const auto* otherSlice = reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->compare(*otherSlice);
 }
 
@@ -98,11 +98,12 @@ jint Java_org_rocksdb_AbstractSlice_compare0(
  * Method:    startsWith0
  * Signature: (JJ)Z;
  */
-jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
-    JNIEnv* env, jobject jobj, jlong handle, jlong otherHandle) {
+jboolean Java_org_rocksdb_AbstractSlice_startsWith0(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle,
+                                                    jlong otherHandle) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  const auto* otherSlice =
-    reinterpret_cast<rocksdb::Slice*>(otherHandle);
+  const auto* otherSlice = reinterpret_cast<rocksdb::Slice*>(otherHandle);
   return slice->starts_with(*otherSlice);
 }
 
@@ -111,8 +112,9 @@ jboolean Java_org_rocksdb_AbstractSlice_startsWith0(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_AbstractSlice_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_AbstractSlice_disposeInternal(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
   delete reinterpret_cast<rocksdb::Slice*>(handle);
 }
 
@@ -125,15 +127,16 @@ void Java_org_rocksdb_AbstractSlice_disposeInternal(
  * Method:    createNewSlice0
  * Signature: ([BI)J
  */
-jlong Java_org_rocksdb_Slice_createNewSlice0(
-    JNIEnv * env, jclass jcls, jbyteArray data, jint offset) {
+jlong Java_org_rocksdb_Slice_createNewSlice0(JNIEnv* env, jclass /*jcls*/,
+                                             jbyteArray data, jint offset) {
   const jsize dataSize = env->GetArrayLength(data);
   const int len = dataSize - offset;
 
-  // NOTE: buf will be deleted in the Java_org_rocksdb_Slice_disposeInternalBuf method
+  // NOTE: buf will be deleted in the Java_org_rocksdb_Slice_disposeInternalBuf
+  // method
   jbyte* buf = new jbyte[len];
   env->GetByteArrayRegion(data, offset, len, buf);
-  if(env->ExceptionCheck()) {
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
     return 0;
   }
@@ -147,22 +150,22 @@ jlong Java_org_rocksdb_Slice_createNewSlice0(
  * Method:    createNewSlice1
  * Signature: ([B)J
  */
-jlong Java_org_rocksdb_Slice_createNewSlice1(
-    JNIEnv * env, jclass jcls, jbyteArray data) {
+jlong Java_org_rocksdb_Slice_createNewSlice1(JNIEnv* env, jclass /*jcls*/,
+                                             jbyteArray data) {
   jbyte* ptrData = env->GetByteArrayElements(data, nullptr);
-  if(ptrData == nullptr) {
+  if (ptrData == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
   const int len = env->GetArrayLength(data) + 1;
 
-  // NOTE: buf will be deleted in the Java_org_rocksdb_Slice_disposeInternalBuf method
+  // NOTE: buf will be deleted in the Java_org_rocksdb_Slice_disposeInternalBuf
+  // method
   char* buf = new char[len];
   memcpy(buf, ptrData, len - 1);
-  buf[len-1] = '\0';
+  buf[len - 1] = '\0';
 
-  const auto* slice =
-      new rocksdb::Slice(buf, len - 1);
+  const auto* slice = new rocksdb::Slice(buf, len - 1);
 
   env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT);
 
@@ -174,19 +177,20 @@ jlong Java_org_rocksdb_Slice_createNewSlice1(
  * Method:    data0
  * Signature: (J)[B
  */
-jbyteArray Java_org_rocksdb_Slice_data0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jbyteArray Java_org_rocksdb_Slice_data0(JNIEnv* env, jobject /*jobj*/,
+                                        jlong handle) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const jsize len = static_cast<jsize>(slice->size());
   const jbyteArray data = env->NewByteArray(len);
-  if(data == nullptr) {
+  if (data == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
-  
-  env->SetByteArrayRegion(data, 0, len,
-    const_cast<jbyte*>(reinterpret_cast<const jbyte*>(slice->data())));
-  if(env->ExceptionCheck()) {
+
+  env->SetByteArrayRegion(
+      data, 0, len,
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(slice->data())));
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
     env->DeleteLocalRef(data);
     return nullptr;
@@ -200,13 +204,13 @@ jbyteArray Java_org_rocksdb_Slice_data0(
  * Method:    clear0
  * Signature: (JZJ)V
  */
-void Java_org_rocksdb_Slice_clear0(
-    JNIEnv * env, jobject jobj, jlong handle, jboolean shouldRelease,
-    jlong internalBufferOffset) {
+void Java_org_rocksdb_Slice_clear0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                   jlong handle, jboolean shouldRelease,
+                                   jlong internalBufferOffset) {
   auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  if(shouldRelease == JNI_TRUE) {
+  if (shouldRelease == JNI_TRUE) {
     const char* buf = slice->data_ - internalBufferOffset;
-    delete [] buf;
+    delete[] buf;
   }
   slice->clear();
 }
@@ -216,8 +220,8 @@ void Java_org_rocksdb_Slice_clear0(
  * Method:    removePrefix0
  * Signature: (JI)V
  */
-void Java_org_rocksdb_Slice_removePrefix0(
-    JNIEnv * env, jobject jobj, jlong handle, jint length) {
+void Java_org_rocksdb_Slice_removePrefix0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                          jlong handle, jint length) {
   auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   slice->remove_prefix(length);
 }
@@ -227,11 +231,12 @@ void Java_org_rocksdb_Slice_removePrefix0(
  * Method:    disposeInternalBuf
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_Slice_disposeInternalBuf(
-    JNIEnv * env, jobject jobj, jlong handle, jlong internalBufferOffset) {
+void Java_org_rocksdb_Slice_disposeInternalBuf(JNIEnv* /*env*/,
+                                               jobject /*jobj*/, jlong handle,
+                                               jlong internalBufferOffset) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const char* buf = slice->data_ - internalBufferOffset;
-  delete [] buf;
+  delete[] buf;
 }
 
 // </editor-fold>
@@ -243,21 +248,21 @@ void Java_org_rocksdb_Slice_disposeInternalBuf(
  * Method:    createNewDirectSlice0
  * Signature: (Ljava/nio/ByteBuffer;I)J
  */
-jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
-    JNIEnv* env, jclass jcls, jobject data, jint length) {
+jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice0(JNIEnv* env,
+                                                         jclass /*jcls*/,
+                                                         jobject data,
+                                                         jint length) {
   assert(data != nullptr);
   void* data_addr = env->GetDirectBufferAddress(data);
-  if(data_addr == nullptr) {
+  if (data_addr == nullptr) {
     // error: memory region is undefined, given object is not a direct
     // java.nio.Buffer, or JNI access to direct buffers is not supported by JVM
-    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument(
-            "Could not access DirectBuffer"));
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Could not access DirectBuffer"));
     return 0;
   }
 
-  const auto* ptrData =
-     reinterpret_cast<char*>(data_addr);
+  const auto* ptrData = reinterpret_cast<char*>(data_addr);
   const auto* slice = new rocksdb::Slice(ptrData, length);
   return reinterpret_cast<jlong>(slice);
 }
@@ -267,15 +272,15 @@ jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice0(
  * Method:    createNewDirectSlice1
  * Signature: (Ljava/nio/ByteBuffer;)J
  */
-jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
-    JNIEnv* env, jclass jcls, jobject data) {
+jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice1(JNIEnv* env,
+                                                         jclass /*jcls*/,
+                                                         jobject data) {
   void* data_addr = env->GetDirectBufferAddress(data);
-  if(data_addr == nullptr) {
+  if (data_addr == nullptr) {
     // error: memory region is undefined, given object is not a direct
     // java.nio.Buffer, or JNI access to direct buffers is not supported by JVM
-    rocksdb::IllegalArgumentExceptionJni::ThrowNew(env,
-        rocksdb::Status::InvalidArgument(
-            "Could not access DirectBuffer"));
+    rocksdb::IllegalArgumentExceptionJni::ThrowNew(
+        env, rocksdb::Status::InvalidArgument("Could not access DirectBuffer"));
     return 0;
   }
 
@@ -289,11 +294,11 @@ jlong Java_org_rocksdb_DirectSlice_createNewDirectSlice1(
  * Method:    data0
  * Signature: (J)Ljava/lang/Object;
  */
-jobject Java_org_rocksdb_DirectSlice_data0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jobject Java_org_rocksdb_DirectSlice_data0(JNIEnv* env, jobject /*jobj*/,
+                                           jlong handle) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return env->NewDirectByteBuffer(const_cast<char*>(slice->data()),
-    slice->size());
+                                  slice->size());
 }
 
 /*
@@ -301,8 +306,8 @@ jobject Java_org_rocksdb_DirectSlice_data0(
  * Method:    get0
  * Signature: (JI)B
  */
-jbyte Java_org_rocksdb_DirectSlice_get0(
-    JNIEnv* env, jobject jobj, jlong handle, jint offset) {
+jbyte Java_org_rocksdb_DirectSlice_get0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                        jlong handle, jint offset) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   return (*slice)[offset];
 }
@@ -312,13 +317,13 @@ jbyte Java_org_rocksdb_DirectSlice_get0(
  * Method:    clear0
  * Signature: (JZJ)V
  */
-void Java_org_rocksdb_DirectSlice_clear0(
-    JNIEnv* env, jobject jobj, jlong handle,
-    jboolean shouldRelease, jlong internalBufferOffset) {
+void Java_org_rocksdb_DirectSlice_clear0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                         jlong handle, jboolean shouldRelease,
+                                         jlong internalBufferOffset) {
   auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
-  if(shouldRelease == JNI_TRUE) {
+  if (shouldRelease == JNI_TRUE) {
     const char* buf = slice->data_ - internalBufferOffset;
-    delete [] buf;
+    delete[] buf;
   }
   slice->clear();
 }
@@ -328,8 +333,9 @@ void Java_org_rocksdb_DirectSlice_clear0(
  * Method:    removePrefix0
  * Signature: (JI)V
  */
-void Java_org_rocksdb_DirectSlice_removePrefix0(
-    JNIEnv* env, jobject jobj, jlong handle, jint length) {
+void Java_org_rocksdb_DirectSlice_removePrefix0(JNIEnv* /*env*/,
+                                                jobject /*jobj*/, jlong handle,
+                                                jint length) {
   auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   slice->remove_prefix(length);
 }
@@ -340,10 +346,11 @@ void Java_org_rocksdb_DirectSlice_removePrefix0(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_DirectSlice_disposeInternalBuf(
-    JNIEnv* env, jobject jobj, jlong handle, jlong internalBufferOffset) {
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong handle,
+    jlong internalBufferOffset) {
   const auto* slice = reinterpret_cast<rocksdb::Slice*>(handle);
   const char* buf = slice->data_ - internalBufferOffset;
-  delete [] buf;
+  delete[] buf;
 }
 
 // </editor-fold>
diff --git a/thirdparty/rocksdb/java/rocksjni/snapshot.cc b/thirdparty/rocksdb/java/rocksjni/snapshot.cc
index 04a0ebfbaf..679271db87 100644
--- a/thirdparty/rocksdb/java/rocksjni/snapshot.cc
+++ b/thirdparty/rocksdb/java/rocksjni/snapshot.cc
@@ -18,9 +18,9 @@
  * Method:    getSequenceNumber
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_Snapshot_getSequenceNumber(JNIEnv* env,
-    jobject jobj, jlong jsnapshot_handle) {
-  auto* snapshot = reinterpret_cast<rocksdb::Snapshot*>(
-      jsnapshot_handle);
+jlong Java_org_rocksdb_Snapshot_getSequenceNumber(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jsnapshot_handle) {
+  auto* snapshot = reinterpret_cast<rocksdb::Snapshot*>(jsnapshot_handle);
   return snapshot->GetSequenceNumber();
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/sst_file_manager.cc b/thirdparty/rocksdb/java/rocksjni/sst_file_manager.cc
new file mode 100644
index 0000000000..3df3c9966c
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/sst_file_manager.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling C++ rocksdb::SstFileManager methods
+// from Java side.
+
+#include <jni.h>
+#include <memory>
+
+#include "include/org_rocksdb_SstFileManager.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    newSstFileManager
+ * Signature: (JJJDJ)J
+ */
+jlong Java_org_rocksdb_SstFileManager_newSstFileManager(
+    JNIEnv* jnienv, jclass /*jcls*/, jlong jenv_handle, jlong jlogger_handle,
+    jlong jrate_bytes, jdouble jmax_trash_db_ratio,
+    jlong jmax_delete_chunk_bytes) {
+  auto* env = reinterpret_cast<rocksdb::Env*>(jenv_handle);
+  rocksdb::Status s;
+  rocksdb::SstFileManager* sst_file_manager = nullptr;
+
+  if (jlogger_handle != 0) {
+    auto* sptr_logger =
+        reinterpret_cast<std::shared_ptr<rocksdb::Logger>*>(jlogger_handle);
+    sst_file_manager = rocksdb::NewSstFileManager(
+        env, *sptr_logger, "", jrate_bytes, true, &s, jmax_trash_db_ratio,
+        jmax_delete_chunk_bytes);
+  } else {
+    sst_file_manager = rocksdb::NewSstFileManager(env, nullptr, "", jrate_bytes,
+                                                  true, &s, jmax_trash_db_ratio,
+                                                  jmax_delete_chunk_bytes);
+  }
+
+  if (!s.ok()) {
+    if (sst_file_manager != nullptr) {
+      delete sst_file_manager;
+    }
+    rocksdb::RocksDBExceptionJni::ThrowNew(jnienv, s);
+  }
+  auto* sptr_sst_file_manager =
+      new std::shared_ptr<rocksdb::SstFileManager>(sst_file_manager);
+
+  return reinterpret_cast<jlong>(sptr_sst_file_manager);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setMaxAllowedSpaceUsage
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_SstFileManager_setMaxAllowedSpaceUsage(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jmax_allowed_space) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  sptr_sst_file_manager->get()->SetMaxAllowedSpaceUsage(jmax_allowed_space);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setCompactionBufferSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_SstFileManager_setCompactionBufferSize(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jcompaction_buffer_size) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  sptr_sst_file_manager->get()->SetCompactionBufferSize(
+      jcompaction_buffer_size);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    isMaxAllowedSpaceReached
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReached(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  return sptr_sst_file_manager->get()->IsMaxAllowedSpaceReached();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    isMaxAllowedSpaceReachedIncludingCompactions
+ * Signature: (J)Z
+ */
+jboolean
+Java_org_rocksdb_SstFileManager_isMaxAllowedSpaceReachedIncludingCompactions(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  return sptr_sst_file_manager->get()
+      ->IsMaxAllowedSpaceReachedIncludingCompactions();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getTotalSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileManager_getTotalSize(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  return sptr_sst_file_manager->get()->GetTotalSize();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getTrackedFiles
+ * Signature: (J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_SstFileManager_getTrackedFiles(JNIEnv* env,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  auto tracked_files = sptr_sst_file_manager->get()->GetTrackedFiles();
+
+  //TODO(AR) could refactor to share code with rocksdb::HashMapJni::fromCppMap(env, tracked_files);
+
+  const jobject jtracked_files = rocksdb::HashMapJni::construct(
+      env, static_cast<uint32_t>(tracked_files.size()));
+  if (jtracked_files == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const rocksdb::HashMapJni::FnMapKV<const std::string, const uint64_t, jobject, jobject>
+      fn_map_kv =
+          [env](const std::pair<const std::string, const uint64_t>& pair) {
+            const jstring jtracked_file_path =
+                env->NewStringUTF(pair.first.c_str());
+            if (jtracked_file_path == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            const jobject jtracked_file_size =
+                rocksdb::LongJni::valueOf(env, pair.second);
+            if (jtracked_file_size == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            return std::unique_ptr<std::pair<jobject, jobject>>(
+                new std::pair<jobject, jobject>(jtracked_file_path,
+                                                jtracked_file_size));
+          };
+
+  if (!rocksdb::HashMapJni::putAll(env, jtracked_files, tracked_files.begin(),
+                                   tracked_files.end(), fn_map_kv)) {
+    // exception occcurred
+    return nullptr;
+  }
+
+  return jtracked_files;
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getDeleteRateBytesPerSecond
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_SstFileManager_getDeleteRateBytesPerSecond(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  return sptr_sst_file_manager->get()->GetDeleteRateBytesPerSecond();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setDeleteRateBytesPerSecond
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_SstFileManager_setDeleteRateBytesPerSecond(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jlong jdelete_rate) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  sptr_sst_file_manager->get()->SetDeleteRateBytesPerSecond(jdelete_rate);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    getMaxTrashDBRatio
+ * Signature: (J)D
+ */
+jdouble Java_org_rocksdb_SstFileManager_getMaxTrashDBRatio(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  return sptr_sst_file_manager->get()->GetMaxTrashDBRatio();
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    setMaxTrashDBRatio
+ * Signature: (JD)V
+ */
+void Java_org_rocksdb_SstFileManager_setMaxTrashDBRatio(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle,
+                                                        jdouble jratio) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  sptr_sst_file_manager->get()->SetMaxTrashDBRatio(jratio);
+}
+
+/*
+ * Class:     org_rocksdb_SstFileManager
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_SstFileManager_disposeInternal(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jhandle) {
+  auto* sptr_sst_file_manager =
+      reinterpret_cast<std::shared_ptr<rocksdb::SstFileManager>*>(jhandle);
+  delete sptr_sst_file_manager;
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/sst_file_writerjni.cc b/thirdparty/rocksdb/java/rocksjni/sst_file_writerjni.cc
index ceb93384ac..76212ed899 100644
--- a/thirdparty/rocksdb/java/rocksjni/sst_file_writerjni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/sst_file_writerjni.cc
@@ -20,16 +20,33 @@
 /*
  * Class:     org_rocksdb_SstFileWriter
  * Method:    newSstFileWriter
- * Signature: (JJJ)J
+ * Signature: (JJJB)J
  */
-jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJ(JNIEnv *env, jclass jcls,
-                                                      jlong jenvoptions,
-                                                      jlong joptions,
-                                                      jlong jcomparator) {
+jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJB(
+    JNIEnv * /*env*/, jclass /*jcls*/, jlong jenvoptions, jlong joptions,
+    jlong jcomparator_handle, jbyte jcomparator_type) {
+  rocksdb::Comparator *comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      comparator = reinterpret_cast<rocksdb::ComparatorJniCallback *>(
+          jcomparator_handle);
+      break;
+
+    // JAVA_DIRECT_COMPARATOR
+    case 0x1:
+      comparator = reinterpret_cast<rocksdb::DirectComparatorJniCallback *>(
+          jcomparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x2:
+      comparator = reinterpret_cast<rocksdb::Comparator *>(jcomparator_handle);
+      break;
+  }
   auto *env_options =
       reinterpret_cast<const rocksdb::EnvOptions *>(jenvoptions);
   auto *options = reinterpret_cast<const rocksdb::Options *>(joptions);
-  auto *comparator = reinterpret_cast<const rocksdb::Comparator *>(jcomparator);
   rocksdb::SstFileWriter *sst_file_writer =
       new rocksdb::SstFileWriter(*env_options, *options, comparator);
   return reinterpret_cast<jlong>(sst_file_writer);
@@ -40,9 +57,10 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJJ(JNIEnv *env, jclass j
  * Method:    newSstFileWriter
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv *env, jclass jcls,
-                                                      jlong jenvoptions,
-                                                      jlong joptions) {
+jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv * /*env*/,
+                                                          jclass /*jcls*/,
+                                                          jlong jenvoptions,
+                                                          jlong joptions) {
   auto *env_options =
       reinterpret_cast<const rocksdb::EnvOptions *>(jenvoptions);
   auto *options = reinterpret_cast<const rocksdb::Options *>(joptions);
@@ -56,10 +74,10 @@ jlong Java_org_rocksdb_SstFileWriter_newSstFileWriter__JJ(JNIEnv *env, jclass jc
  * Method:    open
  * Signature: (JLjava/lang/String;)V
  */
-void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jobject jobj,
+void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jobject /*jobj*/,
                                          jlong jhandle, jstring jfile_path) {
   const char *file_path = env->GetStringUTFChars(jfile_path, nullptr);
-  if(file_path == nullptr) {
+  if (file_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
@@ -77,14 +95,13 @@ void Java_org_rocksdb_SstFileWriter_open(JNIEnv *env, jobject jobj,
  * Method:    put
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject jobj,
+void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject /*jobj*/,
                                              jlong jhandle, jlong jkey_handle,
                                              jlong jvalue_handle) {
   auto *key_slice = reinterpret_cast<rocksdb::Slice *>(jkey_handle);
   auto *value_slice = reinterpret_cast<rocksdb::Slice *>(jvalue_handle);
-  rocksdb::Status s =
-    reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Put(*key_slice,
-                                                             *value_slice);
+  rocksdb::Status s = reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Put(
+      *key_slice, *value_slice);
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
@@ -95,29 +112,28 @@ void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject jobj,
  * Method:    put
  * Signature: (JJJ)V
  */
- void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jobject jobj,
-                                                  jlong jhandle, jbyteArray jkey,
-                                                  jbyteArray jval) {
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  if(key == nullptr) {
+void Java_org_rocksdb_SstFileWriter_put__J_3B_3B(JNIEnv *env, jobject /*jobj*/,
+                                                 jlong jhandle, jbyteArray jkey,
+                                                 jbyteArray jval) {
+  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  rocksdb::Slice key_slice(
-      reinterpret_cast<char*>(key),  env->GetArrayLength(jkey));
+  rocksdb::Slice key_slice(reinterpret_cast<char *>(key),
+                           env->GetArrayLength(jkey));
 
-  jbyte* value = env->GetByteArrayElements(jval, nullptr);
-  if(value == nullptr) {
+  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  if (value == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
     return;
   }
-  rocksdb::Slice value_slice(
-      reinterpret_cast<char*>(value),  env->GetArrayLength(jval));
+  rocksdb::Slice value_slice(reinterpret_cast<char *>(value),
+                             env->GetArrayLength(jval));
 
-  rocksdb::Status s =
-  reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Put(key_slice,
-                                                           value_slice);
+  rocksdb::Status s = reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Put(
+      key_slice, value_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
   env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
@@ -132,14 +148,14 @@ void Java_org_rocksdb_SstFileWriter_put__JJJ(JNIEnv *env, jobject jobj,
  * Method:    merge
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jobject jobj,
+void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jobject /*jobj*/,
                                                jlong jhandle, jlong jkey_handle,
                                                jlong jvalue_handle) {
   auto *key_slice = reinterpret_cast<rocksdb::Slice *>(jkey_handle);
   auto *value_slice = reinterpret_cast<rocksdb::Slice *>(jvalue_handle);
   rocksdb::Status s =
-    reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Merge(*key_slice,
-                                                               *value_slice);
+      reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Merge(*key_slice,
+                                                                 *value_slice);
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
@@ -150,30 +166,31 @@ void Java_org_rocksdb_SstFileWriter_merge__JJJ(JNIEnv *env, jobject jobj,
  * Method:    merge
  * Signature: (J[B[B)V
  */
-void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jobject jobj,
-                                                   jlong jhandle, jbyteArray jkey,
+void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle,
+                                                   jbyteArray jkey,
                                                    jbyteArray jval) {
-
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  if(key == nullptr) {
+  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  rocksdb::Slice key_slice(
-      reinterpret_cast<char*>(key),  env->GetArrayLength(jkey));
+  rocksdb::Slice key_slice(reinterpret_cast<char *>(key),
+                           env->GetArrayLength(jkey));
 
-  jbyte* value = env->GetByteArrayElements(jval, nullptr);
-  if(value == nullptr) {
+  jbyte *value = env->GetByteArrayElements(jval, nullptr);
+  if (value == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
     return;
   }
-  rocksdb::Slice value_slice(
-      reinterpret_cast<char*>(value),  env->GetArrayLength(jval));
+  rocksdb::Slice value_slice(reinterpret_cast<char *>(value),
+                             env->GetArrayLength(jval));
 
   rocksdb::Status s =
-    reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Merge(key_slice,
-                                                               value_slice);
+      reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Merge(key_slice,
+                                                                 value_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
   env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
@@ -188,18 +205,19 @@ void Java_org_rocksdb_SstFileWriter_merge__J_3B_3B(JNIEnv *env, jobject jobj,
  * Method:    delete
  * Signature: (JJJ)V
  */
-void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject jobj,
-                                               jlong jhandle, jbyteArray jkey) {
-  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
-  if(key == nullptr) {
+void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject /*jobj*/,
+                                                 jlong jhandle,
+                                                 jbyteArray jkey) {
+  jbyte *key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
-  rocksdb::Slice key_slice(
-      reinterpret_cast<char*>(key),  env->GetArrayLength(jkey));
+  rocksdb::Slice key_slice(reinterpret_cast<char *>(key),
+                           env->GetArrayLength(jkey));
 
   rocksdb::Status s =
-    reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Delete(key_slice);
+      reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Delete(key_slice);
 
   env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
 
@@ -213,11 +231,12 @@ void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject jobj,
  * Method:    delete
  * Signature: (JJJ)V
  */
- void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jobject jobj,
-  jlong jhandle, jlong jkey_handle) {
+void Java_org_rocksdb_SstFileWriter_delete__JJ(JNIEnv *env, jobject /*jobj*/,
+                                               jlong jhandle,
+                                               jlong jkey_handle) {
   auto *key_slice = reinterpret_cast<rocksdb::Slice *>(jkey_handle);
   rocksdb::Status s =
-    reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Delete(*key_slice);
+      reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Delete(*key_slice);
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
@@ -228,7 +247,7 @@ void Java_org_rocksdb_SstFileWriter_delete__J_3B(JNIEnv *env, jobject jobj,
  * Method:    finish
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jobject jobj,
+void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jobject /*jobj*/,
                                            jlong jhandle) {
   rocksdb::Status s =
       reinterpret_cast<rocksdb::SstFileWriter *>(jhandle)->Finish();
@@ -242,7 +261,8 @@ void Java_org_rocksdb_SstFileWriter_finish(JNIEnv *env, jobject jobj,
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_SstFileWriter_disposeInternal(JNIEnv *env, jobject jobj,
+void Java_org_rocksdb_SstFileWriter_disposeInternal(JNIEnv * /*env*/,
+                                                    jobject /*jobj*/,
                                                     jlong jhandle) {
   delete reinterpret_cast<rocksdb::SstFileWriter *>(jhandle);
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/statistics.cc b/thirdparty/rocksdb/java/rocksjni/statistics.cc
index 7b657ada7b..ae7ad53529 100644
--- a/thirdparty/rocksdb/java/rocksjni/statistics.cc
+++ b/thirdparty/rocksdb/java/rocksjni/statistics.cc
@@ -11,16 +11,17 @@
 #include <set>
 
 #include "include/org_rocksdb_Statistics.h"
+#include "rocksdb/statistics.h"
 #include "rocksjni/portal.h"
 #include "rocksjni/statisticsjni.h"
-#include "rocksdb/statistics.h"
 
 /*
  * Class:     org_rocksdb_Statistics
  * Method:    newStatistics
  * Signature: ()J
  */
-jlong Java_org_rocksdb_Statistics_newStatistics__(JNIEnv* env, jclass jcls) {
+jlong Java_org_rocksdb_Statistics_newStatistics__(
+    JNIEnv* env, jclass jcls) {
   return Java_org_rocksdb_Statistics_newStatistics___3BJ(
       env, jcls, nullptr, 0);
 }
@@ -53,9 +54,7 @@ jlong Java_org_rocksdb_Statistics_newStatistics___3B(
  * Signature: ([BJ)J
  */
 jlong Java_org_rocksdb_Statistics_newStatistics___3BJ(
-    JNIEnv* env, jclass jcls, jbyteArray jhistograms,
-    jlong jother_statistics_handle) {
-
+    JNIEnv* env, jclass, jbyteArray jhistograms, jlong jother_statistics_handle) {
   std::shared_ptr<rocksdb::Statistics>* pSptr_other_statistics = nullptr;
   if (jother_statistics_handle > 0) {
     pSptr_other_statistics =
@@ -68,7 +67,7 @@ jlong Java_org_rocksdb_Statistics_newStatistics___3BJ(
     const jsize len = env->GetArrayLength(jhistograms);
     if (len > 0) {
       jbyte* jhistogram = env->GetByteArrayElements(jhistograms, nullptr);
-      if (jhistogram == nullptr ) {
+      if (jhistogram == nullptr) {
         // exception thrown: OutOfMemoryError
         return 0;
       }
@@ -85,7 +84,7 @@ jlong Java_org_rocksdb_Statistics_newStatistics___3BJ(
 
   std::shared_ptr<rocksdb::Statistics> sptr_other_statistics = nullptr;
   if (pSptr_other_statistics != nullptr) {
-      sptr_other_statistics =   *pSptr_other_statistics;
+    sptr_other_statistics = *pSptr_other_statistics;
   }
 
   auto* pSptr_statistics = new std::shared_ptr<rocksdb::StatisticsJni>(
@@ -100,8 +99,8 @@ jlong Java_org_rocksdb_Statistics_newStatistics___3BJ(
  * Signature: (J)V
  */
 void Java_org_rocksdb_Statistics_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-  if(jhandle > 0) {
+    JNIEnv*, jobject, jlong jhandle) {
+  if (jhandle > 0) {
     auto* pSptr_statistics =
         reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
     delete pSptr_statistics;
@@ -114,11 +113,12 @@ void Java_org_rocksdb_Statistics_disposeInternal(
  * Signature: (J)B
  */
 jbyte Java_org_rocksdb_Statistics_statsLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
+    JNIEnv*, jobject, jlong jhandle) {
   auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
-  return rocksdb::StatsLevelJni::toJavaStatsLevel(pSptr_statistics->get()->stats_level_);
+  return rocksdb::StatsLevelJni::toJavaStatsLevel(
+      pSptr_statistics->get()->get_stats_level());
 }
 
 /*
@@ -127,12 +127,12 @@ jbyte Java_org_rocksdb_Statistics_statsLevel(
  * Signature: (JB)V
  */
 void Java_org_rocksdb_Statistics_setStatsLevel(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jstats_level) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jstats_level) {
   auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
   auto stats_level = rocksdb::StatsLevelJni::toCppStatsLevel(jstats_level);
-  pSptr_statistics->get()->stats_level_ = stats_level;
+  pSptr_statistics->get()->set_stats_level(stats_level);
 }
 
 /*
@@ -141,12 +141,13 @@ void Java_org_rocksdb_Statistics_setStatsLevel(
  * Signature: (JB)J
  */
 jlong Java_org_rocksdb_Statistics_getTickerCount(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jticker_type) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jticker_type) {
   auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
   auto ticker = rocksdb::TickerTypeJni::toCppTickers(jticker_type);
-  return pSptr_statistics->get()->getTickerCount(ticker);
+  uint64_t count = pSptr_statistics->get()->getTickerCount(ticker);
+  return static_cast<jlong>(count);
 }
 
 /*
@@ -155,7 +156,7 @@ jlong Java_org_rocksdb_Statistics_getTickerCount(
  * Signature: (JB)J
  */
 jlong Java_org_rocksdb_Statistics_getAndResetTickerCount(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jticker_type) {
+    JNIEnv*, jobject, jlong jhandle, jbyte jticker_type) {
   auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
@@ -169,33 +170,35 @@ jlong Java_org_rocksdb_Statistics_getAndResetTickerCount(
  * Signature: (JB)Lorg/rocksdb/HistogramData;
  */
 jobject Java_org_rocksdb_Statistics_getHistogramData(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jhistogram_type) {
+    JNIEnv* env, jobject, jlong jhandle, jbyte jhistogram_type) {
   auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
 
-  rocksdb::HistogramData data;  // TODO(AR) perhaps better to construct a Java Object Wrapper that uses ptr to C++ `new HistogramData`
+  // TODO(AR) perhaps better to construct a Java Object Wrapper that
+  //    uses ptr to C++ `new HistogramData`
+  rocksdb::HistogramData data;
+
   auto histogram = rocksdb::HistogramTypeJni::toCppHistograms(jhistogram_type);
   pSptr_statistics->get()->histogramData(
       static_cast<rocksdb::Histograms>(histogram), &data);
 
   jclass jclazz = rocksdb::HistogramDataJni::getJClass(env);
-  if(jclazz == nullptr) {
+  if (jclazz == nullptr) {
     // exception occurred accessing class
     return nullptr;
   }
 
-  jmethodID mid = rocksdb::HistogramDataJni::getConstructorMethodId(
-      env);
-  if(mid == nullptr) {
+  jmethodID mid = rocksdb::HistogramDataJni::getConstructorMethodId(env);
+  if (mid == nullptr) {
     // exception occurred accessing method
     return nullptr;
   }
 
-  return env->NewObject(
-      jclazz,
-      mid, data.median, data.percentile95,data.percentile99, data.average,
-      data.standard_deviation);
+  return env->NewObject(jclazz, mid, data.median, data.percentile95,
+                        data.percentile99, data.average,
+                        data.standard_deviation, data.max, data.count,
+                        data.sum, data.min);
 }
 
 /*
@@ -204,7 +207,7 @@ jobject Java_org_rocksdb_Statistics_getHistogramData(
  * Signature: (JB)Ljava/lang/String;
  */
 jstring Java_org_rocksdb_Statistics_getHistogramString(
-    JNIEnv* env, jobject jobj, jlong jhandle, jbyte jhistogram_type) {
+    JNIEnv* env, jobject, jlong jhandle, jbyte jhistogram_type) {
   auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
@@ -219,13 +222,13 @@ jstring Java_org_rocksdb_Statistics_getHistogramString(
  * Signature: (J)V
  */
 void Java_org_rocksdb_Statistics_reset(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-   auto* pSptr_statistics =
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
   rocksdb::Status s = pSptr_statistics->get()->Reset();
   if (!s.ok()) {
-   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
 }
 
@@ -235,8 +238,8 @@ void Java_org_rocksdb_Statistics_reset(
  * Signature: (J)Ljava/lang/String;
  */
 jstring Java_org_rocksdb_Statistics_toString(
-    JNIEnv* env, jobject jobj, jlong jhandle) {
-   auto* pSptr_statistics =
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto* pSptr_statistics =
       reinterpret_cast<std::shared_ptr<rocksdb::Statistics>*>(jhandle);
   assert(pSptr_statistics != nullptr);
   auto str = pSptr_statistics->get()->ToString();
diff --git a/thirdparty/rocksdb/java/rocksjni/statisticsjni.cc b/thirdparty/rocksdb/java/rocksjni/statisticsjni.cc
index 584ab5aa61..f59ace4dfc 100644
--- a/thirdparty/rocksdb/java/rocksjni/statisticsjni.cc
+++ b/thirdparty/rocksdb/java/rocksjni/statisticsjni.cc
@@ -10,24 +10,23 @@
 
 namespace rocksdb {
 
-  StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats)
-      : StatisticsImpl(stats, false), m_ignore_histograms() {
-  }
+StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats)
+    : StatisticsImpl(stats), m_ignore_histograms() {}
 
-  StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats,
-      const std::set<uint32_t> ignore_histograms) : StatisticsImpl(stats, false),
-      m_ignore_histograms(ignore_histograms) {
-  }
+StatisticsJni::StatisticsJni(std::shared_ptr<Statistics> stats,
+                             const std::set<uint32_t> ignore_histograms)
+    : StatisticsImpl(stats), m_ignore_histograms(ignore_histograms) {}
 
-  bool StatisticsJni::HistEnabledForType(uint32_t type) const {
-    if (type >= HISTOGRAM_ENUM_MAX) {
-      return false;
-    }
-    
-    if (m_ignore_histograms.count(type) > 0) {
-        return false;
-    }
+bool StatisticsJni::HistEnabledForType(uint32_t type) const {
+  if (type >= HISTOGRAM_ENUM_MAX) {
+    return false;
+  }
 
-    return true;
+  if (m_ignore_histograms.count(type) > 0) {
+    return false;
   }
+
+  return true;
+}
+// @lint-ignore TXT4 T25377293 Grandfathered in
 };
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/statisticsjni.h b/thirdparty/rocksdb/java/rocksjni/statisticsjni.h
index 600d9a6763..56186789a9 100644
--- a/thirdparty/rocksdb/java/rocksjni/statisticsjni.h
+++ b/thirdparty/rocksdb/java/rocksjni/statisticsjni.h
@@ -30,4 +30,5 @@ namespace rocksdb {
 
 }  // namespace rocksdb
 
+// @lint-ignore TXT4 T25377293 Grandfathered in
 #endif  // JAVA_ROCKSJNI_STATISTICSJNI_H_
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/table.cc b/thirdparty/rocksdb/java/rocksjni/table.cc
index 5f0a4735fe..1ccc550ab6 100644
--- a/thirdparty/rocksdb/java/rocksjni/table.cc
+++ b/thirdparty/rocksdb/java/rocksjni/table.cc
@@ -5,10 +5,11 @@
 //
 // This file implements the "bridge" between Java and C++ for rocksdb::Options.
 
+#include "rocksdb/table.h"
 #include <jni.h>
-#include "include/org_rocksdb_PlainTableConfig.h"
 #include "include/org_rocksdb_BlockBasedTableConfig.h"
-#include "rocksdb/table.h"
+#include "include/org_rocksdb_PlainTableConfig.h"
+#include "portal.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/filter_policy.h"
 
@@ -18,18 +19,17 @@
  * Signature: (IIDIIBZZ)J
  */
 jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jint jkey_size, jint jbloom_bits_per_key,
-    jdouble jhash_table_ratio, jint jindex_sparseness,
-    jint jhuge_page_tlb_size, jbyte jencoding_type,
-    jboolean jfull_scan_mode, jboolean jstore_index_in_file) {
+    JNIEnv * /*env*/, jobject /*jobj*/, jint jkey_size,
+    jint jbloom_bits_per_key, jdouble jhash_table_ratio, jint jindex_sparseness,
+    jint jhuge_page_tlb_size, jbyte jencoding_type, jboolean jfull_scan_mode,
+    jboolean jstore_index_in_file) {
   rocksdb::PlainTableOptions options = rocksdb::PlainTableOptions();
   options.user_key_len = jkey_size;
   options.bloom_bits_per_key = jbloom_bits_per_key;
   options.hash_table_ratio = jhash_table_ratio;
   options.index_sparseness = jindex_sparseness;
   options.huge_page_tlb_size = jhuge_page_tlb_size;
-  options.encoding_type = static_cast<rocksdb::EncodingType>(
-      jencoding_type);
+  options.encoding_type = static_cast<rocksdb::EncodingType>(jencoding_type);
   options.full_scan_mode = jfull_scan_mode;
   options.store_index_in_file = jstore_index_in_file;
   return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(options));
@@ -38,55 +38,102 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
 /*
  * Class:     org_rocksdb_BlockBasedTableConfig
  * Method:    newTableFactoryHandle
- * Signature: (ZJIJIIZIZZZJIBBI)J
+ * Signature: (ZZZZBBDBZJJJJIIIJZZJZZIIZZJIJI)J
  */
 jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
-    JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
-    jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation,
-    jint block_restart_interval, jboolean whole_key_filtering,
-    jlong jfilterPolicy, jboolean cache_index_and_filter_blocks,
-    jboolean pin_l0_filter_and_index_blocks_in_cache,
-    jboolean hash_index_allow_collision, jlong block_cache_compressed_size,
-    jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type,
-    jbyte jindex_type, jint jformat_version) {
+    JNIEnv*, jobject, jboolean jcache_index_and_filter_blocks,
+    jboolean jcache_index_and_filter_blocks_with_high_priority,
+    jboolean jpin_l0_filter_and_index_blocks_in_cache,
+    jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value,
+    jbyte jdata_block_index_type_value,
+    jdouble jdata_block_hash_table_util_ratio, jbyte jchecksum_type_value,
+    jboolean jno_block_cache, jlong jblock_cache_handle,
+    jlong jpersistent_cache_handle,
+    jlong jblock_cache_compressed_handle, jlong jblock_size,
+    jint jblock_size_deviation, jint jblock_restart_interval,
+    jint jindex_block_restart_interval, jlong jmetadata_block_size,
+    jboolean jpartition_filters, jboolean juse_delta_encoding,
+    jlong jfilter_policy_handle, jboolean jwhole_key_filtering,
+    jboolean jverify_compression, jint jread_amp_bytes_per_bit,
+    jint jformat_version, jboolean jenable_index_compression,
+    jboolean jblock_align, jlong jblock_cache_size,
+    jint jblock_cache_num_shard_bits, jlong jblock_cache_compressed_size,
+    jint jblock_cache_compressed_num_shard_bits) {
   rocksdb::BlockBasedTableOptions options;
-  options.no_block_cache = no_block_cache;
-
-  if (!no_block_cache && block_cache_size > 0) {
-    if (block_cache_num_shardbits > 0) {
-      options.block_cache =
-          rocksdb::NewLRUCache(block_cache_size, block_cache_num_shardbits);
+  options.cache_index_and_filter_blocks =
+      static_cast<bool>(jcache_index_and_filter_blocks);
+  options.cache_index_and_filter_blocks_with_high_priority =
+      static_cast<bool>(jcache_index_and_filter_blocks_with_high_priority);
+  options.pin_l0_filter_and_index_blocks_in_cache =
+    static_cast<bool>(jpin_l0_filter_and_index_blocks_in_cache);
+  options.pin_top_level_index_and_filter =
+    static_cast<bool>(jpin_top_level_index_and_filter);
+  options.index_type =
+      rocksdb::IndexTypeJni::toCppIndexType(jindex_type_value);
+  options.data_block_index_type =
+      rocksdb::DataBlockIndexTypeJni::toCppDataBlockIndexType(
+          jdata_block_index_type_value);
+  options.data_block_hash_table_util_ratio =
+      static_cast<double>(jdata_block_hash_table_util_ratio);
+  options.checksum =
+      rocksdb::ChecksumTypeJni::toCppChecksumType(jchecksum_type_value);
+  options.no_block_cache = static_cast<bool>(jno_block_cache);
+  if (options.no_block_cache) {
+    options.block_cache = nullptr;
+  } else {
+    if (jblock_cache_handle > 0) {
+      std::shared_ptr<rocksdb::Cache> *pCache =
+          reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jblock_cache_handle);
+      options.block_cache = *pCache;
+    } else if (jblock_cache_size > 0) {
+      if (jblock_cache_num_shard_bits > 0) {
+        options.block_cache = rocksdb::NewLRUCache(
+            static_cast<size_t>(jblock_cache_size),
+            static_cast<int>(jblock_cache_num_shard_bits));
+      } else {
+        options.block_cache = rocksdb::NewLRUCache(
+            static_cast<size_t>(jblock_cache_size));
+      }
+    }
+  }
+  if (jpersistent_cache_handle > 0) {
+    std::shared_ptr<rocksdb::PersistentCache> *pCache =
+            reinterpret_cast<std::shared_ptr<rocksdb::PersistentCache> *>(jpersistent_cache_handle);
+        options.persistent_cache = *pCache;
+  }
+  if (jblock_cache_compressed_handle > 0) {
+    std::shared_ptr<rocksdb::Cache> *pCache =
+            reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jblock_cache_compressed_handle);
+        options.block_cache_compressed = *pCache;
+  } else if (jblock_cache_compressed_size > 0) {
+    if (jblock_cache_compressed_num_shard_bits > 0) {
+      options.block_cache_compressed = rocksdb::NewLRUCache(
+          static_cast<size_t>(jblock_cache_compressed_size),
+          static_cast<int>(jblock_cache_compressed_num_shard_bits));
     } else {
-      options.block_cache = rocksdb::NewLRUCache(block_cache_size);
+      options.block_cache_compressed = rocksdb::NewLRUCache(
+          static_cast<size_t>(jblock_cache_compressed_size));
     }
   }
-  options.block_size = block_size;
-  options.block_size_deviation = block_size_deviation;
-  options.block_restart_interval = block_restart_interval;
-  options.whole_key_filtering = whole_key_filtering;
-  if (jfilterPolicy > 0) {
+  options.block_size = static_cast<size_t>(jblock_size);
+  options.block_size_deviation = static_cast<int>(jblock_size_deviation);
+  options.block_restart_interval = static_cast<int>(jblock_restart_interval);
+  options.index_block_restart_interval = static_cast<int>(jindex_block_restart_interval);
+  options.metadata_block_size = static_cast<uint64_t>(jmetadata_block_size);
+  options.partition_filters = static_cast<bool>(jpartition_filters);
+  options.use_delta_encoding = static_cast<bool>(juse_delta_encoding);
+  if (jfilter_policy_handle > 0) {
     std::shared_ptr<rocksdb::FilterPolicy> *pFilterPolicy =
         reinterpret_cast<std::shared_ptr<rocksdb::FilterPolicy> *>(
-            jfilterPolicy);
+            jfilter_policy_handle);
     options.filter_policy = *pFilterPolicy;
   }
-  options.cache_index_and_filter_blocks = cache_index_and_filter_blocks;
-  options.pin_l0_filter_and_index_blocks_in_cache =
-      pin_l0_filter_and_index_blocks_in_cache;
-  options.hash_index_allow_collision = hash_index_allow_collision;
-  if (block_cache_compressed_size > 0) {
-    if (block_cache_compressd_num_shard_bits > 0) {
-      options.block_cache =
-          rocksdb::NewLRUCache(block_cache_compressed_size,
-              block_cache_compressd_num_shard_bits);
-    } else {
-      options.block_cache = rocksdb::NewLRUCache(block_cache_compressed_size);
-    }
-  }
-  options.checksum = static_cast<rocksdb::ChecksumType>(jchecksum_type);
-  options.index_type = static_cast<
-      rocksdb::BlockBasedTableOptions::IndexType>(jindex_type);
-  options.format_version = jformat_version;
+  options.whole_key_filtering = static_cast<bool>(jwhole_key_filtering);
+  options.verify_compression = static_cast<bool>(jverify_compression);
+  options.read_amp_bytes_per_bit = static_cast<uint32_t>(jread_amp_bytes_per_bit);
+  options.format_version = static_cast<uint32_t>(jformat_version);
+  options.enable_index_compression = static_cast<bool>(jenable_index_compression);
+  options.block_align = static_cast<bool>(jblock_align);
 
   return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
 }
diff --git a/thirdparty/rocksdb/java/rocksjni/table_filter.cc b/thirdparty/rocksdb/java/rocksjni/table_filter.cc
new file mode 100644
index 0000000000..e5b3556213
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/table_filter.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// org.rocksdb.AbstractTableFilter.
+
+#include <jni.h>
+#include <memory>
+
+#include "include/org_rocksdb_AbstractTableFilter.h"
+#include "rocksjni/table_filter_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractTableFilter
+ * Method:    createNewTableFilter
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractTableFilter_createNewTableFilter(
+    JNIEnv* env, jobject jtable_filter) {
+  auto* table_filter_jnicallback =
+      new rocksdb::TableFilterJniCallback(env, jtable_filter);
+  return reinterpret_cast<jlong>(table_filter_jnicallback);
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/table_filter_jnicallback.cc b/thirdparty/rocksdb/java/rocksjni/table_filter_jnicallback.cc
new file mode 100644
index 0000000000..680c014459
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/table_filter_jnicallback.cc
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::TableFilter.
+
+#include "rocksjni/table_filter_jnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+TableFilterJniCallback::TableFilterJniCallback(
+    JNIEnv* env, jobject jtable_filter)
+    : JniCallback(env, jtable_filter) {
+  m_jfilter_methodid =
+      AbstractTableFilterJni::getFilterMethod(env);
+  if(m_jfilter_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  // create the function reference
+  /*
+  Note the JNI ENV must be obtained/release
+  on each call to the function itself as
+  it may be called from multiple threads
+  */
+  m_table_filter_function = [this](const rocksdb::TableProperties& table_properties) {
+    jboolean attached_thread = JNI_FALSE;
+    JNIEnv* thread_env = getJniEnv(&attached_thread);
+    assert(thread_env != nullptr);
+
+    // create a Java TableProperties object
+    jobject jtable_properties = TablePropertiesJni::fromCppTableProperties(thread_env, table_properties);
+    if (jtable_properties == nullptr) {
+      // exception thrown from fromCppTableProperties
+      thread_env->ExceptionDescribe();  // print out exception to stderr
+      releaseJniEnv(attached_thread);
+      return false;
+    }
+
+    jboolean result = thread_env->CallBooleanMethod(m_jcallback_obj, m_jfilter_methodid, jtable_properties);
+    if (thread_env->ExceptionCheck()) {
+      // exception thrown from CallBooleanMethod
+      thread_env->DeleteLocalRef(jtable_properties);
+      thread_env->ExceptionDescribe();  // print out exception to stderr
+      releaseJniEnv(attached_thread);
+      return false;
+    }
+
+    // ok... cleanup and then return
+    releaseJniEnv(attached_thread);
+    return static_cast<bool>(result);
+  };
+}
+
+std::function<bool(const rocksdb::TableProperties&)> TableFilterJniCallback::GetTableFilterFunction() {
+  return m_table_filter_function;
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/java/rocksjni/table_filter_jnicallback.h b/thirdparty/rocksdb/java/rocksjni/table_filter_jnicallback.h
new file mode 100644
index 0000000000..39a0c90e0e
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/table_filter_jnicallback.h
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::TableFilter.
+
+#ifndef JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_
+
+#include <jni.h>
+#include <functional>
+#include <memory>
+
+#include "rocksdb/table_properties.h"
+#include "rocksjni/jnicallback.h"
+
+namespace rocksdb {
+
+class TableFilterJniCallback : public JniCallback {
+ public:
+    TableFilterJniCallback(
+        JNIEnv* env, jobject jtable_filter);
+    std::function<bool(const rocksdb::TableProperties&)> GetTableFilterFunction();
+
+ private:
+    jmethodID m_jfilter_methodid;
+    std::function<bool(const rocksdb::TableProperties&)> m_table_filter_function;
+};
+
+}  //namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_TABLE_FILTER_JNICALLBACK_H_
diff --git a/thirdparty/rocksdb/java/rocksjni/thread_status.cc b/thirdparty/rocksdb/java/rocksjni/thread_status.cc
new file mode 100644
index 0000000000..f70d515a5b
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/thread_status.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ and enables
+// calling c++ rocksdb::ThreadStatus methods from Java side.
+
+#include <jni.h>
+
+#include "portal.h"
+#include "include/org_rocksdb_ThreadStatus.h"
+#include "rocksdb/thread_status.h"
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getThreadTypeName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getThreadTypeName(
+    JNIEnv* env, jclass, jbyte jthread_type_value) {
+  auto name = rocksdb::ThreadStatus::GetThreadTypeName(
+      rocksdb::ThreadTypeJni::toCppThreadType(jthread_type_value));
+  return rocksdb::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getOperationName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getOperationName(
+    JNIEnv* env, jclass, jbyte joperation_type_value) {
+  auto name = rocksdb::ThreadStatus::GetOperationName(
+      rocksdb::OperationTypeJni::toCppOperationType(joperation_type_value));
+  return rocksdb::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    microsToStringNative
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_microsToStringNative(
+    JNIEnv* env, jclass, jlong jmicros) {
+  auto str =
+      rocksdb::ThreadStatus::MicrosToString(static_cast<uint64_t>(jmicros));
+  return rocksdb::JniUtil::toJavaString(env, &str, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getOperationStageName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getOperationStageName(
+    JNIEnv* env, jclass, jbyte joperation_stage_value) {
+  auto name = rocksdb::ThreadStatus::GetOperationStageName(
+      rocksdb::OperationStageJni::toCppOperationStage(joperation_stage_value));
+  return rocksdb::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getOperationPropertyName
+ * Signature: (BI)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getOperationPropertyName(
+    JNIEnv* env, jclass, jbyte joperation_type_value, jint jindex) {
+  auto name = rocksdb::ThreadStatus::GetOperationPropertyName(
+      rocksdb::OperationTypeJni::toCppOperationType(joperation_type_value),
+      static_cast<int>(jindex));
+  return rocksdb::JniUtil::toJavaString(env, &name, true);
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    interpretOperationProperties
+ * Signature: (B[J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_ThreadStatus_interpretOperationProperties(
+    JNIEnv* env, jclass, jbyte joperation_type_value,
+    jlongArray joperation_properties) {
+
+  //convert joperation_properties
+  const jsize len = env->GetArrayLength(joperation_properties);
+  const std::unique_ptr<uint64_t[]> op_properties(new uint64_t[len]);
+  jlong* jop = env->GetLongArrayElements(joperation_properties, nullptr);
+  if (jop == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  for (jsize i = 0; i < len; i++) {
+    op_properties[i] = static_cast<uint64_t>(jop[i]);
+  }
+  env->ReleaseLongArrayElements(joperation_properties, jop, JNI_ABORT);
+
+  // call the function
+  auto result = rocksdb::ThreadStatus::InterpretOperationProperties(
+      rocksdb::OperationTypeJni::toCppOperationType(joperation_type_value),
+      op_properties.get());
+  jobject jresult = rocksdb::HashMapJni::fromCppMap(env, &result);
+  if (env->ExceptionCheck()) {
+    // exception occurred
+    return nullptr;
+  }
+
+  return jresult;
+}
+
+/*
+ * Class:     org_rocksdb_ThreadStatus
+ * Method:    getStateName
+ * Signature: (B)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_ThreadStatus_getStateName(
+  JNIEnv* env, jclass, jbyte jstate_type_value) {
+  auto name = rocksdb::ThreadStatus::GetStateName(
+      rocksdb::StateTypeJni::toCppStateType(jstate_type_value));
+  return rocksdb::JniUtil::toJavaString(env, &name, true);
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/trace_writer.cc b/thirdparty/rocksdb/java/rocksjni/trace_writer.cc
new file mode 100644
index 0000000000..5d47cfcb3a
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/trace_writer.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::CompactionFilterFactory.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_AbstractTraceWriter.h"
+#include "rocksjni/trace_writer_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractTraceWriter
+ * Method:    createNewTraceWriter
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractTraceWriter_createNewTraceWriter(
+    JNIEnv* env, jobject jobj) {
+  auto* trace_writer = new rocksdb::TraceWriterJniCallback(env, jobj);    
+  return reinterpret_cast<jlong>(trace_writer);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/trace_writer_jnicallback.cc b/thirdparty/rocksdb/java/rocksjni/trace_writer_jnicallback.cc
new file mode 100644
index 0000000000..d547fb3f87
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/trace_writer_jnicallback.cc
@@ -0,0 +1,115 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::TraceWriter.
+
+#include "rocksjni/trace_writer_jnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+TraceWriterJniCallback::TraceWriterJniCallback(
+    JNIEnv* env, jobject jtrace_writer)
+    : JniCallback(env, jtrace_writer) {
+  m_jwrite_proxy_methodid =
+      AbstractTraceWriterJni::getWriteProxyMethodId(env);
+  if(m_jwrite_proxy_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_jclose_writer_proxy_methodid =
+      AbstractTraceWriterJni::getCloseWriterProxyMethodId(env);
+  if(m_jclose_writer_proxy_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_jget_file_size_methodid =
+      AbstractTraceWriterJni::getGetFileSizeMethodId(env);
+  if(m_jget_file_size_methodid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+}
+
+Status TraceWriterJniCallback::Write(const Slice& data) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return Status::IOError("Unable to attach JNI Environment");
+  }
+
+  jshort jstatus = env->CallShortMethod(m_jcallback_obj,
+      m_jwrite_proxy_methodid,
+      &data);
+
+  if(env->ExceptionCheck()) {
+    // exception thrown from CallShortMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return Status::IOError("Unable to call AbstractTraceWriter#writeProxy(long)");
+  }
+
+  // unpack status code and status sub-code from jstatus
+  jbyte jcode_value = (jstatus >> 8) & 0xFF;
+  jbyte jsub_code_value = jstatus & 0xFF;
+  std::unique_ptr<Status> s = StatusJni::toCppStatus(jcode_value, jsub_code_value);
+
+  releaseJniEnv(attached_thread);
+
+  return Status(*s);
+}
+
+Status TraceWriterJniCallback::Close() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return Status::IOError("Unable to attach JNI Environment");
+  }
+
+  jshort jstatus = env->CallShortMethod(m_jcallback_obj,
+      m_jclose_writer_proxy_methodid);
+
+  if(env->ExceptionCheck()) {
+    // exception thrown from CallShortMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return Status::IOError("Unable to call AbstractTraceWriter#closeWriterProxy()");
+  }
+
+  // unpack status code and status sub-code from jstatus
+  jbyte code_value = (jstatus >> 8) & 0xFF;
+  jbyte sub_code_value = jstatus & 0xFF;
+  std::unique_ptr<Status> s = StatusJni::toCppStatus(code_value, sub_code_value);
+
+  releaseJniEnv(attached_thread);
+
+  return Status(*s);
+}
+
+uint64_t TraceWriterJniCallback::GetFileSize() {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return 0;
+  }
+
+  jlong jfile_size = env->CallLongMethod(m_jcallback_obj,
+      m_jget_file_size_methodid);
+
+  if(env->ExceptionCheck()) {
+    // exception thrown from CallLongMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return 0;
+  }
+
+  releaseJniEnv(attached_thread);
+
+  return static_cast<uint64_t>(jfile_size);
+}
+
+}  // namespace rocksdb
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/trace_writer_jnicallback.h b/thirdparty/rocksdb/java/rocksjni/trace_writer_jnicallback.h
new file mode 100644
index 0000000000..610b6c465f
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/trace_writer_jnicallback.h
@@ -0,0 +1,36 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::TraceWriter.
+
+#ifndef JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_
+
+#include <jni.h>
+#include <memory>
+
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksjni/jnicallback.h"
+
+namespace rocksdb {
+
+class TraceWriterJniCallback : public JniCallback, public TraceWriter {
+ public:
+    TraceWriterJniCallback(
+        JNIEnv* env, jobject jtrace_writer);
+    virtual Status Write(const Slice& data);
+    virtual Status Close();
+    virtual uint64_t GetFileSize();
+
+ private:
+    jmethodID m_jwrite_proxy_methodid;
+    jmethodID m_jclose_writer_proxy_methodid;
+    jmethodID m_jget_file_size_methodid;
+};
+
+}  //namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_TRACE_WRITER_JNICALLBACK_H_
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction.cc b/thirdparty/rocksdb/java/rocksjni/transaction.cc
new file mode 100644
index 0000000000..04eb654df7
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/transaction.cc
@@ -0,0 +1,1584 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::Transaction.
+
+#include <jni.h>
+#include <functional>
+
+#include "include/org_rocksdb_Transaction.h"
+
+#include "rocksdb/utilities/transaction.h"
+#include "rocksjni/portal.h"
+
+using namespace std::placeholders;
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4503)  // identifier' : decorated name length
+                                 // exceeded, name was truncated
+#endif
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSnapshot
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_setSnapshot(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->SetSnapshot();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSnapshotOnNextOperation
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__J(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->SetSnapshotOnNextOperation(nullptr);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSnapshotOnNextOperation
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setSnapshotOnNextOperation__JJ(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jtxn_notifier_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* txn_notifier =
+      reinterpret_cast<std::shared_ptr<rocksdb::TransactionNotifierJniCallback>*>(
+          jtxn_notifier_handle);
+  txn->SetSnapshotOnNextOperation(*txn_notifier);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getSnapshot
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getSnapshot(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  const rocksdb::Snapshot* snapshot = txn->GetSnapshot();
+  return reinterpret_cast<jlong>(snapshot);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    clearSnapshot
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_clearSnapshot(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->ClearSnapshot();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    prepare
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_prepare(JNIEnv* env, jobject /*jobj*/,
+                                          jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  rocksdb::Status s = txn->Prepare();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    commit
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_commit(JNIEnv* env, jobject /*jobj*/,
+                                         jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  rocksdb::Status s = txn->Commit();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    rollback
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_rollback(JNIEnv* env, jobject /*jobj*/,
+                                           jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  rocksdb::Status s = txn->Rollback();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_setSavePoint(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->SetSavePoint();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    rollbackToSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_rollbackToSavePoint(JNIEnv* env,
+                                                      jobject /*jobj*/,
+                                                      jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  rocksdb::Status s = txn->RollbackToSavePoint();
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+typedef std::function<rocksdb::Status(const rocksdb::ReadOptions&,
+                                      const rocksdb::Slice&, std::string*)>
+    FnGet;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+jbyteArray txn_get_helper(JNIEnv* env, const FnGet& fn_get,
+                          const jlong& jread_options_handle,
+                          const jbyteArray& jkey, const jint& jkey_part_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_part_len);
+
+  auto* read_options =
+      reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+  std::string value;
+  rocksdb::Status s = fn_get(*read_options, key_slice, &value);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.IsNotFound()) {
+    return nullptr;
+  }
+
+  if (s.ok()) {
+    jbyteArray jret_value = env->NewByteArray(static_cast<jsize>(value.size()));
+    if (jret_value == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    env->SetByteArrayRegion(jret_value, 0, static_cast<jsize>(value.size()),
+                            const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value.c_str())));
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      return nullptr;
+    }
+    return jret_value;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  return nullptr;
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    get
+ * Signature: (JJ[BIJ)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnGet fn_get = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      const rocksdb::ReadOptions&, rocksdb::ColumnFamilyHandle*,
+      const rocksdb::Slice&, std::string*)>(&rocksdb::Transaction::Get, txn, _1,
+                                            column_family_handle, _2, _3);
+  return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    get
+ * Signature: (JJ[BI)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_get__JJ_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnGet fn_get = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      const rocksdb::ReadOptions&, const rocksdb::Slice&, std::string*)>(
+      &rocksdb::Transaction::Get, txn, _1, _2, _3);
+  return txn_get_helper(env, fn_get, jread_options_handle, jkey, jkey_part_len);
+}
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+// used by txn_multi_get_helper below
+std::vector<rocksdb::ColumnFamilyHandle*> txn_column_families_helper(
+    JNIEnv* env, jlongArray jcolumn_family_handles, bool* has_exception) {
+  std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
+  if (jcolumn_family_handles != nullptr) {
+    const jsize len_cols = env->GetArrayLength(jcolumn_family_handles);
+    if (len_cols > 0) {
+      if (env->EnsureLocalCapacity(len_cols) != 0) {
+        // out of memory
+        *has_exception = JNI_TRUE;
+        return std::vector<rocksdb::ColumnFamilyHandle*>();
+      }
+
+      jlong* jcfh = env->GetLongArrayElements(jcolumn_family_handles, nullptr);
+      if (jcfh == nullptr) {
+        // exception thrown: OutOfMemoryError
+        *has_exception = JNI_TRUE;
+        return std::vector<rocksdb::ColumnFamilyHandle*>();
+      }
+      for (int i = 0; i < len_cols; i++) {
+        auto* cf_handle =
+            reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcfh[i]);
+        cf_handles.push_back(cf_handle);
+      }
+      env->ReleaseLongArrayElements(jcolumn_family_handles, jcfh, JNI_ABORT);
+    }
+  }
+  return cf_handles;
+}
+
+typedef std::function<std::vector<rocksdb::Status>(
+    const rocksdb::ReadOptions&, const std::vector<rocksdb::Slice>&,
+    std::vector<std::string>*)>
+    FnMultiGet;
+
+void free_parts(
+    JNIEnv* env,
+    std::vector<std::tuple<jbyteArray, jbyte*, jobject>>& parts_to_free) {
+  for (auto& value : parts_to_free) {
+    jobject jk;
+    jbyteArray jk_ba;
+    jbyte* jk_val;
+    std::tie(jk_ba, jk_val, jk) = value;
+    env->ReleaseByteArrayElements(jk_ba, jk_val, JNI_ABORT);
+    env->DeleteLocalRef(jk);
+  }
+}
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+// cf multi get
+jobjectArray txn_multi_get_helper(JNIEnv* env, const FnMultiGet& fn_multi_get,
+                                  const jlong& jread_options_handle,
+                                  const jobjectArray& jkey_parts) {
+  const jsize len_key_parts = env->GetArrayLength(jkey_parts);
+  if (env->EnsureLocalCapacity(len_key_parts) != 0) {
+    // out of memory
+    return nullptr;
+  }
+
+  std::vector<rocksdb::Slice> key_parts;
+  std::vector<std::tuple<jbyteArray, jbyte*, jobject>> key_parts_to_free;
+  for (int i = 0; i < len_key_parts; i++) {
+    const jobject jk = env->GetObjectArrayElement(jkey_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      free_parts(env, key_parts_to_free);
+      return nullptr;
+    }
+    jbyteArray jk_ba = reinterpret_cast<jbyteArray>(jk);
+    const jsize len_key = env->GetArrayLength(jk_ba);
+    if (env->EnsureLocalCapacity(len_key) != 0) {
+      // out of memory
+      env->DeleteLocalRef(jk);
+      free_parts(env, key_parts_to_free);
+      return nullptr;
+    }
+    jbyte* jk_val = env->GetByteArrayElements(jk_ba, nullptr);
+    if (jk_val == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jk);
+      free_parts(env, key_parts_to_free);
+      return nullptr;
+    }
+
+    rocksdb::Slice key_slice(reinterpret_cast<char*>(jk_val), len_key);
+    key_parts.push_back(key_slice);
+
+    key_parts_to_free.push_back(std::make_tuple(jk_ba, jk_val, jk));
+  }
+
+  auto* read_options =
+      reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+  std::vector<std::string> value_parts;
+  std::vector<rocksdb::Status> s =
+      fn_multi_get(*read_options, key_parts, &value_parts);
+
+  // free up allocated byte arrays
+  free_parts(env, key_parts_to_free);
+
+  // prepare the results
+  const jclass jcls_ba = env->FindClass("[B");
+  jobjectArray jresults =
+      env->NewObjectArray(static_cast<jsize>(s.size()), jcls_ba, nullptr);
+  if (jresults == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  // add to the jresults
+  for (std::vector<rocksdb::Status>::size_type i = 0; i != s.size(); i++) {
+    if (s[i].ok()) {
+      jbyteArray jentry_value =
+          env->NewByteArray(static_cast<jsize>(value_parts[i].size()));
+      if (jentry_value == nullptr) {
+        // exception thrown: OutOfMemoryError
+        return nullptr;
+      }
+
+      env->SetByteArrayRegion(
+          jentry_value, 0, static_cast<jsize>(value_parts[i].size()),
+          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(value_parts[i].c_str())));
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException
+        env->DeleteLocalRef(jentry_value);
+        return nullptr;
+      }
+
+      env->SetObjectArrayElement(jresults, static_cast<jsize>(i), jentry_value);
+      env->DeleteLocalRef(jentry_value);
+    }
+  }
+
+  return jresults;
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGet
+ * Signature: (JJ[[B[J)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B_3J(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts, jlongArray jcolumn_family_handles) {
+  bool has_exception = false;
+  const std::vector<rocksdb::ColumnFamilyHandle*> column_family_handles =
+      txn_column_families_helper(env, jcolumn_family_handles, &has_exception);
+  if (has_exception) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get =
+      std::bind<std::vector<rocksdb::Status> (rocksdb::Transaction::*)(
+          const rocksdb::ReadOptions&,
+          const std::vector<rocksdb::ColumnFamilyHandle*>&,
+          const std::vector<rocksdb::Slice>&, std::vector<std::string>*)>(
+          &rocksdb::Transaction::MultiGet, txn, _1, column_family_handles, _2,
+          _3);
+  return txn_multi_get_helper(env, fn_multi_get, jread_options_handle,
+                              jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGet
+ * Signature: (JJ[[B)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGet__JJ_3_3B(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get =
+      std::bind<std::vector<rocksdb::Status> (rocksdb::Transaction::*)(
+          const rocksdb::ReadOptions&, const std::vector<rocksdb::Slice>&,
+          std::vector<std::string>*)>(&rocksdb::Transaction::MultiGet, txn, _1,
+                                      _2, _3);
+  return txn_multi_get_helper(env, fn_multi_get, jread_options_handle,
+                              jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getForUpdate
+ * Signature: (JJ[BIJZZ)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIJZZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len, jlong jcolumn_family_handle,
+    jboolean jexclusive, jboolean jdo_validate) {
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnGet fn_get_for_update = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      const rocksdb::ReadOptions&, rocksdb::ColumnFamilyHandle*,
+      const rocksdb::Slice&, std::string*, bool, bool)>(
+      &rocksdb::Transaction::GetForUpdate, txn, _1, column_family_handle, _2,
+      _3, jexclusive, jdo_validate);
+  return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey,
+                        jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getForUpdate
+ * Signature: (JJ[BIZZ)[B
+ */
+jbyteArray Java_org_rocksdb_Transaction_getForUpdate__JJ_3BIZZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jbyteArray jkey, jint jkey_part_len, jboolean jexclusive,
+    jboolean jdo_validate) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnGet fn_get_for_update = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      const rocksdb::ReadOptions&, const rocksdb::Slice&, std::string*, bool,
+      bool)>(&rocksdb::Transaction::GetForUpdate, txn, _1, _2, _3, jexclusive,
+             jdo_validate);
+  return txn_get_helper(env, fn_get_for_update, jread_options_handle, jkey,
+                        jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGetForUpdate
+ * Signature: (JJ[[B[J)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B_3J(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts, jlongArray jcolumn_family_handles) {
+  bool has_exception = false;
+  const std::vector<rocksdb::ColumnFamilyHandle*> column_family_handles =
+      txn_column_families_helper(env, jcolumn_family_handles, &has_exception);
+  if (has_exception) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get_for_update =
+      std::bind<std::vector<rocksdb::Status> (rocksdb::Transaction::*)(
+          const rocksdb::ReadOptions&,
+          const std::vector<rocksdb::ColumnFamilyHandle*>&,
+          const std::vector<rocksdb::Slice>&, std::vector<std::string>*)>(
+          &rocksdb::Transaction::MultiGetForUpdate, txn, _1,
+          column_family_handles, _2, _3);
+  return txn_multi_get_helper(env, fn_multi_get_for_update,
+                              jread_options_handle, jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    multiGetForUpdate
+ * Signature: (JJ[[B)[[B
+ */
+jobjectArray Java_org_rocksdb_Transaction_multiGetForUpdate__JJ_3_3B(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jread_options_handle,
+    jobjectArray jkey_parts) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnMultiGet fn_multi_get_for_update =
+      std::bind<std::vector<rocksdb::Status> (rocksdb::Transaction::*)(
+          const rocksdb::ReadOptions&, const std::vector<rocksdb::Slice>&,
+          std::vector<std::string>*)>(&rocksdb::Transaction::MultiGetForUpdate,
+                                      txn, _1, _2, _3);
+  return txn_multi_get_helper(env, fn_multi_get_for_update,
+                              jread_options_handle, jkey_parts);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getIterator
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_Transaction_getIterator__JJ(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle,
+                                                   jlong jread_options_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* read_options =
+      reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+  return reinterpret_cast<jlong>(txn->GetIterator(*read_options));
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getIterator
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_Transaction_getIterator__JJJ(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jread_options_handle, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* read_options =
+      reinterpret_cast<rocksdb::ReadOptions*>(jread_options_handle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  return reinterpret_cast<jlong>(
+      txn->GetIterator(*read_options, column_family_handle));
+}
+
+typedef std::function<rocksdb::Status(const rocksdb::Slice&,
+                                      const rocksdb::Slice&)>
+    FnWriteKV;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_kv_helper(JNIEnv* env, const FnWriteKV& fn_write_kv,
+                         const jbyteArray& jkey, const jint& jkey_part_len,
+                         const jbyteArray& jval, const jint& jval_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  jbyte* value = env->GetByteArrayElements(jval, nullptr);
+  if (value == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+    return;
+  }
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_part_len);
+  rocksdb::Slice value_slice(reinterpret_cast<char*>(value), jval_len);
+
+  rocksdb::Status s = fn_write_kv(key_slice, value_slice);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jval, value, JNI_ABORT);
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[BI[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3BI_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKV fn_put = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      rocksdb::ColumnFamilyHandle*, const rocksdb::Slice&,
+      const rocksdb::Slice&, bool)>(&rocksdb::Transaction::Put, txn,
+                                    column_family_handle, _1, _2,
+                                    jassume_tracked);
+  txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3BI_3BI(JNIEnv* env, jobject /*jobj*/,
+                                                 jlong jhandle, jbyteArray jkey,
+                                                 jint jkey_part_len,
+                                                 jbyteArray jval,
+                                                 jint jval_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKV fn_put = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      const rocksdb::Slice&, const rocksdb::Slice&)>(&rocksdb::Transaction::Put,
+                                                     txn, _1, _2);
+  txn_write_kv_helper(env, fn_put, jkey, jkey_part_len, jval, jval_len);
+}
+
+typedef std::function<rocksdb::Status(const rocksdb::SliceParts&,
+                                      const rocksdb::SliceParts&)>
+    FnWriteKVParts;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_kv_parts_helper(JNIEnv* env,
+                               const FnWriteKVParts& fn_write_kv_parts,
+                               const jobjectArray& jkey_parts,
+                               const jint& jkey_parts_len,
+                               const jobjectArray& jvalue_parts,
+                               const jint& jvalue_parts_len) {
+#ifndef DEBUG
+  (void) jvalue_parts_len;
+#else
+  assert(jkey_parts_len == jvalue_parts_len);
+#endif
+
+  auto key_parts = std::vector<rocksdb::Slice>();
+  auto value_parts = std::vector<rocksdb::Slice>();
+  auto jparts_to_free = std::vector<std::tuple<jbyteArray, jbyte*, jobject>>();
+
+  // convert java key_parts/value_parts byte[][] to Slice(s)
+  for (jsize i = 0; i < jkey_parts_len; ++i) {
+    const jobject jobj_key_part = env->GetObjectArrayElement(jkey_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      free_parts(env, jparts_to_free);
+      return;
+    }
+    const jobject jobj_value_part = env->GetObjectArrayElement(jvalue_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+
+    const jbyteArray jba_key_part = reinterpret_cast<jbyteArray>(jobj_key_part);
+    const jsize jkey_part_len = env->GetArrayLength(jba_key_part);
+    if (env->EnsureLocalCapacity(jkey_part_len) != 0) {
+      // out of memory
+      env->DeleteLocalRef(jobj_value_part);
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+    jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr);
+    if (jkey_part == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jobj_value_part);
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+
+    const jbyteArray jba_value_part =
+        reinterpret_cast<jbyteArray>(jobj_value_part);
+    const jsize jvalue_part_len = env->GetArrayLength(jba_value_part);
+    if (env->EnsureLocalCapacity(jvalue_part_len) != 0) {
+      // out of memory
+      env->DeleteLocalRef(jobj_value_part);
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+    jbyte* jvalue_part = env->GetByteArrayElements(jba_value_part, nullptr);
+    if (jvalue_part == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->ReleaseByteArrayElements(jba_value_part, jvalue_part, JNI_ABORT);
+      env->DeleteLocalRef(jobj_value_part);
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jparts_to_free);
+      return;
+    }
+
+    jparts_to_free.push_back(
+        std::make_tuple(jba_key_part, jkey_part, jobj_key_part));
+    jparts_to_free.push_back(
+        std::make_tuple(jba_value_part, jvalue_part, jobj_value_part));
+
+    key_parts.push_back(
+        rocksdb::Slice(reinterpret_cast<char*>(jkey_part), jkey_part_len));
+    value_parts.push_back(
+        rocksdb::Slice(reinterpret_cast<char*>(jvalue_part), jvalue_part_len));
+  }
+
+  // call the write_multi function
+  rocksdb::Status s = fn_write_kv_parts(
+      rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()),
+      rocksdb::SliceParts(value_parts.data(), (int)value_parts.size()));
+
+  // cleanup temporary memory
+  free_parts(env, jparts_to_free);
+
+  // return
+  if (s.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[[BI[[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len,
+    jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKVParts fn_put_parts =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          rocksdb::ColumnFamilyHandle*, const rocksdb::SliceParts&,
+          const rocksdb::SliceParts&, bool)>(&rocksdb::Transaction::Put, txn,
+                                             column_family_handle, _1, _2,
+                                             jassume_tracked);
+  txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len,
+                            jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    put
+ * Signature: (J[[BI[[BI)V
+ */
+void Java_org_rocksdb_Transaction_put__J_3_3BI_3_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKVParts fn_put_parts =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          const rocksdb::SliceParts&, const rocksdb::SliceParts&)>(
+          &rocksdb::Transaction::Put, txn, _1, _2);
+  txn_write_kv_parts_helper(env, fn_put_parts, jkey_parts, jkey_parts_len,
+                            jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    merge
+ * Signature: (J[BI[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_merge__J_3BI_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKV fn_merge = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      rocksdb::ColumnFamilyHandle*, const rocksdb::Slice&,
+      const rocksdb::Slice&, bool)>(&rocksdb::Transaction::Merge, txn,
+                                    column_family_handle, _1, _2,
+                                    jassume_tracked);
+  txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    merge
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_merge__J_3BI_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKV fn_merge = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      const rocksdb::Slice&, const rocksdb::Slice&)>(
+      &rocksdb::Transaction::Merge, txn, _1, _2);
+  txn_write_kv_helper(env, fn_merge, jkey, jkey_part_len, jval, jval_len);
+}
+
+typedef std::function<rocksdb::Status(const rocksdb::Slice&)> FnWriteK;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_k_helper(JNIEnv* env, const FnWriteK& fn_write_k,
+                        const jbyteArray& jkey, const jint& jkey_part_len) {
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_part_len);
+
+  rocksdb::Status s = fn_write_k(key_slice);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteK fn_delete = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      rocksdb::ColumnFamilyHandle*, const rocksdb::Slice&, bool)>(
+      &rocksdb::Transaction::Delete, txn, column_family_handle, _1,
+      jassume_tracked);
+  txn_write_k_helper(env, fn_delete, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3BI(JNIEnv* env, jobject /*jobj*/,
+                                                jlong jhandle, jbyteArray jkey,
+                                                jint jkey_part_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteK fn_delete = std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+      const rocksdb::Slice&)>(&rocksdb::Transaction::Delete, txn, _1);
+  txn_write_k_helper(env, fn_delete, jkey, jkey_part_len);
+}
+
+typedef std::function<rocksdb::Status(const rocksdb::SliceParts&)>
+    FnWriteKParts;
+
+// TODO(AR) consider refactoring to share this between here and rocksjni.cc
+void txn_write_k_parts_helper(JNIEnv* env,
+                              const FnWriteKParts& fn_write_k_parts,
+                              const jobjectArray& jkey_parts,
+                              const jint& jkey_parts_len) {
+  std::vector<rocksdb::Slice> key_parts;
+  std::vector<std::tuple<jbyteArray, jbyte*, jobject>> jkey_parts_to_free;
+
+  // convert java key_parts byte[][] to Slice(s)
+  for (jint i = 0; i < jkey_parts_len; ++i) {
+    const jobject jobj_key_part = env->GetObjectArrayElement(jkey_parts, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      free_parts(env, jkey_parts_to_free);
+      return;
+    }
+
+    const jbyteArray jba_key_part = reinterpret_cast<jbyteArray>(jobj_key_part);
+    const jsize jkey_part_len = env->GetArrayLength(jba_key_part);
+    if (env->EnsureLocalCapacity(jkey_part_len) != 0) {
+      // out of memory
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jkey_parts_to_free);
+      return;
+    }
+    jbyte* jkey_part = env->GetByteArrayElements(jba_key_part, nullptr);
+    if (jkey_part == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jobj_key_part);
+      free_parts(env, jkey_parts_to_free);
+      return;
+    }
+
+    jkey_parts_to_free.push_back(std::tuple<jbyteArray, jbyte*, jobject>(
+        jba_key_part, jkey_part, jobj_key_part));
+
+    key_parts.push_back(
+        rocksdb::Slice(reinterpret_cast<char*>(jkey_part), jkey_part_len));
+  }
+
+  // call the write_multi function
+  rocksdb::Status s = fn_write_k_parts(
+      rocksdb::SliceParts(key_parts.data(), (int)key_parts.size()));
+
+  // cleanup temporary memory
+  free_parts(env, jkey_parts_to_free);
+
+  // return
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jlong jcolumn_family_handle,
+    jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKParts fn_delete_parts =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          rocksdb::ColumnFamilyHandle*, const rocksdb::SliceParts&, bool)>(
+          &rocksdb::Transaction::Delete, txn, column_family_handle, _1,
+          jassume_tracked);
+  txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    delete
+ * Signature: (J[[BI)V
+ */
+void Java_org_rocksdb_Transaction_delete__J_3_3BI(JNIEnv* env, jobject /*jobj*/,
+                                                  jlong jhandle,
+                                                  jobjectArray jkey_parts,
+                                                  jint jkey_parts_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKParts fn_delete_parts =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          const rocksdb::SliceParts&)>(&rocksdb::Transaction::Delete, txn, _1);
+  txn_write_k_parts_helper(env, fn_delete_parts, jkey_parts, jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle, jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteK fn_single_delete =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          rocksdb::ColumnFamilyHandle*, const rocksdb::Slice&, bool)>(
+          &rocksdb::Transaction::SingleDelete, txn, column_family_handle, _1,
+          jassume_tracked);
+  txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3BI(JNIEnv* env,
+                                                      jobject /*jobj*/,
+                                                      jlong jhandle,
+                                                      jbyteArray jkey,
+                                                      jint jkey_part_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteK fn_single_delete =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          const rocksdb::Slice&)>(&rocksdb::Transaction::SingleDelete, txn, _1);
+  txn_write_k_helper(env, fn_single_delete, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[[BIJZ)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3_3BIJZ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jlong jcolumn_family_handle,
+    jboolean jassume_tracked) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKParts fn_single_delete_parts =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          rocksdb::ColumnFamilyHandle*, const rocksdb::SliceParts&, bool)>(
+          &rocksdb::Transaction::SingleDelete, txn, column_family_handle, _1,
+          jassume_tracked);
+  txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    singleDelete
+ * Signature: (J[[BI)V
+ */
+void Java_org_rocksdb_Transaction_singleDelete__J_3_3BI(JNIEnv* env,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle,
+                                                        jobjectArray jkey_parts,
+                                                        jint jkey_parts_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKParts fn_single_delete_parts = std::bind<rocksdb::Status (
+      rocksdb::Transaction::*)(const rocksdb::SliceParts&)>(
+      &rocksdb::Transaction::SingleDelete, txn, _1);
+  txn_write_k_parts_helper(env, fn_single_delete_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKV fn_put_untracked = std::bind<rocksdb::Status (
+      rocksdb::Transaction::*)(rocksdb::ColumnFamilyHandle*,
+                               const rocksdb::Slice&, const rocksdb::Slice&)>(
+      &rocksdb::Transaction::PutUntracked, txn, column_family_handle, _1, _2);
+  txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3BI_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKV fn_put_untracked = std::bind<rocksdb::Status (
+      rocksdb::Transaction::*)(const rocksdb::Slice&, const rocksdb::Slice&)>(
+      &rocksdb::Transaction::PutUntracked, txn, _1, _2);
+  txn_write_kv_helper(env, fn_put_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[[BI[[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len,
+    jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKVParts fn_put_parts_untracked =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          rocksdb::ColumnFamilyHandle*, const rocksdb::SliceParts&,
+          const rocksdb::SliceParts&)>(&rocksdb::Transaction::PutUntracked, txn,
+                                       column_family_handle, _1, _2);
+  txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts,
+                            jkey_parts_len, jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putUntracked
+ * Signature: (J[[BI[[BI)V
+ */
+void Java_org_rocksdb_Transaction_putUntracked__J_3_3BI_3_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jobjectArray jvalue_parts, jint jvalue_parts_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKVParts fn_put_parts_untracked =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          const rocksdb::SliceParts&, const rocksdb::SliceParts&)>(
+          &rocksdb::Transaction::PutUntracked, txn, _1, _2);
+  txn_write_kv_parts_helper(env, fn_put_parts_untracked, jkey_parts,
+                            jkey_parts_len, jvalue_parts, jvalue_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    mergeUntracked
+ * Signature: (J[BI[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len,
+    jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKV fn_merge_untracked = std::bind<rocksdb::Status (
+      rocksdb::Transaction::*)(rocksdb::ColumnFamilyHandle*,
+                               const rocksdb::Slice&, const rocksdb::Slice&)>(
+      &rocksdb::Transaction::MergeUntracked, txn, column_family_handle, _1, _2);
+  txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    mergeUntracked
+ * Signature: (J[BI[BI)V
+ */
+void Java_org_rocksdb_Transaction_mergeUntracked__J_3BI_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jbyteArray jval, jint jval_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKV fn_merge_untracked = std::bind<rocksdb::Status (
+      rocksdb::Transaction::*)(const rocksdb::Slice&, const rocksdb::Slice&)>(
+      &rocksdb::Transaction::MergeUntracked, txn, _1, _2);
+  txn_write_kv_helper(env, fn_merge_untracked, jkey, jkey_part_len, jval,
+                      jval_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteK fn_delete_untracked =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          rocksdb::ColumnFamilyHandle*, const rocksdb::Slice&)>(
+          &rocksdb::Transaction::DeleteUntracked, txn, column_family_handle,
+          _1);
+  txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3BI(JNIEnv* env,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle,
+                                                         jbyteArray jkey,
+                                                         jint jkey_part_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteK fn_delete_untracked = std::bind<rocksdb::Status (
+      rocksdb::Transaction::*)(const rocksdb::Slice&)>(
+      &rocksdb::Transaction::DeleteUntracked, txn, _1);
+  txn_write_k_helper(env, fn_delete_untracked, jkey, jkey_part_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  FnWriteKParts fn_delete_untracked_parts =
+      std::bind<rocksdb::Status (rocksdb::Transaction::*)(
+          rocksdb::ColumnFamilyHandle*, const rocksdb::SliceParts&)>(
+          &rocksdb::Transaction::DeleteUntracked, txn, column_family_handle,
+          _1);
+  txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    deleteUntracked
+ * Signature: (J[[BI)V
+ */
+void Java_org_rocksdb_Transaction_deleteUntracked__J_3_3BI(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jobjectArray jkey_parts,
+    jint jkey_parts_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  FnWriteKParts fn_delete_untracked_parts = std::bind<rocksdb::Status (
+      rocksdb::Transaction::*)(const rocksdb::SliceParts&)>(
+      &rocksdb::Transaction::DeleteUntracked, txn, _1);
+  txn_write_k_parts_helper(env, fn_delete_untracked_parts, jkey_parts,
+                           jkey_parts_len);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    putLogData
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_putLogData(JNIEnv* env, jobject /*jobj*/,
+                                             jlong jhandle, jbyteArray jkey,
+                                             jint jkey_part_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_part_len);
+  txn->PutLogData(key_slice);
+
+  // trigger java unref on key.
+  // by passing JNI_ABORT, it will simply release the reference without
+  // copying the result back to the java byte array.
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    disableIndexing
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_disableIndexing(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->DisableIndexing();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    enableIndexing
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_enableIndexing(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->EnableIndexing();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumKeys
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumKeys(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return txn->GetNumKeys();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumPuts
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumPuts(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return txn->GetNumPuts();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumDeletes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumDeletes(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return txn->GetNumDeletes();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getNumMerges
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getNumMerges(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return txn->GetNumMerges();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getElapsedTime
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getElapsedTime(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return txn->GetElapsedTime();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getWriteBatch
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getWriteBatch(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return reinterpret_cast<jlong>(txn->GetWriteBatch());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setLockTimeout(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jhandle,
+                                                 jlong jlock_timeout) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->SetLockTimeout(jlock_timeout);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getWriteOptions
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getWriteOptions(JNIEnv* /*env*/,
+                                                   jobject /*jobj*/,
+                                                   jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return reinterpret_cast<jlong>(txn->GetWriteOptions());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setWriteOptions
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setWriteOptions(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle,
+                                                  jlong jwrite_options_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  txn->SetWriteOptions(*write_options);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    undo
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BIJ(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jbyteArray jkey,
+    jint jkey_part_len, jlong jcolumn_family_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* column_family_handle =
+      reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcolumn_family_handle);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_part_len);
+  txn->UndoGetForUpdate(column_family_handle, key_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    undoGetForUpdate
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_Transaction_undoGetForUpdate__J_3BI(JNIEnv* env,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle,
+                                                          jbyteArray jkey,
+                                                          jint jkey_part_len) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  jbyte* key = env->GetByteArrayElements(jkey, nullptr);
+  if (key == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Slice key_slice(reinterpret_cast<char*>(key), jkey_part_len);
+  txn->UndoGetForUpdate(key_slice);
+
+  env->ReleaseByteArrayElements(jkey, key, JNI_ABORT);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    rebuildFromWriteBatch
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_rebuildFromWriteBatch(
+    JNIEnv* env, jobject /*jobj*/, jlong jhandle, jlong jwrite_batch_handle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  auto* write_batch =
+      reinterpret_cast<rocksdb::WriteBatch*>(jwrite_batch_handle);
+  rocksdb::Status s = txn->RebuildFromWriteBatch(write_batch);
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getCommitTimeWriteBatch
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getCommitTimeWriteBatch(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return reinterpret_cast<jlong>(txn->GetCommitTimeWriteBatch());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setLogNumber
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_Transaction_setLogNumber(JNIEnv* /*env*/,
+                                               jobject /*jobj*/, jlong jhandle,
+                                               jlong jlog_number) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  txn->SetLogNumber(jlog_number);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getLogNumber
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getLogNumber(JNIEnv* /*env*/,
+                                                jobject /*jobj*/,
+                                                jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return txn->GetLogNumber();
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    setName
+ * Signature: (JLjava/lang/String;)V
+ */
+void Java_org_rocksdb_Transaction_setName(JNIEnv* env, jobject /*jobj*/,
+                                          jlong jhandle, jstring jname) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  const char* name = env->GetStringUTFChars(jname, nullptr);
+  if (name == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Status s = txn->SetName(name);
+
+  env->ReleaseStringUTFChars(jname, name);
+
+  if (!s.ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getName
+ * Signature: (J)Ljava/lang/String;
+ */
+jstring Java_org_rocksdb_Transaction_getName(JNIEnv* env, jobject /*jobj*/,
+                                             jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  rocksdb::TransactionName name = txn->GetName();
+  return env->NewStringUTF(name.data());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getID
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getID(JNIEnv* /*env*/, jobject /*jobj*/,
+                                         jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  rocksdb::TransactionID id = txn->GetID();
+  return static_cast<jlong>(id);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    isDeadlockDetect
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_Transaction_isDeadlockDetect(JNIEnv* /*env*/,
+                                                       jobject /*jobj*/,
+                                                       jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  return static_cast<jboolean>(txn->IsDeadlockDetect());
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getWaitingTxns
+ * Signature: (J)Lorg/rocksdb/Transaction/WaitingTransactions;
+ */
+jobject Java_org_rocksdb_Transaction_getWaitingTxns(JNIEnv* env,
+                                                    jobject jtransaction_obj,
+                                                    jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  uint32_t column_family_id;
+  std::string key;
+  std::vector<rocksdb::TransactionID> waiting_txns =
+      txn->GetWaitingTxns(&column_family_id, &key);
+  jobject jwaiting_txns = rocksdb::TransactionJni::newWaitingTransactions(
+      env, jtransaction_obj, column_family_id, key, waiting_txns);
+  return jwaiting_txns;
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getState
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_Transaction_getState(JNIEnv* /*env*/, jobject /*jobj*/,
+                                            jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  rocksdb::Transaction::TransactionState txn_status = txn->GetState();
+  switch (txn_status) {
+    case rocksdb::Transaction::TransactionState::STARTED:
+      return 0x0;
+
+    case rocksdb::Transaction::TransactionState::AWAITING_PREPARE:
+      return 0x1;
+
+    case rocksdb::Transaction::TransactionState::PREPARED:
+      return 0x2;
+
+    case rocksdb::Transaction::TransactionState::AWAITING_COMMIT:
+      return 0x3;
+
+    case rocksdb::Transaction::TransactionState::COMMITED:
+      return 0x4;
+
+    case rocksdb::Transaction::TransactionState::AWAITING_ROLLBACK:
+      return 0x5;
+
+    case rocksdb::Transaction::TransactionState::ROLLEDBACK:
+      return 0x6;
+
+    case rocksdb::Transaction::TransactionState::LOCKS_STOLEN:
+      return 0x7;
+  }
+
+  assert(false);
+  return static_cast<jbyte>(-1);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    getId
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_Transaction_getId(JNIEnv* /*env*/, jobject /*jobj*/,
+                                         jlong jhandle) {
+  auto* txn = reinterpret_cast<rocksdb::Transaction*>(jhandle);
+  uint64_t id = txn->GetId();
+  return static_cast<jlong>(id);
+}
+
+/*
+ * Class:     org_rocksdb_Transaction
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_Transaction_disposeInternal(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong jhandle) {
+  delete reinterpret_cast<rocksdb::Transaction*>(jhandle);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction_db.cc b/thirdparty/rocksdb/java/rocksjni/transaction_db.cc
new file mode 100644
index 0000000000..c2c40bf10e
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/transaction_db.cc
@@ -0,0 +1,453 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::TransactionDB.
+
+#include <jni.h>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "include/org_rocksdb_TransactionDB.h"
+
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    open
+ * Signature: (JJLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2(
+    JNIEnv* env, jclass, jlong joptions_handle,
+    jlong jtxn_db_options_handle, jstring jdb_path) {
+  auto* options = reinterpret_cast<rocksdb::Options*>(joptions_handle);
+  auto* txn_db_options =
+      reinterpret_cast<rocksdb::TransactionDBOptions*>(jtxn_db_options_handle);
+  rocksdb::TransactionDB* tdb = nullptr;
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  rocksdb::Status s =
+      rocksdb::TransactionDB::Open(*options, *txn_db_options, db_path, &tdb);
+  env->ReleaseStringUTFChars(jdb_path, db_path);
+
+  if (s.ok()) {
+    return reinterpret_cast<jlong>(tdb);
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return 0;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    open
+ * Signature: (JJLjava/lang/String;[[B[J)[J
+ */
+jlongArray Java_org_rocksdb_TransactionDB_open__JJLjava_lang_String_2_3_3B_3J(
+    JNIEnv* env, jclass, jlong jdb_options_handle,
+    jlong jtxn_db_options_handle, jstring jdb_path, jobjectArray jcolumn_names,
+    jlongArray jcolumn_options_handles) {
+  const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
+  if (db_path == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+
+  const jsize len_cols = env->GetArrayLength(jcolumn_names);
+  if (env->EnsureLocalCapacity(len_cols) != 0) {
+    // out of memory
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+
+  jlong* jco = env->GetLongArrayElements(jcolumn_options_handles, nullptr);
+  if (jco == nullptr) {
+    // exception thrown: OutOfMemoryError
+    env->ReleaseStringUTFChars(jdb_path, db_path);
+    return nullptr;
+  }
+  std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < len_cols; i++) {
+    const jobject jcn = env->GetObjectArrayElement(jcolumn_names, i);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+    const jbyteArray jcn_ba = reinterpret_cast<jbyteArray>(jcn);
+    jbyte* jcf_name = env->GetByteArrayElements(jcn_ba, nullptr);
+    if (jcf_name == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jcn);
+      env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+
+    const int jcf_name_len = env->GetArrayLength(jcn_ba);
+    if (env->EnsureLocalCapacity(jcf_name_len) != 0) {
+      // out of memory
+      env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT);
+      env->DeleteLocalRef(jcn);
+      env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+      env->ReleaseStringUTFChars(jdb_path, db_path);
+      return nullptr;
+    }
+    const std::string cf_name(reinterpret_cast<char*>(jcf_name), jcf_name_len);
+    const rocksdb::ColumnFamilyOptions* cf_options =
+        reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jco[i]);
+    column_families.push_back(
+        rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options));
+
+    env->ReleaseByteArrayElements(jcn_ba, jcf_name, JNI_ABORT);
+    env->DeleteLocalRef(jcn);
+  }
+  env->ReleaseLongArrayElements(jcolumn_options_handles, jco, JNI_ABORT);
+
+  auto* db_options = reinterpret_cast<rocksdb::DBOptions*>(jdb_options_handle);
+  auto* txn_db_options =
+      reinterpret_cast<rocksdb::TransactionDBOptions*>(jtxn_db_options_handle);
+  std::vector<rocksdb::ColumnFamilyHandle*> handles;
+  rocksdb::TransactionDB* tdb = nullptr;
+  const rocksdb::Status s = rocksdb::TransactionDB::Open(
+      *db_options, *txn_db_options, db_path, column_families, &handles, &tdb);
+
+  // check if open operation was successful
+  if (s.ok()) {
+    const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
+    std::unique_ptr<jlong[]> results =
+        std::unique_ptr<jlong[]>(new jlong[resultsLen]);
+    results[0] = reinterpret_cast<jlong>(tdb);
+    for (int i = 1; i <= len_cols; i++) {
+      results[i] = reinterpret_cast<jlong>(handles[i - 1]);
+    }
+
+    jlongArray jresults = env->NewLongArray(resultsLen);
+    if (jresults == nullptr) {
+      // exception thrown: OutOfMemoryError
+      return nullptr;
+    }
+    env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException
+      env->DeleteLocalRef(jresults);
+      return nullptr;
+    }
+    return jresults;
+  } else {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+    return nullptr;
+  }
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionDB_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  assert(txn_db != nullptr);
+  delete txn_db;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionDB_closeDatabase(
+    JNIEnv* env, jclass, jlong jhandle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  assert(txn_db != nullptr);
+  rocksdb::Status s = txn_db->Close();
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  rocksdb::Transaction* txn = txn_db->BeginTransaction(*write_options);
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction__JJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jtxn_options_handle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* txn_options =
+      reinterpret_cast<rocksdb::TransactionOptions*>(jtxn_options_handle);
+  rocksdb::Transaction* txn =
+      txn_db->BeginTransaction(*write_options, *txn_options);
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jold_txn_handle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* old_txn = reinterpret_cast<rocksdb::Transaction*>(jold_txn_handle);
+  rocksdb::TransactionOptions txn_options;
+  rocksdb::Transaction* txn =
+      txn_db->BeginTransaction(*write_options, txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_txn
+  assert(txn == old_txn);
+
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    beginTransaction_withOld
+ * Signature: (JJJJ)J
+ */
+jlong Java_org_rocksdb_TransactionDB_beginTransaction_1withOld__JJJJ(
+    JNIEnv*, jobject, jlong jhandle, jlong jwrite_options_handle,
+    jlong jtxn_options_handle, jlong jold_txn_handle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  auto* write_options =
+      reinterpret_cast<rocksdb::WriteOptions*>(jwrite_options_handle);
+  auto* txn_options =
+      reinterpret_cast<rocksdb::TransactionOptions*>(jtxn_options_handle);
+  auto* old_txn = reinterpret_cast<rocksdb::Transaction*>(jold_txn_handle);
+  rocksdb::Transaction* txn =
+      txn_db->BeginTransaction(*write_options, *txn_options, old_txn);
+
+  // RocksJava relies on the assumption that
+  // we do not allocate a new Transaction object
+  // when providing an old_txn
+  assert(txn == old_txn);
+
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getTransactionByName
+ * Signature: (JLjava/lang/String;)J
+ */
+jlong Java_org_rocksdb_TransactionDB_getTransactionByName(
+    JNIEnv* env, jobject, jlong jhandle, jstring jname) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  const char* name = env->GetStringUTFChars(jname, nullptr);
+  if (name == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return 0;
+  }
+  rocksdb::Transaction* txn = txn_db->GetTransactionByName(name);
+  env->ReleaseStringUTFChars(jname, name);
+  return reinterpret_cast<jlong>(txn);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getAllPreparedTransactions
+ * Signature: (J)[J
+ */
+jlongArray Java_org_rocksdb_TransactionDB_getAllPreparedTransactions(
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  std::vector<rocksdb::Transaction*> txns;
+  txn_db->GetAllPreparedTransactions(&txns);
+
+  const size_t size = txns.size();
+  assert(size < UINT32_MAX);  // does it fit in a jint?
+
+  const jsize len = static_cast<jsize>(size);
+  std::vector<jlong> tmp(len);
+  for (jsize i = 0; i < len; ++i) {
+    tmp[i] = reinterpret_cast<jlong>(txns[i]);
+  }
+
+  jlongArray jtxns = env->NewLongArray(len);
+  if (jtxns == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  env->SetLongArrayRegion(jtxns, 0, len, tmp.data());
+  if (env->ExceptionCheck()) {
+    // exception thrown: ArrayIndexOutOfBoundsException
+    env->DeleteLocalRef(jtxns);
+    return nullptr;
+  }
+
+  return jtxns;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getLockStatusData
+ * Signature: (J)Ljava/util/Map;
+ */
+jobject Java_org_rocksdb_TransactionDB_getLockStatusData(
+    JNIEnv* env, jobject, jlong jhandle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  const std::unordered_multimap<uint32_t, rocksdb::KeyLockInfo>
+      lock_status_data = txn_db->GetLockStatusData();
+  const jobject jlock_status_data = rocksdb::HashMapJni::construct(
+      env, static_cast<uint32_t>(lock_status_data.size()));
+  if (jlock_status_data == nullptr) {
+    // exception occurred
+    return nullptr;
+  }
+
+  const rocksdb::HashMapJni::FnMapKV<const int32_t, const rocksdb::KeyLockInfo, jobject, jobject>
+      fn_map_kv =
+          [env](
+              const std::pair<const int32_t, const rocksdb::KeyLockInfo>&
+                  pair) {
+            const jobject jlong_column_family_id =
+                rocksdb::LongJni::valueOf(env, pair.first);
+            if (jlong_column_family_id == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            const jobject jkey_lock_info =
+                rocksdb::KeyLockInfoJni::construct(env, pair.second);
+            if (jkey_lock_info == nullptr) {
+              // an error occurred
+              return std::unique_ptr<std::pair<jobject, jobject>>(nullptr);
+            }
+            return std::unique_ptr<std::pair<jobject, jobject>>(
+                new std::pair<jobject, jobject>(jlong_column_family_id,
+                                                jkey_lock_info));
+          };
+
+  if (!rocksdb::HashMapJni::putAll(env, jlock_status_data,
+                                   lock_status_data.begin(),
+                                   lock_status_data.end(), fn_map_kv)) {
+    // exception occcurred
+    return nullptr;
+  }
+
+  return jlock_status_data;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    getDeadlockInfoBuffer
+ * Signature: (J)[Lorg/rocksdb/TransactionDB/DeadlockPath;
+ */
+jobjectArray Java_org_rocksdb_TransactionDB_getDeadlockInfoBuffer(
+    JNIEnv* env, jobject jobj, jlong jhandle) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  const std::vector<rocksdb::DeadlockPath> deadlock_info_buffer =
+      txn_db->GetDeadlockInfoBuffer();
+
+  const jsize deadlock_info_buffer_len =
+      static_cast<jsize>(deadlock_info_buffer.size());
+  jobjectArray jdeadlock_info_buffer =
+      env->NewObjectArray(deadlock_info_buffer_len,
+                          rocksdb::DeadlockPathJni::getJClass(env), nullptr);
+  if (jdeadlock_info_buffer == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return nullptr;
+  }
+  jsize jdeadlock_info_buffer_offset = 0;
+
+  auto buf_end = deadlock_info_buffer.end();
+  for (auto buf_it = deadlock_info_buffer.begin(); buf_it != buf_end;
+       ++buf_it) {
+    const rocksdb::DeadlockPath deadlock_path = *buf_it;
+    const std::vector<rocksdb::DeadlockInfo> deadlock_infos =
+        deadlock_path.path;
+    const jsize deadlock_infos_len =
+        static_cast<jsize>(deadlock_info_buffer.size());
+    jobjectArray jdeadlock_infos = env->NewObjectArray(
+        deadlock_infos_len, rocksdb::DeadlockInfoJni::getJClass(env), nullptr);
+    if (jdeadlock_infos == nullptr) {
+      // exception thrown: OutOfMemoryError
+      env->DeleteLocalRef(jdeadlock_info_buffer);
+      return nullptr;
+    }
+    jsize jdeadlock_infos_offset = 0;
+
+    auto infos_end = deadlock_infos.end();
+    for (auto infos_it = deadlock_infos.begin(); infos_it != infos_end;
+         ++infos_it) {
+      const rocksdb::DeadlockInfo deadlock_info = *infos_it;
+      const jobject jdeadlock_info = rocksdb::TransactionDBJni::newDeadlockInfo(
+          env, jobj, deadlock_info.m_txn_id, deadlock_info.m_cf_id,
+          deadlock_info.m_waiting_key, deadlock_info.m_exclusive);
+      if (jdeadlock_info == nullptr) {
+        // exception occcurred
+        env->DeleteLocalRef(jdeadlock_info_buffer);
+        return nullptr;
+      }
+      env->SetObjectArrayElement(jdeadlock_infos, jdeadlock_infos_offset++,
+                                 jdeadlock_info);
+      if (env->ExceptionCheck()) {
+        // exception thrown: ArrayIndexOutOfBoundsException or
+        // ArrayStoreException
+        env->DeleteLocalRef(jdeadlock_info);
+        env->DeleteLocalRef(jdeadlock_info_buffer);
+        return nullptr;
+      }
+    }
+
+    const jobject jdeadlock_path = rocksdb::DeadlockPathJni::construct(
+        env, jdeadlock_infos, deadlock_path.limit_exceeded);
+    if (jdeadlock_path == nullptr) {
+      // exception occcurred
+      env->DeleteLocalRef(jdeadlock_info_buffer);
+      return nullptr;
+    }
+    env->SetObjectArrayElement(jdeadlock_info_buffer,
+                               jdeadlock_info_buffer_offset++, jdeadlock_path);
+    if (env->ExceptionCheck()) {
+      // exception thrown: ArrayIndexOutOfBoundsException or ArrayStoreException
+      env->DeleteLocalRef(jdeadlock_path);
+      env->DeleteLocalRef(jdeadlock_info_buffer);
+      return nullptr;
+    }
+  }
+
+  return jdeadlock_info_buffer;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDB
+ * Method:    setDeadlockInfoBufferSize
+ * Signature: (JI)V
+ */
+void Java_org_rocksdb_TransactionDB_setDeadlockInfoBufferSize(
+    JNIEnv*, jobject, jlong jhandle, jint jdeadlock_info_buffer_size) {
+  auto* txn_db = reinterpret_cast<rocksdb::TransactionDB*>(jhandle);
+  txn_db->SetDeadlockInfoBufferSize(jdeadlock_info_buffer_size);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction_db_options.cc b/thirdparty/rocksdb/java/rocksjni/transaction_db_options.cc
new file mode 100644
index 0000000000..391accc375
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/transaction_db_options.cc
@@ -0,0 +1,158 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::TransactionDBOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_TransactionDBOptions.h"
+
+#include "rocksdb/utilities/transaction_db.h"
+
+#include "rocksjni/portal.h"
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    newTransactionDBOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_newTransactionDBOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  rocksdb::TransactionDBOptions* opts = new rocksdb::TransactionDBOptions();
+  return reinterpret_cast<jlong>(opts);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getMaxNumLocks
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getMaxNumLocks(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  return opts->max_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setMaxNumLocks
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setMaxNumLocks(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jlong jmax_num_locks) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  opts->max_num_locks = jmax_num_locks;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getNumStripes
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getNumStripes(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  return opts->num_stripes;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setNumStripes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setNumStripes(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle,
+                                                         jlong jnum_stripes) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  opts->num_stripes = jnum_stripes;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getTransactionLockTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getTransactionLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  return opts->transaction_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setTransactionLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setTransactionLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jtransaction_lock_timeout) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  opts->transaction_lock_timeout = jtransaction_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getDefaultLockTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionDBOptions_getDefaultLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  return opts->default_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setDefaultLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setDefaultLockTimeout(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jdefault_lock_timeout) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  opts->default_lock_timeout = jdefault_lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    getWritePolicy
+ * Signature: (J)B
+ */
+jbyte Java_org_rocksdb_TransactionDBOptions_getWritePolicy(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  return rocksdb::TxnDBWritePolicyJni::toJavaTxnDBWritePolicy(
+      opts->write_policy);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    setWritePolicy
+ * Signature: (JB)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_setWritePolicy(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong jhandle,
+                                                          jbyte jwrite_policy) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+  opts->write_policy =
+      rocksdb::TxnDBWritePolicyJni::toCppTxnDBWritePolicy(jwrite_policy);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionDBOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionDBOptions_disposeInternal(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  delete reinterpret_cast<rocksdb::TransactionDBOptions*>(jhandle);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction_log.cc b/thirdparty/rocksdb/java/rocksjni/transaction_log.cc
index a5049e3b26..8186a846bd 100644
--- a/thirdparty/rocksdb/java/rocksjni/transaction_log.cc
+++ b/thirdparty/rocksdb/java/rocksjni/transaction_log.cc
@@ -19,8 +19,9 @@
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_TransactionLogIterator_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_TransactionLogIterator_disposeInternal(JNIEnv* /*env*/,
+                                                             jobject /*jobj*/,
+                                                             jlong handle) {
   delete reinterpret_cast<rocksdb::TransactionLogIterator*>(handle);
 }
 
@@ -29,8 +30,9 @@ void Java_org_rocksdb_TransactionLogIterator_disposeInternal(
  * Method:    isValid
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_TransactionLogIterator_isValid(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jboolean Java_org_rocksdb_TransactionLogIterator_isValid(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
   return reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->Valid();
 }
 
@@ -39,8 +41,9 @@ jboolean Java_org_rocksdb_TransactionLogIterator_isValid(
  * Method:    next
  * Signature: (J)V
  */
-void Java_org_rocksdb_TransactionLogIterator_next(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_TransactionLogIterator_next(JNIEnv* /*env*/,
+                                                  jobject /*jobj*/,
+                                                  jlong handle) {
   reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->Next();
 }
 
@@ -49,10 +52,11 @@ void Java_org_rocksdb_TransactionLogIterator_next(
  * Method:    status
  * Signature: (J)V
  */
-void Java_org_rocksdb_TransactionLogIterator_status(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  rocksdb::Status s = reinterpret_cast<
-      rocksdb::TransactionLogIterator*>(handle)->status();
+void Java_org_rocksdb_TransactionLogIterator_status(JNIEnv* env,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
+  rocksdb::Status s =
+      reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->status();
   if (!s.ok()) {
     rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
   }
@@ -63,8 +67,9 @@ void Java_org_rocksdb_TransactionLogIterator_status(
  * Method:    getBatch
  * Signature: (J)Lorg/rocksdb/TransactionLogIterator$BatchResult
  */
-jobject Java_org_rocksdb_TransactionLogIterator_getBatch(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jobject Java_org_rocksdb_TransactionLogIterator_getBatch(JNIEnv* env,
+                                                         jobject /*jobj*/,
+                                                         jlong handle) {
   rocksdb::BatchResult batch_result =
       reinterpret_cast<rocksdb::TransactionLogIterator*>(handle)->GetBatch();
   return rocksdb::BatchResultJni::construct(env, batch_result);
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction_notifier.cc b/thirdparty/rocksdb/java/rocksjni/transaction_notifier.cc
new file mode 100644
index 0000000000..b60076e100
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/transaction_notifier.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::TransactionNotifier.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_AbstractTransactionNotifier.h"
+#include "rocksjni/transaction_notifier_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractTransactionNotifier
+ * Method:    createNewTransactionNotifier
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractTransactionNotifier_createNewTransactionNotifier(
+    JNIEnv* env, jobject jobj) {
+  auto* transaction_notifier =
+      new rocksdb::TransactionNotifierJniCallback(env, jobj);
+  auto* sptr_transaction_notifier =
+      new std::shared_ptr<rocksdb::TransactionNotifierJniCallback>(
+          transaction_notifier);
+  return reinterpret_cast<jlong>(sptr_transaction_notifier);
+}
+
+/*
+ * Class:     org_rocksdb_AbstractTransactionNotifier
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_AbstractTransactionNotifier_disposeInternal(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  // TODO(AR) refactor to use JniCallback::JniCallback
+  // when https://github.com/facebook/rocksdb/pull/1241/ is merged
+  std::shared_ptr<rocksdb::TransactionNotifierJniCallback>* handle =
+      reinterpret_cast<
+          std::shared_ptr<rocksdb::TransactionNotifierJniCallback>*>(jhandle);
+  delete handle;
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction_notifier_jnicallback.cc b/thirdparty/rocksdb/java/rocksjni/transaction_notifier_jnicallback.cc
new file mode 100644
index 0000000000..85f2a194be
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/transaction_notifier_jnicallback.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::TransactionNotifier.
+
+#include "rocksjni/transaction_notifier_jnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+
+TransactionNotifierJniCallback::TransactionNotifierJniCallback(JNIEnv* env,
+    jobject jtransaction_notifier) : JniCallback(env, jtransaction_notifier) {
+  // we cache the method id for the JNI callback
+  m_jsnapshot_created_methodID =
+      AbstractTransactionNotifierJni::getSnapshotCreatedMethodId(env);
+}
+
+void TransactionNotifierJniCallback::SnapshotCreated(
+    const Snapshot* newSnapshot) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  assert(env != nullptr);
+
+  env->CallVoidMethod(m_jcallback_obj,
+      m_jsnapshot_created_methodID, reinterpret_cast<jlong>(newSnapshot));
+
+  if(env->ExceptionCheck()) {
+    // exception thrown from CallVoidMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  releaseJniEnv(attached_thread);
+}
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction_notifier_jnicallback.h b/thirdparty/rocksdb/java/rocksjni/transaction_notifier_jnicallback.h
new file mode 100644
index 0000000000..8f67cdb8bc
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/transaction_notifier_jnicallback.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::TransactionNotifier.
+
+#ifndef JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_
+
+#include <jni.h>
+
+#include "rocksdb/utilities/transaction.h"
+#include "rocksjni/jnicallback.h"
+
+namespace rocksdb {
+
+/**
+ * This class acts as a bridge between C++
+ * and Java. The methods in this class will be
+ * called back from the RocksDB TransactionDB or OptimisticTransactionDB (C++),
+ * we then callback to the appropriate Java method
+ * this enables TransactionNotifier to be implemented in Java.
+ *
+ * Unlike RocksJava's Comparator JNI Callback, we do not attempt
+ * to reduce Java object allocations by caching the Snapshot object
+ * presented to the callback. This could be revisited in future
+ * if performance is lacking.
+ */
+class TransactionNotifierJniCallback: public JniCallback,
+    public TransactionNotifier {
+ public:
+  TransactionNotifierJniCallback(JNIEnv* env, jobject jtransaction_notifier);
+  virtual void SnapshotCreated(const Snapshot* newSnapshot);
+
+ private:
+  jmethodID m_jsnapshot_created_methodID;
+};
+}  // namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_TRANSACTION_NOTIFIER_JNICALLBACK_H_
diff --git a/thirdparty/rocksdb/java/rocksjni/transaction_options.cc b/thirdparty/rocksdb/java/rocksjni/transaction_options.cc
new file mode 100644
index 0000000000..d18a5294af
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/transaction_options.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++
+// for rocksdb::TransactionOptions.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_TransactionOptions.h"
+
+#include "rocksdb/utilities/transaction_db.h"
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    newTransactionOptions
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_TransactionOptions_newTransactionOptions(
+    JNIEnv* /*env*/, jclass /*jcls*/) {
+  auto* opts = new rocksdb::TransactionOptions();
+  return reinterpret_cast<jlong>(opts);
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    isSetSnapshot
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_TransactionOptions_isSetSnapshot(JNIEnv* /*env*/,
+                                                           jobject /*jobj*/,
+                                                           jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  return opts->set_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setSetSnapshot
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setSetSnapshot(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean jset_snapshot) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  opts->set_snapshot = jset_snapshot;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    isDeadlockDetect
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_TransactionOptions_isDeadlockDetect(JNIEnv* /*env*/,
+                                                              jobject /*jobj*/,
+                                                              jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  return opts->deadlock_detect;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setDeadlockDetect
+ * Signature: (JZ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setDeadlockDetect(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jboolean jdeadlock_detect) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  opts->deadlock_detect = jdeadlock_detect;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getLockTimeout
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getLockTimeout(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  return opts->lock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setLockTimeout
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setLockTimeout(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle,
+                                                        jlong jlock_timeout) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  opts->lock_timeout = jlock_timeout;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getExpiration
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getExpiration(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  return opts->expiration;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setExpiration
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setExpiration(JNIEnv* /*env*/,
+                                                       jobject /*jobj*/,
+                                                       jlong jhandle,
+                                                       jlong jexpiration) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  opts->expiration = jexpiration;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getDeadlockDetectDepth
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getDeadlockDetectDepth(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  return opts->deadlock_detect_depth;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setDeadlockDetectDepth
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setDeadlockDetectDepth(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jdeadlock_detect_depth) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  opts->deadlock_detect_depth = jdeadlock_detect_depth;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    getMaxWriteBatchSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_TransactionOptions_getMaxWriteBatchSize(JNIEnv* /*env*/,
+                                                               jobject /*jobj*/,
+                                                               jlong jhandle) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  return opts->max_write_batch_size;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    setMaxWriteBatchSize
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_TransactionOptions_setMaxWriteBatchSize(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle,
+    jlong jmax_write_batch_size) {
+  auto* opts = reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+  opts->max_write_batch_size = jmax_write_batch_size;
+}
+
+/*
+ * Class:     org_rocksdb_TransactionOptions
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TransactionOptions_disposeInternal(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jhandle) {
+  delete reinterpret_cast<rocksdb::TransactionOptions*>(jhandle);
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/ttl.cc b/thirdparty/rocksdb/java/rocksjni/ttl.cc
index a66ad86d62..4b071e7b33 100644
--- a/thirdparty/rocksdb/java/rocksjni/ttl.cc
+++ b/thirdparty/rocksdb/java/rocksjni/ttl.cc
@@ -10,9 +10,9 @@
 #include <jni.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <memory>
 #include <string>
 #include <vector>
-#include <memory>
 
 #include "include/org_rocksdb_TtlDB.h"
 #include "rocksdb/utilities/db_ttl.h"
@@ -23,19 +23,19 @@
  * Method:    open
  * Signature: (JLjava/lang/String;IZ)J
  */
-jlong Java_org_rocksdb_TtlDB_open(JNIEnv* env,
-    jclass jcls, jlong joptions_handle, jstring jdb_path,
-    jint jttl, jboolean jread_only) {
+jlong Java_org_rocksdb_TtlDB_open(
+    JNIEnv* env, jclass, jlong joptions_handle, jstring jdb_path, jint jttl,
+    jboolean jread_only) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
-  if(db_path == nullptr) {
+  if (db_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
 
   auto* opt = reinterpret_cast<rocksdb::Options*>(joptions_handle);
   rocksdb::DBWithTTL* db = nullptr;
-  rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, &db,
-      jttl, jread_only);
+  rocksdb::Status s =
+      rocksdb::DBWithTTL::Open(*opt, db_path, &db, jttl, jread_only);
   env->ReleaseStringUTFChars(jdb_path, db_path);
 
   // as TTLDB extends RocksDB on the java side, we can reuse
@@ -53,20 +53,19 @@ jlong Java_org_rocksdb_TtlDB_open(JNIEnv* env,
  * Method:    openCF
  * Signature: (JLjava/lang/String;[[B[J[IZ)[J
  */
-jlongArray
-    Java_org_rocksdb_TtlDB_openCF(
-    JNIEnv* env, jclass jcls, jlong jopt_handle, jstring jdb_path,
+jlongArray Java_org_rocksdb_TtlDB_openCF(
+    JNIEnv* env, jclass, jlong jopt_handle, jstring jdb_path,
     jobjectArray jcolumn_names, jlongArray jcolumn_options,
     jintArray jttls, jboolean jread_only) {
   const char* db_path = env->GetStringUTFChars(jdb_path, nullptr);
-  if(db_path == nullptr) {
+  if (db_path == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
 
   const jsize len_cols = env->GetArrayLength(jcolumn_names);
   jlong* jco = env->GetLongArrayElements(jcolumn_options, nullptr);
-  if(jco == nullptr) {
+  if (jco == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseStringUTFChars(jdb_path, db_path);
     return nullptr;
@@ -75,22 +74,21 @@ jlongArray
   std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
   jboolean has_exception = JNI_FALSE;
   rocksdb::JniUtil::byteStrings<std::string>(
-    env,
-    jcolumn_names,
-    [](const char* str_data, const size_t str_len) {
-      return std::string(str_data, str_len);
-    },
-    [&jco, &column_families](size_t idx, std::string cf_name) {
-      rocksdb::ColumnFamilyOptions* cf_options =
-          reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jco[idx]);
-      column_families.push_back(
-          rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options));
-    },
-    &has_exception);
+      env, jcolumn_names,
+      [](const char* str_data, const size_t str_len) {
+        return std::string(str_data, str_len);
+      },
+      [&jco, &column_families](size_t idx, std::string cf_name) {
+        rocksdb::ColumnFamilyOptions* cf_options =
+            reinterpret_cast<rocksdb::ColumnFamilyOptions*>(jco[idx]);
+        column_families.push_back(
+            rocksdb::ColumnFamilyDescriptor(cf_name, *cf_options));
+      },
+      &has_exception);
 
   env->ReleaseLongArrayElements(jcolumn_options, jco, JNI_ABORT);
 
-  if(has_exception == JNI_TRUE) {
+  if (has_exception == JNI_TRUE) {
     // exception occurred
     env->ReleaseStringUTFChars(jdb_path, db_path);
     return nullptr;
@@ -98,13 +96,13 @@ jlongArray
 
   std::vector<int32_t> ttl_values;
   jint* jttlv = env->GetIntArrayElements(jttls, nullptr);
-  if(jttlv == nullptr) {
+  if (jttlv == nullptr) {
     // exception thrown: OutOfMemoryError
     env->ReleaseStringUTFChars(jdb_path, db_path);
     return nullptr;
   }
   const jsize len_ttls = env->GetArrayLength(jttls);
-  for(jsize i = 0; i < len_ttls; i++) {
+  for (jsize i = 0; i < len_ttls; i++) {
     ttl_values.push_back(jttlv[i]);
   }
   env->ReleaseIntArrayElements(jttls, jttlv, JNI_ABORT);
@@ -112,30 +110,30 @@ jlongArray
   auto* opt = reinterpret_cast<rocksdb::DBOptions*>(jopt_handle);
   std::vector<rocksdb::ColumnFamilyHandle*> handles;
   rocksdb::DBWithTTL* db = nullptr;
-  rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, column_families,
-      &handles, &db, ttl_values, jread_only);
+  rocksdb::Status s = rocksdb::DBWithTTL::Open(
+      *opt, db_path, column_families, &handles, &db, ttl_values, jread_only);
 
   // we have now finished with db_path
   env->ReleaseStringUTFChars(jdb_path, db_path);
 
   // check if open operation was successful
   if (s.ok()) {
-    const jsize resultsLen = 1 + len_cols; //db handle + column family handles
+    const jsize resultsLen = 1 + len_cols;  // db handle + column family handles
     std::unique_ptr<jlong[]> results =
         std::unique_ptr<jlong[]>(new jlong[resultsLen]);
     results[0] = reinterpret_cast<jlong>(db);
-    for(int i = 1; i <= len_cols; i++) {
+    for (int i = 1; i <= len_cols; i++) {
       results[i] = reinterpret_cast<jlong>(handles[i - 1]);
     }
 
     jlongArray jresults = env->NewLongArray(resultsLen);
-    if(jresults == nullptr) {
+    if (jresults == nullptr) {
       // exception thrown: OutOfMemoryError
       return nullptr;
     }
 
     env->SetLongArrayRegion(jresults, 0, resultsLen, results.get());
-    if(env->ExceptionCheck()) {
+    if (env->ExceptionCheck()) {
       // exception thrown: ArrayIndexOutOfBoundsException
       env->DeleteLocalRef(jresults);
       return nullptr;
@@ -148,17 +146,43 @@ jlongArray
   }
 }
 
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TtlDB_disposeInternal(
+    JNIEnv*, jobject, jlong jhandle) {
+  auto* ttl_db = reinterpret_cast<rocksdb::DBWithTTL*>(jhandle);
+  assert(ttl_db != nullptr);
+  delete ttl_db;
+}
+
+/*
+ * Class:     org_rocksdb_TtlDB
+ * Method:    closeDatabase
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_TtlDB_closeDatabase(
+    JNIEnv* /* env */, jclass, jlong /* jhandle */) {
+  //auto* ttl_db = reinterpret_cast<rocksdb::DBWithTTL*>(jhandle);
+  //assert(ttl_db != nullptr);
+  //rocksdb::Status s = ttl_db->Close();
+  //rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+
+  //TODO(AR) this is disabled until https://github.com/facebook/rocksdb/issues/4818 is resolved!
+}
+
 /*
  * Class:     org_rocksdb_TtlDB
  * Method:    createColumnFamilyWithTtl
  * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;[BJI)J;
  */
 jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
-    JNIEnv* env, jobject jobj, jlong jdb_handle,
-    jbyteArray jcolumn_name, jlong jcolumn_options, jint jttl) {
-
+    JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jcolumn_name,
+    jlong jcolumn_options, jint jttl) {
   jbyte* cfname = env->GetByteArrayElements(jcolumn_name, nullptr);
-  if(cfname == nullptr) {
+  if (cfname == nullptr) {
     // exception thrown: OutOfMemoryError
     return 0;
   }
@@ -170,8 +194,8 @@ jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl(
   auto* db_handle = reinterpret_cast<rocksdb::DBWithTTL*>(jdb_handle);
   rocksdb::ColumnFamilyHandle* handle;
   rocksdb::Status s = db_handle->CreateColumnFamilyWithTtl(
-      *cfOptions, std::string(reinterpret_cast<char *>(cfname),
-          len), &handle, jttl);
+      *cfOptions, std::string(reinterpret_cast<char*>(cfname), len), &handle,
+      jttl);
 
   env->ReleaseByteArrayElements(jcolumn_name, cfname, 0);
 
diff --git a/thirdparty/rocksdb/java/rocksjni/wal_filter.cc b/thirdparty/rocksdb/java/rocksjni/wal_filter.cc
new file mode 100644
index 0000000000..c74e54252e
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/wal_filter.cc
@@ -0,0 +1,23 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the "bridge" between Java and C++ for
+// rocksdb::WalFilter.
+
+#include <jni.h>
+
+#include "include/org_rocksdb_AbstractWalFilter.h"
+#include "rocksjni/wal_filter_jnicallback.h"
+
+/*
+ * Class:     org_rocksdb_AbstractWalFilter
+ * Method:    createNewWalFilter
+ * Signature: ()J
+ */
+jlong Java_org_rocksdb_AbstractWalFilter_createNewWalFilter(
+    JNIEnv* env, jobject jobj) {
+  auto* wal_filter = new rocksdb::WalFilterJniCallback(env, jobj);    
+  return reinterpret_cast<jlong>(wal_filter);
+}
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/wal_filter_jnicallback.cc b/thirdparty/rocksdb/java/rocksjni/wal_filter_jnicallback.cc
new file mode 100644
index 0000000000..8fd909258f
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/wal_filter_jnicallback.cc
@@ -0,0 +1,144 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::WalFilter.
+
+#include "rocksjni/wal_filter_jnicallback.h"
+#include "rocksjni/portal.h"
+
+namespace rocksdb {
+WalFilterJniCallback::WalFilterJniCallback(
+    JNIEnv* env, jobject jwal_filter)
+    : JniCallback(env, jwal_filter) {
+  // Note: The name of a WalFilter will not change during it's lifetime,
+  // so we cache it in a global var
+  jmethodID jname_mid = AbstractWalFilterJni::getNameMethodId(env);
+  if(jname_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+  jstring jname = (jstring)env->CallObjectMethod(m_jcallback_obj, jname_mid);
+  if(env->ExceptionCheck()) {
+    // exception thrown
+    return;
+  }
+  jboolean has_exception = JNI_FALSE;
+  m_name = JniUtil::copyString(env, jname,
+      &has_exception);  // also releases jname
+  if (has_exception == JNI_TRUE) {
+    // exception thrown
+    return;
+  }
+
+  m_column_family_log_number_map_mid =
+      AbstractWalFilterJni::getColumnFamilyLogNumberMapMethodId(env);
+  if(m_column_family_log_number_map_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+
+  m_log_record_found_proxy_mid =
+      AbstractWalFilterJni::getLogRecordFoundProxyMethodId(env);
+  if(m_log_record_found_proxy_mid == nullptr) {
+    // exception thrown: NoSuchMethodException or OutOfMemoryError
+    return;
+  }
+}
+
+void WalFilterJniCallback::ColumnFamilyLogNumberMap(
+    const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+    const std::map<std::string, uint32_t>& cf_name_id_map) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return;
+  }
+
+  jobject jcf_lognumber_map =
+      rocksdb::HashMapJni::fromCppMap(env, &cf_lognumber_map);
+  if (jcf_lognumber_map == nullptr) {
+    // exception occurred
+    env->ExceptionDescribe(); // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  jobject jcf_name_id_map =
+      rocksdb::HashMapJni::fromCppMap(env, &cf_name_id_map);
+  if (jcf_name_id_map == nullptr) {
+    // exception occurred
+    env->ExceptionDescribe(); // print out exception to stderr
+    env->DeleteLocalRef(jcf_lognumber_map);
+    releaseJniEnv(attached_thread);
+    return;
+  }
+
+  env->CallVoidMethod(m_jcallback_obj,
+      m_column_family_log_number_map_mid,
+      jcf_lognumber_map,
+      jcf_name_id_map);
+
+  env->DeleteLocalRef(jcf_lognumber_map);
+  env->DeleteLocalRef(jcf_name_id_map);
+
+  if(env->ExceptionCheck()) {
+    // exception thrown from CallVoidMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+  }
+
+  releaseJniEnv(attached_thread);
+}
+
+ WalFilter::WalProcessingOption WalFilterJniCallback::LogRecordFound(
+    unsigned long long log_number, const std::string& log_file_name,
+    const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
+  jboolean attached_thread = JNI_FALSE;
+  JNIEnv* env = getJniEnv(&attached_thread);
+  if (env == nullptr) {
+    return  WalFilter::WalProcessingOption::kCorruptedRecord;
+  }
+  
+  jstring jlog_file_name = JniUtil::toJavaString(env, &log_file_name);
+  if (jlog_file_name == nullptr) {
+    // exception occcurred
+      env->ExceptionDescribe(); // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return  WalFilter::WalProcessingOption::kCorruptedRecord;
+  }
+
+  jshort jlog_record_found_result = env->CallShortMethod(m_jcallback_obj,
+      m_log_record_found_proxy_mid,
+      static_cast<jlong>(log_number),
+      jlog_file_name,
+      reinterpret_cast<jlong>(&batch),
+      reinterpret_cast<jlong>(new_batch));
+  
+  env->DeleteLocalRef(jlog_file_name);
+
+  if (env->ExceptionCheck()) {
+    // exception thrown from CallShortMethod
+    env->ExceptionDescribe();  // print out exception to stderr
+    releaseJniEnv(attached_thread);
+    return  WalFilter::WalProcessingOption::kCorruptedRecord;
+  }
+
+  // unpack WalProcessingOption and batch_changed from jlog_record_found_result
+  jbyte jwal_processing_option_value = (jlog_record_found_result >> 8) & 0xFF;
+  jbyte jbatch_changed_value = jlog_record_found_result & 0xFF;
+
+  releaseJniEnv(attached_thread);
+
+  *batch_changed = jbatch_changed_value == JNI_TRUE;
+
+  return WalProcessingOptionJni::toCppWalProcessingOption(
+      jwal_processing_option_value);
+}
+
+const char* WalFilterJniCallback::Name() const {
+  return m_name.get();
+}
+
+}  // namespace rocksdb
\ No newline at end of file
diff --git a/thirdparty/rocksdb/java/rocksjni/wal_filter_jnicallback.h b/thirdparty/rocksdb/java/rocksjni/wal_filter_jnicallback.h
new file mode 100644
index 0000000000..df6394cef2
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/wal_filter_jnicallback.h
@@ -0,0 +1,42 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file implements the callback "bridge" between Java and C++ for
+// rocksdb::WalFilter.
+
+#ifndef JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_
+#define JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_
+
+#include <jni.h>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "rocksdb/wal_filter.h"
+#include "rocksjni/jnicallback.h"
+
+namespace rocksdb {
+
+class WalFilterJniCallback : public JniCallback, public WalFilter {
+ public:
+    WalFilterJniCallback(
+        JNIEnv* env, jobject jwal_filter);
+    virtual void ColumnFamilyLogNumberMap(
+        const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+        const std::map<std::string, uint32_t>& cf_name_id_map);
+    virtual WalFilter::WalProcessingOption LogRecordFound(
+        unsigned long long log_number, const std::string& log_file_name,
+        const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed);
+    virtual const char* Name() const;
+
+ private:
+    std::unique_ptr<const char[]> m_name;
+    jmethodID m_column_family_log_number_map_mid;
+    jmethodID m_log_record_found_proxy_mid;
+};
+
+}  //namespace rocksdb
+
+#endif  // JAVA_ROCKSJNI_WAL_FILTER_JNICALLBACK_H_
diff --git a/thirdparty/rocksdb/java/rocksjni/write_batch.cc b/thirdparty/rocksdb/java/rocksjni/write_batch.cc
index e84f6ed7d1..f1b77446c0 100644
--- a/thirdparty/rocksdb/java/rocksjni/write_batch.cc
+++ b/thirdparty/rocksdb/java/rocksjni/write_batch.cc
@@ -27,19 +27,43 @@
  * Method:    newWriteBatch
  * Signature: (I)J
  */
-jlong Java_org_rocksdb_WriteBatch_newWriteBatch(
-    JNIEnv* env, jclass jcls, jint jreserved_bytes) {
+jlong Java_org_rocksdb_WriteBatch_newWriteBatch__I(JNIEnv* /*env*/,
+                                                   jclass /*jcls*/,
+                                                   jint jreserved_bytes) {
   auto* wb = new rocksdb::WriteBatch(static_cast<size_t>(jreserved_bytes));
   return reinterpret_cast<jlong>(wb);
 }
 
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    newWriteBatch
+ * Signature: ([BI)J
+ */
+jlong Java_org_rocksdb_WriteBatch_newWriteBatch___3BI(JNIEnv* env,
+                                                      jclass /*jcls*/,
+                                                      jbyteArray jserialized,
+                                                      jint jserialized_length) {
+  jboolean has_exception = JNI_FALSE;
+  std::string serialized = rocksdb::JniUtil::byteString<std::string>(
+      env, jserialized, jserialized_length,
+      [](const char* str, const size_t len) { return std::string(str, len); },
+      &has_exception);
+  if (has_exception == JNI_TRUE) {
+    // exception occurred
+    return 0;
+  }
+
+  auto* wb = new rocksdb::WriteBatch(serialized);
+  return reinterpret_cast<jlong>(wb);
+}
+
 /*
  * Class:     org_rocksdb_WriteBatch
  * Method:    count0
  * Signature: (J)I
  */
-jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj,
-    jlong jwb_handle) {
+jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                        jlong jwb_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
 
@@ -51,8 +75,8 @@ jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj,
  * Method:    clear0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj,
-    jlong jwb_handle) {
+void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                        jlong jwb_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
 
@@ -64,8 +88,9 @@ void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj,
  * Method:    setSavePoint0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WriteBatch_setSavePoint0(
-    JNIEnv* env, jobject jobj, jlong jwb_handle) {
+void Java_org_rocksdb_WriteBatch_setSavePoint0(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jwb_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
 
@@ -77,8 +102,9 @@ void Java_org_rocksdb_WriteBatch_setSavePoint0(
  * Method:    rollbackToSavePoint0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0(
-    JNIEnv* env, jobject jobj, jlong jwb_handle) {
+void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0(JNIEnv* env,
+                                                      jobject /*jobj*/,
+                                                      jlong jwb_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
 
@@ -90,22 +116,58 @@ void Java_org_rocksdb_WriteBatch_rollbackToSavePoint0(
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    popSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_popSavePoint(JNIEnv* env, jobject /*jobj*/,
+                                              jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto s = wb->PopSavePoint();
+
+  if (s.ok()) {
+    return;
+  }
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    setMaxBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_WriteBatch_setMaxBytes(JNIEnv* /*env*/, jobject /*jobj*/,
+                                             jlong jwb_handle,
+                                             jlong jmax_bytes) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  wb->SetMaxBytes(static_cast<size_t>(jmax_bytes));
+}
+
 /*
  * Class:     org_rocksdb_WriteBatch
  * Method:    put
  * Signature: (J[BI[BI)V
  */
-void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI(
-    JNIEnv* env, jobject jobj, jlong jwb_handle,
-    jbyteArray jkey, jint jkey_len,
-    jbyteArray jentry_value, jint jentry_value_len) {
+void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI(JNIEnv* env, jobject jobj,
+                                                jlong jwb_handle,
+                                                jbyteArray jkey, jint jkey_len,
+                                                jbyteArray jentry_value,
+                                                jint jentry_value_len) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
-  auto put = [&wb] (rocksdb::Slice key, rocksdb::Slice value) {
-    wb->Put(key, value);
+  auto put = [&wb](rocksdb::Slice key, rocksdb::Slice value) {
+    return wb->Put(key, value);
   };
-  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      put, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -114,18 +176,20 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BI(
  * Signature: (J[BI[BIJ)V
  */
 void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ(
-    JNIEnv* env, jobject jobj, jlong jwb_handle,
-    jbyteArray jkey, jint jkey_len,
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   assert(cf_handle != nullptr);
-  auto put = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
-    wb->Put(cf_handle, key, value);
+  auto put = [&wb, &cf_handle](rocksdb::Slice key, rocksdb::Slice value) {
+    return wb->Put(cf_handle, key, value);
   };
-  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      put, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -134,16 +198,18 @@ void Java_org_rocksdb_WriteBatch_put__J_3BI_3BIJ(
  * Signature: (J[BI[BI)V
  */
 void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI(
-    JNIEnv* env, jobject jobj, jlong jwb_handle,
-    jbyteArray jkey, jint jkey_len,
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
-  auto merge = [&wb] (rocksdb::Slice key, rocksdb::Slice value) {
-    wb->Merge(key, value);
+  auto merge = [&wb](rocksdb::Slice key, rocksdb::Slice value) {
+    return wb->Merge(key, value);
   };
-  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      merge, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -152,52 +218,106 @@ void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BI(
  * Signature: (J[BI[BIJ)V
  */
 void Java_org_rocksdb_WriteBatch_merge__J_3BI_3BIJ(
-    JNIEnv* env, jobject jobj, jlong jwb_handle,
-    jbyteArray jkey, jint jkey_len,
+    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jkey, jint jkey_len,
     jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   assert(cf_handle != nullptr);
-  auto merge = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
-    wb->Merge(cf_handle, key, value);
+  auto merge = [&wb, &cf_handle](rocksdb::Slice key, rocksdb::Slice value) {
+    return wb->Merge(cf_handle, key, value);
   };
-  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      merge, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
  * Class:     org_rocksdb_WriteBatch
- * Method:    remove
+ * Method:    delete
  * Signature: (J[BI)V
  */
-void Java_org_rocksdb_WriteBatch_remove__J_3BI(
-    JNIEnv* env, jobject jobj, jlong jwb_handle,
-    jbyteArray jkey, jint jkey_len) {
+void Java_org_rocksdb_WriteBatch_delete__J_3BI(JNIEnv* env, jobject jobj,
+                                               jlong jwb_handle,
+                                               jbyteArray jkey, jint jkey_len) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
-  auto remove = [&wb] (rocksdb::Slice key) {
-    wb->Delete(key);
+  auto remove = [&wb](rocksdb::Slice key) { return wb->Delete(key); };
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    delete
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatch_delete__J_3BIJ(JNIEnv* env, jobject jobj,
+                                                jlong jwb_handle,
+                                                jbyteArray jkey, jint jkey_len,
+                                                jlong jcf_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wb, &cf_handle](rocksdb::Slice key) {
+    return wb->Delete(cf_handle, key);
+  };
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    singleDelete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatch_singleDelete__J_3BI(JNIEnv* env, jobject jobj,
+                                                     jlong jwb_handle,
+                                                     jbyteArray jkey,
+                                                     jint jkey_len) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+  auto single_delete = [&wb](rocksdb::Slice key) {
+    return wb->SingleDelete(key);
   };
-  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(single_delete, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
  * Class:     org_rocksdb_WriteBatch
- * Method:    remove
+ * Method:    singleDelete
  * Signature: (J[BIJ)V
  */
-void Java_org_rocksdb_WriteBatch_remove__J_3BIJ(
-    JNIEnv* env, jobject jobj, jlong jwb_handle,
-    jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
+void Java_org_rocksdb_WriteBatch_singleDelete__J_3BIJ(JNIEnv* env, jobject jobj,
+                                                      jlong jwb_handle,
+                                                      jbyteArray jkey,
+                                                      jint jkey_len,
+                                                      jlong jcf_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   assert(cf_handle != nullptr);
-  auto remove = [&wb, &cf_handle] (rocksdb::Slice key) {
-    wb->Delete(cf_handle, key);
+  auto single_delete = [&wb, &cf_handle](rocksdb::Slice key) {
+    return wb->SingleDelete(cf_handle, key);
   };
-  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(single_delete, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -205,19 +325,20 @@ void Java_org_rocksdb_WriteBatch_remove__J_3BIJ(
  * Method:    deleteRange
  * Signature: (J[BI[BI)V
  */
-JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI(
-    JNIEnv*, jobject, jlong, jbyteArray, jint, jbyteArray, jint);
-
 void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BI(
     JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jbegin_key,
     jint jbegin_key_len, jbyteArray jend_key, jint jend_key_len) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
   auto deleteRange = [&wb](rocksdb::Slice beginKey, rocksdb::Slice endKey) {
-    wb->DeleteRange(beginKey, endKey);
+    return wb->DeleteRange(beginKey, endKey);
   };
-  rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len,
-                          jend_key, jend_key_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                              jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -235,10 +356,14 @@ void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BIJ(
   assert(cf_handle != nullptr);
   auto deleteRange = [&wb, &cf_handle](rocksdb::Slice beginKey,
                                        rocksdb::Slice endKey) {
-    wb->DeleteRange(cf_handle, beginKey, endKey);
+    return wb->DeleteRange(cf_handle, beginKey, endKey);
   };
-  rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len,
-                          jend_key, jend_key_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                              jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -246,15 +371,17 @@ void Java_org_rocksdb_WriteBatch_deleteRange__J_3BI_3BIJ(
  * Method:    putLogData
  * Signature: (J[BI)V
  */
-void Java_org_rocksdb_WriteBatch_putLogData(
-    JNIEnv* env, jobject jobj, jlong jwb_handle, jbyteArray jblob,
-    jint jblob_len) {
+void Java_org_rocksdb_WriteBatch_putLogData(JNIEnv* env, jobject jobj,
+                                            jlong jwb_handle, jbyteArray jblob,
+                                            jint jblob_len) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
-  auto putLogData = [&wb] (rocksdb::Slice blob) {
-    wb->PutLogData(blob);
-  };
-  rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+  auto putLogData = [&wb](rocksdb::Slice blob) { return wb->PutLogData(blob); };
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -262,13 +389,14 @@ void Java_org_rocksdb_WriteBatch_putLogData(
  * Method:    iterate
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_WriteBatch_iterate(
-    JNIEnv* env , jobject jobj, jlong jwb_handle, jlong handlerHandle) {
+void Java_org_rocksdb_WriteBatch_iterate(JNIEnv* env, jobject /*jobj*/,
+                                         jlong jwb_handle,
+                                         jlong handlerHandle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
 
   rocksdb::Status s = wb->Iterate(
-    reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handlerHandle));
+      reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handlerHandle));
 
   if (s.ok()) {
     return;
@@ -276,13 +404,189 @@ void Java_org_rocksdb_WriteBatch_iterate(
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    data
+ * Signature: (J)[B
+ */
+jbyteArray Java_org_rocksdb_WriteBatch_data(JNIEnv* env, jobject /*jobj*/,
+                                            jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto data = wb->Data();
+  return rocksdb::JniUtil::copyBytes(env, data);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    getDataSize
+ * Signature: (J)J
+ */
+jlong Java_org_rocksdb_WriteBatch_getDataSize(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto data_size = wb->GetDataSize();
+  return static_cast<jlong>(data_size);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasPut
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteBatch_hasPut(JNIEnv* /*env*/, jobject /*jobj*/,
+                                            jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasPut();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasDelete
+ * Signature: (J)Z
+ */
+jboolean Java_org_rocksdb_WriteBatch_hasDelete(JNIEnv* /*env*/,
+                                               jobject /*jobj*/,
+                                               jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasDelete();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasSingleDelete
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasSingleDelete(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasSingleDelete();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasDeleteRange
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasDeleteRange(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasDeleteRange();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasMerge
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasMerge(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasMerge();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasBeginPrepare
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasBeginPrepare(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasBeginPrepare();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasEndPrepare
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasEndPrepare(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasEndPrepare();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasCommit
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasCommit(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasCommit();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    hasRollback
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_rocksdb_WriteBatch_hasRollback(
+    JNIEnv* /*env*/, jobject /*jobj*/, jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  return wb->HasRollback();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    markWalTerminationPoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatch_markWalTerminationPoint(JNIEnv* /*env*/,
+                                                         jobject /*jobj*/,
+                                                         jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  wb->MarkWalTerminationPoint();
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatch
+ * Method:    getWalTerminationPoint
+ * Signature: (J)Lorg/rocksdb/WriteBatch/SavePoint;
+ */
+jobject Java_org_rocksdb_WriteBatch_getWalTerminationPoint(JNIEnv* env,
+                                                           jobject /*jobj*/,
+                                                           jlong jwb_handle) {
+  auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
+  assert(wb != nullptr);
+
+  auto save_point = wb->GetWalTerminationPoint();
+  return rocksdb::WriteBatchSavePointJni::construct(env, save_point);
+}
+
 /*
  * Class:     org_rocksdb_WriteBatch
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_WriteBatch_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WriteBatch_disposeInternal(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(handle);
   assert(wb != nullptr);
   delete wb;
@@ -293,21 +597,8 @@ void Java_org_rocksdb_WriteBatch_disposeInternal(
  * Method:    createNewHandler0
  * Signature: ()J
  */
-jlong Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0(
-    JNIEnv* env, jobject jobj) {
+jlong Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0(JNIEnv* env,
+                                                                 jobject jobj) {
   auto* wbjnic = new rocksdb::WriteBatchHandlerJniCallback(env, jobj);
   return reinterpret_cast<jlong>(wbjnic);
 }
-
-/*
- * Class:     org_rocksdb_WriteBatch_Handler
- * Method:    disposeInternal
- * Signature: (J)V
- */
-void Java_org_rocksdb_WriteBatch_00024Handler_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
-  auto* wbjnic =
-      reinterpret_cast<rocksdb::WriteBatchHandlerJniCallback*>(handle);
-  assert(wbjnic != nullptr);
-  delete wbjnic;
-}
diff --git a/thirdparty/rocksdb/java/rocksjni/write_batch_test.cc b/thirdparty/rocksdb/java/rocksjni/write_batch_test.cc
index 199ad239d7..266fb4abf7 100644
--- a/thirdparty/rocksdb/java/rocksjni/write_batch_test.cc
+++ b/thirdparty/rocksdb/java/rocksjni/write_batch_test.cc
@@ -30,8 +30,9 @@
  * Method:    getContents
  * Signature: (J)[B
  */
-jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
-    JNIEnv* env, jclass jclazz, jlong jwb_handle) {
+jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(JNIEnv* env,
+                                                       jclass /*jclazz*/,
+                                                       jlong jwb_handle) {
   auto* b = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(b != nullptr);
 
@@ -55,8 +56,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
       rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr);
   int count = 0;
   rocksdb::Arena arena;
-  rocksdb::ScopedArenaIterator iter(mem->NewIterator(
-      rocksdb::ReadOptions(), &arena));
+  rocksdb::ScopedArenaIterator iter(
+      mem->NewIterator(rocksdb::ReadOptions(), &arena));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     rocksdb::ParsedInternalKey ikey;
     ikey.clear();
@@ -87,8 +88,32 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
         state.append(")");
         count++;
         break;
+      case rocksdb::kTypeSingleDeletion:
+        state.append("SingleDelete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeRangeDeletion:
+        state.append("DeleteRange(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case rocksdb::kTypeLogData:
+        state.append("LogData(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
       default:
         assert(false);
+        state.append("Err:Expected(");
+        state.append(std::to_string(ikey.type));
+        state.append(")");
+        count++;
         break;
     }
     state.append("@");
@@ -96,20 +121,25 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
   }
   if (!s.ok()) {
     state.append(s.ToString());
-  } else if (count != rocksdb::WriteBatchInternal::Count(b)) {
-    state.append("CountMismatch()");
+  } else if (rocksdb::WriteBatchInternal::Count(b) != count) {
+    state.append("Err:CountMismatch(expected=");
+    state.append(std::to_string(rocksdb::WriteBatchInternal::Count(b)));
+    state.append(", actual=");
+    state.append(std::to_string(count));
+    state.append(")");
   }
   delete mem->Unref();
 
   jbyteArray jstate = env->NewByteArray(static_cast<jsize>(state.size()));
-  if(jstate == nullptr) {
+  if (jstate == nullptr) {
     // exception thrown: OutOfMemoryError
     return nullptr;
   }
 
-  env->SetByteArrayRegion(jstate, 0, static_cast<jsize>(state.size()),
-                          const_cast<jbyte*>(reinterpret_cast<const jbyte*>(state.c_str())));
-  if(env->ExceptionCheck()) {
+  env->SetByteArrayRegion(
+      jstate, 0, static_cast<jsize>(state.size()),
+      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(state.c_str())));
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
     env->DeleteLocalRef(jstate);
     return nullptr;
@@ -124,7 +154,7 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
  * Signature: (JJ)V
  */
 void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence(
-    JNIEnv* env, jclass jclazz, jlong jwb_handle, jlong jsn) {
+    JNIEnv* /*env*/, jclass /*jclazz*/, jlong jwb_handle, jlong jsn) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
 
@@ -137,8 +167,9 @@ void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence(
  * Method:    sequence
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence(
-    JNIEnv* env, jclass jclazz, jlong jwb_handle) {
+jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence(JNIEnv* /*env*/,
+                                                             jclass /*jclazz*/,
+                                                             jlong jwb_handle) {
   auto* wb = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle);
   assert(wb != nullptr);
 
@@ -150,8 +181,10 @@ jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence(
  * Method:    append
  * Signature: (JJ)V
  */
-void Java_org_rocksdb_WriteBatchTestInternalHelper_append(
-    JNIEnv* env, jclass jclazz, jlong jwb_handle_1, jlong jwb_handle_2) {
+void Java_org_rocksdb_WriteBatchTestInternalHelper_append(JNIEnv* /*env*/,
+                                                          jclass /*jclazz*/,
+                                                          jlong jwb_handle_1,
+                                                          jlong jwb_handle_2) {
   auto* wb1 = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle_1);
   assert(wb1 != nullptr);
   auto* wb2 = reinterpret_cast<rocksdb::WriteBatch*>(jwb_handle_2);
diff --git a/thirdparty/rocksdb/java/rocksjni/write_batch_with_index.cc b/thirdparty/rocksdb/java/rocksjni/write_batch_with_index.cc
index 53f2a11d12..12ca299a9d 100644
--- a/thirdparty/rocksdb/java/rocksjni/write_batch_with_index.cc
+++ b/thirdparty/rocksdb/java/rocksjni/write_batch_with_index.cc
@@ -6,10 +6,10 @@
 // This file implements the "bridge" between Java and C++ and enables
 // calling c++ rocksdb::WriteBatchWithIndex methods from Java side.
 
+#include "rocksdb/utilities/write_batch_with_index.h"
 #include "include/org_rocksdb_WBWIRocksIterator.h"
 #include "include/org_rocksdb_WriteBatchWithIndex.h"
 #include "rocksdb/comparator.h"
-#include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksjni/portal.h"
 
 /*
@@ -18,7 +18,7 @@
  * Signature: ()J
  */
 jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__(
-    JNIEnv* env, jclass jcls) {
+    JNIEnv* /*env*/, jclass /*jcls*/) {
   auto* wbwi = new rocksdb::WriteBatchWithIndex();
   return reinterpret_cast<jlong>(wbwi);
 }
@@ -29,25 +29,44 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__(
  * Signature: (Z)J
  */
 jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z(
-    JNIEnv* env, jclass jcls, jboolean joverwrite_key) {
-  auto* wbwi =
-      new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
-          static_cast<bool>(joverwrite_key));
+    JNIEnv* /*env*/, jclass /*jcls*/, jboolean joverwrite_key) {
+  auto* wbwi = new rocksdb::WriteBatchWithIndex(
+      rocksdb::BytewiseComparator(), 0, static_cast<bool>(joverwrite_key));
   return reinterpret_cast<jlong>(wbwi);
 }
 
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
  * Method:    newWriteBatchWithIndex
- * Signature: (JIZ)J
- */
-jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JIZ(
-    JNIEnv* env, jclass jcls, jlong jfallback_index_comparator_handle,
-    jint jreserved_bytes, jboolean joverwrite_key) {
-  auto* wbwi =
-      new rocksdb::WriteBatchWithIndex(
-          reinterpret_cast<rocksdb::Comparator*>(jfallback_index_comparator_handle),
-          static_cast<size_t>(jreserved_bytes), static_cast<bool>(joverwrite_key));
+ * Signature: (JBIZ)J
+ */
+jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JBIZ(
+    JNIEnv* /*env*/, jclass /*jcls*/, jlong jfallback_index_comparator_handle,
+    jbyte jcomparator_type, jint jreserved_bytes, jboolean joverwrite_key) {
+  rocksdb::Comparator* fallback_comparator = nullptr;
+  switch (jcomparator_type) {
+    // JAVA_COMPARATOR
+    case 0x0:
+      fallback_comparator = reinterpret_cast<rocksdb::ComparatorJniCallback*>(
+          jfallback_index_comparator_handle);
+      break;
+
+    // JAVA_DIRECT_COMPARATOR
+    case 0x1:
+      fallback_comparator =
+          reinterpret_cast<rocksdb::DirectComparatorJniCallback*>(
+              jfallback_index_comparator_handle);
+      break;
+
+    // JAVA_NATIVE_COMPARATOR_WRAPPER
+    case 0x2:
+      fallback_comparator = reinterpret_cast<rocksdb::Comparator*>(
+          jfallback_index_comparator_handle);
+      break;
+  }
+  auto* wbwi = new rocksdb::WriteBatchWithIndex(
+      fallback_comparator, static_cast<size_t>(jreserved_bytes),
+      static_cast<bool>(joverwrite_key));
   return reinterpret_cast<jlong>(wbwi);
 }
 
@@ -56,8 +75,9 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JIZ(
  * Method:    count0
  * Signature: (J)I
  */
-jint Java_org_rocksdb_WriteBatchWithIndex_count0(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle) {
+jint Java_org_rocksdb_WriteBatchWithIndex_count0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jwbwi_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
 
@@ -74,11 +94,14 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BI(
     jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
-  auto put = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) {
-    wbwi->Put(key, value);
+  auto put = [&wbwi](rocksdb::Slice key, rocksdb::Slice value) {
+    return wbwi->Put(key, value);
   };
-  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      put, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -94,11 +117,14 @@ void Java_org_rocksdb_WriteBatchWithIndex_put__J_3BI_3BIJ(
   assert(wbwi != nullptr);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   assert(cf_handle != nullptr);
-  auto put = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
-    wbwi->Put(cf_handle, key, value);
+  auto put = [&wbwi, &cf_handle](rocksdb::Slice key, rocksdb::Slice value) {
+    return wbwi->Put(cf_handle, key, value);
   };
-  rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      put, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -111,11 +137,14 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BI(
     jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
-  auto merge = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) {
-    wbwi->Merge(key, value);
+  auto merge = [&wbwi](rocksdb::Slice key, rocksdb::Slice value) {
+    return wbwi->Merge(key, value);
   };
-  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      merge, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -131,45 +160,98 @@ void Java_org_rocksdb_WriteBatchWithIndex_merge__J_3BI_3BIJ(
   assert(wbwi != nullptr);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   assert(cf_handle != nullptr);
-  auto merge = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) {
-    wbwi->Merge(cf_handle, key, value);
+  auto merge = [&wbwi, &cf_handle](rocksdb::Slice key, rocksdb::Slice value) {
+    return wbwi->Merge(cf_handle, key, value);
+  };
+  std::unique_ptr<rocksdb::Status> status = rocksdb::JniUtil::kv_op(
+      merge, env, jobj, jkey, jkey_len, jentry_value, jentry_value_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    delete
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BI(JNIEnv* env,
+                                                        jobject jobj,
+                                                        jlong jwbwi_handle,
+                                                        jbyteArray jkey,
+                                                        jint jkey_len) {
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto remove = [&wbwi](rocksdb::Slice key) { return wbwi->Delete(key); };
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    delete
+ * Signature: (J[BIJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_delete__J_3BIJ(
+    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
+    jint jkey_len, jlong jcf_handle) {
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+  auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
+  assert(cf_handle != nullptr);
+  auto remove = [&wbwi, &cf_handle](rocksdb::Slice key) {
+    return wbwi->Delete(cf_handle, key);
   };
-  rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value,
-      jentry_value_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
- * Method:    remove
+ * Method:    singleDelete
  * Signature: (J[BI)V
  */
-void Java_org_rocksdb_WriteBatchWithIndex_remove__J_3BI(
+void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BI(
     JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
     jint jkey_len) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
-  auto remove = [&wbwi] (rocksdb::Slice key) {
-    wbwi->Delete(key);
+  auto single_delete = [&wbwi](rocksdb::Slice key) {
+    return wbwi->SingleDelete(key);
   };
-  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(single_delete, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
- * Method:    remove
+ * Method:    singleDelete
  * Signature: (J[BIJ)V
  */
-void Java_org_rocksdb_WriteBatchWithIndex_remove__J_3BIJ(
+void Java_org_rocksdb_WriteBatchWithIndex_singleDelete__J_3BIJ(
     JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jkey,
     jint jkey_len, jlong jcf_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   assert(cf_handle != nullptr);
-  auto remove = [&wbwi, &cf_handle] (rocksdb::Slice key) {
-    wbwi->Delete(cf_handle, key);
+  auto single_delete = [&wbwi, &cf_handle](rocksdb::Slice key) {
+    return wbwi->SingleDelete(cf_handle, key);
   };
-  rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(single_delete, env, jobj, jkey, jkey_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -183,10 +265,14 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BI(
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
   auto deleteRange = [&wbwi](rocksdb::Slice beginKey, rocksdb::Slice endKey) {
-    wbwi->DeleteRange(beginKey, endKey);
+    return wbwi->DeleteRange(beginKey, endKey);
   };
-  rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len,
-                          jend_key, jend_key_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                              jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -204,10 +290,14 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BIJ(
   assert(cf_handle != nullptr);
   auto deleteRange = [&wbwi, &cf_handle](rocksdb::Slice beginKey,
                                          rocksdb::Slice endKey) {
-    wbwi->DeleteRange(cf_handle, beginKey, endKey);
+    return wbwi->DeleteRange(cf_handle, beginKey, endKey);
   };
-  rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key, jbegin_key_len,
-                          jend_key, jend_key_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::kv_op(deleteRange, env, jobj, jbegin_key,
+                              jbegin_key_len, jend_key, jend_key_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -215,15 +305,20 @@ void Java_org_rocksdb_WriteBatchWithIndex_deleteRange__J_3BI_3BIJ(
  * Method:    putLogData
  * Signature: (J[BI)V
  */
-void Java_org_rocksdb_WriteBatchWithIndex_putLogData(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jbyteArray jblob,
-    jint jblob_len) {
+void Java_org_rocksdb_WriteBatchWithIndex_putLogData(JNIEnv* env, jobject jobj,
+                                                     jlong jwbwi_handle,
+                                                     jbyteArray jblob,
+                                                     jint jblob_len) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
-  auto putLogData = [&wbwi] (rocksdb::Slice blob) {
-    wbwi->PutLogData(blob);
+  auto putLogData = [&wbwi](rocksdb::Slice blob) {
+    return wbwi->PutLogData(blob);
   };
-  rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+  std::unique_ptr<rocksdb::Status> status =
+      rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len);
+  if (status != nullptr && !status->ok()) {
+    rocksdb::RocksDBExceptionJni::ThrowNew(env, status);
+  }
 }
 
 /*
@@ -231,8 +326,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_putLogData(
  * Method:    clear
  * Signature: (J)V
  */
-void Java_org_rocksdb_WriteBatchWithIndex_clear0(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle) {
+void Java_org_rocksdb_WriteBatchWithIndex_clear0(JNIEnv* /*env*/,
+                                                 jobject /*jobj*/,
+                                                 jlong jwbwi_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
 
@@ -244,8 +340,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_clear0(
  * Method:    setSavePoint0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle) {
+void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong jwbwi_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
 
@@ -258,7 +355,7 @@ void Java_org_rocksdb_WriteBatchWithIndex_setSavePoint0(
  * Signature: (J)V
  */
 void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle) {
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   assert(wbwi != nullptr);
 
@@ -271,13 +368,66 @@ void Java_org_rocksdb_WriteBatchWithIndex_rollbackToSavePoint0(
   rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
 }
 
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    popSavePoint
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_popSavePoint(JNIEnv* env,
+                                                       jobject /*jobj*/,
+                                                       jlong jwbwi_handle) {
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  auto s = wbwi->PopSavePoint();
+
+  if (s.ok()) {
+    return;
+  }
+
+  rocksdb::RocksDBExceptionJni::ThrowNew(env, s);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    setMaxBytes
+ * Signature: (JJ)V
+ */
+void Java_org_rocksdb_WriteBatchWithIndex_setMaxBytes(JNIEnv* /*env*/,
+                                                      jobject /*jobj*/,
+                                                      jlong jwbwi_handle,
+                                                      jlong jmax_bytes) {
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  wbwi->SetMaxBytes(static_cast<size_t>(jmax_bytes));
+}
+
+/*
+ * Class:     org_rocksdb_WriteBatchWithIndex
+ * Method:    getWriteBatch
+ * Signature: (J)Lorg/rocksdb/WriteBatch;
+ */
+jobject Java_org_rocksdb_WriteBatchWithIndex_getWriteBatch(JNIEnv* env,
+                                                           jobject /*jobj*/,
+                                                           jlong jwbwi_handle) {
+  auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
+  assert(wbwi != nullptr);
+
+  auto* wb = wbwi->GetWriteBatch();
+
+  // TODO(AR) is the `wb` object owned by us?
+  return rocksdb::WriteBatchJni::construct(env, wb);
+}
+
 /*
  * Class:     org_rocksdb_WriteBatchWithIndex
  * Method:    iterator0
  * Signature: (J)J
  */
-jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle) {
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jwbwi_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   auto* wbwi_iterator = wbwi->NewIterator();
   return reinterpret_cast<jlong>(wbwi_iterator);
@@ -288,8 +438,10 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0(
  * Method:    iterator1
  * Signature: (JJ)J
  */
-jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jcf_handle) {
+jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong jwbwi_handle,
+                                                     jlong jcf_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   auto* wbwi_iterator = wbwi->NewIterator(cf_handle);
@@ -301,9 +453,11 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1(
  * Method:    iteratorWithBase
  * Signature: (JJJ)J
  */
-jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jcf_handle,
-    jlong jbi_handle) {
+jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(JNIEnv* /*env*/,
+                                                            jobject /*jobj*/,
+                                                            jlong jwbwi_handle,
+                                                            jlong jcf_handle,
+                                                            jlong jbi_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
   auto* base_iterator = reinterpret_cast<rocksdb::Iterator*>(jbi_handle);
@@ -317,7 +471,7 @@ jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase(
  * Signature: (JJ[BI)[B
  */
 jbyteArray JNICALL Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BI(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jdbopt_handle,
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdbopt_handle,
     jbyteArray jkey, jint jkey_len) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   auto* dbopt = reinterpret_cast<rocksdb::DBOptions*>(jdbopt_handle);
@@ -335,17 +489,16 @@ jbyteArray JNICALL Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BI(
  * Signature: (JJ[BIJ)[B
  */
 jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BIJ(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jdbopt_handle,
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdbopt_handle,
     jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   auto* dbopt = reinterpret_cast<rocksdb::DBOptions*>(jdbopt_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
 
-  auto getter =
-      [&wbwi, &cf_handle, &dbopt](const rocksdb::Slice& key,
-                                  std::string* value) {
-        return wbwi->GetFromBatch(cf_handle, *dbopt, key, value);
-      };
+  auto getter = [&wbwi, &cf_handle, &dbopt](const rocksdb::Slice& key,
+                                            std::string* value) {
+    return wbwi->GetFromBatch(cf_handle, *dbopt, key, value);
+  };
 
   return rocksdb::JniUtil::v_op(getter, env, jkey, jkey_len);
 }
@@ -356,16 +509,16 @@ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatch__JJ_3BIJ(
  * Signature: (JJJ[BI)[B
  */
 jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BI(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jdb_handle,
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdb_handle,
     jlong jreadopt_handle, jbyteArray jkey, jint jkey_len) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto* readopt = reinterpret_cast<rocksdb::ReadOptions*>(jreadopt_handle);
 
-  auto getter =
-      [&wbwi, &db, &readopt](const rocksdb::Slice& key, std::string* value) {
-        return wbwi->GetFromBatchAndDB(db, *readopt, key, value);
-      };
+  auto getter = [&wbwi, &db, &readopt](const rocksdb::Slice& key,
+                                       std::string* value) {
+    return wbwi->GetFromBatchAndDB(db, *readopt, key, value);
+  };
 
   return rocksdb::JniUtil::v_op(getter, env, jkey, jkey_len);
 }
@@ -376,18 +529,17 @@ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BI(
  * Signature: (JJJ[BIJ)[B
  */
 jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BIJ(
-    JNIEnv* env, jobject jobj, jlong jwbwi_handle, jlong jdb_handle,
+    JNIEnv* env, jobject /*jobj*/, jlong jwbwi_handle, jlong jdb_handle,
     jlong jreadopt_handle, jbyteArray jkey, jint jkey_len, jlong jcf_handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(jwbwi_handle);
   auto* db = reinterpret_cast<rocksdb::DB*>(jdb_handle);
   auto* readopt = reinterpret_cast<rocksdb::ReadOptions*>(jreadopt_handle);
   auto* cf_handle = reinterpret_cast<rocksdb::ColumnFamilyHandle*>(jcf_handle);
 
-  auto getter =
-      [&wbwi, &db, &cf_handle, &readopt](const rocksdb::Slice& key,
-                                         std::string* value) {
-        return wbwi->GetFromBatchAndDB(db, *readopt, cf_handle, key, value);
-      };
+  auto getter = [&wbwi, &db, &cf_handle, &readopt](const rocksdb::Slice& key,
+                                                   std::string* value) {
+    return wbwi->GetFromBatchAndDB(db, *readopt, cf_handle, key, value);
+  };
 
   return rocksdb::JniUtil::v_op(getter, env, jkey, jkey_len);
 }
@@ -397,8 +549,9 @@ jbyteArray Java_org_rocksdb_WriteBatchWithIndex_getFromBatchAndDB__JJJ_3BIJ(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(JNIEnv* /*env*/,
+                                                          jobject /*jobj*/,
+                                                          jlong handle) {
   auto* wbwi = reinterpret_cast<rocksdb::WriteBatchWithIndex*>(handle);
   assert(wbwi != nullptr);
   delete wbwi;
@@ -411,8 +564,9 @@ void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal(
  * Method:    disposeInternal
  * Signature: (J)V
  */
-void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(JNIEnv* /*env*/,
+                                                        jobject /*jobj*/,
+                                                        jlong handle) {
   auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
   assert(it != nullptr);
   delete it;
@@ -423,8 +577,9 @@ void Java_org_rocksdb_WBWIRocksIterator_disposeInternal(
  * Method:    isValid0
  * Signature: (J)Z
  */
-jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
   return reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Valid();
 }
 
@@ -433,8 +588,9 @@ jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0(
  * Method:    seekToFirst0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(JNIEnv* /*env*/,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
   reinterpret_cast<rocksdb::WBWIIterator*>(handle)->SeekToFirst();
 }
 
@@ -443,8 +599,9 @@ void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0(
  * Method:    seekToLast0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(JNIEnv* /*env*/,
+                                                    jobject /*jobj*/,
+                                                    jlong handle) {
   reinterpret_cast<rocksdb::WBWIIterator*>(handle)->SeekToLast();
 }
 
@@ -453,8 +610,8 @@ void Java_org_rocksdb_WBWIRocksIterator_seekToLast0(
  * Method:    next0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WBWIRocksIterator_next0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WBWIRocksIterator_next0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong handle) {
   reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Next();
 }
 
@@ -463,8 +620,8 @@ void Java_org_rocksdb_WBWIRocksIterator_next0(
  * Method:    prev0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WBWIRocksIterator_prev0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WBWIRocksIterator_prev0(JNIEnv* /*env*/, jobject /*jobj*/,
+                                              jlong handle) {
   reinterpret_cast<rocksdb::WBWIIterator*>(handle)->Prev();
 }
 
@@ -473,31 +630,54 @@ void Java_org_rocksdb_WBWIRocksIterator_prev0(
  * Method:    seek0
  * Signature: (J[BI)V
  */
-void Java_org_rocksdb_WBWIRocksIterator_seek0(
-    JNIEnv* env, jobject jobj, jlong handle, jbyteArray jtarget,
-    jint jtarget_len) {
+void Java_org_rocksdb_WBWIRocksIterator_seek0(JNIEnv* env, jobject /*jobj*/,
+                                              jlong handle, jbyteArray jtarget,
+                                              jint jtarget_len) {
   auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
   jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
-  if(target == nullptr) {
+  if (target == nullptr) {
     // exception thrown: OutOfMemoryError
     return;
   }
 
-  rocksdb::Slice target_slice(
-      reinterpret_cast<char*>(target), jtarget_len);
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
 
   it->Seek(target_slice);
 
   env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
 }
 
+/*
+ * Class:     org_rocksdb_WBWIRocksIterator
+ * Method:    seekForPrev0
+ * Signature: (J[BI)V
+ */
+void Java_org_rocksdb_WBWIRocksIterator_seekForPrev0(JNIEnv* env,
+                                                     jobject /*jobj*/,
+                                                     jlong handle,
+                                                     jbyteArray jtarget,
+                                                     jint jtarget_len) {
+  auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
+  jbyte* target = env->GetByteArrayElements(jtarget, nullptr);
+  if (target == nullptr) {
+    // exception thrown: OutOfMemoryError
+    return;
+  }
+
+  rocksdb::Slice target_slice(reinterpret_cast<char*>(target), jtarget_len);
+
+  it->SeekForPrev(target_slice);
+
+  env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT);
+}
+
 /*
  * Class:     org_rocksdb_WBWIRocksIterator
  * Method:    status0
  * Signature: (J)V
  */
-void Java_org_rocksdb_WBWIRocksIterator_status0(
-    JNIEnv* env, jobject jobj, jlong handle) {
+void Java_org_rocksdb_WBWIRocksIterator_status0(JNIEnv* env, jobject /*jobj*/,
+                                                jlong handle) {
   auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
   rocksdb::Status s = it->status();
 
@@ -513,41 +693,25 @@ void Java_org_rocksdb_WBWIRocksIterator_status0(
  * Method:    entry1
  * Signature: (J)[J
  */
-jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(
-    JNIEnv* env, jobject jobj, jlong handle) {
+jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(JNIEnv* env,
+                                                     jobject /*jobj*/,
+                                                     jlong handle) {
   auto* it = reinterpret_cast<rocksdb::WBWIIterator*>(handle);
   const rocksdb::WriteEntry& we = it->Entry();
 
   jlong results[3];
 
-  //set the type of the write entry
-  switch (we.type) {
-    case rocksdb::kPutRecord:
-      results[0] = 0x1;
-      break;
-
-    case rocksdb::kMergeRecord:
-      results[0] = 0x2;
-      break;
-
-    case rocksdb::kDeleteRecord:
-      results[0] = 0x4;
-      break;
-
-    case rocksdb::kLogDataRecord:
-      results[0] = 0x8;
-      break;
-
-    default:
-      results[0] = 0x0;
-  }
+  // set the type of the write entry
+  results[0] = rocksdb::WriteTypeJni::toJavaWriteType(we.type);
 
-  // key_slice and value_slice will be freed by org.rocksdb.DirectSlice#close
+  // NOTE: key_slice and value_slice will be freed by
+  // org.rocksdb.DirectSlice#close
 
   auto* key_slice = new rocksdb::Slice(we.key.data(), we.key.size());
   results[1] = reinterpret_cast<jlong>(key_slice);
-  if (we.type == rocksdb::kDeleteRecord
-      || we.type == rocksdb::kLogDataRecord) {
+  if (we.type == rocksdb::kDeleteRecord ||
+      we.type == rocksdb::kSingleDeleteRecord ||
+      we.type == rocksdb::kLogDataRecord) {
     // set native handle of value slice to null if no value available
     results[2] = 0;
   } else {
@@ -556,9 +720,9 @@ jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(
   }
 
   jlongArray jresults = env->NewLongArray(3);
-  if(jresults == nullptr) {
+  if (jresults == nullptr) {
     // exception thrown: OutOfMemoryError
-    if(results[2] != 0) {
+    if (results[2] != 0) {
       auto* value_slice = reinterpret_cast<rocksdb::Slice*>(results[2]);
       delete value_slice;
     }
@@ -567,10 +731,10 @@ jlongArray Java_org_rocksdb_WBWIRocksIterator_entry1(
   }
 
   env->SetLongArrayRegion(jresults, 0, 3, results);
-  if(env->ExceptionCheck()) {
+  if (env->ExceptionCheck()) {
     // exception thrown: ArrayIndexOutOfBoundsException
     env->DeleteLocalRef(jresults);
-    if(results[2] != 0) {
+    if (results[2] != 0) {
       auto* value_slice = reinterpret_cast<rocksdb::Slice*>(results[2]);
       delete value_slice;
     }
diff --git a/thirdparty/rocksdb/java/rocksjni/write_buffer_manager.cc b/thirdparty/rocksdb/java/rocksjni/write_buffer_manager.cc
new file mode 100644
index 0000000000..043f69031c
--- /dev/null
+++ b/thirdparty/rocksdb/java/rocksjni/write_buffer_manager.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <jni.h>
+
+#include "include/org_rocksdb_WriteBufferManager.h"
+
+#include "rocksdb/cache.h"
+#include "rocksdb/write_buffer_manager.h"
+
+/*
+ * Class:     org_rocksdb_WriteBufferManager
+ * Method:    newWriteBufferManager
+ * Signature: (JJ)J
+ */
+jlong Java_org_rocksdb_WriteBufferManager_newWriteBufferManager(
+        JNIEnv* /*env*/, jclass /*jclazz*/, jlong jbuffer_size, jlong jcache_handle) {
+    auto* cache_ptr =
+            reinterpret_cast<std::shared_ptr<rocksdb::Cache> *>(jcache_handle);
+    auto* write_buffer_manager = new std::shared_ptr<rocksdb::WriteBufferManager>(
+            std::make_shared<rocksdb::WriteBufferManager>(jbuffer_size, *cache_ptr));
+    return reinterpret_cast<jlong>(write_buffer_manager);
+}
+
+/*
+ * Class:     org_rocksdb_WriteBufferManager
+ * Method:    disposeInternal
+ * Signature: (J)V
+ */
+void Java_org_rocksdb_WriteBufferManager_disposeInternal(
+        JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) {
+    auto* write_buffer_manager =
+            reinterpret_cast<std::shared_ptr<rocksdb::WriteBufferManager> *>(jhandle);
+    assert(write_buffer_manager != nullptr);
+    delete write_buffer_manager;
+}
diff --git a/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc b/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
index 0f00766c53..bf9001110a 100644
--- a/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
+++ b/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.cc
@@ -12,14 +12,11 @@
 namespace rocksdb {
 WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
     JNIEnv* env, jobject jWriteBatchHandler)
-    : m_env(env) {
-
-  // Note: we want to access the Java WriteBatchHandler instance
-  // across multiple method calls, so we create a global ref
-  assert(jWriteBatchHandler != nullptr);
-  m_jWriteBatchHandler = env->NewGlobalRef(jWriteBatchHandler);
-  if(m_jWriteBatchHandler == nullptr) {
-    // exception thrown: OutOfMemoryError
+    : JniCallback(env, jWriteBatchHandler), m_env(env) {
+
+  m_jPutCfMethodId = WriteBatchHandlerJni::getPutCfMethodId(env);
+  if(m_jPutCfMethodId == nullptr) {
+    // exception thrown
     return;
   }
 
@@ -29,18 +26,50 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
     return;
   }
 
+  m_jMergeCfMethodId = WriteBatchHandlerJni::getMergeCfMethodId(env);
+  if(m_jMergeCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
   m_jMergeMethodId = WriteBatchHandlerJni::getMergeMethodId(env);
   if(m_jMergeMethodId == nullptr) {
     // exception thrown
     return;
   }
 
+  m_jDeleteCfMethodId = WriteBatchHandlerJni::getDeleteCfMethodId(env);
+  if(m_jDeleteCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
   m_jDeleteMethodId = WriteBatchHandlerJni::getDeleteMethodId(env);
   if(m_jDeleteMethodId == nullptr) {
     // exception thrown
     return;
   }
 
+  m_jSingleDeleteCfMethodId =
+      WriteBatchHandlerJni::getSingleDeleteCfMethodId(env);
+  if(m_jSingleDeleteCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jSingleDeleteMethodId = WriteBatchHandlerJni::getSingleDeleteMethodId(env);
+  if(m_jSingleDeleteMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
+  m_jDeleteRangeCfMethodId =
+      WriteBatchHandlerJni::getDeleteRangeCfMethodId(env);
+  if (m_jDeleteRangeCfMethodId == nullptr) {
+    // exception thrown
+    return;
+  }
+
   m_jDeleteRangeMethodId = WriteBatchHandlerJni::getDeleteRangeMethodId(env);
   if (m_jDeleteRangeMethodId == nullptr) {
     // exception thrown
@@ -53,209 +82,329 @@ WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback(
     return;
   }
 
-  m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env);
-  if(m_jContinueMethodId == nullptr) {
+  m_jPutBlobIndexCfMethodId =
+      WriteBatchHandlerJni::getPutBlobIndexCfMethodId(env);
+  if(m_jPutBlobIndexCfMethodId == nullptr) {
     // exception thrown
     return;
   }
-}
 
-void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) {
-  const jbyteArray j_key = sliceToJArray(key);
-  if(j_key == nullptr) {
+  m_jMarkBeginPrepareMethodId =
+      WriteBatchHandlerJni::getMarkBeginPrepareMethodId(env);
+  if(m_jMarkBeginPrepareMethodId == nullptr) {
     // exception thrown
-    if(m_env->ExceptionCheck()) {
-      m_env->ExceptionDescribe();
-    }
     return;
   }
 
-  const jbyteArray j_value = sliceToJArray(value);
-  if(j_value == nullptr) {
+  m_jMarkEndPrepareMethodId =
+      WriteBatchHandlerJni::getMarkEndPrepareMethodId(env);
+  if(m_jMarkEndPrepareMethodId == nullptr) {
     // exception thrown
-    if(m_env->ExceptionCheck()) {
-      m_env->ExceptionDescribe();
-    }
-    if(j_key != nullptr) {
-      m_env->DeleteLocalRef(j_key);
-    }
     return;
   }
 
-  m_env->CallVoidMethod(
-      m_jWriteBatchHandler,
-      m_jPutMethodId,
-      j_key,
-      j_value);
-  if(m_env->ExceptionCheck()) {
+  m_jMarkNoopMethodId = WriteBatchHandlerJni::getMarkNoopMethodId(env);
+  if(m_jMarkNoopMethodId == nullptr) {
     // exception thrown
-    m_env->ExceptionDescribe();
-    if(j_value != nullptr) {
-      m_env->DeleteLocalRef(j_value);
-    }
-    if(j_key != nullptr) {
-      m_env->DeleteLocalRef(j_key);
-    }
     return;
   }
-
-  if(j_value != nullptr) {
-    m_env->DeleteLocalRef(j_value);
-  }
-  if(j_key != nullptr) {
-    m_env->DeleteLocalRef(j_key);
+    
+  m_jMarkRollbackMethodId = WriteBatchHandlerJni::getMarkRollbackMethodId(env);
+  if(m_jMarkRollbackMethodId == nullptr) {
+    // exception thrown
+    return;
   }
-}
 
-void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) {
-  const jbyteArray j_key = sliceToJArray(key);
-  if(j_key == nullptr) {
+  m_jMarkCommitMethodId = WriteBatchHandlerJni::getMarkCommitMethodId(env);
+  if(m_jMarkCommitMethodId == nullptr) {
     // exception thrown
-    if(m_env->ExceptionCheck()) {
-      m_env->ExceptionDescribe();
-    }
     return;
   }
 
-  const jbyteArray j_value = sliceToJArray(value);
-  if(j_value == nullptr) {
+  m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env);
+  if(m_jContinueMethodId == nullptr) {
     // exception thrown
-    if(m_env->ExceptionCheck()) {
-      m_env->ExceptionDescribe();
-    }
-    if(j_key != nullptr) {
-      m_env->DeleteLocalRef(j_key);
-    }
     return;
   }
+}
 
-  m_env->CallVoidMethod(
-      m_jWriteBatchHandler,
-      m_jMergeMethodId,
+rocksdb::Status WriteBatchHandlerJniCallback::PutCF(uint32_t column_family_id,
+    const Slice& key, const Slice& value) {
+  auto put = [this, column_family_id] (
+      jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jPutCfMethodId,
+      static_cast<jint>(column_family_id),
       j_key,
       j_value);
-  if(m_env->ExceptionCheck()) {
-    // exception thrown
-    m_env->ExceptionDescribe();
-    if(j_value != nullptr) {
-      m_env->DeleteLocalRef(j_value);
-    }
-    if(j_key != nullptr) {
-      m_env->DeleteLocalRef(j_key);
-    }
-    return;
+  };
+  auto status = WriteBatchHandlerJniCallback::kv_op(key, value, put);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
+}
 
-  if(j_value != nullptr) {
-    m_env->DeleteLocalRef(j_value);
-  }
-  if(j_key != nullptr) {
-    m_env->DeleteLocalRef(j_key);
+void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) {
+  auto put = [this] (
+        jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jPutMethodId,
+      j_key,
+      j_value);
+  };
+  WriteBatchHandlerJniCallback::kv_op(key, value, put);
+}
+
+rocksdb::Status WriteBatchHandlerJniCallback::MergeCF(uint32_t column_family_id,
+    const Slice& key, const Slice& value) {
+  auto merge = [this, column_family_id] (
+        jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jMergeCfMethodId,
+      static_cast<jint>(column_family_id),
+      j_key,
+      j_value);
+  };
+  auto status = WriteBatchHandlerJniCallback::kv_op(key, value, merge);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
 }
 
-void WriteBatchHandlerJniCallback::Delete(const Slice& key) {
-  const jbyteArray j_key = sliceToJArray(key);
-  if(j_key == nullptr) {
-    // exception thrown
-    if(m_env->ExceptionCheck()) {
-      m_env->ExceptionDescribe();
-    }
-    return;
+void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) {
+  auto merge = [this] (
+        jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jMergeMethodId,
+      j_key,
+      j_value);
+  };
+  WriteBatchHandlerJniCallback::kv_op(key, value, merge);
+}
+
+rocksdb::Status WriteBatchHandlerJniCallback::DeleteCF(uint32_t column_family_id,
+    const Slice& key) {
+  auto remove = [this, column_family_id] (jbyteArray j_key) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jDeleteCfMethodId,
+      static_cast<jint>(column_family_id),
+      j_key);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(key, remove);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
+}
 
-  m_env->CallVoidMethod(
-      m_jWriteBatchHandler,
+void WriteBatchHandlerJniCallback::Delete(const Slice& key) {
+  auto remove = [this] (jbyteArray j_key) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
       m_jDeleteMethodId,
       j_key);
-  if(m_env->ExceptionCheck()) {
-    // exception thrown
-    m_env->ExceptionDescribe();
-    if(j_key != nullptr) {
-      m_env->DeleteLocalRef(j_key);
-    }
-    return;
+  };
+  WriteBatchHandlerJniCallback::k_op(key, remove);
+}
+
+rocksdb::Status WriteBatchHandlerJniCallback::SingleDeleteCF(uint32_t column_family_id,
+    const Slice& key) {
+  auto singleDelete = [this, column_family_id] (jbyteArray j_key) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jSingleDeleteCfMethodId,
+      static_cast<jint>(column_family_id),
+      j_key);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(key, singleDelete);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
+}
 
-  if(j_key != nullptr) {
-    m_env->DeleteLocalRef(j_key);
+void WriteBatchHandlerJniCallback::SingleDelete(const Slice& key) {
+  auto singleDelete = [this] (jbyteArray j_key) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jSingleDeleteMethodId,
+      j_key);
+  };
+  WriteBatchHandlerJniCallback::k_op(key, singleDelete);
+}
+
+rocksdb::Status WriteBatchHandlerJniCallback::DeleteRangeCF(uint32_t column_family_id,
+    const Slice& beginKey, const Slice& endKey) {
+  auto deleteRange = [this, column_family_id] (
+        jbyteArray j_beginKey, jbyteArray j_endKey) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jDeleteRangeCfMethodId,
+      static_cast<jint>(column_family_id),
+      j_beginKey,
+      j_endKey);
+  };
+  auto status = WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
 }
 
 void WriteBatchHandlerJniCallback::DeleteRange(const Slice& beginKey,
-                                               const Slice& endKey) {
-  const jbyteArray j_beginKey = sliceToJArray(beginKey);
-  if (j_beginKey == nullptr) {
-    // exception thrown
-    if (m_env->ExceptionCheck()) {
-      m_env->ExceptionDescribe();
-    }
-    return;
-  }
+    const Slice& endKey) {
+  auto deleteRange = [this] (
+        jbyteArray j_beginKey, jbyteArray j_endKey) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jDeleteRangeMethodId,
+      j_beginKey,
+      j_endKey);
+  };
+  WriteBatchHandlerJniCallback::kv_op(beginKey, endKey, deleteRange);
+}
 
-  const jbyteArray j_endKey = sliceToJArray(beginKey);
-  if (j_endKey == nullptr) {
-    // exception thrown
-    if (m_env->ExceptionCheck()) {
-      m_env->ExceptionDescribe();
-    }
-    return;
+void WriteBatchHandlerJniCallback::LogData(const Slice& blob) {
+  auto logData = [this] (jbyteArray j_blob) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jLogDataMethodId,
+      j_blob);
+  };
+  WriteBatchHandlerJniCallback::k_op(blob, logData);
+}
+
+rocksdb::Status WriteBatchHandlerJniCallback::PutBlobIndexCF(uint32_t column_family_id,
+    const Slice& key, const Slice& value) {
+  auto putBlobIndex = [this, column_family_id] (
+      jbyteArray j_key, jbyteArray j_value) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jPutBlobIndexCfMethodId,
+      static_cast<jint>(column_family_id),
+      j_key,
+      j_value);
+  };
+  auto status = WriteBatchHandlerJniCallback::kv_op(key, value, putBlobIndex);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
+}
+
+rocksdb::Status WriteBatchHandlerJniCallback::MarkBeginPrepare(bool unprepare) {
+#ifndef DEBUG
+  (void) unprepare;
+#else
+  assert(!unprepare);
+#endif
+  m_env->CallVoidMethod(m_jcallback_obj, m_jMarkBeginPrepareMethodId);
 
-  m_env->CallVoidMethod(m_jWriteBatchHandler, m_jDeleteRangeMethodId,
-                        j_beginKey, j_endKey);
+  // check for Exception, in-particular RocksDBException
   if (m_env->ExceptionCheck()) {
     // exception thrown
-    m_env->ExceptionDescribe();
-    if (j_beginKey != nullptr) {
-      m_env->DeleteLocalRef(j_beginKey);
-    }
-    if (j_endKey != nullptr) {
-      m_env->DeleteLocalRef(j_endKey);
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<rocksdb::Status> status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
+      m_env->ExceptionDescribe();
+      return rocksdb::Status::OK();  // TODO(AR) probably need a better error code here
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the status
+      return rocksdb::Status(*status);
     }
-    return;
   }
 
-  if (j_beginKey != nullptr) {
-    m_env->DeleteLocalRef(j_beginKey);
-  }
+  return rocksdb::Status::OK();
+}
 
-  if (j_endKey != nullptr) {
-    m_env->DeleteLocalRef(j_endKey);
+rocksdb::Status WriteBatchHandlerJniCallback::MarkEndPrepare(const Slice& xid) {
+  auto markEndPrepare = [this] (
+      jbyteArray j_xid) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jMarkEndPrepareMethodId,
+      j_xid);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(xid, markEndPrepare);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
 }
 
-void WriteBatchHandlerJniCallback::LogData(const Slice& blob) {
-  const jbyteArray j_blob = sliceToJArray(blob);
-  if(j_blob == nullptr) {
+rocksdb::Status WriteBatchHandlerJniCallback::MarkNoop(bool empty_batch) {
+  m_env->CallVoidMethod(m_jcallback_obj, m_jMarkNoopMethodId, static_cast<jboolean>(empty_batch));
+
+  // check for Exception, in-particular RocksDBException
+  if (m_env->ExceptionCheck()) {
     // exception thrown
-    if(m_env->ExceptionCheck()) {
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<rocksdb::Status> status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
       m_env->ExceptionDescribe();
+      return rocksdb::Status::OK();  // TODO(AR) probably need a better error code here
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the status
+      return rocksdb::Status(*status);
     }
-    return;
   }
 
-  m_env->CallVoidMethod(
-      m_jWriteBatchHandler,
-      m_jLogDataMethodId,
-      j_blob);
-  if(m_env->ExceptionCheck()) {
-    // exception thrown
-    m_env->ExceptionDescribe();
-    if(j_blob != nullptr) {
-      m_env->DeleteLocalRef(j_blob);
-    }
-    return;
+  return rocksdb::Status::OK();
+}
+
+rocksdb::Status WriteBatchHandlerJniCallback::MarkRollback(const Slice& xid) {
+  auto markRollback = [this] (
+      jbyteArray j_xid) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jMarkRollbackMethodId,
+      j_xid);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(xid, markRollback);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
+}
 
-  if(j_blob != nullptr) {
-    m_env->DeleteLocalRef(j_blob);
+rocksdb::Status WriteBatchHandlerJniCallback::MarkCommit(const Slice& xid) {
+  auto markCommit = [this] (
+      jbyteArray j_xid) {
+    m_env->CallVoidMethod(
+      m_jcallback_obj,
+      m_jMarkCommitMethodId,
+      j_xid);
+  };
+  auto status = WriteBatchHandlerJniCallback::k_op(xid, markCommit);
+  if(status == nullptr) {
+    return rocksdb::Status::OK();   // TODO(AR) what to do if there is an Exception but we don't know the rocksdb::Status?
+  } else {
+    return rocksdb::Status(*status);
   }
 }
 
 bool WriteBatchHandlerJniCallback::Continue() {
   jboolean jContinue = m_env->CallBooleanMethod(
-      m_jWriteBatchHandler,
+      m_jcallback_obj,
       m_jContinueMethodId);
   if(m_env->ExceptionCheck()) {
     // exception thrown
@@ -265,42 +414,101 @@ bool WriteBatchHandlerJniCallback::Continue() {
   return static_cast<bool>(jContinue == JNI_TRUE);
 }
 
-/*
- * Creates a Java Byte Array from the data in a Slice
- *
- * When calling this function
- * you must remember to call env->DeleteLocalRef
- * on the result after you have finished with it
- *
- * @param s A Slice to convery to a Java byte array
- *
- * @return A reference to a Java byte array, or a nullptr if an
- *     exception occurs
- */
-jbyteArray WriteBatchHandlerJniCallback::sliceToJArray(const Slice& s) {
-  jbyteArray ja = m_env->NewByteArray(static_cast<jsize>(s.size()));
-  if(ja == nullptr) {
-    // exception thrown: OutOfMemoryError
+std::unique_ptr<rocksdb::Status> WriteBatchHandlerJniCallback::kv_op(const Slice& key, const Slice& value, std::function<void(jbyteArray, jbyteArray)> kvFn) {
+    const jbyteArray j_key = JniUtil::copyBytes(m_env, key);
+  if (j_key == nullptr) {
+    // exception thrown
+    if (m_env->ExceptionCheck()) {
+      m_env->ExceptionDescribe();
+    }
     return nullptr;
   }
 
-  m_env->SetByteArrayRegion(
-      ja, 0, static_cast<jsize>(s.size()),
-      const_cast<jbyte*>(reinterpret_cast<const jbyte*>(s.data())));
-  if(m_env->ExceptionCheck()) {
-    if(ja != nullptr) {
-      m_env->DeleteLocalRef(ja);
+  const jbyteArray j_value = JniUtil::copyBytes(m_env, value);
+  if (j_value == nullptr) {
+    // exception thrown
+    if (m_env->ExceptionCheck()) {
+      m_env->ExceptionDescribe();
+    }
+    if (j_key != nullptr) {
+      m_env->DeleteLocalRef(j_key);
     }
-    // exception thrown: ArrayIndexOutOfBoundsException
     return nullptr;
   }
 
-  return ja;
+  kvFn(j_key, j_value);
+
+  // check for Exception, in-particular RocksDBException
+  if (m_env->ExceptionCheck()) {
+    if (j_value != nullptr) {
+      m_env->DeleteLocalRef(j_value);
+    }
+    if (j_key != nullptr) {
+      m_env->DeleteLocalRef(j_key);
+    }
+
+    // exception thrown
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<rocksdb::Status> status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
+      m_env->ExceptionDescribe();
+      return nullptr;
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the status
+      return status;
+    }
+  }
+
+  if (j_value != nullptr) {
+    m_env->DeleteLocalRef(j_value);
+  }
+  if (j_key != nullptr) {
+    m_env->DeleteLocalRef(j_key);
+  }
+
+  // all OK
+  return std::unique_ptr<rocksdb::Status>(new rocksdb::Status(rocksdb::Status::OK()));
 }
 
-WriteBatchHandlerJniCallback::~WriteBatchHandlerJniCallback() {
-  if(m_jWriteBatchHandler != nullptr) {
-    m_env->DeleteGlobalRef(m_jWriteBatchHandler);
+std::unique_ptr<rocksdb::Status> WriteBatchHandlerJniCallback::k_op(const Slice& key, std::function<void(jbyteArray)> kFn) {
+    const jbyteArray j_key = JniUtil::copyBytes(m_env, key);
+  if (j_key == nullptr) {
+    // exception thrown
+    if (m_env->ExceptionCheck()) {
+      m_env->ExceptionDescribe();
+    }
+    return nullptr;
+  }
+
+  kFn(j_key);
+
+  // check for Exception, in-particular RocksDBException
+  if (m_env->ExceptionCheck()) {
+    if (j_key != nullptr) {
+      m_env->DeleteLocalRef(j_key);
+    }
+
+    // exception thrown
+    jthrowable exception = m_env->ExceptionOccurred();
+    std::unique_ptr<rocksdb::Status> status = rocksdb::RocksDBExceptionJni::toCppStatus(m_env, exception);
+    if (status == nullptr) {
+      // unkown status or exception occurred extracting status
+      m_env->ExceptionDescribe();
+      return nullptr;
+
+    } else {
+      m_env->ExceptionClear();  // clear the exception, as we have extracted the status
+      return status;
+    }
+  }
+
+  if (j_key != nullptr) {
+    m_env->DeleteLocalRef(j_key);
   }
+
+  // all OK
+  return std::unique_ptr<rocksdb::Status>(new rocksdb::Status(rocksdb::Status::OK()));
 }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h b/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
index 5d3dee3b1a..720f1693cb 100644
--- a/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
+++ b/thirdparty/rocksdb/java/rocksjni/writebatchhandlerjnicallback.h
@@ -9,7 +9,10 @@
 #ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
 #define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_
 
+#include <functional>
 #include <jni.h>
+#include <memory>
+#include "rocksjni/jnicallback.h"
 #include "rocksdb/write_batch.h"
 
 namespace rocksdb {
@@ -20,28 +23,61 @@ namespace rocksdb {
  * which calls the appropriate Java method.
  * This enables Write Batch Handlers to be implemented in Java.
  */
-class WriteBatchHandlerJniCallback : public WriteBatch::Handler {
+class WriteBatchHandlerJniCallback : public JniCallback, public WriteBatch::Handler {
  public:
     WriteBatchHandlerJniCallback(
       JNIEnv* env, jobject jWriteBackHandler);
-    ~WriteBatchHandlerJniCallback();
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+        const Slice& value);
     void Put(const Slice& key, const Slice& value);
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+        const Slice& value);
     void Merge(const Slice& key, const Slice& value);
+    Status DeleteCF(uint32_t column_family_id, const Slice& key);
     void Delete(const Slice& key);
+    Status SingleDeleteCF(uint32_t column_family_id, const Slice& key);
+    void SingleDelete(const Slice& key);
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice& beginKey,
+        const Slice& endKey);
     void DeleteRange(const Slice& beginKey, const Slice& endKey);
     void LogData(const Slice& blob);
+    Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+                          const Slice& value);
+    Status MarkBeginPrepare(bool);
+    Status MarkEndPrepare(const Slice& xid);
+    Status MarkNoop(bool empty_batch);
+    Status MarkRollback(const Slice& xid);
+    Status MarkCommit(const Slice& xid);
     bool Continue();
 
  private:
     JNIEnv* m_env;
-    jobject m_jWriteBatchHandler;
-    jbyteArray sliceToJArray(const Slice& s);
+    jmethodID m_jPutCfMethodId;
     jmethodID m_jPutMethodId;
+    jmethodID m_jMergeCfMethodId;
     jmethodID m_jMergeMethodId;
+    jmethodID m_jDeleteCfMethodId;
     jmethodID m_jDeleteMethodId;
+    jmethodID m_jSingleDeleteCfMethodId;
+    jmethodID m_jSingleDeleteMethodId;
+    jmethodID m_jDeleteRangeCfMethodId;
     jmethodID m_jDeleteRangeMethodId;
     jmethodID m_jLogDataMethodId;
+    jmethodID m_jPutBlobIndexCfMethodId;
+    jmethodID m_jMarkBeginPrepareMethodId;
+    jmethodID m_jMarkEndPrepareMethodId;
+    jmethodID m_jMarkNoopMethodId;
+    jmethodID m_jMarkRollbackMethodId;
+    jmethodID m_jMarkCommitMethodId;
     jmethodID m_jContinueMethodId;
+    /**
+     * @return A pointer to a rocksdb::Status or nullptr if an unexpected exception occurred
+     */
+    std::unique_ptr<rocksdb::Status> kv_op(const Slice& key, const Slice& value, std::function<void(jbyteArray, jbyteArray)> kvFn);
+    /**
+     * @return A pointer to a rocksdb::Status or nullptr if an unexpected exception occurred
+     */
+    std::unique_ptr<rocksdb::Status> k_op(const Slice& key, std::function<void(jbyteArray)> kFn);
 };
 }  // namespace rocksdb
 
diff --git a/thirdparty/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java b/thirdparty/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java
new file mode 100644
index 0000000000..1633d1f2bd
--- /dev/null
+++ b/thirdparty/rocksdb/java/samples/src/main/java/OptimisticTransactionSample.java
@@ -0,0 +1,184 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+import org.rocksdb.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Demonstrates using Transactions on an OptimisticTransactionDB with
+ * varying isolation guarantees
+ */
+public class OptimisticTransactionSample {
+  private static final String dbPath = "/tmp/rocksdb_optimistic_transaction_example";
+
+  public static final void main(final String args[]) throws RocksDBException {
+
+    try(final Options options = new Options()
+        .setCreateIfMissing(true);
+        final OptimisticTransactionDB txnDb =
+            OptimisticTransactionDB.open(options, dbPath)) {
+
+      try (final WriteOptions writeOptions = new WriteOptions();
+           final ReadOptions readOptions = new ReadOptions()) {
+
+        ////////////////////////////////////////////////////////
+        //
+        // Simple OptimisticTransaction Example ("Read Committed")
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Repeatable Read" (Snapshot Isolation) Example
+        //   -- Using a single Snapshot
+        //
+        ////////////////////////////////////////////////////////
+        repeatableRead(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Read Committed" (Monotonic Atomic Views) Example
+        //   --Using multiple Snapshots
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted_monotonicAtomicViews(txnDb, writeOptions, readOptions);
+      }
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" isolation
+   */
+  private static void readCommitted(final OptimisticTransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+    final byte key1[] = "abc".getBytes(UTF_8);
+    final byte value1[] = "def".getBytes(UTF_8);
+
+    final byte key2[] = "xyz".getBytes(UTF_8);
+    final byte value2[] = "zzz".getBytes(UTF_8);
+
+    // Start a transaction
+    try(final Transaction txn = txnDb.beginTransaction(writeOptions)) {
+      // Read a key in this transaction
+      byte[] value = txn.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key in this transaction
+      txn.put(key1, value1);
+
+      // Read a key OUTSIDE this transaction. Does not affect txn.
+      value = txnDb.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key OUTSIDE of this transaction.
+      // Does not affect txn since this is an unrelated key.
+      // If we wrote key 'abc' here, the transaction would fail to commit.
+      txnDb.put(writeOptions, key2, value2);
+
+      // Commit transaction
+      txn.commit();
+    }
+  }
+
+  /**
+   * Demonstrates "Repeatable Read" (Snapshot Isolation) isolation
+   */
+  private static void repeatableRead(final OptimisticTransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+
+    final byte key1[] = "ghi".getBytes(UTF_8);
+    final byte value1[] = "jkl".getBytes(UTF_8);
+
+    // Set a snapshot at start of transaction by setting setSnapshot(true)
+    try(final OptimisticTransactionOptions txnOptions =
+            new OptimisticTransactionOptions().setSetSnapshot(true);
+        final Transaction txn =
+            txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      final Snapshot snapshot = txn.getSnapshot();
+
+      // Write a key OUTSIDE of transaction
+      txnDb.put(writeOptions, key1, value1);
+
+      // Read a key using the snapshot.
+      readOptions.setSnapshot(snapshot);
+      final byte[] value = txn.getForUpdate(readOptions, key1, true);
+      assert(value == value1);
+
+      try {
+        // Attempt to commit transaction
+        txn.commit();
+        throw new IllegalStateException();
+      } catch(final RocksDBException e) {
+        // Transaction could not commit since the write outside of the txn
+        // conflicted with the read!
+        assert(e.getStatus().getCode() == Status.Code.Busy);
+      }
+
+      txn.rollback();
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" (Monotonic Atomic Views) isolation
+   *
+   * In this example, we set the snapshot multiple times.  This is probably
+   * only necessary if you have very strict isolation requirements to
+   * implement.
+   */
+  private static void readCommitted_monotonicAtomicViews(
+      final OptimisticTransactionDB txnDb, final WriteOptions writeOptions,
+      final ReadOptions readOptions) throws RocksDBException {
+
+    final byte keyX[] = "x".getBytes(UTF_8);
+    final byte valueX[] = "x".getBytes(UTF_8);
+
+    final byte keyY[] = "y".getBytes(UTF_8);
+    final byte valueY[] = "y".getBytes(UTF_8);
+
+    try (final OptimisticTransactionOptions txnOptions =
+             new OptimisticTransactionOptions().setSetSnapshot(true);
+         final Transaction txn =
+             txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      // Do some reads and writes to key "x"
+      Snapshot snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+      byte[] value = txn.get(readOptions, keyX);
+      txn.put(valueX, valueX);
+
+      // Do a write outside of the transaction to key "y"
+      txnDb.put(writeOptions, keyY, valueY);
+
+      // Set a new snapshot in the transaction
+      txn.setSnapshot();
+      snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+
+      // Do some reads and writes to key "y"
+      // Since the snapshot was advanced, the write done outside of the
+      // transaction does not conflict.
+      value = txn.getForUpdate(readOptions, keyY, true);
+      txn.put(keyY, valueY);
+
+      // Commit.  Since the snapshot was advanced, the write done outside of the
+      // transaction does not prevent this transaction from Committing.
+      txn.commit();
+
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/samples/src/main/java/TransactionSample.java b/thirdparty/rocksdb/java/samples/src/main/java/TransactionSample.java
new file mode 100644
index 0000000000..b88a68f123
--- /dev/null
+++ b/thirdparty/rocksdb/java/samples/src/main/java/TransactionSample.java
@@ -0,0 +1,183 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+import org.rocksdb.*;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Demonstrates using Transactions on a TransactionDB with
+ * varying isolation guarantees
+ */
+public class TransactionSample {
+  private static final String dbPath = "/tmp/rocksdb_transaction_example";
+
+  public static final void main(final String args[]) throws RocksDBException {
+
+    try(final Options options = new Options()
+        .setCreateIfMissing(true);
+        final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+        final TransactionDB txnDb =
+            TransactionDB.open(options, txnDbOptions, dbPath)) {
+
+      try (final WriteOptions writeOptions = new WriteOptions();
+           final ReadOptions readOptions = new ReadOptions()) {
+
+        ////////////////////////////////////////////////////////
+        //
+        // Simple Transaction Example ("Read Committed")
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Repeatable Read" (Snapshot Isolation) Example
+        //   -- Using a single Snapshot
+        //
+        ////////////////////////////////////////////////////////
+        repeatableRead(txnDb, writeOptions, readOptions);
+
+
+        ////////////////////////////////////////////////////////
+        //
+        // "Read Committed" (Monotonic Atomic Views) Example
+        //   --Using multiple Snapshots
+        //
+        ////////////////////////////////////////////////////////
+        readCommitted_monotonicAtomicViews(txnDb, writeOptions, readOptions);
+      }
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" isolation
+   */
+  private static void readCommitted(final TransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+    final byte key1[] = "abc".getBytes(UTF_8);
+    final byte value1[] = "def".getBytes(UTF_8);
+
+    final byte key2[] = "xyz".getBytes(UTF_8);
+    final byte value2[] = "zzz".getBytes(UTF_8);
+
+    // Start a transaction
+    try(final Transaction txn = txnDb.beginTransaction(writeOptions)) {
+      // Read a key in this transaction
+      byte[] value = txn.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key in this transaction
+      txn.put(key1, value1);
+
+      // Read a key OUTSIDE this transaction. Does not affect txn.
+      value = txnDb.get(readOptions, key1);
+      assert(value == null);
+
+      // Write a key OUTSIDE of this transaction.
+      // Does not affect txn since this is an unrelated key.
+      // If we wrote key 'abc' here, the transaction would fail to commit.
+      txnDb.put(writeOptions, key2, value2);
+
+      // Commit transaction
+      txn.commit();
+    }
+  }
+
+  /**
+   * Demonstrates "Repeatable Read" (Snapshot Isolation) isolation
+   */
+  private static void repeatableRead(final TransactionDB txnDb,
+      final WriteOptions writeOptions, final ReadOptions readOptions)
+      throws RocksDBException {
+
+    final byte key1[] = "ghi".getBytes(UTF_8);
+    final byte value1[] = "jkl".getBytes(UTF_8);
+
+    // Set a snapshot at start of transaction by setting setSnapshot(true)
+    try(final TransactionOptions txnOptions = new TransactionOptions()
+          .setSetSnapshot(true);
+        final Transaction txn =
+            txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      final Snapshot snapshot = txn.getSnapshot();
+
+      // Write a key OUTSIDE of transaction
+      txnDb.put(writeOptions, key1, value1);
+
+      // Attempt to read a key using the snapshot.  This will fail since
+      // the previous write outside this txn conflicts with this read.
+      readOptions.setSnapshot(snapshot);
+
+      try {
+        final byte[] value = txn.getForUpdate(readOptions, key1, true);
+        throw new IllegalStateException();
+      } catch(final RocksDBException e) {
+        assert(e.getStatus().getCode() == Status.Code.Busy);
+      }
+
+      txn.rollback();
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+
+  /**
+   * Demonstrates "Read Committed" (Monotonic Atomic Views) isolation
+   *
+   * In this example, we set the snapshot multiple times.  This is probably
+   * only necessary if you have very strict isolation requirements to
+   * implement.
+   */
+  private static void readCommitted_monotonicAtomicViews(
+      final TransactionDB txnDb, final WriteOptions writeOptions,
+      final ReadOptions readOptions) throws RocksDBException {
+
+    final byte keyX[] = "x".getBytes(UTF_8);
+    final byte valueX[] = "x".getBytes(UTF_8);
+
+    final byte keyY[] = "y".getBytes(UTF_8);
+    final byte valueY[] = "y".getBytes(UTF_8);
+
+    try (final TransactionOptions txnOptions = new TransactionOptions()
+        .setSetSnapshot(true);
+         final Transaction txn =
+             txnDb.beginTransaction(writeOptions, txnOptions)) {
+
+      // Do some reads and writes to key "x"
+      Snapshot snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+      byte[] value = txn.get(readOptions, keyX);
+      txn.put(valueX, valueX);
+
+      // Do a write outside of the transaction to key "y"
+      txnDb.put(writeOptions, keyY, valueY);
+
+      // Set a new snapshot in the transaction
+      txn.setSnapshot();
+      txn.setSavePoint();
+      snapshot = txnDb.getSnapshot();
+      readOptions.setSnapshot(snapshot);
+
+      // Do some reads and writes to key "y"
+      // Since the snapshot was advanced, the write done outside of the
+      // transaction does not conflict.
+      value = txn.getForUpdate(readOptions, keyY, true);
+      txn.put(keyY, valueY);
+
+      // Decide we want to revert the last write from this transaction.
+      txn.rollbackToSavePoint();
+
+      // Commit.
+      txn.commit();
+    } finally {
+      // Clear snapshot from read options since it is no longer valid
+      readOptions.setSnapshot(null);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java
index 976401fba0..2f0d4f3ca4 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java
@@ -14,6 +14,35 @@
 public abstract class AbstractCompactionFilter<T extends AbstractSlice<?>>
     extends RocksObject {
 
+  public static class Context {
+    private final boolean fullCompaction;
+    private final boolean manualCompaction;
+
+    public Context(final boolean fullCompaction, final boolean manualCompaction) {
+      this.fullCompaction = fullCompaction;
+      this.manualCompaction = manualCompaction;
+    }
+
+    /**
+     * Does this compaction run include all data files
+     *
+     * @return true if this is a full compaction run
+     */
+    public boolean isFullCompaction() {
+      return fullCompaction;
+    }
+
+    /**
+     * Is this compaction requested by the client,
+     * or is it occurring as an automatic compaction process
+     *
+     * @return true if the compaction was initiated by the client
+     */
+    public boolean isManualCompaction() {
+      return manualCompaction;
+    }
+  }
+
   protected AbstractCompactionFilter(final long nativeHandle) {
     super(nativeHandle);
   }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
new file mode 100644
index 0000000000..380b4461d0
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java
@@ -0,0 +1,77 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Each compaction will create a new {@link AbstractCompactionFilter}
+ * allowing the application to know about different compactions
+ *
+ * @param <T> The concrete type of the compaction filter
+ */
+public abstract class AbstractCompactionFilterFactory<T extends AbstractCompactionFilter<?>>
+    extends RocksCallbackObject {
+
+  public AbstractCompactionFilterFactory() {
+    super(null);
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewCompactionFilterFactory0();
+  }
+
+  /**
+   * Called from JNI, see compaction_filter_factory_jnicallback.cc
+   *
+   * @param fullCompaction {@link AbstractCompactionFilter.Context#fullCompaction}
+   * @param manualCompaction {@link AbstractCompactionFilter.Context#manualCompaction}
+   *
+   * @return native handle of the CompactionFilter
+   */
+  private long createCompactionFilter(final boolean fullCompaction,
+      final boolean manualCompaction) {
+    final T filter = createCompactionFilter(
+        new AbstractCompactionFilter.Context(fullCompaction, manualCompaction));
+
+    // CompactionFilterFactory::CreateCompactionFilter returns a std::unique_ptr
+    // which therefore has ownership of the underlying native object
+    filter.disOwnNativeHandle();
+
+    return filter.nativeHandle_;
+  }
+
+  /**
+   * Create a new compaction filter
+   *
+   * @param context The context describing the need for a new compaction filter
+   *
+   * @return A new instance of {@link AbstractCompactionFilter}
+   */
+  public abstract T createCompactionFilter(
+      final AbstractCompactionFilter.Context context);
+
+  /**
+   * A name which identifies this compaction filter
+   *
+   * The name will be printed to the LOG file on start up for diagnosis
+   *
+   * @return name which identifies this compaction filter.
+   */
+  public abstract String name();
+
+  /**
+   * We override {@link RocksCallbackObject#disposeInternal()}
+   * as disposing of a rocksdb::AbstractCompactionFilterFactory requires
+   * a slightly different approach as it is a std::shared_ptr
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native long createNewCompactionFilterFactory0();
+  private native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java
index 0fc4a19dfb..9310397b0c 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractComparator.java
@@ -15,12 +15,25 @@
  *   @see org.rocksdb.DirectComparator
  */
 public abstract class AbstractComparator<T extends AbstractSlice<?>>
-    extends AbstractImmutableNativeReference {
+    extends RocksCallbackObject {
 
   protected AbstractComparator() {
-    super(true);
+    super();
   }
 
+  protected AbstractComparator(final ComparatorOptions copt) {
+    super(copt.nativeHandle_);
+  }
+
+  /**
+   * Get the type of this comparator.
+   *
+   * Used for determining the correct C++ cast in native code.
+   *
+   * @return The type of the comparator.
+   */
+  abstract ComparatorType getComparatorType();
+
   /**
    * The name of the comparator.  Used to check for comparator
    * mismatches (i.e., a DB created with one comparator is
@@ -87,20 +100,4 @@ public String findShortestSeparator(final String start, final T limit) {
   public String findShortSuccessor(final String key) {
       return null;
   }
-
-  /**
-   * Deletes underlying C++ comparator pointer.
-   *
-   * Note that this function should be called only after all
-   * RocksDB instances referencing the comparator are closed.
-   * Otherwise an undefined behavior will occur.
-   */
-  @Override
-  protected void disposeInternal() {
-    disposeInternal(getNativeHandle());
-  }
-
-  protected abstract long getNativeHandle();
-
-  private native void disposeInternal(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
index b1dc1ef379..8532debf80 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractImmutableNativeReference.java
@@ -12,6 +12,7 @@
  * {@link AbstractNativeReference} which have an immutable reference to the
  * underlying native C++ object
  */
+//@ThreadSafe
 public abstract class AbstractImmutableNativeReference
     extends AbstractNativeReference {
 
@@ -19,7 +20,7 @@ public abstract class AbstractImmutableNativeReference
    * A flag indicating whether the current {@code AbstractNativeReference} is
    * responsible to free the underlying C++ object
    */
-  private final AtomicBoolean owningHandle_;
+  protected final AtomicBoolean owningHandle_;
 
   protected AbstractImmutableNativeReference(final boolean owningHandle) {
     this.owningHandle_ = new AtomicBoolean(owningHandle);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java
new file mode 100644
index 0000000000..63015c39a9
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractMutableOptions.java
@@ -0,0 +1,254 @@
+package org.rocksdb;
+
+import java.util.*;
+
+public abstract class AbstractMutableOptions {
+
+  protected static final String KEY_VALUE_PAIR_SEPARATOR = ";";
+  protected static final char KEY_VALUE_SEPARATOR = '=';
+  static final String INT_ARRAY_INT_SEPARATOR = ",";
+
+  protected final String[] keys;
+  private final String[] values;
+
+  /**
+   * User must use builder pattern, or parser.
+   *
+   * @param keys the keys
+   * @param values the values
+   */
+  protected AbstractMutableOptions(final String[] keys, final String[] values) {
+    this.keys = keys;
+    this.values = values;
+  }
+
+  String[] getKeys() {
+    return keys;
+  }
+
+  String[] getValues() {
+    return values;
+  }
+
+  /**
+   * Returns a string representation of MutableOptions which
+   * is suitable for consumption by {@code #parse(String)}.
+   *
+   * @return String representation of MutableOptions
+   */
+  @Override
+  public String toString() {
+    final StringBuilder buffer = new StringBuilder();
+    for(int i = 0; i < keys.length; i++) {
+      buffer
+          .append(keys[i])
+          .append(KEY_VALUE_SEPARATOR)
+          .append(values[i]);
+
+      if(i + 1 < keys.length) {
+        buffer.append(KEY_VALUE_PAIR_SEPARATOR);
+      }
+    }
+    return buffer.toString();
+  }
+
+  public static abstract class AbstractMutableOptionsBuilder<
+      T extends AbstractMutableOptions,
+      U extends AbstractMutableOptionsBuilder<T, U, K>,
+      K extends MutableOptionKey> {
+
+    private final Map<K, MutableOptionValue<?>> options = new LinkedHashMap<>();
+
+    protected abstract U self();
+
+    /**
+     * Get all of the possible keys
+     *
+     * @return A map of all keys, indexed by name.
+     */
+    protected abstract Map<String, K> allKeys();
+
+    /**
+     * Construct a sub-class instance of {@link AbstractMutableOptions}.
+     *
+     * @param keys the keys
+     * @param values the values
+     *
+     * @return an instance of the options.
+     */
+    protected abstract T build(final String[] keys, final String[] values);
+
+    public T build() {
+      final String keys[] = new String[options.size()];
+      final String values[] = new String[options.size()];
+
+      int i = 0;
+      for (final Map.Entry<K, MutableOptionValue<?>> option : options.entrySet()) {
+        keys[i] = option.getKey().name();
+        values[i] = option.getValue().asString();
+        i++;
+      }
+
+      return build(keys, values);
+    }
+
+    protected U setDouble(
+       final K key, final double value) {
+      if (key.getValueType() != MutableOptionKey.ValueType.DOUBLE) {
+        throw new IllegalArgumentException(
+            key + " does not accept a double value");
+      }
+      options.put(key, MutableOptionValue.fromDouble(value));
+      return self();
+    }
+
+    protected double getDouble(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asDouble();
+    }
+
+    protected U setLong(
+        final K key, final long value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.LONG) {
+        throw new IllegalArgumentException(
+            key + " does not accept a long value");
+      }
+      options.put(key, MutableOptionValue.fromLong(value));
+      return self();
+    }
+
+    protected long getLong(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asLong();
+    }
+
+    protected U setInt(
+        final K key, final int value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.INT) {
+        throw new IllegalArgumentException(
+            key + " does not accept an integer value");
+      }
+      options.put(key, MutableOptionValue.fromInt(value));
+      return self();
+    }
+
+    protected int getInt(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asInt();
+    }
+
+    protected U setBoolean(
+        final K key, final boolean value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.BOOLEAN) {
+        throw new IllegalArgumentException(
+            key + " does not accept a boolean value");
+      }
+      options.put(key, MutableOptionValue.fromBoolean(value));
+      return self();
+    }
+
+    protected boolean getBoolean(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asBoolean();
+    }
+
+    protected U setIntArray(
+        final K key, final int[] value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.INT_ARRAY) {
+        throw new IllegalArgumentException(
+            key + " does not accept an int array value");
+      }
+      options.put(key, MutableOptionValue.fromIntArray(value));
+      return self();
+    }
+
+    protected int[] getIntArray(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+      return value.asIntArray();
+    }
+
+    protected <N extends Enum<N>> U setEnum(
+        final K key, final N value) {
+      if(key.getValueType() != MutableOptionKey.ValueType.ENUM) {
+        throw new IllegalArgumentException(
+            key + " does not accept a Enum value");
+      }
+      options.put(key, MutableOptionValue.fromEnum(value));
+      return self();
+    }
+
+    protected <N extends Enum<N>> N getEnum(final K key)
+        throws NoSuchElementException, NumberFormatException {
+      final MutableOptionValue<?> value = options.get(key);
+      if(value == null) {
+        throw new NoSuchElementException(key.name() + " has not been set");
+      }
+
+      if(!(value instanceof MutableOptionValue.MutableOptionEnumValue)) {
+        throw new NoSuchElementException(key.name() + " is not of Enum type");
+      }
+
+      return ((MutableOptionValue.MutableOptionEnumValue<N>)value).asObject();
+    }
+
+    public U fromString(
+        final String keyStr, final String valueStr)
+        throws IllegalArgumentException {
+      Objects.requireNonNull(keyStr);
+      Objects.requireNonNull(valueStr);
+
+      final K key = allKeys().get(keyStr);
+      switch(key.getValueType()) {
+        case DOUBLE:
+          return setDouble(key, Double.parseDouble(valueStr));
+
+        case LONG:
+          return setLong(key, Long.parseLong(valueStr));
+
+        case INT:
+          return setInt(key, Integer.parseInt(valueStr));
+
+        case BOOLEAN:
+          return setBoolean(key, Boolean.parseBoolean(valueStr));
+
+        case INT_ARRAY:
+          final String[] strInts = valueStr
+              .trim().split(INT_ARRAY_INT_SEPARATOR);
+          if(strInts == null || strInts.length == 0) {
+            throw new IllegalArgumentException(
+                "int array value is not correctly formatted");
+          }
+
+          final int value[] = new int[strInts.length];
+          int i = 0;
+          for(final String strInt : strInts) {
+            value[i++] = Integer.parseInt(strInt);
+          }
+          return setIntArray(key, value);
+      }
+
+      throw new IllegalStateException(
+          key + " has unknown value type: " + key.getValueType());
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
index 52bd00f47c..2819b6c70c 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractRocksIterator.java
@@ -58,6 +58,12 @@ public void seek(byte[] target) {
     seek0(nativeHandle_, target, target.length);
   }
 
+ @Override
+ public void seekForPrev(byte[] target) {
+   assert (isOwningHandle());
+   seekForPrev0(nativeHandle_, target, target.length);
+ }
+
   @Override
   public void next() {
     assert (isOwningHandle());
@@ -97,5 +103,6 @@ protected void disposeInternal() {
   abstract void next0(long handle);
   abstract void prev0(long handle);
   abstract void seek0(long handle, byte[] target, int targetLen);
+  abstract void seekForPrev0(long handle, byte[] target, int targetLen);
   abstract void status0(long handle) throws RocksDBException;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTableFilter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTableFilter.java
new file mode 100644
index 0000000000..627e1ae1f7
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTableFilter.java
@@ -0,0 +1,19 @@
+package org.rocksdb;
+
+/**
+ * Base class for Table Filters.
+ */
+public abstract class AbstractTableFilter
+    extends RocksCallbackObject implements TableFilter {
+
+  protected AbstractTableFilter() {
+    super();
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewTableFilter();
+  }
+
+  private native long createNewTableFilter();
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTraceWriter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTraceWriter.java
new file mode 100644
index 0000000000..806709b1f7
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTraceWriter.java
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Base class for TraceWriters.
+ */
+public abstract class AbstractTraceWriter
+    extends RocksCallbackObject implements TraceWriter {
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewTraceWriter();
+  }
+
+  /**
+   * Called from JNI, proxy for {@link TraceWriter#write(Slice)}.
+   *
+   * @param sliceHandle the native handle of the slice (which we do not own)
+   *
+   * @return short (2 bytes) where the first byte is the
+   *     {@link Status.Code#getValue()} and the second byte is the
+   *     {@link Status.SubCode#getValue()}.
+   */
+  private short writeProxy(final long sliceHandle) {
+    try {
+      write(new Slice(sliceHandle));
+      return statusToShort(Status.Code.Ok, Status.SubCode.None);
+    } catch (final RocksDBException e) {
+      return statusToShort(e.getStatus());
+    }
+  }
+
+  /**
+   * Called from JNI, proxy for {@link TraceWriter#closeWriter()}.
+   *
+   * @return short (2 bytes) where the first byte is the
+   *     {@link Status.Code#getValue()} and the second byte is the
+   *     {@link Status.SubCode#getValue()}.
+   */
+  private short closeWriterProxy() {
+    try {
+      closeWriter();
+      return statusToShort(Status.Code.Ok, Status.SubCode.None);
+    } catch (final RocksDBException e) {
+      return statusToShort(e.getStatus());
+    }
+  }
+
+  private static short statusToShort(/*@Nullable*/ final Status status) {
+    final Status.Code code = status != null && status.getCode() != null
+        ? status.getCode()
+        : Status.Code.IOError;
+    final Status.SubCode subCode = status != null && status.getSubCode() != null
+        ? status.getSubCode()
+        : Status.SubCode.None;
+    return statusToShort(code, subCode);
+  }
+
+  private static short statusToShort(final Status.Code code,
+      final Status.SubCode subCode) {
+    short result = (short)(code.getValue() << 8);
+    return (short)(result | subCode.getValue());
+  }
+
+  private native long createNewTraceWriter();
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java
new file mode 100644
index 0000000000..cbb49836d1
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Provides notification to the caller of SetSnapshotOnNextOperation when
+ * the actual snapshot gets created
+ */
+public abstract class AbstractTransactionNotifier
+    extends RocksCallbackObject {
+
+  protected AbstractTransactionNotifier() {
+    super();
+  }
+
+  /**
+   * Implement this method to receive notification when a snapshot is
+   * requested via {@link Transaction#setSnapshotOnNextOperation()}.
+   *
+   * @param newSnapshot the snapshot that has been created.
+   */
+  public abstract void snapshotCreated(final Snapshot newSnapshot);
+
+  /**
+   * This is intentionally private as it is the callback hook
+   * from JNI
+   */
+  private void snapshotCreated(final long snapshotHandle) {
+    snapshotCreated(new Snapshot(snapshotHandle));
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewTransactionNotifier();
+  }
+
+  private native long createNewTransactionNotifier();
+
+  /**
+   * Deletes underlying C++ TransactionNotifier pointer.
+   *
+   * Note that this function should be called only after all
+   * Transactions referencing the comparator are closed.
+   * Otherwise an undefined behavior will occur.
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+  protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractWalFilter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractWalFilter.java
new file mode 100644
index 0000000000..d525045c6b
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractWalFilter.java
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Base class for WAL Filters.
+ */
+public abstract class AbstractWalFilter
+    extends RocksCallbackObject implements WalFilter {
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewWalFilter();
+  }
+
+  /**
+   * Called from JNI, proxy for
+   *     {@link WalFilter#logRecordFound(long, String, WriteBatch, WriteBatch)}.
+   *
+   * @param logNumber the log handle.
+   * @param logFileName the log file name
+   * @param batchHandle the native handle of a WriteBatch (which we do not own)
+   * @param newBatchHandle the native handle of a
+   *     new WriteBatch (which we do not own)
+   *
+   * @return short (2 bytes) where the first byte is the
+   *     {@link WalFilter.LogRecordFoundResult#walProcessingOption}
+   *     {@link WalFilter.LogRecordFoundResult#batchChanged}.
+   */
+  private short logRecordFoundProxy(final long logNumber,
+      final String logFileName, final long batchHandle,
+      final long newBatchHandle) {
+    final LogRecordFoundResult logRecordFoundResult = logRecordFound(
+        logNumber, logFileName, new WriteBatch(batchHandle),
+        new WriteBatch(newBatchHandle));
+    return logRecordFoundResultToShort(logRecordFoundResult);
+  }
+
+  private static short logRecordFoundResultToShort(
+      final LogRecordFoundResult logRecordFoundResult) {
+    short result = (short)(logRecordFoundResult.walProcessingOption.getValue() << 8);
+    return (short)(result | (logRecordFoundResult.batchChanged ? 1 : 0));
+  }
+
+  private native long createNewWalFilter();
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
index b2e5571809..9de0eb43c5 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AbstractWriteBatch.java
@@ -18,52 +18,80 @@ public int count() {
   }
 
   @Override
-  public void put(byte[] key, byte[] value) {
+  public void put(byte[] key, byte[] value) throws RocksDBException {
     put(nativeHandle_, key, key.length, value, value.length);
   }
 
   @Override
   public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key,
-      byte[] value) {
+      byte[] value) throws RocksDBException {
     put(nativeHandle_, key, key.length, value, value.length,
         columnFamilyHandle.nativeHandle_);
   }
 
   @Override
-  public void merge(byte[] key, byte[] value) {
+  public void merge(byte[] key, byte[] value) throws RocksDBException {
     merge(nativeHandle_, key, key.length, value, value.length);
   }
 
   @Override
   public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key,
-      byte[] value) {
+      byte[] value) throws RocksDBException {
     merge(nativeHandle_, key, key.length, value, value.length,
         columnFamilyHandle.nativeHandle_);
   }
 
   @Override
-  public void remove(byte[] key) {
-    remove(nativeHandle_, key, key.length);
+  @Deprecated
+  public void remove(byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key, key.length);
   }
 
   @Override
-  public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) {
-    remove(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  @Deprecated
+  public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+      throws RocksDBException {
+    delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
   }
 
   @Override
-  public void deleteRange(byte[] beginKey, byte[] endKey) {
+  public void delete(byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key, key.length);
+  }
+
+  @Override
+  public void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+      throws RocksDBException {
+    delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+
+  @Override
+  public void singleDelete(byte[] key) throws RocksDBException {
+    singleDelete(nativeHandle_, key, key.length);
+  }
+
+  @Override
+  public void singleDelete(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+      throws RocksDBException {
+    singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  @Override
+  public void deleteRange(byte[] beginKey, byte[] endKey)
+      throws RocksDBException {
     deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length);
   }
 
   @Override
-  public void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] endKey) {
+  public void deleteRange(ColumnFamilyHandle columnFamilyHandle,
+      byte[] beginKey, byte[] endKey) throws RocksDBException {
     deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length,
         columnFamilyHandle.nativeHandle_);
   }
 
   @Override
-  public void putLogData(byte[] blob) {
+  public void putLogData(byte[] blob) throws RocksDBException {
     putLogData(nativeHandle_, blob, blob.length);
   }
 
@@ -82,38 +110,67 @@ public void rollbackToSavePoint() throws RocksDBException {
     rollbackToSavePoint0(nativeHandle_);
   }
 
+  @Override
+  public void popSavePoint() throws RocksDBException {
+    popSavePoint(nativeHandle_);
+  }
+
+  @Override
+  public void setMaxBytes(final long maxBytes) {
+    setMaxBytes(nativeHandle_, maxBytes);
+  }
+
+  @Override
+  public WriteBatch getWriteBatch() {
+    return getWriteBatch(nativeHandle_);
+  }
+
   abstract int count0(final long handle);
 
   abstract void put(final long handle, final byte[] key, final int keyLen,
-      final byte[] value, final int valueLen);
+      final byte[] value, final int valueLen) throws RocksDBException;
 
   abstract void put(final long handle, final byte[] key, final int keyLen,
-      final byte[] value, final int valueLen, final long cfHandle);
+      final byte[] value, final int valueLen, final long cfHandle)
+      throws RocksDBException;
 
   abstract void merge(final long handle, final byte[] key, final int keyLen,
-      final byte[] value, final int valueLen);
+      final byte[] value, final int valueLen) throws RocksDBException;
 
   abstract void merge(final long handle, final byte[] key, final int keyLen,
-      final byte[] value, final int valueLen, final long cfHandle);
+      final byte[] value, final int valueLen, final long cfHandle)
+      throws RocksDBException;
+
+  abstract void delete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+
+  abstract void delete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
 
-  abstract void remove(final long handle, final byte[] key,
-      final int keyLen);
+  abstract void singleDelete(final long handle, final byte[] key,
+                       final int keyLen) throws RocksDBException;
 
-  abstract void remove(final long handle, final byte[] key,
-      final int keyLen, final long cfHandle);
+  abstract void singleDelete(final long handle, final byte[] key,
+                       final int keyLen, final long cfHandle) throws RocksDBException;
 
   abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
-      final byte[] endKey, final int endKeyLen);
+      final byte[] endKey, final int endKeyLen) throws RocksDBException;
 
   abstract void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
-      final byte[] endKey, final int endKeyLen, final long cfHandle);
+      final byte[] endKey, final int endKeyLen, final long cfHandle) throws RocksDBException;
 
   abstract void putLogData(final long handle, final byte[] blob,
-      final int blobLen);
+      final int blobLen) throws RocksDBException;
 
   abstract void clear0(final long handle);
 
   abstract void setSavePoint0(final long handle);
 
   abstract void rollbackToSavePoint0(final long handle);
+
+  abstract void popSavePoint(final long handle) throws RocksDBException;
+
+  abstract void setMaxBytes(final long handle, long maxBytes);
+
+  abstract WriteBatch getWriteBatch(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
index d3908d1a37..ac8550f3ef 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java
@@ -441,7 +441,7 @@ T setOptimizeFiltersForHits(
   boolean optimizeFiltersForHits();
 
   /**
-   * In debug mode, RocksDB run consistency checks on the LSM everytime the LSM
+   * In debug mode, RocksDB run consistency checks on the LSM every time the LSM
    * change (Flush, Compaction, AddFile). These checks are disabled in release
    * mode, use this option to enable them in release mode as well.
    *
@@ -455,7 +455,7 @@ T setForceConsistencyChecks(
       boolean forceConsistencyChecks);
 
   /**
-   * In debug mode, RocksDB run consistency checks on the LSM everytime the LSM
+   * In debug mode, RocksDB run consistency checks on the LSM every time the LSM
    * change (Flush, Compaction, AddFile). These checks are disabled in release
    * mode.
    *
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
index 092fe37843..3ec4671238 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java
@@ -434,4 +434,32 @@ T setReportBgIoStats(
    * @return true if reporting is enabled
    */
   boolean reportBgIoStats();
+
+  /**
+   * Non-bottom-level files older than TTL will go through the compaction
+   * process. This needs {@link MutableDBOptionsInterface#maxOpenFiles()} to be
+   * set to -1.
+   *
+   * Enabled only for level compaction for now.
+   *
+   * Default: 0 (disabled)
+   *
+   * Dynamically changeable through
+   * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}.
+   *
+   * @param ttl the time-to-live.
+   *
+   * @return the reference to the current options.
+   */
+  T setTtl(final long ttl);
+
+  /**
+   * Get the TTL for Non-bottom-level files that will go through the compaction
+   * process.
+   *
+   * See {@link #setTtl(long)}.
+   *
+   * @return the time-to-live.
+   */
+  long ttl();
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java
index 763994575c..a028edea0a 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupEngine.java
@@ -65,7 +65,10 @@ public void createNewBackup(final RocksDB db) throws RocksDBException {
    *                          When false, the Backup Engine will not issue a
    *                          flush before starting the backup. In that case,
    *                          the backup will also include log files
-   *                          corresponding to live memtables. The backup will
+   *                          corresponding to live memtables. If writes have
+   *                          been performed with the write ahead log disabled,
+   *                          set flushBeforeBackup to true to prevent those
+   *                          writes from being lost. Otherwise, the backup will
    *                          always be consistent with the current state of the
    *                          database regardless of the flushBeforeBackup
    *                          parameter.
@@ -81,6 +84,38 @@ public void createNewBackup(
     createNewBackup(nativeHandle_, db.nativeHandle_, flushBeforeBackup);
   }
 
+  /**
+   * Captures the state of the database in the latest backup along with
+   * application specific metadata.
+   *
+   * @param db The database to backup
+   * @param metadata Application metadata
+   * @param flushBeforeBackup When true, the Backup Engine will first issue a
+   *                          memtable flush and only then copy the DB files to
+   *                          the backup directory. Doing so will prevent log
+   *                          files from being copied to the backup directory
+   *                          (since flush will delete them).
+   *                          When false, the Backup Engine will not issue a
+   *                          flush before starting the backup. In that case,
+   *                          the backup will also include log files
+   *                          corresponding to live memtables. If writes have
+   *                          been performed with the write ahead log disabled,
+   *                          set flushBeforeBackup to true to prevent those
+   *                          writes from being lost. Otherwise, the backup will
+   *                          always be consistent with the current state of the
+   *                          database regardless of the flushBeforeBackup
+   *                          parameter.
+   *
+   * Note - This method is not thread safe
+   *
+   * @throws RocksDBException thrown if a new backup could not be created
+   */
+  public void createNewBackupWithMetadata(final RocksDB db, final String metadata,
+      final boolean flushBeforeBackup) throws RocksDBException {
+    assert (isOwningHandle());
+    createNewBackupWithMetadata(nativeHandle_, db.nativeHandle_, metadata, flushBeforeBackup);
+  }
+
   /**
    * Gets information about the available
    * backups
@@ -197,6 +232,9 @@ private native static long open(final long env,
   private native void createNewBackup(final long handle, final long dbHandle,
       final boolean flushBeforeBackup) throws RocksDBException;
 
+  private native void createNewBackupWithMetadata(final long handle, final long dbHandle,
+      final String metadata, final boolean flushBeforeBackup) throws RocksDBException;
+
   private native List<BackupInfo> getBackupInfo(final long handle);
 
   private native int[] getCorruptedBackups(final long handle);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java
index 10f418629a..9244e4eb19 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BackupInfo.java
@@ -19,12 +19,13 @@ public class BackupInfo {
    * @param size size of backup
    * @param numberFiles number of files related to this backup.
    */
-  BackupInfo(final int backupId, final long timestamp, final long size,
-      final int numberFiles) {
+  BackupInfo(final int backupId, final long timestamp, final long size, final int numberFiles,
+      final String app_metadata) {
     backupId_ = backupId;
     timestamp_ = timestamp;
     size_ = size;
     numberFiles_ = numberFiles;
+    app_metadata_ = app_metadata;
   }
 
   /**
@@ -59,8 +60,17 @@ public int numberFiles() {
     return numberFiles_;
   }
 
+  /**
+   *
+   * @return the associated application metadata, or null
+   */
+  public String appMetadata() {
+    return app_metadata_;
+  }
+
   private int backupId_;
   private long timestamp_;
   private long size_;
   private int numberFiles_;
+  private String app_metadata_;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
index 2d847de29d..7a4ff14bfe 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java
@@ -9,96 +9,326 @@
  *
  * BlockBasedTable is a RocksDB's default SST file format.
  */
+//TODO(AR) should be renamed BlockBasedTableOptions
 public class BlockBasedTableConfig extends TableFormatConfig {
 
   public BlockBasedTableConfig() {
-    noBlockCache_ = false;
-    blockCacheSize_ = 8 * 1024 * 1024;
-    blockCacheNumShardBits_ = 0;
-    blockSize_ = 4 * 1024;
-    blockSizeDeviation_ = 10;
-    blockRestartInterval_ = 16;
-    wholeKeyFiltering_ = true;
-    filter_ = null;
-    cacheIndexAndFilterBlocks_ = false;
-    pinL0FilterAndIndexBlocksInCache_ = false;
-    hashIndexAllowCollision_ = true;
-    blockCacheCompressedSize_ = 0;
-    blockCacheCompressedNumShardBits_ = 0;
-    checksumType_ = ChecksumType.kCRC32c;
-    indexType_ = IndexType.kBinarySearch;
-    formatVersion_ = 0;
+    //TODO(AR) flushBlockPolicyFactory
+    cacheIndexAndFilterBlocks = false;
+    cacheIndexAndFilterBlocksWithHighPriority = false;
+    pinL0FilterAndIndexBlocksInCache = false;
+    pinTopLevelIndexAndFilter = true;
+    indexType = IndexType.kBinarySearch;
+    dataBlockIndexType = DataBlockIndexType.kDataBlockBinarySearch;
+    dataBlockHashTableUtilRatio = 0.75;
+    checksumType = ChecksumType.kCRC32c;
+    noBlockCache = false;
+    blockCache = null;
+    persistentCache = null;
+    blockCacheCompressed = null;
+    blockSize = 4 * 1024;
+    blockSizeDeviation = 10;
+    blockRestartInterval = 16;
+    indexBlockRestartInterval = 1;
+    metadataBlockSize = 4096;
+    partitionFilters = false;
+    useDeltaEncoding = true;
+    filterPolicy = null;
+    wholeKeyFiltering = true;
+    verifyCompression = true;
+    readAmpBytesPerBit = 0;
+    formatVersion = 2;
+    enableIndexCompression = true;
+    blockAlign = false;
+
+    // NOTE: ONLY used if blockCache == null
+    blockCacheSize = 8 * 1024 * 1024;
+    blockCacheNumShardBits = 0;
+
+    // NOTE: ONLY used if blockCacheCompressed == null
+    blockCacheCompressedSize = 0;
+    blockCacheCompressedNumShardBits = 0;
   }
 
   /**
-   * Disable block cache. If this is set to true,
-   * then no block cache should be used, and the block_cache should
-   * point to a {@code nullptr} object.
-   * Default: false
+   * Indicating if we'd put index/filter blocks to the block cache.
+   * If not specified, each "table reader" object will pre-load index/filter
+   * block during table initialization.
    *
-   * @param noBlockCache if use block cache
+   * @return if index and filter blocks should be put in block cache.
+   */
+  public boolean cacheIndexAndFilterBlocks() {
+    return cacheIndexAndFilterBlocks;
+  }
+
+  /**
+   * Indicating if we'd put index/filter blocks to the block cache.
+   * If not specified, each "table reader" object will pre-load index/filter
+   * block during table initialization.
+   *
+   * @param cacheIndexAndFilterBlocks and filter blocks should be put in block cache.
    * @return the reference to the current config.
    */
-  public BlockBasedTableConfig setNoBlockCache(final boolean noBlockCache) {
-    noBlockCache_ = noBlockCache;
+  public BlockBasedTableConfig setCacheIndexAndFilterBlocks(
+      final boolean cacheIndexAndFilterBlocks) {
+    this.cacheIndexAndFilterBlocks = cacheIndexAndFilterBlocks;
+    return this;
+  }
+
+  /**
+   * Indicates if index and filter blocks will be treated as high-priority in the block cache.
+   * See note below about applicability. If not specified, defaults to false.
+   *
+   * @return if index and filter blocks will be treated as high-priority.
+   */
+  public boolean cacheIndexAndFilterBlocksWithHighPriority() {
+    return cacheIndexAndFilterBlocksWithHighPriority;
+  }
+
+  /**
+   * If true, cache index and filter blocks with high priority. If set to true,
+   * depending on implementation of block cache, index and filter blocks may be
+   * less likely to be evicted than data blocks.
+   *
+   * @param cacheIndexAndFilterBlocksWithHighPriority if index and filter blocks
+   *            will be treated as high-priority.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setCacheIndexAndFilterBlocksWithHighPriority(
+      final boolean cacheIndexAndFilterBlocksWithHighPriority) {
+    this.cacheIndexAndFilterBlocksWithHighPriority = cacheIndexAndFilterBlocksWithHighPriority;
+    return this;
+  }
+
+  /**
+   * Indicating if we'd like to pin L0 index/filter blocks to the block cache.
+   If not specified, defaults to false.
+   *
+   * @return if L0 index and filter blocks should be pinned to the block cache.
+   */
+  public boolean pinL0FilterAndIndexBlocksInCache() {
+    return pinL0FilterAndIndexBlocksInCache;
+  }
+
+  /**
+   * Indicating if we'd like to pin L0 index/filter blocks to the block cache.
+   If not specified, defaults to false.
+   *
+   * @param pinL0FilterAndIndexBlocksInCache pin blocks in block cache
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPinL0FilterAndIndexBlocksInCache(
+      final boolean pinL0FilterAndIndexBlocksInCache) {
+    this.pinL0FilterAndIndexBlocksInCache = pinL0FilterAndIndexBlocksInCache;
+    return this;
+  }
+
+  /**
+   * Indicates if top-level index and filter blocks should be pinned.
+   *
+   * @return if top-level index and filter blocks should be pinned.
+   */
+  public boolean pinTopLevelIndexAndFilter() {
+    return pinTopLevelIndexAndFilter;
+  }
+
+  /**
+   * If cacheIndexAndFilterBlocks is true and the below is true, then
+   * the top-level index of partitioned filter and index blocks are stored in
+   * the cache, but a reference is held in the "table reader" object so the
+   * blocks are pinned and only evicted from cache when the table reader is
+   * freed. This is not limited to l0 in LSM tree.
+   *
+   * @param pinTopLevelIndexAndFilter if top-level index and filter blocks should be pinned.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPinTopLevelIndexAndFilter(final boolean pinTopLevelIndexAndFilter) {
+    this.pinTopLevelIndexAndFilter = pinTopLevelIndexAndFilter;
+    return this;
+  }
+
+  /**
+   * Get the index type.
+   *
+   * @return the currently set index type
+   */
+  public IndexType indexType() {
+    return indexType;
+  }
+
+  /**
+   * Sets the index type to used with this table.
+   *
+   * @param indexType {@link org.rocksdb.IndexType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setIndexType(
+      final IndexType indexType) {
+    this.indexType = indexType;
+    return this;
+  }
+
+  /**
+   * Get the data block index type.
+   *
+   * @return the currently set data block index type
+   */
+  public DataBlockIndexType dataBlockIndexType() {
+    return dataBlockIndexType;
+  }
+
+  /**
+   * Sets the data block index type to used with this table.
+   *
+   * @param dataBlockIndexType {@link org.rocksdb.DataBlockIndexType} value
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setDataBlockIndexType(
+      final DataBlockIndexType dataBlockIndexType) {
+    this.dataBlockIndexType = dataBlockIndexType;
+    return this;
+  }
+
+  /**
+   * Get the #entries/#buckets. It is valid only when {@link #dataBlockIndexType()} is
+   * {@link DataBlockIndexType#kDataBlockBinaryAndHash}.
+   *
+   * @return the #entries/#buckets.
+   */
+  public double dataBlockHashTableUtilRatio() {
+    return dataBlockHashTableUtilRatio;
+  }
+
+  /**
+   * Set the #entries/#buckets. It is valid only when {@link #dataBlockIndexType()} is
+   * {@link DataBlockIndexType#kDataBlockBinaryAndHash}.
+   *
+   * @param dataBlockHashTableUtilRatio #entries/#buckets
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setDataBlockHashTableUtilRatio(
+      final double dataBlockHashTableUtilRatio) {
+    this.dataBlockHashTableUtilRatio = dataBlockHashTableUtilRatio;
+    return this;
+  }
+
+  /**
+   * Get the checksum type to be used with this table.
+   *
+   * @return the currently set checksum type
+   */
+  public ChecksumType checksumType() {
+    return checksumType;
+  }
+
+  /**
+   * Sets
+   *
+   * @param checksumType {@link org.rocksdb.ChecksumType} value.
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setChecksumType(
+      final ChecksumType checksumType) {
+    this.checksumType = checksumType;
     return this;
   }
 
   /**
+   * Determine if the block cache is disabled.
+   *
    * @return if block cache is disabled
    */
   public boolean noBlockCache() {
-    return noBlockCache_;
+    return noBlockCache;
   }
 
   /**
-   * Set the amount of cache in bytes that will be used by RocksDB.
-   * If cacheSize is non-positive, then cache will not be used.
-   * DEFAULT: 8M
+   * Disable block cache. If this is set to true,
+   * then no block cache should be used, and the {@link #setBlockCache(Cache)}
+   * should point to a {@code null} object.
    *
-   * @param blockCacheSize block cache size in bytes
+   * Default: false
+   *
+   * @param noBlockCache if use block cache
    * @return the reference to the current config.
    */
-  public BlockBasedTableConfig setBlockCacheSize(final long blockCacheSize) {
-    blockCacheSize_ = blockCacheSize;
+  public BlockBasedTableConfig setNoBlockCache(final boolean noBlockCache) {
+    this.noBlockCache = noBlockCache;
     return this;
   }
 
   /**
-   * @return block cache size in bytes
+   * Use the specified cache for blocks.
+   * When not null this take precedence even if the user sets a block cache size.
+   *
+   * {@link org.rocksdb.Cache} should not be disposed before options instances
+   * using this cache is disposed.
+   *
+   * {@link org.rocksdb.Cache} instance can be re-used in multiple options
+   * instances.
+   *
+   * @param blockCache {@link org.rocksdb.Cache} Cache java instance
+   *     (e.g. LRUCache).
+   *
+   * @return the reference to the current config.
    */
-  public long blockCacheSize() {
-    return blockCacheSize_;
+  public BlockBasedTableConfig setBlockCache(final Cache blockCache) {
+    this.blockCache = blockCache;
+    return this;
   }
 
   /**
-   * Controls the number of shards for the block cache.
-   * This is applied only if cacheSize is set to non-negative.
+   * Use the specified persistent cache.
    *
-   * @param blockCacheNumShardBits the number of shard bits. The resulting
-   *     number of shards would be 2 ^ numShardBits.  Any negative
-   *     number means use default settings."
-   * @return the reference to the current option.
+   * If {@code !null} use the specified cache for pages read from device,
+   * otherwise no page cache is used.
+   *
+   * @param persistentCache the persistent cache
+   *
+   * @return the reference to the current config.
    */
-  public BlockBasedTableConfig setCacheNumShardBits(
-      final int blockCacheNumShardBits) {
-    blockCacheNumShardBits_ = blockCacheNumShardBits;
+  public BlockBasedTableConfig setPersistentCache(
+      final PersistentCache persistentCache) {
+    this.persistentCache = persistentCache;
     return this;
   }
 
   /**
-   * Returns the number of shard bits used in the block cache.
-   * The resulting number of shards would be 2 ^ (returned value).
-   * Any negative number means use default settings.
+   * Use the specified cache for compressed blocks.
    *
-   * @return the number of shard bits used in the block cache.
+   * If {@code null}, RocksDB will not use a compressed block cache.
+   *
+   * Note: though it looks similar to {@link #setBlockCache(Cache)}, RocksDB
+   *     doesn't put the same type of object there.
+   *
+   * {@link org.rocksdb.Cache} should not be disposed before options instances
+   * using this cache is disposed.
+   *
+   * {@link org.rocksdb.Cache} instance can be re-used in multiple options
+   * instances.
+   *
+   * @param blockCacheCompressed {@link org.rocksdb.Cache} Cache java instance
+   *     (e.g. LRUCache).
+   *
+   * @return the reference to the current config.
    */
-  public int cacheNumShardBits() {
-    return blockCacheNumShardBits_;
+  public BlockBasedTableConfig setBlockCacheCompressed(
+      final Cache blockCacheCompressed) {
+    this.blockCacheCompressed = blockCacheCompressed;
+    return this;
   }
 
   /**
-   * Approximate size of user data packed per block.  Note that the
+   * Get the approximate size of user data packed per block.
+   *
+   * @return block size in bytes
+   */
+  public long blockSize() {
+    return blockSize;
+  }
+
+  /**
+   * Approximate size of user data packed per block. Note that the
    * block size specified here corresponds to uncompressed data.  The
    * actual size of the unit read from disk may be smaller if
    * compression is enabled.  This parameter can be changed dynamically.
@@ -108,23 +338,24 @@ public int cacheNumShardBits() {
    * @return the reference to the current config.
    */
   public BlockBasedTableConfig setBlockSize(final long blockSize) {
-    blockSize_ = blockSize;
+    this.blockSize = blockSize;
     return this;
   }
 
   /**
-   * @return block size in bytes
+   * @return the hash table ratio.
    */
-  public long blockSize() {
-    return blockSize_;
+  public int blockSizeDeviation() {
+    return blockSizeDeviation;
   }
 
   /**
    * This is used to close a block before it reaches the configured
-   * 'block_size'. If the percentage of free space in the current block is less
-   * than this specified number and adding a new record to the block will
-   * exceed the configured block size, then this block will be closed and the
-   * new record will be written to the next block.
+   * {@link #blockSize()}. If the percentage of free space in the current block
+   * is less than this specified number and adding a new record to the block
+   * will exceed the configured block size, then this block will be closed and
+   * the new record will be written to the next block.
+   *
    * Default is 10.
    *
    * @param blockSizeDeviation the deviation to block size allowed
@@ -132,55 +363,120 @@ public long blockSize() {
    */
   public BlockBasedTableConfig setBlockSizeDeviation(
       final int blockSizeDeviation) {
-    blockSizeDeviation_ = blockSizeDeviation;
+    this.blockSizeDeviation = blockSizeDeviation;
     return this;
   }
 
   /**
-   * @return the hash table ratio.
+   * Get the block restart interval.
+   *
+   * @return block restart interval
    */
-  public int blockSizeDeviation() {
-    return blockSizeDeviation_;
+  public int blockRestartInterval() {
+    return blockRestartInterval;
   }
 
   /**
-   * Set block restart interval
+   * Set the block restart interval.
    *
    * @param restartInterval block restart interval.
    * @return the reference to the current config.
    */
   public BlockBasedTableConfig setBlockRestartInterval(
       final int restartInterval) {
-    blockRestartInterval_ = restartInterval;
+    blockRestartInterval = restartInterval;
     return this;
   }
 
   /**
-   * @return block restart interval
+   * Get the index block restart interval.
+   *
+   * @return index block restart interval
    */
-  public int blockRestartInterval() {
-    return blockRestartInterval_;
+  public int indexBlockRestartInterval() {
+    return indexBlockRestartInterval;
   }
 
   /**
-   * If true, place whole keys in the filter (not just prefixes).
-   * This must generally be true for gets to be efficient.
-   * Default: true
+   * Set the index block restart interval
    *
-   * @param wholeKeyFiltering if enable whole key filtering
+   * @param restartInterval index block restart interval.
    * @return the reference to the current config.
    */
-  public BlockBasedTableConfig setWholeKeyFiltering(
-      final boolean wholeKeyFiltering) {
-    wholeKeyFiltering_ = wholeKeyFiltering;
+  public BlockBasedTableConfig setIndexBlockRestartInterval(
+      final int restartInterval) {
+    indexBlockRestartInterval = restartInterval;
     return this;
   }
 
   /**
-   * @return if whole key filtering is enabled
+   * Get the block size for partitioned metadata.
+   *
+   * @return block size for partitioned metadata.
    */
-  public boolean wholeKeyFiltering() {
-    return wholeKeyFiltering_;
+  public long metadataBlockSize() {
+    return metadataBlockSize;
+  }
+
+  /**
+   * Set block size for partitioned metadata.
+   *
+   * @param metadataBlockSize Partitioned metadata block size.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setMetadataBlockSize(
+      final long metadataBlockSize) {
+    this.metadataBlockSize = metadataBlockSize;
+    return this;
+  }
+
+  /**
+   * Indicates if we're using partitioned filters.
+   *
+   * @return if we're using partition filters.
+   */
+  public boolean partitionFilters() {
+    return partitionFilters;
+  }
+
+  /**
+   * Use partitioned full filters for each SST file. This option is incompatible
+   * with block-based filters.
+   *
+   * Defaults to false.
+   *
+   * @param partitionFilters use partition filters.
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setPartitionFilters(final boolean partitionFilters) {
+    this.partitionFilters = partitionFilters;
+    return this;
+  }
+
+  /**
+   * Determine if delta encoding is being used to compress block keys.
+   *
+   * @return true if delta encoding is enabled, false otherwise.
+   */
+  public boolean useDeltaEncoding() {
+    return useDeltaEncoding;
+  }
+
+  /**
+   * Use delta encoding to compress keys in blocks.
+   *
+   * NOTE: {@link ReadOptions#pinData()} requires this option to be disabled.
+   *
+   * Default: true
+   *
+   * @param useDeltaEncoding true to enable delta encoding
+   *
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setUseDeltaEncoding(
+      final boolean useDeltaEncoding) {
+    this.useDeltaEncoding = useDeltaEncoding;
+    return this;
   }
 
   /**
@@ -193,87 +489,274 @@ public boolean wholeKeyFiltering() {
    * {@link org.rocksdb.Filter} instance can be re-used in multiple options
    * instances.
    *
-   * @param filter {@link org.rocksdb.Filter} Filter Policy java instance.
+   * @param filterPolicy {@link org.rocksdb.Filter} Filter Policy java instance.
    * @return the reference to the current config.
    */
+  public BlockBasedTableConfig setFilterPolicy(
+      final Filter filterPolicy) {
+    this.filterPolicy = filterPolicy;
+    return this;
+  }
+
+  /*
+   * @deprecated Use {@link #setFilterPolicy(Filter)}
+   */
+  @Deprecated
   public BlockBasedTableConfig setFilter(
       final Filter filter) {
-    filter_ = filter;
+    return setFilterPolicy(filter);
+  }
+
+  /**
+   * Determine if whole keys as opposed to prefixes are placed in the filter.
+   *
+   * @return if whole key filtering is enabled
+   */
+  public boolean wholeKeyFiltering() {
+    return wholeKeyFiltering;
+  }
+
+  /**
+   * If true, place whole keys in the filter (not just prefixes).
+   * This must generally be true for gets to be efficient.
+   * Default: true
+   *
+   * @param wholeKeyFiltering if enable whole key filtering
+   * @return the reference to the current config.
+   */
+  public BlockBasedTableConfig setWholeKeyFiltering(
+      final boolean wholeKeyFiltering) {
+    this.wholeKeyFiltering = wholeKeyFiltering;
     return this;
   }
 
   /**
-   * Indicating if we'd put index/filter blocks to the block cache.
-     If not specified, each "table reader" object will pre-load index/filter
-     block during table initialization.
+   * Returns true when compression verification is enabled.
    *
-   * @return if index and filter blocks should be put in block cache.
+   * See {@link #setVerifyCompression(boolean)}.
+   *
+   * @return true if compression verification is enabled.
    */
-  public boolean cacheIndexAndFilterBlocks() {
-    return cacheIndexAndFilterBlocks_;
+  public boolean verifyCompression() {
+    return verifyCompression;
   }
 
   /**
-   * Indicating if we'd put index/filter blocks to the block cache.
-     If not specified, each "table reader" object will pre-load index/filter
-     block during table initialization.
+   * Verify that decompressing the compressed block gives back the input. This
+   * is a verification mode that we use to detect bugs in compression
+   * algorithms.
+   *
+   * @param verifyCompression true to enable compression verification.
    *
-   * @param cacheIndexAndFilterBlocks and filter blocks should be put in block cache.
    * @return the reference to the current config.
    */
-  public BlockBasedTableConfig setCacheIndexAndFilterBlocks(
-      final boolean cacheIndexAndFilterBlocks) {
-    cacheIndexAndFilterBlocks_ = cacheIndexAndFilterBlocks;
+  public BlockBasedTableConfig setVerifyCompression(
+      final boolean verifyCompression) {
+    this.verifyCompression = verifyCompression;
     return this;
   }
 
   /**
-   * Indicating if we'd like to pin L0 index/filter blocks to the block cache.
-     If not specified, defaults to false.
+   * Get the Read amplification bytes per-bit.
    *
-   * @return if L0 index and filter blocks should be pinned to the block cache.
+   * See {@link #setReadAmpBytesPerBit(int)}.
+   *
+   * @return the bytes per-bit.
    */
-  public boolean pinL0FilterAndIndexBlocksInCache() {
-    return pinL0FilterAndIndexBlocksInCache_;
+  public int readAmpBytesPerBit() {
+    return readAmpBytesPerBit;
   }
 
   /**
-   * Indicating if we'd like to pin L0 index/filter blocks to the block cache.
-     If not specified, defaults to false.
+   * Set the Read amplification bytes per-bit.
+   *
+   * If used, For every data block we load into memory, we will create a bitmap
+   * of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap
+   * will be used to figure out the percentage we actually read of the blocks.
+   *
+   * When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and
+   * Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the
+   * read amplification using this formula
+   * (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES)
+   *
+   * value  =&gt;  memory usage (percentage of loaded blocks memory)
+   * 1      =&gt;  12.50 %
+   * 2      =&gt;  06.25 %
+   * 4      =&gt;  03.12 %
+   * 8      =&gt;  01.56 %
+   * 16     =&gt;  00.78 %
+   *
+   * Note: This number must be a power of 2, if not it will be sanitized
+   * to be the next lowest power of 2, for example a value of 7 will be
+   * treated as 4, a value of 19 will be treated as 16.
+   *
+   * Default: 0 (disabled)
+   *
+   * @param readAmpBytesPerBit the bytes per-bit
    *
-   * @param pinL0FilterAndIndexBlocksInCache pin blocks in block cache
    * @return the reference to the current config.
    */
-  public BlockBasedTableConfig setPinL0FilterAndIndexBlocksInCache(
-      final boolean pinL0FilterAndIndexBlocksInCache) {
-    pinL0FilterAndIndexBlocksInCache_ = pinL0FilterAndIndexBlocksInCache;
+  public BlockBasedTableConfig setReadAmpBytesPerBit(final int readAmpBytesPerBit) {
+    this.readAmpBytesPerBit = readAmpBytesPerBit;
     return this;
   }
 
   /**
-   * Influence the behavior when kHashSearch is used.
-     if false, stores a precise prefix to block range mapping
-     if true, does not store prefix and allows prefix hash collision
-     (less memory consumption)
+   * Get the format version.
+   * See {@link #setFormatVersion(int)}.
    *
-   * @return if hash collisions should be allowed.
+   * @return the currently configured format version.
    */
-  public boolean hashIndexAllowCollision() {
-    return hashIndexAllowCollision_;
+  public int formatVersion() {
+    return formatVersion;
   }
 
   /**
-   * Influence the behavior when kHashSearch is used.
-     if false, stores a precise prefix to block range mapping
-     if true, does not store prefix and allows prefix hash collision
-     (less memory consumption)
+   * <p>We currently have five versions:</p>
    *
-   * @param hashIndexAllowCollision points out if hash collisions should be allowed.
+   * <ul>
+   * <li><strong>0</strong> - This version is currently written
+   * out by all RocksDB's versions by default. Can be read by really old
+   * RocksDB's. Doesn't support changing checksum (default is CRC32).</li>
+   * <li><strong>1</strong> - Can be read by RocksDB's versions since 3.0.
+   * Supports non-default checksum, like xxHash. It is written by RocksDB when
+   * BlockBasedTableOptions::checksum is something other than kCRC32c. (version
+   * 0 is silently upconverted)</li>
+   * <li><strong>2</strong> - Can be read by RocksDB's versions since 3.10.
+   * Changes the way we encode compressed blocks with LZ4, BZip2 and Zlib
+   * compression. If you don't plan to run RocksDB before version 3.10,
+   * you should probably use this.</li>
+   * <li><strong>3</strong> - Can be read by RocksDB's versions since 5.15. Changes the way we
+   * encode the keys in index blocks. If you don't plan to run RocksDB before
+   * version 5.15, you should probably use this.
+   * This option only affects newly written tables. When reading existing
+   * tables, the information about version is read from the footer.</li>
+   * <li><strong>4</strong> - Can be read by RocksDB's versions since 5.16. Changes the way we
+   * encode the values in index blocks. If you don't plan to run RocksDB before
+   * version 5.16 and you are using index_block_restart_interval &gt; 1, you should
+   * probably use this as it would reduce the index size.</li>
+   * </ul>
+   * <p> This option only affects newly written tables. When reading existing
+   * tables, the information about version is read from the footer.</p>
+   *
+   * @param formatVersion integer representing the version to be used.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setFormatVersion(
+      final int formatVersion) {
+    assert(formatVersion >= 0 && formatVersion <= 4);
+    this.formatVersion = formatVersion;
+    return this;
+  }
+
+  /**
+   * Determine if index compression is enabled.
+   *
+   * See {@link #setEnableIndexCompression(boolean)}.
+   *
+   * @return true if index compression is enabled, false otherwise
+   */
+  public boolean enableIndexCompression() {
+    return enableIndexCompression;
+  }
+
+  /**
+   * Store index blocks on disk in compressed format.
+   *
+   * Changing this option to false  will avoid the overhead of decompression
+   * if index blocks are evicted and read back.
+   *
+   * @param enableIndexCompression true to enable index compression,
+   *     false to disable
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setEnableIndexCompression(
+      final boolean enableIndexCompression) {
+    this.enableIndexCompression = enableIndexCompression;
+    return this;
+  }
+
+  /**
+   * Determines whether data blocks are aligned on the lesser of page size
+   * and block size.
+   *
+   * @return true if data blocks are aligned on the lesser of page size
+   *     and block size.
+   */
+  public boolean blockAlign() {
+    return blockAlign;
+  }
+
+  /**
+   * Set whether data blocks should be aligned on the lesser of page size
+   * and block size.
+   *
+   * @param blockAlign true to align data blocks on the lesser of page size
+   *     and block size.
+   *
+   * @return the reference to the current option.
+   */
+  public BlockBasedTableConfig setBlockAlign(final boolean blockAlign) {
+    this.blockAlign = blockAlign;
+    return this;
+  }
+
+
+  /**
+   * Get the size of the cache in bytes that will be used by RocksDB.
+   *
+   * @return block cache size in bytes
+   */
+  @Deprecated
+  public long blockCacheSize() {
+    return blockCacheSize;
+  }
+
+  /**
+   * Set the size of the cache in bytes that will be used by RocksDB.
+   * If cacheSize is non-positive, then cache will not be used.
+   * DEFAULT: 8M
+   *
+   * @param blockCacheSize block cache size in bytes
    * @return the reference to the current config.
+   *
+   * @deprecated Use {@link #setBlockCache(Cache)}.
    */
-  public BlockBasedTableConfig setHashIndexAllowCollision(
-      final boolean hashIndexAllowCollision) {
-    hashIndexAllowCollision_ = hashIndexAllowCollision;
+  @Deprecated
+  public BlockBasedTableConfig setBlockCacheSize(final long blockCacheSize) {
+    this.blockCacheSize = blockCacheSize;
+    return this;
+  }
+
+  /**
+   * Returns the number of shard bits used in the block cache.
+   * The resulting number of shards would be 2 ^ (returned value).
+   * Any negative number means use default settings.
+   *
+   * @return the number of shard bits used in the block cache.
+   */
+  @Deprecated
+  public int cacheNumShardBits() {
+    return blockCacheNumShardBits;
+  }
+
+  /**
+   * Controls the number of shards for the block cache.
+   * This is applied only if cacheSize is set to non-negative.
+   *
+   * @param blockCacheNumShardBits the number of shard bits. The resulting
+   *     number of shards would be 2 ^ numShardBits.  Any negative
+   *     number means use default settings."
+   * @return the reference to the current option.
+   *
+   * @deprecated Use {@link #setBlockCache(Cache)}.
+   */
+  @Deprecated
+  public BlockBasedTableConfig setCacheNumShardBits(
+      final int blockCacheNumShardBits) {
+    this.blockCacheNumShardBits = blockCacheNumShardBits;
     return this;
   }
 
@@ -283,8 +766,9 @@ public BlockBasedTableConfig setHashIndexAllowCollision(
    *
    * @return size of compressed block cache.
    */
+  @Deprecated
   public long blockCacheCompressedSize() {
-    return blockCacheCompressedSize_;
+    return blockCacheCompressedSize;
   }
 
   /**
@@ -293,10 +777,13 @@ public long blockCacheCompressedSize() {
    *
    * @param blockCacheCompressedSize of compressed block cache.
    * @return the reference to the current config.
+   *
+   * @deprecated Use {@link #setBlockCacheCompressed(Cache)}.
    */
+  @Deprecated
   public BlockBasedTableConfig setBlockCacheCompressedSize(
       final long blockCacheCompressedSize) {
-    blockCacheCompressedSize_ = blockCacheCompressedSize;
+    this.blockCacheCompressedSize = blockCacheCompressedSize;
     return this;
   }
 
@@ -308,8 +795,9 @@ public BlockBasedTableConfig setBlockCacheCompressedSize(
    *     number of shards would be 2 ^ numShardBits.  Any negative
    *     number means use default settings.
    */
+  @Deprecated
   public int blockCacheCompressedNumShardBits() {
-    return blockCacheCompressedNumShardBits_;
+    return blockCacheCompressedNumShardBits;
   }
 
   /**
@@ -320,133 +808,166 @@ public int blockCacheCompressedNumShardBits() {
    *     number of shards would be 2 ^ numShardBits.  Any negative
    *     number means use default settings."
    * @return the reference to the current option.
+   *
+   * @deprecated Use {@link #setBlockCacheCompressed(Cache)}.
    */
+  @Deprecated
   public BlockBasedTableConfig setBlockCacheCompressedNumShardBits(
       final int blockCacheCompressedNumShardBits) {
-    blockCacheCompressedNumShardBits_ = blockCacheCompressedNumShardBits;
-    return this;
-  }
-
-  /**
-   * Sets the checksum type to be used with this table.
-   *
-   * @param checksumType {@link org.rocksdb.ChecksumType} value.
-   * @return the reference to the current option.
-   */
-  public BlockBasedTableConfig setChecksumType(
-      final ChecksumType checksumType) {
-    checksumType_ = checksumType;
+    this.blockCacheCompressedNumShardBits = blockCacheCompressedNumShardBits;
     return this;
   }
 
   /**
+   * Influence the behavior when kHashSearch is used.
+   *  if false, stores a precise prefix to block range mapping
+   *  if true, does not store prefix and allows prefix hash collision
+   *  (less memory consumption)
    *
-   * @return the currently set checksum type
-   */
-  public ChecksumType checksumType() {
-    return checksumType_;
-  }
-
-  /**
-   * Sets the index type to used with this table.
+   * @return if hash collisions should be allowed.
    *
-   * @param indexType {@link org.rocksdb.IndexType} value
-   * @return the reference to the current option.
+   * @deprecated This option is now deprecated. No matter what value it
+   *     is set to, it will behave as
+   *     if {@link #hashIndexAllowCollision()} == true.
    */
-  public BlockBasedTableConfig setIndexType(
-      final IndexType indexType) {
-    indexType_ = indexType;
-    return this;
+  @Deprecated
+  public boolean hashIndexAllowCollision() {
+    return true;
   }
 
   /**
+   * Influence the behavior when kHashSearch is used.
+   * if false, stores a precise prefix to block range mapping
+   * if true, does not store prefix and allows prefix hash collision
+   * (less memory consumption)
    *
-   * @return the currently set index type
-   */
-  public IndexType indexType() {
-    return indexType_;
-  }
-
-  /**
-   * <p>We currently have three versions:</p>
+   * @param hashIndexAllowCollision points out if hash collisions should be allowed.
    *
-   * <ul>
-   * <li><strong>0</strong> - This version is currently written
-   * out by all RocksDB's versions by default. Can be read by really old
-   * RocksDB's. Doesn't support changing checksum (default is CRC32).</li>
-   * <li><strong>1</strong> - Can be read by RocksDB's versions since 3.0.
-   * Supports non-default checksum, like xxHash. It is written by RocksDB when
-   * BlockBasedTableOptions::checksum is something other than kCRC32c. (version
-   * 0 is silently upconverted)</li>
-   * <li><strong>2</strong> - Can be read by RocksDB's versions since 3.10.
-   * Changes the way we encode compressed blocks with LZ4, BZip2 and Zlib
-   * compression. If you don't plan to run RocksDB before version 3.10,
-   * you should probably use this.</li>
-   * </ul>
-   * <p> This option only affects newly written tables. When reading existing
-   * tables, the information about version is read from the footer.</p>
+   * @return the reference to the current config.
    *
-   * @param formatVersion integer representing the version to be used.
-   * @return the reference to the current option.
+   * @deprecated This option is now deprecated. No matter what value it
+   *     is set to, it will behave as
+   *     if {@link #hashIndexAllowCollision()} == true.
    */
-  public BlockBasedTableConfig setFormatVersion(
-      final int formatVersion) {
-    assert(formatVersion >= 0 && formatVersion <= 2);
-    formatVersion_ = formatVersion;
+  @Deprecated
+  public BlockBasedTableConfig setHashIndexAllowCollision(
+      final boolean hashIndexAllowCollision) {
+    // no-op
     return this;
   }
 
-  /**
-   *
-   * @return the currently configured format version.
-   * See also: {@link #setFormatVersion(int)}.
-   */
-  public int formatVersion() {
-    return formatVersion_;
-  }
+  @Override protected long newTableFactoryHandle() {
+    final long filterPolicyHandle;
+    if (filterPolicy != null) {
+      filterPolicyHandle = filterPolicy.nativeHandle_;
+    } else {
+      filterPolicyHandle = 0;
+    }
 
+    final long blockCacheHandle;
+    if (blockCache != null) {
+      blockCacheHandle = blockCache.nativeHandle_;
+    } else {
+      blockCacheHandle = 0;
+    }
 
+    final long persistentCacheHandle;
+    if (persistentCache != null) {
+      persistentCacheHandle = persistentCache.nativeHandle_;
+    } else {
+      persistentCacheHandle = 0;
+    }
 
-  @Override protected long newTableFactoryHandle() {
-    long filterHandle = 0;
-    if (filter_ != null) {
-      filterHandle = filter_.nativeHandle_;
+    final long blockCacheCompressedHandle;
+    if (blockCacheCompressed != null) {
+      blockCacheCompressedHandle = blockCacheCompressed.nativeHandle_;
+    } else {
+      blockCacheCompressedHandle = 0;
     }
 
-    return newTableFactoryHandle(noBlockCache_, blockCacheSize_,
-        blockCacheNumShardBits_, blockSize_, blockSizeDeviation_,
-        blockRestartInterval_, wholeKeyFiltering_,
-        filterHandle, cacheIndexAndFilterBlocks_,
-        pinL0FilterAndIndexBlocksInCache_,
-        hashIndexAllowCollision_, blockCacheCompressedSize_,
-        blockCacheCompressedNumShardBits_,
-        checksumType_.getValue(), indexType_.getValue(),
-        formatVersion_);
+    return newTableFactoryHandle(cacheIndexAndFilterBlocks,
+        cacheIndexAndFilterBlocksWithHighPriority,
+        pinL0FilterAndIndexBlocksInCache, pinTopLevelIndexAndFilter,
+        indexType.getValue(), dataBlockIndexType.getValue(),
+        dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache,
+        blockCacheHandle, persistentCacheHandle, blockCacheCompressedHandle,
+        blockSize, blockSizeDeviation, blockRestartInterval,
+        indexBlockRestartInterval, metadataBlockSize, partitionFilters,
+        useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering,
+        verifyCompression, readAmpBytesPerBit, formatVersion,
+        enableIndexCompression, blockAlign,
+        blockCacheSize, blockCacheNumShardBits,
+        blockCacheCompressedSize, blockCacheCompressedNumShardBits);
   }
 
   private native long newTableFactoryHandle(
-      boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits,
-      long blockSize, int blockSizeDeviation, int blockRestartInterval,
-      boolean wholeKeyFiltering, long filterPolicyHandle,
-      boolean cacheIndexAndFilterBlocks, boolean pinL0FilterAndIndexBlocksInCache,
-      boolean hashIndexAllowCollision, long blockCacheCompressedSize,
-      int blockCacheCompressedNumShardBits, byte checkSumType,
-      byte indexType, int formatVersion);
-
-  private boolean cacheIndexAndFilterBlocks_;
-  private boolean pinL0FilterAndIndexBlocksInCache_;
-  private IndexType indexType_;
-  private boolean hashIndexAllowCollision_;
-  private ChecksumType checksumType_;
-  private boolean noBlockCache_;
-  private long blockSize_;
-  private long blockCacheSize_;
-  private int blockCacheNumShardBits_;
-  private long blockCacheCompressedSize_;
-  private int blockCacheCompressedNumShardBits_;
-  private int blockSizeDeviation_;
-  private int blockRestartInterval_;
-  private Filter filter_;
-  private boolean wholeKeyFiltering_;
-  private int formatVersion_;
+      final boolean cacheIndexAndFilterBlocks,
+      final boolean cacheIndexAndFilterBlocksWithHighPriority,
+      final boolean pinL0FilterAndIndexBlocksInCache,
+      final boolean pinTopLevelIndexAndFilter,
+      final byte indexTypeValue,
+      final byte dataBlockIndexTypeValue,
+      final double dataBlockHashTableUtilRatio,
+      final byte checksumTypeValue,
+      final boolean noBlockCache,
+      final long blockCacheHandle,
+      final long persistentCacheHandle,
+      final long blockCacheCompressedHandle,
+      final long blockSize,
+      final int blockSizeDeviation,
+      final int blockRestartInterval,
+      final int indexBlockRestartInterval,
+      final long metadataBlockSize,
+      final boolean partitionFilters,
+      final boolean useDeltaEncoding,
+      final long filterPolicyHandle,
+      final boolean wholeKeyFiltering,
+      final boolean verifyCompression,
+      final int readAmpBytesPerBit,
+      final int formatVersion,
+      final boolean enableIndexCompression,
+      final boolean blockAlign,
+
+      @Deprecated final long blockCacheSize,
+      @Deprecated final int blockCacheNumShardBits,
+
+      @Deprecated final long blockCacheCompressedSize,
+      @Deprecated final int blockCacheCompressedNumShardBits
+  );
+
+  //TODO(AR) flushBlockPolicyFactory
+  private boolean cacheIndexAndFilterBlocks;
+  private boolean cacheIndexAndFilterBlocksWithHighPriority;
+  private boolean pinL0FilterAndIndexBlocksInCache;
+  private boolean pinTopLevelIndexAndFilter;
+  private IndexType indexType;
+  private DataBlockIndexType dataBlockIndexType;
+  private double dataBlockHashTableUtilRatio;
+  private ChecksumType checksumType;
+  private boolean noBlockCache;
+  private Cache blockCache;
+  private PersistentCache persistentCache;
+  private Cache blockCacheCompressed;
+  private long blockSize;
+  private int blockSizeDeviation;
+  private int blockRestartInterval;
+  private int indexBlockRestartInterval;
+  private long metadataBlockSize;
+  private boolean partitionFilters;
+  private boolean useDeltaEncoding;
+  private Filter filterPolicy;
+  private boolean wholeKeyFiltering;
+  private boolean verifyCompression;
+  private int readAmpBytesPerBit;
+  private int formatVersion;
+  private boolean enableIndexCompression;
+  private boolean blockAlign;
+
+  // NOTE: ONLY used if blockCache == null
+  @Deprecated private long blockCacheSize;
+  @Deprecated private int blockCacheNumShardBits;
+
+  // NOTE: ONLY used if blockCacheCompressed == null
+  @Deprecated private long blockCacheCompressedSize;
+  @Deprecated private int blockCacheCompressedNumShardBits;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
index 26bf358835..6c87cc1884 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java
@@ -10,9 +10,10 @@
  */
 public class CassandraCompactionFilter
     extends AbstractCompactionFilter<Slice> {
-  public CassandraCompactionFilter(boolean purgeTtlOnExpiration) {
-      super(createNewCassandraCompactionFilter0(purgeTtlOnExpiration));
+  public CassandraCompactionFilter(boolean purgeTtlOnExpiration, int gcGracePeriodInSeconds) {
+    super(createNewCassandraCompactionFilter0(purgeTtlOnExpiration, gcGracePeriodInSeconds));
   }
 
-  private native static long createNewCassandraCompactionFilter0(boolean purgeTtlOnExpiration);
+  private native static long createNewCassandraCompactionFilter0(
+      boolean purgeTtlOnExpiration, int gcGracePeriodInSeconds);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
index a09556a2b8..4b0c71ba5a 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java
@@ -10,11 +10,16 @@
  * values.
  */
 public class CassandraValueMergeOperator extends MergeOperator {
-    public CassandraValueMergeOperator() {
-        super(newSharedCassandraValueMergeOperator());
+  public CassandraValueMergeOperator(int gcGracePeriodInSeconds) {
+    super(newSharedCassandraValueMergeOperator(gcGracePeriodInSeconds, 0));
     }
 
-    private native static long newSharedCassandraValueMergeOperator();
+    public CassandraValueMergeOperator(int gcGracePeriodInSeconds, int operandsLimit) {
+      super(newSharedCassandraValueMergeOperator(gcGracePeriodInSeconds, operandsLimit));
+    }
+
+    private native static long newSharedCassandraValueMergeOperator(
+        int gcGracePeriodInSeconds, int limit);
 
     @Override protected final native void disposeInternal(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
index d932fd9a92..8bb570e5d3 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.Arrays;
+
 /**
  * <p>Describes a column family with a
  * name and respective Options.</p>
@@ -32,7 +34,7 @@ public ColumnFamilyDescriptor(final byte[] columnFamilyName) {
    * @since 3.10.0
    */
   public ColumnFamilyDescriptor(final byte[] columnFamilyName,
-      final ColumnFamilyOptions columnFamilyOptions) {
+                                final ColumnFamilyOptions columnFamilyOptions) {
     columnFamilyName_ = columnFamilyName;
     columnFamilyOptions_ = columnFamilyOptions;
   }
@@ -43,19 +45,65 @@ public ColumnFamilyDescriptor(final byte[] columnFamilyName,
    * @return column family name.
    * @since 3.10.0
    */
-  public byte[] columnFamilyName() {
+  public byte[] getName() {
     return columnFamilyName_;
   }
 
+  /**
+   * Retrieve name of column family.
+   *
+   * @return column family name.
+   * @since 3.10.0
+   *
+   * @deprecated Use {@link #getName()} instead.
+   */
+  @Deprecated
+  public byte[] columnFamilyName() {
+    return getName();
+  }
+
   /**
    * Retrieve assigned options instance.
    *
    * @return Options instance assigned to this instance.
    */
-  public ColumnFamilyOptions columnFamilyOptions() {
+  public ColumnFamilyOptions getOptions() {
     return columnFamilyOptions_;
   }
 
+  /**
+   * Retrieve assigned options instance.
+   *
+   * @return Options instance assigned to this instance.
+   *
+   * @deprecated Use {@link #getOptions()} instead.
+   */
+  @Deprecated
+  public ColumnFamilyOptions columnFamilyOptions() {
+    return getOptions();
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+
+    final ColumnFamilyDescriptor that = (ColumnFamilyDescriptor) o;
+    return Arrays.equals(columnFamilyName_, that.columnFamilyName_)
+            && columnFamilyOptions_.nativeHandle_ == that.columnFamilyOptions_.nativeHandle_;
+  }
+
+  @Override
+  public int hashCode() {
+    int result = (int) (columnFamilyOptions_.nativeHandle_ ^ (columnFamilyOptions_.nativeHandle_ >>> 32));
+    result = 31 * result + Arrays.hashCode(columnFamilyName_);
+    return result;
+  }
+
   private final byte[] columnFamilyName_;
   private final ColumnFamilyOptions columnFamilyOptions_;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
index 7726cc62d7..9cda136b79 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java
@@ -5,6 +5,9 @@
 
 package org.rocksdb;
 
+import java.util.Arrays;
+import java.util.Objects;
+
 /**
  * ColumnFamilyHandle class to hold handles to underlying rocksdb
  * ColumnFamily Pointers.
@@ -21,6 +24,73 @@ public class ColumnFamilyHandle extends RocksObject {
     this.rocksDB_ = rocksDB;
   }
 
+  /**
+   * Gets the name of the Column Family.
+   *
+   * @return The name of the Column Family.
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving the name.
+   */
+  public byte[] getName() throws RocksDBException {
+    return getName(nativeHandle_);
+  }
+
+  /**
+   * Gets the ID of the Column Family.
+   *
+   * @return the ID of the Column Family.
+   */
+  public int getID() {
+    return getID(nativeHandle_);
+  }
+
+  /**
+   * Gets the up-to-date descriptor of the column family
+   * associated with this handle. Since it fills "*desc" with the up-to-date
+   * information, this call might internally lock and release DB mutex to
+   * access the up-to-date CF options. In addition, all the pointer-typed
+   * options cannot be referenced any longer than the original options exist.
+   *
+   * Note that this function is not supported in RocksDBLite.
+   *
+   * @return the up-to-date descriptor.
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving the
+   *     descriptor.
+   */
+  public ColumnFamilyDescriptor getDescriptor() throws RocksDBException {
+    assert(isOwningHandle());
+    return getDescriptor(nativeHandle_);
+  }
+
+  @Override
+  public boolean equals(final Object o) {
+    if (this == o) {
+      return true;
+    }
+    if (o == null || getClass() != o.getClass()) {
+      return false;
+    }
+
+    final ColumnFamilyHandle that = (ColumnFamilyHandle) o;
+    try {
+      return rocksDB_.nativeHandle_ == that.rocksDB_.nativeHandle_ &&
+          getID() == that.getID() &&
+          Arrays.equals(getName(), that.getName());
+    } catch (RocksDBException e) {
+      throw new RuntimeException("Cannot compare column family handles", e);
+    }
+  }
+
+  @Override
+  public int hashCode() {
+    try {
+      return Objects.hash(getName(), getID(), rocksDB_.nativeHandle_);
+    } catch (RocksDBException e) {
+      throw new RuntimeException("Cannot calculate hash code of column family handle", e);
+    }
+  }
+
   /**
    * <p>Deletes underlying C++ iterator pointer.</p>
    *
@@ -36,6 +106,9 @@ protected void disposeInternal() {
     }
   }
 
+  private native byte[] getName(final long handle) throws RocksDBException;
+  private native int getID(final long handle);
+  private native ColumnFamilyDescriptor getDescriptor(final long handle) throws RocksDBException;
   @Override protected final native void disposeInternal(final long handle);
 
   private final RocksDB rocksDB_;
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java
new file mode 100644
index 0000000000..1919040172
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java
@@ -0,0 +1,70 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * The metadata that describes a column family.
+ */
+public class ColumnFamilyMetaData {
+  private final long size;
+  private final long fileCount;
+  private final byte[] name;
+  private final LevelMetaData[] levels;
+
+  /**
+   * Called from JNI C++
+   */
+  private ColumnFamilyMetaData(
+      final long size,
+      final long fileCount,
+      final byte[] name,
+      final LevelMetaData[] levels) {
+    this.size = size;
+    this.fileCount = fileCount;
+    this.name = name;
+    this.levels = levels;
+  }
+
+  /**
+   * The size of this column family in bytes, which is equal to the sum of
+   * the file size of its {@link #levels()}.
+   *
+   * @return the size of this column family
+   */
+  public long size() {
+    return size;
+  }
+
+  /**
+   * The number of files in this column family.
+   *
+   * @return the number of files
+   */
+  public long fileCount() {
+    return fileCount;
+  }
+
+  /**
+   * The name of the column family.
+   *
+   * @return the name
+   */
+  public byte[] name() {
+    return name;
+  }
+
+  /**
+   * The metadata of all levels in this column family.
+   *
+   * @return the levels metadata
+   */
+  public List<LevelMetaData> levels() {
+    return Arrays.asList(levels);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
index 647b92e16c..e577524637 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
@@ -27,12 +27,54 @@ public class ColumnFamilyOptions extends RocksObject
    * Construct ColumnFamilyOptions.
    *
    * This constructor will create (by allocating a block of memory)
-   * an {@code rocksdb::DBOptions} in the c++ side.
+   * an {@code rocksdb::ColumnFamilyOptions} in the c++ side.
    */
   public ColumnFamilyOptions() {
     super(newColumnFamilyOptions());
   }
 
+  /**
+   * Copy constructor for ColumnFamilyOptions.
+   *
+   * NOTE: This does a shallow copy, which means comparator, merge_operator, compaction_filter,
+   * compaction_filter_factory and other pointers will be cloned!
+   *
+   * @param other The ColumnFamilyOptions to copy.
+   */
+  public ColumnFamilyOptions(ColumnFamilyOptions other) {
+    super(copyColumnFamilyOptions(other.nativeHandle_));
+    this.memTableConfig_ = other.memTableConfig_;
+    this.tableFormatConfig_ = other.tableFormatConfig_;
+    this.comparator_ = other.comparator_;
+    this.compactionFilter_ = other.compactionFilter_;
+    this.compactionFilterFactory_ = other.compactionFilterFactory_;
+    this.compactionOptionsUniversal_ = other.compactionOptionsUniversal_;
+    this.compactionOptionsFIFO_ = other.compactionOptionsFIFO_;
+    this.bottommostCompressionOptions_ = other.bottommostCompressionOptions_;
+    this.compressionOptions_ = other.compressionOptions_;
+  }
+
+  /**
+   * Constructor from Options
+   *
+   * @param options The options.
+   */
+  public ColumnFamilyOptions(final Options options) {
+    super(newColumnFamilyOptionsFromOptions(options.nativeHandle_));
+  }
+
+  /**
+   * <p>Constructor to be used by
+   * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)},
+   * {@link ColumnFamilyDescriptor#columnFamilyOptions()}
+   * and also called via JNI.</p>
+   *
+   * @param handle native handle to ColumnFamilyOptions instance.
+   */
+  ColumnFamilyOptions(final long handle) {
+    super(handle);
+  }
+
   /**
    * <p>Method to get a options instance by using pre-configured
    * property values. If one or many values are undefined in
@@ -130,7 +172,8 @@ public ColumnFamilyOptions setComparator(
   public ColumnFamilyOptions setComparator(
       final AbstractComparator<? extends AbstractSlice<?>> comparator) {
     assert (isOwningHandle());
-    setComparatorHandle(nativeHandle_, comparator.getNativeHandle());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_,
+            comparator.getComparatorType().getValue());
     comparator_ = comparator;
     return this;
   }
@@ -153,6 +196,7 @@ public ColumnFamilyOptions setMergeOperator(
     return this;
   }
 
+  @Override
   public ColumnFamilyOptions setCompactionFilter(
         final AbstractCompactionFilter<? extends AbstractSlice<?>>
             compactionFilter) {
@@ -161,6 +205,26 @@ public ColumnFamilyOptions setCompactionFilter(
     return this;
   }
 
+  @Override
+  public AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter() {
+    assert (isOwningHandle());
+    return compactionFilter_;
+  }
+
+  @Override
+  public ColumnFamilyOptions setCompactionFilterFactory(final AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory) {
+    assert (isOwningHandle());
+    setCompactionFilterFactoryHandle(nativeHandle_, compactionFilterFactory.nativeHandle_);
+    compactionFilterFactory_ = compactionFilterFactory;
+    return this;
+  }
+
+  @Override
+  public AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory() {
+    assert (isOwningHandle());
+    return compactionFilterFactory_;
+  }
+
   @Override
   public ColumnFamilyOptions setWriteBufferSize(final long writeBufferSize) {
     assert(isOwningHandle());
@@ -264,6 +328,20 @@ public CompressionType bottommostCompressionType() {
         bottommostCompressionType(nativeHandle_));
   }
 
+  @Override
+  public ColumnFamilyOptions setBottommostCompressionOptions(
+      final CompressionOptions bottommostCompressionOptions) {
+    setBottommostCompressionOptions(nativeHandle_,
+        bottommostCompressionOptions.nativeHandle_);
+    this.bottommostCompressionOptions_ = bottommostCompressionOptions;
+    return this;
+  }
+
+  @Override
+  public CompressionOptions bottommostCompressionOptions() {
+    return this.bottommostCompressionOptions_;
+  }
+
   @Override
   public ColumnFamilyOptions setCompressionOptions(
       final CompressionOptions compressionOptions) {
@@ -428,7 +506,7 @@ public ColumnFamilyOptions setCompactionStyle(
 
   @Override
   public CompactionStyle compactionStyle() {
-    return CompactionStyle.values()[compactionStyle(nativeHandle_)];
+    return CompactionStyle.fromValue(compactionStyle(nativeHandle_));
   }
 
   @Override
@@ -697,6 +775,17 @@ public boolean reportBgIoStats() {
     return reportBgIoStats(nativeHandle_);
   }
 
+  @Override
+  public ColumnFamilyOptions setTtl(final long ttl) {
+    setTtl(nativeHandle_, ttl);
+    return this;
+  }
+
+  @Override
+  public long ttl() {
+    return ttl(nativeHandle_);
+  }
+
   @Override
   public ColumnFamilyOptions setCompactionOptionsUniversal(
       final CompactionOptionsUniversal compactionOptionsUniversal) {
@@ -735,20 +824,13 @@ public boolean forceConsistencyChecks() {
     return forceConsistencyChecks(nativeHandle_);
   }
 
-  /**
-   * <p>Private constructor to be used by
-   * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}</p>
-   *
-   * @param handle native handle to ColumnFamilyOptions instance.
-   */
-  private ColumnFamilyOptions(final long handle) {
-    super(handle);
-  }
-
   private static native long getColumnFamilyOptionsFromProps(
       String optString);
 
   private static native long newColumnFamilyOptions();
+  private static native long copyColumnFamilyOptions(final long handle);
+  private static native long newColumnFamilyOptionsFromOptions(
+      final long optionsHandle);
   @Override protected final native void disposeInternal(final long handle);
 
   private native void optimizeForSmallDb(final long handle);
@@ -760,11 +842,13 @@ private native void optimizeUniversalStyleCompaction(long handle,
       long memtableMemoryBudget);
   private native void setComparatorHandle(long handle, int builtinComparator);
   private native void setComparatorHandle(long optHandle,
-      long comparatorHandle);
+      long comparatorHandle, byte comparatorType);
   private native void setMergeOperatorName(long handle, String name);
   private native void setMergeOperator(long handle, long mergeOperatorHandle);
   private native void setCompactionFilterHandle(long handle,
       long compactionFilterHandle);
+  private native void setCompactionFilterFactoryHandle(long handle,
+      long compactionFilterFactoryHandle);
   private native void setWriteBufferSize(long handle, long writeBufferSize)
       throws IllegalArgumentException;
   private native long writeBufferSize(long handle);
@@ -782,6 +866,8 @@ private native void setCompressionPerLevel(long handle,
   private native void setBottommostCompressionType(long handle,
       byte bottommostCompressionType);
   private native byte bottommostCompressionType(long handle);
+  private native void setBottommostCompressionOptions(final long handle,
+      final long bottommostCompressionOptionsHandle);
   private native void setCompressionOptions(long handle,
       long compressionOptionsHandle);
   private native void useFixedLengthPrefixExtractor(
@@ -889,6 +975,8 @@ private native void setCompactionPriority(final long handle,
   private native void setReportBgIoStats(final long handle,
     final boolean reportBgIoStats);
   private native boolean reportBgIoStats(final long handle);
+  private native void setTtl(final long handle, final long ttl);
+  private native long ttl(final long handle);
   private native void setCompactionOptionsUniversal(final long handle,
     final long compactionOptionsUniversalHandle);
   private native void setCompactionOptionsFIFO(final long handle,
@@ -898,12 +986,16 @@ private native void setForceConsistencyChecks(final long handle,
   private native boolean forceConsistencyChecks(final long handle);
 
   // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
   private MemTableConfig memTableConfig_;
   private TableFormatConfig tableFormatConfig_;
   private AbstractComparator<? extends AbstractSlice<?>> comparator_;
   private AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter_;
+  private AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>>
+      compactionFilterFactory_;
   private CompactionOptionsUniversal compactionOptionsUniversal_;
   private CompactionOptionsFIFO compactionOptionsFIFO_;
+  private CompressionOptions bottommostCompressionOptions_;
   private CompressionOptions compressionOptions_;
 
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
index 5cb68b4614..f88a21af2b 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
@@ -151,6 +151,60 @@ T setComparator(
    */
   T setMergeOperator(MergeOperator mergeOperator);
 
+  /**
+   * A single CompactionFilter instance to call into during compaction.
+   * Allows an application to modify/delete a key-value during background
+   * compaction.
+   *
+   * If the client requires a new compaction filter to be used for different
+   * compaction runs, it can specify call
+   * {@link #setCompactionFilterFactory(AbstractCompactionFilterFactory)}
+   * instead.
+   *
+   * The client should specify only set one of the two.
+   * {@link #setCompactionFilter(AbstractCompactionFilter)} takes precedence
+   * over {@link #setCompactionFilterFactory(AbstractCompactionFilterFactory)}
+   * if the client specifies both.
+   *
+   * If multithreaded compaction is being used, the supplied CompactionFilter
+   * instance may be used from different threads concurrently and so should be thread-safe.
+   *
+   * @param compactionFilter {@link AbstractCompactionFilter} instance.
+   * @return the instance of the current object.
+   */
+  T setCompactionFilter(
+          final AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter);
+
+  /**
+   * Accessor for the CompactionFilter instance in use.
+   *
+   * @return  Reference to the CompactionFilter, or null if one hasn't been set.
+   */
+  AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter();
+
+  /**
+   * This is a factory that provides {@link AbstractCompactionFilter} objects
+   * which allow an application to modify/delete a key-value during background
+   * compaction.
+   *
+   * A new filter will be created on each compaction run.  If multithreaded
+   * compaction is being used, each created CompactionFilter will only be used
+   * from a single thread and so does not need to be thread-safe.
+   *
+   * @param compactionFilterFactory {@link AbstractCompactionFilterFactory} instance.
+   * @return the instance of the current object.
+   */
+  T setCompactionFilterFactory(
+          final AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>>
+                  compactionFilterFactory);
+
+  /**
+   * Accessor for the CompactionFilterFactory instance in use.
+   *
+   * @return  Reference to the CompactionFilterFactory, or null if one hasn't been set.
+   */
+  AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory();
+
   /**
    * This prefix-extractor uses the first n bytes of a key as its prefix.
    *
@@ -345,6 +399,28 @@ T setBottommostCompressionType(
    */
   CompressionType bottommostCompressionType();
 
+  /**
+   * Set the options for compression algorithms used by
+   * {@link #bottommostCompressionType()} if it is enabled.
+   *
+   * To enable it, please see the definition of
+   * {@link CompressionOptions}.
+   *
+   * @param compressionOptions the bottom most compression options.
+   *
+   * @return the reference of the current options.
+   */
+  T setBottommostCompressionOptions(
+      final CompressionOptions compressionOptions);
+
+  /**
+   * Get the bottom most compression options.
+   *
+   * See {@link #setBottommostCompressionOptions(CompressionOptions)}.
+   *
+   * @return the bottom most compression options.
+   */
+  CompressionOptions bottommostCompressionOptions();
 
   /**
    * Set the different options for compression algorithms
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java
new file mode 100644
index 0000000000..c07bd96a55
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactRangeOptions.java
@@ -0,0 +1,237 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * CompactRangeOptions is used by CompactRange() call. In the documentation of the methods "the compaction" refers to
+ * any compaction that is using this CompactRangeOptions.
+ */
+public class CompactRangeOptions extends RocksObject {
+
+  private final static byte VALUE_kSkip = 0;
+  private final static byte VALUE_kIfHaveCompactionFilter = 1;
+  private final static byte VALUE_kForce = 2;
+
+  // For level based compaction, we can configure if we want to skip/force bottommost level compaction.
+  // The order of this neum MUST follow the C++ layer. See BottommostLevelCompaction in db/options.h
+  public enum BottommostLevelCompaction {
+    /**
+     * Skip bottommost level compaction
+     */
+    kSkip((byte)VALUE_kSkip),
+    /**
+     * Only compact bottommost level if there is a compaction filter. This is the default option
+     */
+    kIfHaveCompactionFilter(VALUE_kIfHaveCompactionFilter),
+    /**
+     * Always compact bottommost level
+     */
+    kForce(VALUE_kForce);
+
+    private final byte value;
+
+    BottommostLevelCompaction(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * <p>Returns the byte value of the enumerations value.</p>
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+      return value;
+    }
+
+    /**
+     * Returns the BottommostLevelCompaction for the given C++ rocks enum value.
+     * @param bottommostLevelCompaction The value of the BottommostLevelCompaction
+     * @return BottommostLevelCompaction instance, or null if none matches
+     */
+    public static BottommostLevelCompaction fromRocksId(final int bottommostLevelCompaction) {
+      switch (bottommostLevelCompaction) {
+        case VALUE_kSkip: return kSkip;
+        case VALUE_kIfHaveCompactionFilter: return kIfHaveCompactionFilter;
+        case VALUE_kForce: return kForce;
+        default: return null;
+      }
+    }
+  }
+
+  /**
+   * Construct CompactRangeOptions.
+   */
+  public CompactRangeOptions() {
+    super(newCompactRangeOptions());
+  }
+
+  /**
+   * Returns whether the compaction is exclusive or other compactions may run concurrently at the same time.
+   *
+   * @return true if exclusive, false if concurrent
+   */
+  public boolean exclusiveManualCompaction() {
+    return exclusiveManualCompaction(nativeHandle_);
+  }
+
+  /**
+   * Sets whether the compaction is exclusive or other compaction are allowed run concurrently at the same time.
+   *
+   * @param exclusiveCompaction true if compaction should be exclusive
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setExclusiveManualCompaction(final boolean exclusiveCompaction) {
+    setExclusiveManualCompaction(nativeHandle_, exclusiveCompaction);
+    return this;
+  }
+
+  /**
+   * Returns whether compacted files will be moved to the minimum level capable of holding the data or given level
+   * (specified non-negative target_level).
+   * @return true, if compacted files will be moved to the minimum level
+   */
+  public boolean changeLevel() {
+    return changeLevel(nativeHandle_);
+  }
+
+  /**
+   * Whether compacted files will be moved to the minimum level capable of holding the data or given level
+   * (specified non-negative target_level).
+   *
+   * @param changeLevel If true, compacted files will be moved to the minimum level
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setChangeLevel(final boolean changeLevel) {
+    setChangeLevel(nativeHandle_, changeLevel);
+    return this;
+  }
+
+  /**
+   * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level.
+   * @return The target level for the compacted files
+   */
+  public int targetLevel() {
+    return targetLevel(nativeHandle_);
+  }
+
+
+  /**
+   * If change_level is true and target_level have non-negative value, compacted files will be moved to target_level.
+   *
+   * @param targetLevel target level for the compacted files
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setTargetLevel(final int targetLevel) {
+    setTargetLevel(nativeHandle_, targetLevel);
+    return this;
+  }
+
+  /**
+   * target_path_id for compaction output. Compaction outputs will be placed in options.db_paths[target_path_id].
+   *
+   * @return target_path_id
+   */
+  public int targetPathId() {
+    return targetPathId(nativeHandle_);
+  }
+
+  /**
+   * Compaction outputs will be placed in options.db_paths[target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.
+   *
+   * @param targetPathId target path id
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setTargetPathId(final int targetPathId) {
+    setTargetPathId(nativeHandle_, targetPathId);
+    return this;
+  }
+
+  /**
+   * Returns the policy for compacting the bottommost level
+   * @return The BottommostLevelCompaction policy
+   */
+  public BottommostLevelCompaction bottommostLevelCompaction() {
+    return BottommostLevelCompaction.fromRocksId(bottommostLevelCompaction(nativeHandle_));
+  }
+
+  /**
+   * Sets the policy for compacting the bottommost level
+   *
+   * @param bottommostLevelCompaction The policy for compacting the bottommost level
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setBottommostLevelCompaction(final BottommostLevelCompaction bottommostLevelCompaction) {
+    setBottommostLevelCompaction(nativeHandle_, bottommostLevelCompaction.getValue());
+    return this;
+  }
+
+  /**
+   * If true, compaction will execute immediately even if doing so would cause the DB to
+   * enter write stall mode. Otherwise, it'll sleep until load is low enough.
+   * @return true if compaction will execute immediately
+   */
+  public boolean allowWriteStall() {
+    return allowWriteStall(nativeHandle_);
+  }
+
+
+  /**
+   * If true, compaction will execute immediately even if doing so would cause the DB to
+   * enter write stall mode. Otherwise, it'll sleep until load is low enough.
+   *
+   * @return This CompactRangeOptions
+   * @param allowWriteStall true if compaction should execute immediately
+   */
+  public CompactRangeOptions setAllowWriteStall(final boolean allowWriteStall) {
+    setAllowWriteStall(nativeHandle_, allowWriteStall);
+    return this;
+  }
+
+  /**
+   * If &gt; 0, it will replace the option in the DBOptions for this compaction
+   * @return number of subcompactions
+   */
+  public int maxSubcompactions() {
+    return maxSubcompactions(nativeHandle_);
+  }
+
+  /**
+   * If &gt; 0, it will replace the option in the DBOptions for this compaction
+   *
+   * @param maxSubcompactions number of subcompactions
+   * @return This CompactRangeOptions
+   */
+  public CompactRangeOptions setMaxSubcompactions(final int maxSubcompactions) {
+    setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
+  }
+
+  private native static long newCompactRangeOptions();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private native boolean exclusiveManualCompaction(final long handle);
+  private native void setExclusiveManualCompaction(final long handle,
+      final boolean exclusive_manual_compaction);
+  private native boolean changeLevel(final long handle);
+  private native void setChangeLevel(final long handle,
+      final boolean changeLevel);
+  private native int targetLevel(final long handle);
+  private native void setTargetLevel(final long handle,
+      final int targetLevel);
+  private native int targetPathId(final long handle);
+  private native void setTargetPathId(final long handle,
+      final int targetPathId);
+  private native int bottommostLevelCompaction(final long handle);
+  private native void setBottommostLevelCompaction(final long handle,
+      final int bottommostLevelCompaction);
+  private native boolean allowWriteStall(final long handle);
+  private native void setAllowWriteStall(final long handle,
+      final boolean allowWriteStall);
+  private native void setMaxSubcompactions(final long handle,
+      final int maxSubcompactions);
+  private native int maxSubcompactions(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java
new file mode 100644
index 0000000000..8b59edc91d
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionJobInfo.java
@@ -0,0 +1,159 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+public class CompactionJobInfo extends RocksObject {
+
+  public CompactionJobInfo() {
+    super(newCompactionJobInfo());
+  }
+
+  /**
+   * Private as called from JNI C++
+   */
+  private CompactionJobInfo(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Get the name of the column family where the compaction happened.
+   *
+   * @return the name of the column family
+   */
+  public byte[] columnFamilyName() {
+    return columnFamilyName(nativeHandle_);
+  }
+
+  /**
+   * Get the status indicating whether the compaction was successful or not.
+   *
+   * @return the status
+   */
+  public Status status() {
+    return status(nativeHandle_);
+  }
+
+  /**
+   * Get the id of the thread that completed this compaction job.
+   *
+   * @return the id of the thread
+   */
+  public long threadId() {
+    return threadId(nativeHandle_);
+  }
+
+  /**
+   * Get the job id, which is unique in the same thread.
+   *
+   * @return the id of the thread
+   */
+  public int jobId() {
+    return jobId(nativeHandle_);
+  }
+
+  /**
+   * Get the smallest input level of the compaction.
+   *
+   * @return the input level
+   */
+  public int baseInputLevel() {
+    return baseInputLevel(nativeHandle_);
+  }
+
+  /**
+   * Get the output level of the compaction.
+   *
+   * @return the output level
+   */
+  public int outputLevel() {
+    return outputLevel(nativeHandle_);
+  }
+
+  /**
+   * Get the names of the compaction input files.
+   *
+   * @return the names of the input files.
+   */
+  public List<String> inputFiles() {
+    return Arrays.asList(inputFiles(nativeHandle_));
+  }
+
+  /**
+   * Get the names of the compaction output files.
+   *
+   * @return the names of the output files.
+   */
+  public List<String> outputFiles() {
+    return Arrays.asList(outputFiles(nativeHandle_));
+  }
+
+  /**
+   * Get the table properties for the input and output tables.
+   *
+   * The map is keyed by values from {@link #inputFiles()} and
+   *     {@link #outputFiles()}.
+   *
+   * @return the table properties
+   */
+  public Map<String, TableProperties> tableProperties() {
+    return tableProperties(nativeHandle_);
+  }
+
+  /**
+   * Get the Reason for running the compaction.
+   *
+   * @return the reason.
+   */
+  public CompactionReason compactionReason() {
+    return CompactionReason.fromValue(compactionReason(nativeHandle_));
+  }
+
+  //
+  /**
+   * Get the compression algorithm used for output files.
+   *
+   * @return the compression algorithm
+   */
+  public CompressionType compression() {
+    return CompressionType.getCompressionType(compression(nativeHandle_));
+  }
+
+  /**
+   * Get detailed information about this compaction.
+   *
+   * @return the detailed information, or null if not available.
+   */
+  public /* @Nullable */ CompactionJobStats stats() {
+    final long statsHandle = stats(nativeHandle_);
+    if (statsHandle == 0) {
+      return null;
+    }
+
+    return new CompactionJobStats(statsHandle);
+  }
+
+
+  private static native long newCompactionJobInfo();
+  @Override protected native void disposeInternal(final long handle);
+
+  private static native byte[] columnFamilyName(final long handle);
+  private static native Status status(final long handle);
+  private static native long threadId(final long handle);
+  private static native int jobId(final long handle);
+  private static native int baseInputLevel(final long handle);
+  private static native int outputLevel(final long handle);
+  private static native String[] inputFiles(final long handle);
+  private static native String[] outputFiles(final long handle);
+  private static native Map<String, TableProperties> tableProperties(
+      final long handle);
+  private static native byte compactionReason(final long handle);
+  private static native byte compression(final long handle);
+  private static native long stats(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionJobStats.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionJobStats.java
new file mode 100644
index 0000000000..3d53b5565e
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionJobStats.java
@@ -0,0 +1,295 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class CompactionJobStats extends RocksObject {
+
+  public CompactionJobStats() {
+    super(newCompactionJobStats());
+  }
+
+  /**
+   * Private as called from JNI C++
+   */
+  CompactionJobStats(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Reset the stats.
+   */
+  public void reset() {
+    reset(nativeHandle_);
+  }
+
+  /**
+   * Aggregate the CompactionJobStats from another instance with this one.
+   *
+   * @param compactionJobStats another instance of stats.
+   */
+  public void add(final CompactionJobStats compactionJobStats) {
+    add(nativeHandle_, compactionJobStats.nativeHandle_);
+  }
+
+  /**
+   * Get the elapsed time in micro of this compaction.
+   *
+   * @return the elapsed time in micro of this compaction.
+   */
+  public long elapsedMicros() {
+    return elapsedMicros(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction input records.
+   *
+   * @return the number of compaction input records.
+   */
+  public long numInputRecords() {
+    return numInputRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction input files.
+   *
+   * @return the number of compaction input files.
+   */
+  public long numInputFiles() {
+    return numInputFiles(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction input files at the output level.
+   *
+   * @return the number of compaction input files at the output level.
+   */
+  public long numInputFilesAtOutputLevel() {
+    return numInputFilesAtOutputLevel(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction output records.
+   *
+   * @return the number of compaction output records.
+   */
+  public long numOutputRecords() {
+    return numOutputRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of compaction output files.
+   *
+   * @return the number of compaction output files.
+   */
+  public long numOutputFiles() {
+    return numOutputFiles(nativeHandle_);
+  }
+
+  /**
+   * Determine if the compaction is a manual compaction.
+   *
+   * @return true if the compaction is a manual compaction, false otherwise.
+   */
+  public boolean isManualCompaction() {
+    return isManualCompaction(nativeHandle_);
+  }
+
+  /**
+   * Get the size of the compaction input in bytes.
+   *
+   * @return the size of the compaction input in bytes.
+   */
+  public long totalInputBytes() {
+    return totalInputBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the size of the compaction output in bytes.
+   *
+   * @return the size of the compaction output in bytes.
+   */
+  public long totalOutputBytes() {
+    return totalOutputBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the number of records being replaced by newer record associated
+   * with same key.
+   *
+   * This could be a new value or a deletion entry for that key so this field
+   * sums up all updated and deleted keys.
+   *
+   * @return the number of records being replaced by newer record associated
+   *     with same key.
+   */
+  public long numRecordsReplaced() {
+    return numRecordsReplaced(nativeHandle_);
+  }
+
+  /**
+   * Get the sum of the uncompressed input keys in bytes.
+   *
+   * @return the sum of the uncompressed input keys in bytes.
+   */
+  public long totalInputRawKeyBytes() {
+    return totalInputRawKeyBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the sum of the uncompressed input values in bytes.
+   *
+   * @return the sum of the uncompressed input values in bytes.
+   */
+  public long totalInputRawValueBytes() {
+    return totalInputRawValueBytes(nativeHandle_);
+  }
+
+  /**
+   * Get the number of deletion entries before compaction.
+   *
+   * Deletion entries can disappear after compaction because they expired.
+   *
+   * @return the number of deletion entries before compaction.
+   */
+  public long numInputDeletionRecords() {
+    return numInputDeletionRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of deletion records that were found obsolete and discarded
+   * because it is not possible to delete any more keys with this entry.
+   * (i.e. all possible deletions resulting from it have been completed)
+   *
+   * @return the number of deletion records that were found obsolete and
+   *     discarded.
+   */
+  public long numExpiredDeletionRecords() {
+    return numExpiredDeletionRecords(nativeHandle_);
+  }
+
+  /**
+   * Get the number of corrupt keys (ParseInternalKey returned false when
+   * applied to the key) encountered and written out.
+   *
+   * @return the number of corrupt keys.
+   */
+  public long numCorruptKeys() {
+    return numCorruptKeys(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on file's Append() call.
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on file's Append() call.
+   */
+  public long fileWriteNanos() {
+    return fileWriteNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on sync file range.
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on sync file range.
+   */
+  public long fileRangeSyncNanos() {
+    return fileRangeSyncNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on file fsync.
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on file fsync.
+   */
+  public long fileFsyncNanos() {
+    return fileFsyncNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the Time spent on preparing file write (falocate, etc)
+   *
+   * Only populated if {@link ColumnFamilyOptions#reportBgIoStats()} is set.
+   *
+   * @return the Time spent on preparing file write (falocate, etc).
+   */
+  public long filePrepareWriteNanos() {
+    return filePrepareWriteNanos(nativeHandle_);
+  }
+
+  /**
+   * Get the smallest output key prefix.
+   *
+   * @return the smallest output key prefix.
+   */
+  public byte[] smallestOutputKeyPrefix() {
+    return smallestOutputKeyPrefix(nativeHandle_);
+  }
+
+  /**
+   * Get the largest output key prefix.
+   *
+   * @return the smallest output key prefix.
+   */
+  public byte[] largestOutputKeyPrefix() {
+    return largestOutputKeyPrefix(nativeHandle_);
+  }
+
+  /**
+   * Get the number of single-deletes which do not meet a put.
+   *
+   * @return number of single-deletes which do not meet a put.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public long numSingleDelFallthru() {
+    return numSingleDelFallthru(nativeHandle_);
+  }
+
+  /**
+   * Get the number of single-deletes which meet something other than a put.
+   *
+   * @return the number of single-deletes which meet something other than a put.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public long numSingleDelMismatch() {
+    return numSingleDelMismatch(nativeHandle_);
+  }
+
+  private static native long newCompactionJobStats();
+  @Override protected native void disposeInternal(final long handle);
+
+
+  private static native void reset(final long handle);
+  private static native void add(final long handle,
+      final long compactionJobStatsHandle);
+  private static native long elapsedMicros(final long handle);
+  private static native long numInputRecords(final long handle);
+  private static native long numInputFiles(final long handle);
+  private static native long numInputFilesAtOutputLevel(final long handle);
+  private static native long numOutputRecords(final long handle);
+  private static native long numOutputFiles(final long handle);
+  private static native boolean isManualCompaction(final long handle);
+  private static native long totalInputBytes(final long handle);
+  private static native long totalOutputBytes(final long handle);
+  private static native long numRecordsReplaced(final long handle);
+  private static native long totalInputRawKeyBytes(final long handle);
+  private static native long totalInputRawValueBytes(final long handle);
+  private static native long numInputDeletionRecords(final long handle);
+  private static native long numExpiredDeletionRecords(final long handle);
+  private static native long numCorruptKeys(final long handle);
+  private static native long fileWriteNanos(final long handle);
+  private static native long fileRangeSyncNanos(final long handle);
+  private static native long fileFsyncNanos(final long handle);
+  private static native long filePrepareWriteNanos(final long handle);
+  private static native byte[] smallestOutputKeyPrefix(final long handle);
+  private static native byte[] largestOutputKeyPrefix(final long handle);
+  private static native long numSingleDelFallthru(final long handle);
+  private static native long numSingleDelMismatch(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionOptions.java
new file mode 100644
index 0000000000..2c7e391fbf
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionOptions.java
@@ -0,0 +1,121 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * CompactionOptions are used in
+ * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, CompactionJobInfo)}
+ * calls.
+ */
+public class CompactionOptions extends RocksObject {
+
+  public CompactionOptions() {
+    super(newCompactionOptions());
+  }
+
+  /**
+   * Get the compaction output compression type.
+   *
+   * See {@link #setCompression(CompressionType)}.
+   *
+   * @return the compression type.
+   */
+  public CompressionType compression() {
+    return CompressionType.getCompressionType(
+        compression(nativeHandle_));
+  }
+
+  /**
+   * Set the compaction output compression type.
+   *
+   * Default: snappy
+   *
+   * If set to {@link CompressionType#DISABLE_COMPRESSION_OPTION},
+   * RocksDB will choose compression type according to the
+   * {@link ColumnFamilyOptions#compressionType()}, taking into account
+   * the output level if {@link ColumnFamilyOptions#compressionPerLevel()}
+   * is specified.
+   *
+   * @param compression the compression type to use for compaction output.
+   *
+   * @return the instance of the current Options.
+   */
+  public CompactionOptions setCompression(final CompressionType compression) {
+    setCompression(nativeHandle_, compression.getValue());
+    return this;
+  }
+
+  /**
+   * Get the compaction output file size limit.
+   *
+   * See {@link #setOutputFileSizeLimit(long)}.
+   *
+   * @return the file size limit.
+   */
+  public long outputFileSizeLimit() {
+    return outputFileSizeLimit(nativeHandle_);
+  }
+
+  /**
+   * Compaction will create files of size {@link #outputFileSizeLimit()}.
+   *
+   * Default: 2^64-1, which means that compaction will create a single file
+   *
+   * @param outputFileSizeLimit the size limit
+   *
+   * @return the instance of the current Options.
+   */
+  public CompactionOptions setOutputFileSizeLimit(
+      final long outputFileSizeLimit) {
+    setOutputFileSizeLimit(nativeHandle_, outputFileSizeLimit);
+    return this;
+  }
+
+  /**
+   * Get the maximum number of threads that will concurrently perform a
+   * compaction job.
+   *
+   * @return the maximum number of threads.
+   */
+  public int maxSubcompactions() {
+    return maxSubcompactions(nativeHandle_);
+  }
+
+  /**
+   * This value represents the maximum number of threads that will
+   * concurrently perform a compaction job by breaking it into multiple,
+   * smaller ones that are run simultaneously.
+   *
+   * Default: 0 (i.e. no subcompactions)
+   *
+   * If &gt; 0, it will replace the option in
+   * {@link DBOptions#maxSubcompactions()} for this compaction.
+   *
+   * @param maxSubcompactions The maximum number of threads that will
+   *     concurrently perform a compaction job
+   *
+   * @return the instance of the current Options.
+   */
+  public CompactionOptions setMaxSubcompactions(final int maxSubcompactions) {
+    setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
+  }
+
+  private static native long newCompactionOptions();
+  @Override protected final native void disposeInternal(final long handle);
+
+  private static native byte compression(final long handle);
+  private static native void setCompression(final long handle,
+      final byte compressionTypeValue);
+  private static native long outputFileSizeLimit(final long handle);
+  private static native void setOutputFileSizeLimit(final long handle,
+      final long outputFileSizeLimit);
+  private static native int maxSubcompactions(final long handle);
+  private static native void setMaxSubcompactions(final long handle,
+      final int maxSubcompactions);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
index f795807804..4c8d6545cb 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java
@@ -42,9 +42,48 @@ public long maxTableFilesSize() {
     return maxTableFilesSize(nativeHandle_);
   }
 
-  private native void setMaxTableFilesSize(long handle, long maxTableFilesSize);
-  private native long maxTableFilesSize(long handle);
+  /**
+   * If true, try to do compaction to compact smaller files into larger ones.
+   * Minimum files to compact follows options.level0_file_num_compaction_trigger
+   * and compaction won't trigger if average compact bytes per del file is
+   * larger than options.write_buffer_size. This is to protect large files
+   * from being compacted again.
+   *
+   * Default: false
+   *
+   * @param allowCompaction true to allow intra-L0 compaction
+   *
+   * @return the reference to the current options.
+   */
+  public CompactionOptionsFIFO setAllowCompaction(
+      final boolean allowCompaction) {
+    setAllowCompaction(nativeHandle_, allowCompaction);
+    return this;
+  }
+
+
+  /**
+   * Check if intra-L0 compaction is enabled.
+   * When enabled, we try to compact smaller files into larger ones.
+   *
+   * See {@link #setAllowCompaction(boolean)}.
+   *
+   * Default: false
+   *
+   * @return true if intra-L0 compaction is enabled, false otherwise.
+   */
+  public boolean allowCompaction() {
+    return allowCompaction(nativeHandle_);
+  }
+
 
   private native static long newCompactionOptionsFIFO();
   @Override protected final native void disposeInternal(final long handle);
+
+  private native void setMaxTableFilesSize(final long handle,
+      final long maxTableFilesSize);
+  private native long maxTableFilesSize(final long handle);
+  private native void setAllowCompaction(final long handle,
+      final boolean allowCompaction);
+  private native boolean allowCompaction(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java
new file mode 100644
index 0000000000..f18c481222
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionReason.java
@@ -0,0 +1,115 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum CompactionReason {
+  kUnknown((byte)0x0),
+
+  /**
+   * [Level] number of L0 files &gt; level0_file_num_compaction_trigger
+   */
+  kLevelL0FilesNum((byte)0x1),
+
+  /**
+   * [Level] total size of level &gt; MaxBytesForLevel()
+   */
+  kLevelMaxLevelSize((byte)0x2),
+
+  /**
+   * [Universal] Compacting for size amplification
+   */
+  kUniversalSizeAmplification((byte)0x3),
+
+  /**
+   * [Universal] Compacting for size ratio
+   */
+  kUniversalSizeRatio((byte)0x4),
+
+  /**
+   * [Universal] number of sorted runs &gt; level0_file_num_compaction_trigger
+   */
+  kUniversalSortedRunNum((byte)0x5),
+
+  /**
+   * [FIFO] total size &gt; max_table_files_size
+   */
+  kFIFOMaxSize((byte)0x6),
+
+  /**
+   * [FIFO] reduce number of files.
+   */
+  kFIFOReduceNumFiles((byte)0x7),
+
+  /**
+   * [FIFO] files with creation time &lt; (current_time - interval)
+   */
+  kFIFOTtl((byte)0x8),
+
+  /**
+   * Manual compaction
+   */
+  kManualCompaction((byte)0x9),
+
+  /**
+   * DB::SuggestCompactRange() marked files for compaction
+   */
+  kFilesMarkedForCompaction((byte)0x10),
+
+  /**
+   * [Level] Automatic compaction within bottommost level to cleanup duplicate
+   * versions of same user key, usually due to a released snapshot.
+   */
+  kBottommostFiles((byte)0x0A),
+
+  /**
+   * Compaction based on TTL
+   */
+  kTtl((byte)0x0B),
+
+  /**
+   * According to the comments in flush_job.cc, RocksDB treats flush as
+   * a level 0 compaction in internal stats.
+   */
+  kFlush((byte)0x0C),
+
+  /**
+   * Compaction caused by external sst file ingestion
+   */
+  kExternalSstIngestion((byte)0x0D);
+
+  private final byte value;
+
+  CompactionReason(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the CompactionReason from the internal representation value.
+   *
+   * @return the compaction reason.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static CompactionReason fromValue(final byte value) {
+    for (final CompactionReason compactionReason : CompactionReason.values()) {
+      if(compactionReason.value == value) {
+        return compactionReason;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for CompactionReason: " + value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java
index 5e13363c44..b24bbf8509 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompactionStyle.java
@@ -5,6 +5,8 @@
 
 package org.rocksdb;
 
+import java.util.List;
+
 /**
  * Enum CompactionStyle
  *
@@ -21,6 +23,9 @@
  *   compaction strategy. It is suited for keeping event log data with
  *   very low overhead (query log for example). It periodically deletes
  *   the old data, so it's basically a TTL compaction style.</li>
+ *   <li><strong>NONE</strong> - Disable background compaction.
+ *   Compaction jobs are submitted
+ *   {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, CompactionJobInfo)} ()}.</li>
  * </ol>
  *
  * @see <a
@@ -31,22 +36,45 @@
  * FIFO Compaction</a>
  */
 public enum CompactionStyle {
-  LEVEL((byte) 0),
-  UNIVERSAL((byte) 1),
-  FIFO((byte) 2);
+  LEVEL((byte) 0x0),
+  UNIVERSAL((byte) 0x1),
+  FIFO((byte) 0x2),
+  NONE((byte) 0x3);
 
-  private final byte value_;
+  private final byte value;
 
-  private CompactionStyle(byte value) {
-    value_ = value;
+  CompactionStyle(final byte value) {
+    this.value = value;
   }
 
   /**
-   * Returns the byte value of the enumerations value
+   * Get the internal representation value.
    *
-   * @return byte representation
+   * @return the internal representation value.
    */
+  //TODO(AR) should be made package-private
   public byte getValue() {
-    return value_;
+    return value;
+  }
+
+  /**
+   * Get the Compaction style from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the Compaction style
+   *
+   * @throws IllegalArgumentException if the value does not match a
+   *     CompactionStyle
+   */
+  static CompactionStyle fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final CompactionStyle compactionStyle : CompactionStyle.values()) {
+      if (compactionStyle.value == value) {
+        return compactionStyle;
+      }
+    }
+    throw new IllegalArgumentException("Unknown value for CompactionStyle: "
+        + value);
   }
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Comparator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Comparator.java
index 817e00fd27..4d06073f26 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Comparator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Comparator.java
@@ -16,16 +16,18 @@
  */
 public abstract class Comparator extends AbstractComparator<Slice> {
 
-  private final long nativeHandle_;
-
   public Comparator(final ComparatorOptions copt) {
-    super();
-    this.nativeHandle_ = createNewComparator0(copt.nativeHandle_);
+    super(copt);
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewComparator0(nativeParameterHandles[0]);
   }
 
   @Override
-  protected final long getNativeHandle() {
-    return nativeHandle_;
+  final ComparatorType getComparatorType() {
+    return ComparatorType.JAVA_COMPARATOR;
   }
 
   private native long createNewComparator0(final long comparatorOptionsHandle);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ComparatorType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ComparatorType.java
new file mode 100644
index 0000000000..df8b475907
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ComparatorType.java
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+enum ComparatorType {
+  JAVA_COMPARATOR((byte)0x0),
+  JAVA_DIRECT_COMPARATOR((byte)0x1),
+  JAVA_NATIVE_COMPARATOR_WRAPPER((byte)0x2);
+
+  private final byte value;
+
+  ComparatorType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the ComparatorType enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of ComparatorType.
+   *
+   * @return ComparatorType instance.
+   *
+   * @throws IllegalArgumentException if the comparator type for the byteIdentifier
+   *     cannot be found
+   */
+  static ComparatorType getComparatorType(final byte byteIdentifier) {
+    for (final ComparatorType comparatorType : ComparatorType.values()) {
+      if (comparatorType.getValue() == byteIdentifier) {
+        return comparatorType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for ComparatorType.");
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompressionOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompressionOptions.java
index 4927770e52..a9072bbb97 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompressionOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/CompressionOptions.java
@@ -71,6 +71,67 @@ public int maxDictBytes() {
     return maxDictBytes(nativeHandle_);
   }
 
+  /**
+   * Maximum size of training data passed to zstd's dictionary trainer. Using
+   * zstd's dictionary trainer can achieve even better compression ratio
+   * improvements than using {@link #setMaxDictBytes(int)} alone.
+   *
+   * The training data will be used to generate a dictionary
+   * of {@link #maxDictBytes()}.
+   *
+   * Default: 0.
+   *
+   * @param zstdMaxTrainBytes Maximum bytes to use for training ZStd.
+   *
+   * @return the reference to the current options
+   */
+  public CompressionOptions setZStdMaxTrainBytes(final int zstdMaxTrainBytes) {
+    setZstdMaxTrainBytes(nativeHandle_, zstdMaxTrainBytes);
+    return this;
+  }
+
+  /**
+   * Maximum size of training data passed to zstd's dictionary trainer.
+   *
+   * @return Maximum bytes to use for training ZStd
+   */
+  public int zstdMaxTrainBytes() {
+    return zstdMaxTrainBytes(nativeHandle_);
+  }
+
+  /**
+   * When the compression options are set by the user, it will be set to "true".
+   * For bottommost_compression_opts, to enable it, user must set enabled=true.
+   * Otherwise, bottommost compression will use compression_opts as default
+   * compression options.
+   *
+   * For compression_opts, if compression_opts.enabled=false, it is still
+   * used as compression options for compression process.
+   *
+   * Default: false.
+   *
+   * @param enabled true to use these compression options
+   *     for the bottommost_compression_opts, false otherwise
+   *
+   * @return the reference to the current options
+   */
+  public CompressionOptions setEnabled(final boolean enabled) {
+    setEnabled(nativeHandle_, enabled);
+    return this;
+  }
+
+  /**
+   * Determine whether these compression options
+   * are used for the bottommost_compression_opts.
+   *
+   * @return true if these compression options are used
+   *     for the bottommost_compression_opts, false otherwise
+   */
+  public boolean enabled() {
+    return enabled(nativeHandle_);
+  }
+
+
   private native static long newCompressionOptions();
   @Override protected final native void disposeInternal(final long handle);
 
@@ -82,4 +143,9 @@ public int maxDictBytes() {
   private native int strategy(final long handle);
   private native void setMaxDictBytes(final long handle, final int maxDictBytes);
   private native int maxDictBytes(final long handle);
+  private native void setZstdMaxTrainBytes(final long handle,
+      final int zstdMaxTrainBytes);
+  private native int zstdMaxTrainBytes(final long handle);
+  private native void setEnabled(final long handle, final boolean enabled);
+  private native boolean enabled(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
index 14f0c6c7c9..e2c4c02b32 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptions.java
@@ -15,8 +15,9 @@
  * If {@link #dispose()} function is not called, then it will be GC'd
  * automatically and native resources will be released as part of the process.
  */
-public class DBOptions
-    extends RocksObject implements DBOptionsInterface<DBOptions> {
+public class DBOptions extends RocksObject
+    implements DBOptionsInterface<DBOptions>,
+    MutableDBOptionsInterface<DBOptions> {
   static {
     RocksDB.loadLibrary();
   }
@@ -32,6 +33,33 @@ public DBOptions() {
     numShardBits_ = DEFAULT_NUM_SHARD_BITS;
   }
 
+  /**
+   * Copy constructor for DBOptions.
+   *
+   * NOTE: This does a shallow copy, which means env, rate_limiter, sst_file_manager,
+   * info_log and other pointers will be cloned!
+   *
+   * @param other The DBOptions to copy.
+   */
+  public DBOptions(DBOptions other) {
+    super(copyDBOptions(other.nativeHandle_));
+    this.env_ = other.env_;
+    this.numShardBits_ = other.numShardBits_;
+    this.rateLimiter_ = other.rateLimiter_;
+    this.rowCache_ = other.rowCache_;
+    this.walFilter_ = other.walFilter_;
+    this.writeBufferManager_ = other.writeBufferManager_;
+  }
+
+  /**
+   * Constructor from Options
+   *
+   * @param options The options.
+   */
+  public DBOptions(final Options options) {
+    super(newDBOptionsFromOptions(options.nativeHandle_));
+  }
+
   /**
    * <p>Method to get a options instance by using pre-configured
    * property values. If one or many values are undefined in
@@ -114,18 +142,6 @@ public boolean createMissingColumnFamilies() {
     return createMissingColumnFamilies(nativeHandle_);
   }
 
-  @Override
-  public DBOptions setEnv(final Env env) {
-    setEnv(nativeHandle_, env.nativeHandle_);
-    this.env_ = env;
-    return this;
-  }
-
-  @Override
-  public Env getEnv() {
-    return env_;
-  }
-
   @Override
   public DBOptions setErrorIfExists(
       final boolean errorIfExists) {
@@ -154,6 +170,18 @@ public boolean paranoidChecks() {
     return paranoidChecks(nativeHandle_);
   }
 
+  @Override
+  public DBOptions setEnv(final Env env) {
+    setEnv(nativeHandle_, env.nativeHandle_);
+    this.env_ = env;
+    return this;
+  }
+
+  @Override
+  public Env getEnv() {
+    return env_;
+  }
+
   @Override
   public DBOptions setRateLimiter(final RateLimiter rateLimiter) {
     assert(isOwningHandle());
@@ -162,6 +190,13 @@ public DBOptions setRateLimiter(final RateLimiter rateLimiter) {
     return this;
   }
 
+  @Override
+  public DBOptions setSstFileManager(final SstFileManager sstFileManager) {
+    assert(isOwningHandle());
+    setSstFileManager(nativeHandle_, sstFileManager.nativeHandle_);
+    return this;
+  }
+
   @Override
   public DBOptions setLogger(final Logger logger) {
     assert(isOwningHandle());
@@ -262,8 +297,8 @@ public DBOptions setDbPaths(final Collection<DbPath> dbPaths) {
     assert(isOwningHandle());
 
     final int len = dbPaths.size();
-    final String paths[] = new String[len];
-    final long targetSizes[] = new long[len];
+    final String[] paths = new String[len];
+    final long[] targetSizes = new long[len];
 
     int i = 0;
     for(final DbPath dbPath : dbPaths) {
@@ -281,8 +316,8 @@ public List<DbPath> dbPaths() {
     if(len == 0) {
       return Collections.emptyList();
     } else {
-      final String paths[] = new String[len];
-      final long targetSizes[] = new long[len];
+      final String[] paths = new String[len];
+      final long[] targetSizes = new long[len];
 
       dbPaths(nativeHandle_, paths, targetSizes);
 
@@ -336,6 +371,19 @@ public long deleteObsoleteFilesPeriodMicros() {
     return deleteObsoleteFilesPeriodMicros(nativeHandle_);
   }
 
+  @Override
+  public DBOptions setMaxBackgroundJobs(final int maxBackgroundJobs) {
+    assert(isOwningHandle());
+    setMaxBackgroundJobs(nativeHandle_, maxBackgroundJobs);
+    return this;
+  }
+
+  @Override
+  public int maxBackgroundJobs() {
+    assert(isOwningHandle());
+    return maxBackgroundJobs(nativeHandle_);
+  }
+
   @Override
   public void setBaseBackgroundCompactions(
       final int baseBackgroundCompactions) {
@@ -364,9 +412,10 @@ public int maxBackgroundCompactions() {
   }
 
   @Override
-  public void setMaxSubcompactions(final int maxSubcompactions) {
+  public DBOptions setMaxSubcompactions(final int maxSubcompactions) {
     assert(isOwningHandle());
     setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
   }
 
   @Override
@@ -390,8 +439,7 @@ public int maxBackgroundFlushes() {
   }
 
   @Override
-  public DBOptions setMaxLogFileSize(
-      final long maxLogFileSize) {
+  public DBOptions setMaxLogFileSize(final long maxLogFileSize) {
     assert(isOwningHandle());
     setMaxLogFileSize(nativeHandle_, maxLogFileSize);
     return this;
@@ -515,73 +563,73 @@ public long manifestPreallocationSize() {
   }
 
   @Override
-  public DBOptions setUseDirectReads(
-      final boolean useDirectReads) {
+  public DBOptions setAllowMmapReads(
+      final boolean allowMmapReads) {
     assert(isOwningHandle());
-    setUseDirectReads(nativeHandle_, useDirectReads);
+    setAllowMmapReads(nativeHandle_, allowMmapReads);
     return this;
   }
 
   @Override
-  public boolean useDirectReads() {
+  public boolean allowMmapReads() {
     assert(isOwningHandle());
-    return useDirectReads(nativeHandle_);
+    return allowMmapReads(nativeHandle_);
   }
 
   @Override
-  public DBOptions setUseDirectIoForFlushAndCompaction(
-      final boolean useDirectIoForFlushAndCompaction) {
+  public DBOptions setAllowMmapWrites(
+      final boolean allowMmapWrites) {
     assert(isOwningHandle());
-    setUseDirectIoForFlushAndCompaction(nativeHandle_,
-        useDirectIoForFlushAndCompaction);
+    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
     return this;
   }
 
   @Override
-  public boolean useDirectIoForFlushAndCompaction() {
+  public boolean allowMmapWrites() {
     assert(isOwningHandle());
-    return useDirectIoForFlushAndCompaction(nativeHandle_);
+    return allowMmapWrites(nativeHandle_);
   }
 
   @Override
-  public DBOptions setAllowFAllocate(final boolean allowFAllocate) {
+  public DBOptions setUseDirectReads(
+      final boolean useDirectReads) {
     assert(isOwningHandle());
-    setAllowFAllocate(nativeHandle_, allowFAllocate);
+    setUseDirectReads(nativeHandle_, useDirectReads);
     return this;
   }
 
   @Override
-  public boolean allowFAllocate() {
+  public boolean useDirectReads() {
     assert(isOwningHandle());
-    return allowFAllocate(nativeHandle_);
+    return useDirectReads(nativeHandle_);
   }
 
   @Override
-  public DBOptions setAllowMmapReads(
-      final boolean allowMmapReads) {
+  public DBOptions setUseDirectIoForFlushAndCompaction(
+      final boolean useDirectIoForFlushAndCompaction) {
     assert(isOwningHandle());
-    setAllowMmapReads(nativeHandle_, allowMmapReads);
+    setUseDirectIoForFlushAndCompaction(nativeHandle_,
+        useDirectIoForFlushAndCompaction);
     return this;
   }
 
   @Override
-  public boolean allowMmapReads() {
+  public boolean useDirectIoForFlushAndCompaction() {
     assert(isOwningHandle());
-    return allowMmapReads(nativeHandle_);
+    return useDirectIoForFlushAndCompaction(nativeHandle_);
   }
 
   @Override
-  public DBOptions setAllowMmapWrites(
-      final boolean allowMmapWrites) {
+  public DBOptions setAllowFAllocate(final boolean allowFAllocate) {
     assert(isOwningHandle());
-    setAllowMmapWrites(nativeHandle_, allowMmapWrites);
+    setAllowFAllocate(nativeHandle_, allowFAllocate);
     return this;
   }
 
   @Override
-  public boolean allowMmapWrites() {
+  public boolean allowFAllocate() {
     assert(isOwningHandle());
-    return allowMmapWrites(nativeHandle_);
+    return allowFAllocate(nativeHandle_);
   }
 
   @Override
@@ -632,6 +680,20 @@ public DBOptions setDbWriteBufferSize(final long dbWriteBufferSize) {
     return this;
   }
 
+  @Override
+  public DBOptions setWriteBufferManager(final WriteBufferManager writeBufferManager) {
+    assert(isOwningHandle());
+    setWriteBufferManager(nativeHandle_, writeBufferManager.nativeHandle_);
+    this.writeBufferManager_ = writeBufferManager;
+    return this;
+  }
+
+  @Override
+  public WriteBufferManager writeBufferManager() {
+    assert(isOwningHandle());
+    return this.writeBufferManager_;
+  }
+
   @Override
   public long dbWriteBufferSize() {
     assert(isOwningHandle());
@@ -745,6 +807,33 @@ public long walBytesPerSync() {
     return walBytesPerSync(nativeHandle_);
   }
 
+  //TODO(AR) NOW
+//  @Override
+//  public DBOptions setListeners(final List<EventListener> listeners) {
+//    assert(isOwningHandle());
+//    final long[] eventListenerHandlers = new long[listeners.size()];
+//    for (int i = 0; i < eventListenerHandlers.length; i++) {
+//      eventListenerHandlers[i] = listeners.get(i).nativeHandle_;
+//    }
+//    setEventListeners(nativeHandle_, eventListenerHandlers);
+//    return this;
+//  }
+//
+//  @Override
+//  public Collection<EventListener> listeners() {
+//    assert(isOwningHandle());
+//    final long[] eventListenerHandlers = listeners(nativeHandle_);
+//    if (eventListenerHandlers == null || eventListenerHandlers.length == 0) {
+//      return Collections.emptyList();
+//    }
+//
+//    final List<EventListener> eventListeners = new ArrayList<>();
+//    for (final long eventListenerHandle : eventListenerHandlers) {
+//      eventListeners.add(new EventListener(eventListenerHandle)); //TODO(AR) check ownership is set to false!
+//    }
+//    return eventListeners;
+//  }
+
   @Override
   public DBOptions setEnableThreadTracking(final boolean enableThreadTracking) {
     assert(isOwningHandle());
@@ -770,6 +859,19 @@ public long delayedWriteRate(){
     return delayedWriteRate(nativeHandle_);
   }
 
+  @Override
+  public DBOptions setEnablePipelinedWrite(final boolean enablePipelinedWrite) {
+    assert(isOwningHandle());
+    setEnablePipelinedWrite(nativeHandle_, enablePipelinedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean enablePipelinedWrite() {
+    assert(isOwningHandle());
+    return enablePipelinedWrite(nativeHandle_);
+  }
+
   @Override
   public DBOptions setAllowConcurrentMemtableWrite(
       final boolean allowConcurrentMemtableWrite) {
@@ -871,6 +973,20 @@ public Cache rowCache() {
     return this.rowCache_;
   }
 
+  @Override
+  public DBOptions setWalFilter(final AbstractWalFilter walFilter) {
+    assert(isOwningHandle());
+    setWalFilter(nativeHandle_, walFilter.nativeHandle_);
+    this.walFilter_ = walFilter;
+    return this;
+  }
+
+  @Override
+  public WalFilter walFilter() {
+    assert(isOwningHandle());
+    return this.walFilter_;
+  }
+
   @Override
   public DBOptions setFailIfOptionsFileError(final boolean failIfOptionsFileError) {
     assert(isOwningHandle());
@@ -923,6 +1039,69 @@ public boolean avoidFlushDuringShutdown() {
     return avoidFlushDuringShutdown(nativeHandle_);
   }
 
+  @Override
+  public DBOptions setAllowIngestBehind(final boolean allowIngestBehind) {
+    assert(isOwningHandle());
+    setAllowIngestBehind(nativeHandle_, allowIngestBehind);
+    return this;
+  }
+
+  @Override
+  public boolean allowIngestBehind() {
+    assert(isOwningHandle());
+    return allowIngestBehind(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setPreserveDeletes(final boolean preserveDeletes) {
+    assert(isOwningHandle());
+    setPreserveDeletes(nativeHandle_, preserveDeletes);
+    return this;
+  }
+
+  @Override
+  public boolean preserveDeletes() {
+    assert(isOwningHandle());
+    return preserveDeletes(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setTwoWriteQueues(final boolean twoWriteQueues) {
+    assert(isOwningHandle());
+    setTwoWriteQueues(nativeHandle_, twoWriteQueues);
+    return this;
+  }
+
+  @Override
+  public boolean twoWriteQueues() {
+    assert(isOwningHandle());
+    return twoWriteQueues(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setManualWalFlush(final boolean manualWalFlush) {
+    assert(isOwningHandle());
+    setManualWalFlush(nativeHandle_, manualWalFlush);
+    return this;
+  }
+
+  @Override
+  public boolean manualWalFlush() {
+    assert(isOwningHandle());
+    return manualWalFlush(nativeHandle_);
+  }
+
+  @Override
+  public DBOptions setAtomicFlush(final boolean atomicFlush) {
+    setAtomicFlush(nativeHandle_, atomicFlush);
+    return this;
+  }
+
+  @Override
+  public boolean atomicFlush() {
+    return atomicFlush(nativeHandle_);
+  }
+
   static final int DEFAULT_NUM_SHARD_BITS = -1;
 
 
@@ -941,7 +1120,9 @@ private DBOptions(final long nativeHandle) {
   private static native long getDBOptionsFromProps(
       String optString);
 
-  private native static long newDBOptions();
+  private static native long newDBOptions();
+  private static native long copyDBOptions(final long handle);
+  private static native long newDBOptionsFromOptions(final long optionsHandle);
   @Override protected final native void disposeInternal(final long handle);
 
   private native void optimizeForSmallDb(final long handle);
@@ -959,6 +1140,8 @@ private native void setParanoidChecks(
   private native boolean paranoidChecks(long handle);
   private native void setRateLimiter(long handle,
       long rateLimiterHandle);
+  private native void setSstFileManager(final long handle,
+      final long sstFileManagerHandle);
   private native void setLogger(long handle,
       long loggerHandle);
   private native void setInfoLogLevel(long handle, byte logLevel);
@@ -998,6 +1181,8 @@ private native void setMaxBackgroundCompactions(
   private native void setMaxBackgroundFlushes(
       long handle, int maxBackgroundFlushes);
   private native int maxBackgroundFlushes(long handle);
+  private native void setMaxBackgroundJobs(long handle, int maxBackgroundJobs);
+  private native int maxBackgroundJobs(long handle);
   private native void setMaxLogFileSize(long handle, long maxLogFileSize)
       throws IllegalArgumentException;
   private native long maxLogFileSize(long handle);
@@ -1047,6 +1232,8 @@ private native void setAdviseRandomOnOpen(
   private native boolean adviseRandomOnOpen(long handle);
   private native void setDbWriteBufferSize(final long handle,
       final long dbWriteBufferSize);
+  private native void setWriteBufferManager(final long dbOptionsHandle,
+      final long writeBufferManagerHandle);
   private native long dbWriteBufferSize(final long handle);
   private native void setAccessHintOnCompactionStart(final long handle,
       final byte accessHintOnCompactionStart);
@@ -1076,6 +1263,9 @@ private native void setEnableThreadTracking(long handle,
   private native boolean enableThreadTracking(long handle);
   private native void setDelayedWriteRate(long handle, long delayedWriteRate);
   private native long delayedWriteRate(long handle);
+  private native void setEnablePipelinedWrite(final long handle,
+      final boolean enablePipelinedWrite);
+  private native boolean enablePipelinedWrite(final long handle);
   private native void setAllowConcurrentMemtableWrite(long handle,
       boolean allowConcurrentMemtableWrite);
   private native boolean allowConcurrentMemtableWrite(long handle);
@@ -1098,7 +1288,9 @@ private native void setAllow2pc(final long handle,
       final boolean allow2pc);
   private native boolean allow2pc(final long handle);
   private native void setRowCache(final long handle,
-      final long row_cache_handle);
+      final long rowCacheHandle);
+  private native void setWalFilter(final long handle,
+      final long walFilterHandle);
   private native void setFailIfOptionsFileError(final long handle,
       final boolean failIfOptionsFileError);
   private native boolean failIfOptionsFileError(final long handle);
@@ -1111,10 +1303,28 @@ private native void setAvoidFlushDuringRecovery(final long handle,
   private native void setAvoidFlushDuringShutdown(final long handle,
       final boolean avoidFlushDuringShutdown);
   private native boolean avoidFlushDuringShutdown(final long handle);
+  private native void setAllowIngestBehind(final long handle,
+      final boolean allowIngestBehind);
+  private native boolean allowIngestBehind(final long handle);
+  private native void setPreserveDeletes(final long handle,
+      final boolean preserveDeletes);
+  private native boolean preserveDeletes(final long handle);
+  private native void setTwoWriteQueues(final long handle,
+      final boolean twoWriteQueues);
+  private native boolean twoWriteQueues(final long handle);
+  private native void setManualWalFlush(final long handle,
+      final boolean manualWalFlush);
+  private native boolean manualWalFlush(final long handle);
+  private native void setAtomicFlush(final long handle,
+      final boolean atomicFlush);
+  private native boolean atomicFlush(final long handle);
 
   // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
   private Env env_;
   private int numShardBits_;
   private RateLimiter rateLimiter_;
   private Cache rowCache_;
+  private WalFilter walFilter_;
+  private WriteBufferManager writeBufferManager_;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
index 50ca083d37..af9aa179bf 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DBOptionsInterface.java
@@ -158,6 +158,26 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    */
   T setRateLimiter(RateLimiter rateLimiter);
 
+  /**
+   * Use to track SST files and control their file deletion rate.
+   *
+   * Features:
+   *  - Throttle the deletion rate of the SST files.
+   *  - Keep track the total size of all SST files.
+   *  - Set a maximum allowed space limit for SST files that when reached
+   *    the DB wont do any further flushes or compactions and will set the
+   *    background error.
+   *  - Can be shared between multiple dbs.
+   *
+   *  Limitations:
+   *  - Only track and throttle deletes of SST files in
+   *    first db_path (db_name if db_paths is empty).
+   *
+   * @param sstFileManager The SST File Manager for the db.
+   * @return the instance of the current object.
+   */
+  T setSstFileManager(SstFileManager sstFileManager);
+
   /**
    * <p>Any internal progress/error information generated by
    * the db will be written to the Logger if it is non-nullptr,
@@ -186,35 +206,9 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
   InfoLogLevel infoLogLevel();
 
   /**
-   * Number of open files that can be used by the DB.  You may need to
-   * increase this if your database has a large working set. Value -1 means
-   * files opened are always kept open. You can estimate number of files based
-   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
-   * for level-based compaction. For universal-style compaction, you can usually
-   * set it to -1.
-   * Default: 5000
-   *
-   * @param maxOpenFiles the maximum number of open files.
-   * @return the instance of the current object.
-   */
-  T setMaxOpenFiles(int maxOpenFiles);
-
-  /**
-   * Number of open files that can be used by the DB.  You may need to
-   * increase this if your database has a large working set. Value -1 means
-   * files opened are always kept open. You can estimate number of files based
-   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
-   * for level-based compaction. For universal-style compaction, you can usually
-   * set it to -1.
-   *
-   * @return the maximum number of open files.
-   */
-  int maxOpenFiles();
-
-  /**
-   * If {@link #maxOpenFiles()} is -1, DB will open all files on DB::Open(). You
-   * can use this option to increase the number of threads used to open the
-   * files.
+   * If {@link MutableDBOptionsInterface#maxOpenFiles()} is -1, DB will open
+   * all files on DB::Open(). You can use this option to increase the number
+   * of threads used to open the files.
    *
    * Default: 16
    *
@@ -226,9 +220,9 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
   T setMaxFileOpeningThreads(int maxFileOpeningThreads);
 
   /**
-   * If {@link #maxOpenFiles()} is -1, DB will open all files on DB::Open(). You
-   * can use this option to increase the number of threads used to open the
-   * files.
+   * If {@link MutableDBOptionsInterface#maxOpenFiles()} is -1, DB will open all
+   * files on DB::Open(). You can use this option to increase the number of
+   * threads used to open the files.
    *
    * Default: 16
    *
@@ -236,40 +230,15 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    */
   int maxFileOpeningThreads();
 
-  /**
-   * <p>Once write-ahead logs exceed this size, we will start forcing the
-   * flush of column families whose memtables are backed by the oldest live
-   * WAL file (i.e. the ones that are causing all the space amplification).
-   * </p>
-   * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
-   * be [sum of all write_buffer_size * max_write_buffer_number] * 2</p>
-   * <p>Default: 0</p>
-   *
-   * @param maxTotalWalSize max total wal size.
-   * @return the instance of the current object.
-   */
-  T setMaxTotalWalSize(long maxTotalWalSize);
-
-  /**
-   * <p>Returns the max total wal size. Once write-ahead logs exceed this size,
-   * we will start forcing the flush of column families whose memtables are
-   * backed by the oldest live WAL file (i.e. the ones that are causing all
-   * the space amplification).</p>
-   *
-   * <p>If set to 0 (default), we will dynamically choose the WAL size limit
-   * to be [sum of all write_buffer_size * max_write_buffer_number] * 2
-   * </p>
-   *
-   * @return max total wal size
-   */
-  long maxTotalWalSize();
-
   /**
    * <p>Sets the statistics object which collects metrics about database operations.
    * Statistics objects should not be shared between DB instances as
    * it does not use any locks to prevent concurrent updates.</p>
    *
+   * @param statistics The statistics to set
+   *
    * @return the instance of the current object.
+   *
    * @see RocksDB#open(org.rocksdb.Options, String)
    */
   T setStatistics(final Statistics statistics);
@@ -277,7 +246,9 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
   /**
    * <p>Returns statistics object.</p>
    *
-   * @return the instance of the statistics object or null if there is no statistics object.
+   * @return the instance of the statistics object or null if there is no
+   * statistics object.
+   *
    * @see #setStatistics(Statistics)
    */
   Statistics statistics();
@@ -439,55 +410,6 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    */
   long deleteObsoleteFilesPeriodMicros();
 
-  /**
-   * Suggested number of concurrent background compaction jobs, submitted to
-   * the default LOW priority thread pool.
-   * Default: 1
-   *
-   * @param baseBackgroundCompactions Suggested number of background compaction
-   *     jobs
-   */
-  void setBaseBackgroundCompactions(int baseBackgroundCompactions);
-
-  /**
-   * Suggested number of concurrent background compaction jobs, submitted to
-   * the default LOW priority thread pool.
-   * Default: 1
-   *
-   * @return Suggested number of background compaction jobs
-   */
-  int baseBackgroundCompactions();
-
-  /**
-   * Specifies the maximum number of concurrent background compaction jobs,
-   * submitted to the default LOW priority thread pool.
-   * If you're increasing this, also consider increasing number of threads in
-   * LOW priority thread pool. For more information, see
-   * Default: 1
-   *
-   * @param maxBackgroundCompactions the maximum number of background
-   *     compaction jobs.
-   * @return the instance of the current object.
-   *
-   * @see RocksEnv#setBackgroundThreads(int)
-   * @see RocksEnv#setBackgroundThreads(int, int)
-   * @see #maxBackgroundFlushes()
-   */
-  T setMaxBackgroundCompactions(int maxBackgroundCompactions);
-
-  /**
-   * Returns the maximum number of concurrent background compaction jobs,
-   * submitted to the default LOW priority thread pool.
-   * When increasing this number, we may also want to consider increasing
-   * number of threads in LOW priority thread pool.
-   * Default: 1
-   *
-   * @return the maximum number of concurrent background compaction jobs.
-   * @see RocksEnv#setBackgroundThreads(int)
-   * @see RocksEnv#setBackgroundThreads(int, int)
-   */
-  int maxBackgroundCompactions();
-
   /**
    * This value represents the maximum number of threads that will
    * concurrently perform a compaction job by breaking it into multiple,
@@ -496,8 +418,10 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    *
    * @param maxSubcompactions The maximum number of threads that will
    *     concurrently perform a compaction job
+   *
+   * @return the instance of the current object.
    */
-  void setMaxSubcompactions(int maxSubcompactions);
+  T setMaxSubcompactions(int maxSubcompactions);
 
   /**
    * This value represents the maximum number of threads that will
@@ -520,9 +444,12 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    * @return the instance of the current object.
    *
    * @see RocksEnv#setBackgroundThreads(int)
-   * @see RocksEnv#setBackgroundThreads(int, int)
-   * @see #maxBackgroundCompactions()
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
+   * @see MutableDBOptionsInterface#maxBackgroundCompactions()
+   *
+   * @deprecated Use {@link MutableDBOptionsInterface#setMaxBackgroundJobs(int)}
    */
+  @Deprecated
   T setMaxBackgroundFlushes(int maxBackgroundFlushes);
 
   /**
@@ -533,8 +460,9 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    *
    * @return the maximum number of concurrent background flush jobs.
    * @see RocksEnv#setBackgroundThreads(int)
-   * @see RocksEnv#setBackgroundThreads(int, int)
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
    */
+  @Deprecated
   int maxBackgroundFlushes();
 
   /**
@@ -886,23 +814,6 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    */
   boolean isFdCloseOnExec();
 
-  /**
-   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
-   * Default: 600 (10 minutes)
-   *
-   * @param statsDumpPeriodSec time interval in seconds.
-   * @return the instance of the current object.
-   */
-  T setStatsDumpPeriodSec(int statsDumpPeriodSec);
-
-  /**
-   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
-   * Default: 600 (10 minutes)
-   *
-   * @return time interval in seconds.
-   */
-  int statsDumpPeriodSec();
-
   /**
    * If set true, will hint the underlying file system that the file
    * access pattern is random, when a sst file is opened.
@@ -940,6 +851,28 @@ public interface DBOptionsInterface<T extends DBOptionsInterface> {
    */
   T setDbWriteBufferSize(long dbWriteBufferSize);
 
+  /**
+   * Use passed {@link WriteBufferManager} to control memory usage across
+   * multiple column families and/or DB instances.
+   *
+   * Check <a href="https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager">
+   *     https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager</a>
+   * for more details on when to use it
+   *
+   * @param writeBufferManager The WriteBufferManager to use
+   * @return the reference of the current options.
+   */
+  T setWriteBufferManager(final WriteBufferManager writeBufferManager);
+
+  /**
+   * Reference to {@link WriteBufferManager} used by it. <br>
+   *
+   * Default: null (Disabled)
+   *
+   * @return a reference to WriteBufferManager
+   */
+  WriteBufferManager writeBufferManager();
+
   /**
    * Amount of data to build up in memtables across all column
    * families before writing to disk.
@@ -1015,36 +948,6 @@ T setNewTableReaderForCompactionInputs(
    */
   boolean newTableReaderForCompactionInputs();
 
-  /**
-   * If non-zero, we perform bigger reads when doing compaction. If you're
-   * running RocksDB on spinning disks, you should set this to at least 2MB.
-   *
-   * That way RocksDB's compaction is doing sequential instead of random reads.
-   * When non-zero, we also force {@link #newTableReaderForCompactionInputs()}
-   * to true.
-   *
-   * Default: 0
-   *
-   * @param compactionReadaheadSize The compaction read-ahead size
-   *
-   * @return the reference to the current options.
-   */
-  T setCompactionReadaheadSize(final long compactionReadaheadSize);
-
-  /**
-   * If non-zero, we perform bigger reads when doing compaction. If you're
-   * running RocksDB on spinning disks, you should set this to at least 2MB.
-   *
-   * That way RocksDB's compaction is doing sequential instead of random reads.
-   * When non-zero, we also force {@link #newTableReaderForCompactionInputs()}
-   * to true.
-   *
-   * Default: 0
-   *
-   * @return The compaction read-ahead size
-   */
-  long compactionReadaheadSize();
-
   /**
    * This is a maximum buffer size that is used by WinMmapReadableFile in
    * unbuffered disk I/O mode. We need to maintain an aligned buffer for
@@ -1052,7 +955,8 @@ T setNewTableReaderForCompactionInputs(
    * for bigger requests allocate one shot buffers. In unbuffered mode we
    * always bypass read-ahead buffer at ReadaheadRandomAccessFile
    * When read-ahead is required we then make use of
-   * {@link #compactionReadaheadSize()} value and always try to read ahead.
+   * {@link MutableDBOptionsInterface#compactionReadaheadSize()} value and
+   * always try to read ahead.
    * With read-ahead we always pre-allocate buffer to the size instead of
    * growing it up to a limit.
    *
@@ -1077,9 +981,9 @@ T setNewTableReaderForCompactionInputs(
    * for bigger requests allocate one shot buffers. In unbuffered mode we
    * always bypass read-ahead buffer at ReadaheadRandomAccessFile
    * When read-ahead is required we then make use of
-   * {@link #compactionReadaheadSize()} value and always try to read ahead.
-   * With read-ahead we always pre-allocate buffer to the size instead of
-   * growing it up to a limit.
+   * {@link MutableDBOptionsInterface#compactionReadaheadSize()} value and
+   * always try to read ahead. With read-ahead we always pre-allocate buffer
+   * to the size instead of growing it up to a limit.
    *
    * This option is currently honored only on Windows
    *
@@ -1092,30 +996,6 @@ T setNewTableReaderForCompactionInputs(
    */
   long randomAccessMaxBufferSize();
 
-  /**
-   * This is the maximum buffer size that is used by WritableFileWriter.
-   * On Windows, we need to maintain an aligned buffer for writes.
-   * We allow the buffer to grow until it's size hits the limit.
-   *
-   * Default: 1024 * 1024 (1 MB)
-   *
-   * @param writableFileMaxBufferSize the maximum buffer size
-   *
-   * @return the reference to the current options.
-   */
-  T setWritableFileMaxBufferSize(long writableFileMaxBufferSize);
-
-  /**
-   * This is the maximum buffer size that is used by WritableFileWriter.
-   * On Windows, we need to maintain an aligned buffer for writes.
-   * We allow the buffer to grow until it's size hits the limit.
-   *
-   * Default: 1024 * 1024 (1 MB)
-   *
-   * @return the maximum buffer size
-   */
-  long writableFileMaxBufferSize();
-
   /**
    * Use adaptive mutex, which spins in the user space before resorting
    * to kernel. This could reduce context switch when the mutex is not
@@ -1139,45 +1019,24 @@ T setNewTableReaderForCompactionInputs(
    */
   boolean useAdaptiveMutex();
 
-  /**
-   * Allows OS to incrementally sync files to disk while they are being
-   * written, asynchronously, in the background.
-   * Issue one request for every bytes_per_sync written. 0 turns it off.
-   * Default: 0
-   *
-   * @param bytesPerSync size in bytes
-   * @return the instance of the current object.
-   */
-  T setBytesPerSync(long bytesPerSync);
-
-  /**
-   * Allows OS to incrementally sync files to disk while they are being
-   * written, asynchronously, in the background.
-   * Issue one request for every bytes_per_sync written. 0 turns it off.
-   * Default: 0
-   *
-   * @return size in bytes
-   */
-  long bytesPerSync();
-
-  /**
-   * Same as {@link #setBytesPerSync(long)} , but applies to WAL files
-   *
-   * Default: 0, turned off
-   *
-   * @param walBytesPerSync size in bytes
-   * @return the instance of the current object.
-   */
-  T setWalBytesPerSync(long walBytesPerSync);
-
-  /**
-   * Same as {@link #bytesPerSync()} , but applies to WAL files
-   *
-   * Default: 0, turned off
-   *
-   * @return size in bytes
-   */
-  long walBytesPerSync();
+  //TODO(AR) NOW
+//  /**
+//   * Sets the {@link EventListener}s whose callback functions
+//   * will be called when specific RocksDB event happens.
+//   *
+//   * @param listeners the listeners who should be notified on various events.
+//   *
+//   * @return the instance of the current object.
+//   */
+//  T setListeners(final List<EventListener> listeners);
+//
+//  /**
+//   * Gets the {@link EventListener}s whose callback functions
+//   * will be called when specific RocksDB event happens.
+//   *
+//   * @return a collection of Event listeners.
+//   */
+//  Collection<EventListener> listeners();
 
   /**
    * If true, then the status of the threads involved in this DB will
@@ -1202,40 +1061,33 @@ T setNewTableReaderForCompactionInputs(
   boolean enableThreadTracking();
 
   /**
-   * The limited write rate to DB if
-   * {@link ColumnFamilyOptions#softPendingCompactionBytesLimit()} or
-   * {@link ColumnFamilyOptions#level0SlowdownWritesTrigger()} is triggered,
-   * or we are writing to the last mem table allowed and we allow more than 3
-   * mem tables. It is calculated using size of user write requests before
-   * compression. RocksDB may decide to slow down more if the compaction still
-   * gets behind further.
+   * By default, a single write thread queue is maintained. The thread gets
+   * to the head of the queue becomes write batch group leader and responsible
+   * for writing to WAL and memtable for the batch group.
    *
-   * Unit: bytes per second.
+   * If {@link #enablePipelinedWrite()} is true, separate write thread queue is
+   * maintained for WAL write and memtable write. A write thread first enter WAL
+   * writer queue and then memtable writer queue. Pending thread on the WAL
+   * writer queue thus only have to wait for previous writers to finish their
+   * WAL writing but not the memtable writing. Enabling the feature may improve
+   * write throughput and reduce latency of the prepare phase of two-phase
+   * commit.
    *
-   * Default: 16MB/s
+   * Default: false
    *
-   * @param delayedWriteRate the rate in bytes per second
+   * @param enablePipelinedWrite true to enabled pipelined writes
    *
    * @return the reference to the current options.
    */
-  T setDelayedWriteRate(long delayedWriteRate);
+  T setEnablePipelinedWrite(final boolean enablePipelinedWrite);
 
   /**
-   * The limited write rate to DB if
-   * {@link ColumnFamilyOptions#softPendingCompactionBytesLimit()} or
-   * {@link ColumnFamilyOptions#level0SlowdownWritesTrigger()} is triggered,
-   * or we are writing to the last mem table allowed and we allow more than 3
-   * mem tables. It is calculated using size of user write requests before
-   * compression. RocksDB may decide to slow down more if the compaction still
-   * gets behind further.
-   *
-   * Unit: bytes per second.
-   *
-   * Default: 16MB/s
+   * Returns true if pipelined writes are enabled.
+   * See {@link #setEnablePipelinedWrite(boolean)}.
    *
-   * @return the rate in bytes per second
+   * @return true if pipelined writes are enabled, false otherwise.
    */
-  long delayedWriteRate();
+  boolean enablePipelinedWrite();
 
   /**
    * If true, allow multi-writers to update mem tables in parallel.
@@ -1437,6 +1289,27 @@ T setEnableWriteThreadAdaptiveYield(
    */
   Cache rowCache();
 
+  /**
+   * A filter object supplied to be invoked while processing write-ahead-logs
+   * (WALs) during recovery. The filter provides a way to inspect log
+   * records, ignoring a particular record or skipping replay.
+   * The filter is invoked at startup and is invoked from a single-thread
+   * currently.
+   *
+   * @param walFilter the filter for processing WALs during recovery.
+   *
+   * @return the reference to the current options.
+   */
+  T setWalFilter(final AbstractWalFilter walFilter);
+
+  /**
+   * Get's the filter for processing WALs during recovery.
+   * See {@link #setWalFilter(AbstractWalFilter)}.
+   *
+   * @return the filter used for processing WALs during recovery.
+   */
+  WalFilter walFilter();
+
   /**
    * If true, then DB::Open / CreateColumnFamily / DropColumnFamily
    * / SetOptions will fail if options file is not detected or properly
@@ -1515,35 +1388,126 @@ T setEnableWriteThreadAdaptiveYield(
   boolean avoidFlushDuringRecovery();
 
   /**
-   * By default RocksDB will flush all memtables on DB close if there are
-   * unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
-   * DB close. Unpersisted data WILL BE LOST.
+   * Set this option to true during creation of database if you want
+   * to be able to ingest behind (call IngestExternalFile() skipping keys
+   * that already exist, rather than overwriting matching keys).
+   * Setting this option to true will affect 2 things:
+   *     1) Disable some internal optimizations around SST file compression
+   *     2) Reserve bottom-most level for ingested files only.
+   *     3) Note that num_levels should be &gt;= 3 if this option is turned on.
    *
    * DEFAULT: false
    *
-   * Dynamically changeable through
-   *     {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
-   *     API.
+   * @param allowIngestBehind true to allow ingest behind, false to disallow.
+   *
+   * @return the reference to the current options.
+   */
+  T setAllowIngestBehind(final boolean allowIngestBehind);
+
+  /**
+   * Returns true if ingest behind is allowed.
+   * See {@link #setAllowIngestBehind(boolean)}.
+   *
+   * @return true if ingest behind is allowed, false otherwise.
+   */
+  boolean allowIngestBehind();
+
+  /**
+   * Needed to support differential snapshots.
+   * If set to true then DB will only process deletes with sequence number
+   * less than what was set by SetPreserveDeletesSequenceNumber(uint64_t ts).
+   * Clients are responsible to periodically call this method to advance
+   * the cutoff time. If this method is never called and preserve_deletes
+   * is set to true NO deletes will ever be processed.
+   * At the moment this only keeps normal deletes, SingleDeletes will
+   * not be preserved.
+   *
+   * DEFAULT: false
+   *
+   * @param preserveDeletes true to preserve deletes.
+   *
+   * @return the reference to the current options.
+   */
+  T setPreserveDeletes(final boolean preserveDeletes);
+
+  /**
+   * Returns true if deletes are preserved.
+   * See {@link #setPreserveDeletes(boolean)}.
+   *
+   * @return true if deletes are preserved, false otherwise.
+   */
+  boolean preserveDeletes();
+
+  /**
+   * If enabled it uses two queues for writes, one for the ones with
+   * disable_memtable and one for the ones that also write to memtable. This
+   * allows the memtable writes not to lag behind other writes. It can be used
+   * to optimize MySQL 2PC in which only the commits, which are serial, write to
+   * memtable.
+   *
+   * DEFAULT: false
    *
-   * @param avoidFlushDuringShutdown true if we should avoid flush during
-   *     shutdown
+   * @param twoWriteQueues true to enable two write queues, false otherwise.
    *
    * @return the reference to the current options.
    */
-  T setAvoidFlushDuringShutdown(boolean avoidFlushDuringShutdown);
+  T setTwoWriteQueues(final boolean twoWriteQueues);
+
+  /**
+   * Returns true if two write queues are enabled.
+   *
+   * @return true if two write queues are enabled, false otherwise.
+   */
+  boolean twoWriteQueues();
 
   /**
-   * By default RocksDB will flush all memtables on DB close if there are
-   * unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
-   * DB close. Unpersisted data WILL BE LOST.
+   * If true WAL is not flushed automatically after each write. Instead it
+   * relies on manual invocation of FlushWAL to write the WAL buffer to its
+   * file.
    *
    * DEFAULT: false
    *
-   * Dynamically changeable through
-   *     {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
-   *     API.
+   * @param manualWalFlush true to set disable automatic WAL flushing,
+   *     false otherwise.
+   *
+   * @return the reference to the current options.
+   */
+  T setManualWalFlush(final boolean manualWalFlush);
+
+  /**
+   * Returns true if automatic WAL flushing is disabled.
+   * See {@link #setManualWalFlush(boolean)}.
+   *
+   * @return true if automatic WAL flushing is disabled, false otherwise.
+   */
+  boolean manualWalFlush();
+
+  /**
+   * If true, RocksDB supports flushing multiple column families and committing
+   * their results atomically to MANIFEST. Note that it is not
+   * necessary to set atomic_flush to true if WAL is always enabled since WAL
+   * allows the database to be restored to the last persistent state in WAL.
+   * This option is useful when there are column families with writes NOT
+   * protected by WAL.
+   * For manual flush, application has to specify which column families to
+   * flush atomically in {@link RocksDB#flush(FlushOptions, List)}.
+   * For auto-triggered flush, RocksDB atomically flushes ALL column families.
+   *
+   * Currently, any WAL-enabled writes after atomic flush may be replayed
+   * independently if the process crashes later and tries to recover.
+   *
+   * @param atomicFlush true to enable atomic flush of multiple column families.
+   *
+   * @return the reference to the current options.
+   */
+  T setAtomicFlush(final boolean atomicFlush);
+
+  /**
+   * Determine if atomic flush of multiple column families is enabled.
+   *
+   * See {@link #setAtomicFlush(boolean)}.
    *
-   * @return true if we should avoid flush during shutdown
+   * @return true if atomic flush is enabled.
    */
-  boolean avoidFlushDuringShutdown();
+  boolean atomicFlush();
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DataBlockIndexType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DataBlockIndexType.java
new file mode 100644
index 0000000000..513e5b4294
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DataBlockIndexType.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+
+/**
+ * DataBlockIndexType used in conjunction with BlockBasedTable.
+ */
+public enum DataBlockIndexType {
+  /**
+   * traditional block type
+   */
+  kDataBlockBinarySearch((byte)0x0),
+
+  /**
+   * additional hash index
+   */
+  kDataBlockBinaryAndHash((byte)0x1);
+
+  private final byte value;
+
+  DataBlockIndexType(final byte value) {
+    this.value = value;
+  }
+
+  byte getValue() {
+    return value;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java
index 4c37dfd56b..e33004f5d8 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/DirectComparator.java
@@ -16,16 +16,18 @@
  */
 public abstract class DirectComparator extends AbstractComparator<DirectSlice> {
 
-  private final long nativeHandle_;
-
   public DirectComparator(final ComparatorOptions copt) {
-    super();
-    this.nativeHandle_ = createNewDirectComparator0(copt.nativeHandle_);
+    super(copt);
+  }
+
+  @Override
+  protected long initializeNative(final long... nativeParameterHandles) {
+    return createNewDirectComparator0(nativeParameterHandles[0]);
   }
 
   @Override
-  protected final long getNativeHandle() {
-    return nativeHandle_;
+  final ComparatorType getComparatorType() {
+    return ComparatorType.JAVA_DIRECT_COMPARATOR;
   }
 
   private native long createNewDirectComparator0(
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Env.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Env.java
index a46f06178d..d7658f2394 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Env.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Env.java
@@ -5,12 +5,23 @@
 
 package org.rocksdb;
 
+import java.util.Arrays;
+import java.util.List;
+
 /**
  * Base class for all Env implementations in RocksDB.
  */
 public abstract class Env extends RocksObject {
-  public static final int FLUSH_POOL = 0;
-  public static final int COMPACTION_POOL = 1;
+
+  private static final Env DEFAULT_ENV = new RocksEnv(getDefaultEnvInternal());
+  static {
+    /**
+     * The Ownership of the Default Env belongs to C++
+     * and so we disown the native handle here so that
+     * we cannot accidentally free it from Java.
+     */
+    DEFAULT_ENV.disOwnNativeHandle();
+  }
 
   /**
    * <p>Returns the default environment suitable for the current operating
@@ -18,13 +29,13 @@ public abstract class Env extends RocksObject {
    *
    * <p>The result of {@code getDefault()} is a singleton whose ownership
    * belongs to rocksdb c++.  As a result, the returned RocksEnv will not
-   * have the ownership of its c++ resource, and calling its dispose()
+   * have the ownership of its c++ resource, and calling its dispose()/close()
    * will be no-op.</p>
    *
    * @return the default {@link org.rocksdb.RocksEnv} instance.
    */
   public static Env getDefault() {
-    return default_env_;
+    return DEFAULT_ENV;
   }
 
   /**
@@ -32,27 +43,36 @@ public static Env getDefault() {
    * for this environment.</p>
    * <p>Default number: 1</p>
    *
-   * @param num the number of threads
+   * @param number the number of threads
    *
    * @return current {@link RocksEnv} instance.
    */
-  public Env setBackgroundThreads(final int num) {
-    return setBackgroundThreads(num, FLUSH_POOL);
+  public Env setBackgroundThreads(final int number) {
+    return setBackgroundThreads(number, Priority.LOW);
+  }
+
+  /**
+   * <p>Gets the number of background worker threads of the pool
+   * for this environment.</p>
+   *
+   * @return the number of threads.
+   */
+  public int getBackgroundThreads(final Priority priority) {
+    return getBackgroundThreads(nativeHandle_, priority.getValue());
   }
 
   /**
    * <p>Sets the number of background worker threads of the specified thread
    * pool for this environment.</p>
    *
-   * @param num the number of threads
-   * @param poolID the id to specified a thread pool.  Should be either
-   *     FLUSH_POOL or COMPACTION_POOL.
+   * @param number the number of threads
+   * @param priority the priority id of a specified thread pool.
    *
    * <p>Default number: 1</p>
    * @return current {@link RocksEnv} instance.
    */
-  public Env setBackgroundThreads(final int num, final int poolID) {
-    setBackgroundThreads(nativeHandle_, num, poolID);
+  public Env setBackgroundThreads(final int number, final Priority priority) {
+    setBackgroundThreads(nativeHandle_, number, priority.getValue());
     return this;
   }
 
@@ -60,33 +80,75 @@ public Env setBackgroundThreads(final int num, final int poolID) {
    * <p>Returns the length of the queue associated with the specified
    * thread pool.</p>
    *
-   * @param poolID the id to specified a thread pool.  Should be either
-   *     FLUSH_POOL or COMPACTION_POOL.
+   * @param priority the priority id of a specified thread pool.
    *
    * @return the thread pool queue length.
    */
-  public int getThreadPoolQueueLen(final int poolID) {
-    return getThreadPoolQueueLen(nativeHandle_, poolID);
+  public int getThreadPoolQueueLen(final Priority priority) {
+    return getThreadPoolQueueLen(nativeHandle_, priority.getValue());
   }
 
+  /**
+   * Enlarge number of background worker threads of a specific thread pool
+   * for this environment if it is smaller than specified. 'LOW' is the default
+   * pool.
+   *
+   * @param number the number of threads.
+   *
+   * @return current {@link RocksEnv} instance.
+   */
+  public Env incBackgroundThreadsIfNeeded(final int number,
+    final Priority priority) {
+    incBackgroundThreadsIfNeeded(nativeHandle_, number, priority.getValue());
+    return this;
+  }
 
-  protected Env(final long nativeHandle) {
-    super(nativeHandle);
+  /**
+   * Lower IO priority for threads from the specified pool.
+   *
+   * @param priority the priority id of a specified thread pool.
+   */
+  public Env lowerThreadPoolIOPriority(final Priority priority) {
+    lowerThreadPoolIOPriority(nativeHandle_, priority.getValue());
+    return this;
   }
 
-  static {
-    default_env_ = new RocksEnv(getDefaultEnvInternal());
+  /**
+   * Lower CPU priority for threads from the specified pool.
+   *
+   * @param priority the priority id of a specified thread pool.
+   */
+  public Env lowerThreadPoolCPUPriority(final Priority priority) {
+    lowerThreadPoolCPUPriority(nativeHandle_, priority.getValue());
+    return this;
   }
 
   /**
-   * <p>The static default Env. The ownership of its native handle
-   * belongs to rocksdb c++ and is not able to be released on the Java
-   * side.</p>
+   * Returns the status of all threads that belong to the current Env.
+   *
+   * @return the status of all threads belong to this env.
    */
-  static Env default_env_;
+  public List<ThreadStatus> getThreadList() throws RocksDBException {
+    return Arrays.asList(getThreadList(nativeHandle_));
+  }
+
+  Env(final long nativeHandle) {
+    super(nativeHandle);
+  }
 
   private static native long getDefaultEnvInternal();
   private native void setBackgroundThreads(
-      long handle, int num, int priority);
-  private native int getThreadPoolQueueLen(long handle, int poolID);
+      final long handle, final int number, final byte priority);
+  private native int getBackgroundThreads(final long handle,
+    final byte priority);
+  private native int getThreadPoolQueueLen(final long handle,
+      final byte priority);
+  private native void incBackgroundThreadsIfNeeded(final long handle,
+      final int number, final byte priority);
+  private native void lowerThreadPoolIOPriority(final long handle,
+      final byte priority);
+  private native void lowerThreadPoolCPUPriority(final long handle,
+      final byte priority);
+  private native ThreadStatus[] getThreadList(final long handle)
+      throws RocksDBException;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/EnvOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/EnvOptions.java
index 2bca0355e4..6baddb3102 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/EnvOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/EnvOptions.java
@@ -5,203 +5,362 @@
 
 package org.rocksdb;
 
+/**
+ * Options while opening a file to read/write
+ */
 public class EnvOptions extends RocksObject {
   static {
     RocksDB.loadLibrary();
   }
 
+  /**
+   * Construct with default Options
+   */
   public EnvOptions() {
     super(newEnvOptions());
   }
 
-  public EnvOptions setUseOsBuffer(final boolean useOsBuffer) {
-    setUseOsBuffer(nativeHandle_, useOsBuffer);
-    return this;
-  }
-
-  public boolean useOsBuffer() {
-    assert(isOwningHandle());
-    return useOsBuffer(nativeHandle_);
+  /**
+   * Construct from {@link DBOptions}.
+   *
+   * @param dbOptions the database options.
+   */
+  public EnvOptions(final DBOptions dbOptions) {
+    super(newEnvOptions(dbOptions.nativeHandle_));
   }
 
+  /**
+   * Enable/Disable memory mapped reads.
+   *
+   * Default: false
+   *
+   * @param useMmapReads true to enable memory mapped reads, false to disable.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setUseMmapReads(final boolean useMmapReads) {
     setUseMmapReads(nativeHandle_, useMmapReads);
     return this;
   }
 
+  /**
+   * Determine if memory mapped reads are in-use.
+   *
+   * @return true if memory mapped reads are in-use, false otherwise.
+   */
   public boolean useMmapReads() {
     assert(isOwningHandle());
     return useMmapReads(nativeHandle_);
   }
 
+  /**
+   * Enable/Disable memory mapped Writes.
+   *
+   * Default: true
+   *
+   * @param useMmapWrites true to enable memory mapped writes, false to disable.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setUseMmapWrites(final boolean useMmapWrites) {
     setUseMmapWrites(nativeHandle_, useMmapWrites);
     return this;
   }
 
+  /**
+   * Determine if memory mapped writes are in-use.
+   *
+   * @return true if memory mapped writes are in-use, false otherwise.
+   */
   public boolean useMmapWrites() {
     assert(isOwningHandle());
     return useMmapWrites(nativeHandle_);
   }
 
+  /**
+   * Enable/Disable direct reads, i.e. {@code O_DIRECT}.
+   *
+   * Default: false
+   *
+   * @param useDirectReads true to enable direct reads, false to disable.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setUseDirectReads(final boolean useDirectReads) {
     setUseDirectReads(nativeHandle_, useDirectReads);
     return this;
   }
 
+  /**
+   * Determine if direct reads are in-use.
+   *
+   * @return true if direct reads are in-use, false otherwise.
+   */
   public boolean useDirectReads() {
     assert(isOwningHandle());
     return useDirectReads(nativeHandle_);
   }
 
+  /**
+   * Enable/Disable direct writes, i.e. {@code O_DIRECT}.
+   *
+   * Default: false
+   *
+   * @param useDirectWrites true to enable direct writes, false to disable.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setUseDirectWrites(final boolean useDirectWrites) {
     setUseDirectWrites(nativeHandle_, useDirectWrites);
     return this;
   }
 
+  /**
+   * Determine if direct writes are in-use.
+   *
+   * @return true if direct writes are in-use, false otherwise.
+   */
   public boolean useDirectWrites() {
     assert(isOwningHandle());
     return useDirectWrites(nativeHandle_);
   }
 
+  /**
+   * Enable/Disable fallocate calls.
+   *
+   * Default: true
+   *
+   * If false, {@code fallocate()} calls are bypassed.
+   *
+   * @param allowFallocate true to enable fallocate calls, false to disable.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setAllowFallocate(final boolean allowFallocate) {
     setAllowFallocate(nativeHandle_, allowFallocate);
     return this;
   }
 
+  /**
+   * Determine if fallocate calls are used.
+   *
+   * @return true if fallocate calls are used, false otherwise.
+   */
   public boolean allowFallocate() {
     assert(isOwningHandle());
     return allowFallocate(nativeHandle_);
   }
 
+  /**
+   * Enable/Disable the {@code FD_CLOEXEC} bit when opening file descriptors.
+   *
+   * Default: true
+   *
+   * @param setFdCloexec true to enable the {@code FB_CLOEXEC} bit,
+   *     false to disable.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setSetFdCloexec(final boolean setFdCloexec) {
     setSetFdCloexec(nativeHandle_, setFdCloexec);
     return this;
   }
 
+  /**
+   * Determine i fthe {@code FD_CLOEXEC} bit is set when opening file
+   * descriptors.
+   *
+   * @return true if the {@code FB_CLOEXEC} bit is enabled, false otherwise.
+   */
   public boolean setFdCloexec() {
     assert(isOwningHandle());
     return setFdCloexec(nativeHandle_);
   }
 
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, in the background. Issue one request for every
+   * {@code bytesPerSync} written.
+   *
+   * Default: 0
+   *
+   * @param bytesPerSync 0 to disable, otherwise the number of bytes.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setBytesPerSync(final long bytesPerSync) {
     setBytesPerSync(nativeHandle_, bytesPerSync);
     return this;
   }
 
+  /**
+   * Get the number of incremental bytes per sync written in the background.
+   *
+   * @return 0 if disabled, otherwise the number of bytes.
+   */
   public long bytesPerSync() {
     assert(isOwningHandle());
     return bytesPerSync(nativeHandle_);
   }
 
-  public EnvOptions setFallocateWithKeepSize(final boolean fallocateWithKeepSize) {
+  /**
+   * If true, we will preallocate the file with {@code FALLOC_FL_KEEP_SIZE}
+   * flag, which means that file size won't change as part of preallocation.
+   * If false, preallocation will also change the file size. This option will
+   * improve the performance in workloads where you sync the data on every
+   * write. By default, we set it to true for MANIFEST writes and false for
+   * WAL writes
+   *
+   * @param fallocateWithKeepSize true to preallocate, false otherwise.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setFallocateWithKeepSize(
+      final boolean fallocateWithKeepSize) {
     setFallocateWithKeepSize(nativeHandle_, fallocateWithKeepSize);
     return this;
   }
 
+  /**
+   * Determine if file is preallocated.
+   *
+   * @return true if the file is preallocated, false otherwise.
+   */
   public boolean fallocateWithKeepSize() {
     assert(isOwningHandle());
     return fallocateWithKeepSize(nativeHandle_);
   }
 
-  public EnvOptions setCompactionReadaheadSize(final long compactionReadaheadSize) {
+  /**
+   * See {@link DBOptions#setCompactionReadaheadSize(long)}.
+   *
+   * @param compactionReadaheadSize the compaction read-ahead size.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setCompactionReadaheadSize(
+      final long compactionReadaheadSize) {
     setCompactionReadaheadSize(nativeHandle_, compactionReadaheadSize);
     return this;
   }
 
+  /**
+   * See {@link DBOptions#compactionReadaheadSize()}.
+   *
+   * @return the compaction read-ahead size.
+   */
   public long compactionReadaheadSize() {
     assert(isOwningHandle());
     return compactionReadaheadSize(nativeHandle_);
   }
 
-  public EnvOptions setRandomAccessMaxBufferSize(final long randomAccessMaxBufferSize) {
+  /**
+   * See {@link DBOptions#setRandomAccessMaxBufferSize(long)}.
+   *
+   * @param randomAccessMaxBufferSize the max buffer size for random access.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setRandomAccessMaxBufferSize(
+      final long randomAccessMaxBufferSize) {
     setRandomAccessMaxBufferSize(nativeHandle_, randomAccessMaxBufferSize);
     return this;
   }
 
+  /**
+   * See {@link DBOptions#randomAccessMaxBufferSize()}.
+   *
+   * @return the max buffer size for random access.
+   */
   public long randomAccessMaxBufferSize() {
     assert(isOwningHandle());
     return randomAccessMaxBufferSize(nativeHandle_);
   }
 
-  public EnvOptions setWritableFileMaxBufferSize(final long writableFileMaxBufferSize) {
+  /**
+   * See {@link DBOptions#setWritableFileMaxBufferSize(long)}.
+   *
+   * @param writableFileMaxBufferSize the max buffer size.
+   *
+   * @return the reference to these options.
+   */
+  public EnvOptions setWritableFileMaxBufferSize(
+      final long writableFileMaxBufferSize) {
     setWritableFileMaxBufferSize(nativeHandle_, writableFileMaxBufferSize);
     return this;
   }
 
+  /**
+   * See {@link DBOptions#writableFileMaxBufferSize()}.
+   *
+   * @return the max buffer size.
+   */
   public long writableFileMaxBufferSize() {
     assert(isOwningHandle());
     return writableFileMaxBufferSize(nativeHandle_);
   }
 
+  /**
+   * Set the write rate limiter for flush and compaction.
+   *
+   * @param rateLimiter the rate limiter.
+   *
+   * @return the reference to these options.
+   */
   public EnvOptions setRateLimiter(final RateLimiter rateLimiter) {
     this.rateLimiter = rateLimiter;
     setRateLimiter(nativeHandle_, rateLimiter.nativeHandle_);
     return this;
   }
 
+  /**
+   * Get the write rate limiter for flush and compaction.
+   *
+   * @return the rate limiter.
+   */
   public RateLimiter rateLimiter() {
     assert(isOwningHandle());
     return rateLimiter;
   }
 
   private native static long newEnvOptions();
-
+  private native static long newEnvOptions(final long dboptions_handle);
   @Override protected final native void disposeInternal(final long handle);
 
-  private native void setUseOsBuffer(final long handle, final boolean useOsBuffer);
-
-  private native boolean useOsBuffer(final long handle);
-
-  private native void setUseMmapReads(final long handle, final boolean useMmapReads);
-
+  private native void setUseMmapReads(final long handle,
+      final boolean useMmapReads);
   private native boolean useMmapReads(final long handle);
-
-  private native void setUseMmapWrites(final long handle, final boolean useMmapWrites);
-
+  private native void setUseMmapWrites(final long handle,
+      final boolean useMmapWrites);
   private native boolean useMmapWrites(final long handle);
-
-  private native void setUseDirectReads(final long handle, final boolean useDirectReads);
-
+  private native void setUseDirectReads(final long handle,
+      final boolean useDirectReads);
   private native boolean useDirectReads(final long handle);
-
-  private native void setUseDirectWrites(final long handle, final boolean useDirectWrites);
-
+  private native void setUseDirectWrites(final long handle,
+      final boolean useDirectWrites);
   private native boolean useDirectWrites(final long handle);
-
-  private native void setAllowFallocate(final long handle, final boolean allowFallocate);
-
+  private native void setAllowFallocate(final long handle,
+      final boolean allowFallocate);
   private native boolean allowFallocate(final long handle);
-
-  private native void setSetFdCloexec(final long handle, final boolean setFdCloexec);
-
+  private native void setSetFdCloexec(final long handle,
+      final boolean setFdCloexec);
   private native boolean setFdCloexec(final long handle);
-
-  private native void setBytesPerSync(final long handle, final long bytesPerSync);
-
+  private native void setBytesPerSync(final long handle,
+      final long bytesPerSync);
   private native long bytesPerSync(final long handle);
-
   private native void setFallocateWithKeepSize(
       final long handle, final boolean fallocateWithKeepSize);
-
   private native boolean fallocateWithKeepSize(final long handle);
-
   private native void setCompactionReadaheadSize(
       final long handle, final long compactionReadaheadSize);
-
   private native long compactionReadaheadSize(final long handle);
-
   private native void setRandomAccessMaxBufferSize(
       final long handle, final long randomAccessMaxBufferSize);
-
   private native long randomAccessMaxBufferSize(final long handle);
-
   private native void setWritableFileMaxBufferSize(
       final long handle, final long writableFileMaxBufferSize);
-
   private native long writableFileMaxBufferSize(final long handle);
-
-  private native void setRateLimiter(final long handle, final long rateLimiterHandle);
-
+  private native void setRateLimiter(final long handle,
+      final long rateLimiterHandle);
   private RateLimiter rateLimiter;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Filter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Filter.java
index 011be20856..7f490cf594 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Filter.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Filter.java
@@ -12,6 +12,7 @@
  * number of disk seeks form a handful to a single disk seek per
  * DB::Get() call.
  */
+//TODO(AR) should be renamed FilterPolicy
 public abstract class Filter extends RocksObject {
 
   protected Filter(final long nativeHandle) {
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java
index ce54a528bf..760b515fdf 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/FlushOptions.java
@@ -1,3 +1,8 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
 package org.rocksdb;
 
 /**
@@ -41,9 +46,45 @@ public boolean waitForFlush() {
     return waitForFlush(nativeHandle_);
   }
 
+  /**
+   * Set to true so that flush would proceeds immediately even it it means
+   * writes will stall for the duration of the flush.
+   *
+   * Set to false so that the operation will wait until it's possible to do
+   * the flush without causing stall or until required flush is performed by
+   * someone else (foreground call or background thread).
+   *
+   * Default: false
+   *
+   * @param allowWriteStall true to allow writes to stall for flush, false
+   *     otherwise.
+   *
+   * @return instance of current FlushOptions.
+   */
+  public FlushOptions setAllowWriteStall(final boolean allowWriteStall) {
+    assert(isOwningHandle());
+    setAllowWriteStall(nativeHandle_, allowWriteStall);
+    return this;
+  }
+
+  /**
+   * Returns true if writes are allowed to stall for flushes to complete, false
+   * otherwise.
+   *
+   * @return true if writes are allowed to stall for flushes
+   */
+  public boolean allowWriteStall() {
+    assert(isOwningHandle());
+    return allowWriteStall(nativeHandle_);
+  }
+
   private native static long newFlushOptions();
   @Override protected final native void disposeInternal(final long handle);
-  private native void setWaitForFlush(long handle,
-      boolean wait);
-  private native boolean waitForFlush(long handle);
+
+  private native void setWaitForFlush(final long handle,
+      final boolean wait);
+  private native boolean waitForFlush(final long handle);
+  private native void setAllowWriteStall(final long handle,
+      final boolean allowWriteStall);
+  private native boolean allowWriteStall(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HdfsEnv.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HdfsEnv.java
new file mode 100644
index 0000000000..4d8d3bff6f
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HdfsEnv.java
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * HDFS environment.
+ */
+public class HdfsEnv extends Env {
+
+  /**
+   <p>Creates a new environment that is used for HDFS environment.</p>
+   *
+   * <p>The caller must delete the result when it is
+   * no longer needed.</p>
+   *
+   * @param fsName the HDFS as a string in the form "hdfs://hostname:port/"
+   */
+  public HdfsEnv(final String fsName) {
+    super(createHdfsEnv(fsName));
+  }
+
+  private static native long createHdfsEnv(final String fsName);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java
index 11798eb59f..81d8908834 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramData.java
@@ -11,15 +11,30 @@ public class HistogramData {
   private final double percentile99_;
   private final double average_;
   private final double standardDeviation_;
+  private final double max_;
+  private final long count_;
+  private final long sum_;
+  private final double min_;
+
+  public HistogramData(final double median, final double percentile95,
+                       final double percentile99, final double average,
+                       final double standardDeviation) {
+    this(median, percentile95, percentile99, average, standardDeviation, 0.0, 0, 0, 0.0);
+  }
 
   public HistogramData(final double median, final double percentile95,
       final double percentile99, final double average,
-      final double standardDeviation) {
+      final double standardDeviation, final double max, final long count,
+      final long sum, final double min) {
     median_ = median;
     percentile95_ = percentile95;
     percentile99_ = percentile99;
     average_ = average;
     standardDeviation_ = standardDeviation;
+    min_ = min;
+    max_ = max;
+    count_ = count;
+    sum_ = sum;
   }
 
   public double getMedian() {
@@ -41,4 +56,20 @@ public double getAverage() {
   public double getStandardDeviation() {
     return standardDeviation_;
   }
+
+  public double getMax() {
+    return max_;
+  }
+
+  public long getCount() {
+    return count_;
+  }
+
+  public long getSum() {
+    return sum_;
+  }
+
+  public double getMin() {
+    return min_;
+  }
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
index 2d95f5149f..ab97a4d257 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/HistogramType.java
@@ -84,6 +84,82 @@ public enum HistogramType {
 
   READ_NUM_MERGE_OPERANDS((byte) 0x1E),
 
+  /**
+   * Time spent flushing memtable to disk.
+   */
+  FLUSH_TIME((byte) 0x20),
+
+  /**
+   * Size of keys written to BlobDB.
+   */
+  BLOB_DB_KEY_SIZE((byte) 0x21),
+
+  /**
+   * Size of values written to BlobDB.
+   */
+  BLOB_DB_VALUE_SIZE((byte) 0x22),
+
+  /**
+   * BlobDB Put/PutWithTTL/PutUntil/Write latency.
+   */
+  BLOB_DB_WRITE_MICROS((byte) 0x23),
+
+  /**
+   * BlobDB Get lagency.
+   */
+  BLOB_DB_GET_MICROS((byte) 0x24),
+
+  /**
+   * BlobDB MultiGet latency.
+   */
+  BLOB_DB_MULTIGET_MICROS((byte) 0x25),
+
+  /**
+   * BlobDB Seek/SeekToFirst/SeekToLast/SeekForPrev latency.
+   */
+  BLOB_DB_SEEK_MICROS((byte) 0x26),
+
+  /**
+   * BlobDB Next latency.
+   */
+  BLOB_DB_NEXT_MICROS((byte) 0x27),
+
+  /**
+   * BlobDB Prev latency.
+   */
+  BLOB_DB_PREV_MICROS((byte) 0x28),
+
+  /**
+   * Blob file write latency.
+   */
+  BLOB_DB_BLOB_FILE_WRITE_MICROS((byte) 0x29),
+
+  /**
+   * Blob file read latency.
+   */
+  BLOB_DB_BLOB_FILE_READ_MICROS((byte) 0x2A),
+
+  /**
+   * Blob file sync latency.
+   */
+  BLOB_DB_BLOB_FILE_SYNC_MICROS((byte) 0x2B),
+
+  /**
+   * BlobDB garbage collection time.
+   */
+  BLOB_DB_GC_MICROS((byte) 0x2C),
+
+  /**
+   * BlobDB compression time.
+   */
+  BLOB_DB_COMPRESSION_MICROS((byte) 0x2D),
+
+  /**
+   * BlobDB decompression time.
+   */
+  BLOB_DB_DECOMPRESSION_MICROS((byte) 0x2E),
+
+  // 0x1F for backwards compatibility on current minor version.
   HISTOGRAM_ENUM_MAX((byte) 0x1F);
 
   private final byte value;
@@ -92,6 +168,12 @@ public enum HistogramType {
     this.value = value;
   }
 
+  /**
+   * @deprecated
+   * Exposes internal value of native enum mappings. This method will be marked private in the
+   * next major release.
+   */
+  @Deprecated
   public byte getValue() {
     return value;
   }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IndexType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
index e0c113d39a..04e4814658 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IndexType.java
@@ -33,7 +33,7 @@ public byte getValue() {
     return value_;
   }
 
-  private IndexType(byte value) {
+  IndexType(byte value) {
     value_ = value;
   }
 
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java
index 7343691817..a6a308daa3 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java
@@ -7,7 +7,8 @@
 import java.util.List;
 
 /**
- * IngestExternalFileOptions is used by {@link RocksDB#ingestExternalFile(ColumnFamilyHandle, List, IngestExternalFileOptions)}
+ * IngestExternalFileOptions is used by
+ * {@link RocksDB#ingestExternalFile(ColumnFamilyHandle, List, IngestExternalFileOptions)}.
  */
 public class IngestExternalFileOptions extends RocksObject {
 
@@ -41,9 +42,12 @@ public boolean moveFiles() {
    * Can be set to true to move the files instead of copying them.
    *
    * @param moveFiles true if files should be moved instead of copied
+   *
+   * @return the reference to the current IngestExternalFileOptions.
    */
-  public void setMoveFiles(final boolean moveFiles) {
+  public IngestExternalFileOptions setMoveFiles(final boolean moveFiles) {
     setMoveFiles(nativeHandle_, moveFiles);
+    return this;
   }
 
   /**
@@ -61,9 +65,13 @@ public boolean snapshotConsistency() {
    * that where created before the file was ingested.
    *
    * @param snapshotConsistency true if snapshot consistency is required
+   *
+   * @return the reference to the current IngestExternalFileOptions.
    */
-  public void setSnapshotConsistency(final boolean snapshotConsistency) {
+  public IngestExternalFileOptions setSnapshotConsistency(
+      final boolean snapshotConsistency) {
     setSnapshotConsistency(nativeHandle_, snapshotConsistency);
+    return this;
   }
 
   /**
@@ -81,9 +89,13 @@ public boolean allowGlobalSeqNo() {
    * will fail if the file key range overlaps with existing keys or tombstones in the DB.
    *
    * @param allowGlobalSeqNo true if global seq numbers are required
+   *
+   * @return the reference to the current IngestExternalFileOptions.
    */
-  public void setAllowGlobalSeqNo(final boolean allowGlobalSeqNo) {
+  public IngestExternalFileOptions setAllowGlobalSeqNo(
+      final boolean allowGlobalSeqNo) {
     setAllowGlobalSeqNo(nativeHandle_, allowGlobalSeqNo);
+    return this;
   }
 
   /**
@@ -101,15 +113,100 @@ public boolean allowBlockingFlush() {
    * (memtable flush required), IngestExternalFile will fail.
    *
    * @param allowBlockingFlush true if blocking flushes are allowed
+   *
+   * @return the reference to the current IngestExternalFileOptions.
    */
-  public void setAllowBlockingFlush(final boolean allowBlockingFlush) {
+  public IngestExternalFileOptions setAllowBlockingFlush(
+      final boolean allowBlockingFlush) {
     setAllowBlockingFlush(nativeHandle_, allowBlockingFlush);
+    return this;
+  }
+
+  /**
+   * Returns true if duplicate keys in the file being ingested are
+   * to be skipped rather than overwriting existing data under that key.
+   *
+   * @return true if duplicate keys in the file being ingested are to be
+   *     skipped, false otherwise.
+   */
+  public boolean ingestBehind() {
+    return ingestBehind(nativeHandle_);
+  }
+
+  /**
+   * Set to true if you would like duplicate keys in the file being ingested
+   * to be skipped rather than overwriting existing data under that key.
+   *
+   * Usecase: back-fill of some historical data in the database without
+   * over-writing existing newer version of data.
+   *
+   * This option could only be used if the DB has been running
+   * with DBOptions#allowIngestBehind() == true since the dawn of time.
+   *
+   * All files will be ingested at the bottommost level with seqno=0.
+   *
+   * Default: false
+   *
+   * @param ingestBehind true if you would like duplicate keys in the file being
+   *     ingested to be skipped.
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setIngestBehind(final boolean ingestBehind) {
+    setIngestBehind(nativeHandle_, ingestBehind);
+    return this;
+  }
+
+  /**
+   * Returns true write if the global_seqno is written to a given offset
+   * in the external SST file for backward compatibility.
+   *
+   * See {@link #setWriteGlobalSeqno(boolean)}.
+   *
+   * @return true if the global_seqno is written to a given offset,
+   *     false otherwise.
+   */
+  public boolean writeGlobalSeqno() {
+    return writeGlobalSeqno(nativeHandle_);
+  }
+
+  /**
+   * Set to true if you would like to write the global_seqno to a given offset
+   * in the external SST file for backward compatibility.
+   *
+   * Older versions of RocksDB write the global_seqno to a given offset within
+   * the ingested SST files, and new versions of RocksDB do not.
+   *
+   * If you ingest an external SST using new version of RocksDB and would like
+   * to be able to downgrade to an older version of RocksDB, you should set
+   * {@link #writeGlobalSeqno()} to true.
+   *
+   * If your service is just starting to use the new RocksDB, we recommend that
+   * you set this option to false, which brings two benefits:
+   *    1. No extra random write for global_seqno during ingestion.
+   *    2. Without writing external SST file, it's possible to do checksum.
+   *
+   * We have a plan to set this option to false by default in the future.
+   *
+   * Default: true
+   *
+   * @param writeGlobalSeqno true to write the gloal_seqno to a given offset,
+   *     false otherwise
+   *
+   * @return the reference to the current IngestExternalFileOptions.
+   */
+  public IngestExternalFileOptions setWriteGlobalSeqno(
+      final boolean writeGlobalSeqno) {
+    setWriteGlobalSeqno(nativeHandle_, writeGlobalSeqno);
+    return this;
   }
 
   private native static long newIngestExternalFileOptions();
   private native static long newIngestExternalFileOptions(
       final boolean moveFiles, final boolean snapshotConsistency,
       final boolean allowGlobalSeqNo, final boolean allowBlockingFlush);
+  @Override protected final native void disposeInternal(final long handle);
+
   private native boolean moveFiles(final long handle);
   private native void setMoveFiles(final long handle, final boolean move_files);
   private native boolean snapshotConsistency(final long handle);
@@ -121,5 +218,10 @@ private native void setAllowGlobalSeqNo(final long handle,
   private native boolean allowBlockingFlush(final long handle);
   private native void setAllowBlockingFlush(final long handle,
       final boolean allowBlockingFlush);
-  @Override protected final native void disposeInternal(final long handle);
+  private native boolean ingestBehind(final long handle);
+  private native void setIngestBehind(final long handle,
+      final boolean ingestBehind);
+  private native boolean writeGlobalSeqno(final long handle);
+  private native void setWriteGlobalSeqno(final long handle,
+      final boolean writeGlobalSeqNo);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LevelMetaData.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LevelMetaData.java
new file mode 100644
index 0000000000..c5685098be
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LevelMetaData.java
@@ -0,0 +1,56 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * The metadata that describes a level.
+ */
+public class LevelMetaData {
+  private final int level;
+  private final long size;
+  private final SstFileMetaData[] files;
+
+  /**
+   * Called from JNI C++
+   */
+  private LevelMetaData(final int level, final long size,
+      final SstFileMetaData[] files) {
+    this.level = level;
+    this.size = size;
+    this.files = files;
+  }
+
+  /**
+   * The level which this meta data describes.
+   *
+   * @return the level
+   */
+  public int level() {
+    return level;
+  }
+
+  /**
+   * The size of this level in bytes, which is equal to the sum of
+   * the file size of its {@link #files()}.
+   *
+   * @return the size
+   */
+  public long size() {
+    return size;
+  }
+
+  /**
+   * The metadata of all sst files in this level.
+   *
+   * @return the metadata of the files
+   */
+  public List<SstFileMetaData> files() {
+    return Arrays.asList(files);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LiveFileMetaData.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LiveFileMetaData.java
new file mode 100644
index 0000000000..35d883e180
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LiveFileMetaData.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The full set of metadata associated with each SST file.
+ */
+public class LiveFileMetaData extends SstFileMetaData {
+  private final byte[] columnFamilyName;
+  private final int level;
+
+  /**
+   * Called from JNI C++
+   */
+  private LiveFileMetaData(
+      final byte[] columnFamilyName,
+      final int level,
+      final String fileName,
+      final String path,
+      final long size,
+      final long smallestSeqno,
+      final long largestSeqno,
+      final byte[] smallestKey,
+      final byte[] largestKey,
+      final long numReadsSampled,
+      final boolean beingCompacted,
+      final long numEntries,
+      final long numDeletions) {
+    super(fileName, path, size, smallestSeqno, largestSeqno, smallestKey,
+        largestKey, numReadsSampled, beingCompacted, numEntries, numDeletions);
+    this.columnFamilyName = columnFamilyName;
+    this.level = level;
+  }
+
+  /**
+   * Get the name of the column family.
+   *
+   * @return the name of the column family
+   */
+  public byte[] columnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the level at which this file resides.
+   *
+   * @return the level at which the file resides.
+   */
+  public int level() {
+    return level;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LogFile.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LogFile.java
new file mode 100644
index 0000000000..ef24a6427c
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/LogFile.java
@@ -0,0 +1,75 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class LogFile {
+  private final String pathName;
+  private final long logNumber;
+  private final WalFileType type;
+  private final long startSequence;
+  private final long sizeFileBytes;
+
+  /**
+   * Called from JNI C++
+   */
+  private LogFile(final String pathName, final long logNumber,
+      final byte walFileTypeValue, final long startSequence,
+      final long sizeFileBytes) {
+    this.pathName = pathName;
+    this.logNumber = logNumber;
+    this.type = WalFileType.fromValue(walFileTypeValue);
+    this.startSequence = startSequence;
+    this.sizeFileBytes = sizeFileBytes;
+  }
+
+  /**
+   * Returns log file's pathname relative to the main db dir
+   * Eg. For a live-log-file = /000003.log
+   * For an archived-log-file = /archive/000003.log
+   *
+   * @return log file's pathname
+   */
+  public String pathName() {
+    return pathName;
+  }
+
+  /**
+   * Primary identifier for log file.
+   * This is directly proportional to creation time of the log file
+   *
+   * @return the log number
+   */
+  public long logNumber() {
+    return logNumber;
+  }
+
+  /**
+   * Log file can be either alive or archived.
+   *
+   * @return the type of the log file.
+   */
+  public WalFileType type() {
+    return type;
+  }
+
+  /**
+   * Starting sequence number of writebatch written in this log file.
+   *
+   * @return the stating sequence number
+   */
+  public long startSequence() {
+    return startSequence;
+  }
+
+  /**
+   * Size of log file on disk in Bytes.
+   *
+   * @return size of log file
+   */
+  public long sizeFileBytes() {
+    return sizeFileBytes;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Logger.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Logger.java
index 9021259290..00a5d56745 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Logger.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Logger.java
@@ -35,9 +35,10 @@
  * {@link org.rocksdb.InfoLogLevel#FATAL_LEVEL}.
  * </p>
  */
-public abstract class Logger extends AbstractImmutableNativeReference {
+public abstract class Logger extends RocksCallbackObject {
 
-  final long nativeHandle_;
+  private final static long WITH_OPTIONS = 0;
+  private final static long WITH_DBOPTIONS = 1;
 
   /**
    * <p>AbstractLogger constructor.</p>
@@ -49,8 +50,8 @@ public abstract class Logger extends AbstractImmutableNativeReference {
    * @param options {@link org.rocksdb.Options} instance.
    */
   public Logger(final Options options) {
-    super(true);
-    this.nativeHandle_ = createNewLoggerOptions(options.nativeHandle_);
+    super(options.nativeHandle_, WITH_OPTIONS);
+
   }
 
   /**
@@ -63,8 +64,18 @@ public Logger(final Options options) {
    * @param dboptions {@link org.rocksdb.DBOptions} instance.
    */
   public Logger(final DBOptions dboptions) {
-    super(true);
-    this.nativeHandle_ = createNewLoggerDbOptions(dboptions.nativeHandle_);
+    super(dboptions.nativeHandle_, WITH_DBOPTIONS);
+  }
+
+  @Override
+  protected long initializeNative(long... nativeParameterHandles) {
+    if(nativeParameterHandles[1] == WITH_OPTIONS) {
+      return createNewLoggerOptions(nativeParameterHandles[0]);
+    } else if(nativeParameterHandles[1] == WITH_DBOPTIONS) {
+      return createNewLoggerDbOptions(nativeParameterHandles[0]);
+    } else {
+      throw new IllegalArgumentException();
+    }
   }
 
   /**
@@ -89,17 +100,6 @@ public InfoLogLevel infoLogLevel() {
   protected abstract void log(InfoLogLevel infoLogLevel,
       String logMsg);
 
-  /**
-   * Deletes underlying C++ slice pointer.
-   * Note that this function should be called only after all
-   * RocksDB instances referencing the slice are closed.
-   * Otherwise an undefined behavior will occur.
-   */
-  @Override
-  protected void disposeInternal() {
-    disposeInternal(nativeHandle_);
-  }
-
   protected native long createNewLoggerOptions(
       long options);
   protected native long createNewLoggerDbOptions(
@@ -107,5 +107,16 @@ protected native long createNewLoggerDbOptions(
   protected native void setInfoLogLevel(long handle,
       byte infoLogLevel);
   protected native byte infoLogLevel(long handle);
+
+  /**
+   * We override {@link RocksCallbackObject#disposeInternal()}
+   * as disposing of a rocksdb::LoggerJniCallback requires
+   * a slightly different approach as it is a std::shared_ptr
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
   private native void disposeInternal(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MemoryUsageType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MemoryUsageType.java
new file mode 100644
index 0000000000..6010ce7af5
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MemoryUsageType.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * MemoryUsageType
+ *
+ * <p>The value will be used as a key to indicate the type of memory usage
+ * described</p>
+ */
+public enum MemoryUsageType {
+  /**
+   * Memory usage of all the mem-tables.
+   */
+  kMemTableTotal((byte) 0),
+  /**
+   * Memory usage of those un-flushed mem-tables.
+   */
+  kMemTableUnFlushed((byte) 1),
+  /**
+   * Memory usage of all the table readers.
+   */
+  kTableReadersTotal((byte) 2),
+  /**
+   * Memory usage by Cache.
+   */
+  kCacheTotal((byte) 3),
+  /**
+   * Max usage types - copied to keep 1:1 with native.
+   */
+  kNumUsageTypes((byte) 4);
+
+  /**
+   * Returns the byte value of the enumerations value
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value_;
+  }
+
+  /**
+   * <p>Get the MemoryUsageType enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of MemoryUsageType.
+   *
+   * @return MemoryUsageType instance.
+   *
+   * @throws IllegalArgumentException if the usage type for the byteIdentifier
+   *     cannot be found
+   */
+  public static MemoryUsageType getMemoryUsageType(final byte byteIdentifier) {
+    for (final MemoryUsageType memoryUsageType : MemoryUsageType.values()) {
+      if (memoryUsageType.getValue() == byteIdentifier) {
+        return memoryUsageType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for MemoryUsageType.");
+  }
+
+  MemoryUsageType(byte value) {
+    value_ = value;
+  }
+
+  private final byte value_;
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MemoryUtil.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MemoryUtil.java
new file mode 100644
index 0000000000..52b2175e6b
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MemoryUtil.java
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.*;
+
+/**
+ * JNI passthrough for MemoryUtil.
+ */
+public class MemoryUtil {
+
+  /**
+   * <p>Returns the approximate memory usage of different types in the input
+   * list of DBs and Cache set.  For instance, in the output map the key
+   * kMemTableTotal will be associated with the memory
+   * usage of all the mem-tables from all the input rocksdb instances.</p>
+   *
+   * <p>Note that for memory usage inside Cache class, we will
+   * only report the usage of the input "cache_set" without
+   * including those Cache usage inside the input list "dbs"
+   * of DBs.</p>
+   *
+   * @param dbs List of dbs to collect memory usage for.
+   * @param caches Set of caches to collect memory usage for.
+   * @return Map from {@link MemoryUsageType} to memory usage as a {@link Long}.
+   */
+  public static Map<MemoryUsageType, Long> getApproximateMemoryUsageByType(final List<RocksDB> dbs, final Set<Cache> caches) {
+    int dbCount = (dbs == null) ? 0 : dbs.size();
+    int cacheCount = (caches == null) ? 0 : caches.size();
+    long[] dbHandles = new long[dbCount];
+    long[] cacheHandles = new long[cacheCount];
+    if (dbCount > 0) {
+      ListIterator<RocksDB> dbIter = dbs.listIterator();
+      while (dbIter.hasNext()) {
+        dbHandles[dbIter.nextIndex()] = dbIter.next().nativeHandle_;
+      }
+    }
+    if (cacheCount > 0) {
+      // NOTE: This index handling is super ugly but I couldn't get a clean way to track both the
+      // index and the iterator simultaneously within a Set.
+      int i = 0;
+      for (Cache cache : caches) {
+        cacheHandles[i] = cache.nativeHandle_;
+        i++;
+      }
+    }
+    Map<Byte, Long> byteOutput = getApproximateMemoryUsageByType(dbHandles, cacheHandles);
+    Map<MemoryUsageType, Long> output = new HashMap<>();
+    for(Map.Entry<Byte, Long> longEntry : byteOutput.entrySet()) {
+      output.put(MemoryUsageType.getMemoryUsageType(longEntry.getKey()), longEntry.getValue());
+    }
+    return output;
+  }
+
+  private native static Map<Byte, Long> getApproximateMemoryUsageByType(final long[] dbHandles,
+      final long[] cacheHandles);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
index 3585318dbd..1d9ca08174 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java
@@ -7,27 +7,20 @@
 
 import java.util.*;
 
-public class MutableColumnFamilyOptions {
-  private final static String KEY_VALUE_PAIR_SEPARATOR = ";";
-  private final static char KEY_VALUE_SEPARATOR = '=';
-  private final static String INT_ARRAY_INT_SEPARATOR = ",";
-
-  private final String[] keys;
-  private final String[] values;
-
-  // user must use builder pattern, or parser
-  private MutableColumnFamilyOptions(final String keys[],
-      final String values[]) {
-    this.keys = keys;
-    this.values = values;
-  }
-
-  String[] getKeys() {
-    return keys;
-  }
+public class MutableColumnFamilyOptions
+    extends AbstractMutableOptions {
 
-  String[] getValues() {
-    return values;
+  /**
+   * User must use builder pattern, or parser.
+   *
+   * @param keys the keys
+   * @param values the values
+   *
+   * See {@link #builder()} and {@link #parse(String)}.
+   */
+  private MutableColumnFamilyOptions(final String[] keys,
+      final String[] values) {
+    super(keys, values);
   }
 
   /**
@@ -60,7 +53,7 @@ public static MutableColumnFamilyOptionsBuilder parse(final String str) {
     final MutableColumnFamilyOptionsBuilder builder =
         new MutableColumnFamilyOptionsBuilder();
 
-    final String options[] = str.trim().split(KEY_VALUE_PAIR_SEPARATOR);
+    final String[] options = str.trim().split(KEY_VALUE_PAIR_SEPARATOR);
     for(final String option : options) {
       final int equalsOffset = option.indexOf(KEY_VALUE_SEPARATOR);
       if(equalsOffset <= 0) {
@@ -69,12 +62,12 @@ public static MutableColumnFamilyOptionsBuilder parse(final String str) {
       }
 
       final String key = option.substring(0, equalsOffset);
-      if(key == null || key.isEmpty()) {
+      if(key.isEmpty()) {
         throw new IllegalArgumentException("options string is invalid");
       }
 
       final String value = option.substring(equalsOffset + 1);
-      if(value == null || value.isEmpty()) {
+      if(value.isEmpty()) {
         throw new IllegalArgumentException("options string is invalid");
       }
 
@@ -84,37 +77,7 @@ public static MutableColumnFamilyOptionsBuilder parse(final String str) {
     return builder;
   }
 
-  /**
-   * Returns a string representation
-   * of MutableColumnFamilyOptions which is
-   * suitable for consumption by {@link #parse(String)}
-   *
-   * @return String representation of MutableColumnFamilyOptions
-   */
-  @Override
-  public String toString() {
-    final StringBuilder buffer = new StringBuilder();
-    for(int i = 0; i < keys.length; i++) {
-      buffer
-          .append(keys[i])
-          .append(KEY_VALUE_SEPARATOR)
-          .append(values[i]);
-
-      if(i + 1 < keys.length) {
-        buffer.append(KEY_VALUE_PAIR_SEPARATOR);
-      }
-    }
-    return buffer.toString();
-  }
-
-  public enum ValueType {
-    DOUBLE,
-    LONG,
-    INT,
-    BOOLEAN,
-    INT_ARRAY,
-    ENUM
-  }
+  private interface MutableColumnFamilyOptionKey extends MutableOptionKey {}
 
   public enum MemtableOption implements MutableColumnFamilyOptionKey {
     write_buffer_size(ValueType.LONG),
@@ -153,7 +116,8 @@ public enum CompactionOption implements MutableColumnFamilyOptionKey {
     target_file_size_multiplier(ValueType.INT),
     max_bytes_for_level_base(ValueType.LONG),
     max_bytes_for_level_multiplier(ValueType.INT),
-    max_bytes_for_level_multiplier_additional(ValueType.INT_ARRAY);
+    max_bytes_for_level_multiplier_additional(ValueType.INT_ARRAY),
+    ttl(ValueType.LONG);
 
     private final ValueType valueType;
     CompactionOption(final ValueType valueType) {
@@ -183,356 +147,9 @@ public ValueType getValueType() {
     }
   }
 
-  private interface MutableColumnFamilyOptionKey {
-    String name();
-    ValueType getValueType();
-  }
-
-  private static abstract class MutableColumnFamilyOptionValue<T> {
-    protected final T value;
-
-    MutableColumnFamilyOptionValue(final T value) {
-      this.value = value;
-    }
-
-    abstract double asDouble() throws NumberFormatException;
-    abstract long asLong() throws NumberFormatException;
-    abstract int asInt() throws NumberFormatException;
-    abstract boolean asBoolean() throws IllegalStateException;
-    abstract int[] asIntArray() throws IllegalStateException;
-    abstract String asString();
-    abstract T asObject();
-  }
-
-  private static class MutableColumnFamilyOptionStringValue
-      extends MutableColumnFamilyOptionValue<String> {
-    MutableColumnFamilyOptionStringValue(final String value) {
-      super(value);
-    }
-
-    @Override
-    double asDouble() throws NumberFormatException {
-      return Double.parseDouble(value);
-    }
-
-    @Override
-    long asLong() throws NumberFormatException {
-      return Long.parseLong(value);
-    }
-
-    @Override
-    int asInt() throws NumberFormatException {
-      return Integer.parseInt(value);
-    }
-
-    @Override
-    boolean asBoolean() throws IllegalStateException {
-      return Boolean.parseBoolean(value);
-    }
-
-    @Override
-    int[] asIntArray() throws IllegalStateException {
-      throw new IllegalStateException("String is not applicable as int[]");
-    }
-
-    @Override
-    String asString() {
-      return value;
-    }
-
-    @Override
-    String asObject() {
-      return value;
-    }
-  }
-
-  private static class MutableColumnFamilyOptionDoubleValue
-      extends MutableColumnFamilyOptionValue<Double> {
-    MutableColumnFamilyOptionDoubleValue(final double value) {
-      super(value);
-    }
-
-    @Override
-    double asDouble() {
-      return value;
-    }
-
-    @Override
-    long asLong() throws NumberFormatException {
-      return value.longValue();
-    }
-
-    @Override
-    int asInt() throws NumberFormatException {
-      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
-        throw new NumberFormatException(
-            "double value lies outside the bounds of int");
-      }
-      return value.intValue();
-    }
-
-    @Override
-    boolean asBoolean() throws IllegalStateException {
-      throw new IllegalStateException(
-          "double is not applicable as boolean");
-    }
-
-    @Override
-    int[] asIntArray() throws IllegalStateException {
-      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
-        throw new NumberFormatException(
-            "double value lies outside the bounds of int");
-      }
-      return new int[] { value.intValue() };
-    }
-
-    @Override
-    String asString() {
-      return Double.toString(value);
-    }
-
-    @Override
-    Double asObject() {
-      return value;
-    }
-  }
-
-  private static class MutableColumnFamilyOptionLongValue
-      extends MutableColumnFamilyOptionValue<Long> {
-    MutableColumnFamilyOptionLongValue(final long value) {
-      super(value);
-    }
-
-    @Override
-    double asDouble() {
-      if(value > Double.MAX_VALUE || value < Double.MIN_VALUE) {
-        throw new NumberFormatException(
-            "long value lies outside the bounds of int");
-      }
-      return value.doubleValue();
-    }
-
-    @Override
-    long asLong() throws NumberFormatException {
-      return value;
-    }
-
-    @Override
-    int asInt() throws NumberFormatException {
-      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
-        throw new NumberFormatException(
-            "long value lies outside the bounds of int");
-      }
-      return value.intValue();
-    }
-
-    @Override
-    boolean asBoolean() throws IllegalStateException {
-      throw new IllegalStateException(
-          "long is not applicable as boolean");
-    }
-
-    @Override
-    int[] asIntArray() throws IllegalStateException {
-      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
-        throw new NumberFormatException(
-            "long value lies outside the bounds of int");
-      }
-      return new int[] { value.intValue() };
-    }
-
-    @Override
-    String asString() {
-      return Long.toString(value);
-    }
-
-    @Override
-    Long asObject() {
-      return value;
-    }
-  }
-
-  private static class MutableColumnFamilyOptionIntValue
-      extends MutableColumnFamilyOptionValue<Integer> {
-    MutableColumnFamilyOptionIntValue(final int value) {
-      super(value);
-    }
-
-    @Override
-    double asDouble() {
-      if(value > Double.MAX_VALUE || value < Double.MIN_VALUE) {
-        throw new NumberFormatException("int value lies outside the bounds of int");
-      }
-      return value.doubleValue();
-    }
-
-    @Override
-    long asLong() throws NumberFormatException {
-      return value;
-    }
-
-    @Override
-    int asInt() throws NumberFormatException {
-      return value;
-    }
-
-    @Override
-    boolean asBoolean() throws IllegalStateException {
-      throw new IllegalStateException("int is not applicable as boolean");
-    }
-
-    @Override
-    int[] asIntArray() throws IllegalStateException {
-      return new int[] { value };
-    }
-
-    @Override
-    String asString() {
-      return Integer.toString(value);
-    }
-
-    @Override
-    Integer asObject() {
-      return value;
-    }
-  }
-
-  private static class MutableColumnFamilyOptionBooleanValue
-      extends MutableColumnFamilyOptionValue<Boolean> {
-    MutableColumnFamilyOptionBooleanValue(final boolean value) {
-      super(value);
-    }
-
-    @Override
-    double asDouble() {
-      throw new NumberFormatException("boolean is not applicable as double");
-    }
-
-    @Override
-    long asLong() throws NumberFormatException {
-      throw new NumberFormatException("boolean is not applicable as Long");
-    }
-
-    @Override
-    int asInt() throws NumberFormatException {
-      throw new NumberFormatException("boolean is not applicable as int");
-    }
-
-    @Override
-    boolean asBoolean() {
-      return value;
-    }
-
-    @Override
-    int[] asIntArray() throws IllegalStateException {
-      throw new IllegalStateException("boolean is not applicable as int[]");
-    }
-
-    @Override
-    String asString() {
-      return Boolean.toString(value);
-    }
-
-    @Override
-    Boolean asObject() {
-      return value;
-    }
-  }
-
-  private static class MutableColumnFamilyOptionIntArrayValue
-      extends MutableColumnFamilyOptionValue<int[]> {
-    MutableColumnFamilyOptionIntArrayValue(final int[] value) {
-      super(value);
-    }
-
-    @Override
-    double asDouble() {
-      throw new NumberFormatException("int[] is not applicable as double");
-    }
-
-    @Override
-    long asLong() throws NumberFormatException {
-      throw new NumberFormatException("int[] is not applicable as Long");
-    }
-
-    @Override
-    int asInt() throws NumberFormatException {
-      throw new NumberFormatException("int[] is not applicable as int");
-    }
-
-    @Override
-    boolean asBoolean() {
-      throw new NumberFormatException("int[] is not applicable as boolean");
-    }
-
-    @Override
-    int[] asIntArray() throws IllegalStateException {
-      return value;
-    }
-
-    @Override
-    String asString() {
-      final StringBuilder builder = new StringBuilder();
-      for(int i = 0; i < value.length; i++) {
-        builder.append(Integer.toString(i));
-        if(i + 1 < value.length) {
-          builder.append(INT_ARRAY_INT_SEPARATOR);
-        }
-      }
-      return builder.toString();
-    }
-
-    @Override
-    int[] asObject() {
-      return value;
-    }
-  }
-
-  private static class MutableColumnFamilyOptionEnumValue<T extends Enum<T>>
-      extends MutableColumnFamilyOptionValue<T> {
-
-    MutableColumnFamilyOptionEnumValue(final T value) {
-      super(value);
-    }
-
-    @Override
-    double asDouble() throws NumberFormatException {
-      throw new NumberFormatException("Enum is not applicable as double");
-    }
-
-    @Override
-    long asLong() throws NumberFormatException {
-      throw new NumberFormatException("Enum is not applicable as long");
-    }
-
-    @Override
-    int asInt() throws NumberFormatException {
-      throw new NumberFormatException("Enum is not applicable as int");
-    }
-
-    @Override
-    boolean asBoolean() throws IllegalStateException {
-      throw new NumberFormatException("Enum is not applicable as boolean");
-    }
-
-    @Override
-    int[] asIntArray() throws IllegalStateException {
-      throw new NumberFormatException("Enum is not applicable as int[]");
-    }
-
-    @Override
-    String asString() {
-      return value.name();
-    }
-
-    @Override
-    T asObject() {
-      return value;
-    }
-  }
-
   public static class MutableColumnFamilyOptionsBuilder
-      implements MutableColumnFamilyOptionsInterface {
+      extends AbstractMutableOptionsBuilder<MutableColumnFamilyOptions, MutableColumnFamilyOptionsBuilder, MutableColumnFamilyOptionKey>
+      implements MutableColumnFamilyOptionsInterface<MutableColumnFamilyOptionsBuilder> {
 
     private final static Map<String, MutableColumnFamilyOptionKey> ALL_KEYS_LOOKUP = new HashMap<>();
     static {
@@ -549,179 +166,24 @@ public static class MutableColumnFamilyOptionsBuilder
       }
     }
 
-    private final Map<MutableColumnFamilyOptionKey, MutableColumnFamilyOptionValue<?>> options = new LinkedHashMap<>();
-
-    public MutableColumnFamilyOptions build() {
-      final String keys[] = new String[options.size()];
-      final String values[] = new String[options.size()];
-
-      int i = 0;
-      for(final Map.Entry<MutableColumnFamilyOptionKey, MutableColumnFamilyOptionValue<?>> option : options.entrySet()) {
-        keys[i] = option.getKey().name();
-        values[i] = option.getValue().asString();
-        i++;
-      }
-
-      return new MutableColumnFamilyOptions(keys, values);
-    }
-
-    private MutableColumnFamilyOptionsBuilder setDouble(
-        final MutableColumnFamilyOptionKey key, final double value) {
-      if(key.getValueType() != ValueType.DOUBLE) {
-        throw new IllegalArgumentException(
-            key + " does not accept a double value");
-      }
-      options.put(key, new MutableColumnFamilyOptionDoubleValue(value));
-      return this;
-    }
-
-    private double getDouble(final MutableColumnFamilyOptionKey key)
-        throws NoSuchElementException, NumberFormatException {
-      final MutableColumnFamilyOptionValue<?> value = options.get(key);
-      if(value == null) {
-        throw new NoSuchElementException(key.name() + " has not been set");
-      }
-      return value.asDouble();
-    }
-
-    private MutableColumnFamilyOptionsBuilder setLong(
-        final MutableColumnFamilyOptionKey key, final long value) {
-      if(key.getValueType() != ValueType.LONG) {
-        throw new IllegalArgumentException(
-            key + " does not accept a long value");
-      }
-      options.put(key, new MutableColumnFamilyOptionLongValue(value));
-      return this;
-    }
-
-    private long getLong(final MutableColumnFamilyOptionKey key)
-        throws NoSuchElementException, NumberFormatException {
-      final MutableColumnFamilyOptionValue<?> value = options.get(key);
-      if(value == null) {
-        throw new NoSuchElementException(key.name() + " has not been set");
-      }
-      return value.asLong();
-    }
-
-    private MutableColumnFamilyOptionsBuilder setInt(
-        final MutableColumnFamilyOptionKey key, final int value) {
-      if(key.getValueType() != ValueType.INT) {
-        throw new IllegalArgumentException(
-            key + " does not accept an integer value");
-      }
-      options.put(key, new MutableColumnFamilyOptionIntValue(value));
-      return this;
-    }
-
-    private int getInt(final MutableColumnFamilyOptionKey key)
-        throws NoSuchElementException, NumberFormatException {
-      final MutableColumnFamilyOptionValue<?> value = options.get(key);
-      if(value == null) {
-        throw new NoSuchElementException(key.name() + " has not been set");
-      }
-      return value.asInt();
-    }
-
-    private MutableColumnFamilyOptionsBuilder setBoolean(
-        final MutableColumnFamilyOptionKey key, final boolean value) {
-      if(key.getValueType() != ValueType.BOOLEAN) {
-        throw new IllegalArgumentException(
-            key + " does not accept a boolean value");
-      }
-      options.put(key, new MutableColumnFamilyOptionBooleanValue(value));
-      return this;
-    }
-
-    private boolean getBoolean(final MutableColumnFamilyOptionKey key)
-        throws NoSuchElementException, NumberFormatException {
-      final MutableColumnFamilyOptionValue<?> value = options.get(key);
-      if(value == null) {
-        throw new NoSuchElementException(key.name() + " has not been set");
-      }
-      return value.asBoolean();
+    private MutableColumnFamilyOptionsBuilder() {
+      super();
     }
 
-    private MutableColumnFamilyOptionsBuilder setIntArray(
-        final MutableColumnFamilyOptionKey key, final int[] value) {
-      if(key.getValueType() != ValueType.INT_ARRAY) {
-        throw new IllegalArgumentException(
-            key + " does not accept an int array value");
-      }
-      options.put(key, new MutableColumnFamilyOptionIntArrayValue(value));
-      return this;
-    }
-
-    private int[] getIntArray(final MutableColumnFamilyOptionKey key)
-        throws NoSuchElementException, NumberFormatException {
-      final MutableColumnFamilyOptionValue<?> value = options.get(key);
-      if(value == null) {
-        throw new NoSuchElementException(key.name() + " has not been set");
-      }
-      return value.asIntArray();
-    }
-
-    private <T extends Enum<T>> MutableColumnFamilyOptionsBuilder setEnum(
-        final MutableColumnFamilyOptionKey key, final T value) {
-      if(key.getValueType() != ValueType.ENUM) {
-        throw new IllegalArgumentException(
-            key + " does not accept a Enum value");
-      }
-      options.put(key, new MutableColumnFamilyOptionEnumValue<T>(value));
+    @Override
+    protected MutableColumnFamilyOptionsBuilder self() {
       return this;
-
     }
 
-    private <T extends Enum<T>> T getEnum(final MutableColumnFamilyOptionKey key)
-        throws NoSuchElementException, NumberFormatException {
-      final MutableColumnFamilyOptionValue<?> value = options.get(key);
-      if(value == null) {
-        throw new NoSuchElementException(key.name() + " has not been set");
-      }
-
-      if(!(value instanceof MutableColumnFamilyOptionEnumValue)) {
-        throw new NoSuchElementException(key.name() + " is not of Enum type");
-      }
-
-      return ((MutableColumnFamilyOptionEnumValue<T>)value).asObject();
+    @Override
+    protected Map<String, MutableColumnFamilyOptionKey> allKeys() {
+      return ALL_KEYS_LOOKUP;
     }
 
-    public MutableColumnFamilyOptionsBuilder fromString(final String keyStr,
-        final String valueStr) throws IllegalArgumentException {
-      Objects.requireNonNull(keyStr);
-      Objects.requireNonNull(valueStr);
-
-      final MutableColumnFamilyOptionKey key = ALL_KEYS_LOOKUP.get(keyStr);
-      switch(key.getValueType()) {
-        case DOUBLE:
-          return setDouble(key, Double.parseDouble(valueStr));
-
-        case LONG:
-          return setLong(key, Long.parseLong(valueStr));
-
-        case INT:
-          return setInt(key, Integer.parseInt(valueStr));
-
-        case BOOLEAN:
-          return setBoolean(key, Boolean.parseBoolean(valueStr));
-
-        case INT_ARRAY:
-          final String[] strInts = valueStr
-              .trim().split(INT_ARRAY_INT_SEPARATOR);
-          if(strInts == null || strInts.length == 0) {
-            throw new IllegalArgumentException(
-                "int array value is not correctly formatted");
-          }
-
-          final int value[] = new int[strInts.length];
-          int i = 0;
-          for(final String strInt : strInts) {
-            value[i++] = Integer.parseInt(strInt);
-          }
-          return setIntArray(key, value);
-      }
-
-      throw new IllegalStateException(
-          key + " has unknown value type: " + key.getValueType());
+    @Override
+    protected MutableColumnFamilyOptions build(final String[] keys,
+        final String[] values) {
+      return new MutableColumnFamilyOptions(keys, values);
     }
 
     @Override
@@ -993,5 +455,15 @@ public MutableColumnFamilyOptionsBuilder setReportBgIoStats(
     public boolean reportBgIoStats() {
       return getBoolean(MiscOption.report_bg_io_stats);
     }
+
+    @Override
+    public MutableColumnFamilyOptionsBuilder setTtl(final long ttl) {
+      return setLong(CompactionOption.ttl, ttl);
+    }
+
+    @Override
+    public long ttl() {
+      return getLong(CompactionOption.ttl);
+    }
   }
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java
new file mode 100644
index 0000000000..328f7f9795
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptions.java
@@ -0,0 +1,286 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+
+public class MutableDBOptions extends AbstractMutableOptions {
+
+  /**
+   * User must use builder pattern, or parser.
+   *
+   * @param keys the keys
+   * @param values the values
+   *
+   * See {@link #builder()} and {@link #parse(String)}.
+   */
+  private MutableDBOptions(final String[] keys, final String[] values) {
+    super(keys, values);
+  }
+
+  /**
+   * Creates a builder which allows you
+   * to set MutableDBOptions in a fluent
+   * manner
+   *
+   * @return A builder for MutableDBOptions
+   */
+  public static MutableDBOptionsBuilder builder() {
+    return new MutableDBOptionsBuilder();
+  }
+
+  /**
+   * Parses a String representation of MutableDBOptions
+   *
+   * The format is: key1=value1;key2=value2;key3=value3 etc
+   *
+   * For int[] values, each int should be separated by a comma, e.g.
+   *
+   * key1=value1;intArrayKey1=1,2,3
+   *
+   * @param str The string representation of the mutable db options
+   *
+   * @return A builder for the mutable db options
+   */
+  public static MutableDBOptionsBuilder parse(final String str) {
+    Objects.requireNonNull(str);
+
+    final MutableDBOptionsBuilder builder =
+        new MutableDBOptionsBuilder();
+
+    final String[] options = str.trim().split(KEY_VALUE_PAIR_SEPARATOR);
+    for(final String option : options) {
+      final int equalsOffset = option.indexOf(KEY_VALUE_SEPARATOR);
+      if(equalsOffset <= 0) {
+        throw new IllegalArgumentException(
+            "options string has an invalid key=value pair");
+      }
+
+      final String key = option.substring(0, equalsOffset);
+      if(key.isEmpty()) {
+        throw new IllegalArgumentException("options string is invalid");
+      }
+
+      final String value = option.substring(equalsOffset + 1);
+      if(value.isEmpty()) {
+        throw new IllegalArgumentException("options string is invalid");
+      }
+
+      builder.fromString(key, value);
+    }
+
+    return builder;
+  }
+
+  private interface MutableDBOptionKey extends MutableOptionKey {}
+
+  public enum DBOption implements MutableDBOptionKey {
+    max_background_jobs(ValueType.INT),
+    base_background_compactions(ValueType.INT),
+    max_background_compactions(ValueType.INT),
+    avoid_flush_during_shutdown(ValueType.BOOLEAN),
+    writable_file_max_buffer_size(ValueType.LONG),
+    delayed_write_rate(ValueType.LONG),
+    max_total_wal_size(ValueType.LONG),
+    delete_obsolete_files_period_micros(ValueType.LONG),
+    stats_dump_period_sec(ValueType.INT),
+    max_open_files(ValueType.INT),
+    bytes_per_sync(ValueType.LONG),
+    wal_bytes_per_sync(ValueType.LONG),
+    compaction_readahead_size(ValueType.LONG);
+
+    private final ValueType valueType;
+    DBOption(final ValueType valueType) {
+      this.valueType = valueType;
+    }
+
+    @Override
+    public ValueType getValueType() {
+      return valueType;
+    }
+  }
+
+  public static class MutableDBOptionsBuilder
+      extends AbstractMutableOptionsBuilder<MutableDBOptions, MutableDBOptionsBuilder, MutableDBOptionKey>
+      implements MutableDBOptionsInterface<MutableDBOptionsBuilder> {
+
+    private final static Map<String, MutableDBOptionKey> ALL_KEYS_LOOKUP = new HashMap<>();
+    static {
+      for(final MutableDBOptionKey key : DBOption.values()) {
+        ALL_KEYS_LOOKUP.put(key.name(), key);
+      }
+    }
+
+    private MutableDBOptionsBuilder() {
+      super();
+    }
+
+    @Override
+    protected MutableDBOptionsBuilder self() {
+      return this;
+    }
+
+    @Override
+    protected Map<String, MutableDBOptionKey> allKeys() {
+      return ALL_KEYS_LOOKUP;
+    }
+
+    @Override
+    protected MutableDBOptions build(final String[] keys,
+        final String[] values) {
+      return new MutableDBOptions(keys, values);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setMaxBackgroundJobs(
+        final int maxBackgroundJobs) {
+      return setInt(DBOption.max_background_jobs, maxBackgroundJobs);
+    }
+
+    @Override
+    public int maxBackgroundJobs() {
+      return getInt(DBOption.max_background_jobs);
+    }
+
+    @Override
+    public void setBaseBackgroundCompactions(
+        final int baseBackgroundCompactions) {
+      setInt(DBOption.base_background_compactions,
+          baseBackgroundCompactions);
+    }
+
+    @Override
+    public int baseBackgroundCompactions() {
+      return getInt(DBOption.base_background_compactions);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setMaxBackgroundCompactions(
+        final int maxBackgroundCompactions) {
+      return setInt(DBOption.max_background_compactions,
+          maxBackgroundCompactions);
+    }
+
+    @Override
+    public int maxBackgroundCompactions() {
+      return getInt(DBOption.max_background_compactions);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setAvoidFlushDuringShutdown(
+        final boolean avoidFlushDuringShutdown) {
+      return setBoolean(DBOption.avoid_flush_during_shutdown,
+          avoidFlushDuringShutdown);
+    }
+
+    @Override
+    public boolean avoidFlushDuringShutdown() {
+      return getBoolean(DBOption.avoid_flush_during_shutdown);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setWritableFileMaxBufferSize(
+        final long writableFileMaxBufferSize) {
+      return setLong(DBOption.writable_file_max_buffer_size,
+          writableFileMaxBufferSize);
+    }
+
+    @Override
+    public long writableFileMaxBufferSize() {
+      return getLong(DBOption.writable_file_max_buffer_size);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setDelayedWriteRate(
+        final long delayedWriteRate) {
+      return setLong(DBOption.delayed_write_rate,
+          delayedWriteRate);
+    }
+
+    @Override
+    public long delayedWriteRate() {
+      return getLong(DBOption.delayed_write_rate);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setMaxTotalWalSize(
+        final long maxTotalWalSize) {
+      return setLong(DBOption.max_total_wal_size, maxTotalWalSize);
+    }
+
+    @Override
+    public long maxTotalWalSize() {
+      return getLong(DBOption.max_total_wal_size);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setDeleteObsoleteFilesPeriodMicros(
+        final long micros) {
+      return setLong(DBOption.delete_obsolete_files_period_micros, micros);
+    }
+
+    @Override
+    public long deleteObsoleteFilesPeriodMicros() {
+      return getLong(DBOption.delete_obsolete_files_period_micros);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setStatsDumpPeriodSec(
+        final int statsDumpPeriodSec) {
+      return setInt(DBOption.stats_dump_period_sec, statsDumpPeriodSec);
+    }
+
+    @Override
+    public int statsDumpPeriodSec() {
+      return getInt(DBOption.stats_dump_period_sec);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setMaxOpenFiles(final int maxOpenFiles) {
+      return setInt(DBOption.max_open_files, maxOpenFiles);
+    }
+
+    @Override
+    public int maxOpenFiles() {
+      return getInt(DBOption.max_open_files);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setBytesPerSync(final long bytesPerSync) {
+      return setLong(DBOption.bytes_per_sync, bytesPerSync);
+    }
+
+    @Override
+    public long bytesPerSync() {
+      return getLong(DBOption.bytes_per_sync);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setWalBytesPerSync(
+        final long walBytesPerSync) {
+      return setLong(DBOption.wal_bytes_per_sync, walBytesPerSync);
+    }
+
+    @Override
+    public long walBytesPerSync() {
+      return getLong(DBOption.wal_bytes_per_sync);
+    }
+
+    @Override
+    public MutableDBOptionsBuilder setCompactionReadaheadSize(
+        final long compactionReadaheadSize) {
+      return setLong(DBOption.compaction_readahead_size,
+          compactionReadaheadSize);
+    }
+
+    @Override
+    public long compactionReadaheadSize() {
+      return getLong(DBOption.compaction_readahead_size);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
new file mode 100644
index 0000000000..5fe3215b39
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java
@@ -0,0 +1,336 @@
+package org.rocksdb;
+
+public interface MutableDBOptionsInterface<T extends MutableDBOptionsInterface> {
+
+  /**
+   * Specifies the maximum number of concurrent background jobs (both flushes
+   * and compactions combined).
+   * Default: 2
+   *
+   * @param maxBackgroundJobs number of max concurrent background jobs
+   * @return the instance of the current object.
+   */
+  T setMaxBackgroundJobs(int maxBackgroundJobs);
+
+  /**
+   * Returns the maximum number of concurrent background jobs (both flushes
+   * and compactions combined).
+   * Default: 2
+   *
+   * @return the maximum number of concurrent background jobs.
+   */
+  int maxBackgroundJobs();
+
+  /**
+   * Suggested number of concurrent background compaction jobs, submitted to
+   * the default LOW priority thread pool.
+   * Default: 1
+   *
+   * @param baseBackgroundCompactions Suggested number of background compaction
+   *     jobs
+   *
+   * @deprecated Use {@link #setMaxBackgroundJobs(int)}
+   */
+  @Deprecated
+  void setBaseBackgroundCompactions(int baseBackgroundCompactions);
+
+  /**
+   * Suggested number of concurrent background compaction jobs, submitted to
+   * the default LOW priority thread pool.
+   * Default: 1
+   *
+   * @return Suggested number of background compaction jobs
+   */
+  int baseBackgroundCompactions();
+
+  /**
+   * Specifies the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * If you're increasing this, also consider increasing number of threads in
+   * LOW priority thread pool. For more information, see
+   * Default: 1
+   *
+   * @param maxBackgroundCompactions the maximum number of background
+   *     compaction jobs.
+   * @return the instance of the current object.
+   *
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
+   * @see DBOptionsInterface#maxBackgroundFlushes()
+   */
+  T setMaxBackgroundCompactions(int maxBackgroundCompactions);
+
+  /**
+   * Returns the maximum number of concurrent background compaction jobs,
+   * submitted to the default LOW priority thread pool.
+   * When increasing this number, we may also want to consider increasing
+   * number of threads in LOW priority thread pool.
+   * Default: 1
+   *
+   * @return the maximum number of concurrent background compaction jobs.
+   * @see RocksEnv#setBackgroundThreads(int)
+   * @see RocksEnv#setBackgroundThreads(int, Priority)
+   *
+   * @deprecated Use {@link #setMaxBackgroundJobs(int)}
+   */
+  @Deprecated
+  int maxBackgroundCompactions();
+
+  /**
+   * By default RocksDB will flush all memtables on DB close if there are
+   * unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+   * DB close. Unpersisted data WILL BE LOST.
+   *
+   * DEFAULT: false
+   *
+   * Dynamically changeable through
+   *     {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
+   *     API.
+   *
+   * @param avoidFlushDuringShutdown true if we should avoid flush during
+   *     shutdown
+   *
+   * @return the reference to the current options.
+   */
+  T setAvoidFlushDuringShutdown(boolean avoidFlushDuringShutdown);
+
+  /**
+   * By default RocksDB will flush all memtables on DB close if there are
+   * unpersisted data (i.e. with WAL disabled) The flush can be skip to speedup
+   * DB close. Unpersisted data WILL BE LOST.
+   *
+   * DEFAULT: false
+   *
+   * Dynamically changeable through
+   *     {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
+   *     API.
+   *
+   * @return true if we should avoid flush during shutdown
+   */
+  boolean avoidFlushDuringShutdown();
+
+  /**
+   * This is the maximum buffer size that is used by WritableFileWriter.
+   * On Windows, we need to maintain an aligned buffer for writes.
+   * We allow the buffer to grow until it's size hits the limit.
+   *
+   * Default: 1024 * 1024 (1 MB)
+   *
+   * @param writableFileMaxBufferSize the maximum buffer size
+   *
+   * @return the reference to the current options.
+   */
+  T setWritableFileMaxBufferSize(long writableFileMaxBufferSize);
+
+  /**
+   * This is the maximum buffer size that is used by WritableFileWriter.
+   * On Windows, we need to maintain an aligned buffer for writes.
+   * We allow the buffer to grow until it's size hits the limit.
+   *
+   * Default: 1024 * 1024 (1 MB)
+   *
+   * @return the maximum buffer size
+   */
+  long writableFileMaxBufferSize();
+
+  /**
+   * The limited write rate to DB if
+   * {@link ColumnFamilyOptions#softPendingCompactionBytesLimit()} or
+   * {@link ColumnFamilyOptions#level0SlowdownWritesTrigger()} is triggered,
+   * or we are writing to the last mem table allowed and we allow more than 3
+   * mem tables. It is calculated using size of user write requests before
+   * compression. RocksDB may decide to slow down more if the compaction still
+   * gets behind further.
+   *
+   * Unit: bytes per second.
+   *
+   * Default: 16MB/s
+   *
+   * @param delayedWriteRate the rate in bytes per second
+   *
+   * @return the reference to the current options.
+   */
+  T setDelayedWriteRate(long delayedWriteRate);
+
+  /**
+   * The limited write rate to DB if
+   * {@link ColumnFamilyOptions#softPendingCompactionBytesLimit()} or
+   * {@link ColumnFamilyOptions#level0SlowdownWritesTrigger()} is triggered,
+   * or we are writing to the last mem table allowed and we allow more than 3
+   * mem tables. It is calculated using size of user write requests before
+   * compression. RocksDB may decide to slow down more if the compaction still
+   * gets behind further.
+   *
+   * Unit: bytes per second.
+   *
+   * Default: 16MB/s
+   *
+   * @return the rate in bytes per second
+   */
+  long delayedWriteRate();
+
+  /**
+   * <p>Once write-ahead logs exceed this size, we will start forcing the
+   * flush of column families whose memtables are backed by the oldest live
+   * WAL file (i.e. the ones that are causing all the space amplification).
+   * </p>
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit to
+   * be [sum of all write_buffer_size * max_write_buffer_number] * 2</p>
+   * <p>This option takes effect only when there are more than one column family as
+   * otherwise the wal size is dictated by the write_buffer_size.</p>
+   * <p>Default: 0</p>
+   *
+   * @param maxTotalWalSize max total wal size.
+   * @return the instance of the current object.
+   */
+  T setMaxTotalWalSize(long maxTotalWalSize);
+
+  /**
+   * <p>Returns the max total wal size. Once write-ahead logs exceed this size,
+   * we will start forcing the flush of column families whose memtables are
+   * backed by the oldest live WAL file (i.e. the ones that are causing all
+   * the space amplification).</p>
+   *
+   * <p>If set to 0 (default), we will dynamically choose the WAL size limit
+   * to be [sum of all write_buffer_size * max_write_buffer_number] * 2
+   * </p>
+   *
+   * @return max total wal size
+   */
+  long maxTotalWalSize();
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @param micros the time interval in micros
+   * @return the instance of the current object.
+   */
+  T setDeleteObsoleteFilesPeriodMicros(long micros);
+
+  /**
+   * The periodicity when obsolete files get deleted. The default
+   * value is 6 hours. The files that get out of scope by compaction
+   * process will still get automatically delete on every compaction,
+   * regardless of this setting
+   *
+   * @return the time interval in micros when obsolete files will be deleted.
+   */
+  long deleteObsoleteFilesPeriodMicros();
+
+  /**
+   * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 600 (10 minutes)
+   *
+   * @param statsDumpPeriodSec time interval in seconds.
+   * @return the instance of the current object.
+   */
+  T setStatsDumpPeriodSec(int statsDumpPeriodSec);
+
+  /**
+   * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec
+   * Default: 600 (10 minutes)
+   *
+   * @return time interval in seconds.
+   */
+  int statsDumpPeriodSec();
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   * Default: 5000
+   *
+   * @param maxOpenFiles the maximum number of open files.
+   * @return the instance of the current object.
+   */
+  T setMaxOpenFiles(int maxOpenFiles);
+
+  /**
+   * Number of open files that can be used by the DB.  You may need to
+   * increase this if your database has a large working set. Value -1 means
+   * files opened are always kept open. You can estimate number of files based
+   * on {@code target_file_size_base} and {@code target_file_size_multiplier}
+   * for level-based compaction. For universal-style compaction, you can usually
+   * set it to -1.
+   *
+   * @return the maximum number of open files.
+   */
+  int maxOpenFiles();
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @param bytesPerSync size in bytes
+   * @return the instance of the current object.
+   */
+  T setBytesPerSync(long bytesPerSync);
+
+  /**
+   * Allows OS to incrementally sync files to disk while they are being
+   * written, asynchronously, in the background.
+   * Issue one request for every bytes_per_sync written. 0 turns it off.
+   * Default: 0
+   *
+   * @return size in bytes
+   */
+  long bytesPerSync();
+
+  /**
+   * Same as {@link #setBytesPerSync(long)} , but applies to WAL files
+   *
+   * Default: 0, turned off
+   *
+   * @param walBytesPerSync size in bytes
+   * @return the instance of the current object.
+   */
+  T setWalBytesPerSync(long walBytesPerSync);
+
+  /**
+   * Same as {@link #bytesPerSync()} , but applies to WAL files
+   *
+   * Default: 0, turned off
+   *
+   * @return size in bytes
+   */
+  long walBytesPerSync();
+
+
+  /**
+   * If non-zero, we perform bigger reads when doing compaction. If you're
+   * running RocksDB on spinning disks, you should set this to at least 2MB.
+   *
+   * That way RocksDB's compaction is doing sequential instead of random reads.
+   * When non-zero, we also force
+   * {@link DBOptionsInterface#newTableReaderForCompactionInputs()} to true.
+   *
+   * Default: 0
+   *
+   * @param compactionReadaheadSize The compaction read-ahead size
+   *
+   * @return the reference to the current options.
+   */
+  T setCompactionReadaheadSize(final long compactionReadaheadSize);
+
+  /**
+   * If non-zero, we perform bigger reads when doing compaction. If you're
+   * running RocksDB on spinning disks, you should set this to at least 2MB.
+   *
+   * That way RocksDB's compaction is doing sequential instead of random reads.
+   * When non-zero, we also force
+   * {@link DBOptionsInterface#newTableReaderForCompactionInputs()} to true.
+   *
+   * Default: 0
+   *
+   * @return The compaction read-ahead size
+   */
+  long compactionReadaheadSize();
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableOptionKey.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableOptionKey.java
new file mode 100644
index 0000000000..7402471ff2
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableOptionKey.java
@@ -0,0 +1,15 @@
+package org.rocksdb;
+
+public interface MutableOptionKey {
+  enum ValueType {
+    DOUBLE,
+    LONG,
+    INT,
+    BOOLEAN,
+    INT_ARRAY,
+    ENUM
+  }
+
+  String name();
+  ValueType getValueType();
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java
new file mode 100644
index 0000000000..3727f7c1f2
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/MutableOptionValue.java
@@ -0,0 +1,375 @@
+package org.rocksdb;
+
+import static org.rocksdb.AbstractMutableOptions.INT_ARRAY_INT_SEPARATOR;
+
+public abstract class MutableOptionValue<T> {
+
+  abstract double asDouble() throws NumberFormatException;
+  abstract long asLong() throws NumberFormatException;
+  abstract int asInt() throws NumberFormatException;
+  abstract boolean asBoolean() throws IllegalStateException;
+  abstract int[] asIntArray() throws IllegalStateException;
+  abstract String asString();
+  abstract T asObject();
+
+  private static abstract class MutableOptionValueObject<T>
+      extends MutableOptionValue<T> {
+    protected final T value;
+
+    private MutableOptionValueObject(final T value) {
+      this.value = value;
+    }
+
+    @Override T asObject() {
+      return value;
+    }
+  }
+
+  static MutableOptionValue<String> fromString(final String s) {
+    return new MutableOptionStringValue(s);
+  }
+
+  static MutableOptionValue<Double> fromDouble(final double d) {
+    return new MutableOptionDoubleValue(d);
+  }
+
+  static MutableOptionValue<Long> fromLong(final long d) {
+    return new MutableOptionLongValue(d);
+  }
+
+  static MutableOptionValue<Integer> fromInt(final int i) {
+    return new MutableOptionIntValue(i);
+  }
+
+  static MutableOptionValue<Boolean> fromBoolean(final boolean b) {
+    return new MutableOptionBooleanValue(b);
+  }
+
+  static MutableOptionValue<int[]> fromIntArray(final int[] ix) {
+    return new MutableOptionIntArrayValue(ix);
+  }
+
+  static <N extends Enum<N>> MutableOptionValue<N> fromEnum(final N value) {
+    return new MutableOptionEnumValue<>(value);
+  }
+
+  static class MutableOptionStringValue
+      extends MutableOptionValueObject<String> {
+    MutableOptionStringValue(final String value) {
+      super(value);
+    }
+
+    @Override
+    double asDouble() throws NumberFormatException {
+      return Double.parseDouble(value);
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return Long.parseLong(value);
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      return Integer.parseInt(value);
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      return Boolean.parseBoolean(value);
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      throw new IllegalStateException("String is not applicable as int[]");
+    }
+
+    @Override
+    String asString() {
+      return value;
+    }
+  }
+
+  static class MutableOptionDoubleValue
+      extends MutableOptionValue<Double> {
+    private final double value;
+    MutableOptionDoubleValue(final double value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      return value;
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return Double.valueOf(value).longValue();
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "double value lies outside the bounds of int");
+      }
+      return Double.valueOf(value).intValue();
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new IllegalStateException(
+          "double is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "double value lies outside the bounds of int");
+      }
+      return new int[] { Double.valueOf(value).intValue() };
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Double asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionLongValue
+      extends MutableOptionValue<Long> {
+    private final long value;
+
+    MutableOptionLongValue(final long value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      if(value > Double.MAX_VALUE || value < Double.MIN_VALUE) {
+        throw new NumberFormatException(
+            "long value lies outside the bounds of int");
+      }
+      return Long.valueOf(value).doubleValue();
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return value;
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "long value lies outside the bounds of int");
+      }
+      return Long.valueOf(value).intValue();
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new IllegalStateException(
+          "long is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      if(value > Integer.MAX_VALUE || value < Integer.MIN_VALUE) {
+        throw new NumberFormatException(
+            "long value lies outside the bounds of int");
+      }
+      return new int[] { Long.valueOf(value).intValue() };
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Long asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionIntValue
+      extends MutableOptionValue<Integer> {
+    private final int value;
+
+    MutableOptionIntValue(final int value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      if(value > Double.MAX_VALUE || value < Double.MIN_VALUE) {
+        throw new NumberFormatException("int value lies outside the bounds of int");
+      }
+      return Integer.valueOf(value).doubleValue();
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      return value;
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      return value;
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new IllegalStateException("int is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      return new int[] { value };
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Integer asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionBooleanValue
+      extends MutableOptionValue<Boolean> {
+    private final boolean value;
+
+    MutableOptionBooleanValue(final boolean value) {
+      this.value = value;
+    }
+
+    @Override
+    double asDouble() {
+      throw new NumberFormatException("boolean is not applicable as double");
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      throw new NumberFormatException("boolean is not applicable as Long");
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      throw new NumberFormatException("boolean is not applicable as int");
+    }
+
+    @Override
+    boolean asBoolean() {
+      return value;
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      throw new IllegalStateException("boolean is not applicable as int[]");
+    }
+
+    @Override
+    String asString() {
+      return String.valueOf(value);
+    }
+
+    @Override
+    Boolean asObject() {
+      return value;
+    }
+  }
+
+  static class MutableOptionIntArrayValue
+      extends MutableOptionValueObject<int[]> {
+    MutableOptionIntArrayValue(final int[] value) {
+      super(value);
+    }
+
+    @Override
+    double asDouble() {
+      throw new NumberFormatException("int[] is not applicable as double");
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      throw new NumberFormatException("int[] is not applicable as Long");
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      throw new NumberFormatException("int[] is not applicable as int");
+    }
+
+    @Override
+    boolean asBoolean() {
+      throw new NumberFormatException("int[] is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      return value;
+    }
+
+    @Override
+    String asString() {
+      final StringBuilder builder = new StringBuilder();
+      for(int i = 0; i < value.length; i++) {
+        builder.append(i);
+        if(i + 1 < value.length) {
+          builder.append(INT_ARRAY_INT_SEPARATOR);
+        }
+      }
+      return builder.toString();
+    }
+  }
+
+  static class MutableOptionEnumValue<T extends Enum<T>>
+      extends MutableOptionValueObject<T> {
+
+    MutableOptionEnumValue(final T value) {
+      super(value);
+    }
+
+    @Override
+    double asDouble() throws NumberFormatException {
+      throw new NumberFormatException("Enum is not applicable as double");
+    }
+
+    @Override
+    long asLong() throws NumberFormatException {
+      throw new NumberFormatException("Enum is not applicable as long");
+    }
+
+    @Override
+    int asInt() throws NumberFormatException {
+      throw new NumberFormatException("Enum is not applicable as int");
+    }
+
+    @Override
+    boolean asBoolean() throws IllegalStateException {
+      throw new NumberFormatException("Enum is not applicable as boolean");
+    }
+
+    @Override
+    int[] asIntArray() throws IllegalStateException {
+      throw new NumberFormatException("Enum is not applicable as int[]");
+    }
+
+    @Override
+    String asString() {
+      return value.name();
+    }
+  }
+
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java
new file mode 100644
index 0000000000..28a427aaa7
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java
@@ -0,0 +1,57 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * A simple abstraction to allow a Java class to wrap a custom comparator
+ * implemented in C++.
+ *
+ * The native comparator must directly extend rocksdb::Comparator.
+ */
+public abstract class NativeComparatorWrapper
+    extends AbstractComparator<Slice> {
+
+  @Override
+  final ComparatorType getComparatorType() {
+    return ComparatorType.JAVA_NATIVE_COMPARATOR_WRAPPER;
+  }
+
+  @Override
+  public final String name() {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  @Override
+  public final int compare(final Slice s1, final Slice s2) {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  @Override
+  public final String findShortestSeparator(final String start, final Slice limit) {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  @Override
+  public final String findShortSuccessor(final String key) {
+    throw new IllegalStateException("This should not be called. " +
+        "Implementation is in Native code");
+  }
+
+  /**
+   * We override {@link RocksCallbackObject#disposeInternal()}
+   * as disposing of a native rocksd::Comparator extension requires
+   * a slightly different approach as it is not really a RocksCallbackObject
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OperationStage.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OperationStage.java
new file mode 100644
index 0000000000..6ac0a15a24
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OperationStage.java
@@ -0,0 +1,59 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The operation stage.
+ */
+public enum OperationStage {
+  STAGE_UNKNOWN((byte)0x0),
+  STAGE_FLUSH_RUN((byte)0x1),
+  STAGE_FLUSH_WRITE_L0((byte)0x2),
+  STAGE_COMPACTION_PREPARE((byte)0x3),
+  STAGE_COMPACTION_RUN((byte)0x4),
+  STAGE_COMPACTION_PROCESS_KV((byte)0x5),
+  STAGE_COMPACTION_INSTALL((byte)0x6),
+  STAGE_COMPACTION_SYNC_FILE((byte)0x7),
+  STAGE_PICK_MEMTABLES_TO_FLUSH((byte)0x8),
+  STAGE_MEMTABLE_ROLLBACK((byte)0x9),
+  STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS((byte)0xA);
+
+  private final byte value;
+
+  OperationStage(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the Operation stage from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the operation stage
+   *
+   * @throws IllegalArgumentException if the value does not match
+   *     an OperationStage
+   */
+  static OperationStage fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final OperationStage threadType : OperationStage.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Unknown value for OperationStage: " + value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OperationType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OperationType.java
new file mode 100644
index 0000000000..7cc9b65cdf
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OperationType.java
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The type used to refer to a thread operation.
+ *
+ * A thread operation describes high-level action of a thread,
+ * examples include compaction and flush.
+ */
+public enum OperationType {
+  OP_UNKNOWN((byte)0x0),
+  OP_COMPACTION((byte)0x1),
+  OP_FLUSH((byte)0x2);
+
+  private final byte value;
+
+  OperationType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the Operation type from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the operation type
+   *
+   * @throws IllegalArgumentException if the value does not match
+   *     an OperationType
+   */
+  static OperationType fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final OperationType threadType : OperationType.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Unknown value for OperationType: " + value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java
new file mode 100644
index 0000000000..267cab1dec
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java
@@ -0,0 +1,226 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Database with Transaction support.
+ */
+public class OptimisticTransactionDB extends RocksDB
+    implements TransactionalDB<OptimisticTransactionOptions> {
+
+  /**
+   * Private constructor.
+   *
+   * @param nativeHandle The native handle of the C++ OptimisticTransactionDB
+   *     object
+   */
+  private OptimisticTransactionDB(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Open an OptimisticTransactionDB similar to
+   * {@link RocksDB#open(Options, String)}.
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param path the path to the rocksdb.
+   *
+   * @return a {@link OptimisticTransactionDB} instance on success, null if the
+   * specified {@link OptimisticTransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static OptimisticTransactionDB open(final Options options,
+      final String path) throws RocksDBException {
+    final OptimisticTransactionDB otdb = new OptimisticTransactionDB(open(
+        options.nativeHandle_, path));
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    otdb.storeOptionsInstance(options);
+
+    return otdb;
+  }
+
+  /**
+   * Open an OptimisticTransactionDB similar to
+   * {@link RocksDB#open(DBOptions, String, List, List)}.
+   *
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance.
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *
+   * @return a {@link OptimisticTransactionDB} instance on success, null if the
+   *     specified {@link OptimisticTransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static OptimisticTransactionDB open(final DBOptions dbOptions,
+      final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors
+          .get(i);
+      cfNames[i] = cfDescriptor.columnFamilyName();
+      cfOptionHandles[i] = cfDescriptor.columnFamilyOptions().nativeHandle_;
+    }
+
+    final long[] handles = open(dbOptions.nativeHandle_, path, cfNames,
+        cfOptionHandles);
+    final OptimisticTransactionDB otdb =
+        new OptimisticTransactionDB(handles[0]);
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    otdb.storeOptionsInstance(dbOptions);
+
+    for (int i = 1; i < handles.length; i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(otdb, handles[i]));
+    }
+
+    return otdb;
+  }
+
+
+  /**
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * This is similar to {@link #closeE()} except that it
+   * silently ignores any errors.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_));
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final OptimisticTransactionOptions optimisticTransactionOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_,
+        optimisticTransactionOptions.nativeHandle_));
+  }
+
+  // TODO(AR) consider having beingTransaction(... oldTransaction) set a
+  // reference count inside Transaction, so that we can always call
+  // Transaction#close but the object is only disposed when there are as many
+  // closes as beginTransaction. Makes the try-with-resources paradigm easier for
+  // java developers
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final Transaction oldTransaction) {
+    final long jtxn_handle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxn_handle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final OptimisticTransactionOptions optimisticTransactionOptions,
+      final Transaction oldTransaction) {
+    final long jtxn_handle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, optimisticTransactionOptions.nativeHandle_,
+        oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxn_handle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  /**
+   * Get the underlying database that was opened.
+   *
+   * @return The underlying database that was opened.
+   */
+  public RocksDB getBaseDB() {
+    final RocksDB db = new RocksDB(getBaseDB(nativeHandle_));
+    db.disOwnNativeHandle();
+    return db;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+
+  protected static native long open(final long optionsHandle,
+      final String path) throws RocksDBException;
+  protected static native long[] open(final long handle, final String path,
+      final byte[][] columnFamilyNames, final long[] columnFamilyOptions);
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle);
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle,
+      final long optimisticTransactionOptionsHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle, final long oldTransactionHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle,
+      final long optimisticTransactionOptionsHandle,
+      final long oldTransactionHandle);
+  private native long getBaseDB(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java
new file mode 100644
index 0000000000..650ee22550
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class OptimisticTransactionOptions extends RocksObject
+    implements TransactionalOptions {
+
+  public OptimisticTransactionOptions() {
+    super(newOptimisticTransactionOptions());
+  }
+
+  @Override
+  public boolean isSetSnapshot() {
+    assert(isOwningHandle());
+    return isSetSnapshot(nativeHandle_);
+  }
+
+  @Override
+  public OptimisticTransactionOptions setSetSnapshot(
+      final boolean setSnapshot) {
+    assert(isOwningHandle());
+    setSetSnapshot(nativeHandle_, setSnapshot);
+    return this;
+  }
+
+  /**
+   * Should be set if the DB has a non-default comparator.
+   * See comment in
+   * {@link WriteBatchWithIndex#WriteBatchWithIndex(AbstractComparator, int, boolean)}
+   * constructor.
+   *
+   * @param comparator The comparator to use for the transaction.
+   *
+   * @return this OptimisticTransactionOptions instance
+   */
+  public OptimisticTransactionOptions setComparator(
+      final AbstractComparator<? extends AbstractSlice<?>> comparator) {
+    assert(isOwningHandle());
+    setComparator(nativeHandle_, comparator.nativeHandle_);
+    return this;
+  }
+
+  private native static long newOptimisticTransactionOptions();
+  private native boolean isSetSnapshot(final long handle);
+  private native void setSetSnapshot(final long handle,
+      final boolean setSnapshot);
+  private native void setComparator(final long handle,
+      final long comparatorHandle);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Options.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Options.java
index dcd1138a8a..5831b1e298 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Options.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Options.java
@@ -19,7 +19,9 @@
  * automaticallyand native resources will be released as part of the process.
  */
 public class Options extends RocksObject
-    implements DBOptionsInterface<Options>, ColumnFamilyOptionsInterface<Options>,
+    implements DBOptionsInterface<Options>,
+    MutableDBOptionsInterface<Options>,
+    ColumnFamilyOptionsInterface<Options>,
     MutableColumnFamilyOptionsInterface<Options> {
   static {
     RocksDB.loadLibrary();
@@ -51,6 +53,30 @@ public Options(final DBOptions dbOptions,
     env_ = Env.getDefault();
   }
 
+  /**
+   * Copy constructor for ColumnFamilyOptions.
+   *
+   * NOTE: This does a shallow copy, which means comparator, merge_operator
+   * and other pointers will be cloned!
+   *
+   * @param other The Options to copy.
+   */
+  public Options(Options other) {
+    super(copyOptions(other.nativeHandle_));
+    this.env_ = other.env_;
+    this.memTableConfig_ = other.memTableConfig_;
+    this.tableFormatConfig_ = other.tableFormatConfig_;
+    this.rateLimiter_ = other.rateLimiter_;
+    this.comparator_ = other.comparator_;
+    this.compactionFilter_ = other.compactionFilter_;
+    this.compactionFilterFactory_ = other.compactionFilterFactory_;
+    this.compactionOptionsUniversal_ = other.compactionOptionsUniversal_;
+    this.compactionOptionsFIFO_ = other.compactionOptionsFIFO_;
+    this.compressionOptions_ = other.compressionOptions_;
+    this.rowCache_ = other.rowCache_;
+    this.writeBufferManager_ = other.writeBufferManager_;
+  }
+
   @Override
   public Options setIncreaseParallelism(final int totalThreads) {
     assert(isOwningHandle());
@@ -169,7 +195,8 @@ public Options setComparator(final BuiltinComparator builtinComparator) {
   public Options setComparator(
       final AbstractComparator<? extends AbstractSlice<?>> comparator) {
     assert(isOwningHandle());
-    setComparatorHandle(nativeHandle_, comparator.getNativeHandle());
+    setComparatorHandle(nativeHandle_, comparator.nativeHandle_,
+            comparator.getComparatorType().getValue());
     comparator_ = comparator;
     return this;
   }
@@ -191,6 +218,35 @@ public Options setMergeOperator(final MergeOperator mergeOperator) {
     return this;
   }
 
+  @Override
+  public Options setCompactionFilter(
+          final AbstractCompactionFilter<? extends AbstractSlice<?>>
+                  compactionFilter) {
+    setCompactionFilterHandle(nativeHandle_, compactionFilter.nativeHandle_);
+    compactionFilter_ = compactionFilter;
+    return this;
+  }
+
+  @Override
+  public AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter() {
+    assert (isOwningHandle());
+    return compactionFilter_;
+  }
+
+  @Override
+  public Options setCompactionFilterFactory(final AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory) {
+    assert (isOwningHandle());
+    setCompactionFilterFactoryHandle(nativeHandle_, compactionFilterFactory.nativeHandle_);
+    compactionFilterFactory_ = compactionFilterFactory;
+    return this;
+  }
+
+  @Override
+  public AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>> compactionFilterFactory() {
+    assert (isOwningHandle());
+    return compactionFilterFactory_;
+  }
+
   @Override
   public Options setWriteBufferSize(final long writeBufferSize) {
     assert(isOwningHandle());
@@ -418,9 +474,10 @@ public Options setMaxBackgroundCompactions(
   }
 
   @Override
-  public void setMaxSubcompactions(final int maxSubcompactions) {
+  public Options setMaxSubcompactions(final int maxSubcompactions) {
     assert(isOwningHandle());
     setMaxSubcompactions(nativeHandle_, maxSubcompactions);
+    return this;
   }
 
   @Override
@@ -443,6 +500,19 @@ public Options setMaxBackgroundFlushes(
     return this;
   }
 
+  @Override
+  public int maxBackgroundJobs() {
+    assert(isOwningHandle());
+    return maxBackgroundJobs(nativeHandle_);
+  }
+
+  @Override
+  public Options setMaxBackgroundJobs(final int maxBackgroundJobs) {
+    assert(isOwningHandle());
+    setMaxBackgroundJobs(nativeHandle_, maxBackgroundJobs);
+    return this;
+  }
+
   @Override
   public long maxLogFileSize() {
     assert(isOwningHandle());
@@ -689,6 +759,20 @@ public Options setDbWriteBufferSize(final long dbWriteBufferSize) {
   }
 
   @Override
+  public Options setWriteBufferManager(final WriteBufferManager writeBufferManager) {
+    assert(isOwningHandle());
+    setWriteBufferManager(nativeHandle_, writeBufferManager.nativeHandle_);
+    this.writeBufferManager_ = writeBufferManager;
+    return this;
+  }
+
+  @Override
+  public WriteBufferManager writeBufferManager() {
+    assert(isOwningHandle());
+    return this.writeBufferManager_;
+  }
+
+    @Override
   public long dbWriteBufferSize() {
     assert(isOwningHandle());
     return dbWriteBufferSize(nativeHandle_);
@@ -824,6 +908,17 @@ public long delayedWriteRate(){
     return delayedWriteRate(nativeHandle_);
   }
 
+  @Override
+  public Options setEnablePipelinedWrite(final boolean enablePipelinedWrite) {
+    setEnablePipelinedWrite(nativeHandle_, enablePipelinedWrite);
+    return this;
+  }
+
+  @Override
+  public boolean enablePipelinedWrite() {
+    return enablePipelinedWrite(nativeHandle_);
+  }
+
   @Override
   public Options setAllowConcurrentMemtableWrite(
       final boolean allowConcurrentMemtableWrite) {
@@ -925,6 +1020,20 @@ public Cache rowCache() {
     return this.rowCache_;
   }
 
+  @Override
+  public Options setWalFilter(final AbstractWalFilter walFilter) {
+    assert(isOwningHandle());
+    setWalFilter(nativeHandle_, walFilter.nativeHandle_);
+    this.walFilter_ = walFilter;
+    return this;
+  }
+
+  @Override
+  public WalFilter walFilter() {
+    assert(isOwningHandle());
+    return this.walFilter_;
+  }
+
   @Override
   public Options setFailIfOptionsFileError(final boolean failIfOptionsFileError) {
     assert(isOwningHandle());
@@ -977,6 +1086,58 @@ public boolean avoidFlushDuringShutdown() {
     return avoidFlushDuringShutdown(nativeHandle_);
   }
 
+  @Override
+  public Options setAllowIngestBehind(final boolean allowIngestBehind) {
+    assert(isOwningHandle());
+    setAllowIngestBehind(nativeHandle_, allowIngestBehind);
+    return this;
+  }
+
+  @Override
+  public boolean allowIngestBehind() {
+    assert(isOwningHandle());
+    return allowIngestBehind(nativeHandle_);
+  }
+
+  @Override
+  public Options setPreserveDeletes(final boolean preserveDeletes) {
+    assert(isOwningHandle());
+    setPreserveDeletes(nativeHandle_, preserveDeletes);
+    return this;
+  }
+
+  @Override
+  public boolean preserveDeletes() {
+    assert(isOwningHandle());
+    return preserveDeletes(nativeHandle_);
+  }
+
+  @Override
+  public Options setTwoWriteQueues(final boolean twoWriteQueues) {
+    assert(isOwningHandle());
+    setTwoWriteQueues(nativeHandle_, twoWriteQueues);
+    return this;
+  }
+
+  @Override
+  public boolean twoWriteQueues() {
+    assert(isOwningHandle());
+    return twoWriteQueues(nativeHandle_);
+  }
+
+  @Override
+  public Options setManualWalFlush(final boolean manualWalFlush) {
+    assert(isOwningHandle());
+    setManualWalFlush(nativeHandle_, manualWalFlush);
+    return this;
+  }
+
+  @Override
+  public boolean manualWalFlush() {
+    assert(isOwningHandle());
+    return manualWalFlush(nativeHandle_);
+  }
+
   @Override
   public MemTableConfig memTableConfig() {
     return this.memTableConfig_;
@@ -997,6 +1158,13 @@ public Options setRateLimiter(final RateLimiter rateLimiter) {
     return this;
   }
 
+  @Override
+  public Options setSstFileManager(final SstFileManager sstFileManager) {
+    assert(isOwningHandle());
+    setSstFileManager(nativeHandle_, sstFileManager.nativeHandle_);
+    return this;
+  }
+
   @Override
   public Options setLogger(final Logger logger) {
     assert(isOwningHandle());
@@ -1106,6 +1274,20 @@ public CompressionType bottommostCompressionType() {
         bottommostCompressionType(nativeHandle_));
   }
 
+  @Override
+  public Options setBottommostCompressionOptions(
+      final CompressionOptions bottommostCompressionOptions) {
+    setBottommostCompressionOptions(nativeHandle_,
+        bottommostCompressionOptions.nativeHandle_);
+    this.bottommostCompressionOptions_ = bottommostCompressionOptions;
+    return this;
+  }
+
+  @Override
+  public CompressionOptions bottommostCompressionOptions() {
+    return this.bottommostCompressionOptions_;
+  }
+
   @Override
   public Options setCompressionOptions(
       final CompressionOptions compressionOptions) {
@@ -1121,7 +1303,7 @@ public CompressionOptions compressionOptions() {
 
   @Override
   public CompactionStyle compactionStyle() {
-    return CompactionStyle.values()[compactionStyle(nativeHandle_)];
+    return CompactionStyle.fromValue(compactionStyle(nativeHandle_));
   }
 
   @Override
@@ -1493,6 +1675,17 @@ public boolean reportBgIoStats() {
     return reportBgIoStats(nativeHandle_);
   }
 
+  @Override
+  public Options setTtl(final long ttl) {
+    setTtl(nativeHandle_, ttl);
+    return this;
+  }
+
+  @Override
+  public long ttl() {
+    return ttl(nativeHandle_);
+  }
+
   @Override
   public Options setCompactionOptionsUniversal(
       final CompactionOptionsUniversal compactionOptionsUniversal) {
@@ -1531,9 +1724,21 @@ public boolean forceConsistencyChecks() {
     return forceConsistencyChecks(nativeHandle_);
   }
 
+  @Override
+  public Options setAtomicFlush(final boolean atomicFlush) {
+    setAtomicFlush(nativeHandle_, atomicFlush);
+    return this;
+  }
+
+  @Override
+  public boolean atomicFlush() {
+    return atomicFlush(nativeHandle_);
+  }
+
   private native static long newOptions();
   private native static long newOptions(long dbOptHandle,
       long cfOptHandle);
+  private native static long copyOptions(long handle);
   @Override protected final native void disposeInternal(final long handle);
   private native void setEnv(long optHandle, long envHandle);
   private native void prepareForBulkLoad(long handle);
@@ -1552,6 +1757,8 @@ private native void setParanoidChecks(
   private native boolean paranoidChecks(long handle);
   private native void setRateLimiter(long handle,
       long rateLimiterHandle);
+  private native void setSstFileManager(final long handle,
+      final long sstFileManagerHandle);
   private native void setLogger(long handle,
       long loggerHandle);
   private native void setInfoLogLevel(long handle, byte logLevel);
@@ -1591,6 +1798,8 @@ private native void setMaxBackgroundCompactions(
   private native void setMaxBackgroundFlushes(
       long handle, int maxBackgroundFlushes);
   private native int maxBackgroundFlushes(long handle);
+  private native void setMaxBackgroundJobs(long handle, int maxMaxBackgroundJobs);
+  private native int maxBackgroundJobs(long handle);
   private native void setMaxLogFileSize(long handle, long maxLogFileSize)
       throws IllegalArgumentException;
   private native long maxLogFileSize(long handle);
@@ -1643,6 +1852,8 @@ private native void setAdviseRandomOnOpen(
   private native boolean adviseRandomOnOpen(long handle);
   private native void setDbWriteBufferSize(final long handle,
       final long dbWriteBufferSize);
+  private native void setWriteBufferManager(final long handle,
+      final long writeBufferManagerHandle);
   private native long dbWriteBufferSize(final long handle);
   private native void setAccessHintOnCompactionStart(final long handle,
       final byte accessHintOnCompactionStart);
@@ -1672,6 +1883,9 @@ private native void setEnableThreadTracking(long handle,
   private native boolean enableThreadTracking(long handle);
   private native void setDelayedWriteRate(long handle, long delayedWriteRate);
   private native long delayedWriteRate(long handle);
+  private native void setEnablePipelinedWrite(final long handle,
+      final boolean pipelinedWrite);
+  private native boolean enablePipelinedWrite(final long handle);
   private native void setAllowConcurrentMemtableWrite(long handle,
       boolean allowConcurrentMemtableWrite);
   private native boolean allowConcurrentMemtableWrite(long handle);
@@ -1694,7 +1908,9 @@ private native void setAllow2pc(final long handle,
       final boolean allow2pc);
   private native boolean allow2pc(final long handle);
   private native void setRowCache(final long handle,
-      final long row_cache_handle);
+      final long rowCacheHandle);
+  private native void setWalFilter(final long handle,
+      final long walFilterHandle);
   private native void setFailIfOptionsFileError(final long handle,
       final boolean failIfOptionsFileError);
   private native boolean failIfOptionsFileError(final long handle);
@@ -1707,6 +1923,19 @@ private native void setAvoidFlushDuringRecovery(final long handle,
   private native void setAvoidFlushDuringShutdown(final long handle,
       final boolean avoidFlushDuringShutdown);
   private native boolean avoidFlushDuringShutdown(final long handle);
+  private native void setAllowIngestBehind(final long handle,
+      final boolean allowIngestBehind);
+  private native boolean allowIngestBehind(final long handle);
+  private native void setPreserveDeletes(final long handle,
+      final boolean preserveDeletes);
+  private native boolean preserveDeletes(final long handle);
+  private native void setTwoWriteQueues(final long handle,
+      final boolean twoWriteQueues);
+  private native boolean twoWriteQueues(final long handle);
+  private native void setManualWalFlush(final long handle,
+      final boolean manualWalFlush);
+  private native boolean manualWalFlush(final long handle);
+
 
   // CF native handles
   private native void optimizeForSmallDb(final long handle);
@@ -1718,11 +1947,15 @@ private native void optimizeUniversalStyleCompaction(long handle,
       long memtableMemoryBudget);
   private native void setComparatorHandle(long handle, int builtinComparator);
   private native void setComparatorHandle(long optHandle,
-      long comparatorHandle);
+      long comparatorHandle, byte comparatorType);
   private native void setMergeOperatorName(
       long handle, String name);
   private native void setMergeOperator(
       long handle, long mergeOperatorHandle);
+  private native void setCompactionFilterHandle(
+          long handle, long compactionFilterHandle);
+  private native void setCompactionFilterFactoryHandle(
+          long handle, long compactionFilterFactoryHandle);
   private native void setWriteBufferSize(long handle, long writeBufferSize)
       throws IllegalArgumentException;
   private native long writeBufferSize(long handle);
@@ -1740,6 +1973,8 @@ private native void setCompressionPerLevel(long handle,
   private native void setBottommostCompressionType(long handle,
       byte bottommostCompressionType);
   private native byte bottommostCompressionType(long handle);
+  private native void setBottommostCompressionOptions(final long handle,
+      final long bottommostCompressionOptionsHandle);
   private native void setCompressionOptions(long handle,
       long compressionOptionsHandle);
   private native void useFixedLengthPrefixExtractor(
@@ -1843,6 +2078,8 @@ private native void setCompactionPriority(final long handle,
   private native void setReportBgIoStats(final long handle,
       final boolean reportBgIoStats);
   private native boolean reportBgIoStats(final long handle);
+  private native void setTtl(final long handle, final long ttl);
+  private native long ttl(final long handle);
   private native void setCompactionOptionsUniversal(final long handle,
       final long compactionOptionsUniversalHandle);
   private native void setCompactionOptionsFIFO(final long handle,
@@ -1850,15 +2087,25 @@ private native void setCompactionOptionsFIFO(final long handle,
   private native void setForceConsistencyChecks(final long handle,
       final boolean forceConsistencyChecks);
   private native boolean forceConsistencyChecks(final long handle);
+  private native void setAtomicFlush(final long handle,
+      final boolean atomicFlush);
+  private native boolean atomicFlush(final long handle);
 
   // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
   private Env env_;
   private MemTableConfig memTableConfig_;
   private TableFormatConfig tableFormatConfig_;
   private RateLimiter rateLimiter_;
   private AbstractComparator<? extends AbstractSlice<?>> comparator_;
+  private AbstractCompactionFilter<? extends AbstractSlice<?>> compactionFilter_;
+  private AbstractCompactionFilterFactory<? extends AbstractCompactionFilter<?>>
+          compactionFilterFactory_;
   private CompactionOptionsUniversal compactionOptionsUniversal_;
   private CompactionOptionsFIFO compactionOptionsFIFO_;
+  private CompressionOptions bottommostCompressionOptions_;
   private CompressionOptions compressionOptions_;
   private Cache rowCache_;
+  private WalFilter walFilter_;
+  private WriteBufferManager writeBufferManager_;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java
new file mode 100644
index 0000000000..f153556ba3
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/OptionsUtil.java
@@ -0,0 +1,142 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class OptionsUtil {
+  /**
+   * A static method to construct the DBOptions and ColumnFamilyDescriptors by
+   * loading the latest RocksDB options file stored in the specified rocksdb
+   * database.
+   *
+   * Note that the all the pointer options (except table_factory, which will
+   * be described in more details below) will be initialized with the default
+   * values.  Developers can further initialize them after this function call.
+   * Below is an example list of pointer options which will be initialized.
+   *
+   * - env
+   * - memtable_factory
+   * - compaction_filter_factory
+   * - prefix_extractor
+   * - comparator
+   * - merge_operator
+   * - compaction_filter
+   *
+   * For table_factory, this function further supports deserializing
+   * BlockBasedTableFactory and its BlockBasedTableOptions except the
+   * pointer options of BlockBasedTableOptions (flush_block_policy_factory,
+   * block_cache, and block_cache_compressed), which will be initialized with
+   * default values.  Developers can further specify these three options by
+   * casting the return value of TableFactoroy::GetOptions() to
+   * BlockBasedTableOptions and making necessary changes.
+   *
+   * @param dbPath the path to the RocksDB.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *    returned.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+
+  public static void loadLatestOptions(String dbPath, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadLatestOptions(dbPath, env, dbOptions, cfDescs, false);
+  }
+
+  /**
+   * @param dbPath the path to the RocksDB.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @param ignoreUnknownOptions this flag can be set to true if you want to
+   *     ignore options that are from a newer version of the db, esentially for
+   *     forward compatibility.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadLatestOptions(String dbPath, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions) throws RocksDBException {
+    loadLatestOptions(
+        dbPath, env.nativeHandle_, dbOptions.nativeHandle_, cfDescs, ignoreUnknownOptions);
+  }
+
+  /**
+   * Similar to LoadLatestOptions, this function constructs the DBOptions
+   * and ColumnFamilyDescriptors based on the specified RocksDB Options file.
+   * See LoadLatestOptions above.
+   *
+   * @param optionsFileName the RocksDB options file path.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadOptionsFromFile(String optionsFileName, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs) throws RocksDBException {
+    loadOptionsFromFile(optionsFileName, env, dbOptions, cfDescs, false);
+  }
+
+  /**
+   * @param optionsFileName the RocksDB options file path.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be
+   *     filled and returned.
+   * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be
+   *     returned.
+   * @param ignoreUnknownOptions this flag can be set to true if you want to
+   *     ignore options that are from a newer version of the db, esentially for
+   *     forward compatibility.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static void loadOptionsFromFile(String optionsFileName, Env env, DBOptions dbOptions,
+      List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions) throws RocksDBException {
+    loadOptionsFromFile(
+        optionsFileName, env.nativeHandle_, dbOptions.nativeHandle_, cfDescs, ignoreUnknownOptions);
+  }
+
+  /**
+   * Returns the latest options file name under the specified RocksDB path.
+   *
+   * @param dbPath the path to the RocksDB.
+   * @param env {@link org.rocksdb.Env} instance.
+   * @return the latest options file name under the db path.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *     native library.
+   */
+  public static String getLatestOptionsFileName(String dbPath, Env env) throws RocksDBException {
+    return getLatestOptionsFileName(dbPath, env.nativeHandle_);
+  }
+
+  /**
+   * Private constructor.
+   * This class has only static methods and shouldn't be instantiated.
+   */
+  private OptionsUtil() {}
+
+  // native methods
+  private native static void loadLatestOptions(String dbPath, long envHandle, long dbOptionsHandle,
+      List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions) throws RocksDBException;
+  private native static void loadOptionsFromFile(String optionsFileName, long envHandle,
+      long dbOptionsHandle, List<ColumnFamilyDescriptor> cfDescs, boolean ignoreUnknownOptions)
+      throws RocksDBException;
+  private native static String getLatestOptionsFileName(String dbPath, long envHandle)
+      throws RocksDBException;
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/PersistentCache.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/PersistentCache.java
new file mode 100644
index 0000000000..aed5652973
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/PersistentCache.java
@@ -0,0 +1,26 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Persistent cache for caching IO pages on a persistent medium. The
+ * cache is specifically designed for persistent read cache.
+ */
+public class PersistentCache extends RocksObject {
+
+  public PersistentCache(final Env env, final String path, final long size,
+      final Logger logger, final boolean optimizedForNvm)
+      throws RocksDBException {
+    super(newPersistentCache(env.nativeHandle_, path, size,
+        logger.nativeHandle_, optimizedForNvm));
+  }
+
+  private native static long newPersistentCache(final long envHandle,
+    final String path, final long size, final long loggerHandle,
+    final boolean optimizedForNvm) throws RocksDBException;
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Priority.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Priority.java
new file mode 100644
index 0000000000..34a56edcbc
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Priority.java
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The Thread Pool priority.
+ */
+public enum Priority {
+  BOTTOM((byte) 0x0),
+  LOW((byte) 0x1),
+  HIGH((byte)0x2),
+  TOTAL((byte)0x3);
+
+  private final byte value;
+
+  Priority(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get Priority by byte value.
+   *
+   * @param value byte representation of Priority.
+   *
+   * @return {@link org.rocksdb.Priority} instance.
+   * @throws java.lang.IllegalArgumentException if an invalid
+   *     value is provided.
+   */
+  static Priority getPriority(final byte value) {
+    for (final Priority priority : Priority.values()) {
+      if (priority.getValue() == value){
+        return priority;
+      }
+    }
+    throw new IllegalArgumentException("Illegal value provided for Priority.");
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Range.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Range.java
new file mode 100644
index 0000000000..74c85e5f04
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Range.java
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Range from start to limit.
+ */
+public class Range {
+  final Slice start;
+  final Slice limit;
+
+  public Range(final Slice start, final Slice limit) {
+    this.start = start;
+    this.limit = limit;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RateLimiter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RateLimiter.java
index fc2388777e..c2b8a0fd92 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RateLimiter.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RateLimiter.java
@@ -12,8 +12,11 @@
  * @since 3.10.0
  */
 public class RateLimiter extends RocksObject {
-  private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000);
-  private static final int DEFAULT_FAIRNESS = 10;
+  public static final long DEFAULT_REFILL_PERIOD_MICROS = 100 * 1000;
+  public static final int DEFAULT_FAIRNESS = 10;
+  public static final RateLimiterMode DEFAULT_MODE =
+      RateLimiterMode.WRITES_ONLY;
+  public static final boolean DEFAULT_AUTOTUNE = false;
 
   /**
    * RateLimiter constructor
@@ -21,24 +24,62 @@ public class RateLimiter extends RocksObject {
    * @param rateBytesPerSecond this is the only parameter you want to set
    *     most of the time. It controls the total write rate of compaction
    *     and flush in bytes per second. Currently, RocksDB does not enforce
-   *     rate limit for anything other than flush and compaction, e.g. write to WAL.
-   * @param refillPeriodMicros this controls how often tokens are refilled. For example,
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   */
+  public RateLimiter(final long rateBytesPerSecond) {
+    this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS,
+        DEFAULT_MODE, DEFAULT_AUTOTUNE);
+  }
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
+   */
+  public RateLimiter(final long rateBytesPerSecond,
+      final long refillPeriodMicros) {
+    this(rateBytesPerSecond, refillPeriodMicros, DEFAULT_FAIRNESS, DEFAULT_MODE,
+        DEFAULT_AUTOTUNE);
+  }
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
    *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
-   *     100ms, then 1MB is refilled every 100ms internally. Larger value can lead to
-   *     burstier writes while smaller value introduces more CPU overhead.
-   *     The default should work for most cases.
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
    * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
-   *     A low-pri request is usually blocked in favor of hi-pri request. Currently,
-   *     RocksDB assigns low-pri to request from compaction and high-pri to request
-   *     from flush. Low-pri requests can get blocked if flush requests come in
-   *     continuously. This fairness parameter grants low-pri requests permission by
-   *     fairness chance even though high-pri requests exist to avoid starvation.
+   *     A low-pri request is usually blocked in favor of hi-pri request.
+   *     Currently, RocksDB assigns low-pri to request from compaction and
+   *     high-pri to request from flush. Low-pri requests can get blocked if
+   *     flush requests come in continuously. This fairness parameter grants
+   *     low-pri requests permission by fairness chance even though high-pri
+   *     requests exist to avoid starvation.
    *     You should be good by leaving it at default 10.
    */
   public RateLimiter(final long rateBytesPerSecond,
       final long refillPeriodMicros, final int fairness) {
-    super(newRateLimiterHandle(rateBytesPerSecond,
-        refillPeriodMicros, fairness));
+    this(rateBytesPerSecond, refillPeriodMicros, fairness, DEFAULT_MODE,
+        DEFAULT_AUTOTUNE);
   }
 
   /**
@@ -47,10 +88,65 @@ public RateLimiter(final long rateBytesPerSecond,
    * @param rateBytesPerSecond this is the only parameter you want to set
    *     most of the time. It controls the total write rate of compaction
    *     and flush in bytes per second. Currently, RocksDB does not enforce
-   *     rate limit for anything other than flush and compaction, e.g. write to WAL.
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
+   * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
+   *     A low-pri request is usually blocked in favor of hi-pri request.
+   *     Currently, RocksDB assigns low-pri to request from compaction and
+   *     high-pri to request from flush. Low-pri requests can get blocked if
+   *     flush requests come in continuously. This fairness parameter grants
+   *     low-pri requests permission by fairness chance even though high-pri
+   *     requests exist to avoid starvation.
+   *     You should be good by leaving it at default 10.
+   * @param rateLimiterMode indicates which types of operations count against
+   *     the limit.
    */
-  public RateLimiter(final long rateBytesPerSecond) {
-    this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS);
+  public RateLimiter(final long rateBytesPerSecond,
+      final long refillPeriodMicros, final int fairness,
+      final RateLimiterMode rateLimiterMode) {
+    this(rateBytesPerSecond, refillPeriodMicros, fairness, rateLimiterMode,
+        DEFAULT_AUTOTUNE);
+  }
+
+  /**
+   * RateLimiter constructor
+   *
+   * @param rateBytesPerSecond this is the only parameter you want to set
+   *     most of the time. It controls the total write rate of compaction
+   *     and flush in bytes per second. Currently, RocksDB does not enforce
+   *     rate limit for anything other than flush and compaction, e.g. write to
+   *     WAL.
+   * @param refillPeriodMicros this controls how often tokens are refilled. For
+   *     example,
+   *     when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to
+   *     100ms, then 1MB is refilled every 100ms internally. Larger value can
+   *     lead to burstier writes while smaller value introduces more CPU
+   *     overhead. The default of 100,000ms should work for most cases.
+   * @param fairness RateLimiter accepts high-pri requests and low-pri requests.
+   *     A low-pri request is usually blocked in favor of hi-pri request.
+   *     Currently, RocksDB assigns low-pri to request from compaction and
+   *     high-pri to request from flush. Low-pri requests can get blocked if
+   *     flush requests come in continuously. This fairness parameter grants
+   *     low-pri requests permission by fairness chance even though high-pri
+   *     requests exist to avoid starvation.
+   *     You should be good by leaving it at default 10.
+   * @param rateLimiterMode indicates which types of operations count against
+   *     the limit.
+   * @param autoTune Enables dynamic adjustment of rate limit within the range
+   *     {@code [rate_bytes_per_sec / 20, rate_bytes_per_sec]}, according to
+   *     the recent demand for background I/O.
+   */
+  public RateLimiter(final long rateBytesPerSecond,
+      final long refillPeriodMicros, final int fairness,
+      final RateLimiterMode rateLimiterMode, final boolean autoTune) {
+    super(newRateLimiterHandle(rateBytesPerSecond,
+        refillPeriodMicros, fairness, rateLimiterMode.getValue(), autoTune));
   }
 
   /**
@@ -64,6 +160,16 @@ public void setBytesPerSecond(final long bytesPerSecond) {
     setBytesPerSecond(nativeHandle_, bytesPerSecond);
   }
 
+  /**
+   * Returns the bytes per second.
+   *
+   * @return bytes per second.
+   */
+  public long getBytesPerSecond() {
+    assert(isOwningHandle());
+    return getBytesPerSecond(nativeHandle_);
+  }
+
   /**
    * <p>Request for token to write bytes. If this request can not be satisfied,
    * the call is blocked. Caller is responsible to make sure
@@ -87,9 +193,9 @@ public long getSingleBurstBytes() {
   }
 
   /**
-   * <p>Total bytes that go though rate limiter.</p>
+   * <p>Total bytes that go through rate limiter.</p>
    *
-   * @return total bytes that go though rate limiter.
+   * @return total bytes that go through rate limiter.
    */
   public long getTotalBytesThrough() {
     assert(isOwningHandle());
@@ -97,9 +203,9 @@ public long getTotalBytesThrough() {
   }
 
   /**
-   * <p>Total # of requests that go though rate limiter.</p>
+   * <p>Total # of requests that go through rate limiter.</p>
    *
-   * @return total # of requests that go though rate limiter.
+   * @return total # of requests that go through rate limiter.
    */
   public long getTotalRequests() {
     assert(isOwningHandle());
@@ -107,11 +213,13 @@ public long getTotalRequests() {
   }
 
   private static native long newRateLimiterHandle(final long rateBytesPerSecond,
-      final long refillPeriodMicros, final int fairness);
+      final long refillPeriodMicros, final int fairness,
+      final byte rateLimiterMode, final boolean autoTune);
   @Override protected final native void disposeInternal(final long handle);
 
   private native void setBytesPerSecond(final long handle,
       final long bytesPerSecond);
+  private native long getBytesPerSecond(final long handle);
   private native void request(final long handle, final long bytes);
   private native long getSingleBurstBytes(final long handle);
   private native long getTotalBytesThrough(final long handle);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RateLimiterMode.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RateLimiterMode.java
new file mode 100644
index 0000000000..4b029d8165
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RateLimiterMode.java
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Mode for {@link RateLimiter#RateLimiter(long, long, int, RateLimiterMode)}.
+ */
+public enum RateLimiterMode {
+  READS_ONLY((byte)0x0),
+  WRITES_ONLY((byte)0x1),
+  ALL_IO((byte)0x2);
+
+  private final byte value;
+
+  RateLimiterMode(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the RateLimiterMode enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of RateLimiterMode.
+   *
+   * @return AccessHint instance.
+   *
+   * @throws IllegalArgumentException if the access hint for the byteIdentifier
+   *     cannot be found
+   */
+  public static RateLimiterMode getRateLimiterMode(final byte byteIdentifier) {
+    for (final RateLimiterMode rateLimiterMode : RateLimiterMode.values()) {
+      if (rateLimiterMode.getValue() == byteIdentifier) {
+        return rateLimiterMode;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for RateLimiterMode.");
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
index 9d7b999561..8353e0fe83 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadOptions.java
@@ -16,6 +16,29 @@ public ReadOptions() {
     super(newReadOptions());
   }
 
+  /**
+   * @param verifyChecksums verification will be performed on every read
+   *     when set to true
+   * @param fillCache if true, then fill-cache behavior will be performed.
+   */
+  public ReadOptions(final boolean verifyChecksums, final boolean fillCache) {
+    super(newReadOptions(verifyChecksums, fillCache));
+  }
+
+  /**
+   * Copy constructor.
+   *
+   * NOTE: This does a shallow copy, which means snapshot, iterate_upper_bound
+   * and other pointers will be cloned!
+   *
+   * @param other The ReadOptions to copy.
+   */
+  public ReadOptions(ReadOptions other) {
+    super(copyReadOptions(other.nativeHandle_));
+    this.iterateLowerBoundSlice_ = other.iterateLowerBoundSlice_;
+    this.iterateUpperBoundSlice_ = other.iterateUpperBoundSlice_;
+  }
+
   /**
    * If true, all data read from underlying storage will be
    * verified against corresponding checksums.
@@ -168,8 +191,12 @@ public ReadOptions setTailing(final boolean tailing) {
   /**
    * Returns whether managed iterators will be used.
    *
-   * @return the setting of whether managed iterators will be used, by default false
+   * @return the setting of whether managed iterators will be used,
+   *     by default false
+   *
+   * @deprecated This options is not used anymore.
    */
+  @Deprecated
   public boolean managed() {
     assert(isOwningHandle());
     return managed(nativeHandle_);
@@ -182,7 +209,10 @@ public boolean managed() {
    *
    * @param managed if true, then managed iterators will be enabled.
    * @return the reference to the current ReadOptions.
+   *
+   * @deprecated This options is not used anymore.
    */
+  @Deprecated
   public ReadOptions setManaged(final boolean managed) {
     assert(isOwningHandle());
     setManaged(nativeHandle_, managed);
@@ -224,7 +254,6 @@ public boolean prefixSameAsStart() {
     return prefixSameAsStart(nativeHandle_);
   }
 
-
   /**
    * Enforce that the iterator only iterates over the same prefix as the seek.
    * This option is effective only for prefix seeks, i.e. prefix_extractor is
@@ -332,6 +361,37 @@ public ReadOptions setReadaheadSize(final long readaheadSize) {
     return this;
   }
 
+  /**
+   * A threshold for the number of keys that can be skipped before failing an
+   * iterator seek as incomplete.
+   *
+   * @return the number of keys that can be skipped
+   *     before failing an iterator seek as incomplete.
+   */
+  public long maxSkippableInternalKeys() {
+    assert(isOwningHandle());
+    return maxSkippableInternalKeys(nativeHandle_);
+  }
+
+  /**
+   * A threshold for the number of keys that can be skipped before failing an
+   * iterator seek as incomplete. The default value of 0 should be used to
+   * never fail a request as incomplete, even on skipping too many keys.
+   *
+   * Default: 0
+   *
+   * @param maxSkippableInternalKeys the number of keys that can be skipped
+   *     before failing an iterator seek as incomplete.
+   *
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setMaxSkippableInternalKeys(
+      final long maxSkippableInternalKeys) {
+    assert(isOwningHandle());
+    setMaxSkippableInternalKeys(nativeHandle_, maxSkippableInternalKeys);
+    return this;
+  }
+
   /**
    * If true, keys deleted using the DeleteRange() API will be visible to
    * readers until they are naturally deleted during compaction. This improves
@@ -363,7 +423,162 @@ public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) {
     return this;
   }
 
+  /**
+   * Defines the smallest key at which the backward
+   * iterator can return an entry. Once the bound is passed,
+   * {@link RocksIterator#isValid()} will be false.
+   *
+   * The lower bound is inclusive i.e. the bound value is a valid
+   * entry.
+   *
+   * If prefix_extractor is not null, the Seek target and `iterate_lower_bound`
+   * need to have the same prefix. This is because ordering is not guaranteed
+   * outside of prefix domain.
+   *
+   * Default: null
+   *
+   * @param iterateLowerBound Slice representing the upper bound
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterateLowerBound(final Slice iterateLowerBound) {
+    assert(isOwningHandle());
+    if (iterateLowerBound != null) {
+      // Hold onto a reference so it doesn't get garbage collected out from under us.
+      iterateLowerBoundSlice_ = iterateLowerBound;
+      setIterateLowerBound(nativeHandle_, iterateLowerBoundSlice_.getNativeHandle());
+    }
+    return this;
+  }
+
+  /**
+   * Returns the smallest key at which the backward
+   * iterator can return an entry.
+   *
+   * The lower bound is inclusive i.e. the bound value is a valid entry.
+   *
+   * @return the smallest key, or null if there is no lower bound defined.
+   */
+  public Slice iterateLowerBound() {
+    assert(isOwningHandle());
+    final long lowerBoundSliceHandle = iterateLowerBound(nativeHandle_);
+    if (lowerBoundSliceHandle != 0) {
+      // Disown the new slice - it's owned by the C++ side of the JNI boundary
+      // from the perspective of this method.
+      return new Slice(lowerBoundSliceHandle, false);
+    }
+    return null;
+  }
+
+  /**
+   * Defines the extent up to which the forward iterator
+   * can returns entries. Once the bound is reached,
+   * {@link RocksIterator#isValid()} will be false.
+   *
+   * The upper bound is exclusive i.e. the bound value is not a valid entry.
+   *
+   * If iterator_extractor is not null, the Seek target and iterate_upper_bound
+   * need to have the same prefix. This is because ordering is not guaranteed
+   * outside of prefix domain.
+   *
+   * Default: null
+   *
+   * @param iterateUpperBound Slice representing the upper bound
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterateUpperBound(final Slice iterateUpperBound) {
+    assert(isOwningHandle());
+    if (iterateUpperBound != null) {
+      // Hold onto a reference so it doesn't get garbage collected out from under us.
+      iterateUpperBoundSlice_ = iterateUpperBound;
+      setIterateUpperBound(nativeHandle_, iterateUpperBoundSlice_.getNativeHandle());
+    }
+    return this;
+  }
+
+  /**
+   * Returns the largest key at which the forward
+   * iterator can return an entry.
+   *
+   * The upper bound is exclusive i.e. the bound value is not a valid entry.
+   *
+   * @return the largest key, or null if there is no upper bound defined.
+   */
+  public Slice iterateUpperBound() {
+    assert(isOwningHandle());
+    final long upperBoundSliceHandle = iterateUpperBound(nativeHandle_);
+    if (upperBoundSliceHandle != 0) {
+      // Disown the new slice - it's owned by the C++ side of the JNI boundary
+      // from the perspective of this method.
+      return new Slice(upperBoundSliceHandle, false);
+    }
+    return null;
+  }
+
+  /**
+   * A callback to determine whether relevant keys for this scan exist in a
+   * given table based on the table's properties. The callback is passed the
+   * properties of each table during iteration. If the callback returns false,
+   * the table will not be scanned. This option only affects Iterators and has
+   * no impact on point lookups.
+   *
+   * Default: null (every table will be scanned)
+   *
+   * @param tableFilter the table filter for the callback.
+   *
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setTableFilter(final AbstractTableFilter tableFilter) {
+    assert(isOwningHandle());
+    setTableFilter(nativeHandle_, tableFilter.nativeHandle_);
+    return this;
+  }
+
+  /**
+   * Needed to support differential snapshots. Has 2 effects:
+   *     1) Iterator will skip all internal keys with seqnum &lt; iter_start_seqnum
+   *     2) if this param &gt; 0 iterator will return INTERNAL keys instead of user
+   *         keys; e.g. return tombstones as well.
+   *
+   * Default: 0 (don't filter by seqnum, return user keys)
+   *
+   * @param startSeqnum the starting sequence number.
+   *
+   * @return the reference to the current ReadOptions.
+   */
+  public ReadOptions setIterStartSeqnum(final long startSeqnum) {
+    assert(isOwningHandle());
+    setIterStartSeqnum(nativeHandle_, startSeqnum);
+    return this;
+  }
+
+  /**
+   * Returns the starting Sequence Number of any iterator.
+   * See {@link #setIterStartSeqnum(long)}.
+   *
+   * @return the starting sequence number of any iterator.
+   */
+  public long iterStartSeqnum() {
+    assert(isOwningHandle());
+    return iterStartSeqnum(nativeHandle_);
+  }
+
+  // instance variables
+  // NOTE: If you add new member variables, please update the copy constructor above!
+  //
+  // Hold a reference to any iterate lower or upper bound that was set on this
+  // object until we're destroyed or it's overwritten. That way the caller can
+  // freely leave scope without us losing the Java Slice object, which during
+  // close() would also reap its associated rocksdb::Slice native object since
+  // it's possibly (likely) to be an owning handle.
+  private Slice iterateLowerBoundSlice_;
+  private Slice iterateUpperBoundSlice_;
+
   private native static long newReadOptions();
+  private native static long newReadOptions(final boolean verifyChecksums,
+    final boolean fillCache);
+  private native static long copyReadOptions(long handle);
+  @Override protected final native void disposeInternal(final long handle);
+
   private native boolean verifyChecksums(long handle);
   private native void setVerifyChecksums(long handle, boolean verifyChecksums);
   private native boolean fillCache(long handle);
@@ -388,10 +603,20 @@ private native void setBackgroundPurgeOnIteratorCleanup(final long handle,
   private native long readaheadSize(final long handle);
   private native void setReadaheadSize(final long handle,
       final long readaheadSize);
+  private native long maxSkippableInternalKeys(final long handle);
+  private native void setMaxSkippableInternalKeys(final long handle,
+      final long maxSkippableInternalKeys);
   private native boolean ignoreRangeDeletions(final long handle);
   private native void setIgnoreRangeDeletions(final long handle,
       final boolean ignoreRangeDeletions);
-
-  @Override protected final native void disposeInternal(final long handle);
-
+  private native void setIterateUpperBound(final long handle,
+      final long upperBoundSliceHandle);
+  private native long iterateUpperBound(final long handle);
+  private native void setIterateLowerBound(final long handle,
+      final long lowerBoundSliceHandle);
+  private native long iterateLowerBound(final long handle);
+  private native void setTableFilter(final long handle,
+      final long tableFilterHandle);
+  private native void setIterStartSeqnum(final long handle, final long seqNum);
+  private native long iterStartSeqnum(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java
index 6dc76c52e5..78f83f6ad6 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ReadTier.java
@@ -11,7 +11,8 @@
 public enum ReadTier {
   READ_ALL_TIER((byte)0),
   BLOCK_CACHE_TIER((byte)1),
-  PERSISTED_TIER((byte)2);
+  PERSISTED_TIER((byte)2),
+  MEMTABLE_TIER((byte)3);
 
   private final byte value;
 
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java
new file mode 100644
index 0000000000..a662f78fd7
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksCallbackObject.java
@@ -0,0 +1,50 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * RocksCallbackObject is similar to {@link RocksObject} but varies
+ * in its construction as it is designed for Java objects which have functions
+ * which are called from C++ via JNI.
+ *
+ * RocksCallbackObject is the base-class any RocksDB classes that acts as a
+ * callback from some underlying underlying native C++ {@code rocksdb} object.
+ *
+ * The use of {@code RocksObject} should always be preferred over
+ * {@link RocksCallbackObject} if callbacks are not required.
+ */
+public abstract class RocksCallbackObject extends
+    AbstractImmutableNativeReference {
+
+  protected final long nativeHandle_;
+
+  protected RocksCallbackObject(final long... nativeParameterHandles) {
+    super(true);
+    this.nativeHandle_ = initializeNative(nativeParameterHandles);
+  }
+
+  /**
+   * Construct the Native C++ object which will callback
+   * to our object methods
+   *
+   * @param nativeParameterHandles An array of native handles for any parameter
+   *     objects that are needed during construction
+   *
+   * @return The native handle of the C++ object which will callback to us
+   */
+  protected abstract long initializeNative(
+      final long... nativeParameterHandles);
+
+  /**
+   * Deletes underlying C++ native callback object pointer
+   */
+  @Override
+  protected void disposeInternal() {
+    disposeInternal(nativeHandle_);
+  }
+
+  private native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
index eda0950990..b93a51e28a 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksDB.java
@@ -7,7 +7,6 @@
 
 import java.util.*;
 import java.io.IOException;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 
 import org.rocksdb.util.Environment;
@@ -64,8 +63,8 @@ public static void loadLibrary() {
         NativeLibraryLoader.getInstance().loadLibrary(tmpDir);
       } catch (IOException e) {
         libraryLoaded.set(LibraryState.NOT_LOADED);
-        throw new RuntimeException("Unable to load the RocksDB shared library"
-            + e);
+        throw new RuntimeException("Unable to load the RocksDB shared library",
+            e);
       }
 
       libraryLoaded.set(LibraryState.LOADED);
@@ -139,6 +138,15 @@ public static void loadLibrary(final List<String> paths) {
     }
   }
 
+  /**
+   * Private constructor.
+   *
+   * @param nativeHandle The native handle of the C++ RocksDB object
+   */
+  protected RocksDB(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
   /**
    * The factory constructor of RocksDB that opens a RocksDB instance given
    * the path to the database using the default options w/ createIfMissing
@@ -153,9 +161,7 @@ public static void loadLibrary(final List<String> paths) {
    * @see Options#setCreateIfMissing(boolean)
    */
   public static RocksDB open(final String path) throws RocksDBException {
-    // This allows to use the rocksjni default Options instead of
-    // the c++ one.
-    Options options = new Options();
+    final Options options = new Options();
     options.setCreateIfMissing(true);
     return open(options, path);
   }
@@ -193,9 +199,7 @@ public static RocksDB open(final String path,
       final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
       final List<ColumnFamilyHandle> columnFamilyHandles)
       throws RocksDBException {
-    // This allows to use the rocksjni default Options instead of
-    // the c++ one.
-    DBOptions options = new DBOptions();
+    final DBOptions options = new DBOptions();
     return open(options, path, columnFamilyDescriptors, columnFamilyHandles);
   }
 
@@ -418,6 +422,54 @@ public static RocksDB openReadOnly(final DBOptions options, final String path,
 
     return db;
   }
+
+  /**
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * This is similar to {@link #closeE()} except that it
+   * silently ignores any errors.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
   /**
    * Static method to determine all available column families for a
    * rocksdb database identified by path
@@ -435,10 +487,108 @@ public static List<byte[]> listColumnFamilies(final Options options,
         path));
   }
 
-  private void storeOptionsInstance(DBOptionsInterface options) {
-    options_ = options;
+  /**
+   * Creates a new column family with the name columnFamilyName and
+   * allocates a ColumnFamilyHandle within an internal structure.
+   * The ColumnFamilyHandle is automatically disposed with DB disposal.
+   *
+   * @param columnFamilyDescriptor column family to be created.
+   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public ColumnFamilyHandle createColumnFamily(
+      final ColumnFamilyDescriptor columnFamilyDescriptor)
+      throws RocksDBException {
+    return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_,
+        columnFamilyDescriptor.getName(),
+        columnFamilyDescriptor.getName().length,
+        columnFamilyDescriptor.getOptions().nativeHandle_));
+  }
+
+  /**
+   * Bulk create column families with the same column family options.
+   *
+   * @param columnFamilyOptions the options for the column families.
+   * @param columnFamilyNames the names of the column families.
+   *
+   * @return the handles to the newly created column families.
+   */
+  public List<ColumnFamilyHandle> createColumnFamilies(
+      final ColumnFamilyOptions columnFamilyOptions,
+      final List<byte[]> columnFamilyNames) throws RocksDBException {
+    final byte[][] cfNames = columnFamilyNames.toArray(
+        new byte[0][]);
+    final long[] cfHandles = createColumnFamilies(nativeHandle_,
+        columnFamilyOptions.nativeHandle_, cfNames);
+    final List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>(cfHandles.length);
+    for (int i = 0; i < cfHandles.length; i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i]));
+    }
+    return columnFamilyHandles;
+  }
+
+  /**
+   * Bulk create column families with the same column family options.
+   *
+   * @param columnFamilyDescriptors the descriptions of the column families.
+   *
+   * @return the handles to the newly created column families.
+   */
+  public List<ColumnFamilyHandle> createColumnFamilies(
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors)
+      throws RocksDBException {
+    final long[] cfOptsHandles = new long[columnFamilyDescriptors.size()];
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor columnFamilyDescriptor
+          = columnFamilyDescriptors.get(i);
+      cfOptsHandles[i] = columnFamilyDescriptor.getOptions().nativeHandle_;
+      cfNames[i] = columnFamilyDescriptor.getName();
+    }
+    final long[] cfHandles = createColumnFamilies(nativeHandle_,
+        cfOptsHandles, cfNames);
+    final List<ColumnFamilyHandle> columnFamilyHandles =
+        new ArrayList<>(cfHandles.length);
+    for (int i = 0; i < cfHandles.length; i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(this, cfHandles[i]));
+    }
+    return columnFamilyHandles;
+  }
+
+  /**
+   * Drops the column family specified by {@code columnFamilyHandle}. This call
+   * only records a drop record in the manifest and prevents the column
+   * family from flushing and compacting.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void dropColumnFamily(final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_);
+  }
+
+  // Bulk drop column families. This call only records drop records in the
+  // manifest and prevents the column families from flushing and compacting.
+  // In case of error, the request may succeed partially. User may call
+  // ListColumnFamilies to check the result.
+  public void dropColumnFamilies(
+      final List<ColumnFamilyHandle> columnFamilies) throws RocksDBException {
+    final long[] cfHandles = new long[columnFamilies.size()];
+    for (int i = 0; i < columnFamilies.size(); i++) {
+      cfHandles[i] = columnFamilies.get(i).nativeHandle_;
+    }
+    dropColumnFamilies(nativeHandle_, cfHandles);
   }
 
+  //TODO(AR) what about DestroyColumnFamilyHandle
+
   /**
    * Set the database entry for "key" to "value".
    *
@@ -453,6 +603,32 @@ public void put(final byte[] key, final byte[] value)
     put(nativeHandle_, key, 0, key.length, value, 0, value.length);
   }
 
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be
+   *    non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if errors happens in underlying native
+   *     library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void put(final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
   /**
    * Set the database entry for "key" to "value" in the specified
    * column family.
@@ -473,6 +649,37 @@ public void put(final ColumnFamilyHandle columnFamilyHandle,
         columnFamilyHandle.nativeHandle_);
   }
 
+  /**
+   * Set the database entry for "key" to "value" in the specified
+   * column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must
+   *     be non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length - offset)
+   *
+   * @throws RocksDBException thrown if errors happens in underlying native
+   *     library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
   /**
    * Set the database entry for "key" to "value".
    *
@@ -489,6 +696,35 @@ public void put(final WriteOptions writeOpts, final byte[] key,
         key, 0, key.length, value, 0, value.length);
   }
 
+  /**
+   * Set the database entry for "key" to "value".
+   *
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void put(final WriteOptions writeOpts,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
   /**
    * Set the database entry for "key" to "value" for the specified
    * column family.
@@ -513,1009 +749,1611 @@ public void put(final ColumnFamilyHandle columnFamilyHandle,
   }
 
   /**
-   * If the key definitely does not exist in the database, then this method
-   * returns false, else true.
+   * Set the database entry for "key" to "value" for the specified
+   * column family.
    *
-   * This check is potentially lighter-weight than invoking DB::Get(). One way
-   * to make this lighter weight is to avoid doing any IOs.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpts {@link org.rocksdb.WriteOptions} instance.
+   * @param key The specified key to be inserted
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value associated with the specified key
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
    *
-   * @param key byte array of a key to search for
-   * @param value StringBuilder instance which is a out parameter if a value is
-   *    found in block-cache.
-   * @return boolean value indicating if key does not exist or might exist.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
    */
-  public boolean keyMayExist(final byte[] key, final StringBuilder value) {
-    return keyMayExist(nativeHandle_, key, 0, key.length, value);
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    put(nativeHandle_, writeOpts.nativeHandle_, key, offset, len, value,
+        vOffset, vLen, columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * If the key definitely does not exist in the database, then this method
-   * returns false, else true.
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * This check is potentially lighter-weight than invoking DB::Get(). One way
-   * to make this lighter weight is to avoid doing any IOs.
+   * @param key Key to delete within database
    *
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
-   * @param key byte array of a key to search for
-   * @param value StringBuilder instance which is a out parameter if a value is
-   *    found in block-cache.
-   * @return boolean value indicating if key does not exist or might exist.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
+   * @deprecated Use {@link #delete(byte[])}
    */
-  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
-      final byte[] key, final StringBuilder value) {
-    return keyMayExist(nativeHandle_, key, 0, key.length,
-        columnFamilyHandle.nativeHandle_, value);
+  @Deprecated
+  public void remove(final byte[] key) throws RocksDBException {
+    delete(key);
   }
 
   /**
-   * If the key definitely does not exist in the database, then this method
-   * returns false, else true.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * This check is potentially lighter-weight than invoking DB::Get(). One way
-   * to make this lighter weight is to avoid doing any IOs.
+   * @param key Key to delete within database
    *
-   * @param readOptions {@link ReadOptions} instance
-   * @param key byte array of a key to search for
-   * @param value StringBuilder instance which is a out parameter if a value is
-   *    found in block-cache.
-   * @return boolean value indicating if key does not exist or might exist.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
-  public boolean keyMayExist(final ReadOptions readOptions,
-      final byte[] key, final StringBuilder value) {
-    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
-        key, 0, key.length, value);
+  public void delete(final byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key, 0, key.length);
   }
 
   /**
-   * If the key definitely does not exist in the database, then this method
-   * returns false, else true.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * This check is potentially lighter-weight than invoking DB::Get(). One way
-   * to make this lighter weight is to avoid doing any IOs.
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be
+   *      non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *      non-negative and no larger than ("key".length - offset)
    *
-   * @param readOptions {@link ReadOptions} instance
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
-   * @param key byte array of a key to search for
-   * @param value StringBuilder instance which is a out parameter if a value is
-   *    found in block-cache.
-   * @return boolean value indicating if key does not exist or might exist.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
-  public boolean keyMayExist(final ReadOptions readOptions,
-      final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
-      final StringBuilder value) {
-    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
-        key, 0, key.length, columnFamilyHandle.nativeHandle_,
-        value);
+  public void delete(final byte[] key, final int offset, final int len)
+      throws RocksDBException {
+    delete(nativeHandle_, key, offset, len);
   }
 
   /**
-   * Apply the specified updates to the database.
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * @param writeOpts WriteOptions instance
-   * @param updates WriteBatch instance
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
+   *
+   * @deprecated Use {@link #delete(ColumnFamilyHandle, byte[])}
    */
-  public void write(final WriteOptions writeOpts, final WriteBatch updates)
-      throws RocksDBException {
-    write0(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_);
+  @Deprecated
+  public void remove(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    delete(columnFamilyHandle, key);
   }
 
   /**
-   * Apply the specified updates to the database.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * @param writeOpts WriteOptions instance
-   * @param updates WriteBatchWithIndex instance
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public void write(final WriteOptions writeOpts,
-      final WriteBatchWithIndex updates) throws RocksDBException {
-    write1(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_);
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    delete(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Add merge operand for key/value pair.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * @param key the specified key to be merged.
-   * @param value the value to be merged with the current value for
-   * the specified key.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used,
+   *     must be non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("value".length - offset)
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public void merge(final byte[] key, final byte[] value)
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len)
       throws RocksDBException {
-    merge(nativeHandle_, key, 0, key.length, value, 0, value.length);
+    delete(nativeHandle_, key, offset, len, columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Add merge operand for key/value pair in a ColumnFamily.
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
-   * @param key the specified key to be merged.
-   * @param value the value to be merged with the current value for
-   * the specified key.
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
+   *
+   * @deprecated Use {@link #delete(WriteOptions, byte[])}
    */
-  public void merge(final ColumnFamilyHandle columnFamilyHandle,
-      final byte[] key, final byte[] value) throws RocksDBException {
-    merge(nativeHandle_, key, 0, key.length, value, 0, value.length,
-        columnFamilyHandle.nativeHandle_);
+  @Deprecated
+  public void remove(final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    delete(writeOpt, key);
   }
 
   /**
-   * Add merge operand for key/value pair.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * @param writeOpts {@link WriteOptions} for this write.
-   * @param key the specified key to be merged.
-   * @param value the value to be merged with the current value for
-   * the specified key.
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public void merge(final WriteOptions writeOpts, final byte[] key,
-      final byte[] value) throws RocksDBException {
-    merge(nativeHandle_, writeOpts.nativeHandle_,
-        key, 0, key.length, value, 0, value.length);
+  public void delete(final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length);
   }
 
   /**
-   * Add merge operand for key/value pair.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
-   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
-   * @param writeOpts {@link WriteOptions} for this write.
-   * @param key the specified key to be merged.
-   * @param value the value to be merged with the current value for
-   * the specified key.
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *     non-negative and no larger than ("key".length -  offset)
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public void merge(final ColumnFamilyHandle columnFamilyHandle,
-      final WriteOptions writeOpts, final byte[] key,
-      final byte[] value) throws RocksDBException {
-    merge(nativeHandle_, writeOpts.nativeHandle_,
-        key, 0, key.length, value, 0, value.length,
-        columnFamilyHandle.nativeHandle_);
+  public void delete(final WriteOptions writeOpt, final byte[] key,
+      final int offset, final int len) throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len);
   }
 
-  // TODO(AR) we should improve the #get() API, returning -1 (RocksDB.NOT_FOUND) is not very nice
-  // when we could communicate better status into, also the C++ code show that -2 could be returned
-
   /**
-   * Get the value associated with the specified key within column family*
-   * @param key the key to retrieve the value.
-   * @param value the out-value to receive the retrieved value.
-   * @return The size of the actual value that matches the specified
-   *     {@code key} in byte.  If the return value is greater than the
-   *     length of {@code value}, then it indicates that the size of the
-   *     input buffer {@code value} is insufficient and partial result will
-   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
-   *     found.
+   * Remove the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
+   *
+   * @deprecated Use {@link #delete(ColumnFamilyHandle, WriteOptions, byte[])}
    */
-  public int get(final byte[] key, final byte[] value) throws RocksDBException {
-    return get(nativeHandle_, key, 0, key.length, value, 0, value.length);
+  @Deprecated
+  public void remove(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key) throws RocksDBException {
+    delete(columnFamilyHandle, writeOpt, key);
   }
 
   /**
-   * Get the value associated with the specified key within column family.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
-   * @param key the key to retrieve the value.
-   * @param value the out-value to receive the retrieved value.
-   * @return The size of the actual value that matches the specified
-   *     {@code key} in byte.  If the return value is greater than the
-   *     length of {@code value}, then it indicates that the size of the
-   *     input buffer {@code value} is insufficient and partial result will
-   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
-   *     found.
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
-      final byte[] value) throws RocksDBException, IllegalArgumentException {
-    return get(nativeHandle_, key, 0, key.length, value, 0, value.length,
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length,
         columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Get the value associated with the specified key.
-   *
-   * @param opt {@link org.rocksdb.ReadOptions} instance.
-   * @param key the key to retrieve the value.
-   * @param value the out-value to receive the retrieved value.
-   * @return The size of the actual value that matches the specified
-   *     {@code key} in byte.  If the return value is greater than the
-   *     length of {@code value}, then it indicates that the size of the
-   *     input buffer {@code value} is insufficient and partial result will
-   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
-   *     found.
-   *
-   * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
-   */
-  public int get(final ReadOptions opt, final byte[] key,
-      final byte[] value) throws RocksDBException {
-    return get(nativeHandle_, opt.nativeHandle_,
-               key, 0, key.length, value, 0, value.length);
-  }
-  /**
-   * Get the value associated with the specified key within column family.
+   * Delete the database entry (if any) for "key".  Returns OK on
+   * success, and a non-OK status on error.  It is not an error if "key"
+   * did not exist in the database.
    *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
    *     instance
-   * @param opt {@link org.rocksdb.ReadOptions} instance.
-   * @param key the key to retrieve the value.
-   * @param value the out-value to receive the retrieved value.
-   * @return The size of the actual value that matches the specified
-   *     {@code key} in byte.  If the return value is greater than the
-   *     length of {@code value}, then it indicates that the size of the
-   *     input buffer {@code value} is insufficient and partial result will
-   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
-   *     found.
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param key Key to delete within database
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *     non-negative and no larger than ("key".length -  offset)
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public int get(final ColumnFamilyHandle columnFamilyHandle,
-      final ReadOptions opt, final byte[] key, final byte[] value)
-      throws RocksDBException {
-    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value,
-        0, value.length, columnFamilyHandle.nativeHandle_);
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key, final int offset,
+      final int len)  throws RocksDBException {
+    delete(nativeHandle_, writeOpt.nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * The simplified version of get which returns a new byte array storing
-   * the value associated with the specified input key if any.  null will be
-   * returned if the specified key is not found.
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
    *
-   * @param key the key retrieve the value.
-   * @return a byte array storing the value associated with the input key if
-   *     any.  null if it does not find the specified key.
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
+   *
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
+   *
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
+   *     native library.
    */
-  public byte[] get(final byte[] key) throws RocksDBException {
-    return get(nativeHandle_, key, 0, key.length);
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final byte[] key) throws RocksDBException {
+    singleDelete(nativeHandle_, key, key.length);
   }
 
   /**
-   * The simplified version of get which returns a new byte array storing
-   * the value associated with the specified input key if any.  null will be
-   * returned if the specified key is not found.
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param key the key retrieve the value.
-   * @return a byte array storing the value associated with the input key if
-   *     any.  null if it does not find the specified key.
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
+   *
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
+   *
+   * @param columnFamilyHandle The column family to delete the key from
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
+   *     native library.
    */
-  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
       final byte[] key) throws RocksDBException {
-    return get(nativeHandle_, key, 0, key.length,
+    singleDelete(nativeHandle_, key, key.length,
         columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * The simplified version of get which returns a new byte array storing
-   * the value associated with the specified input key if any.  null will be
-   * returned if the specified key is not found.
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
    *
-   * @param key the key retrieve the value.
-   * @param opt Read options.
-   * @return a byte array storing the value associated with the input key if
-   *     any.  null if it does not find the specified key.
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
    *
-   * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
-   */
-  public byte[] get(final ReadOptions opt, final byte[] key)
-      throws RocksDBException {
-    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length);
-  }
-
-  /**
-   * The simplified version of get which returns a new byte array storing
-   * the value associated with the specified input key if any.  null will be
-   * returned if the specified key is not found.
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param key the key retrieve the value.
-   * @param opt Read options.
-   * @return a byte array storing the value associated with the input key if
-   *     any.  null if it does not find the specified key.
+   * Note: consider setting {@link WriteOptions#setSync(boolean)} true.
+   *
+   * @param writeOpt Write options for the delete
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
+   *     native library.
    */
-  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
-      final ReadOptions opt, final byte[] key) throws RocksDBException {
-    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length,
-        columnFamilyHandle.nativeHandle_);
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final WriteOptions writeOpt, final byte[] key)
+      throws RocksDBException {
+    singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
   }
 
   /**
-   * Returns a map of keys for which values were found in DB.
+   * Remove the database entry for {@code key}. Requires that the key exists
+   * and was not overwritten. It is not an error if the key did not exist
+   * in the database.
    *
-   * @param keys List of keys for which values need to be retrieved.
-   * @return Map where key of map is the key passed by user and value for map
-   * entry is the corresponding value in DB.
+   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+   * times), then the result of calling SingleDelete() on this key is undefined.
+   * SingleDelete() only behaves correctly if there has been only one Put()
+   * for this key since the previous call to SingleDelete() for this key.
+   *
+   * This feature is currently an experimental performance optimization
+   * for a very specific workload. It is up to the caller to ensure that
+   * SingleDelete is only used for a key that is not deleted using Delete() or
+   * written using Merge(). Mixing SingleDelete operations with Deletes and
+   * Merges can result in undefined behavior.
+   *
+   * Note: consider setting {@link WriteOptions#setSync(boolean)} true.
+   *
+   * @param columnFamilyHandle The column family to delete the key from
+   * @param writeOpt Write options for the delete
+   * @param key Key to delete within database
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
+   *     native library.
    */
-  public Map<byte[], byte[]> multiGet(final List<byte[]> keys)
-      throws RocksDBException {
-    assert(keys.size() != 0);
-
-    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
-    for(int i = 0; i < keyLengths.length; i++) {
-      keyLengths[i] = keysArray[i].length;
-    }
-
-    final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets,
-        keyLengths);
-
-    final Map<byte[], byte[]> keyValueMap =
-        new HashMap<>(computeCapacityHint(values.length));
-    for(int i = 0; i < values.length; i++) {
-      if(values[i] == null) {
-        continue;
-      }
-
-      keyValueMap.put(keys.get(i), values[i]);
-    }
-
-    return keyValueMap;
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] key) throws RocksDBException {
+    singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
   }
 
-  private static int computeCapacityHint(final int estimatedNumberOfItems) {
-    // Default load factor for HashMap is 0.75, so N * 1.5 will be at the load
-    // limit. We add +1 for a buffer.
-    return (int)Math.ceil(estimatedNumberOfItems * 1.5 + 1.0);
-  }
 
   /**
-   * Returns a map of keys for which values were found in DB.
-   * <p>
-   * Note: Every key needs to have a related column family name in
-   * {@code columnFamilyHandleList}.
-   * </p>
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
    *
-   * @param columnFamilyHandleList {@link java.util.List} containing
-   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
-   * @param keys List of keys for which values need to be retrieved.
-   * @return Map where key of map is the key passed by user and value for map
-   * entry is the corresponding value in DB.
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
    *
-   * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
-   * @throws IllegalArgumentException thrown if the size of passed keys is not
-   *    equal to the amount of passed column family handles.
+   * @param beginKey First key to delete within database (inclusive)
+   * @param endKey Last key to delete within database (exclusive)
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
    */
-  public Map<byte[], byte[]> multiGet(
-      final List<ColumnFamilyHandle> columnFamilyHandleList,
-      final List<byte[]> keys) throws RocksDBException,
-      IllegalArgumentException {
-    assert(keys.size() != 0);
-    // Check if key size equals cfList size. If not a exception must be
-    // thrown. If not a Segmentation fault happens.
-    if (keys.size() != columnFamilyHandleList.size()) {
-        throw new IllegalArgumentException(
-            "For each key there must be a ColumnFamilyHandle.");
-    }
-    final long[] cfHandles = new long[columnFamilyHandleList.size()];
-    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
-      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
-    }
-
-    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
-    for(int i = 0; i < keyLengths.length; i++) {
-      keyLengths[i] = keysArray[i].length;
-    }
-
-    final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets,
-        keyLengths, cfHandles);
-
-    final Map<byte[], byte[]> keyValueMap =
-        new HashMap<>(computeCapacityHint(values.length));
-    for(int i = 0; i < values.length; i++) {
-      if (values[i] == null) {
-        continue;
-      }
-      keyValueMap.put(keys.get(i), values[i]);
-    }
-    return keyValueMap;
+  public void deleteRange(final byte[] beginKey, final byte[] endKey)
+      throws RocksDBException {
+    deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0,
+        endKey.length);
   }
 
   /**
-   * Returns a map of keys for which values were found in DB.
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
    *
-   * @param opt Read options.
-   * @param keys of keys for which values need to be retrieved.
-   * @return Map where key of map is the key passed by user and value for map
-   * entry is the corresponding value in DB.
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
    *
-   * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance
+   * @param beginKey First key to delete within database (inclusive)
+   * @param endKey Last key to delete within database (exclusive)
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
    */
-  public Map<byte[], byte[]> multiGet(final ReadOptions opt,
-      final List<byte[]> keys) throws RocksDBException {
-    assert(keys.size() != 0);
-
-    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
-    for(int i = 0; i < keyLengths.length; i++) {
-      keyLengths[i] = keysArray[i].length;
-    }
-
-    final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_,
-        keysArray, keyOffsets, keyLengths);
-
-    final Map<byte[], byte[]> keyValueMap =
-        new HashMap<>(computeCapacityHint(values.length));
-    for(int i = 0; i < values.length; i++) {
-      if(values[i] == null) {
-        continue;
-      }
-
-      keyValueMap.put(keys.get(i), values[i]);
-    }
-
-    return keyValueMap;
+  public void deleteRange(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] beginKey, final byte[] endKey) throws RocksDBException {
+    deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0,
+        endKey.length, columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Returns a map of keys for which values were found in DB.
-   * <p>
-   * Note: Every key needs to have a related column family name in
-   * {@code columnFamilyHandleList}.
-   * </p>
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
    *
-   * @param opt Read options.
-   * @param columnFamilyHandleList {@link java.util.List} containing
-   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
-   * @param keys of keys for which values need to be retrieved.
-   * @return Map where key of map is the key passed by user and value for map
-   * entry is the corresponding value in DB.
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
+   *
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param beginKey First key to delete within database (inclusive)
+   * @param endKey Last key to delete within database (exclusive)
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
-   * @throws IllegalArgumentException thrown if the size of passed keys is not
-   *    equal to the amount of passed column family handles.
+   *     native library.
    */
-  public Map<byte[], byte[]> multiGet(final ReadOptions opt,
-      final List<ColumnFamilyHandle> columnFamilyHandleList,
-      final List<byte[]> keys) throws RocksDBException {
-    assert(keys.size() != 0);
-    // Check if key size equals cfList size. If not a exception must be
-    // thrown. If not a Segmentation fault happens.
-    if (keys.size()!=columnFamilyHandleList.size()){
-      throw new IllegalArgumentException(
-          "For each key there must be a ColumnFamilyHandle.");
-    }
-    final long[] cfHandles = new long[columnFamilyHandleList.size()];
-    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
-      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
-    }
-
-    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
-    final int keyOffsets[] = new int[keysArray.length];
-    final int keyLengths[] = new int[keysArray.length];
-    for(int i = 0; i < keyLengths.length; i++) {
-      keyLengths[i] = keysArray[i].length;
-    }
-
-    final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_,
-        keysArray, keyOffsets, keyLengths, cfHandles);
-
-    final Map<byte[], byte[]> keyValueMap
-        = new HashMap<>(computeCapacityHint(values.length));
-    for(int i = 0; i < values.length; i++) {
-      if(values[i] == null) {
-        continue;
-      }
-      keyValueMap.put(keys.get(i), values[i]);
-    }
-
-    return keyValueMap;
+  public void deleteRange(final WriteOptions writeOpt, final byte[] beginKey,
+      final byte[] endKey) throws RocksDBException {
+    deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0,
+        beginKey.length, endKey, 0, endKey.length);
   }
 
   /**
-   * Remove the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
+   * including "beginKey" and excluding "endKey". a non-OK status on error. It
+   * is not an error if no keys exist in the range ["beginKey", "endKey").
    *
-   * @param key Key to delete within database
+   * Delete the database entry (if any) for "key". Returns OK on success, and a
+   * non-OK status on error. It is not an error if "key" did not exist in the
+   * database.
    *
-   * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance
+   * @param writeOpt WriteOptions to be used with delete operation
+   * @param beginKey First key to delete within database (included)
+   * @param endKey Last key to delete within database (excluded)
    *
-   * @deprecated Use {@link #delete(byte[])}
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
    */
-  @Deprecated
-  public void remove(final byte[] key) throws RocksDBException {
-    delete(key);
+  public void deleteRange(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpt, final byte[] beginKey, final byte[] endKey)
+      throws RocksDBException {
+    deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0,
+        beginKey.length, endKey, 0, endKey.length,
+        columnFamilyHandle.nativeHandle_);
   }
 
+
   /**
-   * Delete the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Add merge operand for key/value pair.
    *
-   * @param key Key to delete within database
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for the
+   *     specified key.
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public void delete(final byte[] key) throws RocksDBException {
-    delete(nativeHandle_, key, 0, key.length);
+  public void merge(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    merge(nativeHandle_, key, 0, key.length, value, 0, value.length);
   }
 
   /**
-   * Remove the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Add merge operand for key/value pair.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param key Key to delete within database
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value to be merged with the current value for the
+   *     specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and must be non-negative and no larger than
+   *     ("value".length -  offset)
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
-   *
-   * @deprecated Use {@link #delete(ColumnFamilyHandle, byte[])}
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
    */
-  @Deprecated
-  public void remove(final ColumnFamilyHandle columnFamilyHandle,
-      final byte[] key) throws RocksDBException {
-    delete(columnFamilyHandle, key);
+  public void merge(final byte[] key, int offset, int len, final byte[] value,
+      final int vOffset, final int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, key, offset, len, value, vOffset, vLen);
   }
 
   /**
-   * Delete the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Add merge operand for key/value pair in a ColumnFamily.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param key Key to delete within database
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public void delete(final ColumnFamilyHandle columnFamilyHandle,
-                     final byte[] key) throws RocksDBException {
-    delete(nativeHandle_, key, 0, key.length, columnFamilyHandle.nativeHandle_);
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, key, 0, key.length, value, 0, value.length,
+        columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Remove the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Add merge operand for key/value pair in a ColumnFamily.
    *
-   * @param writeOpt WriteOptions to be used with delete operation
-   * @param key Key to delete within database
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value to be merged with the current value for
+   *     the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     must be non-negative and no larger than ("value".length -  offset)
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len, final byte[] value,
+      final int vOffset, final int vLen) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Add merge operand for key/value pair.
    *
-   * @deprecated Use {@link #delete(WriteOptions, byte[])}
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for
+   * the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
    */
-  @Deprecated
-  public void remove(final WriteOptions writeOpt, final byte[] key)
-      throws RocksDBException {
-    delete(writeOpt, key);
+  public void merge(final WriteOptions writeOpts, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, 0, key.length, value, 0, value.length);
   }
 
   /**
-   * Delete the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Add merge operand for key/value pair.
    *
-   * @param writeOpt WriteOptions to be used with delete operation
-   * @param key Key to delete within database
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("value".length -  offset)
+   * @param value the value to be merged with the current value for
+   *     the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
    */
-  public void delete(final WriteOptions writeOpt, final byte[] key)
+  public void merge(final WriteOptions writeOpts,
+      final byte[] key,  final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
       throws RocksDBException {
-    delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length);
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
   }
 
   /**
-   * Remove the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Add merge operand for key/value pair.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param writeOpt WriteOptions to be used with delete operation
-   * @param key Key to delete within database
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param value the value to be merged with the current value for the
+   *     specified key.
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
-   *
-   * @deprecated Use {@link #delete(ColumnFamilyHandle, WriteOptions, byte[])}
    */
-  @Deprecated
-  public void remove(final ColumnFamilyHandle columnFamilyHandle,
-      final WriteOptions writeOpt, final byte[] key)
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final WriteOptions writeOpts, final byte[] key, final byte[] value)
       throws RocksDBException {
-    delete(columnFamilyHandle, writeOpt, key);
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, 0, key.length, value, 0, value.length,
+        columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Delete the database entry (if any) for "key".  Returns OK on
-   * success, and a non-OK status on error.  It is not an error if "key"
-   * did not exist in the database.
+   * Add merge operand for key/value pair.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param writeOpt WriteOptions to be used with delete operation
-   * @param key Key to delete within database
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param writeOpts {@link WriteOptions} for this write.
+   * @param key the specified key to be merged.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the value to be merged with the current value for
+   *     the specified key.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
+   * @throws IndexOutOfBoundsException if an offset or length is out of bounds
    */
-  public void delete(final ColumnFamilyHandle columnFamilyHandle,
-                     final WriteOptions writeOpt, final byte[] key)
+  public void merge(
+      final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpts,
+      final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
       throws RocksDBException {
-    delete(nativeHandle_, writeOpt.nativeHandle_, key, 0, key.length,
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    merge(nativeHandle_, writeOpts.nativeHandle_,
+        key, offset, len, value, vOffset, vLen,
         columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Remove the database entry for {@code key}. Requires that the key exists
-   * and was not overwritten. It is not an error if the key did not exist
-   * in the database.
+   * Apply the specified updates to the database.
    *
-   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
-   * times), then the result of calling SingleDelete() on this key is undefined.
-   * SingleDelete() only behaves correctly if there has been only one Put()
-   * for this key since the previous call to SingleDelete() for this key.
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatch instance
    *
-   * This feature is currently an experimental performance optimization
-   * for a very specific workload. It is up to the caller to ensure that
-   * SingleDelete is only used for a key that is not deleted using Delete() or
-   * written using Merge(). Mixing SingleDelete operations with Deletes and
-   * Merges can result in undefined behavior.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public void write(final WriteOptions writeOpts, final WriteBatch updates)
+      throws RocksDBException {
+    write0(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_);
+  }
+
+  /**
+   * Apply the specified updates to the database.
    *
-   * @param key Key to delete within database
+   * @param writeOpts WriteOptions instance
+   * @param updates WriteBatchWithIndex instance
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *     native library.
+   *    native library.
    */
-  @Experimental("Performance optimization for a very specific workload")
-  public void singleDelete(final byte[] key) throws RocksDBException {
-    singleDelete(nativeHandle_, key, key.length);
+  public void write(final WriteOptions writeOpts,
+      final WriteBatchWithIndex updates) throws RocksDBException {
+    write1(nativeHandle_, writeOpts.nativeHandle_, updates.nativeHandle_);
   }
 
+  // TODO(AR) we should improve the #get() API, returning -1 (RocksDB.NOT_FOUND) is not very nice
+  // when we could communicate better status into, also the C++ code show that -2 could be returned
+
   /**
-   * Remove the database entry for {@code key}. Requires that the key exists
-   * and was not overwritten. It is not an error if the key did not exist
-   * in the database.
-   *
-   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
-   * times), then the result of calling SingleDelete() on this key is undefined.
-   * SingleDelete() only behaves correctly if there has been only one Put()
-   * for this key since the previous call to SingleDelete() for this key.
+   * Get the value associated with the specified key within column family*
    *
-   * This feature is currently an experimental performance optimization
-   * for a very specific workload. It is up to the caller to ensure that
-   * SingleDelete is only used for a key that is not deleted using Delete() or
-   * written using Merge(). Mixing SingleDelete operations with Deletes and
-   * Merges can result in undefined behavior.
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
    *
-   * @param columnFamilyHandle The column family to delete the key from
-   * @param key Key to delete within database
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *     native library.
+   *    native library.
    */
-  @Experimental("Performance optimization for a very specific workload")
-  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
-      final byte[] key) throws RocksDBException {
-    singleDelete(nativeHandle_, key, key.length,
-        columnFamilyHandle.nativeHandle_);
+  public int get(final byte[] key, final byte[] value) throws RocksDBException {
+    return get(nativeHandle_, key, 0, key.length, value, 0, value.length);
   }
 
   /**
-   * Remove the database entry for {@code key}. Requires that the key exists
-   * and was not overwritten. It is not an error if the key did not exist
-   * in the database.
+   * Get the value associated with the specified key within column family*
    *
-   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
-   * times), then the result of calling SingleDelete() on this key is undefined.
-   * SingleDelete() only behaves correctly if there has been only one Put()
-   * for this key since the previous call to SingleDelete() for this key.
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "value".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and and no larger than ("value".length -  offset)
    *
-   * This feature is currently an experimental performance optimization
-   * for a very specific workload. It is up to the caller to ensure that
-   * SingleDelete is only used for a key that is not deleted using Delete() or
-   * written using Merge(). Mixing SingleDelete operations with Deletes and
-   * Merges can result in undefined behavior.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
    *
-   * Note: consider setting {@link WriteOptions#setSync(boolean)} true.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
    *
-   * @param writeOpt Write options for the delete
-   * @param key Key to delete within database
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *     native library.
+   *    native library.
    */
-  @Experimental("Performance optimization for a very specific workload")
-  public void singleDelete(final WriteOptions writeOpt, final byte[] key)
-      throws RocksDBException {
-    singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length);
+  public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final byte[] value) throws RocksDBException, IllegalArgumentException {
+    return get(nativeHandle_, key, 0, key.length, value, 0, value.length,
+        columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * Remove the database entry for {@code key}. Requires that the key exists
-   * and was not overwritten. It is not an error if the key did not exist
-   * in the database.
+   * Get the value associated with the specified key within column family.
    *
-   * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
-   * times), then the result of calling SingleDelete() on this key is undefined.
-   * SingleDelete() only behaves correctly if there has been only one Put()
-   * for this key since the previous call to SingleDelete() for this key.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     an no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
    *
-   * This feature is currently an experimental performance optimization
-   * for a very specific workload. It is up to the caller to ensure that
-   * SingleDelete is only used for a key that is not deleted using Delete() or
-   * written using Merge(). Mixing SingleDelete operations with Deletes and
-   * Merges can result in undefined behavior.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
    *
-   * Note: consider setting {@link WriteOptions#setSync(boolean)} true.
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final int offset, final int len, final byte[] value, final int vOffset,
+      final int vLen) throws RocksDBException, IllegalArgumentException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, key, offset, len, value, vOffset, vLen,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key.
    *
-   * @param columnFamilyHandle The column family to delete the key from
-   * @param writeOpt Write options for the delete
-   * @param key Key to delete within database
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ReadOptions opt, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_,
+               key, 0, key.length, value, 0, value.length);
+  }
+
+  /**
+   * Get the value associated with the specified key.
+   *
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, must be
+   *     non-negative and no larger than ("value".length -  offset)
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ReadOptions opt, final byte[] key, final int offset,
+      final int len, final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, opt.nativeHandle_,
+        key, offset, len, value, vOffset, vLen);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param value the out-value to receive the retrieved value.
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, final byte[] value)
+      throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length, value,
+        0, value.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the value associated with the specified key within column family.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param opt {@link org.rocksdb.ReadOptions} instance.
+   * @param key the key to retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be
+   *     non-negative and and no larger than ("key".length -  offset)
+   * @param value the out-value to receive the retrieved value.
+   * @param vOffset the offset of the "value" array to be used, must be
+   *     non-negative and no longer than "key".length
+   * @param vLen the length of the "value" array to be used, and must be
+   *     non-negative and no larger than ("value".length -  offset)
+   * @return The size of the actual value that matches the specified
+   *     {@code key} in byte.  If the return value is greater than the
+   *     length of {@code value}, then it indicates that the size of the
+   *     input buffer {@code value} is insufficient and partial result will
+   *     be returned.  RocksDB.NOT_FOUND will be returned if the value not
+   *     found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public int get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, final int offset, final int len,
+      final byte[] value, final int vOffset, final int vLen)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    checkBounds(vOffset, vLen, value.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len, value,
+        vOffset, vLen, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, key, 0, key.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final byte[] key, final int offset,
+      final int len) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, key, offset, len);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, key, 0, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final int offset, final int len)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ReadOptions opt, final byte[] key)
+      throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ReadOptions opt, final byte[] key, final int offset,
+      final int len) throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key) throws RocksDBException {
+    return get(nativeHandle_, opt.nativeHandle_, key, 0, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * The simplified version of get which returns a new byte array storing
+   * the value associated with the specified input key if any.  null will be
+   * returned if the specified key is not found.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key retrieve the value.
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than ("key".length -  offset)
+   * @param opt Read options.
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions opt, final byte[] key, final int offset, final int len)
+      throws RocksDBException {
+    checkBounds(offset, len, key.length);
+    return get(nativeHandle_, opt.nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   *
+   * @param keys List of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   * entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
+   * @deprecated Consider {@link #multiGetAsList(List)} instead.
+   */
+  @Deprecated
+  public Map<byte[], byte[]> multiGet(final List<byte[]> keys)
+      throws RocksDBException {
+    assert(keys.size() != 0);
+
+    final byte[][] keysArray = keys.toArray(new byte[0][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets,
+        keyLengths);
+
+    final Map<byte[], byte[]> keyValueMap =
+        new HashMap<>(computeCapacityHint(values.length));
+    for(int i = 0; i < values.length; i++) {
+      if(values[i] == null) {
+        continue;
+      }
+
+      keyValueMap.put(keys.get(i), values[i]);
+    }
+
+    return keyValueMap;
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys List of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   *     entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   *
+   * @deprecated Consider {@link #multiGetAsList(List, List)} instead.
+   */
+  @Deprecated
+  public Map<byte[], byte[]> multiGet(
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException,
+      IllegalArgumentException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size() != columnFamilyHandleList.size()) {
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    final long[] cfHandles = new long[columnFamilyHandleList.size()];
+    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final byte[][] keysArray = keys.toArray(new byte[0][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    final byte[][] values = multiGet(nativeHandle_, keysArray, keyOffsets,
+        keyLengths, cfHandles);
+
+    final Map<byte[], byte[]> keyValueMap =
+        new HashMap<>(computeCapacityHint(values.length));
+    for(int i = 0; i < values.length; i++) {
+      if (values[i] == null) {
+        continue;
+      }
+      keyValueMap.put(keys.get(i), values[i]);
+    }
+    return keyValueMap;
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   *
+   * @param opt Read options.
+   * @param keys of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   *     entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   *
+   * @deprecated Consider {@link #multiGetAsList(ReadOptions, List)} instead.
+   */
+  @Deprecated
+  public Map<byte[], byte[]> multiGet(final ReadOptions opt,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+
+    final byte[][] keysArray = keys.toArray(new byte[0][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_,
+        keysArray, keyOffsets, keyLengths);
+
+    final Map<byte[], byte[]> keyValueMap =
+        new HashMap<>(computeCapacityHint(values.length));
+    for(int i = 0; i < values.length; i++) {
+      if(values[i] == null) {
+        continue;
+      }
+
+      keyValueMap.put(keys.get(i), values[i]);
+    }
+
+    return keyValueMap;
+  }
+
+  /**
+   * Returns a map of keys for which values were found in DB.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param opt Read options.
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   * @return Map where key of map is the key passed by user and value for map
+   *     entry is the corresponding value in DB.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   *
+   * @deprecated Consider {@link #multiGetAsList(ReadOptions, List, List)}
+   *     instead.
+   */
+  @Deprecated
+  public Map<byte[], byte[]> multiGet(final ReadOptions opt,
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size()!=columnFamilyHandleList.size()){
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    final long[] cfHandles = new long[columnFamilyHandleList.size()];
+    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final byte[][] keysArray = keys.toArray(new byte[0][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    final byte[][] values = multiGet(nativeHandle_, opt.nativeHandle_,
+        keysArray, keyOffsets, keyLengths, cfHandles);
+
+    final Map<byte[], byte[]> keyValueMap
+        = new HashMap<>(computeCapacityHint(values.length));
+    for(int i = 0; i < values.length; i++) {
+      if(values[i] == null) {
+        continue;
+      }
+      keyValueMap.put(keys.get(i), values[i]);
+    }
+
+    return keyValueMap;
+  }
+
+  /**
+   * Takes a list of keys, and returns a list of values for the given list of
+   * keys. List will contain null for keys which could not be found.
+   *
+   * @param keys List of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public List<byte[]> multiGetAsList(final List<byte[]> keys)
+      throws RocksDBException {
+    assert(keys.size() != 0);
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, keysArray, keyOffsets,
+        keyLengths));
+  }
+
+  /**
+   * Returns a list of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
+   *
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys List of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+  public List<byte[]> multiGetAsList(
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException,
+      IllegalArgumentException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size() != columnFamilyHandleList.size()) {
+        throw new IllegalArgumentException(
+            "For each key there must be a ColumnFamilyHandle.");
+    }
+    final long[] cfHandles = new long[columnFamilyHandleList.size()];
+    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, keysArray, keyOffsets,
+        keyLengths, cfHandles));
+  }
+
+  /**
+   * Returns a list of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   *
+   * @param opt Read options.
+   * @param keys of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
    *
    * @throws RocksDBException thrown if error happens in underlying
-   *     native library.
+   *    native library.
    */
-  @Experimental("Performance optimization for a very specific workload")
-  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
-      final WriteOptions writeOpt, final byte[] key) throws RocksDBException {
-    singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length,
-        columnFamilyHandle.nativeHandle_);
+  public List<byte[]> multiGetAsList(final ReadOptions opt,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, opt.nativeHandle_,
+        keysArray, keyOffsets, keyLengths));
   }
 
   /**
-   * DB implements can export properties about their state
-   * via this method on a per column family level.
-   *
-   * <p>If {@code property} is a valid property understood by this DB
-   * implementation, fills {@code value} with its current value and
-   * returns true. Otherwise returns false.</p>
-   *
-   * <p>Valid property names include:
-   * <ul>
-   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at
-   * level &lt;N&gt;, where &lt;N&gt; is an ASCII representation of a level
-   * number (e.g. "0").</li>
-   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
-   *     about the internal operation of the DB.</li>
-   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
-   *    of the sstables that make up the db contents.</li>
-   * </ul>
+   * Returns a list of values for the given list of keys. List will contain
+   * null for keys which could not be found.
+   * <p>
+   * Note: Every key needs to have a related column family name in
+   * {@code columnFamilyHandleList}.
+   * </p>
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param property to be fetched. See above for examples
-   * @return property value
+   * @param opt Read options.
+   * @param columnFamilyHandleList {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   * @return List of values for the given list of keys. List will contain
+   * null for keys which could not be found.
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
    */
-  public String getProperty(final ColumnFamilyHandle columnFamilyHandle,
-      final String property) throws RocksDBException {
-    return getProperty0(nativeHandle_, columnFamilyHandle.nativeHandle_,
-        property, property.length());
+  public List<byte[]> multiGetAsList(final ReadOptions opt,
+      final List<ColumnFamilyHandle> columnFamilyHandleList,
+      final List<byte[]> keys) throws RocksDBException {
+    assert(keys.size() != 0);
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.size()!=columnFamilyHandleList.size()){
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    final long[] cfHandles = new long[columnFamilyHandleList.size()];
+    for (int i = 0; i < columnFamilyHandleList.size(); i++) {
+      cfHandles[i] = columnFamilyHandleList.get(i).nativeHandle_;
+    }
+
+    final byte[][] keysArray = keys.toArray(new byte[keys.size()][]);
+    final int keyOffsets[] = new int[keysArray.length];
+    final int keyLengths[] = new int[keysArray.length];
+    for(int i = 0; i < keyLengths.length; i++) {
+      keyLengths[i] = keysArray[i].length;
+    }
+
+    return Arrays.asList(multiGet(nativeHandle_, opt.nativeHandle_,
+        keysArray, keyOffsets, keyLengths, cfHandles));
   }
 
   /**
-   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
-   * including "beginKey" and excluding "endKey". a non-OK status on error. It
-   * is not an error if no keys exist in the range ["beginKey", "endKey").
-   *
-   * Delete the database entry (if any) for "key". Returns OK on success, and a
-   * non-OK status on error. It is not an error if "key" did not exist in the
-   * database.
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * @param beginKey
-   *          First key to delete within database (included)
-   * @param endKey
-   *          Last key to delete within database (excluded)
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @throws RocksDBException
-   *           thrown if error happens in underlying native library.
+   * @param key byte array of a key to search for
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
    */
-  public void deleteRange(final byte[] beginKey, final byte[] endKey) throws RocksDBException {
-    deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length);
+  public boolean keyMayExist(final byte[] key, final StringBuilder value) {
+    return keyMayExist(nativeHandle_, key, 0, key.length, value);
   }
 
   /**
-   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
-   * including "beginKey" and excluding "endKey". a non-OK status on error. It
-   * is not an error if no keys exist in the range ["beginKey", "endKey").
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * Delete the database entry (if any) for "key". Returns OK on success, and a
-   * non-OK status on error. It is not an error if "key" did not exist in the
-   * database.
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @param columnFamilyHandle
-   *          {@link org.rocksdb.ColumnFamilyHandle} instance
-   * @param beginKey
-   *          First key to delete within database (included)
-   * @param endKey
-   *          Last key to delete within database (excluded)
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than "key".length
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
    *
-   * @throws RocksDBException
-   *           thrown if error happens in underlying native library.
+   * @return boolean value indicating if key does not exist or might exist.
    */
-  public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] beginKey,
-      final byte[] endKey) throws RocksDBException {
-    deleteRange(nativeHandle_, beginKey, 0, beginKey.length, endKey, 0, endKey.length,
-        columnFamilyHandle.nativeHandle_);
+  public boolean keyMayExist(final byte[] key, final int offset, final int len,
+      final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, key, offset, len, value);
   }
 
   /**
-   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
-   * including "beginKey" and excluding "endKey". a non-OK status on error. It
-   * is not an error if no keys exist in the range ["beginKey", "endKey").
-   *
-   * Delete the database entry (if any) for "key". Returns OK on success, and a
-   * non-OK status on error. It is not an error if "key" did not exist in the
-   * database.
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * @param writeOpt
-   *          WriteOptions to be used with delete operation
-   * @param beginKey
-   *          First key to delete within database (included)
-   * @param endKey
-   *          Last key to delete within database (excluded)
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @throws RocksDBException
-   *           thrown if error happens in underlying native library.
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
    */
-  public void deleteRange(final WriteOptions writeOpt, final byte[] beginKey, final byte[] endKey)
-      throws RocksDBException {
-    deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0, beginKey.length, endKey, 0,
-        endKey.length);
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final StringBuilder value) {
+    return keyMayExist(nativeHandle_, key, 0, key.length,
+        columnFamilyHandle.nativeHandle_, value);
   }
 
   /**
-   * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
-   * including "beginKey" and excluding "endKey". a non-OK status on error. It
-   * is not an error if no keys exist in the range ["beginKey", "endKey").
-   *
-   * Delete the database entry (if any) for "key". Returns OK on success, and a
-   * non-OK status on error. It is not an error if "key" did not exist in the
-   * database.
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param writeOpt
-   *          WriteOptions to be used with delete operation
-   * @param beginKey
-   *          First key to delete within database (included)
-   * @param endKey
-   *          Last key to delete within database (excluded)
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @throws RocksDBException
-   *           thrown if error happens in underlying native library.
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *    non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *    and no larger than "key".length
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
    */
-  public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt,
-      final byte[] beginKey, final byte[] endKey) throws RocksDBException {
-    deleteRange(nativeHandle_, writeOpt.nativeHandle_, beginKey, 0, beginKey.length, endKey, 0,
-        endKey.length, columnFamilyHandle.nativeHandle_);
+  public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, int offset, int len, final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, key, offset, len,
+        columnFamilyHandle.nativeHandle_, value);
   }
 
   /**
-   * DB implementations can export properties about their state
-   * via this method.  If "property" is a valid property understood by this
-   * DB implementation, fills "*value" with its current value and returns
-   * true.  Otherwise returns false.
-   *
-   * <p>Valid property names include:
-   * <ul>
-   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at
-   * level &lt;N&gt;, where &lt;N&gt; is an ASCII representation of a level
-   * number (e.g. "0").</li>
-   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
-   *     about the internal operation of the DB.</li>
-   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
-   *    of the sstables that make up the db contents.</li>
-   *</ul>
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * @param property to be fetched. See above for examples
-   * @return property value
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @throws RocksDBException thrown if error happens in underlying
-   *    native library.
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
    */
-  public String getProperty(final String property) throws RocksDBException {
-    return getProperty0(nativeHandle_, property, property.length());
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final byte[] key, final StringBuilder value) {
+    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
+        key, 0, key.length, value);
   }
 
   /**
-   * <p> Similar to GetProperty(), but only works for a subset of properties
-   * whose return value is a numerical value. Return the value as long.</p>
-   *
-   * <p><strong>Note</strong>: As the returned property is of type
-   * {@code uint64_t} on C++ side the returning value can be negative
-   * because Java supports in Java 7 only signed long values.</p>
-   *
-   * <p><strong>Java 7</strong>: To mitigate the problem of the non
-   * existent unsigned long tpye, values should be encapsulated using
-   * {@link java.math.BigInteger} to reflect the correct value. The correct
-   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
-   *
-   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
-   * unsigned long using provided methods of type {@link Long}.</p>
-   *
-   * @param property to be fetched.
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * @return numerical property value.
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @throws RocksDBException if an error happens in the underlying native code.
+   * @param readOptions {@link ReadOptions} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than "key".length
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
    */
-  public long getLongProperty(final String property) throws RocksDBException {
-    return getLongProperty(nativeHandle_, property, property.length());
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final byte[] key, final int offset, final int len,
+      final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
+        key, offset, len, value);
   }
 
   /**
-   * <p> Similar to GetProperty(), but only works for a subset of properties
-   * whose return value is a numerical value. Return the value as long.</p>
-   *
-   * <p><strong>Note</strong>: As the returned property is of type
-   * {@code uint64_t} on C++ side the returning value can be negative
-   * because Java supports in Java 7 only signed long values.</p>
-   *
-   * <p><strong>Java 7</strong>: To mitigate the problem of the non
-   * existent unsigned long tpye, values should be encapsulated using
-   * {@link java.math.BigInteger} to reflect the correct value. The correct
-   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
-   * unsigned long using provided methods of type {@link Long}.</p>
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
-   * @param property to be fetched.
+   * @param readOptions {@link ReadOptions} instance
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
+   */
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final StringBuilder value) {
+    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
+        key, 0, key.length, columnFamilyHandle.nativeHandle_,
+        value);
+  }
+
+  /**
+   * If the key definitely does not exist in the database, then this method
+   * returns false, else true.
    *
-   * @return numerical property value
+   * This check is potentially lighter-weight than invoking DB::Get(). One way
+   * to make this lighter weight is to avoid doing any IOs.
    *
-   * @throws RocksDBException if an error happens in the underlying native code.
+   * @param readOptions {@link ReadOptions} instance
+   * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+   * @param key byte array of a key to search for
+   * @param offset the offset of the "key" array to be used, must be
+   *     non-negative and no larger than "key".length
+   * @param len the length of the "key" array to be used, must be non-negative
+   *     and no larger than "key".length
+   * @param value StringBuilder instance which is a out parameter if a value is
+   *    found in block-cache.
+   * @return boolean value indicating if key does not exist or might exist.
    */
-  public long getLongProperty(final ColumnFamilyHandle columnFamilyHandle,
-      final String property) throws RocksDBException {
-    return getLongProperty(nativeHandle_, columnFamilyHandle.nativeHandle_,
-        property, property.length());
+  public boolean keyMayExist(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final int offset, final int len, final StringBuilder value) {
+    checkBounds(offset, len, key.length);
+    return keyMayExist(nativeHandle_, readOptions.nativeHandle_,
+        key, offset, len, columnFamilyHandle.nativeHandle_,
+        value);
   }
 
   /**
@@ -1552,37 +2390,6 @@ public RocksIterator newIterator(final ReadOptions readOptions) {
         readOptions.nativeHandle_));
   }
 
-   /**
-   * <p>Return a handle to the current DB state. Iterators created with
-   * this handle will all observe a stable snapshot of the current DB
-   * state. The caller must call ReleaseSnapshot(result) when the
-   * snapshot is no longer needed.</p>
-   *
-   * <p>nullptr will be returned if the DB fails to take a snapshot or does
-   * not support snapshot.</p>
-   *
-   * @return Snapshot {@link Snapshot} instance
-   */
-  public Snapshot getSnapshot() {
-    long snapshotHandle = getSnapshot(nativeHandle_);
-    if (snapshotHandle != 0) {
-      return new Snapshot(snapshotHandle);
-    }
-    return null;
-  }
-
-  /**
-   * Release a previously acquired snapshot.  The caller must not
-   * use "snapshot" after this call.
-   *
-   * @param snapshot {@link Snapshot} instance
-   */
-  public void releaseSnapshot(final Snapshot snapshot) {
-    if (snapshot != null) {
-      releaseSnapshot(nativeHandle_, snapshot.nativeHandle_);
-    }
-  }
-
   /**
    * <p>Return a heap-allocated iterator over the contents of the
    * database. The result of newIterator() is initially invalid
@@ -1677,88 +2484,331 @@ public List<RocksIterator> newIterators(
     return iterators;
   }
 
+
   /**
-   * Gets the handle for the default column family
+   * <p>Return a handle to the current DB state. Iterators created with
+   * this handle will all observe a stable snapshot of the current DB
+   * state. The caller must call ReleaseSnapshot(result) when the
+   * snapshot is no longer needed.</p>
    *
-   * @return The handle of the default column family
+   * <p>nullptr will be returned if the DB fails to take a snapshot or does
+   * not support snapshot.</p>
+   *
+   * @return Snapshot {@link Snapshot} instance
    */
-  public ColumnFamilyHandle getDefaultColumnFamily() {
-    ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this,
-        getDefaultColumnFamily(nativeHandle_));
-    cfHandle.disOwnNativeHandle();
-    return cfHandle;
+  public Snapshot getSnapshot() {
+    long snapshotHandle = getSnapshot(nativeHandle_);
+    if (snapshotHandle != 0) {
+      return new Snapshot(snapshotHandle);
+    }
+    return null;
   }
 
   /**
-   * Creates a new column family with the name columnFamilyName and
-   * allocates a ColumnFamilyHandle within an internal structure.
-   * The ColumnFamilyHandle is automatically disposed with DB disposal.
+   * Release a previously acquired snapshot.
    *
-   * @param columnFamilyDescriptor column family to be created.
-   * @return {@link org.rocksdb.ColumnFamilyHandle} instance.
+   * The caller must not use "snapshot" after this call.
+   *
+   * @param snapshot {@link Snapshot} instance
+   */
+  public void releaseSnapshot(final Snapshot snapshot) {
+    if (snapshot != null) {
+      releaseSnapshot(nativeHandle_, snapshot.nativeHandle_);
+    }
+  }
+
+  /**
+   * DB implements can export properties about their state
+   * via this method on a per column family level.
+   *
+   * <p>If {@code property} is a valid property understood by this DB
+   * implementation, fills {@code value} with its current value and
+   * returns true. Otherwise returns false.</p>
+   *
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at
+   * level &lt;N&gt;, where &lt;N&gt; is an ASCII representation of a level
+   * number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param property to be fetched. See above for examples
+   * @return property value
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public ColumnFamilyHandle createColumnFamily(
-      final ColumnFamilyDescriptor columnFamilyDescriptor)
-      throws RocksDBException {
-    return new ColumnFamilyHandle(this, createColumnFamily(nativeHandle_,
-        columnFamilyDescriptor.columnFamilyName(),
-        columnFamilyDescriptor.columnFamilyOptions().nativeHandle_));
+  public String getProperty(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final String property) throws RocksDBException {
+    return getProperty(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        property, property.length());
   }
 
   /**
-   * Drops the column family identified by columnFamilyName. Internal
-   * handles to this column family will be disposed. If the column family
-   * is not known removal will fail.
+   * DB implementations can export properties about their state
+   * via this method.  If "property" is a valid property understood by this
+   * DB implementation, fills "*value" with its current value and returns
+   * true.  Otherwise returns false.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance
+   * <p>Valid property names include:
+   * <ul>
+   * <li>"rocksdb.num-files-at-level&lt;N&gt;" - return the number of files at
+   * level &lt;N&gt;, where &lt;N&gt; is an ASCII representation of a level
+   * number (e.g. "0").</li>
+   * <li>"rocksdb.stats" - returns a multi-line string that describes statistics
+   *     about the internal operation of the DB.</li>
+   * <li>"rocksdb.sstables" - returns a multi-line string that describes all
+   *    of the sstables that make up the db contents.</li>
+   *</ul>
+   *
+   * @param property to be fetched. See above for examples
+   * @return property value
    *
    * @throws RocksDBException thrown if error happens in underlying
    *    native library.
    */
-  public void dropColumnFamily(final ColumnFamilyHandle columnFamilyHandle)
-      throws RocksDBException, IllegalArgumentException {
-    // throws RocksDBException if something goes wrong
-    dropColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_);
-    // After the drop the native handle is not valid anymore
-    columnFamilyHandle.disOwnNativeHandle();
+  public String getProperty(final String property) throws RocksDBException {
+    return getProperty(null, property);
   }
 
+
   /**
-   * <p>Flush all memory table data.</p>
+   * Gets a property map.
    *
-   * <p>Note: it must be ensured that the FlushOptions instance
-   * is not GC'ed before this method finishes. If the wait parameter is
-   * set to false, flush processing is asynchronous.</p>
+   * @param property to be fetched.
    *
-   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
-   * @throws RocksDBException thrown if an error occurs within the native
-   *     part of the library.
+   * @return the property map
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public Map<String, String> getMapProperty(final String property)
+      throws RocksDBException {
+    return getMapProperty(null, property);
+  }
+
+  /**
+   * Gets a property map.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param property to be fetched.
+   *
+   * @return the property map
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public Map<String, String> getMapProperty(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+                      final String property) throws RocksDBException {
+    return getMapProperty(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        property, property.length());
+  }
+
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties
+   * whose return value is a numerical value. Return the value as long.</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param property to be fetched.
+   *
+   * @return numerical property value.
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(final String property) throws RocksDBException {
+    return getLongProperty(null, property);
+  }
+
+  /**
+   * <p> Similar to GetProperty(), but only works for a subset of properties
+   * whose return value is a numerical value. Return the value as long.</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family
+   * @param property to be fetched.
+   *
+   * @return numerical property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getLongProperty(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final String property) throws RocksDBException {
+    return getLongProperty(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        property, property.length());
+  }
+
+  /**
+   * Reset internal stats for DB and all column families.
+   *
+   * Note this doesn't reset {@link Options#statistics()} as it is not
+   * owned by DB.
+   */
+  public void resetStats() throws RocksDBException {
+    resetStats(nativeHandle_);
+  }
+
+  /**
+   * <p> Return sum of the getLongProperty of all the column families</p>
+   *
+   * <p><strong>Note</strong>: As the returned property is of type
+   * {@code uint64_t} on C++ side the returning value can be negative
+   * because Java supports in Java 7 only signed long values.</p>
+   *
+   * <p><strong>Java 7</strong>: To mitigate the problem of the non
+   * existent unsigned long tpye, values should be encapsulated using
+   * {@link java.math.BigInteger} to reflect the correct value. The correct
+   * behavior is guaranteed if {@code 2^64} is added to negative values.</p>
+   *
+   * <p><strong>Java 8</strong>: In Java 8 the value should be treated as
+   * unsigned long using provided methods of type {@link Long}.</p>
+   *
+   * @param property to be fetched.
+   *
+   * @return numerical property value
+   *
+   * @throws RocksDBException if an error happens in the underlying native code.
+   */
+  public long getAggregatedLongProperty(final String property)
+      throws RocksDBException {
+    return getAggregatedLongProperty(nativeHandle_, property,
+        property.length());
+  }
+
+  /**
+   * Get the approximate file system space used by keys in each range.
+   *
+   * Note that the returned sizes measure file system space usage, so
+   * if the user data compresses by a factor of ten, the returned
+   * sizes will be one-tenth the size of the corresponding user data size.
+   *
+   * If {@code sizeApproximationFlags} defines whether the returned size
+   * should include the recently written data in the mem-tables (if
+   * the mem-table type supports it), data serialized to disk, or both.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family
+   * @param ranges the ranges over which to approximate sizes
+   * @param sizeApproximationFlags flags to determine what to include in the
+   *     approximation.
+   *
+   * @return the sizes
+   */
+  public long[] getApproximateSizes(
+      /*@Nullable*/ final ColumnFamilyHandle columnFamilyHandle,
+      final List<Range> ranges,
+      final SizeApproximationFlag... sizeApproximationFlags) {
+
+    byte flags = 0x0;
+    for (final SizeApproximationFlag sizeApproximationFlag
+        : sizeApproximationFlags) {
+      flags |= sizeApproximationFlag.getValue();
+    }
+
+    return getApproximateSizes(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        toRangeSliceHandles(ranges), flags);
+  }
+
+  /**
+   * Get the approximate file system space used by keys in each range for
+   * the default column family.
+   *
+   * Note that the returned sizes measure file system space usage, so
+   * if the user data compresses by a factor of ten, the returned
+   * sizes will be one-tenth the size of the corresponding user data size.
+   *
+   * If {@code sizeApproximationFlags} defines whether the returned size
+   * should include the recently written data in the mem-tables (if
+   * the mem-table type supports it), data serialized to disk, or both.
+   *
+   * @param ranges the ranges over which to approximate sizes
+   * @param sizeApproximationFlags flags to determine what to include in the
+   *     approximation.
+   *
+   * @return the sizes.
+   */
+  public long[] getApproximateSizes(final List<Range> ranges,
+      final SizeApproximationFlag... sizeApproximationFlags) {
+    return getApproximateSizes(null, ranges, sizeApproximationFlags);
+  }
+
+  public static class CountAndSize {
+    public final long count;
+    public final long size;
+
+    public CountAndSize(final long count, final long size) {
+      this.count = count;
+      this.size = size;
+    }
+  }
+
+  /**
+   * This method is similar to
+   * {@link #getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)},
+   * except that it returns approximate number of records and size in memtables.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family
+   * @param range the ranges over which to get the memtable stats
+   *
+   * @return the count and size for the range
    */
-  public void flush(final FlushOptions flushOptions)
-      throws RocksDBException {
-    flush(nativeHandle_, flushOptions.nativeHandle_);
+  public CountAndSize getApproximateMemTableStats(
+      /*@Nullable*/ final ColumnFamilyHandle columnFamilyHandle,
+      final Range range) {
+    final long[] result = getApproximateMemTableStats(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        range.start.getNativeHandle(),
+        range.limit.getNativeHandle());
+    return new CountAndSize(result[0], result[1]);
   }
 
   /**
-   * <p>Flush all memory table data.</p>
+   * This method is similar to
+   * {@link #getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)},
+   * except that it returns approximate number of records and size in memtables.
    *
-   * <p>Note: it must be ensured that the FlushOptions instance
-   * is not GC'ed before this method finishes. If the wait parameter is
-   * set to false, flush processing is asynchronous.</p>
+   * @param range the ranges over which to get the memtable stats
    *
-   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
-   * @throws RocksDBException thrown if an error occurs within the native
-   *     part of the library.
+   * @return the count and size for the range
    */
-  public void flush(final FlushOptions flushOptions,
-      final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException {
-    flush(nativeHandle_, flushOptions.nativeHandle_,
-        columnFamilyHandle.nativeHandle_);
+  public CountAndSize getApproximateMemTableStats(
+    final Range range) {
+    return getApproximateMemTableStats(null, range);
   }
 
   /**
@@ -1778,7 +2828,40 @@ public void flush(final FlushOptions flushOptions,
    *     part of the library.
    */
   public void compactRange() throws RocksDBException {
-    compactRange0(nativeHandle_, false, -1, 0);
+    compactRange(null);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * <p><strong>See also</strong></p>
+   * <ul>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
+   * </ul>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    compactRange(nativeHandle_, null, -1, null, -1, 0,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
   }
 
   /**
@@ -1802,42 +2885,44 @@ public void compactRange() throws RocksDBException {
    */
   public void compactRange(final byte[] begin, final byte[] end)
       throws RocksDBException {
-    compactRange0(nativeHandle_, begin, begin.length, end,
-        end.length, false, -1, 0);
+    compactRange(null, begin, end);
   }
 
   /**
-   * <p>Range compaction of database.</p>
+   * <p>Range compaction of column family.</p>
    * <p><strong>Note</strong>: After the database has been compacted,
    * all data will have been pushed down to the last level containing
    * any data.</p>
    *
-   * <p>Compaction outputs should be placed in options.db_paths
-   * [target_path_id]. Behavior is undefined if target_path_id is
-   * out of range.</p>
-   *
    * <p><strong>See also</strong></p>
    * <ul>
-   * <li>{@link #compactRange()}</li>
-   * <li>{@link #compactRange(byte[], byte[])}</li>
-   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
+   * </li>
+   * <li>
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
+   *   boolean, int, int)}
+   * </li>
    * </ul>
    *
-   * @param reduce_level reduce level after compaction
-   * @param target_level target level to compact to
-   * @param target_path_id the target path id of output path
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
    *
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
-  public void compactRange(final boolean reduce_level,
-      final int target_level, final int target_path_id)
-      throws RocksDBException {
-    compactRange0(nativeHandle_, reduce_level,
-        target_level, target_path_id);
+  public void compactRange(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] begin, final byte[] end) throws RocksDBException {
+    compactRange(nativeHandle_,
+        begin, begin == null ? -1 : begin.length,
+        end, end == null ? -1 : end.length,
+        0, columnFamilyHandle == null ? 0: columnFamilyHandle.nativeHandle_);
   }
 
-
   /**
    * <p>Range compaction of database.</p>
    * <p><strong>Note</strong>: After the database has been compacted,
@@ -1851,24 +2936,23 @@ public void compactRange(final boolean reduce_level,
    * <p><strong>See also</strong></p>
    * <ul>
    * <li>{@link #compactRange()}</li>
-   * <li>{@link #compactRange(boolean, int, int)}</li>
    * <li>{@link #compactRange(byte[], byte[])}</li>
+   * <li>{@link #compactRange(byte[], byte[], boolean, int, int)}</li>
    * </ul>
    *
-   * @param begin start of key range (included in range)
-   * @param end end of key range (excluded from range)
-   * @param reduce_level reduce level after compaction
-   * @param target_level target level to compact to
-   * @param target_path_id the target path id of output path
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead
+   *
+   * @param changeLevel reduce level after compaction
+   * @param targetLevel target level to compact to
+   * @param targetPathId the target path id of output path
    *
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
-  public void compactRange(final byte[] begin, final byte[] end,
-      final boolean reduce_level, final int target_level,
-      final int target_path_id) throws RocksDBException {
-    compactRange0(nativeHandle_, begin, begin.length, end, end.length,
-        reduce_level, target_level, target_path_id);
+  @Deprecated
+  public void compactRange(final boolean changeLevel, final int targetLevel,
+      final int targetPathId) throws RocksDBException {
+    compactRange(null, changeLevel, targetLevel, targetPathId);
   }
 
   /**
@@ -1877,11 +2961,13 @@ public void compactRange(final byte[] begin, final byte[] end,
    * all data will have been pushed down to the last level containing
    * any data.</p>
    *
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
    * <p><strong>See also</strong></p>
    * <ul>
-   * <li>
-   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
-   * </li>
+   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
    * <li>
    *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
    * </li>
@@ -1891,48 +2977,67 @@ public void compactRange(final byte[] begin, final byte[] end,
    * </li>
    * </ul>
    *
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead
+   *
    * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance.
+   *     instance, or null for the default column family.
+   * @param changeLevel reduce level after compaction
+   * @param targetLevel target level to compact to
+   * @param targetPathId the target path id of output path
    *
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
-  public void compactRange(final ColumnFamilyHandle columnFamilyHandle)
+  @Deprecated
+  public void compactRange(
+    /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+    final boolean changeLevel, final int targetLevel, final int targetPathId)
       throws RocksDBException {
-    compactRange(nativeHandle_, false, -1, 0,
-        columnFamilyHandle.nativeHandle_);
+    final CompactRangeOptions options = new CompactRangeOptions();
+    options.setChangeLevel(changeLevel);
+    options.setTargetLevel(targetLevel);
+    options.setTargetPathId(targetPathId);
+    compactRange(nativeHandle_,
+        null, -1,
+        null, -1,
+        options.nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
   }
 
   /**
-   * <p>Range compaction of column family.</p>
+   * <p>Range compaction of database.</p>
    * <p><strong>Note</strong>: After the database has been compacted,
    * all data will have been pushed down to the last level containing
    * any data.</p>
    *
+   * <p>Compaction outputs should be placed in options.db_paths
+   * [target_path_id]. Behavior is undefined if target_path_id is
+   * out of range.</p>
+   *
    * <p><strong>See also</strong></p>
    * <ul>
-   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
-   * <li>
-   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
-   * </li>
-   * <li>
-   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
-   *   boolean, int, int)}
-   * </li>
+   * <li>{@link #compactRange()}</li>
+   * <li>{@link #compactRange(boolean, int, int)}</li>
+   * <li>{@link #compactRange(byte[], byte[])}</li>
    * </ul>
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance.
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)}
+   *     instead
+   *
    * @param begin start of key range (included in range)
    * @param end end of key range (excluded from range)
+   * @param changeLevel reduce level after compaction
+   * @param targetLevel target level to compact to
+   * @param targetPathId the target path id of output path
    *
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
-  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
-      final byte[] begin, final byte[] end) throws RocksDBException {
-    compactRange(nativeHandle_, begin, begin.length, end, end.length,
-        false, -1, 0, columnFamilyHandle.nativeHandle_);
+  @Deprecated
+  public void compactRange(final byte[] begin, final byte[] end,
+      final boolean changeLevel, final int targetLevel,
+      final int targetPathId) throws RocksDBException {
+    compactRange(null, begin, end, changeLevel, targetLevel, targetPathId);
   }
 
   /**
@@ -1949,90 +3054,377 @@ public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
    * <ul>
    * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
    * <li>
-   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
    * </li>
    * <li>
-   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[],
-   *   boolean, int, int)}
+   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
    * </li>
    * </ul>
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance.
-   * @param reduce_level reduce level after compaction
-   * @param target_level target level to compact to
-   * @param target_path_id the target path id of output path
+   * @deprecated Use {@link #compactRange(ColumnFamilyHandle, byte[], byte[], CompactRangeOptions)} instead
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param changeLevel reduce level after compaction
+   * @param targetLevel target level to compact to
+   * @param targetPathId the target path id of output path
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  @Deprecated
+  public void compactRange(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] begin, final byte[] end, final boolean changeLevel,
+      final int targetLevel, final int targetPathId)
+      throws RocksDBException {
+    final CompactRangeOptions options = new CompactRangeOptions();
+    options.setChangeLevel(changeLevel);
+    options.setTargetLevel(targetLevel);
+    options.setTargetPathId(targetPathId);
+    compactRange(nativeHandle_,
+        begin, begin == null ? -1 : begin.length,
+        end, end == null ? -1 : end.length,
+        options.nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * <p>Range compaction of column family.</p>
+   * <p><strong>Note</strong>: After the database has been compacted,
+   * all data will have been pushed down to the last level containing
+   * any data.</p>
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
+   * @param begin start of key range (included in range)
+   * @param end end of key range (excluded from range)
+   * @param compactRangeOptions options for the compaction
+   *
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] begin, final byte[] end,
+      final CompactRangeOptions compactRangeOptions) throws RocksDBException {
+    compactRange(nativeHandle_,
+        begin, begin == null ? -1 : begin.length,
+        end, end == null ? -1 : end.length,
+        compactRangeOptions.nativeHandle_, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Change the options for the column family handle.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance, or null for the default column family.
+   * @param mutableColumnFamilyOptions the options.
+   */
+  public void setOptions(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
+      final MutableColumnFamilyOptions mutableColumnFamilyOptions)
+      throws RocksDBException {
+    setOptions(nativeHandle_, columnFamilyHandle.nativeHandle_,
+        mutableColumnFamilyOptions.getKeys(),
+        mutableColumnFamilyOptions.getValues());
+  }
+
+  /**
+   * Change the options for the default column family handle.
+   *
+   * @param mutableColumnFamilyOptions the options.
+   */
+  public void setOptions(
+      final MutableColumnFamilyOptions mutableColumnFamilyOptions)
+      throws RocksDBException {
+    setOptions(null, mutableColumnFamilyOptions);
+  }
+
+  /**
+   * Set the options for the column family handle.
+   *
+   * @param mutableDBoptions the options.
+   */
+  public void setDBOptions(final MutableDBOptions mutableDBoptions)
+      throws RocksDBException {
+    setDBOptions(nativeHandle_,
+        mutableDBoptions.getKeys(),
+        mutableDBoptions.getValues());
+  }
+
+  /**
+   * Takes nputs a list of files specified by file names and
+   * compacts them to the specified level.
+   *
+   * Note that the behavior is different from
+   * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * in that CompactFiles() performs the compaction job using the CURRENT
+   * thread.
+   *
+   * @param compactionOptions compaction options
+   * @param inputFileNames the name of the files to compact
+   * @param outputLevel the level to which they should be compacted
+   * @param outputPathId the id of the output path, or -1
+   * @param compactionJobInfo the compaction job info, this parameter
+   *     will be updated with the info from compacting the files,
+   *     can just be null if you don't need it.
+   */
+  public List<String> compactFiles(
+      final CompactionOptions compactionOptions,
+      final List<String> inputFileNames,
+      final int outputLevel,
+      final int outputPathId,
+      /* @Nullable */ final CompactionJobInfo compactionJobInfo)
+      throws RocksDBException {
+    return compactFiles(compactionOptions, null, inputFileNames, outputLevel,
+        outputPathId, compactionJobInfo);
+  }
+
+  /**
+   * Takes a list of files specified by file names and
+   * compacts them to the specified level.
+   *
+   * Note that the behavior is different from
+   * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
+   * in that CompactFiles() performs the compaction job using the CURRENT
+   * thread.
+   *
+   * @param compactionOptions compaction options
+   * @param columnFamilyHandle columnFamilyHandle, or null for the
+   *     default column family
+   * @param inputFileNames the name of the files to compact
+   * @param outputLevel the level to which they should be compacted
+   * @param outputPathId the id of the output path, or -1
+   * @param compactionJobInfo the compaction job info, this parameter
+   *     will be updated with the info from compacting the files,
+   *     can just be null if you don't need it.
+   */
+  public List<String> compactFiles(
+      final CompactionOptions compactionOptions,
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle,
+      final List<String> inputFileNames,
+      final int outputLevel,
+      final int outputPathId,
+      /* @Nullable */ final CompactionJobInfo compactionJobInfo)
+      throws RocksDBException {
+    return Arrays.asList(compactFiles(nativeHandle_, compactionOptions.nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        inputFileNames.toArray(new String[0]),
+        outputLevel,
+        outputPathId,
+        compactionJobInfo == null ? 0 : compactionJobInfo.nativeHandle_));
+  }
+
+  /**
+   * This function will wait until all currently running background processes
+   * finish. After it returns, no background process will be run until
+   * {@link #continueBackgroundWork()} is called
+   *
+   * @throws RocksDBException If an error occurs when pausing background work
+   */
+  public void pauseBackgroundWork() throws RocksDBException {
+    pauseBackgroundWork(nativeHandle_);
+  }
+
+  /**
+   * Resumes background work which was suspended by
+   * previously calling {@link #pauseBackgroundWork()}
+   *
+   * @throws RocksDBException If an error occurs when resuming background work
+   */
+  public void continueBackgroundWork() throws RocksDBException {
+    continueBackgroundWork(nativeHandle_);
+  }
+
+  /**
+   * Enable automatic compactions for the given column
+   * families if they were previously disabled.
+   *
+   * The function will first set the
+   * {@link ColumnFamilyOptions#disableAutoCompactions()} option for each
+   * column family to false, after which it will schedule a flush/compaction.
+   *
+   * NOTE: Setting disableAutoCompactions to 'false' through
+   * {@link #setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}
+   * does NOT schedule a flush/compaction afterwards, and only changes the
+   * parameter itself within the column family option.
+   *
+   * @param columnFamilyHandles the column family handles
+   */
+  public void enableAutoCompaction(
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+    enableAutoCompaction(nativeHandle_,
+        toNativeHandleList(columnFamilyHandles));
+  }
+
+  /**
+   * Number of levels used for this DB.
+   *
+   * @return the number of levels
+   */
+  public int numberLevels() {
+    return numberLevels(null);
+  }
+
+  /**
+   * Number of levels used for a column family in this DB.
+   *
+   * @param columnFamilyHandle the column family handle, or null
+   *     for the default column family
+   *
+   * @return the number of levels
+   */
+  public int numberLevels(/* @Nullable */final ColumnFamilyHandle columnFamilyHandle) {
+    return numberLevels(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Maximum level to which a new compacted memtable is pushed if it
+   * does not create overlap.
+   */
+  public int maxMemCompactionLevel() {
+    return maxMemCompactionLevel(null);
+  }
+
+  /**
+   * Maximum level to which a new compacted memtable is pushed if it
+   * does not create overlap.
+   *
+   * @param columnFamilyHandle the column family handle
+   */
+  public int maxMemCompactionLevel(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle) {
+      return maxMemCompactionLevel(nativeHandle_,
+          columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Number of files in level-0 that would stop writes.
+   */
+  public int level0StopWriteTrigger() {
+    return level0StopWriteTrigger(null);
+  }
+
+  /**
+   * Number of files in level-0 that would stop writes.
+   *
+   * @param columnFamilyHandle the column family handle
+   */
+  public int level0StopWriteTrigger(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle) {
+    return level0StopWriteTrigger(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get DB name -- the exact same name that was provided as an argument to
+   * as path to {@link #open(Options, String)}.
+   *
+   * @return the DB name
+   */
+  public String getName() {
+    return getName(nativeHandle_);
+  }
+
+  /**
+   * Get the Env object from the DB
+   *
+   * @return the env
+   */
+  public Env getEnv() {
+    final long envHandle = getEnv(nativeHandle_);
+    if (envHandle == Env.getDefault().nativeHandle_) {
+      return Env.getDefault();
+    } else {
+      final Env env = new RocksEnv(envHandle);
+      env.disOwnNativeHandle();  // we do not own the Env!
+      return env;
+    }
+  }
+
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
+   *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @throws RocksDBException thrown if an error occurs within the native
+   *     part of the library.
+   */
+  public void flush(final FlushOptions flushOptions)
+      throws RocksDBException {
+    flush(flushOptions, (List<ColumnFamilyHandle>) null);
+  }
+
+  /**
+   * <p>Flush all memory table data.</p>
+   *
+   * <p>Note: it must be ensured that the FlushOptions instance
+   * is not GC'ed before this method finishes. If the wait parameter is
+   * set to false, flush processing is asynchronous.</p>
    *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance.
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
-  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
-      final boolean reduce_level, final int target_level,
-      final int target_path_id) throws RocksDBException {
-    compactRange(nativeHandle_, reduce_level, target_level,
-        target_path_id, columnFamilyHandle.nativeHandle_);
+  public void flush(final FlushOptions flushOptions,
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    flush(flushOptions,
+        columnFamilyHandle == null ? null : Arrays.asList(columnFamilyHandle));
   }
 
   /**
-   * <p>Range compaction of column family.</p>
-   * <p><strong>Note</strong>: After the database has been compacted,
-   * all data will have been pushed down to the last level containing
-   * any data.</p>
-   *
-   * <p>Compaction outputs should be placed in options.db_paths
-   * [target_path_id]. Behavior is undefined if target_path_id is
-   * out of range.</p>
+   * Flushes multiple column families.
    *
-   * <p><strong>See also</strong></p>
-   * <ul>
-   * <li>{@link #compactRange(ColumnFamilyHandle)}</li>
-   * <li>
-   *   {@link #compactRange(ColumnFamilyHandle, boolean, int, int)}
-   * </li>
-   * <li>
-   *   {@link #compactRange(ColumnFamilyHandle, byte[], byte[])}
-   * </li>
-   * </ul>
+   * If atomic flush is not enabled, this is equivalent to calling
+   * {@link #flush(FlushOptions, ColumnFamilyHandle)} multiple times.
    *
-   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
-   *     instance.
-   * @param begin start of key range (included in range)
-   * @param end end of key range (excluded from range)
-   * @param reduce_level reduce level after compaction
-   * @param target_level target level to compact to
-   * @param target_path_id the target path id of output path
+   * If atomic flush is enabled, this will flush all column families
+   * specified up to the latest sequence number at the time when flush is
+   * requested.
    *
+   * @param flushOptions {@link org.rocksdb.FlushOptions} instance.
+   * @param columnFamilyHandles column family handles.
    * @throws RocksDBException thrown if an error occurs within the native
    *     part of the library.
    */
-  public void compactRange(final ColumnFamilyHandle columnFamilyHandle,
-      final byte[] begin, final byte[] end, final boolean reduce_level,
-      final int target_level, final int target_path_id)
+  public void flush(final FlushOptions flushOptions,
+      /* @Nullable */ final List<ColumnFamilyHandle> columnFamilyHandles)
       throws RocksDBException {
-    compactRange(nativeHandle_, begin, begin.length, end, end.length,
-        reduce_level, target_level, target_path_id,
-        columnFamilyHandle.nativeHandle_);
+    flush(nativeHandle_, flushOptions.nativeHandle_,
+        toNativeHandleList(columnFamilyHandles));
   }
 
   /**
-   * This function will wait until all currently running background processes
-   * finish. After it returns, no background process will be run until
-   * {@link #continueBackgroundWork()} is called
+   * Flush the WAL memory buffer to the file. If {@code sync} is true,
+   * it calls {@link #syncWal()} afterwards.
    *
-   * @throws RocksDBException If an error occurs when pausing background work
+   * @param sync true to also fsync to disk.
    */
-  public void pauseBackgroundWork() throws RocksDBException {
-    pauseBackgroundWork(nativeHandle_);
+  public void flushWal(final boolean sync) throws RocksDBException {
+    flushWal(nativeHandle_, sync);
   }
 
   /**
-   * Resumes backround work which was suspended by
-   * previously calling {@link #pauseBackgroundWork()}
+   * Sync the WAL.
    *
-   * @throws RocksDBException If an error occurs when resuming background work
+   * Note that {@link #write(WriteOptions, WriteBatch)} followed by
+   * {@link #syncWal()} is not exactly the same as
+   * {@link #write(WriteOptions, WriteBatch)} with
+   * {@link WriteOptions#sync()} set to true; In the latter case the changes
+   * won't be visible until the sync is done.
+   *
+   * Currently only works if {@link Options#allowMmapWrites()} is set to false.
    */
-  public void continueBackgroundWork() throws RocksDBException {
-    continueBackgroundWork(nativeHandle_);
+  public void syncWal() throws RocksDBException {
+    syncWal(nativeHandle_);
   }
 
   /**
@@ -2045,6 +3437,25 @@ public long getLatestSequenceNumber() {
     return getLatestSequenceNumber(nativeHandle_);
   }
 
+  /**
+   * Instructs DB to preserve deletes with sequence numbers &gt;= sequenceNumber.
+   *
+   * Has no effect if DBOptions#preserveDeletes() is set to false.
+   *
+   * This function assumes that user calls this function with monotonically
+   * increasing seqnums (otherwise we can't guarantee that a particular delete
+   * hasn't been already processed).
+   *
+   * @param sequenceNumber the minimum sequence number to preserve
+   *
+   * @return true if the value was successfully updated,
+   *     false if user attempted to call if with
+   *     sequenceNumber &lt;= current value.
+   */
+  public boolean setPreserveDeletesSequenceNumber(final long sequenceNumber) {
+    return setPreserveDeletesSequenceNumber(nativeHandle_, sequenceNumber);
+  }
+
   /**
    * <p>Prevent file deletions. Compactions will continue to occur,
    * but no obsolete files will be deleted. Calling this multiple
@@ -2082,6 +3493,78 @@ public void enableFileDeletions(final boolean force)
     enableFileDeletions(nativeHandle_, force);
   }
 
+  public static class LiveFiles {
+    /**
+     * The valid size of the manifest file. The manifest file is an ever growing
+     * file, but only the portion specified here is valid for this snapshot.
+     */
+    public final long manifestFileSize;
+
+    /**
+     * The files are relative to the {@link #getName()} and are not
+     * absolute paths. Despite being relative paths, the file names begin
+     * with "/".
+     */
+    public final List<String> files;
+
+    LiveFiles(final long manifestFileSize, final List<String> files) {
+      this.manifestFileSize = manifestFileSize;
+      this.files = files;
+    }
+  }
+
+  /**
+   * Retrieve the list of all files in the database after flushing the memtable.
+   *
+   * See {@link #getLiveFiles(boolean)}.
+   *
+   * @return the live files
+   */
+  public LiveFiles getLiveFiles() throws RocksDBException {
+    return getLiveFiles(true);
+  }
+
+  /**
+   * Retrieve the list of all files in the database.
+   *
+   * In case you have multiple column families, even if {@code flushMemtable}
+   * is true, you still need to call {@link #getSortedWalFiles()}
+   * after {@link #getLiveFiles(boolean)} to compensate for new data that
+   * arrived to already-flushed column families while other column families
+   * were flushing.
+   *
+   * NOTE: Calling {@link #getLiveFiles(boolean)} followed by
+   *     {@link #getSortedWalFiles()} can generate a lossless backup.
+   *
+   * @param flushMemtable set to true to flush before recoding the live
+   *     files. Setting to false is useful when we don't want to wait for flush
+   *     which may have to wait for compaction to complete taking an
+   *     indeterminate time.
+   *
+   * @return the live files
+   */
+  public LiveFiles getLiveFiles(final boolean flushMemtable)
+      throws RocksDBException {
+     final String[] result = getLiveFiles(nativeHandle_, flushMemtable);
+     if (result == null) {
+       return null;
+     }
+     final String[] files = Arrays.copyOf(result, result.length - 1);
+     final long manifestFileSize = Long.parseLong(result[result.length - 1]);
+
+     return new LiveFiles(manifestFileSize, Arrays.asList(files));
+  }
+
+  /**
+   * Retrieve the sorted list of all wal files with earliest file first.
+   *
+   * @return the log files
+   */
+  public List<LogFile> getSortedWalFiles() throws RocksDBException {
+    final LogFile[] logFiles = getSortedWalFiles(nativeHandle_);
+    return Arrays.asList(logFiles);
+  }
+
   /**
    * <p>Returns an iterator that is positioned at a write-batch containing
    * seq_number. If the sequence number is non existent, it returns an iterator
@@ -2105,21 +3588,46 @@ public TransactionLogIterator getUpdatesSince(final long sequenceNumber)
         getUpdatesSince(nativeHandle_, sequenceNumber));
   }
 
-  public void setOptions(final ColumnFamilyHandle columnFamilyHandle,
-                         final MutableColumnFamilyOptions mutableColumnFamilyOptions)
-          throws RocksDBException {
-    setOptions(nativeHandle_, columnFamilyHandle.nativeHandle_,
-            mutableColumnFamilyOptions.getKeys(),
-            mutableColumnFamilyOptions.getValues());
+  /**
+   * Delete the file name from the db directory and update the internal state to
+   * reflect that. Supports deletion of sst and log files only. 'name' must be
+   * path relative to the db directory. eg. 000001.sst, /archive/000003.log
+   *
+   * @param name the file name
+   */
+  public void deleteFile(final String name) throws RocksDBException {
+    deleteFile(nativeHandle_, name);
   }
 
-  private long[] toNativeHandleList(final List<? extends RocksObject> objectList) {
-    final int len = objectList.size();
-    final long[] handleList = new long[len];
-    for (int i = 0; i < len; i++) {
-      handleList[i] = objectList.get(i).nativeHandle_;
-    }
-    return handleList;
+  /**
+   * Gets a list of all table files metadata.
+   *
+   * @return table files metadata.
+   */
+  public List<LiveFileMetaData> getLiveFilesMetaData() {
+    return Arrays.asList(getLiveFilesMetaData(nativeHandle_));
+  }
+
+  /**
+   * Obtains the meta data of the specified column family of the DB.
+   *
+   * @param columnFamilyHandle the column family
+   *
+   * @return the column family metadata
+   */
+  public ColumnFamilyMetaData getColumnFamilyMetaData(
+      /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) {
+    return getColumnFamilyMetaData(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Obtains the meta data of the default column family of the DB.
+   *
+   * @return the column family metadata
+   */
+  public ColumnFamilyMetaData GetColumnFamilyMetaData() {
+    return getColumnFamilyMetaData(null);
   }
 
   /**
@@ -2143,7 +3651,7 @@ public void ingestExternalFile(final List<String> filePathList,
       final IngestExternalFileOptions ingestExternalFileOptions)
       throws RocksDBException {
     ingestExternalFile(nativeHandle_, getDefaultColumnFamily().nativeHandle_,
-        filePathList.toArray(new String[filePathList.size()]),
+        filePathList.toArray(new String[0]),
         filePathList.size(), ingestExternalFileOptions.nativeHandle_);
   }
 
@@ -2170,21 +3678,218 @@ public void ingestExternalFile(final ColumnFamilyHandle columnFamilyHandle,
       final IngestExternalFileOptions ingestExternalFileOptions)
       throws RocksDBException {
     ingestExternalFile(nativeHandle_, columnFamilyHandle.nativeHandle_,
-        filePathList.toArray(new String[filePathList.size()]),
+        filePathList.toArray(new String[0]),
         filePathList.size(), ingestExternalFileOptions.nativeHandle_);
   }
 
   /**
-   * Private constructor.
+   * Verify checksum
    *
-   * @param nativeHandle The native handle of the C++ RocksDB object
+   * @throws RocksDBException if the checksum is not valid
    */
-  protected RocksDB(final long nativeHandle) {
-    super(nativeHandle);
+  public void verifyChecksum() throws RocksDBException {
+    verifyChecksum(nativeHandle_);
+  }
+
+  /**
+   * Gets the handle for the default column family
+   *
+   * @return The handle of the default column family
+   */
+  public ColumnFamilyHandle getDefaultColumnFamily() {
+    final ColumnFamilyHandle cfHandle = new ColumnFamilyHandle(this,
+        getDefaultColumnFamily(nativeHandle_));
+    cfHandle.disOwnNativeHandle();
+    return cfHandle;
+  }
+
+  /**
+   * Get the properties of all tables.
+   *
+   * @param columnFamilyHandle the column family handle, or null for the default
+   *     column family.
+   *
+   * @return the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfAllTables(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    return getPropertiesOfAllTables(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Get the properties of all tables in the default column family.
+   *
+   * @return the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfAllTables()
+      throws RocksDBException {
+    return getPropertiesOfAllTables(null);
+  }
+
+  /**
+   * Get the properties of tables in range.
+   *
+   * @param columnFamilyHandle the column family handle, or null for the default
+   *     column family.
+   * @param ranges the ranges over which to get the table properties
+   *
+   * @return the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfTablesInRange(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
+      final List<Range> ranges) throws RocksDBException {
+    return getPropertiesOfTablesInRange(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        toRangeSliceHandles(ranges));
+  }
+
+  /**
+   * Get the properties of tables in range for the default column family.
+   *
+   * @param ranges the ranges over which to get the table properties
+   *
+   * @return the properties
+   */
+  public Map<String, TableProperties> getPropertiesOfTablesInRange(
+      final List<Range> ranges) throws RocksDBException {
+    return getPropertiesOfTablesInRange(null, ranges);
+  }
+
+  /**
+   * Suggest the range to compact.
+   *
+   * @param columnFamilyHandle the column family handle, or null for the default
+   *     column family.
+   *
+   * @return the suggested range.
+   */
+  public Range suggestCompactRange(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle)
+      throws RocksDBException {
+    final long[] rangeSliceHandles = suggestCompactRange(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_);
+    return new Range(new Slice(rangeSliceHandles[0]),
+        new Slice(rangeSliceHandles[1]));
+  }
+
+  /**
+   * Suggest the range to compact for the default column family.
+   *
+   * @return the suggested range.
+   */
+  public Range suggestCompactRange()
+      throws RocksDBException {
+    return suggestCompactRange(null);
+  }
+
+  /**
+   * Promote L0.
+   *
+   * @param columnFamilyHandle the column family handle,
+   *     or null for the default column family.
+   */
+  public void promoteL0(
+      /* @Nullable */final ColumnFamilyHandle columnFamilyHandle,
+      final int targetLevel) throws RocksDBException {
+    promoteL0(nativeHandle_,
+        columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_,
+        targetLevel);
+  }
+
+  /**
+   * Promote L0 for the default column family.
+   */
+  public void promoteL0(final int targetLevel)
+      throws RocksDBException {
+    promoteL0(null, targetLevel);
+  }
+
+  /**
+   * Trace DB operations.
+   *
+   * Use {@link #endTrace()} to stop tracing.
+   *
+   * @param traceOptions the options
+   * @param traceWriter the trace writer
+   */
+  public void startTrace(final TraceOptions traceOptions,
+      final AbstractTraceWriter traceWriter) throws RocksDBException {
+    startTrace(nativeHandle_, traceOptions.getMaxTraceFileSize(),
+        traceWriter.nativeHandle_);
+    /**
+     * NOTE: {@link #startTrace(long, long, long) transfers the ownership
+     * from Java to C++, so we must disown the native handle here.
+     */
+    traceWriter.disOwnNativeHandle();
+  }
+
+  /**
+   * Stop tracing DB operations.
+   *
+   * See {@link #startTrace(TraceOptions, AbstractTraceWriter)}
+   */
+  public void endTrace() throws RocksDBException {
+    endTrace(nativeHandle_);
+  }
+
+  /**
+   * Static method to destroy the contents of the specified database.
+   * Be very careful using this method.
+   *
+   * @param path the path to the Rocksdb database.
+   * @param options {@link org.rocksdb.Options} instance.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public static void destroyDB(final String path, final Options options)
+      throws RocksDBException {
+    destroyDB(path, options.nativeHandle_);
+  }
+
+  private /* @Nullable */ long[] toNativeHandleList(
+      /* @Nullable */ final List<? extends RocksObject> objectList) {
+    if (objectList == null) {
+      return null;
+    }
+    final int len = objectList.size();
+    final long[] handleList = new long[len];
+    for (int i = 0; i < len; i++) {
+      handleList[i] = objectList.get(i).nativeHandle_;
+    }
+    return handleList;
+  }
+
+  private static long[] toRangeSliceHandles(final List<Range> ranges) {
+    final long rangeSliceHandles[] = new long [ranges.size() * 2];
+    for (int i = 0, j = 0; i < ranges.size(); i++) {
+      final Range range = ranges.get(i);
+      rangeSliceHandles[j++] = range.start.getNativeHandle();
+      rangeSliceHandles[j++] = range.limit.getNativeHandle();
+    }
+    return rangeSliceHandles;
+  }
+
+  protected void storeOptionsInstance(DBOptionsInterface options) {
+    options_ = options;
+  }
+
+  private static void checkBounds(int offset, int len, int size) {
+    if ((offset | len | (offset + len) | (size - (offset + len))) < 0) {
+      throw new IndexOutOfBoundsException(String.format("offset(%d), len(%d), size(%d)", offset, len, size));
+    }
+  }
+
+  private static int computeCapacityHint(final int estimatedNumberOfItems) {
+    // Default load factor for HashMap is 0.75, so N * 1.5 will be at the load
+    // limit. We add +1 for a buffer.
+    return (int)Math.ceil(estimatedNumberOfItems * 1.5 + 1.0);
   }
 
   // native methods
-  protected native static long open(final long optionsHandle,
+  private native static long open(final long optionsHandle,
       final String path) throws RocksDBException;
 
   /**
@@ -2199,11 +3904,11 @@ protected native static long open(final long optionsHandle,
    *
    * @throws RocksDBException thrown if the database could not be opened
    */
-  protected native static long[] open(final long optionsHandle,
+  private native static long[] open(final long optionsHandle,
       final String path, final byte[][] columnFamilyNames,
       final long[] columnFamilyOptions) throws RocksDBException;
 
-  protected native static long openROnly(final long optionsHandle,
+  private native static long openROnly(final long optionsHandle,
       final String path) throws RocksDBException;
 
   /**
@@ -2218,167 +3923,258 @@ protected native static long openROnly(final long optionsHandle,
    *
    * @throws RocksDBException thrown if the database could not be opened
    */
-  protected native static long[] openROnly(final long optionsHandle,
+  private native static long[] openROnly(final long optionsHandle,
       final String path, final byte[][] columnFamilyNames,
       final long[] columnFamilyOptions
   ) throws RocksDBException;
 
-  protected native static byte[][] listColumnFamilies(long optionsHandle,
-      String path) throws RocksDBException;
-  protected native void put(long handle, byte[] key, int keyOffset,
-      int keyLength, byte[] value, int valueOffset, int valueLength)
+  @Override protected native void disposeInternal(final long handle);
+
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
+  private native static byte[][] listColumnFamilies(final long optionsHandle,
+      final String path) throws RocksDBException;
+  private native long createColumnFamily(final long handle,
+      final byte[] columnFamilyName, final int columnFamilyNamelen,
+      final long columnFamilyOptions) throws RocksDBException;
+  private native long[] createColumnFamilies(final long handle,
+      final long columnFamilyOptionsHandle, final byte[][] columnFamilyNames)
+      throws RocksDBException;
+  private native long[] createColumnFamilies(final long handle,
+      final long columnFamilyOptionsHandles[], final byte[][] columnFamilyNames)
+      throws RocksDBException;
+  private native void dropColumnFamily(
+      final long handle, final long cfHandle) throws RocksDBException;
+  private native void dropColumnFamilies(final long handle,
+      final long[] cfHandles) throws RocksDBException;
+  //TODO(AR) best way to express DestroyColumnFamilyHandle? ...maybe in ColumnFamilyHandle?
+  private native void put(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, int valueLength) throws RocksDBException;
+  private native void put(final long handle, final byte[] key, final int keyOffset,
+      final int keyLength, final byte[] value, final int valueOffset,
+      final int valueLength, final long cfHandle) throws RocksDBException;
+  private native void put(final long handle, final long writeOptHandle,
+      final byte[] key,  final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength)
+      throws RocksDBException;
+  private native void put(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength,
+      final long cfHandle) throws RocksDBException;
+  private native void delete(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength) throws RocksDBException;
+  private native void delete(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final long cfHandle)
+      throws RocksDBException;
+  private native void delete(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength)
+      throws RocksDBException;
+  private native void delete(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final long cfHandle) throws RocksDBException;
+  private native void singleDelete(
+      final long handle, final byte[] key, final int keyLen)
       throws RocksDBException;
-  protected native void put(long handle, byte[] key, int keyOffset,
-      int keyLength, byte[] value, int valueOffset, int valueLength,
-      long cfHandle) throws RocksDBException;
-  protected native void put(long handle, long writeOptHandle, byte[] key,
-      int keyOffset, int keyLength, byte[] value, int valueOffset,
-      int valueLength) throws RocksDBException;
-  protected native void put(long handle, long writeOptHandle, byte[] key,
-      int keyOffset, int keyLength, byte[] value, int valueOffset,
-      int valueLength, long cfHandle) throws RocksDBException;
-  protected native void write0(final long handle, long writeOptHandle,
-      long wbHandle) throws RocksDBException;
-  protected native void write1(final long handle, long writeOptHandle,
-      long wbwiHandle) throws RocksDBException;
-  protected native boolean keyMayExist(final long handle, final byte[] key,
+  private native void singleDelete(
+      final long handle, final byte[] key, final int keyLen,
+      final long cfHandle) throws RocksDBException;
+  private native void singleDelete(
+      final long handle, final long writeOptHandle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  private native void singleDelete(
+      final long handle, final long writeOptHandle,
+      final byte[] key, final int keyLen, final long cfHandle)
+      throws RocksDBException;
+  private native void deleteRange(final long handle, final byte[] beginKey,
+      final int beginKeyOffset, final int beginKeyLength, final byte[] endKey,
+      final int endKeyOffset, final int endKeyLength) throws RocksDBException;
+  private native void deleteRange(final long handle, final byte[] beginKey,
+      final int beginKeyOffset, final int beginKeyLength, final byte[] endKey,
+      final int endKeyOffset, final int endKeyLength, final long cfHandle)
+      throws RocksDBException;
+  private native void deleteRange(final long handle, final long writeOptHandle,
+      final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength,
+      final byte[] endKey, final int endKeyOffset, final int endKeyLength)
+      throws RocksDBException;
+  private native void deleteRange(
+      final long handle, final long writeOptHandle, final byte[] beginKey,
+      final int beginKeyOffset, final int beginKeyLength, final byte[] endKey,
+      final int endKeyOffset, final int endKeyLength, final long cfHandle)
+      throws RocksDBException;
+  private native void merge(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, final int valueLength) throws RocksDBException;
+  private native void merge(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, final int valueLength, final long cfHandle)
+      throws RocksDBException;
+  private native void merge(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength)
+      throws RocksDBException;
+  private native void merge(final long handle, final long writeOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength,
+      final long cfHandle) throws RocksDBException;
+  private native void write0(final long handle, final long writeOptHandle,
+      final long wbHandle) throws RocksDBException;
+  private native void write1(final long handle, final long writeOptHandle,
+      final long wbwiHandle) throws RocksDBException;
+  private native int get(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final byte[] value,
+      final int valueOffset, final int valueLength) throws RocksDBException;
+  private native int get(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, byte[] value,
+      final int valueOffset, final int valueLength, final long cfHandle)
+      throws RocksDBException;
+  private native int get(final long handle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength)
+      throws RocksDBException;
+  private native int get(final long handle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength,
+      final byte[] value, final int valueOffset, final int valueLength,
+      final long cfHandle) throws RocksDBException;
+  private native byte[] get(final long handle, byte[] key, final int keyOffset,
+      final int keyLength) throws RocksDBException;
+  private native byte[] get(final long handle, final byte[] key,
+      final int keyOffset, final int keyLength, final long cfHandle)
+      throws RocksDBException;
+  private native byte[] get(final long handle, final long readOptHandle,
+      final byte[] key, final int keyOffset, final int keyLength)
+      throws RocksDBException;
+  private native byte[] get(final long handle,
+      final long readOptHandle, final byte[] key, final int keyOffset,
+      final int keyLength, final long cfHandle) throws RocksDBException;
+  private native byte[][] multiGet(final long dbHandle, final byte[][] keys,
+      final int[] keyOffsets, final int[] keyLengths);
+  private native byte[][] multiGet(final long dbHandle, final byte[][] keys,
+      final int[] keyOffsets, final int[] keyLengths,
+      final long[] columnFamilyHandles);
+  private native byte[][] multiGet(final long dbHandle, final long rOptHandle,
+      final byte[][] keys, final int[] keyOffsets, final int[] keyLengths);
+  private native byte[][] multiGet(final long dbHandle, final long rOptHandle,
+      final byte[][] keys, final int[] keyOffsets, final int[] keyLengths,
+      final long[] columnFamilyHandles);
+  private native boolean keyMayExist(final long handle, final byte[] key,
       final int keyOffset, final int keyLength,
       final StringBuilder stringBuilder);
-  protected native boolean keyMayExist(final long handle, final byte[] key,
+  private native boolean keyMayExist(final long handle, final byte[] key,
       final int keyOffset, final int keyLength, final long cfHandle,
       final StringBuilder stringBuilder);
-  protected native boolean keyMayExist(final long handle,
+  private native boolean keyMayExist(final long handle,
       final long optionsHandle, final byte[] key, final int keyOffset,
       final int keyLength, final StringBuilder stringBuilder);
-  protected native boolean keyMayExist(final long handle,
+  private native boolean keyMayExist(final long handle,
       final long optionsHandle, final byte[] key, final int keyOffset,
       final int keyLength, final long cfHandle,
       final StringBuilder stringBuilder);
-  protected native void merge(long handle, byte[] key, int keyOffset,
-      int keyLength, byte[] value, int valueOffset, int valueLength)
+  private native long iterator(final long handle);
+  private native long iterator(final long handle, final long readOptHandle);
+  private native long iteratorCF(final long handle, final long cfHandle);
+  private native long iteratorCF(final long handle, final long cfHandle,
+      final long readOptHandle);
+  private native long[] iterators(final long handle,
+      final long[] columnFamilyHandles, final long readOptHandle)
       throws RocksDBException;
-  protected native void merge(long handle, byte[] key, int keyOffset,
-      int keyLength, byte[] value, int valueOffset, int valueLength,
-      long cfHandle) throws RocksDBException;
-  protected native void merge(long handle, long writeOptHandle, byte[] key,
-      int keyOffset, int keyLength, byte[] value, int valueOffset,
-      int valueLength) throws RocksDBException;
-  protected native void merge(long handle, long writeOptHandle, byte[] key,
-      int keyOffset, int keyLength, byte[] value, int valueOffset,
-      int valueLength, long cfHandle) throws RocksDBException;
-  protected native int get(long handle, byte[] key, int keyOffset,
-      int keyLength, byte[] value, int valueOffset, int valueLength)
+  private native long getSnapshot(final long nativeHandle);
+  private native void releaseSnapshot(
+      final long nativeHandle, final long snapshotHandle);
+  private native String getProperty(final long nativeHandle,
+      final long cfHandle, final String property, final int propertyLength)
       throws RocksDBException;
-  protected native int get(long handle, byte[] key, int keyOffset,
-      int keyLength, byte[] value, int valueOffset, int valueLength,
-      long cfHandle) throws RocksDBException;
-  protected native int get(long handle, long readOptHandle, byte[] key,
-      int keyOffset, int keyLength, byte[] value, int valueOffset,
-      int valueLength) throws RocksDBException;
-  protected native int get(long handle, long readOptHandle, byte[] key,
-      int keyOffset, int keyLength, byte[] value, int valueOffset,
-      int valueLength, long cfHandle) throws RocksDBException;
-  protected native byte[][] multiGet(final long dbHandle, final byte[][] keys,
-      final int[] keyOffsets, final int[] keyLengths);
-  protected native byte[][] multiGet(final long dbHandle, final byte[][] keys,
-      final int[] keyOffsets, final int[] keyLengths,
-      final long[] columnFamilyHandles);
-  protected native byte[][] multiGet(final long dbHandle, final long rOptHandle,
-      final byte[][] keys, final int[] keyOffsets, final int[] keyLengths);
-  protected native byte[][] multiGet(final long dbHandle, final long rOptHandle,
-      final byte[][] keys, final int[] keyOffsets, final int[] keyLengths,
-      final long[] columnFamilyHandles);
-  protected native byte[] get(long handle, byte[] key, int keyOffset,
-      int keyLength) throws RocksDBException;
-  protected native byte[] get(long handle, byte[] key, int keyOffset,
-      int keyLength, long cfHandle) throws RocksDBException;
-  protected native byte[] get(long handle, long readOptHandle,
-      byte[] key, int keyOffset, int keyLength) throws RocksDBException;
-  protected native byte[] get(long handle, long readOptHandle, byte[] key,
-      int keyOffset, int keyLength, long cfHandle) throws RocksDBException;
-  protected native void delete(long handle, byte[] key, int keyOffset,
-      int keyLength) throws RocksDBException;
-  protected native void delete(long handle, byte[] key, int keyOffset,
-      int keyLength, long cfHandle) throws RocksDBException;
-  protected native void delete(long handle, long writeOptHandle, byte[] key,
-      int keyOffset, int keyLength) throws RocksDBException;
-  protected native void delete(long handle, long writeOptHandle, byte[] key,
-      int keyOffset, int keyLength, long cfHandle) throws RocksDBException;
-  protected native void singleDelete(
-      long handle, byte[] key, int keyLen) throws RocksDBException;
-  protected native void singleDelete(
-      long handle, byte[] key, int keyLen, long cfHandle)
+  private native Map<String, String> getMapProperty(final long nativeHandle,
+      final long cfHandle, final String property, final int propertyLength)
       throws RocksDBException;
-  protected native void singleDelete(
-      long handle, long writeOptHandle,
-      byte[] key, int keyLen) throws RocksDBException;
-  protected native void singleDelete(
-      long handle, long writeOptHandle,
-      byte[] key, int keyLen, long cfHandle) throws RocksDBException;
-  protected native void deleteRange(long handle, byte[] beginKey, int beginKeyOffset,
-      int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength)
+  private native long getLongProperty(final long nativeHandle,
+      final long cfHandle, final String property, final int propertyLength)
       throws RocksDBException;
-  protected native void deleteRange(long handle, byte[] beginKey, int beginKeyOffset,
-      int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength, long cfHandle)
+  private native void resetStats(final long nativeHandle)
       throws RocksDBException;
-  protected native void deleteRange(long handle, long writeOptHandle, byte[] beginKey,
-      int beginKeyOffset, int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength)
+  private native long getAggregatedLongProperty(final long nativeHandle,
+      final String property, int propertyLength) throws RocksDBException;
+  private native long[] getApproximateSizes(final long nativeHandle,
+      final long columnFamilyHandle, final long[] rangeSliceHandles,
+      final byte includeFlags);
+  private final native long[] getApproximateMemTableStats(
+      final long nativeHandle, final long columnFamilyHandle,
+      final long rangeStartSliceHandle, final long rangeLimitSliceHandle);
+  private native void compactRange(final long handle,
+      /* @Nullable */ final byte[] begin, final int beginLen,
+      /* @Nullable */ final byte[] end, final int endLen,
+      final long compactRangeOptHandle, final long cfHandle)
       throws RocksDBException;
-  protected native void deleteRange(long handle, long writeOptHandle, byte[] beginKey,
-      int beginKeyOffset, int beginKeyLength, byte[] endKey, int endKeyOffset, int endKeyLength,
-      long cfHandle) throws RocksDBException;
-  protected native String getProperty0(long nativeHandle,
-      String property, int propertyLength) throws RocksDBException;
-  protected native String getProperty0(long nativeHandle, long cfHandle,
-      String property, int propertyLength) throws RocksDBException;
-  protected native long getLongProperty(long nativeHandle, String property,
-      int propertyLength) throws RocksDBException;
-  protected native long getLongProperty(long nativeHandle, long cfHandle,
-      String property, int propertyLength) throws RocksDBException;
-  protected native long iterator(long handle);
-  protected native long iterator(long handle, long readOptHandle);
-  protected native long iteratorCF(long handle, long cfHandle);
-  protected native long iteratorCF(long handle, long cfHandle,
-      long readOptHandle);
-  protected native long[] iterators(final long handle,
-      final long[] columnFamilyHandles, final long readOptHandle)
+  private native void setOptions(final long handle, final long cfHandle,
+      final String[] keys, final String[] values) throws RocksDBException;
+  private native void setDBOptions(final long handle,
+      final String[] keys, final String[] values) throws RocksDBException;
+  private native String[] compactFiles(final long handle,
+      final long compactionOptionsHandle,
+      final long columnFamilyHandle,
+      final String[] inputFileNames,
+      final int outputLevel,
+      final int outputPathId,
+      final long compactionJobInfoHandle) throws RocksDBException;
+  private native void pauseBackgroundWork(final long handle)
       throws RocksDBException;
-  protected native long getSnapshot(long nativeHandle);
-  protected native void releaseSnapshot(long nativeHandle, long snapshotHandle);
-  @Override protected final native void disposeInternal(final long handle);
-  private native long getDefaultColumnFamily(long handle);
-  private native long createColumnFamily(final long handle,
-      final byte[] columnFamilyName, final long columnFamilyOptions)
+  private native void continueBackgroundWork(final long handle)
       throws RocksDBException;
-  private native void dropColumnFamily(long handle, long cfHandle)
+  private native void enableAutoCompaction(final long handle,
+      final long[] columnFamilyHandles) throws RocksDBException;
+  private native int numberLevels(final long handle,
+      final long columnFamilyHandle);
+  private native int maxMemCompactionLevel(final long handle,
+      final long columnFamilyHandle);
+  private native int level0StopWriteTrigger(final long handle,
+      final long columnFamilyHandle);
+  private native String getName(final long handle);
+  private native long getEnv(final long handle);
+  private native void flush(final long handle, final long flushOptHandle,
+      /* @Nullable */ final long[] cfHandles) throws RocksDBException;
+  private native void flushWal(final long handle, final boolean sync)
       throws RocksDBException;
-  private native void flush(long handle, long flushOptHandle)
+  private native void syncWal(final long handle) throws RocksDBException;
+  private native long getLatestSequenceNumber(final long handle);
+  private native boolean setPreserveDeletesSequenceNumber(final long handle,
+      final long sequenceNumber);
+  private native void disableFileDeletions(long handle) throws RocksDBException;
+  private native void enableFileDeletions(long handle, boolean force)
       throws RocksDBException;
-  private native void flush(long handle, long flushOptHandle, long cfHandle)
+  private native String[] getLiveFiles(final long handle,
+      final boolean flushMemtable) throws RocksDBException;
+  private native LogFile[] getSortedWalFiles(final long handle)
       throws RocksDBException;
-  private native void compactRange0(long handle, boolean reduce_level,
-      int target_level, int target_path_id) throws RocksDBException;
-  private native void compactRange0(long handle, byte[] begin, int beginLen,
-      byte[] end, int endLen, boolean reduce_level, int target_level,
-      int target_path_id) throws RocksDBException;
-  private native void compactRange(long handle, boolean reduce_level,
-      int target_level, int target_path_id, long cfHandle)
+  private native long getUpdatesSince(final long handle,
+      final long sequenceNumber) throws RocksDBException;
+  private native void deleteFile(final long handle, final String name)
       throws RocksDBException;
-  private native void compactRange(long handle, byte[] begin, int beginLen,
-      byte[] end, int endLen, boolean reduce_level, int target_level,
-      int target_path_id, long cfHandle) throws RocksDBException;
-  private native void pauseBackgroundWork(long handle) throws RocksDBException;
-  private native void continueBackgroundWork(long handle) throws RocksDBException;
-  private native long getLatestSequenceNumber(long handle);
-  private native void disableFileDeletions(long handle) throws RocksDBException;
-  private native void enableFileDeletions(long handle, boolean force)
+  private native LiveFileMetaData[] getLiveFilesMetaData(final long handle);
+  private native ColumnFamilyMetaData getColumnFamilyMetaData(
+      final long handle, final long columnFamilyHandle);
+  private native void ingestExternalFile(final long handle,
+      final long columnFamilyHandle,  final String[] filePathList,
+      final int filePathListLen, final long ingestExternalFileOptionsHandle)
       throws RocksDBException;
-  private native long getUpdatesSince(long handle, long sequenceNumber)
+  private native void verifyChecksum(final long handle) throws RocksDBException;
+  private native long getDefaultColumnFamily(final long handle);
+  private native Map<String, TableProperties> getPropertiesOfAllTables(
+      final long handle, final long columnFamilyHandle) throws RocksDBException;
+  private native Map<String, TableProperties> getPropertiesOfTablesInRange(
+      final long handle, final long columnFamilyHandle,
+      final long[] rangeSliceHandles);
+  private native long[] suggestCompactRange(final long handle,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void promoteL0(final long handle,
+      final long columnFamilyHandle, final int tragetLevel)
       throws RocksDBException;
-  private native void setOptions(long handle, long cfHandle, String[] keys,
-      String[] values) throws RocksDBException;
-  private native void ingestExternalFile(long handle, long cfHandle,
-      String[] filePathList, int filePathListLen,
-      long ingest_external_file_options_handle) throws RocksDBException;
+  private native void startTrace(final long handle, final long maxTraceFileSize,
+      final long traceWriterHandle) throws RocksDBException;
+  private native void endTrace(final long handle) throws RocksDBException;
+
+
+  private native static void destroyDB(final String path,
+      final long optionsHandle) throws RocksDBException;
+
   protected DBOptionsInterface options_;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java
index 8fe61fd451..b3681d77db 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksEnv.java
@@ -25,19 +25,8 @@ public class RocksEnv extends Env {
    */
   RocksEnv(final long handle) {
     super(handle);
-    disOwnNativeHandle();
   }
 
-  /**
-   * <p>The helper function of {@link #dispose()} which all subclasses of
-   * {@link RocksObject} must implement to release their associated C++
-   * resource.</p>
-   *
-   * <p><strong>Note:</strong> this class is used to use the default
-   * RocksEnv with RocksJava. The default env allocation is managed
-   * by C++.</p>
-   */
   @Override
-  protected final void disposeInternal(final long handle) {
-  }
+  protected native final void disposeInternal(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
index 9e9c648092..12c06f04e5 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIterator.java
@@ -57,6 +57,7 @@ public byte[] value() {
   @Override final native void next0(long handle);
   @Override final native void prev0(long handle);
   @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
   @Override final native void status0(long handle) throws RocksDBException;
 
   private native byte[] key0(long handle);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
index 12fdbb1973..a5a9eb88d8 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksIteratorInterface.java
@@ -41,7 +41,7 @@ public interface RocksIteratorInterface {
   void seekToLast();
 
   /**
-   * <p>Position at the first entry in the source whose key is that or
+   * <p>Position at the first entry in the source whose key is at or
    * past target.</p>
    *
    * <p>The iterator is valid after this call if the source contains
@@ -52,6 +52,18 @@ public interface RocksIteratorInterface {
    */
   void seek(byte[] target);
 
+  /**
+   * <p>Position at the first entry in the source whose key is that or
+   * before target.</p>
+   *
+   * <p>The iterator is valid after this call if the source contains
+   * a key that comes at or before target.</p>
+   *
+   * @param target byte array describing a key or a
+   *               key prefix to seek for.
+   */
+  void seekForPrev(byte[] target);
+
   /**
    * <p>Moves to the next entry in the source.  After this call, Valid() is
    * true if the iterator was not positioned at the last entry in the source.</p>
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java
index d18d0ceb97..0afa5f6623 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/RocksMemEnv.java
@@ -6,22 +6,34 @@
 package org.rocksdb;
 
 /**
- * RocksDB memory environment.
+ * Memory environment.
  */
+//TODO(AR) rename to MemEnv
 public class RocksMemEnv extends Env {
 
   /**
-   * <p>Creates a new RocksDB environment that stores its data
+   * <p>Creates a new environment that stores its data
    * in memory and delegates all non-file-storage tasks to
-   * base_env. The caller must delete the result when it is
+   * {@code baseEnv}.</p>
+   *
+   * <p>The caller must delete the result when it is
    * no longer needed.</p>
    *
-   * <p>{@code *base_env} must remain live while the result is in use.</p>
+   * @param baseEnv the base environment,
+   *     must remain live while the result is in use.
+   */
+  public RocksMemEnv(final Env baseEnv) {
+    super(createMemEnv(baseEnv.nativeHandle_));
+  }
+
+  /**
+   * @deprecated Use {@link #RocksMemEnv(Env)}.
    */
+  @Deprecated
   public RocksMemEnv() {
-    super(createMemEnv());
+    this(Env.getDefault());
   }
 
-  private static native long createMemEnv();
+  private static native long createMemEnv(final long baseEnvHandle);
   @Override protected final native void disposeInternal(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SizeApproximationFlag.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SizeApproximationFlag.java
new file mode 100644
index 0000000000..7807e7c835
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SizeApproximationFlag.java
@@ -0,0 +1,30 @@
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Flags for
+ * {@link RocksDB#getApproximateSizes(ColumnFamilyHandle, List, SizeApproximationFlag...)}
+ * that specify whether memtable stats should be included,
+ * or file stats approximation or both.
+ */
+public enum SizeApproximationFlag {
+  NONE((byte)0x0),
+  INCLUDE_MEMTABLES((byte)0x1),
+  INCLUDE_FILES((byte)0x2);
+
+  private final byte value;
+
+  SizeApproximationFlag(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal byte representation.
+   *
+   * @return the internal representation.
+   */
+  byte getValue() {
+    return value;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Slice.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Slice.java
index a122c3769d..50d9f76525 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Slice.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Slice.java
@@ -39,6 +39,30 @@ private Slice() {
     super();
   }
 
+  /**
+   * <p>Package-private Slice constructor which is used to construct
+   * Slice instances from C++ side. As the reference to this
+   * object is also managed from C++ side the handle will be disowned.</p>
+   *
+   * @param nativeHandle address of native instance.
+   */
+  Slice(final long nativeHandle) {
+    this(nativeHandle, false);
+  }
+
+  /**
+   * <p>Package-private Slice constructor which is used to construct
+   * Slice instances using a handle. </p>
+   *
+   * @param nativeHandle address of native instance.
+   * @param owningNativeHandle true if the Java side owns the memory pointed to
+   *     by this reference, false if ownership belongs to the C++ side
+   */
+  Slice(final long nativeHandle, final boolean owningNativeHandle) {
+    super();
+    setNativeHandle(nativeHandle, owningNativeHandle);
+  }
+
   /**
    * <p>Constructs a slice where the data is taken from
    * a String.</p>
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java
index a6b53f495f..39cdf0c2d2 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Snapshot.java
@@ -11,6 +11,10 @@
 public class Snapshot extends RocksObject {
   Snapshot(final long nativeHandle) {
     super(nativeHandle);
+
+    // The pointer to the snapshot is always released
+    // by the database instance.
+    disOwnNativeHandle();
   }
 
   /**
@@ -20,17 +24,17 @@ public class Snapshot extends RocksObject {
    *     this snapshot.
    */
   public long getSequenceNumber() {
-    assert(isOwningHandle());
     return getSequenceNumber(nativeHandle_);
   }
 
-  /**
-   * Dont release C++ Snapshot pointer. The pointer
-   * to the snapshot is released by the database
-   * instance.
-   */
   @Override
   protected final void disposeInternal(final long handle) {
+    /**
+     * Nothing to release, we never own the pointer for a
+     * Snapshot. The pointer
+     * to the snapshot is released by the database
+     * instance.
+     */
   }
 
   private native long getSequenceNumber(long handle);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileManager.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileManager.java
new file mode 100644
index 0000000000..8805410aa8
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileManager.java
@@ -0,0 +1,251 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Map;
+
+/**
+ * SstFileManager is used to track SST files in the DB and control their
+ * deletion rate.
+ *
+ * All SstFileManager public functions are thread-safe.
+ *
+ * SstFileManager is not extensible.
+ */
+//@ThreadSafe
+public final class SstFileManager extends RocksObject {
+
+  public static final long RATE_BYTES_PER_SEC_DEFAULT = 0;
+  public static final boolean DELETE_EXISTING_TRASH_DEFAULT = true;
+  public static final double MAX_TRASH_DB_RATION_DEFAULT = 0.25;
+  public static final long BYTES_MAX_DELETE_CHUNK_DEFAULT = 64 * 1024 * 1024;
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env) throws RocksDBException {
+    this(env, null);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/  final Logger logger)
+      throws RocksDBException {
+    this(env, logger, RATE_BYTES_PER_SEC_DEFAULT);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * == Deletion rate limiting specific arguments ==
+   * @param rateBytesPerSec how many bytes should be deleted per second, If
+   *     this value is set to 1024 (1 Kb / sec) and we deleted a file of size
+   *     4 Kb in 1 second, we will wait for another 3 seconds before we delete
+   *     other files, Set to 0 to disable deletion rate limiting.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/  final Logger logger,
+      final long rateBytesPerSec) throws RocksDBException {
+    this(env, logger, rateBytesPerSec, MAX_TRASH_DB_RATION_DEFAULT);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * == Deletion rate limiting specific arguments ==
+   * @param rateBytesPerSec how many bytes should be deleted per second, If
+   *     this value is set to 1024 (1 Kb / sec) and we deleted a file of size
+   *     4 Kb in 1 second, we will wait for another 3 seconds before we delete
+   *     other files, Set to 0 to disable deletion rate limiting.
+   * @param maxTrashDbRatio if the trash size constitutes for more than this
+   *     fraction of the total DB size we will start deleting new files passed
+   *     to DeleteScheduler immediately.
+   *
+   *  @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/ final Logger logger,
+      final long rateBytesPerSec, final double maxTrashDbRatio)
+      throws RocksDBException {
+    this(env, logger, rateBytesPerSec, maxTrashDbRatio,
+        BYTES_MAX_DELETE_CHUNK_DEFAULT);
+  }
+
+  /**
+   * Create a new SstFileManager that can be shared among multiple RocksDB
+   * instances to track SST file and control there deletion rate.
+   *
+   * @param env the environment.
+   * @param logger if not null, the logger will be used to log errors.
+   *
+   * == Deletion rate limiting specific arguments ==
+   * @param rateBytesPerSec how many bytes should be deleted per second, If
+   *     this value is set to 1024 (1 Kb / sec) and we deleted a file of size
+   *     4 Kb in 1 second, we will wait for another 3 seconds before we delete
+   *     other files, Set to 0 to disable deletion rate limiting.
+   * @param maxTrashDbRatio if the trash size constitutes for more than this
+   *     fraction of the total DB size we will start deleting new files passed
+   *     to DeleteScheduler immediately.
+   * @param bytesMaxDeleteChunk if a single file is larger than delete chunk,
+   *     ftruncate the file by this size each time, rather than dropping the whole
+   *     file. 0 means to always delete the whole file.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native library.
+   */
+  public SstFileManager(final Env env, /*@Nullable*/final Logger logger,
+      final long rateBytesPerSec, final double maxTrashDbRatio,
+      final long bytesMaxDeleteChunk) throws RocksDBException {
+    super(newSstFileManager(env.nativeHandle_,
+        logger != null ? logger.nativeHandle_ : 0,
+        rateBytesPerSec, maxTrashDbRatio, bytesMaxDeleteChunk));
+  }
+
+
+  /**
+   * Update the maximum allowed space that should be used by RocksDB, if
+   * the total size of the SST files exceeds {@code maxAllowedSpace}, writes to
+   * RocksDB will fail.
+   *
+   * Setting {@code maxAllowedSpace} to 0 will disable this feature;
+   * maximum allowed space will be infinite (Default value).
+   *
+   * @param maxAllowedSpace the maximum allowed space that should be used by
+   *     RocksDB.
+   */
+  public void setMaxAllowedSpaceUsage(final long maxAllowedSpace) {
+    setMaxAllowedSpaceUsage(nativeHandle_, maxAllowedSpace);
+  }
+
+  /**
+   * Set the amount of buffer room each compaction should be able to leave.
+   * In other words, at its maximum disk space consumption, the compaction
+   * should still leave {@code compactionBufferSize} available on the disk so
+   * that other background functions may continue, such as logging and flushing.
+   *
+   * @param compactionBufferSize the amount of buffer room each compaction
+   *     should be able to leave.
+   */
+  public void setCompactionBufferSize(final long compactionBufferSize) {
+    setCompactionBufferSize(nativeHandle_, compactionBufferSize);
+  }
+
+  /**
+   * Determines if the total size of SST files exceeded the maximum allowed
+   * space usage.
+   *
+   * @return true when the maximum allows space usage has been exceeded.
+   */
+  public boolean isMaxAllowedSpaceReached() {
+    return isMaxAllowedSpaceReached(nativeHandle_);
+  }
+
+  /**
+   * Determines if the total size of SST files as well as estimated size
+   * of ongoing compactions exceeds the maximums allowed space usage.
+   *
+   * @return true when the total size of SST files as well as estimated size
+   * of ongoing compactions exceeds the maximums allowed space usage.
+   */
+  public boolean isMaxAllowedSpaceReachedIncludingCompactions() {
+    return isMaxAllowedSpaceReachedIncludingCompactions(nativeHandle_);
+  }
+
+  /**
+   * Get the total size of all tracked files.
+   *
+   * @return the total size of all tracked files.
+   */
+  public long getTotalSize() {
+    return getTotalSize(nativeHandle_);
+  }
+
+  /**
+   * Gets all tracked files and their corresponding sizes.
+   *
+   * @return a map containing all tracked files and there corresponding sizes.
+   */
+  public Map<String, Long> getTrackedFiles() {
+    return getTrackedFiles(nativeHandle_);
+  }
+
+  /**
+   * Gets the delete rate limit.
+   *
+   * @return the delete rate limit (in bytes per second).
+   */
+  public long getDeleteRateBytesPerSecond() {
+    return getDeleteRateBytesPerSecond(nativeHandle_);
+  }
+
+  /**
+   * Set the delete rate limit.
+   *
+   * Zero means disable delete rate limiting and delete files immediately.
+   *
+   * @param deleteRate the delete rate limit (in bytes per second).
+   */
+  public void setDeleteRateBytesPerSecond(final long deleteRate) {
+    setDeleteRateBytesPerSecond(nativeHandle_, deleteRate);
+  }
+
+  /**
+   * Get the trash/DB size ratio where new files will be deleted immediately.
+   *
+   * @return the trash/DB size ratio.
+   */
+  public double getMaxTrashDBRatio() {
+    return getMaxTrashDBRatio(nativeHandle_);
+  }
+
+  /**
+   * Set the trash/DB size ratio where new files will be deleted immediately.
+   *
+   * @param ratio the trash/DB size ratio.
+   */
+  public void setMaxTrashDBRatio(final double ratio) {
+    setMaxTrashDBRatio(nativeHandle_, ratio);
+  }
+
+  private native static long newSstFileManager(final long handle,
+      final long logger_handle, final long rateBytesPerSec,
+      final double maxTrashDbRatio, final long bytesMaxDeleteChunk)
+      throws RocksDBException;
+  private native void setMaxAllowedSpaceUsage(final long handle,
+      final long maxAllowedSpace);
+  private native void setCompactionBufferSize(final long handle,
+      final long compactionBufferSize);
+  private native boolean isMaxAllowedSpaceReached(final long handle);
+  private native boolean isMaxAllowedSpaceReachedIncludingCompactions(
+      final long handle);
+  private native long getTotalSize(final long handle);
+  private native Map<String, Long> getTrackedFiles(final long handle);
+  private native long getDeleteRateBytesPerSecond(final long handle);
+  private native void setDeleteRateBytesPerSecond(final long handle,
+        final long deleteRate);
+  private native double getMaxTrashDBRatio(final long handle);
+  private native void setMaxTrashDBRatio(final long handle, final double ratio);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileMetaData.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileMetaData.java
new file mode 100644
index 0000000000..52e984dff2
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileMetaData.java
@@ -0,0 +1,150 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The metadata that describes a SST file.
+ */
+public class SstFileMetaData {
+  private final String fileName;
+  private final String path;
+  private final long size;
+  private final long smallestSeqno;
+  private final long largestSeqno;
+  private final byte[] smallestKey;
+  private final byte[] largestKey;
+  private final long numReadsSampled;
+  private final boolean beingCompacted;
+  private final long numEntries;
+  private final long numDeletions;
+
+  /**
+   * Called from JNI C++
+   */
+  protected SstFileMetaData(
+      final String fileName,
+      final String path,
+      final long size,
+      final long smallestSeqno,
+      final long largestSeqno,
+      final byte[] smallestKey,
+      final byte[] largestKey,
+      final long numReadsSampled,
+      final boolean beingCompacted,
+      final long numEntries,
+      final long numDeletions) {
+    this.fileName = fileName;
+    this.path = path;
+    this.size = size;
+    this.smallestSeqno = smallestSeqno;
+    this.largestSeqno = largestSeqno;
+    this.smallestKey = smallestKey;
+    this.largestKey = largestKey;
+    this.numReadsSampled = numReadsSampled;
+    this.beingCompacted = beingCompacted;
+    this.numEntries = numEntries;
+    this.numDeletions = numDeletions;
+  }
+
+  /**
+   * Get the name of the file.
+   *
+   * @return the name of the file.
+   */
+  public String fileName() {
+    return fileName;
+  }
+
+  /**
+   * Get the full path where the file locates.
+   *
+   * @return the full path
+   */
+  public String path() {
+    return path;
+  }
+
+  /**
+   * Get the file size in bytes.
+   *
+   * @return file size
+   */
+  public long size() {
+    return size;
+  }
+
+  /**
+   * Get the smallest sequence number in file.
+   *
+   * @return the smallest sequence number
+   */
+  public long smallestSeqno() {
+    return smallestSeqno;
+  }
+
+  /**
+   * Get the largest sequence number in file.
+   *
+   * @return the largest sequence number
+   */
+  public long largestSeqno() {
+    return largestSeqno;
+  }
+
+  /**
+   * Get the smallest user defined key in the file.
+   *
+   * @return the smallest user defined key
+   */
+  public byte[] smallestKey() {
+    return smallestKey;
+  }
+
+  /**
+   * Get the largest user defined key in the file.
+   *
+   * @return the largest user defined key
+   */
+  public byte[] largestKey() {
+    return largestKey;
+  }
+
+  /**
+   * Get the number of times the file has been read.
+   *
+   * @return the number of times the file has been read
+   */
+  public long numReadsSampled() {
+    return numReadsSampled;
+  }
+
+  /**
+   * Returns true if the file is currently being compacted.
+   *
+   * @return true if the file is currently being compacted, false otherwise.
+   */
+  public boolean beingCompacted() {
+    return beingCompacted;
+  }
+
+  /**
+   * Get the number of entries.
+   *
+   * @return the number of entries.
+   */
+  public long numEntries() {
+    return numEntries;
+  }
+
+  /**
+   * Get the number of deletions.
+   *
+   * @return the number of deletions.
+   */
+  public long numDeletions() {
+    return numDeletions;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java
index 5f35f0f61d..447e41ea9d 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/SstFileWriter.java
@@ -30,7 +30,8 @@ public class SstFileWriter extends RocksObject {
   public SstFileWriter(final EnvOptions envOptions, final Options options,
       final AbstractComparator<? extends AbstractSlice<?>> comparator) {
     super(newSstFileWriter(
-        envOptions.nativeHandle_, options.nativeHandle_, comparator.getNativeHandle()));
+        envOptions.nativeHandle_, options.nativeHandle_, comparator.nativeHandle_,
+        comparator.getComparatorType().getValue()));
   }
 
   /**
@@ -224,7 +225,7 @@ public void finish() throws RocksDBException {
 
   private native static long newSstFileWriter(
       final long envOptionsHandle, final long optionsHandle,
-      final long userComparatorHandle);
+      final long userComparatorHandle, final byte comparatorType);
 
   private native static long newSstFileWriter(final long envOptionsHandle,
       final long optionsHandle);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StateType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StateType.java
new file mode 100644
index 0000000000..803456bb2d
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StateType.java
@@ -0,0 +1,53 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The type used to refer to a thread state.
+ *
+ * A state describes lower-level action of a thread
+ * such as reading / writing a file or waiting for a mutex.
+ */
+public enum StateType {
+  STATE_UNKNOWN((byte)0x0),
+  STATE_MUTEX_WAIT((byte)0x1);
+
+  private final byte value;
+
+  StateType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the State type from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the state type
+   *
+   * @throws IllegalArgumentException if the value does not match
+   *     a StateType
+   */
+  static StateType fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final StateType threadType : StateType.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Unknown value for StateType: " + value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Statistics.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Statistics.java
index 10c072c897..0938a6d583 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Statistics.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Statistics.java
@@ -117,6 +117,8 @@ public String getHistogramString(final HistogramType histogramType) {
 
   /**
    * Resets all ticker and histogram stats.
+   *
+   * @throws RocksDBException if an error occurs when resetting the statistics.
    */
   public void reset() throws RocksDBException {
     assert(isOwningHandle());
@@ -126,6 +128,7 @@ public void reset() throws RocksDBException {
   /**
    * String representation of the statistic object.
    */
+  @Override
   public String toString() {
     assert(isOwningHandle());
     return toString(nativeHandle_);
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java
index 48cf8af88e..fb3f57150f 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatisticsCollector.java
@@ -93,9 +93,9 @@ public void run() {
                   statsCallback.histogramCallback(histogramType, histogramData);
                 }
               }
-
-              Thread.sleep(_statsCollectionInterval);
             }
+
+            Thread.sleep(_statsCollectionInterval);
           }
           catch (final InterruptedException e) {
             Thread.currentThread().interrupt();
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatsLevel.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatsLevel.java
index cc2a87c6a2..58504b84a2 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatsLevel.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StatsLevel.java
@@ -60,6 +60,6 @@ public static StatsLevel getStatsLevel(final byte value) {
             }
         }
         throw new IllegalArgumentException(
-                "Illegal value provided for InfoLogLevel.");
+                "Illegal value provided for StatsLevel.");
     }
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Status.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Status.java
index d34b72c691..e633940c29 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Status.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Status.java
@@ -54,6 +54,7 @@ public String getCodeString() {
     return builder.toString();
   }
 
+  // should stay in sync with /include/rocksdb/status.h:Code and /java/rocksjni/portal.h:toJavaStatusCode
   public enum Code {
     Ok(                 (byte)0x0),
     NotFound(           (byte)0x1),
@@ -68,7 +69,8 @@ public enum Code {
     Aborted(            (byte)0xA),
     Busy(               (byte)0xB),
     Expired(            (byte)0xC),
-    TryAgain(           (byte)0xD);
+    TryAgain(           (byte)0xD),
+    Undefined(          (byte)0x7F);
 
     private final byte value;
 
@@ -83,16 +85,30 @@ public static Code getCode(final byte value) {
         }
       }
       throw new IllegalArgumentException(
-          "Illegal value provided for Code.");
+          "Illegal value provided for Code (" + value + ").");
+    }
+
+    /**
+     * Returns the byte value of the enumerations value.
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+      return value;
     }
   }
 
+  // should stay in sync with /include/rocksdb/status.h:SubCode and /java/rocksjni/portal.h:toJavaStatusSubCode
   public enum SubCode {
     None(         (byte)0x0),
     MutexTimeout( (byte)0x1),
     LockTimeout(  (byte)0x2),
     LockLimit(    (byte)0x3),
-    MaxSubCode(   (byte)0x7E);
+    NoSpace(      (byte)0x4),
+    Deadlock(     (byte)0x5),
+    StaleFile(    (byte)0x6),
+    MemoryLimit(  (byte)0x7),
+    Undefined(    (byte)0x7F);
 
     private final byte value;
 
@@ -107,7 +123,16 @@ public static SubCode getSubCode(final byte value) {
         }
       }
       throw new IllegalArgumentException(
-          "Illegal value provided for SubCode.");
+          "Illegal value provided for SubCode (" + value + ").");
+    }
+
+    /**
+     * Returns the byte value of the enumerations value.
+     *
+     * @return byte representation
+     */
+    public byte getValue() {
+      return value;
     }
   }
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
index 85c36adc7c..978cad6ccf 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/StringAppendOperator.java
@@ -11,9 +11,13 @@
  */
 public class StringAppendOperator extends MergeOperator {
     public StringAppendOperator() {
-        super(newSharedStringAppendOperator());
+        this(',');
     }
 
-    private native static long newSharedStringAppendOperator();
+    public StringAppendOperator(char delim) {
+        super(newSharedStringAppendOperator(delim));
+    }
+
+    private native static long newSharedStringAppendOperator(final char delim);
     @Override protected final native void disposeInternal(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TableFilter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TableFilter.java
new file mode 100644
index 0000000000..45605063b5
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TableFilter.java
@@ -0,0 +1,20 @@
+package org.rocksdb;
+
+/**
+ * Filter for iterating a table.
+ */
+public interface TableFilter {
+
+  /**
+   * A callback to determine whether relevant keys for this scan exist in a
+   * given table based on the table's properties. The callback is passed the
+   * properties of each table during iteration. If the callback returns false,
+   * the table will not be scanned. This option only affects Iterators and has
+   * no impact on point lookups.
+   *
+   * @param tableProperties the table properties.
+   *
+   * @return true if the table should be scanned, false otherwise.
+   */
+  boolean filter(final TableProperties tableProperties);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java
new file mode 100644
index 0000000000..5fe98da67b
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TableProperties.java
@@ -0,0 +1,365 @@
+package org.rocksdb;
+
+import java.util.Map;
+
+/**
+ * TableProperties contains read-only properties of its associated
+ * table.
+ */
+public class TableProperties {
+  private final long dataSize;
+  private final long indexSize;
+  private final long indexPartitions;
+  private final long topLevelIndexSize;
+  private final long indexKeyIsUserKey;
+  private final long indexValueIsDeltaEncoded;
+  private final long filterSize;
+  private final long rawKeySize;
+  private final long rawValueSize;
+  private final long numDataBlocks;
+  private final long numEntries;
+  private final long numDeletions;
+  private final long numMergeOperands;
+  private final long numRangeDeletions;
+  private final long formatVersion;
+  private final long fixedKeyLen;
+  private final long columnFamilyId;
+  private final long creationTime;
+  private final long oldestKeyTime;
+  private final byte[] columnFamilyName;
+  private final String filterPolicyName;
+  private final String comparatorName;
+  private final String mergeOperatorName;
+  private final String prefixExtractorName;
+  private final String propertyCollectorsNames;
+  private final String compressionName;
+  private final Map<String, String> userCollectedProperties;
+  private final Map<String, String> readableProperties;
+  private final Map<String, Long> propertiesOffsets;
+
+  /**
+   * Access is private as this will only be constructed from
+   * C++ via JNI.
+   */
+  private TableProperties(final long dataSize, final long indexSize,
+      final long indexPartitions, final long topLevelIndexSize,
+      final long indexKeyIsUserKey, final long indexValueIsDeltaEncoded,
+      final long filterSize, final long rawKeySize, final long rawValueSize,
+      final long numDataBlocks, final long numEntries, final long numDeletions,
+      final long numMergeOperands, final long numRangeDeletions,
+      final long formatVersion, final long fixedKeyLen,
+      final long columnFamilyId, final long creationTime,
+      final long oldestKeyTime, final byte[] columnFamilyName,
+      final String filterPolicyName, final String comparatorName,
+      final String mergeOperatorName, final String prefixExtractorName,
+      final String propertyCollectorsNames, final String compressionName,
+      final Map<String, String> userCollectedProperties,
+      final Map<String, String> readableProperties,
+      final Map<String, Long> propertiesOffsets) {
+    this.dataSize = dataSize;
+    this.indexSize = indexSize;
+    this.indexPartitions = indexPartitions;
+    this.topLevelIndexSize = topLevelIndexSize;
+    this.indexKeyIsUserKey = indexKeyIsUserKey;
+    this.indexValueIsDeltaEncoded = indexValueIsDeltaEncoded;
+    this.filterSize = filterSize;
+    this.rawKeySize = rawKeySize;
+    this.rawValueSize = rawValueSize;
+    this.numDataBlocks = numDataBlocks;
+    this.numEntries = numEntries;
+    this.numDeletions = numDeletions;
+    this.numMergeOperands = numMergeOperands;
+    this.numRangeDeletions = numRangeDeletions;
+    this.formatVersion = formatVersion;
+    this.fixedKeyLen = fixedKeyLen;
+    this.columnFamilyId = columnFamilyId;
+    this.creationTime = creationTime;
+    this.oldestKeyTime = oldestKeyTime;
+    this.columnFamilyName = columnFamilyName;
+    this.filterPolicyName = filterPolicyName;
+    this.comparatorName = comparatorName;
+    this.mergeOperatorName = mergeOperatorName;
+    this.prefixExtractorName = prefixExtractorName;
+    this.propertyCollectorsNames = propertyCollectorsNames;
+    this.compressionName = compressionName;
+    this.userCollectedProperties = userCollectedProperties;
+    this.readableProperties = readableProperties;
+    this.propertiesOffsets = propertiesOffsets;
+  }
+
+  /**
+   * Get the total size of all data blocks.
+   *
+   * @return the total size of all data blocks.
+   */
+  public long getDataSize() {
+    return dataSize;
+  }
+
+  /**
+   * Get the size of index block.
+   *
+   * @return the size of index block.
+   */
+  public long getIndexSize() {
+    return indexSize;
+  }
+
+  /**
+   * Get the total number of index partitions
+   * if {@link IndexType#kTwoLevelIndexSearch} is used.
+   *
+   * @return the total number of index partitions.
+   */
+  public long getIndexPartitions() {
+    return indexPartitions;
+  }
+
+  /**
+   * Size of the top-level index
+   * if {@link IndexType#kTwoLevelIndexSearch} is used.
+   *
+   * @return the size of the top-level index.
+   */
+  public long getTopLevelIndexSize() {
+    return topLevelIndexSize;
+  }
+
+  /**
+   * Whether the index key is user key.
+   * Otherwise it includes 8 byte of sequence
+   * number added by internal key format.
+   *
+   * @return the index key
+   */
+  public long getIndexKeyIsUserKey() {
+    return indexKeyIsUserKey;
+  }
+
+  /**
+   * Whether delta encoding is used to encode the index values.
+   *
+   * @return whether delta encoding is used to encode the index values.
+   */
+  public long getIndexValueIsDeltaEncoded() {
+    return indexValueIsDeltaEncoded;
+  }
+
+  /**
+   * Get the size of filter block.
+   *
+   * @return the size of filter block.
+   */
+  public long getFilterSize() {
+    return filterSize;
+  }
+
+  /**
+   * Get the total raw key size.
+   *
+   * @return the total raw key size.
+   */
+  public long getRawKeySize() {
+    return rawKeySize;
+  }
+
+  /**
+   * Get the total raw value size.
+   *
+   * @return the total raw value size.
+   */
+  public long getRawValueSize() {
+    return rawValueSize;
+  }
+
+  /**
+   * Get the number of blocks in this table.
+   *
+   * @return the number of blocks in this table.
+   */
+  public long getNumDataBlocks() {
+    return numDataBlocks;
+  }
+
+  /**
+   * Get the number of entries in this table.
+   *
+   * @return the number of entries in this table.
+   */
+  public long getNumEntries() {
+    return numEntries;
+  }
+
+  /**
+   * Get the number of deletions in the table.
+   *
+   * @return the number of deletions in the table.
+   */
+  public long getNumDeletions() {
+    return numDeletions;
+  }
+
+  /**
+   * Get the number of merge operands in the table.
+   *
+   * @return the number of merge operands in the table.
+   */
+  public long getNumMergeOperands() {
+    return numMergeOperands;
+  }
+
+  /**
+   * Get the number of range deletions in this table.
+   *
+   * @return the number of range deletions in this table.
+   */
+  public long getNumRangeDeletions() {
+    return numRangeDeletions;
+  }
+
+  /**
+   * Get the format version, reserved for backward compatibility.
+   *
+   * @return the format version.
+   */
+  public long getFormatVersion() {
+    return formatVersion;
+  }
+
+  /**
+   * Get the length of the keys.
+   *
+   * @return 0 when the key is variable length, otherwise number of
+   *     bytes for each key.
+   */
+  public long getFixedKeyLen() {
+    return fixedKeyLen;
+  }
+
+  /**
+   * Get the ID of column family for this SST file,
+   * corresponding to the column family identified by
+   * {@link #getColumnFamilyName()}.
+   *
+   * @return the id of the column family.
+   */
+  public long getColumnFamilyId() {
+    return columnFamilyId;
+  }
+
+  /**
+   * The time when the SST file was created.
+   * Since SST files are immutable, this is equivalent
+   * to last modified time.
+   *
+   * @return the created time.
+   */
+  public long getCreationTime() {
+    return creationTime;
+  }
+
+  /**
+   * Get the timestamp of the earliest key.
+   *
+   * @return 0 means unknown, otherwise the timestamp.
+   */
+  public long getOldestKeyTime() {
+    return oldestKeyTime;
+  }
+
+  /**
+   * Get the name of the column family with which this
+   * SST file is associated.
+   *
+   * @return the name of the column family, or null if the
+   *     column family is unknown.
+   */
+  /*@Nullable*/ public byte[] getColumnFamilyName() {
+    return columnFamilyName;
+  }
+
+  /**
+   * Get the name of the filter policy used in this table.
+   *
+   * @return the name of the filter policy, or null if
+   *     no filter policy is used.
+   */
+  /*@Nullable*/ public String getFilterPolicyName() {
+    return filterPolicyName;
+  }
+
+  /**
+   * Get the name of the comparator used in this table.
+   *
+   * @return the name of the comparator.
+   */
+  public String getComparatorName() {
+    return comparatorName;
+  }
+
+  /**
+   * Get the name of the merge operator used in this table.
+   *
+   * @return the name of the merge operator, or null if no merge operator
+   *      is used.
+   */
+  /*@Nullable*/ public String getMergeOperatorName() {
+    return mergeOperatorName;
+  }
+
+  /**
+   * Get the name of the prefix extractor used in this table.
+   *
+   * @return the name of the prefix extractor, or null if no prefix
+   *     extractor is used.
+   */
+  /*@Nullable*/ public String getPrefixExtractorName() {
+    return prefixExtractorName;
+  }
+
+  /**
+   * Get the names of the property collectors factories used in this table.
+   *
+   * @return the names of the property collector factories separated
+   *     by commas, e.g. {collector_name[1]},{collector_name[2]},...
+   */
+  public String getPropertyCollectorsNames() {
+    return propertyCollectorsNames;
+  }
+
+  /**
+   * Get the name of the compression algorithm used to compress the SST files.
+   *
+   * @return the name of the compression algorithm.
+   */
+  public String getCompressionName() {
+    return compressionName;
+  }
+
+  /**
+   * Get the user collected properties.
+   *
+   * @return the user collected properties.
+   */
+  public Map<String, String> getUserCollectedProperties() {
+    return userCollectedProperties;
+  }
+
+  /**
+   * Get the readable properties.
+   *
+   * @return the readable properties.
+   */
+  public Map<String, String> getReadableProperties() {
+    return readableProperties;
+  }
+
+  /**
+   * The offset of the value of each property in the file.
+   *
+   * @return the offset of each property.
+   */
+  public Map<String, Long> getPropertiesOffsets() {
+    return propertiesOffsets;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ThreadStatus.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ThreadStatus.java
new file mode 100644
index 0000000000..062df5889e
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ThreadStatus.java
@@ -0,0 +1,224 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Map;
+
+public class ThreadStatus {
+  private final long threadId;
+  private final ThreadType threadType;
+  private final String dbName;
+  private final String cfName;
+  private final OperationType operationType;
+  private final long operationElapsedTime; // microseconds
+  private final OperationStage operationStage;
+  private final long  operationProperties[];
+  private final StateType stateType;
+
+  /**
+   * Invoked from C++ via JNI
+   */
+  private ThreadStatus(final long threadId,
+                       final byte threadTypeValue,
+                       final String dbName,
+                       final String cfName,
+                       final byte operationTypeValue,
+                       final long operationElapsedTime,
+                       final byte operationStageValue,
+                       final long[] operationProperties,
+                       final byte stateTypeValue) {
+    this.threadId = threadId;
+    this.threadType = ThreadType.fromValue(threadTypeValue);
+    this.dbName = dbName;
+    this.cfName = cfName;
+    this.operationType = OperationType.fromValue(operationTypeValue);
+    this.operationElapsedTime = operationElapsedTime;
+    this.operationStage = OperationStage.fromValue(operationStageValue);
+    this.operationProperties = operationProperties;
+    this.stateType = StateType.fromValue(stateTypeValue);
+  }
+
+  /**
+   * Get the unique ID of the thread.
+   *
+   * @return the thread id
+   */
+  public long getThreadId() {
+    return threadId;
+  }
+
+  /**
+   * Get the type of the thread.
+   *
+   * @return the type of the thread.
+   */
+  public ThreadType getThreadType() {
+    return threadType;
+  }
+
+  /**
+   * The name of the DB instance that the thread is currently
+   * involved with.
+   *
+   * @return the name of the db, or null if the thread is not involved
+   *     in any DB operation.
+   */
+  /* @Nullable */ public String getDbName() {
+    return dbName;
+  }
+
+  /**
+   * The name of the Column Family that the thread is currently
+   * involved with.
+   *
+   * @return the name of the db, or null if the thread is not involved
+   *     in any column Family operation.
+   */
+  /* @Nullable */ public String getCfName() {
+    return cfName;
+  }
+
+  /**
+   * Get the operation (high-level action) that the current thread is involved
+   * with.
+   *
+   * @return the operation
+   */
+  public OperationType getOperationType() {
+    return operationType;
+  }
+
+  /**
+   * Get the elapsed time of the current thread operation in microseconds.
+   *
+   * @return the elapsed time
+   */
+  public long getOperationElapsedTime() {
+    return operationElapsedTime;
+  }
+
+  /**
+   * Get the current stage where the thread is involved in the current
+   * operation.
+   *
+   * @return the current stage of the current operation
+   */
+  public OperationStage getOperationStage() {
+    return operationStage;
+  }
+
+  /**
+   * Get the list of properties that describe some details about the current
+   * operation.
+   *
+   * Each field in might have different meanings for different operations.
+   *
+   * @return the properties
+   */
+  public long[] getOperationProperties() {
+    return operationProperties;
+  }
+
+  /**
+   * Get the state (lower-level action) that the current thread is involved
+   * with.
+   *
+   * @return the state
+   */
+  public StateType getStateType() {
+    return stateType;
+  }
+
+  /**
+   * Get the name of the thread type.
+   *
+   * @param threadType the thread type
+   *
+   * @return the name of the thread type.
+   */
+  public static String getThreadTypeName(final ThreadType threadType) {
+    return getThreadTypeName(threadType.getValue());
+  }
+
+  /**
+   * Get the name of an operation given its type.
+   *
+   * @param operationType the type of operation.
+   *
+   * @return the name of the operation.
+   */
+  public static String getOperationName(final OperationType operationType) {
+    return getOperationName(operationType.getValue());
+  }
+
+  public static String microsToString(final long operationElapsedTime) {
+    return microsToStringNative(operationElapsedTime);
+  }
+
+  /**
+   * Obtain a human-readable string describing the specified operation stage.
+   *
+   * @param operationStage the stage of the operation.
+   *
+   * @return the description of the operation stage.
+   */
+  public static String getOperationStageName(
+      final OperationStage operationStage) {
+    return getOperationStageName(operationStage.getValue());
+  }
+
+  /**
+   * Obtain the name of the "i"th operation property of the
+   * specified operation.
+   *
+   * @param operationType the operation type.
+   * @param i the index of the operation property.
+   *
+   * @return the name of the operation property
+   */
+  public static String getOperationPropertyName(
+      final OperationType operationType, final int i) {
+    return getOperationPropertyName(operationType.getValue(), i);
+  }
+
+  /**
+   * Translate the "i"th property of the specified operation given
+   * a property value.
+   *
+   * @param operationType the operation type.
+   * @param operationProperties the operation properties.
+   *
+   * @return the property values.
+   */
+  public static Map<String, Long> interpretOperationProperties(
+      final OperationType operationType, final long[] operationProperties) {
+    return interpretOperationProperties(operationType.getValue(),
+        operationProperties);
+  }
+
+  /**
+   * Obtain the name of a state given its type.
+   *
+   * @param stateType the state type.
+   *
+   * @return the name of the state.
+   */
+  public static String getStateName(final StateType stateType) {
+    return getStateName(stateType.getValue());
+  }
+
+  private static native String getThreadTypeName(final byte threadTypeValue);
+  private static native String getOperationName(final byte operationTypeValue);
+  private static native String microsToStringNative(
+      final long operationElapsedTime);
+  private static native String getOperationStageName(
+      final byte operationStageTypeValue);
+  private static native String getOperationPropertyName(
+      final byte operationTypeValue, final int i);
+  private static native Map<String, Long>interpretOperationProperties(
+      final byte operationTypeValue, final long[] operationProperties);
+  private static native String getStateName(final byte stateTypeValue);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ThreadType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ThreadType.java
new file mode 100644
index 0000000000..cc329f4425
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/ThreadType.java
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * The type of a thread.
+ */
+public enum ThreadType {
+  /**
+   * RocksDB BG thread in high-pri thread pool.
+   */
+  HIGH_PRIORITY((byte)0x0),
+
+  /**
+   * RocksDB BG thread in low-pri thread pool.
+   */
+  LOW_PRIORITY((byte)0x1),
+
+  /**
+   * User thread (Non-RocksDB BG thread).
+   */
+  USER((byte)0x2),
+
+  /**
+   * RocksDB BG thread in bottom-pri thread pool
+   */
+  BOTTOM_PRIORITY((byte)0x3);
+
+  private final byte value;
+
+  ThreadType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the Thread type from the internal representation value.
+   *
+   * @param value the internal representation value.
+   *
+   * @return the thread type
+   *
+   * @throws IllegalArgumentException if the value does not match a ThreadType
+   */
+  static ThreadType fromValue(final byte value)
+      throws IllegalArgumentException {
+    for (final ThreadType threadType : ThreadType.values()) {
+      if (threadType.value == value) {
+        return threadType;
+      }
+    }
+    throw new IllegalArgumentException("Unknown value for ThreadType: " + value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TickerType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
index 948079c75a..551e366dc5 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TickerType.java
@@ -5,6 +5,16 @@
 
 package org.rocksdb;
 
+/**
+ * The logical mapping of tickers defined in rocksdb::Tickers.
+ *
+ * Java byte value mappings don't align 1:1 to the c++ values. c++ rocksdb::Tickers enumeration type
+ * is uint32_t and java org.rocksdb.TickerType is byte, this causes mapping issues when
+ * rocksdb::Tickers value is greater then 127 (0x7F) for jbyte jni interface as range greater is not
+ * available. Without breaking interface in minor versions, value mappings for
+ * org.rocksdb.TickerType leverage full byte range [-128 (-0x80), (0x7F)]. Newer tickers added
+ * should descend into negative values until TICKER_ENUM_MAX reaches -128 (-0x80).
+ */
 public enum TickerType {
 
     /**
@@ -304,7 +314,8 @@ public enum TickerType {
     RATE_LIMIT_DELAY_MILLIS((byte) 0x37),
 
     /**
-     * Number of iterators currently open.
+     * Number of iterators created.
+     *
      */
     NO_ITERATORS((byte) 0x38),
 
@@ -465,8 +476,248 @@ public enum TickerType {
      */
     NUMBER_RATE_LIMITER_DRAINS((byte) 0x5C),
 
-    TICKER_ENUM_MAX((byte) 0x5D);
+    /**
+     * Number of internal skipped during iteration
+     */
+    NUMBER_ITER_SKIP((byte) 0x5D),
+
+    /**
+     * Number of MultiGet keys found (vs number requested)
+     */
+    NUMBER_MULTIGET_KEYS_FOUND((byte) 0x5E),
+
+    // -0x01 to fixate the new value that incorrectly changed TICKER_ENUM_MAX
+    /**
+     * Number of iterators created.
+     */
+    NO_ITERATOR_CREATED((byte) -0x01),
+
+    /**
+     * Number of iterators deleted.
+     */
+    NO_ITERATOR_DELETED((byte) 0x60),
+
+    /**
+     * Deletions obsoleted before bottom level due to file gap optimization.
+     */
+    COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE((byte) 0x61),
+
+    /**
+     * If a compaction was cancelled in sfm to prevent ENOSPC
+     */
+    COMPACTION_CANCELLED((byte) 0x62),
+
+    /**
+     * # of times bloom FullFilter has not avoided the reads.
+     */
+    BLOOM_FILTER_FULL_POSITIVE((byte) 0x63),
+
+    /**
+     * # of times bloom FullFilter has not avoided the reads and data actually
+     * exist.
+     */
+    BLOOM_FILTER_FULL_TRUE_POSITIVE((byte) 0x64),
+
+    /**
+     * BlobDB specific stats
+     * # of Put/PutTTL/PutUntil to BlobDB.
+     */
+    BLOB_DB_NUM_PUT((byte) 0x65),
+
+    /**
+     * # of Write to BlobDB.
+     */
+    BLOB_DB_NUM_WRITE((byte) 0x66),
+
+    /**
+     * # of Get to BlobDB.
+     */
+    BLOB_DB_NUM_GET((byte) 0x67),
+
+    /**
+     * # of MultiGet to BlobDB.
+     */
+    BLOB_DB_NUM_MULTIGET((byte) 0x68),
+
+    /**
+     * # of Seek/SeekToFirst/SeekToLast/SeekForPrev to BlobDB iterator.
+     */
+    BLOB_DB_NUM_SEEK((byte) 0x69),
+
+    /**
+     * # of Next to BlobDB iterator.
+     */
+    BLOB_DB_NUM_NEXT((byte) 0x6A),
+
+    /**
+     * # of Prev to BlobDB iterator.
+     */
+    BLOB_DB_NUM_PREV((byte) 0x6B),
+
+    /**
+     * # of keys written to BlobDB.
+     */
+    BLOB_DB_NUM_KEYS_WRITTEN((byte) 0x6C),
+
+    /**
+     * # of keys read from BlobDB.
+     */
+    BLOB_DB_NUM_KEYS_READ((byte) 0x6D),
+
+    /**
+     * # of bytes (key + value) written to BlobDB.
+     */
+    BLOB_DB_BYTES_WRITTEN((byte) 0x6E),
+
+    /**
+     * # of bytes (keys + value) read from BlobDB.
+     */
+    BLOB_DB_BYTES_READ((byte) 0x6F),
+
+    /**
+     * # of keys written by BlobDB as non-TTL inlined value.
+     */
+    BLOB_DB_WRITE_INLINED((byte) 0x70),
+
+    /**
+     * # of keys written by BlobDB as TTL inlined value.
+     */
+    BLOB_DB_WRITE_INLINED_TTL((byte) 0x71),
+
+    /**
+     * # of keys written by BlobDB as non-TTL blob value.
+     */
+    BLOB_DB_WRITE_BLOB((byte) 0x72),
+
+    /**
+     * # of keys written by BlobDB as TTL blob value.
+     */
+    BLOB_DB_WRITE_BLOB_TTL((byte) 0x73),
+
+    /**
+     * # of bytes written to blob file.
+     */
+    BLOB_DB_BLOB_FILE_BYTES_WRITTEN((byte) 0x74),
+
+    /**
+     * # of bytes read from blob file.
+     */
+    BLOB_DB_BLOB_FILE_BYTES_READ((byte) 0x75),
+
+    /**
+     * # of times a blob files being synced.
+     */
+    BLOB_DB_BLOB_FILE_SYNCED((byte) 0x76),
+
+    /**
+     * # of blob index evicted from base DB by BlobDB compaction filter because
+     * of expiration.
+     */
+    BLOB_DB_BLOB_INDEX_EXPIRED_COUNT((byte) 0x77),
+
+    /**
+     * Size of blob index evicted from base DB by BlobDB compaction filter
+     * because of expiration.
+     */
+    BLOB_DB_BLOB_INDEX_EXPIRED_SIZE((byte) 0x78),
+
+    /**
+     * # of blob index evicted from base DB by BlobDB compaction filter because
+     * of corresponding file deleted.
+     */
+    BLOB_DB_BLOB_INDEX_EVICTED_COUNT((byte) 0x79),
+
+    /**
+     * Size of blob index evicted from base DB by BlobDB compaction filter
+     * because of corresponding file deleted.
+     */
+    BLOB_DB_BLOB_INDEX_EVICTED_SIZE((byte) 0x7A),
+
+    /**
+     * # of blob files being garbage collected.
+     */
+    BLOB_DB_GC_NUM_FILES((byte) 0x7B),
+
+    /**
+     * # of blob files generated by garbage collection.
+     */
+    BLOB_DB_GC_NUM_NEW_FILES((byte) 0x7C),
+
+    /**
+     * # of BlobDB garbage collection failures.
+     */
+    BLOB_DB_GC_FAILURES((byte) 0x7D),
+
+    /**
+     * # of keys drop by BlobDB garbage collection because they had been
+     * overwritten.
+     */
+    BLOB_DB_GC_NUM_KEYS_OVERWRITTEN((byte) 0x7E),
 
+    /**
+     * # of keys drop by BlobDB garbage collection because of expiration.
+     */
+    BLOB_DB_GC_NUM_KEYS_EXPIRED((byte) 0x7F),
+
+    /**
+     * # of keys relocated to new blob file by garbage collection.
+     */
+    BLOB_DB_GC_NUM_KEYS_RELOCATED((byte) -0x02),
+
+    /**
+     * # of bytes drop by BlobDB garbage collection because they had been
+     * overwritten.
+     */
+    BLOB_DB_GC_BYTES_OVERWRITTEN((byte) -0x03),
+
+    /**
+     * # of bytes drop by BlobDB garbage collection because of expiration.
+     */
+    BLOB_DB_GC_BYTES_EXPIRED((byte) -0x04),
+
+    /**
+     * # of bytes relocated to new blob file by garbage collection.
+     */
+    BLOB_DB_GC_BYTES_RELOCATED((byte) -0x05),
+
+    /**
+     * # of blob files evicted because of BlobDB is full.
+     */
+    BLOB_DB_FIFO_NUM_FILES_EVICTED((byte) -0x06),
+
+    /**
+     * # of keys in the blob files evicted because of BlobDB is full.
+     */
+    BLOB_DB_FIFO_NUM_KEYS_EVICTED((byte) -0x07),
+
+    /**
+     * # of bytes in the blob files evicted because of BlobDB is full.
+     */
+    BLOB_DB_FIFO_BYTES_EVICTED((byte) -0x08),
+
+    /**
+     * These counters indicate a performance issue in WritePrepared transactions.
+     * We should not seem them ticking them much.
+     * # of times prepare_mutex_ is acquired in the fast path.
+     */
+    TXN_PREPARE_MUTEX_OVERHEAD((byte) -0x09),
+
+    /**
+     * # of times old_commit_map_mutex_ is acquired in the fast path.
+     */
+    TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD((byte) -0x0A),
+
+    /**
+     * # of times we checked a batch for duplicate keys.
+     */
+    TXN_DUPLICATE_KEY_OVERHEAD((byte) -0x0B),
+
+    /**
+     * # of times snapshot_mutex_ is acquired in the fast path.
+     */
+    TXN_SNAPSHOT_MUTEX_OVERHEAD((byte) -0x0C),
+
+    TICKER_ENUM_MAX((byte) 0x5F);
 
     private final byte value;
 
@@ -474,6 +725,13 @@ public enum TickerType {
         this.value = value;
     }
 
+    /**
+     * @deprecated Exposes internal value of native enum mappings.
+     *     This method will be marked package private in the next major release.
+     *
+     * @return the internal representation
+     */
+    @Deprecated
     public byte getValue() {
         return value;
     }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TimedEnv.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TimedEnv.java
new file mode 100644
index 0000000000..dc8b5d6efb
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TimedEnv.java
@@ -0,0 +1,30 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Timed environment.
+ */
+public class TimedEnv extends Env {
+
+  /**
+   * <p>Creates a new environment that measures function call times for
+   * filesystem operations, reporting results to variables in PerfContext.</p>
+   *
+   *
+   * <p>The caller must delete the result when it is
+   * no longer needed.</p>
+   *
+   * @param baseEnv the base environment,
+   *     must remain live while the result is in use.
+   */
+  public TimedEnv(final Env baseEnv) {
+    super(createTimedEnv(baseEnv.nativeHandle_));
+  }
+
+  private static native long createTimedEnv(final long baseEnvHandle);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java
new file mode 100644
index 0000000000..657b263c6d
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TraceOptions.java
@@ -0,0 +1,32 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * TraceOptions is used for
+ * {@link RocksDB#startTrace(TraceOptions, AbstractTraceWriter)}.
+ */
+public class TraceOptions {
+  private final long maxTraceFileSize;
+
+  public TraceOptions() {
+    this.maxTraceFileSize = 64 * 1024 * 1024 * 1024;  // 64 GB
+  }
+
+  public TraceOptions(final long maxTraceFileSize) {
+    this.maxTraceFileSize = maxTraceFileSize;
+  }
+
+  /**
+   * To avoid the trace file size grows large than the storage space,
+   * user can set the max trace file size in Bytes. Default is 64GB
+   *
+   * @return the max trace size
+   */
+  public long getMaxTraceFileSize() {
+    return maxTraceFileSize;
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TraceWriter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TraceWriter.java
new file mode 100644
index 0000000000..cb0234e9b2
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TraceWriter.java
@@ -0,0 +1,36 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * TraceWriter allows exporting RocksDB traces to any system,
+ * one operation at a time.
+ */
+public interface TraceWriter {
+
+  /**
+   * Write the data.
+   *
+   * @param data the data
+   *
+   * @throws RocksDBException if an error occurs whilst writing.
+   */
+  void write(final Slice data) throws RocksDBException;
+
+  /**
+   * Close the writer.
+   *
+   * @throws RocksDBException if an error occurs whilst closing the writer.
+   */
+  void closeWriter() throws RocksDBException;
+
+  /**
+   * Get the size of the file that this writer is writing to.
+   *
+   * @return the file size
+   */
+  long getFileSize();
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Transaction.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Transaction.java
new file mode 100644
index 0000000000..96f1143d4f
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/Transaction.java
@@ -0,0 +1,1868 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.List;
+
+/**
+ * Provides BEGIN/COMMIT/ROLLBACK transactions.
+ *
+ * To use transactions, you must first create either an
+ * {@link OptimisticTransactionDB} or a {@link TransactionDB}
+ *
+ * To create a transaction, use
+ * {@link OptimisticTransactionDB#beginTransaction(org.rocksdb.WriteOptions)} or
+ * {@link TransactionDB#beginTransaction(org.rocksdb.WriteOptions)}
+ *
+ * It is up to the caller to synchronize access to this object.
+ *
+ * See samples/src/main/java/OptimisticTransactionSample.java and
+ * samples/src/main/java/TransactionSample.java for some simple
+ * examples.
+ */
+public class Transaction extends RocksObject {
+
+  private final RocksDB parent;
+
+  /**
+   * Intentionally package private
+   * as this is called from
+   * {@link OptimisticTransactionDB#beginTransaction(org.rocksdb.WriteOptions)}
+   * or {@link TransactionDB#beginTransaction(org.rocksdb.WriteOptions)}
+   *
+   * @param parent This must be either {@link TransactionDB} or
+   *     {@link OptimisticTransactionDB}
+   * @param transactionHandle The native handle to the underlying C++
+   *     transaction object
+   */
+  Transaction(final RocksDB parent, final long transactionHandle) {
+    super(transactionHandle);
+    this.parent = parent;
+  }
+
+  /**
+   * If a transaction has a snapshot set, the transaction will ensure that
+   * any keys successfully written(or fetched via {@link #getForUpdate}) have
+   * not been modified outside of this transaction since the time the snapshot
+   * was set.
+   *
+   * If a snapshot has not been set, the transaction guarantees that keys have
+   * not been modified since the time each key was first written (or fetched via
+   * {@link #getForUpdate}).
+   *
+   * Using {@link #setSnapshot()} will provide stricter isolation guarantees
+   * at the expense of potentially more transaction failures due to conflicts
+   * with other writes.
+   *
+   * Calling {@link #setSnapshot()} has no effect on keys written before this
+   * function has been called.
+   *
+   * {@link #setSnapshot()} may be called multiple times if you would like to
+   * change the snapshot used for different operations in this transaction.
+   *
+   * Calling {@link #setSnapshot()} will not affect the version of Data returned
+   * by get(...) methods. See {@link #get} for more details.
+   */
+  public void setSnapshot() {
+    assert(isOwningHandle());
+    setSnapshot(nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link #setSnapshot()}, but will not change the current snapshot
+   * until put/merge/delete/getForUpdate/multiGetForUpdate is called.
+   * By calling this function, the transaction will essentially call
+   * {@link #setSnapshot()} for you right before performing the next
+   * write/getForUpdate.
+   *
+   * Calling {@link #setSnapshotOnNextOperation()} will not affect what
+   * snapshot is returned by {@link #getSnapshot} until the next
+   * write/getForUpdate is executed.
+   *
+   * When the snapshot is created the notifier's snapshotCreated method will
+   * be called so that the caller can get access to the snapshot.
+   *
+   * This is an optimization to reduce the likelihood of conflicts that
+   * could occur in between the time {@link #setSnapshot()} is called and the
+   * first write/getForUpdate operation. i.e. this prevents the following
+   * race-condition:
+   *
+   *   txn1-&gt;setSnapshot();
+   *                             txn2-&gt;put("A", ...);
+   *                             txn2-&gt;commit();
+   *   txn1-&gt;getForUpdate(opts, "A", ...);  * FAIL!
+   */
+  public void setSnapshotOnNextOperation() {
+    assert(isOwningHandle());
+    setSnapshotOnNextOperation(nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link #setSnapshot()}, but will not change the current snapshot
+   * until put/merge/delete/getForUpdate/multiGetForUpdate is called.
+   * By calling this function, the transaction will essentially call
+   * {@link #setSnapshot()} for you right before performing the next
+   * write/getForUpdate.
+   *
+   * Calling {@link #setSnapshotOnNextOperation()} will not affect what
+   * snapshot is returned by {@link #getSnapshot} until the next
+   * write/getForUpdate is executed.
+   *
+   * When the snapshot is created the
+   * {@link AbstractTransactionNotifier#snapshotCreated(Snapshot)} method will
+   * be called so that the caller can get access to the snapshot.
+   *
+   * This is an optimization to reduce the likelihood of conflicts that
+   * could occur in between the time {@link #setSnapshot()} is called and the
+   * first write/getForUpdate operation. i.e. this prevents the following
+   * race-condition:
+   *
+   *   txn1-&gt;setSnapshot();
+   *                             txn2-&gt;put("A", ...);
+   *                             txn2-&gt;commit();
+   *   txn1-&gt;getForUpdate(opts, "A", ...);  * FAIL!
+   *
+   * @param transactionNotifier A handler for receiving snapshot notifications
+   *     for the transaction
+   *
+   */
+  public void setSnapshotOnNextOperation(
+      final AbstractTransactionNotifier transactionNotifier) {
+    assert(isOwningHandle());
+    setSnapshotOnNextOperation(nativeHandle_, transactionNotifier.nativeHandle_);
+  }
+
+ /**
+  * Returns the Snapshot created by the last call to {@link #setSnapshot()}.
+  *
+  * REQUIRED: The returned Snapshot is only valid up until the next time
+  * {@link #setSnapshot()}/{@link #setSnapshotOnNextOperation()} is called,
+  * {@link #clearSnapshot()} is called, or the Transaction is deleted.
+  *
+  * @return The snapshot or null if there is no snapshot
+  */
+  public Snapshot getSnapshot() {
+    assert(isOwningHandle());
+    final long snapshotNativeHandle = getSnapshot(nativeHandle_);
+    if(snapshotNativeHandle == 0) {
+      return null;
+    } else {
+      final Snapshot snapshot = new Snapshot(snapshotNativeHandle);
+      return snapshot;
+    }
+  }
+
+  /**
+   * Clears the current snapshot (i.e. no snapshot will be 'set')
+   *
+   * This removes any snapshot that currently exists or is set to be created
+   * on the next update operation ({@link #setSnapshotOnNextOperation()}).
+   *
+   * Calling {@link #clearSnapshot()} has no effect on keys written before this
+   * function has been called.
+   *
+   * If a reference to a snapshot was retrieved via {@link #getSnapshot()}, it
+   * will no longer be valid and should be discarded after a call to
+   * {@link #clearSnapshot()}.
+   */
+  public void clearSnapshot() {
+    assert(isOwningHandle());
+    clearSnapshot(nativeHandle_);
+  }
+
+  /**
+   * Prepare the current transaction for 2PC
+   */
+  void prepare() throws RocksDBException {
+    //TODO(AR) consider a Java'ish version of this function, which returns an AutoCloseable (commit)
+    assert(isOwningHandle());
+    prepare(nativeHandle_);
+  }
+
+  /**
+   * Write all batched keys to the db atomically.
+   *
+   * Returns OK on success.
+   *
+   * May return any error status that could be returned by DB:Write().
+   *
+   * If this transaction was created by an {@link OptimisticTransactionDB}
+   * Status::Busy() may be returned if the transaction could not guarantee
+   * that there are no write conflicts. Status::TryAgain() may be returned
+   * if the memtable history size is not large enough
+   *  (See max_write_buffer_number_to_maintain).
+   *
+   * If this transaction was created by a {@link TransactionDB},
+   * Status::Expired() may be returned if this transaction has lived for
+   * longer than {@link TransactionOptions#getExpiration()}.
+   *
+   * @throws RocksDBException if an error occurs when committing the transaction
+   */
+  public void commit() throws RocksDBException {
+    assert(isOwningHandle());
+    commit(nativeHandle_);
+  }
+
+  /**
+   * Discard all batched writes in this transaction.
+   *
+   * @throws RocksDBException if an error occurs when rolling back the transaction
+   */
+  public void rollback() throws RocksDBException {
+    assert(isOwningHandle());
+    rollback(nativeHandle_);
+  }
+
+  /**
+   * Records the state of the transaction for future calls to
+   * {@link #rollbackToSavePoint()}.
+   *
+   * May be called multiple times to set multiple save points.
+   *
+   * @throws RocksDBException if an error occurs whilst setting a save point
+   */
+  public void setSavePoint() throws RocksDBException {
+    assert(isOwningHandle());
+    setSavePoint(nativeHandle_);
+  }
+
+  /**
+   * Undo all operations in this transaction (put, merge, delete, putLogData)
+   * since the most recent call to {@link #setSavePoint()} and removes the most
+   * recent {@link #setSavePoint()}.
+   *
+   * If there is no previous call to {@link #setSavePoint()},
+   * returns Status::NotFound()
+   *
+   * @throws RocksDBException if an error occurs when rolling back to a save point
+   */
+  public void rollbackToSavePoint() throws RocksDBException {
+    assert(isOwningHandle());
+    rollbackToSavePoint(nativeHandle_);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance
+   * @param readOptions Read options.
+   * @param key the key to retrieve the value for.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
+   */
+  public byte[] get(final ColumnFamilyHandle columnFamilyHandle,
+      final ReadOptions readOptions, final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    return get(nativeHandle_, readOptions.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#get(ReadOptions, byte[])} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.
+   * @param key the key to retrieve the value for.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any. null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying native
+   *     library.
+   */
+  public byte[] get(final ReadOptions readOptions, final byte[] key)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    return get(nativeHandle_, readOptions.nativeHandle_, key, key.length);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGet(ReadOptions, List, List)} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandles {@link java.util.List} containing
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   * @throws IllegalArgumentException thrown if the size of passed keys is not
+   *    equal to the amount of passed column family handles.
+   */
+  public byte[][] multiGet(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandles,
+      final byte[][] keys) throws RocksDBException {
+    assert(isOwningHandle());
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.length != columnFamilyHandles.size()) {
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+    final long[] cfHandles = new long[columnFamilyHandles.size()];
+    for (int i = 0; i < columnFamilyHandles.size(); i++) {
+      cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_;
+    }
+
+    return multiGet(nativeHandle_, readOptions.nativeHandle_,
+       keys, cfHandles);
+  }
+
+  /**
+   * This function is similar to
+   * {@link RocksDB#multiGet(ReadOptions, List)} except it will
+   * also read pending changes in this transaction.
+   * Currently, this function will return Status::MergeInProgress if the most
+   * recent write to the queried key in this batch is a Merge.
+   *
+   * If {@link ReadOptions#snapshot()} is not set, the current version of the
+   * key will be read. Calling {@link #setSnapshot()} does not affect the
+   * version of the data returned.
+   *
+   * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect
+   * what is read from the DB but will NOT change which keys are read from this
+   * transaction (the keys in this transaction do not yet belong to any snapshot
+   * and will be fetched regardless).
+   *
+   * @param readOptions Read options.=
+   *     {@link org.rocksdb.ColumnFamilyHandle} instances.
+   * @param keys of keys for which values need to be retrieved.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[][] multiGet(final ReadOptions readOptions,
+      final byte[][] keys) throws RocksDBException {
+    assert(isOwningHandle());
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+
+    return multiGet(nativeHandle_, readOptions.nativeHandle_,
+        keys);
+  }
+
+  /**
+   * Read this key and ensure that this transaction will only
+   * be able to be committed if this key is not written outside this
+   * transaction after it has first been read (or after the snapshot if a
+   * snapshot is set in this transaction). The transaction behavior is the
+   * same regardless of whether the key exists or not.
+   *
+   * Note: Currently, this function will return Status::MergeInProgress
+   * if the most recent write to the queried key in this batch is a Merge.
+   *
+   * The values returned by this function are similar to
+   * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}.
+   * If value==nullptr, then this function will not read any data, but will
+   * still ensure that this key cannot be written to by outside of this
+   * transaction.
+   *
+   * If this transaction was created by an {@link OptimisticTransactionDB},
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}
+   * could cause {@link #commit()} to fail. Otherwise, it could return any error
+   * that could be returned by
+   * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}.
+   *
+   * If this transaction was created on a {@link TransactionDB}, an
+   * {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   * when:
+   *     {@link Status.Code#Busy} if there is a write conflict,
+   *     {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *     {@link Status.Code#TryAgain} if the memtable history size is not large
+   *         enough. See
+   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *     {@link Status.Code#MergeInProgress} if merge operations cannot be
+   *     resolved.
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value for.
+   * @param exclusive true if the transaction should have exclusive access to
+   *     the key, otherwise false for shared access.
+   * @param do_validate true if it should validate the snapshot before doing the read
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] getForUpdate(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final boolean exclusive,
+      final boolean do_validate) throws RocksDBException {
+    assert (isOwningHandle());
+    return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_, exclusive, do_validate);
+  }
+
+  /**
+   * Same as
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean, boolean)}
+   * with do_validate=true.
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value for.
+   * @param exclusive true if the transaction should have exclusive access to
+   *     the key, otherwise false for shared access.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] getForUpdate(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final boolean exclusive) throws RocksDBException {
+    assert(isOwningHandle());
+    return getForUpdate(nativeHandle_, readOptions.nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_, exclusive, true /*do_validate*/);
+  }
+
+  /**
+   * Read this key and ensure that this transaction will only
+   * be able to be committed if this key is not written outside this
+   * transaction after it has first been read (or after the snapshot if a
+   * snapshot is set in this transaction). The transaction behavior is the
+   * same regardless of whether the key exists or not.
+   *
+   * Note: Currently, this function will return Status::MergeInProgress
+   * if the most recent write to the queried key in this batch is a Merge.
+   *
+   * The values returned by this function are similar to
+   * {@link RocksDB#get(ReadOptions, byte[])}.
+   * If value==nullptr, then this function will not read any data, but will
+   * still ensure that this key cannot be written to by outside of this
+   * transaction.
+   *
+   * If this transaction was created on an {@link OptimisticTransactionDB},
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}
+   * could cause {@link #commit()} to fail. Otherwise, it could return any error
+   * that could be returned by
+   * {@link RocksDB#get(ReadOptions, byte[])}.
+   *
+   * If this transaction was created on a {@link TransactionDB}, an
+   * {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   * when:
+   *     {@link Status.Code#Busy} if there is a write conflict,
+   *     {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *     {@link Status.Code#TryAgain} if the memtable history size is not large
+   *         enough. See
+   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *     {@link Status.Code#MergeInProgress} if merge operations cannot be
+   *     resolved.
+   *
+   * @param readOptions Read options.
+   * @param key the key to retrieve the value for.
+   * @param exclusive true if the transaction should have exclusive access to
+   *     the key, otherwise false for shared access.
+   *
+   * @return a byte array storing the value associated with the input key if
+   *     any.  null if it does not find the specified key.
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[] getForUpdate(final ReadOptions readOptions, final byte[] key,
+      final boolean exclusive) throws RocksDBException {
+    assert(isOwningHandle());
+    return getForUpdate(
+        nativeHandle_, readOptions.nativeHandle_, key, key.length, exclusive, true /*do_validate*/);
+  }
+
+  /**
+   * A multi-key version of
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}.
+   *
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandles {@link org.rocksdb.ColumnFamilyHandle}
+   *     instances
+   * @param keys the keys to retrieve the values for.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[][] multiGetForUpdate(final ReadOptions readOptions,
+      final List<ColumnFamilyHandle> columnFamilyHandles,
+      final byte[][] keys) throws RocksDBException {
+    assert(isOwningHandle());
+    // Check if key size equals cfList size. If not a exception must be
+    // thrown. If not a Segmentation fault happens.
+    if (keys.length != columnFamilyHandles.size()){
+      throw new IllegalArgumentException(
+          "For each key there must be a ColumnFamilyHandle.");
+    }
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+    final long[] cfHandles = new long[columnFamilyHandles.size()];
+    for (int i = 0; i < columnFamilyHandles.size(); i++) {
+      cfHandles[i] = columnFamilyHandles.get(i).nativeHandle_;
+    }
+    return multiGetForUpdate(nativeHandle_, readOptions.nativeHandle_,
+        keys, cfHandles);
+  }
+
+  /**
+   * A multi-key version of {@link #getForUpdate(ReadOptions, byte[], boolean)}.
+   *
+   *
+   * @param readOptions Read options.
+   * @param keys the keys to retrieve the values for.
+   *
+   * @return Array of values, one for each key
+   *
+   * @throws RocksDBException thrown if error happens in underlying
+   *    native library.
+   */
+  public byte[][] multiGetForUpdate(final ReadOptions readOptions,
+      final byte[][] keys) throws RocksDBException {
+    assert(isOwningHandle());
+    if(keys.length == 0) {
+      return new byte[0][0];
+    }
+
+    return multiGetForUpdate(nativeHandle_,
+        readOptions.nativeHandle_, keys);
+  }
+
+  /**
+   * Returns an iterator that will iterate on all keys in the default
+   * column family including both keys in the DB and uncommitted keys in this
+   * transaction.
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
+   * from the DB but will NOT change which keys are read from this transaction
+   * (the keys in this transaction do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * Caller is responsible for deleting the returned Iterator.
+   *
+   * The returned iterator is only valid until {@link #commit()},
+   * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called.
+   *
+   * @param readOptions Read options.
+   *
+   * @return instance of iterator object.
+   */
+  public RocksIterator getIterator(final ReadOptions readOptions) {
+    assert(isOwningHandle());
+    return new RocksIterator(parent, getIterator(nativeHandle_,
+        readOptions.nativeHandle_));
+  }
+
+  /**
+   * Returns an iterator that will iterate on all keys in the default
+   * column family including both keys in the DB and uncommitted keys in this
+   * transaction.
+   *
+   * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read
+   * from the DB but will NOT change which keys are read from this transaction
+   * (the keys in this transaction do not yet belong to any snapshot and will be
+   * fetched regardless).
+   *
+   * Caller is responsible for calling {@link RocksIterator#close()} on
+   * the returned Iterator.
+   *
+   * The returned iterator is only valid until {@link #commit()},
+   * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called.
+   *
+   * @param readOptions Read options.
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   *
+   * @return instance of iterator object.
+   */
+  public RocksIterator getIterator(final ReadOptions readOptions,
+      final ColumnFamilyHandle columnFamilyHandle) {
+    assert(isOwningHandle());
+    return new RocksIterator(parent, getIterator(nativeHandle_,
+        readOptions.nativeHandle_, columnFamilyHandle.nativeHandle_));
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, an
+   * {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   * when:
+   *     {@link Status.Code#Busy} if there is a write conflict,
+   *     {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *     {@link Status.Code#TryAgain} if the memtable history size is not large
+   *         enough. See
+   *         {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value,
+      final boolean assume_tracked) throws RocksDBException {
+    assert (isOwningHandle());
+    put(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_,
+        assume_tracked);
+  }
+
+  /*
+   * Same as
+   * {@link #put(ColumnFamilyHandle, byte[], byte[], boolean)}
+   * with assume_tracked=false.
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final byte[] value) throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_,
+        /*assume_tracked*/ false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #put(ColumnFamilyHandle, byte[], byte[])} but allows
+   * you to specify the key and value in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[][] keyParts,
+      final byte[][] valueParts, final boolean assume_tracked) throws RocksDBException {
+    assert (isOwningHandle());
+    put(nativeHandle_, keyParts, keyParts.length, valueParts, valueParts.length,
+        columnFamilyHandle.nativeHandle_, assume_tracked);
+  }
+
+  /*
+   * Same as
+   * {@link #put(ColumnFamilyHandle, byte[][], byte[][], boolean)}
+   * with assume_tracked=false.
+   */
+  public void put(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, keyParts, keyParts.length, valueParts, valueParts.length,
+        columnFamilyHandle.nativeHandle_, /*assume_tracked*/ false);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #put(byte[], byte[])} but allows
+   * you to specify the key and value in several parts that will be
+   * concatenated together
+   *
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void put(final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    put(nativeHandle_, keyParts, keyParts.length, valueParts,
+        valueParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to merge the key/value into
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final byte[] value, final boolean assume_tracked) throws RocksDBException {
+    assert (isOwningHandle());
+    merge(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_,
+        assume_tracked);
+  }
+
+  /*
+   * Same as
+   * {@link #merge(ColumnFamilyHandle, byte[], byte[], boolean)}
+   * with assume_tracked=false.
+   */
+  public void merge(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    assert(isOwningHandle());
+    merge(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_,
+        /*assume_tracked*/ false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(byte[], byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void merge(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    merge(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final boolean assume_tracked) throws RocksDBException {
+    assert (isOwningHandle());
+    delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_, assume_tracked);
+  }
+
+  /*
+   * Same as
+   * {@link #delete(ColumnFamilyHandle, byte[], boolean)}
+   * with assume_tracked=false.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_,
+        /*assume_tracked*/ false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, key, key.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #delete(ColumnFamilyHandle, byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle, final byte[][] keyParts,
+      final boolean assume_tracked) throws RocksDBException {
+    assert (isOwningHandle());
+    delete(
+        nativeHandle_, keyParts, keyParts.length, columnFamilyHandle.nativeHandle_, assume_tracked);
+  }
+
+  /*
+   * Same as
+   * {@link #delete(ColumnFamilyHandle, byte[][], boolean)}
+   * with assume_tracked=false.
+   */
+  public void delete(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, keyParts, keyParts.length, columnFamilyHandle.nativeHandle_,
+        /*assume_tracked*/ false);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #delete(byte[])} but allows
+   * you to specify key the in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be deleted
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void delete(final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    delete(nativeHandle_, keyParts, keyParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#singleDelete(ColumnFamilyHandle, byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key,
+      final boolean assume_tracked) throws RocksDBException {
+    assert (isOwningHandle());
+    singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_, assume_tracked);
+  }
+
+  /*
+   * Same as
+   * {@link #singleDelete(ColumnFamilyHandle, byte[], boolean)}
+   * with assume_tracked=false.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_,
+        /*assume_tracked*/ false);
+  }
+
+  /**
+   * Similar to {@link RocksDB#singleDelete(byte[])}, but
+   * will also perform conflict checking on the keys be written.
+   *
+   * If this Transaction was created on an {@link OptimisticTransactionDB},
+   * these functions should always succeed.
+   *
+   *  If this Transaction was created on a {@link TransactionDB}, an
+   *  {@link RocksDBException} may be thrown with an accompanying {@link Status}
+   *  when:
+   *    {@link Status.Code#Busy} if there is a write conflict,
+   *    {@link Status.Code#TimedOut} if a lock could not be acquired,
+   *    {@link Status.Code#TryAgain} if the memtable history size is not large
+   *       enough. See
+   *       {@link ColumnFamilyOptions#maxWriteBufferNumberToMaintain()}
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, key, key.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #singleDelete(ColumnFamilyHandle, byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[][] keyParts,
+      final boolean assume_tracked) throws RocksDBException {
+    assert (isOwningHandle());
+    singleDelete(
+        nativeHandle_, keyParts, keyParts.length, columnFamilyHandle.nativeHandle_, assume_tracked);
+  }
+
+  /*
+   * Same as
+   * {@link #singleDelete(ColumnFamilyHandle, byte[][], boolean)}
+   * with assume_tracked=false.
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[][] keyParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, keyParts, keyParts.length, columnFamilyHandle.nativeHandle_,
+        /*assume_tracked*/ false);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #singleDelete(byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  @Experimental("Performance optimization for a very specific workload")
+  public void singleDelete(final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    singleDelete(nativeHandle_, keyParts, keyParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #put(ColumnFamilyHandle, byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#put(byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #put(byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param key the specified key to be inserted.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #putUntracked(ColumnFamilyHandle, byte[], byte[])} but
+   * allows you to specify the key and value in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to put the key/value into
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, keyParts, keyParts.length, valueParts,
+        valueParts.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #putUntracked(byte[], byte[])} but
+   * allows you to specify the key and value in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be inserted.
+   * @param valueParts the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void putUntracked(final byte[][] keyParts, final byte[][] valueParts)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    putUntracked(nativeHandle_, keyParts, keyParts.length, valueParts,
+        valueParts.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #merge(ColumnFamilyHandle, byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param columnFamilyHandle The column family to merge the key/value into
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void mergeUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key, final byte[] value) throws RocksDBException {
+    mergeUntracked(nativeHandle_, key, key.length, value, value.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#merge(byte[], byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #merge(byte[], byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param key the specified key to be merged.
+   * @param value the value associated with the specified key.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void mergeUntracked(final byte[] key, final byte[] value)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    mergeUntracked(nativeHandle_, key, key.length, value, value.length);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #delete(ColumnFamilyHandle, byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, key, key.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * Similar to {@link RocksDB#delete(byte[])},
+   * but operates on the transactions write batch. This write will only happen
+   * if this transaction gets committed successfully.
+   *
+   * Unlike {@link #delete(byte[])} no conflict
+   * checking will be performed for this key.
+   *
+   * If this Transaction was created on a {@link TransactionDB}, this function
+   * will still acquire locks necessary to make sure this write doesn't cause
+   * conflicts in other transactions; This may cause a {@link RocksDBException}
+   * with associated {@link Status.Code#Busy}.
+   *
+   * @param key the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final byte[] key) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, key, key.length);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #deleteUntracked(ColumnFamilyHandle, byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param columnFamilyHandle The column family to delete the key/value from
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, keyParts, keyParts.length,
+        columnFamilyHandle.nativeHandle_);
+  }
+
+  //TODO(AR) refactor if we implement org.rocksdb.SliceParts in future
+  /**
+   * Similar to {@link #deleteUntracked(byte[])} but allows
+   * you to specify the key in several parts that will be
+   * concatenated together.
+   *
+   * @param keyParts the specified key to be deleted.
+   *
+   * @throws RocksDBException when one of the TransactionalDB conditions
+   *     described above occurs, or in the case of an unexpected error
+   */
+  public void deleteUntracked(final byte[][] keyParts) throws RocksDBException {
+    assert(isOwningHandle());
+    deleteUntracked(nativeHandle_, keyParts, keyParts.length);
+  }
+
+  /**
+   * Similar to {@link WriteBatch#putLogData(byte[])}
+   *
+   * @param blob binary object to be inserted
+   */
+  public void putLogData(final byte[] blob) {
+    assert(isOwningHandle());
+    putLogData(nativeHandle_, blob, blob.length);
+  }
+
+  /**
+   * By default, all put/merge/delete operations will be indexed in the
+   * transaction so that get/getForUpdate/getIterator can search for these
+   * keys.
+   *
+   * If the caller does not want to fetch the keys about to be written,
+   * they may want to avoid indexing as a performance optimization.
+   * Calling {@link #disableIndexing()} will turn off indexing for all future
+   * put/merge/delete operations until {@link #enableIndexing()} is called.
+   *
+   * If a key is put/merge/deleted after {@link #disableIndexing()} is called
+   * and then is fetched via get/getForUpdate/getIterator, the result of the
+   * fetch is undefined.
+   */
+  public void disableIndexing() {
+    assert(isOwningHandle());
+    disableIndexing(nativeHandle_);
+  }
+
+  /**
+   * Re-enables indexing after a previous call to {@link #disableIndexing()}
+   */
+  public void enableIndexing() {
+    assert(isOwningHandle());
+    enableIndexing(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of distinct Keys being tracked by this transaction.
+   * If this transaction was created by a {@link TransactionDB}, this is the
+   * number of keys that are currently locked by this transaction.
+   * If this transaction was created by an {@link OptimisticTransactionDB},
+   * this is the number of keys that need to be checked for conflicts at commit
+   * time.
+   *
+   * @return the number of distinct Keys being tracked by this transaction
+   */
+  public long getNumKeys() {
+    assert(isOwningHandle());
+    return getNumKeys(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of puts that have been applied to this
+   * transaction so far.
+   *
+   * @return the number of puts that have been applied to this transaction
+   */
+  public long getNumPuts() {
+    assert(isOwningHandle());
+    return getNumPuts(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of deletes that have been applied to this
+   * transaction so far.
+   *
+   * @return the number of deletes that have been applied to this transaction
+   */
+  public long getNumDeletes() {
+    assert(isOwningHandle());
+    return getNumDeletes(nativeHandle_);
+  }
+
+  /**
+   * Returns the number of merges that have been applied to this
+   * transaction so far.
+   *
+   * @return the number of merges that have been applied to this transaction
+   */
+  public long getNumMerges() {
+    assert(isOwningHandle());
+    return getNumMerges(nativeHandle_);
+  }
+
+  /**
+   * Returns the elapsed time in milliseconds since this Transaction began.
+   *
+   * @return the elapsed time in milliseconds since this transaction began.
+   */
+  public long getElapsedTime() {
+    assert(isOwningHandle());
+    return getElapsedTime(nativeHandle_);
+  }
+
+  /**
+   * Fetch the underlying write batch that contains all pending changes to be
+   * committed.
+   *
+   * Note: You should not write or delete anything from the batch directly and
+   * should only use the functions in the {@link Transaction} class to
+   * write to this transaction.
+   *
+   * @return The write batch
+   */
+  public WriteBatchWithIndex getWriteBatch() {
+    assert(isOwningHandle());
+    final WriteBatchWithIndex writeBatchWithIndex =
+        new WriteBatchWithIndex(getWriteBatch(nativeHandle_));
+    return writeBatchWithIndex;
+  }
+
+  /**
+   * Change the value of {@link TransactionOptions#getLockTimeout()}
+   * (in milliseconds) for this transaction.
+   *
+   * Has no effect on OptimisticTransactions.
+   *
+   * @param lockTimeout the timeout (in milliseconds) for locks used by this
+   *     transaction.
+   */
+  public void setLockTimeout(final long lockTimeout) {
+    assert(isOwningHandle());
+    setLockTimeout(nativeHandle_, lockTimeout);
+  }
+
+  /**
+   * Return the WriteOptions that will be used during {@link #commit()}.
+   *
+   * @return the WriteOptions that will be used
+   */
+  public WriteOptions getWriteOptions() {
+    assert(isOwningHandle());
+    final WriteOptions writeOptions =
+        new WriteOptions(getWriteOptions(nativeHandle_));
+    return writeOptions;
+  }
+
+  /**
+   * Reset the WriteOptions that will be used during {@link #commit()}.
+   *
+   * @param writeOptions The new WriteOptions
+   */
+  public void setWriteOptions(final WriteOptions writeOptions) {
+    assert(isOwningHandle());
+    setWriteOptions(nativeHandle_, writeOptions.nativeHandle_);
+  }
+
+  /**
+   * If this key was previously fetched in this transaction using
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling
+   * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will tell
+   * the transaction that it no longer needs to do any conflict checking
+   * for this key.
+   *
+   * If a key has been fetched N times via
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then
+   * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])}  will only have an
+   * effect if it is also called N times. If this key has been written to in
+   * this transaction, {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])}
+   * will have no effect.
+   *
+   * If {@link #setSavePoint()} has been called after the
+   * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)},
+   * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will not have any
+   * effect.
+   *
+   * If this Transaction was created by an {@link OptimisticTransactionDB},
+   * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} can affect
+   * whether this key is conflict checked at commit time.
+   * If this Transaction was created by a {@link TransactionDB},
+   * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} may release
+   * any held locks for this key.
+   *
+   * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle}
+   *     instance
+   * @param key the key to retrieve the value for.
+   */
+  public void undoGetForUpdate(final ColumnFamilyHandle columnFamilyHandle,
+      final byte[] key) {
+    assert(isOwningHandle());
+    undoGetForUpdate(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_);
+  }
+
+  /**
+   * If this key was previously fetched in this transaction using
+   * {@link #getForUpdate(ReadOptions, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling
+   * {@link #undoGetForUpdate(byte[])} will tell
+   * the transaction that it no longer needs to do any conflict checking
+   * for this key.
+   *
+   * If a key has been fetched N times via
+   * {@link #getForUpdate(ReadOptions, byte[], boolean)}/
+   * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then
+   * {@link #undoGetForUpdate(byte[])}  will only have an
+   * effect if it is also called N times. If this key has been written to in
+   * this transaction, {@link #undoGetForUpdate(byte[])}
+   * will have no effect.
+   *
+   * If {@link #setSavePoint()} has been called after the
+   * {@link #getForUpdate(ReadOptions, byte[], boolean)},
+   * {@link #undoGetForUpdate(byte[])} will not have any
+   * effect.
+   *
+   * If this Transaction was created by an {@link OptimisticTransactionDB},
+   * calling {@link #undoGetForUpdate(byte[])} can affect
+   * whether this key is conflict checked at commit time.
+   * If this Transaction was created by a {@link TransactionDB},
+   * calling {@link #undoGetForUpdate(byte[])} may release
+   * any held locks for this key.
+   *
+   * @param key the key to retrieve the value for.
+   */
+  public void undoGetForUpdate(final byte[] key) {
+    assert(isOwningHandle());
+    undoGetForUpdate(nativeHandle_, key, key.length);
+  }
+
+  /**
+   * Adds the keys from the WriteBatch to the transaction
+   *
+   * @param writeBatch The write batch to read from
+   *
+   * @throws RocksDBException if an error occurs whilst rebuilding from the
+   *     write batch.
+   */
+  public void rebuildFromWriteBatch(final WriteBatch writeBatch)
+      throws RocksDBException {
+    assert(isOwningHandle());
+    rebuildFromWriteBatch(nativeHandle_, writeBatch.nativeHandle_);
+  }
+
+  /**
+   * Get the Commit time Write Batch.
+   *
+   * @return the commit time write batch.
+   */
+  public WriteBatch getCommitTimeWriteBatch() {
+    assert(isOwningHandle());
+    final WriteBatch writeBatch =
+        new WriteBatch(getCommitTimeWriteBatch(nativeHandle_));
+    return writeBatch;
+  }
+
+  /**
+   * Set the log number.
+   *
+   * @param logNumber the log number
+   */
+  public void setLogNumber(final long logNumber) {
+    assert(isOwningHandle());
+    setLogNumber(nativeHandle_, logNumber);
+  }
+
+  /**
+   * Get the log number.
+   *
+   * @return the log number
+   */
+  public long getLogNumber() {
+    assert(isOwningHandle());
+    return getLogNumber(nativeHandle_);
+  }
+
+  /**
+   * Set the name of the transaction.
+   *
+   * @param transactionName the name of the transaction
+   *
+   * @throws RocksDBException if an error occurs when setting the transaction
+   *     name.
+   */
+  public void setName(final String transactionName) throws RocksDBException {
+    assert(isOwningHandle());
+    setName(nativeHandle_, transactionName);
+  }
+
+  /**
+   * Get the name of the transaction.
+   *
+   * @return the name of the transaction
+   */
+  public String getName() {
+    assert(isOwningHandle());
+    return getName(nativeHandle_);
+  }
+
+  /**
+   * Get the ID of the transaction.
+   *
+   * @return the ID of the transaction.
+   */
+  public long getID() {
+    assert(isOwningHandle());
+    return getID(nativeHandle_);
+  }
+
+  /**
+   * Determine if a deadlock has been detected.
+   *
+   * @return true if a deadlock has been detected.
+   */
+  public boolean isDeadlockDetect() {
+    assert(isOwningHandle());
+    return isDeadlockDetect(nativeHandle_);
+  }
+
+  /**
+   * Get the list of waiting transactions.
+   *
+   * @return The list of waiting transactions.
+   */
+  public WaitingTransactions getWaitingTxns() {
+    assert(isOwningHandle());
+    return getWaitingTxns(nativeHandle_);
+  }
+
+  /**
+   * Get the execution status of the transaction.
+   *
+   * NOTE: The execution status of an Optimistic Transaction
+   * never changes. This is only useful for non-optimistic transactions!
+   *
+   * @return The execution status of the transaction
+   */
+  public TransactionState getState() {
+    assert(isOwningHandle());
+    return TransactionState.getTransactionState(
+        getState(nativeHandle_));
+  }
+
+  /**
+   * The globally unique id with which the transaction is identified. This id
+   * might or might not be set depending on the implementation. Similarly the
+   * implementation decides the point in lifetime of a transaction at which it
+   * assigns the id. Although currently it is the case, the id is not guaranteed
+   * to remain the same across restarts.
+   *
+   * @return the transaction id.
+   */
+  @Experimental("NOTE: Experimental feature")
+  public long getId() {
+    assert(isOwningHandle());
+    return getId(nativeHandle_);
+  }
+
+  public enum TransactionState {
+    STARTED((byte)0),
+    AWAITING_PREPARE((byte)1),
+    PREPARED((byte)2),
+    AWAITING_COMMIT((byte)3),
+    COMMITED((byte)4),
+    AWAITING_ROLLBACK((byte)5),
+    ROLLEDBACK((byte)6),
+    LOCKS_STOLEN((byte)7);
+
+    private final byte value;
+
+    TransactionState(final byte value) {
+      this.value = value;
+    }
+
+    /**
+     * Get TransactionState by byte value.
+     *
+     * @param value byte representation of TransactionState.
+     *
+     * @return {@link org.rocksdb.Transaction.TransactionState} instance or null.
+     * @throws java.lang.IllegalArgumentException if an invalid
+     *     value is provided.
+     */
+    public static TransactionState getTransactionState(final byte value) {
+      for (final TransactionState transactionState : TransactionState.values()) {
+        if (transactionState.value == value){
+          return transactionState;
+        }
+      }
+      throw new IllegalArgumentException(
+          "Illegal value provided for TransactionState.");
+    }
+  }
+
+  /**
+   * Called from C++ native method {@link #getWaitingTxns(long)}
+   * to construct a WaitingTransactions object.
+   *
+   * @param columnFamilyId The id of the {@link ColumnFamilyHandle}
+   * @param key The key
+   * @param transactionIds The transaction ids
+   *
+   * @return The waiting transactions
+   */
+  private WaitingTransactions newWaitingTransactions(
+      final long columnFamilyId, final String key,
+      final long[] transactionIds) {
+    return new WaitingTransactions(columnFamilyId, key, transactionIds);
+  }
+
+  public static class WaitingTransactions {
+    private final long columnFamilyId;
+    private final String key;
+    private final long[] transactionIds;
+
+    private WaitingTransactions(final long columnFamilyId, final String key,
+        final long[] transactionIds) {
+      this.columnFamilyId = columnFamilyId;
+      this.key = key;
+      this.transactionIds = transactionIds;
+    }
+
+    /**
+     * Get the Column Family ID.
+     *
+     * @return The column family ID
+     */
+    public long getColumnFamilyId() {
+      return columnFamilyId;
+    }
+
+    /**
+     * Get the key on which the transactions are waiting.
+     *
+     * @return The key
+     */
+    public String getKey() {
+      return key;
+    }
+
+    /**
+     * Get the IDs of the waiting transactions.
+     *
+     * @return The IDs of the waiting transactions
+     */
+    public long[] getTransactionIds() {
+      return transactionIds;
+    }
+  }
+
+  private native void setSnapshot(final long handle);
+  private native void setSnapshotOnNextOperation(final long handle);
+  private native void setSnapshotOnNextOperation(final long handle,
+      final long transactionNotifierHandle);
+  private native long getSnapshot(final long handle);
+  private native void clearSnapshot(final long handle);
+  private native void prepare(final long handle) throws RocksDBException;
+  private native void commit(final long handle) throws RocksDBException;
+  private native void rollback(final long handle) throws RocksDBException;
+  private native void setSavePoint(final long handle) throws RocksDBException;
+  private native void rollbackToSavePoint(final long handle)
+      throws RocksDBException;
+  private native byte[] get(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLength, final long columnFamilyHandle)
+      throws RocksDBException;
+  private native byte[] get(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLen) throws RocksDBException;
+  private native byte[][] multiGet(final long handle,
+      final long readOptionsHandle, final byte[][] keys,
+      final long[] columnFamilyHandles) throws RocksDBException;
+  private native byte[][] multiGet(final long handle,
+      final long readOptionsHandle, final byte[][] keys)
+      throws RocksDBException;
+  private native byte[] getForUpdate(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLength, final long columnFamilyHandle, final boolean exclusive,
+      final boolean do_validate) throws RocksDBException;
+  private native byte[] getForUpdate(final long handle, final long readOptionsHandle,
+      final byte key[], final int keyLen, final boolean exclusive, final boolean do_validate)
+      throws RocksDBException;
+  private native byte[][] multiGetForUpdate(final long handle,
+      final long readOptionsHandle, final byte[][] keys,
+      final long[] columnFamilyHandles) throws RocksDBException;
+  private native byte[][] multiGetForUpdate(final long handle,
+      final long readOptionsHandle, final byte[][] keys)
+      throws RocksDBException;
+  private native long getIterator(final long handle,
+      final long readOptionsHandle);
+  private native long getIterator(final long handle,
+      final long readOptionsHandle, final long columnFamilyHandle);
+  private native void put(final long handle, final byte[] key, final int keyLength,
+      final byte[] value, final int valueLength, final long columnFamilyHandle,
+      final boolean assume_tracked) throws RocksDBException;
+  private native void put(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void put(final long handle, final byte[][] keys, final int keysLength,
+      final byte[][] values, final int valuesLength, final long columnFamilyHandle,
+      final boolean assume_tracked) throws RocksDBException;
+  private native void put(final long handle, final byte[][] keys,
+      final int keysLength, final byte[][] values, final int valuesLength)
+      throws RocksDBException;
+  private native void merge(final long handle, final byte[] key, final int keyLength,
+      final byte[] value, final int valueLength, final long columnFamilyHandle,
+      final boolean assume_tracked) throws RocksDBException;
+  private native void merge(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void delete(final long handle, final byte[] key, final int keyLength,
+      final long columnFamilyHandle, final boolean assume_tracked) throws RocksDBException;
+  private native void delete(final long handle, final byte[] key,
+      final int keyLength) throws RocksDBException;
+  private native void delete(final long handle, final byte[][] keys, final int keysLength,
+      final long columnFamilyHandle, final boolean assume_tracked) throws RocksDBException;
+  private native void delete(final long handle, final byte[][] keys,
+      final int keysLength) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[] key, final int keyLength,
+      final long columnFamilyHandle, final boolean assume_tracked) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[] key,
+      final int keyLength) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[][] keys, final int keysLength,
+      final long columnFamilyHandle, final boolean assume_tracked) throws RocksDBException;
+  private native void singleDelete(final long handle, final byte[][] keys,
+      final int keysLength) throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[][] keys,
+      final int keysLength, final byte[][] values, final int valuesLength,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void putUntracked(final long handle, final byte[][] keys,
+      final int keysLength, final byte[][] values, final int valuesLength)
+      throws RocksDBException;
+  private native void mergeUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength,
+      final long columnFamilyHandle) throws RocksDBException;
+  private native void mergeUntracked(final long handle, final byte[] key,
+      final int keyLength, final byte[] value, final int valueLength)
+      throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[] key,
+      final int keyLength, final long columnFamilyHandle)
+      throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[] key,
+      final int keyLength) throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[][] keys,
+      final int keysLength, final long columnFamilyHandle)
+      throws RocksDBException;
+  private native void deleteUntracked(final long handle, final byte[][] keys,
+      final int keysLength) throws RocksDBException;
+  private native void putLogData(final long handle, final byte[] blob,
+      final int blobLength);
+  private native void disableIndexing(final long handle);
+  private native void enableIndexing(final long handle);
+  private native long getNumKeys(final long handle);
+  private native long getNumPuts(final long handle);
+  private native long getNumDeletes(final long handle);
+  private native long getNumMerges(final long handle);
+  private native long getElapsedTime(final long handle);
+  private native long getWriteBatch(final long handle);
+  private native void setLockTimeout(final long handle, final long lockTimeout);
+  private native long getWriteOptions(final long handle);
+  private native void setWriteOptions(final long handle,
+      final long writeOptionsHandle);
+  private native void undoGetForUpdate(final long handle, final byte[] key,
+      final int keyLength, final long columnFamilyHandle);
+  private native void undoGetForUpdate(final long handle, final byte[] key,
+      final int keyLength);
+  private native void rebuildFromWriteBatch(final long handle,
+      final long writeBatchHandle) throws RocksDBException;
+  private native long getCommitTimeWriteBatch(final long handle);
+  private native void setLogNumber(final long handle, final long logNumber);
+  private native long getLogNumber(final long handle);
+  private native void setName(final long handle, final String name)
+      throws RocksDBException;
+  private native String getName(final long handle);
+  private native long getID(final long handle);
+  private native boolean isDeadlockDetect(final long handle);
+  private native WaitingTransactions getWaitingTxns(final long handle);
+  private native byte getState(final long handle);
+  private native long getId(final long handle);
+
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java
new file mode 100644
index 0000000000..a1a09cf963
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionDB.java
@@ -0,0 +1,404 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Database with Transaction support
+ */
+public class TransactionDB extends RocksDB
+    implements TransactionalDB<TransactionOptions> {
+
+  private TransactionDBOptions transactionDbOptions_;
+
+  /**
+   * Private constructor.
+   *
+   * @param nativeHandle The native handle of the C++ TransactionDB object
+   */
+  private TransactionDB(final long nativeHandle) {
+    super(nativeHandle);
+  }
+
+  /**
+   * Open a TransactionDB, similar to {@link RocksDB#open(Options, String)}.
+   *
+   * @param options {@link org.rocksdb.Options} instance.
+   * @param transactionDbOptions {@link org.rocksdb.TransactionDBOptions}
+   *     instance.
+   * @param path the path to the rocksdb.
+   *
+   * @return a {@link TransactionDB} instance on success, null if the specified
+   *     {@link TransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static TransactionDB open(final Options options,
+      final TransactionDBOptions transactionDbOptions, final String path)
+      throws RocksDBException {
+    final TransactionDB tdb = new TransactionDB(open(options.nativeHandle_,
+        transactionDbOptions.nativeHandle_, path));
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    tdb.storeOptionsInstance(options);
+    tdb.storeTransactionDbOptions(transactionDbOptions);
+
+    return tdb;
+  }
+
+  /**
+   * Open a TransactionDB, similar to
+   * {@link RocksDB#open(DBOptions, String, List, List)}.
+   *
+   * @param dbOptions {@link org.rocksdb.DBOptions} instance.
+   * @param transactionDbOptions {@link org.rocksdb.TransactionDBOptions}
+   *     instance.
+   * @param path the path to the rocksdb.
+   * @param columnFamilyDescriptors list of column family descriptors
+   * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances
+   *
+   * @return a {@link TransactionDB} instance on success, null if the specified
+   *     {@link TransactionDB} can not be opened.
+   *
+   * @throws RocksDBException if an error occurs whilst opening the database.
+   */
+  public static TransactionDB open(final DBOptions dbOptions,
+      final TransactionDBOptions transactionDbOptions,
+      final String path,
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
+      final List<ColumnFamilyHandle> columnFamilyHandles)
+      throws RocksDBException {
+
+    final byte[][] cfNames = new byte[columnFamilyDescriptors.size()][];
+    final long[] cfOptionHandles = new long[columnFamilyDescriptors.size()];
+    for (int i = 0; i < columnFamilyDescriptors.size(); i++) {
+      final ColumnFamilyDescriptor cfDescriptor = columnFamilyDescriptors
+          .get(i);
+      cfNames[i] = cfDescriptor.columnFamilyName();
+      cfOptionHandles[i] = cfDescriptor.columnFamilyOptions().nativeHandle_;
+    }
+
+    final long[] handles = open(dbOptions.nativeHandle_,
+        transactionDbOptions.nativeHandle_, path, cfNames, cfOptionHandles);
+    final TransactionDB tdb = new TransactionDB(handles[0]);
+
+    // when non-default Options is used, keeping an Options reference
+    // in RocksDB can prevent Java to GC during the life-time of
+    // the currently-created RocksDB.
+    tdb.storeOptionsInstance(dbOptions);
+    tdb.storeTransactionDbOptions(transactionDbOptions);
+
+    for (int i = 1; i < handles.length; i++) {
+      columnFamilyHandles.add(new ColumnFamilyHandle(tdb, handles[i]));
+    }
+
+    return tdb;
+  }
+
+  /**
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * This is similar to {@link #closeE()} except that it
+   * silently ignores any errors.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_));
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final TransactionOptions transactionOptions) {
+    return new Transaction(this, beginTransaction(nativeHandle_,
+        writeOptions.nativeHandle_, transactionOptions.nativeHandle_));
+  }
+
+  // TODO(AR) consider having beingTransaction(... oldTransaction) set a
+  // reference count inside Transaction, so that we can always call
+  // Transaction#close but the object is only disposed when there are as many
+  // closes as beginTransaction. Makes the try-with-resources paradigm easier for
+  // java developers
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final Transaction oldTransaction) {
+    final long jtxnHandle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxnHandle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  @Override
+  public Transaction beginTransaction(final WriteOptions writeOptions,
+      final TransactionOptions transactionOptions,
+      final Transaction oldTransaction) {
+    final long jtxn_handle = beginTransaction_withOld(nativeHandle_,
+        writeOptions.nativeHandle_, transactionOptions.nativeHandle_,
+        oldTransaction.nativeHandle_);
+
+    // RocksJava relies on the assumption that
+    // we do not allocate a new Transaction object
+    // when providing an old_txn
+    assert(jtxn_handle == oldTransaction.nativeHandle_);
+
+    return oldTransaction;
+  }
+
+  public Transaction getTransactionByName(final String transactionName) {
+    final long jtxnHandle = getTransactionByName(nativeHandle_, transactionName);
+    if(jtxnHandle == 0) {
+      return null;
+    }
+
+    final Transaction txn = new Transaction(this, jtxnHandle);
+
+    // this instance doesn't own the underlying C++ object
+    txn.disOwnNativeHandle();
+
+    return txn;
+  }
+
+  public List<Transaction> getAllPreparedTransactions() {
+    final long[] jtxnHandles = getAllPreparedTransactions(nativeHandle_);
+
+    final List<Transaction> txns = new ArrayList<>();
+    for(final long jtxnHandle : jtxnHandles) {
+      final Transaction txn = new Transaction(this, jtxnHandle);
+
+      // this instance doesn't own the underlying C++ object
+      txn.disOwnNativeHandle();
+
+      txns.add(txn);
+    }
+    return txns;
+  }
+
+  public static class KeyLockInfo {
+    private final String key;
+    private final long[] transactionIDs;
+    private final boolean exclusive;
+
+    public KeyLockInfo(final String key, final long transactionIDs[],
+        final boolean exclusive) {
+      this.key = key;
+      this.transactionIDs = transactionIDs;
+      this.exclusive = exclusive;
+    }
+
+    /**
+     * Get the key.
+     *
+     * @return the key
+     */
+    public String getKey() {
+      return key;
+    }
+
+    /**
+     * Get the Transaction IDs.
+     *
+     * @return the Transaction IDs.
+     */
+    public long[] getTransactionIDs() {
+      return transactionIDs;
+    }
+
+    /**
+     * Get the Lock status.
+     *
+     * @return true if the lock is exclusive, false if the lock is shared.
+     */
+    public boolean isExclusive() {
+      return exclusive;
+    }
+  }
+
+  /**
+   * Returns map of all locks held.
+   *
+   * @return a map of all the locks held.
+   */
+  public Map<Long, KeyLockInfo> getLockStatusData() {
+    return getLockStatusData(nativeHandle_);
+  }
+
+  /**
+   * Called from C++ native method {@link #getDeadlockInfoBuffer(long)}
+   * to construct a DeadlockInfo object.
+   *
+   * @param transactionID The transaction id
+   * @param columnFamilyId The id of the {@link ColumnFamilyHandle}
+   * @param waitingKey the key that we are waiting on
+   * @param exclusive true if the lock is exclusive, false if the lock is shared
+   *
+   * @return The waiting transactions
+   */
+  private DeadlockInfo newDeadlockInfo(
+      final long transactionID, final long columnFamilyId,
+      final String waitingKey, final boolean exclusive) {
+    return new DeadlockInfo(transactionID, columnFamilyId,
+        waitingKey, exclusive);
+  }
+
+  public static class DeadlockInfo {
+    private final long transactionID;
+    private final long columnFamilyId;
+    private final String waitingKey;
+    private final boolean exclusive;
+
+    private DeadlockInfo(final long transactionID, final long columnFamilyId,
+      final String waitingKey, final boolean exclusive) {
+      this.transactionID = transactionID;
+      this.columnFamilyId = columnFamilyId;
+      this.waitingKey = waitingKey;
+      this.exclusive = exclusive;
+    }
+
+    /**
+     * Get the Transaction ID.
+     *
+     * @return the transaction ID
+     */
+    public long getTransactionID() {
+      return transactionID;
+    }
+
+    /**
+     * Get the Column Family ID.
+     *
+     * @return The column family ID
+     */
+    public long getColumnFamilyId() {
+      return columnFamilyId;
+    }
+
+    /**
+     * Get the key that we are waiting on.
+     *
+     * @return the key that we are waiting on
+     */
+    public String getWaitingKey() {
+      return waitingKey;
+    }
+
+    /**
+     * Get the Lock status.
+     *
+     * @return true if the lock is exclusive, false if the lock is shared.
+     */
+    public boolean isExclusive() {
+      return exclusive;
+    }
+  }
+
+  public static class DeadlockPath {
+    final DeadlockInfo[] path;
+    final boolean limitExceeded;
+
+    public DeadlockPath(final DeadlockInfo[] path, final boolean limitExceeded) {
+      this.path = path;
+      this.limitExceeded = limitExceeded;
+    }
+
+    public boolean isEmpty() {
+      return path.length == 0 && !limitExceeded;
+    }
+  }
+
+  public DeadlockPath[] getDeadlockInfoBuffer() {
+    return getDeadlockInfoBuffer(nativeHandle_);
+  }
+
+  public void setDeadlockInfoBufferSize(final int targetSize) {
+    setDeadlockInfoBufferSize(nativeHandle_, targetSize);
+  }
+
+  private void storeTransactionDbOptions(
+      final TransactionDBOptions transactionDbOptions) {
+    this.transactionDbOptions_ = transactionDbOptions;
+  }
+
+  @Override protected final native void disposeInternal(final long handle);
+
+  private static native long open(final long optionsHandle,
+      final long transactionDbOptionsHandle, final String path)
+      throws RocksDBException;
+  private static native long[] open(final long dbOptionsHandle,
+      final long transactionDbOptionsHandle, final String path,
+      final byte[][] columnFamilyNames, final long[] columnFamilyOptions);
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle);
+  private native long beginTransaction(final long handle,
+      final long writeOptionsHandle, final long transactionOptionsHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle, final long oldTransactionHandle);
+  private native long beginTransaction_withOld(final long handle,
+      final long writeOptionsHandle, final long transactionOptionsHandle,
+      final long oldTransactionHandle);
+  private native long getTransactionByName(final long handle,
+      final String name);
+  private native long[] getAllPreparedTransactions(final long handle);
+  private native Map<Long, KeyLockInfo> getLockStatusData(
+      final long handle);
+  private native DeadlockPath[] getDeadlockInfoBuffer(final long handle);
+  private native void setDeadlockInfoBufferSize(final long handle,
+      final int targetSize);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionDBOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionDBOptions.java
new file mode 100644
index 0000000000..76f545cde6
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionDBOptions.java
@@ -0,0 +1,217 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class TransactionDBOptions extends RocksObject {
+
+  public TransactionDBOptions() {
+    super(newTransactionDBOptions());
+  }
+
+  /**
+   * Specifies the maximum number of keys that can be locked at the same time
+   * per column family.
+   *
+   * If the number of locked keys is greater than {@link #getMaxNumLocks()},
+   * transaction writes (or GetForUpdate) will return an error.
+   *
+   * @return The maximum number of keys that can be locked
+   */
+  public long getMaxNumLocks() {
+    assert(isOwningHandle());
+    return getMaxNumLocks(nativeHandle_);
+  }
+
+  /**
+   * Specifies the maximum number of keys that can be locked at the same time
+   * per column family.
+   *
+   * If the number of locked keys is greater than {@link #getMaxNumLocks()},
+   * transaction writes (or GetForUpdate) will return an error.
+   *
+   * @param maxNumLocks The maximum number of keys that can be locked;
+   *     If this value is not positive, no limit will be enforced.
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setMaxNumLocks(final long maxNumLocks) {
+    assert(isOwningHandle());
+    setMaxNumLocks(nativeHandle_, maxNumLocks);
+    return this;
+  }
+
+  /**
+   * The number of sub-tables per lock table (per column family)
+   *
+   * @return The number of sub-tables
+   */
+  public long getNumStripes() {
+    assert(isOwningHandle());
+    return getNumStripes(nativeHandle_);
+  }
+
+  /**
+   * Increasing this value will increase the concurrency by dividing the lock
+   * table (per column family) into more sub-tables, each with their own
+   * separate mutex.
+   *
+   * Default: 16
+   *
+   * @param numStripes The number of sub-tables
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setNumStripes(final long numStripes) {
+    assert(isOwningHandle());
+    setNumStripes(nativeHandle_, numStripes);
+    return this;
+  }
+
+  /**
+   * The default wait timeout in milliseconds when
+   * a transaction attempts to lock a key if not specified by
+   * {@link TransactionOptions#setLockTimeout(long)}
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout.
+   *
+   * @return the default wait timeout in milliseconds
+   */
+  public long getTransactionLockTimeout() {
+    assert(isOwningHandle());
+    return getTransactionLockTimeout(nativeHandle_);
+  }
+
+  /**
+   * If positive, specifies the default wait timeout in milliseconds when
+   * a transaction attempts to lock a key if not specified by
+   * {@link TransactionOptions#setLockTimeout(long)}
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout. Not using a timeout is not recommended
+   * as it can lead to deadlocks.  Currently, there is no deadlock-detection to
+   * recover from a deadlock.
+   *
+   * Default: 1000
+   *
+   * @param transactionLockTimeout the default wait timeout in milliseconds
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setTransactionLockTimeout(
+      final long transactionLockTimeout) {
+    assert(isOwningHandle());
+    setTransactionLockTimeout(nativeHandle_, transactionLockTimeout);
+    return this;
+  }
+
+  /**
+   * The wait timeout in milliseconds when writing a key
+   * OUTSIDE of a transaction (ie by calling {@link RocksDB#put},
+   * {@link RocksDB#merge}, {@link RocksDB#remove} or {@link RocksDB#write}
+   * directly).
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout and will block indefinitely when acquiring
+   * a lock.
+   *
+   * @return the timeout in milliseconds when writing a key OUTSIDE of a
+   *     transaction
+   */
+  public long getDefaultLockTimeout() {
+    assert(isOwningHandle());
+    return getDefaultLockTimeout(nativeHandle_);
+  }
+
+  /**
+   * If positive, specifies the wait timeout in milliseconds when writing a key
+   * OUTSIDE of a transaction (ie by calling {@link RocksDB#put},
+   * {@link RocksDB#merge}, {@link RocksDB#remove} or {@link RocksDB#write}
+   * directly).
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, there is no timeout and will block indefinitely when acquiring
+   * a lock.
+   *
+   * Not using a timeout can lead to deadlocks. Currently, there
+   * is no deadlock-detection to recover from a deadlock.  While DB writes
+   * cannot deadlock with other DB writes, they can deadlock with a transaction.
+   * A negative timeout should only be used if all transactions have a small
+   * expiration set.
+   *
+   * Default: 1000
+   *
+   * @param defaultLockTimeout the timeout in milliseconds when writing a key
+   *     OUTSIDE of a transaction
+   * @return this TransactionDBOptions instance
+   */
+   public TransactionDBOptions setDefaultLockTimeout(
+       final long defaultLockTimeout) {
+     assert(isOwningHandle());
+     setDefaultLockTimeout(nativeHandle_, defaultLockTimeout);
+     return this;
+   }
+
+//  /**
+//   * If set, the {@link TransactionDB} will use this implementation of a mutex
+//   * and condition variable for all transaction locking instead of the default
+//   * mutex/condvar implementation.
+//   *
+//   * @param transactionDbMutexFactory the mutex factory for the transactions
+//   *
+//   * @return this TransactionDBOptions instance
+//   */
+//  public TransactionDBOptions setCustomMutexFactory(
+//      final TransactionDBMutexFactory transactionDbMutexFactory) {
+//
+//  }
+
+  /**
+   * The policy for when to write the data into the DB. The default policy is to
+   * write only the committed data {@link TxnDBWritePolicy#WRITE_COMMITTED}.
+   * The data could be written before the commit phase. The DB then needs to
+   * provide the mechanisms to tell apart committed from uncommitted data.
+   *
+   * @return The write policy.
+   */
+  public TxnDBWritePolicy getWritePolicy() {
+    assert(isOwningHandle());
+    return TxnDBWritePolicy.getTxnDBWritePolicy(getWritePolicy(nativeHandle_));
+  }
+
+  /**
+   * The policy for when to write the data into the DB. The default policy is to
+   * write only the committed data {@link TxnDBWritePolicy#WRITE_COMMITTED}.
+   * The data could be written before the commit phase. The DB then needs to
+   * provide the mechanisms to tell apart committed from uncommitted data.
+   *
+   * @param writePolicy The write policy.
+   *
+   * @return this TransactionDBOptions instance
+   */
+  public TransactionDBOptions setWritePolicy(
+      final TxnDBWritePolicy writePolicy) {
+    assert(isOwningHandle());
+    setWritePolicy(nativeHandle_, writePolicy.getValue());
+    return this;
+  }
+
+  private native static long newTransactionDBOptions();
+  private native long getMaxNumLocks(final long handle);
+  private native void setMaxNumLocks(final long handle,
+      final long maxNumLocks);
+  private native long getNumStripes(final long handle);
+  private native void setNumStripes(final long handle, final long numStripes);
+  private native long getTransactionLockTimeout(final long handle);
+  private native void setTransactionLockTimeout(final long handle,
+      final long transactionLockTimeout);
+  private native long getDefaultLockTimeout(final long handle);
+  private native void setDefaultLockTimeout(final long handle,
+      final long transactionLockTimeout);
+  private native byte getWritePolicy(final long handle);
+  private native void setWritePolicy(final long handle, final byte writePolicy);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionOptions.java
new file mode 100644
index 0000000000..1cd936ae64
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionOptions.java
@@ -0,0 +1,189 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public class TransactionOptions extends RocksObject
+    implements TransactionalOptions {
+
+  public TransactionOptions() {
+    super(newTransactionOptions());
+  }
+
+  @Override
+  public boolean isSetSnapshot() {
+    assert(isOwningHandle());
+    return isSetSnapshot(nativeHandle_);
+  }
+
+  @Override
+  public TransactionOptions setSetSnapshot(final boolean setSnapshot) {
+    assert(isOwningHandle());
+    setSetSnapshot(nativeHandle_, setSnapshot);
+    return this;
+  }
+
+  /**
+   * True means that before acquiring locks, this transaction will
+   * check if doing so will cause a deadlock. If so, it will return with
+   * {@link Status.Code#Busy}. The user should retry their transaction.
+   *
+   * @return true if a deadlock is detected.
+   */
+  public boolean isDeadlockDetect() {
+    assert(isOwningHandle());
+    return isDeadlockDetect(nativeHandle_);
+  }
+
+  /**
+   * Setting to true means that before acquiring locks, this transaction will
+   * check if doing so will cause a deadlock. If so, it will return with
+   * {@link Status.Code#Busy}. The user should retry their transaction.
+   *
+   * @param deadlockDetect true if we should detect deadlocks.
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setDeadlockDetect(final boolean deadlockDetect) {
+    assert(isOwningHandle());
+    setDeadlockDetect(nativeHandle_, deadlockDetect);
+    return this;
+  }
+
+  /**
+   * The wait timeout in milliseconds when a transaction attempts to lock a key.
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)}
+   * will be used
+   *
+   * @return the lock timeout in milliseconds
+   */
+  public long getLockTimeout() {
+    assert(isOwningHandle());
+    return getLockTimeout(nativeHandle_);
+  }
+
+  /**
+   * If positive, specifies the wait timeout in milliseconds when
+   * a transaction attempts to lock a key.
+   *
+   * If 0, no waiting is done if a lock cannot instantly be acquired.
+   * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)}
+   * will be used
+   *
+   * Default: -1
+   *
+   * @param lockTimeout the lock timeout in milliseconds
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setLockTimeout(final long lockTimeout) {
+    assert(isOwningHandle());
+    setLockTimeout(nativeHandle_, lockTimeout);
+    return this;
+  }
+
+  /**
+   * Expiration duration in milliseconds.
+   *
+   * If non-negative, transactions that last longer than this many milliseconds
+   * will fail to commit. If not set, a forgotten transaction that is never
+   * committed, rolled back, or deleted will never relinquish any locks it
+   * holds. This could prevent keys from being written by other writers.
+   *
+   * @return expiration the expiration duration in milliseconds
+   */
+  public long getExpiration() {
+    assert(isOwningHandle());
+    return getExpiration(nativeHandle_);
+  }
+
+  /**
+   * Expiration duration in milliseconds.
+   *
+   * If non-negative, transactions that last longer than this many milliseconds
+   * will fail to commit. If not set, a forgotten transaction that is never
+   * committed, rolled back, or deleted will never relinquish any locks it
+   * holds. This could prevent keys from being written by other writers.
+   *
+   * Default: -1
+   *
+   * @param expiration the expiration duration in milliseconds
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setExpiration(final long expiration) {
+    assert(isOwningHandle());
+    setExpiration(nativeHandle_, expiration);
+    return this;
+  }
+
+  /**
+   * Gets the number of traversals to make during deadlock detection.
+   *
+   * @return the number of traversals to make during
+   *     deadlock detection
+   */
+  public long getDeadlockDetectDepth() {
+    return getDeadlockDetectDepth(nativeHandle_);
+  }
+
+  /**
+   * Sets the number of traversals to make during deadlock detection.
+   *
+   * Default: 50
+   *
+   * @param deadlockDetectDepth the number of traversals to make during
+   *     deadlock detection
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setDeadlockDetectDepth(
+      final long deadlockDetectDepth) {
+    setDeadlockDetectDepth(nativeHandle_, deadlockDetectDepth);
+    return this;
+  }
+
+  /**
+   * Get the maximum number of bytes that may be used for the write batch.
+   *
+   * @return the maximum number of bytes, 0 means no limit.
+   */
+  public long getMaxWriteBatchSize() {
+    return getMaxWriteBatchSize(nativeHandle_);
+  }
+
+  /**
+   * Set the maximum number of bytes that may be used for the write batch.
+   *
+   * @param maxWriteBatchSize the maximum number of bytes, 0 means no limit.
+   *
+   * @return this TransactionOptions instance
+   */
+  public TransactionOptions setMaxWriteBatchSize(final long maxWriteBatchSize) {
+    setMaxWriteBatchSize(nativeHandle_, maxWriteBatchSize);
+    return this;
+  }
+
+  private native static long newTransactionOptions();
+  private native boolean isSetSnapshot(final long handle);
+  private native void setSetSnapshot(final long handle,
+      final boolean setSnapshot);
+  private native boolean isDeadlockDetect(final long handle);
+  private native void setDeadlockDetect(final long handle,
+      final boolean deadlockDetect);
+  private native long getLockTimeout(final long handle);
+  private native void setLockTimeout(final long handle, final long lockTimeout);
+  private native long getExpiration(final long handle);
+  private native void setExpiration(final long handle, final long expiration);
+  private native long getDeadlockDetectDepth(final long handle);
+  private native void setDeadlockDetectDepth(final long handle,
+      final long deadlockDetectDepth);
+  private native long getMaxWriteBatchSize(final long handle);
+  private native void setMaxWriteBatchSize(final long handle,
+      final long maxWriteBatchSize);
+  @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java
new file mode 100644
index 0000000000..3f0eceda85
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionalDB.java
@@ -0,0 +1,68 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+
+interface TransactionalDB<T extends TransactionalOptions>
+    extends AutoCloseable {
+
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @return a new transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions);
+
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @param transactionOptions Any options for the transaction
+   * @return a new transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions,
+      final T transactionOptions);
+
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @param oldTransaction this Transaction will be reused instead of allocating
+   *     a new one. This is an optimization to avoid extra allocations
+   *     when repeatedly creating transactions.
+   * @return The oldTransaction which has been reinitialized as a new
+   *     transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions,
+      final Transaction oldTransaction);
+
+  /**
+   * Starts a new Transaction.
+   *
+   * Caller is responsible for calling {@link #close()} on the returned
+   * transaction when it is no longer needed.
+   *
+   * @param writeOptions Any write options for the transaction
+   * @param transactionOptions Any options for the transaction
+   * @param oldTransaction this Transaction will be reused instead of allocating
+   *     a new one. This is an optimization to avoid extra allocations
+   *     when repeatedly creating transactions.
+   * @return The oldTransaction which has been reinitialized as a new
+   *     transaction
+   */
+  Transaction beginTransaction(final WriteOptions writeOptions,
+      final T transactionOptions, final Transaction oldTransaction);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionalOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionalOptions.java
new file mode 100644
index 0000000000..87aaa7986f
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TransactionalOptions.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+
+interface TransactionalOptions extends AutoCloseable {
+
+  /**
+   * True indicates snapshots will be set, just like if
+   * {@link Transaction#setSnapshot()} had been called
+   *
+   * @return whether a snapshot will be set
+   */
+  boolean isSetSnapshot();
+
+  /**
+   * Setting the setSnapshot to true is the same as calling
+   * {@link Transaction#setSnapshot()}.
+   *
+   * Default: false
+   *
+   * @param <T> The type of transactional options.
+   * @param setSnapshot Whether to set a snapshot
+   *
+   * @return this TransactionalOptions instance
+   */
+  <T extends TransactionalOptions> T setSetSnapshot(final boolean setSnapshot);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
index 740f51268e..26eee4a878 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TtlDB.java
@@ -139,6 +139,55 @@ public static TtlDB open(final DBOptions options, final String db_path,
     return ttlDB;
   }
 
+  /**
+   * <p>Close the TtlDB instance and release resource.</p>
+   *
+   * This is similar to {@link #close()} except that it
+   * throws an exception if any error occurs.
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   *
+   * @throws RocksDBException if an error occurs whilst closing.
+   */
+  public void closeE() throws RocksDBException {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
+  /**
+   * <p>Close the TtlDB instance and release resource.</p>
+   *
+   *
+   * This will not fsync the WAL files.
+   * If syncing is required, the caller must first call {@link #syncWal()}
+   * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch
+   * with {@link WriteOptions#setSync(boolean)} set to true.
+   *
+   * See also {@link #close()}.
+   */
+  @Override
+  public void close() {
+    if (owningHandle_.compareAndSet(true, false)) {
+      try {
+        closeDatabase(nativeHandle_);
+      } catch (final RocksDBException e) {
+        // silently ignore the error report
+      } finally {
+        disposeInternal();
+      }
+    }
+  }
+
   /**
    * <p>Creates a new ttl based column family with a name defined
    * in given ColumnFamilyDescriptor and allocates a
@@ -160,22 +209,8 @@ public ColumnFamilyHandle createColumnFamilyWithTtl(
       final int ttl) throws RocksDBException {
     return new ColumnFamilyHandle(this,
         createColumnFamilyWithTtl(nativeHandle_,
-            columnFamilyDescriptor.columnFamilyName(),
-            columnFamilyDescriptor.columnFamilyOptions().nativeHandle_, ttl));
-  }
-
-  /**
-   * <p>Close the TtlDB instance and release resource.</p>
-   *
-   * <p>Internally, TtlDB owns the {@code rocksdb::DB} pointer
-   * to its associated {@link org.rocksdb.RocksDB}. The release
-   * of that RocksDB pointer is handled in the destructor of the
-   * c++ {@code rocksdb::TtlDB} and should be transparent to
-   * Java developers.</p>
-   */
-  @Override
-  public void close() {
-      super.close();
+            columnFamilyDescriptor.getName(),
+            columnFamilyDescriptor.getOptions().nativeHandle_, ttl));
   }
 
   /**
@@ -193,10 +228,7 @@ protected TtlDB(final long nativeHandle) {
     super(nativeHandle);
   }
 
-  @Override protected void finalize() throws Throwable {
-    close(); //TODO(AR) revisit here when implementing AutoCloseable
-    super.finalize();
-  }
+  @Override protected native void disposeInternal(final long handle);
 
   private native static long open(final long optionsHandle,
       final String db_path, final int ttl, final boolean readOnly)
@@ -208,4 +240,6 @@ private native static long[] openCF(final long optionsHandle,
   private native long createColumnFamilyWithTtl(final long handle,
       final byte[] columnFamilyName, final long columnFamilyOptions, int ttl)
       throws RocksDBException;
+  private native static void closeDatabase(final long handle)
+      throws RocksDBException;
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java
new file mode 100644
index 0000000000..837ce6157f
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java
@@ -0,0 +1,62 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+/**
+ * The transaction db write policy.
+ */
+public enum TxnDBWritePolicy {
+  /**
+   * Write only the committed data.
+   */
+  WRITE_COMMITTED((byte)0x00),
+
+  /**
+   * Write data after the prepare phase of 2pc.
+   */
+  WRITE_PREPARED((byte)0x1),
+
+  /**
+   * Write data before the prepare phase of 2pc.
+   */
+  WRITE_UNPREPARED((byte)0x2);
+
+  private byte value;
+
+  TxnDBWritePolicy(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * <p>Returns the byte value of the enumerations value.</p>
+   *
+   * @return byte representation
+   */
+  public byte getValue() {
+    return value;
+  }
+
+  /**
+   * <p>Get the TxnDBWritePolicy enumeration value by
+   * passing the byte identifier to this method.</p>
+   *
+   * @param byteIdentifier of TxnDBWritePolicy.
+   *
+   * @return TxnDBWritePolicy instance.
+   *
+   * @throws IllegalArgumentException If TxnDBWritePolicy cannot be found for
+   *     the provided byteIdentifier
+   */
+  public static TxnDBWritePolicy getTxnDBWritePolicy(final byte byteIdentifier) {
+    for (final TxnDBWritePolicy txnDBWritePolicy : TxnDBWritePolicy.values()) {
+      if (txnDBWritePolicy.getValue() == byteIdentifier) {
+        return txnDBWritePolicy;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for TxnDBWritePolicy.");
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/UInt64AddOperator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/UInt64AddOperator.java
new file mode 100644
index 0000000000..cce9b298d8
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/UInt64AddOperator.java
@@ -0,0 +1,19 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Uint64AddOperator is a merge operator that accumlates a long
+ * integer value.
+ */
+public class UInt64AddOperator extends MergeOperator {
+    public UInt64AddOperator() {
+        super(newSharedUInt64AddOperator());
+    }
+
+    private native static long newSharedUInt64AddOperator();
+    @Override protected final native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WALRecoveryMode.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WALRecoveryMode.java
index d3fc47b631..d8b9eeceda 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WALRecoveryMode.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WALRecoveryMode.java
@@ -65,7 +65,7 @@ public byte getValue() {
    *
    * @param byteIdentifier of WALRecoveryMode.
    *
-   * @return CompressionType instance.
+   * @return WALRecoveryMode instance.
    *
    * @throws IllegalArgumentException If WALRecoveryMode cannot be found for the
    *   provided byteIdentifier
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
index d45da2b3a1..482351e996 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WBWIRocksIterator.java
@@ -45,6 +45,7 @@ public WriteEntry entry() {
   @Override final native void next0(long handle);
   @Override final native void prev0(long handle);
   @Override final native void seek0(long handle, byte[] target, int targetLen);
+  @Override final native void seekForPrev0(long handle, byte[] target, int targetLen);
   @Override final native void status0(long handle) throws RocksDBException;
 
   private native long[] entry1(final long handle);
@@ -54,10 +55,13 @@ public WriteEntry entry() {
    * that created the record in the Write Batch
    */
   public enum WriteType {
-    PUT((byte)0x1),
-    MERGE((byte)0x2),
-    DELETE((byte)0x4),
-    LOG((byte)0x8);
+    PUT((byte)0x0),
+    MERGE((byte)0x1),
+    DELETE((byte)0x2),
+    SINGLE_DELETE((byte)0x3),
+    DELETE_RANGE((byte)0x4),
+    LOG((byte)0x5),
+    XID((byte)0x6);
 
     final byte id;
     WriteType(final byte id) {
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalFileType.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalFileType.java
new file mode 100644
index 0000000000..fed27ed117
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalFileType.java
@@ -0,0 +1,55 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum WalFileType {
+  /**
+   * Indicates that WAL file is in archive directory. WAL files are moved from
+   * the main db directory to archive directory once they are not live and stay
+   * there until cleaned up. Files are cleaned depending on archive size
+   * (Options::WAL_size_limit_MB) and time since last cleaning
+   * (Options::WAL_ttl_seconds).
+   */
+  kArchivedLogFile((byte)0x0),
+
+  /**
+   * Indicates that WAL file is live and resides in the main db directory
+   */
+  kAliveLogFile((byte)0x1);
+
+  private final byte value;
+
+  WalFileType(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation value.
+   *
+   * @return the internal representation value
+   */
+  byte getValue() {
+    return value;
+  }
+
+  /**
+   * Get the WalFileType from the internal representation value.
+   *
+   * @return the wal file type.
+   *
+   * @throws IllegalArgumentException if the value is unknown.
+   */
+  static WalFileType fromValue(final byte value) {
+    for (final WalFileType walFileType : WalFileType.values()) {
+      if(walFileType.value == value) {
+        return walFileType;
+      }
+    }
+
+    throw new IllegalArgumentException(
+        "Illegal value provided for WalFileType: " + value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalFilter.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalFilter.java
new file mode 100644
index 0000000000..37e36213ae
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalFilter.java
@@ -0,0 +1,87 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import java.util.Map;
+
+/**
+ * WALFilter allows an application to inspect write-ahead-log (WAL)
+ * records or modify their processing on recovery.
+ */
+public interface WalFilter {
+
+  /**
+   * Provide ColumnFamily-&gt;LogNumber map to filter
+   * so that filter can determine whether a log number applies to a given
+   * column family (i.e. that log hasn't been flushed to SST already for the
+   * column family).
+   *
+   * We also pass in name&gt;id map as only name is known during
+   * recovery (as handles are opened post-recovery).
+   * while write batch callbacks happen in terms of column family id.
+   *
+   * @param cfLognumber column_family_id to lognumber map
+   * @param cfNameId column_family_name to column_family_id map
+   */
+  void columnFamilyLogNumberMap(final Map<Integer, Long> cfLognumber,
+      final Map<String, Integer> cfNameId);
+
+  /**
+   * LogRecord is invoked for each log record encountered for all the logs
+   * during replay on logs on recovery. This method can be used to:
+   *     * inspect the record (using the batch parameter)
+   *     * ignoring current record
+   *         (by returning WalProcessingOption::kIgnoreCurrentRecord)
+   *     * reporting corrupted record
+   *         (by returning WalProcessingOption::kCorruptedRecord)
+   *     * stop log replay
+   *         (by returning kStop replay) - please note that this implies
+   *         discarding the logs from current record onwards.
+   *
+   * @param logNumber log number of the current log.
+   *     Filter might use this to determine if the log
+   *     record is applicable to a certain column family.
+   * @param logFileName log file name - only for informational purposes
+   * @param batch batch encountered in the log during recovery
+   * @param newBatch new batch to populate if filter wants to change
+   *     the batch (for example to filter some records out, or alter some
+   *     records). Please note that the new batch MUST NOT contain
+   *     more records than original, else recovery would be failed.
+   *
+   * @return Processing option for the current record.
+   */
+  LogRecordFoundResult logRecordFound(final long logNumber,
+      final String logFileName, final WriteBatch batch,
+      final WriteBatch newBatch);
+
+  class LogRecordFoundResult {
+    public static LogRecordFoundResult CONTINUE_UNCHANGED =
+        new LogRecordFoundResult(WalProcessingOption.CONTINUE_PROCESSING, false);
+
+    final WalProcessingOption walProcessingOption;
+    final boolean batchChanged;
+
+    /**
+     * @param walProcessingOption the processing option
+     * @param batchChanged Whether batch was changed by the filter.
+     *     It must be set to true if newBatch was populated,
+     *     else newBatch has no effect.
+     */
+    public LogRecordFoundResult(final WalProcessingOption walProcessingOption,
+        final boolean batchChanged) {
+      this.walProcessingOption = walProcessingOption;
+      this.batchChanged = batchChanged;
+    }
+  }
+
+  /**
+   * Returns a name that identifies this WAL filter.
+   * The name will be printed to LOG file on start up for diagnosis.
+   *
+   * @return the name
+   */
+  String name();
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalProcessingOption.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalProcessingOption.java
new file mode 100644
index 0000000000..889602edc9
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WalProcessingOption.java
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+public enum WalProcessingOption {
+  /**
+   * Continue processing as usual.
+   */
+  CONTINUE_PROCESSING((byte)0x0),
+
+  /**
+   * Ignore the current record but continue processing of log(s).
+   */
+  IGNORE_CURRENT_RECORD((byte)0x1),
+
+  /**
+   * Stop replay of logs and discard logs.
+   * Logs won't be replayed on subsequent recovery.
+   */
+  STOP_REPLAY((byte)0x2),
+
+  /**
+   * Corrupted record detected by filter.
+   */
+  CORRUPTED_RECORD((byte)0x3);
+
+  private final byte value;
+
+  WalProcessingOption(final byte value) {
+    this.value = value;
+  }
+
+  /**
+   * Get the internal representation.
+   *
+   * @return the internal representation.
+   */
+  byte getValue() {
+    return value;
+  }
+
+  public static WalProcessingOption fromValue(final byte value) {
+    for (final WalProcessingOption walProcessingOption : WalProcessingOption.values()) {
+      if (walProcessingOption.value == value) {
+        return walProcessingOption;
+      }
+    }
+    throw new IllegalArgumentException(
+        "Illegal value provided for WalProcessingOption: " + value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
index 272e9b4cdf..5673a25efb 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatch.java
@@ -39,6 +39,16 @@ public WriteBatch(final int reserved_bytes) {
     super(newWriteBatch(reserved_bytes));
   }
 
+  /**
+   * Constructs a WriteBatch instance from a serialized representation
+   * as returned by {@link #data()}.
+   *
+   * @param serialized the serialized representation.
+   */
+  public WriteBatch(final byte[] serialized) {
+    super(newWriteBatch(serialized, serialized.length));
+  }
+
   /**
    * Support for iterating over the contents of a batch.
    *
@@ -51,6 +61,137 @@ public void iterate(final Handler handler) throws RocksDBException {
     iterate(nativeHandle_, handler.nativeHandle_);
   }
 
+  /**
+   * Retrieve the serialized version of this batch.
+   *
+   * @return the serialized representation of this write batch.
+   *
+   * @throws RocksDBException if an error occurs whilst retrieving
+   *   the serialized batch data.
+   */
+  public byte[] data() throws RocksDBException {
+    return data(nativeHandle_);
+  }
+
+  /**
+   * Retrieve data size of the batch.
+   *
+   * @return the serialized data size of the batch.
+   */
+  public long getDataSize() {
+    return getDataSize(nativeHandle_);
+  }
+
+  /**
+   * Returns true if Put will be called during Iterate.
+   *
+   * @return true if Put will be called during Iterate.
+   */
+  public boolean hasPut() {
+    return hasPut(nativeHandle_);
+  }
+
+  /**
+   * Returns true if Delete will be called during Iterate.
+   *
+   * @return true if Delete will be called during Iterate.
+   */
+  public boolean hasDelete() {
+    return hasDelete(nativeHandle_);
+  }
+
+  /**
+   * Returns true if SingleDelete will be called during Iterate.
+   *
+   * @return true if SingleDelete will be called during Iterate.
+   */
+  public boolean hasSingleDelete() {
+    return hasSingleDelete(nativeHandle_);
+  }
+
+  /**
+   * Returns true if DeleteRange will be called during Iterate.
+   *
+   * @return true if DeleteRange will be called during Iterate.
+   */
+  public boolean hasDeleteRange() {
+    return hasDeleteRange(nativeHandle_);
+  }
+
+  /**
+   * Returns true if Merge will be called during Iterate.
+   *
+   * @return true if Merge will be called during Iterate.
+   */
+  public boolean hasMerge() {
+    return hasMerge(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkBeginPrepare will be called during Iterate.
+   *
+   * @return true if MarkBeginPrepare will be called during Iterate.
+   */
+  public boolean hasBeginPrepare() {
+    return hasBeginPrepare(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkEndPrepare will be called during Iterate.
+   *
+   * @return true if MarkEndPrepare will be called during Iterate.
+   */
+  public boolean hasEndPrepare() {
+    return hasEndPrepare(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkCommit will be called during Iterate.
+   *
+   * @return true if MarkCommit will be called during Iterate.
+   */
+  public boolean hasCommit() {
+    return hasCommit(nativeHandle_);
+  }
+
+  /**
+   * Returns true if MarkRollback will be called during Iterate.
+   *
+   * @return true if MarkRollback will be called during Iterate.
+   */
+  public boolean hasRollback() {
+    return hasRollback(nativeHandle_);
+  }
+
+  @Override
+  public WriteBatch getWriteBatch() {
+    return this;
+  }
+
+  /**
+   * Marks this point in the WriteBatch as the last record to
+   * be inserted into the WAL, provided the WAL is enabled.
+   */
+  public void markWalTerminationPoint() {
+    markWalTerminationPoint(nativeHandle_);
+  }
+
+  /**
+   * Gets the WAL termination point.
+   *
+   * See {@link #markWalTerminationPoint()}
+   *
+   * @return the WAL termination point
+   */
+  public SavePoint getWalTerminationPoint() {
+    return getWalTerminationPoint(nativeHandle_);
+  }
+
+  @Override
+  WriteBatch getWriteBatch(final long handle) {
+    return this;
+  }
+
   /**
    * <p>Private WriteBatch constructor which is used to construct
    * WriteBatch instances from C++ side. As the reference to this
@@ -87,10 +228,14 @@ public void iterate(final Handler handler) throws RocksDBException {
   @Override final native void merge(final long handle, final byte[] key,
       final int keyLen, final byte[] value, final int valueLen,
       final long cfHandle);
-  @Override final native void remove(final long handle, final byte[] key,
-      final int keyLen);
-  @Override final native void remove(final long handle, final byte[] key,
-      final int keyLen, final long cfHandle);
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
   @Override
   final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
       final byte[] endKey, final int endKeyLen);
@@ -98,36 +243,79 @@ final native void deleteRange(final long handle, final byte[] beginKey, final in
   final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
       final byte[] endKey, final int endKeyLen, final long cfHandle);
   @Override final native void putLogData(final long handle,
-      final byte[] blob, final int blobLen);
+      final byte[] blob, final int blobLen) throws RocksDBException;
   @Override final native void clear0(final long handle);
   @Override final native void setSavePoint0(final long handle);
   @Override final native void rollbackToSavePoint0(final long handle);
+  @Override final native void popSavePoint(final long handle) throws RocksDBException;
+  @Override final native void setMaxBytes(final long nativeHandle,
+    final long maxBytes);
 
   private native static long newWriteBatch(final int reserved_bytes);
+  private native static long newWriteBatch(final byte[] serialized,
+      final int serializedLength);
   private native void iterate(final long handle, final long handlerHandle)
       throws RocksDBException;
-
+  private native byte[] data(final long nativeHandle) throws RocksDBException;
+  private native long getDataSize(final long nativeHandle);
+  private native boolean hasPut(final long nativeHandle);
+  private native boolean hasDelete(final long nativeHandle);
+  private native boolean hasSingleDelete(final long nativeHandle);
+  private native boolean hasDeleteRange(final long nativeHandle);
+  private native boolean hasMerge(final long nativeHandle);
+  private native boolean hasBeginPrepare(final long nativeHandle);
+  private native boolean hasEndPrepare(final long nativeHandle);
+  private native boolean hasCommit(final long nativeHandle);
+  private native boolean hasRollback(final long nativeHandle);
+  private native void markWalTerminationPoint(final long nativeHandle);
+  private native SavePoint getWalTerminationPoint(final long nativeHandle);
 
   /**
    * Handler callback for iterating over the contents of a batch.
    */
   public static abstract class Handler
-      extends AbstractImmutableNativeReference {
-    private final long nativeHandle_;
+      extends RocksCallbackObject {
     public Handler() {
-      super(true);
-      this.nativeHandle_ = createNewHandler0();
+      super(null);
+    }
+
+    @Override
+    protected long initializeNative(final long... nativeParameterHandles) {
+      return createNewHandler0();
     }
 
-    public abstract void put(byte[] key, byte[] value);
-    public abstract void merge(byte[] key, byte[] value);
-    public abstract void delete(byte[] key);
-    public abstract void deleteRange(byte[] beginKey, byte[] endKey);
-    public abstract void logData(byte[] blob);
+    public abstract void put(final int columnFamilyId, final byte[] key,
+        final byte[] value) throws RocksDBException;
+    public abstract void put(final byte[] key, final byte[] value);
+    public abstract void merge(final int columnFamilyId, final byte[] key,
+        final byte[] value) throws RocksDBException;
+    public abstract void merge(final byte[] key, final byte[] value);
+    public abstract void delete(final int columnFamilyId, final byte[] key)
+        throws RocksDBException;
+    public abstract void delete(final byte[] key);
+    public abstract void singleDelete(final int columnFamilyId,
+        final byte[] key) throws RocksDBException;
+    public abstract void singleDelete(final byte[] key);
+    public abstract void deleteRange(final int columnFamilyId,
+        final byte[] beginKey, final byte[] endKey) throws RocksDBException;
+    public abstract void deleteRange(final byte[] beginKey,
+        final byte[] endKey);
+    public abstract void logData(final byte[] blob);
+    public abstract void putBlobIndex(final int columnFamilyId,
+        final byte[] key, final byte[] value) throws RocksDBException;
+    public abstract void markBeginPrepare() throws RocksDBException;
+    public abstract void markEndPrepare(final byte[] xid)
+        throws RocksDBException;
+    public abstract void markNoop(final boolean emptyBatch)
+        throws RocksDBException;
+    public abstract void markRollback(final byte[] xid)
+        throws RocksDBException;
+    public abstract void markCommit(final byte[] xid)
+        throws RocksDBException;
 
     /**
      * shouldContinue is called by the underlying iterator
-     * WriteBatch::Iterate. If it returns false,
+     * {@link WriteBatch#iterate(Handler)}. If it returns false,
      * iteration is halted. Otherwise, it continues
      * iterating. The default implementation always
      * returns true.
@@ -139,15 +327,59 @@ public boolean shouldContinue() {
       return true;
     }
 
+    private native long createNewHandler0();
+  }
+
+  /**
+   * A structure for describing the save point in the Write Batch.
+   */
+  public static class SavePoint {
+    private long size;
+    private long count;
+    private long contentFlags;
+
+    public SavePoint(final long size, final long count,
+        final long contentFlags) {
+      this.size = size;
+      this.count = count;
+      this.contentFlags = contentFlags;
+    }
+
+    public void clear() {
+      this.size = 0;
+      this.count = 0;
+      this.contentFlags = 0;
+    }
+
+    /**
+     * Get the size of the serialized representation.
+     *
+     * @return the size of the serialized representation.
+     */
+    public long getSize() {
+      return size;
+    }
+
     /**
-     * Deletes underlying C++ handler pointer.
+     * Get the number of elements.
+     *
+     * @return the number of elements.
      */
-    @Override
-    protected void disposeInternal() {
-      disposeInternal(nativeHandle_);
+    public long getCount() {
+      return count;
     }
 
-    private native long createNewHandler0();
-    private native void disposeInternal(final long handle);
+    /**
+     * Get the content flags.
+     *
+     * @return the content flags.
+     */
+    public long getContentFlags() {
+      return contentFlags;
+    }
+
+    public boolean isCleared() {
+      return (size | count | contentFlags) == 0;
+    }
   }
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
index cd024ad58d..e0999e21b6 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchInterface.java
@@ -23,8 +23,9 @@ public interface WriteBatchInterface {
      *
      * @param key the specified key to be inserted.
      * @param value the value associated with the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void put(byte[] key, byte[] value);
+    void put(byte[] key, byte[] value) throws RocksDBException;
 
     /**
      * <p>Store the mapping "key-&gt;value" within given column
@@ -34,9 +35,10 @@ public interface WriteBatchInterface {
      *     instance
      * @param key the specified key to be inserted.
      * @param value the value associated with the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
     void put(ColumnFamilyHandle columnFamilyHandle,
-                    byte[] key, byte[] value);
+                    byte[] key, byte[] value) throws RocksDBException;
 
     /**
      * <p>Merge "value" with the existing value of "key" in the database.
@@ -45,8 +47,9 @@ void put(ColumnFamilyHandle columnFamilyHandle,
      * @param key the specified key to be merged.
      * @param value the value to be merged with the current value for
      * the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void merge(byte[] key, byte[] value);
+    void merge(byte[] key, byte[] value) throws RocksDBException;
 
     /**
      * <p>Merge "value" with the existing value of "key" in given column family.
@@ -56,24 +59,102 @@ void put(ColumnFamilyHandle columnFamilyHandle,
      * @param key the specified key to be merged.
      * @param value the value to be merged with the current value for
      * the specified key.
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
     void merge(ColumnFamilyHandle columnFamilyHandle,
-                      byte[] key, byte[] value);
+                      byte[] key, byte[] value) throws RocksDBException;
 
     /**
      * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
      *
      * @param key Key to delete within database
+     *
+     * @deprecated Use {@link #delete(byte[])}
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    @Deprecated
+    void remove(byte[] key) throws RocksDBException;
+
+    /**
+     * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
+     * @param key Key to delete within database
+     *
+     * @deprecated Use {@link #delete(ColumnFamilyHandle, byte[])}
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    @Deprecated
+    void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+        throws RocksDBException;
+
+    /**
+     * <p>If the database contains a mapping for "key", erase it.  Else do nothing.</p>
+     *
+     * @param key Key to delete within database
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void remove(byte[] key);
+    void delete(byte[] key) throws RocksDBException;
 
     /**
      * <p>If column family contains a mapping for "key", erase it.  Else do nothing.</p>
      *
      * @param columnFamilyHandle {@link ColumnFamilyHandle} instance
      * @param key Key to delete within database
+     * @throws RocksDBException thrown if error happens in underlying native library.
+     */
+    void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key)
+        throws RocksDBException;
+
+    /**
+     * Remove the database entry for {@code key}. Requires that the key exists
+     * and was not overwritten. It is not an error if the key did not exist
+     * in the database.
+     *
+     * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+     * times), then the result of calling SingleDelete() on this key is undefined.
+     * SingleDelete() only behaves correctly if there has been only one Put()
+     * for this key since the previous call to SingleDelete() for this key.
+     *
+     * This feature is currently an experimental performance optimization
+     * for a very specific workload. It is up to the caller to ensure that
+     * SingleDelete is only used for a key that is not deleted using Delete() or
+     * written using Merge(). Mixing SingleDelete operations with Deletes and
+     * Merges can result in undefined behavior.
+     *
+     * @param key Key to delete within database
+     *
+     * @throws RocksDBException thrown if error happens in underlying
+     *     native library.
      */
-    void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key);
+    @Experimental("Performance optimization for a very specific workload")
+    void singleDelete(final byte[] key) throws RocksDBException;
+
+    /**
+     * Remove the database entry for {@code key}. Requires that the key exists
+     * and was not overwritten. It is not an error if the key did not exist
+     * in the database.
+     *
+     * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple
+     * times), then the result of calling SingleDelete() on this key is undefined.
+     * SingleDelete() only behaves correctly if there has been only one Put()
+     * for this key since the previous call to SingleDelete() for this key.
+     *
+     * This feature is currently an experimental performance optimization
+     * for a very specific workload. It is up to the caller to ensure that
+     * SingleDelete is only used for a key that is not deleted using Delete() or
+     * written using Merge(). Mixing SingleDelete operations with Deletes and
+     * Merges can result in undefined behavior.
+     *
+     * @param columnFamilyHandle The column family to delete the key from
+     * @param key Key to delete within database
+     *
+     * @throws RocksDBException thrown if error happens in underlying
+     *     native library.
+     */
+    @Experimental("Performance optimization for a very specific workload")
+    void singleDelete(final ColumnFamilyHandle columnFamilyHandle,
+            final byte[] key) throws RocksDBException;
 
     /**
      * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
@@ -88,8 +169,9 @@ void merge(ColumnFamilyHandle columnFamilyHandle,
      *          First key to delete within database (included)
      * @param endKey
      *          Last key to delete within database (excluded)
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void deleteRange(byte[] beginKey, byte[] endKey);
+    void deleteRange(byte[] beginKey, byte[] endKey) throws RocksDBException;
 
     /**
      * Removes the database entries in the range ["beginKey", "endKey"), i.e.,
@@ -105,8 +187,10 @@ void merge(ColumnFamilyHandle columnFamilyHandle,
      *          First key to delete within database (included)
      * @param endKey
      *          Last key to delete within database (excluded)
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] endKey);
+    void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey,
+            byte[] endKey) throws RocksDBException;
 
     /**
      * Append a blob of arbitrary size to the records in this batch. The blob will
@@ -121,8 +205,9 @@ void merge(ColumnFamilyHandle columnFamilyHandle,
      * replication.
      *
      * @param blob binary object to be inserted
+     * @throws RocksDBException thrown if error happens in underlying native library.
      */
-    void putLogData(byte[] blob);
+    void putLogData(byte[] blob) throws RocksDBException;
 
     /**
      * Clear all updates buffered in this batch
@@ -143,4 +228,30 @@ void merge(ColumnFamilyHandle columnFamilyHandle,
      * @throws RocksDBException if there is no previous call to SetSavePoint()
      */
     void rollbackToSavePoint() throws RocksDBException;
+
+    /**
+     * Pop the most recent save point.
+     *
+     * That is to say that it removes the last save point,
+     * which was set by {@link #setSavePoint()}.
+     *
+     * @throws RocksDBException If there is no previous call to
+     *     {@link #setSavePoint()}, an exception with
+     *     {@link Status.Code#NotFound} will be thrown.
+     */
+    void popSavePoint() throws RocksDBException;
+
+    /**
+     * Set the maximum size of the write batch.
+     *
+     * @param maxBytes the maximum size in bytes.
+     */
+    void setMaxBytes(long maxBytes);
+
+    /**
+     * Get the underlying Write Batch.
+     *
+     * @return the underlying WriteBatch.
+     */
+    WriteBatch getWriteBatch();
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
index fdf89b2798..2ad91042d4 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java
@@ -60,8 +60,21 @@ public WriteBatchWithIndex(
       final AbstractComparator<? extends AbstractSlice<?>>
           fallbackIndexComparator, final int reservedBytes,
       final boolean overwriteKey) {
-    super(newWriteBatchWithIndex(fallbackIndexComparator.getNativeHandle(),
-        reservedBytes, overwriteKey));
+    super(newWriteBatchWithIndex(fallbackIndexComparator.nativeHandle_,
+        fallbackIndexComparator.getComparatorType().getValue(), reservedBytes,
+        overwriteKey));
+  }
+
+  /**
+   * <p>Private WriteBatchWithIndex constructor which is used to construct
+   * WriteBatchWithIndex instances from C++ side. As the reference to this
+   * object is also managed from C++ side the handle will be disowned.</p>
+   *
+   * @param nativeHandle address of native instance.
+   */
+  WriteBatchWithIndex(final long nativeHandle) {
+    super(nativeHandle);
+    disOwnNativeHandle();
   }
 
   /**
@@ -101,6 +114,12 @@ public WBWIRocksIterator newIterator() {
    * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator}
    * as a delta and baseIterator as a base
    *
+   * Updating write batch with the current key of the iterator is not safe.
+   * We strongly recommand users not to do it. It will invalidate the current
+   * key() and value() of the iterator. This invalidation happens even before
+   * the write batch update finishes. The state may recover after Next() is
+   * called.
+   *
    * @param columnFamilyHandle The column family to iterate over
    * @param baseIterator The base iterator,
    *   e.g. {@link org.rocksdb.RocksDB#newIterator()}
@@ -110,12 +129,10 @@ public WBWIRocksIterator newIterator() {
   public RocksIterator newIteratorWithBase(
       final ColumnFamilyHandle columnFamilyHandle,
       final RocksIterator baseIterator) {
-    RocksIterator iterator = new RocksIterator(
-        baseIterator.parent_,
-        iteratorWithBase(nativeHandle_,
-                columnFamilyHandle.nativeHandle_,
-                baseIterator.nativeHandle_));
-    //when the iterator is deleted it will also delete the baseIterator
+    RocksIterator iterator = new RocksIterator(baseIterator.parent_,
+        iteratorWithBase(
+            nativeHandle_, columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_));
+    // when the iterator is deleted it will also delete the baseIterator
     baseIterator.disOwnNativeHandle();
     return iterator;
   }
@@ -132,8 +149,7 @@ public RocksIterator newIteratorWithBase(
    * point-in-timefrom baseIterator and modifications made in this write batch.
    */
   public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) {
-    return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(),
-        baseIterator);
+    return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator);
   }
 
   /**
@@ -244,10 +260,14 @@ public byte[] getFromBatchAndDB(final RocksDB db, final ReadOptions options,
   @Override final native void merge(final long handle, final byte[] key,
       final int keyLen, final byte[] value, final int valueLen,
       final long cfHandle);
-  @Override final native void remove(final long handle, final byte[] key,
-      final int keyLen);
-  @Override final native void remove(final long handle, final byte[] key,
-      final int keyLen, final long cfHandle);
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void delete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen) throws RocksDBException;
+  @Override final native void singleDelete(final long handle, final byte[] key,
+      final int keyLen, final long cfHandle) throws RocksDBException;
   @Override
   final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
       final byte[] endKey, final int endKeyLen);
@@ -255,20 +275,25 @@ final native void deleteRange(final long handle, final byte[] beginKey, final in
   final native void deleteRange(final long handle, final byte[] beginKey, final int beginKeyLen,
       final byte[] endKey, final int endKeyLen, final long cfHandle);
   @Override final native void putLogData(final long handle, final byte[] blob,
-      final int blobLen);
+      final int blobLen) throws RocksDBException;
   @Override final native void clear0(final long handle);
   @Override final native void setSavePoint0(final long handle);
   @Override final native void rollbackToSavePoint0(final long handle);
+  @Override final native void popSavePoint(final long handle) throws RocksDBException;
+  @Override final native void setMaxBytes(final long nativeHandle,
+      final long maxBytes);
+  @Override final native WriteBatch getWriteBatch(final long handle);
 
   private native static long newWriteBatchWithIndex();
   private native static long newWriteBatchWithIndex(final boolean overwriteKey);
   private native static long newWriteBatchWithIndex(
-      final long fallbackIndexComparatorHandle, final int reservedBytes,
+      final long fallbackIndexComparatorHandle,
+      final byte comparatorType, final int reservedBytes,
       final boolean overwriteKey);
   private native long iterator0(final long handle);
   private native long iterator1(final long handle, final long cfHandle);
-  private native long iteratorWithBase(final long handle,
-      final long baseIteratorHandle, final long cfHandle);
+  private native long iteratorWithBase(
+      final long handle, final long baseIteratorHandle, final long cfHandle);
   private native byte[] getFromBatch(final long handle, final long optHandle,
       final byte[] key, final int keyLen);
   private native byte[] getFromBatch(final long handle, final long optHandle,
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java
new file mode 100644
index 0000000000..b244aa9522
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteBufferManager.java
@@ -0,0 +1,33 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+/**
+ * Java wrapper over native write_buffer_manager class
+ */
+public class WriteBufferManager extends RocksObject {
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  /**
+   * Construct a new instance of WriteBufferManager.
+   *
+   * Check <a href="https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager">
+   *     https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager</a>
+   * for more details on when to use it
+   *
+   * @param bufferSizeBytes buffer size(in bytes) to use for native write_buffer_manager
+   * @param cache cache whose memory should be bounded by this write buffer manager
+   */
+  public WriteBufferManager(final long bufferSizeBytes, final Cache cache){
+    super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_));
+  }
+
+  private native static long newWriteBufferManager(final long bufferSizeBytes, final long cacheHandle);
+  @Override
+  protected native void disposeInternal(final long handle);
+}
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
index b9e8ad81c2..71789ed1fd 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/WriteOptions.java
@@ -20,6 +20,25 @@ public WriteOptions() {
 
   }
 
+  // TODO(AR) consider ownership
+  WriteOptions(final long nativeHandle) {
+    super(nativeHandle);
+    disOwnNativeHandle();
+  }
+
+  /**
+   * Copy constructor for WriteOptions.
+   *
+   * NOTE: This does a shallow copy, which means comparator, merge_operator, compaction_filter,
+   * compaction_filter_factory and other pointers will be cloned!
+   *
+   * @param other The ColumnFamilyOptions to copy.
+   */
+  public WriteOptions(WriteOptions other) {
+    super(copyWriteOptions(other.nativeHandle_));
+  }
+
+
   /**
    * If true, the write will be flushed from the operating system
    * buffer cache (by calling WritableFile::Sync()) before the write
@@ -71,7 +90,10 @@ public boolean sync() {
 
   /**
    * If true, writes will not first go to the write ahead log,
-   * and the write may got lost after a crash.
+   * and the write may got lost after a crash. The backup engine
+   * relies on write-ahead logs to back up the memtable, so if
+   * you disable write-ahead logs, you must create backups with
+   * flush_before_backup=true to avoid losing unflushed memtable data.
    *
    * @param flag a boolean flag to specify whether to disable
    *     write-ahead-log on writes.
@@ -84,7 +106,10 @@ public WriteOptions setDisableWAL(final boolean flag) {
 
   /**
    * If true, writes will not first go to the write ahead log,
-   * and the write may got lost after a crash.
+   * and the write may got lost after a crash. The backup engine
+   * relies on write-ahead logs to back up the memtable, so if
+   * you disable write-ahead logs, you must create backups with
+   * flush_before_backup=true to avoid losing unflushed memtable data.
    *
    * @return boolean value indicating if WAL is disabled.
    */
@@ -144,7 +169,41 @@ public boolean noSlowdown() {
     return noSlowdown(nativeHandle_);
   }
 
+  /**
+   * If true, this write request is of lower priority if compaction is
+   * behind. In this case that, {@link #noSlowdown()} == true, the request
+   * will be cancelled immediately with {@link Status.Code#Incomplete} returned.
+   * Otherwise, it will be slowed down. The slowdown value is determined by
+   * RocksDB to guarantee it introduces minimum impacts to high priority writes.
+   *
+   * Default: false
+   *
+   * @param lowPri true if the write request should be of lower priority than
+   *     compactions which are behind.
+   *
+   * @return the instance of the current WriteOptions.
+   */
+  public WriteOptions setLowPri(final boolean lowPri) {
+    setLowPri(nativeHandle_, lowPri);
+    return this;
+  }
+
+  /**
+   * Returns true if this write request is of lower priority if compaction is
+   * behind.
+   *
+   * See {@link #setLowPri(boolean)}.
+   *
+   * @return true if this write request is of lower priority, false otherwise.
+   */
+  public boolean lowPri() {
+    return lowPri(nativeHandle_);
+  }
+
   private native static long newWriteOptions();
+  private native static long copyWriteOptions(long handle);
+  @Override protected final native void disposeInternal(final long handle);
+
   private native void setSync(long handle, boolean flag);
   private native boolean sync(long handle);
   private native void setDisableWAL(long handle, boolean flag);
@@ -155,5 +214,6 @@ private native void setIgnoreMissingColumnFamilies(final long handle,
   private native void setNoSlowdown(final long handle,
       final boolean noSlowdown);
   private native boolean noSlowdown(final long handle);
-  @Override protected final native void disposeInternal(final long handle);
+  private native void setLowPri(final long handle, final boolean lowPri);
+  private native boolean lowPri(final long handle);
 }
diff --git a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
index f84e14bc19..bf005a3481 100644
--- a/thirdparty/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/thirdparty/rocksdb/java/src/main/java/org/rocksdb/util/Environment.java
@@ -8,10 +8,18 @@ public static boolean isPowerPC() {
     return ARCH.contains("ppc");
   }
 
+  public static boolean isS390x() {
+    return ARCH.contains("s390x");
+  }
+
   public static boolean isWindows() {
     return (OS.contains("win"));
   }
 
+  public static boolean isFreeBSD() {
+    return (OS.contains("freebsd"));
+  }
+
   public static boolean isMac() {
     return (OS.contains("mac"));
   }
@@ -29,6 +37,10 @@ public static boolean isSolaris() {
     return OS.contains("sunos");
   }
 
+  public static boolean isOpenBSD() {
+    return (OS.contains("openbsd"));
+  }
+
   public static boolean is64Bit() {
     if (ARCH.indexOf("sparcv9") >= 0) {
       return true;
@@ -49,11 +61,15 @@ public static String getJniLibraryName(final String name) {
       final String arch = is64Bit() ? "64" : "32";
       if(isPowerPC()) {
         return String.format("%sjni-linux-%s", name, ARCH);
+      } else if(isS390x()) {
+        return String.format("%sjni-linux%s", name, ARCH);
       } else {
         return String.format("%sjni-linux%s", name, arch);
       }
     } else if (isMac()) {
       return String.format("%sjni-osx", name);
+    } else if (isFreeBSD()) {
+      return String.format("%sjni-freebsd%s", name, is64Bit() ? "64" : "32");
     } else if (isAix() && is64Bit()) {
       return String.format("%sjni-aix64", name);
     } else if (isSolaris()) {
@@ -61,6 +77,8 @@ public static String getJniLibraryName(final String name) {
       return String.format("%sjni-solaris%s", name, arch);
     } else if (isWindows() && is64Bit()) {
       return String.format("%sjni-win64", name);
+    } else if (isOpenBSD()) {
+      return String.format("%sjni-openbsd%s", name, is64Bit() ? "64" : "32");
     }
 
     throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name));
@@ -71,7 +89,7 @@ public static String getJniLibraryFileName(final String name) {
   }
 
   private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
-    if (isUnix() || isAix() || isSolaris()) {
+    if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) {
       return libraryFileName + ".so";
     } else if (isMac()) {
       return libraryFileName + (shared ? ".dylib" : ".jnilib");
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/AbstractTransactionTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/AbstractTransactionTest.java
new file mode 100644
index 0000000000..7cac3015b9
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/AbstractTransactionTest.java
@@ -0,0 +1,902 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+/**
+ * Base class of {@link TransactionTest} and {@link OptimisticTransactionTest}
+ */
+public abstract class AbstractTransactionTest {
+
+  protected final static byte[] TXN_TEST_COLUMN_FAMILY = "txn_test_cf"
+      .getBytes();
+
+  protected static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  public abstract DBContainer startDb()
+      throws RocksDBException;
+
+  @Test
+  public void setSnapshot() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshot();
+    }
+  }
+
+  @Test
+  public void setSnapshotOnNextOperation() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshotOnNextOperation();
+      txn.put("key1".getBytes(), "value1".getBytes());
+    }
+  }
+
+  @Test
+  public void setSnapshotOnNextOperation_transactionNotifier() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      try(final TestTransactionNotifier notifier = new TestTransactionNotifier()) {
+        txn.setSnapshotOnNextOperation(notifier);
+        txn.put("key1".getBytes(), "value1".getBytes());
+
+        txn.setSnapshotOnNextOperation(notifier);
+        txn.put("key2".getBytes(), "value2".getBytes());
+
+        assertThat(notifier.getCreatedSnapshots().size()).isEqualTo(2);
+      }
+    }
+  }
+
+  @Test
+  public void getSnapshot() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshot();
+      final Snapshot snapshot = txn.getSnapshot();
+      assertThat(snapshot.isOwningHandle()).isFalse();
+    }
+  }
+
+  @Test
+  public void getSnapshot_null() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final Snapshot snapshot = txn.getSnapshot();
+      assertThat(snapshot).isNull();
+    }
+  }
+
+  @Test
+  public void clearSnapshot() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setSnapshot();
+      txn.clearSnapshot();
+    }
+  }
+
+  @Test
+  public void clearSnapshot_none() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.clearSnapshot();
+    }
+  }
+
+  @Test
+  public void commit() throws RocksDBException {
+    final byte k1[] = "rollback-key1".getBytes(UTF_8);
+    final byte v1[] = "rollback-value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.commit();
+      }
+
+      try(final ReadOptions readOptions = new ReadOptions();
+          final Transaction txn2 = dbContainer.beginTransaction()) {
+        assertThat(txn2.get(readOptions, k1)).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void rollback() throws RocksDBException {
+    final byte k1[] = "rollback-key1".getBytes(UTF_8);
+    final byte v1[] = "rollback-value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        txn.rollback();
+      }
+
+      try(final ReadOptions readOptions = new ReadOptions();
+          final Transaction txn2 = dbContainer.beginTransaction()) {
+        assertThat(txn2.get(readOptions, k1)).isNull();
+      }
+    }
+  }
+
+  @Test
+  public void savePoint() throws RocksDBException {
+    final byte k1[] = "savePoint-key1".getBytes(UTF_8);
+    final byte v1[] = "savePoint-value1".getBytes(UTF_8);
+    final byte k2[] = "savePoint-key2".getBytes(UTF_8);
+    final byte v2[] = "savePoint-value2".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+
+        txn.setSavePoint();
+
+        txn.put(k2, v2);
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn.get(readOptions, k2)).isEqualTo(v2);
+
+        txn.rollbackToSavePoint();
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn.get(readOptions, k2)).isNull();
+
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        assertThat(txn2.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn2.get(readOptions, k2)).isNull();
+      }
+    }
+  }
+
+  @Test
+  public void getPut_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void getPut() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.get(readOptions, k1)).isNull();
+      txn.put(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void multiGetPut_cf() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(new byte[][] { null, null });
+
+      txn.put(testCf, keys[0], values[0]);
+      txn.put(testCf, keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetPut() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(new byte[][] { null, null });
+
+      txn.put(keys[0], values[0]);
+      txn.put(keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void getForUpdate_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isNull();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void getForUpdate() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isNull();
+      txn.put(k1, v1);
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_cf() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGetForUpdate(readOptions, cfList, keys))
+          .isEqualTo(new byte[][] { null, null });
+
+      txn.put(testCf, keys[0], values[0]);
+      txn.put(testCf, keys[1], values[1]);
+      assertThat(txn.multiGetForUpdate(readOptions, cfList, keys))
+          .isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate() throws RocksDBException {
+    final byte keys[][] = new byte[][]{
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][]{
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try (final DBContainer dbContainer = startDb();
+         final ReadOptions readOptions = new ReadOptions();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.multiGetForUpdate(readOptions, keys)).isEqualTo(new byte[][]{null, null});
+
+      txn.put(keys[0], values[0]);
+      txn.put(keys[1], values[1]);
+      assertThat(txn.multiGetForUpdate(readOptions, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void getIterator() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      final byte[] k1 = "key1".getBytes(UTF_8);
+      final byte[] v1 = "value1".getBytes(UTF_8);
+
+      txn.put(k1, v1);
+
+      try(final RocksIterator iterator = txn.getIterator(readOptions)) {
+        iterator.seek(k1);
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo(k1);
+        assertThat(iterator.value()).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void getIterator_cf() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      final byte[] k1 = "key1".getBytes(UTF_8);
+      final byte[] v1 = "value1".getBytes(UTF_8);
+
+      txn.put(testCf, k1, v1);
+
+      try(final RocksIterator iterator = txn.getIterator(readOptions, testCf)) {
+        iterator.seek(k1);
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo(k1);
+        assertThat(iterator.value()).isEqualTo(v1);
+      }
+    }
+  }
+
+  @Test
+  public void merge_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.merge(testCf, k1, v1);
+    }
+  }
+
+  @Test
+  public void merge() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.merge(k1, v1);
+    }
+  }
+
+
+  @Test
+  public void delete_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+
+      txn.delete(testCf, k1);
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void delete() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.put(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+
+      txn.delete(k1);
+      assertThat(txn.get(readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void delete_parts_cf() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, keyParts, valueParts);
+      assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value);
+
+      txn.delete(testCf, keyParts);
+
+      assertThat(txn.get(testCf, readOptions, key))
+          .isNull();
+    }
+  }
+
+  @Test
+  public void delete_parts() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(keyParts, valueParts);
+
+      assertThat(txn.get(readOptions, key)).isEqualTo(value);
+
+      txn.delete(keyParts);
+
+      assertThat(txn.get(readOptions, key)).isNull();
+    }
+  }
+
+  @Test
+  public void getPutUntracked_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+      txn.putUntracked(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void getPutUntracked() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.get(readOptions, k1)).isNull();
+      txn.putUntracked(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+    }
+  }
+
+  @Test
+  public void multiGetPutUntracked_cf() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(new byte[][] { null, null });
+      txn.putUntracked(testCf, keys[0], values[0]);
+      txn.putUntracked(testCf, keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void multiGetPutUntracked() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(new byte[][] { null, null });
+      txn.putUntracked(keys[0], values[0]);
+      txn.putUntracked(keys[1], values[1]);
+      assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+    }
+  }
+
+  @Test
+  public void mergeUntracked_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.mergeUntracked(testCf, k1, v1);
+    }
+  }
+
+  @Test
+  public void mergeUntracked() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.mergeUntracked(k1, v1);
+    }
+  }
+
+  @Test
+  public void deleteUntracked_cf() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+
+      txn.deleteUntracked(testCf, k1);
+      assertThat(txn.get(testCf, readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void deleteUntracked() throws RocksDBException {
+    final byte[] k1 = "key1".getBytes(UTF_8);
+    final byte[] v1 = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.put(k1, v1);
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+
+      txn.deleteUntracked(k1);
+      assertThat(txn.get(readOptions, k1)).isNull();
+    }
+  }
+
+  @Test
+  public void deleteUntracked_parts_cf() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(testCf, keyParts, valueParts);
+      assertThat(txn.get(testCf, readOptions, key)).isEqualTo(value);
+
+      txn.deleteUntracked(testCf, keyParts);
+      assertThat(txn.get(testCf, readOptions, key)).isNull();
+    }
+  }
+
+  @Test
+  public void deleteUntracked_parts() throws RocksDBException {
+    final byte keyParts[][] = new byte[][] {
+        "ke".getBytes(UTF_8),
+        "y1".getBytes(UTF_8)};
+    final byte valueParts[][] = new byte[][] {
+        "val".getBytes(UTF_8),
+        "ue1".getBytes(UTF_8)};
+    final byte[] key = concat(keyParts);
+    final byte[] value = concat(valueParts);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.put(keyParts, valueParts);
+      assertThat(txn.get(readOptions, key)).isEqualTo(value);
+
+      txn.deleteUntracked(keyParts);
+      assertThat(txn.get(readOptions, key)).isNull();
+    }
+  }
+
+  @Test
+  public void putLogData() throws RocksDBException {
+    final byte[] blob = "blobby".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.putLogData(blob);
+    }
+  }
+
+  @Test
+  public void enabledDisableIndexing() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.disableIndexing();
+      txn.enableIndexing();
+      txn.disableIndexing();
+      txn.enableIndexing();
+    }
+  }
+
+  @Test
+  public void numKeys() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte k2[] = "key2".getBytes(UTF_8);
+    final byte v2[] = "value2".getBytes(UTF_8);
+    final byte k3[] = "key3".getBytes(UTF_8);
+    final byte v3[] = "value3".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      txn.put(k1, v1);
+      txn.put(testCf, k2, v2);
+      txn.merge(k3, v3);
+      txn.delete(testCf, k2);
+
+      assertThat(txn.getNumKeys()).isEqualTo(3);
+      assertThat(txn.getNumPuts()).isEqualTo(2);
+      assertThat(txn.getNumMerges()).isEqualTo(1);
+      assertThat(txn.getNumDeletes()).isEqualTo(1);
+    }
+  }
+
+  @Test
+  public void elapsedTime() throws RocksDBException, InterruptedException {
+    final long preStartTxnTime = System.currentTimeMillis();
+    try (final DBContainer dbContainer = startDb();
+         final Transaction txn = dbContainer.beginTransaction()) {
+      Thread.sleep(2);
+
+      final long txnElapsedTime = txn.getElapsedTime();
+      assertThat(txnElapsedTime).isLessThan(System.currentTimeMillis() - preStartTxnTime);
+      assertThat(txnElapsedTime).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void getWriteBatch() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(k1, v1);
+
+      final WriteBatchWithIndex writeBatch = txn.getWriteBatch();
+      assertThat(writeBatch).isNotNull();
+      assertThat(writeBatch.isOwningHandle()).isFalse();
+      assertThat(writeBatch.count()).isEqualTo(1);
+    }
+  }
+
+  @Test
+  public void setLockTimeout() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      txn.setLockTimeout(1000);
+    }
+  }
+
+  @Test
+  public void writeOptions() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final WriteOptions writeOptions = new WriteOptions()
+        .setDisableWAL(true)
+        .setSync(true);
+        final Transaction txn = dbContainer.beginTransaction(writeOptions)) {
+
+      txn.put(k1, v1);
+
+      WriteOptions txnWriteOptions = txn.getWriteOptions();
+      assertThat(txnWriteOptions).isNotNull();
+      assertThat(txnWriteOptions.isOwningHandle()).isFalse();
+      assertThat(txnWriteOptions).isNotSameAs(writeOptions);
+      assertThat(txnWriteOptions.disableWAL()).isTrue();
+      assertThat(txnWriteOptions.sync()).isTrue();
+
+      txn.setWriteOptions(txnWriteOptions.setSync(false));
+      txnWriteOptions = txn.getWriteOptions();
+      assertThat(txnWriteOptions).isNotNull();
+      assertThat(txnWriteOptions.isOwningHandle()).isFalse();
+      assertThat(txnWriteOptions).isNotSameAs(writeOptions);
+      assertThat(txnWriteOptions.disableWAL()).isTrue();
+      assertThat(txnWriteOptions.sync()).isFalse();
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate_cf() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isNull();
+      txn.put(testCf, k1, v1);
+      assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+      txn.undoGetForUpdate(testCf, k1);
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isNull();
+      txn.put(k1, v1);
+      assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+      txn.undoGetForUpdate(k1);
+    }
+  }
+
+  @Test
+  public void rebuildFromWriteBatch() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte k2[] = "key2".getBytes(UTF_8);
+    final byte v2[] = "value2".getBytes(UTF_8);
+    final byte k3[] = "key3".getBytes(UTF_8);
+    final byte v3[] = "value3".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(k1, v1);
+
+      assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+      assertThat(txn.getNumKeys()).isEqualTo(1);
+
+      try(final WriteBatch writeBatch = new WriteBatch()) {
+        writeBatch.put(k2, v2);
+        writeBatch.put(k3, v3);
+        txn.rebuildFromWriteBatch(writeBatch);
+
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        assertThat(txn.get(readOptions, k2)).isEqualTo(v2);
+        assertThat(txn.get(readOptions, k3)).isEqualTo(v3);
+        assertThat(txn.getNumKeys()).isEqualTo(3);
+      }
+    }
+  }
+
+  @Test
+  public void getCommitTimeWriteBatch() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+
+      txn.put(k1, v1);
+      final WriteBatch writeBatch = txn.getCommitTimeWriteBatch();
+
+      assertThat(writeBatch).isNotNull();
+      assertThat(writeBatch.isOwningHandle()).isFalse();
+      assertThat(writeBatch.count()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void logNumber() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getLogNumber()).isEqualTo(0);
+      final long logNumber = rand.nextLong();
+      txn.setLogNumber(logNumber);
+      assertThat(txn.getLogNumber()).isEqualTo(logNumber);
+    }
+  }
+
+  private static byte[] concat(final byte[][] bufs) {
+    int resultLength = 0;
+    for(final byte[] buf : bufs) {
+      resultLength += buf.length;
+    }
+
+    final byte[] result = new byte[resultLength];
+    int resultOffset = 0;
+    for(final byte[] buf : bufs) {
+      final int srcLength = buf.length;
+      System.arraycopy(buf, 0, result, resultOffset, srcLength);
+      resultOffset += srcLength;
+    }
+
+    return result;
+  }
+
+  private static class TestTransactionNotifier
+      extends AbstractTransactionNotifier {
+    private final List<Snapshot> createdSnapshots = new ArrayList<>();
+
+    @Override
+    public void snapshotCreated(final Snapshot newSnapshot) {
+      createdSnapshots.add(newSnapshot);
+    }
+
+    public List<Snapshot> getCreatedSnapshots() {
+      return createdSnapshots;
+    }
+  }
+
+  protected static abstract class DBContainer
+      implements AutoCloseable {
+    protected final WriteOptions writeOptions;
+    protected final List<ColumnFamilyHandle> columnFamilyHandles;
+    protected final ColumnFamilyOptions columnFamilyOptions;
+    protected final DBOptions options;
+
+    public DBContainer(final WriteOptions writeOptions,
+        final List<ColumnFamilyHandle> columnFamilyHandles,
+        final ColumnFamilyOptions columnFamilyOptions,
+        final DBOptions options) {
+      this.writeOptions = writeOptions;
+      this.columnFamilyHandles = columnFamilyHandles;
+      this.columnFamilyOptions = columnFamilyOptions;
+      this.options = options;
+    }
+
+    public abstract Transaction beginTransaction();
+
+    public abstract Transaction beginTransaction(
+        final WriteOptions writeOptions);
+
+    public ColumnFamilyHandle getTestColumnFamily() {
+      return columnFamilyHandles.get(1);
+    }
+
+    @Override
+    public abstract void close();
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java
index 1caae5098e..7c50df3571 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupEngineTest.java
@@ -11,6 +11,7 @@
 import org.junit.rules.TemporaryFolder;
 
 import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -205,6 +206,26 @@ public void restoreFromBackup()
     }
   }
 
+  @Test
+  public void backupDbWithMetadata() throws RocksDBException {
+    // Open empty database.
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      // Fill database with some test values
+      prepareDatabase(db);
+
+      // Create two backups
+      try (final BackupableDBOptions bopt =
+               new BackupableDBOptions(backupFolder.getRoot().getAbsolutePath());
+           final BackupEngine be = BackupEngine.open(opt.getEnv(), bopt)) {
+        final String metadata = String.valueOf(ThreadLocalRandom.current().nextInt());
+        be.createNewBackupWithMetadata(db, metadata, true);
+        final List<BackupInfo> backupInfoList = verifyNumberOfValidBackups(be, 1);
+        assertThat(backupInfoList.get(0).appMetadata()).isEqualTo(metadata);
+      }
+    }
+  }
+
   /**
    * Verify backups.
    *
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
index c223014fd2..0b4992184c 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java
@@ -45,7 +45,7 @@ public void env() {
       assertThat(backupableDBOptions.backupEnv()).
           isNull();
 
-      try(final Env env = new RocksMemEnv()) {
+      try(final Env env = new RocksMemEnv(Env.getDefault())) {
         backupableDBOptions.setBackupEnv(env);
         assertThat(backupableDBOptions.backupEnv())
             .isEqualTo(env);
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
index 8edc8b89fd..fe9f863250 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java
@@ -6,7 +6,12 @@
 package org.rocksdb;
 
 import org.junit.ClassRule;
+import org.junit.Ignore;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.charset.StandardCharsets;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -16,73 +21,68 @@ public class BlockBasedTableConfigTest {
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
-  @Test
-  public void noBlockCache() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setNoBlockCache(true);
-    assertThat(blockBasedTableConfig.noBlockCache()).isTrue();
-  }
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void blockCacheSize() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setBlockCacheSize(8 * 1024);
-    assertThat(blockBasedTableConfig.blockCacheSize()).
-        isEqualTo(8 * 1024);
-  }
+  public void cacheIndexAndFilterBlocks() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()).
+        isTrue();
 
-  @Test
-  public void blockSizeDeviation() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setBlockSizeDeviation(12);
-    assertThat(blockBasedTableConfig.blockSizeDeviation()).
-        isEqualTo(12);
   }
 
   @Test
-  public void blockRestartInterval() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setBlockRestartInterval(15);
-    assertThat(blockBasedTableConfig.blockRestartInterval()).
-        isEqualTo(15);
+  public void cacheIndexAndFilterBlocksWithHighPriority() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(true);
+    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocksWithHighPriority()).
+        isTrue();
   }
 
   @Test
-  public void wholeKeyFiltering() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setWholeKeyFiltering(false);
-    assertThat(blockBasedTableConfig.wholeKeyFiltering()).
-        isFalse();
+  public void pinL0FilterAndIndexBlocksInCache() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPinL0FilterAndIndexBlocksInCache(true);
+    assertThat(blockBasedTableConfig.pinL0FilterAndIndexBlocksInCache()).
+        isTrue();
   }
 
   @Test
-  public void cacheIndexAndFilterBlocks() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setCacheIndexAndFilterBlocks(true);
-    assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()).
-        isTrue();
-
+  public void pinTopLevelIndexAndFilter() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPinTopLevelIndexAndFilter(false);
+    assertThat(blockBasedTableConfig.pinTopLevelIndexAndFilter()).
+        isFalse();
   }
 
   @Test
-  public void hashIndexAllowCollision() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setHashIndexAllowCollision(false);
-    assertThat(blockBasedTableConfig.hashIndexAllowCollision()).
-        isFalse();
+  public void indexType() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    assertThat(IndexType.values().length).isEqualTo(3);
+    blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
+    assertThat(blockBasedTableConfig.indexType().equals(
+        IndexType.kHashSearch));
+    assertThat(IndexType.valueOf("kBinarySearch")).isNotNull();
+    blockBasedTableConfig.setIndexType(IndexType.valueOf("kBinarySearch"));
+    assertThat(blockBasedTableConfig.indexType().equals(
+        IndexType.kBinarySearch));
   }
 
   @Test
-  public void blockCacheCompressedSize() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setBlockCacheCompressedSize(40);
-    assertThat(blockBasedTableConfig.blockCacheCompressedSize()).
-        isEqualTo(40);
+  public void dataBlockIndexType() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinaryAndHash);
+    assertThat(blockBasedTableConfig.dataBlockIndexType().equals(
+        DataBlockIndexType.kDataBlockBinaryAndHash));
+    blockBasedTableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinarySearch);
+    assertThat(blockBasedTableConfig.dataBlockIndexType().equals(
+        DataBlockIndexType.kDataBlockBinarySearch));
   }
 
   @Test
   public void checksumType() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     assertThat(ChecksumType.values().length).isEqualTo(3);
     assertThat(ChecksumType.valueOf("kxxHash")).
         isEqualTo(ChecksumType.kxxHash);
@@ -93,79 +93,301 @@ public void checksumType() {
   }
 
   @Test
-  public void indexType() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    assertThat(IndexType.values().length).isEqualTo(3);
-    blockBasedTableConfig.setIndexType(IndexType.kHashSearch);
-    assertThat(blockBasedTableConfig.indexType().equals(
-        IndexType.kHashSearch));
-    assertThat(IndexType.valueOf("kBinarySearch")).isNotNull();
-    blockBasedTableConfig.setIndexType(IndexType.valueOf("kBinarySearch"));
-    assertThat(blockBasedTableConfig.indexType().equals(
-        IndexType.kBinarySearch));
+  public void noBlockCache() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setNoBlockCache(true);
+    assertThat(blockBasedTableConfig.noBlockCache()).isTrue();
   }
 
   @Test
-  public void blockCacheCompressedNumShardBits() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
-    assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()).
-        isEqualTo(4);
+  public void blockCache() {
+    try (
+        final Cache cache = new LRUCache(17 * 1024 * 1024);
+        final Options options = new Options().setTableFormatConfig(
+            new BlockBasedTableConfig().setBlockCache(cache))) {
+      assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable");
+    }
   }
 
   @Test
-  public void cacheNumShardBits() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
-    blockBasedTableConfig.setCacheNumShardBits(5);
-    assertThat(blockBasedTableConfig.cacheNumShardBits()).
-        isEqualTo(5);
+  public void blockCacheIntegration() throws RocksDBException {
+    try (final Cache cache = new LRUCache(8 * 1024 * 1024);
+         final Statistics statistics = new Statistics()) {
+      for (int shard = 0; shard < 8; shard++) {
+        try (final Options options =
+                 new Options()
+                     .setCreateIfMissing(true)
+                     .setStatistics(statistics)
+                     .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache));
+             final RocksDB db =
+                 RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "/" + shard)) {
+          final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+          final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+          db.put(key, value);
+          db.flush(new FlushOptions());
+          db.get(key);
+
+          assertThat(statistics.getTickerCount(TickerType.BLOCK_CACHE_ADD)).isEqualTo(shard + 1);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void persistentCache() throws RocksDBException {
+    try (final DBOptions dbOptions = new DBOptions().
+        setInfoLogLevel(InfoLogLevel.INFO_LEVEL).
+        setCreateIfMissing(true);
+        final Logger logger = new Logger(dbOptions) {
+      @Override
+      protected void log(final InfoLogLevel infoLogLevel, final String logMsg) {
+        System.out.println(infoLogLevel.name() + ": " + logMsg);
+      }
+    }) {
+      try (final PersistentCache persistentCache =
+               new PersistentCache(Env.getDefault(), dbFolder.getRoot().getPath(), 1024 * 1024 * 100, logger, false);
+           final Options options = new Options().setTableFormatConfig(
+               new BlockBasedTableConfig().setPersistentCache(persistentCache))) {
+        assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable");
+      }
+    }
+  }
+
+  @Test
+  public void blockCacheCompressed() {
+    try (final Cache cache = new LRUCache(17 * 1024 * 1024);
+         final Options options = new Options().setTableFormatConfig(
+        new BlockBasedTableConfig().setBlockCacheCompressed(cache))) {
+      assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable");
+    }
+  }
+
+  @Ignore("See issue: https://github.com/facebook/rocksdb/issues/4822")
+  @Test
+  public void blockCacheCompressedIntegration() throws RocksDBException {
+    final byte[] key1 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] key2 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] key3 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] key4 = "some-key1".getBytes(StandardCharsets.UTF_8);
+    final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+    try (final Cache compressedCache = new LRUCache(8 * 1024 * 1024);
+         final Statistics statistics = new Statistics()) {
+
+      final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig()
+          .setNoBlockCache(true)
+          .setBlockCache(null)
+          .setBlockCacheCompressed(compressedCache)
+          .setFormatVersion(4);
+
+      try (final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setStatistics(statistics)
+             .setTableFormatConfig(blockBasedTableConfig)) {
+
+        for (int shard = 0; shard < 8; shard++) {
+          try (final FlushOptions flushOptions = new FlushOptions();
+               final WriteOptions writeOptions = new WriteOptions();
+               final ReadOptions readOptions = new ReadOptions();
+               final RocksDB db =
+                   RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "/" + shard)) {
+
+            db.put(writeOptions, key1, value);
+            db.put(writeOptions, key2, value);
+            db.put(writeOptions, key3, value);
+            db.put(writeOptions, key4, value);
+            db.flush(flushOptions);
+
+            db.get(readOptions, key1);
+            db.get(readOptions, key2);
+            db.get(readOptions, key3);
+            db.get(readOptions, key4);
+
+            assertThat(statistics.getTickerCount(TickerType.BLOCK_CACHE_COMPRESSED_ADD)).isEqualTo(shard + 1);
+          }
+        }
+      }
+    }
   }
 
   @Test
   public void blockSize() {
-    BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
     blockBasedTableConfig.setBlockSize(10);
     assertThat(blockBasedTableConfig.blockSize()).isEqualTo(10);
   }
 
+  @Test
+  public void blockSizeDeviation() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockSizeDeviation(12);
+    assertThat(blockBasedTableConfig.blockSizeDeviation()).
+        isEqualTo(12);
+  }
+
+  @Test
+  public void blockRestartInterval() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockRestartInterval(15);
+    assertThat(blockBasedTableConfig.blockRestartInterval()).
+        isEqualTo(15);
+  }
+
+  @Test
+  public void indexBlockRestartInterval() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setIndexBlockRestartInterval(15);
+    assertThat(blockBasedTableConfig.indexBlockRestartInterval()).
+        isEqualTo(15);
+  }
 
   @Test
-  public void blockBasedTableWithFilter() {
+  public void metadataBlockSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setMetadataBlockSize(1024);
+    assertThat(blockBasedTableConfig.metadataBlockSize()).
+        isEqualTo(1024);
+  }
+
+  @Test
+  public void partitionFilters() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setPartitionFilters(true);
+    assertThat(blockBasedTableConfig.partitionFilters()).
+        isTrue();
+  }
+
+  @Test
+  public void useDeltaEncoding() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setUseDeltaEncoding(false);
+    assertThat(blockBasedTableConfig.useDeltaEncoding()).
+        isFalse();
+  }
+
+  @Test
+  public void blockBasedTableWithFilterPolicy() {
     try(final Options options = new Options()
         .setTableFormatConfig(new BlockBasedTableConfig()
-        .setFilter(new BloomFilter(10)))) {
+            .setFilterPolicy(new BloomFilter(10)))) {
       assertThat(options.tableFactoryName()).
           isEqualTo("BlockBasedTable");
     }
   }
 
   @Test
-  public void blockBasedTableWithoutFilter() {
+  public void blockBasedTableWithoutFilterPolicy() {
     try(final Options options = new Options().setTableFormatConfig(
-        new BlockBasedTableConfig().setFilter(null))) {
+        new BlockBasedTableConfig().setFilterPolicy(null))) {
       assertThat(options.tableFactoryName()).
           isEqualTo("BlockBasedTable");
     }
   }
 
   @Test
-  public void blockBasedTableFormatVersion() {
-    BlockBasedTableConfig config = new BlockBasedTableConfig();
-    for (int version=0; version<=2; version++) {
-      config.setFormatVersion(version);
-      assertThat(config.formatVersion()).isEqualTo(version);
+  public void wholeKeyFiltering() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setWholeKeyFiltering(false);
+    assertThat(blockBasedTableConfig.wholeKeyFiltering()).
+        isFalse();
+  }
+
+  @Test
+  public void verifyCompression() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setVerifyCompression(true);
+    assertThat(blockBasedTableConfig.verifyCompression()).
+        isTrue();
+  }
+
+  @Test
+  public void readAmpBytesPerBit() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setReadAmpBytesPerBit(2);
+    assertThat(blockBasedTableConfig.readAmpBytesPerBit()).
+        isEqualTo(2);
+  }
+
+  @Test
+  public void formatVersion() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    for (int version = 0; version < 5; version++) {
+      blockBasedTableConfig.setFormatVersion(version);
+      assertThat(blockBasedTableConfig.formatVersion()).isEqualTo(version);
     }
   }
 
   @Test(expected = AssertionError.class)
-  public void blockBasedTableFormatVersionFailNegative() {
-    BlockBasedTableConfig config = new BlockBasedTableConfig();
-    config.setFormatVersion(-1);
+  public void formatVersionFailNegative() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setFormatVersion(-1);
   }
 
   @Test(expected = AssertionError.class)
-  public void blockBasedTableFormatVersionFailIllegalVersion() {
-    BlockBasedTableConfig config = new BlockBasedTableConfig();
-    config.setFormatVersion(3);
+  public void formatVersionFailIllegalVersion() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setFormatVersion(99);
+  }
+
+  @Test
+  public void enableIndexCompression() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setEnableIndexCompression(false);
+    assertThat(blockBasedTableConfig.enableIndexCompression()).
+        isFalse();
+  }
+
+  @Test
+  public void blockAlign() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockAlign(true);
+    assertThat(blockBasedTableConfig.blockAlign()).
+        isTrue();
+  }
+
+  @Deprecated
+  @Test
+  public void hashIndexAllowCollision() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setHashIndexAllowCollision(false);
+    assertThat(blockBasedTableConfig.hashIndexAllowCollision()).
+        isTrue();  // NOTE: setHashIndexAllowCollision should do nothing!
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheSize(8 * 1024);
+    assertThat(blockBasedTableConfig.blockCacheSize()).
+        isEqualTo(8 * 1024);
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheNumShardBits() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setCacheNumShardBits(5);
+    assertThat(blockBasedTableConfig.cacheNumShardBits()).
+        isEqualTo(5);
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheCompressedSize() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheCompressedSize(40);
+    assertThat(blockBasedTableConfig.blockCacheCompressedSize()).
+        isEqualTo(40);
+  }
+
+  @Deprecated
+  @Test
+  public void blockCacheCompressedNumShardBits() {
+    final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig();
+    blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4);
+    assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()).
+        isEqualTo(4);
   }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
index 75749437b8..2cd8f0de9c 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
@@ -7,6 +7,7 @@
 
 import org.junit.ClassRule;
 import org.junit.Test;
+import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -24,6 +25,18 @@ public class ColumnFamilyOptionsTest {
   public static final Random rand = PlatformRandomHelper.
       getPlatformSpecificRandomFactory();
 
+  @Test
+  public void copyConstructor() {
+    ColumnFamilyOptions origOpts = new ColumnFamilyOptions();
+    origOpts.setNumLevels(rand.nextInt(8));
+    origOpts.setTargetFileSizeMultiplier(rand.nextInt(100));
+    origOpts.setLevel0StopWritesTrigger(rand.nextInt(50));
+    ColumnFamilyOptions copyOpts = new ColumnFamilyOptions(origOpts);
+    assertThat(origOpts.numLevels()).isEqualTo(copyOpts.numLevels());
+    assertThat(origOpts.targetFileSizeMultiplier()).isEqualTo(copyOpts.targetFileSizeMultiplier());
+    assertThat(origOpts.level0StopWritesTrigger()).isEqualTo(copyOpts.level0StopWritesTrigger());
+  }
+
   @Test
   public void getColumnFamilyOptionsFromProps() {
     Properties properties = new Properties();
@@ -451,6 +464,23 @@ public void bottommostCompressionType() {
     }
   }
 
+  @Test
+  public void bottommostCompressionOptions() {
+    try (final ColumnFamilyOptions columnFamilyOptions =
+             new ColumnFamilyOptions();
+         final CompressionOptions bottommostCompressionOptions =
+             new CompressionOptions()
+                 .setMaxDictBytes(123)) {
+
+      columnFamilyOptions.setBottommostCompressionOptions(
+          bottommostCompressionOptions);
+      assertThat(columnFamilyOptions.bottommostCompressionOptions())
+          .isEqualTo(bottommostCompressionOptions);
+      assertThat(columnFamilyOptions.bottommostCompressionOptions()
+          .maxDictBytes()).isEqualTo(123);
+    }
+  }
+
   @Test
   public void compressionOptions() {
     try (final ColumnFamilyOptions columnFamilyOptions
@@ -529,6 +559,15 @@ public void reportBgIoStats() {
     }
   }
 
+  @Test
+  public void ttl() {
+    try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
+      options.setTtl(1000 * 60);
+      assertThat(options.ttl()).
+          isEqualTo(1000 * 60);
+    }
+  }
+
   @Test
   public void compactionOptionsUniversal() {
     try (final ColumnFamilyOptions opt = new ColumnFamilyOptions();
@@ -564,4 +603,23 @@ public void forceConsistencyChecks() {
           isEqualTo(booleanValue);
     }
   }
+
+  @Test
+  public void compactionFilter() {
+    try(final ColumnFamilyOptions options = new ColumnFamilyOptions();
+        final RemoveEmptyValueCompactionFilter cf = new RemoveEmptyValueCompactionFilter()) {
+      options.setCompactionFilter(cf);
+      assertThat(options.compactionFilter()).isEqualTo(cf);
+    }
+  }
+
+  @Test
+  public void compactionFilterFactory() {
+    try(final ColumnFamilyOptions options = new ColumnFamilyOptions();
+        final RemoveEmptyValueCompactionFilterFactory cff = new RemoveEmptyValueCompactionFilterFactory()) {
+      options.setCompactionFilterFactory(cff);
+      assertThat(options.compactionFilterFactory()).isEqualTo(cff);
+    }
+  }
+
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
index 19fe332df9..84815b4766 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ColumnFamilyTest.java
@@ -12,6 +12,7 @@
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.assertj.core.api.Assertions.assertThat;
 
 public class ColumnFamilyTest {
@@ -23,6 +24,31 @@ public class ColumnFamilyTest {
   @Rule
   public TemporaryFolder dbFolder = new TemporaryFolder();
 
+  @Test
+  public void columnFamilyDescriptorName() throws RocksDBException {
+    final byte[] cfName = "some_name".getBytes(UTF_8);
+
+    try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) {
+      final ColumnFamilyDescriptor cfDescriptor =
+              new ColumnFamilyDescriptor(cfName, cfOptions);
+      assertThat(cfDescriptor.getName()).isEqualTo(cfName);
+    }
+  }
+
+  @Test
+  public void columnFamilyDescriptorOptions() throws RocksDBException {
+    final byte[] cfName = "some_name".getBytes(UTF_8);
+
+    try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()
+            .setCompressionType(CompressionType.BZLIB2_COMPRESSION)) {
+      final ColumnFamilyDescriptor cfDescriptor =
+          new ColumnFamilyDescriptor(cfName, cfOptions);
+
+        assertThat(cfDescriptor.getOptions().compressionType())
+            .isEqualTo(CompressionType.BZLIB2_COMPRESSION);
+    }
+  }
+
   @Test
   public void listColumnFamilies() throws RocksDBException {
     try (final Options options = new Options().setCreateIfMissing(true);
@@ -47,6 +73,9 @@ public void defaultColumnFamily() throws RocksDBException {
       try {
         assertThat(cfh).isNotNull();
 
+        assertThat(cfh.getName()).isEqualTo("default".getBytes(UTF_8));
+        assertThat(cfh.getID()).isEqualTo(0);
+
         final byte[] key = "key".getBytes();
         final byte[] value = "value".getBytes();
 
@@ -64,15 +93,25 @@ public void defaultColumnFamily() throws RocksDBException {
 
   @Test
   public void createColumnFamily() throws RocksDBException {
+    final byte[] cfName = "new_cf".getBytes(UTF_8);
+    final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName,
+            new ColumnFamilyOptions());
+
     try (final Options options = new Options().setCreateIfMissing(true);
          final RocksDB db = RocksDB.open(options,
-             dbFolder.getRoot().getAbsolutePath())) {
-      final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(
-          new ColumnFamilyDescriptor("new_cf".getBytes(),
-              new ColumnFamilyOptions()));
+                 dbFolder.getRoot().getAbsolutePath())) {
+
+      final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor);
+
       try {
+        assertThat(columnFamilyHandle.getName()).isEqualTo(cfName);
+        assertThat(columnFamilyHandle.getID()).isEqualTo(1);
+
+        final ColumnFamilyDescriptor latestDescriptor = columnFamilyHandle.getDescriptor();
+        assertThat(latestDescriptor.getName()).isEqualTo(cfName);
+
         final List<byte[]> columnFamilyNames = RocksDB.listColumnFamilies(
-            options, dbFolder.getRoot().getAbsolutePath());
+                options, dbFolder.getRoot().getAbsolutePath());
         assertThat(columnFamilyNames).isNotNull();
         assertThat(columnFamilyNames.size()).isGreaterThan(0);
         assertThat(columnFamilyNames.size()).isEqualTo(2);
@@ -190,10 +229,51 @@ public void createWriteDropColumnFamily() throws RocksDBException {
                 new ColumnFamilyOptions()));
         db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
         db.dropColumnFamily(tmpColumnFamilyHandle);
+        assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
+      } finally {
+        if (tmpColumnFamilyHandle != null) {
+          tmpColumnFamilyHandle.close();
+        }
+        for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
+          columnFamilyHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void createWriteDropColumnFamilies() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+             columnFamilyHandleList)) {
+      ColumnFamilyHandle tmpColumnFamilyHandle = null;
+      ColumnFamilyHandle tmpColumnFamilyHandle2 = null;
+      try {
+        tmpColumnFamilyHandle = db.createColumnFamily(
+            new ColumnFamilyDescriptor("tmpCF".getBytes(),
+                new ColumnFamilyOptions()));
+        tmpColumnFamilyHandle2 = db.createColumnFamily(
+            new ColumnFamilyDescriptor("tmpCF2".getBytes(),
+                new ColumnFamilyOptions()));
+        db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes());
+        db.put(tmpColumnFamilyHandle2, "key".getBytes(), "value".getBytes());
+        db.dropColumnFamilies(Arrays.asList(tmpColumnFamilyHandle, tmpColumnFamilyHandle2));
+        assertThat(tmpColumnFamilyHandle.isOwningHandle()).isTrue();
+        assertThat(tmpColumnFamilyHandle2.isOwningHandle()).isTrue();
       } finally {
         if (tmpColumnFamilyHandle != null) {
           tmpColumnFamilyHandle.close();
         }
+        if (tmpColumnFamilyHandle2 != null) {
+          tmpColumnFamilyHandle2.close();
+        }
         for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) {
           columnFamilyHandle.close();
         }
@@ -339,6 +419,50 @@ public void multiGet() throws RocksDBException {
     }
   }
 
+  @Test
+  public void multiGetAsList() throws RocksDBException {
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor("new_cf".getBytes()));
+    final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final RocksDB db = RocksDB.open(options,
+             dbFolder.getRoot().getAbsolutePath(),
+             cfDescriptors, columnFamilyHandleList)) {
+      try {
+        db.put(columnFamilyHandleList.get(0), "key".getBytes(),
+            "value".getBytes());
+        db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(),
+            "value".getBytes());
+
+        final List<byte[]> keys = Arrays.asList(new byte[][]{
+            "key".getBytes(), "newcfkey".getBytes()
+        });
+        List<byte[]> retValues = db.multiGetAsList(columnFamilyHandleList,
+            keys);
+        assertThat(retValues.size()).isEqualTo(2);
+        assertThat(new String(retValues.get(0)))
+            .isEqualTo("value");
+        assertThat(new String(retValues.get(1)))
+            .isEqualTo("value");
+        retValues = db.multiGetAsList(new ReadOptions(), columnFamilyHandleList,
+            keys);
+        assertThat(retValues.size()).isEqualTo(2);
+        assertThat(new String(retValues.get(0)))
+            .isEqualTo("value");
+        assertThat(new String(retValues.get(1)))
+            .isEqualTo("value");
+      } finally {
+        for (final ColumnFamilyHandle columnFamilyHandle :
+            columnFamilyHandleList) {
+          columnFamilyHandle.close();
+        }
+      }
+    }
+  }
+
   @Test
   public void properties() throws RocksDBException {
     final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
@@ -365,6 +489,10 @@ public void properties() throws RocksDBException {
             "rocksdb.stats")).isNotNull();
         assertThat(db.getProperty(columnFamilyHandleList.get(1),
             "rocksdb.sstables")).isNotNull();
+        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
+            isNotNull();
+        assertThat(db.getAggregatedLongProperty("rocksdb.estimate-num-keys")).
+            isGreaterThanOrEqualTo(0);
       } finally {
         for (final ColumnFamilyHandle columnFamilyHandle :
             columnFamilyHandleList) {
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java
new file mode 100644
index 0000000000..18c187ddba
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java
@@ -0,0 +1,98 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.CompactRangeOptions.BottommostLevelCompaction;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactRangeOptionsTest {
+
+  static {
+    RocksDB.loadLibrary();
+  }
+
+  @Test
+  public void exclusiveManualCompaction() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setExclusiveManualCompaction(value);
+    assertThat(opt.exclusiveManualCompaction()).isEqualTo(value);
+    value = true;
+    opt.setExclusiveManualCompaction(value);
+    assertThat(opt.exclusiveManualCompaction()).isEqualTo(value);
+  }
+
+  @Test
+  public void bottommostLevelCompaction() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    BottommostLevelCompaction value = BottommostLevelCompaction.kSkip;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+    value = BottommostLevelCompaction.kForce;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+    value = BottommostLevelCompaction.kIfHaveCompactionFilter;
+    opt.setBottommostLevelCompaction(value);
+    assertThat(opt.bottommostLevelCompaction()).isEqualTo(value);
+  }
+
+  @Test
+  public void changeLevel() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setChangeLevel(value);
+    assertThat(opt.changeLevel()).isEqualTo(value);
+    value = true;
+    opt.setChangeLevel(value);
+    assertThat(opt.changeLevel()).isEqualTo(value);
+  }
+
+  @Test
+  public void targetLevel() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setTargetLevel(value);
+    assertThat(opt.targetLevel()).isEqualTo(value);
+    value = 3;
+    opt.setTargetLevel(value);
+    assertThat(opt.targetLevel()).isEqualTo(value);
+  }
+
+  @Test
+  public void targetPathId() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setTargetPathId(value);
+    assertThat(opt.targetPathId()).isEqualTo(value);
+    value = 3;
+    opt.setTargetPathId(value);
+    assertThat(opt.targetPathId()).isEqualTo(value);
+  }
+
+  @Test
+  public void allowWriteStall() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    boolean value = false;
+    opt.setAllowWriteStall(value);
+    assertThat(opt.allowWriteStall()).isEqualTo(value);
+    value = true;
+    opt.setAllowWriteStall(value);
+    assertThat(opt.allowWriteStall()).isEqualTo(value);
+  }
+
+  @Test
+  public void maxSubcompactions() {
+    CompactRangeOptions opt = new CompactRangeOptions();
+    int value = 2;
+    opt.setMaxSubcompactions(value);
+    assertThat(opt.maxSubcompactions()).isEqualTo(value);
+    value = 3;
+    opt.setMaxSubcompactions(value);
+    assertThat(opt.maxSubcompactions()).isEqualTo(value);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
new file mode 100644
index 0000000000..efa29b1d9f
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionFilterFactoryTest.java
@@ -0,0 +1,67 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionFilterFactoryTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void columnFamilyOptions_setCompactionFilterFactory()
+      throws RocksDBException {
+    try(final DBOptions options = new DBOptions()
+            .setCreateIfMissing(true)
+            .setCreateMissingColumnFamilies(true);
+        final RemoveEmptyValueCompactionFilterFactory compactionFilterFactory
+            = new RemoveEmptyValueCompactionFilterFactory();
+        final ColumnFamilyOptions new_cf_opts
+            = new ColumnFamilyOptions()
+            .setCompactionFilterFactory(compactionFilterFactory)) {
+
+      final List<ColumnFamilyDescriptor> cfNames = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+          new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts));
+
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+      try (final RocksDB rocksDb = RocksDB.open(options,
+               dbFolder.getRoot().getAbsolutePath(), cfNames, cfHandles);
+      ) {
+        try {
+          final byte[] key1 = "key1".getBytes();
+          final byte[] key2 = "key2".getBytes();
+
+          final byte[] value1 = "value1".getBytes();
+          final byte[] value2 = new byte[0];
+
+          rocksDb.put(cfHandles.get(1), key1, value1);
+          rocksDb.put(cfHandles.get(1), key2, value2);
+
+          rocksDb.compactRange(cfHandles.get(1));
+
+          assertThat(rocksDb.get(cfHandles.get(1), key1)).isEqualTo(value1);
+          assertThat(rocksDb.keyMayExist(cfHandles.get(1), key2, new StringBuilder())).isFalse();
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionJobInfoTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionJobInfoTest.java
new file mode 100644
index 0000000000..6c920439c5
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionJobInfoTest.java
@@ -0,0 +1,114 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionJobInfoTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void columnFamilyName() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.columnFamilyName())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void status() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.status().getCode())
+          .isEqualTo(Status.Code.Ok);
+    }
+  }
+
+  @Test
+  public void threadId() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.threadId())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void jobId() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.jobId())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void baseInputLevel() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.baseInputLevel())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void outputLevel() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.outputLevel())
+          .isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void inputFiles() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.inputFiles())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void outputFiles() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.outputFiles())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void tableProperties() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.tableProperties())
+          .isEmpty();
+    }
+  }
+
+  @Test
+  public void compactionReason() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.compactionReason())
+          .isEqualTo(CompactionReason.kUnknown);
+    }
+  }
+
+  @Test
+  public void compression() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.compression())
+          .isEqualTo(CompressionType.NO_COMPRESSION);
+    }
+  }
+
+  @Test
+  public void stats() {
+    try (final CompactionJobInfo compactionJobInfo = new CompactionJobInfo()) {
+      assertThat(compactionJobInfo.stats())
+          .isNotNull();
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionJobStatsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionJobStatsTest.java
new file mode 100644
index 0000000000..7be7226dac
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionJobStatsTest.java
@@ -0,0 +1,196 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionJobStatsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void reset() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      compactionJobStats.reset();
+      assertThat(compactionJobStats.elapsedMicros()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void add() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats();
+         final CompactionJobStats otherCompactionJobStats = new CompactionJobStats()) {
+      compactionJobStats.add(otherCompactionJobStats);
+    }
+  }
+
+  @Test
+  public void elapsedMicros() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.elapsedMicros()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputFiles() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputFiles()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputFilesAtOutputLevel() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputFilesAtOutputLevel()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numOutputRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numOutputRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numOutputFiles() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numOutputFiles()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void isManualCompaction() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.isManualCompaction()).isFalse();
+    }
+  }
+
+  @Test
+  public void totalInputBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalInputBytes()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void totalOutputBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalOutputBytes()).isEqualTo(0);
+    }
+  }
+
+
+  @Test
+  public void numRecordsReplaced() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numRecordsReplaced()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void totalInputRawKeyBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalInputRawKeyBytes()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void totalInputRawValueBytes() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.totalInputRawValueBytes()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numInputDeletionRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numInputDeletionRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numExpiredDeletionRecords() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numExpiredDeletionRecords()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numCorruptKeys() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numCorruptKeys()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void fileWriteNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.fileWriteNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void fileRangeSyncNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.fileRangeSyncNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void fileFsyncNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.fileFsyncNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void filePrepareWriteNanos() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.filePrepareWriteNanos()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void smallestOutputKeyPrefix() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.smallestOutputKeyPrefix()).isEmpty();
+    }
+  }
+
+  @Test
+  public void largestOutputKeyPrefix() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.largestOutputKeyPrefix()).isEmpty();
+    }
+  }
+
+  @Test
+  public void numSingleDelFallthru() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numSingleDelFallthru()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void numSingleDelMismatch() {
+    try (final CompactionJobStats compactionJobStats = new CompactionJobStats()) {
+      assertThat(compactionJobStats.numSingleDelMismatch()).isEqualTo(0);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
index 370a28e819..841615e67e 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java
@@ -18,9 +18,18 @@ public class CompactionOptionsFIFOTest {
   @Test
   public void maxTableFilesSize() {
     final long size = 500 * 1024 * 1026;
-    try(final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
+    try (final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
       opt.setMaxTableFilesSize(size);
       assertThat(opt.maxTableFilesSize()).isEqualTo(size);
     }
   }
+
+  @Test
+  public void allowCompaction() {
+    final boolean allowCompaction = true;
+    try (final CompactionOptionsFIFO opt = new CompactionOptionsFIFO()) {
+      opt.setAllowCompaction(allowCompaction);
+      assertThat(opt.allowCompaction()).isEqualTo(allowCompaction);
+    }
+  }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsTest.java
new file mode 100644
index 0000000000..b1726e866c
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompactionOptionsTest.java
@@ -0,0 +1,52 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class CompactionOptionsTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Test
+  public void compression() {
+    try (final CompactionOptions compactionOptions = new CompactionOptions()) {
+      assertThat(compactionOptions.compression())
+          .isEqualTo(CompressionType.SNAPPY_COMPRESSION);
+      compactionOptions.setCompression(CompressionType.NO_COMPRESSION);
+      assertThat(compactionOptions.compression())
+          .isEqualTo(CompressionType.NO_COMPRESSION);
+    }
+  }
+
+  @Test
+  public void outputFileSizeLimit() {
+    final long mb250 = 1024 * 1024 * 250;
+    try (final CompactionOptions compactionOptions = new CompactionOptions()) {
+      assertThat(compactionOptions.outputFileSizeLimit())
+          .isEqualTo(-1);
+      compactionOptions.setOutputFileSizeLimit(mb250);
+      assertThat(compactionOptions.outputFileSizeLimit())
+          .isEqualTo(mb250);
+    }
+  }
+
+  @Test
+  public void maxSubcompactions() {
+    try (final CompactionOptions compactionOptions = new CompactionOptions()) {
+      assertThat(compactionOptions.maxSubcompactions())
+          .isEqualTo(0);
+      compactionOptions.setMaxSubcompactions(9);
+      assertThat(compactionOptions.maxSubcompactions())
+          .isEqualTo(9);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
index c49224ca36..116552c328 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/CompressionOptionsTest.java
@@ -50,4 +50,22 @@ public void maxDictBytes() {
       assertThat(opt.maxDictBytes()).isEqualTo(maxDictBytes);
     }
   }
+
+  @Test
+  public void zstdMaxTrainBytes() {
+    final int zstdMaxTrainBytes = 999;
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      opt.setZStdMaxTrainBytes(zstdMaxTrainBytes);
+      assertThat(opt.zstdMaxTrainBytes()).isEqualTo(zstdMaxTrainBytes);
+    }
+  }
+
+  @Test
+  public void enabled() {
+    try(final CompressionOptions opt = new CompressionOptions()) {
+      assertThat(opt.enabled()).isFalse();
+      opt.setEnabled(true);
+      assertThat(opt.enabled()).isTrue();
+    }
+  }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
index 11b7435d8a..e6ebc46cd2 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/DBOptionsTest.java
@@ -22,6 +22,19 @@ public class DBOptionsTest {
   public static final Random rand = PlatformRandomHelper.
       getPlatformSpecificRandomFactory();
 
+  @Test
+  public void copyConstructor() {
+    DBOptions origOpts = new DBOptions();
+    origOpts.setCreateIfMissing(rand.nextBoolean());
+    origOpts.setAllow2pc(rand.nextBoolean());
+    origOpts.setBaseBackgroundCompactions(rand.nextInt(10));
+    DBOptions copyOpts = new DBOptions(origOpts);
+    assertThat(origOpts.createIfMissing()).isEqualTo(copyOpts.createIfMissing());
+    assertThat(origOpts.allow2pc()).isEqualTo(copyOpts.allow2pc());
+    assertThat(origOpts.baseBackgroundCompactions()).isEqualTo(
+            copyOpts.baseBackgroundCompactions());
+  }
+
   @Test
   public void getDBOptionsFromProps() {
     // setup sample properties
@@ -240,6 +253,15 @@ public void maxBackgroundFlushes() {
     }
   }
 
+  @Test
+  public void maxBackgroundJobs() {
+    try (final DBOptions opt = new DBOptions()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundJobs(intValue);
+      assertThat(opt.maxBackgroundJobs()).isEqualTo(intValue);
+    }
+  }
+
   @Test
   public void maxLogFileSize() throws RocksDBException {
     try(final DBOptions opt = new DBOptions()) {
@@ -402,6 +424,26 @@ public void dbWriteBufferSize() {
     }
   }
 
+  @Test
+  public void setWriteBufferManager() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
   @Test
   public void accessHintOnCompactionStart() {
     try(final DBOptions opt = new DBOptions()) {
@@ -492,6 +534,15 @@ public void delayedWriteRate() {
     }
   }
 
+  @Test
+  public void enablePipelinedWrite() {
+    try(final DBOptions opt = new DBOptions()) {
+      assertThat(opt.enablePipelinedWrite()).isFalse();
+      opt.setEnablePipelinedWrite(true);
+      assertThat(opt.enablePipelinedWrite()).isTrue();
+    }
+  }
+
   @Test
   public void allowConcurrentMemtableWrite() {
     try (final DBOptions opt = new DBOptions()) {
@@ -573,6 +624,38 @@ public void rowCache() {
     }
   }
 
+  @Test
+  public void walFilter() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.walFilter()).isNull();
+
+      try (final AbstractWalFilter walFilter = new AbstractWalFilter() {
+        @Override
+        public void columnFamilyLogNumberMap(
+            final Map<Integer, Long> cfLognumber,
+            final Map<String, Integer> cfNameId) {
+          // no-op
+        }
+
+        @Override
+        public LogRecordFoundResult logRecordFound(final long logNumber,
+            final String logFileName, final WriteBatch batch,
+            final WriteBatch newBatch) {
+          return new LogRecordFoundResult(
+              WalProcessingOption.CONTINUE_PROCESSING, false);
+        }
+
+        @Override
+        public String name() {
+          return "test-wal-filter";
+        }
+      }) {
+        opt.setWalFilter(walFilter);
+        assertThat(opt.walFilter()).isEqualTo(walFilter);
+      }
+    }
+  }
+
   @Test
   public void failIfOptionsFileError() {
     try (final DBOptions opt = new DBOptions()) {
@@ -609,6 +692,51 @@ public void avoidFlushDuringShutdown() {
     }
   }
 
+  @Test
+  public void allowIngestBehind() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.allowIngestBehind()).isFalse();
+      opt.setAllowIngestBehind(true);
+      assertThat(opt.allowIngestBehind()).isTrue();
+    }
+  }
+
+  @Test
+  public void preserveDeletes() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.preserveDeletes()).isFalse();
+      opt.setPreserveDeletes(true);
+      assertThat(opt.preserveDeletes()).isTrue();
+    }
+  }
+
+  @Test
+  public void twoWriteQueues() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.twoWriteQueues()).isFalse();
+      opt.setTwoWriteQueues(true);
+      assertThat(opt.twoWriteQueues()).isTrue();
+    }
+  }
+
+  @Test
+  public void manualWalFlush() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.manualWalFlush()).isFalse();
+      opt.setManualWalFlush(true);
+      assertThat(opt.manualWalFlush()).isTrue();
+    }
+  }
+
+  @Test
+  public void atomicFlush() {
+    try (final DBOptions opt = new DBOptions()) {
+      assertThat(opt.atomicFlush()).isFalse();
+      opt.setAtomicFlush(true);
+      assertThat(opt.atomicFlush()).isTrue();
+    }
+  }
+
   @Test
   public void rateLimiter() {
     try(final DBOptions options = new DBOptions();
@@ -621,6 +749,15 @@ public void rateLimiter() {
     }
   }
 
+  @Test
+  public void sstFileManager() throws RocksDBException {
+    try (final DBOptions options = new DBOptions();
+         final SstFileManager sstFileManager =
+             new SstFileManager(Env.getDefault())) {
+      options.setSstFileManager(sstFileManager);
+    }
+  }
+
   @Test
   public void statistics() {
     try(final DBOptions options = new DBOptions()) {
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/DefaultEnvTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/DefaultEnvTest.java
new file mode 100644
index 0000000000..9e4f043871
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/DefaultEnvTest.java
@@ -0,0 +1,113 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.Collection;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DefaultEnvTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void backgroundThreads() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.setBackgroundThreads(5, Priority.BOTTOM);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.BOTTOM)).isEqualTo(5);
+
+      defaultEnv.setBackgroundThreads(5);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.LOW)).isEqualTo(5);
+
+      defaultEnv.setBackgroundThreads(5, Priority.LOW);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.LOW)).isEqualTo(5);
+
+      defaultEnv.setBackgroundThreads(5, Priority.HIGH);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.HIGH)).isEqualTo(5);
+    }
+  }
+
+  @Test
+  public void threadPoolQueueLen() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      assertThat(defaultEnv.getThreadPoolQueueLen(Priority.BOTTOM)).isEqualTo(0);
+      assertThat(defaultEnv.getThreadPoolQueueLen(Priority.LOW)).isEqualTo(0);
+      assertThat(defaultEnv.getThreadPoolQueueLen(Priority.HIGH)).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void incBackgroundThreadsIfNeeded() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.incBackgroundThreadsIfNeeded(20, Priority.BOTTOM);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.BOTTOM)).isGreaterThanOrEqualTo(20);
+
+      defaultEnv.incBackgroundThreadsIfNeeded(20, Priority.LOW);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.LOW)).isGreaterThanOrEqualTo(20);
+
+      defaultEnv.incBackgroundThreadsIfNeeded(20, Priority.HIGH);
+      assertThat(defaultEnv.getBackgroundThreads(Priority.HIGH)).isGreaterThanOrEqualTo(20);
+    }
+  }
+
+  @Test
+  public void lowerThreadPoolIOPriority() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.lowerThreadPoolIOPriority(Priority.BOTTOM);
+
+      defaultEnv.lowerThreadPoolIOPriority(Priority.LOW);
+
+      defaultEnv.lowerThreadPoolIOPriority(Priority.HIGH);
+    }
+  }
+
+  @Test
+  public void lowerThreadPoolCPUPriority() {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      defaultEnv.lowerThreadPoolCPUPriority(Priority.BOTTOM);
+
+      defaultEnv.lowerThreadPoolCPUPriority(Priority.LOW);
+
+      defaultEnv.lowerThreadPoolCPUPriority(Priority.HIGH);
+    }
+  }
+
+  @Test
+  public void threadList() throws RocksDBException {
+    try (final Env defaultEnv = RocksEnv.getDefault()) {
+      final Collection<ThreadStatus> threadList = defaultEnv.getThreadList();
+      assertThat(threadList.size()).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void threadList_integration() throws RocksDBException {
+    try (final Env env = RocksEnv.getDefault();
+        final Options opt = new Options()
+            .setCreateIfMissing(true)
+            .setCreateMissingColumnFamilies(true)
+            .setEnv(env)) {
+      // open database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath())) {
+
+        final List<ThreadStatus> threadList = env.getThreadList();
+        assertThat(threadList.size()).isGreaterThan(0);
+      }
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/EnvOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/EnvOptionsTest.java
index 9933b1e1db..9be61b7d70 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/EnvOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/EnvOptionsTest.java
@@ -18,6 +18,18 @@ public class EnvOptionsTest {
 
   public static final Random rand = PlatformRandomHelper.getPlatformSpecificRandomFactory();
 
+  @Test
+  public void dbOptionsConstructor() {
+    final long compactionReadaheadSize = 4 * 1024 * 1024;
+    try (final DBOptions dbOptions = new DBOptions()
+        .setCompactionReadaheadSize(compactionReadaheadSize)) {
+      try (final EnvOptions envOptions = new EnvOptions(dbOptions)) {
+        assertThat(envOptions.compactionReadaheadSize())
+            .isEqualTo(compactionReadaheadSize);
+      }
+    }
+  }
+
   @Test
   public void useMmapReads() {
     try (final EnvOptions envOptions = new EnvOptions()) {
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/FlushOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/FlushOptionsTest.java
new file mode 100644
index 0000000000..f90ae911d8
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/FlushOptionsTest.java
@@ -0,0 +1,31 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class FlushOptionsTest {
+
+  @Test
+  public void waitForFlush() {
+    try (final FlushOptions flushOptions = new FlushOptions()) {
+      assertThat(flushOptions.waitForFlush()).isTrue();
+      flushOptions.setWaitForFlush(false);
+      assertThat(flushOptions.waitForFlush()).isFalse();
+    }
+  }
+
+  @Test
+  public void allowWriteStall() {
+    try (final FlushOptions flushOptions = new FlushOptions()) {
+      assertThat(flushOptions.allowWriteStall()).isFalse();
+      flushOptions.setAllowWriteStall(true);
+      assertThat(flushOptions.allowWriteStall()).isTrue();
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/HdfsEnvTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/HdfsEnvTest.java
new file mode 100644
index 0000000000..3a91c5cad4
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/HdfsEnvTest.java
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class HdfsEnvTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  // expect org.rocksdb.RocksDBException: Not compiled with hdfs support
+  @Test(expected = RocksDBException.class)
+  public void construct() throws RocksDBException {
+    try (final Env env = new HdfsEnv("hdfs://localhost:5000")) {
+      // no-op
+    }
+  }
+
+  // expect org.rocksdb.RocksDBException: Not compiled with hdfs support
+  @Test(expected = RocksDBException.class)
+  public void construct_integration() throws RocksDBException {
+    try (final Env env = new HdfsEnv("hdfs://localhost:5000");
+         final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setEnv(env);
+    ) {
+      try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getPath())) {
+        db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+      }
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
index 48ecfa16a9..b215dd17ff 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/InfoLogLevelTest.java
@@ -27,6 +27,7 @@ public void testInfoLogLevel() throws RocksDBException,
     try (final RocksDB db =
              RocksDB.open(dbFolder.getRoot().getAbsolutePath())) {
       db.put("key".getBytes(), "value".getBytes());
+      db.flush(new FlushOptions().setWaitForFlush(true));
       assertThat(getLogContentsWithoutHeader()).isNotEmpty();
     }
   }
@@ -93,7 +94,7 @@ private String getLogContentsWithoutHeader() throws IOException {
     int first_non_header = lines.length;
     // Identify the last line of the header
     for (int i = lines.length - 1; i >= 0; --i) {
-      if (lines[i].indexOf("Options.") >= 0 && lines[i].indexOf(':') >= 0) {
+      if (lines[i].indexOf("DB pointer") >= 0) {
         first_non_header = i + 1;
         break;
       }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
index 83e0dd17af..a3973ccd9c 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java
@@ -84,4 +84,24 @@ public void allowBlockingFlush() {
       assertThat(options.allowBlockingFlush()).isEqualTo(allowBlockingFlush);
     }
   }
+
+  @Test
+  public void ingestBehind() {
+    try (final IngestExternalFileOptions options =
+             new IngestExternalFileOptions()) {
+      assertThat(options.ingestBehind()).isFalse();
+      options.setIngestBehind(true);
+      assertThat(options.ingestBehind()).isTrue();
+    }
+  }
+
+  @Test
+  public void writeGlobalSeqno() {
+    try (final IngestExternalFileOptions options =
+             new IngestExternalFileOptions()) {
+      assertThat(options.writeGlobalSeqno()).isTrue();
+      options.setWriteGlobalSeqno(false);
+      assertThat(options.writeGlobalSeqno()).isFalse();
+    }
+  }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
index 8092270eb2..577fe2eadf 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/KeyMayExistTest.java
@@ -48,12 +48,33 @@ public void keyMayExist() throws RocksDBException {
         assertThat(exists).isTrue();
         assertThat(retValue.toString()).isEqualTo("value");
 
+        // Slice key
+        StringBuilder builder = new StringBuilder("prefix");
+        int offset = builder.toString().length();
+        builder.append("slice key 0");
+        int len = builder.toString().length() - offset;
+        builder.append("suffix");
+
+        byte[] sliceKey = builder.toString().getBytes();
+        byte[] sliceValue = "slice value 0".getBytes();
+        db.put(sliceKey, offset, len, sliceValue, 0, sliceValue.length);
+
+        retValue = new StringBuilder();
+        exists = db.keyMayExist(sliceKey, offset, len, retValue);
+        assertThat(exists).isTrue();
+        assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
+
         // Test without column family but with readOptions
         try (final ReadOptions readOptions = new ReadOptions()) {
           retValue = new StringBuilder();
           exists = db.keyMayExist(readOptions, "key".getBytes(), retValue);
           assertThat(exists).isTrue();
           assertThat(retValue.toString()).isEqualTo("value");
+
+          retValue = new StringBuilder();
+          exists = db.keyMayExist(readOptions, sliceKey, offset, len, retValue);
+          assertThat(exists).isTrue();
+          assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
         }
 
         // Test with column family
@@ -63,6 +84,13 @@ public void keyMayExist() throws RocksDBException {
         assertThat(exists).isTrue();
         assertThat(retValue.toString()).isEqualTo("value");
 
+        // Test slice sky with column family
+        retValue = new StringBuilder();
+        exists = db.keyMayExist(columnFamilyHandleList.get(0), sliceKey, offset, len,
+            retValue);
+        assertThat(exists).isTrue();
+        assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
+
         // Test with column family and readOptions
         try (final ReadOptions readOptions = new ReadOptions()) {
           retValue = new StringBuilder();
@@ -71,11 +99,23 @@ public void keyMayExist() throws RocksDBException {
               retValue);
           assertThat(exists).isTrue();
           assertThat(retValue.toString()).isEqualTo("value");
+
+          // Test slice key with column family and read options
+          retValue = new StringBuilder();
+          exists = db.keyMayExist(readOptions,
+              columnFamilyHandleList.get(0), sliceKey, offset, len,
+              retValue);
+          assertThat(exists).isTrue();
+          assertThat(retValue.toString().getBytes()).isEqualTo(sliceValue);
         }
 
         // KeyMayExist in CF1 must return false
         assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
             "key".getBytes(), retValue)).isFalse();
+
+        // slice key
+        assertThat(db.keyMayExist(columnFamilyHandleList.get(1),
+           sliceKey, 1, 3, retValue)).isFalse();
       } finally {
         for (final ColumnFamilyHandle columnFamilyHandle :
             columnFamilyHandleList) {
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java
new file mode 100644
index 0000000000..73fcc87c32
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MemoryUtilTest.java
@@ -0,0 +1,143 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MemoryUtilTest {
+
+  private static final String MEMTABLE_SIZE = "rocksdb.size-all-mem-tables";
+  private static final String UNFLUSHED_MEMTABLE_SIZE = "rocksdb.cur-size-all-mem-tables";
+  private static final String TABLE_READERS = "rocksdb.estimate-table-readers-mem";
+
+  private final byte[] key = "some-key".getBytes(StandardCharsets.UTF_8);
+  private final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8);
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule public TemporaryFolder dbFolder1 = new TemporaryFolder();
+  @Rule public TemporaryFolder dbFolder2 = new TemporaryFolder();
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType before and after a put + get
+   */
+  @Test
+  public void getApproximateMemoryUsageByType() throws RocksDBException {
+    try (final Cache cache = new LRUCache(8 * 1024 * 1024);
+         final Options options =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache));
+         final FlushOptions flushOptions =
+                 new FlushOptions().setWaitForFlush(true);
+         final RocksDB db =
+                 RocksDB.open(options, dbFolder1.getRoot().getAbsolutePath())) {
+
+      List<RocksDB> dbs = new ArrayList<>(1);
+      dbs.add(db);
+      Set<Cache> caches = new HashSet<>(1);
+      caches.add(cache);
+      Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isEqualTo(0);
+
+      db.put(key, value);
+      db.flush(flushOptions);
+      db.get(key);
+
+      usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isGreaterThan(0);
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isGreaterThan(0);
+
+    }
+  }
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType with null inputs
+   */
+  @Test
+  public void getApproximateMemoryUsageByTypeNulls() throws RocksDBException {
+    Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(null, null);
+
+    assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(null);
+    assertThat(usage.get(MemoryUsageType.kCacheTotal)).isEqualTo(null);
+  }
+
+  /**
+   * Test MemoryUtil.getApproximateMemoryUsageByType with two DBs and two caches
+   */
+  @Test
+  public void getApproximateMemoryUsageByTypeMultiple() throws RocksDBException {
+    try (final Cache cache1 = new LRUCache(1 * 1024 * 1024);
+         final Options options1 =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache1));
+         final RocksDB db1 =
+                 RocksDB.open(options1, dbFolder1.getRoot().getAbsolutePath());
+         final Cache cache2 = new LRUCache(1 * 1024 * 1024);
+         final Options options2 =
+                 new Options()
+                         .setCreateIfMissing(true)
+                         .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache2));
+         final RocksDB db2 =
+                 RocksDB.open(options2, dbFolder2.getRoot().getAbsolutePath());
+         final FlushOptions flushOptions =
+                 new FlushOptions().setWaitForFlush(true);
+
+    ) {
+      List<RocksDB> dbs = new ArrayList<>(1);
+      dbs.add(db1);
+      dbs.add(db2);
+      Set<Cache> caches = new HashSet<>(1);
+      caches.add(cache1);
+      caches.add(cache2);
+
+      for (RocksDB db: dbs) {
+        db.put(key, value);
+        db.flush(flushOptions);
+        db.get(key);
+      }
+
+      Map<MemoryUsageType, Long> usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches);
+      assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(
+              db1.getAggregatedLongProperty(MEMTABLE_SIZE) + db2.getAggregatedLongProperty(MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(
+              db1.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE) + db2.getAggregatedLongProperty(UNFLUSHED_MEMTABLE_SIZE));
+      assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo(
+              db1.getAggregatedLongProperty(TABLE_READERS) + db2.getAggregatedLongProperty(TABLE_READERS));
+      assertThat(usage.get(MemoryUsageType.kCacheTotal)).isGreaterThan(0);
+
+    }
+  }
+
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
index 73b90869cf..5546984761 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MergeTest.java
@@ -5,6 +5,7 @@
 
 package org.rocksdb;
 
+import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.List;
 import java.util.ArrayList;
@@ -44,6 +45,38 @@ public void stringOption()
     }
   }
 
+  private byte[] longToByteArray(long l) {
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE);
+    buf.putLong(l);
+    return buf.array();
+  }
+
+  private long longFromByteArray(byte[] a) {
+    ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE);
+    buf.put(a);
+    buf.flip();
+    return buf.getLong();
+  }
+
+  @Test
+  public void uint64AddOption()
+      throws InterruptedException, RocksDBException {
+    try (final Options opt = new Options()
+        .setCreateIfMissing(true)
+        .setMergeOperatorName("uint64add");
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // writing (long)100 under key
+      db.put("key".getBytes(), longToByteArray(100));
+      // merge (long)1 under key
+      db.merge("key".getBytes(), longToByteArray(1));
+
+      final byte[] value = db.get("key".getBytes());
+      final long longValue = longFromByteArray(value);
+      assertThat(longValue).isEqualTo(101);
+    }
+  }
+
   @Test
   public void cFStringOption()
       throws InterruptedException, RocksDBException {
@@ -86,6 +119,48 @@ public void cFStringOption()
     }
   }
 
+  @Test
+  public void cFUInt64AddOption()
+      throws InterruptedException, RocksDBException {
+
+    try (final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+        .setMergeOperatorName("uint64add");
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperatorName("uint64add")
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt2)
+      );
+
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)) {
+        try {
+          // writing (long)100 under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(100));
+          // merge (long)1 under key
+          db.merge(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(1));
+
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          long longValue = longFromByteArray(value);
+          assertThat(longValue).isEqualTo(101);
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandleList) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
   @Test
   public void operatorOption()
       throws InterruptedException, RocksDBException {
@@ -108,6 +183,28 @@ public void operatorOption()
     }
   }
 
+  @Test
+  public void uint64AddOperatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator();
+         final Options opt = new Options()
+            .setCreateIfMissing(true)
+            .setMergeOperator(uint64AddOperator);
+         final RocksDB db = RocksDB.open(opt,
+             dbFolder.getRoot().getAbsolutePath())) {
+      // Writing (long)100 under key
+      db.put("key".getBytes(), longToByteArray(100));
+
+      // Writing (long)1 under key
+      db.merge("key".getBytes(), longToByteArray(1));
+
+      final byte[] value = db.get("key".getBytes());
+      final long longValue = longFromByteArray(value);
+
+      assertThat(longValue).isEqualTo(101);
+    }
+  }
+
   @Test
   public void cFOperatorOption()
       throws InterruptedException, RocksDBException {
@@ -170,6 +267,68 @@ public void cFOperatorOption()
     }
   }
 
+  @Test
+  public void cFUInt64AddOperatorOption()
+      throws InterruptedException, RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator();
+         final ColumnFamilyOptions cfOpt1 = new ColumnFamilyOptions()
+             .setMergeOperator(uint64AddOperator);
+         final ColumnFamilyOptions cfOpt2 = new ColumnFamilyOptions()
+             .setMergeOperator(uint64AddOperator)
+    ) {
+      final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpt1),
+          new ColumnFamilyDescriptor("new_cf".getBytes(), cfOpt2)
+      );
+      final List<ColumnFamilyHandle> columnFamilyHandleList = new ArrayList<>();
+      try (final DBOptions opt = new DBOptions()
+          .setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(opt,
+               dbFolder.getRoot().getAbsolutePath(), cfDescriptors,
+               columnFamilyHandleList)
+      ) {
+        try {
+          // writing (long)100 under key
+          db.put(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(100));
+          // merge (long)1 under key
+          db.merge(columnFamilyHandleList.get(1),
+              "cfkey".getBytes(), longToByteArray(1));
+          byte[] value = db.get(columnFamilyHandleList.get(1),
+              "cfkey".getBytes());
+          long longValue = longFromByteArray(value);
+
+          // Test also with createColumnFamily
+          try (final ColumnFamilyOptions cfHandleOpts =
+                   new ColumnFamilyOptions()
+                       .setMergeOperator(uint64AddOperator);
+               final ColumnFamilyHandle cfHandle =
+                   db.createColumnFamily(
+                       new ColumnFamilyDescriptor("new_cf2".getBytes(),
+                           cfHandleOpts))
+          ) {
+            // writing (long)200 under cfkey2
+            db.put(cfHandle, "cfkey2".getBytes(), longToByteArray(200));
+            // merge (long)50 under cfkey2
+            db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(),
+                longToByteArray(50));
+            value = db.get(cfHandle, "cfkey2".getBytes());
+            long longValueTmpCf = longFromByteArray(value);
+
+            assertThat(longValue).isEqualTo(101);
+            assertThat(longValueTmpCf).isEqualTo(250);
+          }
+        } finally {
+          for (final ColumnFamilyHandle columnFamilyHandle :
+              columnFamilyHandleList) {
+            columnFamilyHandle.close();
+          }
+        }
+      }
+    }
+  }
+
   @Test
   public void operatorGcBehaviour()
       throws RocksDBException {
@@ -182,7 +341,6 @@ public void operatorGcBehaviour()
         //no-op
       }
 
-
       // test reuse
       try (final Options opt = new Options()
               .setMergeOperator(stringAppendOperator);
@@ -213,6 +371,48 @@ public void operatorGcBehaviour()
     }
   }
 
+  @Test
+  public void uint64AddOperatorGcBehaviour()
+      throws RocksDBException {
+    try (final UInt64AddOperator uint64AddOperator = new UInt64AddOperator()) {
+      try (final Options opt = new Options()
+              .setCreateIfMissing(true)
+              .setMergeOperator(uint64AddOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test reuse
+      try (final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test param init
+      try (final UInt64AddOperator uint64AddOperator2 = new UInt64AddOperator();
+           final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator2);
+           final RocksDB db = RocksDB.open(opt,
+                   dbFolder.getRoot().getAbsolutePath())) {
+        //no-op
+      }
+
+      // test replace one with another merge operator instance
+      try (final Options opt = new Options()
+              .setMergeOperator(uint64AddOperator);
+           final UInt64AddOperator newUInt64AddOperator = new UInt64AddOperator()) {
+        opt.setMergeOperator(newUInt64AddOperator);
+        try (final RocksDB db = RocksDB.open(opt,
+                dbFolder.getRoot().getAbsolutePath())) {
+          //no-op
+        }
+      }
+    }
+  }
+
   @Test
   public void emptyStringInSetMergeOperatorByName() {
     try (final Options opt = new Options()
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MutableDBOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MutableDBOptionsTest.java
new file mode 100644
index 0000000000..1ce3e11775
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/MutableDBOptionsTest.java
@@ -0,0 +1,84 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.MutableDBOptions.MutableDBOptionsBuilder;
+
+import java.util.NoSuchElementException;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class MutableDBOptionsTest {
+
+  @Test
+  public void builder() {
+    final MutableDBOptionsBuilder builder =
+        MutableDBOptions.builder();
+        builder
+            .setBytesPerSync(1024 * 1024 * 7)
+            .setMaxBackgroundJobs(5)
+            .setAvoidFlushDuringShutdown(false);
+
+    assertThat(builder.bytesPerSync()).isEqualTo(1024 * 1024 * 7);
+    assertThat(builder.maxBackgroundJobs()).isEqualTo(5);
+    assertThat(builder.avoidFlushDuringShutdown()).isEqualTo(false);
+  }
+
+  @Test(expected = NoSuchElementException.class)
+  public void builder_getWhenNotSet() {
+    final MutableDBOptionsBuilder builder =
+        MutableDBOptions.builder();
+
+    builder.bytesPerSync();
+  }
+
+  @Test
+  public void builder_build() {
+    final MutableDBOptions options = MutableDBOptions
+        .builder()
+          .setBytesPerSync(1024 * 1024 * 7)
+          .setMaxBackgroundJobs(5)
+          .build();
+
+    assertThat(options.getKeys().length).isEqualTo(2);
+    assertThat(options.getValues().length).isEqualTo(2);
+    assertThat(options.getKeys()[0])
+        .isEqualTo(
+            MutableDBOptions.DBOption.bytes_per_sync.name());
+    assertThat(options.getValues()[0]).isEqualTo("7340032");
+    assertThat(options.getKeys()[1])
+        .isEqualTo(
+            MutableDBOptions.DBOption.max_background_jobs.name());
+    assertThat(options.getValues()[1]).isEqualTo("5");
+  }
+
+  @Test
+  public void mutableColumnFamilyOptions_toString() {
+    final String str = MutableDBOptions
+        .builder()
+        .setMaxOpenFiles(99)
+        .setDelayedWriteRate(789)
+        .setAvoidFlushDuringShutdown(true)
+        .build()
+        .toString();
+
+    assertThat(str).isEqualTo("max_open_files=99;delayed_write_rate=789;"
+        + "avoid_flush_during_shutdown=true");
+  }
+
+  @Test
+  public void mutableColumnFamilyOptions_parse() {
+    final String str = "max_open_files=99;delayed_write_rate=789;"
+        + "avoid_flush_during_shutdown=true";
+
+    final MutableDBOptionsBuilder builder =
+        MutableDBOptions.parse(str);
+
+    assertThat(builder.maxOpenFiles()).isEqualTo(99);
+    assertThat(builder.delayedWriteRate()).isEqualTo(789);
+    assertThat(builder.avoidFlushDuringShutdown()).isEqualTo(true);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
new file mode 100644
index 0000000000..d1bdf0f884
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java
@@ -0,0 +1,92 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.*;
+import java.util.Comparator;
+
+import static org.junit.Assert.assertEquals;
+
+public class NativeComparatorWrapperTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  private static final Random random = new Random();
+
+  @Test
+  public void rountrip() throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+    final int ITERATIONS = 1_000;
+
+    final String[] storedKeys = new String[ITERATIONS];
+    try (final NativeStringComparatorWrapper comparator = new NativeStringComparatorWrapper();
+        final Options opt = new Options()
+        .setCreateIfMissing(true)
+        .setComparator(comparator)) {
+
+      // store random integer keys
+      try (final RocksDB db = RocksDB.open(opt, dbPath)) {
+        for (int i = 0; i < ITERATIONS; i++) {
+          final String strKey = randomString();
+          final byte key[] = strKey.getBytes();
+          // does key already exist (avoid duplicates)
+          if (i > 0 && db.get(key) != null) {
+            i--; // generate a different key
+          } else {
+            db.put(key, "value".getBytes());
+            storedKeys[i] = strKey;
+          }
+        }
+      }
+
+      // sort the stored keys into ascending alpha-numeric order
+      Arrays.sort(storedKeys, new Comparator<String>() {
+        @Override
+        public int compare(final String o1, final String o2) {
+          return o1.compareTo(o2);
+        }
+      });
+
+      // re-open db and read from start to end
+      // string keys should be in ascending
+      // order
+      try (final RocksDB db = RocksDB.open(opt, dbPath);
+           final RocksIterator it = db.newIterator()) {
+        int count = 0;
+        for (it.seekToFirst(); it.isValid(); it.next()) {
+          final String strKey = new String(it.key());
+          assertEquals(storedKeys[count++], strKey);
+        }
+      }
+    }
+  }
+
+  private String randomString() {
+    final char[] chars = new char[12];
+    for(int i = 0; i < 12; i++) {
+      final int letterCode = random.nextInt(24);
+      final char letter = (char) (((int) 'a') + letterCode);
+      chars[i] = letter;
+    }
+    return String.copyValueOf(chars);
+  }
+
+  public static class NativeStringComparatorWrapper
+      extends NativeComparatorWrapper {
+
+    @Override
+    protected long initializeNative(final long... nativeParameterHandles) {
+      return newStringComparator();
+    }
+
+    private native long newStringComparator();
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java
new file mode 100644
index 0000000000..519b70b1d2
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionDBTest.java
@@ -0,0 +1,131 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class OptimisticTransactionDBTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void open() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(options,
+                 dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(otdb).isNotNull();
+    }
+  }
+
+  @Test
+  public void open_columnFamilies() throws RocksDBException {
+    try(final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+        final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) {
+
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts));
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+      try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(dbOptions,
+               dbFolder.getRoot().getAbsolutePath(),
+               columnFamilyDescriptors, columnFamilyHandles)) {
+        try {
+          assertThat(otdb).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+        final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_transactionOptions() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final OptimisticTransactionOptions optimisticTxnOptions =
+             new OptimisticTransactionOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions,
+          optimisticTxnOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = otdb.beginTransaction(writeOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld_transactionOptions()
+      throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(
+             options, dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final OptimisticTransactionOptions optimisticTxnOptions =
+             new OptimisticTransactionOptions()) {
+
+      try(final Transaction txn = otdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = otdb.beginTransaction(writeOptions,
+            optimisticTxnOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void baseDB() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final OptimisticTransactionDB otdb = OptimisticTransactionDB.open(options,
+             dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(otdb).isNotNull();
+      final RocksDB db = otdb.getBaseDB();
+      assertThat(db).isNotNull();
+      assertThat(db.isOwningHandle()).isFalse();
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java
new file mode 100644
index 0000000000..4a57e33568
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java
@@ -0,0 +1,37 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+import org.rocksdb.util.DirectBytewiseComparator;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class OptimisticTransactionOptionsTest {
+
+  private static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void setSnapshot() {
+    try (final OptimisticTransactionOptions opt = new OptimisticTransactionOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setSetSnapshot(boolValue);
+      assertThat(opt.isSetSnapshot()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void comparator() {
+    try (final OptimisticTransactionOptions opt = new OptimisticTransactionOptions();
+         final ComparatorOptions copt = new ComparatorOptions();
+         final DirectComparator comparator = new DirectBytewiseComparator(copt)) {
+      opt.setComparator(comparator);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java
new file mode 100644
index 0000000000..f44816e64b
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptimisticTransactionTest.java
@@ -0,0 +1,350 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+public class OptimisticTransactionTest extends AbstractTransactionTest {
+
+  @Test
+  public void getForUpdate_cf_conflict() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte v12[] = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(testCf, k1, v12);
+          assertThat(txn2.get(testCf, readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void getForUpdate_conflict() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte v12[] = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(k1, v12);
+          assertThat(txn2.get(readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_cf_conflict() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, keys[0], values[0]);
+        txn.put(testCf, keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, cfList, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(testCf, keys[0], otherValue);
+          assertThat(txn2.get(testCf, readOptions, keys[0]))
+              .isEqualTo(otherValue);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_conflict() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(keys[0], values[0]);
+        txn.put(keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(keys[0], otherValue);
+          assertThat(txn2.get(readOptions, keys[0]))
+              .isEqualTo(otherValue);
+          txn2.commit();
+
+          try {
+            txn3.commit(); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.Busy);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate_cf_conflict() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte v12[] = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        assertThat(txn.get(testCf, readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+
+          // undo the getForUpdate
+          txn3.undoGetForUpdate(testCf, k1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(testCf, k1, v12);
+          assertThat(txn2.get(testCf, readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          // should not cause an exception
+          // because we undid the getForUpdate above!
+          txn3.commit();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void undoGetForUpdate_conflict() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte v12[] = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        assertThat(txn.get(readOptions, k1)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+
+          // undo the getForUpdate
+          txn3.undoGetForUpdate(k1);
+
+          // NOTE: txn2 updates k1, during txn3
+          txn2.put(k1, v12);
+          assertThat(txn2.get(readOptions, k1)).isEqualTo(v12);
+          txn2.commit();
+
+          // should not cause an exception
+          // because we undid the getForUpdate above!
+          txn3.commit();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void name() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getName()).isEmpty();
+      final String name = "my-transaction-" + rand.nextLong();
+
+      try {
+        txn.setName(name);
+      } catch(final RocksDBException e) {
+         assertThat(e.getStatus().getCode() == Status.Code.InvalidArgument);
+        return;
+      }
+
+      fail("Optimistic transactions cannot be named.");
+    }
+  }
+
+  @Override
+  public OptimisticTransactionDBContainer startDb()
+      throws RocksDBException {
+    final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+
+    final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions();
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY,
+                columnFamilyOptions));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    final OptimisticTransactionDB optimisticTxnDb;
+    try {
+      optimisticTxnDb = OptimisticTransactionDB.open(
+          options, dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors, columnFamilyHandles);
+    } catch(final RocksDBException e) {
+      columnFamilyOptions.close();
+      options.close();
+      throw e;
+    }
+
+    final WriteOptions writeOptions = new WriteOptions();
+    final OptimisticTransactionOptions optimisticTxnOptions =
+             new OptimisticTransactionOptions();
+
+    return new OptimisticTransactionDBContainer(optimisticTxnOptions,
+        writeOptions, columnFamilyHandles, optimisticTxnDb, columnFamilyOptions,
+        options);
+  }
+
+  private static class OptimisticTransactionDBContainer
+      extends DBContainer {
+
+    private final OptimisticTransactionOptions optimisticTxnOptions;
+    private final OptimisticTransactionDB optimisticTxnDb;
+
+    public OptimisticTransactionDBContainer(
+        final OptimisticTransactionOptions optimisticTxnOptions,
+        final WriteOptions writeOptions,
+        final List<ColumnFamilyHandle> columnFamilyHandles,
+        final OptimisticTransactionDB optimisticTxnDb,
+        final ColumnFamilyOptions columnFamilyOptions,
+        final DBOptions options) {
+      super(writeOptions, columnFamilyHandles, columnFamilyOptions,
+          options);
+      this.optimisticTxnOptions = optimisticTxnOptions;
+      this.optimisticTxnDb = optimisticTxnDb;
+    }
+
+    @Override
+    public Transaction beginTransaction() {
+      return optimisticTxnDb.beginTransaction(writeOptions,
+          optimisticTxnOptions);
+    }
+
+    @Override
+    public Transaction beginTransaction(final WriteOptions writeOptions) {
+      return optimisticTxnDb.beginTransaction(writeOptions,
+          optimisticTxnOptions);
+    }
+
+    @Override
+    public void close() {
+      optimisticTxnOptions.close();
+      writeOptions.close();
+      for(final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+      optimisticTxnDb.close();
+      options.close();
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
index 6afcab3300..e27a33d7df 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptionsTest.java
@@ -6,13 +6,11 @@
 package org.rocksdb;
 
 import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Random;
+import java.util.*;
 
 import org.junit.ClassRule;
 import org.junit.Test;
+import org.rocksdb.test.RemoveEmptyValueCompactionFilterFactory;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -26,6 +24,18 @@ public class OptionsTest {
   public static final Random rand = PlatformRandomHelper.
       getPlatformSpecificRandomFactory();
 
+  @Test
+  public void copyConstructor() {
+    Options origOpts = new Options();
+    origOpts.setNumLevels(rand.nextInt(8));
+    origOpts.setTargetFileSizeMultiplier(rand.nextInt(100));
+    origOpts.setLevel0StopWritesTrigger(rand.nextInt(50));
+    Options copyOpts = new Options(origOpts);
+    assertThat(origOpts.numLevels()).isEqualTo(copyOpts.numLevels());
+    assertThat(origOpts.targetFileSizeMultiplier()).isEqualTo(copyOpts.targetFileSizeMultiplier());
+    assertThat(origOpts.level0StopWritesTrigger()).isEqualTo(copyOpts.level0StopWritesTrigger());
+  }
+
   @Test
   public void setIncreaseParallelism() {
     try (final Options opt = new Options()) {
@@ -458,6 +468,15 @@ public void maxBackgroundFlushes() {
     }
   }
 
+  @Test
+  public void maxBackgroundJobs() {
+    try (final Options opt = new Options()) {
+      final int intValue = rand.nextInt();
+      opt.setMaxBackgroundJobs(intValue);
+      assertThat(opt.maxBackgroundJobs()).isEqualTo(intValue);
+    }
+  }
+
   @Test
   public void maxLogFileSize() throws RocksDBException {
     try (final Options opt = new Options()) {
@@ -624,6 +643,26 @@ public void dbWriteBufferSize() {
     }
   }
 
+  @Test
+  public void setWriteBufferManager() throws RocksDBException {
+    try (final Options opt = new Options();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
+  @Test
+  public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException {
+    try (final Options opt = new Options();
+         final Cache cache = new LRUCache(1 * 1024 * 1024);
+         final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) {
+      opt.setWriteBufferManager(writeBufferManager);
+      assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager);
+    }
+  }
+
   @Test
   public void accessHintOnCompactionStart() {
     try (final Options opt = new Options()) {
@@ -714,6 +753,15 @@ public void delayedWriteRate() {
     }
   }
 
+  @Test
+  public void enablePipelinedWrite() {
+    try(final Options opt = new Options()) {
+      assertThat(opt.enablePipelinedWrite()).isFalse();
+      opt.setEnablePipelinedWrite(true);
+      assertThat(opt.enablePipelinedWrite()).isTrue();
+    }
+  }
+
   @Test
   public void allowConcurrentMemtableWrite() {
     try (final Options opt = new Options()) {
@@ -795,6 +843,38 @@ public void rowCache() {
     }
   }
 
+  @Test
+  public void walFilter() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.walFilter()).isNull();
+
+      try (final AbstractWalFilter walFilter = new AbstractWalFilter() {
+        @Override
+        public void columnFamilyLogNumberMap(
+            final Map<Integer, Long> cfLognumber,
+            final Map<String, Integer> cfNameId) {
+          // no-op
+        }
+
+        @Override
+        public LogRecordFoundResult logRecordFound(final long logNumber,
+            final String logFileName, final WriteBatch batch,
+            final WriteBatch newBatch) {
+          return new LogRecordFoundResult(
+              WalProcessingOption.CONTINUE_PROCESSING, false);
+        }
+
+        @Override
+        public String name() {
+          return "test-wal-filter";
+        }
+      }) {
+        opt.setWalFilter(walFilter);
+        assertThat(opt.walFilter()).isEqualTo(walFilter);
+      }
+    }
+  }
+
   @Test
   public void failIfOptionsFileError() {
     try (final Options opt = new Options()) {
@@ -831,6 +911,52 @@ public void avoidFlushDuringShutdown() {
     }
   }
 
+
+  @Test
+  public void allowIngestBehind() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.allowIngestBehind()).isFalse();
+      opt.setAllowIngestBehind(true);
+      assertThat(opt.allowIngestBehind()).isTrue();
+    }
+  }
+
+  @Test
+  public void preserveDeletes() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.preserveDeletes()).isFalse();
+      opt.setPreserveDeletes(true);
+      assertThat(opt.preserveDeletes()).isTrue();
+    }
+  }
+
+  @Test
+  public void twoWriteQueues() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.twoWriteQueues()).isFalse();
+      opt.setTwoWriteQueues(true);
+      assertThat(opt.twoWriteQueues()).isTrue();
+    }
+  }
+
+  @Test
+  public void manualWalFlush() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.manualWalFlush()).isFalse();
+      opt.setManualWalFlush(true);
+      assertThat(opt.manualWalFlush()).isTrue();
+    }
+  }
+
+  @Test
+  public void atomicFlush() {
+    try (final Options opt = new Options()) {
+      assertThat(opt.atomicFlush()).isFalse();
+      opt.setAtomicFlush(true);
+      assertThat(opt.atomicFlush()).isTrue();
+    }
+  }
+
   @Test
   public void env() {
     try (final Options options = new Options();
@@ -924,6 +1050,20 @@ public void bottommostCompressionType() {
     }
   }
 
+  @Test
+  public void bottommostCompressionOptions() {
+    try (final Options options = new Options();
+         final CompressionOptions bottommostCompressionOptions = new CompressionOptions()
+             .setMaxDictBytes(123)) {
+
+      options.setBottommostCompressionOptions(bottommostCompressionOptions);
+      assertThat(options.bottommostCompressionOptions())
+          .isEqualTo(bottommostCompressionOptions);
+      assertThat(options.bottommostCompressionOptions().maxDictBytes())
+          .isEqualTo(123);
+    }
+  }
+
   @Test
   public void compressionOptions() {
     try (final Options options = new Options();
@@ -978,6 +1118,15 @@ public void rateLimiter() {
     }
   }
 
+  @Test
+  public void sstFileManager() throws RocksDBException {
+    try (final Options options = new Options();
+         final SstFileManager sstFileManager =
+             new SstFileManager(Env.getDefault())) {
+      options.setSstFileManager(sstFileManager);
+    }
+  }
+
   @Test
   public void shouldSetTestPrefixExtractor() {
     try (final Options options = new Options()) {
@@ -1057,6 +1206,15 @@ public void reportBgIoStats() {
     }
   }
 
+  @Test
+  public void ttl() {
+    try (final Options options = new Options()) {
+      options.setTtl(1000 * 60);
+      assertThat(options.ttl()).
+          isEqualTo(1000 * 60);
+    }
+  }
+
   @Test
   public void compactionOptionsUniversal() {
     try (final Options options = new Options();
@@ -1092,4 +1250,23 @@ public void forceConsistencyChecks() {
           isEqualTo(booleanValue);
     }
   }
+
+  @Test
+  public void compactionFilter() {
+    try(final Options options = new Options();
+        final RemoveEmptyValueCompactionFilter cf = new RemoveEmptyValueCompactionFilter()) {
+      options.setCompactionFilter(cf);
+      assertThat(options.compactionFilter()).isEqualTo(cf);
+    }
+  }
+
+  @Test
+  public void compactionFilterFactory() {
+    try(final Options options = new Options();
+        final RemoveEmptyValueCompactionFilterFactory cff = new RemoveEmptyValueCompactionFilterFactory()) {
+      options.setCompactionFilterFactory(cff);
+      assertThat(options.compactionFilterFactory()).isEqualTo(cff);
+    }
+  }
+
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptionsUtilTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptionsUtilTest.java
new file mode 100644
index 0000000000..e79951aa85
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/OptionsUtilTest.java
@@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class OptionsUtilTest {
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource = new RocksMemoryResource();
+
+  @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  enum TestAPI { LOAD_LATEST_OPTIONS, LOAD_OPTIONS_FROM_FILE }
+
+  @Test
+  public void loadLatestOptions() throws RocksDBException {
+    verifyOptions(TestAPI.LOAD_LATEST_OPTIONS);
+  }
+
+  @Test
+  public void loadOptionsFromFile() throws RocksDBException {
+    verifyOptions(TestAPI.LOAD_OPTIONS_FROM_FILE);
+  }
+
+  @Test
+  public void getLatestOptionsFileName() throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(options, dbPath)) {
+      assertThat(db).isNotNull();
+    }
+
+    String fName = OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault());
+    assertThat(fName).isNotNull();
+    assert(fName.startsWith("OPTIONS-") == true);
+    // System.out.println("latest options fileName: " + fName);
+  }
+
+  private void verifyOptions(TestAPI apiType) throws RocksDBException {
+    final String dbPath = dbFolder.getRoot().getAbsolutePath();
+    final Options options = new Options()
+                                .setCreateIfMissing(true)
+                                .setParanoidChecks(false)
+                                .setMaxOpenFiles(478)
+                                .setDelayedWriteRate(1234567L);
+    final ColumnFamilyOptions baseDefaultCFOpts = new ColumnFamilyOptions();
+    final byte[] secondCFName = "new_cf".getBytes();
+    final ColumnFamilyOptions baseSecondCFOpts =
+        new ColumnFamilyOptions()
+            .setWriteBufferSize(70 * 1024)
+            .setMaxWriteBufferNumber(7)
+            .setMaxBytesForLevelBase(53 * 1024 * 1024)
+            .setLevel0FileNumCompactionTrigger(3)
+            .setLevel0SlowdownWritesTrigger(51)
+            .setBottommostCompressionType(CompressionType.ZSTD_COMPRESSION);
+
+    // Create a database with a new column family
+    try (final RocksDB db = RocksDB.open(options, dbPath)) {
+      assertThat(db).isNotNull();
+
+      // create column family
+      try (final ColumnFamilyHandle columnFamilyHandle =
+               db.createColumnFamily(new ColumnFamilyDescriptor(secondCFName, baseSecondCFOpts))) {
+        assert(columnFamilyHandle != null);
+      }
+    }
+
+    // Read the options back and verify
+    DBOptions dbOptions = new DBOptions();
+    final List<ColumnFamilyDescriptor> cfDescs = new ArrayList<>();
+    String path = dbPath;
+    if (apiType == TestAPI.LOAD_LATEST_OPTIONS) {
+      OptionsUtil.loadLatestOptions(path, Env.getDefault(), dbOptions, cfDescs, false);
+    } else if (apiType == TestAPI.LOAD_OPTIONS_FROM_FILE) {
+      path = dbPath + "/" + OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault());
+      OptionsUtil.loadOptionsFromFile(path, Env.getDefault(), dbOptions, cfDescs, false);
+    }
+
+    assertThat(dbOptions.createIfMissing()).isEqualTo(options.createIfMissing());
+    assertThat(dbOptions.paranoidChecks()).isEqualTo(options.paranoidChecks());
+    assertThat(dbOptions.maxOpenFiles()).isEqualTo(options.maxOpenFiles());
+    assertThat(dbOptions.delayedWriteRate()).isEqualTo(options.delayedWriteRate());
+
+    assertThat(cfDescs.size()).isEqualTo(2);
+    assertThat(cfDescs.get(0)).isNotNull();
+    assertThat(cfDescs.get(1)).isNotNull();
+    assertThat(cfDescs.get(0).columnFamilyName()).isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY);
+    assertThat(cfDescs.get(1).columnFamilyName()).isEqualTo(secondCFName);
+
+    ColumnFamilyOptions defaultCFOpts = cfDescs.get(0).columnFamilyOptions();
+    assertThat(defaultCFOpts.writeBufferSize()).isEqualTo(baseDefaultCFOpts.writeBufferSize());
+    assertThat(defaultCFOpts.maxWriteBufferNumber())
+        .isEqualTo(baseDefaultCFOpts.maxWriteBufferNumber());
+    assertThat(defaultCFOpts.maxBytesForLevelBase())
+        .isEqualTo(baseDefaultCFOpts.maxBytesForLevelBase());
+    assertThat(defaultCFOpts.level0FileNumCompactionTrigger())
+        .isEqualTo(baseDefaultCFOpts.level0FileNumCompactionTrigger());
+    assertThat(defaultCFOpts.level0SlowdownWritesTrigger())
+        .isEqualTo(baseDefaultCFOpts.level0SlowdownWritesTrigger());
+    assertThat(defaultCFOpts.bottommostCompressionType())
+        .isEqualTo(baseDefaultCFOpts.bottommostCompressionType());
+
+    ColumnFamilyOptions secondCFOpts = cfDescs.get(1).columnFamilyOptions();
+    assertThat(secondCFOpts.writeBufferSize()).isEqualTo(baseSecondCFOpts.writeBufferSize());
+    assertThat(secondCFOpts.maxWriteBufferNumber())
+        .isEqualTo(baseSecondCFOpts.maxWriteBufferNumber());
+    assertThat(secondCFOpts.maxBytesForLevelBase())
+        .isEqualTo(baseSecondCFOpts.maxBytesForLevelBase());
+    assertThat(secondCFOpts.level0FileNumCompactionTrigger())
+        .isEqualTo(baseSecondCFOpts.level0FileNumCompactionTrigger());
+    assertThat(secondCFOpts.level0SlowdownWritesTrigger())
+        .isEqualTo(baseSecondCFOpts.level0SlowdownWritesTrigger());
+    assertThat(secondCFOpts.bottommostCompressionType())
+        .isEqualTo(baseSecondCFOpts.bottommostCompressionType());
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RateLimiterTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RateLimiterTest.java
index 27567e89d1..c78f9876e6 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RateLimiterTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RateLimiterTest.java
@@ -8,6 +8,7 @@
 import org.junit.Test;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.RateLimiter.*;
 
 public class RateLimiterTest {
 
@@ -16,17 +17,21 @@ public class RateLimiterTest {
       new RocksMemoryResource();
 
   @Test
-  public void setBytesPerSecond() {
+  public void bytesPerSecond() {
     try(final RateLimiter rateLimiter =
-            new RateLimiter(1000, 100 * 1000, 1)) {
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
+      assertThat(rateLimiter.getBytesPerSecond()).isGreaterThan(0);
       rateLimiter.setBytesPerSecond(2000);
+      assertThat(rateLimiter.getBytesPerSecond()).isGreaterThan(0);
     }
   }
 
   @Test
   public void getSingleBurstBytes() {
     try(final RateLimiter rateLimiter =
-            new RateLimiter(1000, 100 * 1000, 1)) {
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
       assertThat(rateLimiter.getSingleBurstBytes()).isEqualTo(100);
     }
   }
@@ -34,7 +39,8 @@ public void getSingleBurstBytes() {
   @Test
   public void getTotalBytesThrough() {
     try(final RateLimiter rateLimiter =
-            new RateLimiter(1000, 100 * 1000, 1)) {
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
       assertThat(rateLimiter.getTotalBytesThrough()).isEqualTo(0);
     }
   }
@@ -42,8 +48,18 @@ public void getTotalBytesThrough() {
   @Test
   public void getTotalRequests() {
     try(final RateLimiter rateLimiter =
-            new RateLimiter(1000, 100 * 1000, 1)) {
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, DEFAULT_AUTOTUNE)) {
       assertThat(rateLimiter.getTotalRequests()).isEqualTo(0);
     }
   }
+
+  @Test
+  public void autoTune() {
+    try(final RateLimiter rateLimiter =
+            new RateLimiter(1000, DEFAULT_REFILL_PERIOD_MICROS,
+                DEFAULT_FAIRNESS, DEFAULT_MODE, true)) {
+      assertThat(rateLimiter.getBytesPerSecond()).isGreaterThan(0);
+    }
+  }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
index da048c4431..9708cd0b1f 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/ReadOptionsTest.java
@@ -5,6 +5,7 @@
 
 package org.rocksdb;
 
+import java.util.Arrays;
 import java.util.Random;
 
 import org.junit.ClassRule;
@@ -23,6 +24,30 @@ public class ReadOptionsTest {
   @Rule
   public ExpectedException exception = ExpectedException.none();
 
+  @Test
+  public void altConstructor() {
+    try (final ReadOptions opt = new ReadOptions(true, true)) {
+      assertThat(opt.verifyChecksums()).isTrue();
+      assertThat(opt.fillCache()).isTrue();
+    }
+  }
+
+  @Test
+  public void copyConstructor() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      opt.setVerifyChecksums(false);
+      opt.setFillCache(false);
+      opt.setIterateUpperBound(buildRandomSlice());
+      opt.setIterateLowerBound(buildRandomSlice());
+      try (final ReadOptions other = new ReadOptions(opt)) {
+        assertThat(opt.verifyChecksums()).isEqualTo(other.verifyChecksums());
+        assertThat(opt.fillCache()).isEqualTo(other.fillCache());
+        assertThat(Arrays.equals(opt.iterateUpperBound().data(), other.iterateUpperBound().data())).isTrue();
+        assertThat(Arrays.equals(opt.iterateLowerBound().data(), other.iterateLowerBound().data())).isTrue();
+      }
+    }
+  }
+
   @Test
   public void verifyChecksum() {
     try (final ReadOptions opt = new ReadOptions()) {
@@ -127,6 +152,56 @@ public void ignoreRangeDeletions() {
     }
   }
 
+  @Test
+  public void iterateUpperBound() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice upperBound = buildRandomSlice();
+      opt.setIterateUpperBound(upperBound);
+      assertThat(Arrays.equals(upperBound.data(), opt.iterateUpperBound().data())).isTrue();
+    }
+  }
+
+  @Test
+  public void iterateUpperBoundNull() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      assertThat(opt.iterateUpperBound()).isNull();
+    }
+  }
+
+  @Test
+  public void iterateLowerBound() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      Slice lowerBound = buildRandomSlice();
+      opt.setIterateLowerBound(lowerBound);
+      assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue();
+    }
+  }
+
+  @Test
+  public void iterateLowerBoundNull() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      assertThat(opt.iterateLowerBound()).isNull();
+    }
+  }
+
+  @Test
+  public void tableFilter() {
+    try (final ReadOptions opt = new ReadOptions();
+         final AbstractTableFilter allTablesFilter = new AllTablesFilter()) {
+      opt.setTableFilter(allTablesFilter);
+    }
+  }
+
+  @Test
+  public void iterStartSeqnum() {
+    try (final ReadOptions opt = new ReadOptions()) {
+      assertThat(opt.iterStartSeqnum()).isEqualTo(0);
+
+      opt.setIterStartSeqnum(10);
+      assertThat(opt.iterStartSeqnum()).isEqualTo(10);
+    }
+  }
+
   @Test
   public void failSetVerifyChecksumUninitialized() {
     try (final ReadOptions readOptions =
@@ -191,6 +266,38 @@ public void failSnapshotUninitialized() {
     }
   }
 
+  @Test
+  public void failSetIterateUpperBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setIterateUpperBound(null);
+    }
+  }
+
+  @Test
+  public void failIterateUpperBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.iterateUpperBound();
+    }
+  }
+
+  @Test
+  public void failSetIterateLowerBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.setIterateLowerBound(null);
+    }
+  }
+
+  @Test
+  public void failIterateLowerBoundUninitialized() {
+    try (final ReadOptions readOptions =
+             setupUninitializedReadOptions(exception)) {
+      readOptions.iterateLowerBound();
+    }
+  }
+
   private ReadOptions setupUninitializedReadOptions(
       ExpectedException exception) {
     final ReadOptions readOptions = new ReadOptions();
@@ -198,4 +305,18 @@ private ReadOptions setupUninitializedReadOptions(
     exception.expect(AssertionError.class);
     return readOptions;
   }
+
+  private Slice buildRandomSlice() {
+    final Random rand = new Random();
+    byte[] sliceBytes = new byte[rand.nextInt(100) + 1];
+    rand.nextBytes(sliceBytes);
+    return new Slice(sliceBytes);
+  }
+
+  private static class AllTablesFilter extends AbstractTableFilter {
+    @Override
+    public boolean filter(final TableProperties tableProperties) {
+      return true;
+    }
+  }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
index 89894746d2..a7d7fee14f 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksDBTest.java
@@ -4,13 +4,14 @@
 //  (found in the LICENSE.Apache file in the root directory).
 package org.rocksdb;
 
-import org.junit.ClassRule;
-import org.junit.Rule;
-import org.junit.Test;
+import org.junit.*;
+import org.junit.rules.ExpectedException;
 import org.junit.rules.TemporaryFolder;
 
+import java.nio.ByteBuffer;
 import java.util.*;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.Assert.fail;
 
@@ -58,6 +59,130 @@ public void openWhenOpen() throws RocksDBException {
     }
   }
 
+  @Test
+  public void createColumnFamily() throws RocksDBException {
+      final byte[] col1Name = "col1".getBytes(UTF_8);
+
+      try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+           final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()
+      ) {
+        try (final ColumnFamilyHandle col1 =
+            db.createColumnFamily(new ColumnFamilyDescriptor(col1Name, cfOpts))) {
+          assertThat(col1).isNotNull();
+          assertThat(col1.getName()).isEqualTo(col1Name);
+        }
+      }
+
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(col1Name)),
+            cfHandles)) {
+        try {
+          assertThat(cfHandles.size()).isEqualTo(2);
+          assertThat(cfHandles.get(1)).isNotNull();
+          assertThat(cfHandles.get(1).getName()).isEqualTo(col1Name);
+        } finally {
+          for (final ColumnFamilyHandle cfHandle :
+              cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+  }
+
+
+  @Test
+  public void createColumnFamilies() throws RocksDBException {
+    final byte[] col1Name = "col1".getBytes(UTF_8);
+    final byte[] col2Name = "col2".getBytes(UTF_8);
+
+    List<ColumnFamilyHandle> cfHandles;
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()
+    ) {
+      cfHandles =
+          db.createColumnFamilies(cfOpts, Arrays.asList(col1Name, col2Name));
+      try {
+        assertThat(cfHandles).isNotNull();
+        assertThat(cfHandles.size()).isEqualTo(2);
+        assertThat(cfHandles.get(0).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+
+    cfHandles = new ArrayList<>();
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(col1Name),
+            new ColumnFamilyDescriptor(col2Name)),
+        cfHandles)) {
+      try {
+        assertThat(cfHandles.size()).isEqualTo(3);
+        assertThat(cfHandles.get(1)).isNotNull();
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(2)).isNotNull();
+        assertThat(cfHandles.get(2).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  public void createColumnFamiliesfromDescriptors() throws RocksDBException {
+    final byte[] col1Name = "col1".getBytes(UTF_8);
+    final byte[] col2Name = "col2".getBytes(UTF_8);
+
+    List<ColumnFamilyHandle> cfHandles;
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()
+    ) {
+      cfHandles =
+          db.createColumnFamilies(Arrays.asList(
+              new ColumnFamilyDescriptor(col1Name, cfOpts),
+              new ColumnFamilyDescriptor(col2Name, cfOpts)));
+      try {
+        assertThat(cfHandles).isNotNull();
+        assertThat(cfHandles.size()).isEqualTo(2);
+        assertThat(cfHandles.get(0).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+
+    cfHandles = new ArrayList<>();
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(),
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(col1Name),
+            new ColumnFamilyDescriptor(col2Name)),
+        cfHandles)) {
+      try {
+        assertThat(cfHandles.size()).isEqualTo(3);
+        assertThat(cfHandles.get(1)).isNotNull();
+        assertThat(cfHandles.get(1).getName()).isEqualTo(col1Name);
+        assertThat(cfHandles.get(2)).isNotNull();
+        assertThat(cfHandles.get(2).getName()).isEqualTo(col2Name);
+      } finally {
+        for (final ColumnFamilyHandle cfHandle : cfHandles) {
+          cfHandle.close();
+        }
+      }
+    }
+  }
+
   @Test
   public void put() throws RocksDBException {
     try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
@@ -68,6 +193,57 @@ public void put() throws RocksDBException {
           "value".getBytes());
       assertThat(db.get("key2".getBytes())).isEqualTo(
           "12345678".getBytes());
+
+
+      // put
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      Segment value0 = sliceSegment("value 0");
+      Segment value1 = sliceSegment("value 1");
+      db.put(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len);
+      db.put(opt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len);
+
+      // compare
+      Assert.assertTrue(value0.isSamePayload(db.get(key3.data, key3.offset, key3.len)));
+      Assert.assertTrue(value1.isSamePayload(db.get(key4.data, key4.offset, key4.len)));
+    }
+  }
+
+  private static Segment sliceSegment(String key) {
+    ByteBuffer rawKey = ByteBuffer.allocate(key.length() + 4);
+    rawKey.put((byte)0);
+    rawKey.put((byte)0);
+    rawKey.put(key.getBytes());
+
+    return new Segment(rawKey.array(), 2, key.length());
+  }
+
+  private static class Segment {
+    final byte[] data;
+    final int offset;
+    final int len;
+
+    public boolean isSamePayload(byte[] value) {
+      if (value == null) {
+        return false;
+      }
+      if (value.length != len) {
+        return false;
+      }
+
+      for (int i = 0; i < value.length; i++) {
+        if (data[i + offset] != value[i]) {
+          return false;
+        }
+      }
+
+      return true;
+    }
+
+    public Segment(byte[] value, int offset, int len) {
+      this.data = value;
+      this.offset = offset;
+      this.len = len;
     }
   }
 
@@ -143,6 +319,39 @@ public void getWithOutValueReadOptions() throws RocksDBException {
     }
   }
 
+  @Rule
+  public ExpectedException thrown = ExpectedException.none();
+
+  @Test
+  public void getOutOfArrayMaxSizeValue() throws RocksDBException {
+    final int numberOfValueSplits = 10;
+    final int splitSize = Integer.MAX_VALUE / numberOfValueSplits;
+
+    Runtime runtime = Runtime.getRuntime();
+    long neededMemory = ((long)(splitSize)) * (((long)numberOfValueSplits) + 3);
+    boolean isEnoughMemory = runtime.maxMemory() - runtime.totalMemory() > neededMemory;
+    Assume.assumeTrue(isEnoughMemory);
+
+    final byte[] valueSplit = new byte[splitSize];
+    final byte[] key = "key".getBytes();
+
+    thrown.expect(RocksDBException.class);
+    thrown.expectMessage("Requested array size exceeds VM limit");
+
+    // merge (numberOfValueSplits + 1) valueSplit's to get value size exceeding Integer.MAX_VALUE
+    try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
+         final Options opt = new Options()
+                 .setCreateIfMissing(true)
+                 .setMergeOperator(stringAppendOperator);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      db.put(key, valueSplit);
+      for (int i = 0; i < numberOfValueSplits; i++) {
+        db.merge(key, valueSplit);
+      }
+      db.get(key);
+    }
+  }
+
   @Test
   public void multiGet() throws RocksDBException, InterruptedException {
     try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
@@ -182,6 +391,41 @@ public void multiGet() throws RocksDBException, InterruptedException {
     }
   }
 
+  @Test
+  public void multiGetAsList() throws RocksDBException, InterruptedException {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final ReadOptions rOpt = new ReadOptions()) {
+      db.put("key1".getBytes(), "value".getBytes());
+      db.put("key2".getBytes(), "12345678".getBytes());
+      List<byte[]> lookupKeys = new ArrayList<>();
+      lookupKeys.add("key1".getBytes());
+      lookupKeys.add("key2".getBytes());
+      List<byte[]> results = db.multiGetAsList(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).hasSize(lookupKeys.size());
+      assertThat(results).
+          containsExactly("value".getBytes(), "12345678".getBytes());
+      // test same method with ReadOptions
+      results = db.multiGetAsList(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).
+          contains("value".getBytes(), "12345678".getBytes());
+
+      // remove existing key
+      lookupKeys.remove(1);
+      // add non existing key
+      lookupKeys.add("key3".getBytes());
+      results = db.multiGetAsList(lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).
+          containsExactly("value".getBytes(), null);
+      // test same call with readOptions
+      results = db.multiGetAsList(rOpt, lookupKeys);
+      assertThat(results).isNotNull();
+      assertThat(results).contains("value".getBytes());
+    }
+  }
+
   @Test
   public void merge() throws RocksDBException {
     try (final StringAppendOperator stringAppendOperator = new StringAppendOperator();
@@ -207,6 +451,18 @@ public void merge() throws RocksDBException {
       db.merge(wOpt, "key2".getBytes(), "xxxx".getBytes());
       assertThat(db.get("key2".getBytes())).isEqualTo(
           "xxxx".getBytes());
+
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      Segment value0 = sliceSegment("value 0");
+      Segment value1 = sliceSegment("value 1");
+
+      db.merge(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len);
+      db.merge(wOpt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len);
+
+      // compare
+      Assert.assertTrue(value0.isSamePayload(db.get(key3.data, key3.offset, key3.len)));
+      Assert.assertTrue(value1.isSamePayload(db.get(key4.data, key4.offset, key4.len)));
     }
   }
 
@@ -224,6 +480,18 @@ public void delete() throws RocksDBException {
       db.delete(wOpt, "key2".getBytes());
       assertThat(db.get("key1".getBytes())).isNull();
       assertThat(db.get("key2".getBytes())).isNull();
+
+
+      Segment key3 = sliceSegment("key3");
+      Segment key4 = sliceSegment("key4");
+      db.put("key3".getBytes(), "key3 value".getBytes());
+      db.put("key4".getBytes(), "key4 value".getBytes());
+
+      db.delete(key3.data, key3.offset, key3.len);
+      db.delete(wOpt, key4.data, key4.offset, key4.len);
+
+      assertThat(db.get("key3".getBytes())).isNull();
+      assertThat(db.get("key4".getBytes())).isNull();
     }
   }
 
@@ -257,8 +525,7 @@ public void singleDelete_nonExisting() throws RocksDBException {
 
   @Test
   public void deleteRange() throws RocksDBException {
-    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
-         final WriteOptions wOpt = new WriteOptions()) {
+    try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath())) {
       db.put("key1".getBytes(), "value".getBytes());
       db.put("key2".getBytes(), "12345678".getBytes());
       db.put("key3".getBytes(), "abcdefg".getBytes());
@@ -763,4 +1030,525 @@ public void setOptions() throws RocksDBException {
       }
     }
   }
+
+  @Test
+  public void destroyDB() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put("key1".getBytes(), "value".getBytes());
+      }
+      assertThat(dbFolder.getRoot().exists()).isTrue();
+      RocksDB.destroyDB(dbPath, options);
+      assertThat(dbFolder.getRoot().exists()).isFalse();
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void destroyDBFailIfOpen() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        // Fails as the db is open and locked.
+        RocksDB.destroyDB(dbPath, options);
+      }
+    }
+  }
+
+  @Ignore("This test crashes. Re-enable after fixing.")
+  @Test
+  public void getApproximateSizes() throws RocksDBException {
+    final byte key1[] = "key1".getBytes(UTF_8);
+    final byte key2[] = "key2".getBytes(UTF_8);
+    final byte key3[] = "key3".getBytes(UTF_8);
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put(key1, key1);
+        db.put(key2, key2);
+        db.put(key3, key3);
+
+        final long[] sizes = db.getApproximateSizes(
+            Arrays.asList(
+                new Range(new Slice(key1), new Slice(key2)),
+                new Range(new Slice(key2), new Slice(key3))
+            ),
+            SizeApproximationFlag.INCLUDE_FILES,
+            SizeApproximationFlag.INCLUDE_MEMTABLES);
+
+        assertThat(sizes.length).isEqualTo(2);
+        assertThat(sizes[0]).isEqualTo(0);
+        assertThat(sizes[1]).isGreaterThanOrEqualTo(1);
+      }
+    }
+  }
+
+  @Test
+  public void getApproximateMemTableStats() throws RocksDBException {
+    final byte key1[] = "key1".getBytes(UTF_8);
+    final byte key2[] = "key2".getBytes(UTF_8);
+    final byte key3[] = "key3".getBytes(UTF_8);
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put(key1, key1);
+        db.put(key2, key2);
+        db.put(key3, key3);
+
+        final RocksDB.CountAndSize stats =
+            db.getApproximateMemTableStats(
+                new Range(new Slice(key1), new Slice(key3)));
+
+        assertThat(stats).isNotNull();
+        assertThat(stats.count).isGreaterThan(1);
+        assertThat(stats.size).isGreaterThan(1);
+      }
+    }
+  }
+
+  @Ignore("TODO(AR) re-enable when ready!")
+  @Test
+  public void compactFiles() throws RocksDBException {
+    final int kTestKeySize = 16;
+    final int kTestValueSize = 984;
+    final int kEntrySize = kTestKeySize + kTestValueSize;
+    final int kEntriesPerBuffer = 100;
+    final int writeBufferSize = kEntrySize * kEntriesPerBuffer;
+    final byte[] cfName = "pikachu".getBytes(UTF_8);
+
+    try (final Options options = new Options()
+        .setCreateIfMissing(true)
+        .setWriteBufferSize(writeBufferSize)
+        .setCompactionStyle(CompactionStyle.LEVEL)
+        .setTargetFileSizeBase(writeBufferSize)
+        .setMaxBytesForLevelBase(writeBufferSize * 2)
+        .setLevel0StopWritesTrigger(2)
+        .setMaxBytesForLevelMultiplier(2)
+        .setCompressionType(CompressionType.NO_COMPRESSION)
+        .setMaxSubcompactions(4)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath);
+           final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(options)) {
+        db.createColumnFamily(new ColumnFamilyDescriptor(cfName,
+            cfOptions)).close();
+      }
+
+      try (final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(options)) {
+        final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOptions),
+            new ColumnFamilyDescriptor(cfName, cfOptions)
+        );
+        final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+        try (final DBOptions dbOptions = new DBOptions(options);
+            final RocksDB db = RocksDB.open(dbOptions, dbPath, cfDescriptors,
+                cfHandles);
+        ) {
+          try (final FlushOptions flushOptions = new FlushOptions()
+                .setWaitForFlush(true)
+                .setAllowWriteStall(true);
+               final CompactionOptions compactionOptions = new CompactionOptions()) {
+            final Random rnd = new Random(301);
+            for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+              final byte[] value = new byte[kTestValueSize];
+              rnd.nextBytes(value);
+              db.put(cfHandles.get(1), Integer.toString(key).getBytes(UTF_8),
+                  value);
+            }
+            db.flush(flushOptions, cfHandles);
+
+            final RocksDB.LiveFiles liveFiles = db.getLiveFiles();
+            final List<String> compactedFiles =
+                db.compactFiles(compactionOptions, cfHandles.get(1),
+                    liveFiles.files, 1, -1, null);
+            assertThat(compactedFiles).isNotEmpty();
+          } finally {
+            for (final ColumnFamilyHandle cfHandle : cfHandles) {
+              cfHandle.close();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void enableAutoCompaction() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        try {
+          db.enableAutoCompaction(cfHandles);
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void numberLevels() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.numberLevels()).isEqualTo(7);
+      }
+    }
+  }
+
+  @Test
+  public void maxMemCompactionLevel() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.maxMemCompactionLevel()).isEqualTo(0);
+      }
+    }
+  }
+
+  @Test
+  public void level0StopWriteTrigger() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.level0StopWriteTrigger()).isEqualTo(36);
+      }
+    }
+  }
+
+  @Test
+  public void getName() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.getName()).isEqualTo(dbPath);
+      }
+    }
+  }
+
+  @Test
+  public void getEnv() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.getEnv()).isEqualTo(Env.getDefault());
+      }
+    }
+  }
+
+  @Test
+  public void flush() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath);
+        final FlushOptions flushOptions = new FlushOptions()) {
+        db.flush(flushOptions);
+      }
+    }
+  }
+
+  @Test
+  public void flushWal() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.flushWal(true);
+      }
+    }
+  }
+
+  @Test
+  public void syncWal() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.syncWal();
+      }
+    }
+  }
+
+  @Test
+  public void setPreserveDeletesSequenceNumber() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        assertThat(db.setPreserveDeletesSequenceNumber(db.getLatestSequenceNumber()))
+            .isFalse();
+      }
+    }
+  }
+
+  @Test
+  public void getLiveFiles() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        final RocksDB.LiveFiles livefiles = db.getLiveFiles(true);
+        assertThat(livefiles).isNotNull();
+        assertThat(livefiles.manifestFileSize).isEqualTo(13);
+        assertThat(livefiles.files.size()).isEqualTo(3);
+        assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
+        assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000001");
+        assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000005");
+      }
+    }
+  }
+
+  @Test
+  public void getSortedWalFiles() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        final List<LogFile> logFiles = db.getSortedWalFiles();
+        assertThat(logFiles).isNotNull();
+        assertThat(logFiles.size()).isEqualTo(1);
+        assertThat(logFiles.get(0).type())
+            .isEqualTo(WalFileType.kAliveLogFile);
+      }
+    }
+  }
+
+  @Test
+  public void deleteFile() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.deleteFile("unknown");
+      }
+    }
+  }
+
+  @Test
+  public void getLiveFilesMetaData() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        final List<LiveFileMetaData> liveFilesMetaData
+            = db.getLiveFilesMetaData();
+        assertThat(liveFilesMetaData).isEmpty();
+      }
+    }
+  }
+
+  @Test
+  public void getColumnFamilyMetaData() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        try {
+          final ColumnFamilyMetaData cfMetadata =
+              db.getColumnFamilyMetaData(cfHandles.get(0));
+          assertThat(cfMetadata).isNotNull();
+          assertThat(cfMetadata.name()).isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY);
+          assertThat(cfMetadata.levels().size()).isEqualTo(7);
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void verifyChecksum() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.verifyChecksum();
+      }
+    }
+  }
+
+  @Test
+  public void getPropertiesOfAllTables() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        try {
+          final Map<String, TableProperties> properties =
+              db.getPropertiesOfAllTables(cfHandles.get(0));
+          assertThat(properties).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void getPropertiesOfTablesInRange() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+        try {
+          final Range range = new Range(
+              new Slice("key1".getBytes(UTF_8)),
+              new Slice("key3".getBytes(UTF_8)));
+          final Map<String, TableProperties> properties =
+              db.getPropertiesOfTablesInRange(
+                  cfHandles.get(0), Arrays.asList(range));
+          assertThat(properties).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void suggestCompactRange() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)) {
+      final List<ColumnFamilyDescriptor> cfDescs = Arrays.asList(
+          new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)
+      );
+      final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) {
+        db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+        db.put(cfHandles.get(0), "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+        try {
+          final Range range =  db.suggestCompactRange(cfHandles.get(0));
+          assertThat(range).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void promoteL0() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        db.promoteL0(2);
+      }
+    }
+  }
+
+  @Test
+  public void startTrace() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true)) {
+      final String dbPath = dbFolder.getRoot().getAbsolutePath();
+      try (final RocksDB db = RocksDB.open(options, dbPath)) {
+        final TraceOptions traceOptions = new TraceOptions();
+
+        try (final InMemoryTraceWriter traceWriter = new InMemoryTraceWriter()) {
+          db.startTrace(traceOptions, traceWriter);
+
+          db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+
+          db.endTrace();
+
+          final List<byte[]> writes = traceWriter.getWrites();
+          assertThat(writes.size()).isGreaterThan(0);
+        }
+      }
+    }
+  }
+
+  @Test
+  public void setDBOptions() throws RocksDBException {
+    try (final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions()
+             .setWriteBufferSize(4096)) {
+
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts));
+
+      // open database
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+      try (final RocksDB db = RocksDB.open(options,
+          dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) {
+        try {
+          final MutableDBOptions mutableOptions =
+              MutableDBOptions.builder()
+                  .setBytesPerSync(1024 * 1027 * 7)
+                  .setAvoidFlushDuringShutdown(false)
+                  .build();
+
+          db.setDBOptions(mutableOptions);
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  private static class InMemoryTraceWriter extends AbstractTraceWriter {
+    private final List<byte[]> writes = new ArrayList<>();
+    private volatile boolean closed = false;
+
+    @Override
+    public void write(final Slice slice) {
+      if (closed) {
+        return;
+      }
+      final byte[] data = slice.data();
+      final byte[] dataCopy = new byte[data.length];
+      System.arraycopy(data, 0, dataCopy, 0, data.length);
+      writes.add(dataCopy);
+    }
+
+    @Override
+    public void closeWriter() {
+      closed = true;
+    }
+
+    @Override
+    public long getFileSize() {
+      long size = 0;
+      for (int i = 0; i < writes.size(); i++) {
+        size += writes.get(i).length;
+      }
+      return size;
+    }
+
+    public List<byte[]> getWrites() {
+      return writes;
+    }
+  }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java
deleted file mode 100644
index dfb7961073..0000000000
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksEnvTest.java
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-package org.rocksdb;
-
-import org.junit.ClassRule;
-import org.junit.Test;
-
-import static org.assertj.core.api.Assertions.assertThat;
-
-public class RocksEnvTest {
-
-  @ClassRule
-  public static final RocksMemoryResource rocksMemoryResource =
-      new RocksMemoryResource();
-
-  @Test
-  public void rocksEnv() {
-    try (final Env rocksEnv = RocksEnv.getDefault()) {
-      rocksEnv.setBackgroundThreads(5);
-      // default rocksenv will always return zero for flush pool
-      // no matter what was set via setBackgroundThreads
-      assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)).
-          isEqualTo(0);
-      rocksEnv.setBackgroundThreads(5, RocksEnv.FLUSH_POOL);
-      // default rocksenv will always return zero for flush pool
-      // no matter what was set via setBackgroundThreads
-      assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)).
-          isEqualTo(0);
-      rocksEnv.setBackgroundThreads(5, RocksEnv.COMPACTION_POOL);
-      // default rocksenv will always return zero for compaction pool
-      // no matter what was set via setBackgroundThreads
-      assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.COMPACTION_POOL)).
-          isEqualTo(0);
-    }
-  }
-}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
index 982dab4fc8..45893eec11 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksIteratorTest.java
@@ -53,6 +53,48 @@ public void rocksIterator() throws RocksDBException {
         assertThat(iterator.value()).isEqualTo("value2".getBytes());
         iterator.status();
       }
+
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seek("key0".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seek("key1".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seek("key1.5".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+
+        iterator.seek("key2".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+
+        iterator.seek("key3".getBytes());
+        assertThat(iterator.isValid()).isFalse();
+      }
+
+      try (final RocksIterator iterator = db.newIterator()) {
+        iterator.seekForPrev("key0".getBytes());
+        assertThat(iterator.isValid()).isFalse();
+
+        iterator.seekForPrev("key1".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seekForPrev("key1.5".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key1".getBytes());
+
+        iterator.seekForPrev("key2".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+
+        iterator.seekForPrev("key3".getBytes());
+        assertThat(iterator.isValid()).isTrue();
+        assertThat(iterator.key()).isEqualTo("key2".getBytes());
+      }
     }
   }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java
index 04fae2e95d..8e429d4ecb 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/RocksMemEnvTest.java
@@ -33,7 +33,7 @@ public void memEnvFillAndReopen() throws RocksDBException {
         "baz".getBytes()
     };
 
-    try (final Env env = new RocksMemEnv();
+    try (final Env env = new RocksMemEnv(Env.getDefault());
          final Options options = new Options()
              .setCreateIfMissing(true)
              .setEnv(env);
@@ -107,7 +107,7 @@ public void multipleDatabaseInstances() throws RocksDBException {
         "baz".getBytes()
     };
 
-    try (final Env env = new RocksMemEnv();
+    try (final Env env = new RocksMemEnv(Env.getDefault());
          final Options options = new Options()
              .setCreateIfMissing(true)
              .setEnv(env);
@@ -136,7 +136,7 @@ public void multipleDatabaseInstances() throws RocksDBException {
 
   @Test(expected = RocksDBException.class)
   public void createIfMissingFalse() throws RocksDBException {
-    try (final Env env = new RocksMemEnv();
+    try (final Env env = new RocksMemEnv(Env.getDefault());
          final Options options = new Options()
              .setCreateIfMissing(false)
              .setEnv(env);
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/SstFileManagerTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/SstFileManagerTest.java
new file mode 100644
index 0000000000..2e136e8200
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/SstFileManagerTest.java
@@ -0,0 +1,66 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.Collections;
+
+import static org.assertj.core.api.Assertions.*;
+
+public class SstFileManagerTest {
+
+  @Test
+  public void maxAllowedSpaceUsage() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      sstFileManager.setMaxAllowedSpaceUsage(1024 * 1024 * 64);
+      assertThat(sstFileManager.isMaxAllowedSpaceReached()).isFalse();
+      assertThat(sstFileManager.isMaxAllowedSpaceReachedIncludingCompactions()).isFalse();
+    }
+  }
+
+  @Test
+  public void compactionBufferSize() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      sstFileManager.setCompactionBufferSize(1024 * 1024 * 10);
+      assertThat(sstFileManager.isMaxAllowedSpaceReachedIncludingCompactions()).isFalse();
+    }
+  }
+
+  @Test
+  public void totalSize() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getTotalSize()).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void trackedFiles() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getTrackedFiles()).isEqualTo(Collections.emptyMap());
+    }
+  }
+
+  @Test
+  public void deleteRateBytesPerSecond() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getDeleteRateBytesPerSecond()).isEqualTo(SstFileManager.RATE_BYTES_PER_SEC_DEFAULT);
+      final long ratePerSecond = 1024 * 1024 * 52;
+      sstFileManager.setDeleteRateBytesPerSecond(ratePerSecond);
+      assertThat(sstFileManager.getDeleteRateBytesPerSecond()).isEqualTo(ratePerSecond);
+    }
+  }
+
+  @Test
+  public void maxTrashDBRatio() throws RocksDBException {
+    try (final SstFileManager sstFileManager = new SstFileManager(Env.getDefault())) {
+      assertThat(sstFileManager.getMaxTrashDBRatio()).isEqualTo(SstFileManager.MAX_TRASH_DB_RATION_DEFAULT);
+      final double trashRatio = 0.2;
+      sstFileManager.setMaxTrashDBRatio(trashRatio);
+      assertThat(sstFileManager.getMaxTrashDBRatio()).isEqualTo(trashRatio);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/StatisticsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/StatisticsTest.java
index 2103c2fc78..fbd255bdba 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/StatisticsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/StatisticsTest.java
@@ -96,6 +96,14 @@ public void getHistogramData() throws RocksDBException {
       final HistogramData histogramData = statistics.getHistogramData(HistogramType.BYTES_PER_READ);
       assertThat(histogramData).isNotNull();
       assertThat(histogramData.getAverage()).isGreaterThan(0);
+      assertThat(histogramData.getMedian()).isGreaterThan(0);
+      assertThat(histogramData.getPercentile95()).isGreaterThan(0);
+      assertThat(histogramData.getPercentile99()).isGreaterThan(0);
+      assertThat(histogramData.getStandardDeviation()).isEqualTo(0.00);
+      assertThat(histogramData.getMax()).isGreaterThan(0);
+      assertThat(histogramData.getCount()).isGreaterThan(0);
+      assertThat(histogramData.getSum()).isGreaterThan(0);
+      assertThat(histogramData.getMin()).isGreaterThan(0);
     }
   }
 
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TableFilterTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TableFilterTest.java
new file mode 100644
index 0000000000..862696763f
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TableFilterTest.java
@@ -0,0 +1,105 @@
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TableFilterTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void readOptions() throws RocksDBException {
+    try (final DBOptions opt = new DBOptions().
+            setCreateIfMissing(true).
+            setCreateMissingColumnFamilies(true);
+         final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions()
+    ) {
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("new_cf".getBytes(), new_cf_opts)
+          );
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+      // open database
+      try (final RocksDB db = RocksDB.open(opt,
+          dbFolder.getRoot().getAbsolutePath(),
+          columnFamilyDescriptors,
+          columnFamilyHandles)) {
+
+        try (final CfNameCollectionTableFilter cfNameCollectingTableFilter =
+                 new CfNameCollectionTableFilter();
+            final FlushOptions flushOptions =
+                new FlushOptions().setWaitForFlush(true);
+            final ReadOptions readOptions =
+                 new ReadOptions().setTableFilter(cfNameCollectingTableFilter)) {
+
+          db.put(columnFamilyHandles.get(0),
+              "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(0),
+              "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(0),
+              "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(1),
+              "key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(1),
+              "key2".getBytes(UTF_8), "value2".getBytes(UTF_8));
+          db.put(columnFamilyHandles.get(1),
+              "key3".getBytes(UTF_8), "value3".getBytes(UTF_8));
+
+          db.flush(flushOptions, columnFamilyHandles);
+
+          try (final RocksIterator iterator =
+                   db.newIterator(columnFamilyHandles.get(0), readOptions)) {
+            iterator.seekToFirst();
+            while (iterator.isValid()) {
+              iterator.key();
+              iterator.value();
+              iterator.next();
+            }
+          }
+
+          try (final RocksIterator iterator =
+                   db.newIterator(columnFamilyHandles.get(1), readOptions)) {
+            iterator.seekToFirst();
+            while (iterator.isValid()) {
+              iterator.key();
+              iterator.value();
+              iterator.next();
+            }
+          }
+
+          assertThat(cfNameCollectingTableFilter.cfNames.size()).isEqualTo(2);
+          assertThat(cfNameCollectingTableFilter.cfNames.get(0))
+              .isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY);
+          assertThat(cfNameCollectingTableFilter.cfNames.get(1))
+              .isEqualTo("new_cf".getBytes(UTF_8));
+        } finally {
+          for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+            columnFamilyHandle.close();
+          }
+        }
+      }
+    }
+  }
+
+  private static class CfNameCollectionTableFilter extends AbstractTableFilter {
+    private final List<byte[]> cfNames = new ArrayList<>();
+
+    @Override
+    public boolean filter(final TableProperties tableProperties) {
+      cfNames.add(tableProperties.getColumnFamilyName());
+      return true;
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TimedEnvTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TimedEnvTest.java
new file mode 100644
index 0000000000..2eb5eea825
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TimedEnvTest.java
@@ -0,0 +1,43 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class TimedEnvTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void construct() throws RocksDBException {
+    try (final Env env = new TimedEnv(Env.getDefault())) {
+      // no-op
+    }
+  }
+
+  @Test
+  public void construct_integration() throws RocksDBException {
+    try (final Env env = new TimedEnv(Env.getDefault());
+         final Options options = new Options()
+             .setCreateIfMissing(true)
+             .setEnv(env);
+    ) {
+      try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getPath())) {
+        db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8));
+      }
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java
new file mode 100644
index 0000000000..7eaa6b16cd
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionDBOptionsTest.java
@@ -0,0 +1,64 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TransactionDBOptionsTest {
+
+  private static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void maxNumLocks() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxNumLocks(longValue);
+      assertThat(opt.getMaxNumLocks()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxNumStripes() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setNumStripes(longValue);
+      assertThat(opt.getNumStripes()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void transactionLockTimeout() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setTransactionLockTimeout(longValue);
+      assertThat(opt.getTransactionLockTimeout()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void defaultLockTimeout() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setDefaultLockTimeout(longValue);
+      assertThat(opt.getDefaultLockTimeout()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void writePolicy() {
+    try (final TransactionDBOptions opt = new TransactionDBOptions()) {
+      final TxnDBWritePolicy writePolicy = TxnDBWritePolicy.WRITE_UNPREPARED;  // non-default
+      opt.setWritePolicy(writePolicy);
+      assertThat(opt.getWritePolicy()).isEqualTo(writePolicy);
+    }
+  }
+
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionDBTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionDBTest.java
new file mode 100644
index 0000000000..b0ea813ff5
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionDBTest.java
@@ -0,0 +1,178 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.*;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class TransactionDBTest {
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void open() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+                 dbFolder.getRoot().getAbsolutePath())) {
+      assertThat(tdb).isNotNull();
+    }
+  }
+
+  @Test
+  public void open_columnFamilies() throws RocksDBException {
+    try(final DBOptions dbOptions = new DBOptions().setCreateIfMissing(true)
+          .setCreateMissingColumnFamilies(true);
+        final ColumnFamilyOptions myCfOpts = new ColumnFamilyOptions()) {
+
+      final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+          Arrays.asList(
+              new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+              new ColumnFamilyDescriptor("myCf".getBytes(), myCfOpts));
+
+      final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+      try (final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+           final TransactionDB tdb = TransactionDB.open(dbOptions, txnDbOptions,
+               dbFolder.getRoot().getAbsolutePath(),
+               columnFamilyDescriptors, columnFamilyHandles)) {
+        try {
+          assertThat(tdb).isNotNull();
+        } finally {
+          for (final ColumnFamilyHandle handle : columnFamilyHandles) {
+            handle.close();
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+        final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_transactionOptions() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final TransactionOptions txnOptions = new TransactionOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions,
+          txnOptions)) {
+        assertThat(txn).isNotNull();
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = tdb.beginTransaction(writeOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void beginTransaction_withOld_transactionOptions()
+      throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final TransactionOptions txnOptions = new TransactionOptions()) {
+
+      try(final Transaction txn = tdb.beginTransaction(writeOptions)) {
+        final Transaction txnReused = tdb.beginTransaction(writeOptions,
+            txnOptions, txn);
+        assertThat(txnReused).isSameAs(txn);
+      }
+    }
+  }
+
+  @Test
+  public void lockStatusData() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath());
+         final WriteOptions writeOptions = new WriteOptions();
+         final ReadOptions readOptions = new ReadOptions()) {
+
+      try (final Transaction txn = tdb.beginTransaction(writeOptions)) {
+
+        final byte key[] = "key".getBytes(UTF_8);
+        final byte value[] = "value".getBytes(UTF_8);
+
+        txn.put(key, value);
+        assertThat(txn.getForUpdate(readOptions, key, true)).isEqualTo(value);
+
+        final Map<Long, TransactionDB.KeyLockInfo> lockStatus =
+            tdb.getLockStatusData();
+
+        assertThat(lockStatus.size()).isEqualTo(1);
+        final Set<Map.Entry<Long, TransactionDB.KeyLockInfo>> entrySet = lockStatus.entrySet();
+        final Map.Entry<Long, TransactionDB.KeyLockInfo> entry = entrySet.iterator().next();
+        final long columnFamilyId = entry.getKey();
+        assertThat(columnFamilyId).isEqualTo(0);
+        final TransactionDB.KeyLockInfo keyLockInfo = entry.getValue();
+        assertThat(keyLockInfo.getKey()).isEqualTo(new String(key, UTF_8));
+        assertThat(keyLockInfo.getTransactionIDs().length).isEqualTo(1);
+        assertThat(keyLockInfo.getTransactionIDs()[0]).isEqualTo(txn.getId());
+        assertThat(keyLockInfo.isExclusive()).isTrue();
+      }
+    }
+  }
+
+  @Test
+  public void deadlockInfoBuffer() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath())) {
+
+      // TODO(AR) can we cause a deadlock so that we can test the output here?
+      assertThat(tdb.getDeadlockInfoBuffer()).isEmpty();
+    }
+  }
+
+  @Test
+  public void setDeadlockInfoBufferSize() throws RocksDBException {
+    try (final Options options = new Options().setCreateIfMissing(true);
+         final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+         final TransactionDB tdb = TransactionDB.open(options, txnDbOptions,
+             dbFolder.getRoot().getAbsolutePath())) {
+      tdb.setDeadlockInfoBufferSize(123);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionOptionsTest.java
new file mode 100644
index 0000000000..add0439e03
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionOptionsTest.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.Random;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TransactionOptionsTest {
+
+  private static final Random rand = PlatformRandomHelper.
+      getPlatformSpecificRandomFactory();
+
+  @Test
+  public void snapshot() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setSetSnapshot(boolValue);
+      assertThat(opt.isSetSnapshot()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void deadlockDetect() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final boolean boolValue = rand.nextBoolean();
+      opt.setDeadlockDetect(boolValue);
+      assertThat(opt.isDeadlockDetect()).isEqualTo(boolValue);
+    }
+  }
+
+  @Test
+  public void lockTimeout() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setLockTimeout(longValue);
+      assertThat(opt.getLockTimeout()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void expiration() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setExpiration(longValue);
+      assertThat(opt.getExpiration()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void deadlockDetectDepth() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setDeadlockDetectDepth(longValue);
+      assertThat(opt.getDeadlockDetectDepth()).isEqualTo(longValue);
+    }
+  }
+
+  @Test
+  public void maxWriteBatchSize() {
+    try (final TransactionOptions opt = new TransactionOptions()) {
+      final long longValue = rand.nextLong();
+      opt.setMaxWriteBatchSize(longValue);
+      assertThat(opt.getMaxWriteBatchSize()).isEqualTo(longValue);
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java
new file mode 100644
index 0000000000..57a05c9e3a
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/TransactionTest.java
@@ -0,0 +1,308 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+public class TransactionTest extends AbstractTransactionTest {
+
+  @Test
+  public void getForUpdate_cf_conflict() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte v12[] = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, k1, v1);
+        assertThat(txn.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, testCf, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(testCf, k1, v12); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void getForUpdate_conflict() throws RocksDBException {
+    final byte k1[] = "key1".getBytes(UTF_8);
+    final byte v1[] = "value1".getBytes(UTF_8);
+    final byte v12[] = "value12".getBytes(UTF_8);
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(k1, v1);
+        assertThat(txn.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.getForUpdate(readOptions, k1, true)).isEqualTo(v1);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(k1, v12); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_cf_conflict() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily();
+      final List<ColumnFamilyHandle> cfList = Arrays.asList(testCf, testCf);
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(testCf, keys[0], values[0]);
+        txn.put(testCf, keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, cfList, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, cfList, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(testCf, keys[0], otherValue); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void multiGetForUpdate_conflict() throws RocksDBException {
+    final byte keys[][] = new byte[][] {
+        "key1".getBytes(UTF_8),
+        "key2".getBytes(UTF_8)};
+    final byte values[][] = new byte[][] {
+        "value1".getBytes(UTF_8),
+        "value2".getBytes(UTF_8)};
+    final byte[] otherValue = "otherValue".getBytes(UTF_8);
+
+    try(final DBContainer dbContainer = startDb();
+        final ReadOptions readOptions = new ReadOptions()) {
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        txn.put(keys[0], values[0]);
+        txn.put(keys[1], values[1]);
+        assertThat(txn.multiGet(readOptions, keys)).isEqualTo(values);
+        txn.commit();
+      }
+
+      try(final Transaction txn2 = dbContainer.beginTransaction()) {
+        try(final Transaction txn3 = dbContainer.beginTransaction()) {
+          assertThat(txn3.multiGetForUpdate(readOptions, keys))
+              .isEqualTo(values);
+
+          // NOTE: txn2 updates k1, during txn3
+          try {
+            txn2.put(keys[0], otherValue); // should cause an exception!
+          } catch(final RocksDBException e) {
+            assertThat(e.getStatus().getCode()).isSameAs(Status.Code.TimedOut);
+            return;
+          }
+        }
+      }
+
+      fail("Expected an exception for put after getForUpdate from conflicting" +
+          "transactions");
+    }
+  }
+
+  @Test
+  public void name() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getName()).isEmpty();
+      final String name = "my-transaction-" + rand.nextLong();
+      txn.setName(name);
+      assertThat(txn.getName()).isEqualTo(name);
+    }
+  }
+
+  @Test
+  public void ID() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getID()).isGreaterThan(0);
+    }
+  }
+
+  @Test
+  public void deadlockDetect() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.isDeadlockDetect()).isFalse();
+    }
+  }
+
+  @Test
+  public void waitingTxns() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getWaitingTxns().getTransactionIds().length).isEqualTo(0);
+    }
+  }
+
+  @Test
+  public void state() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb()) {
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.STARTED);
+        txn.commit();
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.COMMITED);
+      }
+
+      try(final Transaction txn = dbContainer.beginTransaction()) {
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.STARTED);
+        txn.rollback();
+        assertThat(txn.getState())
+            .isSameAs(Transaction.TransactionState.STARTED);
+      }
+    }
+  }
+
+  @Test
+  public void Id() throws RocksDBException {
+    try(final DBContainer dbContainer = startDb();
+        final Transaction txn = dbContainer.beginTransaction()) {
+      assertThat(txn.getId()).isNotNull();
+    }
+  }
+
+  @Override
+  public TransactionDBContainer startDb() throws RocksDBException {
+    final DBOptions options = new DBOptions()
+        .setCreateIfMissing(true)
+        .setCreateMissingColumnFamilies(true);
+    final TransactionDBOptions txnDbOptions = new TransactionDBOptions();
+    final ColumnFamilyOptions columnFamilyOptions = new ColumnFamilyOptions();
+    final List<ColumnFamilyDescriptor> columnFamilyDescriptors =
+        Arrays.asList(
+            new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+            new ColumnFamilyDescriptor(TXN_TEST_COLUMN_FAMILY,
+                columnFamilyOptions));
+    final List<ColumnFamilyHandle> columnFamilyHandles = new ArrayList<>();
+
+    final TransactionDB txnDb;
+    try {
+      txnDb = TransactionDB.open(options, txnDbOptions,
+          dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors,
+              columnFamilyHandles);
+    } catch(final RocksDBException e) {
+      columnFamilyOptions.close();
+      txnDbOptions.close();
+      options.close();
+      throw e;
+    }
+
+    final WriteOptions writeOptions = new WriteOptions();
+    final TransactionOptions txnOptions = new TransactionOptions();
+
+    return new TransactionDBContainer(txnOptions, writeOptions,
+        columnFamilyHandles, txnDb, txnDbOptions, columnFamilyOptions, options);
+  }
+
+  private static class TransactionDBContainer
+      extends DBContainer {
+    private final TransactionOptions txnOptions;
+    private final TransactionDB txnDb;
+    private final TransactionDBOptions txnDbOptions;
+
+    public TransactionDBContainer(
+        final TransactionOptions txnOptions, final WriteOptions writeOptions,
+        final List<ColumnFamilyHandle> columnFamilyHandles,
+        final TransactionDB txnDb, final TransactionDBOptions txnDbOptions,
+        final ColumnFamilyOptions columnFamilyOptions,
+        final DBOptions options) {
+      super(writeOptions, columnFamilyHandles, columnFamilyOptions,
+          options);
+      this.txnOptions = txnOptions;
+      this.txnDb = txnDb;
+      this.txnDbOptions = txnDbOptions;
+    }
+
+    @Override
+    public Transaction beginTransaction() {
+      return txnDb.beginTransaction(writeOptions, txnOptions);
+    }
+
+    @Override
+    public Transaction beginTransaction(final WriteOptions writeOptions) {
+      return txnDb.beginTransaction(writeOptions, txnOptions);
+    }
+
+    @Override
+    public void close() {
+      txnOptions.close();
+      writeOptions.close();
+      for(final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) {
+        columnFamilyHandle.close();
+      }
+      txnDb.close();
+      txnDbOptions.close();
+      options.close();
+    }
+  }
+
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WalFilterTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WalFilterTest.java
new file mode 100644
index 0000000000..aeb49165d7
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WalFilterTest.java
@@ -0,0 +1,164 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb;
+
+import org.junit.ClassRule;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.util.TestUtil.*;
+
+public class WalFilterTest {
+
+  @ClassRule
+  public static final RocksMemoryResource rocksMemoryResource =
+      new RocksMemoryResource();
+
+  @Rule
+  public TemporaryFolder dbFolder = new TemporaryFolder();
+
+  @Test
+  public void walFilter() throws RocksDBException {
+    // Create 3 batches with two keys each
+    final byte[][][] batchKeys = {
+        new byte[][] {
+            u("key1"),
+            u("key2")
+        },
+        new byte[][] {
+            u("key3"),
+            u("key4")
+        },
+        new byte[][] {
+            u("key5"),
+            u("key6")
+        }
+
+    };
+
+    final List<ColumnFamilyDescriptor> cfDescriptors = Arrays.asList(
+        new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY),
+        new ColumnFamilyDescriptor(u("pikachu"))
+    );
+    final List<ColumnFamilyHandle> cfHandles = new ArrayList<>();
+
+    // Test with all WAL processing options
+    for (final WalProcessingOption option : WalProcessingOption.values()) {
+      try (final Options options = optionsForLogIterTest();
+           final DBOptions dbOptions = new DBOptions(options)
+               .setCreateMissingColumnFamilies(true);
+           final RocksDB db = RocksDB.open(dbOptions,
+               dbFolder.getRoot().getAbsolutePath(),
+                cfDescriptors, cfHandles)) {
+        try (final WriteOptions writeOptions = new WriteOptions()) {
+          // Write given keys in given batches
+          for (int i = 0; i < batchKeys.length; i++) {
+            final WriteBatch batch = new WriteBatch();
+            for (int j = 0; j < batchKeys[i].length; j++) {
+              batch.put(cfHandles.get(0), batchKeys[i][j], dummyString(1024));
+            }
+            db.write(writeOptions, batch);
+          }
+        } finally {
+          for (final ColumnFamilyHandle cfHandle : cfHandles) {
+            cfHandle.close();
+          }
+          cfHandles.clear();
+        }
+      }
+
+      // Create a test filter that would apply wal_processing_option at the first
+      // record
+      final int applyOptionForRecordIndex = 1;
+      try (final TestableWalFilter walFilter =
+               new TestableWalFilter(option, applyOptionForRecordIndex)) {
+
+        try (final Options options = optionsForLogIterTest();
+             final DBOptions dbOptions = new DBOptions(options)
+                .setWalFilter(walFilter)) {
+
+          try (final RocksDB db = RocksDB.open(dbOptions,
+              dbFolder.getRoot().getAbsolutePath(),
+              cfDescriptors, cfHandles)) {
+
+            try {
+              assertThat(walFilter.logNumbers).isNotEmpty();
+              assertThat(walFilter.logFileNames).isNotEmpty();
+            } finally {
+              for (final ColumnFamilyHandle cfHandle : cfHandles) {
+                cfHandle.close();
+              }
+              cfHandles.clear();
+            }
+          } catch (final RocksDBException e) {
+            if (option != WalProcessingOption.CORRUPTED_RECORD) {
+              // exception is expected when CORRUPTED_RECORD!
+              throw e;
+            }
+          }
+        }
+      }
+    }
+  }
+
+
+  private static class TestableWalFilter extends AbstractWalFilter {
+    private final WalProcessingOption walProcessingOption;
+    private final int applyOptionForRecordIndex;
+    Map<Integer, Long> cfLognumber;
+    Map<String, Integer> cfNameId;
+    final List<Long> logNumbers = new ArrayList<>();
+    final List<String> logFileNames = new ArrayList<>();
+    private int currentRecordIndex = 0;
+
+    public TestableWalFilter(final WalProcessingOption walProcessingOption,
+        final int applyOptionForRecordIndex) {
+      super();
+      this.walProcessingOption = walProcessingOption;
+      this.applyOptionForRecordIndex = applyOptionForRecordIndex;
+    }
+
+    @Override
+    public void columnFamilyLogNumberMap(final Map<Integer, Long> cfLognumber,
+        final Map<String, Integer> cfNameId) {
+      this.cfLognumber = cfLognumber;
+      this.cfNameId = cfNameId;
+    }
+
+    @Override
+    public LogRecordFoundResult logRecordFound(
+        final long logNumber, final String logFileName, final WriteBatch batch,
+        final WriteBatch newBatch) {
+
+      logNumbers.add(logNumber);
+      logFileNames.add(logFileName);
+
+      final WalProcessingOption optionToReturn;
+      if (currentRecordIndex == applyOptionForRecordIndex) {
+        optionToReturn = walProcessingOption;
+      }
+      else {
+        optionToReturn = WalProcessingOption.CONTINUE_PROCESSING;
+      }
+
+      currentRecordIndex++;
+
+      return new LogRecordFoundResult(optionToReturn, false);
+    }
+
+    @Override
+    public String name() {
+      return "testable-wal-filter";
+    }
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
index 646a31ce7c..0c7b0d3cad 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java
@@ -5,15 +5,16 @@
 
 package org.rocksdb;
 
-import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
 import org.junit.ClassRule;
 import org.junit.Test;
+import org.rocksdb.util.CapturingWriteBatchHandler;
+import org.rocksdb.util.CapturingWriteBatchHandler.Event;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.*;
 
 
 public class WriteBatchHandlerTest {
@@ -22,45 +23,37 @@ public class WriteBatchHandlerTest {
       new RocksMemoryResource();
 
   @Test
-  public void writeBatchHandler() throws IOException, RocksDBException {
+  public void writeBatchHandler() throws RocksDBException {
     // setup test data
-    final List<Tuple<Action, Tuple<byte[], byte[]>>> testEvents = Arrays.asList(
-        new Tuple<>(Action.DELETE,
-            new Tuple<byte[], byte[]>("k0".getBytes(), null)),
-        new Tuple<>(Action.PUT,
-            new Tuple<>("k1".getBytes(), "v1".getBytes())),
-        new Tuple<>(Action.PUT,
-            new Tuple<>("k2".getBytes(), "v2".getBytes())),
-        new Tuple<>(Action.PUT,
-            new Tuple<>("k3".getBytes(), "v3".getBytes())),
-        new Tuple<>(Action.LOG,
-            new Tuple<byte[], byte[]>(null, "log1".getBytes())),
-        new Tuple<>(Action.MERGE,
-            new Tuple<>("k2".getBytes(), "v22".getBytes())),
-        new Tuple<>(Action.DELETE,
-            new Tuple<byte[], byte[]>("k3".getBytes(), null))
+    final List<Event> testEvents = Arrays.asList(
+        new Event(DELETE, "k0".getBytes(), null),
+        new Event(PUT, "k1".getBytes(), "v1".getBytes()),
+        new Event(PUT, "k2".getBytes(), "v2".getBytes()),
+        new Event(PUT, "k3".getBytes(), "v3".getBytes()),
+        new Event(LOG, null, "log1".getBytes()),
+        new Event(MERGE, "k2".getBytes(), "v22".getBytes()),
+        new Event(DELETE, "k3".getBytes(), null)
     );
 
     // load test data to the write batch
     try (final WriteBatch batch = new WriteBatch()) {
-      for (final Tuple<Action, Tuple<byte[], byte[]>> testEvent : testEvents) {
-        final Tuple<byte[], byte[]> data = testEvent.value;
-        switch (testEvent.key) {
+      for (final Event testEvent : testEvents) {
+        switch (testEvent.action) {
 
           case PUT:
-            batch.put(data.key, data.value);
+            batch.put(testEvent.key, testEvent.value);
             break;
 
           case MERGE:
-            batch.merge(data.key, data.value);
+            batch.merge(testEvent.key, testEvent.value);
             break;
 
           case DELETE:
-            batch.remove(data.key);
+            batch.remove(testEvent.key);
             break;
 
           case LOG:
-            batch.putLogData(data.value);
+            batch.putLogData(testEvent.value);
             break;
         }
       }
@@ -72,98 +65,12 @@ public void writeBatchHandler() throws IOException, RocksDBException {
         batch.iterate(handler);
 
         // compare the results to the test data
-        final List<Tuple<Action, Tuple<byte[], byte[]>>> actualEvents =
+        final List<Event> actualEvents =
             handler.getEvents();
         assertThat(testEvents.size()).isSameAs(actualEvents.size());
 
-        for (int i = 0; i < testEvents.size(); i++) {
-          assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue();
-        }
+        assertThat(testEvents).isEqualTo(actualEvents);
       }
     }
   }
-
-  private static boolean equals(
-      final Tuple<Action, Tuple<byte[], byte[]>> expected,
-      final Tuple<Action, Tuple<byte[], byte[]>> actual) {
-    if (!expected.key.equals(actual.key)) {
-      return false;
-    }
-
-    final Tuple<byte[], byte[]> expectedData = expected.value;
-    final Tuple<byte[], byte[]> actualData = actual.value;
-
-    return equals(expectedData.key, actualData.key)
-        && equals(expectedData.value, actualData.value);
-  }
-
-  private static boolean equals(byte[] expected, byte[] actual) {
-    if (expected != null) {
-      return Arrays.equals(expected, actual);
-    } else {
-      return actual == null;
-    }
-  }
-
-  private static class Tuple<K, V> {
-    public final K key;
-    public final V value;
-
-    public Tuple(final K key, final V value) {
-      this.key = key;
-      this.value = value;
-    }
-  }
-
-  /**
-   * Enumeration of Write Batch
-   * event actions
-   */
-  private enum Action { PUT, MERGE, DELETE, DELETE_RANGE, LOG }
-
-  /**
-   * A simple WriteBatch Handler which adds a record
-   * of each event that it receives to a list
-   */
-  private static class CapturingWriteBatchHandler extends WriteBatch.Handler {
-
-    private final List<Tuple<Action, Tuple<byte[], byte[]>>> events
-        = new ArrayList<>();
-
-    /**
-     * Returns a copy of the current events list
-     *
-     * @return a list of the events which have happened upto now
-     */
-    public List<Tuple<Action, Tuple<byte[], byte[]>>> getEvents() {
-      return new ArrayList<>(events);
-    }
-
-    @Override
-    public void put(final byte[] key, final byte[] value) {
-      events.add(new Tuple<>(Action.PUT, new Tuple<>(key, value)));
-    }
-
-    @Override
-    public void merge(final byte[] key, final byte[] value) {
-      events.add(new Tuple<>(Action.MERGE, new Tuple<>(key, value)));
-    }
-
-    @Override
-    public void delete(final byte[] key) {
-      events.add(new Tuple<>(Action.DELETE,
-          new Tuple<byte[], byte[]>(key, null)));
-    }
-
-    @Override
-    public void deleteRange(final byte[] beginKey, final byte[] endKey) {
-      events.add(new Tuple<>(Action.DELETE_RANGE, new Tuple<byte[], byte[]>(beginKey, endKey)));
-    }
-
-    @Override
-    public void logData(final byte[] blob) {
-      events.add(new Tuple<>(Action.LOG,
-          new Tuple<byte[], byte[]>(null, blob)));
-    }
-  }
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
index 83f90c8eb4..92bec3dcf2 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchTest.java
@@ -12,20 +12,17 @@
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
-
-import java.io.UnsupportedEncodingException;
-import java.util.Arrays;
+import org.rocksdb.util.CapturingWriteBatchHandler;
+import org.rocksdb.util.CapturingWriteBatchHandler.Event;
+import org.rocksdb.util.WriteBatchGetter;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static org.rocksdb.util.CapturingWriteBatchHandler.Action.*;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
 /**
  * This class mimics the db/write_batch_test.cc
  * in the c++ rocksdb library.
- * <p/>
- * Not ported yet:
- * <p/>
- * Continue();
- * PutGatherSlices();
  */
 public class WriteBatchTest {
   @ClassRule
@@ -44,27 +41,45 @@ public void emptyWriteBatch() {
 
   @Test
   public void multipleBatchOperations()
-      throws UnsupportedEncodingException {
-    try (WriteBatch batch = new WriteBatch()) {
-      batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
-      batch.remove("box".getBytes("US-ASCII"));
-      batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII"));
-
-      WriteBatchTestInternalHelper.setSequence(batch, 100);
-      assertThat(WriteBatchTestInternalHelper.sequence(batch)).
-          isNotNull().
-          isEqualTo(100);
-      assertThat(batch.count()).isEqualTo(3);
-      assertThat(new String(getContents(batch), "US-ASCII")).
-          isEqualTo("Put(baz, boo)@102" +
-              "Delete(box)@101" +
-              "Put(foo, bar)@100");
+      throws RocksDBException {
+
+    final byte[] foo = "foo".getBytes(UTF_8);
+    final byte[] bar = "bar".getBytes(UTF_8);
+    final byte[] box = "box".getBytes(UTF_8);
+    final byte[] baz = "baz".getBytes(UTF_8);
+    final byte[] boo = "boo".getBytes(UTF_8);
+    final byte[] hoo = "hoo".getBytes(UTF_8);
+    final byte[] hello = "hello".getBytes(UTF_8);
+
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.put(foo, bar);
+      batch.delete(box);
+      batch.put(baz, boo);
+      batch.merge(baz, hoo);
+      batch.singleDelete(foo);
+      batch.deleteRange(baz, foo);
+      batch.putLogData(hello);
+
+      try(final CapturingWriteBatchHandler handler =
+              new CapturingWriteBatchHandler()) {
+        batch.iterate(handler);
+
+        assertThat(handler.getEvents().size()).isEqualTo(7);
+
+        assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, foo, bar));
+        assertThat(handler.getEvents().get(1)).isEqualTo(new Event(DELETE, box, null));
+        assertThat(handler.getEvents().get(2)).isEqualTo(new Event(PUT, baz, boo));
+        assertThat(handler.getEvents().get(3)).isEqualTo(new Event(MERGE, baz, hoo));
+        assertThat(handler.getEvents().get(4)).isEqualTo(new Event(SINGLE_DELETE, foo, null));
+        assertThat(handler.getEvents().get(5)).isEqualTo(new Event(DELETE_RANGE, baz, foo));
+        assertThat(handler.getEvents().get(6)).isEqualTo(new Event(LOG, null, hello));
+      }
     }
   }
 
   @Test
   public void testAppendOperation()
-      throws UnsupportedEncodingException {
+      throws RocksDBException {
     try (final WriteBatch b1 = new WriteBatch();
          final WriteBatch b2 = new WriteBatch()) {
       WriteBatchTestInternalHelper.setSequence(b1, 200);
@@ -72,67 +87,66 @@ public void testAppendOperation()
       WriteBatchTestInternalHelper.append(b1, b2);
       assertThat(getContents(b1).length).isEqualTo(0);
       assertThat(b1.count()).isEqualTo(0);
-      b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII"));
+      b2.put("a".getBytes(UTF_8), "va".getBytes(UTF_8));
       WriteBatchTestInternalHelper.append(b1, b2);
       assertThat("Put(a, va)@200".equals(new String(getContents(b1),
-          "US-ASCII")));
+          UTF_8)));
       assertThat(b1.count()).isEqualTo(1);
       b2.clear();
-      b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII"));
+      b2.put("b".getBytes(UTF_8), "vb".getBytes(UTF_8));
       WriteBatchTestInternalHelper.append(b1, b2);
       assertThat(("Put(a, va)@200" +
           "Put(b, vb)@201")
-          .equals(new String(getContents(b1), "US-ASCII")));
+          .equals(new String(getContents(b1), UTF_8)));
       assertThat(b1.count()).isEqualTo(2);
-      b2.remove("foo".getBytes("US-ASCII"));
+      b2.delete("foo".getBytes(UTF_8));
       WriteBatchTestInternalHelper.append(b1, b2);
       assertThat(("Put(a, va)@200" +
           "Put(b, vb)@202" +
           "Put(b, vb)@201" +
           "Delete(foo)@203")
-          .equals(new String(getContents(b1), "US-ASCII")));
+          .equals(new String(getContents(b1), UTF_8)));
       assertThat(b1.count()).isEqualTo(4);
     }
   }
 
   @Test
   public void blobOperation()
-      throws UnsupportedEncodingException {
+      throws RocksDBException {
     try (final WriteBatch batch = new WriteBatch()) {
-      batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
-      batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII"));
-      batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII"));
-      batch.putLogData("blob1".getBytes("US-ASCII"));
-      batch.remove("k2".getBytes("US-ASCII"));
-      batch.putLogData("blob2".getBytes("US-ASCII"));
-      batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII"));
+      batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+      batch.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8));
+      batch.put("k3".getBytes(UTF_8), "v3".getBytes(UTF_8));
+      batch.putLogData("blob1".getBytes(UTF_8));
+      batch.delete("k2".getBytes(UTF_8));
+      batch.putLogData("blob2".getBytes(UTF_8));
+      batch.merge("foo".getBytes(UTF_8), "bar".getBytes(UTF_8));
       assertThat(batch.count()).isEqualTo(5);
       assertThat(("Merge(foo, bar)@4" +
           "Put(k1, v1)@0" +
           "Delete(k2)@3" +
           "Put(k2, v2)@1" +
           "Put(k3, v3)@2")
-          .equals(new String(getContents(batch), "US-ASCII")));
+          .equals(new String(getContents(batch), UTF_8)));
     }
   }
 
   @Test
   public void savePoints()
-      throws UnsupportedEncodingException, RocksDBException {
+      throws RocksDBException {
     try (final WriteBatch batch = new WriteBatch()) {
-      batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII"));
-      batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII"));
-      batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII"));
+      batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+      batch.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8));
+      batch.put("k3".getBytes(UTF_8), "v3".getBytes(UTF_8));
 
       assertThat(getFromWriteBatch(batch, "k1")).isEqualTo("v1");
       assertThat(getFromWriteBatch(batch, "k2")).isEqualTo("v2");
       assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3");
 
-
       batch.setSavePoint();
 
-      batch.remove("k2".getBytes("US-ASCII"));
-      batch.put("k3".getBytes("US-ASCII"), "v3-2".getBytes("US-ASCII"));
+      batch.delete("k2".getBytes(UTF_8));
+      batch.put("k3".getBytes(UTF_8), "v3-2".getBytes(UTF_8));
 
       assertThat(getFromWriteBatch(batch, "k2")).isNull();
       assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-2");
@@ -140,8 +154,8 @@ public void savePoints()
 
       batch.setSavePoint();
 
-      batch.put("k3".getBytes("US-ASCII"), "v3-3".getBytes("US-ASCII"));
-      batch.put("k4".getBytes("US-ASCII"), "v4".getBytes("US-ASCII"));
+      batch.put("k3".getBytes(UTF_8), "v3-3".getBytes(UTF_8));
+      batch.put("k4".getBytes(UTF_8), "v4".getBytes(UTF_8));
 
       assertThat(getFromWriteBatch(batch, "k3")).isEqualTo("v3-3");
       assertThat(getFromWriteBatch(batch, "k4")).isEqualTo("v4");
@@ -166,6 +180,7 @@ public void savePoints()
   @Test
   public void deleteRange() throws RocksDBException {
     try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteBatch batch = new WriteBatch();
          final WriteOptions wOpt = new WriteOptions()) {
       db.put("key1".getBytes(), "value".getBytes());
       db.put("key2".getBytes(), "12345678".getBytes());
@@ -176,9 +191,8 @@ public void deleteRange() throws RocksDBException {
       assertThat(db.get("key3".getBytes())).isEqualTo("abcdefg".getBytes());
       assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
 
-      WriteBatch batch = new WriteBatch();
       batch.deleteRange("key2".getBytes(), "key4".getBytes());
-      db.write(new WriteOptions(), batch);
+      db.write(wOpt, batch);
 
       assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
       assertThat(db.get("key2".getBytes())).isNull();
@@ -187,6 +201,30 @@ public void deleteRange() throws RocksDBException {
     }
   }
 
+  @Test
+  public void restorePoints() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+      batch.put("k2".getBytes(), "v2".getBytes());
+
+      batch.setSavePoint();
+
+      batch.put("k1".getBytes(), "123456789".getBytes());
+      batch.delete("k2".getBytes());
+
+      batch.rollbackToSavePoint();
+
+      try(final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler()) {
+        batch.iterate(handler);
+
+        assertThat(handler.getEvents().size()).isEqualTo(2);
+        assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, "k1".getBytes(), "v1".getBytes()));
+        assertThat(handler.getEvents().get(1)).isEqualTo(new Event(PUT, "k2".getBytes(), "v2".getBytes()));
+      }
+    }
+  }
+
   @Test(expected = RocksDBException.class)
   public void restorePoints_withoutSavePoints() throws RocksDBException {
     try (final WriteBatch batch = new WriteBatch()) {
@@ -206,67 +244,222 @@ public void restorePoints_withoutSavePoints_nested() throws RocksDBException {
     }
   }
 
-  static byte[] getContents(final WriteBatch wb) {
-    return getContents(wb.nativeHandle_);
+  @Test
+  public void popSavePoint() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+      batch.put("k2".getBytes(), "v2".getBytes());
+
+      batch.setSavePoint();
+
+      batch.put("k1".getBytes(), "123456789".getBytes());
+      batch.delete("k2".getBytes());
+
+      batch.setSavePoint();
+
+      batch.popSavePoint();
+
+      batch.rollbackToSavePoint();
+
+      try(final CapturingWriteBatchHandler handler = new CapturingWriteBatchHandler()) {
+        batch.iterate(handler);
+
+        assertThat(handler.getEvents().size()).isEqualTo(2);
+        assertThat(handler.getEvents().get(0)).isEqualTo(new Event(PUT, "k1".getBytes(), "v1".getBytes()));
+        assertThat(handler.getEvents().get(1)).isEqualTo(new Event(PUT, "k2".getBytes(), "v2".getBytes()));
+      }
+    }
   }
 
-  static String getFromWriteBatch(final WriteBatch wb, final String key)
-      throws RocksDBException, UnsupportedEncodingException {
-    final WriteBatchGetter getter =
-        new WriteBatchGetter(key.getBytes("US-ASCII"));
-    wb.iterate(getter);
-    if(getter.getValue() != null) {
-      return new String(getter.getValue(), "US-ASCII");
-    } else {
-      return null;
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.popSavePoint();
     }
   }
 
-  private static native byte[] getContents(final long writeBatchHandle);
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints_nested() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
 
-  private static class WriteBatchGetter extends WriteBatch.Handler {
+      batch.setSavePoint();
+      batch.popSavePoint();
+
+      // without previous corresponding setSavePoint
+      batch.popSavePoint();
+    }
+  }
 
-    private final byte[] key;
-    private byte[] value;
+  @Test
+  public void maxBytes() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.setMaxBytes(19);
 
-    public WriteBatchGetter(final byte[] key) {
-      this.key = key;
+      batch.put("k1".getBytes(), "v1".getBytes());
     }
+  }
 
-    public byte[] getValue() {
-      return value;
+  @Test(expected = RocksDBException.class)
+  public void maxBytes_over() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.setMaxBytes(1);
+
+      batch.put("k1".getBytes(), "v1".getBytes());
     }
+  }
 
-    @Override
-    public void put(final byte[] key, final byte[] value) {
-      if(Arrays.equals(this.key, key)) {
-        this.value = value;
+  @Test
+  public void data() throws RocksDBException {
+    try (final WriteBatch batch1 = new WriteBatch()) {
+      batch1.delete("k0".getBytes());
+      batch1.put("k1".getBytes(), "v1".getBytes());
+      batch1.put("k2".getBytes(), "v2".getBytes());
+      batch1.put("k3".getBytes(), "v3".getBytes());
+      batch1.putLogData("log1".getBytes());
+      batch1.merge("k2".getBytes(), "v22".getBytes());
+      batch1.delete("k3".getBytes());
+
+      final byte[] serialized = batch1.data();
+
+      try(final WriteBatch batch2 = new WriteBatch(serialized)) {
+        assertThat(batch2.count()).isEqualTo(batch1.count());
+
+        try(final CapturingWriteBatchHandler handler1 = new CapturingWriteBatchHandler()) {
+          batch1.iterate(handler1);
+
+          try (final CapturingWriteBatchHandler handler2 = new CapturingWriteBatchHandler()) {
+            batch2.iterate(handler2);
+
+            assertThat(handler1.getEvents().equals(handler2.getEvents())).isTrue();
+          }
+        }
       }
     }
+  }
 
-    @Override
-    public void merge(final byte[] key, final byte[] value) {
-      if(Arrays.equals(this.key, key)) {
-        throw new UnsupportedOperationException();
-      }
+  @Test
+  public void dataSize() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      batch.put("k1".getBytes(), "v1".getBytes());
+
+      assertThat(batch.getDataSize()).isEqualTo(19);
     }
+  }
 
-    @Override
-    public void delete(final byte[] key) {
-      if(Arrays.equals(this.key, key)) {
-        this.value = null;
-      }
+  @Test
+  public void hasPut() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasPut()).isFalse();
+
+      batch.put("k1".getBytes(), "v1".getBytes());
+
+      assertThat(batch.hasPut()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasDelete() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasDelete()).isFalse();
+
+      batch.delete("k1".getBytes());
+
+      assertThat(batch.hasDelete()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasSingleDelete() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasSingleDelete()).isFalse();
+
+      batch.singleDelete("k1".getBytes());
+
+      assertThat(batch.hasSingleDelete()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasDeleteRange() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasDeleteRange()).isFalse();
+
+      batch.deleteRange("k1".getBytes(), "k2".getBytes());
+
+      assertThat(batch.hasDeleteRange()).isTrue();
+    }
+  }
+
+  @Test
+  public void hasBeginPrepareRange() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasBeginPrepare()).isFalse();
+    }
+  }
+
+  @Test
+  public void hasEndPrepareRange() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasEndPrepare()).isFalse();
     }
+  }
+
+  @Test
+  public void hasCommit() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasCommit()).isFalse();
+    }
+  }
+
+  @Test
+  public void hasRollback() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.hasRollback()).isFalse();
+    }
+  }
+
+  @Test
+  public void walTerminationPoint() throws RocksDBException {
+    try (final WriteBatch batch = new WriteBatch()) {
+      WriteBatch.SavePoint walTerminationPoint = batch.getWalTerminationPoint();
+      assertThat(walTerminationPoint.isCleared()).isTrue();
+
+      batch.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+
+      batch.markWalTerminationPoint();
+
+      walTerminationPoint = batch.getWalTerminationPoint();
+      assertThat(walTerminationPoint.getSize()).isEqualTo(19);
+      assertThat(walTerminationPoint.getCount()).isEqualTo(1);
+      assertThat(walTerminationPoint.getContentFlags()).isEqualTo(2);
+    }
+  }
 
-    @Override
-    public void deleteRange(final byte[] beginKey, final byte[] endKey) {
-      throw new UnsupportedOperationException();
+  @Test
+  public void getWriteBatch() {
+    try (final WriteBatch batch = new WriteBatch()) {
+      assertThat(batch.getWriteBatch()).isEqualTo(batch);
     }
+  }
+
+  static byte[] getContents(final WriteBatch wb) {
+    return getContents(wb.nativeHandle_);
+  }
 
-    @Override
-    public void logData(final byte[] blob) {
+  static String getFromWriteBatch(final WriteBatch wb, final String key)
+      throws RocksDBException {
+    final WriteBatchGetter getter =
+        new WriteBatchGetter(key.getBytes(UTF_8));
+    wb.iterate(getter);
+    if(getter.getValue() != null) {
+      return new String(getter.getValue(), UTF_8);
+    } else {
+      return null;
     }
   }
+
+  private static native byte[] getContents(final long writeBatchHandle);
 }
 
 /**
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
index 1c5e34234e..fcef00a39f 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java
@@ -14,11 +14,11 @@
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
-import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 
 import static org.assertj.core.api.Assertions.assertThat;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
 
 public class WriteBatchWithIndexTest {
@@ -47,7 +47,6 @@ public void readYourOwnWrites() throws RocksDBException {
       try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true);
            final RocksIterator base = db.newIterator();
            final RocksIterator it = wbwi.newIteratorWithBase(base)) {
-
         it.seek(k1);
         assertThat(it.isValid()).isTrue();
         assertThat(it.key()).isEqualTo(k1);
@@ -75,8 +74,8 @@ public void readYourOwnWrites() throws RocksDBException {
         assertThat(it.key()).isEqualTo(k2);
         assertThat(it.value()).isEqualTo(v2Other);
 
-        //remove k1 and make sure we can read back the write
-        wbwi.remove(k1);
+        //delete k1 and make sure we can read back the write
+        wbwi.delete(k1);
         it.seek(k1);
         assertThat(it.key()).isNotEqualTo(k1);
 
@@ -87,12 +86,25 @@ public void readYourOwnWrites() throws RocksDBException {
         assertThat(it.isValid()).isTrue();
         assertThat(it.key()).isEqualTo(k1);
         assertThat(it.value()).isEqualTo(v1Other);
+
+        //single remove k3 and make sure we can read back the write
+        wbwi.singleDelete(k3);
+        it.seek(k3);
+        assertThat(it.isValid()).isEqualTo(false);
+
+        //reinsert k3 and make sure we see the new value
+        final byte[] v3Other = "otherValue3".getBytes();
+        wbwi.put(k3, v3Other);
+        it.seek(k3);
+        assertThat(it.isValid()).isTrue();
+        assertThat(it.key()).isEqualTo(k3);
+        assertThat(it.value()).isEqualTo(v3Other);
       }
     }
   }
 
   @Test
-  public void write_writeBatchWithIndex() throws RocksDBException {
+  public void writeBatchWithIndex() throws RocksDBException {
     try (final Options options = new Options().setCreateIfMissing(true);
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath())) {
@@ -102,11 +114,12 @@ public void write_writeBatchWithIndex() throws RocksDBException {
       final byte[] k2 = "key2".getBytes();
       final byte[] v2 = "value2".getBytes();
 
-      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex();
+           final WriteOptions wOpt = new WriteOptions()) {
         wbwi.put(k1, v1);
         wbwi.put(k2, v2);
 
-        db.write(new WriteOptions(), wbwi);
+        db.write(wOpt, wbwi);
       }
 
       assertThat(db.get(k1)).isEqualTo(v1);
@@ -124,22 +137,39 @@ public void iterator() throws RocksDBException {
       final String v2 = "value2";
       final String k3 = "key3";
       final String v3 = "value3";
-      final byte[] k1b = k1.getBytes();
-      final byte[] v1b = v1.getBytes();
-      final byte[] k2b = k2.getBytes();
-      final byte[] v2b = v2.getBytes();
-      final byte[] k3b = k3.getBytes();
-      final byte[] v3b = v3.getBytes();
-
-      //add put records
+      final String k4 = "key4";
+      final String k5 = "key5";
+      final String k6 = "key6";
+      final String k7 = "key7";
+      final String v8 = "value8";
+      final byte[] k1b = k1.getBytes(UTF_8);
+      final byte[] v1b = v1.getBytes(UTF_8);
+      final byte[] k2b = k2.getBytes(UTF_8);
+      final byte[] v2b = v2.getBytes(UTF_8);
+      final byte[] k3b = k3.getBytes(UTF_8);
+      final byte[] v3b = v3.getBytes(UTF_8);
+      final byte[] k4b = k4.getBytes(UTF_8);
+      final byte[] k5b = k5.getBytes(UTF_8);
+      final byte[] k6b = k6.getBytes(UTF_8);
+      final byte[] k7b = k7.getBytes(UTF_8);
+      final byte[] v8b = v8.getBytes(UTF_8);
+
+      // add put records
       wbwi.put(k1b, v1b);
       wbwi.put(k2b, v2b);
       wbwi.put(k3b, v3b);
 
-      //add a deletion record
-      final String k4 = "key4";
-      final byte[] k4b = k4.getBytes();
-      wbwi.remove(k4b);
+      // add a deletion record
+      wbwi.delete(k4b);
+
+      // add a single deletion record
+      wbwi.singleDelete(k5b);
+
+      // add a delete range record
+      wbwi.deleteRange(k6b, k7b);
+
+      // add a log record
+      wbwi.putLogData(v8b);
 
       final WBWIRocksIterator.WriteEntry[] expected = {
           new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
@@ -149,12 +179,16 @@ public void iterator() throws RocksDBException {
           new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT,
               new DirectSlice(k3), new DirectSlice(v3)),
           new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE,
-              new DirectSlice(k4), DirectSlice.NONE)
+              new DirectSlice(k4), DirectSlice.NONE),
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.SINGLE_DELETE,
+              new DirectSlice(k5), DirectSlice.NONE),
+          new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE_RANGE,
+              new DirectSlice(k6), new DirectSlice(k7)),
       };
 
       try (final WBWIRocksIterator it = wbwi.newIterator()) {
         //direct access - seek to key offsets
-        final int[] testOffsets = {2, 0, 1, 3};
+        final int[] testOffsets = {2, 0, 3, 4, 1, 5};
 
         for (int i = 0; i < testOffsets.length; i++) {
           final int testOffset = testOffsets[i];
@@ -164,26 +198,26 @@ public void iterator() throws RocksDBException {
           assertThat(it.isValid()).isTrue();
 
           final WBWIRocksIterator.WriteEntry entry = it.entry();
-          assertThat(entry.equals(expected[testOffset])).isTrue();
+          assertThat(entry).isEqualTo(expected[testOffset]);
         }
 
         //forward iterative access
         int i = 0;
         for (it.seekToFirst(); it.isValid(); it.next()) {
-          assertThat(it.entry().equals(expected[i++])).isTrue();
+          assertThat(it.entry()).isEqualTo(expected[i++]);
         }
 
         //reverse iterative access
         i = expected.length - 1;
         for (it.seekToLast(); it.isValid(); it.prev()) {
-          assertThat(it.entry().equals(expected[i--])).isTrue();
+          assertThat(it.entry()).isEqualTo(expected[i--]);
         }
       }
     }
   }
 
   @Test
-  public void zeroByteTests() {
+  public void zeroByteTests() throws RocksDBException {
     try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true)) {
       final byte[] zeroByteValue = new byte[]{0, 0};
       //add zero byte value
@@ -207,8 +241,7 @@ public void zeroByteTests() {
   }
 
   @Test
-  public void savePoints()
-      throws UnsupportedEncodingException, RocksDBException {
+  public void savePoints() throws RocksDBException {
     try (final Options options = new Options().setCreateIfMissing(true);
          final RocksDB db = RocksDB.open(options,
              dbFolder.getRoot().getAbsolutePath())) {
@@ -228,7 +261,7 @@ public void savePoints()
 
         wbwi.setSavePoint();
 
-        wbwi.remove("k2".getBytes());
+        wbwi.delete("k2".getBytes());
         wbwi.put("k3".getBytes(), "v3-2".getBytes());
 
         assertThat(getFromWriteBatchWithIndex(db, readOptions, wbwi, "k2"))
@@ -272,6 +305,27 @@ public void savePoints()
     }
   }
 
+  @Test
+  public void restorePoints() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      wbwi.put("k1".getBytes(UTF_8), "v1".getBytes(UTF_8));
+      wbwi.put("k2".getBytes(UTF_8), "v2".getBytes(UTF_8));
+
+      wbwi.setSavePoint();
+
+      wbwi.put("k1".getBytes(UTF_8), "123456789".getBytes(UTF_8));
+      wbwi.delete("k2".getBytes(UTF_8));
+
+      wbwi.rollbackToSavePoint();
+
+      try(final DBOptions options = new DBOptions()) {
+        assertThat(wbwi.getFromBatch(options,"k1".getBytes(UTF_8))).isEqualTo("v1".getBytes());
+        assertThat(wbwi.getFromBatch(options,"k2".getBytes(UTF_8))).isEqualTo("v2".getBytes());
+      }
+    }
+  }
+
   @Test(expected = RocksDBException.class)
   public void restorePoints_withoutSavePoints() throws RocksDBException {
     try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
@@ -291,12 +345,84 @@ public void restorePoints_withoutSavePoints_nested() throws RocksDBException {
     }
   }
 
+  @Test
+  public void popSavePoint() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      wbwi.put("k1".getBytes(), "v1".getBytes());
+      wbwi.put("k2".getBytes(), "v2".getBytes());
+
+      wbwi.setSavePoint();
+
+      wbwi.put("k1".getBytes(), "123456789".getBytes());
+      wbwi.delete("k2".getBytes());
+
+      wbwi.setSavePoint();
+
+      wbwi.popSavePoint();
+
+      wbwi.rollbackToSavePoint();
+
+      try(final DBOptions options = new DBOptions()) {
+        assertThat(wbwi.getFromBatch(options,"k1".getBytes(UTF_8))).isEqualTo("v1".getBytes());
+        assertThat(wbwi.getFromBatch(options,"k2".getBytes(UTF_8))).isEqualTo("v2".getBytes());
+      }
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      wbwi.popSavePoint();
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void popSavePoint_withoutSavePoints_nested() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      wbwi.setSavePoint();
+      wbwi.popSavePoint();
+
+      // without previous corresponding setSavePoint
+      wbwi.popSavePoint();
+    }
+  }
+
+  @Test
+  public void maxBytes() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      wbwi.setMaxBytes(19);
+
+      wbwi.put("k1".getBytes(), "v1".getBytes());
+    }
+  }
+
+  @Test(expected = RocksDBException.class)
+  public void maxBytes_over() throws RocksDBException {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+      wbwi.setMaxBytes(1);
+
+      wbwi.put("k1".getBytes(), "v1".getBytes());
+    }
+  }
+
+  @Test
+  public void getWriteBatch() {
+    try (final WriteBatchWithIndex wbwi = new WriteBatchWithIndex()) {
+
+      final WriteBatch wb = wbwi.getWriteBatch();
+      assertThat(wb).isNotNull();
+      assertThat(wb.isOwningHandle()).isFalse();
+    }
+  }
+
   private static String getFromWriteBatchWithIndex(final RocksDB db,
       final ReadOptions readOptions, final WriteBatchWithIndex wbwi,
       final String skey) {
     final byte[] key = skey.getBytes();
-    try(final RocksIterator baseIterator = db.newIterator(readOptions);
-        final RocksIterator iterator = wbwi.newIteratorWithBase(baseIterator)) {
+    try (final RocksIterator baseIterator = db.newIterator(readOptions);
+         final RocksIterator iterator = wbwi.newIteratorWithBase(baseIterator)) {
       iterator.seek(key);
 
       // Arrays.equals(key, iterator.key()) ensures an exact match in Rocks,
@@ -329,7 +455,7 @@ public void getFromBatch() throws RocksDBException {
       assertThat(wbwi.getFromBatch(dbOptions, k3)).isEqualTo(v3);
       assertThat(wbwi.getFromBatch(dbOptions, k4)).isNull();
 
-      wbwi.remove(k2);
+      wbwi.delete(k2);
 
       assertThat(wbwi.getFromBatch(dbOptions, k2)).isNull();
     }
@@ -372,7 +498,7 @@ public void getFromBatchAndDB() throws RocksDBException {
         assertThat(wbwi.getFromBatchAndDB(db, readOptions, k3)).isEqualTo(v3);
         assertThat(wbwi.getFromBatchAndDB(db, readOptions, k4)).isEqualTo(v4);
 
-        wbwi.remove(k4);
+        wbwi.delete(k4);
 
         assertThat(wbwi.getFromBatchAndDB(db, readOptions, k4)).isNull();
       }
@@ -387,6 +513,7 @@ private byte[] toArray(final ByteBuffer buf) {
   @Test
   public void deleteRange() throws RocksDBException {
     try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath());
+         final WriteBatch batch = new WriteBatch();
          final WriteOptions wOpt = new WriteOptions()) {
       db.put("key1".getBytes(), "value".getBytes());
       db.put("key2".getBytes(), "12345678".getBytes());
@@ -397,9 +524,8 @@ public void deleteRange() throws RocksDBException {
       assertThat(db.get("key3".getBytes())).isEqualTo("abcdefg".getBytes());
       assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes());
 
-      WriteBatch batch = new WriteBatch();
       batch.deleteRange("key2".getBytes(), "key4".getBytes());
-      db.write(new WriteOptions(), batch);
+      db.write(wOpt, batch);
 
       assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes());
       assertThat(db.get("key2".getBytes())).isNull();
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java
index 72a0687866..00c1d72396 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/WriteOptionsTest.java
@@ -8,6 +8,8 @@
 import org.junit.ClassRule;
 import org.junit.Test;
 
+import java.util.Random;
+
 import static org.assertj.core.api.Assertions.assertThat;
 
 public class WriteOptionsTest {
@@ -16,6 +18,9 @@ public class WriteOptionsTest {
   public static final RocksMemoryResource rocksMemoryResource =
       new RocksMemoryResource();
 
+  public static final Random rand = PlatformRandomHelper.
+          getPlatformSpecificRandomFactory();
+
   @Test
   public void writeOptions() {
     try (final WriteOptions writeOptions = new WriteOptions()) {
@@ -40,6 +45,25 @@ public void writeOptions() {
       assertThat(writeOptions.noSlowdown()).isTrue();
       writeOptions.setNoSlowdown(false);
       assertThat(writeOptions.noSlowdown()).isFalse();
+
+      writeOptions.setLowPri(true);
+      assertThat(writeOptions.lowPri()).isTrue();
+      writeOptions.setLowPri(false);
+      assertThat(writeOptions.lowPri()).isFalse();
     }
   }
+
+  @Test
+  public void copyConstructor() {
+    WriteOptions origOpts = new WriteOptions();
+    origOpts.setDisableWAL(rand.nextBoolean());
+    origOpts.setIgnoreMissingColumnFamilies(rand.nextBoolean());
+    origOpts.setSync(rand.nextBoolean());
+    WriteOptions copyOpts = new WriteOptions(origOpts);
+    assertThat(origOpts.disableWAL()).isEqualTo(copyOpts.disableWAL());
+    assertThat(origOpts.ignoreMissingColumnFamilies()).isEqualTo(
+            copyOpts.ignoreMissingColumnFamilies());
+    assertThat(origOpts.sync()).isEqualTo(copyOpts.sync());
+  }
+
 }
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java
new file mode 100644
index 0000000000..11ffedf312
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java
@@ -0,0 +1,20 @@
+package org.rocksdb.test;
+
+import org.rocksdb.AbstractCompactionFilter;
+import org.rocksdb.AbstractCompactionFilterFactory;
+import org.rocksdb.RemoveEmptyValueCompactionFilter;
+
+/**
+ * Simple CompactionFilterFactory class used in tests. Generates RemoveEmptyValueCompactionFilters.
+ */
+public class RemoveEmptyValueCompactionFilterFactory extends AbstractCompactionFilterFactory<RemoveEmptyValueCompactionFilter> {
+    @Override
+    public RemoveEmptyValueCompactionFilter createCompactionFilter(final AbstractCompactionFilter.Context context) {
+        return new RemoveEmptyValueCompactionFilter();
+    }
+
+    @Override
+    public String name() {
+        return "RemoveEmptyValueCompactionFilterFactory";
+    }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
index 02ad0380ee..42d3148ef2 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java
@@ -10,10 +10,17 @@
 import org.junit.runner.Description;
 import org.junit.runner.JUnitCore;
 import org.junit.runner.Result;
+import org.junit.runner.notification.Failure;
+import org.rocksdb.RocksDB;
 
+import java.io.PrintStream;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
 import java.util.ArrayList;
 import java.util.List;
 
+import static org.rocksdb.test.RocksJunitRunner.RocksJunitListener.Status.*;
+
 /**
  * Custom Junit Runner to print also Test classes
  * and executed methods to command prompt.
@@ -26,20 +33,117 @@ public class RocksJunitRunner {
    */
   static class RocksJunitListener extends TextListener {
 
+    private final static NumberFormat secsFormat =
+        new DecimalFormat("###,###.###");
+
+    private final PrintStream writer;
+
+    private String currentClassName = null;
+    private String currentMethodName = null;
+    private Status currentStatus = null;
+    private long currentTestsStartTime;
+    private int currentTestsCount = 0;
+    private int currentTestsIgnoredCount = 0;
+    private int currentTestsFailureCount = 0;
+    private int currentTestsErrorCount = 0;
+
+    enum Status {
+      IGNORED,
+      FAILURE,
+      ERROR,
+      OK
+    }
+
     /**
      * RocksJunitListener constructor
      *
      * @param system JUnitSystem
      */
     public RocksJunitListener(final JUnitSystem system) {
-      super(system);
+      this(system.out());
+    }
+
+    public RocksJunitListener(final PrintStream writer) {
+      super(writer);
+      this.writer = writer;
+    }
+
+    @Override
+    public void testRunStarted(final Description description) {
+      writer.format("Starting RocksJava Tests...%n");
+
     }
 
     @Override
     public void testStarted(final Description description) {
-       System.out.format("Run: %s testing now -> %s \n",
-           description.getClassName(),
-           description.getMethodName());
+      if(currentClassName == null
+          || !currentClassName.equals(description.getClassName())) {
+        if(currentClassName !=  null) {
+          printTestsSummary();
+        } else {
+          currentTestsStartTime = System.currentTimeMillis();
+        }
+        writer.format("%nRunning: %s%n", description.getClassName());
+        currentClassName = description.getClassName();
+      }
+      currentMethodName = description.getMethodName();
+      currentStatus = OK;
+      currentTestsCount++;
+    }
+
+    private void printTestsSummary() {
+      // print summary of last test set
+      writer.format("Tests run: %d, Failures: %d, Errors: %d, Ignored: %d, Time elapsed: %s sec%n",
+          currentTestsCount,
+          currentTestsFailureCount,
+          currentTestsErrorCount,
+          currentTestsIgnoredCount,
+          formatSecs(System.currentTimeMillis() - currentTestsStartTime));
+
+      // reset counters
+      currentTestsCount = 0;
+      currentTestsFailureCount = 0;
+      currentTestsErrorCount = 0;
+      currentTestsIgnoredCount = 0;
+      currentTestsStartTime = System.currentTimeMillis();
+    }
+
+    private static String formatSecs(final double milliseconds) {
+      final double seconds = milliseconds / 1000;
+      return secsFormat.format(seconds);
+    }
+
+    @Override
+    public void testFailure(final Failure failure) {
+      if (failure.getException() != null
+          && failure.getException() instanceof AssertionError) {
+        currentStatus = FAILURE;
+        currentTestsFailureCount++;
+      } else {
+        currentStatus = ERROR;
+        currentTestsErrorCount++;
+      }
+    }
+
+    @Override
+    public void testIgnored(final Description description) {
+      currentStatus = IGNORED;
+      currentTestsIgnoredCount++;
+    }
+
+    @Override
+    public void testFinished(final Description description) {
+      if(currentStatus == OK) {
+        writer.format("\t%s OK%n",currentMethodName);
+      } else {
+        writer.format("  [%s] %s%n", currentStatus.name(), currentMethodName);
+      }
+    }
+
+    @Override
+    public void testRunFinished(final Result result) {
+      printTestsSummary();
+      super.testRunFinished(result);
     }
   }
 
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java
index 42508bc118..8149a4800a 100644
--- a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/BytewiseComparatorTest.java
@@ -27,6 +27,9 @@
  */
 public class BytewiseComparatorTest {
 
+  private List<String> source_strings = Arrays.asList("b", "d", "f", "h", "j", "l");
+  private List<String> interleaving_strings = Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m");
+
   /**
    * Open the database using the C++ BytewiseComparatorImpl
    * and test the results against our Java BytewiseComparator
@@ -38,14 +41,18 @@ public void java_vs_cpp_bytewiseComparator()
       final Path dbDir = Files.createTempDirectory("comparator_db_test");
       try(final RocksDB db = openDatabase(dbDir,
           BuiltinComparator.BYTEWISE_COMPARATOR)) {
+
         final Random rnd = new Random(rand_seed);
-        doRandomIterationTest(
-            db,
-            toJavaComparator(new BytewiseComparator(new ComparatorOptions())),
-            Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"),
-            rnd,
-            8, 100, 3
-        );
+        try(final ComparatorOptions copt2 = new ComparatorOptions();
+            final Comparator comparator2 = new BytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
       } finally {
         removeData(dbDir);
       }
@@ -61,16 +68,21 @@ public void java_vs_java_bytewiseComparator()
       throws IOException, RocksDBException {
     for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
       final Path dbDir = Files.createTempDirectory("comparator_db_test");
-      try(final RocksDB db = openDatabase(dbDir, new BytewiseComparator(
-          new ComparatorOptions()))) {
+      try(final ComparatorOptions copt = new ComparatorOptions();
+          final Comparator comparator = new BytewiseComparator(copt);
+          final RocksDB db = openDatabase(dbDir, comparator)) {
+
         final Random rnd = new Random(rand_seed);
-        doRandomIterationTest(
-            db,
-            toJavaComparator(new BytewiseComparator(new ComparatorOptions())),
-            Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"),
-            rnd,
-            8, 100, 3
-        );
+        try(final ComparatorOptions copt2 = new ComparatorOptions();
+            final Comparator comparator2 = new BytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
       } finally {
         removeData(dbDir);
       }
@@ -88,16 +100,18 @@ public void java_vs_cpp_directBytewiseComparator()
       final Path dbDir = Files.createTempDirectory("comparator_db_test");
       try(final RocksDB db = openDatabase(dbDir,
           BuiltinComparator.BYTEWISE_COMPARATOR)) {
+
         final Random rnd = new Random(rand_seed);
-        doRandomIterationTest(
-            db,
-            toJavaComparator(new DirectBytewiseComparator(
-                new ComparatorOptions())
-            ),
-            Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"),
-            rnd,
-            8, 100, 3
-        );
+        try(final ComparatorOptions copt2 = new ComparatorOptions();
+            final DirectComparator comparator2 = new DirectBytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
       } finally {
         removeData(dbDir);
       }
@@ -113,18 +127,21 @@ public void java_vs_java_directBytewiseComparator()
       throws IOException, RocksDBException {
     for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
       final Path dbDir = Files.createTempDirectory("comparator_db_test");
-      try(final RocksDB db = openDatabase(dbDir, new DirectBytewiseComparator(
-            new ComparatorOptions()))) {
+      try (final ComparatorOptions copt = new ComparatorOptions();
+          final DirectComparator comparator = new DirectBytewiseComparator(copt);
+          final RocksDB db = openDatabase(dbDir, comparator)) {
+
         final Random rnd = new Random(rand_seed);
-        doRandomIterationTest(
-            db,
-            toJavaComparator(new DirectBytewiseComparator(
-                new ComparatorOptions())
-            ),
-            Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"),
-            rnd,
-            8, 100, 3
-        );
+        try(final ComparatorOptions copt2 = new ComparatorOptions();
+            final DirectComparator comparator2 = new DirectBytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
       } finally {
         removeData(dbDir);
       }
@@ -142,16 +159,18 @@ public void java_vs_cpp_reverseBytewiseComparator()
       final Path dbDir = Files.createTempDirectory("comparator_db_test");
       try(final RocksDB db = openDatabase(dbDir,
           BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR)) {
+
         final Random rnd = new Random(rand_seed);
-        doRandomIterationTest(
-            db,
-            toJavaComparator(
-                new ReverseBytewiseComparator(new ComparatorOptions())
-            ),
-            Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"),
-            rnd,
-            8, 100, 3
-        );
+        try(final ComparatorOptions copt2 = new ComparatorOptions();
+            final Comparator comparator2 = new ReverseBytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
       } finally {
         removeData(dbDir);
       }
@@ -165,21 +184,23 @@ public void java_vs_cpp_reverseBytewiseComparator()
   @Test
   public void java_vs_java_reverseBytewiseComparator()
       throws IOException, RocksDBException {
-
     for(int rand_seed = 301; rand_seed < 306; rand_seed++) {
       final Path dbDir = Files.createTempDirectory("comparator_db_test");
-      try(final RocksDB db = openDatabase(dbDir, new ReverseBytewiseComparator(
-            new ComparatorOptions()))) {
+      try (final ComparatorOptions copt = new ComparatorOptions();
+           final Comparator comparator = new ReverseBytewiseComparator(copt);
+           final RocksDB db = openDatabase(dbDir, comparator)) {
+
         final Random rnd = new Random(rand_seed);
-        doRandomIterationTest(
-            db,
-            toJavaComparator(
-                new ReverseBytewiseComparator(new ComparatorOptions())
-            ),
-            Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i"),
-            rnd,
-            8, 100, 3
-        );
+        try(final ComparatorOptions copt2 = new ComparatorOptions();
+            final Comparator comparator2 = new ReverseBytewiseComparator(copt2)) {
+          final java.util.Comparator<String> jComparator = toJavaComparator(comparator2);
+          doRandomIterationTest(
+              db,
+              jComparator,
+              rnd,
+              8, 100, 3
+          );
+        }
       } finally {
         removeData(dbDir);
       }
@@ -188,7 +209,7 @@ public void java_vs_java_reverseBytewiseComparator()
 
   private void doRandomIterationTest(
       final RocksDB db, final java.util.Comparator<String> javaComparator,
-      final List<String> source_strings, final Random rnd,
+      final Random rnd,
       final int num_writes, final int num_iter_ops,
       final int num_trigger_flush) throws RocksDBException {
 
@@ -228,7 +249,7 @@ private void doRandomIterationTest(
       for (int i = 0; i < num_iter_ops; i++) {
         // Random walk and make sure iter and result_iter returns the
         // same key and value
-        final int type = rnd.nextInt(6);
+        final int type = rnd.nextInt(7);
         iter.status();
         switch (type) {
           case 0:
@@ -242,14 +263,22 @@ private void doRandomIterationTest(
             result_iter.seekToLast();
             break;
           case 2: {
-            // Seek to random key
-            final int key_idx = rnd.nextInt(source_strings.size());
-            final String key = source_strings.get(key_idx);
+            // Seek to random (existing or non-existing) key
+            final int key_idx = rnd.nextInt(interleaving_strings.size());
+            final String key = interleaving_strings.get(key_idx);
             iter.seek(bytes(key));
             result_iter.seek(bytes(key));
             break;
           }
-          case 3:
+          case 3: {
+            // SeekForPrev to random (existing or non-existing) key
+            final int key_idx = rnd.nextInt(interleaving_strings.size());
+            final String key = interleaving_strings.get(key_idx);
+            iter.seekForPrev(bytes(key));
+            result_iter.seekForPrev(bytes(key));
+            break;
+          }
+          case 4:
             // Next
             if (is_valid) {
               iter.next();
@@ -258,7 +287,7 @@ private void doRandomIterationTest(
               continue;
             }
             break;
-          case 4:
+          case 5:
             // Prev
             if (is_valid) {
               iter.prev();
@@ -268,7 +297,7 @@ private void doRandomIterationTest(
             }
             break;
           default: {
-            assert (type == 5);
+            assert (type == 6);
             final int key_idx = rnd.nextInt(source_strings.size());
             final String key = source_strings.get(key_idx);
             final byte[] result = db.get(new ReadOptions(), bytes(key));
@@ -413,6 +442,16 @@ public void seek(final byte[] target) {
       }
     }
 
+    @Override
+    public void seekForPrev(final byte[] target) {
+      for(offset = entries.size()-1; offset >= 0; offset--) {
+        if(comparator.compare(entries.get(offset).getKey(),
+            (K)new String(target, StandardCharsets.UTF_8)) <= 0) {
+          return;
+        }
+      }
+    }
+
     /**
      * Is `a` a prefix of `b`
      *
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
new file mode 100644
index 0000000000..83ac5d3d27
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java
@@ -0,0 +1,171 @@
+package org.rocksdb.util;
+
+import org.rocksdb.RocksDBException;
+import org.rocksdb.WriteBatch;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * A simple WriteBatch Handler which adds a record
+ * of each event that it receives to a list
+ */
+public class CapturingWriteBatchHandler extends WriteBatch.Handler {
+
+  private final List<Event> events = new ArrayList<>();
+
+  /**
+   * Returns a copy of the current events list
+   *
+   * @return a list of the events which have happened upto now
+   */
+  public List<Event> getEvents() {
+    return new ArrayList<>(events);
+  }
+
+  @Override
+  public void put(final int columnFamilyId, final byte[] key,
+                  final byte[] value) {
+    events.add(new Event(Action.PUT, columnFamilyId, key, value));
+  }
+
+  @Override
+  public void put(final byte[] key, final byte[] value) {
+    events.add(new Event(Action.PUT, key, value));
+  }
+
+  @Override
+  public void merge(final int columnFamilyId, final byte[] key,
+                    final byte[] value) {
+    events.add(new Event(Action.MERGE, columnFamilyId, key, value));
+  }
+
+  @Override
+  public void merge(final byte[] key, final byte[] value) {
+    events.add(new Event(Action.MERGE, key, value));
+  }
+
+  @Override
+  public void delete(final int columnFamilyId, final byte[] key) {
+    events.add(new Event(Action.DELETE, columnFamilyId, key, (byte[])null));
+  }
+
+  @Override
+  public void delete(final byte[] key) {
+    events.add(new Event(Action.DELETE, key, (byte[])null));
+  }
+
+  @Override
+  public void singleDelete(final int columnFamilyId, final byte[] key) {
+    events.add(new Event(Action.SINGLE_DELETE,
+        columnFamilyId, key, (byte[])null));
+  }
+
+  @Override
+  public void singleDelete(final byte[] key) {
+    events.add(new Event(Action.SINGLE_DELETE, key, (byte[])null));
+  }
+
+  @Override
+  public void deleteRange(final int columnFamilyId, final byte[] beginKey,
+                          final byte[] endKey) {
+    events.add(new Event(Action.DELETE_RANGE, columnFamilyId, beginKey,
+        endKey));
+  }
+
+  @Override
+  public void deleteRange(final byte[] beginKey, final byte[] endKey) {
+    events.add(new Event(Action.DELETE_RANGE, beginKey, endKey));
+  }
+
+  @Override
+  public void logData(final byte[] blob) {
+    events.add(new Event(Action.LOG, (byte[])null, blob));
+  }
+
+  @Override
+  public void putBlobIndex(final int columnFamilyId, final byte[] key,
+                           final byte[] value) {
+    events.add(new Event(Action.PUT_BLOB_INDEX, key, value));
+  }
+
+  @Override
+  public void markBeginPrepare() throws RocksDBException {
+    events.add(new Event(Action.MARK_BEGIN_PREPARE, (byte[])null,
+        (byte[])null));
+  }
+
+  @Override
+  public void markEndPrepare(final byte[] xid) throws RocksDBException {
+    events.add(new Event(Action.MARK_END_PREPARE, (byte[])null,
+        (byte[])null));
+  }
+
+  @Override
+  public void markNoop(final boolean emptyBatch) throws RocksDBException {
+    events.add(new Event(Action.MARK_NOOP, (byte[])null, (byte[])null));
+  }
+
+  @Override
+  public void markRollback(final byte[] xid) throws RocksDBException {
+    events.add(new Event(Action.MARK_ROLLBACK, (byte[])null, (byte[])null));
+  }
+
+  @Override
+  public void markCommit(final byte[] xid) throws RocksDBException {
+    events.add(new Event(Action.MARK_COMMIT, (byte[])null, (byte[])null));
+  }
+
+  public static class Event {
+    public final Action action;
+    public final int columnFamilyId;
+    public final byte[] key;
+    public final byte[] value;
+
+    public Event(final Action action, final byte[] key, final byte[] value) {
+      this(action, 0, key, value);
+    }
+
+    public Event(final Action action, final int columnFamilyId, final byte[] key,
+        final byte[] value) {
+      this.action = action;
+      this.columnFamilyId = columnFamilyId;
+      this.key = key;
+      this.value = value;
+    }
+
+    @Override
+    public boolean equals(final Object o) {
+      if (this == o) {
+        return true;
+      }
+      if (o == null || getClass() != o.getClass()) {
+        return false;
+      }
+      final Event event = (Event) o;
+      return columnFamilyId == event.columnFamilyId &&
+          action == event.action &&
+          ((key == null && event.key == null)
+              || Arrays.equals(key, event.key)) &&
+          ((value == null && event.value == null)
+              || Arrays.equals(value, event.value));
+    }
+
+    @Override
+    public int hashCode() {
+
+      return Objects.hash(action, columnFamilyId, key, value);
+    }
+  }
+
+  /**
+   * Enumeration of Write Batch
+   * event actions
+   */
+  public enum Action {
+    PUT, MERGE, DELETE, SINGLE_DELETE, DELETE_RANGE, LOG, PUT_BLOB_INDEX,
+    MARK_BEGIN_PREPARE, MARK_END_PREPARE, MARK_NOOP, MARK_COMMIT,
+    MARK_ROLLBACK }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java
new file mode 100644
index 0000000000..12b3bbbbdc
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/TestUtil.java
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+package org.rocksdb.util;
+
+import org.rocksdb.CompactionPriority;
+import org.rocksdb.Options;
+import org.rocksdb.WALRecoveryMode;
+
+import java.util.Random;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * General test utilities.
+ */
+public class TestUtil {
+
+  /**
+   * Get the options for log iteration tests.
+   *
+   * @return the options
+   */
+  public static Options optionsForLogIterTest() {
+    return defaultOptions()
+        .setCreateIfMissing(true)
+        .setWalTtlSeconds(1000);
+  }
+
+  /**
+   * Get the default options.
+   *
+   * @return the options
+   */
+  public static Options defaultOptions() {
+      return new Options()
+          .setWriteBufferSize(4090 * 4096)
+          .setTargetFileSizeBase(2 * 1024 * 1024)
+          .setMaxBytesForLevelBase(10 * 1024 * 1024)
+          .setMaxOpenFiles(5000)
+          .setWalRecoveryMode(WALRecoveryMode.TolerateCorruptedTailRecords)
+          .setCompactionPriority(CompactionPriority.ByCompensatedSize);
+  }
+
+  private static final Random random = new Random();
+
+  /**
+   * Generate a random string of bytes.
+   *
+   * @param len the length of the string to generate.
+   *
+   * @return the random string of bytes
+   */
+  public static byte[] dummyString(final int len) {
+    final byte[] str = new byte[len];
+    random.nextBytes(str);
+    return str;
+  }
+
+  /**
+   * Convert a UTF-8 String to a byte array.
+   *
+   * @param str the string
+   *
+   * @return the byte array.
+   */
+  public static byte[] u(final String str) {
+    return str.getBytes(UTF_8);
+  }
+}
diff --git a/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java
new file mode 100644
index 0000000000..a0d8d669d2
--- /dev/null
+++ b/thirdparty/rocksdb/java/src/test/java/org/rocksdb/util/WriteBatchGetter.java
@@ -0,0 +1,133 @@
+package org.rocksdb.util;
+
+import org.rocksdb.RocksDBException;
+import org.rocksdb.WriteBatch;
+
+import java.util.Arrays;
+
+public class WriteBatchGetter extends WriteBatch.Handler {
+
+  private int columnFamilyId = -1;
+  private final byte[] key;
+  private byte[] value;
+
+  public WriteBatchGetter(final byte[] key) {
+    this.key = key;
+  }
+
+  public byte[] getValue() {
+    return value;
+  }
+
+  @Override
+  public void put(final int columnFamilyId, final byte[] key,
+                  final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void put(final byte[] key, final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void merge(final int columnFamilyId, final byte[] key,
+                    final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void merge(final byte[] key, final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void delete(final int columnFamilyId, final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void delete(final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void singleDelete(final int columnFamilyId, final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void singleDelete(final byte[] key) {
+    if(Arrays.equals(this.key, key)) {
+      this.value = null;
+    }
+  }
+
+  @Override
+  public void deleteRange(final int columnFamilyId, final byte[] beginKey,
+                          final byte[] endKey) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void deleteRange(final byte[] beginKey, final byte[] endKey) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void logData(final byte[] blob) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void putBlobIndex(final int columnFamilyId, final byte[] key,
+                           final byte[] value) {
+    if(Arrays.equals(this.key, key)) {
+      this.columnFamilyId = columnFamilyId;
+      this.value = value;
+    }
+  }
+
+  @Override
+  public void markBeginPrepare() throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markEndPrepare(final byte[] xid) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markNoop(final boolean emptyBatch) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markRollback(final byte[] xid) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public void markCommit(final byte[] xid) throws RocksDBException {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/thirdparty/rocksdb/memtable/alloc_tracker.cc b/thirdparty/rocksdb/memtable/alloc_tracker.cc
index 9889cc4230..a1fa4938c5 100644
--- a/thirdparty/rocksdb/memtable/alloc_tracker.cc
+++ b/thirdparty/rocksdb/memtable/alloc_tracker.cc
@@ -24,7 +24,8 @@ AllocTracker::~AllocTracker() { FreeMem(); }
 
 void AllocTracker::Allocate(size_t bytes) {
   assert(write_buffer_manager_ != nullptr);
-  if (write_buffer_manager_->enabled()) {
+  if (write_buffer_manager_->enabled() ||
+      write_buffer_manager_->cost_to_cache()) {
     bytes_allocated_.fetch_add(bytes, std::memory_order_relaxed);
     write_buffer_manager_->ReserveMem(bytes);
   }
@@ -32,7 +33,8 @@ void AllocTracker::Allocate(size_t bytes) {
 
 void AllocTracker::DoneAllocating() {
   if (write_buffer_manager_ != nullptr && !done_allocating_) {
-    if (write_buffer_manager_->enabled()) {
+    if (write_buffer_manager_->enabled() ||
+        write_buffer_manager_->cost_to_cache()) {
       write_buffer_manager_->ScheduleFreeMem(
           bytes_allocated_.load(std::memory_order_relaxed));
     } else {
@@ -47,7 +49,8 @@ void AllocTracker::FreeMem() {
     DoneAllocating();
   }
   if (write_buffer_manager_ != nullptr && !freed_) {
-    if (write_buffer_manager_->enabled()) {
+    if (write_buffer_manager_->enabled() ||
+        write_buffer_manager_->cost_to_cache()) {
       write_buffer_manager_->FreeMem(
           bytes_allocated_.load(std::memory_order_relaxed));
     } else {
diff --git a/thirdparty/rocksdb/memtable/hash_cuckoo_rep.cc b/thirdparty/rocksdb/memtable/hash_cuckoo_rep.cc
deleted file mode 100644
index 034bf5858b..0000000000
--- a/thirdparty/rocksdb/memtable/hash_cuckoo_rep.cc
+++ /dev/null
@@ -1,660 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-
-#ifndef ROCKSDB_LITE
-#include "memtable/hash_cuckoo_rep.h"
-
-#include <algorithm>
-#include <atomic>
-#include <limits>
-#include <memory>
-#include <queue>
-#include <string>
-#include <vector>
-
-#include "db/memtable.h"
-#include "memtable/skiplist.h"
-#include "memtable/stl_wrappers.h"
-#include "port/port.h"
-#include "rocksdb/memtablerep.h"
-#include "util/murmurhash.h"
-
-namespace rocksdb {
-namespace {
-
-// the default maximum size of the cuckoo path searching queue
-static const int kCuckooPathMaxSearchSteps = 100;
-
-struct CuckooStep {
-  static const int kNullStep = -1;
-  // the bucket id in the cuckoo array.
-  int bucket_id_;
-  // index of cuckoo-step array that points to its previous step,
-  // -1 if it the beginning step.
-  int prev_step_id_;
-  // the depth of the current step.
-  unsigned int depth_;
-
-  CuckooStep() : bucket_id_(-1), prev_step_id_(kNullStep), depth_(1) {}
-
-  CuckooStep(CuckooStep&& o) = default;
-
-  CuckooStep& operator=(CuckooStep&& rhs) {
-    bucket_id_ = std::move(rhs.bucket_id_);
-    prev_step_id_ = std::move(rhs.prev_step_id_);
-    depth_ = std::move(rhs.depth_);
-    return *this;
-  }
-
-  CuckooStep(const CuckooStep&) = delete;
-  CuckooStep& operator=(const CuckooStep&) = delete;
-
-  CuckooStep(int bucket_id, int prev_step_id, int depth)
-      : bucket_id_(bucket_id), prev_step_id_(prev_step_id), depth_(depth) {}
-};
-
-class HashCuckooRep : public MemTableRep {
- public:
-  explicit HashCuckooRep(const MemTableRep::KeyComparator& compare,
-                         Allocator* allocator, const size_t bucket_count,
-                         const unsigned int hash_func_count,
-                         const size_t approximate_entry_size)
-      : MemTableRep(allocator),
-        compare_(compare),
-        allocator_(allocator),
-        bucket_count_(bucket_count),
-        approximate_entry_size_(approximate_entry_size),
-        cuckoo_path_max_depth_(kDefaultCuckooPathMaxDepth),
-        occupied_count_(0),
-        hash_function_count_(hash_func_count),
-        backup_table_(nullptr) {
-    char* mem = reinterpret_cast<char*>(
-        allocator_->Allocate(sizeof(std::atomic<const char*>) * bucket_count_));
-    cuckoo_array_ = new (mem) std::atomic<char*>[bucket_count_];
-    for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
-      cuckoo_array_[bid].store(nullptr, std::memory_order_relaxed);
-    }
-
-    cuckoo_path_ = reinterpret_cast<int*>(
-        allocator_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1)));
-    is_nearly_full_ = false;
-  }
-
-  // return false, indicating HashCuckooRep does not support merge operator.
-  virtual bool IsMergeOperatorSupported() const override { return false; }
-
-  // return false, indicating HashCuckooRep does not support snapshot.
-  virtual bool IsSnapshotSupported() const override { return false; }
-
-  // Returns true iff an entry that compares equal to key is in the collection.
-  virtual bool Contains(const char* internal_key) const override;
-
-  virtual ~HashCuckooRep() override {}
-
-  // Insert the specified key (internal_key) into the mem-table.  Assertion
-  // fails if
-  // the current mem-table already contains the specified key.
-  virtual void Insert(KeyHandle handle) override;
-
-  // This function returns bucket_count_ * approximate_entry_size_ when any
-  // of the followings happen to disallow further write operations:
-  // 1. when the fullness reaches kMaxFullnes.
-  // 2. when the backup_table_ is used.
-  //
-  // otherwise, this function will always return 0.
-  virtual size_t ApproximateMemoryUsage() override {
-    if (is_nearly_full_) {
-      return bucket_count_ * approximate_entry_size_;
-    }
-    return 0;
-  }
-
-  virtual void Get(const LookupKey& k, void* callback_args,
-                   bool (*callback_func)(void* arg,
-                                         const char* entry)) override;
-
-  class Iterator : public MemTableRep::Iterator {
-    std::shared_ptr<std::vector<const char*>> bucket_;
-    std::vector<const char*>::const_iterator mutable cit_;
-    const KeyComparator& compare_;
-    std::string tmp_;  // For passing to EncodeKey
-    bool mutable sorted_;
-    void DoSort() const;
-
-   public:
-    explicit Iterator(std::shared_ptr<std::vector<const char*>> bucket,
-                      const KeyComparator& compare);
-
-    // Initialize an iterator over the specified collection.
-    // The returned iterator is not valid.
-    // explicit Iterator(const MemTableRep* collection);
-    virtual ~Iterator() override{};
-
-    // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const override;
-
-    // Returns the key at the current position.
-    // REQUIRES: Valid()
-    virtual const char* key() const override;
-
-    // Advances to the next position.
-    // REQUIRES: Valid()
-    virtual void Next() override;
-
-    // Advances to the previous position.
-    // REQUIRES: Valid()
-    virtual void Prev() override;
-
-    // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& user_key, const char* memtable_key) override;
-
-    // Retreat to the last entry with a key <= target
-    virtual void SeekForPrev(const Slice& user_key,
-                             const char* memtable_key) override;
-
-    // Position at the first entry in collection.
-    // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() override;
-
-    // Position at the last entry in collection.
-    // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() override;
-  };
-
-  struct CuckooStepBuffer {
-    CuckooStepBuffer() : write_index_(0), read_index_(0) {}
-    ~CuckooStepBuffer() {}
-
-    int write_index_;
-    int read_index_;
-    CuckooStep steps_[kCuckooPathMaxSearchSteps];
-
-    CuckooStep& NextWriteBuffer() { return steps_[write_index_++]; }
-
-    inline const CuckooStep& ReadNext() { return steps_[read_index_++]; }
-
-    inline bool HasNewWrite() { return write_index_ > read_index_; }
-
-    inline void reset() {
-      write_index_ = 0;
-      read_index_ = 0;
-    }
-
-    inline bool IsFull() { return write_index_ >= kCuckooPathMaxSearchSteps; }
-
-    // returns the number of steps that has been read
-    inline int ReadCount() { return read_index_; }
-
-    // returns the number of steps that has been written to the buffer.
-    inline int WriteCount() { return write_index_; }
-  };
-
- private:
-  const MemTableRep::KeyComparator& compare_;
-  // the pointer to Allocator to allocate memory, immutable after construction.
-  Allocator* const allocator_;
-  // the number of hash bucket in the hash table.
-  const size_t bucket_count_;
-  // approximate size of each entry
-  const size_t approximate_entry_size_;
-  // the maxinum depth of the cuckoo path.
-  const unsigned int cuckoo_path_max_depth_;
-  // the current number of entries in cuckoo_array_ which has been occupied.
-  size_t occupied_count_;
-  // the current number of hash functions used in the cuckoo hash.
-  unsigned int hash_function_count_;
-  // the backup MemTableRep to handle the case where cuckoo hash cannot find
-  // a vacant bucket for inserting the key of a put request.
-  std::shared_ptr<MemTableRep> backup_table_;
-  // the array to store pointers, pointing to the actual data.
-  std::atomic<char*>* cuckoo_array_;
-  // a buffer to store cuckoo path
-  int* cuckoo_path_;
-  // a boolean flag indicating whether the fullness of bucket array
-  // reaches the point to make the current memtable immutable.
-  bool is_nearly_full_;
-
-  // the default maximum depth of the cuckoo path.
-  static const unsigned int kDefaultCuckooPathMaxDepth = 10;
-
-  CuckooStepBuffer step_buffer_;
-
-  // returns the bucket id assogied to the input slice based on the
-  unsigned int GetHash(const Slice& slice, const int hash_func_id) const {
-    // the seeds used in the Murmur hash to produce different hash functions.
-    static const int kMurmurHashSeeds[HashCuckooRepFactory::kMaxHashCount] = {
-        545609244,  1769731426, 763324157,  13099088,   592422103,
-        1899789565, 248369300,  1984183468, 1613664382, 1491157517};
-    return static_cast<unsigned int>(
-        MurmurHash(slice.data(), static_cast<int>(slice.size()),
-                   kMurmurHashSeeds[hash_func_id]) %
-        bucket_count_);
-  }
-
-  // A cuckoo path is a sequence of bucket ids, where each id points to a
-  // location of cuckoo_array_.  This path describes the displacement sequence
-  // of entries in order to store the desired data specified by the input user
-  // key.  The path starts from one of the locations associated with the
-  // specified user key and ends at a vacant space in the cuckoo array. This
-  // function will update the cuckoo_path.
-  //
-  // @return true if it found a cuckoo path.
-  bool FindCuckooPath(const char* internal_key, const Slice& user_key,
-                      int* cuckoo_path, size_t* cuckoo_path_length,
-                      int initial_hash_id = 0);
-
-  // Perform quick insert by checking whether there is a vacant bucket in one
-  // of the possible locations of the input key.  If so, then the function will
-  // return true and the key will be stored in that vacant bucket.
-  //
-  // This function is a helper function of FindCuckooPath that discovers the
-  // first possible steps of a cuckoo path.  It begins by first computing
-  // the possible locations of the input keys (and stores them in bucket_ids.)
-  // Then, if one of its possible locations is vacant, then the input key will
-  // be stored in that vacant space and the function will return true.
-  // Otherwise, the function will return false indicating a complete search
-  // of cuckoo-path is needed.
-  bool QuickInsert(const char* internal_key, const Slice& user_key,
-                   int bucket_ids[], const int initial_hash_id);
-
-  // Returns the pointer to the internal iterator to the buckets where buckets
-  // are sorted according to the user specified KeyComparator.  Note that
-  // any insert after this function call may affect the sorted nature of
-  // the returned iterator.
-  virtual MemTableRep::Iterator* GetIterator(Arena* arena) override {
-    std::vector<const char*> compact_buckets;
-    for (unsigned int bid = 0; bid < bucket_count_; ++bid) {
-      const char* bucket = cuckoo_array_[bid].load(std::memory_order_relaxed);
-      if (bucket != nullptr) {
-        compact_buckets.push_back(bucket);
-      }
-    }
-    MemTableRep* backup_table = backup_table_.get();
-    if (backup_table != nullptr) {
-      std::unique_ptr<MemTableRep::Iterator> iter(backup_table->GetIterator());
-      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
-        compact_buckets.push_back(iter->key());
-      }
-    }
-    if (arena == nullptr) {
-      return new Iterator(
-          std::shared_ptr<std::vector<const char*>>(
-              new std::vector<const char*>(std::move(compact_buckets))),
-          compare_);
-    } else {
-      auto mem = arena->AllocateAligned(sizeof(Iterator));
-      return new (mem) Iterator(
-          std::shared_ptr<std::vector<const char*>>(
-              new std::vector<const char*>(std::move(compact_buckets))),
-          compare_);
-    }
-  }
-};
-
-void HashCuckooRep::Get(const LookupKey& key, void* callback_args,
-                        bool (*callback_func)(void* arg, const char* entry)) {
-  Slice user_key = key.user_key();
-  for (unsigned int hid = 0; hid < hash_function_count_; ++hid) {
-    const char* bucket =
-        cuckoo_array_[GetHash(user_key, hid)].load(std::memory_order_acquire);
-    if (bucket != nullptr) {
-      Slice bucket_user_key = UserKey(bucket);
-      if (user_key == bucket_user_key) {
-        callback_func(callback_args, bucket);
-        break;
-      }
-    } else {
-      // as Put() always stores at the vacant bucket located by the
-      // hash function with the smallest possible id, when we first
-      // find a vacant bucket in Get(), that means a miss.
-      break;
-    }
-  }
-  MemTableRep* backup_table = backup_table_.get();
-  if (backup_table != nullptr) {
-    backup_table->Get(key, callback_args, callback_func);
-  }
-}
-
-void HashCuckooRep::Insert(KeyHandle handle) {
-  static const float kMaxFullness = 0.90f;
-
-  auto* key = static_cast<char*>(handle);
-  int initial_hash_id = 0;
-  size_t cuckoo_path_length = 0;
-  auto user_key = UserKey(key);
-  // find cuckoo path
-  if (FindCuckooPath(key, user_key, cuckoo_path_, &cuckoo_path_length,
-                     initial_hash_id) == false) {
-    // if true, then we can't find a vacant bucket for this key even we
-    // have used up all the hash functions.  Then use a backup memtable to
-    // store such key, which will further make this mem-table become
-    // immutable.
-    if (backup_table_.get() == nullptr) {
-      VectorRepFactory factory(10);
-      backup_table_.reset(
-          factory.CreateMemTableRep(compare_, allocator_, nullptr, nullptr));
-      is_nearly_full_ = true;
-    }
-    backup_table_->Insert(key);
-    return;
-  }
-  // when reaching this point, means the insert can be done successfully.
-  occupied_count_++;
-  if (occupied_count_ >= bucket_count_ * kMaxFullness) {
-    is_nearly_full_ = true;
-  }
-
-  // perform kickout process if the length of cuckoo path > 1.
-  if (cuckoo_path_length == 0) return;
-
-  // the cuckoo path stores the kickout path in reverse order.
-  // so the kickout or displacement is actually performed
-  // in reverse order, which avoids false-negatives on read
-  // by moving each key involved in the cuckoo path to the new
-  // location before replacing it.
-  for (size_t i = 1; i < cuckoo_path_length; ++i) {
-    int kicked_out_bid = cuckoo_path_[i - 1];
-    int current_bid = cuckoo_path_[i];
-    // since we only allow one writer at a time, it is safe to do relaxed read.
-    cuckoo_array_[kicked_out_bid]
-        .store(cuckoo_array_[current_bid].load(std::memory_order_relaxed),
-               std::memory_order_release);
-  }
-  int insert_key_bid = cuckoo_path_[cuckoo_path_length - 1];
-  cuckoo_array_[insert_key_bid].store(key, std::memory_order_release);
-}
-
-bool HashCuckooRep::Contains(const char* internal_key) const {
-  auto user_key = UserKey(internal_key);
-  for (unsigned int hid = 0; hid < hash_function_count_; ++hid) {
-    const char* stored_key =
-        cuckoo_array_[GetHash(user_key, hid)].load(std::memory_order_acquire);
-    if (stored_key != nullptr) {
-      if (compare_(internal_key, stored_key) == 0) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool HashCuckooRep::QuickInsert(const char* internal_key, const Slice& user_key,
-                                int bucket_ids[], const int initial_hash_id) {
-  int cuckoo_bucket_id = -1;
-
-  // Below does the followings:
-  // 0. Calculate all possible locations of the input key.
-  // 1. Check if there is a bucket having same user_key as the input does.
-  // 2. If there exists such bucket, then replace this bucket by the newly
-  //    insert data and return.  This step also performs duplication check.
-  // 3. If no such bucket exists but exists a vacant bucket, then insert the
-  //    input data into it.
-  // 4. If step 1 to 3 all fail, then return false.
-  for (unsigned int hid = initial_hash_id; hid < hash_function_count_; ++hid) {
-    bucket_ids[hid] = GetHash(user_key, hid);
-    // since only one PUT is allowed at a time, and this is part of the PUT
-    // operation, so we can safely perform relaxed load.
-    const char* stored_key =
-        cuckoo_array_[bucket_ids[hid]].load(std::memory_order_relaxed);
-    if (stored_key == nullptr) {
-      if (cuckoo_bucket_id == -1) {
-        cuckoo_bucket_id = bucket_ids[hid];
-      }
-    } else {
-      const auto bucket_user_key = UserKey(stored_key);
-      if (bucket_user_key.compare(user_key) == 0) {
-        cuckoo_bucket_id = bucket_ids[hid];
-        break;
-      }
-    }
-  }
-
-  if (cuckoo_bucket_id != -1) {
-    cuckoo_array_[cuckoo_bucket_id].store(const_cast<char*>(internal_key),
-                                          std::memory_order_release);
-    return true;
-  }
-
-  return false;
-}
-
-// Perform pre-check and find the shortest cuckoo path.  A cuckoo path
-// is a displacement sequence for inserting the specified input key.
-//
-// @return true if it successfully found a vacant space or cuckoo-path.
-//     If the return value is true but the length of cuckoo_path is zero,
-//     then it indicates that a vacant bucket or an bucket with matched user
-//     key with the input is found, and a quick insertion is done.
-bool HashCuckooRep::FindCuckooPath(const char* internal_key,
-                                   const Slice& user_key, int* cuckoo_path,
-                                   size_t* cuckoo_path_length,
-                                   const int initial_hash_id) {
-  int bucket_ids[HashCuckooRepFactory::kMaxHashCount];
-  *cuckoo_path_length = 0;
-
-  if (QuickInsert(internal_key, user_key, bucket_ids, initial_hash_id)) {
-    return true;
-  }
-  // If this step is reached, then it means:
-  // 1. no vacant bucket in any of the possible locations of the input key.
-  // 2. none of the possible locations of the input key has the same user
-  //    key as the input `internal_key`.
-
-  // the front and back indices for the step_queue_
-  step_buffer_.reset();
-
-  for (unsigned int hid = initial_hash_id; hid < hash_function_count_; ++hid) {
-    /// CuckooStep& current_step = step_queue_[front_pos++];
-    CuckooStep& current_step = step_buffer_.NextWriteBuffer();
-    current_step.bucket_id_ = bucket_ids[hid];
-    current_step.prev_step_id_ = CuckooStep::kNullStep;
-    current_step.depth_ = 1;
-  }
-
-  while (step_buffer_.HasNewWrite()) {
-    int step_id = step_buffer_.read_index_;
-    const CuckooStep& step = step_buffer_.ReadNext();
-    // Since it's a BFS process, then the first step with its depth deeper
-    // than the maximum allowed depth indicates all the remaining steps
-    // in the step buffer queue will all exceed the maximum depth.
-    // Return false immediately indicating we can't find a vacant bucket
-    // for the input key before the maximum allowed depth.
-    if (step.depth_ >= cuckoo_path_max_depth_) {
-      return false;
-    }
-    // again, we can perform no barrier load safely here as the current
-    // thread is the only writer.
-    Slice bucket_user_key =
-        UserKey(cuckoo_array_[step.bucket_id_].load(std::memory_order_relaxed));
-    if (step.prev_step_id_ != CuckooStep::kNullStep) {
-      if (bucket_user_key == user_key) {
-        // then there is a loop in the current path, stop discovering this path.
-        continue;
-      }
-    }
-    // if the current bucket stores at its nth location, then we only consider
-    // its mth location where m > n.  This property makes sure that all reads
-    // will not miss if we do have data associated to the query key.
-    //
-    // The n and m in the above statement is the start_hid and hid in the code.
-    unsigned int start_hid = hash_function_count_;
-    for (unsigned int hid = 0; hid < hash_function_count_; ++hid) {
-      bucket_ids[hid] = GetHash(bucket_user_key, hid);
-      if (step.bucket_id_ == bucket_ids[hid]) {
-        start_hid = hid;
-      }
-    }
-    // must found a bucket which is its current "home".
-    assert(start_hid != hash_function_count_);
-
-    // explore all possible next steps from the current step.
-    for (unsigned int hid = start_hid + 1; hid < hash_function_count_; ++hid) {
-      CuckooStep& next_step = step_buffer_.NextWriteBuffer();
-      next_step.bucket_id_ = bucket_ids[hid];
-      next_step.prev_step_id_ = step_id;
-      next_step.depth_ = step.depth_ + 1;
-      // once a vacant bucket is found, trace back all its previous steps
-      // to generate a cuckoo path.
-      if (cuckoo_array_[next_step.bucket_id_].load(std::memory_order_relaxed) ==
-          nullptr) {
-        // store the last step in the cuckoo path.  Note that cuckoo_path
-        // stores steps in reverse order.  This allows us to move keys along
-        // the cuckoo path by storing each key to the new place first before
-        // removing it from the old place.  This property ensures reads will
-        // not missed due to moving keys along the cuckoo path.
-        cuckoo_path[(*cuckoo_path_length)++] = next_step.bucket_id_;
-        int depth;
-        for (depth = step.depth_; depth > 0 && step_id != CuckooStep::kNullStep;
-             depth--) {
-          const CuckooStep& prev_step = step_buffer_.steps_[step_id];
-          cuckoo_path[(*cuckoo_path_length)++] = prev_step.bucket_id_;
-          step_id = prev_step.prev_step_id_;
-        }
-        assert(depth == 0 && step_id == CuckooStep::kNullStep);
-        return true;
-      }
-      if (step_buffer_.IsFull()) {
-        // if true, then it reaches maxinum number of cuckoo search steps.
-        return false;
-      }
-    }
-  }
-
-  // tried all possible paths but still not unable to find a cuckoo path
-  // which path leads to a vacant bucket.
-  return false;
-}
-
-HashCuckooRep::Iterator::Iterator(
-    std::shared_ptr<std::vector<const char*>> bucket,
-    const KeyComparator& compare)
-    : bucket_(bucket),
-      cit_(bucket_->end()),
-      compare_(compare),
-      sorted_(false) {}
-
-void HashCuckooRep::Iterator::DoSort() const {
-  if (!sorted_) {
-    std::sort(bucket_->begin(), bucket_->end(),
-              stl_wrappers::Compare(compare_));
-    cit_ = bucket_->begin();
-    sorted_ = true;
-  }
-}
-
-// Returns true iff the iterator is positioned at a valid node.
-bool HashCuckooRep::Iterator::Valid() const {
-  DoSort();
-  return cit_ != bucket_->end();
-}
-
-// Returns the key at the current position.
-// REQUIRES: Valid()
-const char* HashCuckooRep::Iterator::key() const {
-  assert(Valid());
-  return *cit_;
-}
-
-// Advances to the next position.
-// REQUIRES: Valid()
-void HashCuckooRep::Iterator::Next() {
-  assert(Valid());
-  if (cit_ == bucket_->end()) {
-    return;
-  }
-  ++cit_;
-}
-
-// Advances to the previous position.
-// REQUIRES: Valid()
-void HashCuckooRep::Iterator::Prev() {
-  assert(Valid());
-  if (cit_ == bucket_->begin()) {
-    // If you try to go back from the first element, the iterator should be
-    // invalidated. So we set it to past-the-end. This means that you can
-    // treat the container circularly.
-    cit_ = bucket_->end();
-  } else {
-    --cit_;
-  }
-}
-
-// Advance to the first entry with a key >= target
-void HashCuckooRep::Iterator::Seek(const Slice& user_key,
-                                   const char* memtable_key) {
-  DoSort();
-  // Do binary search to find first value not less than the target
-  const char* encoded_key =
-      (memtable_key != nullptr) ? memtable_key : EncodeKey(&tmp_, user_key);
-  cit_ = std::equal_range(bucket_->begin(), bucket_->end(), encoded_key,
-                          [this](const char* a, const char* b) {
-                            return compare_(a, b) < 0;
-                          }).first;
-}
-
-// Retreat to the last entry with a key <= target
-void HashCuckooRep::Iterator::SeekForPrev(const Slice& user_key,
-                                          const char* memtable_key) {
-  assert(false);
-}
-
-// Position at the first entry in collection.
-// Final state of iterator is Valid() iff collection is not empty.
-void HashCuckooRep::Iterator::SeekToFirst() {
-  DoSort();
-  cit_ = bucket_->begin();
-}
-
-// Position at the last entry in collection.
-// Final state of iterator is Valid() iff collection is not empty.
-void HashCuckooRep::Iterator::SeekToLast() {
-  DoSort();
-  cit_ = bucket_->end();
-  if (bucket_->size() != 0) {
-    --cit_;
-  }
-}
-
-}  // anom namespace
-
-MemTableRep* HashCuckooRepFactory::CreateMemTableRep(
-    const MemTableRep::KeyComparator& compare, Allocator* allocator,
-    const SliceTransform* transform, Logger* logger) {
-  // The estimated average fullness.  The write performance of any close hash
-  // degrades as the fullness of the mem-table increases.  Setting kFullness
-  // to a value around 0.7 can better avoid write performance degradation while
-  // keeping efficient memory usage.
-  static const float kFullness = 0.7f;
-  size_t pointer_size = sizeof(std::atomic<const char*>);
-  assert(write_buffer_size_ >= (average_data_size_ + pointer_size));
-  size_t bucket_count =
-    static_cast<size_t>(
-      (write_buffer_size_ / (average_data_size_ + pointer_size)) / kFullness +
-      1);
-  unsigned int hash_function_count = hash_function_count_;
-  if (hash_function_count < 2) {
-    hash_function_count = 2;
-  }
-  if (hash_function_count > kMaxHashCount) {
-    hash_function_count = kMaxHashCount;
-  }
-  return new HashCuckooRep(compare, allocator, bucket_count,
-                           hash_function_count,
-                           static_cast<size_t>(
-                             (average_data_size_ + pointer_size) / kFullness)
-                           );
-}
-
-MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size,
-                                            size_t average_data_size,
-                                            unsigned int hash_function_count) {
-  return new HashCuckooRepFactory(write_buffer_size, average_data_size,
-                                  hash_function_count);
-}
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/memtable/hash_cuckoo_rep.h b/thirdparty/rocksdb/memtable/hash_cuckoo_rep.h
deleted file mode 100644
index 800696e931..0000000000
--- a/thirdparty/rocksdb/memtable/hash_cuckoo_rep.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#pragma once
-#ifndef ROCKSDB_LITE
-#include "port/port.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/memtablerep.h"
-
-namespace rocksdb {
-
-class HashCuckooRepFactory : public MemTableRepFactory {
- public:
-  // maxinum number of hash functions used in the cuckoo hash.
-  static const unsigned int kMaxHashCount = 10;
-
-  explicit HashCuckooRepFactory(size_t write_buffer_size,
-                                size_t average_data_size,
-                                unsigned int hash_function_count)
-      : write_buffer_size_(write_buffer_size),
-        average_data_size_(average_data_size),
-        hash_function_count_(hash_function_count) {}
-
-  virtual ~HashCuckooRepFactory() {}
-
-  using MemTableRepFactory::CreateMemTableRep;
-  virtual MemTableRep* CreateMemTableRep(
-      const MemTableRep::KeyComparator& compare, Allocator* allocator,
-      const SliceTransform* transform, Logger* logger) override;
-
-  virtual const char* Name() const override { return "HashCuckooRepFactory"; }
-
- private:
-  size_t write_buffer_size_;
-  size_t average_data_size_;
-  const unsigned int hash_function_count_;
-};
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/memtable/hash_linklist_rep.cc b/thirdparty/rocksdb/memtable/hash_linklist_rep.cc
index 932b62a346..878d233835 100644
--- a/thirdparty/rocksdb/memtable/hash_linklist_rep.cc
+++ b/thirdparty/rocksdb/memtable/hash_linklist_rep.cc
@@ -17,7 +17,7 @@
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "util/arena.h"
-#include "util/murmurhash.h"
+#include "util/hash.h"
 
 namespace rocksdb {
 namespace {
@@ -168,24 +168,23 @@ class HashLinkListRep : public MemTableRep {
                   int bucket_entries_logging_threshold,
                   bool if_log_bucket_dist_when_flash);
 
-  virtual KeyHandle Allocate(const size_t len, char** buf) override;
+  KeyHandle Allocate(const size_t len, char** buf) override;
 
-  virtual void Insert(KeyHandle handle) override;
+  void Insert(KeyHandle handle) override;
 
-  virtual bool Contains(const char* key) const override;
+  bool Contains(const char* key) const override;
 
-  virtual size_t ApproximateMemoryUsage() override;
+  size_t ApproximateMemoryUsage() override;
 
-  virtual void Get(const LookupKey& k, void* callback_args,
-                   bool (*callback_func)(void* arg,
-                                         const char* entry)) override;
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override;
 
-  virtual ~HashLinkListRep();
+  ~HashLinkListRep() override;
 
-  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
+  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
 
-  virtual MemTableRep::Iterator* GetDynamicPrefixIterator(
-       Arena* arena = nullptr) override;
+  MemTableRep::Iterator* GetDynamicPrefixIterator(
+      Arena* arena = nullptr) override;
 
  private:
   friend class DynamicIterator;
@@ -219,7 +218,7 @@ class HashLinkListRep : public MemTableRep {
   }
 
   size_t GetHash(const Slice& slice) const {
-    return MurmurHash(slice.data(), static_cast<int>(slice.size()), 0) %
+    return NPHash64(slice.data(), static_cast<int>(slice.size()), 0) %
            bucket_size_;
   }
 
@@ -265,36 +264,34 @@ class HashLinkListRep : public MemTableRep {
     explicit FullListIterator(MemtableSkipList* list, Allocator* allocator)
         : iter_(list), full_list_(list), allocator_(allocator) {}
 
-    virtual ~FullListIterator() {
-    }
+    ~FullListIterator() override {}
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const override { return iter_.Valid(); }
+    bool Valid() const override { return iter_.Valid(); }
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const override {
+    const char* key() const override {
       assert(Valid());
       return iter_.key();
     }
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() override {
+    void Next() override {
       assert(Valid());
       iter_.Next();
     }
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() override {
+    void Prev() override {
       assert(Valid());
       iter_.Prev();
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& internal_key,
-                      const char* memtable_key) override {
+    void Seek(const Slice& internal_key, const char* memtable_key) override {
       const char* encoded_key =
           (memtable_key != nullptr) ?
               memtable_key : EncodeKey(&tmp_, internal_key);
@@ -302,8 +299,8 @@ class HashLinkListRep : public MemTableRep {
     }
 
     // Retreat to the last entry with a key <= target
-    virtual void SeekForPrev(const Slice& internal_key,
-                             const char* memtable_key) override {
+    void SeekForPrev(const Slice& internal_key,
+                     const char* memtable_key) override {
       const char* encoded_key = (memtable_key != nullptr)
                                     ? memtable_key
                                     : EncodeKey(&tmp_, internal_key);
@@ -312,11 +309,12 @@ class HashLinkListRep : public MemTableRep {
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() override { iter_.SeekToFirst(); }
+    void SeekToFirst() override { iter_.SeekToFirst(); }
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() override { iter_.SeekToLast(); }
+    void SeekToLast() override { iter_.SeekToLast(); }
+
    private:
     MemtableSkipList::Iterator iter_;
     // To destruct with the iterator.
@@ -333,43 +331,43 @@ class HashLinkListRep : public MemTableRep {
           head_(head),
           node_(nullptr) {}
 
-    virtual ~LinkListIterator() {}
+    ~LinkListIterator() override {}
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const override { return node_ != nullptr; }
+    bool Valid() const override { return node_ != nullptr; }
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const override {
+    const char* key() const override {
       assert(Valid());
       return node_->key;
     }
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() override {
+    void Next() override {
       assert(Valid());
       node_ = node_->Next();
     }
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() override {
+    void Prev() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& internal_key,
-                      const char* memtable_key) override {
+    void Seek(const Slice& internal_key,
+              const char* /*memtable_key*/) override {
       node_ = hash_link_list_rep_->FindGreaterOrEqualInBucket(head_,
                                                               internal_key);
     }
 
     // Retreat to the last entry with a key <= target
-    virtual void SeekForPrev(const Slice& internal_key,
-                             const char* memtable_key) override {
+    void SeekForPrev(const Slice& /*internal_key*/,
+                     const char* /*memtable_key*/) override {
       // Since we do not support Prev()
       // We simply do not support SeekForPrev
       Reset(nullptr);
@@ -377,7 +375,7 @@ class HashLinkListRep : public MemTableRep {
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() override {
+    void SeekToFirst() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
@@ -385,7 +383,7 @@ class HashLinkListRep : public MemTableRep {
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() override {
+    void SeekToLast() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
@@ -414,7 +412,7 @@ class HashLinkListRep : public MemTableRep {
           memtable_rep_(memtable_rep) {}
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& k, const char* memtable_key) override {
+    void Seek(const Slice& k, const char* memtable_key) override {
       auto transformed = memtable_rep_.GetPrefix(k);
       auto* bucket = memtable_rep_.GetBucket(transformed);
 
@@ -443,21 +441,21 @@ class HashLinkListRep : public MemTableRep {
       }
     }
 
-    virtual bool Valid() const override {
+    bool Valid() const override {
       if (skip_list_iter_) {
         return skip_list_iter_->Valid();
       }
       return HashLinkListRep::LinkListIterator::Valid();
     }
 
-    virtual const char* key() const override {
+    const char* key() const override {
       if (skip_list_iter_) {
         return skip_list_iter_->key();
       }
       return HashLinkListRep::LinkListIterator::key();
     }
 
-    virtual void Next() override {
+    void Next() override {
       if (skip_list_iter_) {
         skip_list_iter_->Next();
       } else {
@@ -476,19 +474,19 @@ class HashLinkListRep : public MemTableRep {
     // instantiating an empty bucket over which to iterate.
    public:
     EmptyIterator() { }
-    virtual bool Valid() const override { return false; }
-    virtual const char* key() const override {
+    bool Valid() const override { return false; }
+    const char* key() const override {
       assert(false);
       return nullptr;
     }
-    virtual void Next() override {}
-    virtual void Prev() override {}
-    virtual void Seek(const Slice& user_key,
-                      const char* memtable_key) override {}
-    virtual void SeekForPrev(const Slice& user_key,
-                             const char* memtable_key) override {}
-    virtual void SeekToFirst() override {}
-    virtual void SeekToLast() override {}
+    void Next() override {}
+    void Prev() override {}
+    void Seek(const Slice& /*user_key*/,
+              const char* /*memtable_key*/) override {}
+    void SeekForPrev(const Slice& /*user_key*/,
+                     const char* /*memtable_key*/) override {}
+    void SeekToFirst() override {}
+    void SeekToLast() override {}
 
    private:
   };
diff --git a/thirdparty/rocksdb/memtable/hash_skiplist_rep.cc b/thirdparty/rocksdb/memtable/hash_skiplist_rep.cc
index e34743eb2c..d02919cd4e 100644
--- a/thirdparty/rocksdb/memtable/hash_skiplist_rep.cc
+++ b/thirdparty/rocksdb/memtable/hash_skiplist_rep.cc
@@ -28,21 +28,20 @@ class HashSkipListRep : public MemTableRep {
                   size_t bucket_size, int32_t skiplist_height,
                   int32_t skiplist_branching_factor);
 
-  virtual void Insert(KeyHandle handle) override;
+  void Insert(KeyHandle handle) override;
 
-  virtual bool Contains(const char* key) const override;
+  bool Contains(const char* key) const override;
 
-  virtual size_t ApproximateMemoryUsage() override;
+  size_t ApproximateMemoryUsage() override;
 
-  virtual void Get(const LookupKey& k, void* callback_args,
-                   bool (*callback_func)(void* arg,
-                                         const char* entry)) override;
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override;
 
-  virtual ~HashSkipListRep();
+  ~HashSkipListRep() override;
 
-  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
+  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override;
 
-  virtual MemTableRep::Iterator* GetDynamicPrefixIterator(
+  MemTableRep::Iterator* GetDynamicPrefixIterator(
       Arena* arena = nullptr) override;
 
  private:
@@ -85,7 +84,7 @@ class HashSkipListRep : public MemTableRep {
                       Arena* arena = nullptr)
         : list_(list), iter_(list), own_list_(own_list), arena_(arena) {}
 
-    virtual ~Iterator() {
+    ~Iterator() override {
       // if we own the list, we should also delete it
       if (own_list_) {
         assert(list_ != nullptr);
@@ -94,34 +93,31 @@ class HashSkipListRep : public MemTableRep {
     }
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const override {
-      return list_ != nullptr && iter_.Valid();
-    }
+    bool Valid() const override { return list_ != nullptr && iter_.Valid(); }
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const override {
+    const char* key() const override {
       assert(Valid());
       return iter_.key();
     }
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() override {
+    void Next() override {
       assert(Valid());
       iter_.Next();
     }
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() override {
+    void Prev() override {
       assert(Valid());
       iter_.Prev();
     }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& internal_key,
-                      const char* memtable_key) override {
+    void Seek(const Slice& internal_key, const char* memtable_key) override {
       if (list_ != nullptr) {
         const char* encoded_key =
             (memtable_key != nullptr) ?
@@ -131,15 +127,15 @@ class HashSkipListRep : public MemTableRep {
     }
 
     // Retreat to the last entry with a key <= target
-    virtual void SeekForPrev(const Slice& internal_key,
-                             const char* memtable_key) override {
+    void SeekForPrev(const Slice& /*internal_key*/,
+                     const char* /*memtable_key*/) override {
       // not supported
       assert(false);
     }
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() override {
+    void SeekToFirst() override {
       if (list_ != nullptr) {
         iter_.SeekToFirst();
       }
@@ -147,11 +143,12 @@ class HashSkipListRep : public MemTableRep {
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() override {
+    void SeekToLast() override {
       if (list_ != nullptr) {
         iter_.SeekToLast();
       }
     }
+
    protected:
     void Reset(Bucket* list) {
       if (own_list_) {
@@ -168,7 +165,7 @@ class HashSkipListRep : public MemTableRep {
     Bucket* list_;
     Bucket::Iterator iter_;
     // here we track if we own list_. If we own it, we are also
-    // responsible for it's cleaning. This is a poor man's shared_ptr
+    // responsible for it's cleaning. This is a poor man's std::shared_ptr
     bool own_list_;
     std::unique_ptr<Arena> arena_;
     std::string tmp_;       // For passing to EncodeKey
@@ -181,7 +178,7 @@ class HashSkipListRep : public MemTableRep {
         memtable_rep_(memtable_rep) {}
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& k, const char* memtable_key) override {
+    void Seek(const Slice& k, const char* memtable_key) override {
       auto transformed = memtable_rep_.transform_->Transform(ExtractUserKey(k));
       Reset(memtable_rep_.GetBucket(transformed));
       HashSkipListRep::Iterator::Seek(k, memtable_key);
@@ -189,7 +186,7 @@ class HashSkipListRep : public MemTableRep {
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() override {
+    void SeekToFirst() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
@@ -197,11 +194,12 @@ class HashSkipListRep : public MemTableRep {
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() override {
+    void SeekToLast() override {
       // Prefix iterator does not support total order.
       // We simply set the iterator to invalid state
       Reset(nullptr);
     }
+
    private:
     // the underlying memtable
     const HashSkipListRep& memtable_rep_;
@@ -212,19 +210,19 @@ class HashSkipListRep : public MemTableRep {
     // instantiating an empty bucket over which to iterate.
    public:
     EmptyIterator() { }
-    virtual bool Valid() const override { return false; }
-    virtual const char* key() const override {
+    bool Valid() const override { return false; }
+    const char* key() const override {
       assert(false);
       return nullptr;
     }
-    virtual void Next() override {}
-    virtual void Prev() override {}
-    virtual void Seek(const Slice& internal_key,
-                      const char* memtable_key) override {}
-    virtual void SeekForPrev(const Slice& internal_key,
-                             const char* memtable_key) override {}
-    virtual void SeekToFirst() override {}
-    virtual void SeekToLast() override {}
+    void Next() override {}
+    void Prev() override {}
+    void Seek(const Slice& /*internal_key*/,
+              const char* /*memtable_key*/) override {}
+    void SeekForPrev(const Slice& /*internal_key*/,
+                     const char* /*memtable_key*/) override {}
+    void SeekToFirst() override {}
+    void SeekToLast() override {}
 
    private:
   };
@@ -335,7 +333,7 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) {
 
 MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Allocator* allocator,
-    const SliceTransform* transform, Logger* logger) {
+    const SliceTransform* transform, Logger* /*logger*/) {
   return new HashSkipListRep(compare, allocator, transform, bucket_count_,
                              skiplist_height_, skiplist_branching_factor_);
 }
diff --git a/thirdparty/rocksdb/memtable/inlineskiplist.h b/thirdparty/rocksdb/memtable/inlineskiplist.h
index 5cf6c57d57..1ef8f2b6db 100644
--- a/thirdparty/rocksdb/memtable/inlineskiplist.h
+++ b/thirdparty/rocksdb/memtable/inlineskiplist.h
@@ -45,8 +45,12 @@
 #include <stdlib.h>
 #include <algorithm>
 #include <atomic>
+#include <type_traits>
+#include "port/likely.h"
 #include "port/port.h"
+#include "rocksdb/slice.h"
 #include "util/allocator.h"
+#include "util/coding.h"
 #include "util/random.h"
 
 namespace rocksdb {
@@ -58,6 +62,9 @@ class InlineSkipList {
   struct Splice;
 
  public:
+  using DecodedKey = \
+    typename std::remove_reference<Comparator>::type::DecodedType;
+
   static const uint16_t kMaxPossibleHeight = 32;
 
   // Create a new InlineSkipList object that will use "cmp" for comparing
@@ -81,7 +88,7 @@ class InlineSkipList {
   //
   // REQUIRES: nothing that compares equal to key is currently in the list.
   // REQUIRES: no concurrent calls to any of inserts.
-  void Insert(const char* key);
+  bool Insert(const char* key);
 
   // Inserts a key allocated by AllocateKey with a hint of last insert
   // position in the skip-list. If hint points to nullptr, a new hint will be
@@ -93,10 +100,10 @@ class InlineSkipList {
   //
   // REQUIRES: nothing that compares equal to key is currently in the list.
   // REQUIRES: no concurrent calls to any of inserts.
-  void InsertWithHint(const char* key, void** hint);
+  bool InsertWithHint(const char* key, void** hint);
 
   // Like Insert, but external synchronization is not required.
-  void InsertConcurrently(const char* key);
+  bool InsertConcurrently(const char* key);
 
   // Inserts a node into the skip list.  key must have been allocated by
   // AllocateKey and then filled in by the caller.  If UseCAS is true,
@@ -114,7 +121,7 @@ class InlineSkipList {
   // false has worse running time for the non-sequential case O(log N),
   // but a better constant factor.
   template <bool UseCAS>
-  void Insert(const char* key, Splice* splice, bool allow_partial_splice_fix);
+  bool Insert(const char* key, Splice* splice, bool allow_partial_splice_fix);
 
   // Returns true iff an entry that compares equal to key is in the list.
   bool Contains(const char* key) const;
@@ -177,10 +184,9 @@ class InlineSkipList {
   const uint16_t kBranching_;
   const uint32_t kScaledInverseBranching_;
 
+  Allocator* const allocator_;  // Allocator used for allocations of nodes
   // Immutable after construction
   Comparator const compare_;
-  Allocator* const allocator_;  // Allocator used for allocations of nodes
-
   Node* const head_;
 
   // Modified only by Insert().  Read racily by readers, but stale
@@ -211,6 +217,7 @@ class InlineSkipList {
   // Return true if key is greater than the data stored in "n".  Null n
   // is considered infinite.  n should not be head_.
   bool KeyIsAfterNode(const char* key, Node* n) const;
+  bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
 
   // Returns the earliest node with a key >= key.
   // Return nullptr if there is no such node.
@@ -239,12 +246,13 @@ class InlineSkipList {
   // point to a node that is before the key, and after should point to
   // a node that is after the key.  after should be nullptr if a good after
   // node isn't conveniently available.
-  void FindSpliceForLevel(const char* key, Node* before, Node* after, int level,
+  template<bool prefetch_before>
+  void FindSpliceForLevel(const DecodedKey& key, Node* before, Node* after, int level,
                           Node** out_prev, Node** out_next);
 
   // Recomputes Splice levels from highest_level (inclusive) down to
   // lowest_level (inclusive).
-  void RecomputeSpliceLevels(const char* key, Splice* splice,
+  void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
                              int recompute_level);
 
   // No copying allowed
@@ -278,7 +286,7 @@ struct InlineSkipList<Comparator>::Node {
   // next_[0].  This is used for passing data from AllocateKey to Insert.
   void StashHeight(const int height) {
     assert(sizeof(int) <= sizeof(next_[0]));
-    memcpy(&next_[0], &height, sizeof(int));
+    memcpy(static_cast<void*>(&next_[0]), &height, sizeof(int));
   }
 
   // Retrieves the value passed to StashHeight.  Undefined after a call
@@ -298,30 +306,30 @@ struct InlineSkipList<Comparator>::Node {
     assert(n >= 0);
     // Use an 'acquire load' so that we observe a fully initialized
     // version of the returned Node.
-    return (next_[-n].load(std::memory_order_acquire));
+    return ((&next_[0] - n)->load(std::memory_order_acquire));
   }
 
   void SetNext(int n, Node* x) {
     assert(n >= 0);
     // Use a 'release store' so that anybody who reads through this
     // pointer observes a fully initialized version of the inserted node.
-    next_[-n].store(x, std::memory_order_release);
+    (&next_[0] - n)->store(x, std::memory_order_release);
   }
 
   bool CASNext(int n, Node* expected, Node* x) {
     assert(n >= 0);
-    return next_[-n].compare_exchange_strong(expected, x);
+    return (&next_[0] - n)->compare_exchange_strong(expected, x);
   }
 
   // No-barrier variants that can be safely used in a few locations.
   Node* NoBarrier_Next(int n) {
     assert(n >= 0);
-    return next_[-n].load(std::memory_order_relaxed);
+    return (&next_[0] - n)->load(std::memory_order_relaxed);
   }
 
   void NoBarrier_SetNext(int n, Node* x) {
     assert(n >= 0);
-    next_[-n].store(x, std::memory_order_relaxed);
+    (&next_[0] - n)->store(x, std::memory_order_relaxed);
   }
 
   // Insert node after prev on specific level.
@@ -433,6 +441,14 @@ bool InlineSkipList<Comparator>::KeyIsAfterNode(const char* key,
   return (n != nullptr) && (compare_(n->Key(), key) < 0);
 }
 
+template <class Comparator>
+bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
+                                                Node* n) const {
+  // nullptr n is considered infinite
+  assert(n != head_);
+  return (n != nullptr) && (compare_(n->Key(), key) < 0);
+}
+
 template <class Comparator>
 typename InlineSkipList<Comparator>::Node*
 InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
@@ -444,15 +460,19 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
   Node* x = head_;
   int level = GetMaxHeight() - 1;
   Node* last_bigger = nullptr;
+  const DecodedKey key_decoded = compare_.decode_key(key);
   while (true) {
     Node* next = x->Next(level);
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
     // Make sure the lists are sorted
     assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
     // Make sure we haven't overshot during our search
-    assert(x == head_ || KeyIsAfterNode(key, x));
+    assert(x == head_ || KeyIsAfterNode(key_decoded, x));
     int cmp = (next == nullptr || next == last_bigger)
                   ? 1
-                  : compare_(next->Key(), key);
+                  : compare_(next->Key(), key_decoded);
     if (cmp == 0 || (cmp > 0 && level == 0)) {
       return next;
     } else if (cmp < 0) {
@@ -482,12 +502,18 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
   Node* x = root;
   // KeyIsAfter(key, last_not_after) is definitely false
   Node* last_not_after = nullptr;
+  const DecodedKey key_decoded = compare_.decode_key(key);
   while (true) {
+    assert(x != nullptr);
     Node* next = x->Next(level);
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
     assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
-    assert(x == head_ || KeyIsAfterNode(key, x));
-    if (next != last_not_after && KeyIsAfterNode(key, next)) {
+    assert(x == head_ || KeyIsAfterNode(key_decoded, x));
+    if (next != last_not_after && KeyIsAfterNode(key_decoded, next)) {
       // Keep searching in this list
+      assert(next != nullptr);
       x = next;
     } else {
       if (prev != nullptr) {
@@ -530,10 +556,14 @@ uint64_t InlineSkipList<Comparator>::EstimateCount(const char* key) const {
 
   Node* x = head_;
   int level = GetMaxHeight() - 1;
+  const DecodedKey key_decoded = compare_.decode_key(key);
   while (true) {
-    assert(x == head_ || compare_(x->Key(), key) < 0);
+    assert(x == head_ || compare_(x->Key(), key_decoded) < 0);
     Node* next = x->Next(level);
-    if (next == nullptr || compare_(next->Key(), key) >= 0) {
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
+    if (next == nullptr || compare_(next->Key(), key_decoded) >= 0) {
       if (level == 0) {
         return count;
       } else {
@@ -553,11 +583,11 @@ InlineSkipList<Comparator>::InlineSkipList(const Comparator cmp,
                                            Allocator* allocator,
                                            int32_t max_height,
                                            int32_t branching_factor)
-    : kMaxHeight_(max_height),
-      kBranching_(branching_factor),
+    : kMaxHeight_(static_cast<uint16_t>(max_height)),
+      kBranching_(static_cast<uint16_t>(branching_factor)),
       kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_),
-      compare_(cmp),
       allocator_(allocator),
+      compare_(cmp),
       head_(AllocateNode(0, max_height)),
       max_height_(1),
       seq_splice_(AllocateSplice()) {
@@ -614,38 +644,47 @@ InlineSkipList<Comparator>::AllocateSplice() {
 }
 
 template <class Comparator>
-void InlineSkipList<Comparator>::Insert(const char* key) {
-  Insert<false>(key, seq_splice_, false);
+bool InlineSkipList<Comparator>::Insert(const char* key) {
+  return Insert<false>(key, seq_splice_, false);
 }
 
 template <class Comparator>
-void InlineSkipList<Comparator>::InsertConcurrently(const char* key) {
+bool InlineSkipList<Comparator>::InsertConcurrently(const char* key) {
   Node* prev[kMaxPossibleHeight];
   Node* next[kMaxPossibleHeight];
   Splice splice;
   splice.prev_ = prev;
   splice.next_ = next;
-  Insert<true>(key, &splice, false);
+  return Insert<true>(key, &splice, false);
 }
 
 template <class Comparator>
-void InlineSkipList<Comparator>::InsertWithHint(const char* key, void** hint) {
+bool InlineSkipList<Comparator>::InsertWithHint(const char* key, void** hint) {
   assert(hint != nullptr);
   Splice* splice = reinterpret_cast<Splice*>(*hint);
   if (splice == nullptr) {
     splice = AllocateSplice();
     *hint = reinterpret_cast<void*>(splice);
   }
-  Insert<false>(key, splice, true);
+  return Insert<false>(key, splice, true);
 }
 
 template <class Comparator>
-void InlineSkipList<Comparator>::FindSpliceForLevel(const char* key,
+template <bool prefetch_before>
+void InlineSkipList<Comparator>::FindSpliceForLevel(const DecodedKey& key,
                                                     Node* before, Node* after,
                                                     int level, Node** out_prev,
                                                     Node** out_next) {
   while (true) {
     Node* next = before->Next(level);
+    if (next != nullptr) {
+      PREFETCH(next->Next(level), 0, 1);
+    }
+    if (prefetch_before == true) {
+      if (next != nullptr && level>0) {
+        PREFETCH(next->Next(level-1), 0, 1);
+      }
+    }
     assert(before == head_ || next == nullptr ||
            KeyIsAfterNode(next->Key(), before));
     assert(before == head_ || KeyIsAfterNode(key, before));
@@ -660,22 +699,23 @@ void InlineSkipList<Comparator>::FindSpliceForLevel(const char* key,
 }
 
 template <class Comparator>
-void InlineSkipList<Comparator>::RecomputeSpliceLevels(const char* key,
+void InlineSkipList<Comparator>::RecomputeSpliceLevels(const DecodedKey& key,
                                                        Splice* splice,
                                                        int recompute_level) {
   assert(recompute_level > 0);
   assert(recompute_level <= splice->height_);
   for (int i = recompute_level - 1; i >= 0; --i) {
-    FindSpliceForLevel(key, splice->prev_[i + 1], splice->next_[i + 1], i,
+    FindSpliceForLevel<true>(key, splice->prev_[i + 1], splice->next_[i + 1], i,
                        &splice->prev_[i], &splice->next_[i]);
   }
 }
 
 template <class Comparator>
 template <bool UseCAS>
-void InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
+bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
                                         bool allow_partial_splice_fix) {
   Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1;
+  const DecodedKey key_decoded = compare_.decode_key(key);
   int height = x->UnstashHeight();
   assert(height >= 1 && height <= kMaxHeight_);
 
@@ -743,7 +783,8 @@ void InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
         // our chances of success.
         ++recompute_height;
       } else if (splice->prev_[recompute_height] != head_ &&
-                 !KeyIsAfterNode(key, splice->prev_[recompute_height])) {
+                 !KeyIsAfterNode(key_decoded,
+                                 splice->prev_[recompute_height])) {
         // key is from before splice
         if (allow_partial_splice_fix) {
           // skip all levels with the same node without more comparisons
@@ -755,7 +796,8 @@ void InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
           // we're pessimistic, recompute everything
           recompute_height = max_height;
         }
-      } else if (KeyIsAfterNode(key, splice->next_[recompute_height])) {
+      } else if (KeyIsAfterNode(key_decoded,
+                                splice->next_[recompute_height])) {
         // key is from after splice
         if (allow_partial_splice_fix) {
           Node* bad = splice->next_[recompute_height];
@@ -773,13 +815,24 @@ void InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
   }
   assert(recompute_height <= max_height);
   if (recompute_height > 0) {
-    RecomputeSpliceLevels(key, splice, recompute_height);
+    RecomputeSpliceLevels(key_decoded, splice, recompute_height);
   }
 
   bool splice_is_valid = true;
   if (UseCAS) {
     for (int i = 0; i < height; ++i) {
       while (true) {
+        // Checking for duplicate keys on the level 0 is sufficient
+        if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
+                     compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
+          // duplicate key
+          return false;
+        }
+        if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
+                     compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
+          // duplicate key
+          return false;
+        }
         assert(splice->next_[i] == nullptr ||
                compare_(x->Key(), splice->next_[i]->Key()) < 0);
         assert(splice->prev_[i] == head_ ||
@@ -794,8 +847,8 @@ void InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
         // search, because it should be unlikely that lots of nodes have
         // been inserted between prev[i] and next[i]. No point in using
         // next[i] as the after hint, because we know it is stale.
-        FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i],
-                           &splice->next_[i]);
+        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
+                                  &splice->prev_[i], &splice->next_[i]);
 
         // Since we've narrowed the bracket for level i, we might have
         // violated the Splice constraint between i and i-1.  Make sure
@@ -809,8 +862,19 @@ void InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
     for (int i = 0; i < height; ++i) {
       if (i >= recompute_height &&
           splice->prev_[i]->Next(i) != splice->next_[i]) {
-        FindSpliceForLevel(key, splice->prev_[i], nullptr, i, &splice->prev_[i],
-                           &splice->next_[i]);
+        FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
+                                  &splice->prev_[i], &splice->next_[i]);
+      }
+      // Checking for duplicate keys on the level 0 is sufficient
+      if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
+                   compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
+        // duplicate key
+        return false;
+      }
+      if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
+                   compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
+        // duplicate key
+        return false;
       }
       assert(splice->next_[i] == nullptr ||
              compare_(x->Key(), splice->next_[i]->Key()) < 0);
@@ -844,6 +908,7 @@ void InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
   } else {
     splice->height_ = 0;
   }
+  return true;
 }
 
 template <class Comparator>
@@ -863,6 +928,7 @@ void InlineSkipList<Comparator>::TEST_Validate() const {
   // levels.
   Node* nodes[kMaxPossibleHeight];
   int max_height = GetMaxHeight();
+  assert(max_height > 0);
   for (int i = 0; i < max_height; i++) {
     nodes[i] = head_;
   }
@@ -892,7 +958,7 @@ void InlineSkipList<Comparator>::TEST_Validate() const {
     }
   }
   for (int i = 1; i < max_height; i++) {
-    assert(nodes[i]->Next(i) == nullptr);
+    assert(nodes[i] != nullptr && nodes[i]->Next(i) == nullptr);
   }
 }
 
diff --git a/thirdparty/rocksdb/memtable/inlineskiplist_test.cc b/thirdparty/rocksdb/memtable/inlineskiplist_test.cc
index 5803e5b0f5..b416ef7c55 100644
--- a/thirdparty/rocksdb/memtable/inlineskiplist_test.cc
+++ b/thirdparty/rocksdb/memtable/inlineskiplist_test.cc
@@ -32,6 +32,12 @@ static Key Decode(const char* key) {
 }
 
 struct TestComparator {
+  typedef Key DecodedType;
+
+  static DecodedType decode_key(const char* b) {
+    return Decode(b);
+  }
+
   int operator()(const char* a, const char* b) const {
     if (Decode(a) < Decode(b)) {
       return -1;
@@ -41,6 +47,16 @@ struct TestComparator {
       return 0;
     }
   }
+
+  int operator()(const char* a, const DecodedType b) const {
+    if (Decode(a) < b) {
+      return -1;
+    } else if (Decode(a) > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
 };
 
 typedef InlineSkipList<TestComparator> TestInlineSkipList;
@@ -54,11 +70,12 @@ class InlineSkipTest : public testing::Test {
     keys_.insert(key);
   }
 
-  void InsertWithHint(TestInlineSkipList* list, Key key, void** hint) {
+  bool InsertWithHint(TestInlineSkipList* list, Key key, void** hint) {
     char* buf = list->AllocateKey(sizeof(Key));
     memcpy(buf, &key, sizeof(Key));
-    list->InsertWithHint(buf, hint);
+    bool res = list->InsertWithHint(buf, hint);
     keys_.insert(key);
+    return res;
   }
 
   void Validate(TestInlineSkipList* list) {
@@ -292,6 +309,7 @@ TEST_F(InlineSkipTest, InsertWithHint_CompatibleWithInsertWithoutHint) {
   Validate(&list);
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 // We want to make sure that with a single writer and multiple
 // concurrent readers (with no synchronization other than when a
 // reader's iterator is created), the reader always observes all the
@@ -618,6 +636,7 @@ TEST_F(InlineSkipTest, ConcurrentInsert1) { RunConcurrentInsert(1); }
 TEST_F(InlineSkipTest, ConcurrentInsert2) { RunConcurrentInsert(2); }
 TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); }
 
+#endif  // ROCKSDB_VALGRIND_RUN
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/memtable/memtablerep_bench.cc b/thirdparty/rocksdb/memtable/memtablerep_bench.cc
index 63a0201ce8..51ff11a015 100644
--- a/thirdparty/rocksdb/memtable/memtablerep_bench.cc
+++ b/thirdparty/rocksdb/memtable/memtablerep_bench.cc
@@ -19,8 +19,6 @@ int main() {
 }
 #else
 
-#include <gflags/gflags.h>
-
 #include <atomic>
 #include <iostream>
 #include <memory>
@@ -38,13 +36,14 @@ int main() {
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "util/arena.h"
+#include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 #include "util/testutil.h"
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::RegisterFlagValidator;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_string(benchmarks, "fillrandom",
               "Comma-separated list of benchmarks to run. Options:\n"
@@ -96,17 +95,8 @@ DEFINE_int32(
     threshold_use_skiplist, 256,
     "threshold_use_skiplist parameter to pass into NewHashLinkListRepFactory");
 
-DEFINE_int64(
-    write_buffer_size, 256,
-    "write_buffer_size parameter to pass into NewHashCuckooRepFactory");
-
-DEFINE_int64(
-    average_data_size, 64,
-    "average_data_size parameter to pass into NewHashCuckooRepFactory");
-
-DEFINE_int64(
-    hash_function_count, 4,
-    "hash_function_count parameter to pass into NewHashCuckooRepFactory");
+DEFINE_int64(write_buffer_size, 256,
+             "write_buffer_size parameter to pass into WriteBufferManager");
 
 DEFINE_int32(
     num_threads, 1,
@@ -480,8 +470,8 @@ class FillBenchmark : public Benchmark {
     num_write_ops_per_thread_ = FLAGS_num_operations;
   }
 
-  void RunThreads(std::vector<port::Thread>* threads, uint64_t* bytes_written,
-                  uint64_t* bytes_read, bool write,
+  void RunThreads(std::vector<port::Thread>* /*threads*/, uint64_t* bytes_written,
+                  uint64_t* bytes_read, bool /*write*/,
                   uint64_t* read_hits) override {
     FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_,
                         num_write_ops_per_thread_, read_hits)();
@@ -497,7 +487,7 @@ class ReadBenchmark : public Benchmark {
   }
 
   void RunThreads(std::vector<port::Thread>* threads, uint64_t* bytes_written,
-                  uint64_t* bytes_read, bool write,
+                  uint64_t* bytes_read, bool /*write*/,
                   uint64_t* read_hits) override {
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads->emplace_back(
@@ -521,7 +511,7 @@ class SeqReadBenchmark : public Benchmark {
   }
 
   void RunThreads(std::vector<port::Thread>* threads, uint64_t* bytes_written,
-                  uint64_t* bytes_read, bool write,
+                  uint64_t* bytes_read, bool /*write*/,
                   uint64_t* read_hits) override {
     for (int i = 0; i < FLAGS_num_threads; ++i) {
       threads->emplace_back(SeqReadBenchmarkThread(
@@ -548,7 +538,7 @@ class ReadWriteBenchmark : public Benchmark {
   }
 
   void RunThreads(std::vector<port::Thread>* threads, uint64_t* bytes_written,
-                  uint64_t* bytes_read, bool write,
+                  uint64_t* bytes_read, bool /*write*/,
                   uint64_t* read_hits) override {
     std::atomic_int threads_done;
     threads_done.store(0);
@@ -608,12 +598,6 @@ int main(int argc, char** argv) {
         FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist));
     options.prefix_extractor.reset(
         rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
-  } else if (FLAGS_memtablerep == "cuckoo") {
-    factory.reset(rocksdb::NewHashCuckooRepFactory(
-        FLAGS_write_buffer_size, FLAGS_average_data_size,
-        static_cast<uint32_t>(FLAGS_hash_function_count)));
-    options.prefix_extractor.reset(
-        rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length));
 #endif  // ROCKSDB_LITE
   } else {
     fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str());
diff --git a/thirdparty/rocksdb/memtable/skiplist.h b/thirdparty/rocksdb/memtable/skiplist.h
index 0162dccb78..47a89034eb 100644
--- a/thirdparty/rocksdb/memtable/skiplist.h
+++ b/thirdparty/rocksdb/memtable/skiplist.h
@@ -310,6 +310,7 @@ typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::
   int level = GetMaxHeight() - 1;
   Node* last_bigger = nullptr;
   while (true) {
+    assert(x != nullptr);
     Node* next = x->Next(level);
     // Make sure the lists are sorted
     assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x));
@@ -338,6 +339,7 @@ SkipList<Key, Comparator>::FindLessThan(const Key& key, Node** prev) const {
   // KeyIsAfter(key, last_not_after) is definitely false
   Node* last_not_after = nullptr;
   while (true) {
+    assert(x != nullptr);
     Node* next = x->Next(level);
     assert(x == head_ || next == nullptr || KeyIsAfterNode(next->key, x));
     assert(x == head_ || KeyIsAfterNode(key, x));
@@ -407,8 +409,8 @@ template <typename Key, class Comparator>
 SkipList<Key, Comparator>::SkipList(const Comparator cmp, Allocator* allocator,
                                     int32_t max_height,
                                     int32_t branching_factor)
-    : kMaxHeight_(max_height),
-      kBranching_(branching_factor),
+    : kMaxHeight_(static_cast<uint16_t>(max_height)),
+      kBranching_(static_cast<uint16_t>(branching_factor)),
       kScaledInverseBranching_((Random::kMaxNext + 1) / kBranching_),
       compare_(cmp),
       allocator_(allocator),
diff --git a/thirdparty/rocksdb/memtable/skiplistrep.cc b/thirdparty/rocksdb/memtable/skiplistrep.cc
index f56be5dcb6..32870b127d 100644
--- a/thirdparty/rocksdb/memtable/skiplistrep.cc
+++ b/thirdparty/rocksdb/memtable/skiplistrep.cc
@@ -27,45 +27,55 @@ class SkipListRep : public MemTableRep {
        transform_(transform),
        lookahead_(lookahead) {}
 
- virtual KeyHandle Allocate(const size_t len, char** buf) override {
+ KeyHandle Allocate(const size_t len, char** buf) override {
    *buf = skip_list_.AllocateKey(len);
    return static_cast<KeyHandle>(*buf);
-  }
+ }
 
   // Insert key into the list.
   // REQUIRES: nothing that compares equal to key is currently in the list.
-  virtual void Insert(KeyHandle handle) override {
-    skip_list_.Insert(static_cast<char*>(handle));
-  }
+ void Insert(KeyHandle handle) override {
+   skip_list_.Insert(static_cast<char*>(handle));
+ }
 
-  virtual void InsertWithHint(KeyHandle handle, void** hint) override {
-    skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
-  }
+ bool InsertKey(KeyHandle handle) override {
+   return skip_list_.Insert(static_cast<char*>(handle));
+ }
 
-  virtual void InsertConcurrently(KeyHandle handle) override {
-    skip_list_.InsertConcurrently(static_cast<char*>(handle));
-  }
+ void InsertWithHint(KeyHandle handle, void** hint) override {
+   skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
+ }
 
-  // Returns true iff an entry that compares equal to key is in the list.
-  virtual bool Contains(const char* key) const override {
-    return skip_list_.Contains(key);
-  }
+ bool InsertKeyWithHint(KeyHandle handle, void** hint) override {
+   return skip_list_.InsertWithHint(static_cast<char*>(handle), hint);
+ }
 
-  virtual size_t ApproximateMemoryUsage() override {
-    // All memory is allocated through allocator; nothing to report here
-    return 0;
-  }
+ void InsertConcurrently(KeyHandle handle) override {
+   skip_list_.InsertConcurrently(static_cast<char*>(handle));
+ }
 
-  virtual void Get(const LookupKey& k, void* callback_args,
-                   bool (*callback_func)(void* arg,
-                                         const char* entry)) override {
-    SkipListRep::Iterator iter(&skip_list_);
-    Slice dummy_slice;
-    for (iter.Seek(dummy_slice, k.memtable_key().data());
-         iter.Valid() && callback_func(callback_args, iter.key());
-         iter.Next()) {
-    }
-  }
+ bool InsertKeyConcurrently(KeyHandle handle) override {
+   return skip_list_.InsertConcurrently(static_cast<char*>(handle));
+ }
+
+  // Returns true iff an entry that compares equal to key is in the list.
+ bool Contains(const char* key) const override {
+   return skip_list_.Contains(key);
+ }
+
+ size_t ApproximateMemoryUsage() override {
+   // All memory is allocated through allocator; nothing to report here
+   return 0;
+ }
+
+ void Get(const LookupKey& k, void* callback_args,
+          bool (*callback_func)(void* arg, const char* entry)) override {
+   SkipListRep::Iterator iter(&skip_list_);
+   Slice dummy_slice;
+   for (iter.Seek(dummy_slice, k.memtable_key().data());
+        iter.Valid() && callback_func(callback_args, iter.key()); iter.Next()) {
+   }
+ }
 
   uint64_t ApproximateNumEntries(const Slice& start_ikey,
                                  const Slice& end_ikey) override {
@@ -76,7 +86,7 @@ class SkipListRep : public MemTableRep {
     return (end_count >= start_count) ? (end_count - start_count) : 0;
   }
 
-  virtual ~SkipListRep() override { }
+  ~SkipListRep() override {}
 
   // Iteration over the contents of a skip list
   class Iterator : public MemTableRep::Iterator {
@@ -89,34 +99,25 @@ class SkipListRep : public MemTableRep {
         const InlineSkipList<const MemTableRep::KeyComparator&>* list)
         : iter_(list) {}
 
-    virtual ~Iterator() override { }
+    ~Iterator() override {}
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const override {
-      return iter_.Valid();
-    }
+    bool Valid() const override { return iter_.Valid(); }
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const override {
-      return iter_.key();
-    }
+    const char* key() const override { return iter_.key(); }
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() override {
-      iter_.Next();
-    }
+    void Next() override { iter_.Next(); }
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() override {
-      iter_.Prev();
-    }
+    void Prev() override { iter_.Prev(); }
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& user_key, const char* memtable_key)
-        override {
+    void Seek(const Slice& user_key, const char* memtable_key) override {
       if (memtable_key != nullptr) {
         iter_.Seek(memtable_key);
       } else {
@@ -125,8 +126,7 @@ class SkipListRep : public MemTableRep {
     }
 
     // Retreat to the last entry with a key <= target
-    virtual void SeekForPrev(const Slice& user_key,
-                             const char* memtable_key) override {
+    void SeekForPrev(const Slice& user_key, const char* memtable_key) override {
       if (memtable_key != nullptr) {
         iter_.SeekForPrev(memtable_key);
       } else {
@@ -136,15 +136,12 @@ class SkipListRep : public MemTableRep {
 
     // Position at the first entry in list.
     // Final state of iterator is Valid() iff list is not empty.
-    virtual void SeekToFirst() override {
-      iter_.SeekToFirst();
-    }
+    void SeekToFirst() override { iter_.SeekToFirst(); }
 
     // Position at the last entry in list.
     // Final state of iterator is Valid() iff list is not empty.
-    virtual void SeekToLast() override {
-      iter_.SeekToLast();
-    }
+    void SeekToLast() override { iter_.SeekToLast(); }
+
    protected:
     std::string tmp_;       // For passing to EncodeKey
   };
@@ -158,18 +155,16 @@ class SkipListRep : public MemTableRep {
     explicit LookaheadIterator(const SkipListRep& rep) :
         rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {}
 
-    virtual ~LookaheadIterator() override {}
+    ~LookaheadIterator() override {}
 
-    virtual bool Valid() const override {
-      return iter_.Valid();
-    }
+    bool Valid() const override { return iter_.Valid(); }
 
-    virtual const char *key() const override {
+    const char* key() const override {
       assert(Valid());
       return iter_.key();
     }
 
-    virtual void Next() override {
+    void Next() override {
       assert(Valid());
 
       bool advance_prev = true;
@@ -194,14 +189,13 @@ class SkipListRep : public MemTableRep {
       iter_.Next();
     }
 
-    virtual void Prev() override {
+    void Prev() override {
       assert(Valid());
       iter_.Prev();
       prev_ = iter_;
     }
 
-    virtual void Seek(const Slice& internal_key, const char *memtable_key)
-        override {
+    void Seek(const Slice& internal_key, const char* memtable_key) override {
       const char *encoded_key =
         (memtable_key != nullptr) ?
             memtable_key : EncodeKey(&tmp_, internal_key);
@@ -224,8 +218,8 @@ class SkipListRep : public MemTableRep {
       prev_ = iter_;
     }
 
-    virtual void SeekForPrev(const Slice& internal_key,
-                             const char* memtable_key) override {
+    void SeekForPrev(const Slice& internal_key,
+                     const char* memtable_key) override {
       const char* encoded_key = (memtable_key != nullptr)
                                     ? memtable_key
                                     : EncodeKey(&tmp_, internal_key);
@@ -233,12 +227,12 @@ class SkipListRep : public MemTableRep {
       prev_ = iter_;
     }
 
-    virtual void SeekToFirst() override {
+    void SeekToFirst() override {
       iter_.SeekToFirst();
       prev_ = iter_;
     }
 
-    virtual void SeekToLast() override {
+    void SeekToLast() override {
       iter_.SeekToLast();
       prev_ = iter_;
     }
@@ -252,7 +246,7 @@ class SkipListRep : public MemTableRep {
     InlineSkipList<const MemTableRep::KeyComparator&>::Iterator prev_;
   };
 
-  virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
+  MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
     if (lookahead_ > 0) {
       void *mem =
         arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator))
@@ -270,7 +264,7 @@ class SkipListRep : public MemTableRep {
 
 MemTableRep* SkipListFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Allocator* allocator,
-    const SliceTransform* transform, Logger* logger) {
+    const SliceTransform* transform, Logger* /*logger*/) {
   return new SkipListRep(compare, allocator, transform, lookahead_);
 }
 
diff --git a/thirdparty/rocksdb/memtable/stl_wrappers.h b/thirdparty/rocksdb/memtable/stl_wrappers.h
index 19fa151488..0287f4f8fe 100644
--- a/thirdparty/rocksdb/memtable/stl_wrappers.h
+++ b/thirdparty/rocksdb/memtable/stl_wrappers.h
@@ -11,7 +11,6 @@
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/slice.h"
 #include "util/coding.h"
-#include "util/murmurhash.h"
 
 namespace rocksdb {
 namespace stl_wrappers {
diff --git a/thirdparty/rocksdb/memtable/vectorrep.cc b/thirdparty/rocksdb/memtable/vectorrep.cc
index e54025c2d3..827ab8a5d2 100644
--- a/thirdparty/rocksdb/memtable/vectorrep.cc
+++ b/thirdparty/rocksdb/memtable/vectorrep.cc
@@ -31,20 +31,19 @@ class VectorRep : public MemTableRep {
   // single buffer and pass that in as the parameter to Insert)
   // REQUIRES: nothing that compares equal to key is currently in the
   // collection.
-  virtual void Insert(KeyHandle handle) override;
+  void Insert(KeyHandle handle) override;
 
   // Returns true iff an entry that compares equal to key is in the collection.
-  virtual bool Contains(const char* key) const override;
+  bool Contains(const char* key) const override;
 
-  virtual void MarkReadOnly() override;
+  void MarkReadOnly() override;
 
-  virtual size_t ApproximateMemoryUsage() override;
+  size_t ApproximateMemoryUsage() override;
 
-  virtual void Get(const LookupKey& k, void* callback_args,
-                   bool (*callback_func)(void* arg,
-                                         const char* entry)) override;
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override;
 
-  virtual ~VectorRep() override { }
+  ~VectorRep() override {}
 
   class Iterator : public MemTableRep::Iterator {
     class VectorRep* vrep_;
@@ -62,41 +61,40 @@ class VectorRep : public MemTableRep {
     // Initialize an iterator over the specified collection.
     // The returned iterator is not valid.
     // explicit Iterator(const MemTableRep* collection);
-    virtual ~Iterator() override { };
+    ~Iterator() override{};
 
     // Returns true iff the iterator is positioned at a valid node.
-    virtual bool Valid() const override;
+    bool Valid() const override;
 
     // Returns the key at the current position.
     // REQUIRES: Valid()
-    virtual const char* key() const override;
+    const char* key() const override;
 
     // Advances to the next position.
     // REQUIRES: Valid()
-    virtual void Next() override;
+    void Next() override;
 
     // Advances to the previous position.
     // REQUIRES: Valid()
-    virtual void Prev() override;
+    void Prev() override;
 
     // Advance to the first entry with a key >= target
-    virtual void Seek(const Slice& user_key, const char* memtable_key) override;
+    void Seek(const Slice& user_key, const char* memtable_key) override;
 
     // Advance to the first entry with a key <= target
-    virtual void SeekForPrev(const Slice& user_key,
-                             const char* memtable_key) override;
+    void SeekForPrev(const Slice& user_key, const char* memtable_key) override;
 
     // Position at the first entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToFirst() override;
+    void SeekToFirst() override;
 
     // Position at the last entry in collection.
     // Final state of iterator is Valid() iff collection is not empty.
-    virtual void SeekToLast() override;
+    void SeekToLast() override;
   };
 
   // Return an iterator over the keys in this representation.
-  virtual MemTableRep::Iterator* GetIterator(Arena* arena) override;
+  MemTableRep::Iterator* GetIterator(Arena* arena) override;
 
  private:
   friend class Iterator;
@@ -227,8 +225,8 @@ void VectorRep::Iterator::Seek(const Slice& user_key,
 }
 
 // Advance to the first entry with a key <= target
-void VectorRep::Iterator::SeekForPrev(const Slice& user_key,
-                                      const char* memtable_key) {
+void VectorRep::Iterator::SeekForPrev(const Slice& /*user_key*/,
+                                      const char* /*memtable_key*/) {
   assert(false);
 }
 
@@ -296,7 +294,7 @@ MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) {
 
 MemTableRep* VectorRepFactory::CreateMemTableRep(
     const MemTableRep::KeyComparator& compare, Allocator* allocator,
-    const SliceTransform*, Logger* logger) {
+    const SliceTransform*, Logger* /*logger*/) {
   return new VectorRep(compare, allocator, count_);
 }
 } // namespace rocksdb
diff --git a/thirdparty/rocksdb/memtable/write_buffer_manager.cc b/thirdparty/rocksdb/memtable/write_buffer_manager.cc
index bac0fdd8fb..7f2e664ab5 100644
--- a/thirdparty/rocksdb/memtable/write_buffer_manager.cc
+++ b/thirdparty/rocksdb/memtable/write_buffer_manager.cc
@@ -60,6 +60,8 @@ WriteBufferManager::WriteBufferManager(size_t _buffer_size,
     // Construct the cache key using the pointer to this.
     cache_rep_.reset(new CacheRep(cache));
   }
+#else
+  (void)cache;
 #endif  // ROCKSDB_LITE
 }
 
@@ -77,7 +79,7 @@ WriteBufferManager::~WriteBufferManager() {
 void WriteBufferManager::ReserveMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
   assert(cache_rep_ != nullptr);
-  // Use a mutex to protect various data structures. Can be optimzied to a
+  // Use a mutex to protect various data structures. Can be optimized to a
   // lock-free solution if it ends up with a performance bottleneck.
   std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);
 
@@ -92,20 +94,22 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) {
     cache_rep_->dummy_handles_.push_back(handle);
     cache_rep_->cache_allocated_size_ += kSizeDummyEntry;
   }
+#else
+  (void)mem;
 #endif  // ROCKSDB_LITE
 }
 
 void WriteBufferManager::FreeMemWithCache(size_t mem) {
 #ifndef ROCKSDB_LITE
   assert(cache_rep_ != nullptr);
-  // Use a mutex to protect various data structures. Can be optimzied to a
+  // Use a mutex to protect various data structures. Can be optimized to a
   // lock-free solution if it ends up with a performance bottleneck.
   std::lock_guard<std::mutex> lock(cache_rep_->cache_mutex_);
   size_t new_mem_used = memory_used_.load(std::memory_order_relaxed) - mem;
   memory_used_.store(new_mem_used, std::memory_order_relaxed);
   // Gradually shrink memory costed in the block cache if the actual
   // usage is less than 3/4 of what we reserve from the block cache.
-  // We do this becausse:
+  // We do this because:
   // 1. we don't pay the cost of the block cache immediately a memtable is
   //    freed, as block cache insert is expensive;
   // 2. eventually, if we walk away from a temporary memtable size increase,
@@ -119,6 +123,8 @@ void WriteBufferManager::FreeMemWithCache(size_t mem) {
     cache_rep_->dummy_handles_.pop_back();
     cache_rep_->cache_allocated_size_ -= kSizeDummyEntry;
   }
+#else
+  (void)mem;
 #endif  // ROCKSDB_LITE
 }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/monitoring/histogram.cc b/thirdparty/rocksdb/monitoring/histogram.cc
index b3c01a78e0..4bc7139d30 100644
--- a/thirdparty/rocksdb/monitoring/histogram.cc
+++ b/thirdparty/rocksdb/monitoring/histogram.cc
@@ -202,6 +202,7 @@ std::string HistogramStat::ToString() const {
            Percentile(99.99));
   r.append(buf);
   r.append("------------------------------------------------------\n");
+  if (cur_num == 0) return r;   // all buckets are empty
   const double mult = 100.0 / cur_num;
   uint64_t cumulative_sum = 0;
   for (unsigned int b = 0; b < num_buckets_; b++) {
@@ -209,7 +210,8 @@ std::string HistogramStat::ToString() const {
     if (bucket_value <= 0.0) continue;
     cumulative_sum += bucket_value;
     snprintf(buf, sizeof(buf),
-             "[ %7" PRIu64 ", %7" PRIu64 " ) %8" PRIu64 " %7.3f%% %7.3f%% ",
+             "%c %7" PRIu64 ", %7" PRIu64 " ] %8" PRIu64 " %7.3f%% %7.3f%% ",
+             (b == 0) ? '[' : '(',
              (b == 0) ? 0 : bucketMapper.BucketLimit(b-1),  // left
               bucketMapper.BucketLimit(b),  // right
               bucket_value,                   // count
@@ -233,6 +235,9 @@ void HistogramStat::Data(HistogramData * const data) const {
   data->max = static_cast<double>(max());
   data->average = Average();
   data->standard_deviation = StandardDeviation();
+  data->count = num();
+  data->sum = sum();
+  data->min = static_cast<double>(min());
 }
 
 void HistogramImpl::Clear() {
diff --git a/thirdparty/rocksdb/monitoring/histogram_test.cc b/thirdparty/rocksdb/monitoring/histogram_test.cc
index b4e3c981c8..df58822fc2 100644
--- a/thirdparty/rocksdb/monitoring/histogram_test.cc
+++ b/thirdparty/rocksdb/monitoring/histogram_test.cc
@@ -85,6 +85,19 @@ TEST_F(HistogramTest, BasicOperation) {
   BasicOperation(histogramWindowing);
 }
 
+TEST_F(HistogramTest, BoundaryValue) {
+  HistogramImpl histogram;
+  // - both should be in [0, 1] bucket because we place values on bucket
+  //   boundaries in the lower bucket.
+  // - all points are in [0, 1] bucket, so p50 will be 0.5
+  // - the test cannot be written with a single point since histogram won't
+  //   report percentiles lower than the min or greater than the max.
+  histogram.Add(0);
+  histogram.Add(1);
+
+  ASSERT_LE(fabs(histogram.Percentile(50.0) - 0.5), kIota);
+}
+
 TEST_F(HistogramTest, MergeHistogram) {
   HistogramImpl histogram;
   HistogramImpl other;
diff --git a/thirdparty/rocksdb/monitoring/histogram_windowing.cc b/thirdparty/rocksdb/monitoring/histogram_windowing.cc
index 28d8265f26..ecd6f090a5 100644
--- a/thirdparty/rocksdb/monitoring/histogram_windowing.cc
+++ b/thirdparty/rocksdb/monitoring/histogram_windowing.cc
@@ -17,7 +17,7 @@ namespace rocksdb {
 
 HistogramWindowingImpl::HistogramWindowingImpl() {
   env_ = Env::Default();
-  window_stats_.reset(new HistogramStat[num_windows_]);
+  window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
 
@@ -29,7 +29,7 @@ HistogramWindowingImpl::HistogramWindowingImpl(
       micros_per_window_(micros_per_window),
       min_num_per_window_(min_num_per_window) {
   env_ = Env::Default();
-  window_stats_.reset(new HistogramStat[num_windows_]);
+  window_stats_.reset(new HistogramStat[static_cast<size_t>(num_windows_)]);
   Clear();
 }
 
@@ -60,7 +60,7 @@ void HistogramWindowingImpl::Add(uint64_t value){
   stats_.Add(value);
 
   // Current window update
-  window_stats_[current_window()].Add(value);
+  window_stats_[static_cast<size_t>(current_window())].Add(value);
 }
 
 void HistogramWindowingImpl::Merge(const Histogram& other) {
@@ -89,8 +89,11 @@ void HistogramWindowingImpl::Merge(const HistogramWindowingImpl& other) {
         (cur_window + num_windows_ - i) % num_windows_;
     uint64_t other_window_index =
         (other_cur_window + other.num_windows_ - i) % other.num_windows_;
+    size_t windex = static_cast<size_t>(window_index);
+    size_t other_windex = static_cast<size_t>(other_window_index);
 
-    window_stats_[window_index].Merge(other.window_stats_[other_window_index]);
+    window_stats_[windex].Merge(
+      other.window_stats_[other_windex]);
   }
 }
 
@@ -129,8 +132,9 @@ void HistogramWindowingImpl::Data(HistogramData * const data) const {
 
 void HistogramWindowingImpl::TimerTick() {
   uint64_t curr_time = env_->NowMicros();
+  size_t curr_window_ = static_cast<size_t>(current_window());
   if (curr_time - last_swap_time() > micros_per_window_ &&
-      window_stats_[current_window()].num() >= min_num_per_window_) {
+      window_stats_[curr_window_].num() >= min_num_per_window_) {
     SwapHistoryBucket();
   }
 }
@@ -149,7 +153,8 @@ void HistogramWindowingImpl::SwapHistoryBucket() {
                                                     0 : curr_window + 1;
 
     // subtract next buckets from totals and swap to next buckets
-    HistogramStat& stats_to_drop = window_stats_[next_window];
+    HistogramStat& stats_to_drop = 
+      window_stats_[static_cast<size_t>(next_window)];
 
     if (!stats_to_drop.Empty()) {
       for (size_t b = 0; b < stats_.num_buckets_; b++){
diff --git a/thirdparty/rocksdb/monitoring/histogram_windowing.h b/thirdparty/rocksdb/monitoring/histogram_windowing.h
index 2a6d0dd158..6532aa248e 100644
--- a/thirdparty/rocksdb/monitoring/histogram_windowing.h
+++ b/thirdparty/rocksdb/monitoring/histogram_windowing.h
@@ -22,8 +22,8 @@ class HistogramWindowingImpl : public Histogram
                          uint64_t micros_per_window,
                          uint64_t min_num_per_window);
 
-  HistogramWindowingImpl(const HistogramImpl&) = delete;
-  HistogramWindowingImpl& operator=(const HistogramImpl&) = delete;
+  HistogramWindowingImpl(const HistogramWindowingImpl&) = delete;
+  HistogramWindowingImpl& operator=(const HistogramWindowingImpl&) = delete;
 
   ~HistogramWindowingImpl();
 
@@ -77,4 +77,4 @@ class HistogramWindowingImpl : public Histogram
   uint64_t min_num_per_window_ = 0;
 };
 
-}  // namespace rocksdb
\ No newline at end of file
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/monitoring/instrumented_mutex.cc b/thirdparty/rocksdb/monitoring/instrumented_mutex.cc
index c07a5a17a8..7b61bcf4fb 100644
--- a/thirdparty/rocksdb/monitoring/instrumented_mutex.cc
+++ b/thirdparty/rocksdb/monitoring/instrumented_mutex.cc
@@ -10,25 +10,21 @@
 
 namespace rocksdb {
 namespace {
-bool ShouldReportToStats(Env* env, Statistics* stats) {
-  return env != nullptr && stats != nullptr &&
-          stats->stats_level_ > kExceptTimeForMutex;
+Statistics* stats_for_report(Env* env, Statistics* stats) {
+  if (env != nullptr && stats != nullptr &&
+      stats->get_stats_level() > kExceptTimeForMutex) {
+    return stats;
+  } else {
+    return nullptr;
+  }
 }
 }  // namespace
 
 void InstrumentedMutex::Lock() {
-  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(db_mutex_lock_nanos,
-                                         stats_code_ == DB_MUTEX_WAIT_MICROS);
-  uint64_t wait_time_micros = 0;
-  if (ShouldReportToStats(env_, stats_)) {
-    {
-      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
-      LockInternal();
-    }
-    RecordTick(stats_, stats_code_, wait_time_micros);
-  } else {
-    LockInternal();
-  }
+  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
+      db_mutex_lock_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
+      stats_for_report(env_, stats_), stats_code_);
+  LockInternal();
 }
 
 void InstrumentedMutex::LockInternal() {
@@ -39,18 +35,10 @@ void InstrumentedMutex::LockInternal() {
 }
 
 void InstrumentedCondVar::Wait() {
-  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(db_condition_wait_nanos,
-                                         stats_code_ == DB_MUTEX_WAIT_MICROS);
-  uint64_t wait_time_micros = 0;
-  if (ShouldReportToStats(env_, stats_)) {
-    {
-      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
-      WaitInternal();
-    }
-    RecordTick(stats_, stats_code_, wait_time_micros);
-  } else {
-    WaitInternal();
-  }
+  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
+      db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
+      stats_for_report(env_, stats_), stats_code_);
+  WaitInternal();
 }
 
 void InstrumentedCondVar::WaitInternal() {
@@ -61,20 +49,10 @@ void InstrumentedCondVar::WaitInternal() {
 }
 
 bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) {
-  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(db_condition_wait_nanos,
-                                         stats_code_ == DB_MUTEX_WAIT_MICROS);
-  uint64_t wait_time_micros = 0;
-  bool result = false;
-  if (ShouldReportToStats(env_, stats_)) {
-    {
-      StopWatch sw(env_, nullptr, 0, &wait_time_micros);
-      result = TimedWaitInternal(abs_time_us);
-    }
-    RecordTick(stats_, stats_code_, wait_time_micros);
-  } else {
-    result = TimedWaitInternal(abs_time_us);
-  }
-  return result;
+  PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(
+      db_condition_wait_nanos, stats_code_ == DB_MUTEX_WAIT_MICROS,
+      stats_for_report(env_, stats_), stats_code_);
+  return TimedWaitInternal(abs_time_us);
 }
 
 bool InstrumentedCondVar::TimedWaitInternal(uint64_t abs_time_us) {
diff --git a/thirdparty/rocksdb/monitoring/iostats_context.cc b/thirdparty/rocksdb/monitoring/iostats_context.cc
index 8aa131a704..3d102f9120 100644
--- a/thirdparty/rocksdb/monitoring/iostats_context.cc
+++ b/thirdparty/rocksdb/monitoring/iostats_context.cc
@@ -6,7 +6,6 @@
 #include <sstream>
 #include "monitoring/iostats_context_imp.h"
 #include "rocksdb/env.h"
-#include "util/thread_local.h"
 
 namespace rocksdb {
 
diff --git a/thirdparty/rocksdb/monitoring/iostats_context_imp.h b/thirdparty/rocksdb/monitoring/iostats_context_imp.h
index 88538297a6..23c2088cab 100644
--- a/thirdparty/rocksdb/monitoring/iostats_context_imp.h
+++ b/thirdparty/rocksdb/monitoring/iostats_context_imp.h
@@ -8,35 +8,40 @@
 #include "rocksdb/iostats_context.h"
 
 #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
+namespace rocksdb {
+extern __thread IOStatsContext iostats_context;
+}  // namespace rocksdb
 
 // increment a specific counter by the specified value
-#define IOSTATS_ADD(metric, value)     \
-  (get_iostats_context()->metric += value)
+#define IOSTATS_ADD(metric, value) (iostats_context.metric += value)
 
 // Increase metric value only when it is positive
 #define IOSTATS_ADD_IF_POSITIVE(metric, value)   \
   if (value > 0) { IOSTATS_ADD(metric, value); }
 
 // reset a specific counter to zero
-#define IOSTATS_RESET(metric)          \
-  (get_iostats_context()->metric = 0)
+#define IOSTATS_RESET(metric) (iostats_context.metric = 0)
 
 // reset all counters to zero
-#define IOSTATS_RESET_ALL()                        \
-  (get_iostats_context()->Reset())
+#define IOSTATS_RESET_ALL() (iostats_context.Reset())
 
-#define IOSTATS_SET_THREAD_POOL_ID(value)      \
-  (get_iostats_context()->thread_pool_id = value)
+#define IOSTATS_SET_THREAD_POOL_ID(value) \
+  (iostats_context.thread_pool_id = value)
 
-#define IOSTATS_THREAD_POOL_ID()               \
-  (get_iostats_context()->thread_pool_id)
+#define IOSTATS_THREAD_POOL_ID() (iostats_context.thread_pool_id)
 
-#define IOSTATS(metric)                        \
-  (get_iostats_context()->metric)
+#define IOSTATS(metric) (iostats_context.metric)
 
 // Declare and set start time of the timer
-#define IOSTATS_TIMER_GUARD(metric)                                          \
-  PerfStepTimer iostats_step_timer_##metric(&(get_iostats_context()->metric)); \
+#define IOSTATS_TIMER_GUARD(metric)                                     \
+  PerfStepTimer iostats_step_timer_##metric(&(iostats_context.metric)); \
+  iostats_step_timer_##metric.Start();
+
+// Declare and set start time of the timer
+#define IOSTATS_CPU_TIMER_GUARD(metric, env)           \
+  PerfStepTimer iostats_step_timer_##metric(           \
+      &(iostats_context.metric), env, true,            \
+      PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
   iostats_step_timer_##metric.Start();
 
 #else  // ROCKSDB_SUPPORT_THREAD_LOCAL
@@ -50,5 +55,6 @@
 #define IOSTATS(metric) 0
 
 #define IOSTATS_TIMER_GUARD(metric)
+#define IOSTATS_CPU_TIMER_GUARD(metric, env)
 
 #endif  // ROCKSDB_SUPPORT_THREAD_LOCAL
diff --git a/thirdparty/rocksdb/monitoring/perf_context.cc b/thirdparty/rocksdb/monitoring/perf_context.cc
index 791f4bdbe4..40b0b215c4 100644
--- a/thirdparty/rocksdb/monitoring/perf_context.cc
+++ b/thirdparty/rocksdb/monitoring/perf_context.cc
@@ -15,7 +15,7 @@ PerfContext perf_context;
 #if defined(OS_SOLARIS)
 __thread PerfContext perf_context_;
 #else
-__thread PerfContext perf_context;
+thread_local PerfContext perf_context;
 #endif
 #endif
 
@@ -31,6 +31,300 @@ PerfContext* get_perf_context() {
 #endif
 }
 
+PerfContext::~PerfContext() {
+#if !defined(NPERF_CONTEXT) && defined(ROCKSDB_SUPPORT_THREAD_LOCAL) && !defined(OS_SOLARIS)
+  ClearPerLevelPerfContext();
+#endif
+}
+
+PerfContext::PerfContext(const PerfContext& other) {
+#ifndef NPERF_CONTEXT
+  user_key_comparison_count = other.user_key_comparison_count;
+  block_cache_hit_count = other.block_cache_hit_count;
+  block_read_count = other.block_read_count;
+  block_read_byte = other.block_read_byte;
+  block_read_time = other.block_read_time;
+  block_cache_index_hit_count = other.block_cache_index_hit_count;
+  index_block_read_count = other.index_block_read_count;
+  block_cache_filter_hit_count = other.block_cache_filter_hit_count;
+  filter_block_read_count = other.filter_block_read_count;
+  compression_dict_block_read_count = other.compression_dict_block_read_count;
+  block_checksum_time = other.block_checksum_time;
+  block_decompress_time = other.block_decompress_time;
+  get_read_bytes = other.get_read_bytes;
+  multiget_read_bytes = other.multiget_read_bytes;
+  iter_read_bytes = other.iter_read_bytes;
+  internal_key_skipped_count = other.internal_key_skipped_count;
+  internal_delete_skipped_count = other.internal_delete_skipped_count;
+  internal_recent_skipped_count = other.internal_recent_skipped_count;
+  internal_merge_count = other.internal_merge_count;
+  write_wal_time = other.write_wal_time;
+  get_snapshot_time = other.get_snapshot_time;
+  get_from_memtable_time = other.get_from_memtable_time;
+  get_from_memtable_count = other.get_from_memtable_count;
+  get_post_process_time = other.get_post_process_time;
+  get_from_output_files_time = other.get_from_output_files_time;
+  seek_on_memtable_time = other.seek_on_memtable_time;
+  seek_on_memtable_count = other.seek_on_memtable_count;
+  next_on_memtable_count = other.next_on_memtable_count;
+  prev_on_memtable_count = other.prev_on_memtable_count;
+  seek_child_seek_time = other.seek_child_seek_time;
+  seek_child_seek_count = other.seek_child_seek_count;
+  seek_min_heap_time = other.seek_min_heap_time;
+  seek_internal_seek_time = other.seek_internal_seek_time;
+  find_next_user_entry_time = other.find_next_user_entry_time;
+  write_pre_and_post_process_time = other.write_pre_and_post_process_time;
+  write_memtable_time = other.write_memtable_time;
+  write_delay_time = other.write_delay_time;
+  write_thread_wait_nanos = other.write_thread_wait_nanos;
+  write_scheduling_flushes_compactions_time =
+      other.write_scheduling_flushes_compactions_time;
+  db_mutex_lock_nanos = other.db_mutex_lock_nanos;
+  db_condition_wait_nanos = other.db_condition_wait_nanos;
+  merge_operator_time_nanos = other.merge_operator_time_nanos;
+  read_index_block_nanos = other.read_index_block_nanos;
+  read_filter_block_nanos = other.read_filter_block_nanos;
+  new_table_block_iter_nanos = other.new_table_block_iter_nanos;
+  new_table_iterator_nanos = other.new_table_iterator_nanos;
+  block_seek_nanos = other.block_seek_nanos;
+  find_table_nanos = other.find_table_nanos;
+  bloom_memtable_hit_count = other.bloom_memtable_hit_count;
+  bloom_memtable_miss_count = other.bloom_memtable_miss_count;
+  bloom_sst_hit_count = other.bloom_sst_hit_count;
+  bloom_sst_miss_count = other.bloom_sst_miss_count;
+  key_lock_wait_time = other.key_lock_wait_time;
+  key_lock_wait_count = other.key_lock_wait_count;
+
+  env_new_sequential_file_nanos = other.env_new_sequential_file_nanos;
+  env_new_random_access_file_nanos = other.env_new_random_access_file_nanos;
+  env_new_writable_file_nanos = other.env_new_writable_file_nanos;
+  env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos;
+  env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos;
+  env_new_directory_nanos = other.env_new_directory_nanos;
+  env_file_exists_nanos = other.env_file_exists_nanos;
+  env_get_children_nanos = other.env_get_children_nanos;
+  env_get_children_file_attributes_nanos =
+      other.env_get_children_file_attributes_nanos;
+  env_delete_file_nanos = other.env_delete_file_nanos;
+  env_create_dir_nanos = other.env_create_dir_nanos;
+  env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos;
+  env_delete_dir_nanos = other.env_delete_dir_nanos;
+  env_get_file_size_nanos = other.env_get_file_size_nanos;
+  env_get_file_modification_time_nanos =
+      other.env_get_file_modification_time_nanos;
+  env_rename_file_nanos = other.env_rename_file_nanos;
+  env_link_file_nanos = other.env_link_file_nanos;
+  env_lock_file_nanos = other.env_lock_file_nanos;
+  env_unlock_file_nanos = other.env_unlock_file_nanos;
+  env_new_logger_nanos = other.env_new_logger_nanos;
+  get_cpu_nanos = other.get_cpu_nanos;
+  iter_next_cpu_nanos = other.iter_next_cpu_nanos;
+  iter_prev_cpu_nanos = other.iter_prev_cpu_nanos;
+  iter_seek_cpu_nanos = other.iter_seek_cpu_nanos;
+  if (per_level_perf_context_enabled && level_to_perf_context != nullptr) {
+    ClearPerLevelPerfContext();
+  }
+  if (other.level_to_perf_context != nullptr) {
+    level_to_perf_context = new std::map<uint32_t, PerfContextByLevel>();
+    *level_to_perf_context = *other.level_to_perf_context;
+  }
+  per_level_perf_context_enabled = other.per_level_perf_context_enabled;
+#endif
+}
+
+PerfContext::PerfContext(PerfContext&& other) noexcept {
+#ifndef NPERF_CONTEXT
+  user_key_comparison_count = other.user_key_comparison_count;
+  block_cache_hit_count = other.block_cache_hit_count;
+  block_read_count = other.block_read_count;
+  block_read_byte = other.block_read_byte;
+  block_read_time = other.block_read_time;
+  block_cache_index_hit_count = other.block_cache_index_hit_count;
+  index_block_read_count = other.index_block_read_count;
+  block_cache_filter_hit_count = other.block_cache_filter_hit_count;
+  filter_block_read_count = other.filter_block_read_count;
+  compression_dict_block_read_count = other.compression_dict_block_read_count;
+  block_checksum_time = other.block_checksum_time;
+  block_decompress_time = other.block_decompress_time;
+  get_read_bytes = other.get_read_bytes;
+  multiget_read_bytes = other.multiget_read_bytes;
+  iter_read_bytes = other.iter_read_bytes;
+  internal_key_skipped_count = other.internal_key_skipped_count;
+  internal_delete_skipped_count = other.internal_delete_skipped_count;
+  internal_recent_skipped_count = other.internal_recent_skipped_count;
+  internal_merge_count = other.internal_merge_count;
+  write_wal_time = other.write_wal_time;
+  get_snapshot_time = other.get_snapshot_time;
+  get_from_memtable_time = other.get_from_memtable_time;
+  get_from_memtable_count = other.get_from_memtable_count;
+  get_post_process_time = other.get_post_process_time;
+  get_from_output_files_time = other.get_from_output_files_time;
+  seek_on_memtable_time = other.seek_on_memtable_time;
+  seek_on_memtable_count = other.seek_on_memtable_count;
+  next_on_memtable_count = other.next_on_memtable_count;
+  prev_on_memtable_count = other.prev_on_memtable_count;
+  seek_child_seek_time = other.seek_child_seek_time;
+  seek_child_seek_count = other.seek_child_seek_count;
+  seek_min_heap_time = other.seek_min_heap_time;
+  seek_internal_seek_time = other.seek_internal_seek_time;
+  find_next_user_entry_time = other.find_next_user_entry_time;
+  write_pre_and_post_process_time = other.write_pre_and_post_process_time;
+  write_memtable_time = other.write_memtable_time;
+  write_delay_time = other.write_delay_time;
+  write_thread_wait_nanos = other.write_thread_wait_nanos;
+  write_scheduling_flushes_compactions_time =
+      other.write_scheduling_flushes_compactions_time;
+  db_mutex_lock_nanos = other.db_mutex_lock_nanos;
+  db_condition_wait_nanos = other.db_condition_wait_nanos;
+  merge_operator_time_nanos = other.merge_operator_time_nanos;
+  read_index_block_nanos = other.read_index_block_nanos;
+  read_filter_block_nanos = other.read_filter_block_nanos;
+  new_table_block_iter_nanos = other.new_table_block_iter_nanos;
+  new_table_iterator_nanos = other.new_table_iterator_nanos;
+  block_seek_nanos = other.block_seek_nanos;
+  find_table_nanos = other.find_table_nanos;
+  bloom_memtable_hit_count = other.bloom_memtable_hit_count;
+  bloom_memtable_miss_count = other.bloom_memtable_miss_count;
+  bloom_sst_hit_count = other.bloom_sst_hit_count;
+  bloom_sst_miss_count = other.bloom_sst_miss_count;
+  key_lock_wait_time = other.key_lock_wait_time;
+  key_lock_wait_count = other.key_lock_wait_count;
+
+  env_new_sequential_file_nanos = other.env_new_sequential_file_nanos;
+  env_new_random_access_file_nanos = other.env_new_random_access_file_nanos;
+  env_new_writable_file_nanos = other.env_new_writable_file_nanos;
+  env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos;
+  env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos;
+  env_new_directory_nanos = other.env_new_directory_nanos;
+  env_file_exists_nanos = other.env_file_exists_nanos;
+  env_get_children_nanos = other.env_get_children_nanos;
+  env_get_children_file_attributes_nanos =
+      other.env_get_children_file_attributes_nanos;
+  env_delete_file_nanos = other.env_delete_file_nanos;
+  env_create_dir_nanos = other.env_create_dir_nanos;
+  env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos;
+  env_delete_dir_nanos = other.env_delete_dir_nanos;
+  env_get_file_size_nanos = other.env_get_file_size_nanos;
+  env_get_file_modification_time_nanos =
+      other.env_get_file_modification_time_nanos;
+  env_rename_file_nanos = other.env_rename_file_nanos;
+  env_link_file_nanos = other.env_link_file_nanos;
+  env_lock_file_nanos = other.env_lock_file_nanos;
+  env_unlock_file_nanos = other.env_unlock_file_nanos;
+  env_new_logger_nanos = other.env_new_logger_nanos;
+  get_cpu_nanos = other.get_cpu_nanos;
+  iter_next_cpu_nanos = other.iter_next_cpu_nanos;
+  iter_prev_cpu_nanos = other.iter_prev_cpu_nanos;
+  iter_seek_cpu_nanos = other.iter_seek_cpu_nanos;
+  if (per_level_perf_context_enabled && level_to_perf_context != nullptr) {
+    ClearPerLevelPerfContext();
+  }
+  if (other.level_to_perf_context != nullptr) {
+    level_to_perf_context = other.level_to_perf_context;
+    other.level_to_perf_context = nullptr;
+  }
+  per_level_perf_context_enabled = other.per_level_perf_context_enabled;
+#endif
+}
+
+// TODO(Zhongyi): reduce code duplication between copy constructor and
+// assignment operator
+PerfContext& PerfContext::operator=(const PerfContext& other) {
+#ifndef NPERF_CONTEXT
+  user_key_comparison_count = other.user_key_comparison_count;
+  block_cache_hit_count = other.block_cache_hit_count;
+  block_read_count = other.block_read_count;
+  block_read_byte = other.block_read_byte;
+  block_read_time = other.block_read_time;
+  block_cache_index_hit_count = other.block_cache_index_hit_count;
+  index_block_read_count = other.index_block_read_count;
+  block_cache_filter_hit_count = other.block_cache_filter_hit_count;
+  filter_block_read_count = other.filter_block_read_count;
+  compression_dict_block_read_count = other.compression_dict_block_read_count;
+  block_checksum_time = other.block_checksum_time;
+  block_decompress_time = other.block_decompress_time;
+  get_read_bytes = other.get_read_bytes;
+  multiget_read_bytes = other.multiget_read_bytes;
+  iter_read_bytes = other.iter_read_bytes;
+  internal_key_skipped_count = other.internal_key_skipped_count;
+  internal_delete_skipped_count = other.internal_delete_skipped_count;
+  internal_recent_skipped_count = other.internal_recent_skipped_count;
+  internal_merge_count = other.internal_merge_count;
+  write_wal_time = other.write_wal_time;
+  get_snapshot_time = other.get_snapshot_time;
+  get_from_memtable_time = other.get_from_memtable_time;
+  get_from_memtable_count = other.get_from_memtable_count;
+  get_post_process_time = other.get_post_process_time;
+  get_from_output_files_time = other.get_from_output_files_time;
+  seek_on_memtable_time = other.seek_on_memtable_time;
+  seek_on_memtable_count = other.seek_on_memtable_count;
+  next_on_memtable_count = other.next_on_memtable_count;
+  prev_on_memtable_count = other.prev_on_memtable_count;
+  seek_child_seek_time = other.seek_child_seek_time;
+  seek_child_seek_count = other.seek_child_seek_count;
+  seek_min_heap_time = other.seek_min_heap_time;
+  seek_internal_seek_time = other.seek_internal_seek_time;
+  find_next_user_entry_time = other.find_next_user_entry_time;
+  write_pre_and_post_process_time = other.write_pre_and_post_process_time;
+  write_memtable_time = other.write_memtable_time;
+  write_delay_time = other.write_delay_time;
+  write_thread_wait_nanos = other.write_thread_wait_nanos;
+  write_scheduling_flushes_compactions_time =
+      other.write_scheduling_flushes_compactions_time;
+  db_mutex_lock_nanos = other.db_mutex_lock_nanos;
+  db_condition_wait_nanos = other.db_condition_wait_nanos;
+  merge_operator_time_nanos = other.merge_operator_time_nanos;
+  read_index_block_nanos = other.read_index_block_nanos;
+  read_filter_block_nanos = other.read_filter_block_nanos;
+  new_table_block_iter_nanos = other.new_table_block_iter_nanos;
+  new_table_iterator_nanos = other.new_table_iterator_nanos;
+  block_seek_nanos = other.block_seek_nanos;
+  find_table_nanos = other.find_table_nanos;
+  bloom_memtable_hit_count = other.bloom_memtable_hit_count;
+  bloom_memtable_miss_count = other.bloom_memtable_miss_count;
+  bloom_sst_hit_count = other.bloom_sst_hit_count;
+  bloom_sst_miss_count = other.bloom_sst_miss_count;
+  key_lock_wait_time = other.key_lock_wait_time;
+  key_lock_wait_count = other.key_lock_wait_count;
+
+  env_new_sequential_file_nanos = other.env_new_sequential_file_nanos;
+  env_new_random_access_file_nanos = other.env_new_random_access_file_nanos;
+  env_new_writable_file_nanos = other.env_new_writable_file_nanos;
+  env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos;
+  env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos;
+  env_new_directory_nanos = other.env_new_directory_nanos;
+  env_file_exists_nanos = other.env_file_exists_nanos;
+  env_get_children_nanos = other.env_get_children_nanos;
+  env_get_children_file_attributes_nanos =
+      other.env_get_children_file_attributes_nanos;
+  env_delete_file_nanos = other.env_delete_file_nanos;
+  env_create_dir_nanos = other.env_create_dir_nanos;
+  env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos;
+  env_delete_dir_nanos = other.env_delete_dir_nanos;
+  env_get_file_size_nanos = other.env_get_file_size_nanos;
+  env_get_file_modification_time_nanos =
+      other.env_get_file_modification_time_nanos;
+  env_rename_file_nanos = other.env_rename_file_nanos;
+  env_link_file_nanos = other.env_link_file_nanos;
+  env_lock_file_nanos = other.env_lock_file_nanos;
+  env_unlock_file_nanos = other.env_unlock_file_nanos;
+  env_new_logger_nanos = other.env_new_logger_nanos;
+  get_cpu_nanos = other.get_cpu_nanos;
+  iter_next_cpu_nanos = other.iter_next_cpu_nanos;
+  iter_prev_cpu_nanos = other.iter_prev_cpu_nanos;
+  iter_seek_cpu_nanos = other.iter_seek_cpu_nanos;
+  if (per_level_perf_context_enabled && level_to_perf_context != nullptr) {
+    ClearPerLevelPerfContext();
+  }
+  if (other.level_to_perf_context != nullptr) {
+    level_to_perf_context = new std::map<uint32_t, PerfContextByLevel>();
+    *level_to_perf_context = *other.level_to_perf_context;
+  }
+  per_level_perf_context_enabled = other.per_level_perf_context_enabled;
+#endif
+  return *this;
+}
+
 void PerfContext::Reset() {
 #ifndef NPERF_CONTEXT
   user_key_comparison_count = 0;
@@ -38,6 +332,11 @@ void PerfContext::Reset() {
   block_read_count = 0;
   block_read_byte = 0;
   block_read_time = 0;
+  block_cache_index_hit_count = 0;
+  index_block_read_count = 0;
+  block_cache_filter_hit_count = 0;
+  filter_block_read_count = 0;
+  compression_dict_block_read_count = 0;
   block_checksum_time = 0;
   block_decompress_time = 0;
   get_read_bytes = 0;
@@ -66,6 +365,8 @@ void PerfContext::Reset() {
   write_pre_and_post_process_time = 0;
   write_memtable_time = 0;
   write_delay_time = 0;
+  write_thread_wait_nanos = 0;
+  write_scheduling_flushes_compactions_time = 0;
   db_mutex_lock_nanos = 0;
   db_condition_wait_nanos = 0;
   merge_operator_time_nanos = 0;
@@ -79,6 +380,8 @@ void PerfContext::Reset() {
   bloom_memtable_miss_count = 0;
   bloom_sst_hit_count = 0;
   bloom_sst_miss_count = 0;
+  key_lock_wait_time = 0;
+  key_lock_wait_count = 0;
 
   env_new_sequential_file_nanos = 0;
   env_new_random_access_file_nanos = 0;
@@ -100,6 +403,15 @@ void PerfContext::Reset() {
   env_lock_file_nanos = 0;
   env_unlock_file_nanos = 0;
   env_new_logger_nanos = 0;
+  get_cpu_nanos = 0;
+  iter_next_cpu_nanos = 0;
+  iter_prev_cpu_nanos = 0;
+  iter_seek_cpu_nanos = 0;
+  if (per_level_perf_context_enabled && level_to_perf_context) {
+    for (auto& kv : *level_to_perf_context) {
+      kv.second.Reset();
+    }
+  }
 #endif
 }
 
@@ -108,6 +420,27 @@ void PerfContext::Reset() {
     ss << #counter << " = " << counter << ", ";  \
   }
 
+#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter)         \
+  if (per_level_perf_context_enabled && \
+      level_to_perf_context) {                                    \
+    ss << #counter << " = ";                                      \
+    for (auto& kv : *level_to_perf_context) {                     \
+      if (!exclude_zero_counters || (kv.second.counter > 0)) {    \
+        ss << kv.second.counter << "@level" << kv.first << ", ";  \
+      }                                                           \
+    }                                                             \
+  }
+
+void PerfContextByLevel::Reset() {
+#ifndef NPERF_CONTEXT
+  bloom_filter_useful = 0;
+  bloom_filter_full_positive = 0;
+  bloom_filter_full_true_positive = 0;
+  block_cache_hit_count = 0;
+  block_cache_miss_count = 0;
+#endif
+}
+
 std::string PerfContext::ToString(bool exclude_zero_counters) const {
 #ifdef NPERF_CONTEXT
   return "";
@@ -118,6 +451,11 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_OUTPUT(block_read_count);
   PERF_CONTEXT_OUTPUT(block_read_byte);
   PERF_CONTEXT_OUTPUT(block_read_time);
+  PERF_CONTEXT_OUTPUT(block_cache_index_hit_count);
+  PERF_CONTEXT_OUTPUT(index_block_read_count);
+  PERF_CONTEXT_OUTPUT(block_cache_filter_hit_count);
+  PERF_CONTEXT_OUTPUT(filter_block_read_count);
+  PERF_CONTEXT_OUTPUT(compression_dict_block_read_count);
   PERF_CONTEXT_OUTPUT(block_checksum_time);
   PERF_CONTEXT_OUTPUT(block_decompress_time);
   PERF_CONTEXT_OUTPUT(get_read_bytes);
@@ -144,6 +482,8 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_OUTPUT(find_next_user_entry_time);
   PERF_CONTEXT_OUTPUT(write_pre_and_post_process_time);
   PERF_CONTEXT_OUTPUT(write_memtable_time);
+  PERF_CONTEXT_OUTPUT(write_thread_wait_nanos);
+  PERF_CONTEXT_OUTPUT(write_scheduling_flushes_compactions_time);
   PERF_CONTEXT_OUTPUT(db_mutex_lock_nanos);
   PERF_CONTEXT_OUTPUT(db_condition_wait_nanos);
   PERF_CONTEXT_OUTPUT(merge_operator_time_nanos);
@@ -158,6 +498,8 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_OUTPUT(bloom_memtable_miss_count);
   PERF_CONTEXT_OUTPUT(bloom_sst_hit_count);
   PERF_CONTEXT_OUTPUT(bloom_sst_miss_count);
+  PERF_CONTEXT_OUTPUT(key_lock_wait_time);
+  PERF_CONTEXT_OUTPUT(key_lock_wait_count);
   PERF_CONTEXT_OUTPUT(env_new_sequential_file_nanos);
   PERF_CONTEXT_OUTPUT(env_new_random_access_file_nanos);
   PERF_CONTEXT_OUTPUT(env_new_writable_file_nanos);
@@ -178,8 +520,37 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const {
   PERF_CONTEXT_OUTPUT(env_lock_file_nanos);
   PERF_CONTEXT_OUTPUT(env_unlock_file_nanos);
   PERF_CONTEXT_OUTPUT(env_new_logger_nanos);
+  PERF_CONTEXT_OUTPUT(get_cpu_nanos);
+  PERF_CONTEXT_OUTPUT(iter_next_cpu_nanos);
+  PERF_CONTEXT_OUTPUT(iter_prev_cpu_nanos);
+  PERF_CONTEXT_OUTPUT(iter_seek_cpu_nanos);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_useful);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_positive);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_hit_count);
+  PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_miss_count);
   return ss.str();
 #endif
 }
 
+void PerfContext::EnablePerLevelPerfContext() {
+  if (level_to_perf_context == nullptr) {
+    level_to_perf_context = new std::map<uint32_t, PerfContextByLevel>();
+  }
+  per_level_perf_context_enabled = true;
+}
+
+void PerfContext::DisablePerLevelPerfContext(){
+  per_level_perf_context_enabled = false;
+}
+
+void PerfContext::ClearPerLevelPerfContext(){
+  if (level_to_perf_context != nullptr) {
+    level_to_perf_context->clear();
+    delete level_to_perf_context;
+    level_to_perf_context = nullptr;
+  }
+  per_level_perf_context_enabled = false;
+}
+
 }
diff --git a/thirdparty/rocksdb/monitoring/perf_context_imp.h b/thirdparty/rocksdb/monitoring/perf_context_imp.h
index 421a8cea15..e0ff8afc58 100644
--- a/thirdparty/rocksdb/monitoring/perf_context_imp.h
+++ b/thirdparty/rocksdb/monitoring/perf_context_imp.h
@@ -9,6 +9,16 @@
 #include "util/stop_watch.h"
 
 namespace rocksdb {
+#if defined(NPERF_CONTEXT) || !defined(ROCKSDB_SUPPORT_THREAD_LOCAL)
+extern PerfContext perf_context;
+#else
+#if defined(OS_SOLARIS)
+extern __thread PerfContext perf_context_;
+#define perf_context (*get_perf_context())
+#else
+extern thread_local PerfContext perf_context;
+#endif
+#endif
 
 #if defined(NPERF_CONTEXT)
 
@@ -27,14 +37,29 @@ namespace rocksdb {
 #define PERF_TIMER_START(metric) perf_step_timer_##metric.Start();
 
 // Declare and set start time of the timer
-#define PERF_TIMER_GUARD(metric)                                       \
-  PerfStepTimer perf_step_timer_##metric(&(get_perf_context()->metric)); \
+#define PERF_TIMER_GUARD(metric)                                  \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \
+  perf_step_timer_##metric.Start();
+
+// Declare and set start time of the timer
+#define PERF_TIMER_GUARD_WITH_ENV(metric, env)                         \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \
+  perf_step_timer_##metric.Start();
+
+// Declare and set start time of the timer
+#define PERF_CPU_TIMER_GUARD(metric, env)              \
+  PerfStepTimer perf_step_timer_##metric(              \
+      &(perf_context.metric), env, true,               \
+      PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \
   perf_step_timer_##metric.Start();
 
-#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition)            \
-  PerfStepTimer perf_step_timer_##metric(&(get_perf_context()->metric), true); \
-  if ((condition)) {                                                         \
-    perf_step_timer_##metric.Start();                                        \
+#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats,       \
+                                               ticker_type)                    \
+  PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr,      \
+                                         false, PerfLevel::kEnableTime, stats, \
+                                         ticker_type);                         \
+  if (condition) {                                                             \
+    perf_step_timer_##metric.Start();                                          \
   }
 
 // Update metric with time elapsed since last START. start time is reset
@@ -44,9 +69,25 @@ namespace rocksdb {
 // Increase metric value
 #define PERF_COUNTER_ADD(metric, value)        \
   if (perf_level >= PerfLevel::kEnableCount) { \
-    get_perf_context()->metric += value;       \
+    perf_context.metric += value;              \
   }
 
+// Increase metric value
+#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level)                      \
+  if (perf_level >= PerfLevel::kEnableCount &&                               \
+      perf_context.per_level_perf_context_enabled &&                         \
+      perf_context.level_to_perf_context) {                                  \
+    if ((*(perf_context.level_to_perf_context)).find(level) !=               \
+        (*(perf_context.level_to_perf_context)).end()) {                     \
+      (*(perf_context.level_to_perf_context))[level].metric += value;        \
+    }                                                                        \
+    else {                                                                   \
+      PerfContextByLevel empty_context;                                      \
+      (*(perf_context.level_to_perf_context))[level] = empty_context;        \
+      (*(perf_context.level_to_perf_context))[level].metric += value;       \
+    }                                                                        \
+  }                                                                          \
+
 #endif
 
 }
diff --git a/thirdparty/rocksdb/monitoring/perf_step_timer.h b/thirdparty/rocksdb/monitoring/perf_step_timer.h
index 4cb48b1256..6501bd54ab 100644
--- a/thirdparty/rocksdb/monitoring/perf_step_timer.h
+++ b/thirdparty/rocksdb/monitoring/perf_step_timer.h
@@ -12,26 +12,41 @@ namespace rocksdb {
 
 class PerfStepTimer {
  public:
-  explicit PerfStepTimer(uint64_t* metric, bool for_mutex = false)
-      : enabled_(perf_level >= PerfLevel::kEnableTime ||
-                 (!for_mutex && perf_level >= kEnableTimeExceptForMutex)),
-        env_(enabled_ ? Env::Default() : nullptr),
+  explicit PerfStepTimer(
+      uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false,
+      PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex,
+      Statistics* statistics = nullptr, uint32_t ticker_type = 0)
+      : perf_counter_enabled_(perf_level >= enable_level),
+        use_cpu_time_(use_cpu_time),
+        env_((perf_counter_enabled_ || statistics != nullptr)
+                 ? ((env != nullptr) ? env : Env::Default())
+                 : nullptr),
         start_(0),
-        metric_(metric) {}
+        metric_(metric),
+        statistics_(statistics),
+        ticker_type_(ticker_type) {}
 
   ~PerfStepTimer() {
     Stop();
   }
 
   void Start() {
-    if (enabled_) {
-      start_ = env_->NowNanos();
+    if (perf_counter_enabled_ || statistics_ != nullptr) {
+      start_ = time_now();
+    }
+  }
+
+  uint64_t time_now() {
+    if (!use_cpu_time_) {
+      return env_->NowNanos();
+    } else {
+      return env_->NowCPUNanos();
     }
   }
 
   void Measure() {
     if (start_) {
-      uint64_t now = env_->NowNanos();
+      uint64_t now = time_now();
       *metric_ += now - start_;
       start_ = now;
     }
@@ -39,16 +54,26 @@ class PerfStepTimer {
 
   void Stop() {
     if (start_) {
-      *metric_ += env_->NowNanos() - start_;
+      uint64_t duration = time_now() - start_;
+      if (perf_counter_enabled_) {
+        *metric_ += duration;
+      }
+
+      if (statistics_ != nullptr) {
+        RecordTick(statistics_, ticker_type_, duration);
+      }
       start_ = 0;
     }
   }
 
  private:
-  const bool enabled_;
+  const bool perf_counter_enabled_;
+  const bool use_cpu_time_;
   Env* const env_;
   uint64_t start_;
   uint64_t* metric_;
+  Statistics* statistics_;
+  uint32_t ticker_type_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/monitoring/statistics.cc b/thirdparty/rocksdb/monitoring/statistics.cc
index 9387043127..adb8cbfed8 100644
--- a/thirdparty/rocksdb/monitoring/statistics.cc
+++ b/thirdparty/rocksdb/monitoring/statistics.cc
@@ -17,13 +17,225 @@
 
 namespace rocksdb {
 
+// The order of items listed in  Tickers should be the same as
+// the order listed in TickersNameMap
+const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
+    {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"},
+    {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"},
+    {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"},
+    {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"},
+    {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"},
+    {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"},
+    {BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"},
+    {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"},
+    {BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"},
+    {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"},
+    {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
+    {BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"},
+    {BLOCK_CACHE_FILTER_BYTES_INSERT,
+     "rocksdb.block.cache.filter.bytes.insert"},
+    {BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"},
+    {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
+    {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
+    {BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"},
+    {BLOCK_CACHE_DATA_BYTES_INSERT, "rocksdb.block.cache.data.bytes.insert"},
+    {BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
+    {BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
+    {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
+    {BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"},
+    {BLOOM_FILTER_FULL_TRUE_POSITIVE,
+     "rocksdb.bloom.filter.full.true.positive"},
+    {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"},
+    {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"},
+    {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"},
+    {SIM_BLOCK_CACHE_MISS, "rocksdb.sim.block.cache.miss"},
+    {MEMTABLE_HIT, "rocksdb.memtable.hit"},
+    {MEMTABLE_MISS, "rocksdb.memtable.miss"},
+    {GET_HIT_L0, "rocksdb.l0.hit"},
+    {GET_HIT_L1, "rocksdb.l1.hit"},
+    {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"},
+    {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"},
+    {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"},
+    {COMPACTION_KEY_DROP_RANGE_DEL, "rocksdb.compaction.key.drop.range_del"},
+    {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"},
+    {COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+     "rocksdb.compaction.range_del.drop.obsolete"},
+    {COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+     "rocksdb.compaction.optimized.del.drop.obsolete"},
+    {COMPACTION_CANCELLED, "rocksdb.compaction.cancelled"},
+    {NUMBER_KEYS_WRITTEN, "rocksdb.number.keys.written"},
+    {NUMBER_KEYS_READ, "rocksdb.number.keys.read"},
+    {NUMBER_KEYS_UPDATED, "rocksdb.number.keys.updated"},
+    {BYTES_WRITTEN, "rocksdb.bytes.written"},
+    {BYTES_READ, "rocksdb.bytes.read"},
+    {NUMBER_DB_SEEK, "rocksdb.number.db.seek"},
+    {NUMBER_DB_NEXT, "rocksdb.number.db.next"},
+    {NUMBER_DB_PREV, "rocksdb.number.db.prev"},
+    {NUMBER_DB_SEEK_FOUND, "rocksdb.number.db.seek.found"},
+    {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"},
+    {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"},
+    {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"},
+    {NO_FILE_CLOSES, "rocksdb.no.file.closes"},
+    {NO_FILE_OPENS, "rocksdb.no.file.opens"},
+    {NO_FILE_ERRORS, "rocksdb.no.file.errors"},
+    {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"},
+    {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"},
+    {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"},
+    {STALL_MICROS, "rocksdb.stall.micros"},
+    {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"},
+    {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"},
+    {NO_ITERATORS, "rocksdb.num.iterators"},
+    {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"},
+    {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"},
+    {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"},
+    {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"},
+    {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"},
+    {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"},
+    {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"},
+    {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"},
+    {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"},
+    {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"},
+    {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"},
+    {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"},
+    {BLOCK_CACHE_COMPRESSED_ADD_FAILURES,
+     "rocksdb.block.cachecompressed.add.failures"},
+    {WAL_FILE_SYNCED, "rocksdb.wal.synced"},
+    {WAL_FILE_BYTES, "rocksdb.wal.bytes"},
+    {WRITE_DONE_BY_SELF, "rocksdb.write.self"},
+    {WRITE_DONE_BY_OTHER, "rocksdb.write.other"},
+    {WRITE_TIMEDOUT, "rocksdb.write.timeout"},
+    {WRITE_WITH_WAL, "rocksdb.write.wal"},
+    {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
+    {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
+    {FLUSH_WRITE_BYTES, "rocksdb.flush.write.bytes"},
+    {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
+     "rocksdb.number.direct.load.table.properties"},
+    {NUMBER_SUPERVERSION_ACQUIRES, "rocksdb.number.superversion_acquires"},
+    {NUMBER_SUPERVERSION_RELEASES, "rocksdb.number.superversion_releases"},
+    {NUMBER_SUPERVERSION_CLEANUPS, "rocksdb.number.superversion_cleanups"},
+    {NUMBER_BLOCK_COMPRESSED, "rocksdb.number.block.compressed"},
+    {NUMBER_BLOCK_DECOMPRESSED, "rocksdb.number.block.decompressed"},
+    {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"},
+    {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"},
+    {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"},
+    {ROW_CACHE_HIT, "rocksdb.row.cache.hit"},
+    {ROW_CACHE_MISS, "rocksdb.row.cache.miss"},
+    {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"},
+    {READ_AMP_TOTAL_READ_BYTES, "rocksdb.read.amp.total.read.bytes"},
+    {NUMBER_RATE_LIMITER_DRAINS, "rocksdb.number.rate_limiter.drains"},
+    {NUMBER_ITER_SKIP, "rocksdb.number.iter.skip"},
+    {BLOB_DB_NUM_PUT, "rocksdb.blobdb.num.put"},
+    {BLOB_DB_NUM_WRITE, "rocksdb.blobdb.num.write"},
+    {BLOB_DB_NUM_GET, "rocksdb.blobdb.num.get"},
+    {BLOB_DB_NUM_MULTIGET, "rocksdb.blobdb.num.multiget"},
+    {BLOB_DB_NUM_SEEK, "rocksdb.blobdb.num.seek"},
+    {BLOB_DB_NUM_NEXT, "rocksdb.blobdb.num.next"},
+    {BLOB_DB_NUM_PREV, "rocksdb.blobdb.num.prev"},
+    {BLOB_DB_NUM_KEYS_WRITTEN, "rocksdb.blobdb.num.keys.written"},
+    {BLOB_DB_NUM_KEYS_READ, "rocksdb.blobdb.num.keys.read"},
+    {BLOB_DB_BYTES_WRITTEN, "rocksdb.blobdb.bytes.written"},
+    {BLOB_DB_BYTES_READ, "rocksdb.blobdb.bytes.read"},
+    {BLOB_DB_WRITE_INLINED, "rocksdb.blobdb.write.inlined"},
+    {BLOB_DB_WRITE_INLINED_TTL, "rocksdb.blobdb.write.inlined.ttl"},
+    {BLOB_DB_WRITE_BLOB, "rocksdb.blobdb.write.blob"},
+    {BLOB_DB_WRITE_BLOB_TTL, "rocksdb.blobdb.write.blob.ttl"},
+    {BLOB_DB_BLOB_FILE_BYTES_WRITTEN, "rocksdb.blobdb.blob.file.bytes.written"},
+    {BLOB_DB_BLOB_FILE_BYTES_READ, "rocksdb.blobdb.blob.file.bytes.read"},
+    {BLOB_DB_BLOB_FILE_SYNCED, "rocksdb.blobdb.blob.file.synced"},
+    {BLOB_DB_BLOB_INDEX_EXPIRED_COUNT,
+     "rocksdb.blobdb.blob.index.expired.count"},
+    {BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, "rocksdb.blobdb.blob.index.expired.size"},
+    {BLOB_DB_BLOB_INDEX_EVICTED_COUNT,
+     "rocksdb.blobdb.blob.index.evicted.count"},
+    {BLOB_DB_BLOB_INDEX_EVICTED_SIZE, "rocksdb.blobdb.blob.index.evicted.size"},
+    {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"},
+    {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"},
+    {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"},
+    {BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"},
+    {BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"},
+    {BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"},
+    {BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"},
+    {BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"},
+    {BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"},
+    {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"},
+    {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"},
+    {BLOB_DB_FIFO_BYTES_EVICTED, "rocksdb.blobdb.fifo.bytes.evicted"},
+    {TXN_PREPARE_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.prepare"},
+    {TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD,
+     "rocksdb.txn.overhead.mutex.old.commit.map"},
+    {TXN_DUPLICATE_KEY_OVERHEAD, "rocksdb.txn.overhead.duplicate.key"},
+    {TXN_SNAPSHOT_MUTEX_OVERHEAD, "rocksdb.txn.overhead.mutex.snapshot"},
+    {NUMBER_MULTIGET_KEYS_FOUND, "rocksdb.number.multiget.keys.found"},
+    {NO_ITERATOR_CREATED, "rocksdb.num.iterator.created"},
+    {NO_ITERATOR_DELETED, "rocksdb.num.iterator.deleted"},
+    {BLOCK_CACHE_COMPRESSION_DICT_MISS,
+     "rocksdb.block.cache.compression.dict.miss"},
+    {BLOCK_CACHE_COMPRESSION_DICT_HIT,
+     "rocksdb.block.cache.compression.dict.hit"},
+    {BLOCK_CACHE_COMPRESSION_DICT_ADD,
+     "rocksdb.block.cache.compression.dict.add"},
+    {BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+     "rocksdb.block.cache.compression.dict.bytes.insert"},
+    {BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+     "rocksdb.block.cache.compression.dict.bytes.evict"},
+};
+
+const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
+    {DB_GET, "rocksdb.db.get.micros"},
+    {DB_WRITE, "rocksdb.db.write.micros"},
+    {COMPACTION_TIME, "rocksdb.compaction.times.micros"},
+    {COMPACTION_CPU_TIME, "rocksdb.compaction.times.cpu_micros"},
+    {SUBCOMPACTION_SETUP_TIME, "rocksdb.subcompaction.setup.times.micros"},
+    {TABLE_SYNC_MICROS, "rocksdb.table.sync.micros"},
+    {COMPACTION_OUTFILE_SYNC_MICROS, "rocksdb.compaction.outfile.sync.micros"},
+    {WAL_FILE_SYNC_MICROS, "rocksdb.wal.file.sync.micros"},
+    {MANIFEST_FILE_SYNC_MICROS, "rocksdb.manifest.file.sync.micros"},
+    {TABLE_OPEN_IO_MICROS, "rocksdb.table.open.io.micros"},
+    {DB_MULTIGET, "rocksdb.db.multiget.micros"},
+    {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"},
+    {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"},
+    {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"},
+    {STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"},
+    {STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"},
+    {STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"},
+    {HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"},
+    {SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"},
+    {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"},
+    {DB_SEEK, "rocksdb.db.seek.micros"},
+    {WRITE_STALL, "rocksdb.db.write.stall"},
+    {SST_READ_MICROS, "rocksdb.sst.read.micros"},
+    {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"},
+    {BYTES_PER_READ, "rocksdb.bytes.per.read"},
+    {BYTES_PER_WRITE, "rocksdb.bytes.per.write"},
+    {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"},
+    {BYTES_COMPRESSED, "rocksdb.bytes.compressed"},
+    {BYTES_DECOMPRESSED, "rocksdb.bytes.decompressed"},
+    {COMPRESSION_TIMES_NANOS, "rocksdb.compression.times.nanos"},
+    {DECOMPRESSION_TIMES_NANOS, "rocksdb.decompression.times.nanos"},
+    {READ_NUM_MERGE_OPERANDS, "rocksdb.read.num.merge_operands"},
+    {BLOB_DB_KEY_SIZE, "rocksdb.blobdb.key.size"},
+    {BLOB_DB_VALUE_SIZE, "rocksdb.blobdb.value.size"},
+    {BLOB_DB_WRITE_MICROS, "rocksdb.blobdb.write.micros"},
+    {BLOB_DB_GET_MICROS, "rocksdb.blobdb.get.micros"},
+    {BLOB_DB_MULTIGET_MICROS, "rocksdb.blobdb.multiget.micros"},
+    {BLOB_DB_SEEK_MICROS, "rocksdb.blobdb.seek.micros"},
+    {BLOB_DB_NEXT_MICROS, "rocksdb.blobdb.next.micros"},
+    {BLOB_DB_PREV_MICROS, "rocksdb.blobdb.prev.micros"},
+    {BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"},
+    {BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"},
+    {BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"},
+    {BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"},
+    {BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"},
+    {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"},
+    {FLUSH_TIME, "rocksdb.db.flush.micros"},
+};
+
 std::shared_ptr<Statistics> CreateDBStatistics() {
-  return std::make_shared<StatisticsImpl>(nullptr, false);
+  return std::make_shared<StatisticsImpl>(nullptr);
 }
 
-StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats,
-                               bool enable_internal_stats)
-    : stats_(std::move(stats)), enable_internal_stats_(enable_internal_stats) {}
+StatisticsImpl::StatisticsImpl(std::shared_ptr<Statistics> stats)
+    : stats_(std::move(stats)) {}
 
 StatisticsImpl::~StatisticsImpl() {}
 
@@ -33,10 +245,7 @@ uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const {
 }
 
 uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const {
-  assert(
-    enable_internal_stats_ ?
-      tickerType < INTERNAL_TICKER_ENUM_MAX :
-      tickerType < TICKER_ENUM_MAX);
+  assert(tickerType < TICKER_ENUM_MAX);
   uint64_t res = 0;
   for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
     res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType];
@@ -52,10 +261,7 @@ void StatisticsImpl::histogramData(uint32_t histogramType,
 
 std::unique_ptr<HistogramImpl> StatisticsImpl::getHistogramImplLocked(
     uint32_t histogramType) const {
-  assert(
-    enable_internal_stats_ ?
-      histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
-      histogramType < HISTOGRAM_ENUM_MAX);
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
   std::unique_ptr<HistogramImpl> res_hist(new HistogramImpl());
   for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
     res_hist->Merge(
@@ -80,8 +286,7 @@ void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) {
 }
 
 void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) {
-  assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX
-                                : tickerType < TICKER_ENUM_MAX);
+  assert(tickerType < TICKER_ENUM_MAX);
   for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
     if (core_idx == 0) {
       per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count;
@@ -95,8 +300,7 @@ uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
   uint64_t sum = 0;
   {
     MutexLock lock(&aggregate_lock_);
-    assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX
-                                  : tickerType < TICKER_ENUM_MAX);
+    assert(tickerType < TICKER_ENUM_MAX);
     for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) {
       sum +=
           per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange(
@@ -110,10 +314,7 @@ uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) {
 }
 
 void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
-  assert(
-    enable_internal_stats_ ?
-      tickerType < INTERNAL_TICKER_ENUM_MAX :
-      tickerType < TICKER_ENUM_MAX);
+  assert(tickerType < TICKER_ENUM_MAX);
   per_core_stats_.Access()->tickers_[tickerType].fetch_add(
       count, std::memory_order_relaxed);
   if (stats_ && tickerType < TICKER_ENUM_MAX) {
@@ -121,14 +322,14 @@ void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) {
   }
 }
 
-void StatisticsImpl::measureTime(uint32_t histogramType, uint64_t value) {
-  assert(
-    enable_internal_stats_ ?
-      histogramType < INTERNAL_HISTOGRAM_ENUM_MAX :
-      histogramType < HISTOGRAM_ENUM_MAX);
+void StatisticsImpl::recordInHistogram(uint32_t histogramType, uint64_t value) {
+  assert(histogramType < HISTOGRAM_ENUM_MAX);
+  if (get_stats_level() <= StatsLevel::kExceptHistogramOrTimers) {
+    return;
+  }
   per_core_stats_.Access()->histograms_[histogramType].Add(value);
   if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) {
-    stats_->measureTime(histogramType, value);
+    stats_->recordInHistogram(histogramType, value);
   }
 }
 
@@ -157,35 +358,50 @@ std::string StatisticsImpl::ToString() const {
   std::string res;
   res.reserve(20000);
   for (const auto& t : TickersNameMap) {
-    if (t.first < TICKER_ENUM_MAX || enable_internal_stats_) {
-      char buffer[kTmpStrBufferSize];
-      snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n",
-               t.second.c_str(), getTickerCountLocked(t.first));
-      res.append(buffer);
-    }
+    assert(t.first < TICKER_ENUM_MAX);
+    char buffer[kTmpStrBufferSize];
+    snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n",
+             t.second.c_str(), getTickerCountLocked(t.first));
+    res.append(buffer);
   }
   for (const auto& h : HistogramsNameMap) {
-    if (h.first < HISTOGRAM_ENUM_MAX || enable_internal_stats_) {
-      char buffer[kTmpStrBufferSize];
-      HistogramData hData;
-      getHistogramImplLocked(h.first)->Data(&hData);
-      snprintf(
-          buffer, kTmpStrBufferSize,
-          "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f 100 : %f\n",
-          h.second.c_str(), hData.median, hData.percentile95,
-          hData.percentile99, hData.max);
-      res.append(buffer);
+    assert(h.first < HISTOGRAM_ENUM_MAX);
+    char buffer[kTmpStrBufferSize];
+    HistogramData hData;
+    getHistogramImplLocked(h.first)->Data(&hData);
+    // don't handle failures - buffer should always be big enough and arguments
+    // should be provided correctly
+    int ret =
+        snprintf(buffer, kTmpStrBufferSize,
+                 "%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64
+                 " SUM : %" PRIu64 "\n",
+                 h.second.c_str(), hData.median, hData.percentile95,
+                 hData.percentile99, hData.max, hData.count, hData.sum);
+    if (ret < 0 || ret >= kTmpStrBufferSize) {
+      assert(false);
+      continue;
     }
+    res.append(buffer);
   }
   res.shrink_to_fit();
   return res;
 }
 
-bool StatisticsImpl::HistEnabledForType(uint32_t type) const {
-  if (LIKELY(!enable_internal_stats_)) {
-    return type < HISTOGRAM_ENUM_MAX;
+bool StatisticsImpl::getTickerMap(
+    std::map<std::string, uint64_t>* stats_map) const {
+  assert(stats_map);
+  if (!stats_map) return false;
+  stats_map->clear();
+  MutexLock lock(&aggregate_lock_);
+  for (const auto& t : TickersNameMap) {
+    assert(t.first < TICKER_ENUM_MAX);
+    (*stats_map)[t.second.c_str()] = getTickerCountLocked(t.first);
   }
   return true;
 }
 
+bool StatisticsImpl::HistEnabledForType(uint32_t type) const {
+  return type < HISTOGRAM_ENUM_MAX;
+}
+
 } // namespace rocksdb
diff --git a/thirdparty/rocksdb/monitoring/statistics.h b/thirdparty/rocksdb/monitoring/statistics.h
index 6e915215de..952bf8cb41 100644
--- a/thirdparty/rocksdb/monitoring/statistics.h
+++ b/thirdparty/rocksdb/monitoring/statistics.h
@@ -6,9 +6,10 @@
 #pragma once
 #include "rocksdb/statistics.h"
 
-#include <vector>
 #include <atomic>
+#include <map>
 #include <string>
+#include <vector>
 
 #include "monitoring/histogram.h"
 #include "port/likely.h"
@@ -22,6 +23,11 @@
 #define ROCKSDB_FIELD_UNUSED
 #endif  // __clang__
 
+#ifndef STRINGIFY
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+#endif
+
 namespace rocksdb {
 
 enum TickersInternal : uint32_t {
@@ -34,11 +40,9 @@ enum HistogramsInternal : uint32_t {
   INTERNAL_HISTOGRAM_ENUM_MAX
 };
 
-
 class StatisticsImpl : public Statistics {
  public:
-  StatisticsImpl(std::shared_ptr<Statistics> stats,
-                 bool enable_internal_stats);
+  StatisticsImpl(std::shared_ptr<Statistics> stats);
   virtual ~StatisticsImpl();
 
   virtual uint64_t getTickerCount(uint32_t ticker_type) const override;
@@ -49,17 +53,24 @@ class StatisticsImpl : public Statistics {
   virtual void setTickerCount(uint32_t ticker_type, uint64_t count) override;
   virtual uint64_t getAndResetTickerCount(uint32_t ticker_type) override;
   virtual void recordTick(uint32_t ticker_type, uint64_t count) override;
-  virtual void measureTime(uint32_t histogram_type, uint64_t value) override;
+  // The function is implemented for now for backward compatibility reason.
+  // In case a user explictly calls it, for example, they may have a wrapped
+  // Statistics object, passing the call to recordTick() into here, nothing
+  // will break.
+  void measureTime(uint32_t histogramType, uint64_t time) override {
+    recordInHistogram(histogramType, time);
+  }
+  virtual void recordInHistogram(uint32_t histogram_type,
+                                 uint64_t value) override;
 
   virtual Status Reset() override;
   virtual std::string ToString() const override;
+  virtual bool getTickerMap(std::map<std::string, uint64_t>*) const override;
   virtual bool HistEnabledForType(uint32_t type) const override;
 
  private:
   // If non-nullptr, forwards updates to the object pointed to by `stats_`.
   std::shared_ptr<Statistics> stats_;
-  // TODO(ajkr): clean this up since there are no internal stats anymore
-  bool enable_internal_stats_;
   // Synchronizes anything that operates across other cores' local data,
   // such that operations like Reset() can be performed atomically.
   mutable port::Mutex aggregate_lock_;
@@ -69,18 +80,23 @@ class StatisticsImpl : public Statistics {
   // cores can never share the same cache line.
   //
   // Alignment attributes expand to nothing depending on the platform
-  struct StatisticsData {
+  struct ALIGN_AS(CACHE_LINE_SIZE) StatisticsData {
     std::atomic_uint_fast64_t tickers_[INTERNAL_TICKER_ENUM_MAX] = {{0}};
     HistogramImpl histograms_[INTERNAL_HISTOGRAM_ENUM_MAX];
+#ifndef HAVE_ALIGNED_NEW
     char
         padding[(CACHE_LINE_SIZE -
                  (INTERNAL_TICKER_ENUM_MAX * sizeof(std::atomic_uint_fast64_t) +
                   INTERNAL_HISTOGRAM_ENUM_MAX * sizeof(HistogramImpl)) %
-                     CACHE_LINE_SIZE) %
-                CACHE_LINE_SIZE] ROCKSDB_FIELD_UNUSED;
+                     CACHE_LINE_SIZE)] ROCKSDB_FIELD_UNUSED;
+#endif
+    void *operator new(size_t s) { return port::cacheline_aligned_alloc(s); }
+    void *operator new[](size_t s) { return port::cacheline_aligned_alloc(s); }
+    void operator delete(void *p) { port::cacheline_aligned_free(p); }
+    void operator delete[](void *p) { port::cacheline_aligned_free(p); }
   };
 
-  static_assert(sizeof(StatisticsData) % 64 == 0, "Expected 64-byte aligned");
+  static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0, "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned");
 
   CoreLocalArray<StatisticsData> per_core_stats_;
 
@@ -91,10 +107,17 @@ class StatisticsImpl : public Statistics {
 };
 
 // Utility functions
-inline void MeasureTime(Statistics* statistics, uint32_t histogram_type,
-                        uint64_t value) {
+inline void RecordInHistogram(Statistics* statistics, uint32_t histogram_type,
+                              uint64_t value) {
+  if (statistics) {
+    statistics->recordInHistogram(histogram_type, value);
+  }
+}
+
+inline void RecordTimeToHistogram(Statistics* statistics,
+                                  uint32_t histogram_type, uint64_t value) {
   if (statistics) {
-    statistics->measureTime(histogram_type, value);
+    statistics->reportTimeToHistogram(histogram_type, value);
   }
 }
 
diff --git a/thirdparty/rocksdb/monitoring/statistics_test.cc b/thirdparty/rocksdb/monitoring/statistics_test.cc
index 43aacde9c1..a77022bfb3 100644
--- a/thirdparty/rocksdb/monitoring/statistics_test.cc
+++ b/thirdparty/rocksdb/monitoring/statistics_test.cc
@@ -16,7 +16,7 @@ class StatisticsTest : public testing::Test {};
 
 // Sanity check to make sure that contents and order of TickersNameMap
 // match Tickers enum
-TEST_F(StatisticsTest, Sanity) {
+TEST_F(StatisticsTest, SanityTickers) {
   EXPECT_EQ(static_cast<size_t>(Tickers::TICKER_ENUM_MAX),
             TickersNameMap.size());
 
@@ -26,6 +26,18 @@ TEST_F(StatisticsTest, Sanity) {
   }
 }
 
+// Sanity check to make sure that contents and order of HistogramsNameMap
+// match Tickers enum
+TEST_F(StatisticsTest, SanityHistograms) {
+  EXPECT_EQ(static_cast<size_t>(Histograms::HISTOGRAM_ENUM_MAX),
+            HistogramsNameMap.size());
+
+  for (uint32_t h = 0; h < Histograms::HISTOGRAM_ENUM_MAX; h++) {
+    auto pair = HistogramsNameMap[static_cast<size_t>(h)];
+    ASSERT_EQ(pair.first, h) << "Miss match at " << pair.second;
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/monitoring/thread_status_impl.cc b/thirdparty/rocksdb/monitoring/thread_status_impl.cc
index e263ce661e..f4531ce75c 100644
--- a/thirdparty/rocksdb/monitoring/thread_status_impl.cc
+++ b/thirdparty/rocksdb/monitoring/thread_status_impl.cc
@@ -14,14 +14,21 @@
 namespace rocksdb {
 
 #ifdef ROCKSDB_USING_THREAD_STATUS
-const std::string& ThreadStatus::GetThreadTypeName(
+std::string ThreadStatus::GetThreadTypeName(
     ThreadStatus::ThreadType thread_type) {
-  static std::string thread_type_names[NUM_THREAD_TYPES + 1] = {
-      "High Pri", "Low Pri", "User", "Unknown"};
-  if (thread_type < 0 || thread_type >= NUM_THREAD_TYPES) {
-    return thread_type_names[NUM_THREAD_TYPES];  // "Unknown"
+  switch (thread_type) {
+    case ThreadStatus::ThreadType::HIGH_PRIORITY:
+      return "High Pri";
+    case ThreadStatus::ThreadType::LOW_PRIORITY:
+      return "Low Pri";
+    case ThreadStatus::ThreadType::USER:
+      return "User";
+    case ThreadStatus::ThreadType::BOTTOM_PRIORITY:
+      return "Bottom Pri";
+    case ThreadStatus::ThreadType::NUM_THREAD_TYPES:
+      assert(false);
   }
-  return thread_type_names[thread_type];
+  return "Unknown";
 }
 
 const std::string& ThreadStatus::GetOperationName(
@@ -77,10 +84,8 @@ const std::string& ThreadStatus::GetOperationPropertyName(
   }
 }
 
-std::map<std::string, uint64_t>
-    ThreadStatus::InterpretOperationProperties(
-    ThreadStatus::OperationType op_type,
-    const uint64_t* op_properties) {
+std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
+    ThreadStatus::OperationType op_type, const uint64_t* op_properties) {
   int num_properties;
   switch (op_type) {
     case OP_COMPACTION:
@@ -95,20 +100,14 @@ std::map<std::string, uint64_t>
 
   std::map<std::string, uint64_t> property_map;
   for (int i = 0; i < num_properties; ++i) {
-    if (op_type == OP_COMPACTION &&
-        i == COMPACTION_INPUT_OUTPUT_LEVEL) {
-      property_map.insert(
-          {"BaseInputLevel", op_properties[i] >> 32});
+    if (op_type == OP_COMPACTION && i == COMPACTION_INPUT_OUTPUT_LEVEL) {
+      property_map.insert({"BaseInputLevel", op_properties[i] >> 32});
       property_map.insert(
           {"OutputLevel", op_properties[i] % (uint64_t(1) << 32U)});
-    } else if (op_type == OP_COMPACTION &&
-               i == COMPACTION_PROP_FLAGS) {
-      property_map.insert(
-          {"IsManual", ((op_properties[i] & 2) >> 1)});
-      property_map.insert(
-          {"IsDeletion", ((op_properties[i] & 4) >> 2)});
-      property_map.insert(
-          {"IsTrivialMove", ((op_properties[i] & 8) >> 3)});
+    } else if (op_type == OP_COMPACTION && i == COMPACTION_PROP_FLAGS) {
+      property_map.insert({"IsManual", ((op_properties[i] & 2) >> 1)});
+      property_map.insert({"IsDeletion", ((op_properties[i] & 4) >> 2)});
+      property_map.insert({"IsTrivialMove", ((op_properties[i] & 8) >> 3)});
     } else {
       property_map.insert(
           {GetOperationPropertyName(op_type, i), op_properties[i]});
@@ -117,49 +116,46 @@ std::map<std::string, uint64_t>
   return property_map;
 }
 
-
 #else
 
-const std::string& ThreadStatus::GetThreadTypeName(
-    ThreadStatus::ThreadType thread_type) {
+std::string ThreadStatus::GetThreadTypeName(
+    ThreadStatus::ThreadType /*thread_type*/) {
   static std::string dummy_str = "";
   return dummy_str;
 }
 
 const std::string& ThreadStatus::GetOperationName(
-    ThreadStatus::OperationType op_type) {
+    ThreadStatus::OperationType /*op_type*/) {
   static std::string dummy_str = "";
   return dummy_str;
 }
 
 const std::string& ThreadStatus::GetOperationStageName(
-    ThreadStatus::OperationStage stage) {
+    ThreadStatus::OperationStage /*stage*/) {
   static std::string dummy_str = "";
   return dummy_str;
 }
 
 const std::string& ThreadStatus::GetStateName(
-    ThreadStatus::StateType state_type) {
+    ThreadStatus::StateType /*state_type*/) {
   static std::string dummy_str = "";
   return dummy_str;
 }
 
-const std::string ThreadStatus::MicrosToString(
-    uint64_t op_elapsed_time) {
+const std::string ThreadStatus::MicrosToString(uint64_t /*op_elapsed_time*/) {
   static std::string dummy_str = "";
   return dummy_str;
 }
 
 const std::string& ThreadStatus::GetOperationPropertyName(
-    ThreadStatus::OperationType op_type, int i) {
+    ThreadStatus::OperationType /*op_type*/, int /*i*/) {
   static std::string dummy_str = "";
   return dummy_str;
 }
 
-std::map<std::string, uint64_t>
-    ThreadStatus::InterpretOperationProperties(
-    ThreadStatus::OperationType op_type,
-    const uint64_t* op_properties) {
+std::map<std::string, uint64_t> ThreadStatus::InterpretOperationProperties(
+    ThreadStatus::OperationType /*op_type*/,
+    const uint64_t* /*op_properties*/) {
   return std::map<std::string, uint64_t>();
 }
 
diff --git a/thirdparty/rocksdb/monitoring/thread_status_updater.cc b/thirdparty/rocksdb/monitoring/thread_status_updater.cc
index 7441c35f8b..cde44928b6 100644
--- a/thirdparty/rocksdb/monitoring/thread_status_updater.cc
+++ b/thirdparty/rocksdb/monitoring/thread_status_updater.cc
@@ -15,8 +15,8 @@ namespace rocksdb {
 
 __thread ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = nullptr;
 
-void ThreadStatusUpdater::RegisterThread(
-    ThreadStatus::ThreadType ttype, uint64_t thread_id) {
+void ThreadStatusUpdater::RegisterThread(ThreadStatus::ThreadType ttype,
+                                         uint64_t thread_id) {
   if (UNLIKELY(thread_status_data_ == nullptr)) {
     thread_status_data_ = new ThreadStatusData();
     thread_status_data_->thread_type = ttype;
@@ -43,8 +43,7 @@ void ThreadStatusUpdater::ResetThreadStatus() {
   SetColumnFamilyInfoKey(nullptr);
 }
 
-void ThreadStatusUpdater::SetColumnFamilyInfoKey(
-    const void* cf_key) {
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* cf_key) {
   auto* data = Get();
   if (data == nullptr) {
     return;
@@ -78,13 +77,12 @@ void ThreadStatusUpdater::SetThreadOperation(
   data->operation_type.store(type, std::memory_order_release);
   if (type == ThreadStatus::OP_UNKNOWN) {
     data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN,
-        std::memory_order_relaxed);
+                                std::memory_order_relaxed);
     ClearThreadOperationProperties();
   }
 }
 
-void ThreadStatusUpdater::SetThreadOperationProperty(
-    int i, uint64_t value) {
+void ThreadStatusUpdater::SetThreadOperationProperty(int i, uint64_t value) {
   auto* data = GetLocalThreadStatus();
   if (data == nullptr) {
     return;
@@ -92,8 +90,8 @@ void ThreadStatusUpdater::SetThreadOperationProperty(
   data->op_properties[i].store(value, std::memory_order_relaxed);
 }
 
-void ThreadStatusUpdater::IncreaseThreadOperationProperty(
-    int i, uint64_t delta) {
+void ThreadStatusUpdater::IncreaseThreadOperationProperty(int i,
+                                                          uint64_t delta) {
   auto* data = GetLocalThreadStatus();
   if (data == nullptr) {
     return;
@@ -115,9 +113,9 @@ void ThreadStatusUpdater::ClearThreadOperation() {
     return;
   }
   data->operation_stage.store(ThreadStatus::STAGE_UNKNOWN,
-      std::memory_order_relaxed);
-  data->operation_type.store(
-      ThreadStatus::OP_UNKNOWN, std::memory_order_relaxed);
+                              std::memory_order_relaxed);
+  data->operation_type.store(ThreadStatus::OP_UNKNOWN,
+                             std::memory_order_relaxed);
   ClearThreadOperationProperties();
 }
 
@@ -137,12 +135,10 @@ ThreadStatus::OperationStage ThreadStatusUpdater::SetThreadOperationStage(
   if (data == nullptr) {
     return ThreadStatus::STAGE_UNKNOWN;
   }
-  return data->operation_stage.exchange(
-      stage, std::memory_order_relaxed);
+  return data->operation_stage.exchange(stage, std::memory_order_relaxed);
 }
 
-void ThreadStatusUpdater::SetThreadState(
-    const ThreadStatus::StateType type) {
+void ThreadStatusUpdater::SetThreadState(const ThreadStatus::StateType type) {
   auto* data = GetLocalThreadStatus();
   if (data == nullptr) {
     return;
@@ -155,8 +151,8 @@ void ThreadStatusUpdater::ClearThreadState() {
   if (data == nullptr) {
     return;
   }
-  data->state_type.store(
-      ThreadStatus::STATE_UNKNOWN, std::memory_order_relaxed);
+  data->state_type.store(ThreadStatus::STATE_UNKNOWN,
+                         std::memory_order_relaxed);
 }
 
 Status ThreadStatusUpdater::GetThreadList(
@@ -168,50 +164,40 @@ Status ThreadStatusUpdater::GetThreadList(
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
   for (auto* thread_data : thread_data_set_) {
     assert(thread_data);
-    auto thread_id = thread_data->thread_id.load(
-        std::memory_order_relaxed);
-    auto thread_type = thread_data->thread_type.load(
-        std::memory_order_relaxed);
+    auto thread_id = thread_data->thread_id.load(std::memory_order_relaxed);
+    auto thread_type = thread_data->thread_type.load(std::memory_order_relaxed);
     // Since any change to cf_info_map requires thread_list_mutex,
     // which is currently held by GetThreadList(), here we can safely
     // use "memory_order_relaxed" to load the cf_key.
-    auto cf_key = thread_data->cf_key.load(
-        std::memory_order_relaxed);
-    auto iter = cf_info_map_.find(cf_key);
-    auto* cf_info = iter != cf_info_map_.end() ?
-        iter->second.get() : nullptr;
-    const std::string* db_name = nullptr;
-    const std::string* cf_name = nullptr;
+    auto cf_key = thread_data->cf_key.load(std::memory_order_relaxed);
+
     ThreadStatus::OperationType op_type = ThreadStatus::OP_UNKNOWN;
     ThreadStatus::OperationStage op_stage = ThreadStatus::STAGE_UNKNOWN;
     ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN;
     uint64_t op_elapsed_micros = 0;
     uint64_t op_props[ThreadStatus::kNumOperationProperties] = {0};
-    if (cf_info != nullptr) {
-      db_name = &cf_info->db_name;
-      cf_name = &cf_info->cf_name;
-      op_type = thread_data->operation_type.load(
-          std::memory_order_acquire);
+
+    auto iter = cf_info_map_.find(cf_key);
+    if (iter != cf_info_map_.end()) {
+      op_type = thread_data->operation_type.load(std::memory_order_acquire);
       // display lower-level info only when higher-level info is available.
       if (op_type != ThreadStatus::OP_UNKNOWN) {
         op_elapsed_micros = now_micros - thread_data->op_start_time.load(
-            std::memory_order_relaxed);
-        op_stage = thread_data->operation_stage.load(
-            std::memory_order_relaxed);
-        state_type = thread_data->state_type.load(
-            std::memory_order_relaxed);
+                                             std::memory_order_relaxed);
+        op_stage = thread_data->operation_stage.load(std::memory_order_relaxed);
+        state_type = thread_data->state_type.load(std::memory_order_relaxed);
         for (int i = 0; i < ThreadStatus::kNumOperationProperties; ++i) {
-          op_props[i] = thread_data->op_properties[i].load(
-              std::memory_order_relaxed);
+          op_props[i] =
+              thread_data->op_properties[i].load(std::memory_order_relaxed);
         }
       }
     }
+
     thread_list->emplace_back(
         thread_id, thread_type,
-        db_name ? *db_name : "",
-        cf_name ? *cf_name : "",
-        op_type, op_elapsed_micros, op_stage, op_props,
-        state_type);
+        iter != cf_info_map_.end() ? iter->second.db_name : "",
+        iter != cf_info_map_.end() ? iter->second.cf_name : "", op_type,
+        op_elapsed_micros, op_stage, op_props, state_type);
   }
 
   return Status::OK();
@@ -222,22 +208,23 @@ ThreadStatusData* ThreadStatusUpdater::GetLocalThreadStatus() {
     return nullptr;
   }
   if (!thread_status_data_->enable_tracking) {
-    assert(thread_status_data_->cf_key.load(
-        std::memory_order_relaxed) == nullptr);
+    assert(thread_status_data_->cf_key.load(std::memory_order_relaxed) ==
+           nullptr);
     return nullptr;
   }
   return thread_status_data_;
 }
 
-void ThreadStatusUpdater::NewColumnFamilyInfo(
-    const void* db_key, const std::string& db_name,
-    const void* cf_key, const std::string& cf_name) {
+void ThreadStatusUpdater::NewColumnFamilyInfo(const void* db_key,
+                                              const std::string& db_name,
+                                              const void* cf_key,
+                                              const std::string& cf_name) {
   // Acquiring same lock as GetThreadList() to guarantee
   // a consistent view of global column family table (cf_info_map).
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
 
-  cf_info_map_[cf_key].reset(
-      new ConstantColumnFamilyInfo(db_key, db_name, cf_name));
+  cf_info_map_.emplace(std::piecewise_construct, std::make_tuple(cf_key),
+                       std::make_tuple(db_key, db_name, cf_name));
   db_key_map_[db_key].insert(cf_key);
 }
 
@@ -245,25 +232,20 @@ void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
   // Acquiring same lock as GetThreadList() to guarantee
   // a consistent view of global column family table (cf_info_map).
   std::lock_guard<std::mutex> lck(thread_list_mutex_);
+
   auto cf_pair = cf_info_map_.find(cf_key);
-  if (cf_pair == cf_info_map_.end()) {
-    return;
+  if (cf_pair != cf_info_map_.end()) {
+    // Remove its entry from db_key_map_ by the following steps:
+    // 1. Obtain the entry in db_key_map_ whose set contains cf_key
+    // 2. Remove it from the set.
+    ConstantColumnFamilyInfo& cf_info = cf_pair->second;
+    auto db_pair = db_key_map_.find(cf_info.db_key);
+    assert(db_pair != db_key_map_.end());
+    size_t result __attribute__((__unused__));
+    result = db_pair->second.erase(cf_key);
+    assert(result);
+    cf_info_map_.erase(cf_pair);
   }
-
-  auto* cf_info = cf_pair->second.get();
-  assert(cf_info);
-
-  // Remove its entry from db_key_map_ by the following steps:
-  // 1. Obtain the entry in db_key_map_ whose set contains cf_key
-  // 2. Remove it from the set.
-  auto db_pair = db_key_map_.find(cf_info->db_key);
-  assert(db_pair != db_key_map_.end());
-  size_t result __attribute__((unused)) = db_pair->second.erase(cf_key);
-  assert(result);
-
-  cf_pair->second.reset();
-  result = cf_info_map_.erase(cf_key);
-  assert(result);
 }
 
 void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
@@ -277,73 +259,56 @@ void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
     return;
   }
 
-  size_t result __attribute__((unused)) = 0;
   for (auto cf_key : db_pair->second) {
     auto cf_pair = cf_info_map_.find(cf_key);
-    if (cf_pair == cf_info_map_.end()) {
-      continue;
+    if (cf_pair != cf_info_map_.end()) {
+      cf_info_map_.erase(cf_pair);
     }
-    cf_pair->second.reset();
-    result = cf_info_map_.erase(cf_key);
-    assert(result);
   }
   db_key_map_.erase(db_key);
 }
 
 #else
 
-void ThreadStatusUpdater::RegisterThread(
-    ThreadStatus::ThreadType ttype, uint64_t thread_id) {
-}
+void ThreadStatusUpdater::RegisterThread(ThreadStatus::ThreadType /*ttype*/,
+                                         uint64_t /*thread_id*/) {}
 
-void ThreadStatusUpdater::UnregisterThread() {
-}
+void ThreadStatusUpdater::UnregisterThread() {}
 
-void ThreadStatusUpdater::ResetThreadStatus() {
-}
+void ThreadStatusUpdater::ResetThreadStatus() {}
 
-void ThreadStatusUpdater::SetColumnFamilyInfoKey(
-    const void* cf_key) {
-}
+void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* /*cf_key*/) {}
 
 void ThreadStatusUpdater::SetThreadOperation(
-    const ThreadStatus::OperationType type) {
-}
+    const ThreadStatus::OperationType /*type*/) {}
 
-void ThreadStatusUpdater::ClearThreadOperation() {
-}
+void ThreadStatusUpdater::ClearThreadOperation() {}
 
 void ThreadStatusUpdater::SetThreadState(
-    const ThreadStatus::StateType type) {
-}
+    const ThreadStatus::StateType /*type*/) {}
 
-void ThreadStatusUpdater::ClearThreadState() {
-}
+void ThreadStatusUpdater::ClearThreadState() {}
 
 Status ThreadStatusUpdater::GetThreadList(
-    std::vector<ThreadStatus>* thread_list) {
+    std::vector<ThreadStatus>* /*thread_list*/) {
   return Status::NotSupported(
       "GetThreadList is not supported in the current running environment.");
 }
 
-void ThreadStatusUpdater::NewColumnFamilyInfo(
-    const void* db_key, const std::string& db_name,
-    const void* cf_key, const std::string& cf_name) {
-}
+void ThreadStatusUpdater::NewColumnFamilyInfo(const void* /*db_key*/,
+                                              const std::string& /*db_name*/,
+                                              const void* /*cf_key*/,
+                                              const std::string& /*cf_name*/) {}
 
-void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) {
-}
+void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* /*cf_key*/) {}
 
-void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) {
-}
+void ThreadStatusUpdater::EraseDatabaseInfo(const void* /*db_key*/) {}
 
-void ThreadStatusUpdater::SetThreadOperationProperty(
-    int i, uint64_t value) {
-}
+void ThreadStatusUpdater::SetThreadOperationProperty(int /*i*/,
+                                                     uint64_t /*value*/) {}
 
-void ThreadStatusUpdater::IncreaseThreadOperationProperty(
-    int i, uint64_t delta) {
-}
+void ThreadStatusUpdater::IncreaseThreadOperationProperty(int /*i*/,
+                                                          uint64_t /*delta*/) {}
 
 #endif  // ROCKSDB_USING_THREAD_STATUS
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/monitoring/thread_status_updater.h b/thirdparty/rocksdb/monitoring/thread_status_updater.h
index 69b4d4f7ec..6706d159df 100644
--- a/thirdparty/rocksdb/monitoring/thread_status_updater.h
+++ b/thirdparty/rocksdb/monitoring/thread_status_updater.h
@@ -218,8 +218,7 @@ class ThreadStatusUpdater {
   // globally instead of inside DB is to avoid the situation where DB is
   // closing while GetThreadList function already get the pointer to its
   // CopnstantColumnFamilyInfo.
-  std::unordered_map<
-      const void*, std::unique_ptr<ConstantColumnFamilyInfo>> cf_info_map_;
+  std::unordered_map<const void*, ConstantColumnFamilyInfo> cf_info_map_;
 
   // A db_key to cf_key map that allows erasing elements in cf_info_map
   // associated to the same db_key faster.
diff --git a/thirdparty/rocksdb/monitoring/thread_status_updater_debug.cc b/thirdparty/rocksdb/monitoring/thread_status_updater_debug.cc
index eec52e1887..8dc0fe6fd9 100644
--- a/thirdparty/rocksdb/monitoring/thread_status_updater_debug.cc
+++ b/thirdparty/rocksdb/monitoring/thread_status_updater_debug.cc
@@ -13,19 +13,17 @@ namespace rocksdb {
 #ifndef NDEBUG
 #ifdef ROCKSDB_USING_THREAD_STATUS
 void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
-    const std::vector<ColumnFamilyHandle*>& handles,
-    bool check_exist) {
+    const std::vector<ColumnFamilyHandle*>& handles, bool check_exist) {
   std::unique_lock<std::mutex> lock(thread_list_mutex_);
   if (check_exist) {
     assert(cf_info_map_.size() == handles.size());
   }
   for (auto* handle : handles) {
     auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(handle)->cfd();
-    auto iter __attribute__((unused)) = cf_info_map_.find(cfd);
+    auto iter __attribute__((__unused__)) = cf_info_map_.find(cfd);
     if (check_exist) {
       assert(iter != cf_info_map_.end());
-      assert(iter->second);
-      assert(iter->second->cf_name == cfd->GetName());
+      assert(iter->second.cf_name == cfd->GetName());
     } else {
       assert(iter == cf_info_map_.end());
     }
@@ -35,12 +33,10 @@ void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
 #else
 
 void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap(
-    const std::vector<ColumnFamilyHandle*>& handles,
-    bool check_exist) {
+    const std::vector<ColumnFamilyHandle*>& /*handles*/, bool /*check_exist*/) {
 }
 
 #endif  // ROCKSDB_USING_THREAD_STATUS
 #endif  // !NDEBUG
 
-
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/monitoring/thread_status_util.cc b/thirdparty/rocksdb/monitoring/thread_status_util.cc
index 50692dfe55..c2af0a5745 100644
--- a/thirdparty/rocksdb/monitoring/thread_status_util.cc
+++ b/thirdparty/rocksdb/monitoring/thread_status_util.cc
@@ -10,20 +10,18 @@
 
 namespace rocksdb {
 
-
 #ifdef ROCKSDB_USING_THREAD_STATUS
-__thread ThreadStatusUpdater*
-    ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
+__thread ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ =
+    nullptr;
 __thread bool ThreadStatusUtil::thread_updater_initialized_ = false;
 
-void ThreadStatusUtil::RegisterThread(
-    const Env* env, ThreadStatus::ThreadType thread_type) {
+void ThreadStatusUtil::RegisterThread(const Env* env,
+                                      ThreadStatus::ThreadType thread_type) {
   if (!MaybeInitThreadLocalUpdater(env)) {
     return;
   }
   assert(thread_updater_local_cache_);
-  thread_updater_local_cache_->RegisterThread(
-      thread_type, env->GetThreadID());
+  thread_updater_local_cache_->RegisterThread(thread_type, env->GetThreadID());
 }
 
 void ThreadStatusUtil::UnregisterThread() {
@@ -80,28 +78,25 @@ ThreadStatus::OperationStage ThreadStatusUtil::SetThreadOperationStage(
   return thread_updater_local_cache_->SetThreadOperationStage(stage);
 }
 
-void ThreadStatusUtil::SetThreadOperationProperty(
-    int code, uint64_t value) {
+void ThreadStatusUtil::SetThreadOperationProperty(int code, uint64_t value) {
   if (thread_updater_local_cache_ == nullptr) {
     // thread_updater_local_cache_ must be set in SetColumnFamily
     // or other ThreadStatusUtil functions.
     return;
   }
 
-  thread_updater_local_cache_->SetThreadOperationProperty(
-      code, value);
+  thread_updater_local_cache_->SetThreadOperationProperty(code, value);
 }
 
-void ThreadStatusUtil::IncreaseThreadOperationProperty(
-    int code, uint64_t delta) {
+void ThreadStatusUtil::IncreaseThreadOperationProperty(int code,
+                                                       uint64_t delta) {
   if (thread_updater_local_cache_ == nullptr) {
     // thread_updater_local_cache_ must be set in SetColumnFamily
     // or other ThreadStatusUtil functions.
     return;
   }
 
-  thread_updater_local_cache_->IncreaseThreadOperationProperty(
-      code, delta);
+  thread_updater_local_cache_->IncreaseThreadOperationProperty(code, delta);
 }
 
 void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) {
@@ -135,8 +130,7 @@ void ThreadStatusUtil::NewColumnFamilyInfo(const DB* db,
   }
 }
 
-void ThreadStatusUtil::EraseColumnFamilyInfo(
-    const ColumnFamilyData* cfd) {
+void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* cfd) {
   if (thread_updater_local_cache_ == nullptr) {
     return;
   }
@@ -173,49 +167,39 @@ AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {
 ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr;
 bool ThreadStatusUtil::thread_updater_initialized_ = false;
 
-bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) {
+bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) {
   return false;
 }
 
-void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd,
-                                       const Env* env,
-                                       bool enable_thread_tracking) {}
+void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/,
+                                       const Env* /*env*/,
+                                       bool /*enable_thread_tracking*/) {}
 
-void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) {
-}
+void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType /*op*/) {}
 
-void ThreadStatusUtil::SetThreadOperationProperty(
-    int code, uint64_t value) {
-}
+void ThreadStatusUtil::SetThreadOperationProperty(int /*code*/,
+                                                  uint64_t /*value*/) {}
 
-void ThreadStatusUtil::IncreaseThreadOperationProperty(
-    int code, uint64_t delta) {
-}
+void ThreadStatusUtil::IncreaseThreadOperationProperty(int /*code*/,
+                                                       uint64_t /*delta*/) {}
 
-void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) {
-}
+void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType /*state*/) {}
 
-void ThreadStatusUtil::NewColumnFamilyInfo(const DB* db,
-                                           const ColumnFamilyData* cfd,
-                                           const std::string& cf_name,
-                                           const Env* env) {}
+void ThreadStatusUtil::NewColumnFamilyInfo(const DB* /*db*/,
+                                           const ColumnFamilyData* /*cfd*/,
+                                           const std::string& /*cf_name*/,
+                                           const Env* /*env*/) {}
 
-void ThreadStatusUtil::EraseColumnFamilyInfo(
-    const ColumnFamilyData* cfd) {
-}
+void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* /*cfd*/) {}
 
-void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) {
-}
+void ThreadStatusUtil::EraseDatabaseInfo(const DB* /*db*/) {}
 
-void ThreadStatusUtil::ResetThreadStatus() {
-}
+void ThreadStatusUtil::ResetThreadStatus() {}
 
 AutoThreadOperationStageUpdater::AutoThreadOperationStageUpdater(
-    ThreadStatus::OperationStage stage) {
-}
+    ThreadStatus::OperationStage /*stage*/) {}
 
-AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {
-}
+AutoThreadOperationStageUpdater::~AutoThreadOperationStageUpdater() {}
 
 #endif  // ROCKSDB_USING_THREAD_STATUS
 
diff --git a/thirdparty/rocksdb/options/cf_options.cc b/thirdparty/rocksdb/options/cf_options.cc
index 67cbef68f6..6957e150f1 100644
--- a/thirdparty/rocksdb/options/cf_options.cc
+++ b/thirdparty/rocksdb/options/cf_options.cc
@@ -17,6 +17,7 @@
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
+#include "rocksdb/concurrent_task_limiter.h"
 
 namespace rocksdb {
 
@@ -27,9 +28,6 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
                                        const ColumnFamilyOptions& cf_options)
     : compaction_style(cf_options.compaction_style),
       compaction_pri(cf_options.compaction_pri),
-      compaction_options_universal(cf_options.compaction_options_universal),
-      compaction_options_fifo(cf_options.compaction_options_fifo),
-      prefix_extractor(cf_options.prefix_extractor.get()),
       user_comparator(cf_options.comparator),
       internal_comparator(InternalKeyComparator(cf_options.comparator)),
       merge_operator(cf_options.merge_operator.get()),
@@ -44,6 +42,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
       info_log(db_options.info_log.get()),
       statistics(db_options.statistics.get()),
       rate_limiter(db_options.rate_limiter.get()),
+      info_log_level(db_options.info_log_level),
       env(db_options.env),
       allow_mmap_reads(db_options.allow_mmap_reads),
       allow_mmap_writes(db_options.allow_mmap_writes),
@@ -59,6 +58,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
       use_fsync(db_options.use_fsync),
       compression_per_level(cf_options.compression_per_level),
       bottommost_compression(cf_options.bottommost_compression),
+      bottommost_compression_opts(cf_options.bottommost_compression_opts),
       compression_opts(cf_options.compression_opts),
       level_compaction_dynamic_level_bytes(
           cf_options.level_compaction_dynamic_level_bytes),
@@ -66,16 +66,18 @@ ImmutableCFOptions::ImmutableCFOptions(const ImmutableDBOptions& db_options,
           db_options.access_hint_on_compaction_start),
       new_table_reader_for_compaction_inputs(
           db_options.new_table_reader_for_compaction_inputs),
-      compaction_readahead_size(db_options.compaction_readahead_size),
       num_levels(cf_options.num_levels),
       optimize_filters_for_hits(cf_options.optimize_filters_for_hits),
       force_consistency_checks(cf_options.force_consistency_checks),
       allow_ingest_behind(db_options.allow_ingest_behind),
+      preserve_deletes(db_options.preserve_deletes),
       listeners(db_options.listeners),
       row_cache(db_options.row_cache),
       max_subcompactions(db_options.max_subcompactions),
       memtable_insert_with_hint_prefix_extractor(
-          cf_options.memtable_insert_with_hint_prefix_extractor.get()) {}
+          cf_options.memtable_insert_with_hint_prefix_extractor.get()),
+      cf_paths(cf_options.cf_paths),
+      compaction_thread_limiter(cf_options.compaction_thread_limiter) {}
 
 // Multiple two operands. If they overflow, return op1.
 uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {
@@ -88,6 +90,24 @@ uint64_t MultiplyCheckOverflow(uint64_t op1, double op2) {
   return static_cast<uint64_t>(op1 * op2);
 }
 
+// when level_compaction_dynamic_level_bytes is true and leveled compaction
+// is used, the base level is not always L1, so precomupted max_file_size can
+// no longer be used. Recompute file_size_for_level from base level.
+uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options,
+    int level, CompactionStyle compaction_style, int base_level,
+    bool level_compaction_dynamic_level_bytes) {
+  if (!level_compaction_dynamic_level_bytes || level < base_level ||
+      compaction_style != kCompactionStyleLevel) {
+    assert(level >= 0);
+    assert(level < (int)cf_options.max_file_size.size());
+    return cf_options.max_file_size[level];
+  } else {
+    assert(level >= 0 && base_level >= 0);
+    assert(level - base_level < (int)cf_options.max_file_size.size());
+    return cf_options.max_file_size[level - base_level];
+  }
+}
+
 void MutableCFOptions::RefreshDerivedOptions(int num_levels,
                                              CompactionStyle compaction_style) {
   max_file_size.resize(num_levels);
@@ -103,12 +123,6 @@ void MutableCFOptions::RefreshDerivedOptions(int num_levels,
   }
 }
 
-uint64_t MutableCFOptions::MaxFileSizeForLevel(int level) const {
-  assert(level >= 0);
-  assert(level < (int)max_file_size.size());
-  return max_file_size[level];
-}
-
 void MutableCFOptions::Dump(Logger* log) const {
   // Memtable related options
   ROCKS_LOG_INFO(log,
@@ -121,6 +135,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  arena_block_size);
   ROCKS_LOG_INFO(log, "              memtable_prefix_bloom_ratio: %f",
                  memtable_prefix_bloom_size_ratio);
+  ROCKS_LOG_INFO(log, "              memtable_whole_key_filtering: %d",
+                 memtable_whole_key_filtering);
   ROCKS_LOG_INFO(log,
                  "                  memtable_huge_page_size: %" ROCKSDB_PRIszt,
                  memtable_huge_page_size);
@@ -130,6 +146,9 @@ void MutableCFOptions::Dump(Logger* log) const {
   ROCKS_LOG_INFO(log,
                  "                 inplace_update_num_locks: %" ROCKSDB_PRIszt,
                  inplace_update_num_locks);
+  ROCKS_LOG_INFO(
+      log, "                         prefix_extractor: %s",
+      prefix_extractor == nullptr ? "nullptr" : prefix_extractor->Name());
   ROCKS_LOG_INFO(log, "                 disable_auto_compactions: %d",
                  disable_auto_compactions);
   ROCKS_LOG_INFO(log, "      soft_pending_compaction_bytes_limit: %" PRIu64,
@@ -152,6 +171,8 @@ void MutableCFOptions::Dump(Logger* log) const {
                  max_bytes_for_level_base);
   ROCKS_LOG_INFO(log, "           max_bytes_for_level_multiplier: %f",
                  max_bytes_for_level_multiplier);
+  ROCKS_LOG_INFO(log, "                                      ttl: %" PRIu64,
+                 ttl);
   std::string result;
   char buf[10];
   for (const auto m : max_bytes_for_level_multiplier_additional) {
@@ -174,6 +195,34 @@ void MutableCFOptions::Dump(Logger* log) const {
                  report_bg_io_stats);
   ROCKS_LOG_INFO(log, "                              compression: %d",
                  static_cast<int>(compression));
+
+  // Universal Compaction Options
+  ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d",
+                 compaction_options_universal.size_ratio);
+  ROCKS_LOG_INFO(log, "compaction_options_universal.min_merge_width : %d",
+                 compaction_options_universal.min_merge_width);
+  ROCKS_LOG_INFO(log, "compaction_options_universal.max_merge_width : %d",
+                 compaction_options_universal.max_merge_width);
+  ROCKS_LOG_INFO(
+      log, "compaction_options_universal.max_size_amplification_percent : %d",
+      compaction_options_universal.max_size_amplification_percent);
+  ROCKS_LOG_INFO(log,
+                 "compaction_options_universal.compression_size_percent : %d",
+                 compaction_options_universal.compression_size_percent);
+  ROCKS_LOG_INFO(log, "compaction_options_universal.stop_style : %d",
+                 compaction_options_universal.stop_style);
+  ROCKS_LOG_INFO(
+      log, "compaction_options_universal.allow_trivial_move : %d",
+      static_cast<int>(compaction_options_universal.allow_trivial_move));
+
+  // FIFO Compaction Options
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.max_table_files_size : %" PRIu64,
+                 compaction_options_fifo.max_table_files_size);
+  ROCKS_LOG_INFO(log, "compaction_options_fifo.allow_compaction : %d",
+                 compaction_options_fifo.allow_compaction);
 }
 
+MutableCFOptions::MutableCFOptions(const Options& options)
+    : MutableCFOptions(ColumnFamilyOptions(options)) {}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/options/cf_options.h b/thirdparty/rocksdb/options/cf_options.h
index f376729f85..fed144e4c3 100644
--- a/thirdparty/rocksdb/options/cf_options.h
+++ b/thirdparty/rocksdb/options/cf_options.h
@@ -18,7 +18,7 @@ namespace rocksdb {
 // ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
 // subset of Options that should not be changed during the entire lifetime
 // of DB. Raw pointers defined in this struct do not have ownership to the data
-// they point to. Options contains shared_ptr to these data.
+// they point to. Options contains std::shared_ptr to these data.
 struct ImmutableCFOptions {
   ImmutableCFOptions();
   explicit ImmutableCFOptions(const Options& options);
@@ -30,11 +30,6 @@ struct ImmutableCFOptions {
 
   CompactionPri compaction_pri;
 
-  CompactionOptionsUniversal compaction_options_universal;
-  CompactionOptionsFIFO compaction_options_fifo;
-
-  const SliceTransform* prefix_extractor;
-
   const Comparator* user_comparator;
   InternalKeyComparator internal_comparator;
 
@@ -94,6 +89,8 @@ struct ImmutableCFOptions {
 
   CompressionType bottommost_compression;
 
+  CompressionOptions bottommost_compression_opts;
+
   CompressionOptions compression_opts;
 
   bool level_compaction_dynamic_level_bytes;
@@ -102,8 +99,6 @@ struct ImmutableCFOptions {
 
   bool new_table_reader_for_compaction_inputs;
 
-  size_t compaction_readahead_size;
-
   int num_levels;
 
   bool optimize_filters_for_hits;
@@ -112,7 +107,9 @@ struct ImmutableCFOptions {
 
   bool allow_ingest_behind;
 
-  // A vector of EventListeners which call-back functions will be called
+  bool preserve_deletes;
+
+  // A vector of EventListeners which callback functions will be called
   // when specific RocksDB event happens.
   std::vector<std::shared_ptr<EventListener>> listeners;
 
@@ -121,6 +118,10 @@ struct ImmutableCFOptions {
   uint32_t max_subcompactions;
 
   const SliceTransform* memtable_insert_with_hint_prefix_extractor;
+
+  std::vector<DbPath> cf_paths;
+
+  std::shared_ptr<ConcurrentTaskLimiter> compaction_thread_limiter;
 };
 
 struct MutableCFOptions {
@@ -130,9 +131,11 @@ struct MutableCFOptions {
         arena_block_size(options.arena_block_size),
         memtable_prefix_bloom_size_ratio(
             options.memtable_prefix_bloom_size_ratio),
+        memtable_whole_key_filtering(options.memtable_whole_key_filtering),
         memtable_huge_page_size(options.memtable_huge_page_size),
         max_successive_merges(options.max_successive_merges),
         inplace_update_num_locks(options.inplace_update_num_locks),
+        prefix_extractor(options.prefix_extractor),
         disable_auto_compactions(options.disable_auto_compactions),
         soft_pending_compaction_bytes_limit(
             options.soft_pending_compaction_bytes_limit),
@@ -147,13 +150,17 @@ struct MutableCFOptions {
         target_file_size_multiplier(options.target_file_size_multiplier),
         max_bytes_for_level_base(options.max_bytes_for_level_base),
         max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier),
+        ttl(options.ttl),
         max_bytes_for_level_multiplier_additional(
             options.max_bytes_for_level_multiplier_additional),
+        compaction_options_fifo(options.compaction_options_fifo),
+        compaction_options_universal(options.compaction_options_universal),
         max_sequential_skip_in_iterations(
             options.max_sequential_skip_in_iterations),
         paranoid_file_checks(options.paranoid_file_checks),
         report_bg_io_stats(options.report_bg_io_stats),
-        compression(options.compression) {
+        compression(options.compression),
+        sample_for_compression(options.sample_for_compression) {
     RefreshDerivedOptions(options.num_levels, options.compaction_style);
   }
 
@@ -162,9 +169,11 @@ struct MutableCFOptions {
         max_write_buffer_number(0),
         arena_block_size(0),
         memtable_prefix_bloom_size_ratio(0),
+        memtable_whole_key_filtering(false),
         memtable_huge_page_size(0),
         max_successive_merges(0),
         inplace_update_num_locks(0),
+        prefix_extractor(nullptr),
         disable_auto_compactions(false),
         soft_pending_compaction_bytes_limit(0),
         hard_pending_compaction_bytes_limit(0),
@@ -176,10 +185,15 @@ struct MutableCFOptions {
         target_file_size_multiplier(0),
         max_bytes_for_level_base(0),
         max_bytes_for_level_multiplier(0),
+        ttl(0),
+        compaction_options_fifo(),
         max_sequential_skip_in_iterations(0),
         paranoid_file_checks(false),
         report_bg_io_stats(false),
-        compression(Snappy_Supported() ? kSnappyCompression : kNoCompression) {}
+        compression(Snappy_Supported() ? kSnappyCompression : kNoCompression),
+        sample_for_compression(0) {}
+
+  explicit MutableCFOptions(const Options& options);
 
   // Must be called after any change to MutableCFOptions
   void RefreshDerivedOptions(int num_levels, CompactionStyle compaction_style);
@@ -188,8 +202,6 @@ struct MutableCFOptions {
     RefreshDerivedOptions(ioptions.num_levels, ioptions.compaction_style);
   }
 
-  // Get the max file size in a given level.
-  uint64_t MaxFileSizeForLevel(int level) const;
   int MaxBytesMultiplerAdditional(int level) const {
     if (level >=
         static_cast<int>(max_bytes_for_level_multiplier_additional.size())) {
@@ -205,9 +217,11 @@ struct MutableCFOptions {
   int max_write_buffer_number;
   size_t arena_block_size;
   double memtable_prefix_bloom_size_ratio;
+  bool memtable_whole_key_filtering;
   size_t memtable_huge_page_size;
   size_t max_successive_merges;
   size_t inplace_update_num_locks;
+  std::shared_ptr<const SliceTransform> prefix_extractor;
 
   // Compaction related options
   bool disable_auto_compactions;
@@ -221,13 +235,17 @@ struct MutableCFOptions {
   int target_file_size_multiplier;
   uint64_t max_bytes_for_level_base;
   double max_bytes_for_level_multiplier;
+  uint64_t ttl;
   std::vector<int> max_bytes_for_level_multiplier_additional;
+  CompactionOptionsFIFO compaction_options_fifo;
+  CompactionOptionsUniversal compaction_options_universal;
 
   // Misc options
   uint64_t max_sequential_skip_in_iterations;
   bool paranoid_file_checks;
   bool report_bg_io_stats;
   CompressionType compression;
+  uint64_t sample_for_compression;
 
   // Derived options
   // Per-level target file size.
@@ -236,4 +254,8 @@ struct MutableCFOptions {
 
 uint64_t MultiplyCheckOverflow(uint64_t op1, double op2);
 
+// Get the max file size in a given level.
+uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options,
+    int level, CompactionStyle compaction_style, int base_level = 1,
+    bool level_compaction_dynamic_level_bytes = false);
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/options/db_options.cc b/thirdparty/rocksdb/options/db_options.cc
index 61775757d5..f24705cb75 100644
--- a/thirdparty/rocksdb/options/db_options.cc
+++ b/thirdparty/rocksdb/options/db_options.cc
@@ -62,12 +62,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       access_hint_on_compaction_start(options.access_hint_on_compaction_start),
       new_table_reader_for_compaction_inputs(
           options.new_table_reader_for_compaction_inputs),
-      compaction_readahead_size(options.compaction_readahead_size),
       random_access_max_buffer_size(options.random_access_max_buffer_size),
-      writable_file_max_buffer_size(options.writable_file_max_buffer_size),
       use_adaptive_mutex(options.use_adaptive_mutex),
-      bytes_per_sync(options.bytes_per_sync),
-      wal_bytes_per_sync(options.wal_bytes_per_sync),
       listeners(options.listeners),
       enable_thread_tracking(options.enable_thread_tracking),
       enable_pipelined_write(options.enable_pipelined_write),
@@ -87,8 +83,11 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
       dump_malloc_stats(options.dump_malloc_stats),
       avoid_flush_during_recovery(options.avoid_flush_during_recovery),
       allow_ingest_behind(options.allow_ingest_behind),
-      concurrent_prepare(options.concurrent_prepare),
-      manual_wal_flush(options.manual_wal_flush) {
+      preserve_deletes(options.preserve_deletes),
+      two_write_queues(options.two_write_queues),
+      manual_wal_flush(options.manual_wal_flush),
+      atomic_flush(options.atomic_flush),
+      avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io) {
 }
 
 void ImmutableDBOptions::Dump(Logger* log) const {
@@ -104,6 +103,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    info_log.get());
   ROCKS_LOG_HEADER(log, "               Options.max_file_opening_threads: %d",
                    max_file_opening_threads);
+  ROCKS_LOG_HEADER(log, "                             Options.statistics: %p",
+                   statistics.get());
   ROCKS_LOG_HEADER(log, "                              Options.use_fsync: %d",
                    use_fsync);
   ROCKS_LOG_HEADER(
@@ -168,15 +169,9 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    static_cast<int>(access_hint_on_compaction_start));
   ROCKS_LOG_HEADER(log, " Options.new_table_reader_for_compaction_inputs: %d",
                    new_table_reader_for_compaction_inputs);
-  ROCKS_LOG_HEADER(
-      log, "              Options.compaction_readahead_size: %" ROCKSDB_PRIszt,
-      compaction_readahead_size);
   ROCKS_LOG_HEADER(
       log, "          Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt,
       random_access_max_buffer_size);
-  ROCKS_LOG_HEADER(
-      log, "          Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
-      writable_file_max_buffer_size);
   ROCKS_LOG_HEADER(log, "                     Options.use_adaptive_mutex: %d",
                    use_adaptive_mutex);
   ROCKS_LOG_HEADER(log, "                           Options.rate_limiter: %p",
@@ -184,14 +179,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
   Header(
       log, "    Options.sst_file_manager.rate_bytes_per_sec: %" PRIi64,
       sst_file_manager ? sst_file_manager->GetDeleteRateBytesPerSecond() : 0);
-  ROCKS_LOG_HEADER(log,
-                   "                         Options.bytes_per_sync: %" PRIu64,
-                   bytes_per_sync);
-  ROCKS_LOG_HEADER(log,
-                   "                     Options.wal_bytes_per_sync: %" PRIu64,
-                   wal_bytes_per_sync);
   ROCKS_LOG_HEADER(log, "                      Options.wal_recovery_mode: %d",
-                   wal_recovery_mode);
+                   static_cast<int>(wal_recovery_mode));
   ROCKS_LOG_HEADER(log, "                 Options.enable_thread_tracking: %d",
                    enable_thread_tracking);
   ROCKS_LOG_HEADER(log, "                 Options.enable_pipelined_write: %d",
@@ -208,7 +197,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    write_thread_slow_yield_usec);
   if (row_cache) {
     ROCKS_LOG_HEADER(
-        log, "                              Options.row_cache: %" PRIu64,
+        log,
+        "                              Options.row_cache: %" ROCKSDB_PRIszt,
         row_cache->GetCapacity());
   } else {
     ROCKS_LOG_HEADER(log,
@@ -223,10 +213,16 @@ void ImmutableDBOptions::Dump(Logger* log) const {
                    avoid_flush_during_recovery);
   ROCKS_LOG_HEADER(log, "            Options.allow_ingest_behind: %d",
                    allow_ingest_behind);
-  ROCKS_LOG_HEADER(log, "            Options.concurrent_prepare: %d",
-                   concurrent_prepare);
+  ROCKS_LOG_HEADER(log, "            Options.preserve_deletes: %d",
+                   preserve_deletes);
+  ROCKS_LOG_HEADER(log, "            Options.two_write_queues: %d",
+                   two_write_queues);
   ROCKS_LOG_HEADER(log, "            Options.manual_wal_flush: %d",
                    manual_wal_flush);
+  ROCKS_LOG_HEADER(log, "            Options.atomic_flush: %d", atomic_flush);
+  ROCKS_LOG_HEADER(log,
+                   "            Options.avoid_unnecessary_blocking_io: %d",
+                   avoid_unnecessary_blocking_io);
 }
 
 MutableDBOptions::MutableDBOptions()
@@ -234,23 +230,35 @@ MutableDBOptions::MutableDBOptions()
       base_background_compactions(-1),
       max_background_compactions(-1),
       avoid_flush_during_shutdown(false),
+      writable_file_max_buffer_size(1024 * 1024),
       delayed_write_rate(2 * 1024U * 1024U),
       max_total_wal_size(0),
       delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000),
       stats_dump_period_sec(600),
-      max_open_files(-1) {}
+      stats_persist_period_sec(600),
+      stats_history_buffer_size(1024 * 1024),
+      max_open_files(-1),
+      bytes_per_sync(0),
+      wal_bytes_per_sync(0),
+      compaction_readahead_size(0) {}
 
 MutableDBOptions::MutableDBOptions(const DBOptions& options)
     : max_background_jobs(options.max_background_jobs),
       base_background_compactions(options.base_background_compactions),
       max_background_compactions(options.max_background_compactions),
       avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
+      writable_file_max_buffer_size(options.writable_file_max_buffer_size),
       delayed_write_rate(options.delayed_write_rate),
       max_total_wal_size(options.max_total_wal_size),
       delete_obsolete_files_period_micros(
           options.delete_obsolete_files_period_micros),
       stats_dump_period_sec(options.stats_dump_period_sec),
-      max_open_files(options.max_open_files) {}
+      stats_persist_period_sec(options.stats_persist_period_sec),
+      stats_history_buffer_size(options.stats_history_buffer_size),
+      max_open_files(options.max_open_files),
+      bytes_per_sync(options.bytes_per_sync),
+      wal_bytes_per_sync(options.wal_bytes_per_sync),
+      compaction_readahead_size(options.compaction_readahead_size) {}
 
 void MutableDBOptions::Dump(Logger* log) const {
   ROCKS_LOG_HEADER(log, "            Options.max_background_jobs: %d",
@@ -259,6 +267,9 @@ void MutableDBOptions::Dump(Logger* log) const {
                    max_background_compactions);
   ROCKS_LOG_HEADER(log, "            Options.avoid_flush_during_shutdown: %d",
                    avoid_flush_during_shutdown);
+  ROCKS_LOG_HEADER(
+      log, "          Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt,
+      writable_file_max_buffer_size);
   ROCKS_LOG_HEADER(log, "            Options.delayed_write_rate : %" PRIu64,
                    delayed_write_rate);
   ROCKS_LOG_HEADER(log, "            Options.max_total_wal_size: %" PRIu64,
@@ -268,8 +279,23 @@ void MutableDBOptions::Dump(Logger* log) const {
       delete_obsolete_files_period_micros);
   ROCKS_LOG_HEADER(log, "                  Options.stats_dump_period_sec: %u",
                    stats_dump_period_sec);
+  ROCKS_LOG_HEADER(log, "                Options.stats_persist_period_sec: %d",
+                   stats_persist_period_sec);
+  ROCKS_LOG_HEADER(
+      log,
+      "                Options.stats_history_buffer_size: %" ROCKSDB_PRIszt,
+      stats_history_buffer_size);
   ROCKS_LOG_HEADER(log, "                         Options.max_open_files: %d",
                    max_open_files);
+  ROCKS_LOG_HEADER(log,
+                   "                         Options.bytes_per_sync: %" PRIu64,
+                   bytes_per_sync);
+  ROCKS_LOG_HEADER(log,
+                   "                     Options.wal_bytes_per_sync: %" PRIu64,
+                   wal_bytes_per_sync);
+  ROCKS_LOG_HEADER(log,
+                   "      Options.compaction_readahead_size: %" ROCKSDB_PRIszt,
+                   compaction_readahead_size);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/options/db_options.h b/thirdparty/rocksdb/options/db_options.h
index 18d1a5fb67..283cf7d352 100644
--- a/thirdparty/rocksdb/options/db_options.h
+++ b/thirdparty/rocksdb/options/db_options.h
@@ -55,12 +55,8 @@ struct ImmutableDBOptions {
   std::shared_ptr<WriteBufferManager> write_buffer_manager;
   DBOptions::AccessHint access_hint_on_compaction_start;
   bool new_table_reader_for_compaction_inputs;
-  size_t compaction_readahead_size;
   size_t random_access_max_buffer_size;
-  size_t writable_file_max_buffer_size;
   bool use_adaptive_mutex;
-  uint64_t bytes_per_sync;
-  uint64_t wal_bytes_per_sync;
   std::vector<std::shared_ptr<EventListener>> listeners;
   bool enable_thread_tracking;
   bool enable_pipelined_write;
@@ -79,8 +75,11 @@ struct ImmutableDBOptions {
   bool dump_malloc_stats;
   bool avoid_flush_during_recovery;
   bool allow_ingest_behind;
-  bool concurrent_prepare;
+  bool preserve_deletes;
+  bool two_write_queues;
   bool manual_wal_flush;
+  bool atomic_flush;
+  bool avoid_unnecessary_blocking_io;
 };
 
 struct MutableDBOptions {
@@ -94,11 +93,17 @@ struct MutableDBOptions {
   int base_background_compactions;
   int max_background_compactions;
   bool avoid_flush_during_shutdown;
+  size_t writable_file_max_buffer_size;
   uint64_t delayed_write_rate;
   uint64_t max_total_wal_size;
   uint64_t delete_obsolete_files_period_micros;
   unsigned int stats_dump_period_sec;
+  unsigned int stats_persist_period_sec;
+  size_t stats_history_buffer_size;
   int max_open_files;
+  uint64_t bytes_per_sync;
+  uint64_t wal_bytes_per_sync;
+  size_t compaction_readahead_size;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/options/options.cc b/thirdparty/rocksdb/options/options.cc
index 7bd2c9582f..2c99545811 100644
--- a/thirdparty/rocksdb/options/options.cc
+++ b/thirdparty/rocksdb/options/options.cc
@@ -51,6 +51,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       inplace_callback(options.inplace_callback),
       memtable_prefix_bloom_size_ratio(
           options.memtable_prefix_bloom_size_ratio),
+      memtable_whole_key_filtering(options.memtable_whole_key_filtering),
       memtable_huge_page_size(options.memtable_huge_page_size),
       memtable_insert_with_hint_prefix_extractor(
           options.memtable_insert_with_hint_prefix_extractor),
@@ -85,7 +86,9 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options)
       optimize_filters_for_hits(options.optimize_filters_for_hits),
       paranoid_file_checks(options.paranoid_file_checks),
       force_consistency_checks(options.force_consistency_checks),
-      report_bg_io_stats(options.report_bg_io_stats) {
+      report_bg_io_stats(options.report_bg_io_stats),
+      ttl(options.ttl),
+      sample_for_compression(options.sample_for_compression) {
   assert(memtable_factory.get() != nullptr);
   if (max_bytes_for_level_multiplier_additional.size() <
       static_cast<unsigned int>(num_levels)) {
@@ -99,100 +102,11 @@ ColumnFamilyOptions::ColumnFamilyOptions()
           std::shared_ptr<TableFactory>(new BlockBasedTableFactory())) {}
 
 ColumnFamilyOptions::ColumnFamilyOptions(const Options& options)
-    : AdvancedColumnFamilyOptions(options),
-      comparator(options.comparator),
-      merge_operator(options.merge_operator),
-      compaction_filter(options.compaction_filter),
-      compaction_filter_factory(options.compaction_filter_factory),
-      write_buffer_size(options.write_buffer_size),
-      compression(options.compression),
-      bottommost_compression(options.bottommost_compression),
-      compression_opts(options.compression_opts),
-      level0_file_num_compaction_trigger(
-          options.level0_file_num_compaction_trigger),
-      prefix_extractor(options.prefix_extractor),
-      max_bytes_for_level_base(options.max_bytes_for_level_base),
-      disable_auto_compactions(options.disable_auto_compactions),
-      table_factory(options.table_factory) {}
+    : ColumnFamilyOptions(*static_cast<const ColumnFamilyOptions*>(&options)) {}
 
 DBOptions::DBOptions() {}
-
 DBOptions::DBOptions(const Options& options)
-    : create_if_missing(options.create_if_missing),
-      create_missing_column_families(options.create_missing_column_families),
-      error_if_exists(options.error_if_exists),
-      paranoid_checks(options.paranoid_checks),
-      env(options.env),
-      rate_limiter(options.rate_limiter),
-      sst_file_manager(options.sst_file_manager),
-      info_log(options.info_log),
-      info_log_level(options.info_log_level),
-      max_open_files(options.max_open_files),
-      max_file_opening_threads(options.max_file_opening_threads),
-      max_total_wal_size(options.max_total_wal_size),
-      statistics(options.statistics),
-      use_fsync(options.use_fsync),
-      db_paths(options.db_paths),
-      db_log_dir(options.db_log_dir),
-      wal_dir(options.wal_dir),
-      delete_obsolete_files_period_micros(
-          options.delete_obsolete_files_period_micros),
-      max_background_jobs(options.max_background_jobs),
-      base_background_compactions(options.base_background_compactions),
-      max_background_compactions(options.max_background_compactions),
-      max_subcompactions(options.max_subcompactions),
-      max_background_flushes(options.max_background_flushes),
-      max_log_file_size(options.max_log_file_size),
-      log_file_time_to_roll(options.log_file_time_to_roll),
-      keep_log_file_num(options.keep_log_file_num),
-      recycle_log_file_num(options.recycle_log_file_num),
-      max_manifest_file_size(options.max_manifest_file_size),
-      table_cache_numshardbits(options.table_cache_numshardbits),
-      WAL_ttl_seconds(options.WAL_ttl_seconds),
-      WAL_size_limit_MB(options.WAL_size_limit_MB),
-      manifest_preallocation_size(options.manifest_preallocation_size),
-      allow_mmap_reads(options.allow_mmap_reads),
-      allow_mmap_writes(options.allow_mmap_writes),
-      use_direct_reads(options.use_direct_reads),
-      use_direct_io_for_flush_and_compaction(
-          options.use_direct_io_for_flush_and_compaction),
-      allow_fallocate(options.allow_fallocate),
-      is_fd_close_on_exec(options.is_fd_close_on_exec),
-      skip_log_error_on_recovery(options.skip_log_error_on_recovery),
-      stats_dump_period_sec(options.stats_dump_period_sec),
-      advise_random_on_open(options.advise_random_on_open),
-      db_write_buffer_size(options.db_write_buffer_size),
-      write_buffer_manager(options.write_buffer_manager),
-      access_hint_on_compaction_start(options.access_hint_on_compaction_start),
-      new_table_reader_for_compaction_inputs(
-          options.new_table_reader_for_compaction_inputs),
-      compaction_readahead_size(options.compaction_readahead_size),
-      random_access_max_buffer_size(options.random_access_max_buffer_size),
-      writable_file_max_buffer_size(options.writable_file_max_buffer_size),
-      use_adaptive_mutex(options.use_adaptive_mutex),
-      bytes_per_sync(options.bytes_per_sync),
-      wal_bytes_per_sync(options.wal_bytes_per_sync),
-      listeners(options.listeners),
-      enable_thread_tracking(options.enable_thread_tracking),
-      delayed_write_rate(options.delayed_write_rate),
-      enable_pipelined_write(options.enable_pipelined_write),
-      allow_concurrent_memtable_write(options.allow_concurrent_memtable_write),
-      enable_write_thread_adaptive_yield(
-          options.enable_write_thread_adaptive_yield),
-      write_thread_max_yield_usec(options.write_thread_max_yield_usec),
-      write_thread_slow_yield_usec(options.write_thread_slow_yield_usec),
-      skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
-      wal_recovery_mode(options.wal_recovery_mode),
-      row_cache(options.row_cache),
-#ifndef ROCKSDB_LITE
-      wal_filter(options.wal_filter),
-#endif  // ROCKSDB_LITE
-      fail_if_options_file_error(options.fail_if_options_file_error),
-      dump_malloc_stats(options.dump_malloc_stats),
-      avoid_flush_during_recovery(options.avoid_flush_during_recovery),
-      avoid_flush_during_shutdown(options.avoid_flush_during_shutdown),
-      allow_ingest_behind(options.allow_ingest_behind) {
-}
+    : DBOptions(*static_cast<const DBOptions*>(&options)) {}
 
 void DBOptions::Dump(Logger* log) const {
     ImmutableDBOptions(*this).Dump(log);
@@ -247,6 +161,28 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                      min_write_buffer_number_to_merge);
     ROCKS_LOG_HEADER(log, "    Options.max_write_buffer_number_to_maintain: %d",
                      max_write_buffer_number_to_maintain);
+    ROCKS_LOG_HEADER(
+        log, "           Options.bottommost_compression_opts.window_bits: %d",
+        bottommost_compression_opts.window_bits);
+    ROCKS_LOG_HEADER(
+        log, "                 Options.bottommost_compression_opts.level: %d",
+        bottommost_compression_opts.level);
+    ROCKS_LOG_HEADER(
+        log, "              Options.bottommost_compression_opts.strategy: %d",
+        bottommost_compression_opts.strategy);
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.max_dict_bytes: "
+        "%" PRIu32,
+        bottommost_compression_opts.max_dict_bytes);
+    ROCKS_LOG_HEADER(
+        log,
+        "        Options.bottommost_compression_opts.zstd_max_train_bytes: "
+        "%" PRIu32,
+        bottommost_compression_opts.zstd_max_train_bytes);
+    ROCKS_LOG_HEADER(
+        log, "                 Options.bottommost_compression_opts.enabled: %s",
+        bottommost_compression_opts.enabled ? "true" : "false");
     ROCKS_LOG_HEADER(log, "           Options.compression_opts.window_bits: %d",
                      compression_opts.window_bits);
     ROCKS_LOG_HEADER(log, "                 Options.compression_opts.level: %d",
@@ -255,8 +191,15 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                      compression_opts.strategy);
     ROCKS_LOG_HEADER(
         log,
-        "        Options.compression_opts.max_dict_bytes: %" ROCKSDB_PRIszt,
+        "        Options.compression_opts.max_dict_bytes: %" PRIu32,
         compression_opts.max_dict_bytes);
+    ROCKS_LOG_HEADER(log,
+                     "        Options.compression_opts.zstd_max_train_bytes: "
+                     "%" PRIu32,
+                     compression_opts.zstd_max_train_bytes);
+    ROCKS_LOG_HEADER(log,
+                     "                 Options.compression_opts.enabled: %s",
+                     compression_opts.enabled ? "true" : "false");
     ROCKS_LOG_HEADER(log, "     Options.level0_file_num_compaction_trigger: %d",
                      level0_file_num_compaction_trigger);
     ROCKS_LOG_HEADER(log, "         Options.level0_slowdown_writes_trigger: %d",
@@ -365,8 +308,6 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(log,
                      "Options.compaction_options_fifo.allow_compaction: %d",
                      compaction_options_fifo.allow_compaction);
-    ROCKS_LOG_HEADER(log, "Options.compaction_options_fifo.ttl: %" PRIu64,
-                     compaction_options_fifo.ttl);
     std::string collector_names;
     for (const auto& collector_factory : table_properties_collector_factories) {
       collector_names.append(collector_factory->Name());
@@ -386,6 +327,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
     ROCKS_LOG_HEADER(
         log, "              Options.memtable_prefix_bloom_size_ratio: %f",
         memtable_prefix_bloom_size_ratio);
+    ROCKS_LOG_HEADER(log,
+                     "              Options.memtable_whole_key_filtering: %d",
+                     memtable_whole_key_filtering);
 
     ROCKS_LOG_HEADER(log, "  Options.memtable_huge_page_size: %" ROCKSDB_PRIszt,
                      memtable_huge_page_size);
@@ -406,6 +350,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
                      force_consistency_checks);
     ROCKS_LOG_HEADER(log, "               Options.report_bg_io_stats: %d",
                      report_bg_io_stats);
+    ROCKS_LOG_HEADER(log, "                              Options.ttl: %" PRIu64,
+                     ttl);
 }  // ColumnFamilyOptions::Dump
 
 void Options::Dump(Logger* log) const {
@@ -497,6 +443,10 @@ DBOptions* DBOptions::OldDefaults(int rocksdb_major_version,
 
 ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults(
     int rocksdb_major_version, int rocksdb_minor_version) {
+  if (rocksdb_major_version < 5 ||
+      (rocksdb_major_version == 5 && rocksdb_minor_version <= 18)) {
+    compaction_pri = CompactionPri::kByCompensatedSize;
+  }
   if (rocksdb_major_version < 4 ||
       (rocksdb_major_version == 4 && rocksdb_minor_version < 7)) {
     write_buffer_size = 4 << 20;
@@ -510,7 +460,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults(
   } else if (rocksdb_major_version == 5 && rocksdb_minor_version < 2) {
     level0_stop_writes_trigger = 30;
   }
-  compaction_pri = CompactionPri::kByCompensatedSize;
 
   return this;
 }
@@ -537,6 +486,9 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
   prefix_extractor.reset(NewNoopTransform());
   BlockBasedTableOptions block_based_options;
   block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
+  block_based_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  block_based_options.data_block_hash_table_util_ratio = 0.75;
   block_based_options.filter_policy.reset(NewBloomFilterPolicy(10));
   block_based_options.block_cache =
       NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
@@ -592,8 +544,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeUniversalStyleCompaction(
 }
 
 DBOptions* DBOptions::IncreaseParallelism(int total_threads) {
-  max_background_compactions = total_threads - 1;
-  max_background_flushes = 1;
+  max_background_jobs = total_threads;
   env->SetBackgroundThreads(total_threads, Env::LOW);
   env->SetBackgroundThreads(1, Env::HIGH);
   return this;
@@ -603,6 +554,7 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) {
 
 ReadOptions::ReadOptions()
     : snapshot(nullptr),
+      iterate_lower_bound(nullptr),
       iterate_upper_bound(nullptr),
       readahead_size(0),
       max_skippable_internal_keys(0),
@@ -615,10 +567,12 @@ ReadOptions::ReadOptions()
       prefix_same_as_start(false),
       pin_data(false),
       background_purge_on_iterator_cleanup(false),
-      ignore_range_deletions(false) {}
+      ignore_range_deletions(false),
+      iter_start_seqnum(0) {}
 
 ReadOptions::ReadOptions(bool cksum, bool cache)
     : snapshot(nullptr),
+      iterate_lower_bound(nullptr),
       iterate_upper_bound(nullptr),
       readahead_size(0),
       max_skippable_internal_keys(0),
@@ -631,6 +585,7 @@ ReadOptions::ReadOptions(bool cksum, bool cache)
       prefix_same_as_start(false),
       pin_data(false),
       background_purge_on_iterator_cleanup(false),
-      ignore_range_deletions(false) {}
+      ignore_range_deletions(false),
+      iter_start_seqnum(0) {}
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/options/options_helper.cc b/thirdparty/rocksdb/options/options_helper.cc
index 5cf548fb9e..9facf6e946 100644
--- a/thirdparty/rocksdb/options/options_helper.cc
+++ b/thirdparty/rocksdb/options/options_helper.cc
@@ -19,6 +19,7 @@
 #include "rocksdb/rate_limiter.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/table.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "table/block_based_table_factory.h"
 #include "table/plain_table_factory.h"
 #include "util/cast_util.h"
@@ -56,6 +57,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       mutable_db_options.base_background_compactions;
   options.max_background_compactions =
       mutable_db_options.max_background_compactions;
+  options.bytes_per_sync = mutable_db_options.bytes_per_sync;
+  options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync;
   options.max_subcompactions = immutable_db_options.max_subcompactions;
   options.max_background_flushes = immutable_db_options.max_background_flushes;
   options.max_log_file_size = immutable_db_options.max_log_file_size;
@@ -77,6 +80,10 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.allow_fallocate = immutable_db_options.allow_fallocate;
   options.is_fd_close_on_exec = immutable_db_options.is_fd_close_on_exec;
   options.stats_dump_period_sec = mutable_db_options.stats_dump_period_sec;
+  options.stats_persist_period_sec =
+      mutable_db_options.stats_persist_period_sec;
+  options.stats_history_buffer_size =
+      mutable_db_options.stats_history_buffer_size;
   options.advise_random_on_open = immutable_db_options.advise_random_on_open;
   options.db_write_buffer_size = immutable_db_options.db_write_buffer_size;
   options.write_buffer_manager = immutable_db_options.write_buffer_manager;
@@ -85,17 +92,16 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
   options.new_table_reader_for_compaction_inputs =
       immutable_db_options.new_table_reader_for_compaction_inputs;
   options.compaction_readahead_size =
-      immutable_db_options.compaction_readahead_size;
+      mutable_db_options.compaction_readahead_size;
   options.random_access_max_buffer_size =
       immutable_db_options.random_access_max_buffer_size;
   options.writable_file_max_buffer_size =
-      immutable_db_options.writable_file_max_buffer_size;
+      mutable_db_options.writable_file_max_buffer_size;
   options.use_adaptive_mutex = immutable_db_options.use_adaptive_mutex;
-  options.bytes_per_sync = immutable_db_options.bytes_per_sync;
-  options.wal_bytes_per_sync = immutable_db_options.wal_bytes_per_sync;
   options.listeners = immutable_db_options.listeners;
   options.enable_thread_tracking = immutable_db_options.enable_thread_tracking;
   options.delayed_write_rate = mutable_db_options.delayed_write_rate;
+  options.enable_pipelined_write = immutable_db_options.enable_pipelined_write;
   options.allow_concurrent_memtable_write =
       immutable_db_options.allow_concurrent_memtable_write;
   options.enable_write_thread_adaptive_yield =
@@ -121,6 +127,13 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
       mutable_db_options.avoid_flush_during_shutdown;
   options.allow_ingest_behind =
       immutable_db_options.allow_ingest_behind;
+  options.preserve_deletes =
+      immutable_db_options.preserve_deletes;
+  options.two_write_queues = immutable_db_options.two_write_queues;
+  options.manual_wal_flush = immutable_db_options.manual_wal_flush;
+  options.atomic_flush = immutable_db_options.atomic_flush;
+  options.avoid_unnecessary_blocking_io =
+      immutable_db_options.avoid_unnecessary_blocking_io;
 
   return options;
 }
@@ -136,14 +149,21 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
   cf_opts.arena_block_size = mutable_cf_options.arena_block_size;
   cf_opts.memtable_prefix_bloom_size_ratio =
       mutable_cf_options.memtable_prefix_bloom_size_ratio;
+  cf_opts.memtable_whole_key_filtering =
+      mutable_cf_options.memtable_whole_key_filtering;
   cf_opts.memtable_huge_page_size = mutable_cf_options.memtable_huge_page_size;
   cf_opts.max_successive_merges = mutable_cf_options.max_successive_merges;
   cf_opts.inplace_update_num_locks =
       mutable_cf_options.inplace_update_num_locks;
+  cf_opts.prefix_extractor = mutable_cf_options.prefix_extractor;
 
   // Compaction related options
   cf_opts.disable_auto_compactions =
       mutable_cf_options.disable_auto_compactions;
+  cf_opts.soft_pending_compaction_bytes_limit =
+      mutable_cf_options.soft_pending_compaction_bytes_limit;
+  cf_opts.hard_pending_compaction_bytes_limit =
+      mutable_cf_options.hard_pending_compaction_bytes_limit;
   cf_opts.level0_file_num_compaction_trigger =
       mutable_cf_options.level0_file_num_compaction_trigger;
   cf_opts.level0_slowdown_writes_trigger =
@@ -158,6 +178,7 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
       mutable_cf_options.max_bytes_for_level_base;
   cf_opts.max_bytes_for_level_multiplier =
       mutable_cf_options.max_bytes_for_level_multiplier;
+  cf_opts.ttl = mutable_cf_options.ttl;
 
   cf_opts.max_bytes_for_level_multiplier_additional.clear();
   for (auto value :
@@ -165,12 +186,17 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
     cf_opts.max_bytes_for_level_multiplier_additional.emplace_back(value);
   }
 
+  cf_opts.compaction_options_fifo = mutable_cf_options.compaction_options_fifo;
+  cf_opts.compaction_options_universal =
+      mutable_cf_options.compaction_options_universal;
+
   // Misc options
   cf_opts.max_sequential_skip_in_iterations =
       mutable_cf_options.max_sequential_skip_in_iterations;
   cf_opts.paranoid_file_checks = mutable_cf_options.paranoid_file_checks;
   cf_opts.report_bg_io_stats = mutable_cf_options.report_bg_io_stats;
   cf_opts.compression = mutable_cf_options.compression;
+  cf_opts.sample_for_compression = mutable_cf_options.sample_for_compression;
 
   cf_opts.table_factory = options.table_factory;
   // TODO(yhchiang): find some way to handle the following derived options
@@ -179,8 +205,53 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
   return cf_opts;
 }
 
+std::map<CompactionStyle, std::string>
+    OptionsHelper::compaction_style_to_string = {
+        {kCompactionStyleLevel, "kCompactionStyleLevel"},
+        {kCompactionStyleUniversal, "kCompactionStyleUniversal"},
+        {kCompactionStyleFIFO, "kCompactionStyleFIFO"},
+        {kCompactionStyleNone, "kCompactionStyleNone"}};
+
+std::map<CompactionPri, std::string> OptionsHelper::compaction_pri_to_string = {
+    {kByCompensatedSize, "kByCompensatedSize"},
+    {kOldestLargestSeqFirst, "kOldestLargestSeqFirst"},
+    {kOldestSmallestSeqFirst, "kOldestSmallestSeqFirst"},
+    {kMinOverlappingRatio, "kMinOverlappingRatio"}};
+
+std::map<CompactionStopStyle, std::string>
+    OptionsHelper::compaction_stop_style_to_string = {
+        {kCompactionStopStyleSimilarSize, "kCompactionStopStyleSimilarSize"},
+        {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}};
+
+std::unordered_map<std::string, ChecksumType>
+    OptionsHelper::checksum_type_string_map = {{"kNoChecksum", kNoChecksum},
+                                               {"kCRC32c", kCRC32c},
+                                               {"kxxHash", kxxHash},
+                                               {"kxxHash64", kxxHash64}};
+
+std::unordered_map<std::string, CompressionType>
+    OptionsHelper::compression_type_string_map = {
+        {"kNoCompression", kNoCompression},
+        {"kSnappyCompression", kSnappyCompression},
+        {"kZlibCompression", kZlibCompression},
+        {"kBZip2Compression", kBZip2Compression},
+        {"kLZ4Compression", kLZ4Compression},
+        {"kLZ4HCCompression", kLZ4HCCompression},
+        {"kXpressCompression", kXpressCompression},
+        {"kZSTD", kZSTD},
+        {"kZSTDNotFinalCompression", kZSTDNotFinalCompression},
+        {"kDisableCompressionOption", kDisableCompressionOption}};
 #ifndef ROCKSDB_LITE
 
+const std::string kNameComparator = "comparator";
+const std::string kNameMergeOperator = "merge_operator";
+
+template <typename T>
+Status GetStringFromStruct(
+    std::string* opt_string, const T& options,
+    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::string& delimiter);
+
 namespace {
 template <typename T>
 bool ParseEnum(const std::unordered_map<std::string, T>& type_map,
@@ -255,11 +326,83 @@ bool ParseVectorCompressionType(
   return true;
 }
 
+// This is to handle backward compatibility, where compaction_options_fifo
+// could be assigned a single scalar value, say, like "23", which would be
+// assigned to max_table_files_size.
+bool FIFOCompactionOptionsSpecialCase(const std::string& opt_str,
+                                      CompactionOptionsFIFO* options) {
+  if (opt_str.find("=") != std::string::npos) {
+    // New format. Go do your new parsing using ParseStructOptions.
+    return false;
+  }
+
+  // Old format. Parse just a single uint64_t value.
+  options->max_table_files_size = ParseUint64(opt_str);
+  return true;
+}
+
+template <typename T>
+bool SerializeStruct(
+    const T& options, std::string* value,
+    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+  std::string opt_str;
+  Status s = GetStringFromStruct(&opt_str, options, type_info_map, ";");
+  if (!s.ok()) {
+    return false;
+  }
+  *value = "{" + opt_str + "}";
+  return true;
+}
+
+template <typename T>
+bool ParseSingleStructOption(
+    const std::string& opt_val_str, T* options,
+    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+  size_t end = opt_val_str.find('=');
+  std::string key = opt_val_str.substr(0, end);
+  std::string value = opt_val_str.substr(end + 1);
+  auto iter = type_info_map.find(key);
+  if (iter == type_info_map.end()) {
+    return false;
+  }
+  const auto& opt_info = iter->second;
+  return ParseOptionHelper(
+      reinterpret_cast<char*>(options) + opt_info.mutable_offset, opt_info.type,
+      value);
+}
+
+template <typename T>
+bool ParseStructOptions(
+    const std::string& opt_str, T* options,
+    std::unordered_map<std::string, OptionTypeInfo> type_info_map) {
+  assert(!opt_str.empty());
+
+  size_t start = 0;
+  if (opt_str[0] == '{') {
+    start++;
+  }
+  while ((start != std::string::npos) && (start < opt_str.size())) {
+    if (opt_str[start] == '}') {
+      break;
+    }
+    size_t end = opt_str.find(';', start);
+    size_t len = (end == std::string::npos) ? end : end - start;
+    if (!ParseSingleStructOption(opt_str.substr(start, len), options,
+                                 type_info_map)) {
+      return false;
+    }
+    start = (end == std::string::npos) ? end : end + 1;
+  }
+  return true;
+}
+}  // anonymouse namespace
+
 bool ParseSliceTransformHelper(
     const std::string& kFixedPrefixName, const std::string& kCappedPrefixName,
     const std::string& value,
     std::shared_ptr<const SliceTransform>* slice_transform) {
-
+  const char* no_op_name = "rocksdb.Noop";
+  size_t no_op_length = strlen(no_op_name);
   auto& pe_value = value;
   if (pe_value.size() > kFixedPrefixName.size() &&
       pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == 0) {
@@ -271,6 +414,10 @@ bool ParseSliceTransformHelper(
     int prefix_length =
         ParseInt(trim(pe_value.substr(kCappedPrefixName.size())));
     slice_transform->reset(NewCappedPrefixTransform(prefix_length));
+  } else if (pe_value.size() == no_op_length &&
+             pe_value.compare(0, no_op_length, no_op_name) == 0) {
+    const SliceTransform* no_op_transform = NewNoopTransform();
+    slice_transform->reset(no_op_transform);
   } else if (value == kNullptrString) {
     slice_transform->reset();
   } else {
@@ -304,7 +451,6 @@ bool ParseSliceTransform(
   //                 SliceTransforms here.
   return false;
 }
-}  // anonymouse namespace
 
 bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
                        const std::string& value) {
@@ -315,6 +461,12 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
     case OptionType::kInt:
       *reinterpret_cast<int*>(opt_address) = ParseInt(value);
       break;
+    case OptionType::kInt32T:
+      *reinterpret_cast<int32_t*>(opt_address) = ParseInt32(value);
+      break;
+    case OptionType::kInt64T:
+      PutUnaligned(reinterpret_cast<int64_t*>(opt_address), ParseInt64(value));
+      break;
     case OptionType::kVectorInt:
       *reinterpret_cast<std::vector<int>*>(opt_address) = ParseVectorInt(value);
       break;
@@ -363,6 +515,11 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
       return ParseEnum<BlockBasedTableOptions::IndexType>(
           block_base_table_index_type_string_map, value,
           reinterpret_cast<BlockBasedTableOptions::IndexType*>(opt_address));
+    case OptionType::kBlockBasedTableDataBlockIndexType:
+      return ParseEnum<BlockBasedTableOptions::DataBlockIndexType>(
+          block_base_table_data_block_index_type_string_map, value,
+          reinterpret_cast<BlockBasedTableOptions::DataBlockIndexType*>(
+              opt_address));
     case OptionType::kEncodingType:
       return ParseEnum<EncodingType>(
           encoding_type_string_map, value,
@@ -379,6 +536,28 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
       return ParseEnum<InfoLogLevel>(
           info_log_level_string_map, value,
           reinterpret_cast<InfoLogLevel*>(opt_address));
+    case OptionType::kCompactionOptionsFIFO: {
+      if (!FIFOCompactionOptionsSpecialCase(
+              value, reinterpret_cast<CompactionOptionsFIFO*>(opt_address))) {
+        return ParseStructOptions<CompactionOptionsFIFO>(
+            value, reinterpret_cast<CompactionOptionsFIFO*>(opt_address),
+            fifo_compaction_options_type_info);
+      }
+      return true;
+    }
+    case OptionType::kLRUCacheOptions: {
+      return ParseStructOptions<LRUCacheOptions>(value,
+          reinterpret_cast<LRUCacheOptions*>(opt_address),
+          lru_cache_options_type_info);
+    }
+    case OptionType::kCompactionOptionsUniversal:
+      return ParseStructOptions<CompactionOptionsUniversal>(
+          value, reinterpret_cast<CompactionOptionsUniversal*>(opt_address),
+          universal_compaction_options_type_info);
+    case OptionType::kCompactionStopStyle:
+      return ParseEnum<CompactionStopStyle>(
+          compaction_stop_style_string_map, value,
+          reinterpret_cast<CompactionStopStyle*>(opt_address));
     default:
       return false;
   }
@@ -397,6 +576,16 @@ bool SerializeSingleOptionHelper(const char* opt_address,
     case OptionType::kInt:
       *value = ToString(*(reinterpret_cast<const int*>(opt_address)));
       break;
+    case OptionType::kInt32T:
+      *value = ToString(*(reinterpret_cast<const int32_t*>(opt_address)));
+      break;
+    case OptionType::kInt64T:
+      {
+        int64_t v;
+        GetUnaligned(reinterpret_cast<const int64_t*>(opt_address), &v);
+        *value = ToString(v);
+      }
+      break;
     case OptionType::kVectorInt:
       return SerializeIntVector(
           *reinterpret_cast<const std::vector<int>*>(opt_address), value);
@@ -520,6 +709,12 @@ bool SerializeSingleOptionHelper(const char* opt_address,
           *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(
               opt_address),
           value);
+    case OptionType::kBlockBasedTableDataBlockIndexType:
+      return SerializeEnum<BlockBasedTableOptions::DataBlockIndexType>(
+          block_base_table_data_block_index_type_string_map,
+          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
+              opt_address),
+          value);
     case OptionType::kFlushBlockPolicyFactory: {
       const auto* ptr =
           reinterpret_cast<const std::shared_ptr<FlushBlockPolicyFactory>*>(
@@ -543,6 +738,18 @@ bool SerializeSingleOptionHelper(const char* opt_address,
       return SerializeEnum<InfoLogLevel>(
           info_log_level_string_map,
           *reinterpret_cast<const InfoLogLevel*>(opt_address), value);
+    case OptionType::kCompactionOptionsFIFO:
+      return SerializeStruct<CompactionOptionsFIFO>(
+          *reinterpret_cast<const CompactionOptionsFIFO*>(opt_address), value,
+          fifo_compaction_options_type_info);
+    case OptionType::kCompactionOptionsUniversal:
+      return SerializeStruct<CompactionOptionsUniversal>(
+          *reinterpret_cast<const CompactionOptionsUniversal*>(opt_address),
+          value, universal_compaction_options_type_info);
+    case OptionType::kCompactionStopStyle:
+      return SerializeEnum<CompactionStopStyle>(
+          compaction_stop_style_string_map,
+          *reinterpret_cast<const CompactionStopStyle*>(opt_address), value);
     default:
       return false;
   }
@@ -552,7 +759,7 @@ bool SerializeSingleOptionHelper(const char* opt_address,
 Status GetMutableOptionsFromStrings(
     const MutableCFOptions& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
-    MutableCFOptions* new_options) {
+    Logger* info_log, MutableCFOptions* new_options) {
   assert(new_options);
   *new_options = base_options;
   for (const auto& o : options_map) {
@@ -565,6 +772,13 @@ Status GetMutableOptionsFromStrings(
       if (!opt_info.is_mutable) {
         return Status::InvalidArgument("Option not changeable: " + o.first);
       }
+      if (opt_info.verification == OptionVerificationType::kDeprecated) {
+        // log warning when user tries to set a deprecated option but don't fail
+        // the call for compatibility.
+        ROCKS_LOG_WARN(info_log, "%s is a deprecated option and cannot be set",
+                       o.first.c_str());
+        continue;
+      }
       bool is_ok = ParseOptionHelper(
           reinterpret_cast<char*>(new_options) + opt_info.mutable_offset,
           opt_info.type, o.second);
@@ -685,6 +899,65 @@ Status StringToMap(const std::string& opts_str,
   return Status::OK();
 }
 
+Status ParseCompressionOptions(const std::string& value, const std::string& name,
+                              CompressionOptions& compression_opts) {
+  size_t start = 0;
+  size_t end = value.find(':');
+  if (end == std::string::npos) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.window_bits = ParseInt(value.substr(start, end - start));
+  start = end + 1;
+  end = value.find(':', start);
+  if (end == std::string::npos) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  compression_opts.level = ParseInt(value.substr(start, end - start));
+  start = end + 1;
+  if (start >= value.size()) {
+    return Status::InvalidArgument("unable to parse the specified CF option " +
+                                   name);
+  }
+  end = value.find(':', start);
+  compression_opts.strategy =
+      ParseInt(value.substr(start, value.size() - start));
+  // max_dict_bytes is optional for backwards compatibility
+  if (end != std::string::npos) {
+    start = end + 1;
+    if (start >= value.size()) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.max_dict_bytes =
+        ParseInt(value.substr(start, value.size() - start));
+    end = value.find(':', start);
+  }
+  // zstd_max_train_bytes is optional for backwards compatibility
+  if (end != std::string::npos) {
+    start = end + 1;
+    if (start >= value.size()) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.zstd_max_train_bytes =
+        ParseInt(value.substr(start, value.size() - start));
+    end = value.find(':', start);
+  }
+  // enabled is optional for backwards compatibility
+  if (end != std::string::npos) {
+    start = end + 1;
+    if (start >= value.size()) {
+      return Status::InvalidArgument(
+          "unable to parse the specified CF option " + name);
+    }
+    compression_opts.enabled =
+        ParseBoolean("", value.substr(start, value.size() - start));
+  }
+  return Status::OK();
+}
+
 Status ParseColumnFamilyOption(const std::string& name,
                                const std::string& org_value,
                                ColumnFamilyOptions* new_options,
@@ -733,45 +1006,39 @@ Status ParseColumnFamilyOption(const std::string& name,
             "unable to parse the specified CF option " + name);
       }
       new_options->memtable_factory.reset(new_mem_factory.release());
-    } else if (name == "compression_opts") {
-      size_t start = 0;
-      size_t end = value.find(':');
-      if (end == std::string::npos) {
-        return Status::InvalidArgument(
-            "unable to parse the specified CF option " + name);
-      }
-      new_options->compression_opts.window_bits =
-          ParseInt(value.substr(start, end - start));
-      start = end + 1;
-      end = value.find(':', start);
-      if (end == std::string::npos) {
-        return Status::InvalidArgument(
-            "unable to parse the specified CF option " + name);
+    } else if (name == "bottommost_compression_opts") {
+      Status s = ParseCompressionOptions(
+          value, name, new_options->bottommost_compression_opts);
+      if (!s.ok()) {
+        return s;
       }
-      new_options->compression_opts.level =
-          ParseInt(value.substr(start, end - start));
-      start = end + 1;
-      if (start >= value.size()) {
-        return Status::InvalidArgument(
-            "unable to parse the specified CF option " + name);
+    } else if (name == "compression_opts") {
+      Status s =
+          ParseCompressionOptions(value, name, new_options->compression_opts);
+      if (!s.ok()) {
+        return s;
       }
-      end = value.find(':', start);
-      new_options->compression_opts.strategy =
-          ParseInt(value.substr(start, value.size() - start));
-      // max_dict_bytes is optional for backwards compatibility
-      if (end != std::string::npos) {
-        start = end + 1;
-        if (start >= value.size()) {
-          return Status::InvalidArgument(
-              "unable to parse the specified CF option " + name);
+    } else {
+      if (name == kNameComparator) {
+        // Try to get comparator from object registry first.
+        std::unique_ptr<const Comparator> comp_guard;
+        const Comparator* comp =
+            NewCustomObject<const Comparator>(value, &comp_guard);
+        // Only support static comparator for now.
+        if (comp != nullptr && !comp_guard) {
+          new_options->comparator = comp;
+        }
+      } else if (name == kNameMergeOperator) {
+        // Try to get merge operator from object registry first.
+        std::unique_ptr<std::shared_ptr<MergeOperator>> mo_guard;
+        std::shared_ptr<MergeOperator>* mo =
+            NewCustomObject<std::shared_ptr<MergeOperator>>(value, &mo_guard);
+        // Only support static comparator for now.
+        if (mo != nullptr) {
+          new_options->merge_operator = *mo;
         }
-        new_options->compression_opts.max_dict_bytes =
-            ParseInt(value.substr(start, value.size() - start));
       }
-    } else if (name == "compaction_options_fifo") {
-      new_options->compaction_options_fifo.max_table_files_size =
-          ParseUint64(value);
-    } else {
+
       auto iter = cf_options_type_info.find(name);
       if (iter == cf_options_type_info.end()) {
         return Status::InvalidArgument(
@@ -787,6 +1054,7 @@ Status ParseColumnFamilyOption(const std::string& name,
       switch (opt_info.verification) {
         case OptionVerificationType::kByName:
         case OptionVerificationType::kByNameAllowNull:
+        case OptionVerificationType::kByNameAllowFromNull:
           return Status::NotSupported(
               "Deserializing the specified CF option " + name +
                   " is not supported");
@@ -804,17 +1072,18 @@ Status ParseColumnFamilyOption(const std::string& name,
   return Status::OK();
 }
 
-bool SerializeSingleDBOption(std::string* opt_string,
-                             const DBOptions& db_options,
-                             const std::string& name,
-                             const std::string& delimiter) {
-  auto iter = db_options_type_info.find(name);
-  if (iter == db_options_type_info.end()) {
+template <typename T>
+bool SerializeSingleStructOption(
+    std::string* opt_string, const T& options,
+    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::string& name, const std::string& delimiter) {
+  auto iter = type_info.find(name);
+  if (iter == type_info.end()) {
     return false;
   }
   auto& opt_info = iter->second;
   const char* opt_address =
-      reinterpret_cast<const char*>(&db_options) + opt_info.offset;
+      reinterpret_cast<const char*>(&options) + opt_info.offset;
   std::string value;
   bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
   if (result) {
@@ -823,72 +1092,45 @@ bool SerializeSingleDBOption(std::string* opt_string,
   return result;
 }
 
-Status GetStringFromDBOptions(std::string* opt_string,
-                              const DBOptions& db_options,
-                              const std::string& delimiter) {
+template <typename T>
+Status GetStringFromStruct(
+    std::string* opt_string, const T& options,
+    const std::unordered_map<std::string, OptionTypeInfo> type_info,
+    const std::string& delimiter) {
   assert(opt_string);
   opt_string->clear();
-  for (auto iter = db_options_type_info.begin();
-       iter != db_options_type_info.end(); ++iter) {
+  for (auto iter = type_info.begin(); iter != type_info.end(); ++iter) {
     if (iter->second.verification == OptionVerificationType::kDeprecated) {
       // If the option is no longer used in rocksdb and marked as deprecated,
       // we skip it in the serialization.
       continue;
     }
     std::string single_output;
-    bool result = SerializeSingleDBOption(&single_output, db_options,
-                                          iter->first, delimiter);
-    assert(result);
+    bool result = SerializeSingleStructOption<T>(
+        &single_output, options, type_info, iter->first, delimiter);
     if (result) {
       opt_string->append(single_output);
+    } else {
+      return Status::InvalidArgument("failed to serialize %s\n",
+                                     iter->first.c_str());
     }
+    assert(result);
   }
   return Status::OK();
 }
 
-bool SerializeSingleColumnFamilyOption(std::string* opt_string,
-                                       const ColumnFamilyOptions& cf_options,
-                                       const std::string& name,
-                                       const std::string& delimiter) {
-  auto iter = cf_options_type_info.find(name);
-  if (iter == cf_options_type_info.end()) {
-    return false;
-  }
-  auto& opt_info = iter->second;
-  const char* opt_address =
-      reinterpret_cast<const char*>(&cf_options) + opt_info.offset;
-  std::string value;
-  bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value);
-  if (result) {
-    *opt_string = name + "=" + value + delimiter;
-  }
-  return result;
+Status GetStringFromDBOptions(std::string* opt_string,
+                              const DBOptions& db_options,
+                              const std::string& delimiter) {
+  return GetStringFromStruct<DBOptions>(opt_string, db_options,
+                                        db_options_type_info, delimiter);
 }
 
 Status GetStringFromColumnFamilyOptions(std::string* opt_string,
                                         const ColumnFamilyOptions& cf_options,
                                         const std::string& delimiter) {
-  assert(opt_string);
-  opt_string->clear();
-  for (auto iter = cf_options_type_info.begin();
-       iter != cf_options_type_info.end(); ++iter) {
-    if (iter->second.verification == OptionVerificationType::kDeprecated) {
-      // If the option is no longer used in rocksdb and marked as deprecated,
-      // we skip it in the serialization.
-      continue;
-    }
-    std::string single_output;
-    bool result = SerializeSingleColumnFamilyOption(&single_output, cf_options,
-                                                    iter->first, delimiter);
-    if (result) {
-      opt_string->append(single_output);
-    } else {
-      return Status::InvalidArgument("failed to serialize %s\n",
-                                     iter->first.c_str());
-    }
-    assert(result);
-  }
-  return Status::OK();
+  return GetStringFromStruct<ColumnFamilyOptions>(
+      opt_string, cf_options, cf_options_type_info, delimiter);
 }
 
 Status GetStringFromCompressionType(std::string* compression_str,
@@ -1128,6 +1370,674 @@ Status GetTableFactoryFromMap(
   return Status::OK();
 }
 
+std::unordered_map<std::string, OptionTypeInfo>
+    OptionsHelper::db_options_type_info = {
+        /*
+         // not yet supported
+          Env* env;
+          std::shared_ptr<Cache> row_cache;
+          std::shared_ptr<DeleteScheduler> delete_scheduler;
+          std::shared_ptr<Logger> info_log;
+          std::shared_ptr<RateLimiter> rate_limiter;
+          std::shared_ptr<Statistics> statistics;
+          std::vector<DbPath> db_paths;
+          std::vector<std::shared_ptr<EventListener>> listeners;
+         */
+        {"advise_random_on_open",
+         {offsetof(struct DBOptions, advise_random_on_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"allow_mmap_reads",
+         {offsetof(struct DBOptions, allow_mmap_reads), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"allow_fallocate",
+         {offsetof(struct DBOptions, allow_fallocate), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"allow_mmap_writes",
+         {offsetof(struct DBOptions, allow_mmap_writes), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"use_direct_reads",
+         {offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"use_direct_writes",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"use_direct_io_for_flush_and_compaction",
+         {offsetof(struct DBOptions, use_direct_io_for_flush_and_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"allow_2pc",
+         {offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"allow_os_buffer",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"create_if_missing",
+         {offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"create_missing_column_families",
+         {offsetof(struct DBOptions, create_missing_column_families),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"disableDataSync",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"disable_data_sync",  // for compatibility
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"enable_thread_tracking",
+         {offsetof(struct DBOptions, enable_thread_tracking),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"error_if_exists",
+         {offsetof(struct DBOptions, error_if_exists), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"is_fd_close_on_exec",
+         {offsetof(struct DBOptions, is_fd_close_on_exec), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"paranoid_checks",
+         {offsetof(struct DBOptions, paranoid_checks), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"skip_log_error_on_recovery",
+         {offsetof(struct DBOptions, skip_log_error_on_recovery),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"skip_stats_update_on_db_open",
+         {offsetof(struct DBOptions, skip_stats_update_on_db_open),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"new_table_reader_for_compaction_inputs",
+         {offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"compaction_readahead_size",
+         {offsetof(struct DBOptions, compaction_readahead_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, compaction_readahead_size)}},
+        {"random_access_max_buffer_size",
+         {offsetof(struct DBOptions, random_access_max_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+        {"use_adaptive_mutex",
+         {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"use_fsync",
+         {offsetof(struct DBOptions, use_fsync), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"max_background_jobs",
+         {offsetof(struct DBOptions, max_background_jobs), OptionType::kInt,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, max_background_jobs)}},
+        {"max_background_compactions",
+         {offsetof(struct DBOptions, max_background_compactions),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, max_background_compactions)}},
+        {"base_background_compactions",
+         {offsetof(struct DBOptions, base_background_compactions),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, base_background_compactions)}},
+        {"max_background_flushes",
+         {offsetof(struct DBOptions, max_background_flushes), OptionType::kInt,
+          OptionVerificationType::kNormal, false, 0}},
+        {"max_file_opening_threads",
+         {offsetof(struct DBOptions, max_file_opening_threads),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"max_open_files",
+         {offsetof(struct DBOptions, max_open_files), OptionType::kInt,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, max_open_files)}},
+        {"table_cache_numshardbits",
+         {offsetof(struct DBOptions, table_cache_numshardbits),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"db_write_buffer_size",
+         {offsetof(struct DBOptions, db_write_buffer_size), OptionType::kSizeT,
+          OptionVerificationType::kNormal, false, 0}},
+        {"keep_log_file_num",
+         {offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT,
+          OptionVerificationType::kNormal, false, 0}},
+        {"recycle_log_file_num",
+         {offsetof(struct DBOptions, recycle_log_file_num), OptionType::kSizeT,
+          OptionVerificationType::kNormal, false, 0}},
+        {"log_file_time_to_roll",
+         {offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT,
+          OptionVerificationType::kNormal, false, 0}},
+        {"manifest_preallocation_size",
+         {offsetof(struct DBOptions, manifest_preallocation_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+        {"max_log_file_size",
+         {offsetof(struct DBOptions, max_log_file_size), OptionType::kSizeT,
+          OptionVerificationType::kNormal, false, 0}},
+        {"db_log_dir",
+         {offsetof(struct DBOptions, db_log_dir), OptionType::kString,
+          OptionVerificationType::kNormal, false, 0}},
+        {"wal_dir",
+         {offsetof(struct DBOptions, wal_dir), OptionType::kString,
+          OptionVerificationType::kNormal, false, 0}},
+        {"max_subcompactions",
+         {offsetof(struct DBOptions, max_subcompactions), OptionType::kUInt32T,
+          OptionVerificationType::kNormal, false, 0}},
+        {"WAL_size_limit_MB",
+         {offsetof(struct DBOptions, WAL_size_limit_MB), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, false, 0}},
+        {"WAL_ttl_seconds",
+         {offsetof(struct DBOptions, WAL_ttl_seconds), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, false, 0}},
+        {"bytes_per_sync",
+         {offsetof(struct DBOptions, bytes_per_sync), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, bytes_per_sync)}},
+        {"delayed_write_rate",
+         {offsetof(struct DBOptions, delayed_write_rate), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, delayed_write_rate)}},
+        {"delete_obsolete_files_period_micros",
+         {offsetof(struct DBOptions, delete_obsolete_files_period_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions,
+                   delete_obsolete_files_period_micros)}},
+        {"max_manifest_file_size",
+         {offsetof(struct DBOptions, max_manifest_file_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+        {"max_total_wal_size",
+         {offsetof(struct DBOptions, max_total_wal_size), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, max_total_wal_size)}},
+        {"wal_bytes_per_sync",
+         {offsetof(struct DBOptions, wal_bytes_per_sync), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, wal_bytes_per_sync)}},
+        {"stats_dump_period_sec",
+         {offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, stats_dump_period_sec)}},
+        {"stats_persist_period_sec",
+         {offsetof(struct DBOptions, stats_persist_period_sec),
+          OptionType::kUInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, stats_persist_period_sec)}},
+        {"stats_history_buffer_size",
+         {offsetof(struct DBOptions, stats_history_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, stats_history_buffer_size)}},
+        {"fail_if_options_file_error",
+         {offsetof(struct DBOptions, fail_if_options_file_error),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"enable_pipelined_write",
+         {offsetof(struct DBOptions, enable_pipelined_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"allow_concurrent_memtable_write",
+         {offsetof(struct DBOptions, allow_concurrent_memtable_write),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"wal_recovery_mode",
+         {offsetof(struct DBOptions, wal_recovery_mode),
+          OptionType::kWALRecoveryMode, OptionVerificationType::kNormal, false,
+          0}},
+        {"enable_write_thread_adaptive_yield",
+         {offsetof(struct DBOptions, enable_write_thread_adaptive_yield),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"write_thread_slow_yield_usec",
+         {offsetof(struct DBOptions, write_thread_slow_yield_usec),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+        {"write_thread_max_yield_usec",
+         {offsetof(struct DBOptions, write_thread_max_yield_usec),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
+        {"access_hint_on_compaction_start",
+         {offsetof(struct DBOptions, access_hint_on_compaction_start),
+          OptionType::kAccessHint, OptionVerificationType::kNormal, false, 0}},
+        {"info_log_level",
+         {offsetof(struct DBOptions, info_log_level), OptionType::kInfoLogLevel,
+          OptionVerificationType::kNormal, false, 0}},
+        {"dump_malloc_stats",
+         {offsetof(struct DBOptions, dump_malloc_stats), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false, 0}},
+        {"avoid_flush_during_recovery",
+         {offsetof(struct DBOptions, avoid_flush_during_recovery),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"avoid_flush_during_shutdown",
+         {offsetof(struct DBOptions, avoid_flush_during_shutdown),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}},
+        {"writable_file_max_buffer_size",
+         {offsetof(struct DBOptions, writable_file_max_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableDBOptions, writable_file_max_buffer_size)}},
+        {"allow_ingest_behind",
+         {offsetof(struct DBOptions, allow_ingest_behind), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, allow_ingest_behind)}},
+        {"preserve_deletes",
+         {offsetof(struct DBOptions, preserve_deletes), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, preserve_deletes)}},
+        {"concurrent_prepare",  // Deprecated by two_write_queues
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"two_write_queues",
+         {offsetof(struct DBOptions, two_write_queues), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, two_write_queues)}},
+        {"manual_wal_flush",
+         {offsetof(struct DBOptions, manual_wal_flush), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, manual_wal_flush)}},
+        {"seq_per_batch",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"atomic_flush",
+         {offsetof(struct DBOptions, atomic_flush), OptionType::kBoolean,
+          OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, atomic_flush)}},
+        {"avoid_unnecessary_blocking_io",
+         {offsetof(struct DBOptions, avoid_unnecessary_blocking_io),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false,
+          offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}}
+      };
+
+std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
+    OptionsHelper::block_base_table_index_type_string_map = {
+        {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
+        {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
+        {"kTwoLevelIndexSearch",
+         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}};
+
+std::unordered_map<std::string, BlockBasedTableOptions::DataBlockIndexType>
+    OptionsHelper::block_base_table_data_block_index_type_string_map = {
+        {"kDataBlockBinarySearch",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch},
+        {"kDataBlockBinaryAndHash",
+         BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash}};
+
+std::unordered_map<std::string, EncodingType>
+    OptionsHelper::encoding_type_string_map = {{"kPlain", kPlain},
+                                               {"kPrefix", kPrefix}};
+
+std::unordered_map<std::string, CompactionStyle>
+    OptionsHelper::compaction_style_string_map = {
+        {"kCompactionStyleLevel", kCompactionStyleLevel},
+        {"kCompactionStyleUniversal", kCompactionStyleUniversal},
+        {"kCompactionStyleFIFO", kCompactionStyleFIFO},
+        {"kCompactionStyleNone", kCompactionStyleNone}};
+
+std::unordered_map<std::string, CompactionPri>
+    OptionsHelper::compaction_pri_string_map = {
+        {"kByCompensatedSize", kByCompensatedSize},
+        {"kOldestLargestSeqFirst", kOldestLargestSeqFirst},
+        {"kOldestSmallestSeqFirst", kOldestSmallestSeqFirst},
+        {"kMinOverlappingRatio", kMinOverlappingRatio}};
+
+std::unordered_map<std::string, WALRecoveryMode>
+    OptionsHelper::wal_recovery_mode_string_map = {
+        {"kTolerateCorruptedTailRecords",
+         WALRecoveryMode::kTolerateCorruptedTailRecords},
+        {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency},
+        {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery},
+        {"kSkipAnyCorruptedRecords",
+         WALRecoveryMode::kSkipAnyCorruptedRecords}};
+
+std::unordered_map<std::string, DBOptions::AccessHint>
+    OptionsHelper::access_hint_string_map = {
+        {"NONE", DBOptions::AccessHint::NONE},
+        {"NORMAL", DBOptions::AccessHint::NORMAL},
+        {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL},
+        {"WILLNEED", DBOptions::AccessHint::WILLNEED}};
+
+std::unordered_map<std::string, InfoLogLevel>
+    OptionsHelper::info_log_level_string_map = {
+        {"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL},
+        {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL},
+        {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL},
+        {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL},
+        {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL},
+        {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}};
+
+ColumnFamilyOptions OptionsHelper::dummy_cf_options;
+CompactionOptionsFIFO OptionsHelper::dummy_comp_options;
+LRUCacheOptions OptionsHelper::dummy_lru_cache_options;
+CompactionOptionsUniversal OptionsHelper::dummy_comp_options_universal;
+
+// offset_of is used to get the offset of a class data member
+// ex: offset_of(&ColumnFamilyOptions::num_levels)
+// This call will return the offset of num_levels in ColumnFamilyOptions class
+//
+// This is the same as offsetof() but allow us to work with non standard-layout
+// classes and structures
+// refs:
+// http://en.cppreference.com/w/cpp/concept/StandardLayoutType
+// https://gist.github.com/graphitemaster/494f21190bb2c63c5516
+template <typename T1>
+int offset_of(T1 ColumnFamilyOptions::*member) {
+  return int(size_t(&(OptionsHelper::dummy_cf_options.*member)) -
+             size_t(&OptionsHelper::dummy_cf_options));
+}
+template <typename T1>
+int offset_of(T1 AdvancedColumnFamilyOptions::*member) {
+  return int(size_t(&(OptionsHelper::dummy_cf_options.*member)) -
+             size_t(&OptionsHelper::dummy_cf_options));
+}
+template <typename T1>
+int offset_of(T1 CompactionOptionsFIFO::*member) {
+  return int(size_t(&(OptionsHelper::dummy_comp_options.*member)) -
+             size_t(&OptionsHelper::dummy_comp_options));
+}
+template <typename T1>
+int offset_of(T1 LRUCacheOptions::*member) {
+  return int(size_t(&(OptionsHelper::dummy_lru_cache_options.*member)) -
+             size_t(&OptionsHelper::dummy_lru_cache_options));
+}
+template <typename T1>
+int offset_of(T1 CompactionOptionsUniversal::*member) {
+  return int(size_t(&(OptionsHelper::dummy_comp_options_universal.*member)) -
+             size_t(&OptionsHelper::dummy_comp_options_universal));
+}
+
+std::unordered_map<std::string, OptionTypeInfo>
+    OptionsHelper::cf_options_type_info = {
+        /* not yet supported
+        CompressionOptions compression_opts;
+        TablePropertiesCollectorFactories table_properties_collector_factories;
+        typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+            TablePropertiesCollectorFactories;
+        UpdateStatus (*inplace_callback)(char* existing_value,
+                                         uint34_t* existing_value_size,
+                                         Slice delta_value,
+                                         std::string* merged_value);
+        std::vector<DbPath> cf_paths;
+         */
+        {"report_bg_io_stats",
+         {offset_of(&ColumnFamilyOptions::report_bg_io_stats),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, report_bg_io_stats)}},
+        {"compaction_measure_io_stats",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false,
+          0}},
+        {"disable_auto_compactions",
+         {offset_of(&ColumnFamilyOptions::disable_auto_compactions),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, disable_auto_compactions)}},
+        {"filter_deletes",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"inplace_update_support",
+         {offset_of(&ColumnFamilyOptions::inplace_update_support),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"level_compaction_dynamic_level_bytes",
+         {offset_of(&ColumnFamilyOptions::level_compaction_dynamic_level_bytes),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"optimize_filters_for_hits",
+         {offset_of(&ColumnFamilyOptions::optimize_filters_for_hits),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"paranoid_file_checks",
+         {offset_of(&ColumnFamilyOptions::paranoid_file_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, paranoid_file_checks)}},
+        {"force_consistency_checks",
+         {offset_of(&ColumnFamilyOptions::force_consistency_checks),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"purge_redundant_kvs_while_flush",
+         {offset_of(&ColumnFamilyOptions::purge_redundant_kvs_while_flush),
+          OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
+        {"verify_checksums_in_compaction",
+         {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"soft_pending_compaction_bytes_limit",
+         {offset_of(&ColumnFamilyOptions::soft_pending_compaction_bytes_limit),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions,
+                   soft_pending_compaction_bytes_limit)}},
+        {"hard_pending_compaction_bytes_limit",
+         {offset_of(&ColumnFamilyOptions::hard_pending_compaction_bytes_limit),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions,
+                   hard_pending_compaction_bytes_limit)}},
+        {"hard_rate_limit",
+         {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"soft_rate_limit",
+         {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"max_compaction_bytes",
+         {offset_of(&ColumnFamilyOptions::max_compaction_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, max_compaction_bytes)}},
+        {"expanded_compaction_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
+        {"level0_file_num_compaction_trigger",
+         {offset_of(&ColumnFamilyOptions::level0_file_num_compaction_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions,
+                   level0_file_num_compaction_trigger)}},
+        {"level0_slowdown_writes_trigger",
+         {offset_of(&ColumnFamilyOptions::level0_slowdown_writes_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger)}},
+        {"level0_stop_writes_trigger",
+         {offset_of(&ColumnFamilyOptions::level0_stop_writes_trigger),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, level0_stop_writes_trigger)}},
+        {"max_grandparent_overlap_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
+        {"max_mem_compaction_level",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated, false, 0}},
+        {"max_write_buffer_number",
+         {offset_of(&ColumnFamilyOptions::max_write_buffer_number),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, max_write_buffer_number)}},
+        {"max_write_buffer_number_to_maintain",
+         {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"min_write_buffer_number_to_merge",
+         {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge),
+          OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
+        {"num_levels",
+         {offset_of(&ColumnFamilyOptions::num_levels), OptionType::kInt,
+          OptionVerificationType::kNormal, false, 0}},
+        {"source_compaction_factor",
+         {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
+        {"target_file_size_multiplier",
+         {offset_of(&ColumnFamilyOptions::target_file_size_multiplier),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, target_file_size_multiplier)}},
+        {"arena_block_size",
+         {offset_of(&ColumnFamilyOptions::arena_block_size), OptionType::kSizeT,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, arena_block_size)}},
+        {"inplace_update_num_locks",
+         {offset_of(&ColumnFamilyOptions::inplace_update_num_locks),
+          OptionType::kSizeT, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, inplace_update_num_locks)}},
+        {"max_successive_merges",
+         {offset_of(&ColumnFamilyOptions::max_successive_merges),
+          OptionType::kSizeT, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, max_successive_merges)}},
+        {"memtable_huge_page_size",
+         {offset_of(&ColumnFamilyOptions::memtable_huge_page_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, memtable_huge_page_size)}},
+        {"memtable_prefix_bloom_huge_page_tlb_size",
+         {0, OptionType::kSizeT, OptionVerificationType::kDeprecated, true, 0}},
+        {"write_buffer_size",
+         {offset_of(&ColumnFamilyOptions::write_buffer_size),
+          OptionType::kSizeT, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, write_buffer_size)}},
+        {"bloom_locality",
+         {offset_of(&ColumnFamilyOptions::bloom_locality), OptionType::kUInt32T,
+          OptionVerificationType::kNormal, false, 0}},
+        {"memtable_prefix_bloom_bits",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"memtable_prefix_bloom_size_ratio",
+         {offset_of(&ColumnFamilyOptions::memtable_prefix_bloom_size_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio)}},
+        {"memtable_prefix_bloom_probes",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"memtable_whole_key_filtering",
+         {offset_of(&ColumnFamilyOptions::memtable_whole_key_filtering),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, memtable_whole_key_filtering)}},
+        {"min_partial_merge_operands",
+         {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true,
+          0}},
+        {"max_bytes_for_level_base",
+         {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, max_bytes_for_level_base)}},
+        {"max_bytes_for_level_multiplier",
+         {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier),
+          OptionType::kDouble, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier)}},
+        {"max_bytes_for_level_multiplier_additional",
+         {offset_of(
+              &ColumnFamilyOptions::max_bytes_for_level_multiplier_additional),
+          OptionType::kVectorInt, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions,
+                   max_bytes_for_level_multiplier_additional)}},
+        {"max_sequential_skip_in_iterations",
+         {offset_of(&ColumnFamilyOptions::max_sequential_skip_in_iterations),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions,
+                   max_sequential_skip_in_iterations)}},
+        {"target_file_size_base",
+         {offset_of(&ColumnFamilyOptions::target_file_size_base),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, target_file_size_base)}},
+        {"rate_limit_delay_max_milliseconds",
+         {0, OptionType::kUInt, OptionVerificationType::kDeprecated, false, 0}},
+        {"compression",
+         {offset_of(&ColumnFamilyOptions::compression),
+          OptionType::kCompressionType, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, compression)}},
+        {"compression_per_level",
+         {offset_of(&ColumnFamilyOptions::compression_per_level),
+          OptionType::kVectorCompressionType, OptionVerificationType::kNormal,
+          false, 0}},
+        {"bottommost_compression",
+         {offset_of(&ColumnFamilyOptions::bottommost_compression),
+          OptionType::kCompressionType, OptionVerificationType::kNormal, false,
+          0}},
+        {kNameComparator,
+         {offset_of(&ColumnFamilyOptions::comparator), OptionType::kComparator,
+          OptionVerificationType::kByName, false, 0}},
+        {"prefix_extractor",
+         {offset_of(&ColumnFamilyOptions::prefix_extractor),
+          OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
+          true, offsetof(struct MutableCFOptions, prefix_extractor)}},
+        {"memtable_insert_with_hint_prefix_extractor",
+         {offset_of(
+              &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor),
+          OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
+          false, 0}},
+        {"memtable_factory",
+         {offset_of(&ColumnFamilyOptions::memtable_factory),
+          OptionType::kMemTableRepFactory, OptionVerificationType::kByName,
+          false, 0}},
+        {"table_factory",
+         {offset_of(&ColumnFamilyOptions::table_factory),
+          OptionType::kTableFactory, OptionVerificationType::kByName, false,
+          0}},
+        {"compaction_filter",
+         {offset_of(&ColumnFamilyOptions::compaction_filter),
+          OptionType::kCompactionFilter, OptionVerificationType::kByName, false,
+          0}},
+        {"compaction_filter_factory",
+         {offset_of(&ColumnFamilyOptions::compaction_filter_factory),
+          OptionType::kCompactionFilterFactory, OptionVerificationType::kByName,
+          false, 0}},
+        {kNameMergeOperator,
+         {offset_of(&ColumnFamilyOptions::merge_operator),
+          OptionType::kMergeOperator,
+          OptionVerificationType::kByNameAllowFromNull, false, 0}},
+        {"compaction_style",
+         {offset_of(&ColumnFamilyOptions::compaction_style),
+          OptionType::kCompactionStyle, OptionVerificationType::kNormal, false,
+          0}},
+        {"compaction_pri",
+         {offset_of(&ColumnFamilyOptions::compaction_pri),
+          OptionType::kCompactionPri, OptionVerificationType::kNormal, false,
+          0}},
+        {"compaction_options_fifo",
+         {offset_of(&ColumnFamilyOptions::compaction_options_fifo),
+          OptionType::kCompactionOptionsFIFO, OptionVerificationType::kNormal,
+          true, offsetof(struct MutableCFOptions, compaction_options_fifo)}},
+        {"compaction_options_universal",
+         {offset_of(&ColumnFamilyOptions::compaction_options_universal),
+          OptionType::kCompactionOptionsUniversal,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, compaction_options_universal)}},
+        {"ttl",
+         {offset_of(&ColumnFamilyOptions::ttl), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, ttl)}},
+        {"sample_for_compression",
+         {offset_of(&ColumnFamilyOptions::sample_for_compression),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct MutableCFOptions, sample_for_compression)}}};
+
+std::unordered_map<std::string, OptionTypeInfo>
+    OptionsHelper::fifo_compaction_options_type_info = {
+        {"max_table_files_size",
+         {offset_of(&CompactionOptionsFIFO::max_table_files_size),
+          OptionType::kUInt64T, OptionVerificationType::kNormal, true,
+          offsetof(struct CompactionOptionsFIFO, max_table_files_size)}},
+        {"ttl",
+         {0, OptionType::kUInt64T,
+          OptionVerificationType::kDeprecated, false,
+          0}},
+        {"allow_compaction",
+         {offset_of(&CompactionOptionsFIFO::allow_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(struct CompactionOptionsFIFO, allow_compaction)}}};
+
+std::unordered_map<std::string, OptionTypeInfo>
+    OptionsHelper::universal_compaction_options_type_info = {
+        {"size_ratio",
+         {offset_of(&CompactionOptionsUniversal::size_ratio), OptionType::kUInt,
+          OptionVerificationType::kNormal, true,
+          offsetof(class CompactionOptionsUniversal, size_ratio)}},
+        {"min_merge_width",
+         {offset_of(&CompactionOptionsUniversal::min_merge_width),
+          OptionType::kUInt, OptionVerificationType::kNormal, true,
+          offsetof(class CompactionOptionsUniversal, min_merge_width)}},
+        {"max_merge_width",
+         {offset_of(&CompactionOptionsUniversal::max_merge_width),
+          OptionType::kUInt, OptionVerificationType::kNormal, true,
+          offsetof(class CompactionOptionsUniversal, max_merge_width)}},
+        {"max_size_amplification_percent",
+         {offset_of(
+              &CompactionOptionsUniversal::max_size_amplification_percent),
+          OptionType::kUInt, OptionVerificationType::kNormal, true,
+          offsetof(class CompactionOptionsUniversal,
+                   max_size_amplification_percent)}},
+        {"compression_size_percent",
+         {offset_of(&CompactionOptionsUniversal::compression_size_percent),
+          OptionType::kInt, OptionVerificationType::kNormal, true,
+          offsetof(class CompactionOptionsUniversal,
+                   compression_size_percent)}},
+        {"stop_style",
+         {offset_of(&CompactionOptionsUniversal::stop_style),
+          OptionType::kCompactionStopStyle, OptionVerificationType::kNormal,
+          true, offsetof(class CompactionOptionsUniversal, stop_style)}},
+        {"allow_trivial_move",
+         {offset_of(&CompactionOptionsUniversal::allow_trivial_move),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(class CompactionOptionsUniversal, allow_trivial_move)}}};
+
+std::unordered_map<std::string, CompactionStopStyle>
+    OptionsHelper::compaction_stop_style_string_map = {
+        {"kCompactionStopStyleSimilarSize", kCompactionStopStyleSimilarSize},
+        {"kCompactionStopStyleTotalSize", kCompactionStopStyleTotalSize}};
+
+std::unordered_map<std::string, OptionTypeInfo>
+    OptionsHelper::lru_cache_options_type_info = {
+        {"capacity",
+         {offset_of(&LRUCacheOptions::capacity), OptionType::kSizeT,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct LRUCacheOptions, capacity)}},
+        {"num_shard_bits",
+         {offset_of(&LRUCacheOptions::num_shard_bits), OptionType::kInt,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct LRUCacheOptions, num_shard_bits)}},
+        {"strict_capacity_limit",
+         {offset_of(&LRUCacheOptions::strict_capacity_limit),
+          OptionType::kBoolean, OptionVerificationType::kNormal, true,
+          offsetof(struct LRUCacheOptions, strict_capacity_limit)}},
+        {"high_pri_pool_ratio",
+         {offset_of(&LRUCacheOptions::high_pri_pool_ratio), OptionType::kDouble,
+          OptionVerificationType::kNormal, true,
+          offsetof(struct LRUCacheOptions, high_pri_pool_ratio)}}};
+
 #endif  // !ROCKSDB_LITE
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/options/options_helper.h b/thirdparty/rocksdb/options/options_helper.h
index 67b04271ff..1d3d880a62 100644
--- a/thirdparty/rocksdb/options/options_helper.h
+++ b/thirdparty/rocksdb/options/options_helper.h
@@ -15,6 +15,7 @@
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "rocksdb/universal_compaction.h"
 
 namespace rocksdb {
 
@@ -25,32 +26,12 @@ ColumnFamilyOptions BuildColumnFamilyOptions(
     const ColumnFamilyOptions& ioptions,
     const MutableCFOptions& mutable_cf_options);
 
-static std::map<CompactionStyle, std::string> compaction_style_to_string = {
-    {kCompactionStyleLevel, "kCompactionStyleLevel"},
-    {kCompactionStyleUniversal, "kCompactionStyleUniversal"},
-    {kCompactionStyleFIFO, "kCompactionStyleFIFO"},
-    {kCompactionStyleNone, "kCompactionStyleNone"}};
-
-static std::map<CompactionPri, std::string> compaction_pri_to_string = {
-    {kByCompensatedSize, "kByCompensatedSize"},
-    {kOldestLargestSeqFirst, "kOldestLargestSeqFirst"},
-    {kOldestSmallestSeqFirst, "kOldestSmallestSeqFirst"},
-    {kMinOverlappingRatio, "kMinOverlappingRatio"}};
-
-static std::map<CompactionStopStyle, std::string>
-    compaction_stop_style_to_string = {
-        {kCompactionStopStyleSimilarSize, "kCompactionStopStyleSimilarSize"},
-        {kCompactionStopStyleTotalSize, "kCompactionStopStyleTotalSize"}};
-
-static std::unordered_map<std::string, ChecksumType> checksum_type_string_map =
-    {{"kNoChecksum", kNoChecksum}, {"kCRC32c", kCRC32c}, {"kxxHash", kxxHash}};
-
 #ifndef ROCKSDB_LITE
 
 Status GetMutableOptionsFromStrings(
     const MutableCFOptions& base_options,
     const std::unordered_map<std::string, std::string>& options_map,
-    MutableCFOptions* new_options);
+    Logger* info_log, MutableCFOptions* new_options);
 
 Status GetMutableDBOptionsFromStrings(
     const MutableDBOptions& base_options,
@@ -66,6 +47,8 @@ Status GetTableFactoryFromMap(
 enum class OptionType {
   kBoolean,
   kInt,
+  kInt32T,
+  kInt64T,
   kVectorInt,
   kUInt,
   kUInt32T,
@@ -82,9 +65,13 @@ enum class OptionType {
   kComparator,
   kCompactionFilter,
   kCompactionFilterFactory,
+  kCompactionOptionsFIFO,
+  kCompactionOptionsUniversal,
+  kCompactionStopStyle,
   kMergeOperator,
   kMemTableRepFactory,
   kBlockBasedTableIndexType,
+  kBlockBasedTableDataBlockIndexType,
   kFilterPolicy,
   kFlushBlockPolicyFactory,
   kChecksumType,
@@ -92,20 +79,23 @@ enum class OptionType {
   kWALRecoveryMode,
   kAccessHint,
   kInfoLogLevel,
+  kLRUCacheOptions,
   kUnknown
 };
 
 enum class OptionVerificationType {
   kNormal,
-  kByName,           // The option is pointer typed so we can only verify
-                     // based on it's name.
-  kByNameAllowNull,  // Same as kByName, but it also allows the case
-                     // where one of them is a nullptr.
-  kDeprecated        // The option is no longer used in rocksdb. The RocksDB
-                     // OptionsParser will still accept this option if it
-                     // happen to exists in some Options file.  However, the
-                     // parser will not include it in serialization and
-                     // verification processes.
+  kByName,               // The option is pointer typed so we can only verify
+                         // based on it's name.
+  kByNameAllowNull,      // Same as kByName, but it also allows the case
+                         // where one of them is a nullptr.
+  kByNameAllowFromNull,  // Same as kByName, but it also allows the case
+                         // where the old option is nullptr.
+  kDeprecated            // The option is no longer used in rocksdb. The RocksDB
+                         // OptionsParser will still accept this option if it
+                         // happen to exists in some Options file.  However,
+                         // the parser will not include it in serialization
+                         // and verification processes.
 };
 
 // A struct for storing constant option information such as option name,
@@ -143,501 +133,9 @@ Status GetColumnFamilyOptionsFromMapInternal(
     std::vector<std::string>* unsupported_options_names = nullptr,
     bool ignore_unknown_options = false);
 
-static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
-    /*
-     // not yet supported
-      Env* env;
-      std::shared_ptr<Cache> row_cache;
-      std::shared_ptr<DeleteScheduler> delete_scheduler;
-      std::shared_ptr<Logger> info_log;
-      std::shared_ptr<RateLimiter> rate_limiter;
-      std::shared_ptr<Statistics> statistics;
-      std::vector<DbPath> db_paths;
-      std::vector<std::shared_ptr<EventListener>> listeners;
-     */
-    {"advise_random_on_open",
-     {offsetof(struct DBOptions, advise_random_on_open), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"allow_mmap_reads",
-     {offsetof(struct DBOptions, allow_mmap_reads), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"allow_fallocate",
-     {offsetof(struct DBOptions, allow_fallocate), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"allow_mmap_writes",
-     {offsetof(struct DBOptions, allow_mmap_writes), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"use_direct_reads",
-     {offsetof(struct DBOptions, use_direct_reads), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"use_direct_writes",
-     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
-    {"use_direct_io_for_flush_and_compaction",
-     {offsetof(struct DBOptions, use_direct_io_for_flush_and_compaction),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"allow_2pc",
-     {offsetof(struct DBOptions, allow_2pc), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"allow_os_buffer",
-     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}},
-    {"create_if_missing",
-     {offsetof(struct DBOptions, create_if_missing), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"create_missing_column_families",
-     {offsetof(struct DBOptions, create_missing_column_families),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"disableDataSync",
-     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
-    {"disable_data_sync",  // for compatibility
-     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
-    {"enable_thread_tracking",
-     {offsetof(struct DBOptions, enable_thread_tracking), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"error_if_exists",
-     {offsetof(struct DBOptions, error_if_exists), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"is_fd_close_on_exec",
-     {offsetof(struct DBOptions, is_fd_close_on_exec), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"paranoid_checks",
-     {offsetof(struct DBOptions, paranoid_checks), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"skip_log_error_on_recovery",
-     {offsetof(struct DBOptions, skip_log_error_on_recovery),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"skip_stats_update_on_db_open",
-     {offsetof(struct DBOptions, skip_stats_update_on_db_open),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"new_table_reader_for_compaction_inputs",
-     {offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"compaction_readahead_size",
-     {offsetof(struct DBOptions, compaction_readahead_size), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"random_access_max_buffer_size",
-     {offsetof(struct DBOptions, random_access_max_buffer_size),
-      OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-    {"writable_file_max_buffer_size",
-     {offsetof(struct DBOptions, writable_file_max_buffer_size),
-      OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-    {"use_adaptive_mutex",
-     {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"use_fsync",
-     {offsetof(struct DBOptions, use_fsync), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"max_background_jobs",
-     {offsetof(struct DBOptions, max_background_jobs), OptionType::kInt,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, max_background_jobs)}},
-    {"max_background_compactions",
-     {offsetof(struct DBOptions, max_background_compactions), OptionType::kInt,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, max_background_compactions)}},
-    {"base_background_compactions",
-     {offsetof(struct DBOptions, base_background_compactions), OptionType::kInt,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, base_background_compactions)}},
-    {"max_background_flushes",
-     {offsetof(struct DBOptions, max_background_flushes), OptionType::kInt,
-      OptionVerificationType::kNormal, false, 0}},
-    {"max_file_opening_threads",
-     {offsetof(struct DBOptions, max_file_opening_threads), OptionType::kInt,
-      OptionVerificationType::kNormal, false, 0}},
-    {"max_open_files",
-     {offsetof(struct DBOptions, max_open_files), OptionType::kInt,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, max_open_files)}},
-    {"table_cache_numshardbits",
-     {offsetof(struct DBOptions, table_cache_numshardbits), OptionType::kInt,
-      OptionVerificationType::kNormal, false, 0}},
-    {"db_write_buffer_size",
-     {offsetof(struct DBOptions, db_write_buffer_size), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"keep_log_file_num",
-     {offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"recycle_log_file_num",
-     {offsetof(struct DBOptions, recycle_log_file_num), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"log_file_time_to_roll",
-     {offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"manifest_preallocation_size",
-     {offsetof(struct DBOptions, manifest_preallocation_size),
-      OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
-    {"max_log_file_size",
-     {offsetof(struct DBOptions, max_log_file_size), OptionType::kSizeT,
-      OptionVerificationType::kNormal, false, 0}},
-    {"db_log_dir",
-     {offsetof(struct DBOptions, db_log_dir), OptionType::kString,
-      OptionVerificationType::kNormal, false, 0}},
-    {"wal_dir",
-     {offsetof(struct DBOptions, wal_dir), OptionType::kString,
-      OptionVerificationType::kNormal, false, 0}},
-    {"max_subcompactions",
-     {offsetof(struct DBOptions, max_subcompactions), OptionType::kUInt32T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"WAL_size_limit_MB",
-     {offsetof(struct DBOptions, WAL_size_limit_MB), OptionType::kUInt64T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"WAL_ttl_seconds",
-     {offsetof(struct DBOptions, WAL_ttl_seconds), OptionType::kUInt64T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"bytes_per_sync",
-     {offsetof(struct DBOptions, bytes_per_sync), OptionType::kUInt64T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"delayed_write_rate",
-     {offsetof(struct DBOptions, delayed_write_rate), OptionType::kUInt64T,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, delayed_write_rate)}},
-    {"delete_obsolete_files_period_micros",
-     {offsetof(struct DBOptions, delete_obsolete_files_period_micros),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, delete_obsolete_files_period_micros)}},
-    {"max_manifest_file_size",
-     {offsetof(struct DBOptions, max_manifest_file_size), OptionType::kUInt64T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"max_total_wal_size",
-     {offsetof(struct DBOptions, max_total_wal_size), OptionType::kUInt64T,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, max_total_wal_size)}},
-    {"wal_bytes_per_sync",
-     {offsetof(struct DBOptions, wal_bytes_per_sync), OptionType::kUInt64T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"stats_dump_period_sec",
-     {offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, stats_dump_period_sec)}},
-    {"fail_if_options_file_error",
-     {offsetof(struct DBOptions, fail_if_options_file_error),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"enable_pipelined_write",
-     {offsetof(struct DBOptions, enable_pipelined_write), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"allow_concurrent_memtable_write",
-     {offsetof(struct DBOptions, allow_concurrent_memtable_write),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"wal_recovery_mode",
-     {offsetof(struct DBOptions, wal_recovery_mode),
-      OptionType::kWALRecoveryMode, OptionVerificationType::kNormal, false, 0}},
-    {"enable_write_thread_adaptive_yield",
-     {offsetof(struct DBOptions, enable_write_thread_adaptive_yield),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"write_thread_slow_yield_usec",
-     {offsetof(struct DBOptions, write_thread_slow_yield_usec),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-    {"write_thread_max_yield_usec",
-     {offsetof(struct DBOptions, write_thread_max_yield_usec),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, false, 0}},
-    {"access_hint_on_compaction_start",
-     {offsetof(struct DBOptions, access_hint_on_compaction_start),
-      OptionType::kAccessHint, OptionVerificationType::kNormal, false, 0}},
-    {"info_log_level",
-     {offsetof(struct DBOptions, info_log_level), OptionType::kInfoLogLevel,
-      OptionVerificationType::kNormal, false, 0}},
-    {"dump_malloc_stats",
-     {offsetof(struct DBOptions, dump_malloc_stats), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false, 0}},
-    {"avoid_flush_during_recovery",
-     {offsetof(struct DBOptions, avoid_flush_during_recovery),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"avoid_flush_during_shutdown",
-     {offsetof(struct DBOptions, avoid_flush_during_shutdown),
-      OptionType::kBoolean, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}},
-    {"allow_ingest_behind",
-     {offsetof(struct DBOptions, allow_ingest_behind), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false,
-      offsetof(struct ImmutableDBOptions, allow_ingest_behind)}},
-    {"concurrent_prepare",
-     {offsetof(struct DBOptions, concurrent_prepare), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false,
-      offsetof(struct ImmutableDBOptions, concurrent_prepare)}},
-    {"manual_wal_flush",
-     {offsetof(struct DBOptions, manual_wal_flush), OptionType::kBoolean,
-      OptionVerificationType::kNormal, false,
-      offsetof(struct ImmutableDBOptions, manual_wal_flush)}}};
-
-// offset_of is used to get the offset of a class data member
-// ex: offset_of(&ColumnFamilyOptions::num_levels)
-// This call will return the offset of num_levels in ColumnFamilyOptions class
-//
-// This is the same as offsetof() but allow us to work with non standard-layout
-// classes and structures
-// refs:
-// http://en.cppreference.com/w/cpp/concept/StandardLayoutType
-// https://gist.github.com/graphitemaster/494f21190bb2c63c5516
-template <typename T1, typename T2>
-inline int offset_of(T1 T2::*member) {
-  static T2 obj;
-  return int(size_t(&(obj.*member)) - size_t(&obj));
-}
-
-static std::unordered_map<std::string, OptionTypeInfo> cf_options_type_info = {
-    /* not yet supported
-    CompactionOptionsFIFO compaction_options_fifo;
-    CompactionOptionsUniversal compaction_options_universal;
-    CompressionOptions compression_opts;
-    TablePropertiesCollectorFactories table_properties_collector_factories;
-    typedef std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
-        TablePropertiesCollectorFactories;
-    UpdateStatus (*inplace_callback)(char* existing_value,
-                                     uint34_t* existing_value_size,
-                                     Slice delta_value,
-                                     std::string* merged_value);
-     */
-    {"report_bg_io_stats",
-     {offset_of(&ColumnFamilyOptions::report_bg_io_stats), OptionType::kBoolean,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, report_bg_io_stats)}},
-    {"compaction_measure_io_stats",
-     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}},
-    {"disable_auto_compactions",
-     {offset_of(&ColumnFamilyOptions::disable_auto_compactions),
-      OptionType::kBoolean, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, disable_auto_compactions)}},
-    {"filter_deletes",
-     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}},
-    {"inplace_update_support",
-     {offset_of(&ColumnFamilyOptions::inplace_update_support),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"level_compaction_dynamic_level_bytes",
-     {offset_of(&ColumnFamilyOptions::level_compaction_dynamic_level_bytes),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"optimize_filters_for_hits",
-     {offset_of(&ColumnFamilyOptions::optimize_filters_for_hits),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"paranoid_file_checks",
-     {offset_of(&ColumnFamilyOptions::paranoid_file_checks),
-      OptionType::kBoolean, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, paranoid_file_checks)}},
-    {"force_consistency_checks",
-     {offset_of(&ColumnFamilyOptions::force_consistency_checks),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"purge_redundant_kvs_while_flush",
-     {offset_of(&ColumnFamilyOptions::purge_redundant_kvs_while_flush),
-      OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
-    {"verify_checksums_in_compaction",
-     {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}},
-    {"soft_pending_compaction_bytes_limit",
-     {offset_of(&ColumnFamilyOptions::soft_pending_compaction_bytes_limit),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, soft_pending_compaction_bytes_limit)}},
-    {"hard_pending_compaction_bytes_limit",
-     {offset_of(&ColumnFamilyOptions::hard_pending_compaction_bytes_limit),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, hard_pending_compaction_bytes_limit)}},
-    {"hard_rate_limit",
-     {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true, 0}},
-    {"soft_rate_limit",
-     {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true, 0}},
-    {"max_compaction_bytes",
-     {offset_of(&ColumnFamilyOptions::max_compaction_bytes),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, max_compaction_bytes)}},
-    {"expanded_compaction_factor",
-     {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
-    {"level0_file_num_compaction_trigger",
-     {offset_of(&ColumnFamilyOptions::level0_file_num_compaction_trigger),
-      OptionType::kInt, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, level0_file_num_compaction_trigger)}},
-    {"level0_slowdown_writes_trigger",
-     {offset_of(&ColumnFamilyOptions::level0_slowdown_writes_trigger),
-      OptionType::kInt, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger)}},
-    {"level0_stop_writes_trigger",
-     {offset_of(&ColumnFamilyOptions::level0_stop_writes_trigger),
-      OptionType::kInt, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, level0_stop_writes_trigger)}},
-    {"max_grandparent_overlap_factor",
-     {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
-    {"max_mem_compaction_level",
-     {0, OptionType::kInt, OptionVerificationType::kDeprecated, false, 0}},
-    {"max_write_buffer_number",
-     {offset_of(&ColumnFamilyOptions::max_write_buffer_number),
-      OptionType::kInt, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, max_write_buffer_number)}},
-    {"max_write_buffer_number_to_maintain",
-     {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain),
-      OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-    {"min_write_buffer_number_to_merge",
-     {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge),
-      OptionType::kInt, OptionVerificationType::kNormal, false, 0}},
-    {"num_levels",
-     {offset_of(&ColumnFamilyOptions::num_levels), OptionType::kInt,
-      OptionVerificationType::kNormal, false, 0}},
-    {"source_compaction_factor",
-     {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}},
-    {"target_file_size_multiplier",
-     {offset_of(&ColumnFamilyOptions::target_file_size_multiplier),
-      OptionType::kInt, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, target_file_size_multiplier)}},
-    {"arena_block_size",
-     {offset_of(&ColumnFamilyOptions::arena_block_size), OptionType::kSizeT,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, arena_block_size)}},
-    {"inplace_update_num_locks",
-     {offset_of(&ColumnFamilyOptions::inplace_update_num_locks),
-      OptionType::kSizeT, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, inplace_update_num_locks)}},
-    {"max_successive_merges",
-     {offset_of(&ColumnFamilyOptions::max_successive_merges),
-      OptionType::kSizeT, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, max_successive_merges)}},
-    {"memtable_huge_page_size",
-     {offset_of(&ColumnFamilyOptions::memtable_huge_page_size),
-      OptionType::kSizeT, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, memtable_huge_page_size)}},
-    {"memtable_prefix_bloom_huge_page_tlb_size",
-     {0, OptionType::kSizeT, OptionVerificationType::kDeprecated, true, 0}},
-    {"write_buffer_size",
-     {offset_of(&ColumnFamilyOptions::write_buffer_size), OptionType::kSizeT,
-      OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, write_buffer_size)}},
-    {"bloom_locality",
-     {offset_of(&ColumnFamilyOptions::bloom_locality), OptionType::kUInt32T,
-      OptionVerificationType::kNormal, false, 0}},
-    {"memtable_prefix_bloom_bits",
-     {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, 0}},
-    {"memtable_prefix_bloom_size_ratio",
-     {offset_of(&ColumnFamilyOptions::memtable_prefix_bloom_size_ratio),
-      OptionType::kDouble, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio)}},
-    {"memtable_prefix_bloom_probes",
-     {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, 0}},
-    {"min_partial_merge_operands",
-     {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, 0}},
-    {"max_bytes_for_level_base",
-     {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, max_bytes_for_level_base)}},
-    {"max_bytes_for_level_multiplier",
-     {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier),
-      OptionType::kDouble, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier)}},
-    {"max_bytes_for_level_multiplier_additional",
-     {offset_of(
-          &ColumnFamilyOptions::max_bytes_for_level_multiplier_additional),
-      OptionType::kVectorInt, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions,
-               max_bytes_for_level_multiplier_additional)}},
-    {"max_sequential_skip_in_iterations",
-     {offset_of(&ColumnFamilyOptions::max_sequential_skip_in_iterations),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, max_sequential_skip_in_iterations)}},
-    {"target_file_size_base",
-     {offset_of(&ColumnFamilyOptions::target_file_size_base),
-      OptionType::kUInt64T, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, target_file_size_base)}},
-    {"rate_limit_delay_max_milliseconds",
-     {0, OptionType::kUInt, OptionVerificationType::kDeprecated, false, 0}},
-    {"compression",
-     {offset_of(&ColumnFamilyOptions::compression),
-      OptionType::kCompressionType, OptionVerificationType::kNormal, true,
-      offsetof(struct MutableCFOptions, compression)}},
-    {"compression_per_level",
-     {offset_of(&ColumnFamilyOptions::compression_per_level),
-      OptionType::kVectorCompressionType, OptionVerificationType::kNormal,
-      false, 0}},
-    {"bottommost_compression",
-     {offset_of(&ColumnFamilyOptions::bottommost_compression),
-      OptionType::kCompressionType, OptionVerificationType::kNormal, false, 0}},
-    {"comparator",
-     {offset_of(&ColumnFamilyOptions::comparator), OptionType::kComparator,
-      OptionVerificationType::kByName, false, 0}},
-    {"prefix_extractor",
-     {offset_of(&ColumnFamilyOptions::prefix_extractor),
-      OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
-      false, 0}},
-    {"memtable_insert_with_hint_prefix_extractor",
-     {offset_of(
-          &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor),
-      OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull,
-      false, 0}},
-    {"memtable_factory",
-     {offset_of(&ColumnFamilyOptions::memtable_factory),
-      OptionType::kMemTableRepFactory, OptionVerificationType::kByName, false,
-      0}},
-    {"table_factory",
-     {offset_of(&ColumnFamilyOptions::table_factory), OptionType::kTableFactory,
-      OptionVerificationType::kByName, false, 0}},
-    {"compaction_filter",
-     {offset_of(&ColumnFamilyOptions::compaction_filter),
-      OptionType::kCompactionFilter, OptionVerificationType::kByName, false,
-      0}},
-    {"compaction_filter_factory",
-     {offset_of(&ColumnFamilyOptions::compaction_filter_factory),
-      OptionType::kCompactionFilterFactory, OptionVerificationType::kByName,
-      false, 0}},
-    {"merge_operator",
-     {offset_of(&ColumnFamilyOptions::merge_operator),
-      OptionType::kMergeOperator, OptionVerificationType::kByName, false, 0}},
-    {"compaction_style",
-     {offset_of(&ColumnFamilyOptions::compaction_style),
-      OptionType::kCompactionStyle, OptionVerificationType::kNormal, false, 0}},
-    {"compaction_pri",
-     {offset_of(&ColumnFamilyOptions::compaction_pri),
-      OptionType::kCompactionPri, OptionVerificationType::kNormal, false, 0}}};
-
-static std::unordered_map<std::string, CompressionType>
-    compression_type_string_map = {
-        {"kNoCompression", kNoCompression},
-        {"kSnappyCompression", kSnappyCompression},
-        {"kZlibCompression", kZlibCompression},
-        {"kBZip2Compression", kBZip2Compression},
-        {"kLZ4Compression", kLZ4Compression},
-        {"kLZ4HCCompression", kLZ4HCCompression},
-        {"kXpressCompression", kXpressCompression},
-        {"kZSTD", kZSTD},
-        {"kZSTDNotFinalCompression", kZSTDNotFinalCompression},
-        {"kDisableCompressionOption", kDisableCompressionOption}};
-
-static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
-    block_base_table_index_type_string_map = {
-        {"kBinarySearch", BlockBasedTableOptions::IndexType::kBinarySearch},
-        {"kHashSearch", BlockBasedTableOptions::IndexType::kHashSearch},
-        {"kTwoLevelIndexSearch",
-         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}};
-
-static std::unordered_map<std::string, EncodingType> encoding_type_string_map =
-    {{"kPlain", kPlain}, {"kPrefix", kPrefix}};
-
-static std::unordered_map<std::string, CompactionStyle>
-    compaction_style_string_map = {
-        {"kCompactionStyleLevel", kCompactionStyleLevel},
-        {"kCompactionStyleUniversal", kCompactionStyleUniversal},
-        {"kCompactionStyleFIFO", kCompactionStyleFIFO},
-        {"kCompactionStyleNone", kCompactionStyleNone}};
-
-static std::unordered_map<std::string, CompactionPri>
-    compaction_pri_string_map = {
-        {"kByCompensatedSize", kByCompensatedSize},
-        {"kOldestLargestSeqFirst", kOldestLargestSeqFirst},
-        {"kOldestSmallestSeqFirst", kOldestSmallestSeqFirst},
-        {"kMinOverlappingRatio", kMinOverlappingRatio}};
-
-static std::unordered_map<std::string,
-                          WALRecoveryMode> wal_recovery_mode_string_map = {
-    {"kTolerateCorruptedTailRecords",
-     WALRecoveryMode::kTolerateCorruptedTailRecords},
-    {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency},
-    {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery},
-    {"kSkipAnyCorruptedRecords", WALRecoveryMode::kSkipAnyCorruptedRecords}};
-
-static std::unordered_map<std::string, DBOptions::AccessHint>
-    access_hint_string_map = {{"NONE", DBOptions::AccessHint::NONE},
-                              {"NORMAL", DBOptions::AccessHint::NORMAL},
-                              {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL},
-                              {"WILLNEED", DBOptions::AccessHint::WILLNEED}};
-
-static std::unordered_map<std::string, InfoLogLevel> info_log_level_string_map =
-    {{"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL},
-     {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL},
-     {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL},
-     {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL},
-     {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL},
-     {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}};
+bool ParseSliceTransform(
+    const std::string& value,
+    std::shared_ptr<const SliceTransform>* slice_transform);
 
 extern Status StringToMap(
     const std::string& opts_str,
@@ -647,4 +145,82 @@ extern bool ParseOptionHelper(char* opt_address, const OptionType& opt_type,
                               const std::string& value);
 #endif  // !ROCKSDB_LITE
 
+struct OptionsHelper {
+  static std::map<CompactionStyle, std::string> compaction_style_to_string;
+  static std::map<CompactionPri, std::string> compaction_pri_to_string;
+  static std::map<CompactionStopStyle, std::string>
+      compaction_stop_style_to_string;
+  static std::unordered_map<std::string, ChecksumType> checksum_type_string_map;
+  static std::unordered_map<std::string, CompressionType>
+      compression_type_string_map;
+#ifndef ROCKSDB_LITE
+  static std::unordered_map<std::string, OptionTypeInfo> cf_options_type_info;
+  static std::unordered_map<std::string, OptionTypeInfo>
+      fifo_compaction_options_type_info;
+  static std::unordered_map<std::string, OptionTypeInfo>
+      universal_compaction_options_type_info;
+  static std::unordered_map<std::string, CompactionStopStyle>
+      compaction_stop_style_string_map;
+  static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info;
+  static std::unordered_map<std::string, OptionTypeInfo>
+      lru_cache_options_type_info;
+  static std::unordered_map<std::string, BlockBasedTableOptions::IndexType>
+      block_base_table_index_type_string_map;
+  static std::unordered_map<std::string,
+                            BlockBasedTableOptions::DataBlockIndexType>
+      block_base_table_data_block_index_type_string_map;
+  static std::unordered_map<std::string, EncodingType> encoding_type_string_map;
+  static std::unordered_map<std::string, CompactionStyle>
+      compaction_style_string_map;
+  static std::unordered_map<std::string, CompactionPri>
+      compaction_pri_string_map;
+  static std::unordered_map<std::string, WALRecoveryMode>
+      wal_recovery_mode_string_map;
+  static std::unordered_map<std::string, DBOptions::AccessHint>
+      access_hint_string_map;
+  static std::unordered_map<std::string, InfoLogLevel>
+      info_log_level_string_map;
+  static ColumnFamilyOptions dummy_cf_options;
+  static CompactionOptionsFIFO dummy_comp_options;
+  static LRUCacheOptions dummy_lru_cache_options;
+  static CompactionOptionsUniversal dummy_comp_options_universal;
+#endif  // !ROCKSDB_LITE
+};
+
+// Some aliasing
+static auto& compaction_style_to_string =
+    OptionsHelper::compaction_style_to_string;
+static auto& compaction_pri_to_string = OptionsHelper::compaction_pri_to_string;
+static auto& compaction_stop_style_to_string =
+    OptionsHelper::compaction_stop_style_to_string;
+static auto& checksum_type_string_map = OptionsHelper::checksum_type_string_map;
+#ifndef ROCKSDB_LITE
+static auto& cf_options_type_info = OptionsHelper::cf_options_type_info;
+static auto& fifo_compaction_options_type_info =
+    OptionsHelper::fifo_compaction_options_type_info;
+static auto& universal_compaction_options_type_info =
+    OptionsHelper::universal_compaction_options_type_info;
+static auto& compaction_stop_style_string_map =
+    OptionsHelper::compaction_stop_style_string_map;
+static auto& db_options_type_info = OptionsHelper::db_options_type_info;
+static auto& lru_cache_options_type_info =
+    OptionsHelper::lru_cache_options_type_info;
+static auto& compression_type_string_map =
+    OptionsHelper::compression_type_string_map;
+static auto& block_base_table_index_type_string_map =
+    OptionsHelper::block_base_table_index_type_string_map;
+static auto& block_base_table_data_block_index_type_string_map =
+    OptionsHelper::block_base_table_data_block_index_type_string_map;
+static auto& encoding_type_string_map = OptionsHelper::encoding_type_string_map;
+static auto& compaction_style_string_map =
+    OptionsHelper::compaction_style_string_map;
+static auto& compaction_pri_string_map =
+    OptionsHelper::compaction_pri_string_map;
+static auto& wal_recovery_mode_string_map =
+    OptionsHelper::wal_recovery_mode_string_map;
+static auto& access_hint_string_map = OptionsHelper::access_hint_string_map;
+static auto& info_log_level_string_map =
+    OptionsHelper::info_log_level_string_map;
+#endif  // !ROCKSDB_LITE
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/options/options_parser.cc b/thirdparty/rocksdb/options/options_parser.cc
index 2cb60a068c..2a85fa5343 100644
--- a/thirdparty/rocksdb/options/options_parser.cc
+++ b/thirdparty/rocksdb/options/options_parser.cc
@@ -17,6 +17,7 @@
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "util/cast_util.h"
+#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 
@@ -41,12 +42,16 @@ Status PersistRocksDBOptions(const DBOptions& db_opt,
     return Status::InvalidArgument(
         "cf_names.size() and cf_opts.size() must be the same");
   }
-  std::unique_ptr<WritableFile> writable;
+  std::unique_ptr<WritableFile> wf;
 
-  Status s = env->NewWritableFile(file_name, &writable, EnvOptions());
+  Status s = env->NewWritableFile(file_name, &wf, EnvOptions());
   if (!s.ok()) {
     return s;
   }
+  std::unique_ptr<WritableFileWriter> writable;
+  writable.reset(new WritableFileWriter(std::move(wf), file_name, EnvOptions(),
+                                        nullptr /* statistics */));
+
   std::string options_file_content;
 
   writable->Append(option_file_header + "[" +
@@ -93,8 +98,7 @@ Status PersistRocksDBOptions(const DBOptions& db_opt,
       writable->Append(options_file_content + "\n");
     }
   }
-  writable->Flush();
-  writable->Fsync();
+  writable->Sync(true /* use_fsync */);
   writable->Close();
 
   return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
@@ -196,45 +200,6 @@ Status RocksDBOptionsParser::ParseStatement(std::string* name,
   return Status::OK();
 }
 
-namespace {
-bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
-                 std::string* output, bool* has_data, Status* result) {
-  const int kBufferSize = 4096;
-  char buffer[kBufferSize + 1];
-  Slice input_slice;
-
-  std::string line;
-  bool has_complete_line = false;
-  while (!has_complete_line) {
-    if (std::getline(*iss, line)) {
-      has_complete_line = !iss->eof();
-    } else {
-      has_complete_line = false;
-    }
-    if (!has_complete_line) {
-      // if we're not sure whether we have a complete line,
-      // further read from the file.
-      if (*has_data) {
-        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
-      }
-      if (input_slice.size() == 0) {
-        // meaning we have read all the data
-        *has_data = false;
-        break;
-      } else {
-        iss->str(line + input_slice.ToString());
-        // reset the internal state of iss so that we can keep reading it.
-        iss->clear();
-        *has_data = (input_slice.size() == kBufferSize);
-        continue;
-      }
-    }
-  }
-  *output = line;
-  return *has_data || has_complete_line;
-}
-}  // namespace
-
 Status RocksDBOptionsParser::Parse(const std::string& file_name, Env* env,
                                    bool ignore_unknown_options) {
   Reset();
@@ -268,6 +233,16 @@ Status RocksDBOptionsParser::Parse(const std::string& file_name, Env* env,
       if (!s.ok()) {
         return s;
       }
+
+      // If the option file is not generated by a higher minor version,
+      // there shouldn't be any unknown option.
+      if (ignore_unknown_options && section == kOptionSectionVersion) {
+        if (db_version[0] < ROCKSDB_MAJOR || (db_version[0] == ROCKSDB_MAJOR &&
+                                              db_version[1] <= ROCKSDB_MINOR)) {
+          ignore_unknown_options = false;
+        }
+      }
+
       s = ParseSection(&section, &title, &argument, line, line_num);
       if (!s.ok()) {
         return s;
@@ -525,6 +500,16 @@ bool AreEqualOptions(
     case OptionType::kInt:
       return (*reinterpret_cast<const int*>(offset1) ==
               *reinterpret_cast<const int*>(offset2));
+    case OptionType::kInt32T:
+      return (*reinterpret_cast<const int32_t*>(offset1) ==
+              *reinterpret_cast<const int32_t*>(offset2));
+    case OptionType::kInt64T:
+      {
+        int64_t v1, v2;
+        GetUnaligned(reinterpret_cast<const int64_t*>(offset1), &v1);
+        GetUnaligned(reinterpret_cast<const int64_t*>(offset2), &v2);
+        return (v1 == v2);
+      }
     case OptionType::kVectorInt:
       return (*reinterpret_cast<const std::vector<int>*>(offset1) ==
               *reinterpret_cast<const std::vector<int>*>(offset2));
@@ -578,6 +563,12 @@ bool AreEqualOptions(
           *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(
               offset1) ==
           *reinterpret_cast<const BlockBasedTableOptions::IndexType*>(offset2));
+    case OptionType::kBlockBasedTableDataBlockIndexType:
+      return (
+          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
+              offset1) ==
+          *reinterpret_cast<const BlockBasedTableOptions::DataBlockIndexType*>(
+              offset2));
     case OptionType::kWALRecoveryMode:
       return (*reinterpret_cast<const WALRecoveryMode*>(offset1) ==
               *reinterpret_cast<const WALRecoveryMode*>(offset2));
@@ -587,8 +578,38 @@ bool AreEqualOptions(
     case OptionType::kInfoLogLevel:
       return (*reinterpret_cast<const InfoLogLevel*>(offset1) ==
               *reinterpret_cast<const InfoLogLevel*>(offset2));
+    case OptionType::kCompactionOptionsFIFO: {
+      CompactionOptionsFIFO lhs =
+          *reinterpret_cast<const CompactionOptionsFIFO*>(offset1);
+      CompactionOptionsFIFO rhs =
+          *reinterpret_cast<const CompactionOptionsFIFO*>(offset2);
+      if (lhs.max_table_files_size == rhs.max_table_files_size &&
+          lhs.allow_compaction == rhs.allow_compaction) {
+        return true;
+      }
+      return false;
+    }
+    case OptionType::kCompactionOptionsUniversal: {
+      CompactionOptionsUniversal lhs =
+          *reinterpret_cast<const CompactionOptionsUniversal*>(offset1);
+      CompactionOptionsUniversal rhs =
+          *reinterpret_cast<const CompactionOptionsUniversal*>(offset2);
+      if (lhs.size_ratio == rhs.size_ratio &&
+          lhs.min_merge_width == rhs.min_merge_width &&
+          lhs.max_merge_width == rhs.max_merge_width &&
+          lhs.max_size_amplification_percent ==
+              rhs.max_size_amplification_percent &&
+          lhs.compression_size_percent == rhs.compression_size_percent &&
+          lhs.stop_style == rhs.stop_style &&
+          lhs.allow_trivial_move == rhs.allow_trivial_move) {
+        return true;
+      }
+      return false;
+    }
     default:
       if (type_info.verification == OptionVerificationType::kByName ||
+          type_info.verification ==
+              OptionVerificationType::kByNameAllowFromNull ||
           type_info.verification == OptionVerificationType::kByNameAllowNull) {
         std::string value1;
         bool result =
@@ -608,6 +629,11 @@ bool AreEqualOptions(
             if (iter->second == kNullptrString || value1 == kNullptrString) {
               return true;
             }
+          } else if (type_info.verification ==
+                     OptionVerificationType::kByNameAllowFromNull) {
+            if (iter->second == kNullptrString) {
+              return true;
+            }
           }
           return (value1 == iter->second);
         }
@@ -690,7 +716,7 @@ Status RocksDBOptionsParser::VerifyRocksDBOptionsFromFile(
 
 Status RocksDBOptionsParser::VerifyDBOptions(
     const DBOptions& base_opt, const DBOptions& persisted_opt,
-    const std::unordered_map<std::string, std::string>* opt_map,
+    const std::unordered_map<std::string, std::string>* /*opt_map*/,
     OptionsSanityCheckLevel sanity_check_level) {
   for (auto pair : db_options_type_info) {
     if (pair.second.verification == OptionVerificationType::kDeprecated) {
diff --git a/thirdparty/rocksdb/options/options_parser.h b/thirdparty/rocksdb/options/options_parser.h
index 5545c0b0fa..5aab3e7e9b 100644
--- a/thirdparty/rocksdb/options/options_parser.h
+++ b/thirdparty/rocksdb/options/options_parser.h
@@ -9,7 +9,6 @@
 #include <string>
 #include <vector>
 
-#include "options/options_helper.h"
 #include "options/options_sanity_check.h"
 #include "rocksdb/env.h"
 #include "rocksdb/options.h"
diff --git a/thirdparty/rocksdb/options/options_settable_test.cc b/thirdparty/rocksdb/options/options_settable_test.cc
index ab9989fb46..3a6bd6a882 100644
--- a/thirdparty/rocksdb/options/options_settable_test.cc
+++ b/thirdparty/rocksdb/options/options_settable_test.cc
@@ -13,15 +13,15 @@
 
 #include <cstring>
 
-#include "options/options_parser.h"
+#include "options/options_helper.h"
 #include "rocksdb/convenience.h"
 #include "util/testharness.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
 #else
-#include <gflags/gflags.h>
-using GFLAGS::ParseCommandLineFlags;
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");
 #endif  // GFLAGS
 
@@ -140,7 +140,10 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "cache_index_and_filter_blocks=1;"
       "cache_index_and_filter_blocks_with_high_priority=true;"
       "pin_l0_filter_and_index_blocks_in_cache=1;"
+      "pin_top_level_index_and_filter=1;"
       "index_type=kHashSearch;"
+      "data_block_index_type=kDataBlockBinaryAndHash;"
+      "data_block_hash_table_util_ratio=0.75;"
       "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
       "block_cache=1M;block_cache_compressed=1k;block_size=1024;"
       "block_size_deviation=8;block_restart_interval=4; "
@@ -150,7 +153,9 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
       "filter_policy=bloomfilter:4:true;whole_key_filtering=1;"
       "format_version=1;"
       "hash_index_allow_collision=false;"
-      "verify_compression=true;read_amp_bytes_per_bit=0",
+      "verify_compression=true;read_amp_bytes_per_bit=0;"
+      "enable_index_compression=false;"
+      "block_align=true",
       new_bbto));
 
   ASSERT_EQ(unset_bytes_base,
@@ -261,6 +266,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "manifest_preallocation_size=1222;"
                              "allow_mmap_writes=false;"
                              "stats_dump_period_sec=70127;"
+                             "stats_persist_period_sec=54321;"
+                             "stats_history_buffer_size=14159;"
                              "allow_fallocate=true;"
                              "allow_mmap_reads=false;"
                              "use_direct_reads=false;"
@@ -282,8 +289,13 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                              "avoid_flush_during_recovery=false;"
                              "avoid_flush_during_shutdown=false;"
                              "allow_ingest_behind=false;"
+                             "preserve_deletes=false;"
                              "concurrent_prepare=false;"
-                             "manual_wal_flush=false;",
+                             "two_write_queues=false;"
+                             "manual_wal_flush=false;"
+                             "seq_per_batch=false;"
+                             "atomic_flush=false;"
+                             "avoid_unnecessary_blocking_io=false",
                              new_options));
 
   ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
@@ -296,6 +308,12 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
   delete[] new_options_ptr;
 }
 
+template <typename T1, typename T2>
+inline int offset_of(T1 T2::*member) {
+  static T2 obj;
+  return int(size_t(&(obj.*member)) - size_t(&obj));
+}
+
 // If the test fails, likely a new option is added to ColumnFamilyOptions
 // but it cannot be set through GetColumnFamilyOptionsFromString(), or the
 // test is not updated accordingly.
@@ -334,6 +352,10 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
        sizeof(std::shared_ptr<const SliceTransform>)},
       {offset_of(&ColumnFamilyOptions::table_factory),
        sizeof(std::shared_ptr<TableFactory>)},
+      {offset_of(&ColumnFamilyOptions::cf_paths),
+       sizeof(std::vector<DbPath>)},
+      {offset_of(&ColumnFamilyOptions::compaction_thread_limiter),
+       sizeof(std::shared_ptr<ConcurrentTaskLimiter>)},
   };
 
   char* options_ptr = new char[sizeof(ColumnFamilyOptions)];
@@ -367,10 +389,12 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
   options->rate_limit_delay_max_milliseconds = 33;
   options->compaction_options_universal = CompactionOptionsUniversal();
   options->compression_opts = CompressionOptions();
+  options->bottommost_compression_opts = CompressionOptions();
   options->hard_rate_limit = 0;
   options->soft_rate_limit = 0;
-  options->compaction_options_fifo = CompactionOptionsFIFO();
+  options->purge_redundant_kvs_while_flush = false;
   options->max_mem_compaction_level = 0;
+  options->compaction_filter = nullptr;
 
   char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)];
   ColumnFamilyOptions* new_options =
@@ -414,6 +438,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "max_write_buffer_number_to_maintain=84;"
       "merge_operator=aabcxehazrMergeOperator;"
       "memtable_prefix_bloom_size_ratio=0.4642;"
+      "memtable_whole_key_filtering=true;"
       "memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;"
       "paranoid_file_checks=true;"
       "force_consistency_checks=true;"
@@ -423,10 +448,13 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
       "inplace_update_support=false;"
       "compaction_style=kCompactionStyleFIFO;"
       "compaction_pri=kMinOverlappingRatio;"
-      "purge_redundant_kvs_while_flush=true;"
       "hard_pending_compaction_bytes_limit=0;"
       "disable_auto_compactions=false;"
-      "report_bg_io_stats=true;",
+      "report_bg_io_stats=true;"
+      "ttl=60;"
+      "sample_for_compression=0;"
+      "compaction_options_fifo={max_table_files_size=3;allow_"
+      "compaction=false;};",
       new_options));
 
   ASSERT_EQ(unset_bytes_base,
diff --git a/thirdparty/rocksdb/options/options_test.cc b/thirdparty/rocksdb/options/options_test.cc
index fc4939beb4..586e5697cb 100644
--- a/thirdparty/rocksdb/options/options_test.cc
+++ b/thirdparty/rocksdb/options/options_test.cc
@@ -16,24 +16,29 @@
 #include <unordered_map>
 #include <inttypes.h>
 
+#include "cache/lru_cache.h"
+#include "cache/sharded_cache.h"
 #include "options/options_helper.h"
 #include "options/options_parser.h"
 #include "options/options_sanity_check.h"
+#include "port/port.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/memtablerep.h"
 #include "rocksdb/utilities/leveldb_options.h"
+#include "rocksdb/utilities/object_registry.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "utilities/merge_operators/bytesxor.h"
 
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
 #else
-#include <gflags/gflags.h>
-using GFLAGS::ParseCommandLineFlags;
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");
 #endif  // GFLAGS
 
@@ -60,7 +65,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
        "kZSTD:"
        "kZSTDNotFinalCompression"},
       {"bottommost_compression", "kLZ4Compression"},
-      {"compression_opts", "4:5:6:7"},
+      {"bottommost_compression_opts", "5:6:7:8:9:true"},
+      {"compression_opts", "4:5:6:7:8:true"},
       {"num_levels", "8"},
       {"level0_file_num_compaction_trigger", "8"},
       {"level0_slowdown_writes_trigger", "9"},
@@ -87,6 +93,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"compaction_measure_io_stats", "false"},
       {"inplace_update_num_locks", "25"},
       {"memtable_prefix_bloom_size_ratio", "0.26"},
+      {"memtable_whole_key_filtering", "true"},
       {"memtable_huge_page_size", "28"},
       {"bloom_locality", "29"},
       {"max_successive_merges", "30"},
@@ -124,6 +131,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
       {"is_fd_close_on_exec", "true"},
       {"skip_log_error_on_recovery", "false"},
       {"stats_dump_period_sec", "46"},
+      {"stats_persist_period_sec", "57"},
+      {"stats_history_buffer_size", "69"},
       {"advise_random_on_open", "true"},
       {"use_adaptive_mutex", "false"},
       {"new_table_reader_for_compaction_inputs", "true"},
@@ -157,7 +166,15 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.compression_opts.level, 5);
   ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6);
   ASSERT_EQ(new_cf_opt.compression_opts.max_dict_bytes, 7);
+  ASSERT_EQ(new_cf_opt.compression_opts.zstd_max_train_bytes, 8);
+  ASSERT_EQ(new_cf_opt.compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.bottommost_compression, kLZ4Compression);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.max_dict_bytes, 8);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.zstd_max_train_bytes, 9);
+  ASSERT_EQ(new_cf_opt.bottommost_compression_opts.enabled, true);
   ASSERT_EQ(new_cf_opt.num_levels, 8);
   ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8);
   ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9);
@@ -184,6 +201,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_cf_opt.inplace_update_support, true);
   ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U);
   ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_size_ratio, 0.26);
+  ASSERT_EQ(new_cf_opt.memtable_whole_key_filtering, true);
   ASSERT_EQ(new_cf_opt.memtable_huge_page_size, 28U);
   ASSERT_EQ(new_cf_opt.bloom_locality, 29U);
   ASSERT_EQ(new_cf_opt.max_successive_merges, 30U);
@@ -249,6 +267,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
   ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true);
   ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false);
   ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U);
+  ASSERT_EQ(new_db_opt.stats_persist_period_sec, 57U);
+  ASSERT_EQ(new_db_opt.stats_history_buffer_size, 69U);
   ASSERT_EQ(new_db_opt.advise_random_on_open, true);
   ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
   ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true);
@@ -317,6 +337,34 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) {
               &new_cf_opt));
   ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opt, new_cf_opt));
 
+  // Comparator from object registry
+  std::string kCompName = "reverse_comp";
+  static Registrar<const Comparator> test_reg_a(
+      kCompName, [](const std::string& /*name*/,
+                    std::unique_ptr<const Comparator>* /*comparator_guard*/) {
+        return ReverseBytewiseComparator();
+      });
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt));
+  ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator());
+
+  // MergeOperator from object registry
+  std::unique_ptr<BytesXOROperator> bxo(new BytesXOROperator());
+  std::string kMoName = bxo->Name();
+  static Registrar<std::shared_ptr<MergeOperator>> test_reg_b(
+      kMoName, [](const std::string& /*name*/,
+                  std::unique_ptr<std::shared_ptr<MergeOperator>>*
+                      merge_operator_guard) {
+        merge_operator_guard->reset(
+            new std::shared_ptr<MergeOperator>(new BytesXOROperator()));
+        return merge_operator_guard->get();
+      });
+
+  ASSERT_OK(GetColumnFamilyOptionsFromString(
+      base_cf_opt, "merge_operator=" + kMoName + ";", &new_cf_opt));
+  ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name()));
+
   // Wrong key/value pair
   ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt,
              "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt));
@@ -529,6 +577,101 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
   ASSERT_EQ(table_opt.cache_index_and_filter_blocks,
             new_opt.cache_index_and_filter_blocks);
   ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy);
+
+  // Check block cache options are overwritten when specified
+  // in new format as a struct.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};"
+             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
+                0.5);
+
+  // Set only block cache capacity. Check other values are
+  // reset to default values.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=2M};"
+             "block_cache_compressed={capacity=2M}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(),
+                GetDefaultCacheShardBits(new_opt.block_cache->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.0);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL);
+  // Default values
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(),
+                GetDefaultCacheShardBits(
+                    new_opt.block_cache_compressed->GetCapacity()));
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
+                0.0);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};"
+             "block_cache_compressed={num_shard_bits=5;"
+             "high_pri_pool_ratio=0.5;}",
+             &new_opt));
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(), 5);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.5);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(), 5);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
+                0.5);
+
+  // Set couple of block cache options.
+  ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt,
+             "block_cache={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;};"
+             "block_cache_compressed={capacity=1M;num_shard_bits=4;"
+             "strict_capacity_limit=true;}",
+             &new_opt));
+  ASSERT_TRUE(new_opt.block_cache != nullptr);
+  ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache)->GetHighPriPoolRatio(), 0.0);
+  ASSERT_TRUE(new_opt.block_cache_compressed != nullptr);
+  ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL);
+  ASSERT_EQ(std::dynamic_pointer_cast<ShardedCache>(
+                new_opt.block_cache_compressed)->GetNumShardBits(), 4);
+  ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true);
+  ASSERT_EQ(std::dynamic_pointer_cast<LRUCache>(
+                new_opt.block_cache_compressed)->GetHighPriPoolRatio(),
+                0.0);
 }
 #endif  // !ROCKSDB_LITE
 
@@ -598,8 +741,8 @@ TEST_F(OptionsTest, GetMemTableRepFactoryFromString) {
                                              &new_mem_factory));
 
   ASSERT_NOK(GetMemTableRepFactoryFromString("cuckoo", &new_mem_factory));
-  ASSERT_OK(GetMemTableRepFactoryFromString("cuckoo:1024", &new_mem_factory));
-  ASSERT_EQ(std::string(new_mem_factory->Name()), "HashCuckooRepFactory");
+  // CuckooHash memtable is already removed.
+  ASSERT_NOK(GetMemTableRepFactoryFromString("cuckoo:1024", &new_mem_factory));
 
   ASSERT_NOK(GetMemTableRepFactoryFromString("bad_factory", &new_mem_factory));
 }
@@ -619,6 +762,8 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
       "write_buffer_size=10;max_write_buffer_number=16;"
       "block_based_table_factory={block_cache=1M;block_size=4;};"
       "compression_opts=4:5:6;create_if_missing=true;max_open_files=1;"
+      "bottommost_compression_opts=5:6:7;create_if_missing=true;max_open_files="
+      "1;"
       "rate_limiter_bytes_per_sec=1024",
       &new_options));
 
@@ -626,7 +771,15 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) {
   ASSERT_EQ(new_options.compression_opts.level, 5);
   ASSERT_EQ(new_options.compression_opts.strategy, 6);
   ASSERT_EQ(new_options.compression_opts.max_dict_bytes, 0);
+  ASSERT_EQ(new_options.compression_opts.zstd_max_train_bytes, 0);
+  ASSERT_EQ(new_options.compression_opts.enabled, false);
   ASSERT_EQ(new_options.bottommost_compression, kDisableCompressionOption);
+  ASSERT_EQ(new_options.bottommost_compression_opts.window_bits, 5);
+  ASSERT_EQ(new_options.bottommost_compression_opts.level, 6);
+  ASSERT_EQ(new_options.bottommost_compression_opts.strategy, 7);
+  ASSERT_EQ(new_options.bottommost_compression_opts.max_dict_bytes, 0);
+  ASSERT_EQ(new_options.bottommost_compression_opts.zstd_max_train_bytes, 0);
+  ASSERT_EQ(new_options.bottommost_compression_opts.enabled, false);
   ASSERT_EQ(new_options.write_buffer_size, 10U);
   ASSERT_EQ(new_options.max_write_buffer_number, 16);
   BlockBasedTableOptions new_block_based_table_options =
@@ -660,6 +813,25 @@ TEST_F(OptionsTest, DBOptionsSerialization) {
   ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_options, new_options));
 }
 
+TEST_F(OptionsTest, OptionsComposeDecompose) {
+  // build an Options from DBOptions + CFOptions, then decompose it to verify
+  // we get same constituent options.
+  DBOptions base_db_opts;
+  ColumnFamilyOptions base_cf_opts;
+
+  Random rnd(301);
+  test::RandomInitDBOptions(&base_db_opts, &rnd);
+  test::RandomInitCFOptions(&base_cf_opts, &rnd);
+
+  Options base_opts(base_db_opts, base_cf_opts);
+  DBOptions new_db_opts(base_opts);
+  ColumnFamilyOptions new_cf_opts(base_opts);
+
+  ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(base_db_opts, new_db_opts));
+  ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(base_cf_opts, new_cf_opts));
+  delete new_cf_opts.compaction_filter;
+}
+
 TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) {
   ColumnFamilyOptions base_opt, new_opt;
   Random rnd(302);
@@ -1100,37 +1272,79 @@ TEST_F(OptionsParserTest, DuplicateCFOptions) {
 }
 
 TEST_F(OptionsParserTest, IgnoreUnknownOptions) {
-  DBOptions db_opt;
-  db_opt.max_open_files = 12345;
-  db_opt.max_background_flushes = 301;
-  db_opt.max_total_wal_size = 1024;
-  ColumnFamilyOptions cf_opt;
+  for (int case_id = 0; case_id < 5; case_id++) {
+    DBOptions db_opt;
+    db_opt.max_open_files = 12345;
+    db_opt.max_background_flushes = 301;
+    db_opt.max_total_wal_size = 1024;
+    ColumnFamilyOptions cf_opt;
 
-  std::string options_file_content =
-      "# This is a testing option string.\n"
-      "# Currently we only support \"#\" styled comment.\n"
-      "\n"
-      "[Version]\n"
-      "  rocksdb_version=3.14.0\n"
-      "  options_file_version=1\n"
-      "[DBOptions]\n"
-      "  max_open_files=12345\n"
-      "  max_background_flushes=301\n"
-      "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
-      "  unknown_db_option1=321\n"
-      "  unknown_db_option2=false\n"
-      "[CFOptions \"default\"]\n"
-      "  unknown_cf_option1=hello\n"
-      "[CFOptions \"something_else\"]\n"
-      "  unknown_cf_option2=world\n"
-      "  # if a section is blank, we will use the default\n";
+    std::string version_string;
+    bool should_ignore = true;
+    if (case_id == 0) {
+      // same version
+      should_ignore = false;
+      version_string =
+          ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR) + ".0";
+    } else if (case_id == 1) {
+      // higher minor version
+      should_ignore = true;
+      version_string =
+          ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR + 1) + ".0";
+    } else if (case_id == 2) {
+      // higher major version.
+      should_ignore = true;
+      version_string = ToString(ROCKSDB_MAJOR + 1) + ".0.0";
+    } else if (case_id == 3) {
+      // lower minor version
+#if ROCKSDB_MINOR == 0
+      continue;
+#else
+      version_string =
+          ToString(ROCKSDB_MAJOR) + "." + ToString(ROCKSDB_MINOR - 1) + ".0";
+      should_ignore = false;
+#endif
+    } else {
+      // lower major version
+      should_ignore = false;
+      version_string =
+          ToString(ROCKSDB_MAJOR - 1) + "." + ToString(ROCKSDB_MINOR) + ".0";
+    }
 
-  const std::string kTestFileName = "test-rocksdb-options.ini";
-  env_->WriteToNewFile(kTestFileName, options_file_content);
-  RocksDBOptionsParser parser;
-  ASSERT_NOK(parser.Parse(kTestFileName, env_.get()));
-  ASSERT_OK(parser.Parse(kTestFileName, env_.get(),
-                         true /* ignore_unknown_options */));
+    std::string options_file_content =
+        "# This is a testing option string.\n"
+        "# Currently we only support \"#\" styled comment.\n"
+        "\n"
+        "[Version]\n"
+        "  rocksdb_version=" +
+        version_string +
+        "\n"
+        "  options_file_version=1\n"
+        "[DBOptions]\n"
+        "  max_open_files=12345\n"
+        "  max_background_flushes=301\n"
+        "  max_total_wal_size=1024  # keep_log_file_num=1000\n"
+        "  unknown_db_option1=321\n"
+        "  unknown_db_option2=false\n"
+        "[CFOptions \"default\"]\n"
+        "  unknown_cf_option1=hello\n"
+        "[CFOptions \"something_else\"]\n"
+        "  unknown_cf_option2=world\n"
+        "  # if a section is blank, we will use the default\n";
+
+    const std::string kTestFileName = "test-rocksdb-options.ini";
+    env_->DeleteFile(kTestFileName);
+    env_->WriteToNewFile(kTestFileName, options_file_content);
+    RocksDBOptionsParser parser;
+    ASSERT_NOK(parser.Parse(kTestFileName, env_.get()));
+    if (should_ignore) {
+      ASSERT_OK(parser.Parse(kTestFileName, env_.get(),
+                             true /* ignore_unknown_options */));
+    } else {
+      ASSERT_NOK(parser.Parse(kTestFileName, env_.get(),
+                              true /* ignore_unknown_options */));
+    }
+  }
 }
 
 TEST_F(OptionsParserTest, ParseVersion) {
@@ -1351,6 +1565,7 @@ TEST_F(OptionsParserTest, DifferentDefault) {
   const std::string kOptionsFileName = "test-persisted-options.ini";
 
   ColumnFamilyOptions cf_level_opts;
+  ASSERT_EQ(CompactionPri::kMinOverlappingRatio, cf_level_opts.compaction_pri);
   cf_level_opts.OptimizeLevelStyleCompaction();
 
   ColumnFamilyOptions cf_univ_opts;
@@ -1420,6 +1635,14 @@ TEST_F(OptionsParserTest, DifferentDefault) {
     Options old_default_opts;
     old_default_opts.OldDefaults(5, 2);
     ASSERT_EQ(16 * 1024U * 1024U, old_default_opts.delayed_write_rate);
+    ASSERT_TRUE(old_default_opts.compaction_pri ==
+                CompactionPri::kByCompensatedSize);
+  }
+  {
+    Options old_default_opts;
+    old_default_opts.OldDefaults(5, 18);
+    ASSERT_TRUE(old_default_opts.compaction_pri ==
+                CompactionPri::kByCompensatedSize);
   }
 
   Options small_opts;
@@ -1525,6 +1748,15 @@ TEST_F(OptionsSanityCheckTest, SanityCheck) {
 
   // merge_operator
   {
+    // Test when going from nullptr -> merge operator
+    opts.merge_operator.reset(test::RandomMergeOperator(&rnd));
+    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+
+    // persist the change
+    ASSERT_OK(PersistCFOptions(opts));
+    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
+
     for (int test = 0; test < 5; ++test) {
       // change the merge operator
       opts.merge_operator.reset(test::RandomMergeOperator(&rnd));
@@ -1535,6 +1767,15 @@ TEST_F(OptionsSanityCheckTest, SanityCheck) {
       ASSERT_OK(PersistCFOptions(opts));
       ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
     }
+
+    // Test when going from merge operator -> nullptr
+    opts.merge_operator = nullptr;
+    ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible));
+    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone));
+
+    // persist the change
+    ASSERT_OK(PersistCFOptions(opts));
+    ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch));
   }
 
   // compaction_filter
@@ -1603,6 +1844,18 @@ bool IsEscapedString(const std::string& str) {
 }
 }  // namespace
 
+TEST_F(OptionsParserTest, IntegerParsing) {
+  ASSERT_EQ(ParseUint64("18446744073709551615"), 18446744073709551615U);
+  ASSERT_EQ(ParseUint32("4294967295"), 4294967295U);
+  ASSERT_EQ(ParseSizeT("18446744073709551615"), 18446744073709551615U);
+  ASSERT_EQ(ParseInt64("9223372036854775807"), 9223372036854775807U);
+  ASSERT_EQ(ParseInt64("-9223372036854775808"), port::kMinInt64);
+  ASSERT_EQ(ParseInt32("2147483647"), 2147483647U);
+  ASSERT_EQ(ParseInt32("-2147483648"), port::kMinInt32);
+  ASSERT_EQ(ParseInt("-32767"), -32767);
+  ASSERT_EQ(ParseDouble("-1.234567"), -1.234567);
+}
+
 TEST_F(OptionsParserTest, EscapeOptionString) {
   ASSERT_EQ(UnescapeOptionString(
                 "This is a test string with \\# \\: and \\\\ escape chars."),
diff --git a/thirdparty/rocksdb/port/jemalloc_helper.h b/thirdparty/rocksdb/port/jemalloc_helper.h
new file mode 100644
index 0000000000..0c216face1
--- /dev/null
+++ b/thirdparty/rocksdb/port/jemalloc_helper.h
@@ -0,0 +1,53 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifdef ROCKSDB_JEMALLOC
+#ifdef __FreeBSD__
+#include <malloc_np.h>
+#else
+#include <jemalloc/jemalloc.h>
+#endif
+
+#ifndef JEMALLOC_CXX_THROW
+#define JEMALLOC_CXX_THROW
+#endif
+
+// Declare non-standard jemalloc APIs as weak symbols. We can null-check these
+// symbols to detect whether jemalloc is linked with the binary.
+extern "C" void* mallocx(size_t, int) __attribute__((__weak__));
+extern "C" void* rallocx(void*, size_t, int) __attribute__((__weak__));
+extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__weak__));
+extern "C" size_t sallocx(const void*, int) __attribute__((__weak__));
+extern "C" void dallocx(void*, int) __attribute__((__weak__));
+extern "C" void sdallocx(void*, size_t, int) __attribute__((__weak__));
+extern "C" size_t nallocx(size_t, int) __attribute__((__weak__));
+extern "C" int mallctl(const char*, void*, size_t*, void*, size_t)
+    __attribute__((__weak__));
+extern "C" int mallctlnametomib(const char*, size_t*, size_t*)
+    __attribute__((__weak__));
+extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*,
+                            size_t) __attribute__((__weak__));
+extern "C" void malloc_stats_print(void (*)(void*, const char*), void*,
+                                   const char*) __attribute__((__weak__));
+extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*)
+    JEMALLOC_CXX_THROW __attribute__((__weak__));
+
+// Check if Jemalloc is linked with the binary. Note the main program might be
+// using a different memory allocator even this method return true.
+// It is loosely based on folly::usingJEMalloc(), minus the check that actually
+// allocate memory and see if it is through jemalloc, to handle the dlopen()
+// case:
+// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147
+static inline bool HasJemalloc() {
+  return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr &&
+         sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr &&
+         nallocx != nullptr && mallctl != nullptr &&
+         mallctlnametomib != nullptr && mallctlbymib != nullptr &&
+         malloc_stats_print != nullptr && malloc_usable_size != nullptr;
+}
+
+#endif  // ROCKSDB_JEMALLOC
diff --git a/thirdparty/rocksdb/port/likely.h b/thirdparty/rocksdb/port/likely.h
index e5ef786f2e..397d757133 100644
--- a/thirdparty/rocksdb/port/likely.h
+++ b/thirdparty/rocksdb/port/likely.h
@@ -7,8 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef PORT_LIKELY_H_
-#define PORT_LIKELY_H_
+#pragma once
 
 #if defined(__GNUC__) && __GNUC__ >= 4
 #define LIKELY(x)   (__builtin_expect((x), 1))
@@ -17,5 +16,3 @@
 #define LIKELY(x)   (x)
 #define UNLIKELY(x) (x)
 #endif
-
-#endif  // PORT_LIKELY_H_
diff --git a/thirdparty/rocksdb/port/dirent.h b/thirdparty/rocksdb/port/port_dirent.h
similarity index 89%
rename from thirdparty/rocksdb/port/dirent.h
rename to thirdparty/rocksdb/port/port_dirent.h
index 7bcc356978..cb1adbe129 100644
--- a/thirdparty/rocksdb/port/dirent.h
+++ b/thirdparty/rocksdb/port/port_dirent.h
@@ -9,8 +9,7 @@
 //
 // See port_example.h for documentation for the following types/functions.
 
-#ifndef STORAGE_LEVELDB_PORT_DIRENT_H_
-#define STORAGE_LEVELDB_PORT_DIRENT_H_
+#pragma once
 
 #ifdef ROCKSDB_PLATFORM_POSIX
 #include <dirent.h>
@@ -43,5 +42,3 @@ using port::closedir;
 }  // namespace rocksdb
 
 #endif  // OS_WIN
-
-#endif  // STORAGE_LEVELDB_PORT_DIRENT_H_
diff --git a/thirdparty/rocksdb/port/port_example.h b/thirdparty/rocksdb/port/port_example.h
index 05b3240669..a94dc93c26 100644
--- a/thirdparty/rocksdb/port/port_example.h
+++ b/thirdparty/rocksdb/port/port_example.h
@@ -12,8 +12,7 @@
 // specific port_<platform>.h file.  Use this file as a reference for
 // how to port this package to a new platform.
 
-#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
-#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+#pragma once
 
 namespace rocksdb {
 namespace port {
@@ -100,5 +99,3 @@ extern bool Snappy_Uncompress(const char* input_data, size_t input_length,
 
 }  // namespace port
 }  // namespace rocksdb
-
-#endif  // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
diff --git a/thirdparty/rocksdb/port/port_posix.cc b/thirdparty/rocksdb/port/port_posix.cc
index 129933bb1f..80081e480e 100644
--- a/thirdparty/rocksdb/port/port_posix.cc
+++ b/thirdparty/rocksdb/port/port_posix.cc
@@ -25,6 +25,21 @@
 #include "util/logging.h"
 
 namespace rocksdb {
+
+// We want to give users opportunity to default all the mutexes to adaptive if
+// not specified otherwise. This enables a quick way to conduct various
+// performance related experiements.
+//
+// NB! Support for adaptive mutexes is turned on by definining
+// ROCKSDB_PTHREAD_ADAPTIVE_MUTEX during the compilation. If you use RocksDB
+// build environment then this happens automatically; otherwise it's up to the
+// consumer to define the identifier.
+#ifdef ROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX
+extern const bool kDefaultToAdaptiveMutex = true;
+#else
+extern const bool kDefaultToAdaptiveMutex = false;
+#endif
+
 namespace port {
 
 static int PthreadCall(const char* label, int result) {
@@ -36,6 +51,7 @@ static int PthreadCall(const char* label, int result) {
 }
 
 Mutex::Mutex(bool adaptive) {
+  (void) adaptive;
 #ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX
   if (!adaptive) {
     PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr));
@@ -187,12 +203,10 @@ int GetMaxOpenFiles() {
 void *cacheline_aligned_alloc(size_t size) {
 #if __GNUC__ < 5 && defined(__SANITIZE_ADDRESS__)
   return malloc(size);
-#elif defined(_ISOC11_SOURCE)
-  return aligned_alloc(CACHE_LINE_SIZE, size);
 #elif ( _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || defined(__APPLE__))
   void *m;
   errno = posix_memalign(&m, CACHE_LINE_SIZE, size);
-  return errno ? NULL : m;
+  return errno ? nullptr : m;
 #else
   return malloc(size);
 #endif
diff --git a/thirdparty/rocksdb/port/port_posix.h b/thirdparty/rocksdb/port/port_posix.h
index fe0d42644c..63d7239fe6 100644
--- a/thirdparty/rocksdb/port/port_posix.h
+++ b/thirdparty/rocksdb/port/port_posix.h
@@ -82,12 +82,18 @@
 #endif
 
 namespace rocksdb {
+
+extern const bool kDefaultToAdaptiveMutex;
+
 namespace port {
 
 // For use at db/file_indexer.h kLevelMaxIndex
+const uint32_t kMaxUint32 = std::numeric_limits<uint32_t>::max();
 const int kMaxInt32 = std::numeric_limits<int32_t>::max();
+const int kMinInt32 = std::numeric_limits<int32_t>::min();
 const uint64_t kMaxUint64 = std::numeric_limits<uint64_t>::max();
 const int64_t kMaxInt64 = std::numeric_limits<int64_t>::max();
+const int64_t kMinInt64 = std::numeric_limits<int64_t>::min();
 const size_t kMaxSizet = std::numeric_limits<size_t>::max();
 
 static const bool kLittleEndian = PLATFORM_IS_LITTLE_ENDIAN;
@@ -97,19 +103,7 @@ class CondVar;
 
 class Mutex {
  public:
-// We want to give users opportunity to default all the mutexes to adaptive if
-// not specified otherwise. This enables a quick way to conduct various
-// performance related experiements.
-//
-// NB! Support for adaptive mutexes is turned on by definining
-// ROCKSDB_PTHREAD_ADAPTIVE_MUTEX during the compilation. If you use RocksDB
-// build environment then this happens automatically; otherwise it's up to the
-// consumer to define the identifier.
-#ifdef ROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX
-  explicit Mutex(bool adaptive = true);
-#else
-  explicit Mutex(bool adaptive = false);
-#endif
+  explicit Mutex(bool adaptive = kDefaultToAdaptiveMutex);
   ~Mutex();
 
   void Lock();
diff --git a/thirdparty/rocksdb/port/stack_trace.cc b/thirdparty/rocksdb/port/stack_trace.cc
index baaf140142..8f8135a446 100644
--- a/thirdparty/rocksdb/port/stack_trace.cc
+++ b/thirdparty/rocksdb/port/stack_trace.cc
@@ -13,7 +13,7 @@
 namespace rocksdb {
 namespace port {
 void InstallStackTraceHandler() {}
-void PrintStack(int first_frames_to_skip) {}
+void PrintStack(int /*first_frames_to_skip*/) {}
 }  // namespace port
 }  // namespace rocksdb
 
@@ -32,7 +32,7 @@ namespace port {
 
 namespace {
 
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
 const char* GetExecutableName() {
   static char name[1024];
 
diff --git a/thirdparty/rocksdb/port/sys_time.h b/thirdparty/rocksdb/port/sys_time.h
index 1e2ad0f5d6..2f83da8b3e 100644
--- a/thirdparty/rocksdb/port/sys_time.h
+++ b/thirdparty/rocksdb/port/sys_time.h
@@ -10,8 +10,7 @@
 // This file is a portable substitute for sys/time.h which does not exist on
 // Windows
 
-#ifndef STORAGE_LEVELDB_PORT_SYS_TIME_H_
-#define STORAGE_LEVELDB_PORT_SYS_TIME_H_
+#pragma once
 
 #if defined(OS_WIN) && defined(_MSC_VER)
 
@@ -44,5 +43,3 @@ using port::localtime_r;
 #include <time.h>
 #include <sys/time.h>
 #endif
-
-#endif  // STORAGE_LEVELDB_PORT_SYS_TIME_H_
diff --git a/thirdparty/rocksdb/port/util_logger.h b/thirdparty/rocksdb/port/util_logger.h
index a8255ad6d6..ba424705b2 100644
--- a/thirdparty/rocksdb/port/util_logger.h
+++ b/thirdparty/rocksdb/port/util_logger.h
@@ -7,8 +7,7 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
-#ifndef STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
-#define STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
+#pragma once
 
 // Include the appropriate platform specific file below.  If you are
 // porting to a new platform, see "port_example.h" for documentation
@@ -19,5 +18,3 @@
 #elif defined(OS_WIN)
 #include "port/win/win_logger.h"
 #endif
-
-#endif  // STORAGE_LEVELDB_PORT_UTIL_LOGGER_H_
diff --git a/thirdparty/rocksdb/port/win/env_default.cc b/thirdparty/rocksdb/port/win/env_default.cc
index 52a984f74c..d24c21918a 100644
--- a/thirdparty/rocksdb/port/win/env_default.cc
+++ b/thirdparty/rocksdb/port/win/env_default.cc
@@ -11,16 +11,14 @@
 
 #include <rocksdb/env.h>
 #include "port/win/env_win.h"
+#include "util/compression_context_cache.h"
+#include "util/sync_point.h"
+#include "util/thread_local.h"
 
 namespace rocksdb {
 namespace port {
 
-// We choose to create this on the heap and using std::once for the following
-// reasons
-// 1) Currently available MS compiler does not implement atomic C++11
-// initialization of
-//    function local statics
-// 2) We choose not to destroy the env because joining the threads from the
+// We choose not to destroy the env because joining the threads from the
 // system loader
 //    which destroys the statics (same as from DLLMain) creates a system loader
 //    dead-lock.
@@ -29,14 +27,15 @@ namespace {
   std::once_flag winenv_once_flag;
   Env* envptr;
 };
-
 }
 
 Env* Env::Default() {
   using namespace port;
+  ThreadLocalPtr::InitSingletons();
+  CompressionContextCache::InitSingleton();
+  INIT_SYNC_POINT_SINGLETONS();
   std::call_once(winenv_once_flag, []() { envptr = new WinEnv(); });
   return envptr;
 }
 
 }
-
diff --git a/thirdparty/rocksdb/port/win/env_win.cc b/thirdparty/rocksdb/port/win/env_win.cc
index 462148893b..9abb14d67e 100644
--- a/thirdparty/rocksdb/port/win/env_win.cc
+++ b/thirdparty/rocksdb/port/win/env_win.cc
@@ -24,7 +24,7 @@
 #include "rocksdb/slice.h"
 
 #include "port/port.h"
-#include "port/dirent.h"
+#include "port/port_dirent.h"
 #include "port/win/win_logger.h"
 #include "port/win/io_win.h"
 
@@ -35,6 +35,10 @@
 
 #include <rpc.h>  // for uuid generation
 #include <windows.h>
+#include <shlwapi.h>
+#include "strsafe.h"
+
+#include <algorithm>
 
 namespace rocksdb {
 
@@ -44,10 +48,16 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() {
 
 namespace {
 
+// Sector size used when physical sector size cannot be obtained from device.
+static const size_t kSectorSize = 512;
+
 // RAII helpers for HANDLEs
 const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); };
 typedef std::unique_ptr<void, decltype(CloseHandleFunc)> UniqueCloseHandlePtr;
 
+const auto FindCloseFunc = [](HANDLE h) { ::FindClose(h); };
+typedef std::unique_ptr<void, decltype(FindCloseFunc)> UniqueFindClosePtr;
+
 void WinthreadCall(const char* label, std::error_code result) {
   if (0 != result.value()) {
     fprintf(stderr, "pthread %s: %s\n", label, strerror(result.value()));
@@ -60,10 +70,11 @@ void WinthreadCall(const char* label, std::error_code result) {
 namespace port {
 
 WinEnvIO::WinEnvIO(Env* hosted_env)
-  :   hosted_env_(hosted_env),
-      page_size_(4 * 1012),
+    : hosted_env_(hosted_env),
+      page_size_(4 * 1024),
       allocation_granularity_(page_size_),
       perf_counter_frequency_(0),
+      nano_seconds_per_period_(0),
       GetSystemTimePreciseAsFileTime_(NULL) {
 
   SYSTEM_INFO sinfo;
@@ -74,15 +85,21 @@ WinEnvIO::WinEnvIO(Env* hosted_env)
 
   {
     LARGE_INTEGER qpf;
-    BOOL ret = QueryPerformanceFrequency(&qpf);
+    BOOL ret __attribute__((__unused__));
+    ret = QueryPerformanceFrequency(&qpf);
     assert(ret == TRUE);
     perf_counter_frequency_ = qpf.QuadPart;
+
+    if (std::nano::den % perf_counter_frequency_ == 0) {
+      nano_seconds_per_period_ = std::nano::den / perf_counter_frequency_;
+    }
   }
 
   HMODULE module = GetModuleHandle("kernel32.dll");
   if (module != NULL) {
-    GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
-      module, "GetSystemTimePreciseAsFileTime");
+    GetSystemTimePreciseAsFileTime_ =
+      (FnGetSystemTimePreciseAsFileTime)GetProcAddress(
+          module, "GetSystemTimePreciseAsFileTime");
   }
 }
 
@@ -92,13 +109,26 @@ WinEnvIO::~WinEnvIO() {
 Status WinEnvIO::DeleteFile(const std::string& fname) {
   Status result;
 
-  if (_unlink(fname.c_str())) {
-    result = IOError("Failed to delete: " + fname, errno);
+  BOOL ret = RX_DeleteFile(RX_FN(fname).c_str());
+
+  if(!ret) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError("Failed to delete: " + fname,
+                                     lastError);
   }
 
   return result;
 }
 
+Status WinEnvIO::Truncate(const std::string& fname, size_t size) {
+  Status s;
+  int result = rocksdb::port::Truncate(fname, size);
+  if (result != 0) {
+    s = IOError("Failed to truncate: " + fname, errno);
+  }
+  return s;
+}
+
 Status WinEnvIO::GetCurrentTime(int64_t* unix_time) {
   time_t time = std::time(nullptr);
   if (time == (time_t)(-1)) {
@@ -110,8 +140,8 @@ Status WinEnvIO::GetCurrentTime(int64_t* unix_time) {
 }
 
 Status WinEnvIO::NewSequentialFile(const std::string& fname,
-  std::unique_ptr<SequentialFile>* result,
-  const EnvOptions& options) {
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options) {
   Status s;
 
   result->reset();
@@ -129,17 +159,17 @@ Status WinEnvIO::NewSequentialFile(const std::string& fname,
 
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(
-      fname.c_str(), GENERIC_READ,
-      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
-      OPEN_EXISTING,  // Original fopen mode is "rb"
-      fileFlags, NULL);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(), GENERIC_READ,
+        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+        OPEN_EXISTING,  // Original fopen mode is "rb"
+        fileFlags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError("Failed to open NewSequentialFile" + fname,
-      lastError);
+                                lastError);
   } else {
     result->reset(new WinSequentialFile(fname, hFile, options));
   }
@@ -147,8 +177,8 @@ Status WinEnvIO::NewSequentialFile(const std::string& fname,
 }
 
 Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
-  std::unique_ptr<RandomAccessFile>* result,
-  const EnvOptions& options) {
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options) {
   result->reset();
   Status s;
 
@@ -167,16 +197,16 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile =
-      CreateFileA(fname.c_str(), GENERIC_READ,
-      FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-      NULL, OPEN_EXISTING, fileFlags, NULL);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(), GENERIC_READ,
+        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+        NULL, OPEN_EXISTING, fileFlags, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
     return IOErrorFromWindowsError(
-      "NewRandomAccessFile failed to Create/Open: " + fname, lastError);
+        "NewRandomAccessFile failed to Create/Open: " + fname, lastError);
   }
 
   UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
@@ -192,54 +222,57 @@ Status WinEnvIO::NewRandomAccessFile(const std::string& fname,
       // Will not map empty files
       if (fileSize == 0) {
         return IOError(
-          "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
+            "NewRandomAccessFile failed to map empty file: " + fname, EINVAL);
       }
 
-      HANDLE hMap = CreateFileMappingA(hFile, NULL, PAGE_READONLY,
-        0,  // Whole file at its present length
-        0,
-        NULL);  // Mapping name
+      HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READONLY,
+                                         0,  // At its present length
+                                         0,
+                                         NULL);  // Mapping name
 
       if (!hMap) {
         auto lastError = GetLastError();
         return IOErrorFromWindowsError(
-          "Failed to create file mapping for NewRandomAccessFile: " + fname,
-          lastError);
+            "Failed to create file mapping for NewRandomAccessFile: " + fname,
+            lastError);
       }
 
       UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
 
       const void* mapped_region =
         MapViewOfFileEx(hMap, FILE_MAP_READ,
-        0,  // High DWORD of access start
-        0,  // Low DWORD
-        fileSize,
-        NULL);  // Let the OS choose the mapping
+                        0,  // High DWORD of access start
+                        0,  // Low DWORD
+                        static_cast<SIZE_T>(fileSize),
+                        NULL);  // Let the OS choose the mapping
 
       if (!mapped_region) {
         auto lastError = GetLastError();
         return IOErrorFromWindowsError(
-          "Failed to MapViewOfFile for NewRandomAccessFile: " + fname,
-          lastError);
+            "Failed to MapViewOfFile for NewRandomAccessFile: " + fname,
+            lastError);
       }
 
       result->reset(new WinMmapReadableFile(fname, hFile, hMap, mapped_region,
-        fileSize));
+                                            static_cast<size_t>(fileSize)));
 
       mapGuard.release();
       fileGuard.release();
     }
   } else {
-    result->reset(new WinRandomAccessFile(fname, hFile, page_size_, options));
+    result->reset(new WinRandomAccessFile(fname, hFile,
+                                          std::max(GetSectorSize(fname),
+                                                   page_size_),
+                                          options));
     fileGuard.release();
   }
   return s;
 }
 
 Status WinEnvIO::OpenWritableFile(const std::string& fname,
-  std::unique_ptr<WritableFile>* result,
-  const EnvOptions& options,
-  bool reopen) {
+                                  std::unique_ptr<WritableFile>* result,
+                                  const EnvOptions& options,
+                                  bool reopen) {
 
   const size_t c_BufferCapacity = 64 * 1024;
 
@@ -251,7 +284,7 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
   DWORD fileFlags = FILE_ATTRIBUTE_NORMAL;
 
   if (local_options.use_direct_writes && !local_options.use_mmap_writes) {
-    fileFlags = FILE_FLAG_NO_BUFFERING;
+    fileFlags = FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
   }
 
   // Desired access. We are want to write only here but if we want to memory
@@ -264,8 +297,7 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
 
   if (local_options.use_mmap_writes) {
     desired_access |= GENERIC_READ;
-  }
-  else {
+  } else {
     // Adding this solely for tests to pass (fault_injection_test,
     // wal_manager_test).
     shared_mode |= (FILE_SHARE_WRITE | FILE_SHARE_DELETE);
@@ -280,20 +312,21 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(
-      fname.c_str(),
-      desired_access,  // Access desired
-      shared_mode,
-      NULL,           // Security attributes
-      creation_disposition,  // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC
-      fileFlags,      // Flags
-      NULL);          // Template File
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(),
+        desired_access,  // Access desired
+        shared_mode,
+        NULL,           // Security attributes
+        // Posix env says (reopen) ? (O_CREATE | O_APPEND) : O_CREAT | O_TRUNC
+        creation_disposition,
+        fileFlags,      // Flags
+        NULL);          // Template File
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
     return IOErrorFromWindowsError(
-      "Failed to create a NewWriteableFile: " + fname, lastError);
+        "Failed to create a NewWriteableFile: " + fname, lastError);
   }
 
   // We will start writing at the end, appending
@@ -304,7 +337,8 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
     if (!ret) {
       auto lastError = GetLastError();
       return IOErrorFromWindowsError(
-        "Failed to create a ReopenWritableFile move to the end: " + fname, lastError);
+          "Failed to create a ReopenWritableFile move to the end: " + fname,
+          lastError);
     }
   }
 
@@ -312,18 +346,21 @@ Status WinEnvIO::OpenWritableFile(const std::string& fname,
     // We usually do not use mmmapping on SSD and thus we pass memory
     // page_size
     result->reset(new WinMmapFile(fname, hFile, page_size_,
-      allocation_granularity_, local_options));
+                                  allocation_granularity_, local_options));
   } else {
     // Here we want the buffer allocation to be aligned by the SSD page size
     // and to be a multiple of it
-    result->reset(new WinWritableFile(fname, hFile, page_size_,
-      c_BufferCapacity, local_options));
+    result->reset(new WinWritableFile(fname, hFile,
+                                      std::max(GetSectorSize(fname),
+                                               GetPageSize()),
+                                      c_BufferCapacity, local_options));
   }
   return s;
 }
 
 Status WinEnvIO::NewRandomRWFile(const std::string & fname,
-  std::unique_ptr<RandomRWFile>* result, const EnvOptions & options) {
+                                 std::unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions & options) {
 
   Status s;
 
@@ -331,7 +368,7 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname,
   // Random access is to disable read-ahead as the system reads too much data
   DWORD desired_access = GENERIC_READ | GENERIC_WRITE;
   DWORD shared_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE;
-  DWORD creation_disposition = OPEN_ALWAYS; // Create if necessary or open existing
+  DWORD creation_disposition = OPEN_EXISTING; // Fail if file does not exist
   DWORD file_flags = FILE_FLAG_RANDOM_ACCESS;
 
   if (options.use_direct_reads && options.use_direct_writes) {
@@ -344,13 +381,13 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname,
   {
     IOSTATS_TIMER_GUARD(open_nanos);
     hFile =
-      CreateFileA(fname.c_str(),
-        desired_access,
-        shared_mode,
-        NULL, // Security attributes
-        creation_disposition,
-        file_flags,
-        NULL);
+      RX_CreateFile(RX_FN(fname).c_str(),
+                    desired_access,
+                    shared_mode,
+                    NULL, // Security attributes
+                    creation_disposition,
+                    file_flags,
+                    NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -360,78 +397,224 @@ Status WinEnvIO::NewRandomRWFile(const std::string & fname,
   }
 
   UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
-  result->reset(new WinRandomRWFile(fname, hFile, page_size_, options));
+  result->reset(new WinRandomRWFile(fname, hFile,
+                                    std::max(GetSectorSize(fname),
+                                             GetPageSize()),
+                                    options));
+  fileGuard.release();
+
+  return s;
+}
+
+Status WinEnvIO::NewMemoryMappedFileBuffer(
+    const std::string & fname,
+    std::unique_ptr<MemoryMappedFileBuffer>* result) {
+  Status s;
+  result->reset();
+
+  DWORD fileFlags = FILE_ATTRIBUTE_READONLY;
+
+  HANDLE hFile = INVALID_HANDLE_VALUE;
+  {
+    IOSTATS_TIMER_GUARD(open_nanos);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(), GENERIC_READ | GENERIC_WRITE,
+        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+        NULL,
+        OPEN_EXISTING,  // Open only if it exists
+        fileFlags,
+        NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == hFile) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError(
+        "Failed to open NewMemoryMappedFileBuffer: " + fname, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr fileGuard(hFile, CloseHandleFunc);
+
+  uint64_t fileSize = 0;
+  s = GetFileSize(fname, &fileSize);
+  if (!s.ok()) {
+    return s;
+  }
+  // Will not map empty files
+  if (fileSize == 0) {
+    return Status::NotSupported(
+        "NewMemoryMappedFileBuffer can not map zero length files: " + fname);
+  }
+
+  // size_t is 32-bit with 32-bit builds
+  if (fileSize > std::numeric_limits<size_t>::max()) {
+    return Status::NotSupported(
+        "The specified file size does not fit into 32-bit memory addressing: "
+         + fname);
+  }
+
+  HANDLE hMap = RX_CreateFileMapping(hFile, NULL, PAGE_READWRITE,
+                                     0,  // Whole file at its present length
+                                     0,
+                                     NULL);  // Mapping name
+
+  if (!hMap) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError(
+        "Failed to create file mapping for: " + fname, lastError);
+  }
+  UniqueCloseHandlePtr mapGuard(hMap, CloseHandleFunc);
+
+  void* base = MapViewOfFileEx(hMap, FILE_MAP_WRITE,
+                               0,  // High DWORD of access start
+                               0,  // Low DWORD
+                               static_cast<SIZE_T>(fileSize),
+                               NULL);  // Let the OS choose the mapping
+
+  if (!base) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError(
+        "Failed to MapViewOfFile for NewMemoryMappedFileBuffer: " + fname,
+        lastError);
+  }
+
+  result->reset(new WinMemoryMappedBuffer(hFile, hMap, base,
+                                          static_cast<size_t>(fileSize)));
+
+  mapGuard.release();
   fileGuard.release();
 
   return s;
 }
 
 Status WinEnvIO::NewDirectory(const std::string& name,
-  std::unique_ptr<Directory>* result) {
+                              std::unique_ptr<Directory>* result) {
   Status s;
   // Must be nullptr on failure
   result->reset();
-  // Must fail if directory does not exist
+
   if (!DirExists(name)) {
-    s = IOError("Directory does not exist: " + name, EEXIST);
-  } else {
+    s = IOErrorFromWindowsError(
+        "open folder: " + name, ERROR_DIRECTORY);
+    return s;
+  }
+
+  HANDLE handle = INVALID_HANDLE_VALUE;
+  // 0 - for access means read metadata
+  {
     IOSTATS_TIMER_GUARD(open_nanos);
-    result->reset(new WinDirectory);
+    handle = RX_CreateFile(
+        RX_FN(name).c_str(), 0,
+        FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+        NULL,
+        OPEN_EXISTING,
+        FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+        NULL);
+  }
+
+  if (INVALID_HANDLE_VALUE == handle) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("open folder: " + name, lastError);
+    return s;
   }
+
+  result->reset(new WinDirectory(handle));
+
   return s;
 }
 
 Status WinEnvIO::FileExists(const std::string& fname) {
-  // F_OK == 0
-  const int F_OK_ = 0;
-  return _access(fname.c_str(), F_OK_) == 0 ? Status::OK()
-    : Status::NotFound();
+  Status s;
+  // TODO: This does not follow symbolic links at this point
+  // which is consistent with _access() impl on windows
+  // but can be added
+  WIN32_FILE_ATTRIBUTE_DATA attrs;
+  if (FALSE == RX_GetFileAttributesEx(RX_FN(fname).c_str(),
+                                      GetFileExInfoStandard, &attrs)) {
+    auto lastError = GetLastError();
+    switch (lastError) {
+    case ERROR_ACCESS_DENIED:
+    case ERROR_NOT_FOUND:
+    case ERROR_FILE_NOT_FOUND:
+    case ERROR_PATH_NOT_FOUND:
+      s = Status::NotFound();
+      break;
+    default:
+      s = IOErrorFromWindowsError("Unexpected error for: " + fname,
+                                  lastError);
+      break;
+    }
+  }
+  return s;
 }
 
 Status WinEnvIO::GetChildren(const std::string& dir,
-  std::vector<std::string>* result) {
+                             std::vector<std::string>* result) {
 
+  Status status;
   result->clear();
   std::vector<std::string> output;
 
-  Status status;
+  RX_WIN32_FIND_DATA data;
+  memset(&data, 0, sizeof(data));
+  std::string pattern(dir);
+  pattern.append("\\").append("*");
 
-  auto CloseDir = [](DIR* p) { closedir(p); };
-  std::unique_ptr<DIR, decltype(CloseDir)> dirp(opendir(dir.c_str()),
-    CloseDir);
-
-  if (!dirp) {
-    switch (errno) {
-      case EACCES:
-      case ENOENT:
-      case ENOTDIR:
-        return Status::NotFound();
-      default:
-        return IOError(dir, errno);
-    }
-  } else {
-    if (result->capacity() > 0) {
-      output.reserve(result->capacity());
-    }
+  HANDLE handle = RX_FindFirstFileEx(RX_FN(pattern).c_str(),
+                                     // Do not want alternative name
+                                     FindExInfoBasic,
+                                     &data,
+                                     FindExSearchNameMatch,
+                                     NULL,  // lpSearchFilter
+                                     0);
 
-    struct dirent* ent = readdir(dirp.get());
-    while (ent) {
-      output.push_back(ent->d_name);
-      ent = readdir(dirp.get());
+  if (handle == INVALID_HANDLE_VALUE) {
+    auto lastError = GetLastError();
+    switch (lastError) {
+    case ERROR_NOT_FOUND:
+    case ERROR_ACCESS_DENIED:
+    case ERROR_FILE_NOT_FOUND:
+    case ERROR_PATH_NOT_FOUND:
+      status = Status::NotFound();
+      break;
+    default:
+      status = IOErrorFromWindowsError(
+          "Failed to GetChhildren for: " + dir, lastError);
     }
+    return status;
   }
 
-  output.swap(*result);
+  UniqueFindClosePtr fc(handle, FindCloseFunc);
 
+  if (result->capacity() > 0) {
+    output.reserve(result->capacity());
+  }
+
+  // For safety
+  data.cFileName[MAX_PATH - 1] = 0;
+
+  while (true) {
+    auto x = RX_FILESTRING(data.cFileName, RX_FNLEN(data.cFileName));
+    output.emplace_back(FN_TO_RX(x));
+    BOOL ret =- RX_FindNextFile(handle, &data);
+    // If the function fails the return value is zero
+    // and non-zero otherwise. Not TRUE or FALSE.
+    if (ret == FALSE) {
+      // Posix does not care why we stopped
+      break;
+    }
+    data.cFileName[MAX_PATH - 1] = 0;
+  }
+  output.swap(*result);
   return status;
 }
 
 Status WinEnvIO::CreateDir(const std::string& name) {
   Status result;
-
-  if (_mkdir(name.c_str()) != 0) {
-    auto code = errno;
-    result = IOError("Failed to create dir: " + name, code);
+  BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
+  if (!ret) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError(
+        "Failed to create a directory: " + name, lastError);
   }
 
   return result;
@@ -444,24 +627,27 @@ Status  WinEnvIO::CreateDirIfMissing(const std::string& name) {
     return result;
   }
 
-  if (_mkdir(name.c_str()) != 0) {
-    if (errno == EEXIST) {
-      result =
-        Status::IOError("`" + name + "' exists but is not a directory");
+  BOOL ret = RX_CreateDirectory(RX_FN(name).c_str(), NULL);
+  if (!ret) {
+    auto lastError = GetLastError();
+    if (lastError != ERROR_ALREADY_EXISTS) {
+      result = IOErrorFromWindowsError(
+          "Failed to create a directory: " + name, lastError);
     } else {
-      auto code = errno;
-      result = IOError("Failed to create dir: " + name, code);
+      result =
+          Status::IOError(name + ": exists but is not a directory");
     }
   }
-
   return result;
 }
 
 Status WinEnvIO::DeleteDir(const std::string& name) {
   Status result;
-  if (_rmdir(name.c_str()) != 0) {
-    auto code = errno;
-    result = IOError("Failed to remove dir: " + name, code);
+  BOOL ret = RX_RemoveDirectory(RX_FN(name).c_str());
+  if (!ret) {
+    auto lastError = GetLastError();
+    result = IOErrorFromWindowsError("Failed to remove dir: " + name,
+                                     lastError);
   }
   return result;
 }
@@ -471,7 +657,8 @@ Status WinEnvIO::GetFileSize(const std::string& fname,
   Status s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
+                             &attrs)) {
     ULARGE_INTEGER file_size;
     file_size.HighPart = attrs.nFileSizeHigh;
     file_size.LowPart = attrs.nFileSizeLow;
@@ -506,7 +693,8 @@ Status WinEnvIO::GetFileModificationTime(const std::string& fname,
   Status s;
 
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (GetFileAttributesExA(fname.c_str(), GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(fname).c_str(), GetFileExInfoStandard,
+                            &attrs)) {
     *file_mtime = FileTimeToUnixTime(attrs.ftLastWriteTime);
   } else {
     auto lastError = GetLastError();
@@ -524,7 +712,8 @@ Status WinEnvIO::RenameFile(const std::string& src,
 
   // rename() is not capable of replacing the existing file as on Linux
   // so use OS API directly
-  if (!MoveFileExA(src.c_str(), target.c_str(), MOVEFILE_REPLACE_EXISTING)) {
+  if (!RX_MoveFileEx(RX_FN(src).c_str(), RX_FN(target).c_str(),
+                     MOVEFILE_REPLACE_EXISTING)) {
     DWORD lastError = GetLastError();
 
     std::string text("Failed to rename: ");
@@ -540,8 +729,11 @@ Status WinEnvIO::LinkFile(const std::string& src,
   const std::string& target) {
   Status result;
 
-  if (!CreateHardLinkA(target.c_str(), src.c_str(), NULL)) {
+  if (!RX_CreateHardLink(RX_FN(target).c_str(), RX_FN(src).c_str(),  NULL)) {
     DWORD lastError = GetLastError();
+    if (lastError == ERROR_NOT_SAME_DEVICE) {
+      return Status::NotSupported("No cross FS links allowed");
+    }
 
     std::string text("Failed to link: ");
     text.append(src).append(" to: ").append(target);
@@ -552,8 +744,108 @@ Status WinEnvIO::LinkFile(const std::string& src,
   return result;
 }
 
+Status WinEnvIO::NumFileLinks(const std::string& fname, uint64_t* count) {
+  Status s;
+  HANDLE handle = RX_CreateFile(
+      RX_FN(fname).c_str(), 0,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+      NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+
+  if (INVALID_HANDLE_VALUE == handle) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("NumFileLinks: " + fname, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr handle_guard(handle, CloseHandleFunc);
+  FILE_STANDARD_INFO standard_info;
+  if (0 != GetFileInformationByHandleEx(handle, FileStandardInfo,
+                                        &standard_info,
+                                        sizeof(standard_info))) {
+    *count = standard_info.NumberOfLinks;
+  } else {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("GetFileInformationByHandleEx: " + fname,
+                                lastError);
+  }
+  return s;
+}
+
+Status WinEnvIO::AreFilesSame(const std::string& first,
+  const std::string& second, bool* res) {
+// For MinGW builds
+#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
+  Status s = Status::NotSupported();
+#else
+  assert(res != nullptr);
+  Status s;
+  if (res == nullptr) {
+    s = Status::InvalidArgument("res");
+    return s;
+  }
+
+  // 0 - for access means read metadata
+  HANDLE file_1 = RX_CreateFile(
+      RX_FN(first).c_str(), 0,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+      NULL,
+      OPEN_EXISTING,
+      FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+      NULL);
+
+  if (INVALID_HANDLE_VALUE == file_1) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("open file: " + first, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr g_1(file_1, CloseHandleFunc);
+
+  HANDLE file_2 = RX_CreateFile(
+      RX_FN(second).c_str(), 0,
+      FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+      NULL, OPEN_EXISTING,
+      FILE_FLAG_BACKUP_SEMANTICS, // make opening folders possible
+      NULL);
+
+  if (INVALID_HANDLE_VALUE == file_2) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("open file: " + second, lastError);
+    return s;
+  }
+  UniqueCloseHandlePtr g_2(file_2, CloseHandleFunc);
+
+  FILE_ID_INFO FileInfo_1;
+  BOOL result = GetFileInformationByHandleEx(file_1, FileIdInfo, &FileInfo_1,
+                                             sizeof(FileInfo_1));
+
+  if (!result) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("stat file: " + first, lastError);
+    return s;
+  }
+
+   FILE_ID_INFO FileInfo_2;
+   result = GetFileInformationByHandleEx(file_2, FileIdInfo, &FileInfo_2,
+                                         sizeof(FileInfo_2));
+
+  if (!result) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("stat file: " + second, lastError);
+    return s;
+  }
+
+  if (FileInfo_1.VolumeSerialNumber == FileInfo_2.VolumeSerialNumber) {
+    *res = (0 == memcmp(FileInfo_1.FileId.Identifier,
+                        FileInfo_2.FileId.Identifier,
+                        sizeof(FileInfo_1.FileId.Identifier)));
+  } else {
+    *res = false;
+  }
+#endif
+  return s;
+}
+
 Status  WinEnvIO::LockFile(const std::string& lockFname,
-  FileLock** lock) {
+                           FileLock** lock) {
   assert(lock != nullptr);
 
   *lock = NULL;
@@ -568,15 +860,16 @@ Status  WinEnvIO::LockFile(const std::string& lockFname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(lockFname.c_str(), (GENERIC_READ | GENERIC_WRITE),
-      ExclusiveAccessON, NULL, CREATE_ALWAYS,
-      FILE_ATTRIBUTE_NORMAL, NULL);
+    hFile = RX_CreateFile(RX_FN(lockFname).c_str(),
+                          (GENERIC_READ | GENERIC_WRITE),
+                          ExclusiveAccessON, NULL, CREATE_ALWAYS,
+                          FILE_ATTRIBUTE_NORMAL, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
     auto lastError = GetLastError();
     result = IOErrorFromWindowsError(
-      "Failed to create lock file: " + lockFname, lastError);
+        "Failed to create lock file: " + lockFname, lastError);
   } else {
     *lock = new WinFileLock(hFile);
   }
@@ -595,12 +888,12 @@ Status WinEnvIO::UnlockFile(FileLock* lock) {
 }
 
 Status WinEnvIO::GetTestDirectory(std::string* result) {
+
   std::string output;
 
   const char* env = getenv("TEST_TMPDIR");
   if (env && env[0] != '\0') {
     output = env;
-    CreateDir(output);
   } else {
     env = getenv("TMP");
 
@@ -609,9 +902,8 @@ Status WinEnvIO::GetTestDirectory(std::string* result) {
     } else {
       output = "c:\\tmp";
     }
-
-    CreateDir(output);
   }
+  CreateDir(output);
 
   output.append("\\testrocksdb-");
   output.append(std::to_string(_getpid()));
@@ -624,7 +916,7 @@ Status WinEnvIO::GetTestDirectory(std::string* result) {
 }
 
 Status WinEnvIO::NewLogger(const std::string& fname,
-  std::shared_ptr<Logger>* result) {
+                           std::shared_ptr<Logger>* result) {
   Status s;
 
   result->reset();
@@ -632,15 +924,15 @@ Status WinEnvIO::NewLogger(const std::string& fname,
   HANDLE hFile = 0;
   {
     IOSTATS_TIMER_GUARD(open_nanos);
-    hFile = CreateFileA(
-      fname.c_str(), GENERIC_WRITE,
-      FILE_SHARE_READ | FILE_SHARE_DELETE,  // In RocksDb log files are
-      // renamed and deleted before
-      // they are closed. This enables
-      // doing so.
-      NULL,
-      CREATE_ALWAYS,  // Original fopen mode is "w"
-      FILE_ATTRIBUTE_NORMAL, NULL);
+    hFile = RX_CreateFile(
+        RX_FN(fname).c_str(), GENERIC_WRITE,
+        FILE_SHARE_READ | FILE_SHARE_DELETE,  // In RocksDb log files are
+        // renamed and deleted before
+        // they are closed. This enables
+        // doing so.
+        NULL,
+        CREATE_ALWAYS,  // Original fopen mode is "w"
+        FILE_ATTRIBUTE_NORMAL, NULL);
   }
 
   if (INVALID_HANDLE_VALUE == hFile) {
@@ -687,21 +979,29 @@ uint64_t WinEnvIO::NowMicros() {
     return li.QuadPart;
   }
   using namespace std::chrono;
-  return duration_cast<microseconds>(system_clock::now().time_since_epoch()).count();
+  return duration_cast<microseconds>(
+      high_resolution_clock::now().time_since_epoch()).count();
 }
 
 uint64_t WinEnvIO::NowNanos() {
-  // all std::chrono clocks on windows have the same resolution that is only
-  // good enough for microseconds but not nanoseconds
-  // On Windows 8 and Windows 2012 Server
-  // GetSystemTimePreciseAsFileTime(&current_time) can be used
-  LARGE_INTEGER li;
-  QueryPerformanceCounter(&li);
-  // Convert to nanoseconds first to avoid loss of precision
-  // and divide by frequency
-  li.QuadPart *= std::nano::den;
-  li.QuadPart /= perf_counter_frequency_;
-  return li.QuadPart;
+  if (nano_seconds_per_period_ != 0) {
+    // all std::chrono clocks on windows have the same resolution that is only
+    // good enough for microseconds but not nanoseconds
+    // On Windows 8 and Windows 2012 Server
+    // GetSystemTimePreciseAsFileTime(&current_time) can be used
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    // Convert performance counter to nanoseconds by precomputed ratio.
+    // Directly multiply nano::den with li.QuadPart causes overflow.
+    // Only do this when nano::den is divisible by perf_counter_frequency_,
+    // which most likely is the case in reality. If it's not, fall back to
+    // high_resolution_clock, which may be less precise under old compilers.
+    li.QuadPart *= nano_seconds_per_period_;
+    return li.QuadPart;
+  }
+  using namespace std::chrono;
+  return duration_cast<nanoseconds>(
+      high_resolution_clock::now().time_since_epoch()).count();
 }
 
 Status WinEnvIO::GetHostName(char* name, uint64_t len) {
@@ -720,29 +1020,32 @@ Status WinEnvIO::GetHostName(char* name, uint64_t len) {
 }
 
 Status WinEnvIO::GetAbsolutePath(const std::string& db_path,
-  std::string* output_path) {
+                                 std::string* output_path) {
   // Check if we already have an absolute path
-  // that starts with non dot and has a semicolon in it
-  if ((!db_path.empty() && (db_path[0] == '/' || db_path[0] == '\\')) ||
-    (db_path.size() > 2 && db_path[0] != '.' &&
-    ((db_path[1] == ':' && db_path[2] == '\\') ||
-    (db_path[1] == ':' && db_path[2] == '/')))) {
+  // For test compatibility we will consider starting slash as an
+  // absolute path
+  if ((!db_path.empty() && (db_path[0] == '\\' || db_path[0] == '/')) ||
+    !RX_PathIsRelative(RX_FN(db_path).c_str())) {
     *output_path = db_path;
     return Status::OK();
   }
 
-  std::string result;
-  result.resize(_MAX_PATH);
+  RX_FILESTRING result;
+  result.resize(MAX_PATH);
 
-  char* ret = _getcwd(&result[0], _MAX_PATH);
-  if (ret == nullptr) {
-    return Status::IOError("Failed to get current working directory",
-      strerror(errno));
+  // Hopefully no changes the current directory while we do this
+  // however _getcwd also suffers from the same limitation
+  DWORD len = RX_GetCurrentDirectory(MAX_PATH, &result[0]);
+  if (len == 0) {
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError("Failed to get current working directory",
+                                   lastError);
   }
 
-  result.resize(strlen(result.data()));
+  result.resize(len);
+  std::string res = FN_TO_RX(result);
 
-  result.swap(*output_path);
+  res.swap(*output_path);
   return Status::OK();
 }
 
@@ -762,8 +1065,8 @@ std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) {
     char* p = &result[0];
 
     int len = snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ",
-      t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
-      t.tm_min, t.tm_sec);
+                       t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour,
+                       t.tm_min, t.tm_sec);
     assert(len > 0);
 
     result.resize(len);
@@ -773,42 +1076,109 @@ std::string WinEnvIO::TimeToString(uint64_t secondsSince1970) {
 }
 
 EnvOptions WinEnvIO::OptimizeForLogWrite(const EnvOptions& env_options,
-  const DBOptions& db_options) const {
-  EnvOptions optimized = env_options;
+                                         const DBOptions& db_options) const {
+  EnvOptions optimized(env_options);
+  // These two the same as default optimizations
   optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
+  optimized.writable_file_max_buffer_size =
+      db_options.writable_file_max_buffer_size;
+
+  // This adversely affects %999 on windows
   optimized.use_mmap_writes = false;
-  // This is because we flush only whole pages on unbuffered io and
-  // the last records are not guaranteed to be flushed.
+  // Direct writes will produce a huge perf impact on
+  // Windows. Pre-allocate space for WAL.
   optimized.use_direct_writes = false;
-  // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
-  // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
-  // test and make this false
-  optimized.fallocate_with_keep_size = true;
   return optimized;
 }
 
 EnvOptions WinEnvIO::OptimizeForManifestWrite(
-  const EnvOptions& env_options) const {
-  EnvOptions optimized = env_options;
+    const EnvOptions& env_options) const {
+  EnvOptions optimized(env_options);
   optimized.use_mmap_writes = false;
-  optimized.use_direct_writes = false;
-  optimized.fallocate_with_keep_size = true;
+  optimized.use_direct_reads = false;
+  return optimized;
+}
+
+EnvOptions WinEnvIO::OptimizeForManifestRead(
+    const EnvOptions& env_options) const {
+  EnvOptions optimized(env_options);
+  optimized.use_mmap_writes = false;
+  optimized.use_direct_reads = false;
   return optimized;
 }
 
 // Returns true iff the named directory exists and is a directory.
 bool WinEnvIO::DirExists(const std::string& dname) {
   WIN32_FILE_ATTRIBUTE_DATA attrs;
-  if (GetFileAttributesExA(dname.c_str(), GetFileExInfoStandard, &attrs)) {
+  if (RX_GetFileAttributesEx(RX_FN(dname).c_str(),
+                             GetFileExInfoStandard, &attrs)) {
     return 0 != (attrs.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY);
   }
   return false;
 }
 
+size_t WinEnvIO::GetSectorSize(const std::string& fname) {
+  size_t sector_size = kSectorSize;
+
+  if (RX_PathIsRelative(RX_FN(fname).c_str())) {
+    return sector_size;
+  }
+
+  // obtain device handle
+  char devicename[7] = "\\\\.\\";
+  int erresult = strncat_s(devicename, sizeof(devicename), fname.c_str(), 2);
+
+  if (erresult) {
+    assert(false);
+    return sector_size;
+  }
+
+  HANDLE hDevice = CreateFile(devicename, 0, 0, nullptr, OPEN_EXISTING,
+                              FILE_ATTRIBUTE_NORMAL, nullptr);
+
+  if (hDevice == INVALID_HANDLE_VALUE) {
+    return sector_size;
+  }
+
+  STORAGE_PROPERTY_QUERY spropertyquery;
+  spropertyquery.PropertyId = StorageAccessAlignmentProperty;
+  spropertyquery.QueryType = PropertyStandardQuery;
+
+  BYTE output_buffer[sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR)];
+  DWORD output_bytes = 0;
+
+  BOOL ret = DeviceIoControl(hDevice, IOCTL_STORAGE_QUERY_PROPERTY,
+                             &spropertyquery, sizeof(spropertyquery),
+                             output_buffer,
+                             sizeof(STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR),
+                             &output_bytes, nullptr);
+
+  if (ret) {
+    sector_size = ((STORAGE_ACCESS_ALIGNMENT_DESCRIPTOR *)output_buffer)->BytesPerLogicalSector;
+  } else {
+    // many devices do not support StorageProcessAlignmentProperty. Any failure here and we
+    // fall back to logical alignment
+
+    DISK_GEOMETRY_EX geometry = { 0 };
+    ret = DeviceIoControl(hDevice, IOCTL_DISK_GET_DRIVE_GEOMETRY,
+           nullptr, 0, &geometry, sizeof(geometry), &output_bytes, nullptr);
+    if (ret) {
+      sector_size = geometry.Geometry.BytesPerSector;
+    }
+  }
+
+  if (hDevice != INVALID_HANDLE_VALUE) {
+    CloseHandle(hDevice);
+  }
+
+  return sector_size;
+}
+
 ////////////////////////////////////////////////////////////////////////
 // WinEnvThreads
 
-WinEnvThreads::WinEnvThreads(Env* hosted_env) : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) {
+WinEnvThreads::WinEnvThreads(Env* hosted_env)
+    : hosted_env_(hosted_env), thread_pools_(Env::Priority::TOTAL) {
 
   for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
     thread_pools_[pool_id].SetThreadPriority(
@@ -827,8 +1197,9 @@ WinEnvThreads::~WinEnvThreads() {
   }
 }
 
-void WinEnvThreads::Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-  void* tag, void(*unschedFunction)(void* arg)) {
+void WinEnvThreads::Schedule(void(*function)(void*), void* arg,
+                             Env::Priority pri, void* tag,
+                             void(*unschedFunction)(void* arg)) {
   assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH);
   thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
 }
@@ -923,8 +1294,7 @@ WinEnv::~WinEnv() {
   delete thread_status_updater_;
 }
 
-Status WinEnv::GetThreadList(
-  std::vector<ThreadStatus>* thread_list) {
+Status WinEnv::GetThreadList(std::vector<ThreadStatus>* thread_list) {
   assert(thread_status_updater_);
   return thread_status_updater_->GetThreadList(thread_list);
 }
@@ -933,19 +1303,23 @@ Status WinEnv::DeleteFile(const std::string& fname) {
   return winenv_io_.DeleteFile(fname);
 }
 
+Status WinEnv::Truncate(const std::string& fname, size_t size) {
+  return winenv_io_.Truncate(fname, size);
+}
+
 Status WinEnv::GetCurrentTime(int64_t* unix_time) {
   return winenv_io_.GetCurrentTime(unix_time);
 }
 
 Status  WinEnv::NewSequentialFile(const std::string& fname,
-  std::unique_ptr<SequentialFile>* result,
-  const EnvOptions& options) {
+                                  std::unique_ptr<SequentialFile>* result,
+                                  const EnvOptions& options) {
   return winenv_io_.NewSequentialFile(fname, result, options);
 }
 
 Status WinEnv::NewRandomAccessFile(const std::string& fname,
-  std::unique_ptr<RandomAccessFile>* result,
-  const EnvOptions& options) {
+                                   std::unique_ptr<RandomAccessFile>* result,
+                                   const EnvOptions& options) {
   return winenv_io_.NewRandomAccessFile(fname, result, options);
 }
 
@@ -956,17 +1330,25 @@ Status WinEnv::NewWritableFile(const std::string& fname,
 }
 
 Status WinEnv::ReopenWritableFile(const std::string& fname,
-    std::unique_ptr<WritableFile>* result, const EnvOptions& options) {
+                                  std::unique_ptr<WritableFile>* result,
+                                  const EnvOptions& options) {
   return winenv_io_.OpenWritableFile(fname, result, options, true);
 }
 
 Status WinEnv::NewRandomRWFile(const std::string & fname,
-  unique_ptr<RandomRWFile>* result, const EnvOptions & options) {
+                               std::unique_ptr<RandomRWFile>* result,
+                               const EnvOptions & options) {
   return winenv_io_.NewRandomRWFile(fname, result, options);
 }
 
+Status WinEnv::NewMemoryMappedFileBuffer(
+    const std::string& fname,
+    std::unique_ptr<MemoryMappedFileBuffer>* result) {
+  return winenv_io_.NewMemoryMappedFileBuffer(fname, result);
+}
+
 Status WinEnv::NewDirectory(const std::string& name,
-  std::unique_ptr<Directory>* result) {
+                            std::unique_ptr<Directory>* result) {
   return winenv_io_.NewDirectory(name, result);
 }
 
@@ -975,7 +1357,7 @@ Status WinEnv::FileExists(const std::string& fname) {
 }
 
 Status WinEnv::GetChildren(const std::string& dir,
-  std::vector<std::string>* result) {
+                           std::vector<std::string>* result) {
   return winenv_io_.GetChildren(dir, result);
 }
 
@@ -992,25 +1374,34 @@ Status WinEnv::DeleteDir(const std::string& name) {
 }
 
 Status WinEnv::GetFileSize(const std::string& fname,
-  uint64_t* size) {
+                           uint64_t* size) {
   return winenv_io_.GetFileSize(fname, size);
 }
 
 Status  WinEnv::GetFileModificationTime(const std::string& fname,
-  uint64_t* file_mtime) {
+                                        uint64_t* file_mtime) {
   return winenv_io_.GetFileModificationTime(fname, file_mtime);
 }
 
 Status WinEnv::RenameFile(const std::string& src,
-  const std::string& target) {
+                          const std::string& target) {
   return winenv_io_.RenameFile(src, target);
 }
 
 Status WinEnv::LinkFile(const std::string& src,
-  const std::string& target) {
+                        const std::string& target) {
   return winenv_io_.LinkFile(src, target);
 }
 
+Status WinEnv::NumFileLinks(const std::string& fname, uint64_t* count) {
+  return winenv_io_.NumFileLinks(fname, count);
+}
+
+Status WinEnv::AreFilesSame(const std::string& first,
+                            const std::string& second, bool* res) {
+  return winenv_io_.AreFilesSame(first, second, res);
+}
+
 Status WinEnv::LockFile(const std::string& lockFname,
   FileLock** lock) {
   return winenv_io_.LockFile(lockFname, lock);
@@ -1025,7 +1416,7 @@ Status  WinEnv::GetTestDirectory(std::string* result) {
 }
 
 Status WinEnv::NewLogger(const std::string& fname,
-  std::shared_ptr<Logger>* result) {
+                         std::shared_ptr<Logger>* result) {
   return winenv_io_.NewLogger(fname, result);
 }
 
@@ -1051,8 +1442,8 @@ std::string WinEnv::TimeToString(uint64_t secondsSince1970) {
 }
 
 void  WinEnv::Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-  void* tag,
-  void(*unschedFunction)(void* arg)) {
+                       void* tag,
+                       void(*unschedFunction)(void* arg)) {
   return winenv_threads_.Schedule(function, arg, pri, tag, unschedFunction);
 }
 
@@ -1093,13 +1484,18 @@ void  WinEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) {
   return winenv_threads_.IncBackgroundThreadsIfNeeded(num, pri);
 }
 
+EnvOptions WinEnv::OptimizeForManifestRead(
+    const EnvOptions& env_options) const {
+  return winenv_io_.OptimizeForManifestRead(env_options);
+}
+
 EnvOptions WinEnv::OptimizeForLogWrite(const EnvOptions& env_options,
-  const DBOptions& db_options) const {
+                                       const DBOptions& db_options) const {
   return winenv_io_.OptimizeForLogWrite(env_options, db_options);
 }
 
 EnvOptions WinEnv::OptimizeForManifestWrite(
-  const EnvOptions& env_options) const {
+    const EnvOptions& env_options) const {
   return winenv_io_.OptimizeForManifestWrite(env_options);
 }
 
diff --git a/thirdparty/rocksdb/port/win/env_win.h b/thirdparty/rocksdb/port/win/env_win.h
index ce1a61d416..7a4d48de2e 100644
--- a/thirdparty/rocksdb/port/win/env_win.h
+++ b/thirdparty/rocksdb/port/win/env_win.h
@@ -47,8 +47,7 @@ class WinEnvThreads {
   WinEnvThreads& operator=(const WinEnvThreads&) = delete;
 
   void Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-    void* tag,
-    void(*unschedFunction)(void* arg));
+                void* tag, void(*unschedFunction)(void* arg));
 
   int UnSchedule(void* arg, Env::Priority pri);
 
@@ -72,8 +71,8 @@ class WinEnvThreads {
 
 private:
 
-  Env*                     hosted_env_;
-  mutable std::mutex       mu_;
+  Env* hosted_env_;
+  mutable std::mutex mu_;
   std::vector<ThreadPoolImpl> thread_pools_;
   std::vector<WindowsThread> threads_to_join_;
 
@@ -89,34 +88,40 @@ class WinEnvIO {
 
   virtual Status DeleteFile(const std::string& fname);
 
+  Status Truncate(const std::string& fname, size_t size);
+
   virtual Status GetCurrentTime(int64_t* unix_time);
 
   virtual Status NewSequentialFile(const std::string& fname,
-    std::unique_ptr<SequentialFile>* result,
-    const EnvOptions& options);
+                                   std::unique_ptr<SequentialFile>* result,
+                                   const EnvOptions& options);
 
   // Helper for NewWritable and ReopenWritableFile
   virtual Status OpenWritableFile(const std::string& fname,
-    std::unique_ptr<WritableFile>* result,
-    const EnvOptions& options,
-    bool reopen);
+                                  std::unique_ptr<WritableFile>* result,
+                                  const EnvOptions& options,
+                                  bool reopen);
 
   virtual Status NewRandomAccessFile(const std::string& fname,
-    std::unique_ptr<RandomAccessFile>* result,
-    const EnvOptions& options);
+                                     std::unique_ptr<RandomAccessFile>* result,
+                                     const EnvOptions& options);
 
   // The returned file will only be accessed by one thread at a time.
   virtual Status NewRandomRWFile(const std::string& fname,
-    unique_ptr<RandomRWFile>* result,
-    const EnvOptions& options);
+                                 std::unique_ptr<RandomRWFile>* result,
+                                 const EnvOptions& options);
+
+  virtual Status NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result);
 
   virtual Status NewDirectory(const std::string& name,
-    std::unique_ptr<Directory>* result);
+                              std::unique_ptr<Directory>* result);
 
   virtual Status FileExists(const std::string& fname);
 
   virtual Status GetChildren(const std::string& dir,
-    std::vector<std::string>* result);
+                             std::vector<std::string>* result);
 
   virtual Status CreateDir(const std::string& name);
 
@@ -124,29 +129,31 @@ class WinEnvIO {
 
   virtual Status DeleteDir(const std::string& name);
 
-  virtual Status GetFileSize(const std::string& fname,
-    uint64_t* size);
+  virtual Status GetFileSize(const std::string& fname, uint64_t* size);
 
   static uint64_t FileTimeToUnixTime(const FILETIME& ftTime);
 
   virtual Status GetFileModificationTime(const std::string& fname,
-    uint64_t* file_mtime);
+                                         uint64_t* file_mtime);
+
+  virtual Status RenameFile(const std::string& src, const std::string& target);
+
+  virtual Status LinkFile(const std::string& src, const std::string& target);
 
-  virtual Status RenameFile(const std::string& src,
-    const std::string& target);
+  virtual Status NumFileLinks(const std::string& /*fname*/,
+                              uint64_t* /*count*/);
 
-  virtual Status LinkFile(const std::string& src,
-    const std::string& target);
+  virtual Status AreFilesSame(const std::string& first,
+                              const std::string& second, bool* res);
 
-  virtual Status LockFile(const std::string& lockFname,
-    FileLock** lock);
+  virtual Status LockFile(const std::string& lockFname, FileLock** lock);
 
   virtual Status UnlockFile(FileLock* lock);
 
   virtual Status GetTestDirectory(std::string* result);
 
   virtual Status NewLogger(const std::string& fname,
-    std::shared_ptr<Logger>* result);
+                           std::shared_ptr<Logger>* result);
 
   virtual uint64_t NowMicros();
 
@@ -155,15 +162,18 @@ class WinEnvIO {
   virtual Status GetHostName(char* name, uint64_t len);
 
   virtual Status GetAbsolutePath(const std::string& db_path,
-    std::string* output_path);
+                                 std::string* output_path);
 
   virtual std::string TimeToString(uint64_t secondsSince1970);
 
   virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
-    const DBOptions& db_options) const;
+                                         const DBOptions& db_options) const;
 
   virtual EnvOptions OptimizeForManifestWrite(
-    const EnvOptions& env_options) const;
+      const EnvOptions& env_options) const;
+
+  virtual EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const;
 
   size_t GetPageSize() const { return page_size_; }
 
@@ -171,16 +181,19 @@ class WinEnvIO {
 
   uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; }
 
+  static size_t GetSectorSize(const std::string& fname);
+
 private:
   // Returns true iff the named directory exists and is a directory.
   virtual bool DirExists(const std::string& dname);
 
   typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME);
 
-  Env*            hosted_env_;
-  size_t          page_size_;
-  size_t          allocation_granularity_;
-  uint64_t        perf_counter_frequency_;
+  Env* hosted_env_;
+  size_t page_size_;
+  size_t allocation_granularity_;
+  uint64_t perf_counter_frequency_;
+  uint64_t nano_seconds_per_period_;
   FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_;
 };
 
@@ -192,15 +205,17 @@ class WinEnv : public Env {
 
   Status DeleteFile(const std::string& fname) override;
 
+  Status Truncate(const std::string& fname, size_t size) override;
+
   Status GetCurrentTime(int64_t* unix_time) override;
 
   Status NewSequentialFile(const std::string& fname,
-    std::unique_ptr<SequentialFile>* result,
-    const EnvOptions& options) override;
+                           std::unique_ptr<SequentialFile>* result,
+                           const EnvOptions& options) override;
 
   Status NewRandomAccessFile(const std::string& fname,
-    std::unique_ptr<RandomAccessFile>* result,
-    const EnvOptions& options) override;
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& options) override;
 
   Status NewWritableFile(const std::string& fname,
                          std::unique_ptr<WritableFile>* result,
@@ -214,21 +229,25 @@ class WinEnv : public Env {
   //
   // The returned file will only be accessed by one thread at a time.
   Status ReopenWritableFile(const std::string& fname,
-    std::unique_ptr<WritableFile>* result,
-    const EnvOptions& options) override;
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& options) override;
 
   // The returned file will only be accessed by one thread at a time.
   Status NewRandomRWFile(const std::string& fname,
-    unique_ptr<RandomRWFile>* result,
-    const EnvOptions& options) override;
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override;
+
+  Status NewMemoryMappedFileBuffer(
+      const std::string& fname,
+      std::unique_ptr<MemoryMappedFileBuffer>* result) override;
 
   Status NewDirectory(const std::string& name,
-    std::unique_ptr<Directory>* result) override;
+                      std::unique_ptr<Directory>* result) override;
 
   Status FileExists(const std::string& fname) override;
 
   Status GetChildren(const std::string& dir,
-    std::vector<std::string>* result) override;
+                     std::vector<std::string>* result) override;
 
   Status CreateDir(const std::string& name) override;
 
@@ -237,26 +256,30 @@ class WinEnv : public Env {
   Status DeleteDir(const std::string& name) override;
 
   Status GetFileSize(const std::string& fname,
-    uint64_t* size) override;
+                     uint64_t* size) override;
 
   Status GetFileModificationTime(const std::string& fname,
-    uint64_t* file_mtime) override;
+                                 uint64_t* file_mtime) override;
 
   Status RenameFile(const std::string& src,
-    const std::string& target) override;
+                    const std::string& target) override;
 
   Status LinkFile(const std::string& src,
-    const std::string& target) override;
+                  const std::string& target) override;
+
+  Status NumFileLinks(const std::string& fname, uint64_t* count) override;
 
-  Status LockFile(const std::string& lockFname,
-    FileLock** lock) override;
+  Status AreFilesSame(const std::string& first,
+                      const std::string& second, bool* res) override;
+
+  Status LockFile(const std::string& lockFname, FileLock** lock) override;
 
   Status UnlockFile(FileLock* lock) override;
 
   Status GetTestDirectory(std::string* result) override;
 
   Status NewLogger(const std::string& fname,
-    std::shared_ptr<Logger>* result) override;
+                   std::shared_ptr<Logger>* result) override;
 
   uint64_t NowMicros() override;
 
@@ -265,16 +288,14 @@ class WinEnv : public Env {
   Status GetHostName(char* name, uint64_t len) override;
 
   Status GetAbsolutePath(const std::string& db_path,
-    std::string* output_path) override;
+                         std::string* output_path) override;
 
   std::string TimeToString(uint64_t secondsSince1970) override;
 
-  Status GetThreadList(
-    std::vector<ThreadStatus>* thread_list) override;
+  Status GetThreadList(std::vector<ThreadStatus>* thread_list) override;
 
   void Schedule(void(*function)(void*), void* arg, Env::Priority pri,
-    void* tag,
-    void(*unschedFunction)(void* arg)) override;
+                void* tag, void(*unschedFunction)(void* arg)) override;
 
   int UnSchedule(void* arg, Env::Priority pri) override;
 
@@ -294,15 +315,19 @@ class WinEnv : public Env {
 
   void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override;
 
+  EnvOptions OptimizeForManifestRead(
+      const EnvOptions& env_options) const override;
+
   EnvOptions OptimizeForLogWrite(const EnvOptions& env_options,
-    const DBOptions& db_options) const override;
+                                 const DBOptions& db_options) const override;
 
   EnvOptions OptimizeForManifestWrite(
-    const EnvOptions& env_options) const override;
+      const EnvOptions& env_options) const override;
+
 
 private:
 
-  WinEnvIO      winenv_io_;
+  WinEnvIO winenv_io_;
   WinEnvThreads winenv_threads_;
 };
 
diff --git a/thirdparty/rocksdb/port/win/io_win.cc b/thirdparty/rocksdb/port/win/io_win.cc
index 3d2533a2ef..128cb60b9f 100644
--- a/thirdparty/rocksdb/port/win/io_win.cc
+++ b/thirdparty/rocksdb/port/win/io_win.cc
@@ -30,7 +30,7 @@ bool IsPowerOfTwo(const size_t alignment) {
 }
 
 inline
-bool IsSectorAligned(const size_t off) { 
+bool IsSectorAligned(const size_t off) {
   return (off & (kSectorSize - 1)) == 0;
 }
 
@@ -67,9 +67,20 @@ std::string GetWindowsErrSz(DWORD err) {
 // Because all the reads/writes happen by the specified offset, the caller in
 // theory should not
 // rely on the current file offset.
-SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes,
-  uint64_t offset) {
-  assert(numBytes <= std::numeric_limits<DWORD>::max());
+Status pwrite(const WinFileData* file_data, const Slice& data,
+  uint64_t offset, size_t& bytes_written) {
+
+  Status s;
+  bytes_written = 0;
+
+  size_t num_bytes = data.size();
+  if (num_bytes > std::numeric_limits<DWORD>::max()) {
+    // May happen in 64-bit builds where size_t is 64-bits but
+    // long is still 32-bit, but that's the API here at the moment
+    return Status::InvalidArgument("num_bytes is too large for a single write: " +
+          file_data->GetName());
+  }
+
   OVERLAPPED overlapped = { 0 };
   ULARGE_INTEGER offsetUnion;
   offsetUnion.QuadPart = offset;
@@ -77,23 +88,32 @@ SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes,
   overlapped.Offset = offsetUnion.LowPart;
   overlapped.OffsetHigh = offsetUnion.HighPart;
 
-  SSIZE_T result = 0;
-
-  unsigned long bytesWritten = 0;
+  DWORD bytesWritten = 0;
 
-  if (FALSE == WriteFile(hFile, src, static_cast<DWORD>(numBytes), &bytesWritten,
-    &overlapped)) {
-    result = -1;
+  if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
+    &bytesWritten, &overlapped)) {
+    auto lastError = GetLastError();
+    s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
+      lastError);
   } else {
-    result = bytesWritten;
+    bytes_written = bytesWritten;
   }
 
-  return result;
+  return s;
 }
 
 // See comments for pwrite above
-SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) {
-  assert(numBytes <= std::numeric_limits<DWORD>::max());
+Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
+  uint64_t offset, size_t& bytes_read) {
+
+  Status s;
+  bytes_read = 0;
+
+  if (num_bytes > std::numeric_limits<DWORD>::max()) {
+    return Status::InvalidArgument("num_bytes is too large for a single read: " +
+      file_data->GetName());
+  }
+
   OVERLAPPED overlapped = { 0 };
   ULARGE_INTEGER offsetUnion;
   offsetUnion.QuadPart = offset;
@@ -101,18 +121,21 @@ SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset) {
   overlapped.Offset = offsetUnion.LowPart;
   overlapped.OffsetHigh = offsetUnion.HighPart;
 
-  SSIZE_T result = 0;
-
-  unsigned long bytesRead = 0;
+  DWORD bytesRead = 0;
 
-  if (FALSE == ReadFile(hFile, src, static_cast<DWORD>(numBytes), &bytesRead,
-    &overlapped)) {
-    return -1;
+  if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
+    &bytesRead, &overlapped)) {
+    auto lastError = GetLastError();
+    // EOF is OK with zero bytes read
+    if (lastError != ERROR_HANDLE_EOF) {
+      s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
+        lastError);
+    }
   } else {
-    result = bytesRead;
+    bytes_read = bytesRead;
   }
 
-  return result;
+  return s;
 }
 
 // SetFileInformationByHandle() is capable of fast pre-allocates.
@@ -157,9 +180,11 @@ size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
   if (max_size < kMaxVarint64Length * 3) {
     return 0;
   }
-
-  // This function has to be re-worked for cases when
-  // ReFS file system introduced on Windows Server 2012 is used
+#if (_WIN32_WINNT == _WIN32_WINNT_VISTA)
+  // MINGGW as defined by CMake file.
+  // yuslepukhin: I hate the guts of the above macros.
+  // This impl does not guarantee uniqueness everywhere
+  // is reasonably good
   BY_HANDLE_FILE_INFORMATION FileInfo;
 
   BOOL result = GetFileInformationByHandle(hFile, &FileInfo);
@@ -177,6 +202,33 @@ size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size) {
 
   assert(rid >= id);
   return static_cast<size_t>(rid - id);
+#else
+  FILE_ID_INFO FileInfo;
+  BOOL result = GetFileInformationByHandleEx(hFile, FileIdInfo, &FileInfo,
+    sizeof(FileInfo));
+
+  TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
+
+  if (!result) {
+    return 0;
+  }
+
+  static_assert(sizeof(uint64_t) == sizeof(FileInfo.VolumeSerialNumber),
+    "Wrong sizeof expectations");
+  // FileId.Identifier is an array of 16 BYTEs, we encode them as two uint64_t
+  static_assert(sizeof(uint64_t) * 2 == sizeof(FileInfo.FileId.Identifier),
+    "Wrong sizeof expectations");
+
+  char* rid = id;
+  rid = EncodeVarint64(rid, uint64_t(FileInfo.VolumeSerialNumber));
+  uint64_t* file_id = reinterpret_cast<uint64_t*>(&FileInfo.FileId.Identifier[0]);
+  rid = EncodeVarint64(rid, *file_id);
+  ++file_id;
+  rid = EncodeVarint64(rid, *file_id);
+
+  assert(rid >= id);
+  return static_cast<size_t>(rid - id);
+#endif
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -192,8 +244,8 @@ WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
       length_(length) {}
 
 WinMmapReadableFile::~WinMmapReadableFile() {
-  BOOL ret = ::UnmapViewOfFile(mapped_region_);
-  (void)ret;
+  BOOL ret __attribute__((__unused__));
+  ret = ::UnmapViewOfFile(mapped_region_);
   assert(ret);
 
   ret = ::CloseHandle(hMap_);
@@ -208,7 +260,7 @@ Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
     *result = Slice();
     return IOError(filename_, EINVAL);
   } else if (offset + n > length_) {
-    n = length_ - offset;
+    n = length_ - static_cast<size_t>(offset);
   }
   *result =
     Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
@@ -265,7 +317,7 @@ Status WinMmapFile::MapNewRegion() {
 
   assert(mapped_begin_ == nullptr);
 
-  size_t minDiskSize = file_offset_ + view_size_;
+  size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
 
   if (minDiskSize > reserved_size_) {
     status = Allocate(file_offset_, view_size_);
@@ -279,7 +331,8 @@ Status WinMmapFile::MapNewRegion() {
 
     if (hMap_ != NULL) {
       // Unmap the previous one
-      BOOL ret = ::CloseHandle(hMap_);
+      BOOL ret __attribute__((__unused__));
+      ret = ::CloseHandle(hMap_);
       assert(ret);
       hMap_ = NULL;
     }
@@ -526,7 +579,7 @@ Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(offset + len, view_size_);
+  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
   // Nothing to do
   if (spaceToReserve <= reserved_size_) {
     return status;
@@ -556,34 +609,42 @@ WinSequentialFile::~WinSequentialFile() {
 }
 
 Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
-  assert(result != nullptr && !WinFileData::use_direct_io());
   Status s;
   size_t r = 0;
 
+  assert(result != nullptr);
+  if (WinFileData::use_direct_io()) {
+    return Status::NotSupported("Read() does not support direct_io");
+  }
+
   // Windows ReadFile API accepts a DWORD.
-  // While it is possible to read in a loop if n is > UINT_MAX
-  // it is a highly unlikely case.
-  if (n > UINT_MAX) {
-    return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
+  // While it is possible to read in a loop if n is too big
+  // it is an unlikely case.
+  if (n > std::numeric_limits<DWORD>::max()) {
+    return Status::InvalidArgument("n is too big for a single ReadFile: "
+      + filename_);
   }
 
   DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
   DWORD bytesRead = 0;
   BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
-  if (ret == TRUE) {
+  if (ret != FALSE) {
     r = bytesRead;
   } else {
-    return IOErrorFromWindowsError(filename_, GetLastError());
+    auto lastError = GetLastError();
+    if (lastError != ERROR_HANDLE_EOF) {
+      s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
+        lastError);
+    }
   }
 
   *result = Slice(scratch, r);
-
   return s;
 }
 
-SSIZE_T WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
-  uint64_t offset) const {
-  return pread(GetFileHandle(), src, numBytes, offset);
+Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
+  uint64_t offset, size_t& bytes_read) const {
+  return pread(this, src, numBytes, offset, bytes_read);
 }
 
 Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
@@ -591,27 +652,19 @@ Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* resul
 
   Status s;
 
-  assert(WinFileData::use_direct_io());
-
-  // Windows ReadFile API accepts a DWORD.
-  // While it is possible to read in a loop if n is > UINT_MAX
-  // it is a highly unlikely case.
-  if (n > UINT_MAX) {
-    return IOErrorFromWindowsError(GetName(), ERROR_INVALID_PARAMETER);
+  if (!WinFileData::use_direct_io()) {
+    return Status::NotSupported("This function is only used for direct_io");
   }
 
-  auto r = PositionedReadInternal(scratch, n, offset);
-
-  if (r < 0) {
-    auto lastError = GetLastError();
-    // Posix impl wants to treat reads from beyond
-    // of the file as OK.
-    if (lastError != ERROR_HANDLE_EOF) {
-      s = IOErrorFromWindowsError(GetName(), lastError);
-    }
+  if (!IsSectorAligned(static_cast<size_t>(offset)) ||
+      !IsSectorAligned(n)) {
+      return Status::InvalidArgument(
+        "WinSequentialFile::PositionedRead: offset is not properly aligned");
   }
 
-  *result = Slice(scratch, (r < 0) ? 0 : size_t(r));
+  size_t bytes_read = 0; // out param
+  s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
+  *result = Slice(scratch, bytes_read);
   return s;
 }
 
@@ -619,15 +672,18 @@ Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* resul
 Status WinSequentialFile::Skip(uint64_t n) {
   // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
   // integer. As such it is a highly unlikley case to have n so large.
-  if (n > _I64_MAX) {
-    return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER);
+  if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
+    return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
+      filename_);
   }
 
   LARGE_INTEGER li;
-  li.QuadPart = static_cast<int64_t>(n); //cast is safe due to the check above
+  li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
   BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
   if (ret == FALSE) {
-    return IOErrorFromWindowsError(filename_, GetLastError());
+    auto lastError = GetLastError();
+    return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_, 
+      lastError);
   }
   return Status::OK();
 }
@@ -640,10 +696,11 @@ Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
 /// WinRandomAccessBase
 
 inline
-SSIZE_T WinRandomAccessImpl::PositionedReadInternal(char* src,
+Status WinRandomAccessImpl::PositionedReadInternal(char* src,
   size_t numBytes,
-  uint64_t offset) const {
-  return pread(file_base_->GetFileHandle(), src, numBytes, offset);
+  uint64_t offset,
+  size_t& bytes_read) const {
+  return pread(file_base_, src, numBytes, offset, bytes_read);
 }
 
 inline
@@ -664,8 +721,10 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
 
   // Check buffer alignment
   if (file_base_->use_direct_io()) {
-    if (!IsAligned(alignment_, scratch)) {
-      return Status::InvalidArgument("WinRandomAccessImpl::ReadImpl: scratch is not properly aligned");
+    if (!IsSectorAligned(static_cast<size_t>(offset)) ||
+        !IsAligned(alignment_, scratch)) {
+      return Status::InvalidArgument(
+        "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
     }
   }
 
@@ -674,23 +733,9 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
     return s;
   }
 
-  size_t left = n;
-  char* dest = scratch;
-
-  SSIZE_T r = PositionedReadInternal(scratch, left, offset);
-  if (r > 0) {
-    left -= r;
-  } else if (r < 0) {
-    auto lastError = GetLastError();
-    // Posix impl wants to treat reads from beyond
-    // of the file as OK.
-    if(lastError != ERROR_HANDLE_EOF) {
-      s = IOErrorFromWindowsError(file_base_->GetName(), lastError);
-    }
-  }
-
-  *result = Slice(scratch, (r < 0) ? 0 : n - left);
-
+  size_t bytes_read = 0;
+  s = PositionedReadInternal(scratch, n, offset, bytes_read);
+  *result = Slice(scratch, bytes_read);
   return s;
 }
 
@@ -749,7 +794,7 @@ WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
   BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
       FILE_CURRENT);
   // Querying no supped to fail
-  if (ret) {
+  if (ret != 0) {
     next_write_offset_ = pos.QuadPart;
   } else {
     assert(false);
@@ -761,32 +806,24 @@ Status WinWritableImpl::AppendImpl(const Slice& data) {
 
   Status s;
 
-  assert(data.size() < std::numeric_limits<DWORD>::max());
+  if (data.size() > std::numeric_limits<DWORD>::max()) {
+    return Status::InvalidArgument("data is too long for a single write" + 
+      file_data_->GetName());
+  }
 
-  uint64_t written = 0;
-  (void)written;
+  size_t bytes_written = 0; // out param
 
   if (file_data_->use_direct_io()) {
-
     // With no offset specified we are appending
     // to the end of the file
-
     assert(IsSectorAligned(next_write_offset_));
-    assert(IsSectorAligned(data.size()));
-    assert(IsAligned(GetAlignement(), data.data()));
-
-    SSIZE_T ret = pwrite(file_data_->GetFileHandle(), data.data(),
-     data.size(), next_write_offset_);
-
-    if (ret < 0) {
-      auto lastError = GetLastError();
-      s = IOErrorFromWindowsError(
-        "Failed to pwrite for: " + file_data_->GetName(), lastError);
-    }
-    else {
-      written = ret;
+    if (!IsSectorAligned(data.size()) ||
+        !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
+      s = Status::InvalidArgument(
+        "WriteData must be page aligned, size must be sector aligned");
+    } else {
+      s = pwrite(file_data_, data, next_write_offset_, bytes_written);
     }
-
   } else {
 
     DWORD bytesWritten = 0;
@@ -796,15 +833,21 @@ Status WinWritableImpl::AppendImpl(const Slice& data) {
       s = IOErrorFromWindowsError(
         "Failed to WriteFile: " + file_data_->GetName(),
         lastError);
-    }
-    else {
-      written = bytesWritten;
+    } else {
+      bytes_written = bytesWritten;
     }
   }
 
   if(s.ok()) {
-    assert(written == data.size());
-    next_write_offset_ += data.size();
+    if (bytes_written == data.size()) {
+      // This matters for direct_io cases where
+      // we rely on the fact that next_write_offset_
+      // is sector aligned
+      next_write_offset_ += bytes_written;
+    } else {
+      s = Status::IOError("Failed to write all bytes: " + 
+        file_data_->GetName());
+    }
   }
 
   return s;
@@ -814,39 +857,44 @@ inline
 Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
 
   if(file_data_->use_direct_io()) {
-    assert(IsSectorAligned(offset));
-    assert(IsSectorAligned(data.size()));
-    assert(IsAligned(GetAlignement(), data.data()));
+    if (!IsSectorAligned(static_cast<size_t>(offset)) ||
+        !IsSectorAligned(data.size()) ||
+        !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
+      return Status::InvalidArgument(
+        "Data and offset must be page aligned, size must be sector aligned");
+    }
   }
 
-  Status s;
-
-  SSIZE_T ret = pwrite(file_data_->GetFileHandle(), data.data(), data.size(), offset);
+  size_t bytes_written = 0;
+  Status s = pwrite(file_data_, data, offset, bytes_written);
 
-  // Error break
-  if (ret < 0) {
-    auto lastError = GetLastError();
-    s = IOErrorFromWindowsError(
-      "Failed to pwrite for: " + file_data_->GetName(), lastError);
-  }
-  else {
-    assert(size_t(ret) == data.size());
-    // For sequential write this would be simple
-    // size extension by data.size()
-    uint64_t write_end = offset + data.size();
-    if (write_end >= next_write_offset_) {
-      next_write_offset_ = write_end;
+  if(s.ok()) {
+    if (bytes_written == data.size()) {
+      // For sequential write this would be simple
+      // size extension by data.size()
+      uint64_t write_end = offset + bytes_written;
+      if (write_end >= next_write_offset_) {
+        next_write_offset_ = write_end;
+      }
+    } else {
+      s = Status::IOError("Failed to write all of the requested data: " +
+        file_data_->GetName());
     }
   }
   return s;
 }
 
-// Need to implement this so the file is truncated correctly
-// when buffered and unbuffered mode
 inline
 Status WinWritableImpl::TruncateImpl(uint64_t size) {
+
+  // It is tempting to check for the size for sector alignment
+  // but truncation may come at the end and there is not a requirement
+  // for this to be sector aligned so long as we do not attempt to write
+  // after that. The interface docs state that the behavior is undefined
+  // in that case.
   Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
     size);
+
   if (s.ok()) {
     next_write_offset_ = size;
   }
@@ -861,14 +909,14 @@ Status WinWritableImpl::CloseImpl() {
   auto hFile = file_data_->GetFileHandle();
   assert(INVALID_HANDLE_VALUE != hFile);
 
-  if (fsync(hFile) < 0) {
+  if (!::FlushFileBuffers(hFile)) {
     auto lastError = GetLastError();
-    s = IOErrorFromWindowsError("fsync failed at Close() for: " +
+    s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
       file_data_->GetName(),
       lastError);
   }
 
-  if(!file_data_->CloseFile()) {
+  if(!file_data_->CloseFile() && s.ok()) {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
       lastError);
@@ -879,11 +927,10 @@ Status WinWritableImpl::CloseImpl() {
 inline
 Status WinWritableImpl::SyncImpl() {
   Status s;
-  // Calls flush buffers
-  if (fsync(file_data_->GetFileHandle()) < 0) {
+  if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
     auto lastError = GetLastError();
     s = IOErrorFromWindowsError(
-        "fsync failed at Sync() for: " + file_data_->GetName(), lastError);
+        "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
   }
   return s;
 }
@@ -897,7 +944,7 @@ Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
   // Make sure that we reserve an aligned amount of space
   // since the reservation block size is driven outside so we want
   // to check if we are ok with reservation here
-  size_t spaceToReserve = Roundup(offset + len, alignment_);
+  size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
   // Nothing to do
   if (spaceToReserve <= reservedsize_) {
     return status;
@@ -930,7 +977,7 @@ WinWritableFile::~WinWritableFile() {
 bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
 
 size_t WinWritableFile::GetRequiredBufferAlignment() const {
-  return GetAlignement();
+  return static_cast<size_t>(GetAlignement());
 }
 
 Status WinWritableFile::Append(const Slice& data) {
@@ -963,6 +1010,8 @@ Status WinWritableFile::Sync() {
 
 Status WinWritableFile::Fsync() { return SyncImpl(); }
 
+bool WinWritableFile::IsSyncThreadSafe() const { return true; }
+
 uint64_t WinWritableFile::GetFileSize() {
   return GetFileNextWriteOffset();
 }
@@ -988,7 +1037,7 @@ WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
 bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
 
 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
-  return GetAlignement();
+  return static_cast<size_t>(GetAlignement());
 }
 
 Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
@@ -1012,16 +1061,41 @@ Status WinRandomRWFile::Close() {
   return CloseImpl();
 }
 
+//////////////////////////////////////////////////////////////////////////
+/// WinMemoryMappedBufer
+WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
+  BOOL ret = FALSE;
+  if (base_ != nullptr) {
+    ret = ::UnmapViewOfFile(base_);
+    assert(ret);
+    base_ = nullptr;
+  }
+  if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
+    ret = ::CloseHandle(map_handle_);
+    assert(ret);
+    map_handle_ = NULL;
+  }
+  if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
+    ret = ::CloseHandle(file_handle_);
+    assert(ret);
+    file_handle_ = NULL;
+  }
+}
+
 //////////////////////////////////////////////////////////////////////////
 /// WinDirectory
 
 Status WinDirectory::Fsync() { return Status::OK(); }
 
+size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
+  return GetUniqueIdFromFile(handle_, id, max_size);
+}
 //////////////////////////////////////////////////////////////////////////
 /// WinFileLock
 
 WinFileLock::~WinFileLock() {
-  BOOL ret = ::CloseHandle(hFile_);
+  BOOL ret __attribute__((__unused__));
+  ret = ::CloseHandle(hFile_);
   assert(ret);
 }
 
diff --git a/thirdparty/rocksdb/port/win/io_win.h b/thirdparty/rocksdb/port/win/io_win.h
index 2c1d5a1ea9..1c9d803b13 100644
--- a/thirdparty/rocksdb/port/win/io_win.h
+++ b/thirdparty/rocksdb/port/win/io_win.h
@@ -27,7 +27,9 @@ std::string GetWindowsErrSz(DWORD err);
 inline Status IOErrorFromWindowsError(const std::string& context, DWORD err) {
   return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
              ? Status::NoSpace(context, GetWindowsErrSz(err))
-             : Status::IOError(context, GetWindowsErrSz(err));
+             : ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND))
+                   ? Status::PathNotFound(context, GetWindowsErrSz(err))
+                   : Status::IOError(context, GetWindowsErrSz(err));
 }
 
 inline Status IOErrorFromLastWindowsError(const std::string& context) {
@@ -37,25 +39,18 @@ inline Status IOErrorFromLastWindowsError(const std::string& context) {
 inline Status IOError(const std::string& context, int err_number) {
   return (err_number == ENOSPC)
              ? Status::NoSpace(context, strerror(err_number))
-             : Status::IOError(context, strerror(err_number));
+             : (err_number == ENOENT)
+                   ? Status::PathNotFound(context, strerror(err_number))
+                   : Status::IOError(context, strerror(err_number));
 }
 
-// Note the below two do not set errno because they are used only here in this
-// file
-// on a Windows handle and, therefore, not necessary. Translating GetLastError()
-// to errno
-// is a sad business
-inline int fsync(HANDLE hFile) {
-  if (!FlushFileBuffers(hFile)) {
-    return -1;
-  }
-
-  return 0;
-}
+class WinFileData;
 
-SSIZE_T pwrite(HANDLE hFile, const char* src, size_t numBytes, uint64_t offset);
+Status pwrite(const WinFileData* file_data, const Slice& data,
+  uint64_t offset, size_t& bytes_written);
 
-SSIZE_T pread(HANDLE hFile, char* src, size_t numBytes, uint64_t offset);
+Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
+  uint64_t offset, size_t& bytes_read);
 
 Status fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
 
@@ -67,7 +62,7 @@ class WinFileData {
  protected:
   const std::string filename_;
   HANDLE hFile_;
-  // If ture,  the I/O issued would be direct I/O which the buffer
+  // If true, the I/O issued would be direct I/O which the buffer
   // will need to be aligned (not sure there is a guarantee that the buffer
   // passed in is aligned).
   const bool use_direct_io_;
@@ -104,8 +99,8 @@ class WinFileData {
 class WinSequentialFile : protected WinFileData, public SequentialFile {
 
   // Override for behavior change when creating a custom env
-  virtual SSIZE_T PositionedReadInternal(char* src, size_t numBytes,
-    uint64_t offset) const;
+  virtual Status PositionedReadInternal(char* src, size_t numBytes,
+    uint64_t offset, size_t& bytes_read) const;
 
 public:
   WinSequentialFile(const std::string& fname, HANDLE f,
@@ -240,8 +235,8 @@ class WinRandomAccessImpl {
   size_t       alignment_;
 
   // Override for behavior change when creating a custom env
-  virtual SSIZE_T PositionedReadInternal(char* src, size_t numBytes,
-                                         uint64_t offset) const;
+  virtual Status PositionedReadInternal(char* src, size_t numBytes,
+                                        uint64_t offset, size_t& bytes_read) const;
 
   WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
                       const EnvOptions& options);
@@ -368,6 +363,8 @@ class WinWritableFile : private WinFileData,
 
   virtual Status Fsync() override;
 
+  virtual bool IsSyncThreadSafe() const override;
+
   // Indicates if the class makes use of direct I/O
   // Use PositionedAppend
   virtual bool use_direct_io() const override;
@@ -418,11 +415,30 @@ class WinRandomRWFile : private WinFileData,
   virtual Status Close() override;
 };
 
+class WinMemoryMappedBuffer : public MemoryMappedFileBuffer {
+private:
+  HANDLE  file_handle_;
+  HANDLE  map_handle_;
+public:
+  WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base, size_t size) :
+    MemoryMappedFileBuffer(base, size),
+    file_handle_(file_handle),
+    map_handle_(map_handle) {}
+  ~WinMemoryMappedBuffer() override;
+};
+
 class WinDirectory : public Directory {
+  HANDLE handle_;
  public:
-  WinDirectory() {}
-
+  explicit WinDirectory(HANDLE h) noexcept : handle_(h) {
+    assert(handle_ != INVALID_HANDLE_VALUE);
+  }
+  ~WinDirectory() {
+    ::CloseHandle(handle_);
+  }
   virtual Status Fsync() override;
+
+  size_t GetUniqueId(char* id, size_t max_size) const override;
 };
 
 class WinFileLock : public FileLock {
diff --git a/thirdparty/rocksdb/port/win/port_win.cc b/thirdparty/rocksdb/port/win/port_win.cc
index b3fccbd930..03ba6ef428 100644
--- a/thirdparty/rocksdb/port/win/port_win.cc
+++ b/thirdparty/rocksdb/port/win/port_win.cc
@@ -14,7 +14,7 @@
 #include "port/win/port_win.h"
 
 #include <io.h>
-#include "port/dirent.h"
+#include "port/port_dirent.h"
 #include "port/sys_time.h"
 
 #include <cstdlib>
@@ -26,11 +26,33 @@
 #include <exception>
 #include <chrono>
 
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+// utf8 <-> utf16
+#include <string>
+#include <locale>
+#include <codecvt>
+#endif
+
 #include "util/logging.h"
 
 namespace rocksdb {
+
+extern const bool kDefaultToAdaptiveMutex = false;
+
 namespace port {
 
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+std::string utf16_to_utf8(const std::wstring& utf16) {
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>,wchar_t> convert;
+  return convert.to_bytes(utf16);
+}
+
+std::wstring utf8_to_utf16(const std::string& utf8) {
+  std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+  return converter.from_bytes(utf8);
+}
+#endif
+
 void gettimeofday(struct timeval* tv, struct timezone* /* tz */) {
   using namespace std::chrono;
 
@@ -108,19 +130,20 @@ void InitOnce(OnceType* once, void (*initializer)()) {
 
 // Private structure, exposed only by pointer
 struct DIR {
-  intptr_t handle_;
-  bool firstread_;
-  struct __finddata64_t data_;
+  HANDLE      handle_;
+  bool        firstread_;
+  RX_WIN32_FIND_DATA data_;
   dirent entry_;
 
-  DIR() : handle_(-1), firstread_(true) {}
+  DIR() : handle_(INVALID_HANDLE_VALUE),
+    firstread_(true) {}
 
   DIR(const DIR&) = delete;
   DIR& operator=(const DIR&) = delete;
 
   ~DIR() {
-    if (-1 != handle_) {
-      _findclose(handle_);
+    if (INVALID_HANDLE_VALUE != handle_) {
+      ::FindClose(handle_);
     }
   }
 };
@@ -136,19 +159,26 @@ DIR* opendir(const char* name) {
 
   std::unique_ptr<DIR> dir(new DIR);
 
-  dir->handle_ = _findfirst64(pattern.c_str(), &dir->data_);
+  dir->handle_ = RX_FindFirstFileEx(RX_FN(pattern).c_str(), 
+    FindExInfoBasic, // Do not want alternative name
+    &dir->data_,
+    FindExSearchNameMatch,
+    NULL, // lpSearchFilter
+    0);
 
-  if (dir->handle_ == -1) {
+  if (dir->handle_ == INVALID_HANDLE_VALUE) {
     return nullptr;
   }
 
-  strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), dir->data_.name);
+  RX_FILESTRING x(dir->data_.cFileName, RX_FNLEN(dir->data_.cFileName));
+  strcpy_s(dir->entry_.d_name, sizeof(dir->entry_.d_name), 
+           FN_TO_RX(x).c_str());
 
   return dir.release();
 }
 
 struct dirent* readdir(DIR* dirp) {
-  if (!dirp || dirp->handle_ == -1) {
+  if (!dirp || dirp->handle_ == INVALID_HANDLE_VALUE) {
     errno = EBADF;
     return nullptr;
   }
@@ -158,13 +188,15 @@ struct dirent* readdir(DIR* dirp) {
     return &dirp->entry_;
   }
 
-  auto ret = _findnext64(dirp->handle_, &dirp->data_);
+  auto ret = RX_FindNextFile(dirp->handle_, &dirp->data_);
 
-  if (ret != 0) {
+  if (ret == 0) {
     return nullptr;
   }
 
-  strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), dirp->data_.name);
+  RX_FILESTRING x(dirp->data_.cFileName, RX_FNLEN(dirp->data_.cFileName));
+  strcpy_s(dirp->entry_.d_name, sizeof(dirp->entry_.d_name), 
+           FN_TO_RX(x).c_str());
 
   return &dirp->entry_;
 }
@@ -174,11 +206,15 @@ int closedir(DIR* dirp) {
   return 0;
 }
 
-int truncate(const char* path, int64_t len) {
+int truncate(const char* path, int64_t length) {
   if (path == nullptr) {
     errno = EFAULT;
     return -1;
   }
+  return rocksdb::port::Truncate(path, length);
+}
+
+int Truncate(std::string path, int64_t len) {
 
   if (len < 0) {
     errno = EINVAL;
@@ -186,7 +222,7 @@ int truncate(const char* path, int64_t len) {
   }
 
   HANDLE hFile =
-      CreateFile(path, GENERIC_READ | GENERIC_WRITE,
+      RX_CreateFile(RX_FN(path).c_str(), GENERIC_READ | GENERIC_WRITE,
                  FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
                  NULL,           // Security attrs
                  OPEN_EXISTING,  // Truncate existing file only
diff --git a/thirdparty/rocksdb/port/win/port_win.h b/thirdparty/rocksdb/port/win/port_win.h
index f3c8669051..de41cdc7f0 100644
--- a/thirdparty/rocksdb/port/win/port_win.h
+++ b/thirdparty/rocksdb/port/win/port_win.h
@@ -9,8 +9,7 @@
 //
 // See port_example.h for documentation for the following types/functions.
 
-#ifndef STORAGE_LEVELDB_PORT_PORT_WIN_H_
-#define STORAGE_LEVELDB_PORT_PORT_WIN_H_
+#pragma once
 
 // Always want minimum headers
 #ifndef WIN32_LEAN_AND_MEAN
@@ -28,6 +27,7 @@
 #include <limits>
 #include <condition_variable>
 #include <malloc.h>
+#include <intrin.h>
 
 #include <stdint.h>
 
@@ -78,21 +78,12 @@ namespace rocksdb {
 
 #define PREFETCH(addr, rw, locality)
 
-namespace port {
-
-// VS 15
-#if (defined _MSC_VER) && (_MSC_VER >= 1900)
-
-#define ROCKSDB_NOEXCEPT noexcept
-
-// For use at db/file_indexer.h kLevelMaxIndex
-const int kMaxInt32 = std::numeric_limits<int>::max();
-const uint64_t kMaxUint64 = std::numeric_limits<uint64_t>::max();
-const int64_t kMaxInt64 = std::numeric_limits<int64_t>::max();
+extern const bool kDefaultToAdaptiveMutex;
 
-const size_t kMaxSizet = std::numeric_limits<size_t>::max();
+namespace port {
 
-#else //_MSC_VER
+// VS < 2015
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
 
 // VS 15 has snprintf
 #define snprintf _snprintf
@@ -102,8 +93,11 @@ const size_t kMaxSizet = std::numeric_limits<size_t>::max();
 // therefore, use the same limits
 
 // For use at db/file_indexer.h kLevelMaxIndex
+const uint32_t kMaxUint32 = UINT32_MAX;
 const int kMaxInt32 = INT32_MAX;
+const int kMinInt32 = INT32_MIN;
 const int64_t kMaxInt64 = INT64_MAX;
+const int64_t kMinInt64 = INT64_MIN;
 const uint64_t kMaxUint64 = UINT64_MAX;
 
 #ifdef _WIN64
@@ -112,6 +106,20 @@ const size_t kMaxSizet = UINT64_MAX;
 const size_t kMaxSizet = UINT_MAX;
 #endif
 
+#else // VS >= 2015 or MinGW
+
+#define ROCKSDB_NOEXCEPT noexcept
+
+// For use at db/file_indexer.h kLevelMaxIndex
+const uint32_t kMaxUint32 = std::numeric_limits<uint32_t>::max();
+const int kMaxInt32 = std::numeric_limits<int>::max();
+const int kMinInt32 = std::numeric_limits<int>::min();
+const uint64_t kMaxUint64 = std::numeric_limits<uint64_t>::max();
+const int64_t kMaxInt64 = std::numeric_limits<int64_t>::max();
+const int64_t kMinInt64 = std::numeric_limits<int64_t>::min();
+
+const size_t kMaxSizet = std::numeric_limits<size_t>::max();
+
 #endif //_MSC_VER
 
 const bool kLittleEndian = true;
@@ -121,7 +129,7 @@ class CondVar;
 class Mutex {
  public:
 
-   /* implicit */ Mutex(bool adaptive = false)
+   /* implicit */ Mutex(bool adaptive = kDefaultToAdaptiveMutex)
 #ifndef NDEBUG
      : locked_(false)
 #endif
@@ -241,14 +249,9 @@ extern void InitOnce(OnceType* once, void (*initializer)());
 #endif
 
 #ifdef ROCKSDB_JEMALLOC
-#include "jemalloc/jemalloc.h"
 // Separate inlines so they can be replaced if needed
-inline void* jemalloc_aligned_alloc( size_t size, size_t alignment) {
-  return je_aligned_alloc(alignment, size);
-}
-inline void jemalloc_aligned_free(void* p) {
-  je_free(p);
-}
+void* jemalloc_aligned_alloc(size_t size, size_t alignment) ROCKSDB_NOEXCEPT;
+void jemalloc_aligned_free(void* p) ROCKSDB_NOEXCEPT;
 #endif
 
 inline void *cacheline_aligned_alloc(size_t size) {
@@ -330,11 +333,62 @@ inline void* pthread_getspecific(pthread_key_t key) {
 // using C-runtime to implement. Note, this does not
 // feel space with zeros in case the file is extended.
 int truncate(const char* path, int64_t length);
+int Truncate(std::string path, int64_t length);
 void Crash(const std::string& srcfile, int srcline);
 extern int GetMaxOpenFiles();
+std::string utf16_to_utf8(const std::wstring& utf16);
+std::wstring utf8_to_utf16(const std::string& utf8);
 
 }  // namespace port
 
+
+#ifdef ROCKSDB_WINDOWS_UTF8_FILENAMES
+
+#define RX_FILESTRING std::wstring
+#define RX_FN(a) rocksdb::port::utf8_to_utf16(a)
+#define FN_TO_RX(a) rocksdb::port::utf16_to_utf8(a)
+#define RX_FNLEN(a) ::wcslen(a)
+
+#define RX_DeleteFile DeleteFileW
+#define RX_CreateFile CreateFileW
+#define RX_CreateFileMapping CreateFileMappingW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_FindFirstFileEx FindFirstFileExW
+#define RX_FindNextFile FindNextFileW
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATAW
+#define RX_CreateDirectory CreateDirectoryW
+#define RX_RemoveDirectory RemoveDirectoryW
+#define RX_GetFileAttributesEx GetFileAttributesExW
+#define RX_MoveFileEx MoveFileExW
+#define RX_CreateHardLink CreateHardLinkW
+#define RX_PathIsRelative PathIsRelativeW
+#define RX_GetCurrentDirectory GetCurrentDirectoryW
+
+#else
+
+#define RX_FILESTRING std::string
+#define RX_FN(a) a
+#define FN_TO_RX(a) a
+#define RX_FNLEN(a) strlen(a)
+
+#define RX_DeleteFile DeleteFileA
+#define RX_CreateFile CreateFileA
+#define RX_CreateFileMapping CreateFileMappingA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_FindFirstFileEx FindFirstFileExA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_FindNextFile FindNextFileA
+#define RX_WIN32_FIND_DATA WIN32_FIND_DATA
+#define RX_CreateDirectory CreateDirectoryA
+#define RX_RemoveDirectory RemoveDirectoryA
+#define RX_GetFileAttributesEx GetFileAttributesExA
+#define RX_MoveFileEx MoveFileExA
+#define RX_CreateHardLink CreateHardLinkA
+#define RX_PathIsRelative PathIsRelativeA
+#define RX_GetCurrentDirectory GetCurrentDirectoryA
+
+#endif
+
 using port::pthread_key_t;
 using port::pthread_key_create;
 using port::pthread_key_delete;
@@ -343,5 +397,3 @@ using port::pthread_getspecific;
 using port::truncate;
 
 }  // namespace rocksdb
-
-#endif  // STORAGE_LEVELDB_PORT_PORT_WIN_H_
diff --git a/thirdparty/rocksdb/port/win/win_jemalloc.cc b/thirdparty/rocksdb/port/win/win_jemalloc.cc
index fc46e189c4..3268a56aff 100644
--- a/thirdparty/rocksdb/port/win/win_jemalloc.cc
+++ b/thirdparty/rocksdb/port/win/win_jemalloc.cc
@@ -13,10 +13,39 @@
 
 #include <stdexcept>
 #include "jemalloc/jemalloc.h"
+#include "port/win/port_win.h"
+
+#if defined(ZSTD) && defined(ZSTD_STATIC_LINKING_ONLY)
+#include <zstd.h>
+#if (ZSTD_VERSION_NUMBER >= 500)
+namespace rocksdb {
+namespace port {
+void* JemallocAllocateForZSTD(void* /* opaque */, size_t size) {
+  return je_malloc(size);
+}
+void JemallocDeallocateForZSTD(void* /* opaque */, void* address) {
+  je_free(address);
+}
+ZSTD_customMem GetJeZstdAllocationOverrides() {
+  return {JemallocAllocateForZSTD, JemallocDeallocateForZSTD, nullptr};
+}
+} // namespace port
+} // namespace rocksdb
+#endif // (ZSTD_VERSION_NUMBER >= 500)
+#endif // defined(ZSTD) defined(ZSTD_STATIC_LINKING_ONLY)
 
 // Global operators to be replaced by a linker when this file is
 // a part of the build
 
+namespace rocksdb {
+namespace port {
+void* jemalloc_aligned_alloc(size_t size, size_t alignment) ROCKSDB_NOEXCEPT {
+  return je_aligned_alloc(alignment, size);
+}
+void jemalloc_aligned_free(void* p) ROCKSDB_NOEXCEPT { je_free(p); }
+}  // namespace port
+}  // namespace rocksdb
+
 void* operator new(size_t size) {
   void* p = je_malloc(size);
   if (!p) {
@@ -44,4 +73,3 @@ void operator delete[](void* p) {
     je_free(p);
   }
 }
-
diff --git a/thirdparty/rocksdb/port/win/win_logger.cc b/thirdparty/rocksdb/port/win/win_logger.cc
index 0bace9f31f..af722d9054 100644
--- a/thirdparty/rocksdb/port/win/win_logger.cc
+++ b/thirdparty/rocksdb/port/win/win_logger.cc
@@ -36,9 +36,13 @@ WinLogger::WinLogger(uint64_t (*gettid)(), Env* env, HANDLE file,
       log_size_(0),
       last_flush_micros_(0),
       env_(env),
-      flush_pending_(false) {}
+      flush_pending_(false) {
+  assert(file_ != NULL);
+  assert(file_ != INVALID_HANDLE_VALUE);
+}
 
 void WinLogger::DebugWriter(const char* str, int len) {
+  assert(file_ != INVALID_HANDLE_VALUE);
   DWORD bytesWritten = 0;
   BOOL ret = WriteFile(file_, str, len, &bytesWritten, NULL);
   if (ret == FALSE) {
@@ -47,11 +51,38 @@ void WinLogger::DebugWriter(const char* str, int len) {
   }
 }
 
-WinLogger::~WinLogger() { close(); }
+WinLogger::~WinLogger() { 
+  CloseInternal();
+}
+
+Status WinLogger::CloseImpl() {
+  return CloseInternal();
+}
 
-void WinLogger::close() { CloseHandle(file_); }
+Status WinLogger::CloseInternal() {
+  Status s;
+  if (INVALID_HANDLE_VALUE != file_) {
+    BOOL ret = FlushFileBuffers(file_);
+    if (ret == 0) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", 
+        lastError);
+    }
+    ret = CloseHandle(file_);
+    // On error the return value is zero
+    if (ret == 0 && s.ok()) {
+      auto lastError = GetLastError();
+      s = IOErrorFromWindowsError("Failed to flush LOG on Close() ", 
+        lastError);
+    }
+    file_ = INVALID_HANDLE_VALUE;
+    closed_ = true;
+  }
+  return s;
+}
 
 void WinLogger::Flush() {
+  assert(file_ != INVALID_HANDLE_VALUE);
   if (flush_pending_) {
     flush_pending_ = false;
     // With Windows API writes go to OS buffers directly so no fflush needed
@@ -64,6 +95,7 @@ void WinLogger::Flush() {
 
 void WinLogger::Logv(const char* format, va_list ap) {
   IOSTATS_TIMER_GUARD(logger_nanos);
+  assert(file_ != INVALID_HANDLE_VALUE);
 
   const uint64_t thread_id = (*gettid_)();
 
diff --git a/thirdparty/rocksdb/port/win/win_logger.h b/thirdparty/rocksdb/port/win/win_logger.h
index 2d44f506d1..0982f142f6 100644
--- a/thirdparty/rocksdb/port/win/win_logger.h
+++ b/thirdparty/rocksdb/port/win/win_logger.h
@@ -36,8 +36,6 @@ class WinLogger : public rocksdb::Logger {
 
   WinLogger& operator=(const WinLogger&) = delete;
 
-  void close();
-
   void Flush() override;
 
   using rocksdb::Logger::Logv;
@@ -47,6 +45,10 @@ class WinLogger : public rocksdb::Logger {
 
   void DebugWriter(const char* str, int len);
 
+protected:
+
+    Status CloseImpl() override;
+
  private:
   HANDLE file_;
   uint64_t (*gettid_)();  // Return the thread id for the current thread
@@ -55,6 +57,8 @@ class WinLogger : public rocksdb::Logger {
   Env* env_;
   bool flush_pending_;
 
+  Status CloseInternal();
+
   const static uint64_t flush_every_seconds_ = 5;
 };
 
diff --git a/thirdparty/rocksdb/port/win/win_thread.cc b/thirdparty/rocksdb/port/win/win_thread.cc
index e55ca7450b..9a976e2c6b 100644
--- a/thirdparty/rocksdb/port/win/win_thread.cc
+++ b/thirdparty/rocksdb/port/win/win_thread.cc
@@ -39,12 +39,17 @@ struct WindowsThread::Data {
 
 void WindowsThread::Init(std::function<void()>&& func) {
 
-  data_.reset(new Data(std::move(func)));
+  data_ = std::make_shared<Data>(std::move(func));
+  // We create another instance of std::shared_ptr to get an additional ref
+  // since we may detach and destroy this instance before the threadproc
+  // may start to run. We choose to allocate this additional ref on the heap
+  // so we do not need to synchronize and allow this thread to proceed
+  std::unique_ptr<std::shared_ptr<Data>> th_data(new std::shared_ptr<Data>(data_));
 
   data_->handle_ = _beginthreadex(NULL,
     0,    // stack size
     &Data::ThreadProc,
-    data_.get(),
+    th_data.get(),
     0,   // init flag
     &th_id_);
 
@@ -53,6 +58,7 @@ void WindowsThread::Init(std::function<void()>&& func) {
       std::errc::resource_unavailable_try_again),
       "Unable to create a thread");
   }
+  th_data.release();
 }
 
 WindowsThread::WindowsThread() :
@@ -129,10 +135,12 @@ void WindowsThread::join() {
     assert(false);
     throw std::system_error(static_cast<int>(lastError),
       std::system_category(),
-      "WaitForSingleObjectFailed");
+      "WaitForSingleObjectFailed: thread join");
   }
 
-  CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
+  BOOL rc;
+  rc = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
+  assert(rc != 0);
   data_->handle_ = 0;
 }
 
@@ -148,7 +156,7 @@ bool WindowsThread::detach() {
   BOOL ret = CloseHandle(reinterpret_cast<HANDLE>(data_->handle_));
   data_->handle_ = 0;
 
-  return (ret == TRUE);
+  return (ret != 0);
 }
 
 void  WindowsThread::swap(WindowsThread& o) {
@@ -157,9 +165,9 @@ void  WindowsThread::swap(WindowsThread& o) {
 }
 
 unsigned int __stdcall  WindowsThread::Data::ThreadProc(void* arg) {
-  auto data = reinterpret_cast<WindowsThread::Data*>(arg);
-  data->func_();
-  _endthreadex(0);
+  auto ptr = reinterpret_cast<std::shared_ptr<Data>*>(arg);
+  std::unique_ptr<std::shared_ptr<Data>> data(ptr);
+  (*data)->func_();
   return 0;
 }
 } // namespace port
diff --git a/thirdparty/rocksdb/port/win/win_thread.h b/thirdparty/rocksdb/port/win/win_thread.h
index 993cc02731..1d5b225e6c 100644
--- a/thirdparty/rocksdb/port/win/win_thread.h
+++ b/thirdparty/rocksdb/port/win/win_thread.h
@@ -28,7 +28,7 @@ class WindowsThread {
 
   struct Data;
 
-  std::unique_ptr<Data>  data_;
+  std::shared_ptr<Data>  data_;
   unsigned int           th_id_;
 
   void Init(std::function<void()>&&);
diff --git a/thirdparty/rocksdb/src.mk b/thirdparty/rocksdb/src.mk
index 5bd5236fa1..55b4e3427c 100644
--- a/thirdparty/rocksdb/src.mk
+++ b/thirdparty/rocksdb/src.mk
@@ -11,20 +11,23 @@ LIB_SOURCES =                                                   \
   db/compaction_iterator.cc                                     \
   db/compaction_job.cc                                          \
   db/compaction_picker.cc                                       \
+  db/compaction_picker_fifo.cc                                  \
   db/compaction_picker_universal.cc                             \
   db/convenience.cc                                             \
   db/db_filesnapshot.cc                                         \
   db/db_impl.cc                                                 \
-  db/db_impl_write.cc                                           \
   db/db_impl_compaction_flush.cc                                \
-  db/db_impl_files.cc                                           \
-  db/db_impl_open.cc                                            \
   db/db_impl_debug.cc                                           \
   db/db_impl_experimental.cc                                    \
+  db/db_impl_files.cc                                           \
+  db/db_impl_open.cc                                            \
   db/db_impl_readonly.cc                                        \
+  db/db_impl_secondary.cc                                       \
+  db/db_impl_write.cc                                           \
   db/db_info_dumper.cc                                          \
   db/db_iter.cc                                                 \
   db/dbformat.cc                                                \
+  db/error_handler.cc						\
   db/event_helpers.cc                                           \
   db/experimental.cc                                            \
   db/external_sst_file_ingestion_job.cc                         \
@@ -32,16 +35,18 @@ LIB_SOURCES =                                                   \
   db/flush_job.cc                                               \
   db/flush_scheduler.cc                                         \
   db/forward_iterator.cc                                        \
+  db/in_memory_stats_history.cc                                 \
   db/internal_stats.cc                                          \
+  db/logs_with_prep_tracker.cc                                  \
   db/log_reader.cc                                              \
   db/log_writer.cc                                              \
   db/malloc_stats.cc                                            \
-  db/managed_iterator.cc                                        \
   db/memtable.cc                                                \
   db/memtable_list.cc                                           \
   db/merge_helper.cc                                            \
   db/merge_operator.cc                                          \
   db/range_del_aggregator.cc                                    \
+  db/range_tombstone_fragmenter.cc                              \
   db/repair.cc                                                  \
   db/snapshot_impl.cc                                           \
   db/table_cache.cc                                             \
@@ -63,7 +68,6 @@ LIB_SOURCES =                                                   \
   env/io_posix.cc                                               \
   env/mock_env.cc                                               \
   memtable/alloc_tracker.cc                                     \
-  memtable/hash_cuckoo_rep.cc                                   \
   memtable/hash_linklist_rep.cc                                 \
   memtable/hash_skiplist_rep.cc                                 \
   memtable/skiplistrep.cc                                       \
@@ -96,11 +100,14 @@ LIB_SOURCES =                                                   \
   table/block_based_table_factory.cc                            \
   table/block_based_table_reader.cc                             \
   table/block_builder.cc                                        \
+  table/block_fetcher.cc                                        \
   table/block_prefix_index.cc                                   \
   table/bloom_block.cc                                          \
   table/cuckoo_table_builder.cc                                 \
   table/cuckoo_table_factory.cc                                 \
   table/cuckoo_table_reader.cc                                  \
+  table/data_block_hash_index.cc                                \
+  table/data_block_footer.cc                                    \
   table/flush_block_policy.cc                                   \
   table/format.cc                                               \
   table/full_filter_block.cc                                    \
@@ -116,6 +123,7 @@ LIB_SOURCES =                                                   \
   table/plain_table_index.cc                                    \
   table/plain_table_key_coding.cc                               \
   table/plain_table_reader.cc                                   \
+  table/sst_file_reader.cc                                      \
   table/sst_file_writer.cc                                      \
   table/table_properties.cc                                     \
   table/two_level_iterator.cc                                   \
@@ -127,7 +135,9 @@ LIB_SOURCES =                                                   \
   util/coding.cc                                                \
   util/compaction_job_stats_impl.cc                             \
   util/comparator.cc                                            \
+  util/compression_context_cache.cc                             \
   util/concurrent_arena.cc                                      \
+  util/concurrent_task_limiter_impl.cc                          \
   util/crc32c.cc                                                \
   util/delete_scheduler.cc                                      \
   util/dynamic_bloom.cc                                         \
@@ -137,6 +147,7 @@ LIB_SOURCES =                                                   \
   util/filename.cc                                              \
   util/filter_policy.cc                                         \
   util/hash.cc                                                  \
+  util/jemalloc_nodump_allocator.cc                             \
   util/log_buffer.cc                                            \
   util/murmurhash.cc                                            \
   util/random.cc                                                \
@@ -144,43 +155,40 @@ LIB_SOURCES =                                                   \
   util/slice.cc                                                 \
   util/sst_file_manager_impl.cc                                 \
   util/status.cc                                                \
-  util/status_message.cc                                        \
   util/string_util.cc                                           \
   util/sync_point.cc                                            \
+  util/sync_point_impl.cc                                       \
   util/thread_local.cc                                          \
   util/threadpool_imp.cc                                        \
+  util/trace_replay.cc                                          \
   util/transaction_test_util.cc                                 \
   util/xxhash.cc                                                \
   utilities/backupable/backupable_db.cc                         \
+  utilities/blob_db/blob_compaction_filter.cc                   \
   utilities/blob_db/blob_db.cc                                  \
   utilities/blob_db/blob_db_impl.cc                             \
+  utilities/blob_db/blob_db_impl_filesnapshot.cc                \
   utilities/blob_db/blob_file.cc                                \
+  utilities/blob_db/blob_log_format.cc                          \
   utilities/blob_db/blob_log_reader.cc                          \
   utilities/blob_db/blob_log_writer.cc                          \
-  utilities/blob_db/blob_log_format.cc                          \
-  utilities/blob_db/ttl_extractor.cc                            \
   utilities/cassandra/cassandra_compaction_filter.cc            \
   utilities/cassandra/format.cc                                 \
   utilities/cassandra/merge_operator.cc                         \
   utilities/checkpoint/checkpoint_impl.cc                       \
   utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc    \
   utilities/convenience/info_log_finder.cc                      \
-  utilities/date_tiered/date_tiered_db_impl.cc                  \
-  utilities/debug.cc                                        	\
-  utilities/document/document_db.cc                             \
-  utilities/document/json_document.cc                           \
-  utilities/document/json_document_builder.cc                   \
+  utilities/debug.cc                                            \
   utilities/env_mirror.cc                                       \
   utilities/env_timed.cc                                        \
-  utilities/geodb/geodb_impl.cc                                 \
   utilities/leveldb_options/leveldb_options.cc                  \
-  utilities/lua/rocks_lua_compaction_filter.cc                  \
   utilities/memory/memory_util.cc                               \
   utilities/merge_operators/max.cc                              \
   utilities/merge_operators/put.cc                              \
   utilities/merge_operators/string_append/stringappend.cc       \
   utilities/merge_operators/string_append/stringappend2.cc      \
   utilities/merge_operators/uint64add.cc                        \
+  utilities/merge_operators/bytesxor.cc                         \
   utilities/option_change_migration/option_change_migration.cc  \
   utilities/options/options_util.cc                             \
   utilities/persistent_cache/block_cache_tier.cc                \
@@ -188,28 +196,44 @@ LIB_SOURCES =                                                   \
   utilities/persistent_cache/block_cache_tier_metadata.cc       \
   utilities/persistent_cache/persistent_cache_tier.cc           \
   utilities/persistent_cache/volatile_tier_impl.cc              \
-  utilities/redis/redis_lists.cc                                \
   utilities/simulator_cache/sim_cache.cc                        \
-  utilities/spatialdb/spatial_db.cc                             \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
+  utilities/trace/file_trace_reader_writer.cc                   \
+  utilities/transactions/optimistic_transaction.cc              \
   utilities/transactions/optimistic_transaction_db_impl.cc      \
-  utilities/transactions/optimistic_transaction.cc         \
+  utilities/transactions/pessimistic_transaction.cc             \
+  utilities/transactions/pessimistic_transaction_db.cc          \
+  utilities/transactions/snapshot_checker.cc                    \
   utilities/transactions/transaction_base.cc                    \
-  utilities/transactions/pessimistic_transaction_db.cc                 \
   utilities/transactions/transaction_db_mutex_impl.cc           \
-  utilities/transactions/pessimistic_transaction.cc                    \
   utilities/transactions/transaction_lock_mgr.cc                \
   utilities/transactions/transaction_util.cc                    \
-  utilities/transactions/write_prepared_txn.cc     \
+  utilities/transactions/write_prepared_txn.cc                  \
+  utilities/transactions/write_prepared_txn_db.cc               \
+  utilities/transactions/write_unprepared_txn.cc                \
+  utilities/transactions/write_unprepared_txn_db.cc             \
   utilities/ttl/db_ttl_impl.cc                                  \
   utilities/write_batch_with_index/write_batch_with_index.cc    \
   utilities/write_batch_with_index/write_batch_with_index_internal.cc    \
 
+ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
+LIB_SOURCES_ASM =\
+  util/crc32c_ppc_asm.S
+LIB_SOURCES_C = \
+  util/crc32c_ppc.c
+else
+LIB_SOURCES_ASM =
+LIB_SOURCES_C =
+endif
+
 TOOL_LIB_SOURCES = \
-  tools/ldb_cmd.cc                                               \
-  tools/ldb_tool.cc                                              \
-  tools/sst_dump_tool.cc                                         \
-  utilities/blob_db/blob_dump_tool.cc                            \
+  tools/ldb_cmd.cc                                              \
+  tools/ldb_tool.cc                                             \
+  tools/sst_dump_tool.cc                                        \
+  utilities/blob_db/blob_dump_tool.cc                           \
+
+ANALYZER_LIB_SOURCES = \
+  tools/trace_analyzer_tool.cc					\
 
 MOCK_LIB_SOURCES = \
   table/mock_table.cc \
@@ -218,21 +242,18 @@ MOCK_LIB_SOURCES = \
 BENCH_LIB_SOURCES = \
   tools/db_bench_tool.cc                                        \
 
-EXP_LIB_SOURCES = \
-  utilities/col_buf_encoder.cc                                          \
-  utilities/col_buf_decoder.cc                                          \
-  utilities/column_aware_encoding_util.cc
-
 TEST_LIB_SOURCES = \
-  util/testharness.cc                                                   \
-  util/testutil.cc                                                      \
-  db/db_test_util.cc                                                    \
-  utilities/cassandra/test_utils.cc                                     \
+  db/db_test_util.cc                                            \
+  util/testharness.cc                                           \
+  util/testutil.cc                                              \
+  utilities/cassandra/test_utils.cc                             \
 
-MAIN_SOURCES =                                                    \
-  cache/cache_bench.cc                                                   \
-  cache/cache_test.cc                                                    \
+MAIN_SOURCES =                                                          \
+  cache/cache_bench.cc                                                  \
+  cache/cache_test.cc                                                   \
   db/column_family_test.cc                                              \
+  db/compact_files_test.cc                                              \
+  db/compaction_iterator_test.cc                                        \
   db/compaction_job_stats_test.cc                                       \
   db/compaction_job_test.cc                                             \
   db/compaction_picker_test.cc                                          \
@@ -240,47 +261,70 @@ MAIN_SOURCES =                                                    \
   db/corruption_test.cc                                                 \
   db/cuckoo_table_db_test.cc                                            \
   db/db_basic_test.cc                                                   \
+  db/db_blob_index_test.cc                                              \
   db/db_block_cache_test.cc                                             \
   db/db_bloom_filter_test.cc                                            \
   db/db_compaction_filter_test.cc                                       \
   db/db_compaction_test.cc                                              \
   db/db_dynamic_level_test.cc                                           \
   db/db_encryption_test.cc                                              \
-  db/db_flush_test.cc                                                    \
+  db/db_flush_test.cc                                                   \
   db/db_inplace_update_test.cc                                          \
   db/db_io_failure_test.cc                                              \
   db/db_iter_test.cc                                                    \
+  db/db_iter_stress_test.cc                                             \
   db/db_iterator_test.cc                                                \
   db/db_log_iter_test.cc                                                \
   db/db_memtable_test.cc                                                \
   db/db_merge_operator_test.cc                                          \
   db/db_options_test.cc                                                 \
+  db/db_properties_test.cc                                              \
   db/db_range_del_test.cc                                               \
+  db/db_secondary_test.cc                                               \
   db/db_sst_test.cc                                                     \
   db/db_statistics_test.cc                                              \
   db/db_table_properties_test.cc                                        \
   db/db_tailing_iter_test.cc                                            \
   db/db_test.cc                                                         \
+  db/db_test2.cc                                                        \
   db/db_universal_compaction_test.cc                                    \
   db/db_wal_test.cc                                                     \
   db/db_write_test.cc                                                   \
   db/dbformat_test.cc                                                   \
   db/deletefile_test.cc                                                 \
+  db/env_timed_test.cc                                                  \
+  db/error_handler_test.cc                                        	\
   db/external_sst_file_basic_test.cc                                    \
   db/external_sst_file_test.cc                                          \
   db/fault_injection_test.cc                                            \
   db/file_indexer_test.cc                                               \
+  db/file_reader_writer_test.cc                                         \
   db/filename_test.cc                                                   \
   db/flush_job_test.cc                                                  \
+  db/hash_table_test.cc                                                 \
+  db/hash_test.cc                                                       \
+  db/heap_test.cc                                                       \
   db/listener_test.cc                                                   \
   db/log_test.cc                                                        \
+  db/lru_cache_test.cc                                                  \
   db/manual_compaction_test.cc                                          \
+  db/memtable_list_test.cc                                              \
+  db/merge_helper_test.cc                                               \
   db/merge_test.cc                                                      \
+  db/obsolete_files_test.cc						\
+  db/options_settable_test.cc                                           \
   db/options_file_test.cc                                               \
+  db/partitioned_filter_block_test.cc                                   \
   db/perf_context_test.cc                                               \
+  db/persistent_cache_test.cc                                           \
   db/plain_table_db_test.cc                                             \
   db/prefix_test.cc                                                     \
+  db/repair_test.cc                                                     \
+  db/range_del_aggregator_test.cc                                       \
+  db/range_del_aggregator_bench.cc                                      \
+  db/range_tombstone_fragmenter_test.cc                                 \
   db/table_properties_collector_test.cc                                 \
+  db/util_merge_operators_test.cc                                       \
   db/version_builder_test.cc                                            \
   db/version_edit_test.cc                                               \
   db/version_set_test.cc                                                \
@@ -304,8 +348,10 @@ MAIN_SOURCES =                                                    \
   table/cleanable_test.cc                                               \
   table/cuckoo_table_builder_test.cc                                    \
   table/cuckoo_table_reader_test.cc                                     \
+  table/data_block_hash_index_test.cc                                   \
   table/full_filter_block_test.cc                                       \
   table/merger_test.cc                                                  \
+  table/sst_file_reader_test.cc                                         \
   table/table_reader_bench.cc                                           \
   table/table_test.cc                                                   \
   third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc                  \
@@ -315,6 +361,7 @@ MAIN_SOURCES =                                                    \
   tools/ldb_cmd_test.cc                                                 \
   tools/reduce_levels_test.cc                                           \
   tools/sst_dump_test.cc                                                \
+  tools/trace_analyzer_test.cc						\
   util/arena_test.cc                                                    \
   util/auto_roll_logger_test.cc                                         \
   util/autovector_test.cc                                               \
@@ -326,6 +373,7 @@ MAIN_SOURCES =                                                    \
   util/filelock_test.cc                                                 \
   util/log_write_bench.cc                                               \
   util/rate_limiter_test.cc                                             \
+  util/repeatable_thread_test.cc                                        \
   util/slice_transform_test.cc                                          \
   util/timer_queue_test.cc                                              \
   util/thread_list_test.cc                                              \
@@ -337,24 +385,17 @@ MAIN_SOURCES =                                                    \
   utilities/cassandra/cassandra_row_merge_test.cc                       \
   utilities/cassandra/cassandra_serialize_test.cc                       \
   utilities/checkpoint/checkpoint_test.cc                               \
-  utilities/column_aware_encoding_exp.cc                                \
-  utilities/column_aware_encoding_test.cc                               \
-  utilities/date_tiered/date_tiered_test.cc                             \
-  utilities/document/document_db_test.cc                                \
-  utilities/document/json_document_test.cc                              \
-  utilities/geodb/geodb_test.cc                                         \
-  utilities/lua/rocks_lua_test.cc                                       \
   utilities/memory/memory_test.cc                                       \
   utilities/merge_operators/string_append/stringappend_test.cc          \
   utilities/object_registry_test.cc                                     \
   utilities/option_change_migration/option_change_migration_test.cc     \
   utilities/options/options_util_test.cc                                \
-  utilities/redis/redis_lists_test.cc                                   \
   utilities/simulator_cache/sim_cache_test.cc                           \
-  utilities/spatialdb/spatial_db_test.cc                                \
   utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
   utilities/transactions/optimistic_transaction_test.cc                 \
   utilities/transactions/transaction_test.cc                            \
+  utilities/transactions/write_prepared_transaction_test.cc             \
+  utilities/transactions/write_unprepared_transaction_test.cc           \
   utilities/ttl/ttl_test.cc                                             \
   utilities/write_batch_with_index/write_batch_with_index_test.cc       \
 
@@ -364,7 +405,13 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/checkpoint.cc                                 \
   java/rocksjni/clock_cache.cc                                \
   java/rocksjni/columnfamilyhandle.cc                         \
+  java/rocksjni/compact_range_options.cc                      \
   java/rocksjni/compaction_filter.cc                          \
+  java/rocksjni/compaction_filter_factory.cc                  \
+  java/rocksjni/compaction_filter_factory_jnicallback.cc      \
+  java/rocksjni/compaction_job_info.cc                        \
+  java/rocksjni/compaction_job_stats.cc                       \
+  java/rocksjni/compaction_options.cc                         \
   java/rocksjni/compaction_options_fifo.cc                    \
   java/rocksjni/compaction_options_universal.cc               \
   java/rocksjni/comparator.cc                                 \
@@ -375,26 +422,50 @@ JNI_NATIVE_SOURCES =                                          \
   java/rocksjni/ingest_external_file_options.cc               \
   java/rocksjni/filter.cc                                     \
   java/rocksjni/iterator.cc                                   \
+  java/rocksjni/jnicallback.cc                                \
   java/rocksjni/loggerjnicallback.cc                          \
   java/rocksjni/lru_cache.cc                                  \
   java/rocksjni/memtablejni.cc                                \
+  java/rocksjni/memory_util.cc                                \
   java/rocksjni/merge_operator.cc                             \
+  java/rocksjni/native_comparator_wrapper_test.cc             \
+  java/rocksjni/optimistic_transaction_db.cc                  \
+  java/rocksjni/optimistic_transaction_options.cc             \
   java/rocksjni/options.cc                                    \
+  java/rocksjni/options_util.cc                               \
+  java/rocksjni/persistent_cache.cc                           \
   java/rocksjni/ratelimiterjni.cc                             \
   java/rocksjni/remove_emptyvalue_compactionfilterjni.cc      \
   java/rocksjni/cassandra_compactionfilterjni.cc              \
+  java/rocksjni/cassandra_value_operator.cc                   \
   java/rocksjni/restorejni.cc                                 \
+  java/rocksjni/rocks_callback_object.cc                      \
   java/rocksjni/rocksjni.cc                                   \
   java/rocksjni/rocksdb_exception_test.cc                     \
   java/rocksjni/slice.cc                                      \
   java/rocksjni/snapshot.cc                                   \
+  java/rocksjni/sst_file_manager.cc                           \
   java/rocksjni/sst_file_writerjni.cc                         \
   java/rocksjni/statistics.cc                                 \
   java/rocksjni/statisticsjni.cc                              \
   java/rocksjni/table.cc                                      \
+  java/rocksjni/table_filter.cc                               \
+  java/rocksjni/table_filter_jnicallback.cc                   \
+  java/rocksjni/thread_status.cc                              \
+  java/rocksjni/trace_writer.cc                               \
+  java/rocksjni/trace_writer_jnicallback.cc                   \
+  java/rocksjni/transaction.cc                                \
+  java/rocksjni/transaction_db.cc                             \
+  java/rocksjni/transaction_options.cc                        \
+  java/rocksjni/transaction_db_options.cc                     \
   java/rocksjni/transaction_log.cc                            \
+  java/rocksjni/transaction_notifier.cc                       \
+  java/rocksjni/transaction_notifier_jnicallback.cc           \
   java/rocksjni/ttl.cc                                        \
+  java/rocksjni/wal_filter.cc                                 \
+  java/rocksjni/wal_filter_jnicallback.cc                     \
   java/rocksjni/write_batch.cc                                \
   java/rocksjni/writebatchhandlerjnicallback.cc               \
   java/rocksjni/write_batch_test.cc                           \
-  java/rocksjni/write_batch_with_index.cc
+  java/rocksjni/write_batch_with_index.cc                     \
+  java/rocksjni/write_buffer_manager.cc
diff --git a/thirdparty/rocksdb/table/adaptive_table_factory.cc b/thirdparty/rocksdb/table/adaptive_table_factory.cc
index 47069f8669..bbba3b9193 100644
--- a/thirdparty/rocksdb/table/adaptive_table_factory.cc
+++ b/thirdparty/rocksdb/table/adaptive_table_factory.cc
@@ -42,9 +42,9 @@ extern const uint64_t kCuckooTableMagicNumber;
 
 Status AdaptiveTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table,
-    bool prefetch_index_and_filter_in_cache) const {
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
   Footer footer;
   auto s = ReadFooterFromFile(file.get(), nullptr /* prefetch_buffer */,
                               file_size, &footer);
diff --git a/thirdparty/rocksdb/table/adaptive_table_factory.h b/thirdparty/rocksdb/table/adaptive_table_factory.h
index b7b52ba96f..5534c8b372 100644
--- a/thirdparty/rocksdb/table/adaptive_table_factory.h
+++ b/thirdparty/rocksdb/table/adaptive_table_factory.h
@@ -14,7 +14,6 @@ namespace rocksdb {
 
 struct EnvOptions;
 
-using std::unique_ptr;
 class Status;
 class RandomAccessFile;
 class WritableFile;
@@ -35,8 +34,8 @@ class AdaptiveTableFactory : public TableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
@@ -44,8 +43,9 @@ class AdaptiveTableFactory : public TableFactory {
       uint32_t column_family_id, WritableFileWriter* file) const override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeOptions(const DBOptions& db_opts,
-                         const ColumnFamilyOptions& cf_opts) const override {
+  Status SanitizeOptions(
+      const DBOptions& /*db_opts*/,
+      const ColumnFamilyOptions& /*cf_opts*/) const override {
     return Status::OK();
   }
 
diff --git a/thirdparty/rocksdb/table/block.cc b/thirdparty/rocksdb/table/block.cc
index 372bbd2f0b..7c83ebb640 100644
--- a/thirdparty/rocksdb/table/block.cc
+++ b/thirdparty/rocksdb/table/block.cc
@@ -20,6 +20,7 @@
 #include "port/stack_trace.h"
 #include "rocksdb/comparator.h"
 #include "table/block_prefix_index.h"
+#include "table/data_block_footer.h"
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/logging.h"
@@ -33,35 +34,138 @@ namespace rocksdb {
 //
 // If any errors are detected, returns nullptr.  Otherwise, returns a
 // pointer to the key delta (just past the three decoded values).
-static inline const char* DecodeEntry(const char* p, const char* limit,
-                                      uint32_t* shared,
-                                      uint32_t* non_shared,
-                                      uint32_t* value_length) {
-  if (limit - p < 3) return nullptr;
-  *shared = reinterpret_cast<const unsigned char*>(p)[0];
-  *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
-  *value_length = reinterpret_cast<const unsigned char*>(p)[2];
-  if ((*shared | *non_shared | *value_length) < 128) {
-    // Fast path: all three values are encoded in one byte each
-    p += 3;
-  } else {
-    if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
-    if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
-    if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
+struct DecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    assert(limit - p >= 3);
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    // Using an assert in place of "return null" since we should not pay the
+    // cost of checking for corruption on every single key decoding
+    assert(!(static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)));
+    return p;
+  }
+};
+
+// Helper routine: similar to DecodeEntry but does not have assertions.
+// Instead, returns nullptr so that caller can detect and report failure.
+struct CheckAndDecodeEntry {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared,
+                                uint32_t* value_length) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) {
+      return nullptr;
+    }
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+    if ((*shared | *non_shared | *value_length) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 3;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
+        return nullptr;
+      }
+    }
+
+    if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
+      return nullptr;
+    }
+    return p;
   }
+};
 
-  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
-    return nullptr;
+struct DecodeKey {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    uint32_t value_length;
+    return DecodeEntry()(p, limit, shared, non_shared, &value_length);
   }
-  return p;
+};
+
+// In format_version 4, which is used by index blocks, the value size is not
+// encoded before the entry, as the value is known to be the handle with the
+// known size.
+struct DecodeKeyV4 {
+  inline const char* operator()(const char* p, const char* limit,
+                                uint32_t* shared, uint32_t* non_shared) {
+    // We need 2 bytes for shared and non_shared size. We also need one more
+    // byte either for value size or the actual value in case of value delta
+    // encoding.
+    if (limit - p < 3) return nullptr;
+    *shared = reinterpret_cast<const unsigned char*>(p)[0];
+    *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+    if ((*shared | *non_shared) < 128) {
+      // Fast path: all three values are encoded in one byte each
+      p += 2;
+    } else {
+      if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
+      if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
+    }
+    return p;
+  }
+};
+
+void DataBlockIter::Next() {
+  assert(Valid());
+  ParseNextDataKey<DecodeEntry>();
 }
 
-void BlockIter::Next() {
+void DataBlockIter::NextOrReport() {
   assert(Valid());
-  ParseNextKey();
+  ParseNextDataKey<CheckAndDecodeEntry>();
 }
 
-void BlockIter::Prev() {
+void IndexBlockIter::Next() {
+  assert(Valid());
+  ParseNextIndexKey();
+}
+
+void IndexBlockIter::Prev() {
+  assert(Valid());
+  // Scan backwards to a restart point before current_
+  const uint32_t original = current_;
+  while (GetRestartPoint(restart_index_) >= original) {
+    if (restart_index_ == 0) {
+      // No more entries
+      current_ = restarts_;
+      restart_index_ = num_restarts_;
+      return;
+    }
+    restart_index_--;
+  }
+  SeekToRestartPoint(restart_index_);
+  do {
+    if (!ParseNextIndexKey()) {
+      break;
+    }
+    // Loop until end of current entry hits the start of original entry
+  } while (NextEntryOffset() < original);
+}
+
+// Similar to IndexBlockIter::Prev but also caches the prev entries
+void DataBlockIter::Prev() {
   assert(Valid());
 
   assert(prev_entries_idx_ == -1 ||
@@ -87,7 +191,7 @@ void BlockIter::Prev() {
     const Slice current_key(key_ptr, current_prev_entry.key_size);
 
     current_ = current_prev_entry.offset;
-    key_.SetInternalKey(current_key, false /* copy */);
+    key_.SetKey(current_key, false /* copy */);
     value_ = current_prev_entry.value;
 
     return;
@@ -113,7 +217,7 @@ void BlockIter::Prev() {
   SeekToRestartPoint(restart_index_);
 
   do {
-    if (!ParseNextKey()) {
+    if (!ParseNextDataKey<DecodeEntry>()) {
       break;
     }
     Slice current_key = key();
@@ -135,7 +239,151 @@ void BlockIter::Prev() {
   prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
 }
 
-void BlockIter::Seek(const Slice& target) {
+void DataBlockIter::Seek(const Slice& target) {
+  Slice seek_key = target;
+  PERF_TIMER_GUARD(block_seek_nanos);
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  uint32_t index = 0;
+  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                                  comparator_);
+
+  if (!ok) {
+    return;
+  }
+  SeekToRestartPoint(index);
+  // Linear search (within restart block) for first key >= target
+
+  while (true) {
+    if (!ParseNextDataKey<DecodeEntry>() || Compare(key_, seek_key) >= 0) {
+      return;
+    }
+  }
+}
+
+// Optimized Seek for point lookup for an internal key `target`
+// target = "seek_user_key @ type | seqno".
+//
+// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex, this function behaves identically as Seek().
+//
+// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
+// or kTypeBlobIndex:
+//
+// If the return value is FALSE, iter location is undefined, and it means:
+// 1) there is no key in this block falling into the range:
+//    ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
+//    inclusive; AND
+// 2) the last key of this block has a greater user_key from seek_user_key
+//
+// If the return value is TRUE, iter location has two possibilies:
+// 1) If iter is valid, it is set to a location as if set by BinarySeek. In
+//    this case, it points to the first key_ with a larger user_key or a
+//    matching user_key with a seqno no greater than the seeking seqno.
+// 2) If the iter is invalid, it means that either all the user_key is less
+//    than the seek_user_key, or the block ends with a matching user_key but
+//    with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
+//    but larger type).
+bool DataBlockIter::SeekForGetImpl(const Slice& target) {
+  Slice user_key = ExtractUserKey(target);
+  uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
+  uint8_t entry = data_block_hash_index_->Lookup(data_, map_offset, user_key);
+
+  if (entry == kCollision) {
+    // HashSeek not effective, falling back
+    Seek(target);
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    // Even if we cannot find the user_key in this block, the result may
+    // exist in the next block. Consider this exmpale:
+    //
+    // Block N:    [aab@100, ... , app@120]
+    // bounary key: axy@50 (we make minimal assumption about a boundary key)
+    // Block N+1:  [axy@10, ...   ]
+    //
+    // If seek_key = axy@60, the search will starts from Block N.
+    // Even if the user_key is not found in the hash map, the caller still
+    // have to conntinue searching the next block.
+    //
+    // In this case, we pretend the key is the the last restart interval.
+    // The while-loop below will search the last restart interval for the
+    // key. It will stop at the first key that is larger than the seek_key,
+    // or to the end of the block if no one is larger.
+    entry = static_cast<uint8_t>(num_restarts_ - 1);
+  }
+
+  uint32_t restart_index = entry;
+
+  // check if the key is in the restart_interval
+  assert(restart_index < num_restarts_);
+  SeekToRestartPoint(restart_index);
+
+  const char* limit = nullptr;
+  if (restart_index_ + 1 < num_restarts_) {
+    limit = data_ + GetRestartPoint(restart_index_ + 1);
+  } else {
+    limit = data_ + restarts_;
+  }
+
+  while (true) {
+    // Here we only linear seek the target key inside the restart interval.
+    // If a key does not exist inside a restart interval, we avoid
+    // further searching the block content accross restart interval boundary.
+    //
+    // TODO(fwu): check the left and write boundary of the restart interval
+    // to avoid linear seek a target key that is out of range.
+    if (!ParseNextDataKey<DecodeEntry>(limit) || Compare(key_, target) >= 0) {
+      // we stop at the first potential matching user key.
+      break;
+    }
+  }
+
+  if (current_ == restarts_) {
+    // Search reaches to the end of the block. There are three possibilites:
+    // 1) there is only one user_key match in the block (otherwise collsion).
+    //    the matching user_key resides in the last restart interval, and it
+    //    is the last key of the restart interval and of the block as well.
+    //    ParseNextDataKey() skiped it as its [ type | seqno ] is smaller.
+    //
+    // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
+    //    AND all existing user_keys in the restart interval are smaller than
+    //    seek_user_key.
+    //
+    // 3) The seek_key is a false positive and happens to be hashed to the
+    //    last restart interval, AND all existing user_keys in the restart
+    //    interval are smaller than seek_user_key.
+    //
+    // The result may exist in the next block each case, so we return true.
+    return true;
+  }
+
+  if (user_comparator_->Compare(key_.GetUserKey(), user_key) != 0) {
+    // the key is not in this block and cannot be at the next block either.
+    return false;
+  }
+
+  // Here we are conservative and only support a limited set of cases
+  ValueType value_type = ExtractValueType(key_.GetKey());
+  if (value_type != ValueType::kTypeValue &&
+      value_type != ValueType::kTypeDeletion &&
+      value_type != ValueType::kTypeSingleDeletion &&
+      value_type != ValueType::kTypeBlobIndex) {
+    Seek(target);
+    return true;
+  }
+
+  // Result found, and the iter is correctly set.
+  return true;
+}
+
+void IndexBlockIter::Seek(const Slice& target) {
+  Slice seek_key = target;
+  if (!key_includes_seq_) {
+    seek_key = ExtractUserKey(target);
+  }
   PERF_TIMER_GUARD(block_seek_nanos);
   if (data_ == nullptr) {  // Not init yet
     return;
@@ -144,8 +392,12 @@ void BlockIter::Seek(const Slice& target) {
   bool ok = false;
   if (prefix_index_) {
     ok = PrefixSeek(target, &index);
+  } else if (value_delta_encoded_) {
+    ok = BinarySeek<DecodeKeyV4>(seek_key, 0, num_restarts_ - 1, &index,
+                                 comparator_);
   } else {
-    ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
+    ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                               comparator_);
   }
 
   if (!ok) {
@@ -155,57 +407,85 @@ void BlockIter::Seek(const Slice& target) {
   // Linear search (within restart block) for first key >= target
 
   while (true) {
-    if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) {
+    if (!ParseNextIndexKey() || Compare(key_, seek_key) >= 0) {
       return;
     }
   }
 }
 
-void BlockIter::SeekForPrev(const Slice& target) {
+void DataBlockIter::SeekForPrev(const Slice& target) {
   PERF_TIMER_GUARD(block_seek_nanos);
+  Slice seek_key = target;
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   uint32_t index = 0;
-  bool ok = false;
-  ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
+  bool ok = BinarySeek<DecodeKey>(seek_key, 0, num_restarts_ - 1, &index,
+                                  comparator_);
 
   if (!ok) {
     return;
   }
   SeekToRestartPoint(index);
-  // Linear search (within restart block) for first key >= target
+  // Linear search (within restart block) for first key >= seek_key
 
-  while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) {
+  while (ParseNextDataKey<DecodeEntry>() && Compare(key_, seek_key) < 0) {
   }
   if (!Valid()) {
     SeekToLast();
   } else {
-    while (Valid() && Compare(key_.GetInternalKey(), target) > 0) {
+    while (Valid() && Compare(key_, seek_key) > 0) {
       Prev();
     }
   }
 }
 
-void BlockIter::SeekToFirst() {
+void DataBlockIter::SeekToFirst() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  ParseNextDataKey<DecodeEntry>();
+}
+
+void DataBlockIter::SeekToFirstOrReport() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(0);
+  ParseNextDataKey<CheckAndDecodeEntry>();
+}
+
+void IndexBlockIter::SeekToFirst() {
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   SeekToRestartPoint(0);
-  ParseNextKey();
+  ParseNextIndexKey();
+}
+
+void DataBlockIter::SeekToLast() {
+  if (data_ == nullptr) {  // Not init yet
+    return;
+  }
+  SeekToRestartPoint(num_restarts_ - 1);
+  while (ParseNextDataKey<DecodeEntry>() && NextEntryOffset() < restarts_) {
+    // Keep skipping
+  }
 }
 
-void BlockIter::SeekToLast() {
+void IndexBlockIter::SeekToLast() {
   if (data_ == nullptr) {  // Not init yet
     return;
   }
   SeekToRestartPoint(num_restarts_ - 1);
-  while (ParseNextKey() && NextEntryOffset() < restarts_) {
+  while (ParseNextIndexKey() && NextEntryOffset() < restarts_) {
     // Keep skipping
   }
 }
 
-void BlockIter::CorruptionError() {
+template <class TValue>
+void BlockIter<TValue>::CorruptionError() {
   current_ = restarts_;
   restart_index_ = num_restarts_;
   status_ = Status::Corruption("bad entry in block");
@@ -213,10 +493,14 @@ void BlockIter::CorruptionError() {
   value_.clear();
 }
 
-bool BlockIter::ParseNextKey() {
+template <typename DecodeEntryFunc>
+bool DataBlockIter::ParseNextDataKey(const char* limit) {
   current_ = NextEntryOffset();
   const char* p = data_ + current_;
-  const char* limit = data_ + restarts_;  // Restarts come right after data
+  if (!limit) {
+    limit = data_ + restarts_;  // Restarts come right after data
+  }
+
   if (p >= limit) {
     // No more entries to return.  Mark as invalid.
     current_ = restarts_;
@@ -226,7 +510,7 @@ bool BlockIter::ParseNextKey() {
 
   // Decode next entry
   uint32_t shared, non_shared, value_length;
-  p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
+  p = DecodeEntryFunc()(p, limit, &shared, &non_shared, &value_length);
   if (p == nullptr || key_.Size() < shared) {
     CorruptionError();
     return false;
@@ -234,7 +518,7 @@ bool BlockIter::ParseNextKey() {
     if (shared == 0) {
       // If this key dont share any bytes with prev key then we dont need
       // to decode it and can use it's address in the block directly.
-      key_.SetInternalKey(Slice(p, non_shared), false /* copy */);
+      key_.SetKey(Slice(p, non_shared), false /* copy */);
       key_pinned_ = true;
     } else {
       // This key share `shared` bytes with prev key, we need to decode it
@@ -245,13 +529,14 @@ bool BlockIter::ParseNextKey() {
     if (global_seqno_ != kDisableGlobalSequenceNumber) {
       // If we are reading a file with a global sequence number we should
       // expect that all encoded sequence numbers are zeros and any value
-      // type is kTypeValue, kTypeMerge or kTypeDeletion
+      // type is kTypeValue, kTypeMerge, kTypeDeletion, or kTypeRangeDeletion.
       assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
 
-      ValueType value_type = ExtractValueType(key_.GetInternalKey());
+      ValueType value_type = ExtractValueType(key_.GetKey());
       assert(value_type == ValueType::kTypeValue ||
              value_type == ValueType::kTypeMerge ||
-             value_type == ValueType::kTypeDeletion);
+             value_type == ValueType::kTypeDeletion ||
+             value_type == ValueType::kTypeRangeDeletion);
 
       if (key_pinned_) {
         // TODO(tec): Investigate updating the seqno in the loaded block
@@ -267,11 +552,97 @@ bool BlockIter::ParseNextKey() {
     }
 
     value_ = Slice(p + non_shared, value_length);
+    if (shared == 0) {
+      while (restart_index_ + 1 < num_restarts_ &&
+             GetRestartPoint(restart_index_ + 1) < current_) {
+        ++restart_index_;
+      }
+    }
+    // else we are in the middle of a restart interval and the restart_index_
+    // thus has not changed
+    return true;
+  }
+}
+
+bool IndexBlockIter::ParseNextIndexKey() {
+  current_ = NextEntryOffset();
+  const char* p = data_ + current_;
+  const char* limit = data_ + restarts_;  // Restarts come right after data
+  if (p >= limit) {
+    // No more entries to return.  Mark as invalid.
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    return false;
+  }
+
+  // Decode next entry
+  uint32_t shared, non_shared, value_length;
+  if (value_delta_encoded_) {
+    p = DecodeKeyV4()(p, limit, &shared, &non_shared);
+    value_length = 0;
+  } else {
+    p = DecodeEntry()(p, limit, &shared, &non_shared, &value_length);
+  }
+  if (p == nullptr || key_.Size() < shared) {
+    CorruptionError();
+    return false;
+  }
+  if (shared == 0) {
+    // If this key dont share any bytes with prev key then we dont need
+    // to decode it and can use it's address in the block directly.
+    key_.SetKey(Slice(p, non_shared), false /* copy */);
+    key_pinned_ = true;
+  } else {
+    // This key share `shared` bytes with prev key, we need to decode it
+    key_.TrimAppend(shared, p, non_shared);
+    key_pinned_ = false;
+  }
+  value_ = Slice(p + non_shared, value_length);
+  if (shared == 0) {
     while (restart_index_ + 1 < num_restarts_ &&
            GetRestartPoint(restart_index_ + 1) < current_) {
       ++restart_index_;
     }
-    return true;
+  }
+  // else we are in the middle of a restart interval and the restart_index_
+  // thus has not changed
+  if (value_delta_encoded_) {
+    assert(value_length == 0);
+    DecodeCurrentValue(shared);
+  }
+  return true;
+}
+
+// The format:
+// restart_point   0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// restart_point   1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// ...
+// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
+// where, k is key, v is value, and its encoding is in parenthesis.
+// The format of each key is (shared_size, non_shared_size, shared, non_shared)
+// The format of each value, i.e., block hanlde, is (offset, size) whenever the
+// shared_size is 0, which included the first entry in each restart point.
+// Otherwise the format is delta-size = block handle size - size of last block
+// handle.
+void IndexBlockIter::DecodeCurrentValue(uint32_t shared) {
+  assert(value_delta_encoded_);
+  const char* limit = data_ + restarts_;
+  if (shared == 0) {
+    uint64_t o, s;
+    const char* newp = GetVarint64Ptr(value_.data(), limit, &o);
+    assert(newp);
+    newp = GetVarint64Ptr(newp, limit, &s);
+    assert(newp);
+    decoded_value_ = BlockHandle(o, s);
+    value_ = Slice(value_.data(), newp - value_.data());
+  } else {
+    uint64_t next_value_base =
+        decoded_value_.offset() + decoded_value_.size() + kBlockTrailerSize;
+    int64_t delta;
+    const char* newp = GetVarsignedint64Ptr(value_.data(), limit, &delta);
+    decoded_value_ =
+        BlockHandle(next_value_base, decoded_value_.size() + delta);
+    value_ = Slice(value_.data(), newp - value_.data());
   }
 }
 
@@ -279,22 +650,25 @@ bool BlockIter::ParseNextKey() {
 // is either the last restart point with a key less than target,
 // which means the key of next restart point is larger than target, or
 // the first restart point with a key = target
-bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
-                           uint32_t* index) {
+template <class TValue>
+template <typename DecodeKeyFunc>
+bool BlockIter<TValue>::BinarySeek(const Slice& target, uint32_t left,
+                                   uint32_t right, uint32_t* index,
+                                   const Comparator* comp) {
   assert(left <= right);
 
   while (left < right) {
     uint32_t mid = (left + right + 1) / 2;
     uint32_t region_offset = GetRestartPoint(mid);
-    uint32_t shared, non_shared, value_length;
-    const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
-                                      &shared, &non_shared, &value_length);
+    uint32_t shared, non_shared;
+    const char* key_ptr = DecodeKeyFunc()(
+        data_ + region_offset, data_ + restarts_, &shared, &non_shared);
     if (key_ptr == nullptr || (shared != 0)) {
       CorruptionError();
       return false;
     }
     Slice mid_key(key_ptr, non_shared);
-    int cmp = Compare(mid_key, target);
+    int cmp = comp->Compare(mid_key, target);
     if (cmp < 0) {
       // Key at "mid" is smaller than "target". Therefore all
       // blocks before "mid" are uninteresting.
@@ -314,11 +688,15 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
 
 // Compare target key and the block key of the block of `block_index`.
 // Return -1 if error.
-int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
+int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
   uint32_t region_offset = GetRestartPoint(block_index);
-  uint32_t shared, non_shared, value_length;
-  const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
-                                    &shared, &non_shared, &value_length);
+  uint32_t shared, non_shared;
+  const char* key_ptr =
+      value_delta_encoded_
+          ? DecodeKeyV4()(data_ + region_offset, data_ + restarts_, &shared,
+                          &non_shared)
+          : DecodeKey()(data_ + region_offset, data_ + restarts_, &shared,
+                        &non_shared);
   if (key_ptr == nullptr || (shared != 0)) {
     CorruptionError();
     return 1;  // Return target is smaller
@@ -329,9 +707,9 @@ int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
 
 // Binary search in block_ids to find the first block
 // with a key >= target
-bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
-                                     uint32_t left, uint32_t right,
-                                     uint32_t* index) {
+bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
+                                          uint32_t* block_ids, uint32_t left,
+                                          uint32_t right, uint32_t* index) {
   assert(left <= right);
   uint32_t left_bound = left;
 
@@ -379,22 +757,62 @@ bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
   }
 }
 
-bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
+bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
   assert(prefix_index_);
+  Slice seek_key = target;
+  if (!key_includes_seq_) {
+    seek_key = ExtractUserKey(target);
+  }
   uint32_t* block_ids = nullptr;
   uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
 
   if (num_blocks == 0) {
     current_ = restarts_;
     return false;
-  } else  {
-    return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index);
+  } else {
+    return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index);
   }
 }
 
 uint32_t Block::NumRestarts() const {
-  assert(size_ >= 2*sizeof(uint32_t));
-  return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  assert(size_ >= 2 * sizeof(uint32_t));
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // In BlockBuilder, we have ensured a block with HashIndex is less than
+    // kMaxBlockSizeSupportedByHashIndex (64KiB).
+    //
+    // Therefore, if we encounter a block with a size > 64KiB, the block
+    // cannot have HashIndex. So the footer will directly interpreted as
+    // num_restarts.
+    //
+    // Such check is for backward compatibility. We can ensure legacy block
+    // with a vary large num_restarts i.e. >= 0x80000000 can be interpreted
+    // correctly as no HashIndex even if the MSB of num_restarts is set.
+    return num_restarts;
+  }
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return num_restarts;
+}
+
+BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
+  assert(size_ >= 2 * sizeof(uint32_t));
+  if (size_ > kMaxBlockSizeSupportedByHashIndex) {
+    // The check is for the same reason as that in NumRestarts()
+    return BlockBasedTableOptions::kDataBlockBinarySearch;
+  }
+  uint32_t block_footer = DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+  uint32_t num_restarts = block_footer;
+  BlockBasedTableOptions::DataBlockIndexType index_type;
+  UnPackIndexTypeAndNumRestarts(block_footer, &index_type, &num_restarts);
+  return index_type;
+}
+
+Block::~Block() {
+  // This sync point can be re-enabled if RocksDB can control the
+  // initialization order of any/all static options created by the user.
+  // TEST_SYNC_POINT("Block::~Block");
 }
 
 Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
@@ -402,16 +820,51 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
     : contents_(std::move(contents)),
       data_(contents_.data.data()),
       size_(contents_.data.size()),
+      restart_offset_(0),
+      num_restarts_(0),
       global_seqno_(_global_seqno) {
+  TEST_SYNC_POINT("Block::Block:0");
   if (size_ < sizeof(uint32_t)) {
     size_ = 0;  // Error marker
   } else {
-    restart_offset_ =
-        static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t);
-    if (restart_offset_ > size_ - sizeof(uint32_t)) {
-      // The size is too small for NumRestarts() and therefore
-      // restart_offset_ wrapped around.
-      size_ = 0;
+    // Should only decode restart points for uncompressed blocks
+    num_restarts_ = NumRestarts();
+    switch (IndexType()) {
+      case BlockBasedTableOptions::kDataBlockBinarySearch:
+        restart_offset_ = static_cast<uint32_t>(size_) -
+                          (1 + num_restarts_) * sizeof(uint32_t);
+        if (restart_offset_ > size_ - sizeof(uint32_t)) {
+          // The size is too small for NumRestarts() and therefore
+          // restart_offset_ wrapped around.
+          size_ = 0;
+        }
+        break;
+      case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+        if (size_ < sizeof(uint32_t) /* block footer */ +
+                        sizeof(uint16_t) /* NUM_BUCK */) {
+          size_ = 0;
+          break;
+        }
+
+        uint16_t map_offset;
+        data_block_hash_index_.Initialize(
+            contents.data.data(),
+            static_cast<uint16_t>(contents.data.size() -
+                                  sizeof(uint32_t)), /*chop off
+                                                 NUM_RESTARTS*/
+            &map_offset);
+
+        restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t);
+
+        if (restart_offset_ > map_offset) {
+          // map_offset is too small for NumRestarts() and
+          // therefore restart_offset_ wrapped around.
+          size_ = 0;
+          break;
+        }
+        break;
+      default:
+        size_ = 0;  // Error marker
     }
   }
   if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
@@ -420,37 +873,33 @@ Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
   }
 }
 
-InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
-                                     bool total_order_seek, Statistics* stats) {
-  if (size_ < 2*sizeof(uint32_t)) {
-    if (iter != nullptr) {
-      iter->SetStatus(Status::Corruption("bad block contents"));
-      return iter;
-    } else {
-      return NewErrorInternalIterator(Status::Corruption("bad block contents"));
-    }
+template <>
+DataBlockIter* Block::NewIterator(const Comparator* cmp, const Comparator* ucmp,
+                                  DataBlockIter* iter, Statistics* stats,
+                                  bool /*total_order_seek*/,
+                                  bool /*key_includes_seq*/,
+                                  bool /*value_is_full*/,
+                                  bool block_contents_pinned,
+                                  BlockPrefixIndex* /*prefix_index*/) {
+  DataBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new DataBlockIter;
   }
-  const uint32_t num_restarts = NumRestarts();
-  if (num_restarts == 0) {
-    if (iter != nullptr) {
-      iter->SetStatus(Status::OK());
-      return iter;
-    } else {
-      return NewEmptyInternalIterator();
-    }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
   } else {
-    BlockPrefixIndex* prefix_index_ptr =
-        total_order_seek ? nullptr : prefix_index_.get();
-
-    if (iter != nullptr) {
-      iter->Initialize(cmp, data_, restart_offset_, num_restarts,
-                       prefix_index_ptr, global_seqno_, read_amp_bitmap_.get());
-    } else {
-      iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
-                           prefix_index_ptr, global_seqno_,
-                           read_amp_bitmap_.get());
-    }
-
+    ret_iter->Initialize(
+        cmp, ucmp, data_, restart_offset_, num_restarts_, global_seqno_,
+        read_amp_bitmap_.get(), block_contents_pinned,
+        data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr);
     if (read_amp_bitmap_) {
       if (read_amp_bitmap_->GetStatistics() != stats) {
         // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
@@ -459,17 +908,51 @@ InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
     }
   }
 
-  return iter;
+  return ret_iter;
 }
 
-void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
-  prefix_index_.reset(prefix_index);
+template <>
+IndexBlockIter* Block::NewIterator(const Comparator* cmp,
+                                   const Comparator* ucmp, IndexBlockIter* iter,
+                                   Statistics* /*stats*/, bool total_order_seek,
+                                   bool key_includes_seq, bool value_is_full,
+                                   bool block_contents_pinned,
+                                   BlockPrefixIndex* prefix_index) {
+  IndexBlockIter* ret_iter;
+  if (iter != nullptr) {
+    ret_iter = iter;
+  } else {
+    ret_iter = new IndexBlockIter;
+  }
+  if (size_ < 2 * sizeof(uint32_t)) {
+    ret_iter->Invalidate(Status::Corruption("bad block contents"));
+    return ret_iter;
+  }
+  if (num_restarts_ == 0) {
+    // Empty block.
+    ret_iter->Invalidate(Status::OK());
+    return ret_iter;
+  } else {
+    BlockPrefixIndex* prefix_index_ptr =
+        total_order_seek ? nullptr : prefix_index;
+    ret_iter->Initialize(cmp, ucmp, data_, restart_offset_, num_restarts_,
+                         prefix_index_ptr, key_includes_seq, value_is_full,
+                         block_contents_pinned,
+                         nullptr /* data_block_hash_index */);
+  }
+
+  return ret_iter;
 }
 
 size_t Block::ApproximateMemoryUsage() const {
   size_t usage = usable_size();
-  if (prefix_index_) {
-    usage += prefix_index_->ApproximateMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  if (read_amp_bitmap_) {
+    usage += read_amp_bitmap_->ApproximateMemoryUsage();
   }
   return usage;
 }
diff --git a/thirdparty/rocksdb/table/block.h b/thirdparty/rocksdb/table/block.h
index 59dc167433..737874abdf 100644
--- a/thirdparty/rocksdb/table/block.h
+++ b/thirdparty/rocksdb/table/block.h
@@ -22,20 +22,25 @@
 
 #include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
+#include "format.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
 #include "table/block_prefix_index.h"
+#include "table/data_block_hash_index.h"
 #include "table/internal_iterator.h"
 #include "util/random.h"
 #include "util/sync_point.h"
-#include "format.h"
 
 namespace rocksdb {
 
 struct BlockContents;
 class Comparator;
+template <class TValue>
 class BlockIter;
+class DataBlockIter;
+class IndexBlockIter;
 class BlockPrefixIndex;
 
 // BlockReadAmpBitmap is a bitmap that map the rocksdb::Block data bytes to
@@ -48,8 +53,8 @@ class BlockReadAmpBitmap {
       : bitmap_(nullptr),
         bytes_per_bit_pow_(0),
         statistics_(statistics),
-        rnd_(
-            Random::GetTLSInstance()->Uniform(static_cast<int>(bytes_per_bit))) {
+        rnd_(Random::GetTLSInstance()->Uniform(
+            static_cast<int>(bytes_per_bit))) {
     TEST_SYNC_POINT_CALLBACK("BlockReadAmpBitmap:rnd", &rnd_);
     assert(block_size > 0 && bytes_per_bit > 0);
 
@@ -59,8 +64,7 @@ class BlockReadAmpBitmap {
     }
 
     // num_bits_needed = ceil(block_size / bytes_per_bit)
-    size_t num_bits_needed =
-      ((block_size - 1) >> bytes_per_bit_pow_) + 1;
+    size_t num_bits_needed = ((block_size - 1) >> bytes_per_bit_pow_) + 1;
     assert(num_bits_needed > 0);
 
     // bitmap_size = ceil(num_bits_needed / kBitsPerEntry)
@@ -104,6 +108,13 @@ class BlockReadAmpBitmap {
 
   uint32_t GetBytesPerBit() { return 1 << bytes_per_bit_pow_; }
 
+  size_t ApproximateMemoryUsage() const {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    return malloc_usable_size((void*)this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return sizeof(*this);
+  }
+
  private:
   // Get the current value of bit at `bit_idx` and set it to 1
   inline bool GetAndSet(uint32_t bit_idx) {
@@ -137,42 +148,53 @@ class Block {
                  size_t read_amp_bytes_per_bit = 0,
                  Statistics* statistics = nullptr);
 
-  ~Block() = default;
+  ~Block();
 
   size_t size() const { return size_; }
   const char* data() const { return data_; }
-  bool cachable() const { return contents_.cachable; }
-  size_t usable_size() const {
-#ifdef ROCKSDB_MALLOC_USABLE_SIZE
-    if (contents_.allocation.get() != nullptr) {
-      return malloc_usable_size(contents_.allocation.get());
-    }
-#endif  // ROCKSDB_MALLOC_USABLE_SIZE
-    return size_;
-  }
+  // The additional memory space taken by the block data.
+  size_t usable_size() const { return contents_.usable_size(); }
   uint32_t NumRestarts() const;
-  CompressionType compression_type() const {
-    return contents_.compression_type;
-  }
+  bool own_bytes() const { return contents_.own_bytes(); }
 
-  // If hash index lookup is enabled and `use_hash_index` is true. This block
-  // will do hash lookup for the key prefix.
-  //
-  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
-  // the iterator will simply be set as "invalid", rather than returning
-  // the key that is just pass the target key.
+  BlockBasedTableOptions::DataBlockIndexType IndexType() const;
+
+  // If comparator is InternalKeyComparator, user_comparator is its user
+  // comparator; they are equal otherwise.
   //
   // If iter is null, return new Iterator
   // If iter is not null, update this one and return it as Iterator*
   //
-  // If total_order_seek is true, hash_index_ and prefix_index_ are ignored.
-  // This option only applies for index block. For data block, hash_index_
-  // and prefix_index_ are null, so this option does not matter.
-  InternalIterator* NewIterator(const Comparator* comparator,
-                                BlockIter* iter = nullptr,
-                                bool total_order_seek = true,
-                                Statistics* stats = nullptr);
-  void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  //
+  // NewIterator<DataBlockIter>
+  // Same as above but also updates read_amp_bitmap_ if it is not nullptr.
+  //
+  // NewIterator<IndexBlockIter>
+  // If `prefix_index` is not nullptr this block will do hash lookup for the key
+  // prefix. If total_order_seek is true, prefix_index_ is ignored.
+  //
+  // If `block_contents_pinned` is true, the caller will guarantee that when
+  // the cleanup functions are transferred from the iterator to other
+  // classes, e.g. PinnableSlice, the pointer to the bytes will still be
+  // valid. Either the iterator holds cache handle or ownership of some resource
+  // and release them in a release function, or caller is sure that the data
+  // will not go away (for example, it's from mmapped file which will not be
+  // closed).
+  //
+  // NOTE: for the hash based lookup, if a key prefix doesn't match any key,
+  // the iterator will simply be set as "invalid", rather than returning
+  // the key that is just pass the target key.
+  template <typename TBlockIter>
+  TBlockIter* NewIterator(
+      const Comparator* comparator, const Comparator* user_comparator,
+      TBlockIter* iter = nullptr, Statistics* stats = nullptr,
+      bool total_order_seek = true, bool key_includes_seq = true,
+      bool value_is_full = true, bool block_contents_pinned = false,
+      BlockPrefixIndex* prefix_index = nullptr);
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const;
@@ -181,50 +203,30 @@ class Block {
 
  private:
   BlockContents contents_;
-  const char* data_;            // contents_.data.data()
-  size_t size_;                 // contents_.data.size()
-  uint32_t restart_offset_;     // Offset in data_ of restart array
-  std::unique_ptr<BlockPrefixIndex> prefix_index_;
+  const char* data_;         // contents_.data.data()
+  size_t size_;              // contents_.data.size()
+  uint32_t restart_offset_;  // Offset in data_ of restart array
+  uint32_t num_restarts_;
   std::unique_ptr<BlockReadAmpBitmap> read_amp_bitmap_;
   // All keys in the block will have seqno = global_seqno_, regardless of
   // the encoded value (kDisableGlobalSequenceNumber means disabled)
   const SequenceNumber global_seqno_;
 
+  DataBlockHashIndex data_block_hash_index_;
+
   // No copying allowed
-  Block(const Block&);
-  void operator=(const Block&);
+  Block(const Block&) = delete;
+  void operator=(const Block&) = delete;
 };
 
-class BlockIter : public InternalIterator {
+template <class TValue>
+class BlockIter : public InternalIteratorBase<TValue> {
  public:
-  BlockIter()
-      : comparator_(nullptr),
-        data_(nullptr),
-        restarts_(0),
-        num_restarts_(0),
-        current_(0),
-        restart_index_(0),
-        status_(Status::OK()),
-        prefix_index_(nullptr),
-        key_pinned_(false),
-        global_seqno_(kDisableGlobalSequenceNumber),
-        read_amp_bitmap_(nullptr),
-        last_bitmap_offset_(0) {}
-
-  BlockIter(const Comparator* comparator, const char* data, uint32_t restarts,
-            uint32_t num_restarts, BlockPrefixIndex* prefix_index,
-            SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap)
-      : BlockIter() {
-    Initialize(comparator, data, restarts, num_restarts, prefix_index,
-               global_seqno, read_amp_bitmap);
-  }
-
-  void Initialize(const Comparator* comparator, const char* data,
-                  uint32_t restarts, uint32_t num_restarts,
-                  BlockPrefixIndex* prefix_index, SequenceNumber global_seqno,
-                  BlockReadAmpBitmap* read_amp_bitmap) {
-    assert(data_ == nullptr);           // Ensure it is called only once
-    assert(num_restarts > 0);           // Ensure the param is valid
+  void InitializeBase(const Comparator* comparator, const char* data,
+                      uint32_t restarts, uint32_t num_restarts,
+                      SequenceNumber global_seqno, bool block_contents_pinned) {
+    assert(data_ == nullptr);  // Ensure it is called only once
+    assert(num_restarts > 0);  // Ensure the param is valid
 
     comparator_ = comparator;
     data_ = data;
@@ -232,47 +234,34 @@ class BlockIter : public InternalIterator {
     num_restarts_ = num_restarts;
     current_ = restarts_;
     restart_index_ = num_restarts_;
-    prefix_index_ = prefix_index;
     global_seqno_ = global_seqno;
-    read_amp_bitmap_ = read_amp_bitmap;
-    last_bitmap_offset_ = current_ + 1;
+    block_contents_pinned_ = block_contents_pinned;
   }
 
-  void SetStatus(Status s) {
+  // Makes Valid() return false, status() return `s`, and Seek()/Prev()/etc do
+  // nothing. Calls cleanup functions.
+  void InvalidateBase(Status s) {
+    // Assert that the BlockIter is never deleted while Pinning is Enabled.
+    assert(!pinned_iters_mgr_ ||
+           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
+
+    data_ = nullptr;
+    current_ = restarts_;
     status_ = s;
+
+    // Call cleanup callbacks.
+    Cleanable::Reset();
   }
 
   virtual bool Valid() const override { return current_ < restarts_; }
   virtual Status status() const override { return status_; }
   virtual Slice key() const override {
     assert(Valid());
-    return key_.GetInternalKey();
-  }
-  virtual Slice value() const override {
-    assert(Valid());
-    if (read_amp_bitmap_ && current_ < restarts_ &&
-        current_ != last_bitmap_offset_) {
-      read_amp_bitmap_->Mark(current_ /* current entry offset */,
-                             NextEntryOffset() - 1);
-      last_bitmap_offset_ = current_;
-    }
-    return value_;
+    return key_.GetKey();
   }
 
-  virtual void Next() override;
-
-  virtual void Prev() override;
-
-  virtual void Seek(const Slice& target) override;
-
-  virtual void SeekForPrev(const Slice& target) override;
-
-  virtual void SeekToFirst() override;
-
-  virtual void SeekToLast() override;
-
 #ifndef NDEBUG
-  ~BlockIter() {
+  virtual ~BlockIter() {
     // Assert that the BlockIter is never deleted while Pinning is Enabled.
     assert(!pinned_iters_mgr_ ||
            (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
@@ -284,9 +273,11 @@ class BlockIter : public InternalIterator {
   PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
 #endif
 
-  virtual bool IsKeyPinned() const override { return key_pinned_; }
+  virtual bool IsKeyPinned() const override {
+    return block_contents_pinned_ && key_pinned_;
+  }
 
-  virtual bool IsValuePinned() const override { return true; }
+  virtual bool IsValuePinned() const override { return block_contents_pinned_; }
 
   size_t TEST_CurrentEntrySize() { return NextEntryOffset() - current_; }
 
@@ -294,27 +285,142 @@ class BlockIter : public InternalIterator {
     return static_cast<uint32_t>(value_.data() - data_);
   }
 
- private:
+ protected:
+  // Note: The type could be changed to InternalKeyComparator but we see a weird
+  // performance drop by that.
   const Comparator* comparator_;
   const char* data_;       // underlying block contents
-  uint32_t restarts_;      // Offset of restart array (list of fixed32)
   uint32_t num_restarts_;  // Number of uint32_t entries in restart array
 
+  // Index of restart block in which current_ or current_-1 falls
+  uint32_t restart_index_;
+  uint32_t restarts_;  // Offset of restart array (list of fixed32)
   // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
   uint32_t current_;
-  uint32_t restart_index_;  // Index of restart block in which current_ falls
   IterKey key_;
   Slice value_;
   Status status_;
-  BlockPrefixIndex* prefix_index_;
   bool key_pinned_;
+  // Whether the block data is guaranteed to outlive this iterator, and
+  // as long as the cleanup functions are transferred to another class,
+  // e.g. PinnableSlice, the pointer to the bytes will still be valid.
+  bool block_contents_pinned_;
   SequenceNumber global_seqno_;
 
+ public:
+  // Return the offset in data_ just past the end of the current entry.
+  inline uint32_t NextEntryOffset() const {
+    // NOTE: We don't support blocks bigger than 2GB
+    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+  }
+
+  uint32_t GetRestartPoint(uint32_t index) {
+    assert(index < num_restarts_);
+    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  }
+
+  void SeekToRestartPoint(uint32_t index) {
+    key_.Clear();
+    restart_index_ = index;
+    // current_ will be fixed by ParseNextKey();
+
+    // ParseNextKey() starts at the end of value_, so set value_ accordingly
+    uint32_t offset = GetRestartPoint(index);
+    value_ = Slice(data_ + offset, 0);
+  }
+
+  void CorruptionError();
+
+  template <typename DecodeKeyFunc>
+  inline bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
+                         uint32_t* index, const Comparator* comp);
+};
+
+class DataBlockIter final : public BlockIter<Slice> {
+ public:
+  DataBlockIter()
+      : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {}
+  DataBlockIter(const Comparator* comparator, const Comparator* user_comparator,
+                const char* data, uint32_t restarts, uint32_t num_restarts,
+                SequenceNumber global_seqno,
+                BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned,
+                DataBlockHashIndex* data_block_hash_index)
+      : DataBlockIter() {
+    Initialize(comparator, user_comparator, data, restarts, num_restarts,
+               global_seqno, read_amp_bitmap, block_contents_pinned,
+               data_block_hash_index);
+  }
+  void Initialize(const Comparator* comparator,
+                  const Comparator* user_comparator, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  SequenceNumber global_seqno,
+                  BlockReadAmpBitmap* read_amp_bitmap,
+                  bool block_contents_pinned,
+                  DataBlockHashIndex* data_block_hash_index) {
+    InitializeBase(comparator, data, restarts, num_restarts, global_seqno,
+                   block_contents_pinned);
+    user_comparator_ = user_comparator;
+    key_.SetIsUserKey(false);
+    read_amp_bitmap_ = read_amp_bitmap;
+    last_bitmap_offset_ = current_ + 1;
+    data_block_hash_index_ = data_block_hash_index;
+  }
+
+  virtual Slice value() const override {
+    assert(Valid());
+    if (read_amp_bitmap_ && current_ < restarts_ &&
+        current_ != last_bitmap_offset_) {
+      read_amp_bitmap_->Mark(current_ /* current entry offset */,
+                             NextEntryOffset() - 1);
+      last_bitmap_offset_ = current_;
+    }
+    return value_;
+  }
+
+  virtual void Seek(const Slice& target) override;
+
+  inline bool SeekForGet(const Slice& target) {
+    if (!data_block_hash_index_) {
+      Seek(target);
+      return true;
+    }
+
+    return SeekForGetImpl(target);
+  }
+
+  virtual void SeekForPrev(const Slice& target) override;
+
+  virtual void Prev() override;
+
+  virtual void Next() override;
+
+  // Try to advance to the next entry in the block. If there is data corruption
+  // or error, report it to the caller instead of aborting the process. May
+  // incur higher CPU overhead because we need to perform check on every entry.
+  void NextOrReport();
+
+  virtual void SeekToFirst() override;
+
+  // Try to seek to the first entry in the block. If there is data corruption
+  // or error, report it to caller instead of aborting the process. May incur
+  // higher CPU overhead because we need to perform check on every entry.
+  void SeekToFirstOrReport();
+
+  virtual void SeekToLast() override;
+
+  void Invalidate(Status s) {
+    InvalidateBase(s);
+    // Clear prev entries cache.
+    prev_entries_keys_buff_.clear();
+    prev_entries_.clear();
+    prev_entries_idx_ = -1;
+  }
+
+ private:
   // read-amp bitmap
   BlockReadAmpBitmap* read_amp_bitmap_;
   // last `current_` value we report to read-amp bitmp
   mutable uint32_t last_bitmap_offset_;
-
   struct CachedPrevEntry {
     explicit CachedPrevEntry(uint32_t _offset, const char* _key_ptr,
                              size_t _key_offset, size_t _key_size, Slice _value)
@@ -339,46 +445,124 @@ class BlockIter : public InternalIterator {
   std::vector<CachedPrevEntry> prev_entries_;
   int32_t prev_entries_idx_ = -1;
 
-  inline int Compare(const Slice& a, const Slice& b) const {
-    return comparator_->Compare(a, b);
+  DataBlockHashIndex* data_block_hash_index_;
+  const Comparator* user_comparator_;
+
+  template <typename DecodeEntryFunc>
+  inline bool ParseNextDataKey(const char* limit = nullptr);
+
+  inline int Compare(const IterKey& ikey, const Slice& b) const {
+    return comparator_->Compare(ikey.GetInternalKey(), b);
   }
 
-  // Return the offset in data_ just past the end of the current entry.
-  inline uint32_t NextEntryOffset() const {
-    // NOTE: We don't support blocks bigger than 2GB
-    return static_cast<uint32_t>((value_.data() + value_.size()) - data_);
+  bool SeekForGetImpl(const Slice& target);
+};
+
+class IndexBlockIter final : public BlockIter<BlockHandle> {
+ public:
+  IndexBlockIter() : BlockIter(), prefix_index_(nullptr) {}
+
+  virtual Slice key() const override {
+    assert(Valid());
+    return key_.GetKey();
+  }
+  // key_includes_seq, default true, means that the keys are in internal key
+  // format.
+  // value_is_full, default true, means that no delta encoding is
+  // applied to values.
+  IndexBlockIter(const Comparator* comparator,
+                 const Comparator* user_comparator, const char* data,
+                 uint32_t restarts, uint32_t num_restarts,
+                 BlockPrefixIndex* prefix_index, bool key_includes_seq,
+                 bool value_is_full, bool block_contents_pinned)
+      : IndexBlockIter() {
+    Initialize(comparator, user_comparator, data, restarts, num_restarts,
+               prefix_index, key_includes_seq, block_contents_pinned,
+               value_is_full, nullptr /* data_block_hash_index */);
   }
 
-  uint32_t GetRestartPoint(uint32_t index) {
-    assert(index < num_restarts_);
-    return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+  void Initialize(const Comparator* comparator,
+                  const Comparator* user_comparator, const char* data,
+                  uint32_t restarts, uint32_t num_restarts,
+                  BlockPrefixIndex* prefix_index, bool key_includes_seq,
+                  bool value_is_full, bool block_contents_pinned,
+                  DataBlockHashIndex* /*data_block_hash_index*/) {
+    InitializeBase(key_includes_seq ? comparator : user_comparator, data,
+                   restarts, num_restarts, kDisableGlobalSequenceNumber,
+                   block_contents_pinned);
+    key_includes_seq_ = key_includes_seq;
+    key_.SetIsUserKey(!key_includes_seq_);
+    prefix_index_ = prefix_index;
+    value_delta_encoded_ = !value_is_full;
   }
 
-  void SeekToRestartPoint(uint32_t index) {
-    key_.Clear();
-    restart_index_ = index;
-    // current_ will be fixed by ParseNextKey();
+  virtual BlockHandle value() const override {
+    assert(Valid());
+    if (value_delta_encoded_) {
+      return decoded_value_;
+    } else {
+      BlockHandle handle;
+      Slice v = value_;
+      Status decode_s __attribute__((__unused__)) = handle.DecodeFrom(&v);
+      assert(decode_s.ok());
+      return handle;
+    }
+  }
 
-    // ParseNextKey() starts at the end of value_, so set value_ accordingly
-    uint32_t offset = GetRestartPoint(index);
-    value_ = Slice(data_ + offset, 0);
+  virtual void Seek(const Slice& target) override;
+
+  virtual void SeekForPrev(const Slice&) override {
+    assert(false);
+    current_ = restarts_;
+    restart_index_ = num_restarts_;
+    status_ = Status::InvalidArgument(
+        "RocksDB internal error: should never call SeekForPrev() on index "
+        "blocks");
+    key_.Clear();
+    value_.clear();
   }
 
-  void CorruptionError();
+  virtual void Prev() override;
 
-  bool ParseNextKey();
+  virtual void Next() override;
 
-  bool BinarySeek(const Slice& target, uint32_t left, uint32_t right,
-                  uint32_t* index);
+  virtual void SeekToFirst() override;
 
-  int CompareBlockKey(uint32_t block_index, const Slice& target);
+  virtual void SeekToLast() override;
 
-  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
-                            uint32_t left, uint32_t right,
-                            uint32_t* index);
+  void Invalidate(Status s) { InvalidateBase(s); }
+
+ private:
+  // Key is in InternalKey format
+  bool key_includes_seq_;
+  bool value_delta_encoded_;
+  BlockPrefixIndex* prefix_index_;
+  // Whether the value is delta encoded. In that case the value is assumed to be
+  // BlockHandle. The first value in each restart interval is the full encoded
+  // BlockHandle; the restart of encoded size part of the BlockHandle. The
+  // offset of delta encoded BlockHandles is computed by adding the size of
+  // previous delta encoded values in the same restart interval to the offset of
+  // the first value in that restart interval.
+  BlockHandle decoded_value_;
 
   bool PrefixSeek(const Slice& target, uint32_t* index);
+  bool BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
+                            uint32_t left, uint32_t right, uint32_t* index);
+  inline int CompareBlockKey(uint32_t block_index, const Slice& target);
+
+  inline int Compare(const Slice& a, const Slice& b) const {
+    return comparator_->Compare(a, b);
+  }
+
+  inline int Compare(const IterKey& ikey, const Slice& b) const {
+    return comparator_->Compare(ikey.GetKey(), b);
+  }
+
+  inline bool ParseNextIndexKey();
 
+  // When value_delta_encoded_ is enabled it decodes the value which is assumed
+  // to be BlockHandle and put it to decoded_value_
+  inline void DecodeCurrentValue(uint32_t shared);
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_based_filter_block.cc b/thirdparty/rocksdb/table/block_based_filter_block.cc
index 697c11a42f..81087b243b 100644
--- a/thirdparty/rocksdb/table/block_based_filter_block.cc
+++ b/thirdparty/rocksdb/table/block_based_filter_block.cc
@@ -53,7 +53,6 @@ void AppendItem(std::string* props, const TKey& key, const std::string& value) {
 }
 }  // namespace
 
-
 // See doc/table_format.txt for an explanation of the filter block format.
 
 // Generate new filter every 2KB of data
@@ -67,7 +66,8 @@ BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
       prefix_extractor_(prefix_extractor),
       whole_key_filtering_(table_opt.whole_key_filtering),
       prev_prefix_start_(0),
-      prev_prefix_size_(0) {
+      prev_prefix_size_(0),
+      num_added_(0) {
   assert(policy_);
 }
 
@@ -91,6 +91,7 @@ void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
 
 // Add key to filter if needed
 inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
+  num_added_++;
   start_.push_back(entries_.size());
   entries_.append(key.data(), key.size());
 }
@@ -106,14 +107,13 @@ inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
   Slice prefix = prefix_extractor_->Transform(key);
   // insert prefix only when it's different from the previous prefix.
   if (prev.size() == 0 || prefix != prev) {
-    start_.push_back(entries_.size());
     prev_prefix_start_ = entries_.size();
     prev_prefix_size_ = prefix.size();
-    entries_.append(prefix.data(), prefix.size());
+    AddKey(prefix);
   }
 }
 
-Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& tmp,
+Slice BlockBasedFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
                                            Status* status) {
   // In this impl we ignore BlockHandle
   *status = Status::OK();
@@ -185,8 +185,9 @@ BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
 }
 
 bool BlockBasedFilterBlockReader::KeyMayMatch(
-    const Slice& key, uint64_t block_offset, const bool no_io,
-    const Slice* const const_ikey_ptr) {
+    const Slice& key, const SliceTransform* /* prefix_extractor */,
+    uint64_t block_offset, const bool /*no_io*/,
+    const Slice* const /*const_ikey_ptr*/) {
   assert(block_offset != kNotValid);
   if (!whole_key_filtering_) {
     return true;
@@ -195,12 +196,10 @@ bool BlockBasedFilterBlockReader::KeyMayMatch(
 }
 
 bool BlockBasedFilterBlockReader::PrefixMayMatch(
-    const Slice& prefix, uint64_t block_offset, const bool no_io,
-    const Slice* const const_ikey_ptr) {
+    const Slice& prefix, const SliceTransform* /* prefix_extractor */,
+    uint64_t block_offset, const bool /*no_io*/,
+    const Slice* const /*const_ikey_ptr*/) {
   assert(block_offset != kNotValid);
-  if (!prefix_extractor_) {
-    return true;
-  }
   return MayMatch(prefix, block_offset);
 }
 
@@ -233,7 +232,7 @@ size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
 }
 
 std::string BlockBasedFilterBlockReader::ToString() const {
-  std::string result, filter_meta;
+  std::string result;
   result.reserve(1024);
 
   std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks");
diff --git a/thirdparty/rocksdb/table/block_based_filter_block.h b/thirdparty/rocksdb/table/block_based_filter_block.h
index 52b79fea50..d1ff585462 100644
--- a/thirdparty/rocksdb/table/block_based_filter_block.h
+++ b/thirdparty/rocksdb/table/block_based_filter_block.h
@@ -15,8 +15,8 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string>
 #include <memory>
+#include <string>
 #include <vector>
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
@@ -26,7 +26,6 @@
 
 namespace rocksdb {
 
-
 // A BlockBasedFilterBlockBuilder is used to construct all of the filters for a
 // particular Table.  It generates a single string which is stored as
 // a special block in the Table.
@@ -36,11 +35,12 @@ namespace rocksdb {
 class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
  public:
   BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
-      const BlockBasedTableOptions& table_opt);
+                               const BlockBasedTableOptions& table_opt);
 
   virtual bool IsBlockBased() override { return true; }
   virtual void StartBlock(uint64_t block_offset) override;
   virtual void Add(const Slice& key) override;
+  virtual size_t NumAdded() const override { return num_added_; }
   virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
   using FilterBlockBuilder::Finish;
 
@@ -65,6 +65,7 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
   std::string result_;              // Filter data computed so far
   std::vector<Slice> tmp_entries_;  // policy_->CreateFilter() argument
   std::vector<uint32_t> filter_offsets_;
+  size_t num_added_;  // Number of keys added
 
   // No copying allowed
   BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&);
@@ -81,13 +82,14 @@ class BlockBasedFilterBlockReader : public FilterBlockReader {
                               bool whole_key_filtering,
                               BlockContents&& contents, Statistics* statistics);
   virtual bool IsBlockBased() override { return true; }
+
   virtual bool KeyMayMatch(
-      const Slice& key, uint64_t block_offset = kNotValid,
-      const bool no_io = false,
+      const Slice& key, const SliceTransform* prefix_extractor,
+      uint64_t block_offset = kNotValid, const bool no_io = false,
       const Slice* const const_ikey_ptr = nullptr) override;
   virtual bool PrefixMayMatch(
-      const Slice& prefix, uint64_t block_offset = kNotValid,
-      const bool no_io = false,
+      const Slice& prefix, const SliceTransform* prefix_extractor,
+      uint64_t block_offset = kNotValid, const bool no_io = false,
       const Slice* const const_ikey_ptr = nullptr) override;
   virtual size_t ApproximateMemoryUsage() const override;
 
diff --git a/thirdparty/rocksdb/table/block_based_filter_block_test.cc b/thirdparty/rocksdb/table/block_based_filter_block_test.cc
index f666ba2524..6b352b2f6b 100644
--- a/thirdparty/rocksdb/table/block_based_filter_block_test.cc
+++ b/thirdparty/rocksdb/table/block_based_filter_block_test.cc
@@ -21,18 +21,16 @@ namespace rocksdb {
 // For testing: emit an array with one hash value per key
 class TestHashFilter : public FilterPolicy {
  public:
-  virtual const char* Name() const override { return "TestHashFilter"; }
+  const char* Name() const override { return "TestHashFilter"; }
 
-  virtual void CreateFilter(const Slice* keys, int n,
-                            std::string* dst) const override {
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
     for (int i = 0; i < n; i++) {
       uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
       PutFixed32(dst, h);
     }
   }
 
-  virtual bool KeyMayMatch(const Slice& key,
-                           const Slice& filter) const override {
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
     uint32_t h = Hash(key.data(), key.size(), 1);
     for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
       if (h == DecodeFixed32(filter.data() + i)) {
@@ -55,16 +53,17 @@ class FilterBlockTest : public testing::Test {
 
 TEST_F(FilterBlockTest, EmptyBuilder) {
   BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
-  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockContents block(builder.Finish());
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", 100000));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0}));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100000));
 }
 
 TEST_F(FilterBlockTest, SingleChunk) {
   BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
+  ASSERT_EQ(0, builder.NumAdded());
   builder.StartBlock(100);
   builder.Add("foo");
   builder.Add("bar");
@@ -73,16 +72,17 @@ TEST_F(FilterBlockTest, SingleChunk) {
   builder.Add("box");
   builder.StartBlock(300);
   builder.Add("hello");
-  BlockContents block(builder.Finish(), false, kNoCompression);
+  ASSERT_EQ(5, builder.NumAdded());
+  BlockContents block(builder.Finish());
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", 100));
-  ASSERT_TRUE(reader.KeyMayMatch("box", 100));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", 100));
-  ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing", 100));
-  ASSERT_TRUE(!reader.KeyMayMatch("other", 100));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 100));
+  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 100));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 100));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr, 100));
+  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr, 100));
 }
 
 TEST_F(FilterBlockTest, MultiChunk) {
@@ -105,33 +105,33 @@ TEST_F(FilterBlockTest, MultiChunk) {
   builder.Add("box");
   builder.Add("hello");
 
-  BlockContents block(builder.Finish(), false, kNoCompression);
+  BlockContents block(builder.Finish());
   BlockBasedFilterBlockReader reader(nullptr, table_options_, true,
                                      std::move(block), nullptr);
 
   // Check first filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
-  ASSERT_TRUE(reader.KeyMayMatch("bar", 2000));
-  ASSERT_TRUE(!reader.KeyMayMatch("box", 0));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", 0));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr, uint64_t{0}));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr, 2000));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, uint64_t{0}));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, uint64_t{0}));
 
   // Check second filter
-  ASSERT_TRUE(reader.KeyMayMatch("box", 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100));
+  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 3100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 3100));
 
   // Check third filter (empty)
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("box", 4100));
-  ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("box", nullptr, 4100));
+  ASSERT_TRUE(!reader.KeyMayMatch("hello", nullptr, 4100));
 
   // Check last filter
-  ASSERT_TRUE(reader.KeyMayMatch("box", 9000));
-  ASSERT_TRUE(reader.KeyMayMatch("hello", 9000));
-  ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000));
-  ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000));
+  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr, 9000));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr, 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("foo", nullptr, 9000));
+  ASSERT_TRUE(!reader.KeyMayMatch("bar", nullptr, 9000));
 }
 
 // Test for block based filter block
@@ -144,26 +144,26 @@ class BlockBasedFilterBlockTest : public testing::Test {
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
   }
 
-  ~BlockBasedFilterBlockTest() {}
+  ~BlockBasedFilterBlockTest() override {}
 };
 
 TEST_F(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
-  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
-      nullptr, table_options_);
-  BlockContents block(builder->Finish(), false, kNoCompression);
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
+  BlockContents block(builder->Finish());
   ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
-  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
-  ASSERT_TRUE(reader->KeyMayMatch("foo", 100000));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0}));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100000));
 
   delete builder;
   delete reader;
 }
 
 TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
-  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
-      nullptr, table_options_);
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
   builder->StartBlock(100);
   builder->Add("foo");
   builder->Add("bar");
@@ -172,24 +172,24 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
   builder->Add("box");
   builder->StartBlock(300);
   builder->Add("hello");
-  BlockContents block(builder->Finish(), false, kNoCompression);
+  BlockContents block(builder->Finish());
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
-  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
-  ASSERT_TRUE(reader->KeyMayMatch("bar", 100));
-  ASSERT_TRUE(reader->KeyMayMatch("box", 100));
-  ASSERT_TRUE(reader->KeyMayMatch("hello", 100));
-  ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
-  ASSERT_TRUE(!reader->KeyMayMatch("missing", 100));
-  ASSERT_TRUE(!reader->KeyMayMatch("other", 100));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 100));
+  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 100));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 100));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("missing", nullptr, 100));
+  ASSERT_TRUE(!reader->KeyMayMatch("other", nullptr, 100));
 
   delete builder;
   delete reader;
 }
 
 TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
-  FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
-      nullptr, table_options_);
+  FilterBlockBuilder* builder =
+      new BlockBasedFilterBlockBuilder(nullptr, table_options_);
 
   // First filter
   builder->StartBlock(0);
@@ -208,33 +208,33 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
   builder->Add("box");
   builder->Add("hello");
 
-  BlockContents block(builder->Finish(), false, kNoCompression);
+  BlockContents block(builder->Finish());
   FilterBlockReader* reader = new BlockBasedFilterBlockReader(
       nullptr, table_options_, true, std::move(block), nullptr);
 
   // Check first filter
-  ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
-  ASSERT_TRUE(reader->KeyMayMatch("bar", 2000));
-  ASSERT_TRUE(!reader->KeyMayMatch("box", 0));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", 0));
+  ASSERT_TRUE(reader->KeyMayMatch("foo", nullptr, uint64_t{0}));
+  ASSERT_TRUE(reader->KeyMayMatch("bar", nullptr, 2000));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, uint64_t{0}));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, uint64_t{0}));
 
   // Check second filter
-  ASSERT_TRUE(reader->KeyMayMatch("box", 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100));
+  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 3100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 3100));
 
   // Check third filter (empty)
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("box", 4100));
-  ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("box", nullptr, 4100));
+  ASSERT_TRUE(!reader->KeyMayMatch("hello", nullptr, 4100));
 
   // Check last filter
-  ASSERT_TRUE(reader->KeyMayMatch("box", 9000));
-  ASSERT_TRUE(reader->KeyMayMatch("hello", 9000));
-  ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000));
-  ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000));
+  ASSERT_TRUE(reader->KeyMayMatch("box", nullptr, 9000));
+  ASSERT_TRUE(reader->KeyMayMatch("hello", nullptr, 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("foo", nullptr, 9000));
+  ASSERT_TRUE(!reader->KeyMayMatch("bar", nullptr, 9000));
 
   delete builder;
   delete reader;
diff --git a/thirdparty/rocksdb/table/block_based_table_builder.cc b/thirdparty/rocksdb/table/block_based_table_builder.cc
index e82f91aec7..479311f5b0 100644
--- a/thirdparty/rocksdb/table/block_based_table_builder.cc
+++ b/thirdparty/rocksdb/table/block_based_table_builder.cc
@@ -37,14 +37,14 @@
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/full_filter_block.h"
-#include "table/meta_blocks.h"
 #include "table/table_builder.h"
 
-#include "util/string_util.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
+#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
+#include "util/string_util.h"
 #include "util/xxhash.h"
 
 #include "table/index_builder.h"
@@ -62,14 +62,17 @@ namespace {
 
 // Create a filter block builder based on its type.
 FilterBlockBuilder* CreateFilterBlockBuilder(
-    const ImmutableCFOptions& opt, const BlockBasedTableOptions& table_opt,
+    const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt,
+    const BlockBasedTableOptions& table_opt,
+    const bool use_delta_encoding_for_index_values,
     PartitionedIndexBuilder* const p_index_builder) {
   if (table_opt.filter_policy == nullptr) return nullptr;
 
   FilterBitsBuilder* filter_bits_builder =
       table_opt.filter_policy->GetFilterBitsBuilder();
   if (filter_bits_builder == nullptr) {
-    return new BlockBasedFilterBlockBuilder(opt.prefix_extractor, table_opt);
+    return new BlockBasedFilterBlockBuilder(mopt.prefix_extractor.get(),
+                                            table_opt);
   } else {
     if (table_opt.partition_filters) {
       assert(p_index_builder != nullptr);
@@ -77,16 +80,18 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
       // until index builder actully cuts the partition, we take the lower bound
       // as partition size.
       assert(table_opt.block_size_deviation <= 100);
-      auto partition_size = static_cast<uint32_t>(
-          table_opt.metadata_block_size *
-          (100 - table_opt.block_size_deviation));
+      auto partition_size =
+          static_cast<uint32_t>(((table_opt.metadata_block_size *
+                                  (100 - table_opt.block_size_deviation)) +
+                                 99) /
+                                100);
       partition_size = std::max(partition_size, static_cast<uint32_t>(1));
       return new PartitionedFilterBlockBuilder(
-          opt.prefix_extractor, table_opt.whole_key_filtering,
+          mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
           filter_bits_builder, table_opt.index_block_restart_interval,
-          p_index_builder, partition_size);
+          use_delta_encoding_for_index_values, p_index_builder, partition_size);
     } else {
-      return new FullFilterBlockBuilder(opt.prefix_extractor,
+      return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
                                         table_opt.whole_key_filtering,
                                         filter_bits_builder);
     }
@@ -98,84 +103,105 @@ bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
   return compressed_size < raw_size - (raw_size / 8u);
 }
 
-}  // namespace
-
-// format_version is the block format as defined in include/rocksdb/table.h
-Slice CompressBlock(const Slice& raw,
-                    const CompressionOptions& compression_options,
-                    CompressionType* type, uint32_t format_version,
-                    const Slice& compression_dict,
-                    std::string* compressed_output) {
-  if (*type == kNoCompression) {
-    return raw;
-  }
-
+bool CompressBlockInternal(const Slice& raw,
+                           const CompressionInfo& compression_info,
+                           uint32_t format_version,
+                           std::string* compressed_output) {
   // Will return compressed block contents if (1) the compression method is
   // supported in this platform and (2) the compression rate is "good enough".
-  switch (*type) {
+  switch (compression_info.type()) {
     case kSnappyCompression:
-      if (Snappy_Compress(compression_options, raw.data(), raw.size(),
-                          compressed_output) &&
-          GoodCompressionRatio(compressed_output->size(), raw.size())) {
-        return *compressed_output;
-      }
-      break;  // fall back to no compression.
+      return Snappy_Compress(compression_info, raw.data(), raw.size(),
+                             compressed_output);
     case kZlibCompression:
-      if (Zlib_Compress(
-              compression_options,
-              GetCompressFormatForVersion(kZlibCompression, format_version),
-              raw.data(), raw.size(), compressed_output, compression_dict) &&
-          GoodCompressionRatio(compressed_output->size(), raw.size())) {
-        return *compressed_output;
-      }
-      break;  // fall back to no compression.
+      return Zlib_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kZlibCompression, format_version),
+          raw.data(), raw.size(), compressed_output);
     case kBZip2Compression:
-      if (BZip2_Compress(
-              compression_options,
-              GetCompressFormatForVersion(kBZip2Compression, format_version),
-              raw.data(), raw.size(), compressed_output) &&
-          GoodCompressionRatio(compressed_output->size(), raw.size())) {
-        return *compressed_output;
-      }
-      break;  // fall back to no compression.
+      return BZip2_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kBZip2Compression, format_version),
+          raw.data(), raw.size(), compressed_output);
     case kLZ4Compression:
-      if (LZ4_Compress(
-              compression_options,
-              GetCompressFormatForVersion(kLZ4Compression, format_version),
-              raw.data(), raw.size(), compressed_output, compression_dict) &&
-          GoodCompressionRatio(compressed_output->size(), raw.size())) {
-        return *compressed_output;
-      }
-      break;  // fall back to no compression.
+      return LZ4_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kLZ4Compression, format_version),
+          raw.data(), raw.size(), compressed_output);
     case kLZ4HCCompression:
-      if (LZ4HC_Compress(
-              compression_options,
-              GetCompressFormatForVersion(kLZ4HCCompression, format_version),
-              raw.data(), raw.size(), compressed_output, compression_dict) &&
-          GoodCompressionRatio(compressed_output->size(), raw.size())) {
-        return *compressed_output;
-      }
-      break;     // fall back to no compression.
+      return LZ4HC_Compress(
+          compression_info,
+          GetCompressFormatForVersion(kLZ4HCCompression, format_version),
+          raw.data(), raw.size(), compressed_output);
     case kXpressCompression:
-      if (XPRESS_Compress(raw.data(), raw.size(),
-          compressed_output) &&
-          GoodCompressionRatio(compressed_output->size(), raw.size())) {
-        return *compressed_output;
-      }
-      break;
+      return XPRESS_Compress(raw.data(), raw.size(), compressed_output);
     case kZSTD:
     case kZSTDNotFinalCompression:
-      if (ZSTD_Compress(compression_options, raw.data(), raw.size(),
-                        compressed_output, compression_dict) &&
-          GoodCompressionRatio(compressed_output->size(), raw.size())) {
-        return *compressed_output;
-      }
-      break;     // fall back to no compression.
-    default: {}  // Do not recognize this compression type
+      return ZSTD_Compress(compression_info, raw.data(), raw.size(),
+                           compressed_output);
+    default:
+      // Do not recognize this compression type
+      return false;
+  }
+}
+
+}  // namespace
+
+// format_version is the block format as defined in include/rocksdb/table.h
+Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
+                    CompressionType* type, uint32_t format_version,
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow) {
+  *type = info.type();
+
+  if (info.type() == kNoCompression && !info.SampleForCompression()) {
+    return raw;
   }
 
-  // Compression method is not supported, or not good compression ratio, so just
-  // fall back to uncompressed form.
+  // If requested, we sample one in every N block with a
+  // fast and slow compression algorithm and report the stats.
+  // The users can use these stats to decide if it is worthwhile
+  // enabling compression and they also get a hint about which
+  // compression algorithm wil be beneficial.
+  if (do_sample && info.SampleForCompression() &&
+      Random::GetTLSInstance()->OneIn((int)info.SampleForCompression()) &&
+      sampled_output_fast && sampled_output_slow) {
+    // Sampling with a fast compression algorithm
+    if (LZ4_Supported() || Snappy_Supported()) {
+      CompressionType c =
+          LZ4_Supported() ? kLZ4Compression : kSnappyCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+
+      CompressBlockInternal(raw, info_tmp, format_version, sampled_output_fast);
+    }
+
+    // Sampling with a slow but high-compression algorithm
+    if (ZSTD_Supported() || Zlib_Supported()) {
+      CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression;
+      CompressionContext context(c);
+      CompressionOptions options;
+      CompressionInfo info_tmp(options, context,
+                               CompressionDict::GetEmptyDict(), c,
+                               info.SampleForCompression());
+      CompressBlockInternal(raw, info_tmp, format_version, sampled_output_slow);
+    }
+  }
+
+  // Actually compress the data
+  if (*type != kNoCompression) {
+    if (CompressBlockInternal(raw, info, format_version, compressed_output) &&
+        GoodCompressionRatio(compressed_output->size(), raw.size())) {
+      return *compressed_output;
+    }
+  }
+
+  // Compression method is not supported, or not good
+  // compression ratio, so just fall back to uncompressed form.
   *type = kNoCompression;
   return raw;
 }
@@ -208,14 +234,22 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
         whole_key_filtering_(whole_key_filtering),
         prefix_filtering_(prefix_filtering) {}
 
-  virtual Status InternalAdd(const Slice& key, const Slice& value,
-                             uint64_t file_size) override {
+  Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
     // Intentionally left blank. Have no interest in collecting stats for
     // individual key/value pairs.
     return Status::OK();
   }
 
-  virtual Status Finish(UserCollectedProperties* properties) override {
+  virtual void BlockAdd(uint64_t /* blockRawBytes */,
+                        uint64_t /* blockCompressedBytesFast */,
+                        uint64_t /* blockCompressedBytesSlow */) override {
+    // Intentionally left blank. No interest in collecting stats for
+    // blocks.
+    return;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
     std::string val;
     PutFixed32(&val, static_cast<uint32_t>(index_type_));
     properties->insert({BlockBasedTablePropertyNames::kIndexType, val});
@@ -227,11 +261,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
   }
 
   // The name of the properties collector can be used for debugging purpose.
-  virtual const char* Name() const override {
+  const char* Name() const override {
     return "BlockBasedTablePropertiesCollector";
   }
 
-  virtual UserCollectedProperties GetReadableProperties() const override {
+  UserCollectedProperties GetReadableProperties() const override {
     // Intentionally left blank.
     return UserCollectedProperties();
   }
@@ -244,12 +278,21 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
 
 struct BlockBasedTableBuilder::Rep {
   const ImmutableCFOptions ioptions;
+  const MutableCFOptions moptions;
   const BlockBasedTableOptions table_options;
   const InternalKeyComparator& internal_comparator;
   WritableFileWriter* file;
   uint64_t offset = 0;
   Status status;
+  size_t alignment;
   BlockBuilder data_block;
+  // Buffers uncompressed data blocks and keys to replay later. Needed when
+  // compression dictionary is enabled so we can finalize the dictionary before
+  // compressing any data blocks.
+  // TODO(ajkr): ideally we don't buffer all keys and all uncompressed data
+  // blocks as it's redundant, but it's easier to implement for now.
+  std::vector<std::pair<std::string, std::vector<std::string>>>
+      data_block_and_keys_buffers;
   BlockBuilder range_del_block;
 
   InternalKeySliceTransform internal_prefix_transform;
@@ -257,13 +300,44 @@ struct BlockBasedTableBuilder::Rep {
   PartitionedIndexBuilder* p_index_builder_ = nullptr;
 
   std::string last_key;
-  const CompressionType compression_type;
-  const CompressionOptions compression_opts;
-  // Data for presetting the compression library's dictionary, or nullptr.
-  const std::string* compression_dict;
+  CompressionType compression_type;
+  uint64_t sample_for_compression;
+  CompressionOptions compression_opts;
+  std::unique_ptr<CompressionDict> compression_dict;
+  CompressionContext compression_ctx;
+  std::unique_ptr<UncompressionContext> verify_ctx;
+  std::unique_ptr<UncompressionDict> verify_dict;
+
+  size_t data_begin_offset = 0;
+
   TableProperties props;
 
-  bool closed = false;  // Either Finish() or Abandon() has been called.
+  // States of the builder.
+  //
+  // - `kBuffered`: This is the initial state where zero or more data blocks are
+  //   accumulated uncompressed in-memory. From this state, call
+  //   `EnterUnbuffered()` to finalize the compression dictionary if enabled,
+  //   compress/write out any buffered blocks, and proceed to the `kUnbuffered`
+  //   state.
+  //
+  // - `kUnbuffered`: This is the state when compression dictionary is finalized
+  //   either because it wasn't enabled in the first place or it's been created
+  //   from sampling previously buffered data. In this state, blocks are simply
+  //   compressed/written out as they fill up. From this state, call `Finish()`
+  //   to complete the file (write meta-blocks, etc.), or `Abandon()` to delete
+  //   the partially created file.
+  //
+  // - `kClosed`: This indicates either `Finish()` or `Abandon()` has been
+  //   called, so the table builder is no longer usable. We must be in this
+  //   state by the time the destructor runs.
+  enum class State {
+    kBuffered,
+    kUnbuffered,
+    kClosed,
+  };
+  State state;
+
+  const bool use_delta_encoding_for_index_values;
   std::unique_ptr<FilterBlockBuilder> filter_builder;
   char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize];
   size_t compressed_cache_key_prefix_size;
@@ -276,53 +350,76 @@ struct BlockBasedTableBuilder::Rep {
   const std::string& column_family_name;
   uint64_t creation_time = 0;
   uint64_t oldest_key_time = 0;
+  const uint64_t target_file_size;
 
   std::vector<std::unique_ptr<IntTblPropCollector>> table_properties_collectors;
 
-  Rep(const ImmutableCFOptions& _ioptions,
+  Rep(const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
       const BlockBasedTableOptions& table_opt,
       const InternalKeyComparator& icomparator,
       const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
           int_tbl_prop_collector_factories,
       uint32_t _column_family_id, WritableFileWriter* f,
       const CompressionType _compression_type,
-      const CompressionOptions& _compression_opts,
-      const std::string* _compression_dict, const bool skip_filters,
+      const uint64_t _sample_for_compression,
+      const CompressionOptions& _compression_opts, const bool skip_filters,
       const std::string& _column_family_name, const uint64_t _creation_time,
-      const uint64_t _oldest_key_time)
+      const uint64_t _oldest_key_time, const uint64_t _target_file_size)
       : ioptions(_ioptions),
+        moptions(_moptions),
         table_options(table_opt),
         internal_comparator(icomparator),
         file(f),
+        alignment(table_options.block_align
+                      ? std::min(table_options.block_size, kDefaultPageSize)
+                      : 0),
         data_block(table_options.block_restart_interval,
-                   table_options.use_delta_encoding),
-        range_del_block(1),  // TODO(andrewkr): restart_interval unnecessary
-        internal_prefix_transform(_ioptions.prefix_extractor),
+                   table_options.use_delta_encoding,
+                   false /* use_value_delta_encoding */,
+                   icomparator.user_comparator()
+                           ->CanKeysWithDifferentByteContentsBeEqual()
+                       ? BlockBasedTableOptions::kDataBlockBinarySearch
+                       : table_options.data_block_index_type,
+                   table_options.data_block_hash_table_util_ratio),
+        range_del_block(1 /* block_restart_interval */),
+        internal_prefix_transform(_moptions.prefix_extractor.get()),
         compression_type(_compression_type),
+        sample_for_compression(_sample_for_compression),
         compression_opts(_compression_opts),
-        compression_dict(_compression_dict),
+        compression_dict(),
+        compression_ctx(_compression_type),
+        verify_dict(),
+        state((_compression_opts.max_dict_bytes > 0) ? State::kBuffered
+                                                     : State::kUnbuffered),
+        use_delta_encoding_for_index_values(table_opt.format_version >= 4 &&
+                                            !table_opt.block_align),
+        compressed_cache_key_prefix_size(0),
         flush_block_policy(
             table_options.flush_block_policy_factory->NewFlushBlockPolicy(
                 table_options, data_block)),
         column_family_id(_column_family_id),
         column_family_name(_column_family_name),
         creation_time(_creation_time),
-        oldest_key_time(_oldest_key_time) {
+        oldest_key_time(_oldest_key_time),
+        target_file_size(_target_file_size) {
     if (table_options.index_type ==
         BlockBasedTableOptions::kTwoLevelIndexSearch) {
       p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder(
-          &internal_comparator, table_options);
+          &internal_comparator, use_delta_encoding_for_index_values,
+          table_options);
       index_builder.reset(p_index_builder_);
     } else {
       index_builder.reset(IndexBuilder::CreateIndexBuilder(
           table_options.index_type, &internal_comparator,
-          &this->internal_prefix_transform, table_options));
+          &this->internal_prefix_transform, use_delta_encoding_for_index_values,
+          table_options));
     }
     if (skip_filters) {
       filter_builder = nullptr;
     } else {
-      filter_builder.reset(
-          CreateFilterBlockBuilder(_ioptions, table_options, p_index_builder_));
+      filter_builder.reset(CreateFilterBlockBuilder(
+          _ioptions, _moptions, table_options,
+          use_delta_encoding_for_index_values, p_index_builder_));
     }
 
     for (auto& collector_factories : *int_tbl_prop_collector_factories) {
@@ -332,22 +429,31 @@ struct BlockBasedTableBuilder::Rep {
     table_properties_collectors.emplace_back(
         new BlockBasedTablePropertiesCollector(
             table_options.index_type, table_options.whole_key_filtering,
-            _ioptions.prefix_extractor != nullptr));
+            _moptions.prefix_extractor != nullptr));
+    if (table_options.verify_compression) {
+      verify_ctx.reset(new UncompressionContext(UncompressionContext::NoCache(),
+                                                compression_type));
+    }
   }
+
+  Rep(const Rep&) = delete;
+  Rep& operator=(const Rep&) = delete;
+
+  ~Rep() {}
 };
 
 BlockBasedTableBuilder::BlockBasedTableBuilder(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
     const BlockBasedTableOptions& table_options,
     const InternalKeyComparator& internal_comparator,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, WritableFileWriter* file,
     const CompressionType compression_type,
-    const CompressionOptions& compression_opts,
-    const std::string* compression_dict, const bool skip_filters,
+    const uint64_t sample_for_compression,
+    const CompressionOptions& compression_opts, const bool skip_filters,
     const std::string& column_family_name, const uint64_t creation_time,
-    const uint64_t oldest_key_time) {
+    const uint64_t oldest_key_time, const uint64_t target_file_size) {
   BlockBasedTableOptions sanitized_table_options(table_options);
   if (sanitized_table_options.format_version == 0 &&
       sanitized_table_options.checksum != kCRC32c) {
@@ -360,11 +466,11 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
     sanitized_table_options.format_version = 1;
   }
 
-  rep_ =
-      new Rep(ioptions, sanitized_table_options, internal_comparator,
-              int_tbl_prop_collector_factories, column_family_id, file,
-              compression_type, compression_opts, compression_dict,
-              skip_filters, column_family_name, creation_time, oldest_key_time);
+  rep_ = new Rep(
+      ioptions, moptions, sanitized_table_options, internal_comparator,
+      int_tbl_prop_collector_factories, column_family_id, file,
+      compression_type, sample_for_compression, compression_opts, skip_filters,
+      column_family_name, creation_time, oldest_key_time, target_file_size);
 
   if (rep_->filter_builder != nullptr) {
     rep_->filter_builder->StartBlock(0);
@@ -378,25 +484,33 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
 }
 
 BlockBasedTableBuilder::~BlockBasedTableBuilder() {
-  assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
+  // Catch errors where caller forgot to call Finish()
+  assert(rep_->state == Rep::State::kClosed);
   delete rep_;
 }
 
 void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
   Rep* r = rep_;
-  assert(!r->closed);
+  assert(rep_->state != Rep::State::kClosed);
   if (!ok()) return;
   ValueType value_type = ExtractValueType(key);
   if (IsValueType(value_type)) {
-    if (r->props.num_entries > 0) {
+#ifndef NDEBUG
+    if (r->props.num_entries > r->props.num_range_deletions) {
       assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
     }
+#endif  // NDEBUG
 
     auto should_flush = r->flush_block_policy->Update(key, value);
     if (should_flush) {
       assert(!r->data_block.empty());
       Flush();
 
+      if (r->state == Rep::State::kBuffered &&
+          r->data_begin_offset > r->target_file_size) {
+        EnterUnbuffered();
+      }
+
       // Add item to index block.
       // We do not emit the index entry for a block until we have seen the
       // first key for the next data block.  This allows us to use shorter
@@ -405,53 +519,61 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
       // "the r" as the key for the index block entry since it is >= all
       // entries in the first block and < all entries in subsequent
       // blocks.
-      if (ok()) {
+      if (ok() && r->state == Rep::State::kUnbuffered) {
         r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
       }
     }
 
     // Note: PartitionedFilterBlockBuilder requires key being added to filter
     // builder after being added to index builder.
-    if (r->filter_builder != nullptr) {
+    if (r->state == Rep::State::kUnbuffered && r->filter_builder != nullptr) {
       r->filter_builder->Add(ExtractUserKey(key));
     }
 
     r->last_key.assign(key.data(), key.size());
     r->data_block.Add(key, value);
-    r->props.num_entries++;
-    r->props.raw_key_size += key.size();
-    r->props.raw_value_size += value.size();
-
-    r->index_builder->OnKeyAdded(key);
+    if (r->state == Rep::State::kBuffered) {
+      // Buffer keys to be replayed during `Finish()` once compression
+      // dictionary has been finalized.
+      if (r->data_block_and_keys_buffers.empty() || should_flush) {
+        r->data_block_and_keys_buffers.emplace_back();
+      }
+      r->data_block_and_keys_buffers.back().second.emplace_back(key.ToString());
+    } else {
+      r->index_builder->OnKeyAdded(key);
+    }
     NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
                                       r->table_properties_collectors,
                                       r->ioptions.info_log);
 
   } else if (value_type == kTypeRangeDeletion) {
-    // TODO(wanning&andrewkr) add num_tomestone to table properties
     r->range_del_block.Add(key, value);
-    ++r->props.num_entries;
-    r->props.raw_key_size += key.size();
-    r->props.raw_value_size += value.size();
     NotifyCollectTableCollectorsOnAdd(key, value, r->offset,
                                       r->table_properties_collectors,
                                       r->ioptions.info_log);
   } else {
     assert(false);
   }
+
+  r->props.num_entries++;
+  r->props.raw_key_size += key.size();
+  r->props.raw_value_size += value.size();
+  if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion) {
+    r->props.num_deletions++;
+  } else if (value_type == kTypeRangeDeletion) {
+    r->props.num_deletions++;
+    r->props.num_range_deletions++;
+  } else if (value_type == kTypeMerge) {
+    r->props.num_merge_operands++;
+  }
 }
 
 void BlockBasedTableBuilder::Flush() {
   Rep* r = rep_;
-  assert(!r->closed);
+  assert(rep_->state != Rep::State::kClosed);
   if (!ok()) return;
   if (r->data_block.empty()) return;
   WriteBlock(&r->data_block, &r->pending_handle, true /* is_data_block */);
-  if (r->filter_builder != nullptr) {
-    r->filter_builder->StartBlock(r->offset);
-  }
-  r->props.data_size = r->offset;
-  ++r->props.num_data_blocks;
 }
 
 void BlockBasedTableBuilder::WriteBlock(BlockBuilder* block,
@@ -472,32 +594,64 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
   Rep* r = rep_;
 
   auto type = r->compression_type;
+  uint64_t sample_for_compression = r->sample_for_compression;
   Slice block_contents;
   bool abort_compression = false;
 
-  StopWatchNano timer(r->ioptions.env,
-    ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics));
+  StopWatchNano timer(
+      r->ioptions.env,
+      ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics));
+
+  if (r->state == Rep::State::kBuffered) {
+    assert(is_data_block);
+    assert(!r->data_block_and_keys_buffers.empty());
+    r->data_block_and_keys_buffers.back().first = raw_block_contents.ToString();
+    r->data_begin_offset += r->data_block_and_keys_buffers.back().first.size();
+    return;
+  }
 
   if (raw_block_contents.size() < kCompressionSizeLimit) {
-    Slice compression_dict;
-    if (is_data_block && r->compression_dict && r->compression_dict->size()) {
-      compression_dict = *r->compression_dict;
+    const CompressionDict* compression_dict;
+    if (!is_data_block || r->compression_dict == nullptr) {
+      compression_dict = &CompressionDict::GetEmptyDict();
+    } else {
+      compression_dict = r->compression_dict.get();
     }
-
-    block_contents = CompressBlock(raw_block_contents, r->compression_opts,
-                                   &type, r->table_options.format_version,
-                                   compression_dict, &r->compressed_output);
+    assert(compression_dict != nullptr);
+    CompressionInfo compression_info(r->compression_opts, r->compression_ctx,
+                                     *compression_dict, type,
+                                     sample_for_compression);
+
+    std::string sampled_output_fast;
+    std::string sampled_output_slow;
+    block_contents = CompressBlock(
+        raw_block_contents, compression_info, &type,
+        r->table_options.format_version, is_data_block /* do_sample */,
+        &r->compressed_output, &sampled_output_fast, &sampled_output_slow);
+
+    // notify collectors on block add
+    NotifyCollectTableCollectorsOnBlockAdd(
+        r->table_properties_collectors, raw_block_contents.size(),
+        sampled_output_fast.size(), sampled_output_slow.size());
 
     // Some of the compression algorithms are known to be unreliable. If
     // the verify_compression flag is set then try to de-compress the
     // compressed data and compare to the input.
     if (type != kNoCompression && r->table_options.verify_compression) {
       // Retrieve the uncompressed contents into a new buffer
+      const UncompressionDict* verify_dict;
+      if (!is_data_block || r->verify_dict == nullptr) {
+        verify_dict = &UncompressionDict::GetEmptyDict();
+      } else {
+        verify_dict = r->verify_dict.get();
+      }
+      assert(verify_dict != nullptr);
       BlockContents contents;
+      UncompressionInfo uncompression_info(*r->verify_ctx, *verify_dict,
+                                           r->compression_type);
       Status stat = UncompressBlockContentsForCompressionType(
-          block_contents.data(), block_contents.size(), &contents,
-          r->table_options.format_version, compression_dict, type,
-          r->ioptions);
+          uncompression_info, block_contents.data(), block_contents.size(),
+          &contents, r->table_options.format_version, r->ioptions);
 
       if (stat.ok()) {
         bool compressed_ok = contents.data.compare(raw_block_contents) == 0;
@@ -526,23 +680,33 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents,
     RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
     type = kNoCompression;
     block_contents = raw_block_contents;
-  } else if (type != kNoCompression &&
-             ShouldReportDetailedTime(r->ioptions.env,
-                                      r->ioptions.statistics)) {
-    MeasureTime(r->ioptions.statistics, COMPRESSION_TIMES_NANOS,
-                timer.ElapsedNanos());
-    MeasureTime(r->ioptions.statistics, BYTES_COMPRESSED,
-                raw_block_contents.size());
+  } else if (type != kNoCompression) {
+    if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.statistics)) {
+      RecordTimeToHistogram(r->ioptions.statistics, COMPRESSION_TIMES_NANOS,
+                            timer.ElapsedNanos());
+    }
+    RecordInHistogram(r->ioptions.statistics, BYTES_COMPRESSED,
+                      raw_block_contents.size());
     RecordTick(r->ioptions.statistics, NUMBER_BLOCK_COMPRESSED);
+  } else if (type != r->compression_type) {
+    RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED);
   }
 
-  WriteRawBlock(block_contents, type, handle);
+  WriteRawBlock(block_contents, type, handle, is_data_block);
   r->compressed_output.clear();
+  if (is_data_block) {
+    if (r->filter_builder != nullptr) {
+      r->filter_builder->StartBlock(r->offset);
+    }
+    r->props.data_size = r->offset;
+    ++r->props.num_data_blocks;
+  }
 }
 
 void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
                                            CompressionType type,
-                                           BlockHandle* handle) {
+                                           BlockHandle* handle,
+                                           bool is_data_block) {
   Rep* r = rep_;
   StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS);
   handle->set_offset(r->offset);
@@ -571,26 +735,50 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents,
         EncodeFixed32(trailer_without_type, XXH32_digest(xxh));
         break;
       }
+      case kxxHash64: {
+        XXH64_state_t* const state = XXH64_createState();
+        XXH64_reset(state, 0);
+        XXH64_update(state, block_contents.data(),
+                     static_cast<uint32_t>(block_contents.size()));
+        XXH64_update(state, trailer, 1);  // Extend  to cover block type
+        EncodeFixed32(
+            trailer_without_type,
+            static_cast<uint32_t>(XXH64_digest(state) &  // lower 32 bits
+                                  uint64_t{0xffffffff}));
+        XXH64_freeState(state);
+        break;
+      }
     }
 
     assert(r->status.ok());
+    TEST_SYNC_POINT_CALLBACK(
+        "BlockBasedTableBuilder::WriteRawBlock:TamperWithChecksum",
+        static_cast<char*>(trailer));
     r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
     if (r->status.ok()) {
       r->status = InsertBlockInCache(block_contents, type, handle);
     }
     if (r->status.ok()) {
       r->offset += block_contents.size() + kBlockTrailerSize;
+      if (r->table_options.block_align && is_data_block) {
+        size_t pad_bytes =
+            (r->alignment - ((block_contents.size() + kBlockTrailerSize) &
+                             (r->alignment - 1))) &
+            (r->alignment - 1);
+        r->status = r->file->Pad(pad_bytes);
+        if (r->status.ok()) {
+          r->offset += pad_bytes;
+        }
+      }
     }
   }
 }
 
-Status BlockBasedTableBuilder::status() const {
-  return rep_->status;
-}
+Status BlockBasedTableBuilder::status() const { return rep_->status; }
 
-static void DeleteCachedBlock(const Slice& key, void* value) {
-  Block* block = reinterpret_cast<Block*>(value);
-  delete block;
+static void DeleteCachedBlockContents(const Slice& /*key*/, void* value) {
+  BlockContents* bc = reinterpret_cast<BlockContents*>(value);
+  delete bc;
 }
 
 //
@@ -603,28 +791,31 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
   Cache* block_cache_compressed = r->table_options.block_cache_compressed.get();
 
   if (type != kNoCompression && block_cache_compressed != nullptr) {
-
     size_t size = block_contents.size();
 
-    std::unique_ptr<char[]> ubuf(new char[size + 1]);
+    auto ubuf =
+        AllocateBlock(size + 1, block_cache_compressed->memory_allocator());
     memcpy(ubuf.get(), block_contents.data(), size);
     ubuf[size] = type;
 
-    BlockContents results(std::move(ubuf), size, true, type);
-
-    Block* block = new Block(std::move(results), kDisableGlobalSequenceNumber);
+    BlockContents* block_contents_to_cache =
+        new BlockContents(std::move(ubuf), size);
+#ifndef NDEBUG
+    block_contents_to_cache->is_raw_block = true;
+#endif  // NDEBUG
 
     // make cache key by appending the file offset to the cache prefix id
     char* end = EncodeVarint64(
-                  r->compressed_cache_key_prefix +
-                  r->compressed_cache_key_prefix_size,
-                  handle->offset());
-    Slice key(r->compressed_cache_key_prefix, static_cast<size_t>
-              (end - r->compressed_cache_key_prefix));
+        r->compressed_cache_key_prefix + r->compressed_cache_key_prefix_size,
+        handle->offset());
+    Slice key(r->compressed_cache_key_prefix,
+              static_cast<size_t>(end - r->compressed_cache_key_prefix));
 
     // Insert into compressed block cache.
-    block_cache_compressed->Insert(key, block, block->usable_size(),
-                                   &DeleteCachedBlock);
+    block_cache_compressed->Insert(
+        key, block_contents_to_cache,
+        block_contents_to_cache->ApproximateMemoryUsage(),
+        &DeleteCachedBlockContents);
 
     // Invalidate OS cache.
     r->file->InvalidateCache(static_cast<size_t>(r->offset), size);
@@ -632,216 +823,343 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents,
   return Status::OK();
 }
 
-Status BlockBasedTableBuilder::Finish() {
-  Rep* r = rep_;
-  bool empty_data_block = r->data_block.empty();
-  Flush();
-  assert(!r->closed);
-  r->closed = true;
-
-  // To make sure properties block is able to keep the accurate size of index
-  // block, we will finish writing all index entries here and flush them
-  // to storage after metaindex block is written.
-  if (ok() && !empty_data_block) {
-    r->index_builder->AddIndexEntry(
-        &r->last_key, nullptr /* no next data block */, r->pending_handle);
-  }
-
-  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle,
-      compression_dict_block_handle, range_del_block_handle;
-  // Write filter block
-  if (ok() && r->filter_builder != nullptr) {
+void BlockBasedTableBuilder::WriteFilterBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  BlockHandle filter_block_handle;
+  bool empty_filter_block = (rep_->filter_builder == nullptr ||
+                             rep_->filter_builder->NumAdded() == 0);
+  if (ok() && !empty_filter_block) {
     Status s = Status::Incomplete();
-    while (s.IsIncomplete()) {
-      Slice filter_content = r->filter_builder->Finish(filter_block_handle, &s);
+    while (ok() && s.IsIncomplete()) {
+      Slice filter_content =
+          rep_->filter_builder->Finish(filter_block_handle, &s);
       assert(s.ok() || s.IsIncomplete());
-      r->props.filter_size += filter_content.size();
+      rep_->props.filter_size += filter_content.size();
       WriteRawBlock(filter_content, kNoCompression, &filter_block_handle);
     }
   }
+  if (ok() && !empty_filter_block) {
+    // Add mapping from "<filter_block_prefix>.Name" to location
+    // of filter data.
+    std::string key;
+    if (rep_->filter_builder->IsBlockBased()) {
+      key = BlockBasedTable::kFilterBlockPrefix;
+    } else {
+      key = rep_->table_options.partition_filters
+                ? BlockBasedTable::kPartitionedFilterBlockPrefix
+                : BlockBasedTable::kFullFilterBlockPrefix;
+    }
+    key.append(rep_->table_options.filter_policy->Name());
+    meta_index_builder->Add(key, filter_block_handle);
+  }
+}
 
+void BlockBasedTableBuilder::WriteIndexBlock(
+    MetaIndexBuilder* meta_index_builder, BlockHandle* index_block_handle) {
   IndexBuilder::IndexBlocks index_blocks;
-  auto index_builder_status = r->index_builder->Finish(&index_blocks);
+  auto index_builder_status = rep_->index_builder->Finish(&index_blocks);
   if (index_builder_status.IsIncomplete()) {
     // We we have more than one index partition then meta_blocks are not
     // supported for the index. Currently meta_blocks are used only by
     // HashIndexBuilder which is not multi-partition.
     assert(index_blocks.meta_blocks.empty());
-  } else if (!index_builder_status.ok()) {
-    return index_builder_status;
+  } else if (ok() && !index_builder_status.ok()) {
+    rep_->status = index_builder_status;
   }
-
-  // Write meta blocks and metaindex block with the following order.
-  //    1. [meta block: filter]
-  //    2. [meta block: properties]
-  //    3. [meta block: compression dictionary]
-  //    4. [meta block: range deletion tombstone]
-  //    5. [metaindex block]
-  // write meta blocks
-  MetaIndexBuilder meta_index_builder;
-  for (const auto& item : index_blocks.meta_blocks) {
-    BlockHandle block_handle;
-    WriteBlock(item.second, &block_handle, false /* is_data_block */);
-    meta_index_builder.Add(item.first, block_handle);
+  if (ok()) {
+    for (const auto& item : index_blocks.meta_blocks) {
+      BlockHandle block_handle;
+      WriteBlock(item.second, &block_handle, false /* is_data_block */);
+      if (!ok()) {
+        break;
+      }
+      meta_index_builder->Add(item.first, block_handle);
+    }
   }
+  if (ok()) {
+    if (rep_->table_options.enable_index_compression) {
+      WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
+    } else {
+      WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+                    index_block_handle);
+    }
+  }
+  // If there are more index partitions, finish them and write them out
+  Status s = index_builder_status;
+  while (ok() && s.IsIncomplete()) {
+    s = rep_->index_builder->Finish(&index_blocks, *index_block_handle);
+    if (!s.ok() && !s.IsIncomplete()) {
+      rep_->status = s;
+      return;
+    }
+    if (rep_->table_options.enable_index_compression) {
+      WriteBlock(index_blocks.index_block_contents, index_block_handle, false);
+    } else {
+      WriteRawBlock(index_blocks.index_block_contents, kNoCompression,
+                    index_block_handle);
+    }
+    // The last index_block_handle will be for the partition index block
+  }
+}
 
+void BlockBasedTableBuilder::WritePropertiesBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  BlockHandle properties_block_handle;
   if (ok()) {
-    if (r->filter_builder != nullptr) {
-      // Add mapping from "<filter_block_prefix>.Name" to location
-      // of filter data.
-      std::string key;
-      if (r->filter_builder->IsBlockBased()) {
-        key = BlockBasedTable::kFilterBlockPrefix;
-      } else {
-        key = r->table_options.partition_filters
-                  ? BlockBasedTable::kPartitionedFilterBlockPrefix
-                  : BlockBasedTable::kFullFilterBlockPrefix;
+    PropertyBlockBuilder property_block_builder;
+    rep_->props.column_family_id = rep_->column_family_id;
+    rep_->props.column_family_name = rep_->column_family_name;
+    rep_->props.filter_policy_name =
+        rep_->table_options.filter_policy != nullptr
+            ? rep_->table_options.filter_policy->Name()
+            : "";
+    rep_->props.index_size =
+        rep_->index_builder->IndexSize() + kBlockTrailerSize;
+    rep_->props.comparator_name = rep_->ioptions.user_comparator != nullptr
+                                      ? rep_->ioptions.user_comparator->Name()
+                                      : "nullptr";
+    rep_->props.merge_operator_name =
+        rep_->ioptions.merge_operator != nullptr
+            ? rep_->ioptions.merge_operator->Name()
+            : "nullptr";
+    rep_->props.compression_name =
+        CompressionTypeToString(rep_->compression_type);
+    rep_->props.compression_options =
+        CompressionOptionsToString(rep_->compression_opts);
+    rep_->props.prefix_extractor_name =
+        rep_->moptions.prefix_extractor != nullptr
+            ? rep_->moptions.prefix_extractor->Name()
+            : "nullptr";
+
+    std::string property_collectors_names = "[";
+    for (size_t i = 0;
+         i < rep_->ioptions.table_properties_collector_factories.size(); ++i) {
+      if (i != 0) {
+        property_collectors_names += ",";
       }
-      key.append(r->table_options.filter_policy->Name());
-      meta_index_builder.Add(key, filter_block_handle);
+      property_collectors_names +=
+          rep_->ioptions.table_properties_collector_factories[i]->Name();
     }
-
-    // Write properties and compression dictionary blocks.
+    property_collectors_names += "]";
+    rep_->props.property_collectors_names = property_collectors_names;
+    if (rep_->table_options.index_type ==
+        BlockBasedTableOptions::kTwoLevelIndexSearch) {
+      assert(rep_->p_index_builder_ != nullptr);
+      rep_->props.index_partitions = rep_->p_index_builder_->NumPartitions();
+      rep_->props.top_level_index_size =
+          rep_->p_index_builder_->TopLevelIndexSize(rep_->offset);
+    }
+    rep_->props.index_key_is_user_key =
+        !rep_->index_builder->seperator_is_key_plus_seq();
+    rep_->props.index_value_is_delta_encoded =
+        rep_->use_delta_encoding_for_index_values;
+    rep_->props.creation_time = rep_->creation_time;
+    rep_->props.oldest_key_time = rep_->oldest_key_time;
+
+    // Add basic properties
+    property_block_builder.AddTableProperty(rep_->props);
+
+    // Add use collected properties
+    NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors,
+                                         rep_->ioptions.info_log,
+                                         &property_block_builder);
+
+    WriteRawBlock(property_block_builder.Finish(), kNoCompression,
+                  &properties_block_handle);
+  }
+  if (ok()) {
+#ifndef NDEBUG
     {
-      PropertyBlockBuilder property_block_builder;
-      r->props.column_family_id = r->column_family_id;
-      r->props.column_family_name = r->column_family_name;
-      r->props.filter_policy_name = r->table_options.filter_policy != nullptr ?
-          r->table_options.filter_policy->Name() : "";
-      r->props.index_size =
-          r->index_builder->EstimatedSize() + kBlockTrailerSize;
-      r->props.comparator_name = r->ioptions.user_comparator != nullptr
-                                     ? r->ioptions.user_comparator->Name()
-                                     : "nullptr";
-      r->props.merge_operator_name = r->ioptions.merge_operator != nullptr
-                                         ? r->ioptions.merge_operator->Name()
-                                         : "nullptr";
-      r->props.compression_name = CompressionTypeToString(r->compression_type);
-      r->props.prefix_extractor_name =
-          r->ioptions.prefix_extractor != nullptr
-              ? r->ioptions.prefix_extractor->Name()
-              : "nullptr";
-
-      std::string property_collectors_names = "[";
-      property_collectors_names = "[";
-      for (size_t i = 0;
-           i < r->ioptions.table_properties_collector_factories.size(); ++i) {
-        if (i != 0) {
-          property_collectors_names += ",";
-        }
-        property_collectors_names +=
-            r->ioptions.table_properties_collector_factories[i]->Name();
-      }
-      property_collectors_names += "]";
-      r->props.property_collectors_names = property_collectors_names;
-      if (r->table_options.index_type ==
-          BlockBasedTableOptions::kTwoLevelIndexSearch) {
-        assert(r->p_index_builder_ != nullptr);
-        r->props.index_partitions = r->p_index_builder_->NumPartitions();
-        r->props.top_level_index_size =
-            r->p_index_builder_->EstimateTopLevelIndexSize(r->offset);
-      }
-      r->props.creation_time = r->creation_time;
-      r->props.oldest_key_time = r->oldest_key_time;
-
-      // Add basic properties
-      property_block_builder.AddTableProperty(r->props);
-
-      // Add use collected properties
-      NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors,
-                                           r->ioptions.info_log,
-                                           &property_block_builder);
-
-      BlockHandle properties_block_handle;
-      WriteRawBlock(
-          property_block_builder.Finish(),
-          kNoCompression,
-          &properties_block_handle
-      );
-      meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
-
-      // Write compression dictionary block
-      if (r->compression_dict && r->compression_dict->size()) {
-        WriteRawBlock(*r->compression_dict, kNoCompression,
-                      &compression_dict_block_handle);
-        meta_index_builder.Add(kCompressionDictBlock,
-                               compression_dict_block_handle);
+      uint64_t props_block_offset = properties_block_handle.offset();
+      uint64_t props_block_size = properties_block_handle.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+          &props_block_offset);
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+          &props_block_size);
+    }
+#endif  // !NDEBUG
+    meta_index_builder->Add(kPropertiesBlock, properties_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteCompressionDictBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (rep_->compression_dict != nullptr &&
+      rep_->compression_dict->GetRawDict().size()) {
+    BlockHandle compression_dict_block_handle;
+    if (ok()) {
+      WriteRawBlock(rep_->compression_dict->GetRawDict(), kNoCompression,
+                    &compression_dict_block_handle);
+#ifndef NDEBUG
+      Slice compression_dict = rep_->compression_dict->GetRawDict();
+      TEST_SYNC_POINT_CALLBACK(
+          "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+          &compression_dict);
+#endif  // NDEBUG
+    }
+    if (ok()) {
+      meta_index_builder->Add(kCompressionDictBlock,
+                              compression_dict_block_handle);
+    }
+  }
+}
+
+void BlockBasedTableBuilder::WriteRangeDelBlock(
+    MetaIndexBuilder* meta_index_builder) {
+  if (ok() && !rep_->range_del_block.empty()) {
+    BlockHandle range_del_block_handle;
+    WriteRawBlock(rep_->range_del_block.Finish(), kNoCompression,
+                  &range_del_block_handle);
+    meta_index_builder->Add(kRangeDelBlock, range_del_block_handle);
+  }
+}
+
+void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle,
+                                         BlockHandle& index_block_handle) {
+  Rep* r = rep_;
+  // No need to write out new footer if we're using default checksum.
+  // We're writing legacy magic number because we want old versions of RocksDB
+  // be able to read files generated with new release (just in case if
+  // somebody wants to roll back after an upgrade)
+  // TODO(icanadi) at some point in the future, when we're absolutely sure
+  // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
+  // number and always write new table files with new magic number
+  bool legacy = (r->table_options.format_version == 0);
+  // this is guaranteed by BlockBasedTableBuilder's constructor
+  assert(r->table_options.checksum == kCRC32c ||
+         r->table_options.format_version != 0);
+  Footer footer(
+      legacy ? kLegacyBlockBasedTableMagicNumber : kBlockBasedTableMagicNumber,
+      r->table_options.format_version);
+  footer.set_metaindex_handle(metaindex_block_handle);
+  footer.set_index_handle(index_block_handle);
+  footer.set_checksum(r->table_options.checksum);
+  std::string footer_encoding;
+  footer.EncodeTo(&footer_encoding);
+  assert(r->status.ok());
+  r->status = r->file->Append(footer_encoding);
+  if (r->status.ok()) {
+    r->offset += footer_encoding.size();
+  }
+}
+
+void BlockBasedTableBuilder::EnterUnbuffered() {
+  Rep* r = rep_;
+  assert(r->state == Rep::State::kBuffered);
+  r->state = Rep::State::kUnbuffered;
+  const size_t kSampleBytes = r->compression_opts.zstd_max_train_bytes > 0
+                                  ? r->compression_opts.zstd_max_train_bytes
+                                  : r->compression_opts.max_dict_bytes;
+  Random64 generator{r->creation_time};
+  std::string compression_dict_samples;
+  std::vector<size_t> compression_dict_sample_lens;
+  if (!r->data_block_and_keys_buffers.empty()) {
+    while (compression_dict_samples.size() < kSampleBytes) {
+      size_t rand_idx =
+          generator.Uniform(r->data_block_and_keys_buffers.size());
+      size_t copy_len =
+          std::min(kSampleBytes - compression_dict_samples.size(),
+                   r->data_block_and_keys_buffers[rand_idx].first.size());
+      compression_dict_samples.append(
+          r->data_block_and_keys_buffers[rand_idx].first, 0, copy_len);
+      compression_dict_sample_lens.emplace_back(copy_len);
+    }
+  }
+
+  // final data block flushed, now we can generate dictionary from the samples.
+  // OK if compression_dict_samples is empty, we'll just get empty dictionary.
+  std::string dict;
+  if (r->compression_opts.zstd_max_train_bytes > 0) {
+    dict = ZSTD_TrainDictionary(compression_dict_samples,
+                                compression_dict_sample_lens,
+                                r->compression_opts.max_dict_bytes);
+  } else {
+    dict = std::move(compression_dict_samples);
+  }
+  r->compression_dict.reset(new CompressionDict(dict, r->compression_type,
+                                                r->compression_opts.level));
+  r->verify_dict.reset(new UncompressionDict(
+      dict, r->compression_type == kZSTD ||
+                r->compression_type == kZSTDNotFinalCompression));
+
+  for (size_t i = 0; ok() && i < r->data_block_and_keys_buffers.size(); ++i) {
+    const auto& data_block = r->data_block_and_keys_buffers[i].first;
+    auto& keys = r->data_block_and_keys_buffers[i].second;
+    assert(!data_block.empty());
+    assert(!keys.empty());
+
+    for (const auto& key : keys) {
+      if (r->filter_builder != nullptr) {
+        r->filter_builder->Add(ExtractUserKey(key));
       }
-    }  // end of properties/compression dictionary block writing
+      r->index_builder->OnKeyAdded(key);
+    }
+    WriteBlock(Slice(data_block), &r->pending_handle, true /* is_data_block */);
+    if (ok() && i + 1 < r->data_block_and_keys_buffers.size()) {
+      Slice first_key_in_next_block =
+          r->data_block_and_keys_buffers[i + 1].second.front();
+      Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
+      r->index_builder->AddIndexEntry(&keys.back(), first_key_in_next_block_ptr,
+                                      r->pending_handle);
+    }
+  }
+  r->data_block_and_keys_buffers.clear();
+}
 
-    if (ok() && !r->range_del_block.empty()) {
-      WriteRawBlock(r->range_del_block.Finish(), kNoCompression,
-                    &range_del_block_handle);
-      meta_index_builder.Add(kRangeDelBlock, range_del_block_handle);
-    }  // range deletion tombstone meta block
-  }    // meta blocks
+Status BlockBasedTableBuilder::Finish() {
+  Rep* r = rep_;
+  assert(r->state != Rep::State::kClosed);
+  bool empty_data_block = r->data_block.empty();
+  Flush();
+  if (r->state == Rep::State::kBuffered) {
+    EnterUnbuffered();
+  }
+  // To make sure properties block is able to keep the accurate size of index
+  // block, we will finish writing all index entries first.
+  if (ok() && !empty_data_block) {
+    r->index_builder->AddIndexEntry(
+        &r->last_key, nullptr /* no next data block */, r->pending_handle);
+  }
 
-  // Write index block
+  // Write meta blocks, metaindex block and footer in the following order.
+  //    1. [meta block: filter]
+  //    2. [meta block: index]
+  //    3. [meta block: compression dictionary]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: properties]
+  //    6. [metaindex block]
+  //    7. Footer
+  BlockHandle metaindex_block_handle, index_block_handle;
+  MetaIndexBuilder meta_index_builder;
+  WriteFilterBlock(&meta_index_builder);
+  WriteIndexBlock(&meta_index_builder, &index_block_handle);
+  WriteCompressionDictBlock(&meta_index_builder);
+  WriteRangeDelBlock(&meta_index_builder);
+  WritePropertiesBlock(&meta_index_builder);
   if (ok()) {
     // flush the meta index block
     WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
                   &metaindex_block_handle);
-
-    const bool is_data_block = true;
-    WriteBlock(index_blocks.index_block_contents, &index_block_handle,
-               !is_data_block);
-    // If there are more index partitions, finish them and write them out
-    Status& s = index_builder_status;
-    while (s.IsIncomplete()) {
-      s = r->index_builder->Finish(&index_blocks, index_block_handle);
-      if (!s.ok() && !s.IsIncomplete()) {
-        return s;
-      }
-      WriteBlock(index_blocks.index_block_contents, &index_block_handle,
-                 !is_data_block);
-      // The last index_block_handle will be for the partition index block
-    }
   }
-
-  // Write footer
   if (ok()) {
-    // No need to write out new footer if we're using default checksum.
-    // We're writing legacy magic number because we want old versions of RocksDB
-    // be able to read files generated with new release (just in case if
-    // somebody wants to roll back after an upgrade)
-    // TODO(icanadi) at some point in the future, when we're absolutely sure
-    // nobody will roll back to RocksDB 2.x versions, retire the legacy magic
-    // number and always write new table files with new magic number
-    bool legacy = (r->table_options.format_version == 0);
-    // this is guaranteed by BlockBasedTableBuilder's constructor
-    assert(r->table_options.checksum == kCRC32c ||
-           r->table_options.format_version != 0);
-    Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber
-                         : kBlockBasedTableMagicNumber,
-                  r->table_options.format_version);
-    footer.set_metaindex_handle(metaindex_block_handle);
-    footer.set_index_handle(index_block_handle);
-    footer.set_checksum(r->table_options.checksum);
-    std::string footer_encoding;
-    footer.EncodeTo(&footer_encoding);
-    assert(r->status.ok());
-    r->status = r->file->Append(footer_encoding);
-    if (r->status.ok()) {
-      r->offset += footer_encoding.size();
-    }
+    WriteFooter(metaindex_block_handle, index_block_handle);
   }
-
+  r->state = Rep::State::kClosed;
   return r->status;
 }
 
 void BlockBasedTableBuilder::Abandon() {
-  Rep* r = rep_;
-  assert(!r->closed);
-  r->closed = true;
+  assert(rep_->state != Rep::State::kClosed);
+  rep_->state = Rep::State::kClosed;
 }
 
 uint64_t BlockBasedTableBuilder::NumEntries() const {
   return rep_->props.num_entries;
 }
 
-uint64_t BlockBasedTableBuilder::FileSize() const {
-  return rep_->offset;
-}
+uint64_t BlockBasedTableBuilder::FileSize() const { return rep_->offset; }
 
 bool BlockBasedTableBuilder::NeedCompact() const {
   for (const auto& collector : rep_->table_properties_collectors) {
diff --git a/thirdparty/rocksdb/table/block_based_table_builder.h b/thirdparty/rocksdb/table/block_based_table_builder.h
index 36dfce1f0f..b10494e7b9 100644
--- a/thirdparty/rocksdb/table/block_based_table_builder.h
+++ b/thirdparty/rocksdb/table/block_based_table_builder.h
@@ -18,7 +18,9 @@
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
 #include "rocksdb/status.h"
+#include "table/meta_blocks.h"
 #include "table/table_builder.h"
+#include "util/compression.h"
 
 namespace rocksdb {
 
@@ -35,24 +37,26 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Create a builder that will store the contents of the table it is
   // building in *file.  Does not close the file.  It is up to the
   // caller to close the file after calling Finish().
-  // @param compression_dict Data for presetting the compression library's
-  //    dictionary, or nullptr.
   BlockBasedTableBuilder(
-      const ImmutableCFOptions& ioptions,
+      const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
       const BlockBasedTableOptions& table_options,
       const InternalKeyComparator& internal_comparator,
       const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
           int_tbl_prop_collector_factories,
       uint32_t column_family_id, WritableFileWriter* file,
       const CompressionType compression_type,
-      const CompressionOptions& compression_opts,
-      const std::string* compression_dict, const bool skip_filters,
+      const uint64_t sample_for_compression,
+      const CompressionOptions& compression_opts, const bool skip_filters,
       const std::string& column_family_name, const uint64_t creation_time = 0,
-      const uint64_t oldest_key_time = 0);
+      const uint64_t oldest_key_time = 0, const uint64_t target_file_size = 0);
 
   // REQUIRES: Either Finish() or Abandon() has been called.
   ~BlockBasedTableBuilder();
 
+  // No copying allowed
+  BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
+  BlockBasedTableBuilder& operator=(const BlockBasedTableBuilder&) = delete;
+
   // Add key,value to the table being constructed.
   // REQUIRES: key is after any previously added key according to comparator.
   // REQUIRES: Finish(), Abandon() have not been called
@@ -88,6 +92,11 @@ class BlockBasedTableBuilder : public TableBuilder {
  private:
   bool ok() const { return status().ok(); }
 
+  // Transition state from buffered to unbuffered. See `Rep::State` API comment
+  // for details of the states.
+  // REQUIRES: `rep_->state == kBuffered`
+  void EnterUnbuffered();
+
   // Call block's Finish() method
   // and then write the compressed block contents to file.
   void WriteBlock(BlockBuilder* block, BlockHandle* handle, bool is_data_block);
@@ -96,10 +105,21 @@ class BlockBasedTableBuilder : public TableBuilder {
   void WriteBlock(const Slice& block_contents, BlockHandle* handle,
                   bool is_data_block);
   // Directly write data to the file.
-  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle);
+  void WriteRawBlock(const Slice& data, CompressionType, BlockHandle* handle,
+                     bool is_data_block = false);
   Status InsertBlockInCache(const Slice& block_contents,
                             const CompressionType type,
                             const BlockHandle* handle);
+
+  void WriteFilterBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteIndexBlock(MetaIndexBuilder* meta_index_builder,
+                       BlockHandle* index_block_handle);
+  void WritePropertiesBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteCompressionDictBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteRangeDelBlock(MetaIndexBuilder* meta_index_builder);
+  void WriteFooter(BlockHandle& metaindex_block_handle,
+                   BlockHandle& index_block_handle);
+
   struct Rep;
   class BlockBasedTablePropertiesCollectorFactory;
   class BlockBasedTablePropertiesCollector;
@@ -114,16 +134,12 @@ class BlockBasedTableBuilder : public TableBuilder {
   // Some compression libraries fail when the raw size is bigger than int. If
   // uncompressed size is bigger than kCompressionSizeLimit, don't compress it
   const uint64_t kCompressionSizeLimit = std::numeric_limits<int>::max();
-
-  // No copying allowed
-  BlockBasedTableBuilder(const BlockBasedTableBuilder&) = delete;
-  void operator=(const BlockBasedTableBuilder&) = delete;
 };
 
-Slice CompressBlock(const Slice& raw,
-                    const CompressionOptions& compression_options,
+Slice CompressBlock(const Slice& raw, const CompressionInfo& info,
                     CompressionType* type, uint32_t format_version,
-                    const Slice& compression_dict,
-                    std::string* compressed_output);
+                    bool do_sample, std::string* compressed_output,
+                    std::string* sampled_output_fast,
+                    std::string* sampled_output_slow);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_based_table_factory.cc b/thirdparty/rocksdb/table/block_based_table_factory.cc
index 0c6bbbcb64..cda8d1e271 100644
--- a/thirdparty/rocksdb/table/block_based_table_factory.cc
+++ b/thirdparty/rocksdb/table/block_based_table_factory.cc
@@ -9,9 +9,15 @@
 
 #include "table/block_based_table_factory.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <stdint.h>
+
 #include <memory>
 #include <string>
-#include <stdint.h>
 
 #include "options/options_helper.h"
 #include "port/port.h"
@@ -21,10 +27,141 @@
 #include "table/block_based_table_builder.h"
 #include "table/block_based_table_reader.h"
 #include "table/format.h"
+#include "util/mutexlock.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
 
+void TailPrefetchStats::RecordEffectiveSize(size_t len) {
+  MutexLock l(&mutex_);
+  if (num_records_ < kNumTracked) {
+    num_records_++;
+  }
+  records_[next_++] = len;
+  if (next_ == kNumTracked) {
+    next_ = 0;
+  }
+}
+
+size_t TailPrefetchStats::GetSuggestedPrefetchSize() {
+  std::vector<size_t> sorted;
+  {
+    MutexLock l(&mutex_);
+
+    if (num_records_ == 0) {
+      return 0;
+    }
+    sorted.assign(records_, records_ + num_records_);
+  }
+
+  // Of the historic size, we find the maximum one that satisifis the condtiion
+  // that if prefetching all, less than 1/8 will be wasted.
+  std::sort(sorted.begin(), sorted.end());
+
+  // Assuming we have 5 data points, and after sorting it looks like this:
+  //
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                    +---+    |   |   |   |
+  //                    |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   |
+  //           |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // and we use every of the value as a candidate, and estimate how much we
+  // wasted, compared to read. For example, when we use the 3rd record
+  // as candiate. This area is what we read:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ***+ ***  ***  *** *** **
+  //  *                 |   |    |   |   |   |
+  //           +---+    |   |    |   |   |   *
+  //  *        |   |    |   |    |   |   |   |
+  //  +---+    |   |    |   |    |   |   |   *
+  //  *   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   *
+  //  *   |    |   |    |   |    |   |   |   |
+  //  *** *** ***-***  ***--*** ***--*** +****
+  // which is (size of the record) X (number of records).
+  //
+  // While wasted is this area:
+  //                                     +---+
+  //                             +---+   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //                             |   |   |   |
+  //  ***  ***  ***  ****---+    |   |   |   |
+  //  *                 *   |    |   |   |   |
+  //  *        *-***  ***   |    |   |   |   |
+  //  *        *   |    |   |    |   |   |   |
+  //  *--**  ***   |    |   |    |   |   |   |
+  //  |   |    |   |    | X |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // Which can be calculated iteratively.
+  // The difference between wasted using 4st and 3rd record, will
+  // be following area:
+  //                                     +---+
+  //  +--+  +-+   ++  +-+  +-+   +---+   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //    xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  + xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  | xxxxxxxxxxxxxxxxxxxxxxxx |   |   |   |
+  //  +-+ +-+  +-+  ++  +---+ +--+   |   |   |
+  //  |                 |   |    |   |   |   |
+  //           +---+ ++ |   |    |   |   |   |
+  //  |        |   |    |   |    | X |   |   |
+  //  +---+ ++ |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  |   |    |   |    |   |    |   |   |   |
+  //  +---+    +---+    +---+    +---+   +---+
+  //
+  // which will be the size difference between 4st and 3rd record,
+  // times 3, which is number of records before the 4st.
+  // Here we assume that all data within the prefetch range will be useful. In
+  // reality, it may not be the case when a partial block is inside the range,
+  // or there are data in the middle that is not read. We ignore those cases
+  // for simplicity.
+  assert(!sorted.empty());
+  size_t prev_size = sorted[0];
+  size_t max_qualified_size = sorted[0];
+  size_t wasted = 0;
+  for (size_t i = 1; i < sorted.size(); i++) {
+    size_t read = sorted[i] * sorted.size();
+    wasted += (sorted[i] - prev_size) * i;
+    if (wasted <= read / 8) {
+      max_qualified_size = sorted[i];
+    }
+    prev_size = sorted[i];
+  }
+  const size_t kMaxPrefetchSize = 512 * 1024;  // Never exceed 512KB
+  return std::min(kMaxPrefetchSize, max_qualified_size);
+}
+
 BlockBasedTableFactory::BlockBasedTableFactory(
     const BlockBasedTableOptions& _table_options)
     : table_options_(_table_options) {
@@ -57,45 +194,49 @@ BlockBasedTableFactory::BlockBasedTableFactory(
 
 Status BlockBasedTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader,
     bool prefetch_index_and_filter_in_cache) const {
   return BlockBasedTable::Open(
       table_reader_options.ioptions, table_reader_options.env_options,
       table_options_, table_reader_options.internal_comparator, std::move(file),
-      file_size, table_reader, prefetch_index_and_filter_in_cache,
-      table_reader_options.skip_filters, table_reader_options.level);
+      file_size, table_reader, table_reader_options.prefix_extractor,
+      prefetch_index_and_filter_in_cache, table_reader_options.skip_filters,
+      table_reader_options.level, table_reader_options.immortal,
+      table_reader_options.largest_seqno, &tail_prefetch_stats_);
 }
 
 TableBuilder* BlockBasedTableFactory::NewTableBuilder(
     const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
     WritableFileWriter* file) const {
   auto table_builder = new BlockBasedTableBuilder(
-      table_builder_options.ioptions, table_options_,
-      table_builder_options.internal_comparator,
+      table_builder_options.ioptions, table_builder_options.moptions,
+      table_options_, table_builder_options.internal_comparator,
       table_builder_options.int_tbl_prop_collector_factories, column_family_id,
       file, table_builder_options.compression_type,
+      table_builder_options.sample_for_compression,
       table_builder_options.compression_opts,
-      table_builder_options.compression_dict,
       table_builder_options.skip_filters,
       table_builder_options.column_family_name,
       table_builder_options.creation_time,
-      table_builder_options.oldest_key_time);
+      table_builder_options.oldest_key_time,
+      table_builder_options.target_file_size);
 
   return table_builder;
 }
 
 Status BlockBasedTableFactory::SanitizeOptions(
-    const DBOptions& db_opts,
-    const ColumnFamilyOptions& cf_opts) const {
+    const DBOptions& /*db_opts*/, const ColumnFamilyOptions& cf_opts) const {
   if (table_options_.index_type == BlockBasedTableOptions::kHashSearch &&
       cf_opts.prefix_extractor == nullptr) {
-    return Status::InvalidArgument("Hash index is specified for block-based "
+    return Status::InvalidArgument(
+        "Hash index is specified for block-based "
         "table, but prefix_extractor is not given");
   }
   if (table_options_.cache_index_and_filter_blocks &&
       table_options_.no_block_cache) {
-    return Status::InvalidArgument("Enable cache_index_and_filter_blocks, "
+    return Status::InvalidArgument(
+        "Enable cache_index_and_filter_blocks, "
         ", but block cache is disabled");
   }
   if (table_options_.pin_l0_filter_and_index_blocks_in_cache &&
@@ -109,6 +250,23 @@ Status BlockBasedTableFactory::SanitizeOptions(
         "Unsupported BlockBasedTable format_version. Please check "
         "include/rocksdb/table.h for more info");
   }
+  if (table_options_.block_align && (cf_opts.compression != kNoCompression)) {
+    return Status::InvalidArgument(
+        "Enable block_align, but compression "
+        "enabled");
+  }
+  if (table_options_.block_align &&
+      (table_options_.block_size & (table_options_.block_size - 1))) {
+    return Status::InvalidArgument(
+        "Block alignment requested but block size is not a power of 2");
+  }
+  if (table_options_.data_block_index_type ==
+          BlockBasedTableOptions::kDataBlockBinaryAndHash &&
+      table_options_.data_block_hash_table_util_ratio <= 0) {
+    return Status::InvalidArgument(
+        "data_block_hash_table_util_ratio should be greater than 0 when "
+        "data_block_index_type is set to kDataBlockBinaryAndHash");
+  }
   return Status::OK();
 }
 
@@ -133,14 +291,22 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
            "  pin_l0_filter_and_index_blocks_in_cache: %d\n",
            table_options_.pin_l0_filter_and_index_blocks_in_cache);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  pin_top_level_index_and_filter: %d\n",
+           table_options_.pin_top_level_index_and_filter);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize, "  index_type: %d\n",
            table_options_.index_type);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_index_type: %d\n",
+           table_options_.data_block_index_type);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  data_block_hash_table_util_ratio: %lf\n",
+           table_options_.data_block_hash_table_util_ratio);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize, "  hash_index_allow_collision: %d\n",
            table_options_.hash_index_allow_collision);
   ret.append(buffer);
-  snprintf(buffer, kBufferSize, "  checksum: %d\n",
-           table_options_.checksum);
+  snprintf(buffer, kBufferSize, "  checksum: %d\n", table_options_.checksum);
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  no_block_cache: %d\n",
            table_options_.no_block_cache);
@@ -192,16 +358,38 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
   snprintf(buffer, kBufferSize, "  index_block_restart_interval: %d\n",
            table_options_.index_block_restart_interval);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  metadata_block_size: %" PRIu64 "\n",
+           table_options_.metadata_block_size);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  partition_filters: %d\n",
+           table_options_.partition_filters);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  use_delta_encoding: %d\n",
+           table_options_.use_delta_encoding);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize, "  filter_policy: %s\n",
-           table_options_.filter_policy == nullptr ?
-             "nullptr" : table_options_.filter_policy->Name());
+           table_options_.filter_policy == nullptr
+               ? "nullptr"
+               : table_options_.filter_policy->Name());
   ret.append(buffer);
   snprintf(buffer, kBufferSize, "  whole_key_filtering: %d\n",
            table_options_.whole_key_filtering);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  verify_compression: %d\n",
+           table_options_.verify_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  read_amp_bytes_per_bit: %d\n",
+           table_options_.read_amp_bytes_per_bit);
+  ret.append(buffer);
   snprintf(buffer, kBufferSize, "  format_version: %d\n",
            table_options_.format_version);
   ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  enable_index_compression: %d\n",
+           table_options_.enable_index_compression);
+  ret.append(buffer);
+  snprintf(buffer, kBufferSize, "  block_align: %d\n",
+           table_options_.block_align);
+  ret.append(buffer);
   return ret;
 }
 
@@ -249,7 +437,7 @@ Status BlockBasedTableFactory::GetOptionString(
 }
 #else
 Status BlockBasedTableFactory::GetOptionString(
-    std::string* opt_string, const std::string& delimiter) const {
+    std::string* /*opt_string*/, const std::string& /*delimiter*/) const {
   return Status::OK();
 }
 #endif  // !ROCKSDB_LITE
@@ -270,11 +458,31 @@ std::string ParseBlockBasedTableOption(const std::string& name,
   if (!input_strings_escaped) {
     // if the input string is not escaped, it means this function is
     // invoked from SetOptions, which takes the old format.
-    if (name == "block_cache") {
-      new_options->block_cache = NewLRUCache(ParseSizeT(value));
-      return "";
-    } else if (name == "block_cache_compressed") {
-      new_options->block_cache_compressed = NewLRUCache(ParseSizeT(value));
+    if (name == "block_cache" || name == "block_cache_compressed") {
+      // cache options can be specified in the following format
+      //   "block_cache={capacity=1M;num_shard_bits=4;
+      //    strict_capacity_limit=true;high_pri_pool_ratio=0.5;}"
+      // To support backward compatibility, the following format
+      // is also supported.
+      //   "block_cache=1M"
+      std::shared_ptr<Cache> cache;
+      // block_cache is specified in format block_cache=<cache_size>.
+      if (value.find('=') == std::string::npos) {
+        cache = NewLRUCache(ParseSizeT(value));
+      } else {
+        LRUCacheOptions cache_opts;
+        if (!ParseOptionHelper(reinterpret_cast<char*>(&cache_opts),
+                               OptionType::kLRUCacheOptions, value)) {
+          return "Invalid cache options";
+        }
+        cache = NewLRUCache(cache_opts);
+      }
+
+      if (name == "block_cache") {
+        new_options->block_cache = cache;
+      } else {
+        new_options->block_cache_compressed = cache;
+      }
       return "";
     } else if (name == "filter_policy") {
       // Expect the following format
@@ -347,6 +555,8 @@ Status GetBlockBasedTableOptionsFromMap(
           (iter->second.verification != OptionVerificationType::kByName &&
            iter->second.verification !=
                OptionVerificationType::kByNameAllowNull &&
+           iter->second.verification !=
+               OptionVerificationType::kByNameAllowFromNull &&
            iter->second.verification != OptionVerificationType::kDeprecated)) {
         // Restore "new_options" to the default "base_options".
         *new_table_options = table_options;
diff --git a/thirdparty/rocksdb/table/block_based_table_factory.h b/thirdparty/rocksdb/table/block_based_table_factory.h
index 39e3eac0b3..100bb0bc41 100644
--- a/thirdparty/rocksdb/table/block_based_table_factory.h
+++ b/thirdparty/rocksdb/table/block_based_table_factory.h
@@ -23,9 +23,24 @@ namespace rocksdb {
 
 struct EnvOptions;
 
-using std::unique_ptr;
 class BlockBasedTableBuilder;
 
+// A class used to track actual bytes written from the tail in the recent SST
+// file opens, and provide a suggestion for following open.
+class TailPrefetchStats {
+ public:
+  void RecordEffectiveSize(size_t len);
+  // 0 indicates no information to determine.
+  size_t GetSuggestedPrefetchSize();
+
+ private:
+  const static size_t kNumTracked = 32;
+  size_t records_[kNumTracked];
+  port::Mutex mutex_;
+  size_t next_ = 0;
+  size_t num_records_ = 0;
+};
+
 class BlockBasedTableFactory : public TableFactory {
  public:
   explicit BlockBasedTableFactory(
@@ -37,8 +52,8 @@ class BlockBasedTableFactory : public TableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
@@ -64,6 +79,7 @@ class BlockBasedTableFactory : public TableFactory {
 
  private:
   BlockBasedTableOptions table_options_;
+  mutable TailPrefetchStats tail_prefetch_stats_;
 };
 
 extern const std::string kHashIndexPrefixesBlock;
@@ -106,6 +122,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
         {"hash_index_allow_collision",
          {offsetof(struct BlockBasedTableOptions, hash_index_allow_collision),
           OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"data_block_index_type",
+         {offsetof(struct BlockBasedTableOptions, data_block_index_type),
+          OptionType::kBlockBasedTableDataBlockIndexType,
+          OptionVerificationType::kNormal, false, 0}},
+        {"data_block_hash_table_util_ratio",
+         {offsetof(struct BlockBasedTableOptions,
+                   data_block_hash_table_util_ratio),
+          OptionType::kDouble, OptionVerificationType::kNormal, false, 0}},
         {"checksum",
          {offsetof(struct BlockBasedTableOptions, checksum),
           OptionType::kChecksumType, OptionVerificationType::kNormal, false,
@@ -152,6 +176,16 @@ static std::unordered_map<std::string, OptionTypeInfo>
           OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
         {"read_amp_bytes_per_bit",
          {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit),
-          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}}};
+          OptionType::kSizeT, OptionVerificationType::kNormal, false, 0}},
+        {"enable_index_compression",
+         {offsetof(struct BlockBasedTableOptions, enable_index_compression),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"block_align",
+         {offsetof(struct BlockBasedTableOptions, block_align),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}},
+        {"pin_top_level_index_and_filter",
+         {offsetof(struct BlockBasedTableOptions,
+                   pin_top_level_index_and_filter),
+          OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}};
 #endif  // !ROCKSDB_LITE
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_based_table_reader.cc b/thirdparty/rocksdb/table/block_based_table_reader.cc
index d8c6d807c8..dc2d4263ee 100644
--- a/thirdparty/rocksdb/table/block_based_table_reader.cc
+++ b/thirdparty/rocksdb/table/block_based_table_reader.cc
@@ -9,6 +9,7 @@
 #include "table/block_based_table_reader.h"
 
 #include <algorithm>
+#include <array>
 #include <limits>
 #include <string>
 #include <utility>
@@ -30,6 +31,7 @@
 #include "table/block.h"
 #include "table/block_based_filter_block.h"
 #include "table/block_based_table_factory.h"
+#include "table/block_fetcher.h"
 #include "table/block_prefix_index.h"
 #include "table/filter_block.h"
 #include "table/format.h"
@@ -44,17 +46,18 @@
 
 #include "monitoring/perf_context_imp.h"
 #include "util/coding.h"
+#include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
+#include "util/xxhash.h"
 
 namespace rocksdb {
 
 extern const uint64_t kBlockBasedTableMagicNumber;
 extern const std::string kHashIndexPrefixesBlock;
 extern const std::string kHashIndexPrefixesMetadataBlock;
-using std::unique_ptr;
 
 typedef BlockBasedTable::IndexReader IndexReader;
 
@@ -63,24 +66,29 @@ BlockBasedTable::~BlockBasedTable() {
   delete rep_;
 }
 
+std::atomic<uint64_t> BlockBasedTable::next_cache_key_id_(0);
+
 namespace {
 // Read the block identified by "handle" from "file".
 // The only relevant option is options.verify_checksums for now.
 // On failure return non-OK.
 // On success fill *result and return OK - caller owns *result
-// @param compression_dict Data for presetting the compression library's
+// @param uncompression_dict Data for presetting the compression library's
 //    dictionary.
 Status ReadBlockFromFile(
     RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
     const Footer& footer, const ReadOptions& options, const BlockHandle& handle,
     std::unique_ptr<Block>* result, const ImmutableCFOptions& ioptions,
-    bool do_uncompress, const Slice& compression_dict,
+    bool do_uncompress, bool maybe_compressed,
+    const UncompressionDict& uncompression_dict,
     const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
-    size_t read_amp_bytes_per_bit) {
+    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) {
   BlockContents contents;
-  Status s = ReadBlockContents(file, prefetch_buffer, footer, options, handle,
-                               &contents, ioptions, do_uncompress,
-                               compression_dict, cache_options);
+  BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle,
+                             &contents, ioptions, do_uncompress,
+                             maybe_compressed, uncompression_dict,
+                             cache_options, memory_allocator);
+  Status s = block_fetcher.ReadBlockContents();
   if (s.ok()) {
     result->reset(new Block(std::move(contents), global_seqno,
                             read_amp_bytes_per_bit, ioptions.statistics));
@@ -89,21 +97,36 @@ Status ReadBlockFromFile(
   return s;
 }
 
+inline MemoryAllocator* GetMemoryAllocator(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache.get()
+             ? table_options.block_cache->memory_allocator()
+             : nullptr;
+}
+
+inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock(
+    const BlockBasedTableOptions& table_options) {
+  return table_options.block_cache_compressed.get()
+             ? table_options.block_cache_compressed->memory_allocator()
+             : nullptr;
+}
+
 // Delete the resource that is held by the iterator.
 template <class ResourceType>
-void DeleteHeldResource(void* arg, void* ignored) {
+void DeleteHeldResource(void* arg, void* /*ignored*/) {
   delete reinterpret_cast<ResourceType*>(arg);
 }
 
 // Delete the entry resided in the cache.
 template <class Entry>
-void DeleteCachedEntry(const Slice& key, void* value) {
+void DeleteCachedEntry(const Slice& /*key*/, void* value) {
   auto entry = reinterpret_cast<Entry*>(value);
   delete entry;
 }
 
 void DeleteCachedFilterEntry(const Slice& key, void* value);
 void DeleteCachedIndexEntry(const Slice& key, void* value);
+void DeleteCachedUncompressionDictEntry(const Slice& key, void* value);
 
 // Release the cached entry and decrement its ref count.
 void ReleaseCachedEntry(void* arg, void* h) {
@@ -112,6 +135,13 @@ void ReleaseCachedEntry(void* arg, void* h) {
   cache->Release(handle);
 }
 
+// Release the cached entry and decrement its ref count.
+void ForceReleaseCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle, true /* force_erase */);
+}
+
 Slice GetCacheKeyFromOffset(const char* cache_key_prefix,
                             size_t cache_key_prefix_size, uint64_t offset,
                             char* cache_key) {
@@ -124,29 +154,72 @@ Slice GetCacheKeyFromOffset(const char* cache_key_prefix,
 }
 
 Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
-                                 Tickers block_cache_miss_ticker,
+                                 int level, Tickers block_cache_miss_ticker,
                                  Tickers block_cache_hit_ticker,
-                                 Statistics* statistics) {
+                                 uint64_t* block_cache_miss_stats,
+                                 uint64_t* block_cache_hit_stats,
+                                 Statistics* statistics,
+                                 GetContext* get_context) {
   auto cache_handle = block_cache->Lookup(key, statistics);
   if (cache_handle != nullptr) {
     PERF_COUNTER_ADD(block_cache_hit_count, 1);
-    // overall cache hit
-    RecordTick(statistics, BLOCK_CACHE_HIT);
-    // total bytes read from cache
-    RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
-               block_cache->GetUsage(cache_handle));
-    // block-type specific cache hit
-    RecordTick(statistics, block_cache_hit_ticker);
+    PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1,
+                              static_cast<uint32_t>(level));
+    if (get_context != nullptr) {
+      // overall cache hit
+      get_context->get_context_stats_.num_cache_hit++;
+      // total bytes read from cache
+      get_context->get_context_stats_.num_cache_bytes_read +=
+          block_cache->GetUsage(cache_handle);
+      // block-type specific cache hit
+      (*block_cache_hit_stats)++;
+    } else {
+      // overall cache hit
+      RecordTick(statistics, BLOCK_CACHE_HIT);
+      // total bytes read from cache
+      RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
+                 block_cache->GetUsage(cache_handle));
+      RecordTick(statistics, block_cache_hit_ticker);
+    }
   } else {
-    // overall cache miss
-    RecordTick(statistics, BLOCK_CACHE_MISS);
-    // block-type specific cache miss
-    RecordTick(statistics, block_cache_miss_ticker);
+    PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 1,
+                              static_cast<uint32_t>(level));
+    if (get_context != nullptr) {
+      // overall cache miss
+      get_context->get_context_stats_.num_cache_miss++;
+      // block-type specific cache miss
+      (*block_cache_miss_stats)++;
+    } else {
+      RecordTick(statistics, BLOCK_CACHE_MISS);
+      RecordTick(statistics, block_cache_miss_ticker);
+    }
   }
 
   return cache_handle;
 }
 
+// For hash based index, return true if prefix_extractor and
+// prefix_extractor_block mismatch, false otherwise. This flag will be used
+// as total_order_seek via NewIndexIterator
+bool PrefixExtractorChanged(const TableProperties* table_properties,
+                            const SliceTransform* prefix_extractor) {
+  // BlockBasedTableOptions::kHashSearch requires prefix_extractor to be set.
+  // Turn off hash index in prefix_extractor is not set; if  prefix_extractor
+  // is set but prefix_extractor_block is not set, also disable hash index
+  if (prefix_extractor == nullptr || table_properties == nullptr ||
+      table_properties->prefix_extractor_name.empty()) {
+    return true;
+  }
+
+  // prefix_extractor and prefix_extractor_block are both non-empty
+  if (table_properties->prefix_extractor_name.compare(
+          prefix_extractor->Name()) != 0) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 }  // namespace
 
 // Index that allows binary search lookup in a two-level index structure.
@@ -163,137 +236,164 @@ class PartitionIndexReader : public IndexReader, public Cleanable {
                        const InternalKeyComparator* icomparator,
                        IndexReader** index_reader,
                        const PersistentCacheOptions& cache_options,
-                       const int level) {
+                       const int level, const bool index_key_includes_seq,
+                       const bool index_value_is_full,
+                       MemoryAllocator* memory_allocator) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
         &index_block, ioptions, true /* decompress */,
-        Slice() /*compression dict*/, cache_options,
-        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
+        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+        cache_options, kDisableGlobalSequenceNumber,
+        0 /* read_amp_bytes_per_bit */, memory_allocator);
 
     if (s.ok()) {
-      *index_reader =
-          new PartitionIndexReader(table, icomparator, std::move(index_block),
-                                   ioptions.statistics, level);
+      *index_reader = new PartitionIndexReader(
+          table, icomparator, std::move(index_block), ioptions.statistics,
+          level, index_key_includes_seq, index_value_is_full);
     }
 
     return s;
   }
 
   // return a two-level iterator: first level is on the partition index
-  virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
-                                        bool dont_care = true) override {
+  InternalIteratorBase<BlockHandle>* NewIterator(
+      IndexBlockIter* /*iter*/ = nullptr, bool /*dont_care*/ = true,
+      bool fill_cache = true) override {
+    Statistics* kNullStats = nullptr;
     // Filters are already checked before seeking the index
-    const bool skip_filters = true;
-    const bool is_index = true;
-    return NewTwoLevelIterator(
-        new BlockBasedTable::BlockEntryIteratorState(
-            table_, ReadOptions(), icomparator_, skip_filters, is_index,
-            partition_map_.size() ? &partition_map_ : nullptr),
-        index_block_->NewIterator(icomparator_, nullptr, true));
+    if (!partition_map_.empty()) {
+      // We don't return pinned datat from index blocks, so no need
+      // to set `block_contents_pinned`.
+      return NewTwoLevelIterator(
+          new BlockBasedTable::PartitionedIndexIteratorState(
+              table_, &partition_map_, index_key_includes_seq_,
+              index_value_is_full_),
+          index_block_->NewIterator<IndexBlockIter>(
+              icomparator_, icomparator_->user_comparator(), nullptr,
+              kNullStats, true, index_key_includes_seq_, index_value_is_full_));
+    } else {
+      auto ro = ReadOptions();
+      ro.fill_cache = fill_cache;
+      bool kIsIndex = true;
+      // We don't return pinned datat from index blocks, so no need
+      // to set `block_contents_pinned`.
+      return new BlockBasedTableIterator<IndexBlockIter, BlockHandle>(
+          table_, ro, *icomparator_,
+          index_block_->NewIterator<IndexBlockIter>(
+              icomparator_, icomparator_->user_comparator(), nullptr,
+              kNullStats, true, index_key_includes_seq_, index_value_is_full_),
+          false, true, /* prefix_extractor */ nullptr, kIsIndex,
+          index_key_includes_seq_, index_value_is_full_);
+    }
     // TODO(myabandeh): Update TwoLevelIterator to be able to make use of
     // on-stack BlockIter while the state is on heap. Currentlly it assumes
     // the first level iter is always on heap and will attempt to delete it
     // in its destructor.
   }
 
-  virtual void CacheDependencies(bool pin) override {
+  void CacheDependencies(bool pin) override {
     // Before read partitions, prefetch them to avoid lots of IOs
     auto rep = table_->rep_;
-    BlockIter biter;
+    IndexBlockIter biter;
     BlockHandle handle;
-    index_block_->NewIterator(icomparator_, &biter, true);
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
+    index_block_->NewIterator<IndexBlockIter>(
+        icomparator_, icomparator_->user_comparator(), &biter, kNullStats, true,
+        index_key_includes_seq_, index_value_is_full_);
     // Index partitions are assumed to be consecuitive. Prefetch them all.
     // Read the first block offset
     biter.SeekToFirst();
-    Slice input = biter.value();
-    Status s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log,
-                     "Could not read first index partition");
+    if (!biter.Valid()) {
+      // Empty index.
       return;
     }
+    handle = biter.value();
     uint64_t prefetch_off = handle.offset();
 
     // Read the last block's offset
     biter.SeekToLast();
-    input = biter.value();
-    s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log,
-                     "Could not read last index partition");
+    if (!biter.Valid()) {
+      // Empty index.
       return;
     }
+    handle = biter.value();
     uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
     uint64_t prefetch_len = last_off - prefetch_off;
     std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
     auto& file = table_->rep_->file;
     prefetch_buffer.reset(new FilePrefetchBuffer());
-    s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
+    Status s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
+                                         static_cast<size_t>(prefetch_len));
 
     // After prefetch, read the partitions one by one
     biter.SeekToFirst();
     auto ro = ReadOptions();
     Cache* block_cache = rep->table_options.block_cache.get();
     for (; biter.Valid(); biter.Next()) {
-      input = biter.value();
-      s = handle.DecodeFrom(&input);
-      assert(s.ok());
-      if (!s.ok()) {
-        ROCKS_LOG_WARN(rep->ioptions.info_log,
-                       "Could not read index partition");
-        continue;
-      }
-
+      handle = biter.value();
       BlockBasedTable::CachableEntry<Block> block;
-      Slice compression_dict;
-      if (rep->compression_dict_block) {
-        compression_dict = rep->compression_dict_block->data;
-      }
       const bool is_index = true;
-      s = table_->MaybeLoadDataBlockToCache(prefetch_buffer.get(), rep, ro,
-                                            handle, compression_dict, &block,
-                                            is_index);
+      // TODO: Support counter batch update for partitioned index and
+      // filter blocks
+      s = table_->MaybeReadBlockAndLoadToCache(
+          prefetch_buffer.get(), rep, ro, handle,
+          UncompressionDict::GetEmptyDict(), &block, is_index,
+          nullptr /* get_context */);
 
       assert(s.ok() || block.value == nullptr);
       if (s.ok() && block.value != nullptr) {
-        assert(block.cache_handle != nullptr);
-        if (pin) {
-          partition_map_[handle.offset()] = block;
-          RegisterCleanup(&ReleaseCachedEntry, block_cache, block.cache_handle);
+        if (block.cache_handle != nullptr) {
+          if (pin) {
+            partition_map_[handle.offset()] = block;
+            RegisterCleanup(&ReleaseCachedEntry, block_cache,
+                            block.cache_handle);
+          } else {
+            block_cache->Release(block.cache_handle);
+          }
         } else {
-          block_cache->Release(block.cache_handle);
+          delete block.value;
         }
       }
     }
   }
 
-  virtual size_t size() const override { return index_block_->size(); }
-  virtual size_t usable_size() const override {
-    return index_block_->usable_size();
-  }
+  size_t size() const override { return index_block_->size(); }
+  size_t usable_size() const override { return index_block_->usable_size(); }
 
-  virtual size_t ApproximateMemoryUsage() const override {
+  size_t ApproximateMemoryUsage() const override {
     assert(index_block_);
-    return index_block_->ApproximateMemoryUsage();
+    size_t usage = index_block_->ApproximateMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size((void*)this);
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    // TODO(myabandeh): more accurate estimate of partition_map_ mem usage
+    return usage;
   }
 
  private:
   PartitionIndexReader(BlockBasedTable* table,
                        const InternalKeyComparator* icomparator,
                        std::unique_ptr<Block>&& index_block, Statistics* stats,
-                       const int level)
+                       const int /*level*/, const bool index_key_includes_seq,
+                       const bool index_value_is_full)
       : IndexReader(icomparator, stats),
         table_(table),
-        index_block_(std::move(index_block)) {
+        index_block_(std::move(index_block)),
+        index_key_includes_seq_(index_key_includes_seq),
+        index_value_is_full_(index_value_is_full) {
     assert(index_block_ != nullptr);
   }
   BlockBasedTable* table_;
   std::unique_ptr<Block> index_block_;
   std::unordered_map<uint64_t, BlockBasedTable::CachableEntry<Block>>
       partition_map_;
+  const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
 };
 
 // Index that allows binary search lookup for the first key of each block.
@@ -311,67 +411,89 @@ class BinarySearchIndexReader : public IndexReader {
                        const ImmutableCFOptions& ioptions,
                        const InternalKeyComparator* icomparator,
                        IndexReader** index_reader,
-                       const PersistentCacheOptions& cache_options) {
+                       const PersistentCacheOptions& cache_options,
+                       const bool index_key_includes_seq,
+                       const bool index_value_is_full,
+                       MemoryAllocator* memory_allocator) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
         &index_block, ioptions, true /* decompress */,
-        Slice() /*compression dict*/, cache_options,
-        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
+        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+        cache_options, kDisableGlobalSequenceNumber,
+        0 /* read_amp_bytes_per_bit */, memory_allocator);
 
     if (s.ok()) {
       *index_reader = new BinarySearchIndexReader(
-          icomparator, std::move(index_block), ioptions.statistics);
+          icomparator, std::move(index_block), ioptions.statistics,
+          index_key_includes_seq, index_value_is_full);
     }
 
     return s;
   }
 
-  virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
-                                        bool dont_care = true) override {
-    return index_block_->NewIterator(icomparator_, iter, true);
+  InternalIteratorBase<BlockHandle>* NewIterator(
+      IndexBlockIter* iter = nullptr, bool /*dont_care*/ = true,
+      bool /*dont_care*/ = true) override {
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
+    return index_block_->NewIterator<IndexBlockIter>(
+        icomparator_, icomparator_->user_comparator(), iter, kNullStats, true,
+        index_key_includes_seq_, index_value_is_full_);
   }
 
-  virtual size_t size() const override { return index_block_->size(); }
-  virtual size_t usable_size() const override {
-    return index_block_->usable_size();
-  }
+  size_t size() const override { return index_block_->size(); }
+  size_t usable_size() const override { return index_block_->usable_size(); }
 
-  virtual size_t ApproximateMemoryUsage() const override {
+  size_t ApproximateMemoryUsage() const override {
     assert(index_block_);
-    return index_block_->ApproximateMemoryUsage();
+    size_t usage = index_block_->ApproximateMemoryUsage();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size((void*)this);
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
   }
 
  private:
   BinarySearchIndexReader(const InternalKeyComparator* icomparator,
                           std::unique_ptr<Block>&& index_block,
-                          Statistics* stats)
-      : IndexReader(icomparator, stats), index_block_(std::move(index_block)) {
+                          Statistics* stats, const bool index_key_includes_seq,
+                          const bool index_value_is_full)
+      : IndexReader(icomparator, stats),
+        index_block_(std::move(index_block)),
+        index_key_includes_seq_(index_key_includes_seq),
+        index_value_is_full_(index_value_is_full) {
     assert(index_block_ != nullptr);
   }
   std::unique_ptr<Block> index_block_;
+  const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
 };
 
 // Index that leverages an internal hash table to quicken the lookup for a given
 // key.
 class HashIndexReader : public IndexReader {
  public:
-  static Status Create(const SliceTransform* hash_key_extractor,
-                       const Footer& footer, RandomAccessFileReader* file,
-                       FilePrefetchBuffer* prefetch_buffer,
-                       const ImmutableCFOptions& ioptions,
-                       const InternalKeyComparator* icomparator,
-                       const BlockHandle& index_handle,
-                       InternalIterator* meta_index_iter,
-                       IndexReader** index_reader,
-                       bool hash_index_allow_collision,
-                       const PersistentCacheOptions& cache_options) {
+  static Status Create(
+      const SliceTransform* hash_key_extractor, const Footer& footer,
+      RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
+      const ImmutableCFOptions& ioptions,
+      const InternalKeyComparator* icomparator, const BlockHandle& index_handle,
+      InternalIterator* meta_index_iter, IndexReader** index_reader,
+      bool /*hash_index_allow_collision*/,
+      const PersistentCacheOptions& cache_options,
+      const bool index_key_includes_seq, const bool index_value_is_full,
+      MemoryAllocator* memory_allocator) {
     std::unique_ptr<Block> index_block;
     auto s = ReadBlockFromFile(
         file, prefetch_buffer, footer, ReadOptions(), index_handle,
         &index_block, ioptions, true /* decompress */,
-        Slice() /*compression dict*/, cache_options,
-        kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */);
+        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+        cache_options, kDisableGlobalSequenceNumber,
+        0 /* read_amp_bytes_per_bit */, memory_allocator);
 
     if (!s.ok()) {
       return s;
@@ -381,9 +503,9 @@ class HashIndexReader : public IndexReader {
     // hard error. We can still fall back to the original binary search index.
     // So, Create will succeed regardless, from this point on.
 
-    auto new_index_reader =
-        new HashIndexReader(icomparator, std::move(index_block),
-          ioptions.statistics);
+    auto new_index_reader = new HashIndexReader(
+        icomparator, std::move(index_block), ioptions.statistics,
+        index_key_includes_seq, index_value_is_full);
     *index_reader = new_index_reader;
 
     // Get prefixes block
@@ -406,18 +528,22 @@ class HashIndexReader : public IndexReader {
 
     // Read contents for the blocks
     BlockContents prefixes_contents;
-    s = ReadBlockContents(file, prefetch_buffer, footer, ReadOptions(),
-                          prefixes_handle, &prefixes_contents, ioptions,
-                          true /* decompress */, Slice() /*compression dict*/,
-                          cache_options);
+    BlockFetcher prefixes_block_fetcher(
+        file, prefetch_buffer, footer, ReadOptions(), prefixes_handle,
+        &prefixes_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+        cache_options, memory_allocator);
+    s = prefixes_block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       return s;
     }
     BlockContents prefixes_meta_contents;
-    s = ReadBlockContents(file, prefetch_buffer, footer, ReadOptions(),
-                          prefixes_meta_handle, &prefixes_meta_contents,
-                          ioptions, true /* decompress */,
-                          Slice() /*compression dict*/, cache_options);
+    BlockFetcher prefixes_meta_block_fetcher(
+        file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle,
+        &prefixes_meta_contents, ioptions, true /*decompress*/,
+        true /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+        cache_options, memory_allocator);
+    s = prefixes_meta_block_fetcher.ReadBlockContents();
     if (!s.ok()) {
       // TODO: log error
       return Status::OK();
@@ -428,40 +554,61 @@ class HashIndexReader : public IndexReader {
                                  prefixes_meta_contents.data, &prefix_index);
     // TODO: log error
     if (s.ok()) {
-      new_index_reader->index_block_->SetBlockPrefixIndex(prefix_index);
+      new_index_reader->prefix_index_.reset(prefix_index);
     }
 
     return Status::OK();
   }
 
-  virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
-                                        bool total_order_seek = true) override {
-    return index_block_->NewIterator(icomparator_, iter, total_order_seek);
+  InternalIteratorBase<BlockHandle>* NewIterator(
+      IndexBlockIter* iter = nullptr, bool total_order_seek = true,
+      bool /*dont_care*/ = true) override {
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
+    return index_block_->NewIterator<IndexBlockIter>(
+        icomparator_, icomparator_->user_comparator(), iter, kNullStats,
+        total_order_seek, index_key_includes_seq_, index_value_is_full_,
+        false /* block_contents_pinned */, prefix_index_.get());
   }
 
-  virtual size_t size() const override { return index_block_->size(); }
-  virtual size_t usable_size() const override {
-    return index_block_->usable_size();
-  }
+  size_t size() const override { return index_block_->size(); }
+  size_t usable_size() const override { return index_block_->usable_size(); }
 
-  virtual size_t ApproximateMemoryUsage() const override {
+  size_t ApproximateMemoryUsage() const override {
     assert(index_block_);
-    return index_block_->ApproximateMemoryUsage() +
-           prefixes_contents_.data.size();
+    size_t usage = index_block_->ApproximateMemoryUsage();
+    usage += prefixes_contents_.usable_size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size((void*)this);
+#else
+    if (prefix_index_) {
+      usage += prefix_index_->ApproximateMemoryUsage();
+    }
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    return usage;
   }
 
  private:
   HashIndexReader(const InternalKeyComparator* icomparator,
-                  std::unique_ptr<Block>&& index_block, Statistics* stats)
-      : IndexReader(icomparator, stats), index_block_(std::move(index_block)) {
+                  std::unique_ptr<Block>&& index_block, Statistics* stats,
+                  const bool index_key_includes_seq,
+                  const bool index_value_is_full)
+      : IndexReader(icomparator, stats),
+        index_block_(std::move(index_block)),
+        index_key_includes_seq_(index_key_includes_seq),
+        index_value_is_full_(index_value_is_full) {
     assert(index_block_ != nullptr);
   }
 
-  ~HashIndexReader() {
-  }
+  ~HashIndexReader() override {}
 
   std::unique_ptr<Block> index_block_;
+  std::unique_ptr<BlockPrefixIndex> prefix_index_;
   BlockContents prefixes_contents_;
+  const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
 };
 
 // Helper function to setup the cache key's prefix for the Table.
@@ -488,9 +635,8 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) {
   }
 }
 
-void BlockBasedTable::GenerateCachePrefix(Cache* cc,
-    RandomAccessFile* file, char* buffer, size_t* size) {
-
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
+                                          char* buffer, size_t* size) {
   // generate an id from the file
   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
 
@@ -502,9 +648,8 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc,
   }
 }
 
-void BlockBasedTable::GenerateCachePrefix(Cache* cc,
-    WritableFile* file, char* buffer, size_t* size) {
-
+void BlockBasedTable::GenerateCachePrefix(Cache* cc, WritableFile* file,
+                                          char* buffer, size_t* size) {
   // generate an id from the file
   *size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
 
@@ -535,51 +680,78 @@ bool IsFeatureSupported(const TableProperties& table_properties,
   return true;
 }
 
-SequenceNumber GetGlobalSequenceNumber(const TableProperties& table_properties,
-                                       Logger* info_log) {
-  auto& props = table_properties.user_collected_properties;
-
-  auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
-  auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+// Caller has to ensure seqno is not nullptr.
+Status GetGlobalSequenceNumber(const TableProperties& table_properties,
+                               SequenceNumber largest_seqno,
+                               SequenceNumber* seqno) {
+  const auto& props = table_properties.user_collected_properties;
+  const auto version_pos = props.find(ExternalSstFilePropertyNames::kVersion);
+  const auto seqno_pos = props.find(ExternalSstFilePropertyNames::kGlobalSeqno);
 
+  *seqno = kDisableGlobalSequenceNumber;
   if (version_pos == props.end()) {
     if (seqno_pos != props.end()) {
+      std::array<char, 200> msg_buf;
       // This is not an external sst file, global_seqno is not supported.
-      assert(false);
-      ROCKS_LOG_ERROR(
-          info_log,
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
           "A non-external sst file have global seqno property with value %s",
           seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
     }
-    return kDisableGlobalSequenceNumber;
+    return Status::OK();
   }
 
   uint32_t version = DecodeFixed32(version_pos->second.c_str());
   if (version < 2) {
     if (seqno_pos != props.end() || version != 1) {
+      std::array<char, 200> msg_buf;
       // This is a v1 external sst file, global_seqno is not supported.
-      assert(false);
-      ROCKS_LOG_ERROR(
-          info_log,
-          "An external sst file with version %u have global seqno property "
-          "with value %s",
-          version, seqno_pos->second.c_str());
+      snprintf(msg_buf.data(), msg_buf.max_size(),
+               "An external sst file with version %u have global seqno "
+               "property with value %s",
+               version, seqno_pos->second.c_str());
+      return Status::Corruption(msg_buf.data());
     }
-    return kDisableGlobalSequenceNumber;
+    return Status::OK();
   }
 
-  SequenceNumber global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  // Since we have a plan to deprecate global_seqno, we do not return failure
+  // if seqno_pos == props.end(). We rely on version_pos to detect whether the
+  // SST is external.
+  SequenceNumber global_seqno(0);
+  if (seqno_pos != props.end()) {
+    global_seqno = DecodeFixed64(seqno_pos->second.c_str());
+  }
+  // SstTableReader open table reader with kMaxSequenceNumber as largest_seqno
+  // to denote it is unknown.
+  if (largest_seqno < kMaxSequenceNumber) {
+    if (global_seqno == 0) {
+      global_seqno = largest_seqno;
+    }
+    if (global_seqno != largest_seqno) {
+      std::array<char, 200> msg_buf;
+      snprintf(
+          msg_buf.data(), msg_buf.max_size(),
+          "An external sst file with version %u have global seqno property "
+          "with value %s, while largest seqno in the file is %llu",
+          version, seqno_pos->second.c_str(),
+          static_cast<unsigned long long>(largest_seqno));
+      return Status::Corruption(msg_buf.data());
+    }
+  }
+  *seqno = global_seqno;
 
   if (global_seqno > kMaxSequenceNumber) {
-    assert(false);
-    ROCKS_LOG_ERROR(
-        info_log,
-        "An external sst file with version %u have global seqno property "
-        "with value %llu, which is greater than kMaxSequenceNumber",
-        version, global_seqno);
+    std::array<char, 200> msg_buf;
+    snprintf(msg_buf.data(), msg_buf.max_size(),
+             "An external sst file with version %u have global seqno property "
+             "with value %llu, which is greater than kMaxSequenceNumber",
+             version, static_cast<unsigned long long>(global_seqno));
+    return Status::Corruption(msg_buf.data());
   }
 
-  return global_seqno;
+  return Status::OK();
 }
 }  // namespace
 
@@ -599,36 +771,36 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                              const EnvOptions& env_options,
                              const BlockBasedTableOptions& table_options,
                              const InternalKeyComparator& internal_comparator,
-                             unique_ptr<RandomAccessFileReader>&& file,
+                             std::unique_ptr<RandomAccessFileReader>&& file,
                              uint64_t file_size,
-                             unique_ptr<TableReader>* table_reader,
+                             std::unique_ptr<TableReader>* table_reader,
+                             const SliceTransform* prefix_extractor,
                              const bool prefetch_index_and_filter_in_cache,
-                             const bool skip_filters, const int level) {
+                             const bool skip_filters, const int level,
+                             const bool immortal_table,
+                             const SequenceNumber largest_seqno,
+                             TailPrefetchStats* tail_prefetch_stats) {
   table_reader->reset();
 
+  Status s;
   Footer footer;
-
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
 
-  // Before read footer, readahead backwards to prefetch data
-  const size_t kTailPrefetchSize = 512 * 1024;
-  size_t prefetch_off;
-  size_t prefetch_len;
-  if (file_size < kTailPrefetchSize) {
-    prefetch_off = 0;
-    prefetch_len = file_size;
-  } else {
-    prefetch_off = file_size - kTailPrefetchSize;
-    prefetch_len = kTailPrefetchSize;
-  }
-  Status s;
-  // TODO should not have this special logic in the future.
-  if (!file->use_direct_io()) {
-    s = file->Prefetch(prefetch_off, prefetch_len);
-  } else {
-    prefetch_buffer.reset(new FilePrefetchBuffer());
-    s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
-  }
+  // prefetch both index and filters, down to all partitions
+  const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
+  const bool preload_all = !table_options.cache_index_and_filter_blocks;
+
+  s = PrefetchTail(file.get(), file_size, tail_prefetch_stats, prefetch_all,
+                   preload_all, &prefetch_buffer);
+
+  // Read in the following order:
+  //    1. Footer
+  //    2. [metaindex block]
+  //    3. [meta block: properties]
+  //    4. [meta block: range deletion tombstone]
+  //    5. [meta block: compression dictionary]
+  //    6. [meta block: index]
+  //    7. [meta block: filter]
   s = ReadFooterFromFile(file.get(), prefetch_buffer.get(), file_size, &footer,
                          kBlockBasedTableMagicNumber);
   if (!s.ok()) {
@@ -645,7 +817,8 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   // raw pointer will be used to create HashIndexReader, whose reset may
   // access a dangling pointer.
   Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
-                                      internal_comparator, skip_filters);
+                                      internal_comparator, skip_filters, level,
+                                      immortal_table);
   rep->file = std::move(file);
   rep->footer = footer;
   rep->index_type = table_options.index_type;
@@ -653,18 +826,18 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
   // We need to wrap data with internal_prefix_transform to make sure it can
   // handle prefix correctly.
   rep->internal_prefix_transform.reset(
-      new InternalKeySliceTransform(rep->ioptions.prefix_extractor));
+      new InternalKeySliceTransform(prefix_extractor));
   SetupCacheKeyPrefix(rep, file_size);
-  unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
+  std::unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
 
   // page cache options
   rep->persistent_cache_options =
       PersistentCacheOptions(rep->table_options.persistent_cache,
                              std::string(rep->persistent_cache_key_prefix,
                                          rep->persistent_cache_key_prefix_size),
-                                         rep->ioptions.statistics);
+                             rep->ioptions.statistics);
 
-  // Read meta index
+  // Read metaindex
   std::unique_ptr<Block> meta;
   std::unique_ptr<InternalIterator> meta_iter;
   s = ReadMetaBlock(rep, prefetch_buffer.get(), &meta, &meta_iter);
@@ -672,38 +845,147 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     return s;
   }
 
-  // Find filter handle and filter type
-  if (rep->filter_policy) {
-    for (auto filter_type :
-         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
-          Rep::FilterType::kBlockFilter}) {
-      std::string prefix;
-      switch (filter_type) {
-        case Rep::FilterType::kFullFilter:
-          prefix = kFullFilterBlockPrefix;
-          break;
-        case Rep::FilterType::kPartitionedFilter:
-          prefix = kPartitionedFilterBlockPrefix;
-          break;
-        case Rep::FilterType::kBlockFilter:
-          prefix = kFilterBlockPrefix;
-          break;
-        default:
-          assert(0);
-      }
-      std::string filter_block_key = prefix;
-      filter_block_key.append(rep->filter_policy->Name());
-      if (FindMetaBlock(meta_iter.get(), filter_block_key, &rep->filter_handle)
-              .ok()) {
-        rep->filter_type = filter_type;
-        break;
-      }
+  s = ReadPropertiesBlock(rep, prefetch_buffer.get(), meta_iter.get(),
+                          largest_seqno);
+  if (!s.ok()) {
+    return s;
+  }
+  s = ReadRangeDelBlock(rep, prefetch_buffer.get(), meta_iter.get(),
+                        internal_comparator);
+  if (!s.ok()) {
+    return s;
+  }
+  s = PrefetchIndexAndFilterBlocks(rep, prefetch_buffer.get(), meta_iter.get(),
+                                   new_table.get(), prefix_extractor,
+                                   prefetch_all, table_options, level,
+                                   prefetch_index_and_filter_in_cache);
+
+  if (s.ok()) {
+    // Update tail prefetch stats
+    assert(prefetch_buffer.get() != nullptr);
+    if (tail_prefetch_stats != nullptr) {
+      assert(prefetch_buffer->min_offset_read() < file_size);
+      tail_prefetch_stats->RecordEffectiveSize(
+          static_cast<size_t>(file_size) - prefetch_buffer->min_offset_read());
     }
+
+    *table_reader = std::move(new_table);
+  }
+
+  return s;
+}
+
+Status BlockBasedTable::PrefetchTail(
+    RandomAccessFileReader* file, uint64_t file_size,
+    TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
+    const bool preload_all,
+    std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer) {
+  size_t tail_prefetch_size = 0;
+  if (tail_prefetch_stats != nullptr) {
+    // Multiple threads may get a 0 (no history) when running in parallel,
+    // but it will get cleared after the first of them finishes.
+    tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize();
+  }
+  if (tail_prefetch_size == 0) {
+    // Before read footer, readahead backwards to prefetch data. Do more
+    // readahead if we're going to read index/filter.
+    // TODO: This may incorrectly select small readahead in case partitioned
+    // index/filter is enabled and top-level partition pinning is enabled.
+    // That's because we need to issue readahead before we read the properties,
+    // at which point we don't yet know the index type.
+    tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024;
+  }
+  size_t prefetch_off;
+  size_t prefetch_len;
+  if (file_size < tail_prefetch_size) {
+    prefetch_off = 0;
+    prefetch_len = static_cast<size_t>(file_size);
+  } else {
+    prefetch_off = static_cast<size_t>(file_size - tail_prefetch_size);
+    prefetch_len = tail_prefetch_size;
+  }
+  TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen",
+                           &tail_prefetch_size);
+  Status s;
+  // TODO should not have this special logic in the future.
+  if (!file->use_direct_io()) {
+    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, false, true));
+    s = file->Prefetch(prefetch_off, prefetch_len);
+  } else {
+    prefetch_buffer->reset(new FilePrefetchBuffer(nullptr, 0, 0, true, true));
+    s = (*prefetch_buffer)->Prefetch(file, prefetch_off, prefetch_len);
+  }
+  return s;
+}
+
+Status VerifyChecksum(const ChecksumType type, const char* buf, size_t len,
+                      uint32_t expected) {
+  Status s;
+  uint32_t actual = 0;
+  switch (type) {
+    case kNoChecksum:
+      break;
+    case kCRC32c:
+      expected = crc32c::Unmask(expected);
+      actual = crc32c::Value(buf, len);
+      break;
+    case kxxHash:
+      actual = XXH32(buf, static_cast<int>(len), 0);
+      break;
+    case kxxHash64:
+      actual = static_cast<uint32_t>(XXH64(buf, static_cast<int>(len), 0) &
+                                     uint64_t{0xffffffff});
+      break;
+    default:
+      s = Status::Corruption("unknown checksum type");
+  }
+  if (s.ok() && actual != expected) {
+    s = Status::Corruption("properties block checksum mismatched");
+  }
+  return s;
+}
+
+Status BlockBasedTable::TryReadPropertiesWithGlobalSeqno(
+    Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
+    TableProperties** table_properties) {
+  assert(table_properties != nullptr);
+  // If this is an external SST file ingested with write_global_seqno set to
+  // true, then we expect the checksum mismatch because checksum was written
+  // by SstFileWriter, but its global seqno in the properties block may have
+  // been changed during ingestion. In this case, we read the properties
+  // block, copy it to a memory buffer, change the global seqno to its
+  // original value, i.e. 0, and verify the checksum again.
+  BlockHandle props_block_handle;
+  CacheAllocationPtr tmp_buf;
+  Status s = ReadProperties(handle_value, rep->file.get(), prefetch_buffer,
+                            rep->footer, rep->ioptions, table_properties,
+                            false /* verify_checksum */, &props_block_handle,
+                            &tmp_buf, false /* compression_type_missing */,
+                            nullptr /* memory_allocator */);
+  if (s.ok() && tmp_buf) {
+    const auto seqno_pos_iter =
+        (*table_properties)
+            ->properties_offsets.find(
+                ExternalSstFilePropertyNames::kGlobalSeqno);
+    size_t block_size = props_block_handle.size();
+    if (seqno_pos_iter != (*table_properties)->properties_offsets.end()) {
+      uint64_t global_seqno_offset = seqno_pos_iter->second;
+      EncodeFixed64(
+          tmp_buf.get() + global_seqno_offset - props_block_handle.offset(), 0);
+    }
+    uint32_t value = DecodeFixed32(tmp_buf.get() + block_size + 1);
+    s = rocksdb::VerifyChecksum(rep->footer.checksum(), tmp_buf.get(),
+                                block_size + 1, value);
   }
+  return s;
+}
 
-  // Read the properties
+Status BlockBasedTable::ReadPropertiesBlock(
+    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    const SequenceNumber largest_seqno) {
   bool found_properties_block = true;
-  s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
+  Status s;
+  s = SeekToPropertiesBlock(meta_iter, &found_properties_block);
 
   if (!s.ok()) {
     ROCKS_LOG_WARN(rep->ioptions.info_log,
@@ -713,9 +995,20 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
     s = meta_iter->status();
     TableProperties* table_properties = nullptr;
     if (s.ok()) {
-      s = ReadProperties(meta_iter->value(), rep->file.get(),
-                         prefetch_buffer.get(), rep->footer, rep->ioptions,
-                         &table_properties);
+      s = ReadProperties(
+          meta_iter->value(), rep->file.get(), prefetch_buffer, rep->footer,
+          rep->ioptions, &table_properties, true /* verify_checksum */,
+          nullptr /* ret_block_handle */, nullptr /* ret_block_contents */,
+          false /* compression_type_missing */, nullptr /* memory_allocator */);
+    }
+
+    if (s.IsCorruption()) {
+      s = TryReadPropertiesWithGlobalSeqno(
+          rep, prefetch_buffer, meta_iter->value(), &table_properties);
+    }
+    std::unique_ptr<TableProperties> props_guard;
+    if (table_properties != nullptr) {
+      props_guard.reset(table_properties);
     }
 
     if (!s.ok()) {
@@ -724,32 +1017,98 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
                      "block %s",
                      s.ToString().c_str());
     } else {
-      rep->table_properties.reset(table_properties);
+      assert(table_properties != nullptr);
+      rep->table_properties.reset(props_guard.release());
+      rep->blocks_maybe_compressed = rep->table_properties->compression_name !=
+                                     CompressionTypeToString(kNoCompression);
+      rep->blocks_definitely_zstd_compressed =
+          (rep->table_properties->compression_name ==
+               CompressionTypeToString(kZSTD) ||
+           rep->table_properties->compression_name ==
+               CompressionTypeToString(kZSTDNotFinalCompression));
     }
   } else {
     ROCKS_LOG_ERROR(rep->ioptions.info_log,
                     "Cannot find Properties block from file.");
   }
+#ifndef ROCKSDB_LITE
+  if (rep->table_properties) {
+    ParseSliceTransform(rep->table_properties->prefix_extractor_name,
+                        &(rep->table_prefix_extractor));
+  }
+#endif  // ROCKSDB_LITE
+
+  // Read the table properties, if provided.
+  if (rep->table_properties) {
+    rep->whole_key_filtering &=
+        IsFeatureSupported(*(rep->table_properties),
+                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
+                           rep->ioptions.info_log);
+    rep->prefix_filtering &= IsFeatureSupported(
+        *(rep->table_properties),
+        BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
+
+    s = GetGlobalSequenceNumber(*(rep->table_properties), largest_seqno,
+                                &(rep->global_seqno));
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(rep->ioptions.info_log, "%s", s.ToString().c_str());
+    }
+  }
+  return s;
+}
 
-  // Read the compression dictionary meta block
-  bool found_compression_dict;
-  s = SeekToCompressionDictBlock(meta_iter.get(), &found_compression_dict);
+Status BlockBasedTable::ReadRangeDelBlock(
+    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    const InternalKeyComparator& internal_comparator) {
+  Status s;
+  bool found_range_del_block;
+  BlockHandle range_del_handle;
+  s = SeekToRangeDelBlock(meta_iter, &found_range_del_block, &range_del_handle);
   if (!s.ok()) {
     ROCKS_LOG_WARN(
         rep->ioptions.info_log,
-        "Error when seeking to compression dictionary block from file: %s",
+        "Error when seeking to range delete tombstones block from file: %s",
         s.ToString().c_str());
-  } else if (found_compression_dict) {
-    // TODO(andrewkr): Add to block cache if cache_index_and_filter_blocks is
-    // true.
-    unique_ptr<BlockContents> compression_dict_block{new BlockContents()};
-    // TODO(andrewkr): ReadMetaBlock repeats SeekToCompressionDictBlock().
-    // maybe decode a handle from meta_iter
-    // and do ReadBlockContents(handle) instead
-    s = rocksdb::ReadMetaBlock(rep->file.get(), prefetch_buffer.get(),
-                               file_size, kBlockBasedTableMagicNumber,
-                               rep->ioptions, rocksdb::kCompressionDictBlock,
-                               compression_dict_block.get());
+  } else if (found_range_del_block && !range_del_handle.IsNull()) {
+    ReadOptions read_options;
+    std::unique_ptr<InternalIterator> iter(NewDataBlockIterator<DataBlockIter>(
+        rep, read_options, range_del_handle, nullptr /* input_iter */,
+        false /* is_index */, true /* key_includes_seq */,
+        true /* index_key_is_full */, nullptr /* get_context */, Status(),
+        prefetch_buffer));
+    assert(iter != nullptr);
+    s = iter->status();
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(
+          rep->ioptions.info_log,
+          "Encountered error while reading data from range del block %s",
+          s.ToString().c_str());
+    } else {
+      rep->fragmented_range_dels =
+          std::make_shared<FragmentedRangeTombstoneList>(std::move(iter),
+                                                         internal_comparator);
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::ReadCompressionDictBlock(
+    Rep* rep, FilePrefetchBuffer* prefetch_buffer,
+    std::unique_ptr<const BlockContents>* compression_dict_block) {
+  assert(compression_dict_block != nullptr);
+  Status s;
+  if (!rep->compression_dict_handle.IsNull()) {
+    std::unique_ptr<BlockContents> compression_dict_cont{new BlockContents()};
+    PersistentCacheOptions cache_options;
+    ReadOptions read_options;
+    read_options.verify_checksums = true;
+    BlockFetcher compression_block_fetcher(
+        rep->file.get(), prefetch_buffer, rep->footer, read_options,
+        rep->compression_dict_handle, compression_dict_cont.get(),
+        rep->ioptions, false /* decompress */, false /*maybe_compressed*/,
+        UncompressionDict::GetEmptyDict(), cache_options);
+    s = compression_block_fetcher.ReadBlockContents();
+
     if (!s.ok()) {
       ROCKS_LOG_WARN(
           rep->ioptions.info_log,
@@ -757,124 +1116,178 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
           "block %s",
           s.ToString().c_str());
     } else {
-      rep->compression_dict_block = std::move(compression_dict_block);
+      *compression_dict_block = std::move(compression_dict_cont);
     }
   }
+  return s;
+}
 
-  // Read the range del meta block
-  bool found_range_del_block;
-  s = SeekToRangeDelBlock(meta_iter.get(), &found_range_del_block,
-                          &rep->range_del_handle);
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(
-        rep->ioptions.info_log,
-        "Error when seeking to range delete tombstones block from file: %s",
-        s.ToString().c_str());
-  } else {
-    if (found_range_del_block && !rep->range_del_handle.IsNull()) {
-      ReadOptions read_options;
-      s = MaybeLoadDataBlockToCache(
-          prefetch_buffer.get(), rep, read_options, rep->range_del_handle,
-          Slice() /* compression_dict */, &rep->range_del_entry);
-      if (!s.ok()) {
-        ROCKS_LOG_WARN(
-            rep->ioptions.info_log,
-            "Encountered error while reading data from range del block %s",
-            s.ToString().c_str());
+Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
+    Rep* rep, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter,
+    BlockBasedTable* new_table, const SliceTransform* prefix_extractor,
+    bool prefetch_all, const BlockBasedTableOptions& table_options,
+    const int level, const bool prefetch_index_and_filter_in_cache) {
+  Status s;
+
+  // Find filter handle and filter type
+  if (rep->filter_policy) {
+    for (auto filter_type :
+         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
+          Rep::FilterType::kBlockFilter}) {
+      std::string prefix;
+      switch (filter_type) {
+        case Rep::FilterType::kFullFilter:
+          prefix = kFullFilterBlockPrefix;
+          break;
+        case Rep::FilterType::kPartitionedFilter:
+          prefix = kPartitionedFilterBlockPrefix;
+          break;
+        case Rep::FilterType::kBlockFilter:
+          prefix = kFilterBlockPrefix;
+          break;
+        default:
+          assert(0);
+      }
+      std::string filter_block_key = prefix;
+      filter_block_key.append(rep->filter_policy->Name());
+      if (FindMetaBlock(meta_iter, filter_block_key, &rep->filter_handle)
+              .ok()) {
+        rep->filter_type = filter_type;
+        break;
       }
     }
   }
 
-  // Determine whether whole key filtering is supported.
-  if (rep->table_properties) {
-    rep->whole_key_filtering &=
-        IsFeatureSupported(*(rep->table_properties),
-                           BlockBasedTablePropertyNames::kWholeKeyFiltering,
-                           rep->ioptions.info_log);
-    rep->prefix_filtering &= IsFeatureSupported(
-        *(rep->table_properties),
-        BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
-
-    rep->global_seqno = GetGlobalSequenceNumber(*(rep->table_properties),
-                                                rep->ioptions.info_log);
+  {
+    // Find compression dictionary handle
+    bool found_compression_dict;
+    s = SeekToCompressionDictBlock(meta_iter, &found_compression_dict,
+                                   &rep->compression_dict_handle);
   }
 
-  const bool pin =
+  bool need_upper_bound_check =
+      PrefixExtractorChanged(rep->table_properties.get(), prefix_extractor);
+
+  BlockBasedTableOptions::IndexType index_type = new_table->UpdateIndexType();
+  // prefetch the first level of index
+  const bool prefetch_index =
+      prefetch_all ||
+      (table_options.pin_top_level_index_and_filter &&
+       index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+  // prefetch the first level of filter
+  const bool prefetch_filter =
+      prefetch_all || (table_options.pin_top_level_index_and_filter &&
+                       rep->filter_type == Rep::FilterType::kPartitionedFilter);
+  // Partition fitlers cannot be enabled without partition indexes
+  assert(!prefetch_filter || prefetch_index);
+  // pin both index and filters, down to all partitions
+  const bool pin_all =
       rep->table_options.pin_l0_filter_and_index_blocks_in_cache && level == 0;
+  // pin the first level of index
+  const bool pin_index =
+      pin_all || (table_options.pin_top_level_index_and_filter &&
+                  index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
+  // pin the first level of filter
+  const bool pin_filter =
+      pin_all || (table_options.pin_top_level_index_and_filter &&
+                  rep->filter_type == Rep::FilterType::kPartitionedFilter);
   // pre-fetching of blocks is turned on
-  // Will use block cache for index/filter blocks access
+  // Will use block cache for meta-blocks access
   // Always prefetch index and filter for level 0
+  // TODO(ajkr): also prefetch compression dictionary block
   if (table_options.cache_index_and_filter_blocks) {
-    if (prefetch_index_and_filter_in_cache || level == 0) {
-      assert(table_options.block_cache != nullptr);
+    assert(table_options.block_cache != nullptr);
+    if (prefetch_index) {
       // Hack: Call NewIndexIterator() to implicitly add index to the
       // block_cache
-
       CachableEntry<IndexReader> index_entry;
-      unique_ptr<InternalIterator> iter(
-          new_table->NewIndexIterator(ReadOptions(), nullptr, &index_entry));
-      index_entry.value->CacheDependencies(pin);
-      if (pin) {
-        rep->index_entry = std::move(index_entry);
-      } else {
-        index_entry.Release(table_options.block_cache.get());
+      // check prefix_extractor match only if hash based index is used
+      bool disable_prefix_seek =
+          rep->index_type == BlockBasedTableOptions::kHashSearch &&
+          need_upper_bound_check;
+      if (s.ok()) {
+        std::unique_ptr<InternalIteratorBase<BlockHandle>> iter(
+            new_table->NewIndexIterator(ReadOptions(), disable_prefix_seek,
+                                        nullptr, &index_entry));
+        s = iter->status();
       }
-      s = iter->status();
-
       if (s.ok()) {
-        // Hack: Call GetFilter() to implicitly add filter to the block_cache
-        auto filter_entry = new_table->GetFilter();
-        if (filter_entry.value != nullptr) {
-          filter_entry.value->CacheDependencies(pin);
+        // This is the first call to NewIndexIterator() since we're in Open().
+        // On success it should give us ownership of the `CachableEntry` by
+        // populating `index_entry`.
+        assert(index_entry.value != nullptr);
+        if (prefetch_all) {
+          index_entry.value->CacheDependencies(pin_all);
         }
-        // if pin_l0_filter_and_index_blocks_in_cache is true, and this is
-        // a level0 file, then save it in rep_->filter_entry; it will be
-        // released in the destructor only, hence it will be pinned in the
-        // cache while this reader is alive
-        if (pin) {
-          rep->filter_entry = filter_entry;
+        if (pin_index) {
+          rep->index_entry = std::move(index_entry);
         } else {
-          filter_entry.Release(table_options.block_cache.get());
+          index_entry.Release(table_options.block_cache.get());
         }
       }
     }
+    if (s.ok() && prefetch_filter) {
+      // Hack: Call GetFilter() to implicitly add filter to the block_cache
+      auto filter_entry =
+          new_table->GetFilter(rep->table_prefix_extractor.get());
+      if (filter_entry.value != nullptr && prefetch_all) {
+        filter_entry.value->CacheDependencies(
+            pin_all, rep->table_prefix_extractor.get());
+      }
+      // if pin_filter is true then save it in rep_->filter_entry; it will be
+      // released in the destructor only, hence it will be pinned in the
+      // cache while this reader is alive
+      if (pin_filter) {
+        rep->filter_entry = filter_entry;
+      } else {
+        filter_entry.Release(table_options.block_cache.get());
+      }
+    }
   } else {
-    // If we don't use block cache for index/filter blocks access, we'll
-    // pre-load these blocks, which will kept in member variables in Rep
-    // and with a same life-time as this table object.
+    // If we don't use block cache for meta-block access, we'll pre-load these
+    // blocks, which will kept in member variables in Rep and with a same life-
+    // time as this table object.
     IndexReader* index_reader = nullptr;
-    s = new_table->CreateIndexReader(prefetch_buffer.get(), &index_reader,
-                                     meta_iter.get(), level);
+    if (s.ok()) {
+      s = new_table->CreateIndexReader(prefetch_buffer, &index_reader,
+                                       meta_iter, level);
+    }
+    std::unique_ptr<const BlockContents> compression_dict_block;
     if (s.ok()) {
       rep->index_reader.reset(index_reader);
       // The partitions of partitioned index are always stored in cache. They
       // are hence follow the configuration for pin and prefetch regardless of
       // the value of cache_index_and_filter_blocks
       if (prefetch_index_and_filter_in_cache || level == 0) {
-        rep->index_reader->CacheDependencies(pin);
+        rep->index_reader->CacheDependencies(pin_all);
       }
 
       // Set filter block
       if (rep->filter_policy) {
         const bool is_a_filter_partition = true;
-        auto filter = new_table->ReadFilter(
-            prefetch_buffer.get(), rep->filter_handle, !is_a_filter_partition);
+        auto filter = new_table->ReadFilter(prefetch_buffer, rep->filter_handle,
+                                            !is_a_filter_partition,
+                                            rep->table_prefix_extractor.get());
         rep->filter.reset(filter);
         // Refer to the comment above about paritioned indexes always being
         // cached
         if (filter && (prefetch_index_and_filter_in_cache || level == 0)) {
-          filter->CacheDependencies(pin);
+          filter->CacheDependencies(pin_all, rep->table_prefix_extractor.get());
         }
       }
+      s = ReadCompressionDictBlock(rep, prefetch_buffer,
+                                   &compression_dict_block);
     } else {
       delete index_reader;
     }
+    if (s.ok() && !rep->compression_dict_handle.IsNull()) {
+      assert(compression_dict_block != nullptr);
+      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
+      rep->uncompression_dict.reset(new UncompressionDict(
+          compression_dict_block->data.ToString(),
+          rep->blocks_definitely_zstd_compressed, rep->ioptions.statistics));
+    }
   }
-
-  if (s.ok()) {
-    *table_reader = std::move(new_table);
-  }
-
   return s;
 }
 
@@ -909,6 +1322,9 @@ size_t BlockBasedTable::ApproximateMemoryUsage() const {
   if (rep_->index_reader) {
     usage += rep_->index_reader->ApproximateMemoryUsage();
   }
+  if (rep_->uncompression_dict) {
+    usage += rep_->uncompression_dict->ApproximateMemoryUsage();
+  }
   return usage;
 }
 
@@ -924,9 +1340,10 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
   Status s = ReadBlockFromFile(
       rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
       rep->footer.metaindex_handle(), &meta, rep->ioptions,
-      true /* decompress */, Slice() /*compression dict*/,
-      rep->persistent_cache_options, kDisableGlobalSequenceNumber,
-      0 /* read_amp_bytes_per_bit */);
+      true /* decompress */, true /*maybe_compressed*/,
+      UncompressionDict::GetEmptyDict(), rep->persistent_cache_options,
+      kDisableGlobalSequenceNumber, 0 /* read_amp_bytes_per_bit */,
+      GetMemoryAllocator(rep->table_options));
 
   if (!s.ok()) {
     ROCKS_LOG_ERROR(rep->ioptions.info_log,
@@ -938,28 +1355,38 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep,
 
   *meta_block = std::move(meta);
   // meta block uses bytewise comparator.
-  iter->reset(meta_block->get()->NewIterator(BytewiseComparator()));
+  iter->reset(meta_block->get()->NewIterator<DataBlockIter>(
+      BytewiseComparator(), BytewiseComparator()));
   return Status::OK();
 }
 
 Status BlockBasedTable::GetDataBlockFromCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-    Cache* block_cache, Cache* block_cache_compressed,
-    const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
-    BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
-    const Slice& compression_dict, size_t read_amp_bytes_per_bit,
-    bool is_index) {
+    Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
+    const ReadOptions& read_options,
+    BlockBasedTable::CachableEntry<Block>* block,
+    const UncompressionDict& uncompression_dict, size_t read_amp_bytes_per_bit,
+    bool is_index, GetContext* get_context) {
   Status s;
-  Block* compressed_block = nullptr;
+  BlockContents* compressed_block = nullptr;
   Cache::Handle* block_cache_compressed_handle = nullptr;
-  Statistics* statistics = ioptions.statistics;
+  Statistics* statistics = rep->ioptions.statistics;
 
   // Lookup uncompressed cache first
   if (block_cache != nullptr) {
     block->cache_handle = GetEntryFromCache(
-        block_cache, block_cache_key,
+        block_cache, block_cache_key, rep->level,
         is_index ? BLOCK_CACHE_INDEX_MISS : BLOCK_CACHE_DATA_MISS,
-        is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT, statistics);
+        is_index ? BLOCK_CACHE_INDEX_HIT : BLOCK_CACHE_DATA_HIT,
+        get_context
+            ? (is_index ? &get_context->get_context_stats_.num_cache_index_miss
+                        : &get_context->get_context_stats_.num_cache_data_miss)
+            : nullptr,
+        get_context
+            ? (is_index ? &get_context->get_context_stats_.num_cache_index_hit
+                        : &get_context->get_context_stats_.num_cache_data_hit)
+            : nullptr,
+        statistics, get_context);
     if (block->cache_handle != nullptr) {
       block->value =
           reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
@@ -986,44 +1413,62 @@ Status BlockBasedTable::GetDataBlockFromCache(
 
   // found compressed block
   RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
-  compressed_block = reinterpret_cast<Block*>(
+  compressed_block = reinterpret_cast<BlockContents*>(
       block_cache_compressed->Value(block_cache_compressed_handle));
-  assert(compressed_block->compression_type() != kNoCompression);
+  CompressionType compression_type = compressed_block->get_compression_type();
+  assert(compression_type != kNoCompression);
 
   // Retrieve the uncompressed contents into a new buffer
   BlockContents contents;
-  s = UncompressBlockContents(compressed_block->data(),
-                              compressed_block->size(), &contents,
-                              format_version, compression_dict,
-                              ioptions);
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, uncompression_dict, compression_type);
+  s = UncompressBlockContents(info, compressed_block->data.data(),
+                              compressed_block->data.size(), &contents,
+                              rep->table_options.format_version, rep->ioptions,
+                              GetMemoryAllocator(rep->table_options));
 
   // Insert uncompressed block into block cache
   if (s.ok()) {
     block->value =
-        new Block(std::move(contents), compressed_block->global_seqno(),
+        new Block(std::move(contents), rep->get_global_seqno(is_index),
                   read_amp_bytes_per_bit,
                   statistics);  // uncompressed block
-    assert(block->value->compression_type() == kNoCompression);
-    if (block_cache != nullptr && block->value->cachable() &&
+    if (block_cache != nullptr && block->value->own_bytes() &&
         read_options.fill_cache) {
-      s = block_cache->Insert(
-          block_cache_key, block->value, block->value->usable_size(),
-          &DeleteCachedEntry<Block>, &(block->cache_handle));
-      block_cache->TEST_mark_as_data_block(block_cache_key,
-                                           block->value->usable_size());
+      size_t charge = block->value->ApproximateMemoryUsage();
+      s = block_cache->Insert(block_cache_key, block->value, charge,
+                              &DeleteCachedEntry<Block>,
+                              &(block->cache_handle));
+#ifndef NDEBUG
+      block_cache->TEST_mark_as_data_block(block_cache_key, charge);
+#endif  // NDEBUG
       if (s.ok()) {
-        RecordTick(statistics, BLOCK_CACHE_ADD);
+        if (get_context != nullptr) {
+          get_context->get_context_stats_.num_cache_add++;
+          get_context->get_context_stats_.num_cache_bytes_write += charge;
+        } else {
+          RecordTick(statistics, BLOCK_CACHE_ADD);
+          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
+        }
         if (is_index) {
-          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-          RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT,
-                     block->value->usable_size());
+          if (get_context != nullptr) {
+            get_context->get_context_stats_.num_cache_index_add++;
+            get_context->get_context_stats_.num_cache_index_bytes_insert +=
+                charge;
+          } else {
+            RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+            RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
+          }
         } else {
-          RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
-          RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT,
-                     block->value->usable_size());
+          if (get_context != nullptr) {
+            get_context->get_context_stats_.num_cache_data_add++;
+            get_context->get_context_stats_.num_cache_data_bytes_insert +=
+                charge;
+          } else {
+            RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+            RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
+          }
         }
-        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
-                   block->value->usable_size());
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
         delete block->value;
@@ -1040,80 +1485,109 @@ Status BlockBasedTable::GetDataBlockFromCache(
 Status BlockBasedTable::PutDataBlockToCache(
     const Slice& block_cache_key, const Slice& compressed_block_cache_key,
     Cache* block_cache, Cache* block_cache_compressed,
-    const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
-    CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
-    const Slice& compression_dict, size_t read_amp_bytes_per_bit, bool is_index,
-    Cache::Priority priority) {
-  assert(raw_block->compression_type() == kNoCompression ||
+    const ReadOptions& /*read_options*/, const ImmutableCFOptions& ioptions,
+    CachableEntry<Block>* cached_block, BlockContents* raw_block_contents,
+    CompressionType raw_block_comp_type, uint32_t format_version,
+    const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+    size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
+    bool is_index, Cache::Priority priority, GetContext* get_context) {
+  assert(raw_block_comp_type == kNoCompression ||
          block_cache_compressed != nullptr);
 
   Status s;
   // Retrieve the uncompressed contents into a new buffer
-  BlockContents contents;
+  BlockContents uncompressed_block_contents;
   Statistics* statistics = ioptions.statistics;
-  if (raw_block->compression_type() != kNoCompression) {
-    s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents,
-                                format_version, compression_dict, ioptions);
+  if (raw_block_comp_type != kNoCompression) {
+    UncompressionContext context(raw_block_comp_type);
+    UncompressionInfo info(context, uncompression_dict, raw_block_comp_type);
+    s = UncompressBlockContents(info, raw_block_contents->data.data(),
+                                raw_block_contents->data.size(),
+                                &uncompressed_block_contents, format_version,
+                                ioptions, memory_allocator);
   }
   if (!s.ok()) {
-    delete raw_block;
     return s;
   }
 
-  if (raw_block->compression_type() != kNoCompression) {
-    block->value = new Block(std::move(contents), raw_block->global_seqno(),
-                             read_amp_bytes_per_bit,
-                             statistics);  // uncompressed block
+  if (raw_block_comp_type != kNoCompression) {
+    cached_block->value = new Block(std::move(uncompressed_block_contents),
+                                    seq_no, read_amp_bytes_per_bit,
+                                    statistics);  // uncompressed block
   } else {
-    block->value = raw_block;
-    raw_block = nullptr;
+    cached_block->value =
+        new Block(std::move(*raw_block_contents), seq_no,
+                  read_amp_bytes_per_bit, ioptions.statistics);
   }
 
   // Insert compressed block into compressed block cache.
   // Release the hold on the compressed cache entry immediately.
-  if (block_cache_compressed != nullptr && raw_block != nullptr &&
-      raw_block->cachable()) {
-    s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block,
-                                       raw_block->usable_size(),
-                                       &DeleteCachedEntry<Block>);
+  if (block_cache_compressed != nullptr &&
+      raw_block_comp_type != kNoCompression && raw_block_contents != nullptr &&
+      raw_block_contents->own_bytes()) {
+#ifndef NDEBUG
+    assert(raw_block_contents->is_raw_block);
+#endif  // NDEBUG
+
+    // We cannot directly put raw_block_contents because this could point to
+    // an object in the stack.
+    BlockContents* block_cont_for_comp_cache =
+        new BlockContents(std::move(*raw_block_contents));
+    s = block_cache_compressed->Insert(
+        compressed_block_cache_key, block_cont_for_comp_cache,
+        block_cont_for_comp_cache->ApproximateMemoryUsage(),
+        &DeleteCachedEntry<BlockContents>);
     if (s.ok()) {
       // Avoid the following code to delete this cached block.
-      raw_block = nullptr;
       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD);
     } else {
       RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+      delete block_cont_for_comp_cache;
     }
   }
-  delete raw_block;
 
   // insert into uncompressed block cache
-  assert((block->value->compression_type() == kNoCompression));
-  if (block_cache != nullptr && block->value->cachable()) {
-    s = block_cache->Insert(
-        block_cache_key, block->value, block->value->usable_size(),
-        &DeleteCachedEntry<Block>, &(block->cache_handle), priority);
-    block_cache->TEST_mark_as_data_block(block_cache_key,
-                                         block->value->usable_size());
+  if (block_cache != nullptr && cached_block->value->own_bytes()) {
+    size_t charge = cached_block->value->ApproximateMemoryUsage();
+    s = block_cache->Insert(block_cache_key, cached_block->value, charge,
+                            &DeleteCachedEntry<Block>,
+                            &(cached_block->cache_handle), priority);
+#ifndef NDEBUG
+    block_cache->TEST_mark_as_data_block(block_cache_key, charge);
+#endif  // NDEBUG
     if (s.ok()) {
-      assert(block->cache_handle != nullptr);
-      RecordTick(statistics, BLOCK_CACHE_ADD);
+      assert(cached_block->cache_handle != nullptr);
+      if (get_context != nullptr) {
+        get_context->get_context_stats_.num_cache_add++;
+        get_context->get_context_stats_.num_cache_bytes_write += charge;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_ADD);
+        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
+      }
       if (is_index) {
-        RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-        RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT,
-                   block->value->usable_size());
+        if (get_context != nullptr) {
+          get_context->get_context_stats_.num_cache_index_add++;
+          get_context->get_context_stats_.num_cache_index_bytes_insert +=
+              charge;
+        } else {
+          RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
+          RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
+        }
       } else {
-        RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
-        RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT,
-                   block->value->usable_size());
+        if (get_context != nullptr) {
+          get_context->get_context_stats_.num_cache_data_add++;
+          get_context->get_context_stats_.num_cache_data_bytes_insert += charge;
+        } else {
+          RecordTick(statistics, BLOCK_CACHE_DATA_ADD);
+          RecordTick(statistics, BLOCK_CACHE_DATA_BYTES_INSERT, charge);
+        }
       }
-      RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
-                 block->value->usable_size());
-      assert(reinterpret_cast<Block*>(
-                 block_cache->Value(block->cache_handle)) == block->value);
+      assert(reinterpret_cast<Block*>(block_cache->Value(
+                 cached_block->cache_handle)) == cached_block->value);
     } else {
       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
-      delete block->value;
-      block->value = nullptr;
+      delete cached_block->value;
+      cached_block->value = nullptr;
     }
   }
 
@@ -1122,7 +1596,8 @@ Status BlockBasedTable::PutDataBlockToCache(
 
 FilterBlockReader* BlockBasedTable::ReadFilter(
     FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
-    const bool is_a_filter_partition) const {
+    const bool is_a_filter_partition,
+    const SliceTransform* prefix_extractor) const {
   auto& rep = rep_;
   // TODO: We might want to unify with ReadBlockFromFile() if we start
   // requiring checksum verification in Table::Open.
@@ -1130,11 +1605,15 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
     return nullptr;
   }
   BlockContents block;
-  if (!ReadBlockContents(rep->file.get(), prefetch_buffer, rep->footer,
-                         ReadOptions(), filter_handle, &block, rep->ioptions,
-                         false /* decompress */, Slice() /*compression dict*/,
-                         rep->persistent_cache_options)
-           .ok()) {
+
+  BlockFetcher block_fetcher(
+      rep->file.get(), prefetch_buffer, rep->footer, ReadOptions(),
+      filter_handle, &block, rep->ioptions, false /* decompress */,
+      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+      rep->persistent_cache_options, GetMemoryAllocator(rep->table_options));
+  Status s = block_fetcher.ReadBlockContents();
+
+  if (!s.ok()) {
     // Error reading the block
     return nullptr;
   }
@@ -1150,14 +1629,18 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
   switch (filter_type) {
     case Rep::FilterType::kPartitionedFilter: {
       return new PartitionedFilterBlockReader(
-          rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
+          rep->prefix_filtering ? prefix_extractor : nullptr,
           rep->whole_key_filtering, std::move(block), nullptr,
-          rep->ioptions.statistics, rep->internal_comparator, this);
+          rep->ioptions.statistics, rep->internal_comparator, this,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0);
     }
 
     case Rep::FilterType::kBlockFilter:
       return new BlockBasedFilterBlockReader(
-          rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
+          rep->prefix_filtering ? prefix_extractor : nullptr,
           rep->table_options, rep->whole_key_filtering, std::move(block),
           rep->ioptions.statistics);
 
@@ -1166,7 +1649,7 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
           rep->filter_policy->GetFilterBitsReader(block.data);
       assert(filter_bits_reader != nullptr);
       return new FullFilterBlockReader(
-          rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
+          rep->prefix_filtering ? prefix_extractor : nullptr,
           rep->whole_key_filtering, std::move(block), filter_bits_reader,
           rep->ioptions.statistics);
     }
@@ -1180,16 +1663,18 @@ FilterBlockReader* BlockBasedTable::ReadFilter(
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
-    FilePrefetchBuffer* prefetch_buffer, bool no_io) const {
+    const SliceTransform* prefix_extractor, FilePrefetchBuffer* prefetch_buffer,
+    bool no_io, GetContext* get_context) const {
   const BlockHandle& filter_blk_handle = rep_->filter_handle;
   const bool is_a_filter_partition = true;
   return GetFilter(prefetch_buffer, filter_blk_handle, !is_a_filter_partition,
-                   no_io);
+                   no_io, get_context, prefix_extractor);
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
-    const bool is_a_filter_partition, bool no_io) const {
+    const bool is_a_filter_partition, bool no_io, GetContext* get_context,
+    const SliceTransform* prefix_extractor) const {
   // If cache_index_and_filter_blocks is false, filter should be pre-populated.
   // We will return rep_->filter anyway. rep_->filter can be nullptr if filter
   // read fails at Open() time. We don't want to reload again since it will
@@ -1217,32 +1702,47 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
                          filter_blk_handle, cache_key);
 
   Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle =
-      GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS,
-                        BLOCK_CACHE_FILTER_HIT, statistics);
+  auto cache_handle = GetEntryFromCache(
+      block_cache, key, rep_->level, BLOCK_CACHE_FILTER_MISS,
+      BLOCK_CACHE_FILTER_HIT,
+      get_context ? &get_context->get_context_stats_.num_cache_filter_miss
+                  : nullptr,
+      get_context ? &get_context->get_context_stats_.num_cache_filter_hit
+                  : nullptr,
+      statistics, get_context);
 
   FilterBlockReader* filter = nullptr;
   if (cache_handle != nullptr) {
-    filter = reinterpret_cast<FilterBlockReader*>(
-        block_cache->Value(cache_handle));
+    PERF_COUNTER_ADD(block_cache_filter_hit_count, 1);
+    filter =
+        reinterpret_cast<FilterBlockReader*>(block_cache->Value(cache_handle));
   } else if (no_io) {
     // Do not invoke any io.
     return CachableEntry<FilterBlockReader>();
   } else {
-    filter =
-        ReadFilter(prefetch_buffer, filter_blk_handle, is_a_filter_partition);
+    filter = ReadFilter(prefetch_buffer, filter_blk_handle,
+                        is_a_filter_partition, prefix_extractor);
     if (filter != nullptr) {
-      assert(filter->size() > 0);
+      size_t usage = filter->ApproximateMemoryUsage();
       Status s = block_cache->Insert(
-          key, filter, filter->size(), &DeleteCachedFilterEntry, &cache_handle,
+          key, filter, usage, &DeleteCachedFilterEntry, &cache_handle,
           rep_->table_options.cache_index_and_filter_blocks_with_high_priority
               ? Cache::Priority::HIGH
               : Cache::Priority::LOW);
       if (s.ok()) {
-        RecordTick(statistics, BLOCK_CACHE_ADD);
-        RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
-        RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, filter->size());
-        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter->size());
+        PERF_COUNTER_ADD(filter_block_read_count, 1);
+        if (get_context != nullptr) {
+          get_context->get_context_stats_.num_cache_add++;
+          get_context->get_context_stats_.num_cache_bytes_write += usage;
+          get_context->get_context_stats_.num_cache_filter_add++;
+          get_context->get_context_stats_.num_cache_filter_bytes_insert +=
+              usage;
+        } else {
+          RecordTick(statistics, BLOCK_CACHE_ADD);
+          RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+          RecordTick(statistics, BLOCK_CACHE_FILTER_ADD);
+          RecordTick(statistics, BLOCK_CACHE_FILTER_BYTES_INSERT, usage);
+        }
       } else {
         RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
         delete filter;
@@ -1251,21 +1751,108 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
     }
   }
 
-  return { filter, cache_handle };
+  return {filter, cache_handle};
+}
+
+BlockBasedTable::CachableEntry<UncompressionDict>
+BlockBasedTable::GetUncompressionDict(Rep* rep,
+                                      FilePrefetchBuffer* prefetch_buffer,
+                                      bool no_io, GetContext* get_context) {
+  if (!rep->table_options.cache_index_and_filter_blocks) {
+    // block cache is either disabled or not used for meta-blocks. In either
+    // case, BlockBasedTableReader is the owner of the uncompression dictionary.
+    return {rep->uncompression_dict.get(), nullptr /* cache handle */};
+  }
+  if (rep->compression_dict_handle.IsNull()) {
+    return {nullptr, nullptr};
+  }
+  char cache_key_buf[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  auto cache_key =
+      GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
+                  rep->compression_dict_handle, cache_key_buf);
+  auto cache_handle = GetEntryFromCache(
+      rep->table_options.block_cache.get(), cache_key, rep->level,
+      BLOCK_CACHE_COMPRESSION_DICT_MISS, BLOCK_CACHE_COMPRESSION_DICT_HIT,
+      get_context
+          ? &get_context->get_context_stats_.num_cache_compression_dict_miss
+          : nullptr,
+      get_context
+          ? &get_context->get_context_stats_.num_cache_compression_dict_hit
+          : nullptr,
+      rep->ioptions.statistics, get_context);
+  UncompressionDict* dict = nullptr;
+  if (cache_handle != nullptr) {
+    dict = reinterpret_cast<UncompressionDict*>(
+        rep->table_options.block_cache->Value(cache_handle));
+  } else if (no_io) {
+    // Do not invoke any io.
+  } else {
+    std::unique_ptr<const BlockContents> compression_dict_block;
+    Status s =
+        ReadCompressionDictBlock(rep, prefetch_buffer, &compression_dict_block);
+    size_t usage = 0;
+    if (s.ok()) {
+      assert(compression_dict_block != nullptr);
+      // TODO(ajkr): find a way to avoid the `compression_dict_block` data copy
+      dict = new UncompressionDict(compression_dict_block->data.ToString(),
+                                   rep->blocks_definitely_zstd_compressed,
+                                   rep->ioptions.statistics);
+      usage = dict->ApproximateMemoryUsage();
+      s = rep->table_options.block_cache->Insert(
+          cache_key, dict, usage, &DeleteCachedUncompressionDictEntry,
+          &cache_handle,
+          rep->table_options.cache_index_and_filter_blocks_with_high_priority
+              ? Cache::Priority::HIGH
+              : Cache::Priority::LOW);
+    }
+    if (s.ok()) {
+      PERF_COUNTER_ADD(compression_dict_block_read_count, 1);
+      if (get_context != nullptr) {
+        get_context->get_context_stats_.num_cache_add++;
+        get_context->get_context_stats_.num_cache_bytes_write += usage;
+        get_context->get_context_stats_.num_cache_compression_dict_add++;
+        get_context->get_context_stats_
+            .num_cache_compression_dict_bytes_insert += usage;
+      } else {
+        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD);
+        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_WRITE, usage);
+        RecordTick(rep->ioptions.statistics, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+        RecordTick(rep->ioptions.statistics,
+                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, usage);
+      }
+    } else {
+      // There should be no way to get here if block cache insertion succeeded.
+      // Though it is still possible something failed earlier.
+      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_ADD_FAILURES);
+      delete dict;
+      dict = nullptr;
+      assert(cache_handle == nullptr);
+    }
+  }
+  return {dict, cache_handle};
 }
 
-InternalIterator* BlockBasedTable::NewIndexIterator(
-    const ReadOptions& read_options, BlockIter* input_iter,
-    CachableEntry<IndexReader>* index_entry) {
+// disable_prefix_seek should be set to true when prefix_extractor found in SST
+// differs from the one in mutable_cf_options and index type is HashBasedIndex
+InternalIteratorBase<BlockHandle>* BlockBasedTable::NewIndexIterator(
+    const ReadOptions& read_options, bool disable_prefix_seek,
+    IndexBlockIter* input_iter, CachableEntry<IndexReader>* index_entry,
+    GetContext* get_context) {
   // index reader has already been pre-populated.
   if (rep_->index_reader) {
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
     return rep_->index_reader->NewIterator(
-        input_iter, read_options.total_order_seek);
+        input_iter, read_options.total_order_seek || disable_prefix_seek,
+        read_options.fill_cache);
   }
   // we have a pinned index block
   if (rep_->index_entry.IsSet()) {
-    return rep_->index_entry.value->NewIterator(input_iter,
-                                                read_options.total_order_seek);
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
+    return rep_->index_entry.value->NewIterator(
+        input_iter, read_options.total_order_seek || disable_prefix_seek,
+        read_options.fill_cache);
   }
 
   PERF_TIMER_GUARD(read_index_block_nanos);
@@ -1277,21 +1864,28 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
       GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                             rep_->dummy_index_reader_offset, cache_key);
   Statistics* statistics = rep_->ioptions.statistics;
-  auto cache_handle =
-      GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS,
-                        BLOCK_CACHE_INDEX_HIT, statistics);
+  auto cache_handle = GetEntryFromCache(
+      block_cache, key, rep_->level, BLOCK_CACHE_INDEX_MISS,
+      BLOCK_CACHE_INDEX_HIT,
+      get_context ? &get_context->get_context_stats_.num_cache_index_miss
+                  : nullptr,
+      get_context ? &get_context->get_context_stats_.num_cache_index_hit
+                  : nullptr,
+      statistics, get_context);
 
   if (cache_handle == nullptr && no_io) {
     if (input_iter != nullptr) {
-      input_iter->SetStatus(Status::Incomplete("no blocking io"));
+      input_iter->Invalidate(Status::Incomplete("no blocking io"));
       return input_iter;
     } else {
-      return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
+      return NewErrorInternalIterator<BlockHandle>(
+          Status::Incomplete("no blocking io"));
     }
   }
 
   IndexReader* index_reader = nullptr;
   if (cache_handle != nullptr) {
+    PERF_COUNTER_ADD(block_cache_index_hit_count, 1);
     index_reader =
         reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle));
   } else {
@@ -1302,22 +1896,28 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
     TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:1");
     TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread2:3");
     TEST_SYNC_POINT("BlockBasedTable::NewIndexIterator::thread1:4");
+    size_t charge = 0;
     if (s.ok()) {
       assert(index_reader != nullptr);
+      charge = index_reader->ApproximateMemoryUsage();
       s = block_cache->Insert(
-          key, index_reader, index_reader->usable_size(),
-          &DeleteCachedIndexEntry, &cache_handle,
+          key, index_reader, charge, &DeleteCachedIndexEntry, &cache_handle,
           rep_->table_options.cache_index_and_filter_blocks_with_high_priority
               ? Cache::Priority::HIGH
               : Cache::Priority::LOW);
     }
 
     if (s.ok()) {
-      size_t usable_size = index_reader->usable_size();
-      RecordTick(statistics, BLOCK_CACHE_ADD);
+      if (get_context != nullptr) {
+        get_context->get_context_stats_.num_cache_add++;
+        get_context->get_context_stats_.num_cache_bytes_write += charge;
+      } else {
+        RecordTick(statistics, BLOCK_CACHE_ADD);
+        RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, charge);
+      }
+      PERF_COUNTER_ADD(index_block_read_count, 1);
       RecordTick(statistics, BLOCK_CACHE_INDEX_ADD);
-      RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, usable_size);
-      RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, usable_size);
+      RecordTick(statistics, BLOCK_CACHE_INDEX_BYTES_INSERT, charge);
     } else {
       if (index_reader != nullptr) {
         delete index_reader;
@@ -1325,18 +1925,19 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
       RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES);
       // make sure if something goes wrong, index_reader shall remain intact.
       if (input_iter != nullptr) {
-        input_iter->SetStatus(s);
+        input_iter->Invalidate(s);
         return input_iter;
       } else {
-        return NewErrorInternalIterator(s);
+        return NewErrorInternalIterator<BlockHandle>(s);
       }
     }
-
   }
 
   assert(cache_handle);
+  // We don't return pinned datat from index blocks, so no need
+  // to set `block_contents_pinned`.
   auto* iter = index_reader->NewIterator(
-      input_iter, read_options.total_order_seek);
+      input_iter, read_options.total_order_seek || disable_prefix_seek);
 
   // the caller would like to take ownership of the index block
   // don't call RegisterCleanup() in this case, the caller will take care of it
@@ -1349,102 +1950,153 @@ InternalIterator* BlockBasedTable::NewIndexIterator(
   return iter;
 }
 
-InternalIterator* BlockBasedTable::NewDataBlockIterator(
-    Rep* rep, const ReadOptions& ro, const Slice& index_value,
-    BlockIter* input_iter, bool is_index) {
-  BlockHandle handle;
-  Slice input = index_value;
-  // We intentionally allow extra stuff in index_value so that we
-  // can add more features in the future.
-  Status s = handle.DecodeFrom(&input);
-  return NewDataBlockIterator(rep, ro, handle, input_iter, is_index, s);
-}
-
 // Convert an index iterator value (i.e., an encoded BlockHandle)
 // into an iterator over the contents of the corresponding block.
 // If input_iter is null, new a iterator
 // If input_iter is not null, update this iter and return it
-InternalIterator* BlockBasedTable::NewDataBlockIterator(
+template <typename TBlockIter>
+TBlockIter* BlockBasedTable::NewDataBlockIterator(
     Rep* rep, const ReadOptions& ro, const BlockHandle& handle,
-    BlockIter* input_iter, bool is_index, Status s) {
+    TBlockIter* input_iter, bool is_index, bool key_includes_seq,
+    bool index_key_is_full, GetContext* get_context, Status s,
+    FilePrefetchBuffer* prefetch_buffer) {
   PERF_TIMER_GUARD(new_table_block_iter_nanos);
 
-  const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep->table_options.block_cache.get();
   CachableEntry<Block> block;
-  Slice compression_dict;
-  if (s.ok()) {
-    if (rep->compression_dict_block) {
-      compression_dict = rep->compression_dict_block->data;
+  TBlockIter* iter;
+  {
+    const bool no_io = (ro.read_tier == kBlockCacheTier);
+    auto uncompression_dict_storage =
+        GetUncompressionDict(rep, prefetch_buffer, no_io, get_context);
+    const UncompressionDict& uncompression_dict =
+        uncompression_dict_storage.value == nullptr
+            ? UncompressionDict::GetEmptyDict()
+            : *uncompression_dict_storage.value;
+    if (s.ok()) {
+      s = MaybeReadBlockAndLoadToCache(prefetch_buffer, rep, ro, handle,
+                                       uncompression_dict, &block, is_index,
+                                       get_context);
     }
-    s = MaybeLoadDataBlockToCache(nullptr /*prefetch_buffer*/, rep, ro, handle,
-                                  compression_dict, &block, is_index);
-  }
 
-  // Didn't get any data from block caches.
-  if (s.ok() && block.value == nullptr) {
-    if (no_io) {
-      // Could not read from block_cache and can't do IO
-      if (input_iter != nullptr) {
-        input_iter->SetStatus(Status::Incomplete("no blocking io"));
-        return input_iter;
-      } else {
-        return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
+    if (input_iter != nullptr) {
+      iter = input_iter;
+    } else {
+      iter = new TBlockIter;
+    }
+    // Didn't get any data from block caches.
+    if (s.ok() && block.value == nullptr) {
+      if (no_io) {
+        // Could not read from block_cache and can't do IO
+        iter->Invalidate(Status::Incomplete("no blocking io"));
+        return iter;
+      }
+      std::unique_ptr<Block> block_value;
+      {
+        StopWatch sw(rep->ioptions.env, rep->ioptions.statistics,
+                     READ_BLOCK_GET_MICROS);
+        s = ReadBlockFromFile(
+            rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
+            &block_value, rep->ioptions,
+            rep->blocks_maybe_compressed /*do_decompress*/,
+            rep->blocks_maybe_compressed, uncompression_dict,
+            rep->persistent_cache_options,
+            is_index ? kDisableGlobalSequenceNumber : rep->global_seqno,
+            rep->table_options.read_amp_bytes_per_bit,
+            GetMemoryAllocator(rep->table_options));
+      }
+      if (s.ok()) {
+        block.value = block_value.release();
       }
     }
-    std::unique_ptr<Block> block_value;
-    s = ReadBlockFromFile(rep->file.get(), nullptr /* prefetch_buffer */,
-                          rep->footer, ro, handle, &block_value, rep->ioptions,
-                          true /* compress */, compression_dict,
-                          rep->persistent_cache_options, rep->global_seqno,
-                          rep->table_options.read_amp_bytes_per_bit);
-    if (s.ok()) {
-      block.value = block_value.release();
-    }
+    // TODO(ajkr): also pin compression dictionary block when
+    // `pin_l0_filter_and_index_blocks_in_cache == true`.
+    uncompression_dict_storage.Release(block_cache);
   }
 
-  InternalIterator* iter;
   if (s.ok()) {
     assert(block.value != nullptr);
-    iter = block.value->NewIterator(&rep->internal_comparator, input_iter, true,
-                                    rep->ioptions.statistics);
+    const bool kTotalOrderSeek = true;
+    // Block contents are pinned and it is still pinned after the iterator
+    // is destroyed as long as cleanup functions are moved to another object,
+    // when:
+    // 1. block cache handle is set to be released in cleanup function, or
+    // 2. it's pointing to immortal source. If own_bytes is true then we are
+    //    not reading data from the original source, whether immortal or not.
+    //    Otherwise, the block is pinned iff the source is immortal.
+    bool block_contents_pinned =
+        (block.cache_handle != nullptr ||
+         (!block.value->own_bytes() && rep->immortal_table));
+    iter = block.value->NewIterator<TBlockIter>(
+        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+        iter, rep->ioptions.statistics, kTotalOrderSeek, key_includes_seq,
+        index_key_is_full, block_contents_pinned);
     if (block.cache_handle != nullptr) {
       iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
                             block.cache_handle);
     } else {
+      if (!ro.fill_cache && rep->cache_key_prefix_size != 0) {
+        // insert a dummy record to block cache to track the memory usage
+        Cache::Handle* cache_handle;
+        // There are two other types of cache keys: 1) SST cache key added in
+        // `MaybeReadBlockAndLoadToCache` 2) dummy cache key added in
+        // `write_buffer_manager`. Use longer prefix (41 bytes) to differentiate
+        // from SST cache key(31 bytes), and use non-zero prefix to
+        // differentiate from `write_buffer_manager`
+        const size_t kExtraCacheKeyPrefix = kMaxVarint64Length * 4 + 1;
+        char cache_key[kExtraCacheKeyPrefix + kMaxVarint64Length];
+        // Prefix: use rep->cache_key_prefix padded by 0s
+        memset(cache_key, 0, kExtraCacheKeyPrefix + kMaxVarint64Length);
+        assert(rep->cache_key_prefix_size != 0);
+        assert(rep->cache_key_prefix_size <= kExtraCacheKeyPrefix);
+        memcpy(cache_key, rep->cache_key_prefix, rep->cache_key_prefix_size);
+        char* end = EncodeVarint64(cache_key + kExtraCacheKeyPrefix,
+                                   next_cache_key_id_++);
+        assert(end - cache_key <=
+               static_cast<int>(kExtraCacheKeyPrefix + kMaxVarint64Length));
+        Slice unique_key =
+            Slice(cache_key, static_cast<size_t>(end - cache_key));
+        s = block_cache->Insert(unique_key, nullptr,
+                                block.value->ApproximateMemoryUsage(), nullptr,
+                                &cache_handle);
+        if (s.ok()) {
+          if (cache_handle != nullptr) {
+            iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
+                                  cache_handle);
+          }
+        }
+      }
       iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
     }
   } else {
     assert(block.value == nullptr);
-    if (input_iter != nullptr) {
-      input_iter->SetStatus(s);
-      iter = input_iter;
-    } else {
-      iter = NewErrorInternalIterator(s);
-    }
+    iter->Invalidate(s);
   }
   return iter;
 }
 
-Status BlockBasedTable::MaybeLoadDataBlockToCache(
+Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
-    const BlockHandle& handle, Slice compression_dict,
-    CachableEntry<Block>* block_entry, bool is_index) {
+    const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+    CachableEntry<Block>* block_entry, bool is_index, GetContext* get_context) {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep->table_options.block_cache.get();
+
+  // No point to cache compressed blocks if it never goes away
   Cache* block_cache_compressed =
-      rep->table_options.block_cache_compressed.get();
+      rep->immortal_table ? nullptr
+                          : rep->table_options.block_cache_compressed.get();
 
+  // First, try to get the block from the cache
+  //
   // If either block cache is enabled, we'll try to read from it.
   Status s;
+  char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+  Slice key /* key to the block cache */;
+  Slice ckey /* key to the compressed block cache */;
   if (block_cache != nullptr || block_cache_compressed != nullptr) {
-    Statistics* statistics = rep->ioptions.statistics;
-    char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-    char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-    Slice key, /* key to the block cache */
-        ckey /* key to the compressed block cache */;
-
     // create key for block cache
     if (block_cache != nullptr) {
       key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
@@ -1457,33 +2109,47 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
                          compressed_cache_key);
     }
 
-    s = GetDataBlockFromCache(
-        key, ckey, block_cache, block_cache_compressed, rep->ioptions, ro,
-        block_entry, rep->table_options.format_version, compression_dict,
-        rep->table_options.read_amp_bytes_per_bit, is_index);
+    s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
+                              rep, ro, block_entry, uncompression_dict,
+                              rep->table_options.read_amp_bytes_per_bit,
+                              is_index, get_context);
 
+    // Can't find the block from the cache. If I/O is allowed, read from the
+    // file.
     if (block_entry->value == nullptr && !no_io && ro.fill_cache) {
-      std::unique_ptr<Block> raw_block;
+      Statistics* statistics = rep->ioptions.statistics;
+      bool do_decompress =
+          block_cache_compressed == nullptr && rep->blocks_maybe_compressed;
+      CompressionType raw_block_comp_type;
+      BlockContents raw_block_contents;
       {
         StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
-        s = ReadBlockFromFile(
+        BlockFetcher block_fetcher(
             rep->file.get(), prefetch_buffer, rep->footer, ro, handle,
-            &raw_block, rep->ioptions, block_cache_compressed == nullptr,
-            compression_dict, rep->persistent_cache_options, rep->global_seqno,
-            rep->table_options.read_amp_bytes_per_bit);
+            &raw_block_contents, rep->ioptions,
+            do_decompress /* do uncompress */, rep->blocks_maybe_compressed,
+            uncompression_dict, rep->persistent_cache_options,
+            GetMemoryAllocator(rep->table_options),
+            GetMemoryAllocatorForCompressedBlock(rep->table_options));
+        s = block_fetcher.ReadBlockContents();
+        raw_block_comp_type = block_fetcher.get_compression_type();
       }
 
       if (s.ok()) {
+        SequenceNumber seq_no = rep->get_global_seqno(is_index);
+        // If filling cache is allowed and a cache is configured, try to put the
+        // block to the cache.
         s = PutDataBlockToCache(
             key, ckey, block_cache, block_cache_compressed, ro, rep->ioptions,
-            block_entry, raw_block.release(), rep->table_options.format_version,
-            compression_dict, rep->table_options.read_amp_bytes_per_bit,
-            is_index,
-            is_index &&
-                    rep->table_options
-                        .cache_index_and_filter_blocks_with_high_priority
+            block_entry, &raw_block_contents, raw_block_comp_type,
+            rep->table_options.format_version, uncompression_dict, seq_no,
+            rep->table_options.read_amp_bytes_per_bit,
+            GetMemoryAllocator(rep->table_options), is_index,
+            is_index && rep->table_options
+                            .cache_index_and_filter_blocks_with_high_priority
                 ? Cache::Priority::HIGH
-                : Cache::Priority::LOW);
+                : Cache::Priority::LOW,
+            get_context);
       }
     }
   }
@@ -1491,65 +2157,44 @@ Status BlockBasedTable::MaybeLoadDataBlockToCache(
   return s;
 }
 
-BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState(
-    BlockBasedTable* table, const ReadOptions& read_options,
-    const InternalKeyComparator* icomparator, bool skip_filters, bool is_index,
-    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map)
-    : TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor != nullptr),
-      table_(table),
-      read_options_(read_options),
-      icomparator_(icomparator),
-      skip_filters_(skip_filters),
-      is_index_(is_index),
-      block_map_(block_map) {}
-
-InternalIterator*
-BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator(
-    const Slice& index_value) {
+BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState(
+    BlockBasedTable* table,
+    std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
+    bool index_key_includes_seq, bool index_key_is_full)
+    : table_(table),
+      block_map_(block_map),
+      index_key_includes_seq_(index_key_includes_seq),
+      index_key_is_full_(index_key_is_full) {}
+
+template <class TBlockIter, typename TValue>
+const size_t BlockBasedTableIterator<TBlockIter, TValue>::kMaxReadaheadSize =
+    256 * 1024;
+
+InternalIteratorBase<BlockHandle>*
+BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator(
+    const BlockHandle& handle) {
   // Return a block iterator on the index partition
-  BlockHandle handle;
-  Slice input = index_value;
-  Status s = handle.DecodeFrom(&input);
-  auto rep = table_->rep_;
-  if (block_map_) {
-    auto block = block_map_->find(handle.offset());
-    // This is a possible scenario since block cache might not have had space
-    // for the partition
-    if (block != block_map_->end()) {
-      PERF_COUNTER_ADD(block_cache_hit_count, 1);
-      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
-      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
-      Cache* block_cache = rep->table_options.block_cache.get();
-      assert(block_cache);
-      RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
-                 block_cache->GetUsage(block->second.cache_handle));
-      return block->second.value->NewIterator(
-          &rep->internal_comparator, nullptr, true, rep->ioptions.statistics);
-    }
-  }
-  return NewDataBlockIterator(rep, read_options_, handle, nullptr, is_index_,
-                              s);
-}
-
-bool BlockBasedTable::BlockEntryIteratorState::PrefixMayMatch(
-    const Slice& internal_key) {
-  if (read_options_.total_order_seek || skip_filters_) {
-    return true;
-  }
-  return table_->PrefixMayMatch(internal_key);
-}
-
-bool BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound(
-    const Slice& internal_key) {
-  bool reached_upper_bound = read_options_.iterate_upper_bound != nullptr &&
-                             icomparator_ != nullptr &&
-                             icomparator_->user_comparator()->Compare(
-                                 ExtractUserKey(internal_key),
-                                 *read_options_.iterate_upper_bound) >= 0;
-  TEST_SYNC_POINT_CALLBACK(
-      "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound",
-      &reached_upper_bound);
-  return reached_upper_bound;
+  auto rep = table_->get_rep();
+  auto block = block_map_->find(handle.offset());
+  // This is a possible scenario since block cache might not have had space
+  // for the partition
+  if (block != block_map_->end()) {
+    PERF_COUNTER_ADD(block_cache_hit_count, 1);
+    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_INDEX_HIT);
+    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_HIT);
+    Cache* block_cache = rep->table_options.block_cache.get();
+    assert(block_cache);
+    RecordTick(rep->ioptions.statistics, BLOCK_CACHE_BYTES_READ,
+               block_cache->GetUsage(block->second.cache_handle));
+    Statistics* kNullStats = nullptr;
+    // We don't return pinned datat from index blocks, so no need
+    // to set `block_contents_pinned`.
+    return block->second.value->NewIterator<IndexBlockIter>(
+        &rep->internal_comparator, rep->internal_comparator.user_comparator(),
+        nullptr, kNullStats, true, index_key_includes_seq_, index_key_is_full_);
+  }
+  // Create an empty iterator
+  return new IndexBlockIter();
 }
 
 // This will be broken if the user specifies an unusual implementation
@@ -1564,32 +2209,52 @@ bool BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound(
 // Otherwise, this method guarantees no I/O will be incurred.
 //
 // REQUIRES: this method shouldn't be called while the DB lock is held.
-bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
+bool BlockBasedTable::PrefixMayMatch(
+    const Slice& internal_key, const ReadOptions& read_options,
+    const SliceTransform* options_prefix_extractor,
+    const bool need_upper_bound_check) {
   if (!rep_->filter_policy) {
     return true;
   }
 
-  assert(rep_->ioptions.prefix_extractor != nullptr);
+  const SliceTransform* prefix_extractor;
+
+  if (rep_->table_prefix_extractor == nullptr) {
+    if (need_upper_bound_check) {
+      return true;
+    }
+    prefix_extractor = options_prefix_extractor;
+  } else {
+    prefix_extractor = rep_->table_prefix_extractor.get();
+  }
   auto user_key = ExtractUserKey(internal_key);
-  if (!rep_->ioptions.prefix_extractor->InDomain(user_key) ||
-      rep_->table_properties->prefix_extractor_name.compare(
-          rep_->ioptions.prefix_extractor->Name()) != 0) {
+  if (!prefix_extractor->InDomain(user_key)) {
     return true;
   }
-  auto prefix = rep_->ioptions.prefix_extractor->Transform(user_key);
 
   bool may_match = true;
   Status s;
 
   // First, try check with full filter
-  auto filter_entry = GetFilter();
+  auto filter_entry = GetFilter(prefix_extractor);
   FilterBlockReader* filter = filter_entry.value;
+  bool filter_checked = true;
   if (filter != nullptr) {
     if (!filter->IsBlockBased()) {
       const Slice* const const_ikey_ptr = &internal_key;
-      may_match =
-          filter->PrefixMayMatch(prefix, kNotValid, false, const_ikey_ptr);
+      may_match = filter->RangeMayExist(
+          read_options.iterate_upper_bound, user_key, prefix_extractor,
+          rep_->internal_comparator.user_comparator(), const_ikey_ptr,
+          &filter_checked, need_upper_bound_check);
     } else {
+      // if prefix_extractor changed for block based filter, skip filter
+      if (need_upper_bound_check) {
+        if (!rep_->filter_entry.IsSet()) {
+          filter_entry.Release(rep_->table_options.block_cache.get());
+        }
+        return true;
+      }
+      auto prefix = prefix_extractor->Transform(user_key);
       InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
       auto internal_prefix = internal_key_prefix.Encode();
 
@@ -1600,7 +2265,11 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
       no_io_read_options.read_tier = kBlockCacheTier;
 
       // Then, try find it within each block
-      unique_ptr<InternalIterator> iiter(NewIndexIterator(no_io_read_options));
+      // we already know prefix_extractor and prefix_extractor_name must match
+      // because `CheckPrefixMayMatch` first checks `check_filter_ == true`
+      std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
+          NewIndexIterator(no_io_read_options,
+                           /* need_upper_bound_check */ false));
       iiter->Seek(internal_prefix);
 
       if (!iiter->Valid()) {
@@ -1609,7 +2278,10 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
         // and we're not really sure that we're past the end
         // of the file
         may_match = iiter->status().IsIncomplete();
-      } else if (ExtractUserKey(iiter->key())
+      } else if ((rep_->table_properties &&
+                          rep_->table_properties->index_key_is_user_key
+                      ? iiter->key()
+                      : ExtractUserKey(iiter->key()))
                      .starts_with(ExtractUserKey(internal_prefix))) {
         // we need to check for this subtle case because our only
         // guarantee is that "the key is a string >= last key in that data
@@ -1628,19 +2300,19 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
         // after the data block corresponding to iiter->key() cannot
         // possibly contain the key.  Thus, the corresponding data block
         // is the only on could potentially contain the prefix.
-        Slice handle_value = iiter->value();
-        BlockHandle handle;
-        s = handle.DecodeFrom(&handle_value);
-        assert(s.ok());
-        may_match = filter->PrefixMayMatch(prefix, handle.offset());
+        BlockHandle handle = iiter->value();
+        may_match =
+            filter->PrefixMayMatch(prefix, prefix_extractor, handle.offset());
       }
     }
   }
 
-  Statistics* statistics = rep_->ioptions.statistics;
-  RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
-  if (!may_match) {
-    RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+  if (filter_checked) {
+    Statistics* statistics = rep_->ioptions.statistics;
+    RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
+    if (!may_match) {
+      RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
+    }
   }
 
   // if rep_->filter_entry is not set, we should call Release(); otherwise
@@ -1649,121 +2321,375 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
   if (!rep_->filter_entry.IsSet()) {
     filter_entry.Release(rep_->table_options.block_cache.get());
   }
-
   return may_match;
 }
 
-InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
-                                               Arena* arena,
-                                               bool skip_filters) {
-  return NewTwoLevelIterator(
-      new BlockEntryIteratorState(this, read_options,
-                                  &rep_->internal_comparator, skip_filters),
-      NewIndexIterator(read_options), arena);
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Seek(const Slice& target) {
+  is_out_of_bound_ = false;
+  if (!CheckPrefixMayMatch(target)) {
+    ResetDataIter();
+    return;
+  }
+
+  SavePrevIndexValue();
+
+  index_iter_->Seek(target);
+
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+
+  InitDataBlock();
+
+  block_iter_.Seek(target);
+
+  FindKeyForward();
+  assert(
+      !block_iter_.Valid() ||
+      (key_includes_seq_ && icomp_.Compare(target, block_iter_.key()) <= 0) ||
+      (!key_includes_seq_ && user_comparator_.Compare(ExtractUserKey(target),
+                                                      block_iter_.key()) <= 0));
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekForPrev(
+    const Slice& target) {
+  is_out_of_bound_ = false;
+  if (!CheckPrefixMayMatch(target)) {
+    ResetDataIter();
+    return;
+  }
+
+  SavePrevIndexValue();
+
+  // Call Seek() rather than SeekForPrev() in the index block, because the
+  // target data block will likely to contain the position for `target`, the
+  // same as Seek(), rather than than before.
+  // For example, if we have three data blocks, each containing two keys:
+  //   [2, 4]  [6, 8] [10, 12]
+  //  (the keys in the index block would be [4, 8, 12])
+  // and the user calls SeekForPrev(7), we need to go to the second block,
+  // just like if they call Seek(7).
+  // The only case where the block is difference is when they seek to a position
+  // in the boundary. For example, if they SeekForPrev(5), we should go to the
+  // first block, rather than the second. However, we don't have the information
+  // to distinguish the two unless we read the second block. In this case, we'll
+  // end up with reading two blocks.
+  index_iter_->Seek(target);
+
+  if (!index_iter_->Valid()) {
+    index_iter_->SeekToLast();
+    if (!index_iter_->Valid()) {
+      ResetDataIter();
+      block_iter_points_to_real_block_ = false;
+      return;
+    }
+  }
+
+  InitDataBlock();
+
+  block_iter_.SeekForPrev(target);
+
+  FindKeyBackward();
+  assert(!block_iter_.Valid() ||
+         icomp_.Compare(target, block_iter_.key()) >= 0);
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToFirst() {
+  is_out_of_bound_ = false;
+  SavePrevIndexValue();
+  index_iter_->SeekToFirst();
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+  InitDataBlock();
+  block_iter_.SeekToFirst();
+  FindKeyForward();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::SeekToLast() {
+  is_out_of_bound_ = false;
+  SavePrevIndexValue();
+  index_iter_->SeekToLast();
+  if (!index_iter_->Valid()) {
+    ResetDataIter();
+    return;
+  }
+  InitDataBlock();
+  block_iter_.SeekToLast();
+  FindKeyBackward();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Next() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Next();
+  FindKeyForward();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
+  assert(block_iter_points_to_real_block_);
+  block_iter_.Prev();
+  FindKeyBackward();
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
+  BlockHandle data_block_handle = index_iter_->value();
+  if (!block_iter_points_to_real_block_ ||
+      data_block_handle.offset() != prev_index_value_.offset() ||
+      // if previous attempt of reading the block missed cache, try again
+      block_iter_.status().IsIncomplete()) {
+    if (block_iter_points_to_real_block_) {
+      ResetDataIter();
+    }
+    auto* rep = table_->get_rep();
+
+    // Automatically prefetch additional data when a range scan (iterator) does
+    // more than 2 sequential IOs. This is enabled only for user reads and when
+    // ReadOptions.readahead_size is 0.
+    if (!for_compaction_ && read_options_.readahead_size == 0) {
+      num_file_reads_++;
+      if (num_file_reads_ > 2) {
+        if (!rep->file->use_direct_io() &&
+            (data_block_handle.offset() +
+                 static_cast<size_t>(data_block_handle.size()) +
+                 kBlockTrailerSize >
+             readahead_limit_)) {
+          // Buffered I/O
+          // Discarding the return status of Prefetch calls intentionally, as we
+          // can fallback to reading from disk if Prefetch fails.
+          rep->file->Prefetch(data_block_handle.offset(), readahead_size_);
+          readahead_limit_ =
+              static_cast<size_t>(data_block_handle.offset() + readahead_size_);
+          // Keep exponentially increasing readahead size until
+          // kMaxReadaheadSize.
+          readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2);
+        } else if (rep->file->use_direct_io() && !prefetch_buffer_) {
+          // Direct I/O
+          // Let FilePrefetchBuffer take care of the readahead.
+          prefetch_buffer_.reset(new FilePrefetchBuffer(
+              rep->file.get(), kInitReadaheadSize, kMaxReadaheadSize));
+        }
+      }
+    }
+
+    Status s;
+    BlockBasedTable::NewDataBlockIterator<TBlockIter>(
+        rep, read_options_, data_block_handle, &block_iter_, is_index_,
+        key_includes_seq_, index_key_is_full_,
+        /* get_context */ nullptr, s, prefetch_buffer_.get());
+    block_iter_points_to_real_block_ = true;
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyForward() {
+  assert(!is_out_of_bound_);
+  // TODO the while loop inherits from two-level-iterator. We don't know
+  // whether a block can be empty so it can be replaced by an "if".
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+    ResetDataIter();
+    // We used to check the current index key for upperbound.
+    // It will only save a data reading for a small percentage of use cases,
+    // so for code simplicity, we removed it. We can add it back if there is a
+    // significnat performance regression.
+    index_iter_->Next();
+
+    if (index_iter_->Valid()) {
+      InitDataBlock();
+      block_iter_.SeekToFirst();
+    } else {
+      return;
+    }
+  }
+
+  // Check upper bound on the current key
+  bool reached_upper_bound =
+      (read_options_.iterate_upper_bound != nullptr &&
+       block_iter_points_to_real_block_ && block_iter_.Valid() &&
+       user_comparator_.Compare(ExtractUserKey(block_iter_.key()),
+                                *read_options_.iterate_upper_bound) >= 0);
+  TEST_SYNC_POINT_CALLBACK(
+      "BlockBasedTable::BlockEntryIteratorState::KeyReachedUpperBound",
+      &reached_upper_bound);
+  if (reached_upper_bound) {
+    is_out_of_bound_ = true;
+    return;
+  }
+}
+
+template <class TBlockIter, typename TValue>
+void BlockBasedTableIterator<TBlockIter, TValue>::FindKeyBackward() {
+  assert(!is_out_of_bound_);
+  while (!block_iter_.Valid()) {
+    if (!block_iter_.status().ok()) {
+      return;
+    }
+
+    ResetDataIter();
+    index_iter_->Prev();
+
+    if (index_iter_->Valid()) {
+      InitDataBlock();
+      block_iter_.SeekToLast();
+    } else {
+      return;
+    }
+  }
+
+  // We could have check lower bound here too, but we opt not to do it for
+  // code simplicity.
 }
 
-InternalIterator* BlockBasedTable::NewRangeTombstoneIterator(
+InternalIterator* BlockBasedTable::NewIterator(
+    const ReadOptions& read_options, const SliceTransform* prefix_extractor,
+    Arena* arena, bool skip_filters, bool for_compaction) {
+  bool need_upper_bound_check =
+      PrefixExtractorChanged(rep_->table_properties.get(), prefix_extractor);
+  const bool kIsNotIndex = false;
+  if (arena == nullptr) {
+    return new BlockBasedTableIterator<DataBlockIter>(
+        this, read_options, rep_->internal_comparator,
+        NewIndexIterator(
+            read_options,
+            need_upper_bound_check &&
+                rep_->index_type == BlockBasedTableOptions::kHashSearch),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, kIsNotIndex,
+        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
+  } else {
+    auto* mem =
+        arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
+    return new (mem) BlockBasedTableIterator<DataBlockIter>(
+        this, read_options, rep_->internal_comparator,
+        NewIndexIterator(read_options, need_upper_bound_check),
+        !skip_filters && !read_options.total_order_seek &&
+            prefix_extractor != nullptr,
+        need_upper_bound_check, prefix_extractor, kIsNotIndex,
+        true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction);
+  }
+}
+
+FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator(
     const ReadOptions& read_options) {
-  if (rep_->range_del_handle.IsNull()) {
-    // The block didn't exist, nullptr indicates no range tombstones.
+  if (rep_->fragmented_range_dels == nullptr) {
     return nullptr;
   }
-  if (rep_->range_del_entry.cache_handle != nullptr) {
-    // We have a handle to an uncompressed block cache entry that's held for
-    // this table's lifetime. Increment its refcount before returning an
-    // iterator based on it since the returned iterator may outlive this table
-    // reader.
-    assert(rep_->range_del_entry.value != nullptr);
-    Cache* block_cache = rep_->table_options.block_cache.get();
-    assert(block_cache != nullptr);
-    if (block_cache->Ref(rep_->range_del_entry.cache_handle)) {
-      auto iter = rep_->range_del_entry.value->NewIterator(
-          &rep_->internal_comparator, nullptr /* iter */,
-          true /* total_order_seek */, rep_->ioptions.statistics);
-      iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
-                            rep_->range_del_entry.cache_handle);
-      return iter;
-    }
+  SequenceNumber snapshot = kMaxSequenceNumber;
+  if (read_options.snapshot != nullptr) {
+    snapshot = read_options.snapshot->GetSequenceNumber();
   }
-  std::string str;
-  rep_->range_del_handle.EncodeTo(&str);
-  // The meta-block exists but isn't in uncompressed block cache (maybe because
-  // it is disabled), so go through the full lookup process.
-  return NewDataBlockIterator(rep_, read_options, Slice(str));
+  return new FragmentedRangeTombstoneIterator(
+      rep_->fragmented_range_dels, rep_->internal_comparator, snapshot);
 }
 
-bool BlockBasedTable::FullFilterKeyMayMatch(const ReadOptions& read_options,
-                                            FilterBlockReader* filter,
-                                            const Slice& internal_key,
-                                            const bool no_io) const {
+bool BlockBasedTable::FullFilterKeyMayMatch(
+    const ReadOptions& read_options, FilterBlockReader* filter,
+    const Slice& internal_key, const bool no_io,
+    const SliceTransform* prefix_extractor) const {
   if (filter == nullptr || filter->IsBlockBased()) {
     return true;
   }
   Slice user_key = ExtractUserKey(internal_key);
   const Slice* const const_ikey_ptr = &internal_key;
+  bool may_match = true;
   if (filter->whole_key_filtering()) {
-    return filter->KeyMayMatch(user_key, kNotValid, no_io, const_ikey_ptr);
-  }
-  if (!read_options.total_order_seek && rep_->ioptions.prefix_extractor &&
-      rep_->table_properties->prefix_extractor_name.compare(
-          rep_->ioptions.prefix_extractor->Name()) == 0 &&
-      rep_->ioptions.prefix_extractor->InDomain(user_key) &&
-      !filter->PrefixMayMatch(
-          rep_->ioptions.prefix_extractor->Transform(user_key), kNotValid,
-          false, const_ikey_ptr)) {
-    return false;
+    may_match = filter->KeyMayMatch(user_key, prefix_extractor, kNotValid,
+                                    no_io, const_ikey_ptr);
+  } else if (!read_options.total_order_seek && prefix_extractor &&
+             rep_->table_properties->prefix_extractor_name.compare(
+                 prefix_extractor->Name()) == 0 &&
+             prefix_extractor->InDomain(user_key) &&
+             !filter->PrefixMayMatch(prefix_extractor->Transform(user_key),
+                                     prefix_extractor, kNotValid, false,
+                                     const_ikey_ptr)) {
+    may_match = false;
+  }
+  if (may_match) {
+    RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_POSITIVE);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level);
   }
-  return true;
+  return may_match;
 }
 
 Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
-                            GetContext* get_context, bool skip_filters) {
+                            GetContext* get_context,
+                            const SliceTransform* prefix_extractor,
+                            bool skip_filters) {
+  assert(key.size() >= 8);  // key must be internal key
   Status s;
   const bool no_io = read_options.read_tier == kBlockCacheTier;
   CachableEntry<FilterBlockReader> filter_entry;
   if (!skip_filters) {
-    filter_entry = GetFilter(/*prefetch_buffer*/ nullptr,
-                             read_options.read_tier == kBlockCacheTier);
+    filter_entry =
+        GetFilter(prefix_extractor, /*prefetch_buffer*/ nullptr,
+                  read_options.read_tier == kBlockCacheTier, get_context);
   }
   FilterBlockReader* filter = filter_entry.value;
 
   // First check the full filter
   // If full filter not useful, Then go into each block
-  if (!FullFilterKeyMayMatch(read_options, filter, key, no_io)) {
+  if (!FullFilterKeyMayMatch(read_options, filter, key, no_io,
+                             prefix_extractor)) {
     RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
   } else {
-    BlockIter iiter_on_stack;
-    auto iiter = NewIndexIterator(read_options, &iiter_on_stack);
-    std::unique_ptr<InternalIterator> iiter_unique_ptr;
+    IndexBlockIter iiter_on_stack;
+    // if prefix_extractor found in block differs from options, disable
+    // BlockPrefixIndex. Only do this check when index_type is kHashSearch.
+    bool need_upper_bound_check = false;
+    if (rep_->index_type == BlockBasedTableOptions::kHashSearch) {
+      need_upper_bound_check = PrefixExtractorChanged(
+          rep_->table_properties.get(), prefix_extractor);
+    }
+    auto iiter =
+        NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack,
+                         /* index_entry */ nullptr, get_context);
+    std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
     if (iiter != &iiter_on_stack) {
       iiter_unique_ptr.reset(iiter);
     }
 
+    bool matched = false;  // if such user key mathced a key in SST
     bool done = false;
     for (iiter->Seek(key); iiter->Valid() && !done; iiter->Next()) {
-      Slice handle_value = iiter->value();
+      BlockHandle handle = iiter->value();
 
-      BlockHandle handle;
       bool not_exist_in_filter =
           filter != nullptr && filter->IsBlockBased() == true &&
-          handle.DecodeFrom(&handle_value).ok() &&
-          !filter->KeyMayMatch(ExtractUserKey(key), handle.offset(), no_io);
+          !filter->KeyMayMatch(ExtractUserKey(key), prefix_extractor,
+                               handle.offset(), no_io);
 
       if (not_exist_in_filter) {
         // Not found
         // TODO: think about interaction with Merge. If a user key cannot
         // cross one data block, we should be fine.
         RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
+        PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level);
         break;
       } else {
-        BlockIter biter;
-        NewDataBlockIterator(rep_, read_options, iiter->value(), &biter);
+        DataBlockIter biter;
+        NewDataBlockIterator<DataBlockIter>(
+            rep_, read_options, iiter->value(), &biter, false,
+            true /* key_includes_seq */, true /* index_key_is_full */,
+            get_context);
 
         if (read_options.read_tier == kBlockCacheTier &&
             biter.status().IsIncomplete()) {
           // couldn't get block from block_cache
-          // Update Saver.state to Found because we are only looking for whether
-          // we can guarantee the key is not there when "no_io" is set
+          // Update Saver.state to Found because we are only looking for
+          // whether we can guarantee the key is not there when "no_io" is set
           get_context->MarkKeyMayExist();
           break;
         }
@@ -1772,14 +2698,25 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
           break;
         }
 
+        bool may_exist = biter.SeekForGet(key);
+        if (!may_exist) {
+          // HashSeek cannot find the key this block and the the iter is not
+          // the end of the block, i.e. cannot be in the following blocks
+          // either. In this case, the seek_key cannot be found, so we break
+          // from the top level for-loop.
+          break;
+        }
+
         // Call the *saver function on each entry/block until it returns false
-        for (biter.Seek(key); biter.Valid(); biter.Next()) {
+        for (; biter.Valid(); biter.Next()) {
           ParsedInternalKey parsed_key;
           if (!ParseInternalKey(biter.key(), &parsed_key)) {
             s = Status::Corruption(Slice());
           }
 
-          if (!get_context->SaveValue(parsed_key, biter.value(), &biter)) {
+          if (!get_context->SaveValue(
+                  parsed_key, biter.value(), &matched,
+                  biter.IsValuePinned() ? &biter : nullptr)) {
             done = true;
             break;
           }
@@ -1791,6 +2728,11 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
         break;
       }
     }
+    if (matched && filter != nullptr && !filter->IsBlockBased()) {
+      RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+      PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1,
+                                rep_->level);
+    }
     if (s.ok()) {
       s = iiter->status();
     }
@@ -1808,16 +2750,18 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
 Status BlockBasedTable::Prefetch(const Slice* const begin,
                                  const Slice* const end) {
   auto& comparator = rep_->internal_comparator;
+  auto user_comparator = comparator.user_comparator();
   // pre-condition
   if (begin && end && comparator.Compare(*begin, *end) > 0) {
     return Status::InvalidArgument(*begin, *end);
   }
 
-  BlockIter iiter_on_stack;
-  auto iiter = NewIndexIterator(ReadOptions(), &iiter_on_stack);
-  std::unique_ptr<InternalIterator> iiter_unique_ptr;
+  IndexBlockIter iiter_on_stack;
+  auto iiter = NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
+    iiter_unique_ptr =
+        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
   }
 
   if (!iiter->status().ok()) {
@@ -1830,9 +2774,13 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
 
   for (begin ? iiter->Seek(*begin) : iiter->SeekToFirst(); iiter->Valid();
        iiter->Next()) {
-    Slice block_handle = iiter->value();
-
-    if (end && comparator.Compare(iiter->key(), *end) >= 0) {
+    BlockHandle block_handle = iiter->value();
+    const bool is_user_key = rep_->table_properties &&
+                             rep_->table_properties->index_key_is_user_key > 0;
+    if (end &&
+        ((!is_user_key && comparator.Compare(iiter->key(), *end) >= 0) ||
+         (is_user_key &&
+          user_comparator->Compare(iiter->key(), ExtractUserKey(*end)) >= 0))) {
       if (prefetching_boundary_page) {
         break;
       }
@@ -1843,8 +2791,9 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
     }
 
     // Load the block specified by the block_handle into the block cache
-    BlockIter biter;
-    NewDataBlockIterator(rep_, ReadOptions(), block_handle, &biter);
+    DataBlockIter biter;
+    NewDataBlockIterator<DataBlockIter>(rep_, ReadOptions(), block_handle,
+                                        &biter);
 
     if (!biter.status().ok()) {
       // there was an unexpected error while pre-fetching
@@ -1862,7 +2811,7 @@ Status BlockBasedTable::VerifyChecksum() {
   std::unique_ptr<InternalIterator> meta_iter;
   s = ReadMetaBlock(rep_, nullptr /* prefetch buffer */, &meta, &meta_iter);
   if (s.ok()) {
-    s = VerifyChecksumInBlocks(meta_iter.get());
+    s = VerifyChecksumInMetaBlocks(meta_iter.get());
     if (!s.ok()) {
       return s;
     }
@@ -1870,11 +2819,13 @@ Status BlockBasedTable::VerifyChecksum() {
     return s;
   }
   // Check Data blocks
-  BlockIter iiter_on_stack;
-  InternalIterator* iiter = NewIndexIterator(ReadOptions(), &iiter_on_stack);
-  std::unique_ptr<InternalIterator> iiter_unique_ptr;
+  IndexBlockIter iiter_on_stack;
+  InternalIteratorBase<BlockHandle>* iiter =
+      NewIndexIterator(ReadOptions(), false, &iiter_on_stack);
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter_unique_ptr;
   if (iiter != &iiter_on_stack) {
-    iiter_unique_ptr = std::unique_ptr<InternalIterator>(iiter);
+    iiter_unique_ptr =
+        std::unique_ptr<InternalIteratorBase<BlockHandle>>(iiter);
   }
   if (!iiter->status().ok()) {
     // error opening index iterator
@@ -1884,25 +2835,54 @@ Status BlockBasedTable::VerifyChecksum() {
   return s;
 }
 
-Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) {
+Status BlockBasedTable::VerifyChecksumInBlocks(
+    InternalIteratorBase<BlockHandle>* index_iter) {
   Status s;
   for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
     s = index_iter->status();
     if (!s.ok()) {
       break;
     }
-    BlockHandle handle;
-    Slice input = index_iter->value();
-    s = handle.DecodeFrom(&input);
+    BlockHandle handle = index_iter->value();
+    BlockContents contents;
+    BlockFetcher block_fetcher(
+        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+        ReadOptions(), handle, &contents, rep_->ioptions,
+        false /* decompress */, false /*maybe_compressed*/,
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+Status BlockBasedTable::VerifyChecksumInMetaBlocks(
+    InternalIteratorBase<Slice>* index_iter) {
+  Status s;
+  for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
+    s = index_iter->status();
     if (!s.ok()) {
       break;
     }
+    BlockHandle handle;
+    Slice input = index_iter->value();
+    s = handle.DecodeFrom(&input);
     BlockContents contents;
-    s = ReadBlockContents(rep_->file.get(), nullptr /* prefetch buffer */,
-                          rep_->footer, ReadOptions(), handle, &contents,
-                          rep_->ioptions, false /* decompress */,
-                          Slice() /*compression dict*/,
-                          rep_->persistent_cache_options);
+    BlockFetcher block_fetcher(
+        rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
+        ReadOptions(), handle, &contents, rep_->ioptions,
+        false /* decompress */, false /*maybe_compressed*/,
+        UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
+    s = block_fetcher.ReadBlockContents();
+    if (s.IsCorruption() && index_iter->key() == kPropertiesBlock) {
+      TableProperties* table_properties;
+      s = TryReadPropertiesWithGlobalSeqno(rep_, nullptr /* prefetch_buffer */,
+                                           index_iter->value(),
+                                           &table_properties);
+      delete table_properties;
+    }
     if (!s.ok()) {
       break;
     }
@@ -1912,30 +2892,41 @@ Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) {
 
 bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
                                       const Slice& key) {
-  std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options));
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> iiter(
+      NewIndexIterator(options));
   iiter->Seek(key);
   assert(iiter->Valid());
   CachableEntry<Block> block;
 
-  BlockHandle handle;
-  Slice input = iiter->value();
-  Status s = handle.DecodeFrom(&input);
-  assert(s.ok());
+  BlockHandle handle = iiter->value();
   Cache* block_cache = rep_->table_options.block_cache.get();
   assert(block_cache != nullptr);
 
   char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
   Slice cache_key =
-      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
-                  handle, cache_key_storage);
+      GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle,
+                  cache_key_storage);
   Slice ckey;
 
-  s = GetDataBlockFromCache(
-      cache_key, ckey, block_cache, nullptr, rep_->ioptions, options, &block,
-      rep_->table_options.format_version,
-      rep_->compression_dict_block ? rep_->compression_dict_block->data
-                                   : Slice(),
-      0 /* read_amp_bytes_per_bit */);
+  Status s;
+  if (!rep_->compression_dict_handle.IsNull()) {
+    std::unique_ptr<const BlockContents> compression_dict_block;
+    s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */,
+                                 &compression_dict_block);
+    if (s.ok()) {
+      assert(compression_dict_block != nullptr);
+      UncompressionDict uncompression_dict(
+          compression_dict_block->data.ToString(),
+          rep_->blocks_definitely_zstd_compressed);
+      s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, rep_,
+                                options, &block, uncompression_dict,
+                                0 /* read_amp_bytes_per_bit */);
+    }
+  } else {
+    s = GetDataBlockFromCache(
+        cache_key, ckey, block_cache, nullptr, rep_, options, &block,
+        UncompressionDict::GetEmptyDict(), 0 /* read_amp_bytes_per_bit */);
+  }
   assert(s.ok());
   bool in_cache = block.value != nullptr;
   if (in_cache) {
@@ -1944,50 +2935,66 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
   return in_cache;
 }
 
-// REQUIRES: The following fields of rep_ should have already been populated:
-//  1. file
-//  2. index_handle,
-//  3. options
-//  4. internal_comparator
-//  5. index_type
-Status BlockBasedTable::CreateIndexReader(
-    FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
-    InternalIterator* preloaded_meta_index_iter, int level) {
+BlockBasedTableOptions::IndexType BlockBasedTable::UpdateIndexType() {
   // Some old version of block-based tables don't have index type present in
   // table properties. If that's the case we can safely use the kBinarySearch.
-  auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
+  BlockBasedTableOptions::IndexType index_type_on_file =
+      BlockBasedTableOptions::kBinarySearch;
   if (rep_->table_properties) {
     auto& props = rep_->table_properties->user_collected_properties;
     auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
     if (pos != props.end()) {
       index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
           DecodeFixed32(pos->second.c_str()));
+      // update index_type with the true type
+      rep_->index_type = index_type_on_file;
     }
   }
+  return index_type_on_file;
+}
+
+// REQUIRES: The following fields of rep_ should have already been populated:
+//  1. file
+//  2. index_handle,
+//  3. options
+//  4. internal_comparator
+//  5. index_type
+Status BlockBasedTable::CreateIndexReader(
+    FilePrefetchBuffer* prefetch_buffer, IndexReader** index_reader,
+    InternalIterator* preloaded_meta_index_iter, int level) {
+  auto index_type_on_file = UpdateIndexType();
 
   auto file = rep_->file.get();
   const InternalKeyComparator* icomparator = &rep_->internal_comparator;
   const Footer& footer = rep_->footer;
-  if (index_type_on_file == BlockBasedTableOptions::kHashSearch &&
-      rep_->ioptions.prefix_extractor == nullptr) {
-    ROCKS_LOG_WARN(rep_->ioptions.info_log,
-                   "BlockBasedTableOptions::kHashSearch requires "
-                   "options.prefix_extractor to be set."
-                   " Fall back to binary search index.");
-    index_type_on_file = BlockBasedTableOptions::kBinarySearch;
-  }
+
+  // kHashSearch requires non-empty prefix_extractor but bypass checking
+  // prefix_extractor here since we have no access to MutableCFOptions.
+  // Add need_upper_bound_check flag in  BlockBasedTable::NewIndexIterator.
+  // If prefix_extractor does not match prefix_extractor_name from table
+  // properties, turn off Hash Index by setting total_order_seek to true
 
   switch (index_type_on_file) {
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
       return PartitionIndexReader::Create(
           this, file, prefetch_buffer, footer, footer.index_handle(),
           rep_->ioptions, icomparator, index_reader,
-          rep_->persistent_cache_options, level);
+          rep_->persistent_cache_options, level,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0,
+          GetMemoryAllocator(rep_->table_options));
     }
     case BlockBasedTableOptions::kBinarySearch: {
       return BinarySearchIndexReader::Create(
           file, prefetch_buffer, footer, footer.index_handle(), rep_->ioptions,
-          icomparator, index_reader, rep_->persistent_cache_options);
+          icomparator, index_reader, rep_->persistent_cache_options,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0,
+          GetMemoryAllocator(rep_->table_options));
     }
     case BlockBasedTableOptions::kHashSearch: {
       std::unique_ptr<Block> meta_guard;
@@ -2005,7 +3012,12 @@ Status BlockBasedTable::CreateIndexReader(
           return BinarySearchIndexReader::Create(
               file, prefetch_buffer, footer, footer.index_handle(),
               rep_->ioptions, icomparator, index_reader,
-              rep_->persistent_cache_options);
+              rep_->persistent_cache_options,
+              rep_->table_properties == nullptr ||
+                  rep_->table_properties->index_key_is_user_key == 0,
+              rep_->table_properties == nullptr ||
+                  rep_->table_properties->index_value_is_delta_encoded == 0,
+              GetMemoryAllocator(rep_->table_options));
         }
         meta_index_iter = meta_iter_guard.get();
       }
@@ -2014,7 +3026,12 @@ Status BlockBasedTable::CreateIndexReader(
           rep_->internal_prefix_transform.get(), footer, file, prefetch_buffer,
           rep_->ioptions, icomparator, footer.index_handle(), meta_index_iter,
           index_reader, rep_->hash_index_allow_collision,
-          rep_->persistent_cache_options);
+          rep_->persistent_cache_options,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_key_is_user_key == 0,
+          rep_->table_properties == nullptr ||
+              rep_->table_properties->index_value_is_delta_encoded == 0,
+          GetMemoryAllocator(rep_->table_options));
     }
     default: {
       std::string error_message =
@@ -2025,22 +3042,14 @@ Status BlockBasedTable::CreateIndexReader(
 }
 
 uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
-  unique_ptr<InternalIterator> index_iter(NewIndexIterator(ReadOptions()));
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> index_iter(
+      NewIndexIterator(ReadOptions()));
 
   index_iter->Seek(key);
   uint64_t result;
   if (index_iter->Valid()) {
-    BlockHandle handle;
-    Slice input = index_iter->value();
-    Status s = handle.DecodeFrom(&input);
-    if (s.ok()) {
-      result = handle.offset();
-    } else {
-      // Strange: we can't decode the block handle in the index block.
-      // We'll just return the offset of the metaindex block, which is
-      // close to the whole file size for this case.
-      result = rep_->footer.metaindex_handle().offset();
-    }
+    BlockHandle handle = index_iter->value();
+    result = handle.offset();
   } else {
     // key is past the last key in the file. If table_properties is not
     // available, approximate the offset by returning the offset of the
@@ -2067,7 +3076,7 @@ bool BlockBasedTable::TEST_index_reader_preloaded() const {
 
 Status BlockBasedTable::GetKVPairsFromDataBlocks(
     std::vector<KVPairBlock>* kv_pair_blocks) {
-  std::unique_ptr<InternalIterator> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
       NewIndexIterator(ReadOptions()));
 
   Status s = blockhandles_iter->status();
@@ -2085,8 +3094,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
     }
 
     std::unique_ptr<InternalIterator> datablock_iter;
-    datablock_iter.reset(
-        NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value()));
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        rep_, ReadOptions(), blockhandles_iter->value()));
     s = datablock_iter->status();
 
     if (!s.ok()) {
@@ -2115,7 +3124,8 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks(
   return Status::OK();
 }
 
-Status BlockBasedTable::DumpTable(WritableFile* out_file) {
+Status BlockBasedTable::DumpTable(WritableFile* out_file,
+                                  const SliceTransform* prefix_extractor) {
   // Output Footer
   out_file->Append(
       "Footer Details:\n"
@@ -2173,30 +3183,32 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
         "  ");
     out_file->Append(table_properties->ToString("\n  ", ": ").c_str());
     out_file->Append("\n");
-  }
 
-  // Output Filter blocks
-  if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
-    // Support only BloomFilter as off now
-    rocksdb::BlockBasedTableOptions table_options;
-    table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
-    if (table_properties->filter_policy_name.compare(
-            table_options.filter_policy->Name()) == 0) {
-      std::string filter_block_key = kFilterBlockPrefix;
-      filter_block_key.append(table_properties->filter_policy_name);
-      BlockHandle handle;
-      if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
-        BlockContents block;
-        if (ReadBlockContents(rep_->file.get(), nullptr /* prefetch_buffer */,
-                              rep_->footer, ReadOptions(), handle, &block,
-                              rep_->ioptions, false /*decompress*/,
-                              Slice() /*compression dict*/,
-                              rep_->persistent_cache_options)
-                .ok()) {
-          rep_->filter.reset(new BlockBasedFilterBlockReader(
-              rep_->ioptions.prefix_extractor, table_options,
-              table_options.whole_key_filtering, std::move(block),
-              rep_->ioptions.statistics));
+    // Output Filter blocks
+    if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
+      // Support only BloomFilter as off now
+      rocksdb::BlockBasedTableOptions table_options;
+      table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
+      if (table_properties->filter_policy_name.compare(
+              table_options.filter_policy->Name()) == 0) {
+        std::string filter_block_key = kFilterBlockPrefix;
+        filter_block_key.append(table_properties->filter_policy_name);
+        BlockHandle handle;
+        if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
+          BlockContents block;
+          BlockFetcher block_fetcher(
+              rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer,
+              ReadOptions(), handle, &block, rep_->ioptions,
+              false /*decompress*/, false /*maybe_compressed*/,
+              UncompressionDict::GetEmptyDict(),
+              rep_->persistent_cache_options);
+          s = block_fetcher.ReadBlockContents();
+          if (!s.ok()) {
+            rep_->filter.reset(new BlockBasedFilterBlockReader(
+                prefix_extractor, table_options,
+                table_options.whole_key_filtering, std::move(block),
+                rep_->ioptions.statistics));
+          }
         }
       }
     }
@@ -2217,8 +3229,15 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) {
   }
 
   // Output compression dictionary
-  if (rep_->compression_dict_block != nullptr) {
-    auto compression_dict = rep_->compression_dict_block->data;
+  if (!rep_->compression_dict_handle.IsNull()) {
+    std::unique_ptr<const BlockContents> compression_dict_block;
+    s = ReadCompressionDictBlock(rep_, nullptr /* prefetch_buffer */,
+                                 &compression_dict_block);
+    if (!s.ok()) {
+      return s;
+    }
+    assert(compression_dict_block != nullptr);
+    auto compression_dict = compression_dict_block->data;
     out_file->Append(
         "Compression Dictionary:\n"
         "--------------------------------------\n");
@@ -2256,22 +3275,36 @@ void BlockBasedTable::Close() {
   if (rep_->closed) {
     return;
   }
-  rep_->filter_entry.Release(rep_->table_options.block_cache.get());
-  rep_->index_entry.Release(rep_->table_options.block_cache.get());
-  rep_->range_del_entry.Release(rep_->table_options.block_cache.get());
-  // cleanup index and filter blocks to avoid accessing dangling pointer
+
+  Cache* const cache = rep_->table_options.block_cache.get();
+
+  rep_->filter_entry.Release(cache);
+  rep_->index_entry.Release(cache);
+
+  // cleanup index, filter, and compression dictionary blocks
+  // to avoid accessing dangling pointers
   if (!rep_->table_options.no_block_cache) {
     char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
+
     // Get the filter block key
     auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
                            rep_->filter_handle, cache_key);
-    rep_->table_options.block_cache.get()->Erase(key);
+    cache->Erase(key);
+
     // Get the index block key
     key = GetCacheKeyFromOffset(rep_->cache_key_prefix,
                                 rep_->cache_key_prefix_size,
                                 rep_->dummy_index_reader_offset, cache_key);
-    rep_->table_options.block_cache.get()->Erase(key);
+    cache->Erase(key);
+
+    if (!rep_->compression_dict_handle.IsNull()) {
+      // Get the compression dictionary block key
+      key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
+                        rep_->compression_dict_handle, cache_key);
+      cache->Erase(key);
+    }
   }
+
   rep_->closed = true;
 }
 
@@ -2279,8 +3312,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
   out_file->Append(
       "Index Details:\n"
       "--------------------------------------\n");
-
-  std::unique_ptr<InternalIterator> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
       NewIndexIterator(ReadOptions()));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
@@ -2297,16 +3329,23 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
       break;
     }
     Slice key = blockhandles_iter->key();
+    Slice user_key;
     InternalKey ikey;
-    ikey.DecodeFrom(key);
+    if (rep_->table_properties &&
+        rep_->table_properties->index_key_is_user_key != 0) {
+      user_key = key;
+    } else {
+      ikey.DecodeFrom(key);
+      user_key = ikey.user_key();
+    }
 
     out_file->Append("  HEX    ");
-    out_file->Append(ikey.user_key().ToString(true).c_str());
+    out_file->Append(user_key.ToString(true).c_str());
     out_file->Append(": ");
     out_file->Append(blockhandles_iter->value().ToString(true).c_str());
     out_file->Append("\n");
 
-    std::string str_key = ikey.user_key().ToString();
+    std::string str_key = user_key.ToString();
     std::string res_key("");
     char cspace = ' ';
     for (size_t i = 0; i < str_key.size(); i++) {
@@ -2322,7 +3361,7 @@ Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
 }
 
 Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
-  std::unique_ptr<InternalIterator> blockhandles_iter(
+  std::unique_ptr<InternalIteratorBase<BlockHandle>> blockhandles_iter(
       NewIndexIterator(ReadOptions()));
   Status s = blockhandles_iter->status();
   if (!s.ok()) {
@@ -2342,9 +3381,7 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
       break;
     }
 
-    Slice bh_val = blockhandles_iter->value();
-    BlockHandle bh;
-    bh.DecodeFrom(&bh_val);
+    BlockHandle bh = blockhandles_iter->value();
     uint64_t datablock_size = bh.size();
     datablock_size_min = std::min(datablock_size_min, datablock_size);
     datablock_size_max = std::max(datablock_size_max, datablock_size);
@@ -2358,8 +3395,8 @@ Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
     out_file->Append("--------------------------------------\n");
 
     std::unique_ptr<InternalIterator> datablock_iter;
-    datablock_iter.reset(
-        NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value()));
+    datablock_iter.reset(NewDataBlockIterator<DataBlockIter>(
+        rep_, ReadOptions(), blockhandles_iter->value()));
     s = datablock_iter->status();
 
     if (!s.ok()) {
@@ -2415,11 +3452,19 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
   std::string res_key(""), res_value("");
   char cspace = ' ';
   for (size_t i = 0; i < str_key.size(); i++) {
-    res_key.append(&str_key[i], 1);
+    if (str_key[i] == '\0') {
+      res_key.append("\\0", 2);
+    } else {
+      res_key.append(&str_key[i], 1);
+    }
     res_key.append(1, cspace);
   }
   for (size_t i = 0; i < str_value.size(); i++) {
-    res_value.append(&str_value[i], 1);
+    if (str_value[i] == '\0') {
+      res_value.append("\\0", 2);
+    } else {
+      res_value.append(&str_value[i], 1);
+    }
     res_value.append(1, cspace);
   }
 
@@ -2432,24 +3477,31 @@ void BlockBasedTable::DumpKeyValue(const Slice& key, const Slice& value,
 
 namespace {
 
-void DeleteCachedFilterEntry(const Slice& key, void* value) {
+void DeleteCachedFilterEntry(const Slice& /*key*/, void* value) {
   FilterBlockReader* filter = reinterpret_cast<FilterBlockReader*>(value);
   if (filter->statistics() != nullptr) {
     RecordTick(filter->statistics(), BLOCK_CACHE_FILTER_BYTES_EVICT,
-               filter->size());
+               filter->ApproximateMemoryUsage());
   }
   delete filter;
 }
 
-void DeleteCachedIndexEntry(const Slice& key, void* value) {
+void DeleteCachedIndexEntry(const Slice& /*key*/, void* value) {
   IndexReader* index_reader = reinterpret_cast<IndexReader*>(value);
   if (index_reader->statistics() != nullptr) {
     RecordTick(index_reader->statistics(), BLOCK_CACHE_INDEX_BYTES_EVICT,
-               index_reader->usable_size());
+               index_reader->ApproximateMemoryUsage());
   }
   delete index_reader;
 }
 
+void DeleteCachedUncompressionDictEntry(const Slice& /*key*/, void* value) {
+  UncompressionDict* dict = reinterpret_cast<UncompressionDict*>(value);
+  RecordTick(dict->statistics(), BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT,
+             dict->ApproximateMemoryUsage());
+  delete dict;
+}
+
 }  // anonymous namespace
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_based_table_reader.h b/thirdparty/rocksdb/table/block_based_table_reader.h
index a5426cdedf..f0b5cdb1bc 100644
--- a/thirdparty/rocksdb/table/block_based_table_reader.h
+++ b/thirdparty/rocksdb/table/block_based_table_reader.h
@@ -16,12 +16,15 @@
 #include <utility>
 #include <vector>
 
+#include "db/range_tombstone_fragmenter.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
 #include "rocksdb/persistent_cache.h"
 #include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/table.h"
+#include "table/block.h"
+#include "table/block_based_table_factory.h"
 #include "table/filter_block.h"
 #include "table/format.h"
 #include "table/persistent_cache_helper.h"
@@ -30,11 +33,10 @@
 #include "table/two_level_iterator.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
+#include "util/user_comparator_wrapper.h"
 
 namespace rocksdb {
 
-class Block;
-class BlockIter;
 class BlockHandle;
 class Cache;
 class FilterBlockReader;
@@ -51,9 +53,6 @@ struct BlockBasedTableOptions;
 struct EnvOptions;
 struct ReadOptions;
 class GetContext;
-class InternalIterator;
-
-using std::unique_ptr;
 
 typedef std::vector<std::pair<std::string, std::string>> KVPairBlock;
 
@@ -89,27 +88,38 @@ class BlockBasedTable : public TableReader {
                      const EnvOptions& env_options,
                      const BlockBasedTableOptions& table_options,
                      const InternalKeyComparator& internal_key_comparator,
-                     unique_ptr<RandomAccessFileReader>&& file,
-                     uint64_t file_size, unique_ptr<TableReader>* table_reader,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size,
+                     std::unique_ptr<TableReader>* table_reader,
+                     const SliceTransform* prefix_extractor = nullptr,
                      bool prefetch_index_and_filter_in_cache = true,
-                     bool skip_filters = false, int level = -1);
+                     bool skip_filters = false, int level = -1,
+                     const bool immortal_table = false,
+                     const SequenceNumber largest_seqno = 0,
+                     TailPrefetchStats* tail_prefetch_stats = nullptr);
 
-  bool PrefixMayMatch(const Slice& internal_key);
+  bool PrefixMayMatch(const Slice& internal_key,
+                      const ReadOptions& read_options,
+                      const SliceTransform* options_prefix_extractor,
+                      const bool need_upper_bound_check);
 
   // Returns a new iterator over the table contents.
   // The result of NewIterator() is initially invalid (caller must
   // call one of the Seek methods on the iterator before using it).
   // @param skip_filters Disables loading/accessing the filter block
-  InternalIterator* NewIterator(
-      const ReadOptions&, Arena* arena = nullptr,
-      bool skip_filters = false) override;
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena = nullptr,
+                                bool skip_filters = false,
+                                bool for_compaction = false) override;
 
-  InternalIterator* NewRangeTombstoneIterator(
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
       const ReadOptions& read_options) override;
 
   // @param skip_filters Disables loading/accessing the filter block
   Status Get(const ReadOptions& readOptions, const Slice& key,
-             GetContext* get_context, bool skip_filters = false) override;
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
 
   // Pre-fetch the disk blocks that correspond to the key range specified by
   // (kbegin, kend). The call will return error status in the event of
@@ -137,7 +147,8 @@ class BlockBasedTable : public TableReader {
   size_t ApproximateMemoryUsage() const override;
 
   // convert SST file to a human readable form
-  Status DumpTable(WritableFile* out_file) override;
+  Status DumpTable(WritableFile* out_file,
+                   const SliceTransform* prefix_extractor = nullptr) override;
 
   Status VerifyChecksum() override;
 
@@ -167,8 +178,9 @@ class BlockBasedTable : public TableReader {
     // to
     // a different object then iter and the callee has the ownership of the
     // returned object.
-    virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
-                                          bool total_order_seek = true) = 0;
+    virtual InternalIteratorBase<BlockHandle>* NewIterator(
+        IndexBlockIter* iter = nullptr, bool total_order_seek = true,
+        bool fill_cache = true) = 0;
 
     // The size of the index.
     virtual size_t size() const = 0;
@@ -201,29 +213,40 @@ class BlockBasedTable : public TableReader {
   // The key retrieved are internal keys.
   Status GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
 
-  class BlockEntryIteratorState;
+  template <class TValue>
+  struct CachableEntry;
+  struct Rep;
+
+  Rep* get_rep() { return rep_; }
+
+  // input_iter: if it is not null, update this one and return it as Iterator
+  template <typename TBlockIter>
+  static TBlockIter* NewDataBlockIterator(
+      Rep* rep, const ReadOptions& ro, const Slice& index_value,
+      TBlockIter* input_iter = nullptr, bool is_index = false,
+      bool key_includes_seq = true, bool index_key_is_full = true,
+      GetContext* get_context = nullptr,
+      FilePrefetchBuffer* prefetch_buffer = nullptr);
+  template <typename TBlockIter>
+  static TBlockIter* NewDataBlockIterator(
+      Rep* rep, const ReadOptions& ro, const BlockHandle& block_hanlde,
+      TBlockIter* input_iter = nullptr, bool is_index = false,
+      bool key_includes_seq = true, bool index_key_is_full = true,
+      GetContext* get_context = nullptr, Status s = Status(),
+      FilePrefetchBuffer* prefetch_buffer = nullptr);
+
+  class PartitionedIndexIteratorState;
 
   friend class PartitionIndexReader;
 
  protected:
-  template <class TValue>
-  struct CachableEntry;
-  struct Rep;
   Rep* rep_;
   explicit BlockBasedTable(Rep* rep) : rep_(rep) {}
 
  private:
   friend class MockedBlockBasedTable;
-  // input_iter: if it is not null, update this one and return it as Iterator
-  static InternalIterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
-                                                const Slice& index_value,
-                                                BlockIter* input_iter = nullptr,
-                                                bool is_index = false);
-  static InternalIterator* NewDataBlockIterator(Rep* rep, const ReadOptions& ro,
-                                                const BlockHandle& block_hanlde,
-                                                BlockIter* input_iter = nullptr,
-                                                bool is_index = false,
-                                                Status s = Status());
+  static std::atomic<uint64_t> next_cache_key_id_;
+
   // If block cache enabled (compressed or uncompressed), looks for the block
   // identified by handle in (1) uncompressed cache, (2) compressed cache, and
   // then (3) file. If found, inserts into the cache(s) that were searched
@@ -233,21 +256,27 @@ class BlockBasedTable : public TableReader {
   // @param block_entry value is set to the uncompressed block if found. If
   //    in uncompressed block cache, also sets cache_handle to reference that
   //    block.
-  static Status MaybeLoadDataBlockToCache(FilePrefetchBuffer* prefetch_buffer,
-                                          Rep* rep, const ReadOptions& ro,
-                                          const BlockHandle& handle,
-                                          Slice compression_dict,
-                                          CachableEntry<Block>* block_entry,
-                                          bool is_index = false);
+  static Status MaybeReadBlockAndLoadToCache(
+      FilePrefetchBuffer* prefetch_buffer, Rep* rep, const ReadOptions& ro,
+      const BlockHandle& handle, const UncompressionDict& uncompression_dict,
+      CachableEntry<Block>* block_entry, bool is_index = false,
+      GetContext* get_context = nullptr);
 
   // For the following two functions:
   // if `no_io == true`, we will not try to read filter/index from sst file
   // were they not present in cache yet.
   CachableEntry<FilterBlockReader> GetFilter(
-      FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false) const;
+      const SliceTransform* prefix_extractor = nullptr,
+      FilePrefetchBuffer* prefetch_buffer = nullptr, bool no_io = false,
+      GetContext* get_context = nullptr) const;
   virtual CachableEntry<FilterBlockReader> GetFilter(
       FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_blk_handle,
-      const bool is_a_filter_partition, bool no_io) const;
+      const bool is_a_filter_partition, bool no_io, GetContext* get_context,
+      const SliceTransform* prefix_extractor = nullptr) const;
+
+  static CachableEntry<UncompressionDict> GetUncompressionDict(
+      Rep* rep, FilePrefetchBuffer* prefetch_buffer, bool no_io,
+      GetContext* get_context);
 
   // Get the iterator from the index reader.
   // If input_iter is not set, return new Iterator
@@ -259,23 +288,26 @@ class BlockBasedTable : public TableReader {
   //  2. index is not present in block cache.
   //  3. We disallowed any io to be performed, that is, read_options ==
   //     kBlockCacheTier
-  InternalIterator* NewIndexIterator(
-      const ReadOptions& read_options, BlockIter* input_iter = nullptr,
-      CachableEntry<IndexReader>* index_entry = nullptr);
+  InternalIteratorBase<BlockHandle>* NewIndexIterator(
+      const ReadOptions& read_options, bool need_upper_bound_check = false,
+      IndexBlockIter* input_iter = nullptr,
+      CachableEntry<IndexReader>* index_entry = nullptr,
+      GetContext* get_context = nullptr);
 
   // Read block cache from block caches (if set): block_cache and
   // block_cache_compressed.
   // On success, Status::OK with be returned and @block will be populated with
   // pointer to the block as well as its block handle.
-  // @param compression_dict Data for presetting the compression library's
+  // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
   static Status GetDataBlockFromCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
-      Cache* block_cache, Cache* block_cache_compressed,
-      const ImmutableCFOptions& ioptions, const ReadOptions& read_options,
-      BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version,
-      const Slice& compression_dict, size_t read_amp_bytes_per_bit,
-      bool is_index = false);
+      Cache* block_cache, Cache* block_cache_compressed, Rep* rep,
+      const ReadOptions& read_options,
+      BlockBasedTable::CachableEntry<Block>* block,
+      const UncompressionDict& uncompression_dict,
+      size_t read_amp_bytes_per_bit, bool is_index = false,
+      GetContext* get_context = nullptr);
 
   // Put a raw block (maybe compressed) to the corresponding block caches.
   // This method will perform decompression against raw_block if needed and then
@@ -283,17 +315,20 @@ class BlockBasedTable : public TableReader {
   // On success, Status::OK will be returned; also @block will be populated with
   // uncompressed block and its cache handle.
   //
-  // REQUIRES: raw_block is heap-allocated. PutDataBlockToCache() will be
-  // responsible for releasing its memory if error occurs.
-  // @param compression_dict Data for presetting the compression library's
+  // Allocated memory managed by raw_block_contents will be transferred to
+  // PutDataBlockToCache(). After the call, the object will be invalid.
+  // @param uncompression_dict Data for presetting the compression library's
   //    dictionary.
   static Status PutDataBlockToCache(
       const Slice& block_cache_key, const Slice& compressed_block_cache_key,
       Cache* block_cache, Cache* block_cache_compressed,
       const ReadOptions& read_options, const ImmutableCFOptions& ioptions,
-      CachableEntry<Block>* block, Block* raw_block, uint32_t format_version,
-      const Slice& compression_dict, size_t read_amp_bytes_per_bit,
-      bool is_index = false, Cache::Priority pri = Cache::Priority::LOW);
+      CachableEntry<Block>* block, BlockContents* raw_block_contents,
+      CompressionType raw_block_comp_type, uint32_t format_version,
+      const UncompressionDict& uncompression_dict, SequenceNumber seq_no,
+      size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
+      bool is_index = false, Cache::Priority pri = Cache::Priority::LOW,
+      GetContext* get_context = nullptr);
 
   // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
   // after a call to Seek(key), until handle_result returns false.
@@ -303,6 +338,9 @@ class BlockBasedTable : public TableReader {
 
   void ReadMeta(const Footer& footer);
 
+  // Figure the index type, update it in rep_, and also return it.
+  BlockBasedTableOptions::IndexType UpdateIndexType();
+
   // Create a index reader based on the index type stored in the table.
   // Optionally, user can pass a preloaded meta_index_iter for the index that
   // need to access extra meta blocks for index construction. This parameter
@@ -312,29 +350,56 @@ class BlockBasedTable : public TableReader {
       InternalIterator* preloaded_meta_index_iter = nullptr,
       const int level = -1);
 
-  bool FullFilterKeyMayMatch(const ReadOptions& read_options,
-                             FilterBlockReader* filter, const Slice& user_key,
-                             const bool no_io) const;
+  bool FullFilterKeyMayMatch(
+      const ReadOptions& read_options, FilterBlockReader* filter,
+      const Slice& user_key, const bool no_io,
+      const SliceTransform* prefix_extractor = nullptr) const;
 
-  // Read the meta block from sst.
+  static Status PrefetchTail(
+      RandomAccessFileReader* file, uint64_t file_size,
+      TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all,
+      const bool preload_all,
+      std::unique_ptr<FilePrefetchBuffer>* prefetch_buffer);
   static Status ReadMetaBlock(Rep* rep, FilePrefetchBuffer* prefetch_buffer,
                               std::unique_ptr<Block>* meta_block,
                               std::unique_ptr<InternalIterator>* iter);
-
-  Status VerifyChecksumInBlocks(InternalIterator* index_iter);
+  static Status TryReadPropertiesWithGlobalSeqno(
+      Rep* rep, FilePrefetchBuffer* prefetch_buffer, const Slice& handle_value,
+      TableProperties** table_properties);
+  static Status ReadPropertiesBlock(Rep* rep,
+                                    FilePrefetchBuffer* prefetch_buffer,
+                                    InternalIterator* meta_iter,
+                                    const SequenceNumber largest_seqno);
+  static Status ReadRangeDelBlock(
+      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
+      InternalIterator* meta_iter,
+      const InternalKeyComparator& internal_comparator);
+  static Status ReadCompressionDictBlock(
+      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
+      std::unique_ptr<const BlockContents>* compression_dict_block);
+  static Status PrefetchIndexAndFilterBlocks(
+      Rep* rep, FilePrefetchBuffer* prefetch_buffer,
+      InternalIterator* meta_iter, BlockBasedTable* new_table,
+      const SliceTransform* prefix_extractor, bool prefetch_all,
+      const BlockBasedTableOptions& table_options, const int level,
+      const bool prefetch_index_and_filter_in_cache);
+
+  Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
+  Status VerifyChecksumInBlocks(InternalIteratorBase<BlockHandle>* index_iter);
 
   // Create the filter from the filter block.
-  FilterBlockReader* ReadFilter(FilePrefetchBuffer* prefetch_buffer,
-                                const BlockHandle& filter_handle,
-                                const bool is_a_filter_partition) const;
+  virtual FilterBlockReader* ReadFilter(
+      FilePrefetchBuffer* prefetch_buffer, const BlockHandle& filter_handle,
+      const bool is_a_filter_partition,
+      const SliceTransform* prefix_extractor = nullptr) const;
 
   static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size);
 
   // Generate a cache key prefix from the file
-  static void GenerateCachePrefix(Cache* cc,
-    RandomAccessFile* file, char* buffer, size_t* size);
-  static void GenerateCachePrefix(Cache* cc,
-    WritableFile* file, char* buffer, size_t* size);
+  static void GenerateCachePrefix(Cache* cc, RandomAccessFile* file,
+                                  char* buffer, size_t* size);
+  static void GenerateCachePrefix(Cache* cc, WritableFile* file, char* buffer,
+                                  size_t* size);
 
   // Helper functions for DumpTable()
   Status DumpIndexBlock(WritableFile* out_file);
@@ -351,27 +416,22 @@ class BlockBasedTable : public TableReader {
 };
 
 // Maitaning state of a two-level iteration on a partitioned index structure
-class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
+class BlockBasedTable::PartitionedIndexIteratorState
+    : public TwoLevelIteratorState {
  public:
-  BlockEntryIteratorState(
-      BlockBasedTable* table, const ReadOptions& read_options,
-      const InternalKeyComparator* icomparator, bool skip_filters,
-      bool is_index = false,
-      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map = nullptr);
-  InternalIterator* NewSecondaryIterator(const Slice& index_value) override;
-  bool PrefixMayMatch(const Slice& internal_key) override;
-  bool KeyReachedUpperBound(const Slice& internal_key) override;
+  PartitionedIndexIteratorState(
+      BlockBasedTable* table,
+      std::unordered_map<uint64_t, CachableEntry<Block>>* block_map,
+      const bool index_key_includes_seq, const bool index_key_is_full);
+  InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+      const BlockHandle& index_value) override;
 
  private:
   // Don't own table_
   BlockBasedTable* table_;
-  const ReadOptions read_options_;
-  const InternalKeyComparator* icomparator_;
-  bool skip_filters_;
-  // true if the 2nd level iterator is on indexes instead of on user data.
-  bool is_index_;
   std::unordered_map<uint64_t, CachableEntry<Block>>* block_map_;
-  port::RWMutex cleaner_mu;
+  bool index_key_includes_seq_;
+  bool index_key_is_full_;
 };
 
 // CachableEntry represents the entries that *may* be fetched from block cache.
@@ -400,25 +460,29 @@ struct BlockBasedTable::CachableEntry {
 struct BlockBasedTable::Rep {
   Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
       const BlockBasedTableOptions& _table_opt,
-      const InternalKeyComparator& _internal_comparator, bool skip_filters)
+      const InternalKeyComparator& _internal_comparator, bool skip_filters,
+      int _level, const bool _immortal_table)
       : ioptions(_ioptions),
         env_options(_env_options),
         table_options(_table_opt),
         filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
         internal_comparator(_internal_comparator),
         filter_type(FilterType::kNoFilter),
+        index_type(BlockBasedTableOptions::IndexType::kBinarySearch),
+        hash_index_allow_collision(false),
         whole_key_filtering(_table_opt.whole_key_filtering),
         prefix_filtering(true),
-        range_del_handle(BlockHandle::NullBlockHandle()),
-        global_seqno(kDisableGlobalSequenceNumber) {}
+        global_seqno(kDisableGlobalSequenceNumber),
+        level(_level),
+        immortal_table(_immortal_table) {}
 
   const ImmutableCFOptions& ioptions;
   const EnvOptions& env_options;
-  const BlockBasedTableOptions& table_options;
+  const BlockBasedTableOptions table_options;
   const FilterPolicy* const filter_policy;
   const InternalKeyComparator& internal_comparator;
   Status status;
-  unique_ptr<RandomAccessFileReader> file;
+  std::unique_ptr<RandomAccessFileReader> file;
   char cache_key_prefix[kMaxCacheKeyPrefixSize];
   size_t cache_key_prefix_size = 0;
   char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
@@ -431,11 +495,15 @@ struct BlockBasedTable::Rep {
 
   // Footer contains the fixed table information
   Footer footer;
-  // index_reader and filter will be populated and used only when
-  // options.block_cache is nullptr; otherwise we will get the index block via
-  // the block cache.
-  unique_ptr<IndexReader> index_reader;
-  unique_ptr<FilterBlockReader> filter;
+  // `index_reader`, `filter`, and `uncompression_dict` will be populated (i.e.,
+  // non-nullptr) and used only when options.block_cache is nullptr or when
+  // `cache_index_and_filter_blocks == false`. Otherwise, we will get the index,
+  // filter, and compression dictionary blocks via the block cache. In that case
+  // `dummy_index_reader_offset`, `filter_handle`, and `compression_dict_handle`
+  // are used to lookup these meta-blocks in block cache.
+  std::unique_ptr<IndexReader> index_reader;
+  std::unique_ptr<FilterBlockReader> filter;
+  std::unique_ptr<UncompressionDict> uncompression_dict;
 
   enum class FilterType {
     kNoFilter,
@@ -445,13 +513,9 @@ struct BlockBasedTable::Rep {
   };
   FilterType filter_type;
   BlockHandle filter_handle;
+  BlockHandle compression_dict_handle;
 
   std::shared_ptr<const TableProperties> table_properties;
-  // Block containing the data for the compression dictionary. We take ownership
-  // for the entire block struct, even though we only use its Slice member. This
-  // is easier because the Slice member depends on the continued existence of
-  // another member ("allocation").
-  std::unique_ptr<const BlockContents> compression_dict_block;
   BlockBasedTableOptions::IndexType index_type;
   bool hash_index_allow_collision;
   bool whole_key_filtering;
@@ -460,19 +524,18 @@ struct BlockBasedTable::Rep {
   // module should not be relying on db module. However to make things easier
   // and compatible with existing code, we introduce a wrapper that allows
   // block to extract prefix without knowing if a key is internal or not.
-  unique_ptr<SliceTransform> internal_prefix_transform;
-
-  // only used in level 0 files:
-  // when pin_l0_filter_and_index_blocks_in_cache is true, we do use the
-  // LRU cache, but we always keep the filter & idndex block's handle checked
-  // out here (=we don't call Release()), plus the parsed out objects
-  // the LRU cache will never push flush them out, hence they're pinned
+  std::unique_ptr<SliceTransform> internal_prefix_transform;
+  std::shared_ptr<const SliceTransform> table_prefix_extractor;
+
+  // only used in level 0 files when pin_l0_filter_and_index_blocks_in_cache is
+  // true or in all levels when pin_top_level_index_and_filter is set in
+  // combination with partitioned index/filters: then we do use the LRU cache,
+  // but we always keep the filter & index block's handle checked out here (=we
+  // don't call Release()), plus the parsed out objects the LRU cache will never
+  // push flush them out, hence they're pinned
   CachableEntry<FilterBlockReader> filter_entry;
   CachableEntry<IndexReader> index_entry;
-  // range deletion meta-block is pinned through reader's lifetime when LRU
-  // cache is enabled.
-  CachableEntry<Block> range_del_entry;
-  BlockHandle range_del_handle;
+  std::shared_ptr<const FragmentedRangeTombstoneList> fragmented_range_dels;
 
   // If global_seqno is used, all Keys in this file will have the same
   // seqno with value `global_seqno`.
@@ -480,7 +543,168 @@ struct BlockBasedTable::Rep {
   // A value of kDisableGlobalSequenceNumber means that this feature is disabled
   // and every key have it's own seqno.
   SequenceNumber global_seqno;
+
+  // the level when the table is opened, could potentially change when trivial
+  // move is involved
+  int level;
+
+  // If false, blocks in this file are definitely all uncompressed. Knowing this
+  // before reading individual blocks enables certain optimizations.
+  bool blocks_maybe_compressed = true;
+
+  // If true, data blocks in this file are definitely ZSTD compressed. If false
+  // they might not be. When false we skip creating a ZSTD digested
+  // uncompression dictionary. Even if we get a false negative, things should
+  // still work, just not as quickly.
+  bool blocks_definitely_zstd_compressed = false;
+
   bool closed = false;
+  const bool immortal_table;
+
+  SequenceNumber get_global_seqno(bool is_index) const {
+    return is_index ? kDisableGlobalSequenceNumber : global_seqno;
+  }
+};
+
+template <class TBlockIter, typename TValue = Slice>
+class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
+ public:
+  BlockBasedTableIterator(BlockBasedTable* table,
+                          const ReadOptions& read_options,
+                          const InternalKeyComparator& icomp,
+                          InternalIteratorBase<BlockHandle>* index_iter,
+                          bool check_filter, bool need_upper_bound_check,
+                          const SliceTransform* prefix_extractor, bool is_index,
+                          bool key_includes_seq = true,
+                          bool index_key_is_full = true,
+                          bool for_compaction = false)
+      : table_(table),
+        read_options_(read_options),
+        icomp_(icomp),
+        user_comparator_(icomp.user_comparator()),
+        index_iter_(index_iter),
+        pinned_iters_mgr_(nullptr),
+        block_iter_points_to_real_block_(false),
+        check_filter_(check_filter),
+        need_upper_bound_check_(need_upper_bound_check),
+        prefix_extractor_(prefix_extractor),
+        is_index_(is_index),
+        key_includes_seq_(key_includes_seq),
+        index_key_is_full_(index_key_is_full),
+        for_compaction_(for_compaction) {}
+
+  ~BlockBasedTableIterator() { delete index_iter_; }
+
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() override;
+  void Prev() override;
+  bool Valid() const override {
+    return !is_out_of_bound_ && block_iter_points_to_real_block_ &&
+           block_iter_.Valid();
+  }
+  Slice key() const override {
+    assert(Valid());
+    return block_iter_.key();
+  }
+  TValue value() const override {
+    assert(Valid());
+    return block_iter_.value();
+  }
+  Status status() const override {
+    if (!index_iter_->status().ok()) {
+      return index_iter_->status();
+    } else if (block_iter_points_to_real_block_) {
+      return block_iter_.status();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  bool IsOutOfBound() override { return is_out_of_bound_; }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           block_iter_points_to_real_block_ && block_iter_.IsKeyPinned();
+  }
+  bool IsValuePinned() const override {
+    // BlockIter::IsValuePinned() is always true. No need to check
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           block_iter_points_to_real_block_;
+  }
+
+  bool CheckPrefixMayMatch(const Slice& ikey) {
+    if (check_filter_ &&
+        !table_->PrefixMayMatch(ikey, read_options_, prefix_extractor_,
+                                need_upper_bound_check_)) {
+      // TODO remember the iterator is invalidated because of prefix
+      // match. This can avoid the upper level file iterator to falsely
+      // believe the position is the end of the SST file and move to
+      // the first key of the next file.
+      ResetDataIter();
+      return false;
+    }
+    return true;
+  }
+
+  void ResetDataIter() {
+    if (block_iter_points_to_real_block_) {
+      if (pinned_iters_mgr_ != nullptr && pinned_iters_mgr_->PinningEnabled()) {
+        block_iter_.DelegateCleanupsTo(pinned_iters_mgr_);
+      }
+      block_iter_.Invalidate(Status::OK());
+      block_iter_points_to_real_block_ = false;
+    }
+  }
+
+  void SavePrevIndexValue() {
+    if (block_iter_points_to_real_block_) {
+      // Reseek. If they end up with the same data block, we shouldn't re-fetch
+      // the same data block.
+      prev_index_value_ = index_iter_->value();
+    }
+  }
+
+  void InitDataBlock();
+  void FindKeyForward();
+  void FindKeyBackward();
+
+ private:
+  BlockBasedTable* table_;
+  const ReadOptions read_options_;
+  const InternalKeyComparator& icomp_;
+  UserComparatorWrapper user_comparator_;
+  InternalIteratorBase<BlockHandle>* index_iter_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  TBlockIter block_iter_;
+  bool block_iter_points_to_real_block_;
+  bool is_out_of_bound_ = false;
+  bool check_filter_;
+  // TODO(Zhongyi): pick a better name
+  bool need_upper_bound_check_;
+  const SliceTransform* prefix_extractor_;
+  // If the blocks over which we iterate are index blocks
+  bool is_index_;
+  // If the keys in the blocks over which we iterate include 8 byte sequence
+  bool key_includes_seq_;
+  bool index_key_is_full_;
+  // If this iterator is created for compaction
+  bool for_compaction_;
+  BlockHandle prev_index_value_;
+
+  static const size_t kInitReadaheadSize = 8 * 1024;
+  // Found that 256 KB readahead size provides the best performance, based on
+  // experiments.
+  static const size_t kMaxReadaheadSize;
+  size_t readahead_size_ = kInitReadaheadSize;
+  size_t readahead_limit_ = 0;
+  int num_file_reads_ = 0;
+  std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_builder.cc b/thirdparty/rocksdb/table/block_builder.cc
index 39bfffe511..c14b4f6d3e 100644
--- a/thirdparty/rocksdb/table/block_builder.cc
+++ b/thirdparty/rocksdb/table/block_builder.cc
@@ -33,46 +33,78 @@
 
 #include "table/block_builder.h"
 
-#include <algorithm>
 #include <assert.h>
-#include "rocksdb/comparator.h"
+#include <algorithm>
 #include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "table/data_block_footer.h"
 #include "util/coding.h"
 
 namespace rocksdb {
 
-BlockBuilder::BlockBuilder(int block_restart_interval, bool use_delta_encoding)
+BlockBuilder::BlockBuilder(
+    int block_restart_interval, bool use_delta_encoding,
+    bool use_value_delta_encoding,
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    double data_block_hash_table_util_ratio)
     : block_restart_interval_(block_restart_interval),
       use_delta_encoding_(use_delta_encoding),
+      use_value_delta_encoding_(use_value_delta_encoding),
       restarts_(),
       counter_(0),
       finished_(false) {
+  switch (index_type) {
+    case BlockBasedTableOptions::kDataBlockBinarySearch:
+      break;
+    case BlockBasedTableOptions::kDataBlockBinaryAndHash:
+      data_block_hash_index_builder_.Initialize(
+          data_block_hash_table_util_ratio);
+      break;
+    default:
+      assert(0);
+  }
   assert(block_restart_interval_ >= 1);
-  restarts_.push_back(0);       // First restart point is at offset 0
+  restarts_.push_back(0);  // First restart point is at offset 0
   estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
 }
 
 void BlockBuilder::Reset() {
   buffer_.clear();
   restarts_.clear();
-  restarts_.push_back(0);       // First restart point is at offset 0
+  restarts_.push_back(0);  // First restart point is at offset 0
   estimate_ = sizeof(uint32_t) + sizeof(uint32_t);
   counter_ = 0;
   finished_ = false;
   last_key_.clear();
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Reset();
+  }
 }
 
-size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value)
-  const {
+size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key,
+                                         const Slice& value) const {
   size_t estimate = CurrentSizeEstimate();
-  estimate += key.size() + value.size();
+  // Note: this is an imprecise estimate as it accounts for the whole key size
+  // instead of non-shared key size.
+  estimate += key.size();
+  // In value delta encoding we estimate the value delta size as half the full
+  // value size since only the size field of block handle is encoded.
+  estimate +=
+      !use_value_delta_encoding_ || (counter_ >= block_restart_interval_)
+          ? value.size()
+          : value.size() / 2;
+
   if (counter_ >= block_restart_interval_) {
-    estimate += sizeof(uint32_t); // a new restart entry.
+    estimate += sizeof(uint32_t);  // a new restart entry.
   }
 
-  estimate += sizeof(int32_t); // varint for shared prefix length.
-  estimate += VarintLength(key.size()); // varint for key length.
-  estimate += VarintLength(value.size()); // varint for value length.
+  estimate += sizeof(int32_t);  // varint for shared prefix length.
+  // Note: this is an imprecise estimate as we will have to encoded size, one
+  // for shared key and one for non-shared key.
+  estimate += VarintLength(key.size());  // varint for key length.
+  if (!use_value_delta_encoding_ || (counter_ >= block_restart_interval_)) {
+    estimate += VarintLength(value.size());  // varint for value length.
+  }
 
   return estimate;
 }
@@ -82,14 +114,29 @@ Slice BlockBuilder::Finish() {
   for (size_t i = 0; i < restarts_.size(); i++) {
     PutFixed32(&buffer_, restarts_[i]);
   }
-  PutFixed32(&buffer_, static_cast<uint32_t>(restarts_.size()));
+
+  uint32_t num_restarts = static_cast<uint32_t>(restarts_.size());
+  BlockBasedTableOptions::DataBlockIndexType index_type =
+      BlockBasedTableOptions::kDataBlockBinarySearch;
+  if (data_block_hash_index_builder_.Valid() &&
+      CurrentSizeEstimate() <= kMaxBlockSizeSupportedByHashIndex) {
+    data_block_hash_index_builder_.Finish(buffer_);
+    index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  }
+
+  // footer is a packed format of data_block_index_type and num_restarts
+  uint32_t block_footer = PackIndexTypeAndNumRestarts(index_type, num_restarts);
+
+  PutFixed32(&buffer_, block_footer);
   finished_ = true;
   return Slice(buffer_);
 }
 
-void BlockBuilder::Add(const Slice& key, const Slice& value) {
+void BlockBuilder::Add(const Slice& key, const Slice& value,
+                       const Slice* const delta_value) {
   assert(!finished_);
   assert(counter_ <= block_restart_interval_);
+  assert(!use_value_delta_encoding_ || delta_value);
   size_t shared = 0;  // number of bytes shared with prev key
   if (counter_ >= block_restart_interval_) {
     // Restart compression
@@ -115,14 +162,32 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) {
   const size_t non_shared = key.size() - shared;
   const size_t curr_size = buffer_.size();
 
-  // Add "<shared><non_shared><value_size>" to buffer_
-  PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
-                              static_cast<uint32_t>(non_shared),
-                              static_cast<uint32_t>(value.size()));
+  if (use_value_delta_encoding_) {
+    // Add "<shared><non_shared>" to buffer_
+    PutVarint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                        static_cast<uint32_t>(non_shared));
+  } else {
+    // Add "<shared><non_shared><value_size>" to buffer_
+    PutVarint32Varint32Varint32(&buffer_, static_cast<uint32_t>(shared),
+                                static_cast<uint32_t>(non_shared),
+                                static_cast<uint32_t>(value.size()));
+  }
 
   // Add string delta to buffer_ followed by value
   buffer_.append(key.data() + shared, non_shared);
-  buffer_.append(value.data(), value.size());
+  // Use value delta encoding only when the key has shared bytes. This would
+  // simplify the decoding, where it can figure which decoding to use simply by
+  // looking at the shared bytes size.
+  if (shared != 0 && use_value_delta_encoding_) {
+    buffer_.append(delta_value->data(), delta_value->size());
+  } else {
+    buffer_.append(value.data(), value.size());
+  }
+
+  if (data_block_hash_index_builder_.Valid()) {
+    data_block_hash_index_builder_.Add(ExtractUserKey(key),
+                                       restarts_.size() - 1);
+  }
 
   counter_++;
   estimate_ += buffer_.size() - curr_size;
diff --git a/thirdparty/rocksdb/table/block_builder.h b/thirdparty/rocksdb/table/block_builder.h
index 6b5297d041..0576279f50 100644
--- a/thirdparty/rocksdb/table/block_builder.h
+++ b/thirdparty/rocksdb/table/block_builder.h
@@ -12,6 +12,8 @@
 
 #include <stdint.h>
 #include "rocksdb/slice.h"
+#include "rocksdb/table.h"
+#include "table/data_block_hash_index.h"
 
 namespace rocksdb {
 
@@ -21,14 +23,19 @@ class BlockBuilder {
   void operator=(const BlockBuilder&) = delete;
 
   explicit BlockBuilder(int block_restart_interval,
-                        bool use_delta_encoding = true);
+                        bool use_delta_encoding = true,
+                        bool use_value_delta_encoding = false,
+                        BlockBasedTableOptions::DataBlockIndexType index_type =
+                            BlockBasedTableOptions::kDataBlockBinarySearch,
+                        double data_block_hash_table_util_ratio = 0.75);
 
   // Reset the contents as if the BlockBuilder was just constructed.
   void Reset();
 
   // REQUIRES: Finish() has not been called since the last call to Reset().
   // REQUIRES: key is larger than any previously added key
-  void Add(const Slice& key, const Slice& value);
+  void Add(const Slice& key, const Slice& value,
+           const Slice* const delta_value = nullptr);
 
   // Finish building the block and return a slice that refers to the
   // block contents.  The returned slice will remain valid for the
@@ -37,26 +44,32 @@ class BlockBuilder {
 
   // Returns an estimate of the current (uncompressed) size of the block
   // we are building.
-  inline size_t CurrentSizeEstimate() const { return estimate_; }
+  inline size_t CurrentSizeEstimate() const {
+    return estimate_ + (data_block_hash_index_builder_.Valid()
+                            ? data_block_hash_index_builder_.EstimateSize()
+                            : 0);
+  }
 
   // Returns an estimated block size after appending key and value.
   size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const;
 
   // Return true iff no entries have been added since the last Reset()
-  bool empty() const {
-    return buffer_.empty();
-  }
+  bool empty() const { return buffer_.empty(); }
 
  private:
-  const int          block_restart_interval_;
-  const bool         use_delta_encoding_;
+  const int block_restart_interval_;
+  // TODO(myabandeh): put it into a separate IndexBlockBuilder
+  const bool use_delta_encoding_;
+  // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values
+  const bool use_value_delta_encoding_;
 
-  std::string           buffer_;    // Destination buffer
+  std::string buffer_;              // Destination buffer
   std::vector<uint32_t> restarts_;  // Restart points
-  size_t                estimate_;
-  int                   counter_;   // Number of entries emitted since restart
-  bool                  finished_;  // Has Finish() been called?
-  std::string           last_key_;
+  size_t estimate_;
+  int counter_;    // Number of entries emitted since restart
+  bool finished_;  // Has Finish() been called?
+  std::string last_key_;
+  DataBlockHashIndexBuilder data_block_hash_index_builder_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_fetcher.cc b/thirdparty/rocksdb/table/block_fetcher.cc
new file mode 100644
index 0000000000..1f209210c1
--- /dev/null
+++ b/thirdparty/rocksdb/table/block_fetcher.cc
@@ -0,0 +1,265 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/block_fetcher.h"
+
+#include <inttypes.h>
+#include <string>
+
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/env.h"
+#include "table/block.h"
+#include "table/block_based_table_reader.h"
+#include "table/format.h"
+#include "table/persistent_cache_helper.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/file_reader_writer.h"
+#include "util/logging.h"
+#include "util/memory_allocator.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/xxhash.h"
+
+namespace rocksdb {
+
+inline void BlockFetcher::CheckBlockChecksum() {
+  // Check the crc of the type and the block contents
+  if (read_options_.verify_checksums) {
+    const char* data = slice_.data();  // Pointer to where Read put the data
+    PERF_TIMER_GUARD(block_checksum_time);
+    uint32_t value = DecodeFixed32(data + block_size_ + 1);
+    uint32_t actual = 0;
+    switch (footer_.checksum()) {
+      case kNoChecksum:
+        break;
+      case kCRC32c:
+        value = crc32c::Unmask(value);
+        actual = crc32c::Value(data, block_size_ + 1);
+        break;
+      case kxxHash:
+        actual = XXH32(data, static_cast<int>(block_size_) + 1, 0);
+        break;
+      case kxxHash64:
+        actual = static_cast<uint32_t>(
+            XXH64(data, static_cast<int>(block_size_) + 1, 0) &
+            uint64_t{0xffffffff});
+        break;
+      default:
+        status_ = Status::Corruption(
+            "unknown checksum type " + ToString(footer_.checksum()) + " in " +
+            file_->file_name() + " offset " + ToString(handle_.offset()) +
+            " size " + ToString(block_size_));
+    }
+    if (status_.ok() && actual != value) {
+      status_ = Status::Corruption(
+          "block checksum mismatch: expected " + ToString(actual) + ", got " +
+          ToString(value) + "  in " + file_->file_name() + " offset " +
+          ToString(handle_.offset()) + " size " + ToString(block_size_));
+    }
+  }
+}
+
+inline bool BlockFetcher::TryGetUncompressBlockFromPersistentCache() {
+  if (cache_options_.persistent_cache &&
+      !cache_options_.persistent_cache->IsCompressed()) {
+    Status status = PersistentCacheHelper::LookupUncompressedPage(
+        cache_options_, handle_, contents_);
+    if (status.ok()) {
+      // uncompressed page is found for the block handle
+      return true;
+    } else {
+      // uncompressed page is not found
+      if (ioptions_.info_log && !status.IsNotFound()) {
+        assert(!status.ok());
+        ROCKS_LOG_INFO(ioptions_.info_log,
+                       "Error reading from persistent cache. %s",
+                       status.ToString().c_str());
+      }
+    }
+  }
+  return false;
+}
+
+inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
+  if (prefetch_buffer_ != nullptr &&
+      prefetch_buffer_->TryReadFromCache(
+          handle_.offset(),
+          static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_)) {
+    block_size_ = static_cast<size_t>(handle_.size());
+    CheckBlockChecksum();
+    if (!status_.ok()) {
+      return true;
+    }
+    got_from_prefetch_buffer_ = true;
+    used_buf_ = const_cast<char*>(slice_.data());
+  }
+  return got_from_prefetch_buffer_;
+}
+
+inline bool BlockFetcher::TryGetCompressedBlockFromPersistentCache() {
+  if (cache_options_.persistent_cache &&
+      cache_options_.persistent_cache->IsCompressed()) {
+    // lookup uncompressed cache mode p-cache
+    std::unique_ptr<char[]> raw_data;
+    status_ = PersistentCacheHelper::LookupRawPage(
+        cache_options_, handle_, &raw_data, block_size_ + kBlockTrailerSize);
+    if (status_.ok()) {
+      heap_buf_ = CacheAllocationPtr(raw_data.release());
+      used_buf_ = heap_buf_.get();
+      slice_ = Slice(heap_buf_.get(), block_size_);
+      return true;
+    } else if (!status_.IsNotFound() && ioptions_.info_log) {
+      assert(!status_.ok());
+      ROCKS_LOG_INFO(ioptions_.info_log,
+                     "Error reading from persistent cache. %s",
+                     status_.ToString().c_str());
+    }
+  }
+  return false;
+}
+
+inline void BlockFetcher::PrepareBufferForBlockFromFile() {
+  // cache miss read from device
+  if (do_uncompress_ &&
+      block_size_ + kBlockTrailerSize < kDefaultStackBufferSize) {
+    // If we've got a small enough hunk of data, read it in to the
+    // trivially allocated stack buffer instead of needing a full malloc()
+    used_buf_ = &stack_buf_[0];
+  } else if (maybe_compressed_ && !do_uncompress_) {
+    compressed_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize,
+                                    memory_allocator_compressed_);
+    used_buf_ = compressed_buf_.get();
+  } else {
+    heap_buf_ =
+        AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
+    used_buf_ = heap_buf_.get();
+  }
+}
+
+inline void BlockFetcher::InsertCompressedBlockToPersistentCacheIfNeeded() {
+  if (status_.ok() && read_options_.fill_cache &&
+      cache_options_.persistent_cache &&
+      cache_options_.persistent_cache->IsCompressed()) {
+    // insert to raw cache
+    PersistentCacheHelper::InsertRawPage(cache_options_, handle_, used_buf_,
+                                         block_size_ + kBlockTrailerSize);
+  }
+}
+
+inline void BlockFetcher::InsertUncompressedBlockToPersistentCacheIfNeeded() {
+  if (status_.ok() && !got_from_prefetch_buffer_ && read_options_.fill_cache &&
+      cache_options_.persistent_cache &&
+      !cache_options_.persistent_cache->IsCompressed()) {
+    // insert to uncompressed cache
+    PersistentCacheHelper::InsertUncompressedPage(cache_options_, handle_,
+                                                  *contents_);
+  }
+}
+
+inline void BlockFetcher::CopyBufferToHeap() {
+  assert(used_buf_ != heap_buf_.get());
+  heap_buf_ = AllocateBlock(block_size_ + kBlockTrailerSize, memory_allocator_);
+  memcpy(heap_buf_.get(), used_buf_, block_size_ + kBlockTrailerSize);
+}
+
+inline void BlockFetcher::GetBlockContents() {
+  if (slice_.data() != used_buf_) {
+    // the slice content is not the buffer provided
+    *contents_ = BlockContents(Slice(slice_.data(), block_size_));
+  } else {
+    // page can be either uncompressed or compressed, the buffer either stack
+    // or heap provided. Refer to https://github.com/facebook/rocksdb/pull/4096
+    if (got_from_prefetch_buffer_ || used_buf_ == &stack_buf_[0]) {
+      CopyBufferToHeap();
+    } else if (used_buf_ == compressed_buf_.get()) {
+      if (compression_type_ == kNoCompression &&
+          memory_allocator_ != memory_allocator_compressed_) {
+        CopyBufferToHeap();
+      } else {
+        heap_buf_ = std::move(compressed_buf_);
+      }
+    }
+    *contents_ = BlockContents(std::move(heap_buf_), block_size_);
+  }
+#ifndef NDEBUG
+  contents_->is_raw_block = true;
+#endif
+}
+
+Status BlockFetcher::ReadBlockContents() {
+  block_size_ = static_cast<size_t>(handle_.size());
+
+  if (TryGetUncompressBlockFromPersistentCache()) {
+    compression_type_ = kNoCompression;
+#ifndef NDEBUG
+    contents_->is_raw_block = true;
+#endif  // NDEBUG
+    return Status::OK();
+  }
+  if (TryGetFromPrefetchBuffer()) {
+    if (!status_.ok()) {
+      return status_;
+    }
+  } else if (!TryGetCompressedBlockFromPersistentCache()) {
+    PrepareBufferForBlockFromFile();
+    Status s;
+
+    {
+      PERF_TIMER_GUARD(block_read_time);
+      // Actual file read
+      status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize,
+                            &slice_, used_buf_);
+    }
+    PERF_COUNTER_ADD(block_read_count, 1);
+    PERF_COUNTER_ADD(block_read_byte, block_size_ + kBlockTrailerSize);
+    if (!status_.ok()) {
+      return status_;
+    }
+
+    if (slice_.size() != block_size_ + kBlockTrailerSize) {
+      return Status::Corruption("truncated block read from " +
+                                file_->file_name() + " offset " +
+                                ToString(handle_.offset()) + ", expected " +
+                                ToString(block_size_ + kBlockTrailerSize) +
+                                " bytes, got " + ToString(slice_.size()));
+    }
+
+    CheckBlockChecksum();
+    if (status_.ok()) {
+      InsertCompressedBlockToPersistentCacheIfNeeded();
+    } else {
+      return status_;
+    }
+  }
+
+  PERF_TIMER_GUARD(block_decompress_time);
+
+  compression_type_ = get_block_compression_type(slice_.data(), block_size_);
+
+  if (do_uncompress_ && compression_type_ != kNoCompression) {
+    // compressed page, uncompress, update cache
+    UncompressionContext context(compression_type_);
+    UncompressionInfo info(context, uncompression_dict_, compression_type_);
+    status_ = UncompressBlockContents(info, slice_.data(), block_size_,
+                                      contents_, footer_.version(), ioptions_,
+                                      memory_allocator_);
+    compression_type_ = kNoCompression;
+  } else {
+    GetBlockContents();
+  }
+
+  InsertUncompressedBlockToPersistentCacheIfNeeded();
+
+  return status_;
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_fetcher.h b/thirdparty/rocksdb/table/block_fetcher.h
new file mode 100644
index 0000000000..b5fee94159
--- /dev/null
+++ b/thirdparty/rocksdb/table/block_fetcher.h
@@ -0,0 +1,88 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "table/block.h"
+#include "table/format.h"
+#include "util/memory_allocator.h"
+
+namespace rocksdb {
+class BlockFetcher {
+ public:
+  // Read the block identified by "handle" from "file".
+  // The only relevant option is options.verify_checksums for now.
+  // On failure return non-OK.
+  // On success fill *result and return OK - caller owns *result
+  // @param uncompression_dict Data for presetting the compression library's
+  //    dictionary.
+  BlockFetcher(RandomAccessFileReader* file,
+               FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
+               const ReadOptions& read_options, const BlockHandle& handle,
+               BlockContents* contents, const ImmutableCFOptions& ioptions,
+               bool do_uncompress, bool maybe_compressed,
+               const UncompressionDict& uncompression_dict,
+               const PersistentCacheOptions& cache_options,
+               MemoryAllocator* memory_allocator = nullptr,
+               MemoryAllocator* memory_allocator_compressed = nullptr)
+      : file_(file),
+        prefetch_buffer_(prefetch_buffer),
+        footer_(footer),
+        read_options_(read_options),
+        handle_(handle),
+        contents_(contents),
+        ioptions_(ioptions),
+        do_uncompress_(do_uncompress),
+        maybe_compressed_(maybe_compressed),
+        uncompression_dict_(uncompression_dict),
+        cache_options_(cache_options),
+        memory_allocator_(memory_allocator),
+        memory_allocator_compressed_(memory_allocator_compressed) {}
+  Status ReadBlockContents();
+  CompressionType get_compression_type() const { return compression_type_; }
+
+ private:
+  static const uint32_t kDefaultStackBufferSize = 5000;
+
+  RandomAccessFileReader* file_;
+  FilePrefetchBuffer* prefetch_buffer_;
+  const Footer& footer_;
+  const ReadOptions read_options_;
+  const BlockHandle& handle_;
+  BlockContents* contents_;
+  const ImmutableCFOptions& ioptions_;
+  bool do_uncompress_;
+  bool maybe_compressed_;
+  const UncompressionDict& uncompression_dict_;
+  const PersistentCacheOptions& cache_options_;
+  MemoryAllocator* memory_allocator_;
+  MemoryAllocator* memory_allocator_compressed_;
+  Status status_;
+  Slice slice_;
+  char* used_buf_ = nullptr;
+  size_t block_size_;
+  CacheAllocationPtr heap_buf_;
+  CacheAllocationPtr compressed_buf_;
+  char stack_buf_[kDefaultStackBufferSize];
+  bool got_from_prefetch_buffer_ = false;
+  rocksdb::CompressionType compression_type_;
+
+  // return true if found
+  bool TryGetUncompressBlockFromPersistentCache();
+  // return true if found
+  bool TryGetFromPrefetchBuffer();
+  bool TryGetCompressedBlockFromPersistentCache();
+  void PrepareBufferForBlockFromFile();
+  // Copy content from used_buf_ to new heap buffer.
+  void CopyBufferToHeap();
+  void GetBlockContents();
+  void InsertCompressedBlockToPersistentCacheIfNeeded();
+  void InsertUncompressedBlockToPersistentCacheIfNeeded();
+  void CheckBlockChecksum();
+};
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/block_prefix_index.cc b/thirdparty/rocksdb/table/block_prefix_index.cc
index df37b5fc2b..67c749d4c3 100644
--- a/thirdparty/rocksdb/table/block_prefix_index.cc
+++ b/thirdparty/rocksdb/table/block_prefix_index.cc
@@ -41,9 +41,7 @@ inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
 const uint32_t kNoneBlock = 0x7FFFFFFF;
 const uint32_t kBlockArrayMask = 0x80000000;
 
-inline bool IsNone(uint32_t block_id) {
-  return block_id == kNoneBlock;
-}
+inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }
 
 inline bool IsBlockId(uint32_t block_id) {
   return (block_id & kBlockArrayMask) == 0;
@@ -74,10 +72,9 @@ class BlockPrefixIndex::Builder {
   explicit Builder(const SliceTransform* internal_prefix_extractor)
       : internal_prefix_extractor_(internal_prefix_extractor) {}
 
-  void Add(const Slice& key_prefix, uint32_t start_block,
-           uint32_t num_blocks) {
+  void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
     PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
-      arena_.AllocateAligned(sizeof(PrefixRecord)));
+        arena_.AllocateAligned(sizeof(PrefixRecord)));
     record->prefix = key_prefix;
     record->start_block = start_block;
     record->end_block = start_block + num_blocks - 1;
@@ -169,7 +166,6 @@ class BlockPrefixIndex::Builder {
   Arena arena_;
 };
 
-
 Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
                                 const Slice& prefixes, const Slice& prefix_meta,
                                 BlockPrefixIndex** prefix_index) {
@@ -191,7 +187,7 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
     }
     if (pos + prefix_size > prefixes.size()) {
       s = Status::Corruption(
-        "Corrupted prefix meta block: size inconsistency.");
+          "Corrupted prefix meta block: size inconsistency.");
       break;
     }
     Slice prefix(prefixes.data() + pos, prefix_size);
@@ -211,8 +207,7 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
   return s;
 }
 
-uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
-                                     uint32_t** blocks) {
+uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
   Slice prefix = internal_prefix_extractor_->Transform(key);
 
   uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
@@ -226,7 +221,7 @@ uint32_t BlockPrefixIndex::GetBlocks(const Slice& key,
   } else {
     uint32_t index = DecodeIndex(block_id);
     assert(index < num_block_array_buffer_entries_);
-    *blocks = &block_array_buffer_[index+1];
+    *blocks = &block_array_buffer_[index + 1];
     uint32_t num_blocks = block_array_buffer_[index];
     assert(num_blocks > 1);
     assert(index + num_blocks < num_block_array_buffer_entries_);
diff --git a/thirdparty/rocksdb/table/block_prefix_index.h b/thirdparty/rocksdb/table/block_prefix_index.h
index dd4282d17b..105606db20 100644
--- a/thirdparty/rocksdb/table/block_prefix_index.h
+++ b/thirdparty/rocksdb/table/block_prefix_index.h
@@ -19,7 +19,6 @@ class SliceTransform;
 // that index block.
 class BlockPrefixIndex {
  public:
-
   // Maps a key to a list of data blocks that could potentially contain
   // the key, based on the prefix.
   // Returns the total number of relevant blocks, 0 means the key does
@@ -28,7 +27,7 @@ class BlockPrefixIndex {
 
   size_t ApproximateMemoryUsage() const {
     return sizeof(BlockPrefixIndex) +
-      (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t);
+           (num_block_array_buffer_entries_ + num_buckets_) * sizeof(uint32_t);
   }
 
   // Create hash index by reading from the metadata blocks.
@@ -48,8 +47,7 @@ class BlockPrefixIndex {
   friend Builder;
 
   BlockPrefixIndex(const SliceTransform* internal_prefix_extractor,
-                   uint32_t num_buckets,
-                   uint32_t* buckets,
+                   uint32_t num_buckets, uint32_t* buckets,
                    uint32_t num_block_array_buffer_entries,
                    uint32_t* block_array_buffer)
       : internal_prefix_extractor_(internal_prefix_extractor),
diff --git a/thirdparty/rocksdb/table/block_test.cc b/thirdparty/rocksdb/table/block_test.cc
index f5c543975f..3e0ff3eab5 100644
--- a/thirdparty/rocksdb/table/block_test.cc
+++ b/thirdparty/rocksdb/table/block_test.cc
@@ -12,13 +12,13 @@
 #include <vector>
 
 #include "db/dbformat.h"
-#include "db/write_batch_internal.h"
 #include "db/memtable.h"
+#include "db/write_batch_internal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
-#include "rocksdb/table.h"
 #include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
 #include "table/block.h"
 #include "table/block_builder.h"
 #include "table/format.h"
@@ -28,7 +28,7 @@
 
 namespace rocksdb {
 
-static std::string RandomString(Random* rnd, int len) {
+static std::string RandomString(Random *rnd, int len) {
   std::string r;
   test::RandomString(rnd, len, &r);
   return r;
@@ -68,6 +68,29 @@ void GenerateRandomKVs(std::vector<std::string> *keys,
   }
 }
 
+// Same as GenerateRandomKVs but the values are BlockHandle
+void GenerateRandomKBHs(std::vector<std::string> *keys,
+                        std::vector<BlockHandle> *values, const int from,
+                        const int len, const int step = 1,
+                        const int padding_size = 0,
+                        const int keys_share_prefix = 1) {
+  Random rnd(302);
+  uint64_t offset = 0;
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generate keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      uint64_t size = rnd.Uniform(1024 * 16);
+      BlockHandle handle(offset, size);
+      offset += size + kBlockTrailerSize;
+      values->emplace_back(handle);
+    }
+  }
+}
+
 class BlockTest : public testing::Test {};
 
 // block test
@@ -94,14 +117,13 @@ TEST_F(BlockTest, SimpleTest) {
   // create block reader
   BlockContents contents;
   contents.data = rawblock;
-  contents.cachable = false;
   Block reader(std::move(contents), kDisableGlobalSequenceNumber);
 
   // read contents of block sequentially
   int count = 0;
-  InternalIterator *iter = reader.NewIterator(options.comparator);
-  for (iter->SeekToFirst();iter->Valid(); count++, iter->Next()) {
-
+  InternalIterator *iter =
+      reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
+  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
     // read kv from block
     Slice k = iter->key();
     Slice v = iter->value();
@@ -113,9 +135,9 @@ TEST_F(BlockTest, SimpleTest) {
   delete iter;
 
   // read block contents randomly
-  iter = reader.NewIterator(options.comparator);
+  iter =
+      reader.NewIterator<DataBlockIter>(options.comparator, options.comparator);
   for (int i = 0; i < num_records; i++) {
-
     // find a random key in the lookaside array
     int index = rnd.Uniform(num_records);
     Slice k(keys[index]);
@@ -129,11 +151,88 @@ TEST_F(BlockTest, SimpleTest) {
   delete iter;
 }
 
+TEST_F(BlockTest, ValueDeltaEncodingTest) {
+  Random rnd(301);
+  Options options = Options();
+  std::unique_ptr<InternalKeyComparator> ic;
+  ic.reset(new test::PlainInternalKeyComparator(options.comparator));
+
+  std::vector<std::string> keys;
+  std::vector<BlockHandle> values;
+  const bool kUseDeltaEncoding = true;
+  const bool kUseValueDeltaEncoding = true;
+  BlockBuilder builder(16, kUseDeltaEncoding, kUseValueDeltaEncoding);
+  int num_records = 100;
+
+  GenerateRandomKBHs(&keys, &values, 0, num_records);
+  // add a bunch of records to a block
+  BlockHandle last_encoded_handle;
+  for (int i = 0; i < num_records; i++) {
+    auto block_handle = values[i];
+    std::string handle_encoding;
+    block_handle.EncodeTo(&handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(&handle_delta_encoding,
+                      block_handle.size() - last_encoded_handle.size());
+    last_encoded_handle = block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    builder.Add(keys[i], handle_encoding, &handle_delta_encoding_slice);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const bool kTotalOrderSeek = true;
+  const bool kIncludesSeq = true;
+  const bool kValueIsFull = !kUseValueDeltaEncoding;
+  IndexBlockIter *kNullIter = nullptr;
+  Statistics *kNullStats = nullptr;
+  // read contents of block sequentially
+  int count = 0;
+  InternalIteratorBase<BlockHandle> *iter = reader.NewIterator<IndexBlockIter>(
+      options.comparator, options.comparator, kNullIter, kNullStats,
+      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
+  for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) {
+    // read kv from block
+    Slice k = iter->key();
+    BlockHandle handle = iter->value();
+
+    // compare with lookaside array
+    ASSERT_EQ(k.ToString().compare(keys[count]), 0);
+
+    ASSERT_EQ(values[count].offset(), handle.offset());
+    ASSERT_EQ(values[count].size(), handle.size());
+  }
+  delete iter;
+
+  // read block contents randomly
+  iter = reader.NewIterator<IndexBlockIter>(
+      options.comparator, options.comparator, kNullIter, kNullStats,
+      kTotalOrderSeek, kIncludesSeq, kValueIsFull);
+  for (int i = 0; i < num_records; i++) {
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    Slice k(keys[index]);
+
+    // search in block for this key
+    iter->Seek(k);
+    ASSERT_TRUE(iter->Valid());
+    BlockHandle handle = iter->value();
+    ASSERT_EQ(values[index].offset(), handle.offset());
+    ASSERT_EQ(values[index].size(), handle.size());
+  }
+  delete iter;
+}
 // return the block contents
 BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
                                const std::vector<std::string> &keys,
                                const std::vector<std::string> &values,
-                               const int prefix_group_size = 1) {
+                               const int /*prefix_group_size*/ = 1) {
   builder->reset(new BlockBuilder(1 /* restart interval */));
 
   // Add only half of the keys
@@ -144,7 +243,6 @@ BlockContents GetBlockContents(std::unique_ptr<BlockBuilder> *builder,
 
   BlockContents contents;
   contents.data = rawblock;
-  contents.cachable = false;
 
   return contents;
 }
@@ -154,8 +252,7 @@ void CheckBlockContents(BlockContents contents, const int max_key,
                         const std::vector<std::string> &values) {
   const size_t prefix_size = 6;
   // create block reader
-  BlockContents contents_ref(contents.data, contents.cachable,
-                             contents.compression_type);
+  BlockContents contents_ref(contents.data);
   Block reader1(std::move(contents), kDisableGlobalSequenceNumber);
   Block reader2(std::move(contents_ref), kDisableGlobalSequenceNumber);
 
@@ -163,7 +260,8 @@ void CheckBlockContents(BlockContents contents, const int max_key,
       NewFixedPrefixTransform(prefix_size));
 
   std::unique_ptr<InternalIterator> regular_iter(
-      reader2.NewIterator(BytewiseComparator()));
+      reader2.NewIterator<DataBlockIter>(BytewiseComparator(),
+                                         BytewiseComparator()));
 
   // Seek existent keys
   for (size_t i = 0; i < keys.size(); i++) {
@@ -229,40 +327,67 @@ class BlockReadAmpBitmapSlowAndAccurate {
     marked_ranges_.emplace(end_offset, start_offset);
   }
 
+  void ResetCheckSequence() { iter_valid_ = false; }
+
   // Return true if any byte in this range was Marked
+  // This does linear search from the previous position. When calling
+  // multiple times, `offset` needs to be incremental to get correct results.
+  // Call ResetCheckSequence() to reset it.
   bool IsPinMarked(size_t offset) {
-    auto it = marked_ranges_.lower_bound(
+    if (iter_valid_) {
+      // Has existing iterator, try linear search from
+      // the iterator.
+      for (int i = 0; i < 64; i++) {
+        if (offset < iter_->second) {
+          return false;
+        }
+        if (offset <= iter_->first) {
+          return true;
+        }
+
+        iter_++;
+        if (iter_ == marked_ranges_.end()) {
+          iter_valid_ = false;
+          return false;
+        }
+      }
+    }
+    // Initial call or have linear searched too many times.
+    // Do binary search.
+    iter_ = marked_ranges_.lower_bound(
         std::make_pair(offset, static_cast<size_t>(0)));
-    if (it == marked_ranges_.end()) {
+    if (iter_ == marked_ranges_.end()) {
+      iter_valid_ = false;
       return false;
     }
-    return offset <= it->first && offset >= it->second;
+    iter_valid_ = true;
+    return offset <= iter_->first && offset >= iter_->second;
   }
 
  private:
   std::set<std::pair<size_t, size_t>> marked_ranges_;
+  std::set<std::pair<size_t, size_t>>::iterator iter_;
+  bool iter_valid_ = false;
 };
 
 TEST_F(BlockTest, BlockReadAmpBitmap) {
   uint32_t pin_offset = 0;
   SyncPoint::GetInstance()->SetCallBack(
-    "BlockReadAmpBitmap:rnd", [&pin_offset](void* arg) {
-      pin_offset = *(static_cast<uint32_t*>(arg));
-    });
+      "BlockReadAmpBitmap:rnd", [&pin_offset](void *arg) {
+        pin_offset = *(static_cast<uint32_t *>(arg));
+      });
   SyncPoint::GetInstance()->EnableProcessing();
   std::vector<size_t> block_sizes = {
-      1,                 // 1 byte
-      32,                // 32 bytes
-      61,                // 61 bytes
-      64,                // 64 bytes
-      512,               // 0.5 KB
-      1024,              // 1 KB
-      1024 * 4,          // 4 KB
-      1024 * 10,         // 10 KB
-      1024 * 50,         // 50 KB
-      1024 * 1024,       // 1 MB
-      1024 * 1024 * 4,   // 4 MB
-      1024 * 1024 * 50,  // 10 MB
+      1,                // 1 byte
+      32,               // 32 bytes
+      61,               // 61 bytes
+      64,               // 64 bytes
+      512,              // 0.5 KB
+      1024,             // 1 KB
+      1024 * 4,         // 4 KB
+      1024 * 10,        // 10 KB
+      1024 * 50,        // 50 KB
+      1024 * 1024 * 4,  // 5 MB
       777,
       124653,
   };
@@ -278,10 +403,6 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
     if (block_size % kBytesPerBit != 0) {
       needed_bits++;
     }
-    size_t bitmap_size = needed_bits / 32;
-    if (needed_bits % 32 != 0) {
-      bitmap_size++;
-    }
 
     ASSERT_EQ(stats->getTickerCount(READ_AMP_TOTAL_READ_BYTES), block_size);
 
@@ -309,6 +430,7 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
     }
 
     for (size_t i = 0; i < random_entries.size(); i++) {
+      read_amp_slow_and_accurate.ResetCheckSequence();
       auto &current_entry = random_entries[rnd.Next() % random_entries.size()];
 
       read_amp_bitmap.Mark(static_cast<uint32_t>(current_entry.first),
@@ -319,11 +441,11 @@ TEST_F(BlockTest, BlockReadAmpBitmap) {
       size_t total_bits = 0;
       for (size_t bit_idx = 0; bit_idx < needed_bits; bit_idx++) {
         total_bits += read_amp_slow_and_accurate.IsPinMarked(
-          bit_idx * kBytesPerBit + pin_offset);
+            bit_idx * kBytesPerBit + pin_offset);
       }
       size_t expected_estimate_useful = total_bits * kBytesPerBit;
       size_t got_estimate_useful =
-        stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+          stats->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
       ASSERT_EQ(expected_estimate_useful, got_estimate_useful);
     }
   }
@@ -358,14 +480,14 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = true;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber,
                  kBytesPerBit, stats.get());
 
     // read contents of block sequentially
     size_t read_bytes = 0;
-    BlockIter *iter = static_cast<BlockIter *>(
-        reader.NewIterator(options.comparator, nullptr, true, stats.get()));
+    DataBlockIter *iter =
+        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
+            options.comparator, options.comparator, nullptr, stats.get()));
     for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
       iter->value();
       read_bytes += iter->TEST_CurrentEntrySize();
@@ -392,13 +514,13 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = true;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber,
                  kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    BlockIter *iter = static_cast<BlockIter *>(
-        reader.NewIterator(options.comparator, nullptr, true, stats.get()));
+    DataBlockIter *iter =
+        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
+            options.comparator, options.comparator, nullptr, stats.get()));
     for (int i = 0; i < num_records; i++) {
       Slice k(keys[i]);
 
@@ -428,13 +550,13 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) {
     // create block reader
     BlockContents contents;
     contents.data = rawblock;
-    contents.cachable = true;
     Block reader(std::move(contents), kDisableGlobalSequenceNumber,
                  kBytesPerBit, stats.get());
 
     size_t read_bytes = 0;
-    BlockIter *iter = static_cast<BlockIter *>(
-        reader.NewIterator(options.comparator, nullptr, true, stats.get()));
+    DataBlockIter *iter =
+        static_cast<DataBlockIter *>(reader.NewIterator<DataBlockIter>(
+            options.comparator, options.comparator, nullptr, stats.get()));
     std::unordered_set<int> read_keys;
     for (int i = 0; i < num_records; i++) {
       int index = rnd.Uniform(num_records);
diff --git a/thirdparty/rocksdb/table/bloom_block.h b/thirdparty/rocksdb/table/bloom_block.h
index 9ff610badd..483fa25d93 100644
--- a/thirdparty/rocksdb/table/bloom_block.h
+++ b/thirdparty/rocksdb/table/bloom_block.h
@@ -15,8 +15,7 @@ class BloomBlockBuilder {
  public:
   static const std::string kBloomBlock;
 
-  explicit BloomBlockBuilder(uint32_t num_probes = 6)
-      : bloom_(num_probes, nullptr) {}
+  explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes) {}
 
   void SetTotalBits(Allocator* allocator, uint32_t total_bits,
                     uint32_t locality, size_t huge_page_tlb_size,
diff --git a/thirdparty/rocksdb/table/cuckoo_table_builder.cc b/thirdparty/rocksdb/table/cuckoo_table_builder.cc
index e3ed314b36..f590e6ad40 100644
--- a/thirdparty/rocksdb/table/cuckoo_table_builder.cc
+++ b/thirdparty/rocksdb/table/cuckoo_table_builder.cc
@@ -164,9 +164,9 @@ bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const {
 Slice CuckooTableBuilder::GetKey(uint64_t idx) const {
   assert(closed_);
   if (IsDeletedKey(idx)) {
-    return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_);
+    return Slice(&deleted_keys_[static_cast<size_t>((idx - num_values_) * key_size_)], static_cast<size_t>(key_size_));
   }
-  return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_);
+  return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_))], static_cast<size_t>(key_size_));
 }
 
 Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
@@ -177,17 +177,17 @@ Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const {
 Slice CuckooTableBuilder::GetValue(uint64_t idx) const {
   assert(closed_);
   if (IsDeletedKey(idx)) {
-    static std::string empty_value(value_size_, 'a');
+    static std::string empty_value(static_cast<unsigned int>(value_size_), 'a');
     return Slice(empty_value);
   }
-  return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_);
+  return Slice(&kvs_[static_cast<size_t>(idx * (key_size_ + value_size_) + key_size_)], static_cast<size_t>(value_size_));
 }
 
 Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
-  buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1);
+  buckets->resize(static_cast<size_t>(hash_table_size_ + cuckoo_block_size_ - 1));
   uint32_t make_space_for_key_call_id = 0;
   for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) {
-    uint64_t bucket_id;
+    uint64_t bucket_id = 0;
     bool bucket_found = false;
     autovector<uint64_t> hash_vals;
     Slice user_key = GetUserKey(vector_idx);
@@ -200,13 +200,13 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
       // stop searching and proceed for next hash function.
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++hash_val) {
-        if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
           bucket_id = hash_val;
           bucket_found = true;
           break;
         } else {
           if (ucomp_->Compare(user_key,
-                GetUserKey((*buckets)[hash_val].vector_idx)) == 0) {
+                GetUserKey((*buckets)[static_cast<size_t>(hash_val)].vector_idx)) == 0) {
             return Status::NotSupported("Same key is being inserted again.");
           }
           hash_vals.push_back(hash_val);
@@ -226,7 +226,7 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
       ++num_hash_func_;
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++hash_val) {
-        if ((*buckets)[hash_val].vector_idx == kMaxVectorIdx) {
+        if ((*buckets)[static_cast<size_t>(hash_val)].vector_idx == kMaxVectorIdx) {
           bucket_found = true;
           bucket_id = hash_val;
           break;
@@ -235,7 +235,7 @@ Status CuckooTableBuilder::MakeHashTable(std::vector<CuckooBucket>* buckets) {
         }
       }
     }
-    (*buckets)[bucket_id].vector_idx = vector_idx;
+    (*buckets)[static_cast<size_t>(bucket_id)].vector_idx = vector_idx;
   }
   return Status::OK();
 }
@@ -289,13 +289,14 @@ Status CuckooTableBuilder::Finish() {
     }
   }
   properties_.num_entries = num_entries_;
+  properties_.num_deletions = num_entries_ - num_values_;
   properties_.fixed_key_len = key_size_;
   properties_.user_collected_properties[
         CuckooTablePropertyNames::kValueLength].assign(
         reinterpret_cast<const char*>(&value_size_), sizeof(value_size_));
 
   uint64_t bucket_size = key_size_ + value_size_;
-  unused_bucket.resize(bucket_size, 'a');
+  unused_bucket.resize(static_cast<size_t>(bucket_size), 'a');
   // Write the table.
   uint32_t num_added = 0;
   for (auto& bucket : buckets) {
@@ -320,7 +321,7 @@ Status CuckooTableBuilder::Finish() {
 
   uint64_t offset = buckets.size() * bucket_size;
   properties_.data_size = offset;
-  unused_bucket.resize(properties_.fixed_key_len);
+  unused_bucket.resize(static_cast<size_t>(properties_.fixed_key_len));
   properties_.user_collected_properties[
     CuckooTablePropertyNames::kEmptyKey] = unused_bucket;
   properties_.user_collected_properties[
@@ -456,7 +457,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
   // no. of times this will be called is <= max_num_hash_func_ + num_entries_.
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
     uint64_t bid = hash_vals[hash_cnt];
-    (*buckets)[bid].make_space_for_key_call_id = make_space_for_key_call_id;
+    (*buckets)[static_cast<size_t>(bid)].make_space_for_key_call_id = make_space_for_key_call_id;
     tree.push_back(CuckooNode(bid, 0, 0));
   }
   bool null_found = false;
@@ -467,7 +468,7 @@ bool CuckooTableBuilder::MakeSpaceForKey(
     if (curr_depth >= max_search_depth_) {
       break;
     }
-    CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id];
+    CuckooBucket& curr_bucket = (*buckets)[static_cast<size_t>(curr_node.bucket_id)];
     for (uint32_t hash_cnt = 0;
         hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) {
       uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx),
@@ -476,15 +477,15 @@ bool CuckooTableBuilder::MakeSpaceForKey(
       // Iterate inside Cuckoo Block.
       for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
           ++block_idx, ++child_bucket_id) {
-        if ((*buckets)[child_bucket_id].make_space_for_key_call_id ==
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id ==
             make_space_for_key_call_id) {
           continue;
         }
-        (*buckets)[child_bucket_id].make_space_for_key_call_id =
+        (*buckets)[static_cast<size_t>(child_bucket_id)].make_space_for_key_call_id =
           make_space_for_key_call_id;
         tree.push_back(CuckooNode(child_bucket_id, curr_depth + 1,
               curr_pos));
-        if ((*buckets)[child_bucket_id].vector_idx == kMaxVectorIdx) {
+        if ((*buckets)[static_cast<size_t>(child_bucket_id)].vector_idx == kMaxVectorIdx) {
           null_found = true;
           break;
         }
@@ -502,8 +503,8 @@ bool CuckooTableBuilder::MakeSpaceForKey(
     uint32_t bucket_to_replace_pos = static_cast<uint32_t>(tree.size()) - 1;
     while (bucket_to_replace_pos >= num_hash_func_) {
       CuckooNode& curr_node = tree[bucket_to_replace_pos];
-      (*buckets)[curr_node.bucket_id] =
-        (*buckets)[tree[curr_node.parent_pos].bucket_id];
+      (*buckets)[static_cast<size_t>(curr_node.bucket_id)] =
+        (*buckets)[static_cast<size_t>(tree[curr_node.parent_pos].bucket_id)];
       bucket_to_replace_pos = curr_node.parent_pos;
     }
     *bucket_id = tree[bucket_to_replace_pos].bucket_id;
diff --git a/thirdparty/rocksdb/table/cuckoo_table_builder_test.cc b/thirdparty/rocksdb/table/cuckoo_table_builder_test.cc
index 93daaca472..c1e350327f 100644
--- a/thirdparty/rocksdb/table/cuckoo_table_builder_test.cc
+++ b/thirdparty/rocksdb/table/cuckoo_table_builder_test.cc
@@ -23,7 +23,7 @@ namespace {
 std::unordered_map<std::string, std::vector<uint64_t>> hash_map;
 
 uint64_t GetSliceHash(const Slice& s, uint32_t index,
-    uint64_t max_num_buckets) {
+                      uint64_t /*max_num_buckets*/) {
   return hash_map[s.ToString()][index];
 }
 }  // namespace
@@ -43,23 +43,31 @@ class CuckooBuilderTest : public testing::Test {
       std::string expected_unused_bucket, uint64_t expected_table_size,
       uint32_t expected_num_hash_func, bool expected_is_last_level,
       uint32_t expected_cuckoo_block_size = 1) {
+    uint64_t num_deletions = 0;
+    for (const auto& key : keys) {
+      ParsedInternalKey parsed;
+      if (ParseInternalKey(key, &parsed) && parsed.type == kTypeDeletion) {
+        num_deletions++;
+      }
+    }
     // Read file
-    unique_ptr<RandomAccessFile> read_file;
+    std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env_->NewRandomAccessFile(fname, &read_file, env_options_));
     uint64_t read_file_size;
     ASSERT_OK(env_->GetFileSize(fname, &read_file_size));
 
+   // @lint-ignore TXT2 T25377293 Grandfathered in
 	  Options options;
 	  options.allow_mmap_reads = true;
 	  ImmutableCFOptions ioptions(options);
 
     // Assert Table Properties.
     TableProperties* props = nullptr;
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(read_file), fname));
     ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size,
                                   kCuckooTableMagicNumber, ioptions,
-                                  &props));
+                                  &props, true /* compression_type_missing */));
     // Check unused bucket.
     std::string unused_key = props->user_collected_properties[
       CuckooTablePropertyNames::kEmptyKey];
@@ -89,6 +97,7 @@ class CuckooBuilderTest : public testing::Test {
     ASSERT_EQ(expected_is_last_level, is_last_level_found);
 
     ASSERT_EQ(props->num_entries, keys.size());
+    ASSERT_EQ(props->num_deletions, num_deletions);
     ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size());
     ASSERT_EQ(props->data_size, expected_unused_bucket.size() *
         (expected_table_size + expected_cuckoo_block_size - 1));
@@ -125,9 +134,10 @@ class CuckooBuilderTest : public testing::Test {
     }
   }
 
-  std::string GetInternalKey(Slice user_key, bool zero_seqno) {
+  std::string GetInternalKey(Slice user_key, bool zero_seqno,
+                             ValueType type = kTypeValue) {
     IterKey ikey;
-    ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, kTypeValue);
+    ikey.SetInternalKey(user_key, zero_seqno ? 0 : 1000, type);
     return ikey.GetInternalKey().ToString();
   }
 
@@ -151,11 +161,11 @@ class CuckooBuilderTest : public testing::Test {
 };
 
 TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/EmptyFile";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("EmptyFile");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100,
                              BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -168,50 +178,57 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) {
 }
 
 TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) {
-  uint32_t num_hash_fun = 4;
-  std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
-  std::vector<std::string> values = {"v01", "v02", "v03", "v04"};
-  // Need to have a temporary variable here as VS compiler does not currently
-  // support operator= with initializer_list as a parameter
-  std::unordered_map<std::string, std::vector<uint64_t>> hm = {
-      {user_keys[0], {0, 1, 2, 3}},
-      {user_keys[1], {1, 2, 3, 4}},
-      {user_keys[2], {2, 3, 4, 5}},
-      {user_keys[3], {3, 4, 5, 6}}};
-  hash_map = std::move(hm);
-
-  std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
-  std::vector<std::string> keys;
-  for (auto& user_key : user_keys) {
-    keys.push_back(GetInternalKey(user_key, false));
-  }
-  uint64_t expected_table_size = GetExpectedTableSize(keys.size());
-
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/NoCollisionFullKey";
-  ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
-  CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
-                             100, BytewiseComparator(), 1, false, false,
-                             GetSliceHash, 0 /* column_family_id */,
-                             kDefaultColumnFamilyName);
-  ASSERT_OK(builder.status());
-  for (uint32_t i = 0; i < user_keys.size(); i++) {
-    builder.Add(Slice(keys[i]), Slice(values[i]));
-    ASSERT_EQ(builder.NumEntries(), i + 1);
+  for (auto type : {kTypeValue, kTypeDeletion}) {
+    uint32_t num_hash_fun = 4;
+    std::vector<std::string> user_keys = {"key01", "key02", "key03", "key04"};
+    std::vector<std::string> values;
+    if (type == kTypeValue) {
+      values = {"v01", "v02", "v03", "v04"};
+    } else {
+      values = {"", "", "", ""};
+    }
+    // Need to have a temporary variable here as VS compiler does not currently
+    // support operator= with initializer_list as a parameter
+    std::unordered_map<std::string, std::vector<uint64_t>> hm = {
+        {user_keys[0], {0, 1, 2, 3}},
+        {user_keys[1], {1, 2, 3, 4}},
+        {user_keys[2], {2, 3, 4, 5}},
+        {user_keys[3], {3, 4, 5, 6}}};
+    hash_map = std::move(hm);
+
+    std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
+    std::vector<std::string> keys;
+    for (auto& user_key : user_keys) {
+      keys.push_back(GetInternalKey(user_key, false, type));
+    }
+    uint64_t expected_table_size = GetExpectedTableSize(keys.size());
+
+    std::unique_ptr<WritableFile> writable_file;
+    fname = test::PerThreadDBPath("NoCollisionFullKey");
+    ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
+    CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
+                               100, BytewiseComparator(), 1, false, false,
+                               GetSliceHash, 0 /* column_family_id */,
+                               kDefaultColumnFamilyName);
     ASSERT_OK(builder.status());
+    for (uint32_t i = 0; i < user_keys.size(); i++) {
+      builder.Add(Slice(keys[i]), Slice(values[i]));
+      ASSERT_EQ(builder.NumEntries(), i + 1);
+      ASSERT_OK(builder.status());
+    }
+    size_t bucket_size = keys[0].size() + values[0].size();
+    ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
+    ASSERT_OK(builder.Finish());
+    ASSERT_OK(file_writer->Close());
+    ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
+
+    std::string expected_unused_bucket = GetInternalKey("key00", true);
+    expected_unused_bucket += std::string(values[0].size(), 'a');
+    CheckFileContents(keys, values, expected_locations, expected_unused_bucket,
+                      expected_table_size, 2, false);
   }
-  size_t bucket_size = keys[0].size() + values[0].size();
-  ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize());
-  ASSERT_OK(builder.Finish());
-  ASSERT_OK(file_writer->Close());
-  ASSERT_LE(expected_table_size * bucket_size, builder.FileSize());
-
-  std::string expected_unused_bucket = GetInternalKey("key00", true);
-  expected_unused_bucket += std::string(values[0].size(), 'a');
-  CheckFileContents(keys, values, expected_locations,
-      expected_unused_bucket, expected_table_size, 2, false);
 }
 
 TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
@@ -235,11 +252,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/WithCollisionFullKey";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("WithCollisionFullKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -283,12 +300,12 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   uint32_t cuckoo_block_size = 2;
-  fname = test::TmpDir() + "/WithCollisionFullKey2";
+  fname = test::PerThreadDBPath("WithCollisionFullKey2");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(
       file_writer.get(), kHashTableRatio, num_hash_fun, 100,
       BytewiseComparator(), cuckoo_block_size, false, false, GetSliceHash,
@@ -337,11 +354,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/WithCollisionPathFullKey";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("WithCollisionPathFullKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -387,11 +404,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) {
   }
   uint64_t expected_table_size = GetExpectedTableSize(keys.size());
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("WithCollisionPathFullKeyAndCuckooBlock");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 2, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -430,11 +447,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/NoCollisionUserKey";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("NoCollisionUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -474,11 +491,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 2, 3};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/WithCollisionUserKey";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("WithCollisionUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -520,11 +537,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) {
   std::vector<uint64_t> expected_locations = {0, 1, 3, 4, 2};
   uint64_t expected_table_size = GetExpectedTableSize(user_keys.size());
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/WithCollisionPathUserKey";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("WithCollisionPathUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -565,11 +582,11 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) {
   };
   hash_map = std::move(hm);
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/WithCollisionPathUserKey";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("WithCollisionPathUserKey");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              2, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -593,11 +610,11 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) {
   uint32_t num_hash_fun = 4;
   std::string user_key = "repeatedkey";
 
-  unique_ptr<WritableFile> writable_file;
-  fname = test::TmpDir() + "/FailWhenSameKeyInserted";
+  std::unique_ptr<WritableFile> writable_file;
+  fname = test::PerThreadDBPath("FailWhenSameKeyInserted");
   ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, EnvOptions()));
   CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun,
                              100, BytewiseComparator(), 1, false, false,
                              GetSliceHash, 0 /* column_family_id */,
@@ -624,7 +641,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/table/cuckoo_table_factory.cc b/thirdparty/rocksdb/table/cuckoo_table_factory.cc
index 2325bcf77c..74d18d5121 100644
--- a/thirdparty/rocksdb/table/cuckoo_table_factory.cc
+++ b/thirdparty/rocksdb/table/cuckoo_table_factory.cc
@@ -14,9 +14,9 @@ namespace rocksdb {
 
 Status CuckooTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
     std::unique_ptr<TableReader>* table,
-    bool prefetch_index_and_filter_in_cache) const {
+    bool /*prefetch_index_and_filter_in_cache*/) const {
   std::unique_ptr<CuckooTableReader> new_reader(new CuckooTableReader(
       table_reader_options.ioptions, std::move(file), file_size,
       table_reader_options.internal_comparator.user_comparator(), nullptr));
diff --git a/thirdparty/rocksdb/table/cuckoo_table_factory.h b/thirdparty/rocksdb/table/cuckoo_table_factory.h
index db860c3d00..eb3c5e5176 100644
--- a/thirdparty/rocksdb/table/cuckoo_table_factory.h
+++ b/thirdparty/rocksdb/table/cuckoo_table_factory.h
@@ -24,6 +24,8 @@ static inline uint64_t CuckooHash(
   if (get_slice_hash != nullptr) {
     return get_slice_hash(user_key, hash_cnt, table_size_);
   }
+#else
+  (void)get_slice_hash;
 #endif
 
   uint64_t value = 0;
@@ -58,8 +60,8 @@ class CuckooTableFactory : public TableFactory {
 
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
       bool prefetch_index_and_filter_in_cache = true) const override;
 
   TableBuilder* NewTableBuilder(
@@ -67,8 +69,9 @@ class CuckooTableFactory : public TableFactory {
       uint32_t column_family_id, WritableFileWriter* file) const override;
 
   // Sanitizes the specified DB Options.
-  Status SanitizeOptions(const DBOptions& db_opts,
-                         const ColumnFamilyOptions& cf_opts) const override {
+  Status SanitizeOptions(
+      const DBOptions& /*db_opts*/,
+      const ColumnFamilyOptions& /*cf_opts*/) const override {
     return Status::OK();
   }
 
@@ -76,8 +79,8 @@ class CuckooTableFactory : public TableFactory {
 
   void* GetOptions() override { return &table_options_; }
 
-  Status GetOptionString(std::string* opt_string,
-                         const std::string& delimiter) const override {
+  Status GetOptionString(std::string* /*opt_string*/,
+                         const std::string& /*delimiter*/) const override {
     return Status::OK();
   }
 
diff --git a/thirdparty/rocksdb/table/cuckoo_table_reader.cc b/thirdparty/rocksdb/table/cuckoo_table_reader.cc
index 9cecebaebb..f4df2467fd 100644
--- a/thirdparty/rocksdb/table/cuckoo_table_reader.cc
+++ b/thirdparty/rocksdb/table/cuckoo_table_reader.cc
@@ -38,6 +38,18 @@ CuckooTableReader::CuckooTableReader(
     const Comparator* comparator,
     uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
     : file_(std::move(file)),
+      is_last_level_(false),
+      identity_as_first_hash_(false),
+      use_module_hash_(false),
+      num_hash_func_(0),
+      unused_key_(""),
+      key_length_(0),
+      user_key_length_(0),
+      value_length_(0),
+      bucket_length_(0),
+      cuckoo_block_size_(0),
+      cuckoo_block_bytes_minus_one_(0),
+      table_size_(0),
       ucomp_(comparator),
       get_slice_hash_(get_slice_hash) {
   if (!ioptions.allow_mmap_reads) {
@@ -45,7 +57,7 @@ CuckooTableReader::CuckooTableReader(
   }
   TableProperties* props = nullptr;
   status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
-      ioptions, &props);
+      ioptions, &props, true /* compression_type_missing */);
   if (!status_.ok()) {
     return;
   }
@@ -124,11 +136,13 @@ CuckooTableReader::CuckooTableReader(
   cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
       cuckoo_block_size->second.data());
   cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
-  status_ = file_->Read(0, file_size, &file_data_, nullptr);
+  status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr);
 }
 
-Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key,
-                              GetContext* get_context, bool skip_filters) {
+Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,
+                              const Slice& key, GetContext* get_context,
+                              const SliceTransform* /* prefix_extractor */,
+                              bool /*skip_filters*/) {
   assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
   Slice user_key = ExtractUserKey(key);
   for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
@@ -157,7 +171,8 @@ Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key,
           Slice full_key(bucket, key_length_);
           ParsedInternalKey found_ikey;
           ParseInternalKey(full_key, &found_ikey);
-          get_context->SaveValue(found_ikey, value);
+          bool dont_care __attribute__((__unused__));
+          get_context->SaveValue(found_ikey, value, &dont_care);
         }
         // We don't support merge operations. So, we return here.
         return Status::OK();
@@ -182,7 +197,7 @@ void CuckooTableReader::Prepare(const Slice& key) {
 class CuckooTableIterator : public InternalIterator {
  public:
   explicit CuckooTableIterator(CuckooTableReader* reader);
-  ~CuckooTableIterator() {}
+  ~CuckooTableIterator() override {}
   bool Valid() const override;
   void SeekToFirst() override;
   void SeekToLast() override;
@@ -192,7 +207,7 @@ class CuckooTableIterator : public InternalIterator {
   void Prev() override;
   Slice key() const override;
   Slice value() const override;
-  Status status() const override { return status_; }
+  Status status() const override { return Status::OK(); }
   void InitIfNeeded();
 
  private:
@@ -227,7 +242,6 @@ class CuckooTableIterator : public InternalIterator {
   void PrepareKVAtCurrIdx();
   CuckooTableReader* reader_;
   bool initialized_;
-  Status status_;
   // Contains a map of keys to bucket_id sorted in key order.
   std::vector<uint32_t> sorted_bucket_ids_;
   // We assume that the number of items can be stored in uint32 (4 Billion).
@@ -254,7 +268,7 @@ void CuckooTableIterator::InitIfNeeded() {
   if (initialized_) {
     return;
   }
-  sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries);
+  sorted_bucket_ids_.reserve(static_cast<size_t>(reader_->GetTableProperties()->num_entries));
   uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
   assert(num_buckets < kInvalidIndex);
   const char* bucket = reader_->file_data_.data();
@@ -299,7 +313,7 @@ void CuckooTableIterator::Seek(const Slice& target) {
   PrepareKVAtCurrIdx();
 }
 
-void CuckooTableIterator::SeekForPrev(const Slice& target) {
+void CuckooTableIterator::SeekForPrev(const Slice& /*target*/) {
   // Not supported
   assert(false);
 }
@@ -360,13 +374,12 @@ Slice CuckooTableIterator::value() const {
   return curr_value_;
 }
 
-extern InternalIterator* NewErrorInternalIterator(const Status& status,
-                                                  Arena* arena);
-
 InternalIterator* CuckooTableReader::NewIterator(
-    const ReadOptions& read_options, Arena* arena, bool skip_filters) {
+    const ReadOptions& /*read_options*/,
+    const SliceTransform* /* prefix_extractor */, Arena* arena,
+    bool /*skip_filters*/, bool /*for_compaction*/) {
   if (!status().ok()) {
-    return NewErrorInternalIterator(
+    return NewErrorInternalIterator<Slice>(
         Status::Corruption("CuckooTableReader status is not okay."), arena);
   }
   CuckooTableIterator* iter;
diff --git a/thirdparty/rocksdb/table/cuckoo_table_reader.h b/thirdparty/rocksdb/table/cuckoo_table_reader.h
index 4beac8f9d0..b37d46373e 100644
--- a/thirdparty/rocksdb/table/cuckoo_table_reader.h
+++ b/thirdparty/rocksdb/table/cuckoo_table_reader.h
@@ -25,7 +25,6 @@ namespace rocksdb {
 
 class Arena;
 class TableReader;
-class InternalIterator;
 
 class CuckooTableReader: public TableReader {
  public:
@@ -42,19 +41,22 @@ class CuckooTableReader: public TableReader {
 
   Status status() const { return status_; }
 
-  Status Get(const ReadOptions& read_options, const Slice& key,
-             GetContext* get_context, bool skip_filters = false) override;
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
+             bool skip_filters = false) override;
 
-  InternalIterator* NewIterator(
-      const ReadOptions&, Arena* arena = nullptr,
-      bool skip_filters = false) override;
+  InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena = nullptr,
+                                bool skip_filters = false,
+                                bool for_compaction = false) override;
   void Prepare(const Slice& target) override;
 
   // Report an approximation of how much memory has been used.
   size_t ApproximateMemoryUsage() const override;
 
   // Following methods are not implemented for Cuckoo Table Reader
-  uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; }
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; }
   void SetupForCompaction() override {}
   // End of methods not implemented.
 
diff --git a/thirdparty/rocksdb/table/cuckoo_table_reader_test.cc b/thirdparty/rocksdb/table/cuckoo_table_reader_test.cc
index 7e131e56e3..74fb52e6c7 100644
--- a/thirdparty/rocksdb/table/cuckoo_table_reader_test.cc
+++ b/thirdparty/rocksdb/table/cuckoo_table_reader_test.cc
@@ -18,24 +18,24 @@ int main() {
 #endif
 
 #include <inttypes.h>
-#include <gflags/gflags.h>
 #include <vector>
 #include <string>
 #include <map>
 
-#include "table/meta_blocks.h"
 #include "table/cuckoo_table_builder.h"
-#include "table/cuckoo_table_reader.h"
 #include "table/cuckoo_table_factory.h"
+#include "table/cuckoo_table_reader.h"
 #include "table/get_context.h"
+#include "table/meta_blocks.h"
 #include "util/arena.h"
+#include "util/gflags_compat.h"
 #include "util/random.h"
 #include "util/string_util.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_string(file_dir, "", "Directory where the files will be created"
     " for benchmark. Added for using tmpfs.");
@@ -61,7 +61,7 @@ void AddHashLookups(const std::string& s, uint64_t bucket_id,
 }
 
 uint64_t GetSliceHash(const Slice& s, uint32_t index,
-    uint64_t max_num_buckets) {
+                      uint64_t /*max_num_buckets*/) {
   return hash_map[s.ToString()][index];
 }
 }  // namespace
@@ -95,8 +95,8 @@ class CuckooReaderTest : public testing::Test {
       const Comparator* ucomp = BytewiseComparator()) {
     std::unique_ptr<WritableFile> writable_file;
     ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-    unique_ptr<WritableFileWriter> file_writer(
-        new WritableFileWriter(std::move(writable_file), env_options));
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(writable_file), fname, env_options));
 
     CuckooTableBuilder builder(
         file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, 2, false, false,
@@ -115,7 +115,7 @@ class CuckooReaderTest : public testing::Test {
     // Check reader now.
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(read_file), fname));
     const ImmutableCFOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
@@ -127,7 +127,8 @@ class CuckooReaderTest : public testing::Test {
       GetContext get_context(ucomp, nullptr, nullptr, nullptr,
                              GetContext::kNotFound, Slice(user_keys[i]), &value,
                              nullptr, nullptr, nullptr, nullptr);
-      ASSERT_OK(reader.Get(ReadOptions(), Slice(keys[i]), &get_context));
+      ASSERT_OK(
+          reader.Get(ReadOptions(), Slice(keys[i]), &get_context, nullptr));
       ASSERT_STREQ(values[i].c_str(), value.data());
     }
   }
@@ -143,13 +144,14 @@ class CuckooReaderTest : public testing::Test {
   void CheckIterator(const Comparator* ucomp = BytewiseComparator()) {
     std::unique_ptr<RandomAccessFile> read_file;
     ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(read_file), fname));
     const ImmutableCFOptions ioptions(options);
     CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp,
                              GetSliceHash);
     ASSERT_OK(reader.status());
-    InternalIterator* it = reader.NewIterator(ReadOptions(), nullptr);
+    InternalIterator* it =
+        reader.NewIterator(ReadOptions(), nullptr, nullptr, false);
     ASSERT_OK(it->status());
     ASSERT_TRUE(!it->Valid());
     it->SeekToFirst();
@@ -188,7 +190,7 @@ class CuckooReaderTest : public testing::Test {
     delete it;
 
     Arena arena;
-    it = reader.NewIterator(ReadOptions(), &arena);
+    it = reader.NewIterator(ReadOptions(), nullptr, &arena);
     ASSERT_OK(it->status());
     ASSERT_TRUE(!it->Valid());
     it->Seek(keys[num_items/2]);
@@ -213,7 +215,7 @@ class CuckooReaderTest : public testing::Test {
 
 TEST_F(CuckooReaderTest, WhenKeyExists) {
   SetUp(kNumHashFunc);
-  fname = test::TmpDir() + "/CuckooReader_WhenKeyExists";
+  fname = test::PerThreadDBPath("CuckooReader_WhenKeyExists");
   for (uint64_t i = 0; i < num_items; i++) {
     user_keys[i] = "key" + NumToStr(i);
     ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
@@ -240,7 +242,7 @@ TEST_F(CuckooReaderTest, WhenKeyExists) {
 
 TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) {
   SetUp(kNumHashFunc);
-  fname = test::TmpDir() + "/CuckooReaderUint64_WhenKeyExists";
+  fname = test::PerThreadDBPath("CuckooReaderUint64_WhenKeyExists");
   for (uint64_t i = 0; i < num_items; i++) {
     user_keys[i].resize(8);
     memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
@@ -268,7 +270,7 @@ TEST_F(CuckooReaderTest, WhenKeyExistsWithUint64Comparator) {
 
 TEST_F(CuckooReaderTest, CheckIterator) {
   SetUp(2*kNumHashFunc);
-  fname = test::TmpDir() + "/CuckooReader_CheckIterator";
+  fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
   for (uint64_t i = 0; i < num_items; i++) {
     user_keys[i] = "key" + NumToStr(i);
     ParsedInternalKey ikey(user_keys[i], 1000, kTypeValue);
@@ -287,7 +289,7 @@ TEST_F(CuckooReaderTest, CheckIterator) {
 
 TEST_F(CuckooReaderTest, CheckIteratorUint64) {
   SetUp(2*kNumHashFunc);
-  fname = test::TmpDir() + "/CuckooReader_CheckIterator";
+  fname = test::PerThreadDBPath("CuckooReader_CheckIterator");
   for (uint64_t i = 0; i < num_items; i++) {
     user_keys[i].resize(8);
     memcpy(&user_keys[i][0], static_cast<void*>(&i), 8);
@@ -308,7 +310,7 @@ TEST_F(CuckooReaderTest, CheckIteratorUint64) {
 TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   // Add keys with colliding hash values.
   SetUp(kNumHashFunc);
-  fname = test::TmpDir() + "/CuckooReader_WhenKeyNotFound";
+  fname = test::PerThreadDBPath("CuckooReader_WhenKeyNotFound");
   for (uint64_t i = 0; i < num_items; i++) {
     user_keys[i] = "key" + NumToStr(i);
     ParsedInternalKey ikey(user_keys[i], i + 1000, kTypeValue);
@@ -321,7 +323,7 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   CreateCuckooFileAndCheckReader();
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(read_file), fname));
   const ImmutableCFOptions ioptions(options);
   CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp,
@@ -337,7 +339,8 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound,
                          Slice(not_found_key), &value, nullptr, nullptr,
                          nullptr, nullptr);
-  ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key), &get_context));
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(not_found_key), &get_context, nullptr));
   ASSERT_TRUE(value.empty());
   ASSERT_OK(reader.status());
   // Search for a key with an independent hash value.
@@ -350,7 +353,8 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   GetContext get_context2(ucmp, nullptr, nullptr, nullptr,
                           GetContext::kNotFound, Slice(not_found_key2), &value,
                           nullptr, nullptr, nullptr, nullptr);
-  ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2));
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2, nullptr));
   ASSERT_TRUE(value.empty());
   ASSERT_OK(reader.status());
 
@@ -365,7 +369,8 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) {
   GetContext get_context3(ucmp, nullptr, nullptr, nullptr,
                           GetContext::kNotFound, Slice(unused_key), &value,
                           nullptr, nullptr, nullptr, nullptr);
-  ASSERT_OK(reader.Get(ReadOptions(), Slice(unused_key), &get_context3));
+  ASSERT_OK(
+      reader.Get(ReadOptions(), Slice(unused_key), &get_context3, nullptr));
   ASSERT_TRUE(value.empty());
   ASSERT_OK(reader.status());
 }
@@ -390,8 +395,8 @@ std::string GetFileName(uint64_t num) {
   if (FLAGS_file_dir.empty()) {
     FLAGS_file_dir = test::TmpDir();
   }
-  return FLAGS_file_dir + "/cuckoo_read_benchmark" +
-    ToString(num/1000000) + "Mkeys";
+  return test::PerThreadDBPath(FLAGS_file_dir, "cuckoo_read_benchmark") +
+         ToString(num / 1000000) + "Mkeys";
 }
 
 // Create last level file as we are interested in measuring performance of
@@ -406,8 +411,8 @@ void WriteFile(const std::vector<std::string>& keys,
 
   std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options));
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(writable_file), env_options));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(writable_file), fname, env_options));
   CuckooTableBuilder builder(
       file_writer.get(), hash_ratio, 64, 1000, test::Uint64Comparator(), 5,
       false, FLAGS_identity_as_first_hash, nullptr, 0 /* column_family_id */,
@@ -427,7 +432,7 @@ void WriteFile(const std::vector<std::string>& keys,
   env->GetFileSize(fname, &file_size);
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(read_file), fname));
 
   const ImmutableCFOptions ioptions(options);
@@ -443,7 +448,7 @@ void WriteFile(const std::vector<std::string>& keys,
   for (uint64_t i = 0; i < num; ++i) {
     value.Reset();
     value.clear();
-    ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context));
+    ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context, nullptr));
     ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4));
   }
 }
@@ -459,7 +464,7 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
   env->GetFileSize(fname, &file_size);
   std::unique_ptr<RandomAccessFile> read_file;
   ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options));
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       new RandomAccessFileReader(std::move(read_file), fname));
 
   const ImmutableCFOptions ioptions(options);
@@ -496,13 +501,13 @@ void ReadKeys(uint64_t num, uint32_t batch_size) {
       }
       for (uint64_t j = i; j < i+batch_size && j < num; ++j) {
         reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[j]), 16),
-                   &get_context);
+                   &get_context, nullptr);
       }
     }
   } else {
     for (uint64_t i = 0; i < num; i++) {
       reader.Get(r_options, Slice(reinterpret_cast<char*>(&keys[i]), 16),
-                 &get_context);
+                 &get_context, nullptr);
     }
   }
   float time_per_op = (env->NowMicros() - start_time) * 1.0f / num;
@@ -560,7 +565,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/table/data_block_footer.cc b/thirdparty/rocksdb/table/data_block_footer.cc
new file mode 100644
index 0000000000..cb9e143815
--- /dev/null
+++ b/thirdparty/rocksdb/table/data_block_footer.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "data_block_footer.h"
+
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+const int kDataBlockIndexTypeBitShift = 31;
+
+// 0x7FFFFFFF
+const uint32_t kMaxNumRestarts = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+// 0x7FFFFFFF
+const uint32_t kNumRestartsMask = (1u << kDataBlockIndexTypeBitShift) - 1u;
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts) {
+  if (num_restarts > kMaxNumRestarts) {
+    assert(0);  // mute travis "unused" warning
+  }
+
+  uint32_t block_footer = num_restarts;
+  if (index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash) {
+    block_footer |= 1u << kDataBlockIndexTypeBitShift;
+  } else if (index_type != BlockBasedTableOptions::kDataBlockBinarySearch) {
+    assert(0);
+  }
+
+  return block_footer;
+}
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts) {
+  if (index_type) {
+    if (block_footer & 1u << kDataBlockIndexTypeBitShift) {
+      *index_type = BlockBasedTableOptions::kDataBlockBinaryAndHash;
+    } else {
+      *index_type = BlockBasedTableOptions::kDataBlockBinarySearch;
+    }
+  }
+
+  if (num_restarts) {
+    *num_restarts = block_footer & kNumRestartsMask;
+    assert(*num_restarts <= kMaxNumRestarts);
+  }
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/data_block_footer.h b/thirdparty/rocksdb/table/data_block_footer.h
new file mode 100644
index 0000000000..e6ff20bccb
--- /dev/null
+++ b/thirdparty/rocksdb/table/data_block_footer.h
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "rocksdb/table.h"
+
+namespace rocksdb {
+
+uint32_t PackIndexTypeAndNumRestarts(
+    BlockBasedTableOptions::DataBlockIndexType index_type,
+    uint32_t num_restarts);
+
+void UnPackIndexTypeAndNumRestarts(
+    uint32_t block_footer,
+    BlockBasedTableOptions::DataBlockIndexType* index_type,
+    uint32_t* num_restarts);
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/data_block_hash_index.cc b/thirdparty/rocksdb/table/data_block_hash_index.cc
new file mode 100644
index 0000000000..adb1d7b8c2
--- /dev/null
+++ b/thirdparty/rocksdb/table/data_block_hash_index.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+#include "table/data_block_hash_index.h"
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace rocksdb {
+
+void DataBlockHashIndexBuilder::Add(const Slice& key,
+                                    const size_t restart_index) {
+  assert(Valid());
+  if (restart_index > kMaxRestartSupportedByHashIndex) {
+    valid_ = false;
+    return;
+  }
+
+  uint32_t hash_value = GetSliceHash(key);
+  hash_and_restart_pairs_.emplace_back(hash_value,
+                                       static_cast<uint8_t>(restart_index));
+  estimated_num_buckets_ += bucket_per_key_;
+}
+
+void DataBlockHashIndexBuilder::Finish(std::string& buffer) {
+  assert(Valid());
+  uint16_t num_buckets = static_cast<uint16_t>(estimated_num_buckets_);
+
+  if (num_buckets == 0) {
+    num_buckets = 1;  // sanity check
+  }
+
+  // The build-in hash cannot well distribute strings when into different
+  // buckets when num_buckets is power of two, resulting in high hash
+  // collision.
+  // We made the num_buckets to be odd to avoid this issue.
+  num_buckets |= 1;
+
+  std::vector<uint8_t> buckets(num_buckets, kNoEntry);
+  // write the restart_index array
+  for (auto& entry : hash_and_restart_pairs_) {
+    uint32_t hash_value = entry.first;
+    uint8_t restart_index = entry.second;
+    uint16_t buck_idx = static_cast<uint16_t>(hash_value % num_buckets);
+    if (buckets[buck_idx] == kNoEntry) {
+      buckets[buck_idx] = restart_index;
+    } else if (buckets[buck_idx] != restart_index) {
+      // same bucket cannot store two different restart_index, mark collision
+      buckets[buck_idx] = kCollision;
+    }
+  }
+
+  for (uint8_t restart_index : buckets) {
+    buffer.append(
+        const_cast<const char*>(reinterpret_cast<char*>(&restart_index)),
+        sizeof(restart_index));
+  }
+
+  // write NUM_BUCK
+  PutFixed16(&buffer, num_buckets);
+
+  assert(buffer.size() <= kMaxBlockSizeSupportedByHashIndex);
+}
+
+void DataBlockHashIndexBuilder::Reset() {
+  estimated_num_buckets_ = 0;
+  valid_ = true;
+  hash_and_restart_pairs_.clear();
+}
+
+void DataBlockHashIndex::Initialize(const char* data, uint16_t size,
+                                    uint16_t* map_offset) {
+  assert(size >= sizeof(uint16_t));  // NUM_BUCKETS
+  num_buckets_ = DecodeFixed16(data + size - sizeof(uint16_t));
+  assert(num_buckets_ > 0);
+  assert(size > num_buckets_ * sizeof(uint8_t));
+  *map_offset = static_cast<uint16_t>(size - sizeof(uint16_t) -
+                                      num_buckets_ * sizeof(uint8_t));
+}
+
+uint8_t DataBlockHashIndex::Lookup(const char* data, uint32_t map_offset,
+                                   const Slice& key) const {
+  uint32_t hash_value = GetSliceHash(key);
+  uint16_t idx = static_cast<uint16_t>(hash_value % num_buckets_);
+  const char* bucket_table = data + map_offset;
+  return static_cast<uint8_t>(*(bucket_table + idx * sizeof(uint8_t)));
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/data_block_hash_index.h b/thirdparty/rocksdb/table/data_block_hash_index.h
new file mode 100644
index 0000000000..0af8b257c2
--- /dev/null
+++ b/thirdparty/rocksdb/table/data_block_hash_index.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace rocksdb {
+// This is an experimental feature aiming to reduce the CPU utilization of
+// point-lookup within a data-block. It is only used in data blocks, and not
+// in meta-data blocks or per-table index blocks.
+//
+// It only used to support BlockBasedTable::Get().
+//
+// A serialized hash index is appended to the data-block. The new block data
+// format is as follows:
+//
+// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
+//
+// RI:       Restart Interval (the same as the default data-block format)
+// RI_IDX:   Restart Interval index (the same as the default data-block format)
+// HASH_IDX: The new data-block hash index feature.
+// FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
+//           the flag indicating if this hash index is in use. Note that
+//           given a data block < 32KB, the MSB is never used. So we can
+//           borrow the MSB as the hash index flag. Therefore, this format is
+//           compatible with the legacy data-blocks with num_restarts < 32768,
+//           as the MSB is 0.
+//
+// The format of the data-block hash index is as follows:
+//
+// HASH_IDX: [B B B ... B NUM_BUCK]
+//
+// B:         bucket, an array of restart index. Each buckets is uint8_t.
+// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
+//
+// We reserve two special flag:
+//    kNoEntry=255,
+//    kCollision=254.
+//
+// Therefore, the max number of restarts this hash index can supoport is 253.
+//
+// Buckets are initialized to be kNoEntry.
+//
+// When storing a key in the hash index, the key is first hashed to a bucket.
+// If there the bucket is empty (kNoEntry), the restart index is stored in
+// the bucket. If there is already a restart index there, we will update the
+// existing restart index to a collision marker (kCollision). If the
+// the bucket is already marked as collision, we do not store the restart
+// index either.
+//
+// During query process, a key is first hashed to a bucket. Then we examine if
+// the buckets store nothing (kNoEntry) or the bucket had a collision
+// (kCollision). If either of those happens, we get the restart index of
+// the key and will directly go to the restart interval to search the key.
+//
+// Note that we only support blocks with #restart_interval < 254. If a block
+// has more restart interval than that, hash index will not be create for it.
+
+const uint8_t kNoEntry = 255;
+const uint8_t kCollision = 254;
+const uint8_t kMaxRestartSupportedByHashIndex = 253;
+
+// Because we use uint16_t address, we only support block no more than 64KB
+const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
+const double kDefaultUtilRatio = 0.75;
+
+class DataBlockHashIndexBuilder {
+ public:
+  DataBlockHashIndexBuilder()
+      : bucket_per_key_(-1 /*uninitialized marker*/),
+        estimated_num_buckets_(0),
+        valid_(false) {}
+
+  void Initialize(double util_ratio) {
+    if (util_ratio <= 0) {
+      util_ratio = kDefaultUtilRatio;  // sanity check
+    }
+    bucket_per_key_ = 1 / util_ratio;
+    valid_ = true;
+  }
+
+  inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
+  void Add(const Slice& key, const size_t restart_index);
+  void Finish(std::string& buffer);
+  void Reset();
+  inline size_t EstimateSize() const {
+    uint16_t estimated_num_buckets =
+        static_cast<uint16_t>(estimated_num_buckets_);
+
+    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
+    estimated_num_buckets |= 1;
+
+    return sizeof(uint16_t) +
+           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
+  }
+
+ private:
+  double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
+  double estimated_num_buckets_;
+
+  // Now the only usage for `valid_` is to mark false when the inserted
+  // restart_index is larger than supported. In this case HashIndex is not
+  // appended to the block content.
+  bool valid_;
+
+  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
+  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
+};
+
+class DataBlockHashIndex {
+ public:
+  DataBlockHashIndex() : num_buckets_(0) {}
+
+  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);
+
+  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;
+
+  inline bool Valid() { return num_buckets_ != 0; }
+
+ private:
+  // To make the serialized hash index compact and to save the space overhead,
+  // here all the data fields persisted in the block are in uint16 format.
+  // We find that a uint16 is large enough to index every offset of a 64KiB
+  // block.
+  // So in other words, DataBlockHashIndex does not support block size equal
+  // or greater then 64KiB.
+  uint16_t num_buckets_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/data_block_hash_index_test.cc b/thirdparty/rocksdb/table/data_block_hash_index_test.cc
new file mode 100644
index 0000000000..11226648ef
--- /dev/null
+++ b/thirdparty/rocksdb/table/data_block_hash_index_test.cc
@@ -0,0 +1,724 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstdlib>
+#include <string>
+#include <unordered_map>
+
+#include "db/table_properties_collector.h"
+#include "rocksdb/slice.h"
+#include "table/block.h"
+#include "table/block_based_table_reader.h"
+#include "table/block_builder.h"
+#include "table/data_block_hash_index.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace rocksdb {
+
+bool SearchForOffset(DataBlockHashIndex& index, const char* data,
+                     uint16_t map_offset, const Slice& key,
+                     uint8_t& restart_point) {
+  uint8_t entry = index.Lookup(data, map_offset, key);
+  if (entry == kCollision) {
+    return true;
+  }
+
+  if (entry == kNoEntry) {
+    return false;
+  }
+
+  return entry == restart_point;
+}
+
+// Random KV generator similer to block_test
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+std::string GenerateKey(int primary_key, int secondary_key, int padding_size,
+                        Random* rnd) {
+  char buf[50];
+  char* p = &buf[0];
+  snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key);
+  std::string k(p);
+  if (padding_size) {
+    k += RandomString(rnd, padding_size);
+  }
+
+  return k;
+}
+
+// Generate random key value pairs.
+// The generated key will be sorted. You can tune the parameters to generated
+// different kinds of test key/value pairs for different scenario.
+void GenerateRandomKVs(std::vector<std::string>* keys,
+                       std::vector<std::string>* values, const int from,
+                       const int len, const int step = 1,
+                       const int padding_size = 0,
+                       const int keys_share_prefix = 1) {
+  Random rnd(302);
+
+  // generate different prefix
+  for (int i = from; i < from + len; i += step) {
+    // generating keys that shares the prefix
+    for (int j = 0; j < keys_share_prefix; ++j) {
+      keys->emplace_back(GenerateKey(i, j, padding_size, &rnd));
+
+      // 100 bytes values
+      values->emplace_back(RandomString(&rnd, 100));
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestSmall) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  for (int j = 0; j < 5; j++) {
+    for (uint8_t i = 0; i < 2 + j; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      builder.Add(key, restart_point);
+    }
+
+    size_t estimated_size = builder.EstimateSize();
+
+    std::string buffer("fake"), buffer2;
+    size_t original_size = buffer.size();
+    estimated_size += original_size;
+    builder.Finish(buffer);
+
+    ASSERT_EQ(buffer.size(), estimated_size);
+
+    buffer2 = buffer;  // test for the correctness of relative offset
+
+    Slice s(buffer2);
+    DataBlockHashIndex index;
+    uint16_t map_offset;
+    index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+    // the additional hash map should start at the end of the buffer
+    ASSERT_EQ(original_size, map_offset);
+    for (uint8_t i = 0; i < 2; i++) {
+      std::string key("key" + std::to_string(i));
+      uint8_t restart_point = i;
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    }
+    builder.Reset();
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTest) {
+  // bucket_num = 200, #keys = 100. 50% utilization
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("fake content"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestCollision) {
+  // bucket_num = 2. There will be intense hash collisions
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("some other fake content to take up space"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key("key" + std::to_string(i));
+    uint8_t restart_point = i;
+    ASSERT_TRUE(
+        SearchForOffset(index, s.data(), map_offset, key, restart_point));
+  }
+}
+
+TEST(DataBlockHashIndex, DataBlockHashTestLarge) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i < 100; i++) {
+    if (i % 2) {
+      continue;  // leave half of the keys out
+    }
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+    m[key] = restart_point;
+  }
+
+  size_t estimated_size = builder.EstimateSize();
+
+  std::string buffer("filling stuff"), buffer2;
+  size_t original_size = buffer.size();
+  estimated_size += original_size;
+  builder.Finish(buffer);
+
+  ASSERT_EQ(buffer.size(), estimated_size);
+
+  buffer2 = buffer; // test for the correctness of relative offset
+
+  Slice s(buffer2);
+  DataBlockHashIndex index;
+  uint16_t map_offset;
+  index.Initialize(s.data(), static_cast<uint16_t>(s.size()), &map_offset);
+
+  // the additional hash map should start at the end of the buffer
+  ASSERT_EQ(original_size, map_offset);
+  for (uint8_t i = 0; i < 100; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    if (m.count(key)) {
+      ASSERT_TRUE(m[key] == restart_point);
+      ASSERT_TRUE(
+          SearchForOffset(index, s.data(), map_offset, key, restart_point));
+    } else {
+      // we allow false positve, so don't test the nonexisting keys.
+      // when false positive happens, the search will continue to the
+      // restart intervals to see if the key really exist.
+    }
+  }
+}
+
+TEST(DataBlockHashIndex, RestartIndexExceedMax) {
+  DataBlockHashIndexBuilder builder;
+  builder.Initialize(0.75 /*util_ratio*/);
+  std::unordered_map<std::string, uint8_t> m;
+
+  for (uint8_t i = 0; i <= 253; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+  ASSERT_TRUE(builder.Valid());
+
+  builder.Reset();
+
+  for (uint8_t i = 0; i <= 254; i++) {
+    std::string key = "key" + std::to_string(i);
+    uint8_t restart_point = i;
+    builder.Add(key, restart_point);
+  }
+
+  ASSERT_FALSE(builder.Valid());
+
+  builder.Reset();
+  ASSERT_TRUE(builder.Valid());
+}
+
+TEST(DataBlockHashIndex, BlockRestartIndexExceedMax) {
+  Options options = Options();
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  // #restarts <= 253. HashIndex is valid
+  for (int i = 0; i <= 253; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  // #restarts > 253. HashIndex is not used
+  for (int i = 0; i <= 254; i++) {
+    std::string ukey = "key" + std::to_string(i);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), "value");
+  }
+
+  {
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockSizeExceedMax) {
+  Options options = Options();
+  std::string ukey(10, 'k');
+  InternalKey ikey(ukey, 0, kTypeValue);
+
+  BlockBuilder builder(1 /* block_restart_interval */,
+                       false /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  {
+    // insert a large value. The block size plus HashIndex is 65536.
+    std::string value(65502, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  }
+
+  builder.Reset();
+
+  {
+    // insert a large value. The block size plus HashIndex would be 65537.
+    // This excceed the max block size supported by HashIndex (65536).
+    // So when build finishes HashIndex will not be created for the block.
+    std::string value(65503, 'v');
+
+    builder.Add(ikey.Encode().ToString(), value);
+
+    // read serialized contents of the block
+    Slice rawblock = builder.Finish();
+    ASSERT_LE(rawblock.size(), kMaxBlockSizeSupportedByHashIndex);
+    std::cerr << "block size: " << rawblock.size() << std::endl;
+
+    // create block reader
+    BlockContents contents;
+    contents.data = rawblock;
+    Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+    // the index type have fallen back to binary when build finish.
+    ASSERT_EQ(reader.IndexType(),
+              BlockBasedTableOptions::kDataBlockBinarySearch);
+  }
+}
+
+TEST(DataBlockHashIndex, BlockTestSingleKey) {
+  Options options = Options();
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+
+  std::string ukey("gopher");
+  std::string value("gold");
+  InternalKey ikey(ukey, 10, kTypeValue);
+  builder.Add(ikey.Encode().ToString(), value /*value*/);
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+
+  const InternalKeyComparator icmp(BytewiseComparator());
+  auto iter = reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+  bool may_exist;
+  // search in block for the key just inserted
+  {
+    InternalKey seek_ikey(ukey, 10, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(
+        options.comparator->Compare(iter->key(), ikey.Encode().ToString()), 0);
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // search in block for the existing ukey, but with higher seqno
+  {
+    InternalKey seek_ikey(ukey, 20, kValueTypeForSeek);
+
+    // HashIndex should be able to set the iter correctly
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+
+    // user key should match
+    ASSERT_EQ(options.comparator->Compare(ExtractUserKey(iter->key()), ukey),
+              0);
+
+    // seek_key seqno number should be greater than that of iter result
+    ASSERT_GT(GetInternalKeySeqno(seek_ikey.Encode()),
+              GetInternalKeySeqno(iter->key()));
+
+    ASSERT_EQ(iter->value(), value);
+  }
+
+  // Search in block for the existing ukey, but with lower seqno
+  // in this case, hash can find the only occurrence of the user_key, but
+  // ParseNextDataKey() will skip it as it does not have a older seqno.
+  // In this case, GetForSeek() is effective to locate the user_key, and
+  // iter->Valid() == false indicates that we've reached to the end of
+  // the block and the caller should continue searching the next block.
+  {
+    InternalKey seek_ikey(ukey, 5, kValueTypeForSeek);
+    may_exist = iter->SeekForGet(seek_ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_FALSE(iter->Valid());  // should have reached to the end of block
+  }
+
+  delete iter;
+}
+
+TEST(DataBlockHashIndex, BlockTestLarge) {
+  Random rnd(1019);
+  Options options = Options();
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  BlockBuilder builder(16 /* block_restart_interval */,
+                       true /* use_delta_encoding */,
+                       false /* use_value_delta_encoding */,
+                       BlockBasedTableOptions::kDataBlockBinaryAndHash);
+  int num_records = 500;
+
+  GenerateRandomKVs(&keys, &values, 0, num_records);
+
+  // Generate keys. Adding a trailing "1" to indicate existent keys.
+  // Later will Seeking for keys with a trailing "0" to test seeking
+  // non-existent keys.
+  for (int i = 0; i < num_records; i++) {
+    std::string ukey(keys[i] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+    builder.Add(ikey.Encode().ToString(), values[i]);
+  }
+
+  // read serialized contents of the block
+  Slice rawblock = builder.Finish();
+
+  // create block reader
+  BlockContents contents;
+  contents.data = rawblock;
+  Block reader(std::move(contents), kDisableGlobalSequenceNumber);
+  const InternalKeyComparator icmp(BytewiseComparator());
+
+  // random seek existent keys
+  for (int i = 0; i < num_records; i++) {
+    auto iter =
+        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "1" /* existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    ASSERT_TRUE(may_exist);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(values[index], iter->value());
+
+    delete iter;
+  }
+
+  // random seek non-existent user keys
+  // In this case A), the user_key cannot be found in HashIndex. The key may
+  // exist in the next block. So the iter is set invalidated to tell the
+  // caller to search the next block. This test case belongs to this case A).
+  //
+  // Note that for non-existent keys, there is possibility of false positive,
+  // i.e. the key is still hashed into some restart interval.
+  // Two additional possible outcome:
+  // B) linear seek the restart interval and not found, the iter stops at the
+  //    starting of the next restart interval. The key does not exist
+  //    anywhere.
+  // C) linear seek the restart interval and not found, the iter stops at the
+  //    the end of the block, i.e. restarts_. The key may exist in the next
+  //    block.
+  // So these combinations are possible when searching non-existent user_key:
+  //
+  // case#    may_exist  iter->Valid()
+  //     A         true          false
+  //     B        false           true
+  //     C         true          false
+
+  for (int i = 0; i < num_records; i++) {
+    auto iter =
+        reader.NewIterator<DataBlockIter>(&icmp, icmp.user_comparator());
+    // find a random key in the lookaside array
+    int index = rnd.Uniform(num_records);
+    std::string ukey(keys[index] + "0" /* non-existing key marker */);
+    InternalKey ikey(ukey, 0, kTypeValue);
+
+    // search in block for this key
+    bool may_exist = iter->SeekForGet(ikey.Encode().ToString());
+    if (!may_exist) {
+      ASSERT_TRUE(iter->Valid());
+    }
+    if (!iter->Valid()) {
+      ASSERT_TRUE(may_exist);
+    }
+
+    delete iter;
+  }
+}
+
+// helper routine for DataBlockHashIndex.BlockBoundary
+void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2,
+                  std::string& v2, InternalKey& seek_ikey,
+                  GetContext& get_context, Options& options) {
+  std::unique_ptr<WritableFileWriter> file_writer;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  std::unique_ptr<TableReader> table_reader;
+  int level_ = -1;
+
+  std::vector<std::string> keys;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+
+  EnvOptions soptions;
+
+  soptions.use_mmap_reads = ioptions.allow_mmap_reads;
+  file_writer.reset(
+      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
+  std::unique_ptr<TableBuilder> builder;
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  builder.reset(ioptions.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, internal_comparator,
+                          &int_tbl_prop_collector_factories,
+                          options.compression, options.sample_for_compression,
+                          CompressionOptions(), false /* skip_filters */,
+                          column_family_name, level_),
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      file_writer.get()));
+
+  builder->Add(ik1.Encode().ToString(), v1);
+  builder->Add(ik2.Encode().ToString(), v2);
+  EXPECT_TRUE(builder->status().ok());
+
+  Status s = builder->Finish();
+  file_writer->Flush();
+  EXPECT_TRUE(s.ok()) << s.ToString();
+
+  EXPECT_EQ(static_cast<test::StringSink*>(file_writer->writable_file())
+                ->contents()
+                .size(),
+            builder->FileSize());
+
+  // Open the table
+  file_reader.reset(test::GetRandomAccessFileReader(new test::StringSource(
+      static_cast<test::StringSink*>(file_writer->writable_file())->contents(),
+      0 /*uniq_id*/, ioptions.allow_mmap_reads)));
+  const bool kSkipFilters = true;
+  const bool kImmortal = true;
+  ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+                         internal_comparator, !kSkipFilters, !kImmortal,
+                         level_),
+      std::move(file_reader),
+      static_cast<test::StringSink*>(file_writer->writable_file())
+          ->contents()
+          .size(),
+      &table_reader);
+  // Search using Get()
+  ReadOptions ro;
+
+  ASSERT_OK(table_reader->Get(ro, seek_ikey.Encode().ToString(), &get_context,
+                              moptions.prefix_extractor.get()));
+}
+
+TEST(DataBlockHashIndex, BlockBoundary) {
+  BlockBasedTableOptions table_options;
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+  table_options.block_restart_interval = 1;
+  table_options.block_size = 4096;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // insert two large k/v pair. Given that the block_size is 4096, one k/v
+  // pair will take up one block.
+  // [    k1/v1   ][    k2/v2  ]
+  // [   Block N  ][ Block N+1 ]
+
+  {
+    // [ "aab"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("aab");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@60
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 60, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v2);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@120
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 120, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kFound);
+    ASSERT_EQ(value, v1);
+    value.Reset();
+  }
+
+  {
+    // [ "axy"@100 ][ "axy"@10  ]
+    // | Block  N  ][ Block N+1 ]
+    // seek for "axy"@5
+    std::string uk1("axy");
+    InternalKey ik1(uk1, 100, kTypeValue);
+    std::string v1(4100, '1');  // large value
+
+    std::string uk2("axy");
+    InternalKey ik2(uk2, 10, kTypeValue);
+    std::string v2(4100, '2');  // large value
+
+    PinnableSlice value;
+    std::string seek_ukey("axy");
+    InternalKey seek_ikey(seek_ukey, 5, kTypeValue);
+    GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                           GetContext::kNotFound, seek_ukey, &value, nullptr,
+                           nullptr, nullptr, nullptr);
+
+    TestBoundary(ik1, v1, ik2, v2, seek_ikey, get_context, options);
+    ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+    value.Reset();
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/thirdparty/rocksdb/table/filter_block.h b/thirdparty/rocksdb/table/filter_block.h
index 7bf3b31324..a930495479 100644
--- a/thirdparty/rocksdb/table/filter_block.h
+++ b/thirdparty/rocksdb/table/filter_block.h
@@ -51,6 +51,7 @@ class FilterBlockBuilder {
   virtual bool IsBlockBased() = 0;                    // If is blockbased filter
   virtual void StartBlock(uint64_t block_offset) = 0;  // Start new block filter
   virtual void Add(const Slice& key) = 0;      // Add a key to current filter
+  virtual size_t NumAdded() const = 0;         // Number of keys added
   Slice Finish() {                             // Generate Filter
     const BlockHandle empty_handle;
     Status dont_care_status;
@@ -92,16 +93,21 @@ class FilterBlockReader {
    * built upon InternalKey and must be provided via const_ikey_ptr when running
    * queries.
    */
-  virtual bool KeyMayMatch(const Slice& key, uint64_t block_offset = kNotValid,
+  virtual bool KeyMayMatch(const Slice& key,
+                           const SliceTransform* prefix_extractor,
+                           uint64_t block_offset = kNotValid,
                            const bool no_io = false,
                            const Slice* const const_ikey_ptr = nullptr) = 0;
+
   /**
    * no_io and const_ikey_ptr here means the same as in KeyMayMatch
    */
   virtual bool PrefixMayMatch(const Slice& prefix,
+                              const SliceTransform* prefix_extractor,
                               uint64_t block_offset = kNotValid,
                               const bool no_io = false,
                               const Slice* const const_ikey_ptr = nullptr) = 0;
+
   virtual size_t ApproximateMemoryUsage() const = 0;
   virtual size_t size() const { return size_; }
   virtual Statistics* statistics() const { return statistics_; }
@@ -114,7 +120,19 @@ class FilterBlockReader {
     return error_msg;
   }
 
-  virtual void CacheDependencies(bool pin) {}
+  virtual void CacheDependencies(bool /*pin*/,
+                                 const SliceTransform* /*prefix_extractor*/) {}
+
+  virtual bool RangeMayExist(
+      const Slice* /*iterate_upper_bound*/, const Slice& user_key,
+      const SliceTransform* prefix_extractor,
+      const Comparator* /*comparator*/, const Slice* const const_ikey_ptr,
+      bool* filter_checked, bool /*need_upper_bound_check*/) {
+    *filter_checked = true;
+    Slice prefix = prefix_extractor->Transform(user_key);
+    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+                          const_ikey_ptr);
+  }
 
  protected:
   bool whole_key_filtering_;
diff --git a/thirdparty/rocksdb/table/flush_block_policy.cc b/thirdparty/rocksdb/table/flush_block_policy.cc
index 9a8dea4cb0..1b1675828d 100644
--- a/thirdparty/rocksdb/table/flush_block_policy.cc
+++ b/thirdparty/rocksdb/table/flush_block_policy.cc
@@ -3,10 +3,11 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
-#include "rocksdb/options.h"
 #include "rocksdb/flush_block_policy.h"
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "table/block_builder.h"
+#include "table/format.h"
 
 #include <cassert>
 
@@ -21,14 +22,15 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
   //                               reaches the configured
   FlushBlockBySizePolicy(const uint64_t block_size,
                          const uint64_t block_size_deviation,
+                         const bool align,
                          const BlockBuilder& data_block_builder)
       : block_size_(block_size),
         block_size_deviation_limit_(
             ((block_size * (100 - block_size_deviation)) + 99) / 100),
+        align_(align),
         data_block_builder_(data_block_builder) {}
 
-  virtual bool Update(const Slice& key,
-                      const Slice& value) override {
+  bool Update(const Slice& key, const Slice& value) override {
     // it makes no sense to flush when the data block is empty
     if (data_block_builder_.empty()) {
       return false;
@@ -51,8 +53,13 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
     }
 
     const auto curr_size = data_block_builder_.CurrentSizeEstimate();
-    const auto estimated_size_after =
-      data_block_builder_.EstimateSizeAfterKV(key, value);
+    auto estimated_size_after =
+        data_block_builder_.EstimateSizeAfterKV(key, value);
+
+    if (align_) {
+      estimated_size_after += kBlockTrailerSize;
+      return estimated_size_after > block_size_;
+    }
 
     return estimated_size_after > block_size_ &&
            curr_size > block_size_deviation_limit_;
@@ -60,6 +67,7 @@ class FlushBlockBySizePolicy : public FlushBlockPolicy {
 
   const uint64_t block_size_;
   const uint64_t block_size_deviation_limit_;
+  const bool align_;
   const BlockBuilder& data_block_builder_;
 };
 
@@ -68,13 +76,13 @@ FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
     const BlockBuilder& data_block_builder) const {
   return new FlushBlockBySizePolicy(
       table_options.block_size, table_options.block_size_deviation,
-      data_block_builder);
+      table_options.block_align, data_block_builder);
 }
 
 FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
     const uint64_t size, const int deviation,
     const BlockBuilder& data_block_builder) {
-  return new FlushBlockBySizePolicy(size, deviation, data_block_builder);
+  return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/format.cc b/thirdparty/rocksdb/table/format.cc
index 364766e9a8..476db85f73 100644
--- a/thirdparty/rocksdb/table/format.cc
+++ b/thirdparty/rocksdb/table/format.cc
@@ -9,20 +9,22 @@
 
 #include "table/format.h"
 
-#include <string>
 #include <inttypes.h>
+#include <string>
 
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
 #include "rocksdb/env.h"
 #include "table/block.h"
 #include "table/block_based_table_reader.h"
+#include "table/block_fetcher.h"
 #include "table/persistent_cache_helper.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
 #include "util/logging.h"
+#include "util/memory_allocator.h"
 #include "util/stop_watch.h"
 #include "util/string_util.h"
 #include "util/xxhash.h"
@@ -40,11 +42,10 @@ extern const uint64_t kPlainTableMagicNumber;
 const uint64_t kLegacyPlainTableMagicNumber = 0;
 const uint64_t kPlainTableMagicNumber = 0;
 #endif
-const uint32_t DefaultStackBufferSize = 5000;
 
 bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
   return env != nullptr && stats != nullptr &&
-         stats->stats_level_ > kExceptDetailedTimers;
+         stats->get_stats_level() > kExceptDetailedTimers;
 }
 
 void BlockHandle::EncodeTo(std::string* dst) const {
@@ -55,8 +56,19 @@ void BlockHandle::EncodeTo(std::string* dst) const {
 }
 
 Status BlockHandle::DecodeFrom(Slice* input) {
-  if (GetVarint64(input, &offset_) &&
-      GetVarint64(input, &size_)) {
+  if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
+    return Status::OK();
+  } else {
+    // reset in case failure after partially decoding
+    offset_ = 0;
+    size_ = 0;
+    return Status::Corruption("bad block handle");
+  }
+}
+
+Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
+  if (GetVarint64(input, &size_)) {
+    offset_ = _offset;
     return Status::OK();
   } else {
     // reset in case failure after partially decoding
@@ -146,7 +158,7 @@ Status Footer::DecodeFrom(Slice* input) {
   assert(input != nullptr);
   assert(input->size() >= kMinEncodedLength);
 
-  const char *magic_ptr =
+  const char* magic_ptr =
       input->data() + input->size() - kMagicNumberLengthByte;
   const uint32_t magic_lo = DecodeFixed32(magic_ptr);
   const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
@@ -196,7 +208,7 @@ Status Footer::DecodeFrom(Slice* input) {
 }
 
 std::string Footer::ToString() const {
-  std::string result, handle_;
+  std::string result;
   result.reserve(1024);
 
   bool legacy = IsLegacyFooterFormat(table_magic_number_);
@@ -221,9 +233,10 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
                           uint64_t file_size, Footer* footer,
                           uint64_t enforce_table_magic_number) {
   if (file_size < Footer::kMinEncodedLength) {
-    return Status::Corruption(
-      "file is too short (" + ToString(file_size) + " bytes) to be an "
-      "sstable: " + file->file_name());
+    return Status::Corruption("file is too short (" + ToString(file_size) +
+                              " bytes) to be an "
+                              "sstable: " +
+                              file->file_name());
   }
 
   char footer_space[Footer::kMaxEncodedLength];
@@ -244,9 +257,10 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
   // Check that we actually read the whole footer from the file. It may be
   // that size isn't correct.
   if (footer_input.size() < Footer::kMinEncodedLength) {
-    return Status::Corruption(
-      "file is too short (" + ToString(file_size) + " bytes) to be an "
-      "sstable" + file->file_name());
+    return Status::Corruption("file is too short (" + ToString(file_size) +
+                              " bytes) to be an "
+                              "sstable" +
+                              file->file_name());
   }
 
   s = footer->DecodeFrom(&footer_input);
@@ -256,321 +270,122 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
   if (enforce_table_magic_number != 0 &&
       enforce_table_magic_number != footer->table_magic_number()) {
     return Status::Corruption(
-      "Bad table magic number: expected "
-      + ToString(enforce_table_magic_number) + ", found "
-      + ToString(footer->table_magic_number())
-      + " in " + file->file_name());
+        "Bad table magic number: expected " +
+        ToString(enforce_table_magic_number) + ", found " +
+        ToString(footer->table_magic_number()) + " in " + file->file_name());
   }
   return Status::OK();
 }
 
-// Without anonymous namespace here, we fail the warning -Wmissing-prototypes
-namespace {
-Status CheckBlockChecksum(const ReadOptions& options, const Footer& footer,
-                          const Slice& contents, size_t block_size,
-                          RandomAccessFileReader* file,
-                          const BlockHandle& handle) {
-  Status s;
-  // Check the crc of the type and the block contents
-  if (options.verify_checksums) {
-    const char* data = contents.data();  // Pointer to where Read put the data
-    PERF_TIMER_GUARD(block_checksum_time);
-    uint32_t value = DecodeFixed32(data + block_size + 1);
-    uint32_t actual = 0;
-    switch (footer.checksum()) {
-      case kNoChecksum:
-        break;
-      case kCRC32c:
-        value = crc32c::Unmask(value);
-        actual = crc32c::Value(data, block_size + 1);
-        break;
-      case kxxHash:
-        actual = XXH32(data, static_cast<int>(block_size) + 1, 0);
-        break;
-      default:
-        s = Status::Corruption(
-            "unknown checksum type " + ToString(footer.checksum()) + " in " +
-            file->file_name() + " offset " + ToString(handle.offset()) +
-            " size " + ToString(block_size));
-    }
-    if (s.ok() && actual != value) {
-      s = Status::Corruption(
-          "block checksum mismatch: expected " + ToString(actual) + ", got " +
-          ToString(value) + "  in " + file->file_name() + " offset " +
-          ToString(handle.offset()) + " size " + ToString(block_size));
-    }
-    if (!s.ok()) {
-      return s;
-    }
-  }
-  return s;
-}
-
-// Read a block and check its CRC
-// contents is the result of reading.
-// According to the implementation of file->Read, contents may not point to buf
-Status ReadBlock(RandomAccessFileReader* file, const Footer& footer,
-                 const ReadOptions& options, const BlockHandle& handle,
-                 Slice* contents, /* result of reading */ char* buf) {
-  size_t n = static_cast<size_t>(handle.size());
-  Status s;
-
-  {
-    PERF_TIMER_GUARD(block_read_time);
-    s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf);
-  }
-
-  PERF_COUNTER_ADD(block_read_count, 1);
-  PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize);
-
-  if (!s.ok()) {
-    return s;
-  }
-  if (contents->size() != n + kBlockTrailerSize) {
-    return Status::Corruption("truncated block read from " + file->file_name() +
-                              " offset " + ToString(handle.offset()) +
-                              ", expected " + ToString(n + kBlockTrailerSize) +
-                              " bytes, got " + ToString(contents->size()));
-  }
-  return CheckBlockChecksum(options, footer, *contents, n, file, handle);
-}
-
-}  // namespace
-
-Status ReadBlockContents(RandomAccessFileReader* file,
-                         FilePrefetchBuffer* prefetch_buffer,
-                         const Footer& footer, const ReadOptions& read_options,
-                         const BlockHandle& handle, BlockContents* contents,
-                         const ImmutableCFOptions& ioptions,
-                         bool decompression_requested,
-                         const Slice& compression_dict,
-                         const PersistentCacheOptions& cache_options) {
-  Status status;
-  Slice slice;
-  size_t n = static_cast<size_t>(handle.size());
-  std::unique_ptr<char[]> heap_buf;
-  char stack_buf[DefaultStackBufferSize];
-  char* used_buf = nullptr;
-  rocksdb::CompressionType compression_type;
-
-  if (cache_options.persistent_cache &&
-      !cache_options.persistent_cache->IsCompressed()) {
-    status = PersistentCacheHelper::LookupUncompressedPage(cache_options,
-                                                           handle, contents);
-    if (status.ok()) {
-      // uncompressed page is found for the block handle
-      return status;
-    } else {
-      // uncompressed page is not found
-      if (ioptions.info_log && !status.IsNotFound()) {
-        assert(!status.ok());
-        ROCKS_LOG_INFO(ioptions.info_log,
-                       "Error reading from persistent cache. %s",
-                       status.ToString().c_str());
-      }
-    }
-  }
-
-  bool got_from_prefetch_buffer = false;
-  if (prefetch_buffer != nullptr &&
-      prefetch_buffer->TryReadFromCache(
-          handle.offset(),
-          static_cast<size_t>(handle.size()) + kBlockTrailerSize, &slice)) {
-    status =
-        CheckBlockChecksum(read_options, footer, slice,
-                           static_cast<size_t>(handle.size()), file, handle);
-    if (!status.ok()) {
-      return status;
-    }
-    got_from_prefetch_buffer = true;
-    used_buf = const_cast<char*>(slice.data());
-  } else if (cache_options.persistent_cache &&
-             cache_options.persistent_cache->IsCompressed()) {
-    // lookup uncompressed cache mode p-cache
-    status = PersistentCacheHelper::LookupRawPage(
-        cache_options, handle, &heap_buf, n + kBlockTrailerSize);
-  } else {
-    status = Status::NotFound();
-  }
-
-  if (!got_from_prefetch_buffer) {
-    if (status.ok()) {
-      // cache hit
-      used_buf = heap_buf.get();
-      slice = Slice(heap_buf.get(), n);
-    } else {
-      if (ioptions.info_log && !status.IsNotFound()) {
-        assert(!status.ok());
-        ROCKS_LOG_INFO(ioptions.info_log,
-                       "Error reading from persistent cache. %s",
-                       status.ToString().c_str());
-      }
-      // cache miss read from device
-      if (decompression_requested &&
-          n + kBlockTrailerSize < DefaultStackBufferSize) {
-        // If we've got a small enough hunk of data, read it in to the
-        // trivially allocated stack buffer instead of needing a full malloc()
-        used_buf = &stack_buf[0];
-      } else {
-        heap_buf = std::unique_ptr<char[]>(new char[n + kBlockTrailerSize]);
-        used_buf = heap_buf.get();
-      }
-
-      status = ReadBlock(file, footer, read_options, handle, &slice, used_buf);
-      if (status.ok() && read_options.fill_cache &&
-          cache_options.persistent_cache &&
-          cache_options.persistent_cache->IsCompressed()) {
-        // insert to raw cache
-        PersistentCacheHelper::InsertRawPage(cache_options, handle, used_buf,
-                                             n + kBlockTrailerSize);
-      }
-    }
-
-    if (!status.ok()) {
-      return status;
-    }
-  }
-
-  PERF_TIMER_GUARD(block_decompress_time);
-
-  compression_type = static_cast<rocksdb::CompressionType>(slice.data()[n]);
-
-  if (decompression_requested && compression_type != kNoCompression) {
-    // compressed page, uncompress, update cache
-    status = UncompressBlockContents(slice.data(), n, contents,
-                                     footer.version(), compression_dict,
-                                     ioptions);
-  } else if (slice.data() != used_buf) {
-    // the slice content is not the buffer provided
-    *contents = BlockContents(Slice(slice.data(), n), false, compression_type);
-  } else {
-    // page is uncompressed, the buffer either stack or heap provided
-    if (got_from_prefetch_buffer || used_buf == &stack_buf[0]) {
-      heap_buf = std::unique_ptr<char[]>(new char[n]);
-      memcpy(heap_buf.get(), used_buf, n);
-    }
-    *contents = BlockContents(std::move(heap_buf), n, true, compression_type);
-  }
-
-  if (status.ok() && !got_from_prefetch_buffer && read_options.fill_cache &&
-      cache_options.persistent_cache &&
-      !cache_options.persistent_cache->IsCompressed()) {
-    // insert to uncompressed cache
-    PersistentCacheHelper::InsertUncompressedPage(cache_options, handle,
-                                                  *contents);
-  }
-
-  return status;
-}
-
 Status UncompressBlockContentsForCompressionType(
-    const char* data, size_t n, BlockContents* contents,
-    uint32_t format_version, const Slice& compression_dict,
-    CompressionType compression_type, const ImmutableCFOptions &ioptions) {
-  std::unique_ptr<char[]> ubuf;
+    const UncompressionInfo& uncompression_info, const char* data, size_t n,
+    BlockContents* contents, uint32_t format_version,
+    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator) {
+  CacheAllocationPtr ubuf;
 
-  assert(compression_type != kNoCompression && "Invalid compression type");
+  assert(uncompression_info.type() != kNoCompression &&
+         "Invalid compression type");
 
-  StopWatchNano timer(ioptions.env,
-    ShouldReportDetailedTime(ioptions.env, ioptions.statistics));
+  StopWatchNano timer(ioptions.env, ShouldReportDetailedTime(
+                                        ioptions.env, ioptions.statistics));
   int decompress_size = 0;
-  switch (compression_type) {
+  switch (uncompression_info.type()) {
     case kSnappyCompression: {
       size_t ulength = 0;
       static char snappy_corrupt_msg[] =
-        "Snappy not supported or corrupted Snappy compressed block contents";
+          "Snappy not supported or corrupted Snappy compressed block contents";
       if (!Snappy_GetUncompressedLength(data, n, &ulength)) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      ubuf.reset(new char[ulength]);
+      ubuf = AllocateBlock(ulength, allocator);
       if (!Snappy_Uncompress(data, n, ubuf.get())) {
         return Status::Corruption(snappy_corrupt_msg);
       }
-      *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), ulength);
       break;
     }
     case kZlibCompression:
-      ubuf.reset(Zlib_Uncompress(
-          data, n, &decompress_size,
+      ubuf = Zlib_Uncompress(
+          uncompression_info, data, n, &decompress_size,
           GetCompressFormatForVersion(kZlibCompression, format_version),
-          compression_dict));
+          allocator);
       if (!ubuf) {
         static char zlib_corrupt_msg[] =
-          "Zlib not supported or corrupted Zlib compressed block contents";
+            "Zlib not supported or corrupted Zlib compressed block contents";
         return Status::Corruption(zlib_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kBZip2Compression:
-      ubuf.reset(BZip2_Uncompress(
+      ubuf = BZip2_Uncompress(
           data, n, &decompress_size,
-          GetCompressFormatForVersion(kBZip2Compression, format_version)));
+          GetCompressFormatForVersion(kBZip2Compression, format_version),
+          allocator);
       if (!ubuf) {
         static char bzip2_corrupt_msg[] =
-          "Bzip2 not supported or corrupted Bzip2 compressed block contents";
+            "Bzip2 not supported or corrupted Bzip2 compressed block contents";
         return Status::Corruption(bzip2_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kLZ4Compression:
-      ubuf.reset(LZ4_Uncompress(
-          data, n, &decompress_size,
+      ubuf = LZ4_Uncompress(
+          uncompression_info, data, n, &decompress_size,
           GetCompressFormatForVersion(kLZ4Compression, format_version),
-          compression_dict));
+          allocator);
       if (!ubuf) {
         static char lz4_corrupt_msg[] =
-          "LZ4 not supported or corrupted LZ4 compressed block contents";
+            "LZ4 not supported or corrupted LZ4 compressed block contents";
         return Status::Corruption(lz4_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kLZ4HCCompression:
-      ubuf.reset(LZ4_Uncompress(
-          data, n, &decompress_size,
+      ubuf = LZ4_Uncompress(
+          uncompression_info, data, n, &decompress_size,
           GetCompressFormatForVersion(kLZ4HCCompression, format_version),
-          compression_dict));
+          allocator);
       if (!ubuf) {
         static char lz4hc_corrupt_msg[] =
-          "LZ4HC not supported or corrupted LZ4HC compressed block contents";
+            "LZ4HC not supported or corrupted LZ4HC compressed block contents";
         return Status::Corruption(lz4hc_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kXpressCompression:
+      // XPRESS allocates memory internally, thus no support for custom
+      // allocator.
       ubuf.reset(XPRESS_Uncompress(data, n, &decompress_size));
       if (!ubuf) {
         static char xpress_corrupt_msg[] =
-          "XPRESS not supported or corrupted XPRESS compressed block contents";
+            "XPRESS not supported or corrupted XPRESS compressed block "
+            "contents";
         return Status::Corruption(xpress_corrupt_msg);
       }
-      *contents =
-        BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     case kZSTD:
     case kZSTDNotFinalCompression:
-      ubuf.reset(ZSTD_Uncompress(data, n, &decompress_size, compression_dict));
+      ubuf = ZSTD_Uncompress(uncompression_info, data, n, &decompress_size,
+                             allocator);
       if (!ubuf) {
         static char zstd_corrupt_msg[] =
             "ZSTD not supported or corrupted ZSTD compressed block contents";
         return Status::Corruption(zstd_corrupt_msg);
       }
-      *contents =
-          BlockContents(std::move(ubuf), decompress_size, true, kNoCompression);
+      *contents = BlockContents(std::move(ubuf), decompress_size);
       break;
     default:
       return Status::Corruption("bad block type");
   }
 
-  if(ShouldReportDetailedTime(ioptions.env, ioptions.statistics)){
-    MeasureTime(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
-      timer.ElapsedNanos());
-    MeasureTime(ioptions.statistics, BYTES_DECOMPRESSED, contents->data.size());
-    RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
+  if (ShouldReportDetailedTime(ioptions.env, ioptions.statistics)) {
+    RecordTimeToHistogram(ioptions.statistics, DECOMPRESSION_TIMES_NANOS,
+                          timer.ElapsedNanos());
   }
+  RecordTimeToHistogram(ioptions.statistics, BYTES_DECOMPRESSED,
+                        contents->data.size());
+  RecordTick(ioptions.statistics, NUMBER_BLOCK_DECOMPRESSED);
 
   return Status::OK();
 }
@@ -582,14 +397,16 @@ Status UncompressBlockContentsForCompressionType(
 // buffer is returned via 'result' and it is upto the caller to
 // free this buffer.
 // format_version is the block format as defined in include/rocksdb/table.h
-Status UncompressBlockContents(const char* data, size_t n,
+Status UncompressBlockContents(const UncompressionInfo& uncompression_info,
+                               const char* data, size_t n,
                                BlockContents* contents, uint32_t format_version,
-                               const Slice& compression_dict,
-                               const ImmutableCFOptions &ioptions) {
+                               const ImmutableCFOptions& ioptions,
+                               MemoryAllocator* allocator) {
   assert(data[n] != kNoCompression);
-  return UncompressBlockContentsForCompressionType(
-      data, n, contents, format_version, compression_dict,
-      (CompressionType)data[n], ioptions);
+  assert(data[n] == uncompression_info.type());
+  return UncompressBlockContentsForCompressionType(uncompression_info, data, n,
+                                                   contents, format_version,
+                                                   ioptions, allocator);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/format.h b/thirdparty/rocksdb/table/format.h
index 512b4a32bf..f585885055 100644
--- a/thirdparty/rocksdb/table/format.h
+++ b/thirdparty/rocksdb/table/format.h
@@ -8,21 +8,28 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
 #pragma once
-#include <string>
 #include <stdint.h>
+#include <string>
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif
+#endif
+#include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
-#include "rocksdb/options.h"
 #include "rocksdb/table.h"
 
 #include "options/cf_options.h"
 #include "port/port.h"  // noexcept
 #include "table/persistent_cache_options.h"
 #include "util/file_reader_writer.h"
+#include "util/memory_allocator.h"
 
 namespace rocksdb {
 
-class Block;
 class RandomAccessFile;
 struct ReadOptions;
 
@@ -48,19 +55,16 @@ class BlockHandle {
 
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(Slice* input);
+  Status DecodeSizeFrom(uint64_t offset, Slice* input);
 
   // Return a string that contains the copy of handle.
   std::string ToString(bool hex = true) const;
 
   // if the block handle's offset and size are both "0", we will view it
   // as a null block handle that points to no where.
-  bool IsNull() const {
-    return offset_ == 0 && size_ == 0;
-  }
+  bool IsNull() const { return offset_ == 0 && size_ == 0; }
 
-  static const BlockHandle& NullBlockHandle() {
-    return kNullBlockHandle;
-  }
+  static const BlockHandle& NullBlockHandle() { return kNullBlockHandle; }
 
   // Maximum encoding length of a BlockHandle
   enum { kMaxEncodedLength = 10 + 10 };
@@ -74,6 +78,9 @@ class BlockHandle {
 
 inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
                                             uint32_t version) {
+#ifdef NDEBUG
+  (void)compression_type;
+#endif
   // snappy is not versioned
   assert(compression_type != kSnappyCompression &&
          compression_type != kXpressCompression &&
@@ -85,7 +92,7 @@ inline uint32_t GetCompressFormatForVersion(CompressionType compression_type,
 }
 
 inline bool BlockBasedTableSupportedVersion(uint32_t version) {
-  return version <= 2;
+  return version <= 4;
 }
 
 // Footer encapsulates the fixed information stored at the tail
@@ -182,32 +189,74 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
 // 1-byte type + 32-bit crc
 static const size_t kBlockTrailerSize = 5;
 
+inline CompressionType get_block_compression_type(const char* block_data,
+                                                  size_t block_size) {
+  return static_cast<CompressionType>(block_data[block_size]);
+}
+
 struct BlockContents {
-  Slice data;           // Actual contents of data
-  bool cachable;        // True iff data can be cached
-  CompressionType compression_type;
-  std::unique_ptr<char[]> allocation;
+  Slice data;  // Actual contents of data
+  CacheAllocationPtr allocation;
+
+#ifndef NDEBUG
+  // Whether the block is a raw block, which contains compression type
+  // byte. It is only used for assertion.
+  bool is_raw_block = false;
+#endif  // NDEBUG
+
+  BlockContents() {}
 
-  BlockContents() : cachable(false), compression_type(kNoCompression) {}
+  BlockContents(const Slice& _data) : data(_data) {}
 
-  BlockContents(const Slice& _data, bool _cachable,
-                CompressionType _compression_type)
-      : data(_data), cachable(_cachable), compression_type(_compression_type) {}
+  BlockContents(CacheAllocationPtr&& _data, size_t _size)
+      : data(_data.get(), _size), allocation(std::move(_data)) {}
 
-  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size, bool _cachable,
-                CompressionType _compression_type)
-      : data(_data.get(), _size),
-        cachable(_cachable),
-        compression_type(_compression_type),
-        allocation(std::move(_data)) {}
+  BlockContents(std::unique_ptr<char[]>&& _data, size_t _size)
+      : data(_data.get(), _size) {
+    allocation.reset(_data.release());
+  }
+
+  bool own_bytes() const { return allocation.get() != nullptr; }
+
+  // It's the caller's responsibility to make sure that this is
+  // for raw block contents, which contains the compression
+  // byte in the end.
+  CompressionType get_compression_type() const {
+    assert(is_raw_block);
+    return get_block_compression_type(data.data(), data.size());
+  }
+
+  // The additional memory space taken by the block data.
+  size_t usable_size() const {
+    if (allocation.get() != nullptr) {
+      auto allocator = allocation.get_deleter().allocator;
+      if (allocator) {
+        return allocator->UsableSize(allocation.get(), data.size());
+      }
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      return malloc_usable_size(allocation.get());
+#else
+      return data.size();
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    } else {
+      return 0;  // no extra memory is occupied by the data
+    }
+  }
 
-  BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT { *this = std::move(other); }
+  size_t ApproximateMemoryUsage() const {
+    return usable_size() + sizeof(*this);
+  }
+
+  BlockContents(BlockContents&& other) ROCKSDB_NOEXCEPT {
+    *this = std::move(other);
+  }
 
   BlockContents& operator=(BlockContents&& other) {
     data = std::move(other.data);
-    cachable = other.cachable;
-    compression_type = other.compression_type;
     allocation = std::move(other.allocation);
+#ifndef NDEBUG
+    is_raw_block = other.is_raw_block;
+#endif  // NDEBUG
     return *this;
   }
 };
@@ -228,19 +277,20 @@ extern Status ReadBlockContents(
 // free this buffer.
 // For description of compress_format_version and possible values, see
 // util/compression.h
-extern Status UncompressBlockContents(const char* data, size_t n,
+extern Status UncompressBlockContents(const UncompressionInfo& info,
+                                      const char* data, size_t n,
                                       BlockContents* contents,
                                       uint32_t compress_format_version,
-                                      const Slice& compression_dict,
-                                      const ImmutableCFOptions &ioptions);
+                                      const ImmutableCFOptions& ioptions,
+                                      MemoryAllocator* allocator = nullptr);
 
 // This is an extension to UncompressBlockContents that accepts
 // a specific compression type. This is used by un-wrapped blocks
 // with no compression header.
 extern Status UncompressBlockContentsForCompressionType(
-    const char* data, size_t n, BlockContents* contents,
-    uint32_t compress_format_version, const Slice& compression_dict,
-    CompressionType compression_type, const ImmutableCFOptions &ioptions);
+    const UncompressionInfo& info, const char* data, size_t n,
+    BlockContents* contents, uint32_t compress_format_version,
+    const ImmutableCFOptions& ioptions, MemoryAllocator* allocator = nullptr);
 
 // Implementation details follow.  Clients should ignore,
 
@@ -248,9 +298,7 @@ extern Status UncompressBlockContentsForCompressionType(
 // BlockHandle. Currently we use zeros for null and use negation-of-zeros for
 // uninitialized.
 inline BlockHandle::BlockHandle()
-    : BlockHandle(~static_cast<uint64_t>(0),
-                  ~static_cast<uint64_t>(0)) {
-}
+    : BlockHandle(~static_cast<uint64_t>(0), ~static_cast<uint64_t>(0)) {}
 
 inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size)
     : offset_(_offset), size_(_size) {}
diff --git a/thirdparty/rocksdb/table/full_filter_bits_builder.h b/thirdparty/rocksdb/table/full_filter_bits_builder.h
index b3be7e897f..851ed1e2ab 100644
--- a/thirdparty/rocksdb/table/full_filter_bits_builder.h
+++ b/thirdparty/rocksdb/table/full_filter_bits_builder.h
@@ -51,6 +51,7 @@ class FullFilterBitsBuilder : public FilterBitsBuilder {
                           uint32_t* num_lines);
 
  private:
+  friend class FullFilterBlockTest_DuplicateEntries_Test;
   size_t bits_per_key_;
   size_t num_probes_;
   std::vector<uint32_t> hash_entries_;
diff --git a/thirdparty/rocksdb/table/full_filter_block.cc b/thirdparty/rocksdb/table/full_filter_block.cc
index 5739494e8d..a7491a7161 100644
--- a/thirdparty/rocksdb/table/full_filter_block.cc
+++ b/thirdparty/rocksdb/table/full_filter_block.cc
@@ -5,6 +5,14 @@
 
 #include "table/full_filter_block.h"
 
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif
+#endif
+
 #include "monitoring/perf_context_imp.h"
 #include "port/port.h"
 #include "rocksdb/filter_policy.h"
@@ -17,16 +25,32 @@ FullFilterBlockBuilder::FullFilterBlockBuilder(
     FilterBitsBuilder* filter_bits_builder)
     : prefix_extractor_(prefix_extractor),
       whole_key_filtering_(whole_key_filtering),
+      last_whole_key_recorded_(false),
+      last_prefix_recorded_(false),
       num_added_(0) {
   assert(filter_bits_builder != nullptr);
   filter_bits_builder_.reset(filter_bits_builder);
 }
 
 void FullFilterBlockBuilder::Add(const Slice& key) {
+  const bool add_prefix = prefix_extractor_ && prefix_extractor_->InDomain(key);
   if (whole_key_filtering_) {
-    AddKey(key);
+    if (!add_prefix) {
+      AddKey(key);
+    } else {
+      // if both whole_key and prefix are added to bloom then we will have whole
+      // key and prefix addition being interleaved and thus cannot rely on the
+      // bits builder to properly detect the duplicates by comparing with the
+      // last item.
+      Slice last_whole_key = Slice(last_whole_key_str_);
+      if (!last_whole_key_recorded_ || last_whole_key.compare(key) != 0) {
+        AddKey(key);
+        last_whole_key_recorded_ = true;
+        last_whole_key_str_.assign(key.data(), key.size());
+      }
+    }
   }
-  if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
+  if (add_prefix) {
     AddPrefix(key);
   }
 }
@@ -40,10 +64,30 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
 // Add prefix to filter if needed
 inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
   Slice prefix = prefix_extractor_->Transform(key);
-  AddKey(prefix);
+  if (whole_key_filtering_) {
+    // if both whole_key and prefix are added to bloom then we will have whole
+    // key and prefix addition being interleaved and thus cannot rely on the
+    // bits builder to properly detect the duplicates by comparing with the last
+    // item.
+    Slice last_prefix = Slice(last_prefix_str_);
+    if (!last_prefix_recorded_ || last_prefix.compare(prefix) != 0) {
+      AddKey(prefix);
+      last_prefix_recorded_ = true;
+      last_prefix_str_.assign(prefix.data(), prefix.size());
+    }
+  } else {
+    AddKey(prefix);
+  }
 }
 
-Slice FullFilterBlockBuilder::Finish(const BlockHandle& tmp, Status* status) {
+void FullFilterBlockBuilder::Reset() {
+  last_whole_key_recorded_ = false;
+  last_prefix_recorded_ = false;
+}
+
+Slice FullFilterBlockBuilder::Finish(const BlockHandle& /*tmp*/,
+                                     Status* status) {
+  Reset();
   // In this impl we ignore BlockHandle
   *status = Status::OK();
   if (num_added_ != 0) {
@@ -62,6 +106,10 @@ FullFilterBlockReader::FullFilterBlockReader(
       contents_(contents) {
   assert(filter_bits_reader != nullptr);
   filter_bits_reader_.reset(filter_bits_reader);
+  if (prefix_extractor_ != nullptr) {
+    full_length_enabled_ =
+        prefix_extractor_->FullLengthEnabled(&prefix_extractor_full_length_);
+  }
 }
 
 FullFilterBlockReader::FullFilterBlockReader(
@@ -73,9 +121,13 @@ FullFilterBlockReader::FullFilterBlockReader(
   block_contents_ = std::move(contents);
 }
 
-bool FullFilterBlockReader::KeyMayMatch(const Slice& key, uint64_t block_offset,
-                                        const bool no_io,
-                                        const Slice* const const_ikey_ptr) {
+bool FullFilterBlockReader::KeyMayMatch(
+    const Slice& key, const SliceTransform* /*prefix_extractor*/,
+    uint64_t block_offset, const bool /*no_io*/,
+    const Slice* const /*const_ikey_ptr*/) {
+#ifdef NDEBUG
+  (void)block_offset;
+#endif
   assert(block_offset == kNotValid);
   if (!whole_key_filtering_) {
     return true;
@@ -83,14 +135,14 @@ bool FullFilterBlockReader::KeyMayMatch(const Slice& key, uint64_t block_offset,
   return MayMatch(key);
 }
 
-bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix,
-                                           uint64_t block_offset,
-                                           const bool no_io,
-                                           const Slice* const const_ikey_ptr) {
+bool FullFilterBlockReader::PrefixMayMatch(
+    const Slice& prefix, const SliceTransform* /* prefix_extractor */,
+    uint64_t block_offset, const bool /*no_io*/,
+    const Slice* const /*const_ikey_ptr*/) {
+#ifdef NDEBUG
+  (void)block_offset;
+#endif
   assert(block_offset == kNotValid);
-  if (!prefix_extractor_) {
-    return true;
-  }
   return MayMatch(prefix);
 }
 
@@ -108,6 +160,67 @@ bool FullFilterBlockReader::MayMatch(const Slice& entry) {
 }
 
 size_t FullFilterBlockReader::ApproximateMemoryUsage() const {
-  return contents_.size();
+  size_t usage = block_contents_.usable_size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+  usage += malloc_usable_size(filter_bits_reader_.get());
+#else
+  usage += sizeof(*this);
+  usage += sizeof(*filter_bits_reader_.get());
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
 }
+
+bool FullFilterBlockReader::RangeMayExist(const Slice* iterate_upper_bound,
+    const Slice& user_key, const SliceTransform* prefix_extractor,
+    const Comparator* comparator, const Slice* const const_ikey_ptr,
+    bool* filter_checked, bool need_upper_bound_check) {
+  if (!prefix_extractor || !prefix_extractor->InDomain(user_key)) {
+    *filter_checked = false;
+    return true;
+  }
+  Slice prefix = prefix_extractor->Transform(user_key);
+  if (need_upper_bound_check &&
+      !IsFilterCompatible(iterate_upper_bound, prefix, comparator)) {
+    *filter_checked = false;
+    return true;
+  } else {
+    *filter_checked = true;
+    return PrefixMayMatch(prefix, prefix_extractor, kNotValid, false,
+                          const_ikey_ptr);
+  }
+}
+
+bool FullFilterBlockReader::IsFilterCompatible(
+    const Slice* iterate_upper_bound, const Slice& prefix,
+    const Comparator* comparator) {
+  // Try to reuse the bloom filter in the SST table if prefix_extractor in
+  // mutable_cf_options has changed. If range [user_key, upper_bound) all
+  // share the same prefix then we may still be able to use the bloom filter.
+  if (iterate_upper_bound != nullptr && prefix_extractor_) {
+    if (!prefix_extractor_->InDomain(*iterate_upper_bound)) {
+      return false;
+    }
+    Slice upper_bound_xform =
+        prefix_extractor_->Transform(*iterate_upper_bound);
+    // first check if user_key and upper_bound all share the same prefix
+    if (!comparator->Equal(prefix, upper_bound_xform)) {
+      // second check if user_key's prefix is the immediate predecessor of
+      // upper_bound and have the same length. If so, we know for sure all
+      // keys in the range [user_key, upper_bound) share the same prefix.
+      // Also need to make sure upper_bound are full length to ensure
+      // correctness
+      if (!full_length_enabled_ ||
+          iterate_upper_bound->size() != prefix_extractor_full_length_ ||
+          !comparator->IsSameLengthImmediateSuccessor(prefix,
+                                                      *iterate_upper_bound)) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/full_filter_block.h b/thirdparty/rocksdb/table/full_filter_block.h
index be27c58b61..e4384c91a4 100644
--- a/thirdparty/rocksdb/table/full_filter_block.h
+++ b/thirdparty/rocksdb/table/full_filter_block.h
@@ -43,14 +43,16 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   ~FullFilterBlockBuilder() {}
 
   virtual bool IsBlockBased() override { return false; }
-  virtual void StartBlock(uint64_t block_offset) override {}
+  virtual void StartBlock(uint64_t /*block_offset*/) override {}
   virtual void Add(const Slice& key) override;
+  virtual size_t NumAdded() const override { return num_added_; }
   virtual Slice Finish(const BlockHandle& tmp, Status* status) override;
   using FilterBlockBuilder::Finish;
 
  protected:
   virtual void AddKey(const Slice& key);
   std::unique_ptr<FilterBitsBuilder> filter_bits_builder_;
+  virtual void Reset();
 
  private:
   // important: all of these might point to invalid addresses
@@ -58,6 +60,10 @@ class FullFilterBlockBuilder : public FilterBlockBuilder {
   // should NOT dereference them.
   const SliceTransform* prefix_extractor_;
   bool whole_key_filtering_;
+  bool last_whole_key_recorded_;
+  std::string last_whole_key_str_;
+  bool last_prefix_recorded_;
+  std::string last_prefix_str_;
 
   uint32_t num_added_;
   std::unique_ptr<const char[]> filter_data_;
@@ -91,27 +97,37 @@ class FullFilterBlockReader : public FilterBlockReader {
   ~FullFilterBlockReader() {}
 
   virtual bool IsBlockBased() override { return false; }
+
   virtual bool KeyMayMatch(
-      const Slice& key, uint64_t block_offset = kNotValid,
-      const bool no_io = false,
+      const Slice& key, const SliceTransform* prefix_extractor,
+      uint64_t block_offset = kNotValid, const bool no_io = false,
       const Slice* const const_ikey_ptr = nullptr) override;
+
   virtual bool PrefixMayMatch(
-      const Slice& prefix, uint64_t block_offset = kNotValid,
-      const bool no_io = false,
+      const Slice& prefix, const SliceTransform* prefix_extractor,
+      uint64_t block_offset = kNotValid, const bool no_io = false,
       const Slice* const const_ikey_ptr = nullptr) override;
   virtual size_t ApproximateMemoryUsage() const override;
-
+  virtual bool RangeMayExist(const Slice* iterate_upper_bound, const Slice& user_key,
+                             const SliceTransform* prefix_extractor,
+                             const Comparator* comparator,
+                             const Slice* const const_ikey_ptr, bool* filter_checked,
+                             bool need_upper_bound_check) override;
  private:
   const SliceTransform* prefix_extractor_;
   Slice contents_;
   std::unique_ptr<FilterBitsReader> filter_bits_reader_;
   BlockContents block_contents_;
-  std::unique_ptr<const char[]> filter_data_;
+  bool full_length_enabled_;
+  size_t prefix_extractor_full_length_;
 
   // No copying allowed
   FullFilterBlockReader(const FullFilterBlockReader&);
   bool MayMatch(const Slice& entry);
   void operator=(const FullFilterBlockReader&);
+  bool IsFilterCompatible(const Slice* iterate_upper_bound,
+                          const Slice& prefix, const Comparator* comparator);
+
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/full_filter_block_test.cc b/thirdparty/rocksdb/table/full_filter_block_test.cc
index 5fbda4c6f0..f01ae52bf7 100644
--- a/thirdparty/rocksdb/table/full_filter_block_test.cc
+++ b/thirdparty/rocksdb/table/full_filter_block_test.cc
@@ -6,6 +6,7 @@
 #include "table/full_filter_block.h"
 
 #include "rocksdb/filter_policy.h"
+#include "table/full_filter_bits_builder.h"
 #include "util/coding.h"
 #include "util/hash.h"
 #include "util/string_util.h"
@@ -19,12 +20,12 @@ class TestFilterBitsBuilder : public FilterBitsBuilder {
   explicit TestFilterBitsBuilder() {}
 
   // Add Key to filter
-  virtual void AddKey(const Slice& key) override {
+  void AddKey(const Slice& key) override {
     hash_entries_.push_back(Hash(key.data(), key.size(), 1));
   }
 
   // Generate the filter using the keys that are added
-  virtual Slice Finish(std::unique_ptr<const char[]>* buf) override {
+  Slice Finish(std::unique_ptr<const char[]>* buf) override {
     uint32_t len = static_cast<uint32_t>(hash_entries_.size()) * 4;
     char* data = new char[len];
     for (size_t i = 0; i < hash_entries_.size(); i++) {
@@ -44,7 +45,7 @@ class TestFilterBitsReader : public FilterBitsReader {
   explicit TestFilterBitsReader(const Slice& contents)
       : data_(contents.data()), len_(static_cast<uint32_t>(contents.size())) {}
 
-  virtual bool MayMatch(const Slice& entry) override {
+  bool MayMatch(const Slice& entry) override {
     uint32_t h = Hash(entry.data(), entry.size(), 1);
     for (size_t i = 0; i + 4 <= len_; i += 4) {
       if (h == DecodeFixed32(data_ + i)) {
@@ -62,18 +63,16 @@ class TestFilterBitsReader : public FilterBitsReader {
 
 class TestHashFilter : public FilterPolicy {
  public:
-  virtual const char* Name() const override { return "TestHashFilter"; }
+  const char* Name() const override { return "TestHashFilter"; }
 
-  virtual void CreateFilter(const Slice* keys, int n,
-                            std::string* dst) const override {
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
     for (int i = 0; i < n; i++) {
       uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
       PutFixed32(dst, h);
     }
   }
 
-  virtual bool KeyMayMatch(const Slice& key,
-                           const Slice& filter) const override {
+  bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
     uint32_t h = Hash(key.data(), key.size(), 1);
     for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
       if (h == DecodeFixed32(filter.data() + i)) {
@@ -83,12 +82,11 @@ class TestHashFilter : public FilterPolicy {
     return false;
   }
 
-  virtual FilterBitsBuilder* GetFilterBitsBuilder() const override {
+  FilterBitsBuilder* GetFilterBitsBuilder() const override {
     return new TestFilterBitsBuilder();
   }
 
-  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents)
-      const override {
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
     return new TestFilterBitsReader(contents);
   }
 };
@@ -112,7 +110,7 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) {
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
   // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
 }
 
 TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
@@ -127,13 +125,13 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) {
   FullFilterBlockReader reader(
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo"));
-  ASSERT_TRUE(reader.KeyMayMatch("bar"));
-  ASSERT_TRUE(reader.KeyMayMatch("box"));
-  ASSERT_TRUE(reader.KeyMayMatch("hello"));
-  ASSERT_TRUE(reader.KeyMayMatch("foo"));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing"));
-  ASSERT_TRUE(!reader.KeyMayMatch("other"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr));
 }
 
 class FullFilterBlockTest : public testing::Test {
@@ -144,7 +142,7 @@ class FullFilterBlockTest : public testing::Test {
     table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false));
   }
 
-  ~FullFilterBlockTest() {}
+  ~FullFilterBlockTest() override {}
 };
 
 TEST_F(FullFilterBlockTest, EmptyBuilder) {
@@ -157,28 +155,63 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) {
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
   // Remain same symantic with blockbased filter
-  ASSERT_TRUE(reader.KeyMayMatch("foo"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
+}
+
+TEST_F(FullFilterBlockTest, DuplicateEntries) {
+  {  // empty prefixes
+    std::unique_ptr<const SliceTransform> prefix_extractor(
+        NewFixedPrefixTransform(0));
+    auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>(
+        table_options_.filter_policy->GetFilterBitsBuilder());
+    const bool WHOLE_KEY = true;
+    FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                   bits_builder);
+    ASSERT_EQ(0, builder.NumAdded());
+    builder.Add("key");  // test with empty prefix
+    ASSERT_EQ(2, bits_builder->hash_entries_.size());
+  }
+
+  // mix of empty and non-empty
+  std::unique_ptr<const SliceTransform> prefix_extractor(
+      NewFixedPrefixTransform(7));
+  auto bits_builder = dynamic_cast<FullFilterBitsBuilder*>(
+      table_options_.filter_policy->GetFilterBitsBuilder());
+  const bool WHOLE_KEY = true;
+  FullFilterBlockBuilder builder(prefix_extractor.get(), WHOLE_KEY,
+                                 bits_builder);
+  ASSERT_EQ(0, builder.NumAdded());
+  builder.Add("");  // test with empty key too
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key1");
+  builder.Add("prefix1key2");
+  builder.Add("prefix1key3");
+  builder.Add("prefix2key4");
+  // two prefix adn 4 keys
+  ASSERT_EQ(1 + 2 + 4, bits_builder->hash_entries_.size());
 }
 
 TEST_F(FullFilterBlockTest, SingleChunk) {
   FullFilterBlockBuilder builder(
       nullptr, true, table_options_.filter_policy->GetFilterBitsBuilder());
+  ASSERT_EQ(0, builder.NumAdded());
   builder.Add("foo");
   builder.Add("bar");
   builder.Add("box");
   builder.Add("box");
   builder.Add("hello");
+  ASSERT_EQ(5, builder.NumAdded());
   Slice block = builder.Finish();
   FullFilterBlockReader reader(
       nullptr, true, block,
       table_options_.filter_policy->GetFilterBitsReader(block), nullptr);
-  ASSERT_TRUE(reader.KeyMayMatch("foo"));
-  ASSERT_TRUE(reader.KeyMayMatch("bar"));
-  ASSERT_TRUE(reader.KeyMayMatch("box"));
-  ASSERT_TRUE(reader.KeyMayMatch("hello"));
-  ASSERT_TRUE(reader.KeyMayMatch("foo"));
-  ASSERT_TRUE(!reader.KeyMayMatch("missing"));
-  ASSERT_TRUE(!reader.KeyMayMatch("other"));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("bar", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("box", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("hello", nullptr));
+  ASSERT_TRUE(reader.KeyMayMatch("foo", nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch("missing", nullptr));
+  ASSERT_TRUE(!reader.KeyMayMatch("other", nullptr));
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/get_context.cc b/thirdparty/rocksdb/table/get_context.cc
index 258891ec4c..24c9ba7d5b 100644
--- a/thirdparty/rocksdb/table/get_context.cc
+++ b/thirdparty/rocksdb/table/get_context.cc
@@ -6,6 +6,7 @@
 #include "table/get_context.h"
 #include "db/merge_helper.h"
 #include "db/pinned_iterators_manager.h"
+#include "db/read_callback.h"
 #include "monitoring/file_read_sample.h"
 #include "monitoring/perf_context_imp.h"
 #include "monitoring/statistics.h"
@@ -28,17 +29,24 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) {
     replay_log->push_back(type);
     PutLengthPrefixedSlice(replay_log, value);
   }
+#else
+  (void)replay_log;
+  (void)type;
+  (void)value;
 #endif  // ROCKSDB_LITE
 }
 
 }  // namespace
 
-GetContext::GetContext(
-    const Comparator* ucmp, const MergeOperator* merge_operator, Logger* logger,
-    Statistics* statistics, GetState init_state, const Slice& user_key,
-    PinnableSlice* pinnable_val, bool* value_found, MergeContext* merge_context,
-    RangeDelAggregator* _range_del_agg, Env* env, SequenceNumber* seq,
-    PinnedIteratorsManager* _pinned_iters_mgr, bool* is_blob_index)
+GetContext::GetContext(const Comparator* ucmp,
+                       const MergeOperator* merge_operator, Logger* logger,
+                       Statistics* statistics, GetState init_state,
+                       const Slice& user_key, PinnableSlice* pinnable_val,
+                       bool* value_found, MergeContext* merge_context,
+                       SequenceNumber* _max_covering_tombstone_seq, Env* env,
+                       SequenceNumber* seq,
+                       PinnedIteratorsManager* _pinned_iters_mgr,
+                       ReadCallback* callback, bool* is_blob_index)
     : ucmp_(ucmp),
       merge_operator_(merge_operator),
       logger_(logger),
@@ -48,11 +56,12 @@ GetContext::GetContext(
       pinnable_val_(pinnable_val),
       value_found_(value_found),
       merge_context_(merge_context),
-      range_del_agg_(_range_del_agg),
+      max_covering_tombstone_seq_(_max_covering_tombstone_seq),
       env_(env),
       seq_(seq),
       replay_log_(nullptr),
       pinned_iters_mgr_(_pinned_iters_mgr),
+      callback_(callback),
       is_blob_index_(is_blob_index) {
   if (seq_) {
     *seq_ = kMaxSequenceNumber;
@@ -72,7 +81,7 @@ void GetContext::MarkKeyMayExist() {
   }
 }
 
-void GetContext::SaveValue(const Slice& value, SequenceNumber seq) {
+void GetContext::SaveValue(const Slice& value, SequenceNumber /*seq*/) {
   assert(state_ == kNotFound);
   appendToReplayLog(replay_log_, kTypeValue, value);
 
@@ -82,11 +91,104 @@ void GetContext::SaveValue(const Slice& value, SequenceNumber seq) {
   }
 }
 
+void GetContext::ReportCounters() {
+  if (get_context_stats_.num_cache_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_HIT, get_context_stats_.num_cache_hit);
+  }
+  if (get_context_stats_.num_cache_index_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_HIT,
+               get_context_stats_.num_cache_index_hit);
+  }
+  if (get_context_stats_.num_cache_data_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_HIT,
+               get_context_stats_.num_cache_data_hit);
+  }
+  if (get_context_stats_.num_cache_filter_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_HIT,
+               get_context_stats_.num_cache_filter_hit);
+  }
+  if (get_context_stats_.num_cache_compression_dict_hit > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_HIT,
+               get_context_stats_.num_cache_compression_dict_hit);
+  }
+  if (get_context_stats_.num_cache_index_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_MISS,
+               get_context_stats_.num_cache_index_miss);
+  }
+  if (get_context_stats_.num_cache_filter_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_MISS,
+               get_context_stats_.num_cache_filter_miss);
+  }
+  if (get_context_stats_.num_cache_data_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_MISS,
+               get_context_stats_.num_cache_data_miss);
+  }
+  if (get_context_stats_.num_cache_compression_dict_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_MISS,
+               get_context_stats_.num_cache_compression_dict_miss);
+  }
+  if (get_context_stats_.num_cache_bytes_read > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_READ,
+               get_context_stats_.num_cache_bytes_read);
+  }
+  if (get_context_stats_.num_cache_miss > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_MISS,
+               get_context_stats_.num_cache_miss);
+  }
+  if (get_context_stats_.num_cache_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_ADD, get_context_stats_.num_cache_add);
+  }
+  if (get_context_stats_.num_cache_bytes_write > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_BYTES_WRITE,
+               get_context_stats_.num_cache_bytes_write);
+  }
+  if (get_context_stats_.num_cache_index_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_ADD,
+               get_context_stats_.num_cache_index_add);
+  }
+  if (get_context_stats_.num_cache_index_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_INDEX_BYTES_INSERT,
+               get_context_stats_.num_cache_index_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_data_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_ADD,
+               get_context_stats_.num_cache_data_add);
+  }
+  if (get_context_stats_.num_cache_data_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_DATA_BYTES_INSERT,
+               get_context_stats_.num_cache_data_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_filter_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_ADD,
+               get_context_stats_.num_cache_filter_add);
+  }
+  if (get_context_stats_.num_cache_filter_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_FILTER_BYTES_INSERT,
+               get_context_stats_.num_cache_filter_bytes_insert);
+  }
+  if (get_context_stats_.num_cache_compression_dict_add > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_ADD,
+               get_context_stats_.num_cache_compression_dict_add);
+  }
+  if (get_context_stats_.num_cache_compression_dict_bytes_insert > 0) {
+    RecordTick(statistics_, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT,
+               get_context_stats_.num_cache_compression_dict_bytes_insert);
+  }
+}
+
 bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
-                           const Slice& value, Cleanable* value_pinner) {
+                           const Slice& value, bool* matched,
+                           Cleanable* value_pinner) {
+  assert(matched);
   assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
          merge_context_ != nullptr);
   if (ucmp_->Equal(parsed_key.user_key, user_key_)) {
+    *matched = true;
+    // If the value is not in the snapshot, skip it
+    if (!CheckCallback(parsed_key.sequence)) {
+      return true;  // to continue to the next seq
+    }
+
     appendToReplayLog(replay_log_, parsed_key.type, value);
 
     if (seq_ != nullptr) {
@@ -99,7 +201,8 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
     auto type = parsed_key.type;
     // Key matches. Process it
     if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex) &&
-        range_del_agg_ != nullptr && range_del_agg_->ShouldDelete(parsed_key)) {
+        max_covering_tombstone_seq_ != nullptr &&
+        *max_covering_tombstone_seq_ > parsed_key.sequence) {
       type = kTypeRangeDeletion;
     }
     switch (type) {
@@ -118,6 +221,8 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
               // If the backing resources for the value are provided, pin them
               pinnable_val_->PinSlice(value, value_pinner);
             } else {
+              TEST_SYNC_POINT_CALLBACK("GetContext::SaveValue::PinSelf", this);
+
               // Otherwise copy the value
               pinnable_val_->PinSelf(value);
             }
@@ -175,6 +280,21 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
         } else {
           merge_context_->PushOperand(value, false);
         }
+        if (merge_operator_ != nullptr &&
+            merge_operator_->ShouldMerge(merge_context_->GetOperandsDirectionBackward())) {
+          state_ = kFound;
+          if (LIKELY(pinnable_val_ != nullptr)) {
+            Status merge_status = MergeHelper::TimedFullMerge(
+                merge_operator_, user_key_, nullptr,
+                merge_context_->GetOperands(), pinnable_val_->GetSelf(),
+                logger_, statistics_, env_);
+            pinnable_val_->PinSelf();
+            if (!merge_status.ok()) {
+              state_ = kCorrupt;
+            }
+          }
+          return false;
+        }
         return true;
 
       default:
@@ -199,13 +319,18 @@ void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
     assert(ret);
     (void)ret;
 
+    bool dont_care __attribute__((__unused__));
     // Since SequenceNumber is not stored and unknown, we will use
     // kMaxSequenceNumber.
     get_context->SaveValue(
         ParsedInternalKey(user_key, kMaxSequenceNumber, type), value,
-        value_pinner);
+        &dont_care, value_pinner);
   }
 #else   // ROCKSDB_LITE
+  (void)replay_log;
+  (void)user_key;
+  (void)get_context;
+  (void)value_pinner;
   assert(false);
 #endif  // ROCKSDB_LITE
 }
diff --git a/thirdparty/rocksdb/table/get_context.h b/thirdparty/rocksdb/table/get_context.h
index a708f6be74..d7d0e9808b 100644
--- a/thirdparty/rocksdb/table/get_context.h
+++ b/thirdparty/rocksdb/table/get_context.h
@@ -6,8 +6,9 @@
 #pragma once
 #include <string>
 #include "db/merge_context.h"
-#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
 #include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/types.h"
 #include "table/block.h"
 
@@ -15,6 +16,30 @@ namespace rocksdb {
 class MergeContext;
 class PinnedIteratorsManager;
 
+struct GetContextStats {
+  uint64_t num_cache_hit = 0;
+  uint64_t num_cache_index_hit = 0;
+  uint64_t num_cache_data_hit = 0;
+  uint64_t num_cache_filter_hit = 0;
+  uint64_t num_cache_compression_dict_hit = 0;
+  uint64_t num_cache_index_miss = 0;
+  uint64_t num_cache_filter_miss = 0;
+  uint64_t num_cache_data_miss = 0;
+  uint64_t num_cache_compression_dict_miss = 0;
+  uint64_t num_cache_bytes_read = 0;
+  uint64_t num_cache_miss = 0;
+  uint64_t num_cache_add = 0;
+  uint64_t num_cache_bytes_write = 0;
+  uint64_t num_cache_index_add = 0;
+  uint64_t num_cache_index_bytes_insert = 0;
+  uint64_t num_cache_data_add = 0;
+  uint64_t num_cache_data_bytes_insert = 0;
+  uint64_t num_cache_filter_add = 0;
+  uint64_t num_cache_filter_bytes_insert = 0;
+  uint64_t num_cache_compression_dict_add = 0;
+  uint64_t num_cache_compression_dict_bytes_insert = 0;
+};
+
 class GetContext {
  public:
   enum GetState {
@@ -25,24 +50,29 @@ class GetContext {
     kMerge,  // saver contains the current merge result (the operands)
     kBlobIndex,
   };
+  GetContextStats get_context_stats_;
 
   GetContext(const Comparator* ucmp, const MergeOperator* merge_operator,
              Logger* logger, Statistics* statistics, GetState init_state,
              const Slice& user_key, PinnableSlice* value, bool* value_found,
-             MergeContext* merge_context, RangeDelAggregator* range_del_agg,
-             Env* env, SequenceNumber* seq = nullptr,
+             MergeContext* merge_context,
+             SequenceNumber* max_covering_tombstone_seq, Env* env,
+             SequenceNumber* seq = nullptr,
              PinnedIteratorsManager* _pinned_iters_mgr = nullptr,
-             bool* is_blob_index = nullptr);
+             ReadCallback* callback = nullptr, bool* is_blob_index = nullptr);
 
   void MarkKeyMayExist();
 
   // Records this key, value, and any meta-data (such as sequence number and
   // state) into this GetContext.
   //
+  // If the parsed_key matches the user key that we are looking for, sets
+  // mathced to true.
+  //
   // Returns True if more keys need to be read (due to merges) or
   //         False if the complete value has been found.
   bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value,
-                 Cleanable* value_pinner = nullptr);
+                 bool* matched, Cleanable* value_pinner = nullptr);
 
   // Simplified version of the previous function. Should only be used when we
   // know that the operation is a Put.
@@ -50,7 +80,9 @@ class GetContext {
 
   GetState State() const { return state_; }
 
-  RangeDelAggregator* range_del_agg() { return range_del_agg_; }
+  SequenceNumber* max_covering_tombstone_seq() {
+    return max_covering_tombstone_seq_;
+  }
 
   PinnedIteratorsManager* pinned_iters_mgr() { return pinned_iters_mgr_; }
 
@@ -64,6 +96,15 @@ class GetContext {
 
   bool sample() const { return sample_; }
 
+  bool CheckCallback(SequenceNumber seq) {
+    if (callback_) {
+      return callback_->IsVisible(seq);
+    }
+    return true;
+  }
+
+  void ReportCounters();
+
  private:
   const Comparator* ucmp_;
   const MergeOperator* merge_operator_;
@@ -76,7 +117,7 @@ class GetContext {
   PinnableSlice* pinnable_val_;
   bool* value_found_;  // Is value set correctly? Used by KeyMayExist
   MergeContext* merge_context_;
-  RangeDelAggregator* range_del_agg_;
+  SequenceNumber* max_covering_tombstone_seq_;
   Env* env_;
   // If a key is found, seq_ will be set to the SequenceNumber of most recent
   // write to the key or kMaxSequenceNumber if unknown
@@ -84,6 +125,7 @@ class GetContext {
   std::string* replay_log_;
   // Used to temporarily pin blocks when state_ == GetContext::kMerge
   PinnedIteratorsManager* pinned_iters_mgr_;
+  ReadCallback* callback_;
   bool sample_;
   bool* is_blob_index_;
 };
diff --git a/thirdparty/rocksdb/table/index_builder.cc b/thirdparty/rocksdb/table/index_builder.cc
index cdf20aee92..cd28c42a8b 100644
--- a/thirdparty/rocksdb/table/index_builder.cc
+++ b/thirdparty/rocksdb/table/index_builder.cc
@@ -27,42 +27,65 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder(
     BlockBasedTableOptions::IndexType index_type,
     const InternalKeyComparator* comparator,
     const InternalKeySliceTransform* int_key_slice_transform,
+    const bool use_value_delta_encoding,
     const BlockBasedTableOptions& table_opt) {
+  IndexBuilder* result = nullptr;
   switch (index_type) {
     case BlockBasedTableOptions::kBinarySearch: {
-      return new ShortenedIndexBuilder(comparator,
-                                       table_opt.index_block_restart_interval);
+      result = new ShortenedIndexBuilder(
+          comparator, table_opt.index_block_restart_interval,
+          table_opt.format_version, use_value_delta_encoding);
     }
+  break;
     case BlockBasedTableOptions::kHashSearch: {
-      return new HashIndexBuilder(comparator, int_key_slice_transform,
-                                  table_opt.index_block_restart_interval);
+      result = new HashIndexBuilder(comparator, int_key_slice_transform,
+                                    table_opt.index_block_restart_interval,
+                                    table_opt.format_version,
+                                    use_value_delta_encoding);
     }
+  break;
     case BlockBasedTableOptions::kTwoLevelIndexSearch: {
-      return PartitionedIndexBuilder::CreateIndexBuilder(comparator, table_opt);
+      result = PartitionedIndexBuilder::CreateIndexBuilder(
+          comparator, use_value_delta_encoding, table_opt);
     }
+    break;
     default: {
       assert(!"Do not recognize the index type ");
-      return nullptr;
     }
+  break;
   }
-  // impossible.
-  assert(false);
-  return nullptr;
+  return result;
 }
 
 PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder(
     const InternalKeyComparator* comparator,
+    const bool use_value_delta_encoding,
     const BlockBasedTableOptions& table_opt) {
-  return new PartitionedIndexBuilder(comparator, table_opt);
+  return new PartitionedIndexBuilder(comparator, table_opt,
+                                     use_value_delta_encoding);
 }
 
 PartitionedIndexBuilder::PartitionedIndexBuilder(
     const InternalKeyComparator* comparator,
-    const BlockBasedTableOptions& table_opt)
+    const BlockBasedTableOptions& table_opt,
+    const bool use_value_delta_encoding)
     : IndexBuilder(comparator),
-      index_block_builder_(table_opt.index_block_restart_interval),
+      index_block_builder_(table_opt.index_block_restart_interval,
+                           true /*use_delta_encoding*/,
+                           use_value_delta_encoding),
+      index_block_builder_without_seq_(table_opt.index_block_restart_interval,
+                                       true /*use_delta_encoding*/,
+                                       use_value_delta_encoding),
       sub_index_builder_(nullptr),
-      table_opt_(table_opt) {}
+      table_opt_(table_opt),
+      // We start by false. After each partition we revise the value based on
+      // what the sub_index_builder has decided. If the feature is disabled
+      // entirely, this will be set to true after switching the first
+      // sub_index_builder. Otherwise, it could be set to true even one of the
+      // sub_index_builders could not safely exclude seq from the keys, then it
+      // wil be enforced on all sub_index_builders on ::Finish.
+      seperator_is_key_plus_seq_(false),
+      use_value_delta_encoding_(use_value_delta_encoding) {}
 
 PartitionedIndexBuilder::~PartitionedIndexBuilder() {
   delete sub_index_builder_;
@@ -71,10 +94,15 @@ PartitionedIndexBuilder::~PartitionedIndexBuilder() {
 void PartitionedIndexBuilder::MakeNewSubIndexBuilder() {
   assert(sub_index_builder_ == nullptr);
   sub_index_builder_ = new ShortenedIndexBuilder(
-      comparator_, table_opt_.index_block_restart_interval);
+      comparator_, table_opt_.index_block_restart_interval,
+      table_opt_.format_version, use_value_delta_encoding_);
   flush_policy_.reset(FlushBlockBySizePolicyFactory::NewFlushBlockPolicy(
       table_opt_.metadata_block_size, table_opt_.block_size_deviation,
-      sub_index_builder_->index_block_builder_));
+      // Note: this is sub-optimal since sub_index_builder_ could later reset
+      // seperator_is_key_plus_seq_ but the probability of that is low.
+      sub_index_builder_->seperator_is_key_plus_seq_
+          ? sub_index_builder_->index_block_builder_
+          : sub_index_builder_->index_block_builder_without_seq_));
   partition_cut_requested_ = false;
 }
 
@@ -93,6 +121,10 @@ void PartitionedIndexBuilder::AddIndexEntry(
     }
     sub_index_builder_->AddIndexEntry(last_key_in_current_block,
                                       first_key_in_next_block, block_handle);
+    if (sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders
+      seperator_is_key_plus_seq_ = true;
+    }
     sub_index_last_key_ = std::string(*last_key_in_current_block);
     entries_.push_back(
         {sub_index_last_key_,
@@ -121,67 +153,62 @@ void PartitionedIndexBuilder::AddIndexEntry(
     sub_index_builder_->AddIndexEntry(last_key_in_current_block,
                                       first_key_in_next_block, block_handle);
     sub_index_last_key_ = std::string(*last_key_in_current_block);
+    if (sub_index_builder_->seperator_is_key_plus_seq_) {
+      // then we need to apply it to all sub-index builders
+      seperator_is_key_plus_seq_ = true;
+    }
   }
 }
 
 Status PartitionedIndexBuilder::Finish(
     IndexBlocks* index_blocks, const BlockHandle& last_partition_block_handle) {
-  assert(!entries_.empty());
+  if (partition_cnt_ == 0) {
+    partition_cnt_ = entries_.size();
+  }
   // It must be set to null after last key is added
   assert(sub_index_builder_ == nullptr);
   if (finishing_indexes == true) {
     Entry& last_entry = entries_.front();
     std::string handle_encoding;
     last_partition_block_handle.EncodeTo(&handle_encoding);
-    index_block_builder_.Add(last_entry.key, handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_block_builder_.Add(last_entry.key, handle_encoding,
+                             &handle_delta_encoding_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(last_entry.key),
+                                           handle_encoding,
+                                           &handle_delta_encoding_slice);
+    }
     entries_.pop_front();
   }
   // If there is no sub_index left, then return the 2nd level index.
   if (UNLIKELY(entries_.empty())) {
-    index_blocks->index_block_contents = index_block_builder_.Finish();
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    top_level_index_size_ = index_blocks->index_block_contents.size();
+    index_size_ += top_level_index_size_;
     return Status::OK();
   } else {
     // Finish the next partition index in line and Incomplete() to indicate we
     // expect more calls to Finish
     Entry& entry = entries_.front();
+    // Apply the policy to all sub-indexes
+    entry.value->seperator_is_key_plus_seq_ = seperator_is_key_plus_seq_;
     auto s = entry.value->Finish(index_blocks);
+    index_size_ += index_blocks->index_block_contents.size();
     finishing_indexes = true;
     return s.ok() ? Status::Incomplete() : s;
   }
 }
 
-// Estimate size excluding the top-level index
-// It is assumed that this method is called before writing index partition
-// starts
-size_t PartitionedIndexBuilder::EstimatedSize() const {
-  size_t total = 0;
-  for (auto it = entries_.begin(); it != entries_.end(); ++it) {
-    total += it->value->EstimatedSize();
-  }
-  total +=
-      sub_index_builder_ == nullptr ? 0 : sub_index_builder_->EstimatedSize();
-  return total;
-}
-
-// Since when this method is called we do not know the index block offsets yet,
-// the top-level index does not exist. Hence we estimate the block offsets and
-// create a temporary top-level index.
-size_t PartitionedIndexBuilder::EstimateTopLevelIndexSize(
-    uint64_t offset) const {
-  BlockBuilder tmp_builder(
-      table_opt_.index_block_restart_interval);  // tmp top-level index builder
-  for (auto it = entries_.begin(); it != entries_.end(); ++it) {
-    std::string tmp_handle_encoding;
-    uint64_t size = it->value->EstimatedSize();
-    BlockHandle tmp_block_handle(offset, size);
-    tmp_block_handle.EncodeTo(&tmp_handle_encoding);
-    tmp_builder.Add(it->key, tmp_handle_encoding);
-    offset += size;
-  }
-  return tmp_builder.CurrentSizeEstimate();
-}
-
-size_t PartitionedIndexBuilder::NumPartitions() const {
-  return entries_.size();
-}
+size_t PartitionedIndexBuilder::NumPartitions() const { return partition_cnt_; }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/index_builder.h b/thirdparty/rocksdb/table/index_builder.h
index d591e0e533..87d7b7a71b 100644
--- a/thirdparty/rocksdb/table/index_builder.h
+++ b/thirdparty/rocksdb/table/index_builder.h
@@ -38,6 +38,7 @@ class IndexBuilder {
       BlockBasedTableOptions::IndexType index_type,
       const rocksdb::InternalKeyComparator* comparator,
       const InternalKeySliceTransform* int_key_slice_transform,
+      const bool use_value_delta_encoding,
       const BlockBasedTableOptions& table_opt);
 
   // Index builder will construct a set of blocks which contain:
@@ -69,7 +70,7 @@ class IndexBuilder {
 
   // This method will be called whenever a key is added. The subclasses may
   // override OnKeyAdded() if they need to collect additional information.
-  virtual void OnKeyAdded(const Slice& key) {}
+  virtual void OnKeyAdded(const Slice& /*key*/) {}
 
   // Inform the index builder that all entries has been written. Block builder
   // may therefore perform any operation required for block finalization.
@@ -96,11 +97,15 @@ class IndexBuilder {
   virtual Status Finish(IndexBlocks* index_blocks,
                         const BlockHandle& last_partition_block_handle) = 0;
 
-  // Get the estimated size for index block.
-  virtual size_t EstimatedSize() const = 0;
+  // Get the size for index block. Must be called after ::Finish.
+  virtual size_t IndexSize() const = 0;
+
+  virtual bool seperator_is_key_plus_seq() { return true; }
 
  protected:
   const InternalKeyComparator* comparator_;
+  // Set after ::Finish is called
+  size_t index_size_ = 0;
 };
 
 // This index builder builds space-efficient index block.
@@ -115,9 +120,19 @@ class IndexBuilder {
 class ShortenedIndexBuilder : public IndexBuilder {
  public:
   explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
-                                 int index_block_restart_interval)
+                                 const int index_block_restart_interval,
+                                 const uint32_t format_version,
+                                 const bool use_value_delta_encoding)
       : IndexBuilder(comparator),
-        index_block_builder_(index_block_restart_interval) {}
+        index_block_builder_(index_block_restart_interval,
+                             true /*use_delta_encoding*/,
+                             use_value_delta_encoding),
+        index_block_builder_without_seq_(index_block_restart_interval,
+                                         true /*use_delta_encoding*/,
+                                         use_value_delta_encoding) {
+    // Making the default true will disable the feature for old versions
+    seperator_is_key_plus_seq_ = (format_version <= 2);
+  }
 
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
                              const Slice* first_key_in_next_block,
@@ -125,31 +140,60 @@ class ShortenedIndexBuilder : public IndexBuilder {
     if (first_key_in_next_block != nullptr) {
       comparator_->FindShortestSeparator(last_key_in_current_block,
                                          *first_key_in_next_block);
+      if (!seperator_is_key_plus_seq_ &&
+          comparator_->user_comparator()->Compare(
+              ExtractUserKey(*last_key_in_current_block),
+              ExtractUserKey(*first_key_in_next_block)) == 0) {
+        seperator_is_key_plus_seq_ = true;
+      }
     } else {
       comparator_->FindShortSuccessor(last_key_in_current_block);
     }
+    auto sep = Slice(*last_key_in_current_block);
 
     std::string handle_encoding;
     block_handle.EncodeTo(&handle_encoding);
-    index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(&handle_delta_encoding,
+                      block_handle.size() - last_encoded_handle_.size());
+    assert(handle_delta_encoding.size() != 0);
+    last_encoded_handle_ = block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_block_builder_.Add(sep, handle_encoding,
+                             &handle_delta_encoding_slice);
+    if (!seperator_is_key_plus_seq_) {
+      index_block_builder_without_seq_.Add(ExtractUserKey(sep), handle_encoding,
+                                           &handle_delta_encoding_slice);
+    }
   }
 
   using IndexBuilder::Finish;
   virtual Status Finish(
       IndexBlocks* index_blocks,
-      const BlockHandle& last_partition_block_handle) override {
-    index_blocks->index_block_contents = index_block_builder_.Finish();
+      const BlockHandle& /*last_partition_block_handle*/) override {
+    if (seperator_is_key_plus_seq_) {
+      index_blocks->index_block_contents = index_block_builder_.Finish();
+    } else {
+      index_blocks->index_block_contents =
+          index_block_builder_without_seq_.Finish();
+    }
+    index_size_ = index_blocks->index_block_contents.size();
     return Status::OK();
   }
 
-  virtual size_t EstimatedSize() const override {
-    return index_block_builder_.CurrentSizeEstimate();
+  virtual size_t IndexSize() const override { return index_size_; }
+
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
   }
 
   friend class PartitionedIndexBuilder;
 
  private:
   BlockBuilder index_block_builder_;
+  BlockBuilder index_block_builder_without_seq_;
+  bool seperator_is_key_plus_seq_;
+  BlockHandle last_encoded_handle_;
 };
 
 // HashIndexBuilder contains a binary-searchable primary index and the
@@ -183,9 +227,11 @@ class HashIndexBuilder : public IndexBuilder {
  public:
   explicit HashIndexBuilder(const InternalKeyComparator* comparator,
                             const SliceTransform* hash_key_extractor,
-                            int index_block_restart_interval)
+                            int index_block_restart_interval,
+                            int format_version, bool use_value_delta_encoding)
       : IndexBuilder(comparator),
-        primary_index_builder_(comparator, index_block_restart_interval),
+        primary_index_builder_(comparator, index_block_restart_interval,
+                               format_version, use_value_delta_encoding),
         hash_key_extractor_(hash_key_extractor) {}
 
   virtual void AddIndexEntry(std::string* last_key_in_current_block,
@@ -226,7 +272,9 @@ class HashIndexBuilder : public IndexBuilder {
   virtual Status Finish(
       IndexBlocks* index_blocks,
       const BlockHandle& last_partition_block_handle) override {
-    FlushPendingPrefix();
+    if (pending_block_num_ != 0) {
+      FlushPendingPrefix();
+    }
     primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
     index_blocks->meta_blocks.insert(
         {kHashIndexPrefixesBlock.c_str(), prefix_block_});
@@ -235,11 +283,15 @@ class HashIndexBuilder : public IndexBuilder {
     return Status::OK();
   }
 
-  virtual size_t EstimatedSize() const override {
-    return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
+  virtual size_t IndexSize() const override {
+    return primary_index_builder_.IndexSize() + prefix_block_.size() +
            prefix_meta_block_.size();
   }
 
+  virtual bool seperator_is_key_plus_seq() override {
+    return primary_index_builder_.seperator_is_key_plus_seq();
+  }
+
  private:
   void FlushPendingPrefix() {
     prefix_block_.append(pending_entry_prefix_.data(),
@@ -282,10 +334,12 @@ class PartitionedIndexBuilder : public IndexBuilder {
  public:
   static PartitionedIndexBuilder* CreateIndexBuilder(
       const rocksdb::InternalKeyComparator* comparator,
+      const bool use_value_delta_encoding,
       const BlockBasedTableOptions& table_opt);
 
   explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
-                                   const BlockBasedTableOptions& table_opt);
+                                   const BlockBasedTableOptions& table_opt,
+                                   const bool use_value_delta_encoding);
 
   virtual ~PartitionedIndexBuilder();
 
@@ -297,8 +351,8 @@ class PartitionedIndexBuilder : public IndexBuilder {
       IndexBlocks* index_blocks,
       const BlockHandle& last_partition_block_handle) override;
 
-  virtual size_t EstimatedSize() const override;
-  size_t EstimateTopLevelIndexSize(uint64_t) const;
+  virtual size_t IndexSize() const override { return index_size_; }
+  size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; }
   size_t NumPartitions() const;
 
   inline bool ShouldCutFilterBlock() {
@@ -316,7 +370,18 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // cutting the next partition
   void RequestPartitionCut();
 
+  virtual bool seperator_is_key_plus_seq() override {
+    return seperator_is_key_plus_seq_;
+  }
+
+  bool get_use_value_delta_encoding() { return use_value_delta_encoding_; }
+
  private:
+  // Set after ::Finish is called
+  size_t top_level_index_size_ = 0;
+  // Set after ::Finish is called
+  size_t partition_cnt_ = 0;
+
   void MakeNewSubIndexBuilder();
 
   struct Entry {
@@ -325,6 +390,7 @@ class PartitionedIndexBuilder : public IndexBuilder {
   };
   std::list<Entry> entries_;  // list of partitioned indexes and their keys
   BlockBuilder index_block_builder_;  // top-level index builder
+  BlockBuilder index_block_builder_without_seq_;  // same for user keys
   // the active partition index builder
   ShortenedIndexBuilder* sub_index_builder_;
   // the last key in the active partition index builder
@@ -333,10 +399,13 @@ class PartitionedIndexBuilder : public IndexBuilder {
   // true if Finish is called once but not complete yet.
   bool finishing_indexes = false;
   const BlockBasedTableOptions& table_opt_;
+  bool seperator_is_key_plus_seq_;
+  bool use_value_delta_encoding_;
   // true if an external entity (such as filter partition builder) request
   // cutting the next partition
   bool partition_cut_requested_ = true;
   // true if it should cut the next filter partition block
   bool cut_filter_block = false;
+  BlockHandle last_encoded_handle_;
 };
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/internal_iterator.h b/thirdparty/rocksdb/table/internal_iterator.h
index 2bfdb7d952..a173d60690 100644
--- a/thirdparty/rocksdb/table/internal_iterator.h
+++ b/thirdparty/rocksdb/table/internal_iterator.h
@@ -10,18 +10,21 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
 #include "rocksdb/status.h"
+#include "table/format.h"
 
 namespace rocksdb {
 
 class PinnedIteratorsManager;
 
-class InternalIterator : public Cleanable {
+template <class TValue>
+class InternalIteratorBase : public Cleanable {
  public:
-  InternalIterator() {}
-  virtual ~InternalIterator() {}
+  InternalIteratorBase() {}
+  virtual ~InternalIteratorBase() {}
 
   // An iterator is either positioned at a key/value pair, or
   // not valid.  This method returns true iff the iterator is valid.
+  // Always returns false if !status().ok().
   virtual bool Valid() const = 0;
 
   // Position at the first key in the source.  The iterator is Valid()
@@ -35,6 +38,9 @@ class InternalIterator : public Cleanable {
   // Position at the first key in the source that at or past target
   // The iterator is Valid() after this call iff the source contains
   // an entry that comes at or past target.
+  // All Seek*() methods clear any error status() that the iterator had prior to
+  // the call; after the seek, status() indicates only the error (if any) that
+  // happened during the seek, not any past errors.
   virtual void Seek(const Slice& target) = 0;
 
   // Position at the first key in the source that at or before target
@@ -61,20 +67,25 @@ class InternalIterator : public Cleanable {
   // Return the value for the current entry.  The underlying storage for
   // the returned slice is valid only until the next modification of
   // the iterator.
-  // REQUIRES: !AtEnd() && !AtStart()
-  virtual Slice value() const = 0;
+  // REQUIRES: Valid()
+  virtual TValue value() const = 0;
 
   // If an error has occurred, return it.  Else return an ok status.
   // If non-blocking IO is requested and this operation cannot be
   // satisfied without doing some IO, then this returns Status::Incomplete().
   virtual Status status() const = 0;
 
+  // True if the iterator is invalidated because it is out of the iterator
+  // upper bound
+  virtual bool IsOutOfBound() { return false; }
+
   // Pass the PinnedIteratorsManager to the Iterator, most Iterators dont
   // communicate with PinnedIteratorsManager so default implementation is no-op
   // but for Iterators that need to communicate with PinnedIteratorsManager
   // they will implement this function and use the passed pointer to communicate
   // with PinnedIteratorsManager.
-  virtual void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) {}
+  virtual void SetPinnedItersMgr(PinnedIteratorsManager* /*pinned_iters_mgr*/) {
+  }
 
   // If true, this means that the Slice returned by key() is valid as long as
   // PinnedIteratorsManager::ReleasePinnedData is not called and the
@@ -91,7 +102,7 @@ class InternalIterator : public Cleanable {
   // Iterator is not deleted.
   virtual bool IsValuePinned() const { return false; }
 
-  virtual Status GetProperty(std::string prop_name, std::string* prop) {
+  virtual Status GetProperty(std::string /*prop_name*/, std::string* /*prop*/) {
     return Status::NotSupported("");
   }
 
@@ -108,14 +119,24 @@ class InternalIterator : public Cleanable {
 
  private:
   // No copying allowed
-  InternalIterator(const InternalIterator&) = delete;
-  InternalIterator& operator=(const InternalIterator&) = delete;
+  InternalIteratorBase(const InternalIteratorBase&) = delete;
+  InternalIteratorBase& operator=(const InternalIteratorBase&) = delete;
 };
 
+using InternalIterator = InternalIteratorBase<Slice>;
+
 // Return an empty iterator (yields nothing).
-extern InternalIterator* NewEmptyInternalIterator();
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator();
 
 // Return an empty iterator with the specified status.
-extern InternalIterator* NewErrorInternalIterator(const Status& status);
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status);
+
+// Return an empty iterator with the specified status, allocated arena.
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/iter_heap.h b/thirdparty/rocksdb/table/iter_heap.h
index 74c06caeaf..f30c122722 100644
--- a/thirdparty/rocksdb/table/iter_heap.h
+++ b/thirdparty/rocksdb/table/iter_heap.h
@@ -6,7 +6,7 @@
 
 #pragma once
 
-#include "rocksdb/comparator.h"
+#include "db/dbformat.h"
 #include "table/iterator_wrapper.h"
 
 namespace rocksdb {
@@ -15,28 +15,28 @@ namespace rocksdb {
 // iterator with the max/largest key on top.
 class MaxIteratorComparator {
  public:
-  MaxIteratorComparator(const Comparator* comparator) :
-    comparator_(comparator) {}
+  MaxIteratorComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
 
   bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
     return comparator_->Compare(a->key(), b->key()) < 0;
   }
  private:
-  const Comparator* comparator_;
+  const InternalKeyComparator* comparator_;
 };
 
 // When used with std::priority_queue, this comparison functor puts the
 // iterator with the min/smallest key on top.
 class MinIteratorComparator {
  public:
-  MinIteratorComparator(const Comparator* comparator) :
-    comparator_(comparator) {}
+  MinIteratorComparator(const InternalKeyComparator* comparator)
+      : comparator_(comparator) {}
 
   bool operator()(IteratorWrapper* a, IteratorWrapper* b) const {
     return comparator_->Compare(a->key(), b->key()) > 0;
   }
  private:
-  const Comparator* comparator_;
+  const InternalKeyComparator* comparator_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/iterator.cc b/thirdparty/rocksdb/table/iterator.cc
index ed6a2cdea4..0475b9d134 100644
--- a/thirdparty/rocksdb/table/iterator.cc
+++ b/thirdparty/rocksdb/table/iterator.cc
@@ -103,20 +103,20 @@ Status Iterator::GetProperty(std::string prop_name, std::string* prop) {
     *prop = "0";
     return Status::OK();
   }
-  return Status::InvalidArgument("Undentified property.");
+  return Status::InvalidArgument("Unidentified property.");
 }
 
 namespace {
 class EmptyIterator : public Iterator {
  public:
   explicit EmptyIterator(const Status& s) : status_(s) { }
-  virtual bool Valid() const override { return false; }
-  virtual void Seek(const Slice& target) override {}
-  virtual void SeekForPrev(const Slice& target) override {}
-  virtual void SeekToFirst() override {}
-  virtual void SeekToLast() override {}
-  virtual void Next() override { assert(false); }
-  virtual void Prev() override { assert(false); }
+  bool Valid() const override { return false; }
+  void Seek(const Slice& /*target*/) override {}
+  void SeekForPrev(const Slice& /*target*/) override {}
+  void SeekToFirst() override {}
+  void SeekToLast() override {}
+  void Next() override { assert(false); }
+  void Prev() override { assert(false); }
   Slice key() const override {
     assert(false);
     return Slice();
@@ -125,69 +125,86 @@ class EmptyIterator : public Iterator {
     assert(false);
     return Slice();
   }
-  virtual Status status() const override { return status_; }
+  Status status() const override { return status_; }
 
  private:
   Status status_;
 };
 
-class EmptyInternalIterator : public InternalIterator {
+template <class TValue = Slice>
+class EmptyInternalIterator : public InternalIteratorBase<TValue> {
  public:
   explicit EmptyInternalIterator(const Status& s) : status_(s) {}
-  virtual bool Valid() const override { return false; }
-  virtual void Seek(const Slice& target) override {}
-  virtual void SeekForPrev(const Slice& target) override {}
-  virtual void SeekToFirst() override {}
-  virtual void SeekToLast() override {}
-  virtual void Next() override { assert(false); }
-  virtual void Prev() override { assert(false); }
+  bool Valid() const override { return false; }
+  void Seek(const Slice& /*target*/) override {}
+  void SeekForPrev(const Slice& /*target*/) override {}
+  void SeekToFirst() override {}
+  void SeekToLast() override {}
+  void Next() override { assert(false); }
+  void Prev() override { assert(false); }
   Slice key() const override {
     assert(false);
     return Slice();
   }
-  Slice value() const override {
+  TValue value() const override {
     assert(false);
-    return Slice();
+    return TValue();
   }
-  virtual Status status() const override { return status_; }
+  Status status() const override { return status_; }
 
  private:
   Status status_;
 };
 }  // namespace
 
-Iterator* NewEmptyIterator() {
-  return new EmptyIterator(Status::OK());
-}
+Iterator* NewEmptyIterator() { return new EmptyIterator(Status::OK()); }
 
 Iterator* NewErrorIterator(const Status& status) {
   return new EmptyIterator(status);
 }
 
-InternalIterator* NewEmptyInternalIterator() {
-  return new EmptyInternalIterator(Status::OK());
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status) {
+  return new EmptyInternalIterator<TValue>(status);
 }
-
-InternalIterator* NewEmptyInternalIterator(Arena* arena) {
+template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+    const Status& status);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewErrorInternalIterator(const Status& status,
+                                                       Arena* arena) {
   if (arena == nullptr) {
-    return NewEmptyInternalIterator();
+    return NewErrorInternalIterator<TValue>(status);
   } else {
-    auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
-    return new (mem) EmptyInternalIterator(Status::OK());
+    auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+    return new (mem) EmptyInternalIterator<TValue>(status);
   }
 }
-
-InternalIterator* NewErrorInternalIterator(const Status& status) {
-  return new EmptyInternalIterator(status);
+template InternalIteratorBase<BlockHandle>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+template InternalIteratorBase<Slice>* NewErrorInternalIterator(
+    const Status& status, Arena* arena);
+
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator() {
+  return new EmptyInternalIterator<TValue>(Status::OK());
 }
+template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator();
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator();
 
-InternalIterator* NewErrorInternalIterator(const Status& status, Arena* arena) {
+template <class TValue>
+InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena) {
   if (arena == nullptr) {
-    return NewErrorInternalIterator(status);
+    return NewEmptyInternalIterator<TValue>();
   } else {
-    auto mem = arena->AllocateAligned(sizeof(EmptyIterator));
-    return new (mem) EmptyInternalIterator(status);
+    auto mem = arena->AllocateAligned(sizeof(EmptyInternalIterator<TValue>));
+    return new (mem) EmptyInternalIterator<TValue>(Status::OK());
   }
 }
+template InternalIteratorBase<BlockHandle>* NewEmptyInternalIterator(
+    Arena* arena);
+template InternalIteratorBase<Slice>* NewEmptyInternalIterator(Arena* arena);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/iterator_wrapper.h b/thirdparty/rocksdb/table/iterator_wrapper.h
index f14acdb9bf..5941b846a1 100644
--- a/thirdparty/rocksdb/table/iterator_wrapper.h
+++ b/thirdparty/rocksdb/table/iterator_wrapper.h
@@ -19,19 +19,21 @@ namespace rocksdb {
 // the valid() and key() results for an underlying iterator.
 // This can help avoid virtual function calls and also gives better
 // cache locality.
-class IteratorWrapper {
+template <class TValue = Slice>
+class IteratorWrapperBase {
  public:
-  IteratorWrapper() : iter_(nullptr), valid_(false) {}
-  explicit IteratorWrapper(InternalIterator* _iter) : iter_(nullptr) {
+  IteratorWrapperBase() : iter_(nullptr), valid_(false) {}
+  explicit IteratorWrapperBase(InternalIteratorBase<TValue>* _iter)
+      : iter_(nullptr) {
     Set(_iter);
   }
-  ~IteratorWrapper() {}
-  InternalIterator* iter() const { return iter_; }
+  ~IteratorWrapperBase() {}
+  InternalIteratorBase<TValue>* iter() const { return iter_; }
 
   // Set the underlying Iterator to _iter and return
   // previous underlying Iterator.
-  InternalIterator* Set(InternalIterator* _iter) {
-    InternalIterator* old_iter = iter_;
+  InternalIteratorBase<TValue>* Set(InternalIteratorBase<TValue>* _iter) {
+    InternalIteratorBase<TValue>* old_iter = iter_;
 
     iter_ = _iter;
     if (iter_ == nullptr) {
@@ -47,7 +49,7 @@ class IteratorWrapper {
       if (!is_arena_mode) {
         delete iter_;
       } else {
-        iter_->~InternalIterator();
+        iter_->~InternalIteratorBase<TValue>();
       }
     }
   }
@@ -55,7 +57,10 @@ class IteratorWrapper {
   // Iterator interface methods
   bool Valid() const        { return valid_; }
   Slice key() const         { assert(Valid()); return key_; }
-  Slice value() const       { assert(Valid()); return iter_->value(); }
+  TValue value() const {
+    assert(Valid());
+    return iter_->value();
+  }
   // Methods below require iter() != nullptr
   Status status() const     { assert(iter_); return iter_->status(); }
   void Next()               { assert(iter_); iter_->Next();        Update(); }
@@ -87,20 +92,20 @@ class IteratorWrapper {
     valid_ = iter_->Valid();
     if (valid_) {
       key_ = iter_->key();
+      assert(iter_->status().ok());
     }
   }
 
-  InternalIterator* iter_;
+  InternalIteratorBase<TValue>* iter_;
   bool valid_;
   Slice key_;
 };
 
+using IteratorWrapper = IteratorWrapperBase<Slice>;
+
 class Arena;
 // Return an empty iterator (yields nothing) allocated from arena.
-extern InternalIterator* NewEmptyInternalIterator(Arena* arena);
-
-// Return an empty iterator with the specified status, allocated arena.
-extern InternalIterator* NewErrorInternalIterator(const Status& status,
-                                                  Arena* arena);
+template <class TValue = Slice>
+extern InternalIteratorBase<TValue>* NewEmptyInternalIterator(Arena* arena);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/merger_test.cc b/thirdparty/rocksdb/table/merger_test.cc
index 379a6f412d..1b04d06572 100644
--- a/thirdparty/rocksdb/table/merger_test.cc
+++ b/thirdparty/rocksdb/table/merger_test.cc
@@ -15,12 +15,18 @@ namespace rocksdb {
 class MergerTest : public testing::Test {
  public:
   MergerTest()
-      : rnd_(3), merging_iterator_(nullptr), single_iterator_(nullptr) {}
-  ~MergerTest() = default;
+      : icomp_(BytewiseComparator()),
+        rnd_(3),
+        merging_iterator_(nullptr),
+        single_iterator_(nullptr) {}
+  ~MergerTest() override = default;
   std::vector<std::string> GenerateStrings(size_t len, int string_len) {
     std::vector<std::string> ret;
+
     for (size_t i = 0; i < len; ++i) {
-      ret.push_back(test::RandomHumanReadableString(&rnd_, string_len));
+      InternalKey ik(test::RandomHumanReadableString(&rnd_, string_len), 0,
+                     ValueType::kTypeValue);
+      ret.push_back(ik.Encode().ToString(false));
     }
     return ret;
   }
@@ -37,7 +43,11 @@ class MergerTest : public testing::Test {
     }
   }
 
-  void SeekToRandom() { Seek(test::RandomHumanReadableString(&rnd_, 5)); }
+  void SeekToRandom() {
+    InternalKey ik(test::RandomHumanReadableString(&rnd_, 5), 0,
+                   ValueType::kTypeValue);
+    Seek(ik.Encode().ToString(false));
+  }
 
   void Seek(std::string target) {
     merging_iterator_->Seek(target);
@@ -96,11 +106,12 @@ class MergerTest : public testing::Test {
     }
 
     merging_iterator_.reset(
-        NewMergingIterator(BytewiseComparator(), &small_iterators[0],
+        NewMergingIterator(&icomp_, &small_iterators[0],
                            static_cast<int>(small_iterators.size())));
     single_iterator_.reset(new test::VectorIterator(all_keys_));
   }
 
+  InternalKeyComparator icomp_;
   Random rnd_;
   std::unique_ptr<InternalIterator> merging_iterator_;
   std::unique_ptr<InternalIterator> single_iterator_;
diff --git a/thirdparty/rocksdb/table/merging_iterator.cc b/thirdparty/rocksdb/table/merging_iterator.cc
index da30e1e635..bd4a186b3c 100644
--- a/thirdparty/rocksdb/table/merging_iterator.cc
+++ b/thirdparty/rocksdb/table/merging_iterator.cc
@@ -10,6 +10,7 @@
 #include "table/merging_iterator.h"
 #include <string>
 #include <vector>
+#include "db/dbformat.h"
 #include "db/pinned_iterators_manager.h"
 #include "monitoring/perf_context_imp.h"
 #include "rocksdb/comparator.h"
@@ -35,8 +36,9 @@ const size_t kNumIterReserve = 4;
 
 class MergingIterator : public InternalIterator {
  public:
-  MergingIterator(const Comparator* comparator, InternalIterator** children,
-                  int n, bool is_arena_mode, bool prefix_seek_mode)
+  MergingIterator(const InternalKeyComparator* comparator,
+                  InternalIterator** children, int n, bool is_arena_mode,
+                  bool prefix_seek_mode)
       : is_arena_mode_(is_arena_mode),
         comparator_(comparator),
         current_(nullptr),
@@ -50,12 +52,21 @@ class MergingIterator : public InternalIterator {
     }
     for (auto& child : children_) {
       if (child.Valid()) {
+        assert(child.status().ok());
         minHeap_.push(&child);
+      } else {
+        considerStatus(child.status());
       }
     }
     current_ = CurrentForward();
   }
 
+  void considerStatus(Status s) {
+    if (!s.ok() && status_.ok()) {
+      status_ = s;
+    }
+  }
+
   virtual void AddIterator(InternalIterator* iter) {
     assert(direction_ == kForward);
     children_.emplace_back(iter);
@@ -64,46 +75,60 @@ class MergingIterator : public InternalIterator {
     }
     auto new_wrapper = children_.back();
     if (new_wrapper.Valid()) {
+      assert(new_wrapper.status().ok());
       minHeap_.push(&new_wrapper);
       current_ = CurrentForward();
+    } else {
+      considerStatus(new_wrapper.status());
     }
   }
 
-  virtual ~MergingIterator() {
+  ~MergingIterator() override {
     for (auto& child : children_) {
       child.DeleteIter(is_arena_mode_);
     }
   }
 
-  virtual bool Valid() const override { return (current_ != nullptr); }
+  bool Valid() const override { return current_ != nullptr && status_.ok(); }
 
-  virtual void SeekToFirst() override {
+  Status status() const override { return status_; }
+
+  void SeekToFirst() override {
     ClearHeaps();
+    status_ = Status::OK();
     for (auto& child : children_) {
       child.SeekToFirst();
       if (child.Valid()) {
+        assert(child.status().ok());
         minHeap_.push(&child);
+      } else {
+        considerStatus(child.status());
       }
     }
     direction_ = kForward;
     current_ = CurrentForward();
   }
 
-  virtual void SeekToLast() override {
+  void SeekToLast() override {
     ClearHeaps();
     InitMaxHeap();
+    status_ = Status::OK();
     for (auto& child : children_) {
       child.SeekToLast();
       if (child.Valid()) {
+        assert(child.status().ok());
         maxHeap_->push(&child);
+      } else {
+        considerStatus(child.status());
       }
     }
     direction_ = kReverse;
     current_ = CurrentReverse();
   }
 
-  virtual void Seek(const Slice& target) override {
+  void Seek(const Slice& target) override {
     ClearHeaps();
+    status_ = Status::OK();
     for (auto& child : children_) {
       {
         PERF_TIMER_GUARD(seek_child_seek_time);
@@ -112,8 +137,11 @@ class MergingIterator : public InternalIterator {
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
       if (child.Valid()) {
+        assert(child.status().ok());
         PERF_TIMER_GUARD(seek_min_heap_time);
         minHeap_.push(&child);
+      } else {
+        considerStatus(child.status());
       }
     }
     direction_ = kForward;
@@ -123,9 +151,10 @@ class MergingIterator : public InternalIterator {
     }
   }
 
-  virtual void SeekForPrev(const Slice& target) override {
+  void SeekForPrev(const Slice& target) override {
     ClearHeaps();
     InitMaxHeap();
+    status_ = Status::OK();
 
     for (auto& child : children_) {
       {
@@ -135,8 +164,11 @@ class MergingIterator : public InternalIterator {
       PERF_COUNTER_ADD(seek_child_seek_count, 1);
 
       if (child.Valid()) {
+        assert(child.status().ok());
         PERF_TIMER_GUARD(seek_max_heap_time);
         maxHeap_->push(&child);
+      } else {
+        considerStatus(child.status());
       }
     }
     direction_ = kReverse;
@@ -146,7 +178,7 @@ class MergingIterator : public InternalIterator {
     }
   }
 
-  virtual void Next() override {
+  void Next() override {
     assert(Valid());
 
     // Ensure that all children are positioned after key().
@@ -154,21 +186,7 @@ class MergingIterator : public InternalIterator {
     // true for all of the non-current children since current_ is
     // the smallest child and key() == current_->key().
     if (direction_ != kForward) {
-      // Otherwise, advance the non-current children.  We advance current_
-      // just after the if-block.
-      ClearHeaps();
-      for (auto& child : children_) {
-        if (&child != current_) {
-          child.Seek(key());
-          if (child.Valid() && comparator_->Equal(key(), child.key())) {
-            child.Next();
-          }
-        }
-        if (child.Valid()) {
-          minHeap_.push(&child);
-        }
-      }
-      direction_ = kForward;
+      SwitchToForward();
       // The loop advanced all non-current children to be > key() so current_
       // should still be strictly the smallest key.
       assert(current_ == CurrentForward());
@@ -184,15 +202,17 @@ class MergingIterator : public InternalIterator {
       // current is still valid after the Next() call above.  Call
       // replace_top() to restore the heap property.  When the same child
       // iterator yields a sequence of keys, this is cheap.
+      assert(current_->status().ok());
       minHeap_.replace_top(current_);
     } else {
       // current stopped being valid, remove it from the heap.
+      considerStatus(current_->status());
       minHeap_.pop();
     }
     current_ = CurrentForward();
   }
 
-  virtual void Prev() override {
+  void Prev() override {
     assert(Valid());
     // Ensure that all children are positioned before key().
     // If we are moving in the reverse direction, it is already
@@ -203,28 +223,19 @@ class MergingIterator : public InternalIterator {
       // just after the if-block.
       ClearHeaps();
       InitMaxHeap();
+      Slice target = key();
       for (auto& child : children_) {
         if (&child != current_) {
-          if (!prefix_seek_mode_) {
-            child.Seek(key());
-            if (child.Valid()) {
-              // Child is at first entry >= key().  Step back one to be < key()
-              TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev",
-                                       &child);
-              child.Prev();
-            } else {
-              // Child has no entries >= key().  Position at last entry.
-              TEST_SYNC_POINT("MergeIterator::Prev:BeforeSeekToLast");
-              child.SeekToLast();
-            }
-          } else {
-            child.SeekForPrev(key());
-            if (child.Valid() && comparator_->Equal(key(), child.key())) {
-              child.Prev();
-            }
+          child.SeekForPrev(target);
+          TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child);
+          considerStatus(child.status());
+          if (child.Valid() && comparator_->Equal(target, child.key())) {
+            child.Prev();
+            considerStatus(child.status());
           }
         }
         if (child.Valid()) {
+          assert(child.status().ok());
           maxHeap_->push(&child);
         }
       }
@@ -250,50 +261,40 @@ class MergingIterator : public InternalIterator {
       // current is still valid after the Prev() call above.  Call
       // replace_top() to restore the heap property.  When the same child
       // iterator yields a sequence of keys, this is cheap.
+      assert(current_->status().ok());
       maxHeap_->replace_top(current_);
     } else {
       // current stopped being valid, remove it from the heap.
+      considerStatus(current_->status());
       maxHeap_->pop();
     }
     current_ = CurrentReverse();
   }
 
-  virtual Slice key() const override {
+  Slice key() const override {
     assert(Valid());
     return current_->key();
   }
 
-  virtual Slice value() const override {
+  Slice value() const override {
     assert(Valid());
     return current_->value();
   }
 
-  virtual Status status() const override {
-    Status s;
-    for (auto& child : children_) {
-      s = child.status();
-      if (!s.ok()) {
-        break;
-      }
-    }
-    return s;
-  }
-
-  virtual void SetPinnedItersMgr(
-      PinnedIteratorsManager* pinned_iters_mgr) override {
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
     pinned_iters_mgr_ = pinned_iters_mgr;
     for (auto& child : children_) {
       child.SetPinnedItersMgr(pinned_iters_mgr);
     }
   }
 
-  virtual bool IsKeyPinned() const override {
+  bool IsKeyPinned() const override {
     assert(Valid());
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            current_->IsKeyPinned();
   }
 
-  virtual bool IsValuePinned() const override {
+  bool IsValuePinned() const override {
     assert(Valid());
     return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
            current_->IsValuePinned();
@@ -307,13 +308,15 @@ class MergingIterator : public InternalIterator {
   void InitMaxHeap();
 
   bool is_arena_mode_;
-  const Comparator* comparator_;
+  const InternalKeyComparator* comparator_;
   autovector<IteratorWrapper, kNumIterReserve> children_;
 
   // Cached pointer to child iterator with the current key, or nullptr if no
   // child iterators are valid.  This is the top of minHeap_ or maxHeap_
   // depending on the direction.
   IteratorWrapper* current_;
+  // If any of the children have non-ok status, this is one of them.
+  Status status_;
   // Which direction is the iterator moving?
   enum Direction {
     kForward,
@@ -328,6 +331,8 @@ class MergingIterator : public InternalIterator {
   std::unique_ptr<MergerMaxIterHeap> maxHeap_;
   PinnedIteratorsManager* pinned_iters_mgr_;
 
+  void SwitchToForward();
+
   IteratorWrapper* CurrentForward() const {
     assert(direction_ == kForward);
     return !minHeap_.empty() ? minHeap_.top() : nullptr;
@@ -340,6 +345,27 @@ class MergingIterator : public InternalIterator {
   }
 };
 
+void MergingIterator::SwitchToForward() {
+  // Otherwise, advance the non-current children.  We advance current_
+  // just after the if-block.
+  ClearHeaps();
+  Slice target = key();
+  for (auto& child : children_) {
+    if (&child != current_) {
+      child.Seek(target);
+      considerStatus(child.status());
+      if (child.Valid() && comparator_->Equal(target, child.key())) {
+        child.Next();
+        considerStatus(child.status());
+      }
+    }
+    if (child.Valid()) {
+      minHeap_.push(&child);
+    }
+  }
+  direction_ = kForward;
+}
+
 void MergingIterator::ClearHeaps() {
   minHeap_.clear();
   if (maxHeap_) {
@@ -353,12 +379,12 @@ void MergingIterator::InitMaxHeap() {
   }
 }
 
-InternalIterator* NewMergingIterator(const Comparator* cmp,
+InternalIterator* NewMergingIterator(const InternalKeyComparator* cmp,
                                      InternalIterator** list, int n,
                                      Arena* arena, bool prefix_seek_mode) {
   assert(n >= 0);
   if (n == 0) {
-    return NewEmptyInternalIterator(arena);
+    return NewEmptyInternalIterator<Slice>(arena);
   } else if (n == 1) {
     return list[0];
   } else {
@@ -371,18 +397,28 @@ InternalIterator* NewMergingIterator(const Comparator* cmp,
   }
 }
 
-MergeIteratorBuilder::MergeIteratorBuilder(const Comparator* comparator,
-                                           Arena* a, bool prefix_seek_mode)
+MergeIteratorBuilder::MergeIteratorBuilder(
+    const InternalKeyComparator* comparator, Arena* a, bool prefix_seek_mode)
     : first_iter(nullptr), use_merging_iter(false), arena(a) {
   auto mem = arena->AllocateAligned(sizeof(MergingIterator));
   merge_iter =
       new (mem) MergingIterator(comparator, nullptr, 0, true, prefix_seek_mode);
 }
 
+MergeIteratorBuilder::~MergeIteratorBuilder() {
+  if (first_iter != nullptr) {
+    first_iter->~InternalIterator();
+  }
+  if (merge_iter != nullptr) {
+    merge_iter->~MergingIterator();
+  }
+}
+
 void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
   if (!use_merging_iter && first_iter != nullptr) {
     merge_iter->AddIterator(first_iter);
     use_merging_iter = true;
+    first_iter = nullptr;
   }
   if (use_merging_iter) {
     merge_iter->AddIterator(iter);
@@ -392,13 +428,15 @@ void MergeIteratorBuilder::AddIterator(InternalIterator* iter) {
 }
 
 InternalIterator* MergeIteratorBuilder::Finish() {
+  InternalIterator* ret = nullptr;
   if (!use_merging_iter) {
-    return first_iter;
+    ret = first_iter;
+    first_iter = nullptr;
   } else {
-    auto ret = merge_iter;
+    ret = merge_iter;
     merge_iter = nullptr;
-    return ret;
   }
+  return ret;
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/merging_iterator.h b/thirdparty/rocksdb/table/merging_iterator.h
index 48a28d86fd..21ff79bf6b 100644
--- a/thirdparty/rocksdb/table/merging_iterator.h
+++ b/thirdparty/rocksdb/table/merging_iterator.h
@@ -9,14 +9,17 @@
 
 #pragma once
 
+#include "db/dbformat.h"
 #include "rocksdb/types.h"
 
 namespace rocksdb {
 
 class Comparator;
-class InternalIterator;
 class Env;
 class Arena;
+template <class TValue>
+class InternalIteratorBase;
+using InternalIterator = InternalIteratorBase<Slice>;
 
 // Return an iterator that provided the union of the data in
 // children[0,n-1].  Takes ownership of the child iterators and
@@ -26,10 +29,9 @@ class Arena;
 // key is present in K child iterators, it will be yielded K times.
 //
 // REQUIRES: n >= 0
-extern InternalIterator* NewMergingIterator(const Comparator* comparator,
-                                            InternalIterator** children, int n,
-                                            Arena* arena = nullptr,
-                                            bool prefix_seek_mode = false);
+extern InternalIterator* NewMergingIterator(
+    const InternalKeyComparator* comparator, InternalIterator** children, int n,
+    Arena* arena = nullptr, bool prefix_seek_mode = false);
 
 class MergingIterator;
 
@@ -38,9 +40,9 @@ class MergeIteratorBuilder {
  public:
   // comparator: the comparator used in merging comparator
   // arena: where the merging iterator needs to be allocated from.
-  explicit MergeIteratorBuilder(const Comparator* comparator, Arena* arena,
-                                bool prefix_seek_mode = false);
-  ~MergeIteratorBuilder() {}
+  explicit MergeIteratorBuilder(const InternalKeyComparator* comparator,
+                                Arena* arena, bool prefix_seek_mode = false);
+  ~MergeIteratorBuilder();
 
   // Add iter to the merging iterator.
   void AddIterator(InternalIterator* iter);
diff --git a/thirdparty/rocksdb/table/meta_blocks.cc b/thirdparty/rocksdb/table/meta_blocks.cc
index 19925d7889..57111cfebf 100644
--- a/thirdparty/rocksdb/table/meta_blocks.cc
+++ b/thirdparty/rocksdb/table/meta_blocks.cc
@@ -11,6 +11,7 @@
 #include "rocksdb/table.h"
 #include "rocksdb/table_properties.h"
 #include "table/block.h"
+#include "table/block_fetcher.h"
 #include "table/format.h"
 #include "table/internal_iterator.h"
 #include "table/persistent_cache_helper.h"
@@ -37,8 +38,12 @@ Slice MetaIndexBuilder::Finish() {
   return meta_index_block_->Finish();
 }
 
+// Property block will be read sequentially and cached in a heap located
+// object, so there's no need for restart points. Thus we set the restart
+// interval to infinity to save space.
 PropertyBlockBuilder::PropertyBlockBuilder()
-    : properties_block_(new BlockBuilder(1 /* restart interval */)) {}
+    : properties_block_(
+          new BlockBuilder(port::kMaxInt32 /* restart interval */)) {}
 
 void PropertyBlockBuilder::Add(const std::string& name,
                                const std::string& val) {
@@ -70,7 +75,13 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
     Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
     Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
   }
+  Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
+  Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
+      props.index_value_is_delta_encoded);
   Add(TablePropertiesNames::kNumEntries, props.num_entries);
+  Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
+  Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
+  Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
   Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
   Add(TablePropertiesNames::kFilterSize, props.filter_size);
   Add(TablePropertiesNames::kFormatVersion, props.format_version);
@@ -104,6 +115,9 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
   if (!props.compression_name.empty()) {
     Add(TablePropertiesNames::kCompression, props.compression_name);
   }
+  if (!props.compression_options.empty()) {
+    Add(TablePropertiesNames::kCompressionOptions, props.compression_options);
+  }
 }
 
 Slice PropertyBlockBuilder::Finish() {
@@ -140,6 +154,16 @@ bool NotifyCollectTableCollectorsOnAdd(
   return all_succeeded;
 }
 
+void NotifyCollectTableCollectorsOnBlockAdd(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    const uint64_t blockRawBytes, const uint64_t blockCompressedBytesFast,
+    const uint64_t blockCompressedBytesSlow) {
+  for (auto& collector : collectors) {
+    collector->BlockAdd(blockRawBytes, blockCompressedBytesFast,
+                        blockCompressedBytesSlow);
+  }
+}
+
 bool NotifyCollectTableCollectorsOnFinish(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
     Logger* info_log, PropertyBlockBuilder* builder) {
@@ -163,7 +187,11 @@ bool NotifyCollectTableCollectorsOnFinish(
 Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
                       FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                       const ImmutableCFOptions& ioptions,
-                      TableProperties** table_properties) {
+                      TableProperties** table_properties, bool verify_checksum,
+                      BlockHandle* ret_block_handle,
+                      CacheAllocationPtr* verification_buf,
+                      bool /*compression_type_missing*/,
+                      MemoryAllocator* memory_allocator) {
   assert(table_properties);
 
   Slice v = handle_value;
@@ -174,10 +202,17 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
 
   BlockContents block_contents;
   ReadOptions read_options;
-  read_options.verify_checksums = false;
+  read_options.verify_checksums = verify_checksum;
   Status s;
-  s = ReadBlockContents(file, prefetch_buffer, footer, read_options, handle,
-                        &block_contents, ioptions, false /* decompress */);
+  PersistentCacheOptions cache_options;
+
+  BlockFetcher block_fetcher(
+      file, prefetch_buffer, footer, read_options, handle, &block_contents,
+      ioptions, false /* decompress */, false /*maybe_compressed*/,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = block_fetcher.ReadBlockContents();
+  // property block is never compressed. Need to add uncompress logic if we are
+  // to compress it..
 
   if (!s.ok()) {
     return s;
@@ -185,8 +220,9 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
 
   Block properties_block(std::move(block_contents),
                          kDisableGlobalSequenceNumber);
-  BlockIter iter;
-  properties_block.NewIterator(BytewiseComparator(), &iter);
+  DataBlockIter iter;
+  properties_block.NewIterator<DataBlockIter>(BytewiseComparator(),
+                                              BytewiseComparator(), &iter);
 
   auto new_table_properties = new TableProperties();
   // All pre-defined properties of type uint64_t
@@ -197,6 +233,10 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
        &new_table_properties->index_partitions},
       {TablePropertiesNames::kTopLevelIndexSize,
        &new_table_properties->top_level_index_size},
+      {TablePropertiesNames::kIndexKeyIsUserKey,
+       &new_table_properties->index_key_is_user_key},
+      {TablePropertiesNames::kIndexValueIsDeltaEncoded,
+       &new_table_properties->index_value_is_delta_encoded},
       {TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
       {TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
       {TablePropertiesNames::kRawValueSize,
@@ -204,6 +244,12 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
       {TablePropertiesNames::kNumDataBlocks,
        &new_table_properties->num_data_blocks},
       {TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
+      {TablePropertiesNames::kDeletedKeys,
+       &new_table_properties->num_deletions},
+      {TablePropertiesNames::kMergeOperands,
+       &new_table_properties->num_merge_operands},
+      {TablePropertiesNames::kNumRangeDeletions,
+       &new_table_properties->num_range_deletions},
       {TablePropertiesNames::kFormatVersion,
        &new_table_properties->format_version},
       {TablePropertiesNames::kFixedKeyLen,
@@ -217,16 +263,19 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   };
 
   std::string last_key;
-  for (iter.SeekToFirst(); iter.Valid(); iter.Next()) {
+  for (iter.SeekToFirstOrReport(); iter.Valid(); iter.NextOrReport()) {
     s = iter.status();
     if (!s.ok()) {
       break;
     }
 
     auto key = iter.key().ToString();
-    // properties block is strictly sorted with no duplicate key.
-    assert(last_key.empty() ||
-           BytewiseComparator()->Compare(key, last_key) > 0);
+    // properties block should be strictly sorted with no duplicate key.
+    if (!last_key.empty() &&
+        BytewiseComparator()->Compare(key, last_key) <= 0) {
+      s = Status::Corruption("properties unsorted");
+      break;
+    }
     last_key = key;
 
     auto raw_val = iter.value();
@@ -236,6 +285,12 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
         {key, handle.offset() + iter.ValueOffset()});
 
     if (pos != predefined_uint64_properties.end()) {
+      if (key == TablePropertiesNames::kDeletedKeys ||
+          key == TablePropertiesNames::kMergeOperands) {
+        // Insert in user-collected properties for API backwards compatibility
+        new_table_properties->user_collected_properties.insert(
+            {key, raw_val.ToString()});
+      }
       // handle predefined rocksdb properties
       uint64_t val;
       if (!GetVarint64(&raw_val, &val)) {
@@ -261,6 +316,8 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
       new_table_properties->property_collectors_names = raw_val.ToString();
     } else if (key == TablePropertiesNames::kCompression) {
       new_table_properties->compression_name = raw_val.ToString();
+    } else if (key == TablePropertiesNames::kCompressionOptions) {
+      new_table_properties->compression_options = raw_val.ToString();
     } else {
       // handle user-collected properties
       new_table_properties->user_collected_properties.insert(
@@ -269,6 +326,16 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
   }
   if (s.ok()) {
     *table_properties = new_table_properties;
+    if (ret_block_handle != nullptr) {
+      *ret_block_handle = handle;
+    }
+    if (verification_buf != nullptr) {
+      size_t len = handle.size() + kBlockTrailerSize;
+      *verification_buf = rocksdb::AllocateBlock(len, memory_allocator);
+      if (verification_buf->get() != nullptr) {
+        memcpy(verification_buf->get(), block_contents.data.data(), len);
+      }
+    }
   } else {
     delete new_table_properties;
   }
@@ -278,8 +345,10 @@ Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
 
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions &ioptions,
-                           TableProperties** properties) {
+                           const ImmutableCFOptions& ioptions,
+                           TableProperties** properties,
+                           bool compression_type_missing,
+                           MemoryAllocator* memory_allocator) {
   // -- Read metaindex block
   Footer footer;
   auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
@@ -292,16 +361,24 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  s = ReadBlockContents(file, nullptr /* prefetch_buffer */, footer,
-                        read_options, metaindex_handle, &metaindex_contents,
-                        ioptions, false /* decompress */);
+  PersistentCacheOptions cache_options;
+
+  BlockFetcher block_fetcher(
+      file, nullptr /* prefetch_buffer */, footer, read_options,
+      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+      cache_options, memory_allocator);
+  s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
   }
+  // property blocks are never compressed. Need to add uncompress logic if we
+  // are to compress it.
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
   std::unique_ptr<InternalIterator> meta_iter(
-      metaindex_block.NewIterator(BytewiseComparator()));
+      metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(),
+                                                 BytewiseComparator()));
 
   // -- Read property block
   bool found_properties_block = true;
@@ -312,8 +389,11 @@ Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
 
   TableProperties table_properties;
   if (found_properties_block == true) {
-    s = ReadProperties(meta_iter->value(), file, nullptr /* prefetch_buffer */,
-                       footer, ioptions, properties);
+    s = ReadProperties(
+        meta_iter->value(), file, nullptr /* prefetch_buffer */, footer,
+        ioptions, properties, false /* verify_checksum */,
+        nullptr /* ret_block_hanel */, nullptr /* ret_block_contents */,
+        compression_type_missing, memory_allocator);
   } else {
     s = Status::NotFound();
   }
@@ -336,9 +416,11 @@ Status FindMetaBlock(InternalIterator* meta_index_iter,
 
 Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions &ioptions,
+                     const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
-                     BlockHandle* block_handle) {
+                     BlockHandle* block_handle,
+                     bool /*compression_type_missing*/,
+                     MemoryAllocator* memory_allocator) {
   Footer footer;
   auto s = ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
                               &footer, table_magic_number);
@@ -350,17 +432,24 @@ Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  s = ReadBlockContents(file, nullptr /* prefetch_buffer */, footer,
-                        read_options, metaindex_handle, &metaindex_contents,
-                        ioptions, false /* do decompression */);
+  PersistentCacheOptions cache_options;
+  BlockFetcher block_fetcher(
+      file, nullptr /* prefetch_buffer */, footer, read_options,
+      metaindex_handle, &metaindex_contents, ioptions,
+      false /* do decompression */, false /*maybe_compressed*/,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  s = block_fetcher.ReadBlockContents();
   if (!s.ok()) {
     return s;
   }
+  // meta blocks are never compressed. Need to add uncompress logic if we are to
+  // compress it.
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
 
   std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
+  meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>(
+      BytewiseComparator(), BytewiseComparator()));
 
   return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
 }
@@ -370,7 +459,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      uint64_t table_magic_number,
                      const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
-                     BlockContents* contents) {
+                     BlockContents* contents, bool /*compression_type_missing*/,
+                     MemoryAllocator* memory_allocator) {
   Status status;
   Footer footer;
   status = ReadFooterFromFile(file, prefetch_buffer, file_size, &footer,
@@ -384,19 +474,27 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   BlockContents metaindex_contents;
   ReadOptions read_options;
   read_options.verify_checksums = false;
-  status = ReadBlockContents(file, prefetch_buffer, footer, read_options,
+  PersistentCacheOptions cache_options;
+
+  BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options,
                              metaindex_handle, &metaindex_contents, ioptions,
-                             false /* decompress */);
+                             false /* decompress */, false /*maybe_compressed*/,
+                             UncompressionDict::GetEmptyDict(), cache_options,
+                             memory_allocator);
+  status = block_fetcher.ReadBlockContents();
   if (!status.ok()) {
     return status;
   }
+  // meta block is never compressed. Need to add uncompress logic if we are to
+  // compress it.
 
   // Finding metablock
   Block metaindex_block(std::move(metaindex_contents),
                         kDisableGlobalSequenceNumber);
 
   std::unique_ptr<InternalIterator> meta_iter;
-  meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator()));
+  meta_iter.reset(metaindex_block.NewIterator<DataBlockIter>(
+      BytewiseComparator(), BytewiseComparator()));
 
   BlockHandle block_handle;
   status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle);
@@ -406,9 +504,11 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
   }
 
   // Reading metablock
-  return ReadBlockContents(file, prefetch_buffer, footer, read_options,
-                           block_handle, contents, ioptions,
-                           false /* decompress */);
+  BlockFetcher block_fetcher2(
+      file, prefetch_buffer, footer, read_options, block_handle, contents,
+      ioptions, false /* decompress */, false /*maybe_compressed*/,
+      UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
+  return block_fetcher2.ReadBlockContents();
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/meta_blocks.h b/thirdparty/rocksdb/table/meta_blocks.h
index 220985d9e1..6efd1225e1 100644
--- a/thirdparty/rocksdb/table/meta_blocks.h
+++ b/thirdparty/rocksdb/table/meta_blocks.h
@@ -11,12 +11,13 @@
 
 #include "db/builder.h"
 #include "db/table_properties_collector.h"
-#include "util/kv_map.h"
 #include "rocksdb/comparator.h"
+#include "rocksdb/memory_allocator.h"
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "table/block_builder.h"
 #include "table/format.h"
+#include "util/kv_map.h"
 
 namespace rocksdb {
 
@@ -27,7 +28,6 @@ class Footer;
 class Logger;
 class RandomAccessFile;
 struct TableProperties;
-class InternalIterator;
 
 class MetaIndexBuilder {
  public:
@@ -83,6 +83,11 @@ bool NotifyCollectTableCollectorsOnAdd(
     const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
     Logger* info_log);
 
+void NotifyCollectTableCollectorsOnBlockAdd(
+    const std::vector<std::unique_ptr<IntTblPropCollector>>& collectors,
+    uint64_t blockRawBytes, uint64_t blockCompressedBytesFast,
+    uint64_t blockCompressedBytesSlow);
+
 // NotifyCollectTableCollectorsOnAdd() triggers the `Finish` event for all
 // property collectors. The collected properties will be added to `builder`.
 bool NotifyCollectTableCollectorsOnFinish(
@@ -96,16 +101,26 @@ bool NotifyCollectTableCollectorsOnFinish(
 Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file,
                       FilePrefetchBuffer* prefetch_buffer, const Footer& footer,
                       const ImmutableCFOptions& ioptions,
-                      TableProperties** table_properties);
+                      TableProperties** table_properties, bool verify_checksum,
+                      BlockHandle* block_handle,
+                      CacheAllocationPtr* verification_buf,
+                      bool compression_type_missing = false,
+                      MemoryAllocator* memory_allocator = nullptr);
 
 // Directly read the properties from the properties block of a plain table.
 // @returns a status to indicate if the operation succeeded. On success,
 //          *table_properties will point to a heap-allocated TableProperties
 //          object, otherwise value of `table_properties` will not be modified.
+// certain tables do not have compression_type byte setup properly for
+// uncompressed blocks, caller can request to reset compression type by
+// passing compression_type_missing = true, the same applies to
+// `ReadProperties`, `FindMetaBlock`, and `ReadMetaBlock`
 Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
                            uint64_t table_magic_number,
-                           const ImmutableCFOptions &ioptions,
-                           TableProperties** properties);
+                           const ImmutableCFOptions& ioptions,
+                           TableProperties** properties,
+                           bool compression_type_missing = false,
+                           MemoryAllocator* memory_allocator = nullptr);
 
 // Find the meta block from the meta index block.
 Status FindMetaBlock(InternalIterator* meta_index_iter,
@@ -115,9 +130,11 @@ Status FindMetaBlock(InternalIterator* meta_index_iter,
 // Find the meta block
 Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size,
                      uint64_t table_magic_number,
-                     const ImmutableCFOptions &ioptions,
+                     const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
-                     BlockHandle* block_handle);
+                     BlockHandle* block_handle,
+                     bool compression_type_missing = false,
+                     MemoryAllocator* memory_allocator = nullptr);
 
 // Read the specified meta block with name meta_block_name
 // from `file` and initialize `contents` with contents of this block.
@@ -127,6 +144,8 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
                      uint64_t table_magic_number,
                      const ImmutableCFOptions& ioptions,
                      const std::string& meta_block_name,
-                     BlockContents* contents);
+                     BlockContents* contents,
+                     bool compression_type_missing = false,
+                     MemoryAllocator* memory_allocator = nullptr);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/mock_table.cc b/thirdparty/rocksdb/table/mock_table.cc
index 86c380865c..65a4361696 100644
--- a/thirdparty/rocksdb/table/mock_table.cc
+++ b/thirdparty/rocksdb/table/mock_table.cc
@@ -26,14 +26,16 @@ stl_wrappers::KVMap MakeMockFile(
   return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_));
 }
 
-InternalIterator* MockTableReader::NewIterator(const ReadOptions&,
-                                               Arena* arena,
-                                               bool skip_filters) {
+InternalIterator* MockTableReader::NewIterator(
+    const ReadOptions&, const SliceTransform* /* prefix_extractor */,
+    Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/) {
   return new MockTableIterator(table_);
 }
 
 Status MockTableReader::Get(const ReadOptions&, const Slice& key,
-                            GetContext* get_context, bool skip_filters) {
+                            GetContext* get_context,
+                            const SliceTransform* /*prefix_extractor*/,
+                            bool /*skip_filters*/) {
   std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
   for (iter->Seek(key); iter->Valid(); iter->Next()) {
     ParsedInternalKey parsed_key;
@@ -41,7 +43,8 @@ Status MockTableReader::Get(const ReadOptions&, const Slice& key,
       return Status::Corruption(Slice());
     }
 
-    if (!get_context->SaveValue(parsed_key, iter->value())) {
+    bool dont_care __attribute__((__unused__));
+    if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) {
       break;
     }
   }
@@ -56,10 +59,10 @@ std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
 MockTableFactory::MockTableFactory() : next_id_(1) {}
 
 Status MockTableFactory::NewTableReader(
-    const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader,
-    bool prefetch_index_and_filter_in_cache) const {
+    const TableReaderOptions& /*table_reader_options*/,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
+    std::unique_ptr<TableReader>* table_reader,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
   uint32_t id = GetIDFromFile(file.get());
 
   MutexLock lock_guard(&file_system_.mutex);
@@ -75,8 +78,8 @@ Status MockTableFactory::NewTableReader(
 }
 
 TableBuilder* MockTableFactory::NewTableBuilder(
-    const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
-    WritableFileWriter* file) const {
+    const TableBuilderOptions& /*table_builder_options*/,
+    uint32_t /*column_family_id*/, WritableFileWriter* file) const {
   uint32_t id = GetAndWriteNextID(file);
 
   return new MockTableBuilder(id, &file_system_);
@@ -90,7 +93,7 @@ Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
     return s;
   }
 
-  WritableFileWriter file_writer(std::move(file), EnvOptions());
+  WritableFileWriter file_writer(std::move(file), fname, EnvOptions());
 
   uint32_t id = GetAndWriteNextID(&file_writer);
   file_system_.files.insert({id, std::move(file_contents)});
diff --git a/thirdparty/rocksdb/table/mock_table.h b/thirdparty/rocksdb/table/mock_table.h
index 71609a173f..2f123a963c 100644
--- a/thirdparty/rocksdb/table/mock_table.h
+++ b/thirdparty/rocksdb/table/mock_table.h
@@ -39,13 +39,16 @@ class MockTableReader : public TableReader {
   explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {}
 
   InternalIterator* NewIterator(const ReadOptions&,
-                                Arena* arena,
-                                bool skip_filters = false) override;
+                                const SliceTransform* prefix_extractor,
+                                Arena* arena = nullptr,
+                                bool skip_filters = false,
+                                bool for_compaction = false) override;
 
-  Status Get(const ReadOptions&, const Slice& key, GetContext* get_context,
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
-  uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; }
+  uint64_t ApproximateOffsetOf(const Slice& /*key*/) override { return 0; }
 
   virtual size_t ApproximateMemoryUsage() const override { return 0; }
 
@@ -154,8 +157,8 @@ class MockTableFactory : public TableFactory {
   const char* Name() const override { return "MockTable"; }
   Status NewTableReader(
       const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table_reader,
       bool prefetch_index_and_filter_in_cache = true) const override;
   TableBuilder* NewTableBuilder(
       const TableBuilderOptions& table_builder_options,
@@ -168,8 +171,8 @@ class MockTableFactory : public TableFactory {
                          stl_wrappers::KVMap file_contents);
 
   virtual Status SanitizeOptions(
-      const DBOptions& db_opts,
-      const ColumnFamilyOptions& cf_opts) const override {
+      const DBOptions& /*db_opts*/,
+      const ColumnFamilyOptions& /*cf_opts*/) const override {
     return Status::OK();
   }
 
diff --git a/thirdparty/rocksdb/table/partitioned_filter_block.cc b/thirdparty/rocksdb/table/partitioned_filter_block.cc
index 202245939f..aab0f5509b 100644
--- a/thirdparty/rocksdb/table/partitioned_filter_block.cc
+++ b/thirdparty/rocksdb/table/partitioned_filter_block.cc
@@ -5,6 +5,13 @@
 
 #include "table/partitioned_filter_block.h"
 
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else
+#include <malloc.h>
+#endif
+#endif
 #include <utility>
 
 #include "monitoring/perf_context_imp.h"
@@ -19,13 +26,20 @@ namespace rocksdb {
 PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder(
     const SliceTransform* prefix_extractor, bool whole_key_filtering,
     FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+    const bool use_value_delta_encoding,
     PartitionedIndexBuilder* const p_index_builder,
     const uint32_t partition_size)
     : FullFilterBlockBuilder(prefix_extractor, whole_key_filtering,
                              filter_bits_builder),
-      index_on_filter_block_builder_(index_block_restart_interval),
+      index_on_filter_block_builder_(index_block_restart_interval,
+                                     true /*use_delta_encoding*/,
+                                     use_value_delta_encoding),
+      index_on_filter_block_builder_without_seq_(index_block_restart_interval,
+                                                 true /*use_delta_encoding*/,
+                                                 use_value_delta_encoding),
       p_index_builder_(p_index_builder),
-      filters_in_partition_(0) {
+      filters_in_partition_(0),
+      num_added_(0) {
   filters_per_partition_ =
       filter_bits_builder_->CalculateNumEntry(partition_size);
 }
@@ -47,12 +61,14 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock() {
   std::string& index_key = p_index_builder_->GetPartitionKey();
   filters.push_back({index_key, filter});
   filters_in_partition_ = 0;
+  Reset();
 }
 
 void PartitionedFilterBlockBuilder::AddKey(const Slice& key) {
   MaybeCutAFilterBlock();
   filter_bits_builder_->AddKey(key);
   filters_in_partition_++;
+  num_added_++;
 }
 
 Slice PartitionedFilterBlockBuilder::Finish(
@@ -62,7 +78,19 @@ Slice PartitionedFilterBlockBuilder::Finish(
     FilterEntry& last_entry = filters.front();
     std::string handle_encoding;
     last_partition_block_handle.EncodeTo(&handle_encoding);
-    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding);
+    std::string handle_delta_encoding;
+    PutVarsignedint64(
+        &handle_delta_encoding,
+        last_partition_block_handle.size() - last_encoded_handle_.size());
+    last_encoded_handle_ = last_partition_block_handle;
+    const Slice handle_delta_encoding_slice(handle_delta_encoding);
+    index_on_filter_block_builder_.Add(last_entry.key, handle_encoding,
+                                       &handle_delta_encoding_slice);
+    if (!p_index_builder_->seperator_is_key_plus_seq()) {
+      index_on_filter_block_builder_without_seq_.Add(
+          ExtractUserKey(last_entry.key), handle_encoding,
+          &handle_delta_encoding_slice);
+    }
     filters.pop_front();
   } else {
     MaybeCutAFilterBlock();
@@ -72,7 +100,11 @@ Slice PartitionedFilterBlockBuilder::Finish(
   if (UNLIKELY(filters.empty())) {
     *status = Status::OK();
     if (finishing_filters) {
-      return index_on_filter_block_builder_.Finish();
+      if (p_index_builder_->seperator_is_key_plus_seq()) {
+        return index_on_filter_block_builder_.Finish();
+      } else {
+        return index_on_filter_block_builder_without_seq_.Finish();
+      }
     } else {
       // This is the rare case where no key was added to the filter
       return Slice();
@@ -88,13 +120,16 @@ Slice PartitionedFilterBlockBuilder::Finish(
 
 PartitionedFilterBlockReader::PartitionedFilterBlockReader(
     const SliceTransform* prefix_extractor, bool _whole_key_filtering,
-    BlockContents&& contents, FilterBitsReader* filter_bits_reader,
-    Statistics* stats, const Comparator& comparator,
-    const BlockBasedTable* table)
+    BlockContents&& contents, FilterBitsReader* /*filter_bits_reader*/,
+    Statistics* stats, const InternalKeyComparator comparator,
+    const BlockBasedTable* table, const bool index_key_includes_seq,
+    const bool index_value_is_full)
     : FilterBlockReader(contents.data.size(), stats, _whole_key_filtering),
       prefix_extractor_(prefix_extractor),
       comparator_(comparator),
-      table_(table) {
+      table_(table),
+      index_key_includes_seq_(index_key_includes_seq),
+      index_value_is_full_(index_value_is_full) {
   idx_on_fltr_blk_.reset(new Block(std::move(contents),
                                    kDisableGlobalSequenceNumber,
                                    0 /* read_amp_bytes_per_bit */, stats));
@@ -109,17 +144,15 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
     return;
   }
   char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
-  BlockIter biter;
+  IndexBlockIter biter;
   BlockHandle handle;
-  idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
+  Statistics* kNullStats = nullptr;
+  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
+      &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
+      index_key_includes_seq_, index_value_is_full_);
   biter.SeekToFirst();
   for (; biter.Valid(); biter.Next()) {
-    auto input = biter.value();
-    auto s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      continue;
-    }
+    handle = biter.value();
     auto key = BlockBasedTable::GetCacheKey(table_->rep_->cache_key_prefix,
                                             table_->rep_->cache_key_prefix_size,
                                             handle, cache_key);
@@ -128,7 +161,8 @@ PartitionedFilterBlockReader::~PartitionedFilterBlockReader() {
 }
 
 bool PartitionedFilterBlockReader::KeyMayMatch(
-    const Slice& key, uint64_t block_offset, const bool no_io,
+    const Slice& key, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io,
     const Slice* const const_ikey_ptr) {
   assert(const_ikey_ptr != nullptr);
   assert(block_offset == kNotValid);
@@ -143,12 +177,14 @@ bool PartitionedFilterBlockReader::KeyMayMatch(
     return false;
   }
   bool cached = false;
-  auto filter_partition = GetFilterPartition(nullptr /* prefetch_buffer */,
-                                             &filter_handle, no_io, &cached);
+  auto filter_partition =
+      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
+                         &cached, prefix_extractor);
   if (UNLIKELY(!filter_partition.value)) {
     return true;
   }
-  auto res = filter_partition.value->KeyMayMatch(key, block_offset, no_io);
+  auto res = filter_partition.value->KeyMayMatch(key, prefix_extractor,
+                                                 block_offset, no_io);
   if (cached) {
     return res;
   }
@@ -161,11 +197,15 @@ bool PartitionedFilterBlockReader::KeyMayMatch(
 }
 
 bool PartitionedFilterBlockReader::PrefixMayMatch(
-    const Slice& prefix, uint64_t block_offset, const bool no_io,
+    const Slice& prefix, const SliceTransform* prefix_extractor,
+    uint64_t block_offset, const bool no_io,
     const Slice* const const_ikey_ptr) {
+#ifdef NDEBUG
+  (void)block_offset;
+#endif
   assert(const_ikey_ptr != nullptr);
   assert(block_offset == kNotValid);
-  if (!prefix_extractor_) {
+  if (!prefix_extractor_ && !prefix_extractor) {
     return true;
   }
   if (UNLIKELY(idx_on_fltr_blk_->size() == 0)) {
@@ -176,12 +216,14 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
     return false;
   }
   bool cached = false;
-  auto filter_partition = GetFilterPartition(nullptr /* prefetch_buffer */,
-                                             &filter_handle, no_io, &cached);
+  auto filter_partition =
+      GetFilterPartition(nullptr /* prefetch_buffer */, filter_handle, no_io,
+                         &cached, prefix_extractor);
   if (UNLIKELY(!filter_partition.value)) {
     return true;
   }
-  auto res = filter_partition.value->PrefixMayMatch(prefix, kNotValid, no_io);
+  auto res = filter_partition.value->PrefixMayMatch(prefix, prefix_extractor,
+                                                    kNotValid, no_io);
   if (cached) {
     return res;
   }
@@ -193,26 +235,26 @@ bool PartitionedFilterBlockReader::PrefixMayMatch(
   return res;
 }
 
-Slice PartitionedFilterBlockReader::GetFilterPartitionHandle(
+BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle(
     const Slice& entry) {
-  BlockIter iter;
-  idx_on_fltr_blk_->NewIterator(&comparator_, &iter, true);
+  IndexBlockIter iter;
+  Statistics* kNullStats = nullptr;
+  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
+      &comparator_, comparator_.user_comparator(), &iter, kNullStats, true,
+      index_key_includes_seq_, index_value_is_full_);
   iter.Seek(entry);
   if (UNLIKELY(!iter.Valid())) {
-    return Slice();
+    return BlockHandle(0, 0);
   }
   assert(iter.Valid());
-  Slice handle_value = iter.value();
-  return handle_value;
+  BlockHandle fltr_blk_handle = iter.value();
+  return fltr_blk_handle;
 }
 
 BlockBasedTable::CachableEntry<FilterBlockReader>
 PartitionedFilterBlockReader::GetFilterPartition(
-    FilePrefetchBuffer* prefetch_buffer, Slice* handle_value, const bool no_io,
-    bool* cached) {
-  BlockHandle fltr_blk_handle;
-  auto s = fltr_blk_handle.DecodeFrom(handle_value);
-  assert(s.ok());
+    FilePrefetchBuffer* prefetch_buffer, BlockHandle& fltr_blk_handle,
+    const bool no_io, bool* cached, const SliceTransform* prefix_extractor) {
   const bool is_a_filter_partition = true;
   auto block_cache = table_->rep_->table_options.block_cache.get();
   if (LIKELY(block_cache != nullptr)) {
@@ -231,74 +273,76 @@ PartitionedFilterBlockReader::GetFilterPartition(
       }
     }
     return table_->GetFilter(/*prefetch_buffer*/ nullptr, fltr_blk_handle,
-                             is_a_filter_partition, no_io);
+                             is_a_filter_partition, no_io,
+                             /* get_context */ nullptr, prefix_extractor);
   } else {
     auto filter = table_->ReadFilter(prefetch_buffer, fltr_blk_handle,
-                                     is_a_filter_partition);
+                                     is_a_filter_partition, prefix_extractor);
     return {filter, nullptr};
   }
 }
 
 size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const {
-  return idx_on_fltr_blk_->size();
+  size_t usage = idx_on_fltr_blk_->usable_size();
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size((void*)this);
+#else
+  usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+  return usage;
+  // TODO(myabandeh): better estimation for filter_map_ size
+}
+
+// Release the cached entry and decrement its ref count.
+void ReleaseFilterCachedEntry(void* arg, void* h) {
+  Cache* cache = reinterpret_cast<Cache*>(arg);
+  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+  cache->Release(handle);
 }
 
 // TODO(myabandeh): merge this with the same function in IndexReader
-void PartitionedFilterBlockReader::CacheDependencies(bool pin) {
+void PartitionedFilterBlockReader::CacheDependencies(
+    bool pin, const SliceTransform* prefix_extractor) {
   // Before read partitions, prefetch them to avoid lots of IOs
   auto rep = table_->rep_;
-  BlockIter biter;
-  BlockHandle handle;
-  idx_on_fltr_blk_->NewIterator(&comparator_, &biter, true);
+  IndexBlockIter biter;
+  Statistics* kNullStats = nullptr;
+  idx_on_fltr_blk_->NewIterator<IndexBlockIter>(
+      &comparator_, comparator_.user_comparator(), &biter, kNullStats, true,
+      index_key_includes_seq_, index_value_is_full_);
   // Index partitions are assumed to be consecuitive. Prefetch them all.
   // Read the first block offset
   biter.SeekToFirst();
-  Slice input = biter.value();
-  Status s = handle.DecodeFrom(&input);
-  assert(s.ok());
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
-                   "Could not read first index partition");
-    return;
-  }
+  BlockHandle handle = biter.value();
   uint64_t prefetch_off = handle.offset();
 
   // Read the last block's offset
   biter.SeekToLast();
-  input = biter.value();
-  s = handle.DecodeFrom(&input);
-  assert(s.ok());
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(rep->ioptions.info_log,
-                   "Could not read last index partition");
-    return;
-  }
+  handle = biter.value();
   uint64_t last_off = handle.offset() + handle.size() + kBlockTrailerSize;
   uint64_t prefetch_len = last_off - prefetch_off;
   std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
   auto& file = table_->rep_->file;
   prefetch_buffer.reset(new FilePrefetchBuffer());
-  s = prefetch_buffer->Prefetch(file.get(), prefetch_off, prefetch_len);
+  Status s;
+  s = prefetch_buffer->Prefetch(file.get(), prefetch_off,
+    static_cast<size_t>(prefetch_len));
 
   // After prefetch, read the partitions one by one
   biter.SeekToFirst();
   Cache* block_cache = rep->table_options.block_cache.get();
   for (; biter.Valid(); biter.Next()) {
-    input = biter.value();
-    s = handle.DecodeFrom(&input);
-    assert(s.ok());
-    if (!s.ok()) {
-      ROCKS_LOG_WARN(rep->ioptions.info_log, "Could not read index partition");
-      continue;
-    }
-
+    handle = biter.value();
     const bool no_io = true;
     const bool is_a_filter_partition = true;
-    auto filter = table_->GetFilter(prefetch_buffer.get(), handle,
-                                    is_a_filter_partition, !no_io);
+    auto filter = table_->GetFilter(
+        prefetch_buffer.get(), handle, is_a_filter_partition, !no_io,
+        /* get_context */ nullptr, prefix_extractor);
     if (LIKELY(filter.IsSet())) {
       if (pin) {
         filter_map_[handle.offset()] = std::move(filter);
+        RegisterCleanup(&ReleaseFilterCachedEntry, block_cache,
+                        filter.cache_handle);
       } else {
         block_cache->Release(filter.cache_handle);
       }
diff --git a/thirdparty/rocksdb/table/partitioned_filter_block.h b/thirdparty/rocksdb/table/partitioned_filter_block.h
index 1a00a86e6c..5d55da5449 100644
--- a/thirdparty/rocksdb/table/partitioned_filter_block.h
+++ b/thirdparty/rocksdb/table/partitioned_filter_block.h
@@ -26,6 +26,7 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   explicit PartitionedFilterBlockBuilder(
       const SliceTransform* prefix_extractor, bool whole_key_filtering,
       FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval,
+      const bool use_value_delta_encoding,
       PartitionedIndexBuilder* const p_index_builder,
       const uint32_t partition_size);
 
@@ -33,12 +34,16 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
 
   void AddKey(const Slice& key) override;
 
+  size_t NumAdded() const override { return num_added_; }
+
   virtual Slice Finish(const BlockHandle& last_partition_block_handle,
                        Status* status) override;
 
  private:
   // Filter data
   BlockBuilder index_on_filter_block_builder_;  // top-level index builder
+  BlockBuilder
+      index_on_filter_block_builder_without_seq_;  // same for user keys
   struct FilterEntry {
     std::string key;
     Slice filter;
@@ -59,41 +64,48 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder {
   uint32_t filters_per_partition_;
   // The current number of filters in the last partition
   uint32_t filters_in_partition_;
+  // Number of keys added
+  size_t num_added_;
+  BlockHandle last_encoded_handle_;
 };
 
-class PartitionedFilterBlockReader : public FilterBlockReader {
+class PartitionedFilterBlockReader : public FilterBlockReader,
+                                     public Cleanable {
  public:
-  explicit PartitionedFilterBlockReader(const SliceTransform* prefix_extractor,
-                                        bool whole_key_filtering,
-                                        BlockContents&& contents,
-                                        FilterBitsReader* filter_bits_reader,
-                                        Statistics* stats,
-                                        const Comparator& comparator,
-                                        const BlockBasedTable* table);
+  explicit PartitionedFilterBlockReader(
+      const SliceTransform* prefix_extractor, bool whole_key_filtering,
+      BlockContents&& contents, FilterBitsReader* filter_bits_reader,
+      Statistics* stats, const InternalKeyComparator comparator,
+      const BlockBasedTable* table, const bool index_key_includes_seq,
+      const bool index_value_is_full);
   virtual ~PartitionedFilterBlockReader();
 
   virtual bool IsBlockBased() override { return false; }
   virtual bool KeyMayMatch(
-      const Slice& key, uint64_t block_offset = kNotValid,
-      const bool no_io = false,
+      const Slice& key, const SliceTransform* prefix_extractor,
+      uint64_t block_offset = kNotValid, const bool no_io = false,
       const Slice* const const_ikey_ptr = nullptr) override;
   virtual bool PrefixMayMatch(
-      const Slice& prefix, uint64_t block_offset = kNotValid,
-      const bool no_io = false,
+      const Slice& prefix, const SliceTransform* prefix_extractor,
+      uint64_t block_offset = kNotValid, const bool no_io = false,
       const Slice* const const_ikey_ptr = nullptr) override;
   virtual size_t ApproximateMemoryUsage() const override;
 
  private:
-  Slice GetFilterPartitionHandle(const Slice& entry);
+  BlockHandle GetFilterPartitionHandle(const Slice& entry);
   BlockBasedTable::CachableEntry<FilterBlockReader> GetFilterPartition(
-      FilePrefetchBuffer* prefetch_buffer, Slice* handle, const bool no_io,
-      bool* cached);
-  virtual void CacheDependencies(bool pin) override;
+      FilePrefetchBuffer* prefetch_buffer, BlockHandle& handle,
+      const bool no_io, bool* cached,
+      const SliceTransform* prefix_extractor = nullptr);
+  virtual void CacheDependencies(
+      bool bin, const SliceTransform* prefix_extractor) override;
 
   const SliceTransform* prefix_extractor_;
   std::unique_ptr<Block> idx_on_fltr_blk_;
-  const Comparator& comparator_;
+  const InternalKeyComparator comparator_;
   const BlockBasedTable* table_;
+  const bool index_key_includes_seq_;
+  const bool index_value_is_full_;
   std::unordered_map<uint64_t,
                      BlockBasedTable::CachableEntry<FilterBlockReader>>
       filter_map_;
diff --git a/thirdparty/rocksdb/table/partitioned_filter_block_test.cc b/thirdparty/rocksdb/table/partitioned_filter_block_test.cc
index 1bc529ed97..8068f14d81 100644
--- a/thirdparty/rocksdb/table/partitioned_filter_block_test.cc
+++ b/thirdparty/rocksdb/table/partitioned_filter_block_test.cc
@@ -27,18 +27,32 @@ class MockedBlockBasedTable : public BlockBasedTable {
     rep->cache_key_prefix_size = 10;
   }
 
-  virtual CachableEntry<FilterBlockReader> GetFilter(
+  CachableEntry<FilterBlockReader> GetFilter(
       FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
-      const bool /* unused */, bool /* unused */) const override {
+      const bool /* unused */, bool /* unused */, GetContext* /* unused */,
+      const SliceTransform* prefix_extractor) const override {
     Slice slice = slices[filter_blk_handle.offset()];
     auto obj = new FullFilterBlockReader(
-        nullptr, true, BlockContents(slice, false, kNoCompression),
+        prefix_extractor, true, BlockContents(slice),
         rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
     return {obj, nullptr};
   }
+
+  FilterBlockReader* ReadFilter(
+      FilePrefetchBuffer*, const BlockHandle& filter_blk_handle,
+      const bool /* unused */,
+      const SliceTransform* prefix_extractor) const override {
+    Slice slice = slices[filter_blk_handle.offset()];
+    auto obj = new FullFilterBlockReader(
+        prefix_extractor, true, BlockContents(slice),
+        rep_->table_options.filter_policy->GetFilterBitsReader(slice), nullptr);
+    return obj;
+  }
 };
 
-class PartitionedFilterBlockTest : public testing::Test {
+class PartitionedFilterBlockTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
  public:
   BlockBasedTableOptions table_options_;
   InternalKeyComparator icomp = InternalKeyComparator(BytewiseComparator());
@@ -48,10 +62,12 @@ class PartitionedFilterBlockTest : public testing::Test {
     table_options_.no_block_cache = true;  // Otherwise BlockBasedTable::Close
                                            // will access variable that are not
                                            // initialized in our mocked version
+    table_options_.format_version = GetParam();
+    table_options_.index_block_restart_interval = 3;
   }
 
   std::shared_ptr<Cache> cache_;
-  ~PartitionedFilterBlockTest() {}
+  ~PartitionedFilterBlockTest() override {}
 
   const std::string keys[4] = {"afoo", "bar", "box", "hello"};
   const std::string missing_keys[2] = {"missing", "other"};
@@ -75,7 +91,8 @@ class PartitionedFilterBlockTest : public testing::Test {
     auto partition_size =
         filter_bits_reader->CalculateSpace(num_keys, &dont_care1, &dont_care2);
     delete filter_bits_reader;
-    return partition_size + table_options_.block_size_deviation;
+    return partition_size +
+               partition_size * table_options_.block_size_deviation / 100;
   }
 
   int last_offset = 10;
@@ -87,27 +104,34 @@ class PartitionedFilterBlockTest : public testing::Test {
   }
 
   PartitionedIndexBuilder* NewIndexBuilder() {
-    return PartitionedIndexBuilder::CreateIndexBuilder(&icomp, table_options_);
+    const bool kValueDeltaEncoded = true;
+    return PartitionedIndexBuilder::CreateIndexBuilder(
+        &icomp, !kValueDeltaEncoded, table_options_);
   }
 
   PartitionedFilterBlockBuilder* NewBuilder(
-      PartitionedIndexBuilder* const p_index_builder) {
+      PartitionedIndexBuilder* const p_index_builder,
+      const SliceTransform* prefix_extractor = nullptr) {
     assert(table_options_.block_size_deviation <= 100);
     auto partition_size = static_cast<uint32_t>(
-        table_options_.metadata_block_size *
-        ( 100 - table_options_.block_size_deviation));
+             ((table_options_.metadata_block_size *
+               (100 - table_options_.block_size_deviation)) +
+              99) /
+             100);
     partition_size = std::max(partition_size, static_cast<uint32_t>(1));
+    const bool kValueDeltaEncoded = true;
     return new PartitionedFilterBlockBuilder(
-        nullptr, table_options_.whole_key_filtering,
+        prefix_extractor, table_options_.whole_key_filtering,
         table_options_.filter_policy->GetFilterBitsBuilder(),
-        table_options_.index_block_restart_interval, p_index_builder,
-        partition_size);
+        table_options_.index_block_restart_interval, !kValueDeltaEncoded,
+        p_index_builder, partition_size);
   }
 
   std::unique_ptr<MockedBlockBasedTable> table;
 
   PartitionedFilterBlockReader* NewReader(
-      PartitionedFilterBlockBuilder* builder) {
+      PartitionedFilterBlockBuilder* builder, PartitionedIndexBuilder* pib,
+      const SliceTransform* prefix_extractor) {
     BlockHandle bh;
     Status status;
     Slice slice;
@@ -117,40 +141,51 @@ class PartitionedFilterBlockTest : public testing::Test {
     } while (status.IsIncomplete());
     const Options options;
     const ImmutableCFOptions ioptions(options);
+    const MutableCFOptions moptions(options);
     const EnvOptions env_options;
-    table.reset(new MockedBlockBasedTable(new BlockBasedTable::Rep(
-        ioptions, env_options, table_options_, icomp, false)));
+    const bool kSkipFilters = true;
+    const bool kImmortal = true;
+    table.reset(new MockedBlockBasedTable(
+        new BlockBasedTable::Rep(ioptions, env_options, table_options_, icomp,
+                                 !kSkipFilters, 0, !kImmortal)));
     auto reader = new PartitionedFilterBlockReader(
-        nullptr, true, BlockContents(slice, false, kNoCompression), nullptr,
-        nullptr, *icomp.user_comparator(), table.get());
+        prefix_extractor, true, BlockContents(slice), nullptr, nullptr, icomp,
+        table.get(), pib->seperator_is_key_plus_seq(),
+        !pib->get_use_value_delta_encoding());
     return reader;
   }
 
   void VerifyReader(PartitionedFilterBlockBuilder* builder,
-                    bool empty = false) {
-    std::unique_ptr<PartitionedFilterBlockReader> reader(NewReader(builder));
+                    PartitionedIndexBuilder* pib, bool empty = false,
+                    const SliceTransform* prefix_extractor = nullptr) {
+    std::unique_ptr<PartitionedFilterBlockReader> reader(
+        NewReader(builder, pib, prefix_extractor));
     // Querying added keys
     const bool no_io = true;
     for (auto key : keys) {
       auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
-      ASSERT_TRUE(reader->KeyMayMatch(key, kNotValid, !no_io, &ikey_slice));
+      ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid, !no_io,
+                                      &ikey_slice));
     }
     {
       // querying a key twice
       auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
-      ASSERT_TRUE(reader->KeyMayMatch(keys[0], kNotValid, !no_io, &ikey_slice));
+      ASSERT_TRUE(reader->KeyMayMatch(keys[0], prefix_extractor, kNotValid,
+                                      !no_io, &ikey_slice));
     }
     // querying missing keys
     for (auto key : missing_keys) {
       auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
       const Slice ikey_slice = Slice(*ikey.rep());
       if (empty) {
-        ASSERT_TRUE(reader->KeyMayMatch(key, kNotValid, !no_io, &ikey_slice));
+        ASSERT_TRUE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
+                                        !no_io, &ikey_slice));
       } else {
         // assuming a good hash function
-        ASSERT_FALSE(reader->KeyMayMatch(key, kNotValid, !no_io, &ikey_slice));
+        ASSERT_FALSE(reader->KeyMayMatch(key, prefix_extractor, kNotValid,
+                                         !no_io, &ikey_slice));
       }
     }
   }
@@ -173,14 +208,14 @@ class PartitionedFilterBlockTest : public testing::Test {
     builder->Add(keys[i]);
     CutABlock(pib.get(), keys[i]);
 
-    VerifyReader(builder.get());
+    VerifyReader(builder.get(), pib.get());
     return CountNumOfIndexPartitions(pib.get());
   }
 
-  void TestBlockPerTwoKeys() {
+  void TestBlockPerTwoKeys(const SliceTransform* prefix_extractor = nullptr) {
     std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
     std::unique_ptr<PartitionedFilterBlockBuilder> builder(
-        NewBuilder(pib.get()));
+        NewBuilder(pib.get(), prefix_extractor));
     int i = 0;
     builder->Add(keys[i]);
     i++;
@@ -193,7 +228,7 @@ class PartitionedFilterBlockTest : public testing::Test {
     builder->Add(keys[i]);
     CutABlock(pib.get(), keys[i]);
 
-    VerifyReader(builder.get());
+    VerifyReader(builder.get(), pib.get(), prefix_extractor);
   }
 
   void TestBlockPerAllKeys() {
@@ -211,7 +246,7 @@ class PartitionedFilterBlockTest : public testing::Test {
     builder->Add(keys[i]);
     CutABlock(pib.get(), keys[i]);
 
-    VerifyReader(builder.get());
+    VerifyReader(builder.get(), pib.get());
   }
 
   void CutABlock(PartitionedIndexBuilder* builder,
@@ -248,14 +283,19 @@ class PartitionedFilterBlockTest : public testing::Test {
   }
 };
 
-TEST_F(PartitionedFilterBlockTest, EmptyBuilder) {
+INSTANTIATE_TEST_CASE_P(FormatDef, PartitionedFilterBlockTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, PartitionedFilterBlockTest,
+                        testing::Values(test::kLatestFormatVersion));
+
+TEST_P(PartitionedFilterBlockTest, EmptyBuilder) {
   std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
   std::unique_ptr<PartitionedFilterBlockBuilder> builder(NewBuilder(pib.get()));
   const bool empty = true;
-  VerifyReader(builder.get(), empty);
+  VerifyReader(builder.get(), pib.get(), empty);
 }
 
-TEST_F(PartitionedFilterBlockTest, OneBlock) {
+TEST_P(PartitionedFilterBlockTest, OneBlock) {
   uint64_t max_index_size = MaxIndexSize();
   for (uint64_t i = 1; i < max_index_size + 1; i++) {
     table_options_.metadata_block_size = i;
@@ -263,7 +303,7 @@ TEST_F(PartitionedFilterBlockTest, OneBlock) {
   }
 }
 
-TEST_F(PartitionedFilterBlockTest, TwoBlocksPerKey) {
+TEST_P(PartitionedFilterBlockTest, TwoBlocksPerKey) {
   uint64_t max_index_size = MaxIndexSize();
   for (uint64_t i = 1; i < max_index_size + 1; i++) {
     table_options_.metadata_block_size = i;
@@ -271,7 +311,35 @@ TEST_F(PartitionedFilterBlockTest, TwoBlocksPerKey) {
   }
 }
 
-TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) {
+// This reproduces the bug that a prefix is the same among multiple consecutive
+// blocks but the bug would add it only to the first block.
+TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) {
+  // some small number to cause partition cuts
+  table_options_.metadata_block_size = 1;
+  std::unique_ptr<const SliceTransform> prefix_extractor
+      (rocksdb::NewFixedPrefixTransform(1));
+  std::unique_ptr<PartitionedIndexBuilder> pib(NewIndexBuilder());
+  std::unique_ptr<PartitionedFilterBlockBuilder> builder(
+      NewBuilder(pib.get(), prefix_extractor.get()));
+  const std::string pkeys[3] = {"p-key1", "p-key2", "p-key3"};
+  builder->Add(pkeys[0]);
+  CutABlock(pib.get(), pkeys[0], pkeys[1]);
+  builder->Add(pkeys[1]);
+  CutABlock(pib.get(), pkeys[1], pkeys[2]);
+  builder->Add(pkeys[2]);
+  CutABlock(pib.get(), pkeys[2]);
+  std::unique_ptr<PartitionedFilterBlockReader> reader(
+      NewReader(builder.get(), pib.get(), prefix_extractor.get()));
+  for (auto key : pkeys) {
+    auto ikey = InternalKey(key, 0, ValueType::kTypeValue);
+    const Slice ikey_slice = Slice(*ikey.rep());
+    ASSERT_TRUE(reader->PrefixMayMatch(prefix_extractor->Transform(key),
+                                       prefix_extractor.get(), kNotValid,
+                                       false /*no_io*/, &ikey_slice));
+  }
+}
+
+TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) {
   uint64_t max_index_size = MaxIndexSize();
   for (uint64_t i = 1; i < max_index_size + 1; i++) {
     table_options_.metadata_block_size = i;
@@ -279,7 +347,7 @@ TEST_F(PartitionedFilterBlockTest, OneBlockPerKey) {
   }
 }
 
-TEST_F(PartitionedFilterBlockTest, PartitionCount) {
+TEST_P(PartitionedFilterBlockTest, PartitionCount) {
   int num_keys = sizeof(keys) / sizeof(*keys);
   table_options_.metadata_block_size =
       std::max(MaxIndexSize(), MaxFilterSize());
diff --git a/thirdparty/rocksdb/table/persistent_cache_helper.cc b/thirdparty/rocksdb/table/persistent_cache_helper.cc
index ec1cac0b9d..4e90697a6e 100644
--- a/thirdparty/rocksdb/table/persistent_cache_helper.cc
+++ b/thirdparty/rocksdb/table/persistent_cache_helper.cc
@@ -29,12 +29,9 @@ void PersistentCacheHelper::InsertUncompressedPage(
     const BlockContents& contents) {
   assert(cache_options.persistent_cache);
   assert(!cache_options.persistent_cache->IsCompressed());
-  if (!contents.cachable || contents.compression_type != kNoCompression) {
-    // We shouldn't cache this. Either
-    // (1) content is not cacheable
-    // (2) content is compressed
-    return;
-  }
+  // Precondition:
+  // (1) content is cacheable
+  // (2) content is not compressed
 
   // construct the page key
   char cache_key[BlockBasedTable::kMaxCacheKeyPrefixSize + kMaxVarint64Length];
@@ -49,6 +46,9 @@ void PersistentCacheHelper::InsertUncompressedPage(
 Status PersistentCacheHelper::LookupRawPage(
     const PersistentCacheOptions& cache_options, const BlockHandle& handle,
     std::unique_ptr<char[]>* raw_data, const size_t raw_data_size) {
+#ifdef NDEBUG
+  (void)raw_data_size;
+#endif
   assert(cache_options.persistent_cache);
   assert(cache_options.persistent_cache->IsCompressed());
 
@@ -106,8 +106,7 @@ Status PersistentCacheHelper::LookupUncompressedPage(
   // update stats
   RecordTick(cache_options.statistics, PERSISTENT_CACHE_HIT);
   // construct result and return
-  *contents =
-      BlockContents(std::move(data), size, false /*cacheable*/, kNoCompression);
+  *contents = BlockContents(std::move(data), size);
   return Status::OK();
 }
 
diff --git a/thirdparty/rocksdb/table/plain_table_builder.cc b/thirdparty/rocksdb/table/plain_table_builder.cc
index 964804358a..453b6c768b 100644
--- a/thirdparty/rocksdb/table/plain_table_builder.cc
+++ b/thirdparty/rocksdb/table/plain_table_builder.cc
@@ -57,7 +57,7 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull;
 extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull;
 
 PlainTableBuilder::PlainTableBuilder(
-    const ImmutableCFOptions& ioptions,
+    const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
     const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
         int_tbl_prop_collector_factories,
     uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len,
@@ -66,20 +66,21 @@ PlainTableBuilder::PlainTableBuilder(
     uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio,
     bool store_index_in_file)
     : ioptions_(ioptions),
+      moptions_(moptions),
       bloom_block_(num_probes),
       file_(file),
       bloom_bits_per_key_(bloom_bits_per_key),
       huge_page_tlb_size_(huge_page_tlb_size),
-      encoder_(encoding_type, user_key_len, ioptions.prefix_extractor,
+      encoder_(encoding_type, user_key_len, moptions.prefix_extractor.get(),
                index_sparseness),
       store_index_in_file_(store_index_in_file),
-      prefix_extractor_(ioptions.prefix_extractor) {
+      prefix_extractor_(moptions.prefix_extractor.get()) {
   // Build index block and save it in the file if hash_table_ratio > 0
   if (store_index_in_file_) {
     assert(hash_table_ratio > 0 || IsTotalOrderMode());
-    index_builder_.reset(
-        new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness,
-                                   hash_table_ratio, huge_page_tlb_size_));
+    index_builder_.reset(new PlainTableIndexBuilder(
+        &arena_, ioptions, moptions.prefix_extractor.get(), index_sparseness,
+        hash_table_ratio, huge_page_tlb_size_));
     properties_.user_collected_properties
         [PlainTablePropertyNames::kBloomVersion] = "1";  // For future use
   }
@@ -96,8 +97,8 @@ PlainTableBuilder::PlainTableBuilder(
   properties_.format_version = (encoding_type == kPlain) ? 0 : 1;
   properties_.column_family_id = column_family_id;
   properties_.column_family_name = column_family_name;
-  properties_.prefix_extractor_name = ioptions_.prefix_extractor != nullptr
-                                          ? ioptions_.prefix_extractor->Name()
+  properties_.prefix_extractor_name = moptions_.prefix_extractor != nullptr
+                                          ? moptions_.prefix_extractor->Name()
                                           : "nullptr";
 
   std::string val;
@@ -131,11 +132,11 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
 
   // Store key hash
   if (store_index_in_file_) {
-    if (ioptions_.prefix_extractor == nullptr) {
+    if (moptions_.prefix_extractor == nullptr) {
       keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key));
     } else {
       Slice prefix =
-          ioptions_.prefix_extractor->Transform(internal_key.user_key);
+          moptions_.prefix_extractor->Transform(internal_key.user_key);
       keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix));
     }
   }
@@ -165,6 +166,12 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) {
   properties_.num_entries++;
   properties_.raw_key_size += key.size();
   properties_.raw_value_size += value.size();
+  if (internal_key.type == kTypeDeletion ||
+      internal_key.type == kTypeSingleDeletion) {
+    properties_.num_deletions++;
+  } else if (internal_key.type == kTypeMerge) {
+    properties_.num_merge_operands++;
+  }
 
   // notify property collectors
   NotifyCollectTableCollectorsOnAdd(
diff --git a/thirdparty/rocksdb/table/plain_table_builder.h b/thirdparty/rocksdb/table/plain_table_builder.h
index 1d1f6c7586..ca0879a4e1 100644
--- a/thirdparty/rocksdb/table/plain_table_builder.h
+++ b/thirdparty/rocksdb/table/plain_table_builder.h
@@ -32,7 +32,7 @@ class PlainTableBuilder: public TableBuilder {
   // will be part of level specified by 'level'.  A value of -1 means
   // that the caller does not know which level the output file will reside.
   PlainTableBuilder(
-      const ImmutableCFOptions& ioptions,
+      const ImmutableCFOptions& ioptions, const MutableCFOptions& moptions,
       const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
           int_tbl_prop_collector_factories,
       uint32_t column_family_id, WritableFileWriter* file,
@@ -79,6 +79,7 @@ class PlainTableBuilder: public TableBuilder {
  private:
   Arena arena_;
   const ImmutableCFOptions& ioptions_;
+  const MutableCFOptions& moptions_;
   std::vector<std::unique_ptr<IntTblPropCollector>>
       table_properties_collectors_;
 
diff --git a/thirdparty/rocksdb/table/plain_table_factory.cc b/thirdparty/rocksdb/table/plain_table_factory.cc
index 5f7809b967..a6e59c142f 100644
--- a/thirdparty/rocksdb/table/plain_table_factory.cc
+++ b/thirdparty/rocksdb/table/plain_table_factory.cc
@@ -19,15 +19,16 @@ namespace rocksdb {
 
 Status PlainTableFactory::NewTableReader(
     const TableReaderOptions& table_reader_options,
-    unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-    unique_ptr<TableReader>* table,
-    bool prefetch_index_and_filter_in_cache) const {
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table,
+    bool /*prefetch_index_and_filter_in_cache*/) const {
   return PlainTableReader::Open(
       table_reader_options.ioptions, table_reader_options.env_options,
       table_reader_options.internal_comparator, std::move(file), file_size,
       table, table_options_.bloom_bits_per_key, table_options_.hash_table_ratio,
       table_options_.index_sparseness, table_options_.huge_page_tlb_size,
-      table_options_.full_scan_mode);
+      table_options_.full_scan_mode, table_reader_options.immortal,
+      table_reader_options.prefix_extractor);
 }
 
 TableBuilder* PlainTableFactory::NewTableBuilder(
@@ -38,7 +39,7 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
   // tables
   //
   return new PlainTableBuilder(
-      table_builder_options.ioptions,
+      table_builder_options.ioptions, table_builder_options.moptions,
       table_builder_options.int_tbl_prop_collector_factories, column_family_id,
       file, table_options_.user_key_len, table_options_.encoding_type,
       table_options_.index_sparseness, table_options_.bloom_bits_per_key,
@@ -102,7 +103,7 @@ Status GetMemTableRepFactoryFromString(
   std::vector<std::string> opts_list = StringSplit(opts_str, ':');
   size_t len = opts_list.size();
 
-  if (opts_list.size() <= 0 || opts_list.size() > 2) {
+  if (opts_list.empty() || opts_list.size() > 2) {
     return Status::InvalidArgument("Can't parse memtable_factory option ",
                                    opts_str);
   }
@@ -146,15 +147,8 @@ Status GetMemTableRepFactoryFromString(
       mem_factory = new VectorRepFactory();
     }
   } else if (opts_list[0] == "cuckoo") {
-    // Expecting format
-    // cuckoo:<write_buffer_size>
-    if (2 == len) {
-      size_t write_buffer_size = ParseSizeT(opts_list[1]);
-      mem_factory = NewHashCuckooRepFactory(write_buffer_size);
-    } else if (1 == len) {
-      return Status::InvalidArgument("Can't parse memtable_factory option ",
-                                     opts_str);
-    }
+    return Status::NotSupported(
+        "cuckoo hash memtable is not supported anymore.");
   } else {
     return Status::InvalidArgument("Unrecognized memtable_factory option ",
                                    opts_str);
@@ -195,7 +189,7 @@ Status GetPlainTableOptionsFromMap(
     const PlainTableOptions& table_options,
     const std::unordered_map<std::string, std::string>& opts_map,
     PlainTableOptions* new_table_options, bool input_strings_escaped,
-    bool ignore_unknown_options) {
+    bool /*ignore_unknown_options*/) {
   assert(new_table_options);
   *new_table_options = table_options;
   for (const auto& o : opts_map) {
@@ -210,6 +204,8 @@ Status GetPlainTableOptionsFromMap(
           (iter->second.verification != OptionVerificationType::kByName &&
            iter->second.verification !=
                OptionVerificationType::kByNameAllowNull &&
+           iter->second.verification !=
+               OptionVerificationType::kByNameAllowFromNull &&
            iter->second.verification != OptionVerificationType::kDeprecated)) {
         // Restore "new_options" to the default "base_options".
         *new_table_options = table_options;
diff --git a/thirdparty/rocksdb/table/plain_table_factory.h b/thirdparty/rocksdb/table/plain_table_factory.h
index 6c9ca44f30..990df482ed 100644
--- a/thirdparty/rocksdb/table/plain_table_factory.h
+++ b/thirdparty/rocksdb/table/plain_table_factory.h
@@ -17,7 +17,6 @@ namespace rocksdb {
 
 struct EnvOptions;
 
-using std::unique_ptr;
 class Status;
 class RandomAccessFile;
 class WritableFile;
@@ -149,8 +148,8 @@ class PlainTableFactory : public TableFactory {
 
   const char* Name() const override { return "PlainTable"; }
   Status NewTableReader(const TableReaderOptions& table_reader_options,
-                        unique_ptr<RandomAccessFileReader>&& file,
-                        uint64_t file_size, unique_ptr<TableReader>* table,
+                        std::unique_ptr<RandomAccessFileReader>&& file,
+                        uint64_t file_size, std::unique_ptr<TableReader>* table,
                         bool prefetch_index_and_filter_in_cache) const override;
 
   TableBuilder* NewTableBuilder(
@@ -161,18 +160,19 @@ class PlainTableFactory : public TableFactory {
 
   const PlainTableOptions& table_options() const;
 
-  static const char kValueTypeSeqId0 = char(0xFF);
+  static const char kValueTypeSeqId0 = char(~0);
 
   // Sanitizes the specified DB Options.
-  Status SanitizeOptions(const DBOptions& db_opts,
-                         const ColumnFamilyOptions& cf_opts) const override {
+  Status SanitizeOptions(
+      const DBOptions& /*db_opts*/,
+      const ColumnFamilyOptions& /*cf_opts*/) const override {
     return Status::OK();
   }
 
   void* GetOptions() override { return &table_options_; }
 
-  Status GetOptionString(std::string* opt_string,
-                         const std::string& delimiter) const override {
+  Status GetOptionString(std::string* /*opt_string*/,
+                         const std::string& /*delimiter*/) const override {
     return Status::OK();
   }
 
diff --git a/thirdparty/rocksdb/table/plain_table_index.cc b/thirdparty/rocksdb/table/plain_table_index.cc
index 39a6b53d60..4374092397 100644
--- a/thirdparty/rocksdb/table/plain_table_index.cc
+++ b/thirdparty/rocksdb/table/plain_table_index.cc
@@ -203,7 +203,7 @@ Slice PlainTableIndexBuilder::FillIndexes(
   assert(sub_index_offset == sub_index_size_);
 
   ROCKS_LOG_DEBUG(ioptions_.info_log,
-                  "hash table size: %d, suffix_map length %" ROCKSDB_PRIszt,
+                  "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
                   index_size_, sub_index_size_);
   return Slice(allocated, GetTotalSize());
 }
diff --git a/thirdparty/rocksdb/table/plain_table_index.h b/thirdparty/rocksdb/table/plain_table_index.h
index 2916be4192..360d998279 100644
--- a/thirdparty/rocksdb/table/plain_table_index.h
+++ b/thirdparty/rocksdb/table/plain_table_index.h
@@ -112,6 +112,7 @@ class PlainTableIndex {
 class PlainTableIndexBuilder {
  public:
   PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions,
+                         const SliceTransform* prefix_extractor,
                          size_t index_sparseness, double hash_table_ratio,
                          size_t huge_page_tlb_size)
       : arena_(arena),
@@ -123,7 +124,9 @@ class PlainTableIndexBuilder {
         num_keys_per_prefix_(0),
         prev_key_prefix_hash_(0),
         index_sparseness_(index_sparseness),
-        prefix_extractor_(ioptions.prefix_extractor),
+        index_size_(0),
+        sub_index_size_(0),
+        prefix_extractor_(prefix_extractor),
         hash_table_ratio_(hash_table_ratio),
         huge_page_tlb_size_(huge_page_tlb_size) {}
 
diff --git a/thirdparty/rocksdb/table/plain_table_key_coding.cc b/thirdparty/rocksdb/table/plain_table_key_coding.cc
index 3e87c03d13..6f5ee9b4ad 100644
--- a/thirdparty/rocksdb/table/plain_table_key_coding.cc
+++ b/thirdparty/rocksdb/table/plain_table_key_coding.cc
@@ -288,7 +288,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
                                                   ParsedInternalKey* parsed_key,
                                                   Slice* internal_key,
                                                   uint32_t* bytes_read,
-                                                  bool* seekable) {
+                                                  bool* /*seekable*/) {
   uint32_t user_key_size = 0;
   Status s;
   if (fixed_user_key_len_ != kPlainTableVariableLength) {
diff --git a/thirdparty/rocksdb/table/plain_table_key_coding.h b/thirdparty/rocksdb/table/plain_table_key_coding.h
index 321e0aed59..9a27ad06b7 100644
--- a/thirdparty/rocksdb/table/plain_table_key_coding.h
+++ b/thirdparty/rocksdb/table/plain_table_key_coding.h
@@ -114,7 +114,7 @@ class PlainTableFileReader {
   };
 
   // Keep buffers for two recent reads.
-  std::array<unique_ptr<Buffer>, 2> buffers_;
+  std::array<std::unique_ptr<Buffer>, 2> buffers_;
   uint32_t num_buf_;
   Status status_;
 
diff --git a/thirdparty/rocksdb/table/plain_table_reader.cc b/thirdparty/rocksdb/table/plain_table_reader.cc
index d4d9edb741..b0c6dcf07e 100644
--- a/thirdparty/rocksdb/table/plain_table_reader.cc
+++ b/thirdparty/rocksdb/table/plain_table_reader.cc
@@ -54,7 +54,7 @@ inline uint32_t GetFixed32Element(const char* base, size_t offset) {
 class PlainTableIterator : public InternalIterator {
  public:
   explicit PlainTableIterator(PlainTableReader* table, bool use_prefix_seek);
-  ~PlainTableIterator();
+  ~PlainTableIterator() override;
 
   bool Valid() const override;
 
@@ -91,20 +91,20 @@ class PlainTableIterator : public InternalIterator {
 };
 
 extern const uint64_t kPlainTableMagicNumber;
-PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
-                                   unique_ptr<RandomAccessFileReader>&& file,
-                                   const EnvOptions& storage_options,
-                                   const InternalKeyComparator& icomparator,
-                                   EncodingType encoding_type,
-                                   uint64_t file_size,
-                                   const TableProperties* table_properties)
+PlainTableReader::PlainTableReader(
+    const ImmutableCFOptions& ioptions,
+    std::unique_ptr<RandomAccessFileReader>&& file,
+    const EnvOptions& storage_options, const InternalKeyComparator& icomparator,
+    EncodingType encoding_type, uint64_t file_size,
+    const TableProperties* table_properties,
+    const SliceTransform* prefix_extractor)
     : internal_comparator_(icomparator),
       encoding_type_(encoding_type),
       full_scan_mode_(false),
       user_key_len_(static_cast<uint32_t>(table_properties->fixed_key_len)),
-      prefix_extractor_(ioptions.prefix_extractor),
+      prefix_extractor_(prefix_extractor),
       enable_bloom_(false),
-      bloom_(6, nullptr),
+      bloom_(6),
       file_info_(std::move(file), storage_options,
                  static_cast<uint32_t>(table_properties->data_size)),
       ioptions_(ioptions),
@@ -114,22 +114,22 @@ PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions,
 PlainTableReader::~PlainTableReader() {
 }
 
-Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
-                              const EnvOptions& env_options,
-                              const InternalKeyComparator& internal_comparator,
-                              unique_ptr<RandomAccessFileReader>&& file,
-                              uint64_t file_size,
-                              unique_ptr<TableReader>* table_reader,
-                              const int bloom_bits_per_key,
-                              double hash_table_ratio, size_t index_sparseness,
-                              size_t huge_page_tlb_size, bool full_scan_mode) {
+Status PlainTableReader::Open(
+    const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
+    const InternalKeyComparator& internal_comparator,
+    std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+    std::unique_ptr<TableReader>* table_reader, const int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
+    bool full_scan_mode, const bool immortal_table,
+    const SliceTransform* prefix_extractor) {
   if (file_size > PlainTableIndex::kMaxFileSize) {
     return Status::NotSupported("File is too large for PlainTableReader!");
   }
 
   TableProperties* props = nullptr;
   auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
-                               ioptions, &props);
+                               ioptions, &props,
+                               true /* compression_type_missing */);
   if (!s.ok()) {
     return s;
   }
@@ -141,12 +141,12 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
   if (!full_scan_mode &&
       !prefix_extractor_in_file.empty() /* old version sst file*/
       && prefix_extractor_in_file != "nullptr") {
-    if (!ioptions.prefix_extractor) {
+    if (!prefix_extractor) {
       return Status::InvalidArgument(
           "Prefix extractor is missing when opening a PlainTable built "
           "using a prefix extractor");
-    } else if (prefix_extractor_in_file.compare(
-                   ioptions.prefix_extractor->Name()) != 0) {
+    } else if (prefix_extractor_in_file.compare(prefix_extractor->Name()) !=
+               0) {
       return Status::InvalidArgument(
           "Prefix extractor given doesn't match the one used to build "
           "PlainTable");
@@ -163,7 +163,7 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
 
   std::unique_ptr<PlainTableReader> new_reader(new PlainTableReader(
       ioptions, std::move(file), env_options, internal_comparator,
-      encoding_type, file_size, props));
+      encoding_type, file_size, props, prefix_extractor));
 
   s = new_reader->MmapDataIfNeeded();
   if (!s.ok()) {
@@ -182,6 +182,10 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
     new_reader->full_scan_mode_ = true;
   }
 
+  if (immortal_table && new_reader->file_info_.is_mmap_mode) {
+    new_reader->dummy_cleanable_.reset(new Cleanable());
+  }
+
   *table_reader = std::move(new_reader);
   return s;
 }
@@ -189,9 +193,9 @@ Status PlainTableReader::Open(const ImmutableCFOptions& ioptions,
 void PlainTableReader::SetupForCompaction() {
 }
 
-InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options,
-                                                Arena* arena,
-                                                bool skip_filters) {
+InternalIterator* PlainTableReader::NewIterator(
+    const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
+    Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) {
   bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
   if (arena == nullptr) {
     return new PlainTableIterator(this, use_prefix_seek);
@@ -202,7 +206,8 @@ InternalIterator* PlainTableReader::NewIterator(const ReadOptions& options,
 }
 
 Status PlainTableReader::PopulateIndexRecordList(
-    PlainTableIndexBuilder* index_builder, vector<uint32_t>* prefix_hashes) {
+    PlainTableIndexBuilder* index_builder,
+    std::vector<uint32_t>* prefix_hashes) {
   Slice prev_key_prefix_slice;
   std::string prev_key_prefix_buf;
   uint32_t pos = data_start_offset_;
@@ -210,7 +215,7 @@ Status PlainTableReader::PopulateIndexRecordList(
   bool is_first_record = true;
   Slice key_prefix_slice;
   PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
-                               ioptions_.prefix_extractor);
+                               prefix_extractor_);
   while (pos < file_info_.data_end_offset) {
     uint32_t key_offset = pos;
     ParsedInternalKey key;
@@ -252,10 +257,9 @@ Status PlainTableReader::PopulateIndexRecordList(
   return s;
 }
 
-void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
-                                            int num_prefixes,
-                                            size_t huge_page_tlb_size,
-                                            vector<uint32_t>* prefix_hashes) {
+void PlainTableReader::AllocateAndFillBloom(
+    int bloom_bits_per_key, int num_prefixes, size_t huge_page_tlb_size,
+    std::vector<uint32_t>* prefix_hashes) {
   if (!IsTotalOrderMode()) {
     uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key;
     if (bloom_total_bits > 0) {
@@ -267,7 +271,7 @@ void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key,
   }
 }
 
-void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
+void PlainTableReader::FillBloom(std::vector<uint32_t>* prefix_hashes) {
   assert(bloom_.IsInitialized());
   for (auto prefix_hash : *prefix_hashes) {
     bloom_.AddHash(prefix_hash);
@@ -277,7 +281,7 @@ void PlainTableReader::FillBloom(vector<uint32_t>* prefix_hashes) {
 Status PlainTableReader::MmapDataIfNeeded() {
   if (file_info_.is_mmap_mode) {
     // Get mmapped memory.
-    return file_info_.file->Read(0, file_size_, &file_info_.file_data, nullptr);
+    return file_info_.file->Read(0, static_cast<size_t>(file_size_), &file_info_.file_data, nullptr);
   }
   return Status::OK();
 }
@@ -294,7 +298,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                            file_size_, kPlainTableMagicNumber, ioptions_,
                            PlainTableIndexBuilder::kPlainTableIndexBlock,
-                           &index_block_contents);
+                           &index_block_contents,
+                           true /* compression_type_missing */);
 
   bool index_in_file = s.ok();
 
@@ -304,7 +309,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
   if (index_in_file) {
     s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */,
                       file_size_, kPlainTableMagicNumber, ioptions_,
-                      BloomBlockBuilder::kBloomBlock, &bloom_block_contents);
+                      BloomBlockBuilder::kBloomBlock, &bloom_block_contents,
+                      true /* compression_type_missing */);
     bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0;
   }
 
@@ -330,9 +336,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     index_block = nullptr;
   }
 
-  if ((ioptions_.prefix_extractor == nullptr) &&
-      (hash_table_ratio != 0)) {
-    // ioptions.prefix_extractor is requried for a hash-based look-up.
+  if ((prefix_extractor_ == nullptr) && (hash_table_ratio != 0)) {
+    // moptions.prefix_extractor is requried for a hash-based look-up.
     return Status::NotSupported(
         "PlainTable requires a prefix extractor enable prefix hash mode.");
   }
@@ -377,8 +382,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props,
     bloom_bits_per_key = 0;
   }
 
-  PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness,
-                                       hash_table_ratio, huge_page_tlb_size);
+  PlainTableIndexBuilder index_builder(&arena_, ioptions_, prefix_extractor_,
+                                       index_sparseness, hash_table_ratio,
+                                       huge_page_tlb_size);
 
   std::vector<uint32_t> prefix_hashes;
   if (!index_in_file) {
@@ -537,8 +543,10 @@ void PlainTableReader::Prepare(const Slice& target) {
   }
 }
 
-Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
-                             GetContext* get_context, bool skip_filters) {
+Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target,
+                             GetContext* get_context,
+                             const SliceTransform* /* prefix_extractor */,
+                             bool /*skip_filters*/) {
   // Check bloom filter first.
   Slice prefix_slice;
   uint32_t prefix_hash;
@@ -565,7 +573,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
   uint32_t offset;
   bool prefix_match;
   PlainTableKeyDecoder decoder(&file_info_, encoding_type_, user_key_len_,
-                               ioptions_.prefix_extractor);
+                               prefix_extractor_);
   Status s = GetOffset(&decoder, target, prefix_slice, prefix_hash,
                        prefix_match, &offset);
 
@@ -594,7 +602,9 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
     // TODO(ljin): since we know the key comparison result here,
     // can we enable the fast path?
     if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
-      if (!get_context->SaveValue(found_key, found_value)) {
+      bool dont_care __attribute__((__unused__));
+      if (!get_context->SaveValue(found_key, found_value, &dont_care,
+                                  dummy_cleanable_.get())) {
         break;
       }
     }
@@ -602,7 +612,7 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target,
   return Status::OK();
 }
 
-uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& key) {
+uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/) {
   return 0;
 }
 
@@ -624,6 +634,7 @@ bool PlainTableIterator::Valid() const {
 }
 
 void PlainTableIterator::SeekToFirst() {
+  status_ = Status::OK();
   next_offset_ = table_->data_start_offset_;
   if (next_offset_ >= table_->file_info_.data_end_offset) {
     next_offset_ = offset_ = table_->file_info_.data_end_offset;
@@ -635,6 +646,7 @@ void PlainTableIterator::SeekToFirst() {
 void PlainTableIterator::SeekToLast() {
   assert(false);
   status_ = Status::NotSupported("SeekToLast() is not supported in PlainTable");
+  next_offset_ = offset_ = table_->file_info_.data_end_offset;
 }
 
 void PlainTableIterator::Seek(const Slice& target) {
@@ -675,6 +687,7 @@ void PlainTableIterator::Seek(const Slice& target) {
   if (!table_->IsTotalOrderMode()) {
     prefix_hash = GetSliceHash(prefix_slice);
     if (!table_->MatchBloom(prefix_hash)) {
+      status_ = Status::OK();
       offset_ = next_offset_ = table_->file_info_.data_end_offset;
       return;
     }
@@ -706,10 +719,11 @@ void PlainTableIterator::Seek(const Slice& target) {
   }
 }
 
-void PlainTableIterator::SeekForPrev(const Slice& target) {
+void PlainTableIterator::SeekForPrev(const Slice& /*target*/) {
   assert(false);
   status_ =
       Status::NotSupported("SeekForPrev() is not supported in PlainTable");
+  offset_ = next_offset_ = table_->file_info_.data_end_offset;
 }
 
 void PlainTableIterator::Next() {
diff --git a/thirdparty/rocksdb/table/plain_table_reader.h b/thirdparty/rocksdb/table/plain_table_reader.h
index 6bf8da2f98..022886b729 100644
--- a/thirdparty/rocksdb/table/plain_table_reader.h
+++ b/thirdparty/rocksdb/table/plain_table_reader.h
@@ -38,20 +38,16 @@ class TableReader;
 class InternalKeyComparator;
 class PlainTableKeyDecoder;
 class GetContext;
-class InternalIterator;
 
-using std::unique_ptr;
-using std::unordered_map;
-using std::vector;
 extern const uint32_t kPlainTableVariableLength;
 
 struct PlainTableReaderFileInfo {
   bool is_mmap_mode;
   Slice file_data;
   uint32_t data_end_offset;
-  unique_ptr<RandomAccessFileReader> file;
+  std::unique_ptr<RandomAccessFileReader> file;
 
-  PlainTableReaderFileInfo(unique_ptr<RandomAccessFileReader>&& _file,
+  PlainTableReaderFileInfo(std::unique_ptr<RandomAccessFileReader>&& _file,
                            const EnvOptions& storage_options,
                            uint32_t _data_size_offset)
       : is_mmap_mode(storage_options.use_mmap_reads),
@@ -72,19 +68,23 @@ class PlainTableReader: public TableReader {
   static Status Open(const ImmutableCFOptions& ioptions,
                      const EnvOptions& env_options,
                      const InternalKeyComparator& internal_comparator,
-                     unique_ptr<RandomAccessFileReader>&& file,
-                     uint64_t file_size, unique_ptr<TableReader>* table,
+                     std::unique_ptr<RandomAccessFileReader>&& file,
+                     uint64_t file_size, std::unique_ptr<TableReader>* table,
                      const int bloom_bits_per_key, double hash_table_ratio,
                      size_t index_sparseness, size_t huge_page_tlb_size,
-                     bool full_scan_mode);
+                     bool full_scan_mode, const bool immortal_table = false,
+                     const SliceTransform* prefix_extractor = nullptr);
 
   InternalIterator* NewIterator(const ReadOptions&,
+                                const SliceTransform* prefix_extractor,
                                 Arena* arena = nullptr,
-                                bool skip_filters = false) override;
+                                bool skip_filters = false,
+                                bool for_compaction = false) override;
 
   void Prepare(const Slice& target) override;
 
-  Status Get(const ReadOptions&, const Slice& key, GetContext* get_context,
+  Status Get(const ReadOptions& readOptions, const Slice& key,
+             GetContext* get_context, const SliceTransform* prefix_extractor,
              bool skip_filters = false) override;
 
   uint64_t ApproximateOffsetOf(const Slice& key) override;
@@ -101,11 +101,12 @@ class PlainTableReader: public TableReader {
   }
 
   PlainTableReader(const ImmutableCFOptions& ioptions,
-                   unique_ptr<RandomAccessFileReader>&& file,
+                   std::unique_ptr<RandomAccessFileReader>&& file,
                    const EnvOptions& env_options,
                    const InternalKeyComparator& internal_comparator,
                    EncodingType encoding_type, uint64_t file_size,
-                   const TableProperties* table_properties);
+                   const TableProperties* table_properties,
+                   const SliceTransform* prefix_extractor);
   virtual ~PlainTableReader();
 
  protected:
@@ -149,10 +150,11 @@ class PlainTableReader: public TableReader {
   DynamicBloom bloom_;
   PlainTableReaderFileInfo file_info_;
   Arena arena_;
-  std::unique_ptr<char[]> index_block_alloc_;
-  std::unique_ptr<char[]> bloom_block_alloc_;
+  CacheAllocationPtr index_block_alloc_;
+  CacheAllocationPtr bloom_block_alloc_;
 
   const ImmutableCFOptions& ioptions_;
+  std::unique_ptr<Cleanable> dummy_cleanable_;
   uint64_t file_size_;
   std::shared_ptr<const TableProperties> table_properties_;
 
@@ -197,14 +199,14 @@ class PlainTableReader: public TableReader {
   // If bloom_ is not null, all the keys' full-key hash will be added to the
   // bloom filter.
   Status PopulateIndexRecordList(PlainTableIndexBuilder* index_builder,
-                                 vector<uint32_t>* prefix_hashes);
+                                 std::vector<uint32_t>* prefix_hashes);
 
   // Internal helper function to allocate memory for bloom filter and fill it
   void AllocateAndFillBloom(int bloom_bits_per_key, int num_prefixes,
                             size_t huge_page_tlb_size,
-                            vector<uint32_t>* prefix_hashes);
+                            std::vector<uint32_t>* prefix_hashes);
 
-  void FillBloom(vector<uint32_t>* prefix_hashes);
+  void FillBloom(std::vector<uint32_t>* prefix_hashes);
 
   // Read the key and value at `offset` to parameters for keys, the and
   // `seekable`.
diff --git a/thirdparty/rocksdb/table/sst_file_reader.cc b/thirdparty/rocksdb/table/sst_file_reader.cc
new file mode 100644
index 0000000000..54408bb50e
--- /dev/null
+++ b/thirdparty/rocksdb/table/sst_file_reader.cc
@@ -0,0 +1,87 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/sst_file_reader.h"
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "options/cf_options.h"
+#include "table/get_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "util/file_reader_writer.h"
+
+namespace rocksdb {
+
+struct SstFileReader::Rep {
+  Options options;
+  EnvOptions soptions;
+  ImmutableCFOptions ioptions;
+  MutableCFOptions moptions;
+
+  std::unique_ptr<TableReader> table_reader;
+
+  Rep(const Options& opts)
+      : options(opts),
+        soptions(options),
+        ioptions(options),
+        moptions(ColumnFamilyOptions(options)) {}
+};
+
+SstFileReader::SstFileReader(const Options& options) : rep_(new Rep(options)) {}
+
+SstFileReader::~SstFileReader() {}
+
+Status SstFileReader::Open(const std::string& file_path) {
+  auto r = rep_.get();
+  Status s;
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFile> file;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  s = r->options.env->GetFileSize(file_path, &file_size);
+  if (s.ok()) {
+    s = r->options.env->NewRandomAccessFile(file_path, &file, r->soptions);
+  }
+  if (s.ok()) {
+    file_reader.reset(new RandomAccessFileReader(std::move(file), file_path));
+  }
+  if (s.ok()) {
+    TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor.get(),
+                             r->soptions, r->ioptions.internal_comparator);
+    // Allow open file with global sequence number for backward compatibility.
+    t_opt.largest_seqno = kMaxSequenceNumber;
+    s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader),
+                                                 file_size, &r->table_reader);
+  }
+  return s;
+}
+
+Iterator* SstFileReader::NewIterator(const ReadOptions& options) {
+  auto r = rep_.get();
+  auto sequence = options.snapshot != nullptr
+                      ? options.snapshot->GetSequenceNumber()
+                      : kMaxSequenceNumber;
+  auto internal_iter =
+      r->table_reader->NewIterator(options, r->moptions.prefix_extractor.get());
+  return NewDBIterator(r->options.env, options, r->ioptions, r->moptions,
+                       r->ioptions.user_comparator, internal_iter, sequence,
+                       r->moptions.max_sequential_skip_in_iterations,
+                       nullptr /* read_callback */);
+}
+
+std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
+    const {
+  return rep_->table_reader->GetTableProperties();
+}
+
+Status SstFileReader::VerifyChecksum() {
+  return rep_->table_reader->VerifyChecksum();
+}
+
+}  // namespace rocksdb
+
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/table/sst_file_reader_test.cc b/thirdparty/rocksdb/table/sst_file_reader_test.cc
new file mode 100644
index 0000000000..51bc975af0
--- /dev/null
+++ b/thirdparty/rocksdb/table/sst_file_reader_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <inttypes.h>
+
+#include "rocksdb/db.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksdb/sst_file_writer.h"
+#include "table/sst_file_writer_collectors.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+std::string EncodeAsString(uint64_t v) {
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%08" PRIu64, v);
+  return std::string(buf);
+}
+
+std::string EncodeAsUint64(uint64_t v) {
+  std::string dst;
+  PutFixed64(&dst, v);
+  return dst;
+}
+
+class SstFileReaderTest : public testing::Test {
+ public:
+  SstFileReaderTest() {
+    options_.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    sst_name_ = test::PerThreadDBPath("sst_file");
+  }
+
+  ~SstFileReaderTest() {
+    Status s = Env::Default()->DeleteFile(sst_name_);
+    assert(s.ok());
+  }
+
+  void CreateFile(const std::string& file_name,
+                  const std::vector<std::string>& keys) {
+    SstFileWriter writer(soptions_, options_);
+    ASSERT_OK(writer.Open(file_name));
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_OK(writer.Put(keys[i], keys[i]));
+      ASSERT_OK(writer.Merge(keys[i + 1], EncodeAsUint64(i + 1)));
+      ASSERT_OK(writer.Delete(keys[i + 2]));
+    }
+    ASSERT_OK(writer.Finish());
+  }
+
+  void CheckFile(const std::string& file_name,
+                 const std::vector<std::string>& keys,
+                 bool check_global_seqno = false) {
+    ReadOptions ropts;
+    SstFileReader reader(options_);
+    ASSERT_OK(reader.Open(file_name));
+    ASSERT_OK(reader.VerifyChecksum());
+    std::unique_ptr<Iterator> iter(reader.NewIterator(ropts));
+    iter->SeekToFirst();
+    for (size_t i = 0; i + 2 < keys.size(); i += 3) {
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i]), 0);
+      ASSERT_EQ(iter->value().compare(keys[i]), 0);
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ(iter->key().compare(keys[i + 1]), 0);
+      ASSERT_EQ(iter->value().compare(EncodeAsUint64(i + 1)), 0);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+    if (check_global_seqno) {
+      auto properties = reader.GetTableProperties();
+      ASSERT_TRUE(properties);
+      auto& user_properties = properties->user_collected_properties;
+      ASSERT_TRUE(
+          user_properties.count(ExternalSstFilePropertyNames::kGlobalSeqno));
+    }
+  }
+
+  void CreateFileAndCheck(const std::vector<std::string>& keys) {
+    CreateFile(sst_name_, keys);
+    CheckFile(sst_name_, keys);
+  }
+
+ protected:
+  Options options_;
+  EnvOptions soptions_;
+  std::string sst_name_;
+};
+
+const uint64_t kNumKeys = 100;
+
+TEST_F(SstFileReaderTest, Basic) {
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsString(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, Uint64Comparator) {
+  options_.comparator = test::Uint64Comparator();
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsUint64(i));
+  }
+  CreateFileAndCheck(keys);
+}
+
+TEST_F(SstFileReaderTest, ReadFileWithGlobalSeqno) {
+  std::vector<std::string> keys;
+  for (uint64_t i = 0; i < kNumKeys; i++) {
+    keys.emplace_back(EncodeAsString(i));
+  }
+  // Generate a SST file.
+  CreateFile(sst_name_, keys);
+
+  // Ingest the file into a db, to assign it a global sequence number.
+  Options options;
+  options.create_if_missing = true;
+  std::string db_name = test::PerThreadDBPath("test_db");
+  DB* db;
+  ASSERT_OK(DB::Open(options, db_name, &db));
+  // Bump sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), keys[0], "foo"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Ingest the file.
+  IngestExternalFileOptions ingest_options;
+  ingest_options.write_global_seqno = true;
+  ASSERT_OK(db->IngestExternalFile({sst_name_}, ingest_options));
+  std::vector<std::string> live_files;
+  uint64_t manifest_file_size = 0;
+  ASSERT_OK(db->GetLiveFiles(live_files, &manifest_file_size));
+  // Get the ingested file.
+  std::string ingested_file;
+  for (auto& live_file : live_files) {
+    if (live_file.substr(live_file.size() - 4, std::string::npos) == ".sst") {
+      if (ingested_file.empty() || ingested_file < live_file) {
+        ingested_file = live_file;
+      }
+    }
+  }
+  ASSERT_FALSE(ingested_file.empty());
+  delete db;
+
+  // Verify the file can be open and read by SstFileReader.
+  CheckFile(db_name + ingested_file, keys, true /* check_global_seqno */);
+
+  // Cleanup.
+  ASSERT_OK(DestroyDB(db_name, options));
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as SstFileReader is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/table/sst_file_writer.cc b/thirdparty/rocksdb/table/sst_file_writer.cc
index adcd91f92e..b9a7273e07 100644
--- a/thirdparty/rocksdb/table/sst_file_writer.cc
+++ b/thirdparty/rocksdb/table/sst_file_writer.cc
@@ -27,7 +27,7 @@ const size_t kFadviseTrigger = 1024 * 1024; // 1MB
 struct SstFileWriter::Rep {
   Rep(const EnvOptions& _env_options, const Options& options,
       Env::IOPriority _io_priority, const Comparator* _user_comparator,
-      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache)
+      ColumnFamilyHandle* _cfh, bool _invalidate_page_cache, bool _skip_filters)
       : env_options(_env_options),
         ioptions(options),
         mutable_cf_options(options),
@@ -35,7 +35,8 @@ struct SstFileWriter::Rep {
         internal_comparator(_user_comparator),
         cfh(_cfh),
         invalidate_page_cache(_invalidate_page_cache),
-        last_fadvise_size(0) {}
+        last_fadvise_size(0),
+        skip_filters(_skip_filters) {}
 
   std::unique_ptr<WritableFileWriter> file_writer;
   std::unique_ptr<TableBuilder> builder;
@@ -49,11 +50,12 @@ struct SstFileWriter::Rep {
   std::string column_family_name;
   ColumnFamilyHandle* cfh;
   // If true, We will give the OS a hint that this file pages is not needed
-  // everytime we write 1MB to the file.
+  // every time we write 1MB to the file.
   bool invalidate_page_cache;
   // The size of the file during the last time we called Fadvise to remove
   // cached pages from page cache.
   uint64_t last_fadvise_size;
+  bool skip_filters;
   Status Add(const Slice& user_key, const Slice& value,
              const ValueType value_type) {
     if (!builder) {
@@ -99,6 +101,42 @@ struct SstFileWriter::Rep {
     return Status::OK();
   }
 
+  Status DeleteRange(const Slice& begin_key, const Slice& end_key) {
+    if (!builder) {
+      return Status::InvalidArgument("File is not opened");
+    }
+
+    RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */);
+    if (file_info.num_range_del_entries == 0) {
+      file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+                                              tombstone.start_key_.size());
+      file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+                                             tombstone.end_key_.size());
+    } else {
+      if (internal_comparator.user_comparator()->Compare(
+              tombstone.start_key_, file_info.smallest_range_del_key) < 0) {
+        file_info.smallest_range_del_key.assign(tombstone.start_key_.data(),
+                                                tombstone.start_key_.size());
+      }
+      if (internal_comparator.user_comparator()->Compare(
+              tombstone.end_key_, file_info.largest_range_del_key) > 0) {
+        file_info.largest_range_del_key.assign(tombstone.end_key_.data(),
+                                               tombstone.end_key_.size());
+      }
+    }
+
+    auto ikey_and_end_key = tombstone.Serialize();
+    builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
+
+    // update file info
+    file_info.num_range_del_entries++;
+    file_info.file_size = builder->FileSize();
+
+    InvalidatePageCache(false /* closing */);
+
+    return Status::OK();
+  }
+
   void InvalidatePageCache(bool closing) {
     if (invalidate_page_cache == false) {
       // Fadvise disabled
@@ -122,9 +160,9 @@ SstFileWriter::SstFileWriter(const EnvOptions& env_options,
                              const Comparator* user_comparator,
                              ColumnFamilyHandle* column_family,
                              bool invalidate_page_cache,
-                             Env::IOPriority io_priority)
+                             Env::IOPriority io_priority, bool skip_filters)
     : rep_(new Rep(env_options, options, io_priority, user_comparator,
-                   column_family, invalidate_page_cache)) {
+                   column_family, invalidate_page_cache, skip_filters)) {
   rep_->file_info.file_size = 0;
 }
 
@@ -148,14 +186,24 @@ Status SstFileWriter::Open(const std::string& file_path) {
   sst_file->SetIOPriority(r->io_priority);
 
   CompressionType compression_type;
+  CompressionOptions compression_opts;
   if (r->ioptions.bottommost_compression != kDisableCompressionOption) {
     compression_type = r->ioptions.bottommost_compression;
+    if (r->ioptions.bottommost_compression_opts.enabled) {
+      compression_opts = r->ioptions.bottommost_compression_opts;
+    } else {
+      compression_opts = r->ioptions.compression_opts;
+    }
   } else if (!r->ioptions.compression_per_level.empty()) {
     // Use the compression of the last level if we have per level compression
     compression_type = *(r->ioptions.compression_per_level.rbegin());
+    compression_opts = r->ioptions.compression_opts;
   } else {
     compression_type = r->mutable_cf_options.compression;
+    compression_opts = r->ioptions.compression_opts;
   }
+  uint64_t sample_for_compression =
+      r->mutable_cf_options.sample_for_compression;
 
   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
       int_tbl_prop_collector_factories;
@@ -187,22 +235,21 @@ Status SstFileWriter::Open(const std::string& file_path) {
   }
 
   TableBuilderOptions table_builder_options(
-      r->ioptions, r->internal_comparator, &int_tbl_prop_collector_factories,
-      compression_type, r->ioptions.compression_opts,
-      nullptr /* compression_dict */, false /* skip_filters */,
+      r->ioptions, r->mutable_cf_options, r->internal_comparator,
+      &int_tbl_prop_collector_factories, compression_type,
+      sample_for_compression, compression_opts, r->skip_filters,
       r->column_family_name, unknown_level);
-  r->file_writer.reset(
-      new WritableFileWriter(std::move(sst_file), r->env_options));
+  r->file_writer.reset(new WritableFileWriter(
+      std::move(sst_file), file_path, r->env_options, r->ioptions.env,
+      nullptr /* stats */, r->ioptions.listeners));
 
   // TODO(tec) : If table_factory is using compressed block cache, we will
   // be adding the external sst file blocks into it, which is wasteful.
   r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
       table_builder_options, cf_id, r->file_writer.get()));
 
+  r->file_info = ExternalSstFileInfo();
   r->file_info.file_path = file_path;
-  r->file_info.file_size = 0;
-  r->file_info.num_entries = 0;
-  r->file_info.sequence_number = 0;
   r->file_info.version = 2;
   return s;
 }
@@ -223,12 +270,18 @@ Status SstFileWriter::Delete(const Slice& user_key) {
   return rep_->Add(user_key, Slice(), ValueType::kTypeDeletion);
 }
 
+Status SstFileWriter::DeleteRange(const Slice& begin_key,
+                                  const Slice& end_key) {
+  return rep_->DeleteRange(begin_key, end_key);
+}
+
 Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) {
   Rep* r = rep_.get();
   if (!r->builder) {
     return Status::InvalidArgument("File is not opened");
   }
-  if (r->file_info.num_entries == 0) {
+  if (r->file_info.num_entries == 0 &&
+      r->file_info.num_range_del_entries == 0) {
     return Status::InvalidArgument("Cannot create sst file with no entries");
   }
 
diff --git a/thirdparty/rocksdb/table/sst_file_writer_collectors.h b/thirdparty/rocksdb/table/sst_file_writer_collectors.h
index ce3a45f5a7..e1827939f2 100644
--- a/thirdparty/rocksdb/table/sst_file_writer_collectors.h
+++ b/thirdparty/rocksdb/table/sst_file_writer_collectors.h
@@ -5,6 +5,8 @@
 
 #pragma once
 #include <string>
+#include "db/dbformat.h"
+#include "db/table_properties_collector.h"
 #include "rocksdb/types.h"
 #include "util/string_util.h"
 
@@ -26,13 +28,21 @@ class SstFileWriterPropertiesCollector : public IntTblPropCollector {
                                             SequenceNumber global_seqno)
       : version_(version), global_seqno_(global_seqno) {}
 
-  virtual Status InternalAdd(const Slice& key, const Slice& value,
-                             uint64_t file_size) override {
+  virtual Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
+                             uint64_t /*file_size*/) override {
     // Intentionally left blank. Have no interest in collecting stats for
     // individual key/value pairs.
     return Status::OK();
   }
 
+  virtual void BlockAdd(uint64_t /* blockRawBytes */,
+                        uint64_t /* blockCompressedBytesFast */,
+                        uint64_t /* blockCompressedBytesSlow */) override {
+    // Intentionally left blank. No interest in collecting stats for
+    // blocks.
+    return;
+  }
+
   virtual Status Finish(UserCollectedProperties* properties) override {
     // File version
     std::string version_val;
@@ -68,7 +78,7 @@ class SstFileWriterPropertiesCollectorFactory
       : version_(version), global_seqno_(global_seqno) {}
 
   virtual IntTblPropCollector* CreateIntTblPropCollector(
-      uint32_t column_family_id) override {
+      uint32_t /*column_family_id*/) override {
     return new SstFileWriterPropertiesCollector(version_, global_seqno_);
   }
 
diff --git a/thirdparty/rocksdb/table/table_builder.h b/thirdparty/rocksdb/table/table_builder.h
index e5e7d6e22f..20d9a55f2f 100644
--- a/thirdparty/rocksdb/table/table_builder.h
+++ b/thirdparty/rocksdb/table/table_builder.h
@@ -13,6 +13,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "db/dbformat.h"
 #include "db/table_properties_collector.h"
 #include "options/cf_options.h"
 #include "rocksdb/options.h"
@@ -27,59 +28,83 @@ class Status;
 struct TableReaderOptions {
   // @param skip_filters Disables loading/accessing the filter block
   TableReaderOptions(const ImmutableCFOptions& _ioptions,
+                     const SliceTransform* _prefix_extractor,
                      const EnvOptions& _env_options,
                      const InternalKeyComparator& _internal_comparator,
-                     bool _skip_filters = false, int _level = -1)
+                     bool _skip_filters = false, bool _immortal = false,
+                     int _level = -1)
+      : TableReaderOptions(_ioptions, _prefix_extractor, _env_options,
+                           _internal_comparator, _skip_filters, _immortal,
+                           _level, 0 /* _largest_seqno */) {}
+
+  // @param skip_filters Disables loading/accessing the filter block
+  TableReaderOptions(const ImmutableCFOptions& _ioptions,
+                     const SliceTransform* _prefix_extractor,
+                     const EnvOptions& _env_options,
+                     const InternalKeyComparator& _internal_comparator,
+                     bool _skip_filters, bool _immortal, int _level,
+                     SequenceNumber _largest_seqno)
       : ioptions(_ioptions),
+        prefix_extractor(_prefix_extractor),
         env_options(_env_options),
         internal_comparator(_internal_comparator),
         skip_filters(_skip_filters),
-        level(_level) {}
+        immortal(_immortal),
+        level(_level),
+        largest_seqno(_largest_seqno) {}
 
   const ImmutableCFOptions& ioptions;
+  const SliceTransform* prefix_extractor;
   const EnvOptions& env_options;
   const InternalKeyComparator& internal_comparator;
   // This is only used for BlockBasedTable (reader)
   bool skip_filters;
+  // Whether the table will be valid as long as the DB is open
+  bool immortal;
   // what level this table/file is on, -1 for "not set, don't know"
   int level;
+  // largest seqno in the table
+  SequenceNumber largest_seqno;
 };
 
 struct TableBuilderOptions {
   TableBuilderOptions(
-      const ImmutableCFOptions& _ioptions,
+      const ImmutableCFOptions& _ioptions, const MutableCFOptions& _moptions,
       const InternalKeyComparator& _internal_comparator,
       const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
           _int_tbl_prop_collector_factories,
-      CompressionType _compression_type,
-      const CompressionOptions& _compression_opts,
-      const std::string* _compression_dict, bool _skip_filters,
+      CompressionType _compression_type, uint64_t _sample_for_compression,
+      const CompressionOptions& _compression_opts, bool _skip_filters,
       const std::string& _column_family_name, int _level,
-      const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0)
+      const uint64_t _creation_time = 0, const int64_t _oldest_key_time = 0,
+      const uint64_t _target_file_size = 0)
       : ioptions(_ioptions),
+        moptions(_moptions),
         internal_comparator(_internal_comparator),
         int_tbl_prop_collector_factories(_int_tbl_prop_collector_factories),
         compression_type(_compression_type),
+        sample_for_compression(_sample_for_compression),
         compression_opts(_compression_opts),
-        compression_dict(_compression_dict),
         skip_filters(_skip_filters),
         column_family_name(_column_family_name),
         level(_level),
         creation_time(_creation_time),
-        oldest_key_time(_oldest_key_time) {}
+        oldest_key_time(_oldest_key_time),
+        target_file_size(_target_file_size) {}
   const ImmutableCFOptions& ioptions;
+  const MutableCFOptions& moptions;
   const InternalKeyComparator& internal_comparator;
   const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
       int_tbl_prop_collector_factories;
   CompressionType compression_type;
+  uint64_t sample_for_compression;
   const CompressionOptions& compression_opts;
-  // Data for presetting the compression library's dictionary, or nullptr.
-  const std::string* compression_dict;
   bool skip_filters;  // only used by BlockBasedTableBuilder
   const std::string& column_family_name;
   int level; // what level this table/file is on, -1 for "not set, don't know"
   const uint64_t creation_time;
   const int64_t oldest_key_time;
+  const uint64_t target_file_size;
 };
 
 // TableBuilder provides the interface used to build a Table
diff --git a/thirdparty/rocksdb/table/table_properties.cc b/thirdparty/rocksdb/table/table_properties.cc
index 24453f6f9c..b7aaea4816 100644
--- a/thirdparty/rocksdb/table/table_properties.cc
+++ b/thirdparty/rocksdb/table/table_properties.cc
@@ -78,6 +78,11 @@ std::string TableProperties::ToString(
   AppendProperty(result, "# data blocks", num_data_blocks, prop_delim,
                  kv_delim);
   AppendProperty(result, "# entries", num_entries, prop_delim, kv_delim);
+  AppendProperty(result, "# deletions", num_deletions, prop_delim, kv_delim);
+  AppendProperty(result, "# merge operands", num_merge_operands, prop_delim,
+                 kv_delim);
+  AppendProperty(result, "# range deletions", num_range_deletions, prop_delim,
+                 kv_delim);
 
   AppendProperty(result, "raw key size", raw_key_size, prop_delim, kv_delim);
   AppendProperty(result, "raw average key size",
@@ -90,7 +95,13 @@ std::string TableProperties::ToString(
                  prop_delim, kv_delim);
 
   AppendProperty(result, "data block size", data_size, prop_delim, kv_delim);
-  AppendProperty(result, "index block size", index_size, prop_delim, kv_delim);
+  char index_block_size_str[80];
+  snprintf(index_block_size_str, sizeof(index_block_size_str),
+           "index block size (user-key? %d, delta-value? %d)",
+           static_cast<int>(index_key_is_user_key),
+           static_cast<int>(index_value_is_delta_encoded));
+  AppendProperty(result, index_block_size_str, index_size, prop_delim,
+                 kv_delim);
   if (index_partitions != 0) {
     AppendProperty(result, "# index partitions", index_partitions, prop_delim,
                    kv_delim);
@@ -107,6 +118,11 @@ std::string TableProperties::ToString(
       filter_policy_name.empty() ? std::string("N/A") : filter_policy_name,
       prop_delim, kv_delim);
 
+  AppendProperty(result, "prefix extractor name",
+                 prefix_extractor_name.empty() ? std::string("N/A")
+                                               : prefix_extractor_name,
+                 prop_delim, kv_delim);
+
   AppendProperty(result, "column family ID",
                  column_family_id == rocksdb::TablePropertiesCollectorFactory::
                                          Context::kUnknownColumnFamily
@@ -137,6 +153,11 @@ std::string TableProperties::ToString(
       compression_name.empty() ? std::string("N/A") : compression_name,
       prop_delim, kv_delim);
 
+  AppendProperty(
+      result, "SST file compression options",
+      compression_options.empty() ? std::string("N/A") : compression_options,
+      prop_delim, kv_delim);
+
   AppendProperty(result, "creation time", creation_time, prop_delim, kv_delim);
 
   AppendProperty(result, "time stamp of earliest key", oldest_key_time,
@@ -150,11 +171,16 @@ void TableProperties::Add(const TableProperties& tp) {
   index_size += tp.index_size;
   index_partitions += tp.index_partitions;
   top_level_index_size += tp.top_level_index_size;
+  index_key_is_user_key += tp.index_key_is_user_key;
+  index_value_is_delta_encoded += tp.index_value_is_delta_encoded;
   filter_size += tp.filter_size;
   raw_key_size += tp.raw_key_size;
   raw_value_size += tp.raw_value_size;
   num_data_blocks += tp.num_data_blocks;
   num_entries += tp.num_entries;
+  num_deletions += tp.num_deletions;
+  num_merge_operands += tp.num_merge_operands;
+  num_range_deletions += tp.num_range_deletions;
 }
 
 const std::string TablePropertiesNames::kDataSize  =
@@ -165,6 +191,10 @@ const std::string TablePropertiesNames::kIndexPartitions =
     "rocksdb.index.partitions";
 const std::string TablePropertiesNames::kTopLevelIndexSize =
     "rocksdb.top-level.index.size";
+const std::string TablePropertiesNames::kIndexKeyIsUserKey =
+    "rocksdb.index.key.is.user.key";
+const std::string TablePropertiesNames::kIndexValueIsDeltaEncoded =
+    "rocksdb.index.value.is.delta.encoded";
 const std::string TablePropertiesNames::kFilterSize =
     "rocksdb.filter.size";
 const std::string TablePropertiesNames::kRawKeySize =
@@ -175,6 +205,11 @@ const std::string TablePropertiesNames::kNumDataBlocks =
     "rocksdb.num.data.blocks";
 const std::string TablePropertiesNames::kNumEntries =
     "rocksdb.num.entries";
+const std::string TablePropertiesNames::kDeletedKeys = "rocksdb.deleted.keys";
+const std::string TablePropertiesNames::kMergeOperands =
+    "rocksdb.merge.operands";
+const std::string TablePropertiesNames::kNumRangeDeletions =
+    "rocksdb.num.range-deletions";
 const std::string TablePropertiesNames::kFilterPolicy =
     "rocksdb.filter.policy";
 const std::string TablePropertiesNames::kFormatVersion =
@@ -193,6 +228,8 @@ const std::string TablePropertiesNames::kPrefixExtractorName =
 const std::string TablePropertiesNames::kPropertyCollectors =
     "rocksdb.property.collectors";
 const std::string TablePropertiesNames::kCompression = "rocksdb.compression";
+const std::string TablePropertiesNames::kCompressionOptions =
+    "rocksdb.compression_options";
 const std::string TablePropertiesNames::kCreationTime = "rocksdb.creation.time";
 const std::string TablePropertiesNames::kOldestKeyTime =
     "rocksdb.oldest.key.time";
@@ -215,8 +252,9 @@ Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found) {
 
 // Seek to the compression dictionary block.
 // Return true if it successfully seeks to that block.
-Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found) {
-  return SeekToMetaBlock(meta_iter, kCompressionDictBlock, is_found);
+Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found,
+                                  BlockHandle* block_handle) {
+  return SeekToMetaBlock(meta_iter, kCompressionDictBlock, is_found, block_handle);
 }
 
 Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found,
diff --git a/thirdparty/rocksdb/table/table_properties_internal.h b/thirdparty/rocksdb/table/table_properties_internal.h
index 2a89427341..888b43d245 100644
--- a/thirdparty/rocksdb/table/table_properties_internal.h
+++ b/thirdparty/rocksdb/table/table_properties_internal.h
@@ -10,7 +10,6 @@
 
 namespace rocksdb {
 
-class InternalIterator;
 class BlockHandle;
 
 // Seek to the properties block.
@@ -21,7 +20,8 @@ Status SeekToPropertiesBlock(InternalIterator* meta_iter, bool* is_found);
 // Seek to the compression dictionary block.
 // If it successfully seeks to the properties block, "is_found" will be
 // set to true.
-Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found);
+Status SeekToCompressionDictBlock(InternalIterator* meta_iter, bool* is_found,
+                          BlockHandle* block_handle);
 
 // TODO(andrewkr) should not put all meta block in table_properties.h/cc
 Status SeekToRangeDelBlock(InternalIterator* meta_iter, bool* is_found,
diff --git a/thirdparty/rocksdb/table/table_reader.h b/thirdparty/rocksdb/table/table_reader.h
index 18fcda2737..a5f15e1304 100644
--- a/thirdparty/rocksdb/table/table_reader.h
+++ b/thirdparty/rocksdb/table/table_reader.h
@@ -9,6 +9,8 @@
 
 #pragma once
 #include <memory>
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/slice_transform.h"
 #include "table/internal_iterator.h"
 
 namespace rocksdb {
@@ -20,7 +22,6 @@ class Arena;
 struct ReadOptions;
 struct TableProperties;
 class GetContext;
-class InternalIterator;
 
 // A Table is a sorted map from strings to strings.  Tables are
 // immutable and persistent.  A Table may be safely accessed from
@@ -39,11 +40,13 @@ class TableReader {
   // skip_filters: disables checking the bloom filters even if they exist. This
   //               option is effective only for block-based table format.
   virtual InternalIterator* NewIterator(const ReadOptions&,
+                                        const SliceTransform* prefix_extractor,
                                         Arena* arena = nullptr,
-                                        bool skip_filters = false) = 0;
+                                        bool skip_filters = false,
+                                        bool for_compaction = false) = 0;
 
-  virtual InternalIterator* NewRangeTombstoneIterator(
-      const ReadOptions& read_options) {
+  virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& /*read_options*/) {
     return nullptr;
   }
 
@@ -62,7 +65,7 @@ class TableReader {
   virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
 
   // Prepare work that can be done before the real Get()
-  virtual void Prepare(const Slice& target) {}
+  virtual void Prepare(const Slice& /*target*/) {}
 
   // Report an approximation of how much memory has been used.
   virtual size_t ApproximateMemoryUsage() const = 0;
@@ -79,7 +82,9 @@ class TableReader {
   // skip_filters: disables checking the bloom filters even if they exist. This
   //               option is effective only for block-based table format.
   virtual Status Get(const ReadOptions& readOptions, const Slice& key,
-                     GetContext* get_context, bool skip_filters = false) = 0;
+                     GetContext* get_context,
+                     const SliceTransform* prefix_extractor,
+                     bool skip_filters = false) = 0;
 
   // Prefetch data corresponding to a give range of keys
   // Typically this functionality is required for table implementations that
@@ -94,7 +99,8 @@ class TableReader {
   }
 
   // convert db file to a human readable form
-  virtual Status DumpTable(WritableFile* out_file) {
+  virtual Status DumpTable(WritableFile* /*out_file*/,
+                           const SliceTransform* /*prefix_extractor*/) {
     return Status::NotSupported("DumpTable() not supported");
   }
 
diff --git a/thirdparty/rocksdb/table/table_reader_bench.cc b/thirdparty/rocksdb/table/table_reader_bench.cc
index 85e48c1fea..a9b75715b5 100644
--- a/thirdparty/rocksdb/table/table_reader_bench.cc
+++ b/thirdparty/rocksdb/table/table_reader_bench.cc
@@ -11,8 +11,6 @@ int main() {
 }
 #else
 
-#include <gflags/gflags.h>
-
 #include "db/db_impl.h"
 #include "db/dbformat.h"
 #include "monitoring/histogram.h"
@@ -25,11 +23,12 @@ int main() {
 #include "table/plain_table_factory.h"
 #include "table/table_builder.h"
 #include "util/file_reader_writer.h"
+#include "util/gflags_compat.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 namespace rocksdb {
 
@@ -71,37 +70,39 @@ uint64_t Now(Env* env, bool measured_by_nanosecond) {
 namespace {
 void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
                           ReadOptions& read_options, int num_keys1,
-                          int num_keys2, int num_iter, int prefix_len,
+                          int num_keys2, int num_iter, int /*prefix_len*/,
                           bool if_query_empty_keys, bool for_iterator,
                           bool through_db, bool measured_by_nanosecond) {
   rocksdb::InternalKeyComparator ikc(opts.comparator);
 
-  std::string file_name = test::TmpDir()
-      + "/rocksdb_table_reader_benchmark";
-  std::string dbname = test::TmpDir() + "/rocksdb_table_reader_bench_db";
+  std::string file_name =
+      test::PerThreadDBPath("rocksdb_table_reader_benchmark");
+  std::string dbname = test::PerThreadDBPath("rocksdb_table_reader_bench_db");
   WriteOptions wo;
   Env* env = Env::Default();
   TableBuilder* tb = nullptr;
   DB* db = nullptr;
   Status s;
   const ImmutableCFOptions ioptions(opts);
-  unique_ptr<WritableFileWriter> file_writer;
+  const ColumnFamilyOptions cfo(opts);
+  const MutableCFOptions moptions(cfo);
+  std::unique_ptr<WritableFileWriter> file_writer;
   if (!through_db) {
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     env->NewWritableFile(file_name, &file, env_options);
 
     std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
         int_tbl_prop_collector_factories;
 
-    file_writer.reset(new WritableFileWriter(std::move(file), env_options));
+    file_writer.reset(
+        new WritableFileWriter(std::move(file), file_name, env_options));
     int unknown_level = -1;
     tb = opts.table_factory->NewTableBuilder(
-        TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
-                            CompressionType::kNoCompression,
-                            CompressionOptions(),
-                            nullptr /* compression_dict */,
-                            false /* skip_filters */, kDefaultColumnFamilyName,
-                            unknown_level),
+        TableBuilderOptions(
+            ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+            CompressionType::kNoCompression, 0 /* sample_for_compression */,
+            CompressionOptions(), false /* skip_filters */,
+            kDefaultColumnFamilyName, unknown_level),
         0 /* column_family_id */, file_writer.get());
   } else {
     s = DB::Open(opts, dbname, &db);
@@ -126,9 +127,9 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     db->Flush(FlushOptions());
   }
 
-  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<TableReader> table_reader;
   if (!through_db) {
-    unique_ptr<RandomAccessFile> raf;
+    std::unique_ptr<RandomAccessFile> raf;
     s = env->NewRandomAccessFile(file_name, &raf, env_options);
     if (!s.ok()) {
       fprintf(stderr, "Create File Error: %s\n", s.ToString().c_str());
@@ -136,11 +137,12 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
     }
     uint64_t file_size;
     env->GetFileSize(file_name, &file_size);
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         new RandomAccessFileReader(std::move(raf), file_name));
     s = opts.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, env_options, ikc), std::move(file_reader),
-        file_size, &table_reader);
+        TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
+                           env_options, ikc),
+        std::move(file_reader), file_size, &table_reader);
     if (!s.ok()) {
       fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str());
       exit(1);
@@ -168,13 +170,13 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           if (!through_db) {
             PinnableSlice value;
             MergeContext merge_context;
-            RangeDelAggregator range_del_agg(ikc, {} /* snapshots */);
+            SequenceNumber max_covering_tombstone_seq = 0;
             GetContext get_context(ioptions.user_comparator,
                                    ioptions.merge_operator, ioptions.info_log,
                                    ioptions.statistics, GetContext::kNotFound,
                                    Slice(key), &value, nullptr, &merge_context,
-                                   &range_del_agg, env);
-            s = table_reader->Get(read_options, key, &get_context);
+                                   &max_covering_tombstone_seq, env);
+            s = table_reader->Get(read_options, key, &get_context, nullptr);
           } else {
             s = db->Get(read_options, key, &result);
           }
@@ -196,7 +198,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
           Iterator* iter = nullptr;
           InternalIterator* iiter = nullptr;
           if (!through_db) {
-            iiter = table_reader->NewIterator(read_options);
+            iiter = table_reader->NewIterator(read_options, nullptr);
           } else {
             iter = db->NewIterator(read_options);
           }
diff --git a/thirdparty/rocksdb/table/table_test.cc b/thirdparty/rocksdb/table/table_test.cc
index 178cf4243d..f217fe50aa 100644
--- a/thirdparty/rocksdb/table/table_test.cc
+++ b/thirdparty/rocksdb/table/table_test.cc
@@ -37,6 +37,7 @@
 #include "table/block_based_table_factory.h"
 #include "table/block_based_table_reader.h"
 #include "table/block_builder.h"
+#include "table/block_fetcher.h"
 #include "table/format.h"
 #include "table/get_context.h"
 #include "table/internal_iterator.h"
@@ -64,13 +65,17 @@ namespace {
 // DummyPropertiesCollector used to test BlockBasedTableProperties
 class DummyPropertiesCollector : public TablePropertiesCollector {
  public:
-  const char* Name() const { return ""; }
+  const char* Name() const override { return ""; }
 
-  Status Finish(UserCollectedProperties* properties) { return Status::OK(); }
+  Status Finish(UserCollectedProperties* /*properties*/) override {
+    return Status::OK();
+  }
 
-  Status Add(const Slice& user_key, const Slice& value) { return Status::OK(); }
+  Status Add(const Slice& /*user_key*/, const Slice& /*value*/) override {
+    return Status::OK();
+  }
 
-  virtual UserCollectedProperties GetReadableProperties() const {
+  UserCollectedProperties GetReadableProperties() const override {
     return UserCollectedProperties{};
   }
 };
@@ -78,21 +83,21 @@ class DummyPropertiesCollector : public TablePropertiesCollector {
 class DummyPropertiesCollectorFactory1
     : public TablePropertiesCollectorFactory {
  public:
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
-      TablePropertiesCollectorFactory::Context context) {
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
     return new DummyPropertiesCollector();
   }
-  const char* Name() const { return "DummyPropertiesCollector1"; }
+  const char* Name() const override { return "DummyPropertiesCollector1"; }
 };
 
 class DummyPropertiesCollectorFactory2
     : public TablePropertiesCollectorFactory {
  public:
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
-      TablePropertiesCollectorFactory::Context context) {
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
     return new DummyPropertiesCollector();
   }
-  const char* Name() const { return "DummyPropertiesCollector2"; }
+  const char* Name() const override { return "DummyPropertiesCollector2"; }
 };
 
 // Return reverse of "key".
@@ -105,23 +110,23 @@ std::string Reverse(const Slice& key) {
 
 class ReverseKeyComparator : public Comparator {
  public:
-  virtual const char* Name() const override {
+  const char* Name() const override {
     return "rocksdb.ReverseBytewiseComparator";
   }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
   }
 
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
     std::string s = Reverse(*start);
     std::string l = Reverse(limit);
     BytewiseComparator()->FindShortestSeparator(&s, l);
     *start = Reverse(s);
   }
 
-  virtual void FindShortSuccessor(std::string* key) const override {
+  void FindShortSuccessor(std::string* key) const override {
     std::string s = Reverse(*key);
     BytewiseComparator()->FindShortSuccessor(&s);
     *key = Reverse(s);
@@ -159,6 +164,7 @@ class Constructor {
   // been added so far.  Returns the keys in sorted order in "*keys"
   // and stores the key/value pairs in "*kvmap"
   void Finish(const Options& options, const ImmutableCFOptions& ioptions,
+              const MutableCFOptions& moptions,
               const BlockBasedTableOptions& table_options,
               const InternalKeyComparator& internal_comparator,
               std::vector<std::string>* keys, stl_wrappers::KVMap* kvmap) {
@@ -169,7 +175,7 @@ class Constructor {
       keys->push_back(kv.first);
     }
     data_.clear();
-    Status s = FinishImpl(options, ioptions, table_options,
+    Status s = FinishImpl(options, ioptions, moptions, table_options,
                           internal_comparator, *kvmap);
     ASSERT_TRUE(s.ok()) << s.ToString();
   }
@@ -177,11 +183,13 @@ class Constructor {
   // Construct the data structure from the data in "data"
   virtual Status FinishImpl(const Options& options,
                             const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& moptions,
                             const BlockBasedTableOptions& table_options,
                             const InternalKeyComparator& internal_comparator,
                             const stl_wrappers::KVMap& data) = 0;
 
-  virtual InternalIterator* NewIterator() const = 0;
+  virtual InternalIterator* NewIterator(
+      const SliceTransform* prefix_extractor = nullptr) const = 0;
 
   virtual const stl_wrappers::KVMap& data() { return data_; }
 
@@ -204,14 +212,13 @@ class BlockConstructor: public Constructor {
       : Constructor(cmp),
         comparator_(cmp),
         block_(nullptr) { }
-  ~BlockConstructor() {
-    delete block_;
-  }
-  virtual Status FinishImpl(const Options& options,
-                            const ImmutableCFOptions& ioptions,
-                            const BlockBasedTableOptions& table_options,
-                            const InternalKeyComparator& internal_comparator,
-                            const stl_wrappers::KVMap& kv_map) override {
+  ~BlockConstructor() override { delete block_; }
+  Status FinishImpl(const Options& /*options*/,
+                    const ImmutableCFOptions& /*ioptions*/,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& table_options,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
     delete block_;
     block_ = nullptr;
     BlockBuilder builder(table_options.block_restart_interval);
@@ -223,12 +230,12 @@ class BlockConstructor: public Constructor {
     data_ = builder.Finish().ToString();
     BlockContents contents;
     contents.data = data_;
-    contents.cachable = false;
     block_ = new Block(std::move(contents), kDisableGlobalSequenceNumber);
     return Status::OK();
   }
-  virtual InternalIterator* NewIterator() const override {
-    return block_->NewIterator(comparator_);
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
+    return block_->NewIterator<DataBlockIter>(comparator_, comparator_);
   }
 
  private:
@@ -245,32 +252,32 @@ class KeyConvertingIterator : public InternalIterator {
   explicit KeyConvertingIterator(InternalIterator* iter,
                                  bool arena_mode = false)
       : iter_(iter), arena_mode_(arena_mode) {}
-  virtual ~KeyConvertingIterator() {
+  ~KeyConvertingIterator() override {
     if (arena_mode_) {
       iter_->~InternalIterator();
     } else {
       delete iter_;
     }
   }
-  virtual bool Valid() const override { return iter_->Valid(); }
-  virtual void Seek(const Slice& target) override {
+  bool Valid() const override { return iter_->Valid() && status_.ok(); }
+  void Seek(const Slice& target) override {
     ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
     std::string encoded;
     AppendInternalKey(&encoded, ikey);
     iter_->Seek(encoded);
   }
-  virtual void SeekForPrev(const Slice& target) override {
+  void SeekForPrev(const Slice& target) override {
     ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
     std::string encoded;
     AppendInternalKey(&encoded, ikey);
     iter_->SeekForPrev(encoded);
   }
-  virtual void SeekToFirst() override { iter_->SeekToFirst(); }
-  virtual void SeekToLast() override { iter_->SeekToLast(); }
-  virtual void Next() override { iter_->Next(); }
-  virtual void Prev() override { iter_->Prev(); }
+  void SeekToFirst() override { iter_->SeekToFirst(); }
+  void SeekToLast() override { iter_->SeekToLast(); }
+  void Next() override { iter_->Next(); }
+  void Prev() override { iter_->Prev(); }
 
-  virtual Slice key() const override {
+  Slice key() const override {
     assert(Valid());
     ParsedInternalKey parsed_key;
     if (!ParseInternalKey(iter_->key(), &parsed_key)) {
@@ -280,8 +287,8 @@ class KeyConvertingIterator : public InternalIterator {
     return parsed_key.user_key;
   }
 
-  virtual Slice value() const override { return iter_->value(); }
-  virtual Status status() const override {
+  Slice value() const override { return iter_->value(); }
+  Status status() const override {
     return status_.ok() ? iter_->status() : status_;
   }
 
@@ -298,31 +305,32 @@ class KeyConvertingIterator : public InternalIterator {
 class TableConstructor: public Constructor {
  public:
   explicit TableConstructor(const Comparator* cmp,
-                            bool convert_to_internal_key = false)
+                            bool convert_to_internal_key = false,
+                            int level = -1)
       : Constructor(cmp),
-        convert_to_internal_key_(convert_to_internal_key) {}
-  ~TableConstructor() { Reset(); }
-
-  virtual Status FinishImpl(const Options& options,
-                            const ImmutableCFOptions& ioptions,
-                            const BlockBasedTableOptions& table_options,
-                            const InternalKeyComparator& internal_comparator,
-                            const stl_wrappers::KVMap& kv_map) override {
+        convert_to_internal_key_(convert_to_internal_key),
+        level_(level) {}
+  ~TableConstructor() override { Reset(); }
+
+  Status FinishImpl(const Options& options, const ImmutableCFOptions& ioptions,
+                    const MutableCFOptions& moptions,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& internal_comparator,
+                    const stl_wrappers::KVMap& kv_map) override {
     Reset();
     soptions.use_mmap_reads = ioptions.allow_mmap_reads;
-    file_writer_.reset(test::GetWritableFileWriter(new test::StringSink()));
-    unique_ptr<TableBuilder> builder;
+    file_writer_.reset(test::GetWritableFileWriter(new test::StringSink(),
+                                                   "" /* don't care */));
+    std::unique_ptr<TableBuilder> builder;
     std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
         int_tbl_prop_collector_factories;
     std::string column_family_name;
-    int unknown_level = -1;
     builder.reset(ioptions.table_factory->NewTableBuilder(
-        TableBuilderOptions(ioptions, internal_comparator,
+        TableBuilderOptions(ioptions, moptions, internal_comparator,
                             &int_tbl_prop_collector_factories,
-                            options.compression, CompressionOptions(),
-                            nullptr /* compression_dict */,
-                            false /* skip_filters */, column_family_name,
-                            unknown_level),
+                            options.compression, options.sample_for_compression,
+                            options.compression_opts, false /* skip_filters */,
+                            column_family_name, level_),
         TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
         file_writer_.get()));
 
@@ -341,20 +349,26 @@ class TableConstructor: public Constructor {
     file_writer_->Flush();
     EXPECT_TRUE(s.ok()) << s.ToString();
 
-    EXPECT_EQ(GetSink()->contents().size(), builder->FileSize());
+    EXPECT_EQ(TEST_GetSink()->contents().size(), builder->FileSize());
 
     // Open the table
     uniq_id_ = cur_uniq_id_++;
     file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
-        GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+    const bool kSkipFilters = true;
+    const bool kImmortal = true;
     return ioptions.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, soptions, internal_comparator),
-        std::move(file_reader_), GetSink()->contents().size(), &table_reader_);
+        TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+                           internal_comparator, !kSkipFilters, !kImmortal,
+                           level_),
+        std::move(file_reader_), TEST_GetSink()->contents().size(),
+        &table_reader_);
   }
 
-  virtual InternalIterator* NewIterator() const override {
+  InternalIterator* NewIterator(
+      const SliceTransform* prefix_extractor) const override {
     ReadOptions ro;
-    InternalIterator* iter = table_reader_->NewIterator(ro);
+    InternalIterator* iter = table_reader_->NewIterator(ro, prefix_extractor);
     if (convert_to_internal_key_) {
       return new KeyConvertingIterator(iter);
     } else {
@@ -371,19 +385,20 @@ class TableConstructor: public Constructor {
     return table_reader_->ApproximateOffsetOf(key);
   }
 
-  virtual Status Reopen(const ImmutableCFOptions& ioptions) {
+  virtual Status Reopen(const ImmutableCFOptions& ioptions,
+                        const MutableCFOptions& moptions) {
     file_reader_.reset(test::GetRandomAccessFileReader(new test::StringSource(
-        GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
+        TEST_GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads)));
     return ioptions.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, soptions, *last_internal_key_),
-        std::move(file_reader_), GetSink()->contents().size(), &table_reader_);
+        TableReaderOptions(ioptions, moptions.prefix_extractor.get(), soptions,
+                           *last_internal_key_),
+        std::move(file_reader_), TEST_GetSink()->contents().size(),
+        &table_reader_);
   }
 
-  virtual TableReader* GetTableReader() {
-    return table_reader_.get();
-  }
+  virtual TableReader* GetTableReader() { return table_reader_.get(); }
 
-  virtual bool AnywayDeleteIterator() const override {
+  bool AnywayDeleteIterator() const override {
     return convert_to_internal_key_;
   }
 
@@ -391,6 +406,10 @@ class TableConstructor: public Constructor {
 
   bool ConvertToInternalKey() { return convert_to_internal_key_; }
 
+  test::StringSink* TEST_GetSink() {
+    return static_cast<test::StringSink*>(file_writer_->writable_file());
+  }
+
  private:
   void Reset() {
     uniq_id_ = 0;
@@ -399,15 +418,12 @@ class TableConstructor: public Constructor {
     file_reader_.reset();
   }
 
-  test::StringSink* GetSink() {
-    return static_cast<test::StringSink*>(file_writer_->writable_file());
-  }
-
   uint64_t uniq_id_;
-  unique_ptr<WritableFileWriter> file_writer_;
-  unique_ptr<RandomAccessFileReader> file_reader_;
-  unique_ptr<TableReader> table_reader_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  std::unique_ptr<TableReader> table_reader_;
   bool convert_to_internal_key_;
+  int level_;
 
   TableConstructor();
 
@@ -430,13 +446,12 @@ class MemTableConstructor: public Constructor {
                      wb, kMaxSequenceNumber, 0 /* column_family_id */);
     memtable_->Ref();
   }
-  ~MemTableConstructor() {
-    delete memtable_->Unref();
-  }
-  virtual Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
-                            const BlockBasedTableOptions& table_options,
-                            const InternalKeyComparator& internal_comparator,
-                            const stl_wrappers::KVMap& kv_map) override {
+  ~MemTableConstructor() override { delete memtable_->Unref(); }
+  Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
     delete memtable_->Unref();
     ImmutableCFOptions mem_ioptions(ioptions);
     memtable_ = new MemTable(internal_comparator_, mem_ioptions,
@@ -450,14 +465,15 @@ class MemTableConstructor: public Constructor {
     }
     return Status::OK();
   }
-  virtual InternalIterator* NewIterator() const override {
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
     return new KeyConvertingIterator(
         memtable_->NewIterator(ReadOptions(), &arena_), true);
   }
 
-  virtual bool AnywayDeleteIterator() const override { return true; }
+  bool AnywayDeleteIterator() const override { return true; }
 
-  virtual bool IsArenaMode() const override { return true; }
+  bool IsArenaMode() const override { return true; }
 
  private:
   mutable Arena arena_;
@@ -471,21 +487,19 @@ class MemTableConstructor: public Constructor {
 class InternalIteratorFromIterator : public InternalIterator {
  public:
   explicit InternalIteratorFromIterator(Iterator* it) : it_(it) {}
-  virtual bool Valid() const override { return it_->Valid(); }
-  virtual void Seek(const Slice& target) override { it_->Seek(target); }
-  virtual void SeekForPrev(const Slice& target) override {
-    it_->SeekForPrev(target);
-  }
-  virtual void SeekToFirst() override { it_->SeekToFirst(); }
-  virtual void SeekToLast() override { it_->SeekToLast(); }
-  virtual void Next() override { it_->Next(); }
-  virtual void Prev() override { it_->Prev(); }
+  bool Valid() const override { return it_->Valid(); }
+  void Seek(const Slice& target) override { it_->Seek(target); }
+  void SeekForPrev(const Slice& target) override { it_->SeekForPrev(target); }
+  void SeekToFirst() override { it_->SeekToFirst(); }
+  void SeekToLast() override { it_->SeekToLast(); }
+  void Next() override { it_->Next(); }
+  void Prev() override { it_->Prev(); }
   Slice key() const override { return it_->key(); }
   Slice value() const override { return it_->value(); }
-  virtual Status status() const override { return it_->status(); }
+  Status status() const override { return it_->status(); }
 
  private:
-  unique_ptr<Iterator> it_;
+  std::unique_ptr<Iterator> it_;
 };
 
 class DBConstructor: public Constructor {
@@ -496,14 +510,13 @@ class DBConstructor: public Constructor {
     db_ = nullptr;
     NewDB();
   }
-  ~DBConstructor() {
-    delete db_;
-  }
-  virtual Status FinishImpl(const Options& options,
-                            const ImmutableCFOptions& ioptions,
-                            const BlockBasedTableOptions& table_options,
-                            const InternalKeyComparator& internal_comparator,
-                            const stl_wrappers::KVMap& kv_map) override {
+  ~DBConstructor() override { delete db_; }
+  Status FinishImpl(const Options& /*options*/,
+                    const ImmutableCFOptions& /*ioptions*/,
+                    const MutableCFOptions& /*moptions*/,
+                    const BlockBasedTableOptions& /*table_options*/,
+                    const InternalKeyComparator& /*internal_comparator*/,
+                    const stl_wrappers::KVMap& kv_map) override {
     delete db_;
     db_ = nullptr;
     NewDB();
@@ -515,15 +528,16 @@ class DBConstructor: public Constructor {
     return Status::OK();
   }
 
-  virtual InternalIterator* NewIterator() const override {
+  InternalIterator* NewIterator(
+      const SliceTransform* /*prefix_extractor*/) const override {
     return new InternalIteratorFromIterator(db_->NewIterator(ReadOptions()));
   }
 
-  virtual DB* db() const override { return db_; }
+  DB* db() const override { return db_; }
 
  private:
   void NewDB() {
-    std::string name = test::TmpDir() + "/table_testdb";
+    std::string name = test::PerThreadDBPath("table_testdb");
 
     Options options;
     options.comparator = comparator_;
@@ -655,9 +669,9 @@ class FixedOrLessPrefixTransform : public SliceTransform {
       prefix_len_(prefix_len) {
   }
 
-  virtual const char* Name() const override { return "rocksdb.FixedPrefix"; }
+  const char* Name() const override { return "rocksdb.FixedPrefix"; }
 
-  virtual Slice Transform(const Slice& src) const override {
+  Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
     if (src.size() < prefix_len_) {
       return src;
@@ -665,17 +679,19 @@ class FixedOrLessPrefixTransform : public SliceTransform {
     return Slice(src.data(), prefix_len_);
   }
 
-  virtual bool InDomain(const Slice& src) const override { return true; }
+  bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  virtual bool InRange(const Slice& dst) const override {
+  bool InRange(const Slice& dst) const override {
     return (dst.size() <= prefix_len_);
   }
+  bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
 };
 
 class HarnessTest : public testing::Test {
  public:
   HarnessTest()
       : ioptions_(options_),
+        moptions_(options_),
         constructor_(nullptr),
         write_buffer_(options_.db_write_buffer_size) {}
 
@@ -774,9 +790,10 @@ class HarnessTest : public testing::Test {
         break;
     }
     ioptions_ = ImmutableCFOptions(options_);
+    moptions_ = MutableCFOptions(options_);
   }
 
-  ~HarnessTest() { delete constructor_; }
+  ~HarnessTest() override { delete constructor_; }
 
   void Add(const std::string& key, const std::string& value) {
     constructor_->Add(key, value);
@@ -785,7 +802,7 @@ class HarnessTest : public testing::Test {
   void Test(Random* rnd) {
     std::vector<std::string> keys;
     stl_wrappers::KVMap data;
-    constructor_->Finish(options_, ioptions_, table_options_,
+    constructor_->Finish(options_, ioptions_, moptions_, table_options_,
                          *internal_comparator_, &keys, &data);
 
     TestForwardScan(keys, data);
@@ -795,7 +812,7 @@ class HarnessTest : public testing::Test {
     TestRandomAccess(rnd, keys, data);
   }
 
-  void TestForwardScan(const std::vector<std::string>& keys,
+  void TestForwardScan(const std::vector<std::string>& /*keys*/,
                        const stl_wrappers::KVMap& data) {
     InternalIterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
@@ -813,7 +830,7 @@ class HarnessTest : public testing::Test {
     }
   }
 
-  void TestBackwardScan(const std::vector<std::string>& keys,
+  void TestBackwardScan(const std::vector<std::string>& /*keys*/,
                         const stl_wrappers::KVMap& data) {
     InternalIterator* iter = constructor_->NewIterator();
     ASSERT_TRUE(!iter->Valid());
@@ -963,15 +980,38 @@ class HarnessTest : public testing::Test {
   // Returns nullptr if not running against a DB
   DB* db() const { return constructor_->db(); }
 
+  void RandomizedHarnessTest(size_t part, size_t total) {
+    std::vector<TestArgs> args = GenerateArgList();
+    assert(part);
+    assert(part <= total);
+    for (size_t i = 0; i < args.size(); i++) {
+      if ((i % total) + 1 != part) {
+        continue;
+      }
+      Init(args[i]);
+      Random rnd(test::RandomSeed() + 5);
+      for (int num_entries = 0; num_entries < 2000;
+           num_entries += (num_entries < 50 ? 1 : 200)) {
+        for (int e = 0; e < num_entries; e++) {
+          std::string v;
+          Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+              test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+        }
+        Test(&rnd);
+      }
+    }
+  }
+
  private:
   Options options_ = Options();
   ImmutableCFOptions ioptions_;
+  MutableCFOptions moptions_;
   BlockBasedTableOptions table_options_ = BlockBasedTableOptions();
   Constructor* constructor_;
   WriteBufferManager write_buffer_;
   bool support_prev_;
   bool only_support_prefix_seek_;
-  shared_ptr<InternalKeyComparator> internal_comparator_;
+  std::shared_ptr<InternalKeyComparator> internal_comparator_;
 };
 
 static bool Between(uint64_t val, uint64_t low, uint64_t high) {
@@ -1003,9 +1043,32 @@ class TableTest : public testing::Test {
 };
 
 class GeneralTableTest : public TableTest {};
-class BlockBasedTableTest : public TableTest {};
+class BlockBasedTableTest
+    : public TableTest,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  BlockBasedTableTest() : format_(GetParam()) {}
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions options;
+    options.format_version = format_;
+    return options;
+  }
+
+ protected:
+  uint64_t IndexUncompressedHelper(bool indexCompress);
+
+ private:
+  uint32_t format_;
+};
 class PlainTableTest : public TableTest {};
 class TablePropertyTest : public testing::Test {};
+class BBTTailPrefetchTest : public TableTest {};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, BlockBasedTableTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, BlockBasedTableTest,
+                        testing::Values(test::kLatestFormatVersion));
 
 // This test serves as the living tutorial for the prefix scan of user collected
 // properties.
@@ -1046,7 +1109,7 @@ TEST_F(TablePropertyTest, PrefixScanTest) {
 
 // This test include all the basic checks except those for index size and block
 // size, which will be conducted in separated unit tests.
-TEST_F(BlockBasedTableTest, BasicBlockBasedTableProperties) {
+TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
 
   c.Add("a1", "val1");
@@ -1064,13 +1127,18 @@ TEST_F(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   stl_wrappers::KVMap kvmap;
   Options options;
   options.compression = kNoCompression;
-  BlockBasedTableOptions table_options;
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.block_restart_interval = 1;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
-  const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options,
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  ioptions.statistics = options.statistics.get();
+  c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0);
 
   auto& props = *c.GetTableReader()->GetTableProperties();
   ASSERT_EQ(kvmap.size(), props.num_entries);
@@ -1094,7 +1162,43 @@ TEST_F(BlockBasedTableTest, BasicBlockBasedTableProperties) {
   c.ResetTableReader();
 }
 
-TEST_F(BlockBasedTableTest, BlockBasedTableProperties2) {
+#ifdef SNAPPY
+uint64_t BlockBasedTableTest::IndexUncompressedHelper(bool compressed) {
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  constexpr size_t kNumKeys = 10000;
+
+  for (size_t k = 0; k < kNumKeys; ++k) {
+    c.Add("key" + ToString(k), "val" + ToString(k));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  Options options;
+  options.compression = kSnappyCompression;
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_restart_interval = 1;
+  table_options.enable_index_compression = compressed;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  ioptions.statistics = options.statistics.get();
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+  c.ResetTableReader();
+  return options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+}
+TEST_P(BlockBasedTableTest, IndexUncompressed) {
+  uint64_t tbl1_compressed_cnt = IndexUncompressedHelper(true);
+  uint64_t tbl2_compressed_cnt = IndexUncompressedHelper(false);
+  // tbl1_compressed_cnt should include 1 index block
+  EXPECT_EQ(tbl2_compressed_cnt + 1, tbl1_compressed_cnt);
+}
+#endif  // SNAPPY
+
+TEST_P(BlockBasedTableTest, BlockBasedTableProperties2) {
   TableConstructor c(&reverse_key_comparator);
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
@@ -1102,11 +1206,12 @@ TEST_F(BlockBasedTableTest, BlockBasedTableProperties2) {
   {
     Options options;
     options.compression = CompressionType::kNoCompression;
-    BlockBasedTableOptions table_options;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     const ImmutableCFOptions ioptions(options);
-    c.Finish(options, ioptions, table_options,
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
     auto& props = *c.GetTableReader()->GetTableProperties();
@@ -1128,7 +1233,7 @@ TEST_F(BlockBasedTableTest, BlockBasedTableProperties2) {
 
   {
     Options options;
-    BlockBasedTableOptions table_options;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
     options.comparator = &reverse_key_comparator;
     options.merge_operator = MergeOperators::CreateUInt64AddOperator();
@@ -1139,7 +1244,8 @@ TEST_F(BlockBasedTableTest, BlockBasedTableProperties2) {
         new DummyPropertiesCollectorFactory2());
 
     const ImmutableCFOptions ioptions(options);
-    c.Finish(options, ioptions, table_options,
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
     auto& props = *c.GetTableReader()->GetTableProperties();
@@ -1154,11 +1260,18 @@ TEST_F(BlockBasedTableTest, BlockBasedTableProperties2) {
   }
 }
 
-TEST_F(BlockBasedTableTest, RangeDelBlock) {
+TEST_P(BlockBasedTableTest, RangeDelBlock) {
   TableConstructor c(BytewiseComparator());
   std::vector<std::string> keys = {"1pika", "2chu"};
   std::vector<std::string> vals = {"p", "c"};
 
+  std::vector<RangeTombstone> expected_tombstones = {
+      {"1pika", "2chu", 0},
+      {"2chu", "c", 1},
+      {"2chu", "c", 0},
+      {"c", "p", 0},
+  };
+
   for (int i = 0; i < 2; i++) {
     RangeTombstone t(keys[i], vals[i], i);
     std::pair<InternalKey, Slice> p = t.Serialize();
@@ -1169,15 +1282,16 @@ TEST_F(BlockBasedTableTest, RangeDelBlock) {
   stl_wrappers::KVMap kvmap;
   Options options;
   options.compression = kNoCompression;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.block_restart_interval = 1;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
   std::unique_ptr<InternalKeyComparator> internal_cmp(
       new InternalKeyComparator(options.comparator));
-  c.Finish(options, ioptions, table_options, *internal_cmp, &sorted_keys,
-           &kvmap);
+  c.Finish(options, ioptions, moptions, table_options, *internal_cmp,
+           &sorted_keys, &kvmap);
 
   for (int j = 0; j < 2; ++j) {
     std::unique_ptr<InternalIterator> iter(
@@ -1190,32 +1304,34 @@ TEST_F(BlockBasedTableTest, RangeDelBlock) {
     ASSERT_FALSE(iter->Valid());
     iter->SeekToFirst();
     ASSERT_TRUE(iter->Valid());
-    for (int i = 0; i < 2; i++) {
+    for (size_t i = 0; i < expected_tombstones.size(); i++) {
       ASSERT_TRUE(iter->Valid());
       ParsedInternalKey parsed_key;
       ASSERT_TRUE(ParseInternalKey(iter->key(), &parsed_key));
       RangeTombstone t(parsed_key, iter->value());
-      ASSERT_EQ(t.start_key_, keys[i]);
-      ASSERT_EQ(t.end_key_, vals[i]);
-      ASSERT_EQ(t.seq_, i);
+      const auto& expected_t = expected_tombstones[i];
+      ASSERT_EQ(t.start_key_, expected_t.start_key_);
+      ASSERT_EQ(t.end_key_, expected_t.end_key_);
+      ASSERT_EQ(t.seq_, expected_t.seq_);
       iter->Next();
     }
     ASSERT_TRUE(!iter->Valid());
   }
 }
 
-TEST_F(BlockBasedTableTest, FilterPolicyNameProperties) {
+TEST_P(BlockBasedTableTest, FilterPolicyNameProperties) {
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   c.Add("a1", "val1");
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options,
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   auto& props = *c.GetTableReader()->GetTableProperties();
   ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name);
@@ -1258,13 +1374,14 @@ void PrefetchRange(TableConstructor* c, Options* opt,
   table_options->block_cache = NewLRUCache(16 * 1024 * 1024, 4);
   opt->table_factory.reset(NewBlockBasedTableFactory(*table_options));
   const ImmutableCFOptions ioptions2(*opt);
-  ASSERT_OK(c->Reopen(ioptions2));
+  const MutableCFOptions moptions(*opt);
+  ASSERT_OK(c->Reopen(ioptions2, moptions));
 
   // prefetch
   auto* table_reader = dynamic_cast<BlockBasedTable*>(c->GetTableReader());
   Status s;
-  unique_ptr<Slice> begin, end;
-  unique_ptr<InternalKey> i_begin, i_end;
+  std::unique_ptr<Slice> begin, end;
+  std::unique_ptr<InternalKey> i_begin, i_end;
   if (key_begin != nullptr) {
     if (c->ConvertToInternalKey()) {
       i_begin.reset(new InternalKey(key_begin, kMaxSequenceNumber, kTypeValue));
@@ -1291,14 +1408,14 @@ void PrefetchRange(TableConstructor* c, Options* opt,
   c->ResetTableReader();
 }
 
-TEST_F(BlockBasedTableTest, PrefetchTest) {
+TEST_P(BlockBasedTableTest, PrefetchTest) {
   // The purpose of this test is to test the prefetching operation built into
   // BlockBasedTable.
   Options opt;
-  unique_ptr<InternalKeyComparator> ikc;
+  std::unique_ptr<InternalKeyComparator> ikc;
   ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
   opt.compression = kNoCompression;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.block_size = 1024;
   // big enough so we don't ever lose cached values.
   table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
@@ -1315,7 +1432,8 @@ TEST_F(BlockBasedTableTest, PrefetchTest) {
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(opt);
-  c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
+  const MutableCFOptions moptions(opt);
+  c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
   c.ResetTableReader();
 
   // We get the following data spread :
@@ -1358,8 +1476,8 @@ TEST_F(BlockBasedTableTest, PrefetchTest) {
   c.ResetTableReader();
 }
 
-TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
-  BlockBasedTableOptions table_options;
+TEST_P(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   for (int i = 0; i < 4; ++i) {
     Options options;
     // Make each key/value an individual block
@@ -1410,14 +1528,16 @@ TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
     std::vector<std::string> keys;
     stl_wrappers::KVMap kvmap;
     const ImmutableCFOptions ioptions(options);
-    c.Finish(options, ioptions, table_options,
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &keys, &kvmap);
     auto props = c.GetTableReader()->GetTableProperties();
     ASSERT_EQ(7u, props->num_data_blocks);
     auto* reader = c.GetTableReader();
     ReadOptions ro;
     ro.total_order_seek = true;
-    std::unique_ptr<InternalIterator> iter(reader->NewIterator(ro));
+    std::unique_ptr<InternalIterator> iter(
+        reader->NewIterator(ro, moptions.prefix_extractor.get()));
 
     iter->Seek(InternalKey("b", 0, kTypeValue).Encode());
     ASSERT_OK(iter->status());
@@ -1448,8 +1568,8 @@ TEST_F(BlockBasedTableTest, TotalOrderSeekOnHashIndex) {
   }
 }
 
-TEST_F(BlockBasedTableTest, NoopTransformSeek) {
-  BlockBasedTableOptions table_options;
+TEST_P(BlockBasedTableTest, NoopTransformSeek) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
 
   Options options;
@@ -1466,15 +1586,17 @@ TEST_F(BlockBasedTableTest, NoopTransformSeek) {
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
-  c.Finish(options, ioptions, table_options, internal_comparator, &keys,
-           &kvmap);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
 
   auto* reader = c.GetTableReader();
   for (int i = 0; i < 2; ++i) {
     ReadOptions ro;
     ro.total_order_seek = (i == 0);
-    std::unique_ptr<InternalIterator> iter(reader->NewIterator(ro));
+    std::unique_ptr<InternalIterator> iter(
+        reader->NewIterator(ro, moptions.prefix_extractor.get()));
 
     iter->Seek(key.Encode());
     ASSERT_OK(iter->status());
@@ -1483,10 +1605,10 @@ TEST_F(BlockBasedTableTest, NoopTransformSeek) {
   }
 }
 
-TEST_F(BlockBasedTableTest, SkipPrefixBloomFilter) {
+TEST_P(BlockBasedTableTest, SkipPrefixBloomFilter) {
   // if DB is opened with a prefix extractor of a different name,
   // prefix bloom is skipped when read the file
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.filter_policy.reset(NewBloomFilterPolicy(2));
   table_options.whole_key_filtering = false;
 
@@ -1501,14 +1623,18 @@ TEST_F(BlockBasedTableTest, SkipPrefixBloomFilter) {
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
   const InternalKeyComparator internal_comparator(options.comparator);
-  c.Finish(options, ioptions, table_options, internal_comparator, &keys,
-           &kvmap);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+  // TODO(Zhongyi): update test to use MutableCFOptions
   options.prefix_extractor.reset(NewFixedPrefixTransform(9));
   const ImmutableCFOptions new_ioptions(options);
-  c.Reopen(new_ioptions);
+  const MutableCFOptions new_moptions(options);
+  c.Reopen(new_ioptions, new_moptions);
   auto reader = c.GetTableReader();
-  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(ReadOptions()));
+  std::unique_ptr<InternalIterator> db_iter(
+      reader->NewIterator(ReadOptions(), new_moptions.prefix_extractor.get()));
 
   // Test point lookup
   // only one kv
@@ -1528,7 +1654,7 @@ static std::string RandomString(Random* rnd, int len) {
 }
 
 void AddInternalKey(TableConstructor* c, const std::string& prefix,
-                    int suffix_len = 800) {
+                    int /*suffix_len*/ = 800) {
   static Random rnd(1023);
   InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue);
   c->Add(k.Encode().ToString(), "v");
@@ -1565,14 +1691,17 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
   auto reader = c.GetTableReader();
 
   auto props = reader->GetTableProperties();
   ASSERT_EQ(5u, props->num_data_blocks);
 
+  // TODO(Zhongyi): update test to use MutableCFOptions
   std::unique_ptr<InternalIterator> index_iter(
-      reader->NewIterator(ReadOptions()));
+      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
 
   // -- Find keys do not exist, but have common prefix.
   std::vector<std::string> prefixes = {"001", "003", "005", "007", "009"};
@@ -1643,24 +1772,24 @@ void TableTest::IndexTest(BlockBasedTableOptions table_options) {
   c.ResetTableReader();
 }
 
-TEST_F(TableTest, BinaryIndexTest) {
-  BlockBasedTableOptions table_options;
+TEST_P(BlockBasedTableTest, BinaryIndexTest) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.index_type = BlockBasedTableOptions::kBinarySearch;
   IndexTest(table_options);
 }
 
-TEST_F(TableTest, HashIndexTest) {
-  BlockBasedTableOptions table_options;
+TEST_P(BlockBasedTableTest, HashIndexTest) {
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.index_type = BlockBasedTableOptions::kHashSearch;
   IndexTest(table_options);
 }
 
-TEST_F(TableTest, PartitionIndexTest) {
+TEST_P(BlockBasedTableTest, PartitionIndexTest) {
   const int max_index_keys = 5;
   const int est_max_index_key_value_size = 32;
   const int est_max_index_size = max_index_keys * est_max_index_key_value_size;
   for (int i = 1; i <= est_max_index_size + 1; i++) {
-    BlockBasedTableOptions table_options;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
     table_options.metadata_block_size = i;
     IndexTest(table_options);
@@ -1670,7 +1799,7 @@ TEST_F(TableTest, PartitionIndexTest) {
 // It's very hard to figure out the index block size of a block accurately.
 // To make sure we get the index size, we just make sure as key number
 // grows, the filter block size also grows.
-TEST_F(BlockBasedTableTest, IndexSizeStat) {
+TEST_P(BlockBasedTableTest, IndexSizeStat) {
   uint64_t last_index_size = 0;
 
   // we need to use random keys since the pure human readable texts
@@ -1696,12 +1825,13 @@ TEST_F(BlockBasedTableTest, IndexSizeStat) {
     stl_wrappers::KVMap kvmap;
     Options options;
     options.compression = kNoCompression;
-    BlockBasedTableOptions table_options;
+    BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
     table_options.block_restart_interval = 1;
     options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 
     const ImmutableCFOptions ioptions(options);
-    c.Finish(options, ioptions, table_options,
+    const MutableCFOptions moptions(options);
+    c.Finish(options, ioptions, moptions, table_options,
              GetPlainInternalComparator(options.comparator), &ks, &kvmap);
     auto index_size = c.GetTableReader()->GetTableProperties()->index_size;
     ASSERT_GT(index_size, last_index_size);
@@ -1710,12 +1840,12 @@ TEST_F(BlockBasedTableTest, IndexSizeStat) {
   }
 }
 
-TEST_F(BlockBasedTableTest, NumBlockStat) {
+TEST_P(BlockBasedTableTest, NumBlockStat) {
   Random rnd(test::RandomSeed());
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   Options options;
   options.compression = kNoCompression;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.block_restart_interval = 1;
   table_options.block_size = 1000;
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
@@ -1729,7 +1859,8 @@ TEST_F(BlockBasedTableTest, NumBlockStat) {
   std::vector<std::string> ks;
   stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options,
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &ks, &kvmap);
   ASSERT_EQ(kvmap.size(),
             c.GetTableReader()->GetTableProperties()->num_data_blocks);
@@ -1801,11 +1932,11 @@ class BlockCachePropertiesSnapshot {
 
 // Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
 // use block cache to store them).
-TEST_F(BlockBasedTableTest, BlockCacheDisabledTest) {
+TEST_P(BlockBasedTableTest, BlockCacheDisabledTest) {
   Options options;
   options.create_if_missing = true;
   options.statistics = CreateDBStatistics();
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.block_cache = NewLRUCache(1024, 4);
   table_options.filter_policy.reset(NewBloomFilterPolicy(10));
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
@@ -1815,7 +1946,8 @@ TEST_F(BlockBasedTableTest, BlockCacheDisabledTest) {
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   c.Add("key", "value");
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options,
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
   // preloading filter/index blocks is enabled.
@@ -1835,7 +1967,8 @@ TEST_F(BlockBasedTableTest, BlockCacheDisabledTest) {
                            GetContext::kNotFound, Slice(), nullptr, nullptr,
                            nullptr, nullptr, nullptr);
     // a hack that just to trigger BlockBasedTable::GetFilter.
-    reader->Get(ReadOptions(), "non-exist-key", &get_context);
+    reader->Get(ReadOptions(), "non-exist-key", &get_context,
+                moptions.prefix_extractor.get());
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertIndexBlockStat(0, 0);
     props.AssertFilterBlockStat(0, 0);
@@ -1844,15 +1977,15 @@ TEST_F(BlockBasedTableTest, BlockCacheDisabledTest) {
 
 // Due to the difficulities of the intersaction between statistics, this test
 // only tests the case when "index block is put to block cache"
-TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
+TEST_P(BlockBasedTableTest, FilterBlockInBlockCache) {
   // -- Table construction
   Options options;
   options.create_if_missing = true;
   options.statistics = CreateDBStatistics();
 
   // Enable the cache for index/filter blocks
-  BlockBasedTableOptions table_options;
-  table_options.block_cache = NewLRUCache(1024, 4);
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.block_cache = NewLRUCache(2048, 2);
   table_options.cache_index_and_filter_blocks = true;
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   std::vector<std::string> keys;
@@ -1861,7 +1994,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
   c.Add("key", "value");
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options,
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   // preloading filter/index blocks is prohibited.
   auto* reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
@@ -1870,7 +2004,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   // -- PART 1: Open with regular block cache.
   // Since block_cache is disabled, no cache activities will be involved.
-  unique_ptr<InternalIterator> iter;
+  std::unique_ptr<InternalIterator> iter;
 
   int64_t last_cache_bytes_read = 0;
   // At first, no block will be accessed.
@@ -1887,7 +2021,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   // Only index block will be accessed
   {
-    iter.reset(c.NewIterator());
+    iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
     BlockCachePropertiesSnapshot props(options.statistics.get());
     // NOTE: to help better highlight the "detla" of each ticker, I use
     // <last_value> + <added_value> to indicate the increment of changed
@@ -1916,7 +2050,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
 
   // Data block will be in cache
   {
-    iter.reset(c.NewIterator());
+    iter.reset(c.NewIterator(moptions.prefix_extractor.get()));
     iter->SeekToFirst();
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1, 1 + 1, /* index block hit */
@@ -1938,7 +2072,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   options.statistics = CreateDBStatistics();
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   const ImmutableCFOptions ioptions2(options);
-  c.Reopen(ioptions2);
+  const MutableCFOptions moptions2(options);
+  c.Reopen(ioptions2, moptions2);
   {
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1,  // index block miss
@@ -1951,7 +2086,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
     // Both index and data block get accessed.
     // It first cache index block then data block. But since the cache size
     // is only 1, index block will be purged after data block is inserted.
-    iter.reset(c.NewIterator());
+    iter.reset(c.NewIterator(moptions2.prefix_extractor.get()));
     BlockCachePropertiesSnapshot props(options.statistics.get());
     props.AssertEqual(1 + 1,  // index block miss
                       0, 0,   // data block miss
@@ -1983,8 +2118,9 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   InternalKey internal_key(user_key, 0, kTypeValue);
   c3.Add(internal_key.Encode().ToString(), "hello");
   ImmutableCFOptions ioptions3(options);
+  MutableCFOptions moptions3(options);
   // Generate table without filter policy
-  c3.Finish(options, ioptions3, table_options,
+  c3.Finish(options, ioptions3, moptions3, table_options,
             GetPlainInternalComparator(options.comparator), &keys, &kvmap);
   c3.ResetTableReader();
 
@@ -1993,14 +2129,16 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
   options.statistics = CreateDBStatistics();
   ImmutableCFOptions ioptions4(options);
-  ASSERT_OK(c3.Reopen(ioptions4));
+  MutableCFOptions moptions4(options);
+  ASSERT_OK(c3.Reopen(ioptions4, moptions4));
   reader = dynamic_cast<BlockBasedTable*>(c3.GetTableReader());
   ASSERT_TRUE(!reader->TEST_filter_block_preloaded());
   PinnableSlice value;
   GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
                          GetContext::kNotFound, user_key, &value, nullptr,
                          nullptr, nullptr, nullptr);
-  ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context));
+  ASSERT_OK(reader->Get(ReadOptions(), internal_key.Encode(), &get_context,
+                        moptions4.prefix_extractor.get()));
   ASSERT_STREQ(value.data(), "hello");
   BlockCachePropertiesSnapshot props(options.statistics.get());
   props.AssertFilterBlockStat(0, 0);
@@ -2031,7 +2169,7 @@ void ValidateBlockRestartInterval(int value, int expected) {
   delete factory;
 }
 
-TEST_F(BlockBasedTableTest, InvalidOptions) {
+TEST_P(BlockBasedTableTest, InvalidOptions) {
   // invalid values for block_size_deviation (<0 or >100) are silently set to 0
   ValidateBlockSizeDeviation(-10, 0);
   ValidateBlockSizeDeviation(-1, 0);
@@ -2051,7 +2189,7 @@ TEST_F(BlockBasedTableTest, InvalidOptions) {
   ValidateBlockRestartInterval(1000, 1000);
 }
 
-TEST_F(BlockBasedTableTest, BlockReadCountTest) {
+TEST_P(BlockBasedTableTest, BlockReadCountTest) {
   // bloom_filter_type = 0 -- block-based filter
   // bloom_filter_type = 0 -- full filter
   for (int bloom_filter_type = 0; bloom_filter_type < 2; ++bloom_filter_type) {
@@ -2060,7 +2198,7 @@ TEST_F(BlockBasedTableTest, BlockReadCountTest) {
       Options options;
       options.create_if_missing = true;
 
-      BlockBasedTableOptions table_options;
+      BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
       table_options.block_cache = NewLRUCache(1, 0);
       table_options.cache_index_and_filter_blocks = index_and_filter_in_cache;
       table_options.filter_policy.reset(
@@ -2075,8 +2213,9 @@ TEST_F(BlockBasedTableTest, BlockReadCountTest) {
       std::string encoded_key = internal_key.Encode().ToString();
       c.Add(encoded_key, "hello");
       ImmutableCFOptions ioptions(options);
+      MutableCFOptions moptions(options);
       // Generate table with filter policy
-      c.Finish(options, ioptions, table_options,
+      c.Finish(options, ioptions, moptions, table_options,
                GetPlainInternalComparator(options.comparator), &keys, &kvmap);
       auto reader = c.GetTableReader();
       PinnableSlice value;
@@ -2084,7 +2223,8 @@ TEST_F(BlockBasedTableTest, BlockReadCountTest) {
                              GetContext::kNotFound, user_key, &value, nullptr,
                              nullptr, nullptr, nullptr);
       get_perf_context()->Reset();
-      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context));
+      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                            moptions.prefix_extractor.get()));
       if (index_and_filter_in_cache) {
         // data, index and filter block
         ASSERT_EQ(get_perf_context()->block_read_count, 3);
@@ -2105,7 +2245,8 @@ TEST_F(BlockBasedTableTest, BlockReadCountTest) {
                                GetContext::kNotFound, user_key, &value, nullptr,
                                nullptr, nullptr, nullptr);
       get_perf_context()->Reset();
-      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context));
+      ASSERT_OK(reader->Get(ReadOptions(), encoded_key, &get_context,
+                            moptions.prefix_extractor.get()));
       ASSERT_EQ(get_context.State(), GetContext::kNotFound);
 
       if (index_and_filter_in_cache) {
@@ -2134,10 +2275,10 @@ class MockCache : public LRUCache {
             double high_pri_pool_ratio)
       : LRUCache(capacity, num_shard_bits, strict_capacity_limit,
                  high_pri_pool_ratio) {}
-  virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Handle** handle = nullptr,
-                        Priority priority = Priority::LOW) override {
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
     // Replace the deleter with our own so that we keep track of data blocks
     // erased from the cache
     deleters_[key.ToString()] = deleter;
@@ -2145,8 +2286,7 @@ class MockCache : public LRUCache {
                                 priority);
   }
   // This is called by the application right after inserting a data block
-  virtual void TEST_mark_as_data_block(const Slice& key,
-                                       size_t charge) override {
+  void TEST_mark_as_data_block(const Slice& key, size_t charge) override {
     marked_data_in_cache_[key.ToString()] = charge;
     marked_size_ += charge;
   }
@@ -2176,96 +2316,136 @@ std::map<std::string, size_t> MockCache::marked_data_in_cache_;
 // object depends on the table to be live, it then must be destructed before the
 // table is closed. This test makes sure that the only items remains in the
 // cache after the table is closed are raw data blocks.
-TEST_F(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
-  for (auto index_type :
-       {BlockBasedTableOptions::IndexType::kBinarySearch,
+TEST_P(BlockBasedTableTest, NoObjectInCacheAfterTableClose) {
+  std::vector<CompressionType> compression_types{kNoCompression};
+
+  // The following are the compression library versions supporting compression
+  // dictionaries. See the test case CacheCompressionDict in the
+  // DBBlockCacheTest suite.
+#ifdef ZLIB
+  compression_types.push_back(kZlibCompression);
+#endif  // ZLIB
+#if LZ4_VERSION_NUMBER >= 10400
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+#if ZSTD_VERSION_NUMBER >= 500
+  compression_types.push_back(kZSTD);
+#endif  // ZSTD_VERSION_NUMBER >= 500
+
+  for (int level: {-1, 0, 1, 10}) {
+    for (auto index_type :
+        {BlockBasedTableOptions::IndexType::kBinarySearch,
         BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch}) {
-    for (bool block_based_filter : {true, false}) {
-      for (bool partition_filter : {true, false}) {
-        if (partition_filter &&
-            (block_based_filter ||
-             index_type !=
-                 BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch)) {
-          continue;
-        }
-        for (bool index_and_filter_in_cache : {true, false}) {
-          for (bool pin_l0 : {true, false}) {
-            if (pin_l0 && !index_and_filter_in_cache) {
-              continue;
+      for (bool block_based_filter : {true, false}) {
+        for (bool partition_filter : {true, false}) {
+          if (partition_filter &&
+              (block_based_filter ||
+               index_type !=
+               BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch)) {
+            continue;
+          }
+          for (bool index_and_filter_in_cache : {true, false}) {
+            for (bool pin_l0 : {true, false}) {
+              for (bool pin_top_level : {true, false}) {
+                if (pin_l0 && !index_and_filter_in_cache) {
+                  continue;
+                }
+
+                for (auto compression_type : compression_types) {
+                  for (uint32_t max_dict_bytes : {0, 1 << 14}) {
+                    if (compression_type == kNoCompression && max_dict_bytes)
+                      continue;
+
+                    // Create a table
+                    Options opt;
+                    std::unique_ptr<InternalKeyComparator> ikc;
+                    ikc.reset(new test::PlainInternalKeyComparator(
+                      opt.comparator));
+                    opt.compression = compression_type;
+                    opt.compression_opts.max_dict_bytes = max_dict_bytes;
+                    BlockBasedTableOptions table_options =
+                      GetBlockBasedTableOptions();
+                    table_options.block_size = 1024;
+                    table_options.index_type = index_type;
+                    table_options.pin_l0_filter_and_index_blocks_in_cache =
+                      pin_l0;
+                    table_options.pin_top_level_index_and_filter =
+                      pin_top_level;
+                    table_options.partition_filters = partition_filter;
+                    table_options.cache_index_and_filter_blocks =
+                      index_and_filter_in_cache;
+                    // big enough so we don't ever lose cached values.
+                    table_options.block_cache = std::make_shared<MockCache>(
+                      16 * 1024 * 1024, 4, false, 0.0);
+                    table_options.filter_policy.reset(
+                      rocksdb::NewBloomFilterPolicy(10, block_based_filter));
+                    opt.table_factory.reset(NewBlockBasedTableFactory(
+                      table_options));
+
+                    bool convert_to_internal_key = false;
+                    TableConstructor c(BytewiseComparator(),
+                      convert_to_internal_key, level);
+                    std::string user_key = "k01";
+                    std::string key =
+                      InternalKey(user_key, 0, kTypeValue).Encode().ToString();
+                    c.Add(key, "hello");
+                    std::vector<std::string> keys;
+                    stl_wrappers::KVMap kvmap;
+                    const ImmutableCFOptions ioptions(opt);
+                    const MutableCFOptions moptions(opt);
+                    c.Finish(opt, ioptions, moptions, table_options, *ikc,
+                      &keys, &kvmap);
+
+                    // Doing a read to make index/filter loaded into the cache
+                    auto table_reader =
+                      dynamic_cast<BlockBasedTable*>(c.GetTableReader());
+                    PinnableSlice value;
+                    GetContext get_context(opt.comparator, nullptr, nullptr,
+                      nullptr, GetContext::kNotFound, user_key, &value,
+                      nullptr, nullptr, nullptr, nullptr);
+                    InternalKey ikey(user_key, 0, kTypeValue);
+                    auto s = table_reader->Get(ReadOptions(), key, &get_context,
+                      moptions.prefix_extractor.get());
+                    ASSERT_EQ(get_context.State(), GetContext::kFound);
+                    ASSERT_STREQ(value.data(), "hello");
+
+                    // Close the table
+                    c.ResetTableReader();
+
+                    auto usage = table_options.block_cache->GetUsage();
+                    auto pinned_usage =
+                      table_options.block_cache->GetPinnedUsage();
+                    // The only usage must be for marked data blocks
+                    ASSERT_EQ(usage, MockCache::marked_size_);
+                    // There must be some pinned data since PinnableSlice has
+                    // not released them yet
+                    ASSERT_GT(pinned_usage, 0);
+                    // Release pinnable slice reousrces
+                    value.Reset();
+                    pinned_usage = table_options.block_cache->GetPinnedUsage();
+                    ASSERT_EQ(pinned_usage, 0);
+                  }
+                }
+              }
             }
-            // Create a table
-            Options opt;
-            unique_ptr<InternalKeyComparator> ikc;
-            ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
-            opt.compression = kNoCompression;
-            BlockBasedTableOptions table_options;
-            table_options.block_size = 1024;
-            table_options.index_type =
-                BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
-            table_options.pin_l0_filter_and_index_blocks_in_cache = pin_l0;
-            table_options.partition_filters = partition_filter;
-            table_options.cache_index_and_filter_blocks =
-                index_and_filter_in_cache;
-            // big enough so we don't ever lose cached values.
-            table_options.block_cache = std::shared_ptr<rocksdb::Cache>(
-                new MockCache(16 * 1024 * 1024, 4, false, 0.0));
-            table_options.filter_policy.reset(
-                rocksdb::NewBloomFilterPolicy(10, block_based_filter));
-            opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
-
-            TableConstructor c(BytewiseComparator());
-            std::string user_key = "k01";
-            std::string key =
-                InternalKey(user_key, 0, kTypeValue).Encode().ToString();
-            c.Add(key, "hello");
-            std::vector<std::string> keys;
-            stl_wrappers::KVMap kvmap;
-            const ImmutableCFOptions ioptions(opt);
-            c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
-
-            // Doing a read to make index/filter loaded into the cache
-            auto table_reader =
-                dynamic_cast<BlockBasedTable*>(c.GetTableReader());
-            PinnableSlice value;
-            GetContext get_context(opt.comparator, nullptr, nullptr, nullptr,
-                                   GetContext::kNotFound, user_key, &value,
-                                   nullptr, nullptr, nullptr, nullptr);
-            InternalKey ikey(user_key, 0, kTypeValue);
-            auto s = table_reader->Get(ReadOptions(), key, &get_context);
-            ASSERT_EQ(get_context.State(), GetContext::kFound);
-            ASSERT_STREQ(value.data(), "hello");
-
-            // Close the table
-            c.ResetTableReader();
-
-            auto usage = table_options.block_cache->GetUsage();
-            auto pinned_usage = table_options.block_cache->GetPinnedUsage();
-            // The only usage must be for marked data blocks
-            ASSERT_EQ(usage, MockCache::marked_size_);
-            // There must be some pinned data since PinnableSlice has not
-            // released them yet
-            ASSERT_GT(pinned_usage, 0);
-            // Release pinnable slice reousrces
-            value.Reset();
-            pinned_usage = table_options.block_cache->GetPinnedUsage();
-            ASSERT_EQ(pinned_usage, 0);
           }
         }
       }
     }
-  }
+  } // level
 }
 
-TEST_F(BlockBasedTableTest, BlockCacheLeak) {
+TEST_P(BlockBasedTableTest, BlockCacheLeak) {
   // Check that when we reopen a table we don't lose access to blocks already
   // in the cache. This test checks whether the Table actually makes use of the
   // unique ID from the file.
 
   Options opt;
-  unique_ptr<InternalKeyComparator> ikc;
+  std::unique_ptr<InternalKeyComparator> ikc;
   ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
   opt.compression = kNoCompression;
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.block_size = 1024;
   // big enough so we don't ever lose cached values.
   table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
@@ -2282,9 +2462,11 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
   std::vector<std::string> keys;
   stl_wrappers::KVMap kvmap;
   const ImmutableCFOptions ioptions(opt);
-  c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap);
+  const MutableCFOptions moptions(opt);
+  c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
 
-  unique_ptr<InternalIterator> iter(c.NewIterator());
+  std::unique_ptr<InternalIterator> iter(
+      c.NewIterator(moptions.prefix_extractor.get()));
   iter->SeekToFirst();
   while (iter->Valid()) {
     iter->key();
@@ -2292,12 +2474,15 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
     iter->Next();
   }
   ASSERT_OK(iter->status());
+  iter.reset();
 
   const ImmutableCFOptions ioptions1(opt);
-  ASSERT_OK(c.Reopen(ioptions1));
+  const MutableCFOptions moptions1(opt);
+  ASSERT_OK(c.Reopen(ioptions1, moptions1));
   auto table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   for (const std::string& key : keys) {
-    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key));
+    InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+    ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
   }
   c.ResetTableReader();
 
@@ -2305,15 +2490,89 @@ TEST_F(BlockBasedTableTest, BlockCacheLeak) {
   table_options.block_cache = NewLRUCache(16 * 1024 * 1024, 4);
   opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
   const ImmutableCFOptions ioptions2(opt);
-  ASSERT_OK(c.Reopen(ioptions2));
+  const MutableCFOptions moptions2(opt);
+  ASSERT_OK(c.Reopen(ioptions2, moptions2));
   table_reader = dynamic_cast<BlockBasedTable*>(c.GetTableReader());
   for (const std::string& key : keys) {
-    ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key));
+    InternalKey ikey(key, kMaxSequenceNumber, kTypeValue);
+    ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), ikey.Encode()));
   }
   c.ResetTableReader();
 }
 
-TEST_F(BlockBasedTableTest, NewIndexIteratorLeak) {
+namespace {
+class CustomMemoryAllocator : public MemoryAllocator {
+ public:
+  const char* Name() const override { return "CustomMemoryAllocator"; }
+
+  void* Allocate(size_t size) override {
+    ++numAllocations;
+    auto ptr = new char[size + 16];
+    memcpy(ptr, "memory_allocator_", 16);  // mangle first 16 bytes
+    return reinterpret_cast<void*>(ptr + 16);
+  }
+  void Deallocate(void* p) override {
+    ++numDeallocations;
+    char* ptr = reinterpret_cast<char*>(p) - 16;
+    delete[] ptr;
+  }
+
+  std::atomic<int> numAllocations;
+  std::atomic<int> numDeallocations;
+};
+}  // namespace
+
+TEST_P(BlockBasedTableTest, MemoryAllocator) {
+  auto custom_memory_allocator = std::make_shared<CustomMemoryAllocator>();
+  {
+    Options opt;
+    std::unique_ptr<InternalKeyComparator> ikc;
+    ikc.reset(new test::PlainInternalKeyComparator(opt.comparator));
+    opt.compression = kNoCompression;
+    BlockBasedTableOptions table_options;
+    table_options.block_size = 1024;
+    LRUCacheOptions lruOptions;
+    lruOptions.memory_allocator = custom_memory_allocator;
+    lruOptions.capacity = 16 * 1024 * 1024;
+    lruOptions.num_shard_bits = 4;
+    table_options.block_cache = NewLRUCache(std::move(lruOptions));
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    TableConstructor c(BytewiseComparator(),
+                       true /* convert_to_internal_key_ */);
+    c.Add("k01", "hello");
+    c.Add("k02", "hello2");
+    c.Add("k03", std::string(10000, 'x'));
+    c.Add("k04", std::string(200000, 'x'));
+    c.Add("k05", std::string(300000, 'x'));
+    c.Add("k06", "hello3");
+    c.Add("k07", std::string(100000, 'x'));
+    std::vector<std::string> keys;
+    stl_wrappers::KVMap kvmap;
+    const ImmutableCFOptions ioptions(opt);
+    const MutableCFOptions moptions(opt);
+    c.Finish(opt, ioptions, moptions, table_options, *ikc, &keys, &kvmap);
+
+    std::unique_ptr<InternalIterator> iter(
+        c.NewIterator(moptions.prefix_extractor.get()));
+    iter->SeekToFirst();
+    while (iter->Valid()) {
+      iter->key();
+      iter->value();
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+
+  // out of scope, block cache should have been deleted, all allocations
+  // deallocated
+  EXPECT_EQ(custom_memory_allocator->numAllocations.load(),
+            custom_memory_allocator->numDeallocations.load());
+  // make sure that allocations actually happened through the cache allocator
+  EXPECT_GT(custom_memory_allocator->numAllocations.load(), 0);
+}
+
+TEST_P(BlockBasedTableTest, NewIndexIteratorLeak) {
   // A regression test to avoid data race described in
   // https://github.com/facebook/rocksdb/issues/1267
   TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
@@ -2322,13 +2581,14 @@ TEST_F(BlockBasedTableTest, NewIndexIteratorLeak) {
   c.Add("a1", "val1");
   Options options;
   options.prefix_extractor.reset(NewFixedPrefixTransform(1));
-  BlockBasedTableOptions table_options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
   table_options.index_type = BlockBasedTableOptions::kHashSearch;
   table_options.cache_index_and_filter_blocks = true;
   table_options.block_cache = NewLRUCache(0);
   options.table_factory.reset(NewBlockBasedTableFactory(table_options));
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options,
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options,
            GetPlainInternalComparator(options.comparator), &keys, &kvmap);
 
   rocksdb::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
@@ -2355,13 +2615,16 @@ TEST_F(BlockBasedTableTest, NewIndexIteratorLeak) {
 
   std::function<void()> func1 = [&]() {
     TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread1Marker");
-    std::unique_ptr<InternalIterator> iter(reader->NewIterator(ro));
+    // TODO(Zhongyi): update test to use MutableCFOptions
+    std::unique_ptr<InternalIterator> iter(
+        reader->NewIterator(ro, moptions.prefix_extractor.get()));
     iter->Seek(InternalKey("a1", 0, kTypeValue).Encode());
   };
 
   std::function<void()> func2 = [&]() {
     TEST_SYNC_POINT("BlockBasedTableTest::NewIndexIteratorLeak:Thread2Marker");
-    std::unique_ptr<InternalIterator> iter(reader->NewIterator(ro));
+    std::unique_ptr<InternalIterator> iter(
+        reader->NewIterator(ro, moptions.prefix_extractor.get()));
   };
 
   auto thread1 = port::Thread(func1);
@@ -2382,21 +2645,21 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
 
   PlainTableFactory factory(plain_table_options);
   test::StringSink sink;
-  unique_ptr<WritableFileWriter> file_writer(
-      test::GetWritableFileWriter(new test::StringSink()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(new test::StringSink(), "" /* don't care */));
   Options options;
   const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
       int_tbl_prop_collector_factories;
   std::string column_family_name;
   int unknown_level = -1;
   std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
-      TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
-                          kNoCompression, CompressionOptions(),
-                          nullptr /* compression_dict */,
-                          false /* skip_filters */, column_family_name,
-                          unknown_level),
+      TableBuilderOptions(
+          ioptions, moptions, ikc, &int_tbl_prop_collector_factories,
+          kNoCompression, 0 /* sample_for_compression */, CompressionOptions(),
+          false /* skip_filters */, column_family_name, unknown_level),
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       file_writer.get()));
 
@@ -2411,14 +2674,14 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
 
   test::StringSink* ss =
     static_cast<test::StringSink*>(file_writer->writable_file());
-  unique_ptr<RandomAccessFileReader> file_reader(
+  std::unique_ptr<RandomAccessFileReader> file_reader(
       test::GetRandomAccessFileReader(
           new test::StringSource(ss->contents(), 72242, true)));
 
   TableProperties* props = nullptr;
   auto s = ReadTableProperties(file_reader.get(), ss->contents().size(),
                                kPlainTableMagicNumber, ioptions,
-                               &props);
+                               &props, true /* compression_type_missing */);
   std::unique_ptr<TableProperties> props_guard(props);
   ASSERT_OK(s);
 
@@ -2448,7 +2711,8 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) {
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options, internal_comparator,
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
            &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
@@ -2483,14 +2747,15 @@ static void DoCompressionTest(CompressionType comp) {
   BlockBasedTableOptions table_options;
   table_options.block_size = 1024;
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options, ikc, &keys, &kvmap);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, ikc, &keys, &kvmap);
 
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"),       0,      0));
   ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"),       0,      0));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3000));
-  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6100));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"),    2000,   3500));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"),    2000,   3500));
+  ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"),    4000,   6500));
   c.ResetTableReader();
 }
 
@@ -2536,25 +2801,63 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfCompressed) {
   }
 }
 
-TEST_F(HarnessTest, Randomized) {
-  std::vector<TestArgs> args = GenerateArgList();
-  for (unsigned int i = 0; i < args.size(); i++) {
-    Init(args[i]);
-    Random rnd(test::RandomSeed() + 5);
-    for (int num_entries = 0; num_entries < 2000;
-         num_entries += (num_entries < 50 ? 1 : 200)) {
-      if ((num_entries % 10) == 0) {
-        fprintf(stderr, "case %d of %d: num_entries = %d\n", (i + 1),
-                static_cast<int>(args.size()), num_entries);
-      }
-      for (int e = 0; e < num_entries; e++) {
-        std::string v;
-        Add(test::RandomKey(&rnd, rnd.Skewed(4)),
-            test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
-      }
-      Test(&rnd);
-    }
-  }
+#ifndef ROCKSDB_VALGRIND_RUN
+// RandomizedHarnessTest is very slow for certain combination of arguments
+// Split into 8 pieces to reduce the time individual tests take.
+TEST_F(HarnessTest, Randomized1) {
+  // part 1 out of 8
+  const size_t part = 1;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized2) {
+  // part 2 out of 8
+  const size_t part = 2;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized3) {
+  // part 3 out of 8
+  const size_t part = 3;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized4) {
+  // part 4 out of 8
+  const size_t part = 4;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized5) {
+  // part 5 out of 8
+  const size_t part = 5;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized6) {
+  // part 6 out of 8
+  const size_t part = 6;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized7) {
+  // part 7 out of 8
+  const size_t part = 7;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
+}
+
+TEST_F(HarnessTest, Randomized8) {
+  // part 8 out of 8
+  const size_t part = 8;
+  const size_t total = 8;
+  RandomizedHarnessTest(part, total);
 }
 
 #ifndef ROCKSDB_LITE
@@ -2582,6 +2885,7 @@ TEST_F(HarnessTest, RandomizedLongDB) {
   ASSERT_GT(files, 0);
 }
 #endif  // ROCKSDB_LITE
+#endif  // ROCKSDB_VALGRIND_RUN
 
 class MemTableTest : public testing::Test {};
 
@@ -2617,7 +2921,8 @@ TEST_F(MemTableTest, Simple) {
       iter = memtable->NewIterator(ReadOptions(), &arena);
       arena_iter_guard.set(iter);
     } else {
-      iter = memtable->NewRangeTombstoneIterator(ReadOptions());
+      iter = memtable->NewRangeTombstoneIterator(
+          ReadOptions(), kMaxSequenceNumber /* read_seq */);
       iter_guard.reset(iter);
     }
     if (iter == nullptr) {
@@ -2717,6 +3022,26 @@ TEST_F(HarnessTest, FooterTests) {
     ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
     ASSERT_EQ(decoded_footer.version(), 1U);
   }
+  {
+    // xxhash64 block based
+    std::string encoded;
+    Footer footer(kBlockBasedTableMagicNumber, 1);
+    BlockHandle meta_index(10, 5), index(20, 15);
+    footer.set_metaindex_handle(meta_index);
+    footer.set_index_handle(index);
+    footer.set_checksum(kxxHash64);
+    footer.EncodeTo(&encoded);
+    Footer decoded_footer;
+    Slice encoded_slice(encoded);
+    decoded_footer.DecodeFrom(&encoded_slice);
+    ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber);
+    ASSERT_EQ(decoded_footer.checksum(), kxxHash64);
+    ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset());
+    ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size());
+    ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset());
+    ASSERT_EQ(decoded_footer.index_handle().size(), index.size());
+    ASSERT_EQ(decoded_footer.version(), 1U);
+  }
 // Plain table is not supported in ROCKSDB_LITE
 #ifndef ROCKSDB_LITE
   {
@@ -2781,10 +3106,14 @@ TEST_F(HarnessTest, FooterTests) {
 }
 
 class IndexBlockRestartIntervalTest
-    : public BlockBasedTableTest,
-      public ::testing::WithParamInterface<int> {
+    : public TableTest,
+      public ::testing::WithParamInterface<std::pair<int, bool>> {
  public:
-  static std::vector<int> GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; }
+  static std::vector<std::pair<int, bool>> GetRestartValues() {
+    return {{-1, false}, {0, false},  {1, false}, {8, false},
+            {16, false}, {32, false}, {-1, true}, {0, true},
+            {1, true},   {8, true},   {16, true}, {32, true}};
+  }
 };
 
 INSTANTIATE_TEST_CASE_P(
@@ -2796,12 +3125,16 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
   const int kKeySize = 100;
   const int kValSize = 500;
 
-  int index_block_restart_interval = GetParam();
+  const int index_block_restart_interval = std::get<0>(GetParam());
+  const bool value_delta_encoding = std::get<1>(GetParam());
 
   Options options;
   BlockBasedTableOptions table_options;
   table_options.block_size = 64;  // small block size to get big index block
   table_options.index_block_restart_interval = index_block_restart_interval;
+  if (value_delta_encoding) {
+    table_options.format_version = 4;
+  }
   options.table_factory.reset(new BlockBasedTableFactory(table_options));
 
   TableConstructor c(BytewiseComparator());
@@ -2816,10 +3149,13 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
   std::unique_ptr<InternalKeyComparator> comparator(
       new InternalKeyComparator(BytewiseComparator()));
   const ImmutableCFOptions ioptions(options);
-  c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap);
+  const MutableCFOptions moptions(options);
+  c.Finish(options, ioptions, moptions, table_options, *comparator, &keys,
+           &kvmap);
   auto reader = c.GetTableReader();
 
-  std::unique_ptr<InternalIterator> db_iter(reader->NewIterator(ReadOptions()));
+  std::unique_ptr<InternalIterator> db_iter(
+      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
 
   // Test point lookup
   for (auto& kv : kvmap) {
@@ -2845,7 +3181,7 @@ TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) {
 class PrefixTest : public testing::Test {
  public:
   PrefixTest() : testing::Test() {}
-  ~PrefixTest() {}
+  ~PrefixTest() override {}
 };
 
 namespace {
@@ -2865,7 +3201,7 @@ class TestPrefixExtractor : public rocksdb::SliceTransform {
     return true;
   }
 
-  bool InRange(const rocksdb::Slice& dst) const override { return true; }
+  bool InRange(const rocksdb::Slice& /*dst*/) const override { return true; }
 
   bool IsValid(const rocksdb::Slice& src) const {
     if (src.size() != 4) {
@@ -2901,7 +3237,7 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
   bbto.block_size = 262144;
   bbto.whole_key_filtering = true;
 
-  const std::string kDBPath = test::TmpDir() + "/table_prefix_test";
+  const std::string kDBPath = test::PerThreadDBPath("table_prefix_test");
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   DestroyDB(kDBPath, options);
   rocksdb::DB* db;
@@ -2923,13 +3259,22 @@ TEST_F(PrefixTest, PrefixAndWholeKeyTest) {
   // rocksdb still works.
 }
 
-TEST_F(BlockBasedTableTest, TableWithGlobalSeqno) {
-  BlockBasedTableOptions bbto;
+/*
+ * Disable TableWithGlobalSeqno since RocksDB does not store global_seqno in
+ * the SST file any more. Instead, RocksDB deduces global_seqno from the
+ * MANIFEST while reading from an SST. Therefore, it's not possible to test the
+ * functionality of global_seqno in a single, isolated unit test without the
+ * involvement of Version, VersionSet, etc.
+ */
+TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
   test::StringSink* sink = new test::StringSink();
-  unique_ptr<WritableFileWriter> file_writer(test::GetWritableFileWriter(sink));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(sink, "" /* don't care */));
   Options options;
   options.table_factory.reset(NewBlockBasedTableFactory(bbto));
   const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
   InternalKeyComparator ikc(options.comparator);
   std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
       int_tbl_prop_collector_factories;
@@ -2938,9 +3283,9 @@ TEST_F(BlockBasedTableTest, TableWithGlobalSeqno) {
                                                   0 /* global_seqno*/));
   std::string column_family_name;
   std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
-      TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
-                          kNoCompression, CompressionOptions(),
-                          nullptr /* compression_dict */,
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          0 /* sample_for_compression */, CompressionOptions(),
                           false /* skip_filters */, column_family_name, -1),
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       file_writer.get()));
@@ -2962,14 +3307,14 @@ TEST_F(BlockBasedTableTest, TableWithGlobalSeqno) {
 
   // Helper function to get version, global_seqno, global_seqno_offset
   std::function<void()> GetVersionAndGlobalSeqno = [&]() {
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         test::GetRandomAccessFileReader(
             new test::StringSource(ss_rw.contents(), 73342, true)));
 
     TableProperties* props = nullptr;
     ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
                                   kBlockBasedTableMagicNumber, ioptions,
-                                  &props));
+                                  &props, true /* compression_type_missing */));
 
     UserCollectedProperties user_props = props->user_collected_properties;
     version = DecodeFixed32(
@@ -2991,17 +3336,19 @@ TEST_F(BlockBasedTableTest, TableWithGlobalSeqno) {
   };
 
   // Helper function to get the contents of the table InternalIterator
-  unique_ptr<TableReader> table_reader;
+  std::unique_ptr<TableReader> table_reader;
   std::function<InternalIterator*()> GetTableInternalIter = [&]() {
-    unique_ptr<RandomAccessFileReader> file_reader(
+    std::unique_ptr<RandomAccessFileReader> file_reader(
         test::GetRandomAccessFileReader(
             new test::StringSource(ss_rw.contents(), 73342, true)));
 
     options.table_factory->NewTableReader(
-        TableReaderOptions(ioptions, EnvOptions(), ikc), std::move(file_reader),
-        ss_rw.contents().size(), &table_reader);
+        TableReaderOptions(ioptions, moptions.prefix_extractor.get(),
+                           EnvOptions(), ikc),
+        std::move(file_reader), ss_rw.contents().size(), &table_reader);
 
-    return table_reader->NewIterator(ReadOptions());
+    return table_reader->NewIterator(ReadOptions(),
+                                     moptions.prefix_extractor.get());
   };
 
   GetVersionAndGlobalSeqno();
@@ -3100,6 +3447,430 @@ TEST_F(BlockBasedTableTest, TableWithGlobalSeqno) {
   delete iter;
 }
 
+TEST_P(BlockBasedTableTest, BlockAlignTest) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_align = true;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(sink, "" /* don't care */));
+  Options options;
+  options.compression = kNoCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  std::string column_family_name;
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          0 /* sample_for_compression */, CompressionOptions(),
+                          false /* skip_filters */, column_family_name, -1),
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      file_writer.get()));
+
+  for (int i = 1; i <= 10000; ++i) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << i;
+    std::string key = ostr.str();
+    std::string value = "val";
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  file_writer->Flush();
+
+  test::RandomRWStringSink ss_rw(sink);
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      test::GetRandomAccessFileReader(
+          new test::StringSource(ss_rw.contents(), 73342, true)));
+
+  // Helper function to get version, global_seqno, global_seqno_offset
+  std::function<void()> VerifyBlockAlignment = [&]() {
+    TableProperties* props = nullptr;
+    ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(),
+                                  kBlockBasedTableMagicNumber, ioptions,
+                                  &props, true /* compression_type_missing */));
+
+    uint64_t data_block_size = props->data_size / props->num_data_blocks;
+    ASSERT_EQ(data_block_size, 4096);
+    ASSERT_EQ(props->data_size, data_block_size * props->num_data_blocks);
+    delete props;
+  };
+
+  VerifyBlockAlignment();
+
+  // The below block of code verifies that we can read back the keys. Set
+  // block_align to false when creating the reader to ensure we can flip between
+  // the two modes without any issues
+  std::unique_ptr<TableReader> table_reader;
+  bbto.block_align = false;
+  Options options2;
+  options2.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ImmutableCFOptions ioptions2(options2);
+  const MutableCFOptions moptions2(options2);
+
+  ASSERT_OK(ioptions.table_factory->NewTableReader(
+      TableReaderOptions(ioptions2, moptions2.prefix_extractor.get(),
+                         EnvOptions(),
+                         GetPlainInternalComparator(options2.comparator)),
+      std::move(file_reader), ss_rw.contents().size(), &table_reader));
+
+  std::unique_ptr<InternalIterator> db_iter(table_reader->NewIterator(
+      ReadOptions(), moptions2.prefix_extractor.get()));
+
+  int expected_key = 1;
+  for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << expected_key++;
+    std::string key = ostr.str();
+    std::string value = "val";
+
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(ExtractUserKey(db_iter->key()).ToString(), key);
+    ASSERT_EQ(db_iter->value().ToString(), value);
+  }
+  expected_key--;
+  ASSERT_EQ(expected_key, 10000);
+  table_reader.reset();
+}
+
+TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_align = true;
+  test::StringSink* sink = new test::StringSink();
+  std::unique_ptr<WritableFileWriter> file_writer(
+      test::GetWritableFileWriter(sink, "" /* don't care */));
+
+  Options options;
+  options.compression = kNoCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  InternalKeyComparator ikc(options.comparator);
+  std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
+      int_tbl_prop_collector_factories;
+  std::string column_family_name;
+
+  std::unique_ptr<TableBuilder> builder(options.table_factory->NewTableBuilder(
+      TableBuilderOptions(ioptions, moptions, ikc,
+                          &int_tbl_prop_collector_factories, kNoCompression,
+                          0 /* sample_for_compression */, CompressionOptions(),
+                          false /* skip_filters */, column_family_name, -1),
+      TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+      file_writer.get()));
+
+  for (int i = 1; i <= 10000; ++i) {
+    std::ostringstream ostr;
+    ostr << std::setfill('0') << std::setw(5) << i;
+    std::string key = ostr.str();
+    std::string value = "val";
+    InternalKey ik(key, 0, kTypeValue);
+
+    builder->Add(ik.Encode(), value);
+  }
+  ASSERT_OK(builder->Finish());
+  file_writer->Flush();
+
+  test::RandomRWStringSink ss_rw(sink);
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      test::GetRandomAccessFileReader(
+          new test::StringSource(ss_rw.contents(), 73342, true)));
+
+  {
+    RandomAccessFileReader* file = file_reader.get();
+    uint64_t file_size = ss_rw.contents().size();
+
+    Footer footer;
+    ASSERT_OK(ReadFooterFromFile(file, nullptr /* prefetch_buffer */, file_size,
+                                 &footer, kBlockBasedTableMagicNumber));
+
+    auto BlockFetchHelper = [&](const BlockHandle& handle,
+                                BlockContents* contents) {
+      ReadOptions read_options;
+      read_options.verify_checksums = false;
+      PersistentCacheOptions cache_options;
+
+      BlockFetcher block_fetcher(
+          file, nullptr /* prefetch_buffer */, footer, read_options, handle,
+          contents, ioptions, false /* decompress */,
+          false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+          cache_options);
+
+      ASSERT_OK(block_fetcher.ReadBlockContents());
+    };
+
+    // -- Read metaindex block
+    auto metaindex_handle = footer.metaindex_handle();
+    BlockContents metaindex_contents;
+
+    BlockFetchHelper(metaindex_handle, &metaindex_contents);
+    Block metaindex_block(std::move(metaindex_contents),
+                          kDisableGlobalSequenceNumber);
+
+    std::unique_ptr<InternalIterator> meta_iter(
+        metaindex_block.NewIterator<DataBlockIter>(BytewiseComparator(),
+                                                   BytewiseComparator()));
+    bool found_properties_block = true;
+    ASSERT_OK(SeekToPropertiesBlock(meta_iter.get(), &found_properties_block));
+    ASSERT_TRUE(found_properties_block);
+
+    // -- Read properties block
+    Slice v = meta_iter->value();
+    BlockHandle properties_handle;
+    ASSERT_OK(properties_handle.DecodeFrom(&v));
+    BlockContents properties_contents;
+
+    BlockFetchHelper(properties_handle, &properties_contents);
+    Block properties_block(std::move(properties_contents),
+                           kDisableGlobalSequenceNumber);
+
+    ASSERT_EQ(properties_block.NumRestarts(), 1);
+  }
+}
+
+TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) {
+  // The properties meta-block should come at the end since we always need to
+  // read it when opening a file, unlike index/filter/other meta-blocks, which
+  // are sometimes read depending on the user's configuration. This ordering
+  // allows us to do a small readahead on the end of the file to read properties
+  // and meta-index blocks with one I/O.
+  TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */);
+  c.Add("a1", "val1");
+  c.Add("b2", "val2");
+  c.Add("c3", "val3");
+  c.Add("d4", "val4");
+  c.Add("e5", "val5");
+  c.Add("f6", "val6");
+  c.Add("g7", "val7");
+  c.Add("h8", "val8");
+  c.Add("j9", "val9");
+
+  // write an SST file
+  Options options;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.filter_policy.reset(NewBloomFilterPolicy(
+      8 /* bits_per_key */, false /* use_block_based_filter */));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ImmutableCFOptions ioptions(options);
+  MutableCFOptions moptions(options);
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  c.Finish(options, ioptions, moptions, table_options,
+           GetPlainInternalComparator(options.comparator), &keys, &kvmap);
+
+  // get file reader
+  test::StringSink* table_sink = c.TEST_GetSink();
+  std::unique_ptr<RandomAccessFileReader> table_reader{
+      test::GetRandomAccessFileReader(
+          new test::StringSource(table_sink->contents(), 0 /* unique_id */,
+                                 false /* allow_mmap_reads */))};
+  size_t table_size = table_sink->contents().size();
+
+  // read footer
+  Footer footer;
+  ASSERT_OK(ReadFooterFromFile(table_reader.get(),
+                               nullptr /* prefetch_buffer */, table_size,
+                               &footer, kBlockBasedTableMagicNumber));
+
+  // read metaindex
+  auto metaindex_handle = footer.metaindex_handle();
+  BlockContents metaindex_contents;
+  PersistentCacheOptions pcache_opts;
+  BlockFetcher block_fetcher(
+      table_reader.get(), nullptr /* prefetch_buffer */, footer, ReadOptions(),
+      metaindex_handle, &metaindex_contents, ioptions, false /* decompress */,
+      false /*maybe_compressed*/, UncompressionDict::GetEmptyDict(),
+      pcache_opts, nullptr /*memory_allocator*/);
+  ASSERT_OK(block_fetcher.ReadBlockContents());
+  Block metaindex_block(std::move(metaindex_contents),
+                        kDisableGlobalSequenceNumber);
+
+  // verify properties block comes last
+  std::unique_ptr<InternalIterator> metaindex_iter{
+      metaindex_block.NewIterator<DataBlockIter>(options.comparator,
+                                                 options.comparator)};
+  uint64_t max_offset = 0;
+  std::string key_at_max_offset;
+  for (metaindex_iter->SeekToFirst(); metaindex_iter->Valid();
+       metaindex_iter->Next()) {
+    BlockHandle handle;
+    Slice value = metaindex_iter->value();
+    ASSERT_OK(handle.DecodeFrom(&value));
+    if (handle.offset() > max_offset) {
+      max_offset = handle.offset();
+      key_at_max_offset = metaindex_iter->key().ToString();
+    }
+  }
+  ASSERT_EQ(kPropertiesBlock, key_at_max_offset);
+  // index handle is stored in footer rather than metaindex block, so need
+  // separate logic to verify it comes before properties block.
+  ASSERT_GT(max_offset, footer.index_handle().offset());
+  c.ResetTableReader();
+}
+
+TEST_P(BlockBasedTableTest, BadOptions) {
+  rocksdb::Options options;
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto = GetBlockBasedTableOptions();
+  bbto.block_size = 4000;
+  bbto.block_align = true;
+
+  const std::string kDBPath =
+      test::PerThreadDBPath("block_based_table_bad_options_test");
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyDB(kDBPath, options);
+  rocksdb::DB* db;
+  ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db));
+
+  bbto.block_size = 4096;
+  options.compression = kSnappyCompression;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  ASSERT_NOK(rocksdb::DB::Open(options, kDBPath, &db));
+}
+
+TEST_F(BBTTailPrefetchTest, TestTailPrefetchStats) {
+  TailPrefetchStats tpstats;
+  ASSERT_EQ(0, tpstats.GetSuggestedPrefetchSize());
+  tpstats.RecordEffectiveSize(size_t{1000});
+  tpstats.RecordEffectiveSize(size_t{1005});
+  tpstats.RecordEffectiveSize(size_t{1002});
+  ASSERT_EQ(1005, tpstats.GetSuggestedPrefetchSize());
+
+  // One single super large value shouldn't influence much
+  tpstats.RecordEffectiveSize(size_t{1002000});
+  tpstats.RecordEffectiveSize(size_t{999});
+  ASSERT_LE(1005, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(1200, tpstats.GetSuggestedPrefetchSize());
+
+  // Only history of 32 is kept
+  for (int i = 0; i < 32; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_EQ(100, tpstats.GetSuggestedPrefetchSize());
+
+  // 16 large values and 16 small values. The result should be closer
+  // to the small value as the algorithm.
+  for (int i = 0; i < 16; i++) {
+    tpstats.RecordEffectiveSize(size_t{1000});
+  }
+  tpstats.RecordEffectiveSize(size_t{10});
+  tpstats.RecordEffectiveSize(size_t{20});
+  for (int i = 0; i < 6; i++) {
+    tpstats.RecordEffectiveSize(size_t{100});
+  }
+  ASSERT_LE(80, tpstats.GetSuggestedPrefetchSize());
+  ASSERT_GT(200, tpstats.GetSuggestedPrefetchSize());
+}
+
+TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) {
+  TailPrefetchStats tpstats;
+  FilePrefetchBuffer buffer(nullptr, 0, 0, false, true);
+  buffer.TryReadFromCache(500, 10, nullptr);
+  buffer.TryReadFromCache(480, 10, nullptr);
+  buffer.TryReadFromCache(490, 10, nullptr);
+  ASSERT_EQ(480, buffer.min_offset_read());
+}
+
+TEST_P(BlockBasedTableTest, DataBlockHashIndex) {
+  const int kNumKeys = 500;
+  const int kKeySize = 8;
+  const int kValSize = 40;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.data_block_index_type =
+      BlockBasedTableOptions::kDataBlockBinaryAndHash;
+
+  Options options;
+  options.comparator = BytewiseComparator();
+
+  options.table_factory.reset(new BlockBasedTableFactory(table_options));
+
+  TableConstructor c(options.comparator);
+
+  static Random rnd(1048);
+  for (int i = 0; i < kNumKeys; i++) {
+    // padding one "0" to mark existent keys.
+    std::string random_key(RandomString(&rnd, kKeySize - 1) + "1");
+    InternalKey k(random_key, 0, kTypeValue);
+    c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize));
+  }
+
+  std::vector<std::string> keys;
+  stl_wrappers::KVMap kvmap;
+  const ImmutableCFOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  const InternalKeyComparator internal_comparator(options.comparator);
+  c.Finish(options, ioptions, moptions, table_options, internal_comparator,
+           &keys, &kvmap);
+
+  auto reader = c.GetTableReader();
+
+  std::unique_ptr<InternalIterator> seek_iter;
+  seek_iter.reset(
+      reader->NewIterator(ReadOptions(), moptions.prefix_extractor.get()));
+  for (int i = 0; i < 2; ++i) {
+    ReadOptions ro;
+    // for every kv, we seek using two method: Get() and Seek()
+    // Get() will use the SuffixIndexHash in Block. For non-existent key it
+    //      will invalidate the iterator
+    // Seek() will use the default BinarySeek() in Block. So for non-existent
+    //      key it will land at the closest key that is large than target.
+
+    // Search for existent keys
+    for (auto& kv : kvmap) {
+      if (i == 0) {
+        // Search using Seek()
+        seek_iter->Seek(kv.first);
+        ASSERT_OK(seek_iter->status());
+        ASSERT_TRUE(seek_iter->Valid());
+        ASSERT_EQ(seek_iter->key(), kv.first);
+        ASSERT_EQ(seek_iter->value(), kv.second);
+      } else {
+        // Search using Get()
+        PinnableSlice value;
+        std::string user_key = ExtractUserKey(kv.first).ToString();
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, kv.first, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kFound);
+        ASSERT_EQ(value, Slice(kv.second));
+        value.Reset();
+      }
+    }
+
+    // Search for non-existent keys
+    for (auto& kv : kvmap) {
+      std::string user_key = ExtractUserKey(kv.first).ToString();
+      user_key.back() = '0';  // make it non-existent key
+      InternalKey internal_key(user_key, 0, kTypeValue);
+      std::string encoded_key = internal_key.Encode().ToString();
+      if (i == 0) {  // Search using Seek()
+        seek_iter->Seek(encoded_key);
+        ASSERT_OK(seek_iter->status());
+        if (seek_iter->Valid()) {
+          ASSERT_TRUE(BytewiseComparator()->Compare(
+                          user_key, ExtractUserKey(seek_iter->key())) < 0);
+        }
+      } else {  // Search using Get()
+        PinnableSlice value;
+        GetContext get_context(options.comparator, nullptr, nullptr, nullptr,
+                               GetContext::kNotFound, user_key, &value, nullptr,
+                               nullptr, nullptr, nullptr);
+        ASSERT_OK(reader->Get(ro, encoded_key, &get_context,
+                              moptions.prefix_extractor.get()));
+        ASSERT_EQ(get_context.State(), GetContext::kNotFound);
+        value.Reset();
+      }
+    }
+  }
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/table/two_level_iterator.cc b/thirdparty/rocksdb/table/two_level_iterator.cc
index 2236a2a726..a8f617dee2 100644
--- a/thirdparty/rocksdb/table/two_level_iterator.cc
+++ b/thirdparty/rocksdb/table/two_level_iterator.cc
@@ -19,45 +19,37 @@ namespace rocksdb {
 
 namespace {
 
-class TwoLevelIterator : public InternalIterator {
+class TwoLevelIndexIterator : public InternalIteratorBase<BlockHandle> {
  public:
-  explicit TwoLevelIterator(TwoLevelIteratorState* state,
-                            InternalIterator* first_level_iter,
-                            bool need_free_iter_and_state);
+  explicit TwoLevelIndexIterator(
+      TwoLevelIteratorState* state,
+      InternalIteratorBase<BlockHandle>* first_level_iter);
 
-  virtual ~TwoLevelIterator() {
-    // Assert that the TwoLevelIterator is never deleted while Pinning is
-    // Enabled.
-    assert(!pinned_iters_mgr_ ||
-           (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled()));
-    first_level_iter_.DeleteIter(!need_free_iter_and_state_);
-    second_level_iter_.DeleteIter(false);
-    if (need_free_iter_and_state_) {
-      delete state_;
-    } else {
-      state_->~TwoLevelIteratorState();
-    }
+  ~TwoLevelIndexIterator() override {
+    first_level_iter_.DeleteIter(false /* is_arena_mode */);
+    second_level_iter_.DeleteIter(false /* is_arena_mode */);
+    delete state_;
   }
 
-  virtual void Seek(const Slice& target) override;
-  virtual void SeekForPrev(const Slice& target) override;
-  virtual void SeekToFirst() override;
-  virtual void SeekToLast() override;
-  virtual void Next() override;
-  virtual void Prev() override;
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() override;
+  void Prev() override;
 
-  virtual bool Valid() const override { return second_level_iter_.Valid(); }
-  virtual Slice key() const override {
+  bool Valid() const override { return second_level_iter_.Valid(); }
+  Slice key() const override {
     assert(Valid());
     return second_level_iter_.key();
   }
-  virtual Slice value() const override {
+  BlockHandle value() const override {
     assert(Valid());
     return second_level_iter_.value();
   }
-  virtual Status status() const override {
-    // It'd be nice if status() returned a const Status& instead of a Status
+  Status status() const override {
     if (!first_level_iter_.status().ok()) {
+      assert(second_level_iter_.iter() == nullptr);
       return first_level_iter_.status();
     } else if (second_level_iter_.iter() != nullptr &&
                !second_level_iter_.status().ok()) {
@@ -66,22 +58,10 @@ class TwoLevelIterator : public InternalIterator {
       return status_;
     }
   }
-  virtual void SetPinnedItersMgr(
-      PinnedIteratorsManager* pinned_iters_mgr) override {
-    pinned_iters_mgr_ = pinned_iters_mgr;
-    first_level_iter_.SetPinnedItersMgr(pinned_iters_mgr);
-    if (second_level_iter_.iter()) {
-      second_level_iter_.SetPinnedItersMgr(pinned_iters_mgr);
-    }
-  }
-  virtual bool IsKeyPinned() const override {
-    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
-           second_level_iter_.iter() && second_level_iter_.IsKeyPinned();
-  }
-  virtual bool IsValuePinned() const override {
-    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
-           second_level_iter_.iter() && second_level_iter_.IsValuePinned();
-  }
+  void SetPinnedItersMgr(
+      PinnedIteratorsManager* /*pinned_iters_mgr*/) override {}
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return false; }
 
  private:
   void SaveError(const Status& s) {
@@ -89,34 +69,24 @@ class TwoLevelIterator : public InternalIterator {
   }
   void SkipEmptyDataBlocksForward();
   void SkipEmptyDataBlocksBackward();
-  void SetSecondLevelIterator(InternalIterator* iter);
+  void SetSecondLevelIterator(InternalIteratorBase<BlockHandle>* iter);
   void InitDataBlock();
 
   TwoLevelIteratorState* state_;
-  IteratorWrapper first_level_iter_;
-  IteratorWrapper second_level_iter_;  // May be nullptr
-  bool need_free_iter_and_state_;
-  PinnedIteratorsManager* pinned_iters_mgr_;
+  IteratorWrapperBase<BlockHandle> first_level_iter_;
+  IteratorWrapperBase<BlockHandle> second_level_iter_;  // May be nullptr
   Status status_;
   // If second_level_iter is non-nullptr, then "data_block_handle_" holds the
   // "index_value" passed to block_function_ to create the second_level_iter.
-  std::string data_block_handle_;
+  BlockHandle data_block_handle_;
 };
 
-TwoLevelIterator::TwoLevelIterator(TwoLevelIteratorState* state,
-                                   InternalIterator* first_level_iter,
-                                   bool need_free_iter_and_state)
-    : state_(state),
-      first_level_iter_(first_level_iter),
-      need_free_iter_and_state_(need_free_iter_and_state),
-      pinned_iters_mgr_(nullptr) {}
+TwoLevelIndexIterator::TwoLevelIndexIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<BlockHandle>* first_level_iter)
+    : state_(state), first_level_iter_(first_level_iter) {}
 
-void TwoLevelIterator::Seek(const Slice& target) {
-  if (state_->check_prefix_may_match &&
-      !state_->PrefixMayMatch(target)) {
-    SetSecondLevelIterator(nullptr);
-    return;
-  }
+void TwoLevelIndexIterator::Seek(const Slice& target) {
   first_level_iter_.Seek(target);
 
   InitDataBlock();
@@ -126,18 +96,14 @@ void TwoLevelIterator::Seek(const Slice& target) {
   SkipEmptyDataBlocksForward();
 }
 
-void TwoLevelIterator::SeekForPrev(const Slice& target) {
-  if (state_->check_prefix_may_match && !state_->PrefixMayMatch(target)) {
-    SetSecondLevelIterator(nullptr);
-    return;
-  }
+void TwoLevelIndexIterator::SeekForPrev(const Slice& target) {
   first_level_iter_.Seek(target);
   InitDataBlock();
   if (second_level_iter_.iter() != nullptr) {
     second_level_iter_.SeekForPrev(target);
   }
   if (!Valid()) {
-    if (!first_level_iter_.Valid()) {
+    if (!first_level_iter_.Valid() && first_level_iter_.status().ok()) {
       first_level_iter_.SeekToLast();
       InitDataBlock();
       if (second_level_iter_.iter() != nullptr) {
@@ -148,7 +114,7 @@ void TwoLevelIterator::SeekForPrev(const Slice& target) {
   }
 }
 
-void TwoLevelIterator::SeekToFirst() {
+void TwoLevelIndexIterator::SeekToFirst() {
   first_level_iter_.SeekToFirst();
   InitDataBlock();
   if (second_level_iter_.iter() != nullptr) {
@@ -157,7 +123,7 @@ void TwoLevelIterator::SeekToFirst() {
   SkipEmptyDataBlocksForward();
 }
 
-void TwoLevelIterator::SeekToLast() {
+void TwoLevelIndexIterator::SeekToLast() {
   first_level_iter_.SeekToLast();
   InitDataBlock();
   if (second_level_iter_.iter() != nullptr) {
@@ -166,25 +132,23 @@ void TwoLevelIterator::SeekToLast() {
   SkipEmptyDataBlocksBackward();
 }
 
-void TwoLevelIterator::Next() {
+void TwoLevelIndexIterator::Next() {
   assert(Valid());
   second_level_iter_.Next();
   SkipEmptyDataBlocksForward();
 }
 
-void TwoLevelIterator::Prev() {
+void TwoLevelIndexIterator::Prev() {
   assert(Valid());
   second_level_iter_.Prev();
   SkipEmptyDataBlocksBackward();
 }
 
-void TwoLevelIterator::SkipEmptyDataBlocksForward() {
+void TwoLevelIndexIterator::SkipEmptyDataBlocksForward() {
   while (second_level_iter_.iter() == nullptr ||
-         (!second_level_iter_.Valid() &&
-          !second_level_iter_.status().IsIncomplete())) {
+         (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
     // Move to next block
-    if (!first_level_iter_.Valid() ||
-        state_->KeyReachedUpperBound(first_level_iter_.key())) {
+    if (!first_level_iter_.Valid()) {
       SetSecondLevelIterator(nullptr);
       return;
     }
@@ -196,10 +160,9 @@ void TwoLevelIterator::SkipEmptyDataBlocksForward() {
   }
 }
 
-void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
+void TwoLevelIndexIterator::SkipEmptyDataBlocksBackward() {
   while (second_level_iter_.iter() == nullptr ||
-         (!second_level_iter_.Valid() &&
-          !second_level_iter_.status().IsIncomplete())) {
+         (!second_level_iter_.Valid() && second_level_iter_.status().ok())) {
     // Move to next block
     if (!first_level_iter_.Valid()) {
       SetSecondLevelIterator(nullptr);
@@ -213,36 +176,26 @@ void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
   }
 }
 
-void TwoLevelIterator::SetSecondLevelIterator(InternalIterator* iter) {
-  if (second_level_iter_.iter() != nullptr) {
-    SaveError(second_level_iter_.status());
-  }
-
-  if (pinned_iters_mgr_ && iter) {
-    iter->SetPinnedItersMgr(pinned_iters_mgr_);
-  }
-
-  InternalIterator* old_iter = second_level_iter_.Set(iter);
-  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
-    pinned_iters_mgr_->PinIterator(old_iter);
-  } else {
-    delete old_iter;
-  }
+void TwoLevelIndexIterator::SetSecondLevelIterator(
+    InternalIteratorBase<BlockHandle>* iter) {
+  InternalIteratorBase<BlockHandle>* old_iter = second_level_iter_.Set(iter);
+  delete old_iter;
 }
 
-void TwoLevelIterator::InitDataBlock() {
+void TwoLevelIndexIterator::InitDataBlock() {
   if (!first_level_iter_.Valid()) {
     SetSecondLevelIterator(nullptr);
   } else {
-    Slice handle = first_level_iter_.value();
+    BlockHandle handle = first_level_iter_.value();
     if (second_level_iter_.iter() != nullptr &&
         !second_level_iter_.status().IsIncomplete() &&
-        handle.compare(data_block_handle_) == 0) {
+        handle.offset() == data_block_handle_.offset()) {
       // second_level_iter is already constructed with this iterator, so
       // no need to change anything
     } else {
-      InternalIterator* iter = state_->NewSecondaryIterator(handle);
-      data_block_handle_.assign(handle.data(), handle.size());
+      InternalIteratorBase<BlockHandle>* iter =
+          state_->NewSecondaryIterator(handle);
+      data_block_handle_ = handle;
       SetSecondLevelIterator(iter);
     }
   }
@@ -250,18 +203,9 @@ void TwoLevelIterator::InitDataBlock() {
 
 }  // namespace
 
-InternalIterator* NewTwoLevelIterator(TwoLevelIteratorState* state,
-                                      InternalIterator* first_level_iter,
-                                      Arena* arena,
-                                      bool need_free_iter_and_state) {
-  if (arena == nullptr) {
-    return new TwoLevelIterator(state, first_level_iter,
-                                need_free_iter_and_state);
-  } else {
-    auto mem = arena->AllocateAligned(sizeof(TwoLevelIterator));
-    return new (mem)
-        TwoLevelIterator(state, first_level_iter, need_free_iter_and_state);
-  }
+InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<BlockHandle>* first_level_iter) {
+  return new TwoLevelIndexIterator(state, first_level_iter);
 }
-
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/table/two_level_iterator.h b/thirdparty/rocksdb/table/two_level_iterator.h
index 34b33c83f6..55d5c01a4a 100644
--- a/thirdparty/rocksdb/table/two_level_iterator.h
+++ b/thirdparty/rocksdb/table/two_level_iterator.h
@@ -16,19 +16,14 @@ namespace rocksdb {
 
 struct ReadOptions;
 class InternalKeyComparator;
-class Arena;
 
+// TwoLevelIteratorState expects iterators are not created using the arena
 struct TwoLevelIteratorState {
-  explicit TwoLevelIteratorState(bool _check_prefix_may_match)
-      : check_prefix_may_match(_check_prefix_may_match) {}
+  TwoLevelIteratorState() {}
 
   virtual ~TwoLevelIteratorState() {}
-  virtual InternalIterator* NewSecondaryIterator(const Slice& handle) = 0;
-  virtual bool PrefixMayMatch(const Slice& internal_key) = 0;
-  virtual bool KeyReachedUpperBound(const Slice& internal_key) = 0;
-
-  // If call PrefixMayMatch()
-  bool check_prefix_may_match;
+  virtual InternalIteratorBase<BlockHandle>* NewSecondaryIterator(
+      const BlockHandle& handle) = 0;
 };
 
 
@@ -41,13 +36,9 @@ struct TwoLevelIteratorState {
 //
 // Uses a supplied function to convert an index_iter value into
 // an iterator over the contents of the corresponding block.
-// arena: If not null, the arena is used to allocate the Iterator.
-//        When destroying the iterator, the destructor will destroy
-//        all the states but those allocated in arena.
-// need_free_iter_and_state: free `state` and `first_level_iter` if
-//                           true. Otherwise, just call destructor.
-extern InternalIterator* NewTwoLevelIterator(
-    TwoLevelIteratorState* state, InternalIterator* first_level_iter,
-    Arena* arena = nullptr, bool need_free_iter_and_state = true);
+// Note: this function expects first_level_iter was not created using the arena
+extern InternalIteratorBase<BlockHandle>* NewTwoLevelIterator(
+    TwoLevelIteratorState* state,
+    InternalIteratorBase<BlockHandle>* first_level_iter);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/third-party/fbson/COMMIT.md b/thirdparty/rocksdb/third-party/fbson/COMMIT.md
deleted file mode 100644
index b38b5424d3..0000000000
--- a/thirdparty/rocksdb/third-party/fbson/COMMIT.md
+++ /dev/null
@@ -1,5 +0,0 @@
-fbson commit: 
-https://github.com/facebook/mysql-5.6/commit/55ef9ff25c934659a70b4094e9b406c48e9dd43d
-
-# TODO.
-* Had to convert zero sized array to [1] sized arrays due to the fact that MS Compiler complains about it not being standard. At some point need to contribute this change back to MySql where this code was taken from.
diff --git a/thirdparty/rocksdb/third-party/fbson/FbsonDocument.h b/thirdparty/rocksdb/third-party/fbson/FbsonDocument.h
deleted file mode 100644
index 6fb8a93f17..0000000000
--- a/thirdparty/rocksdb/third-party/fbson/FbsonDocument.h
+++ /dev/null
@@ -1,893 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-/*
- * This header defines FbsonDocument, FbsonKeyValue, and various value classes
- * which are derived from FbsonValue, and a forward iterator for container
- * values - essentially everything that is related to FBSON binary data
- * structures.
- *
- * Implementation notes:
- *
- * None of the classes in this header file can be instantiated directly (i.e.
- * you cannot create a FbsonKeyValue or FbsonValue object - all constructors
- * are declared non-public). We use the classes as wrappers on the packed FBSON
- * bytes (serialized), and cast the classes (types) to the underlying packed
- * byte array.
- *
- * For the same reason, we cannot define any FBSON value class to be virtual,
- * since we never call constructors, and will not instantiate vtbl and vptrs.
- *
- * Therefore, the classes are defined as packed structures (i.e. no data
- * alignment and padding), and the private member variables of the classes are
- * defined precisely in the same order as the FBSON spec. This ensures we
- * access the packed FBSON bytes correctly.
- *
- * The packed structures are highly optimized for in-place operations with low
- * overhead. The reads (and in-place writes) are performed directly on packed
- * bytes. There is no memory allocation at all at runtime.
- *
- * For updates/writes of values that will expand the original FBSON size, the
- * write will fail, and the caller needs to handle buffer increase.
- *
- * ** Iterator **
- * Both ObjectVal class and ArrayVal class have iterator type that you can use
- * to declare an iterator on a container object to go through the key-value
- * pairs or value list. The iterator has both non-const and const types.
- *
- * Note: iterators are forward direction only.
- *
- * ** Query **
- * Querying into containers is through the member functions find (for key/value
- * pairs) and get (for array elements), and is in streaming style. We don't
- * need to read/scan the whole FBSON packed bytes in order to return results.
- * Once the key/index is found, we will stop search.  You can use text to query
- * both objects and array (for array, text will be converted to integer index),
- * and use index to retrieve from array. Array index is 0-based.
- *
- * ** External dictionary **
- * During query processing, you can also pass a call-back function, so the
- * search will first try to check if the key string exists in the dictionary.
- * If so, search will be based on the id instead of the key string.
- *
- * @author Tian Xia <tianx@fb.com>
- */
-
-#ifndef FBSON_FBSONDOCUMENT_H
-#define FBSON_FBSONDOCUMENT_H
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-
-namespace fbson {
-
-#pragma pack(push, 1)
-
-#define FBSON_VER 1
-
-// forward declaration
-class FbsonValue;
-class ObjectVal;
-
-/*
- * FbsonDocument is the main object that accesses and queries FBSON packed
- * bytes. NOTE: FbsonDocument only allows object container as the top level
- * FBSON value. However, you can use the static method "createValue" to get any
- * FbsonValue object from the packed bytes.
- *
- * FbsonDocument object also dereferences to an object container value
- * (ObjectVal) once FBSON is loaded.
- *
- * ** Load **
- * FbsonDocument is usable after loading packed bytes (memory location) into
- * the object. We only need the header and first few bytes of the payload after
- * header to verify the FBSON.
- *
- * Note: creating an FbsonDocument (through createDocument) does not allocate
- * any memory. The document object is an efficient wrapper on the packed bytes
- * which is accessed directly.
- *
- * ** Query **
- * Query is through dereferencing into ObjectVal.
- */
-class FbsonDocument {
- public:
-  // create an FbsonDocument object from FBSON packed bytes
-  static FbsonDocument* createDocument(const char* pb, uint32_t size);
-
-  // create an FbsonValue from FBSON packed bytes
-  static FbsonValue* createValue(const char* pb, uint32_t size);
-
-  uint8_t version() { return header_.ver_; }
-
-  FbsonValue* getValue() { return ((FbsonValue*)payload_); }
-
-  ObjectVal* operator->() { return ((ObjectVal*)payload_); }
-
-  const ObjectVal* operator->() const { return ((const ObjectVal*)payload_); }
-
- private:
-  /*
-   * FbsonHeader class defines FBSON header (internal to FbsonDocument).
-   *
-   * Currently it only contains version information (1-byte). We may expand the
-   * header to include checksum of the FBSON binary for more security.
-   */
-  struct FbsonHeader {
-    uint8_t ver_;
-  } header_;
-
-  char payload_[1];
-
-  FbsonDocument();
-
-  FbsonDocument(const FbsonDocument&) = delete;
-  FbsonDocument& operator=(const FbsonDocument&) = delete;
-};
-
-/*
- * FbsonFwdIteratorT implements FBSON's iterator template.
- *
- * Note: it is an FORWARD iterator only due to the design of FBSON format.
- */
-template <class Iter_Type, class Cont_Type>
-class FbsonFwdIteratorT {
-  typedef Iter_Type iterator;
-  typedef typename std::iterator_traits<Iter_Type>::pointer pointer;
-  typedef typename std::iterator_traits<Iter_Type>::reference reference;
-
- public:
-  explicit FbsonFwdIteratorT(const iterator& i) : current_(i) {}
-
-  // allow non-const to const iterator conversion (same container type)
-  template <class Iter_Ty>
-  FbsonFwdIteratorT(const FbsonFwdIteratorT<Iter_Ty, Cont_Type>& rhs)
-      : current_(rhs.base()) {}
-
-  bool operator==(const FbsonFwdIteratorT& rhs) const {
-    return (current_ == rhs.current_);
-  }
-
-  bool operator!=(const FbsonFwdIteratorT& rhs) const {
-    return !operator==(rhs);
-  }
-
-  bool operator<(const FbsonFwdIteratorT& rhs) const {
-    return (current_ < rhs.current_);
-  }
-
-  bool operator>(const FbsonFwdIteratorT& rhs) const { return !operator<(rhs); }
-
-  FbsonFwdIteratorT& operator++() {
-    current_ = (iterator)(((char*)current_) + current_->numPackedBytes());
-    return *this;
-  }
-
-  FbsonFwdIteratorT operator++(int) {
-    auto tmp = *this;
-    current_ = (iterator)(((char*)current_) + current_->numPackedBytes());
-    return tmp;
-  }
-
-  explicit operator pointer() { return current_; }
-
-  reference operator*() const { return *current_; }
-
-  pointer operator->() const { return current_; }
-
-  iterator base() const { return current_; }
-
- private:
-  iterator current_;
-};
-
-typedef int (*hDictInsert)(const char* key, unsigned len);
-typedef int (*hDictFind)(const char* key, unsigned len);
-
-/*
- * FbsonType defines 10 primitive types and 2 container types, as described
- * below.
- *
- * primitive_value ::=
- *   0x00        //null value (0 byte)
- * | 0x01        //boolean true (0 byte)
- * | 0x02        //boolean false (0 byte)
- * | 0x03 int8   //char/int8 (1 byte)
- * | 0x04 int16  //int16 (2 bytes)
- * | 0x05 int32  //int32 (4 bytes)
- * | 0x06 int64  //int64 (8 bytes)
- * | 0x07 double //floating point (8 bytes)
- * | 0x08 string //variable length string
- * | 0x09 binary //variable length binary
- *
- * container ::=
- *   0x0A int32 key_value_list //object, int32 is the total bytes of the object
- * | 0x0B int32 value_list     //array, int32 is the total bytes of the array
- */
-enum class FbsonType : char {
-  T_Null = 0x00,
-  T_True = 0x01,
-  T_False = 0x02,
-  T_Int8 = 0x03,
-  T_Int16 = 0x04,
-  T_Int32 = 0x05,
-  T_Int64 = 0x06,
-  T_Double = 0x07,
-  T_String = 0x08,
-  T_Binary = 0x09,
-  T_Object = 0x0A,
-  T_Array = 0x0B,
-  NUM_TYPES,
-};
-
-typedef std::underlying_type<FbsonType>::type FbsonTypeUnder;
-
-/*
- * FbsonKeyValue class defines FBSON key type, as described below.
- *
- * key ::=
- *   0x00 int8    //1-byte dictionary id
- * | int8 (byte*) //int8 (>0) is the size of the key string
- *
- * value ::= primitive_value | container
- *
- * FbsonKeyValue can be either an id mapping to the key string in an external
- * dictionary, or it is the original key string. Whether to read an id or a
- * string is decided by the first byte (size_).
- *
- * Note: a key object must be followed by a value object. Therefore, a key
- * object implicitly refers to a key-value pair, and you can get the value
- * object right after the key object. The function numPackedBytes hence
- * indicates the total size of the key-value pair, so that we will be able go
- * to next pair from the key.
- *
- * ** Dictionary size **
- * By default, the dictionary size is 255 (1-byte). Users can define
- * "USE_LARGE_DICT" to increase the dictionary size to 655535 (2-byte).
- */
-class FbsonKeyValue {
- public:
-#ifdef USE_LARGE_DICT
-  static const int sMaxKeyId = 65535;
-  typedef uint16_t keyid_type;
-#else
-  static const int sMaxKeyId = 255;
-  typedef uint8_t keyid_type;
-#endif // #ifdef USE_LARGE_DICT
-
-  static const uint8_t sMaxKeyLen = 64;
-
-  // size of the key. 0 indicates it is stored as id
-  uint8_t klen() const { return size_; }
-
-  // get the key string. Note the string may not be null terminated.
-  const char* getKeyStr() const { return key_.str_; }
-
-  keyid_type getKeyId() const { return key_.id_; }
-
-  unsigned int keyPackedBytes() const {
-    return size_ ? (sizeof(size_) + size_)
-                 : (sizeof(size_) + sizeof(keyid_type));
-  }
-
-  FbsonValue* value() const {
-    return (FbsonValue*)(((char*)this) + keyPackedBytes());
-  }
-
-  // size of the total packed bytes (key+value)
-  unsigned int numPackedBytes() const;
-
- private:
-  uint8_t size_;
-
-  union key_ {
-    keyid_type id_;
-    char str_[1];
-  } key_;
-
-  FbsonKeyValue();
-};
-
-/*
- * FbsonValue is the base class of all FBSON types. It contains only one member
- * variable - type info, which can be retrieved by member functions is[Type]()
- * or type().
- */
-class FbsonValue {
- public:
-  static const uint32_t sMaxValueLen = 1 << 24; // 16M
-
-  bool isNull() const { return (type_ == FbsonType::T_Null); }
-  bool isTrue() const { return (type_ == FbsonType::T_True); }
-  bool isFalse() const { return (type_ == FbsonType::T_False); }
-  bool isInt8() const { return (type_ == FbsonType::T_Int8); }
-  bool isInt16() const { return (type_ == FbsonType::T_Int16); }
-  bool isInt32() const { return (type_ == FbsonType::T_Int32); }
-  bool isInt64() const { return (type_ == FbsonType::T_Int64); }
-  bool isDouble() const { return (type_ == FbsonType::T_Double); }
-  bool isString() const { return (type_ == FbsonType::T_String); }
-  bool isBinary() const { return (type_ == FbsonType::T_Binary); }
-  bool isObject() const { return (type_ == FbsonType::T_Object); }
-  bool isArray() const { return (type_ == FbsonType::T_Array); }
-
-  FbsonType type() const { return type_; }
-
-  // size of the total packed bytes
-  unsigned int numPackedBytes() const;
-
-  // size of the value in bytes
-  unsigned int size() const;
-
-  // get the raw byte array of the value
-  const char* getValuePtr() const;
-
-  // find the FBSON value by a key path string (null terminated)
-  FbsonValue* findPath(const char* key_path,
-                       const char* delim = ".",
-                       hDictFind handler = nullptr) {
-    return findPath(key_path, (unsigned int)strlen(key_path), delim, handler);
-  }
-
-  // find the FBSON value by a key path string (with length)
-  FbsonValue* findPath(const char* key_path,
-                       unsigned int len,
-                       const char* delim,
-                       hDictFind handler);
-
- protected:
-  FbsonType type_; // type info
-
-  FbsonValue();
-};
-
-/*
- * NumerValT is the template class (derived from FbsonValue) of all number
- * types (integers and double).
- */
-template <class T>
-class NumberValT : public FbsonValue {
- public:
-  T val() const { return num_; }
-
-  unsigned int numPackedBytes() const { return sizeof(FbsonValue) + sizeof(T); }
-
-  // catch all unknow specialization of the template class
-  bool setVal(T value) { return false; }
-
- private:
-  T num_;
-
-  NumberValT();
-};
-
-typedef NumberValT<int8_t> Int8Val;
-
-// override setVal for Int8Val
-template <>
-inline bool Int8Val::setVal(int8_t value) {
-  if (!isInt8()) {
-    return false;
-  }
-
-  num_ = value;
-  return true;
-}
-
-typedef NumberValT<int16_t> Int16Val;
-
-// override setVal for Int16Val
-template <>
-inline bool Int16Val::setVal(int16_t value) {
-  if (!isInt16()) {
-    return false;
-  }
-
-  num_ = value;
-  return true;
-}
-
-typedef NumberValT<int32_t> Int32Val;
-
-// override setVal for Int32Val
-template <>
-inline bool Int32Val::setVal(int32_t value) {
-  if (!isInt32()) {
-    return false;
-  }
-
-  num_ = value;
-  return true;
-}
-
-typedef NumberValT<int64_t> Int64Val;
-
-// override setVal for Int64Val
-template <>
-inline bool Int64Val::setVal(int64_t value) {
-  if (!isInt64()) {
-    return false;
-  }
-
-  num_ = value;
-  return true;
-}
-
-typedef NumberValT<double> DoubleVal;
-
-// override setVal for DoubleVal
-template <>
-inline bool DoubleVal::setVal(double value) {
-  if (!isDouble()) {
-    return false;
-  }
-
-  num_ = value;
-  return true;
-}
-
-/*
- * BlobVal is the base class (derived from FbsonValue) for string and binary
- * types. The size_ indicates the total bytes of the payload_.
- */
-class BlobVal : public FbsonValue {
- public:
-  // size of the blob payload only
-  unsigned int getBlobLen() const { return size_; }
-
-  // return the blob as byte array
-  const char* getBlob() const { return payload_; }
-
-  // size of the total packed bytes
-  unsigned int numPackedBytes() const {
-    return sizeof(FbsonValue) + sizeof(size_) + size_;
-  }
-
- protected:
-  uint32_t size_;
-  char payload_[1];
-
-  // set new blob bytes
-  bool internalSetVal(const char* blob, uint32_t blobSize) {
-    // if we cannot fit the new blob, fail the operation
-    if (blobSize > size_) {
-      return false;
-    }
-
-    memcpy(payload_, blob, blobSize);
-
-    // Set the reset of the bytes to 0.  Note we cannot change the size_ of the
-    // current payload, as all values are packed.
-    memset(payload_ + blobSize, 0, size_ - blobSize);
-
-    return true;
-  }
-
-  BlobVal();
-
- private:
-  // Disable as this class can only be allocated dynamically
-  BlobVal(const BlobVal&) = delete;
-  BlobVal& operator=(const BlobVal&) = delete;
-};
-
-/*
- * Binary type
- */
-class BinaryVal : public BlobVal {
- public:
-  bool setVal(const char* blob, uint32_t blobSize) {
-    if (!isBinary()) {
-      return false;
-    }
-
-    return internalSetVal(blob, blobSize);
-  }
-
- private:
-  BinaryVal();
-};
-
-/*
- * String type
- * Note: FBSON string may not be a c-string (NULL-terminated)
- */
-class StringVal : public BlobVal {
- public:
-  bool setVal(const char* str, uint32_t blobSize) {
-    if (!isString()) {
-      return false;
-    }
-
-    return internalSetVal(str, blobSize);
-  }
-
- private:
-  StringVal();
-};
-
-/*
- * ContainerVal is the base class (derived from FbsonValue) for object and
- * array types. The size_ indicates the total bytes of the payload_.
- */
-class ContainerVal : public FbsonValue {
- public:
-  // size of the container payload only
-  unsigned int getContainerSize() const { return size_; }
-
-  // return the container payload as byte array
-  const char* getPayload() const { return payload_; }
-
-  // size of the total packed bytes
-  unsigned int numPackedBytes() const {
-    return sizeof(FbsonValue) + sizeof(size_) + size_;
-  }
-
- protected:
-  uint32_t size_;
-  char payload_[1];
-
-  ContainerVal();
-
-  ContainerVal(const ContainerVal&) = delete;
-  ContainerVal& operator=(const ContainerVal&) = delete;
-};
-
-/*
- * Object type
- */
-class ObjectVal : public ContainerVal {
- public:
-  // find the FBSON value by a key string (null terminated)
-  FbsonValue* find(const char* key, hDictFind handler = nullptr) const {
-    if (!key)
-      return nullptr;
-
-    return find(key, (unsigned int)strlen(key), handler);
-  }
-
-  // find the FBSON value by a key string (with length)
-  FbsonValue* find(const char* key,
-                   unsigned int klen,
-                   hDictFind handler = nullptr) const {
-    if (!key || !klen)
-      return nullptr;
-
-    int key_id = -1;
-    if (handler && (key_id = handler(key, klen)) >= 0) {
-      return find(key_id);
-    }
-
-    return internalFind(key, klen);
-  }
-
-  // find the FBSON value by a key dictionary ID
-  FbsonValue* find(int key_id) const {
-    if (key_id < 0 || key_id > FbsonKeyValue::sMaxKeyId)
-      return nullptr;
-
-    const char* pch = payload_;
-    const char* fence = payload_ + size_;
-
-    while (pch < fence) {
-      FbsonKeyValue* pkey = (FbsonKeyValue*)(pch);
-      if (!pkey->klen() && key_id == pkey->getKeyId()) {
-        return pkey->value();
-      }
-      pch += pkey->numPackedBytes();
-    }
-
-    assert(pch == fence);
-
-    return nullptr;
-  }
-
-  typedef FbsonKeyValue value_type;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef FbsonFwdIteratorT<pointer, ObjectVal> iterator;
-  typedef FbsonFwdIteratorT<const_pointer, ObjectVal> const_iterator;
-
-  iterator begin() { return iterator((pointer)payload_); }
-
-  const_iterator begin() const { return const_iterator((pointer)payload_); }
-
-  iterator end() { return iterator((pointer)(payload_ + size_)); }
-
-  const_iterator end() const {
-    return const_iterator((pointer)(payload_ + size_));
-  }
-
- private:
-  FbsonValue* internalFind(const char* key, unsigned int klen) const {
-    const char* pch = payload_;
-    const char* fence = payload_ + size_;
-
-    while (pch < fence) {
-      FbsonKeyValue* pkey = (FbsonKeyValue*)(pch);
-      if (klen == pkey->klen() && strncmp(key, pkey->getKeyStr(), klen) == 0) {
-        return pkey->value();
-      }
-      pch += pkey->numPackedBytes();
-    }
-
-    assert(pch == fence);
-
-    return nullptr;
-  }
-
- private:
-  ObjectVal();
-};
-
-/*
- * Array type
- */
-class ArrayVal : public ContainerVal {
- public:
-  // get the FBSON value at index
-  FbsonValue* get(int idx) const {
-    if (idx < 0)
-      return nullptr;
-
-    const char* pch = payload_;
-    const char* fence = payload_ + size_;
-
-    while (pch < fence && idx-- > 0)
-      pch += ((FbsonValue*)pch)->numPackedBytes();
-
-    if (idx == -1)
-      return (FbsonValue*)pch;
-    else {
-      assert(pch == fence);
-      return nullptr;
-    }
-  }
-
-  // Get number of elements in array
-  unsigned int numElem() const {
-    const char* pch = payload_;
-    const char* fence = payload_ + size_;
-
-    unsigned int num = 0;
-    while (pch < fence) {
-      ++num;
-      pch += ((FbsonValue*)pch)->numPackedBytes();
-    }
-
-    assert(pch == fence);
-
-    return num;
-  }
-
-  typedef FbsonValue value_type;
-  typedef value_type* pointer;
-  typedef const value_type* const_pointer;
-  typedef FbsonFwdIteratorT<pointer, ArrayVal> iterator;
-  typedef FbsonFwdIteratorT<const_pointer, ArrayVal> const_iterator;
-
-  iterator begin() { return iterator((pointer)payload_); }
-
-  const_iterator begin() const { return const_iterator((pointer)payload_); }
-
-  iterator end() { return iterator((pointer)(payload_ + size_)); }
-
-  const_iterator end() const {
-    return const_iterator((pointer)(payload_ + size_));
-  }
-
- private:
-  ArrayVal();
-};
-
-inline FbsonDocument* FbsonDocument::createDocument(const char* pb,
-                                                    uint32_t size) {
-  if (!pb || size < sizeof(FbsonHeader) + sizeof(FbsonValue)) {
-    return nullptr;
-  }
-
-  FbsonDocument* doc = (FbsonDocument*)pb;
-  if (doc->header_.ver_ != FBSON_VER) {
-    return nullptr;
-  }
-
-  FbsonValue* val = (FbsonValue*)doc->payload_;
-  if (!val->isObject() || size != sizeof(FbsonHeader) + val->numPackedBytes()) {
-    return nullptr;
-  }
-
-  return doc;
-}
-
-inline FbsonValue* FbsonDocument::createValue(const char* pb, uint32_t size) {
-  if (!pb || size < sizeof(FbsonHeader) + sizeof(FbsonValue)) {
-    return nullptr;
-  }
-
-  FbsonDocument* doc = (FbsonDocument*)pb;
-  if (doc->header_.ver_ != FBSON_VER) {
-    return nullptr;
-  }
-
-  FbsonValue* val = (FbsonValue*)doc->payload_;
-  if (size != sizeof(FbsonHeader) + val->numPackedBytes()) {
-    return nullptr;
-  }
-
-  return val;
-}
-
-inline unsigned int FbsonKeyValue::numPackedBytes() const {
-  unsigned int ks = keyPackedBytes();
-  FbsonValue* val = (FbsonValue*)(((char*)this) + ks);
-  return ks + val->numPackedBytes();
-}
-
-// Poor man's "virtual" function FbsonValue::numPackedBytes
-inline unsigned int FbsonValue::numPackedBytes() const {
-  switch (type_) {
-  case FbsonType::T_Null:
-  case FbsonType::T_True:
-  case FbsonType::T_False: {
-    return sizeof(type_);
-  }
-
-  case FbsonType::T_Int8: {
-    return sizeof(type_) + sizeof(int8_t);
-  }
-  case FbsonType::T_Int16: {
-    return sizeof(type_) + sizeof(int16_t);
-  }
-  case FbsonType::T_Int32: {
-    return sizeof(type_) + sizeof(int32_t);
-  }
-  case FbsonType::T_Int64: {
-    return sizeof(type_) + sizeof(int64_t);
-  }
-  case FbsonType::T_Double: {
-    return sizeof(type_) + sizeof(double);
-  }
-  case FbsonType::T_String:
-  case FbsonType::T_Binary: {
-    return ((BlobVal*)(this))->numPackedBytes();
-  }
-
-  case FbsonType::T_Object:
-  case FbsonType::T_Array: {
-    return ((ContainerVal*)(this))->numPackedBytes();
-  }
-  default:
-    return 0;
-  }
-}
-
-inline unsigned int FbsonValue::size() const {
-  switch (type_) {
-  case FbsonType::T_Int8: {
-    return sizeof(int8_t);
-  }
-  case FbsonType::T_Int16: {
-    return sizeof(int16_t);
-  }
-  case FbsonType::T_Int32: {
-    return sizeof(int32_t);
-  }
-  case FbsonType::T_Int64: {
-    return sizeof(int64_t);
-  }
-  case FbsonType::T_Double: {
-    return sizeof(double);
-  }
-  case FbsonType::T_String:
-  case FbsonType::T_Binary: {
-    return ((BlobVal*)(this))->getBlobLen();
-  }
-
-  case FbsonType::T_Object:
-  case FbsonType::T_Array: {
-    return ((ContainerVal*)(this))->getContainerSize();
-  }
-  case FbsonType::T_Null:
-  case FbsonType::T_True:
-  case FbsonType::T_False:
-  default:
-    return 0;
-  }
-}
-
-inline const char* FbsonValue::getValuePtr() const {
-  switch (type_) {
-  case FbsonType::T_Int8:
-  case FbsonType::T_Int16:
-  case FbsonType::T_Int32:
-  case FbsonType::T_Int64:
-  case FbsonType::T_Double:
-    return ((char*)this) + sizeof(FbsonType);
-
-  case FbsonType::T_String:
-  case FbsonType::T_Binary:
-    return ((BlobVal*)(this))->getBlob();
-
-  case FbsonType::T_Object:
-  case FbsonType::T_Array:
-    return ((ContainerVal*)(this))->getPayload();
-
-  case FbsonType::T_Null:
-  case FbsonType::T_True:
-  case FbsonType::T_False:
-  default:
-    return nullptr;
-  }
-}
-
-inline FbsonValue* FbsonValue::findPath(const char* key_path,
-                                        unsigned int kp_len,
-                                        const char* delim = ".",
-                                        hDictFind handler = nullptr) {
-  if (!key_path || !kp_len)
-    return nullptr;
-
-  if (!delim)
-    delim = "."; // default delimiter
-
-  FbsonValue* pval = this;
-  const char* fence = key_path + kp_len;
-  char idx_buf[21]; // buffer to parse array index (integer value)
-
-  while (pval && key_path < fence) {
-    const char* key = key_path;
-    unsigned int klen = 0;
-    // find the current key
-    for (; key_path != fence && *key_path != *delim; ++key_path, ++klen)
-      ;
-
-    if (!klen)
-      return nullptr;
-
-    switch (pval->type_) {
-    case FbsonType::T_Object: {
-      pval = ((ObjectVal*)pval)->find(key, klen, handler);
-      break;
-    }
-
-    case FbsonType::T_Array: {
-      // parse string into an integer (array index)
-      if (klen >= sizeof(idx_buf))
-        return nullptr;
-
-      memcpy(idx_buf, key, klen);
-      idx_buf[klen] = 0;
-
-      char* end = nullptr;
-      int index = (int)strtol(idx_buf, &end, 10);
-      if (end && !*end)
-        pval = ((fbson::ArrayVal*)pval)->get(index);
-      else
-        // incorrect index string
-        return nullptr;
-      break;
-    }
-
-    default:
-      return nullptr;
-    }
-
-    // skip the delimiter
-    if (key_path < fence) {
-      ++key_path;
-      if (key_path == fence)
-        // we have a trailing delimiter at the end
-        return nullptr;
-    }
-  }
-
-  return pval;
-}
-
-#pragma pack(pop)
-
-} // namespace fbson
-
-#endif // FBSON_FBSONDOCUMENT_H
diff --git a/thirdparty/rocksdb/third-party/fbson/FbsonJsonParser.h b/thirdparty/rocksdb/third-party/fbson/FbsonJsonParser.h
deleted file mode 100644
index 63b03e2b90..0000000000
--- a/thirdparty/rocksdb/third-party/fbson/FbsonJsonParser.h
+++ /dev/null
@@ -1,741 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-/*
- * This file defines FbsonJsonParserT (template) and FbsonJsonParser.
- *
- * FbsonJsonParserT is a template class which implements a JSON parser.
- * FbsonJsonParserT parses JSON text, and serialize it to FBSON binary format
- * by using FbsonWriterT object. By default, FbsonJsonParserT creates a new
- * FbsonWriterT object with an output stream object.  However, you can also
- * pass in your FbsonWriterT or any stream object that implements some basic
- * interface of std::ostream (see FbsonStream.h).
- *
- * FbsonJsonParser specializes FbsonJsonParserT with FbsonOutStream type (see
- * FbsonStream.h). So unless you want to provide own a different output stream
- * type, use FbsonJsonParser object.
- *
- * ** Parsing JSON **
- * FbsonJsonParserT parses JSON string, and directly serializes into FBSON
- * packed bytes. There are three ways to parse a JSON string: (1) using
- * c-string, (2) using string with len, (3) using std::istream object. You can
- * use custome streambuf to redirect output. FbsonOutBuffer is a streambuf used
- * internally if the input is raw character buffer.
- *
- * You can reuse an FbsonJsonParserT object to parse/serialize multiple JSON
- * strings, and the previous FBSON will be overwritten.
- *
- * If parsing fails (returned false), the error code will be set to one of
- * FbsonErrType, and can be retrieved by calling getErrorCode().
- *
- * ** External dictionary **
- * During parsing a JSON string, you can pass a call-back function to map a key
- * string to an id, and store the dictionary id in FBSON to save space. The
- * purpose of using an external dictionary is more towards a collection of
- * documents (which has common keys) rather than a single document, so that
- * space saving will be significant.
- *
- * ** Endianness **
- * Note: FBSON serialization doesn't assume endianness of the server. However
- * you will need to ensure that the endianness at the reader side is the same
- * as that at the writer side (if they are on different machines). Otherwise,
- * proper conversion is needed when a number value is returned to the
- * caller/writer.
- *
- * @author Tian Xia <tianx@fb.com>
- */
-
-#ifndef FBSON_FBSONPARSER_H
-#define FBSON_FBSONPARSER_H
-
-#include <cmath>
-#include <limits>
-#include "FbsonDocument.h"
-#include "FbsonWriter.h"
-
-namespace fbson {
-
-const char* const kJsonDelim = " ,]}\t\r\n";
-const char* const kWhiteSpace = " \t\n\r";
-
-/*
- * Error codes
- */
-enum class FbsonErrType {
-  E_NONE = 0,
-  E_INVALID_VER,
-  E_EMPTY_STR,
-  E_OUTPUT_FAIL,
-  E_INVALID_DOCU,
-  E_INVALID_VALUE,
-  E_INVALID_KEY,
-  E_INVALID_STR,
-  E_INVALID_OBJ,
-  E_INVALID_ARR,
-  E_INVALID_HEX,
-  E_INVALID_OCTAL,
-  E_INVALID_DECIMAL,
-  E_INVALID_EXPONENT,
-  E_HEX_OVERFLOW,
-  E_OCTAL_OVERFLOW,
-  E_DECIMAL_OVERFLOW,
-  E_DOUBLE_OVERFLOW,
-  E_EXPONENT_OVERFLOW,
-};
-
-/*
- * Template FbsonJsonParserT
- */
-template <class OS_TYPE>
-class FbsonJsonParserT {
- public:
-  FbsonJsonParserT() : err_(FbsonErrType::E_NONE) {}
-
-  explicit FbsonJsonParserT(OS_TYPE& os)
-      : writer_(os), err_(FbsonErrType::E_NONE) {}
-
-  // parse a UTF-8 JSON string
-  bool parse(const std::string& str, hDictInsert handler = nullptr) {
-    return parse(str.c_str(), (unsigned int)str.size(), handler);
-  }
-
-  // parse a UTF-8 JSON c-style string (NULL terminated)
-  bool parse(const char* c_str, hDictInsert handler = nullptr) {
-    return parse(c_str, (unsigned int)strlen(c_str), handler);
-  }
-
-  // parse a UTF-8 JSON string with length
-  bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) {
-    if (!pch || len == 0) {
-      err_ = FbsonErrType::E_EMPTY_STR;
-      return false;
-    }
-
-    FbsonInBuffer sb(pch, len);
-    std::istream in(&sb);
-    return parse(in, handler);
-  }
-
-  // parse UTF-8 JSON text from an input stream
-  bool parse(std::istream& in, hDictInsert handler = nullptr) {
-    bool res = false;
-
-    // reset output stream
-    writer_.reset();
-
-    trim(in);
-
-    if (in.peek() == '{') {
-      in.ignore();
-      res = parseObject(in, handler);
-    } else if (in.peek() == '[') {
-      in.ignore();
-      res = parseArray(in, handler);
-    } else {
-      err_ = FbsonErrType::E_INVALID_DOCU;
-    }
-
-    trim(in);
-    if (res && !in.eof()) {
-      err_ = FbsonErrType::E_INVALID_DOCU;
-      return false;
-    }
-
-    return res;
-  }
-
-  FbsonWriterT<OS_TYPE>& getWriter() { return writer_; }
-
-  FbsonErrType getErrorCode() { return err_; }
-
-  // clear error code
-  void clearErr() { err_ = FbsonErrType::E_NONE; }
-
- private:
-  // parse a JSON object (comma-separated list of key-value pairs)
-  bool parseObject(std::istream& in, hDictInsert handler) {
-    if (!writer_.writeStartObject()) {
-      err_ = FbsonErrType::E_OUTPUT_FAIL;
-      return false;
-    }
-
-    trim(in);
-
-    if (in.peek() == '}') {
-      in.ignore();
-      // empty object
-      if (!writer_.writeEndObject()) {
-        err_ = FbsonErrType::E_OUTPUT_FAIL;
-        return false;
-      }
-      return true;
-    }
-
-    while (in.good()) {
-      if (in.get() != '"') {
-        err_ = FbsonErrType::E_INVALID_KEY;
-        return false;
-      }
-
-      if (!parseKVPair(in, handler)) {
-        return false;
-      }
-
-      trim(in);
-
-      char ch = in.get();
-      if (ch == '}') {
-        // end of the object
-        if (!writer_.writeEndObject()) {
-          err_ = FbsonErrType::E_OUTPUT_FAIL;
-          return false;
-        }
-        return true;
-      } else if (ch != ',') {
-        err_ = FbsonErrType::E_INVALID_OBJ;
-        return false;
-      }
-
-      trim(in);
-    }
-
-    err_ = FbsonErrType::E_INVALID_OBJ;
-    return false;
-  }
-
-  // parse a JSON array (comma-separated list of values)
-  bool parseArray(std::istream& in, hDictInsert handler) {
-    if (!writer_.writeStartArray()) {
-      err_ = FbsonErrType::E_OUTPUT_FAIL;
-      return false;
-    }
-
-    trim(in);
-
-    if (in.peek() == ']') {
-      in.ignore();
-      // empty array
-      if (!writer_.writeEndArray()) {
-        err_ = FbsonErrType::E_OUTPUT_FAIL;
-        return false;
-      }
-      return true;
-    }
-
-    while (in.good()) {
-      if (!parseValue(in, handler)) {
-        return false;
-      }
-
-      trim(in);
-
-      char ch = in.get();
-      if (ch == ']') {
-        // end of the array
-        if (!writer_.writeEndArray()) {
-          err_ = FbsonErrType::E_OUTPUT_FAIL;
-          return false;
-        }
-        return true;
-      } else if (ch != ',') {
-        err_ = FbsonErrType::E_INVALID_ARR;
-        return false;
-      }
-
-      trim(in);
-    }
-
-    err_ = FbsonErrType::E_INVALID_ARR;
-    return false;
-  }
-
-  // parse a key-value pair, separated by ":"
-  bool parseKVPair(std::istream& in, hDictInsert handler) {
-    if (parseKey(in, handler) && parseValue(in, handler)) {
-      return true;
-    }
-
-    return false;
-  }
-
-  // parse a key (must be string)
-  bool parseKey(std::istream& in, hDictInsert handler) {
-    char key[FbsonKeyValue::sMaxKeyLen];
-    int i = 0;
-    while (in.good() && in.peek() != '"' && i < FbsonKeyValue::sMaxKeyLen) {
-      key[i++] = in.get();
-    }
-
-    if (!in.good() || in.peek() != '"' || i == 0) {
-      err_ = FbsonErrType::E_INVALID_KEY;
-      return false;
-    }
-
-    in.ignore(); // discard '"'
-
-    int key_id = -1;
-    if (handler) {
-      key_id = handler(key, i);
-    }
-
-    if (key_id < 0) {
-      writer_.writeKey(key, i);
-    } else {
-      writer_.writeKey(key_id);
-    }
-
-    trim(in);
-
-    if (in.get() != ':') {
-      err_ = FbsonErrType::E_INVALID_OBJ;
-      return false;
-    }
-
-    return true;
-  }
-
-  // parse a value
-  bool parseValue(std::istream& in, hDictInsert handler) {
-    bool res = false;
-
-    trim(in);
-
-    switch (in.peek()) {
-    case 'N':
-    case 'n': {
-      in.ignore();
-      res = parseNull(in);
-      break;
-    }
-    case 'T':
-    case 't': {
-      in.ignore();
-      res = parseTrue(in);
-      break;
-    }
-    case 'F':
-    case 'f': {
-      in.ignore();
-      res = parseFalse(in);
-      break;
-    }
-    case '"': {
-      in.ignore();
-      res = parseString(in);
-      break;
-    }
-    case '{': {
-      in.ignore();
-      res = parseObject(in, handler);
-      break;
-    }
-    case '[': {
-      in.ignore();
-      res = parseArray(in, handler);
-      break;
-    }
-    default: {
-      res = parseNumber(in);
-      break;
-    }
-    }
-
-    return res;
-  }
-
-  // parse NULL value
-  bool parseNull(std::istream& in) {
-    if (tolower(in.get()) == 'u' && tolower(in.get()) == 'l' &&
-        tolower(in.get()) == 'l') {
-      writer_.writeNull();
-      return true;
-    }
-
-    err_ = FbsonErrType::E_INVALID_VALUE;
-    return false;
-  }
-
-  // parse TRUE value
-  bool parseTrue(std::istream& in) {
-    if (tolower(in.get()) == 'r' && tolower(in.get()) == 'u' &&
-        tolower(in.get()) == 'e') {
-      writer_.writeBool(true);
-      return true;
-    }
-
-    err_ = FbsonErrType::E_INVALID_VALUE;
-    return false;
-  }
-
-  // parse FALSE value
-  bool parseFalse(std::istream& in) {
-    if (tolower(in.get()) == 'a' && tolower(in.get()) == 'l' &&
-        tolower(in.get()) == 's' && tolower(in.get()) == 'e') {
-      writer_.writeBool(false);
-      return true;
-    }
-
-    err_ = FbsonErrType::E_INVALID_VALUE;
-    return false;
-  }
-
-  // parse a string
-  bool parseString(std::istream& in) {
-    if (!writer_.writeStartString()) {
-      err_ = FbsonErrType::E_OUTPUT_FAIL;
-      return false;
-    }
-
-    bool escaped = false;
-    char buffer[4096]; // write 4KB at a time
-    int nread = 0;
-    while (in.good()) {
-      char ch = in.get();
-      if (ch != '"' || escaped) {
-        buffer[nread++] = ch;
-        if (nread == 4096) {
-          // flush buffer
-          if (!writer_.writeString(buffer, nread)) {
-            err_ = FbsonErrType::E_OUTPUT_FAIL;
-            return false;
-          }
-          nread = 0;
-        }
-        // set/reset escape
-        if (ch == '\\' || escaped) {
-          escaped = !escaped;
-        }
-      } else {
-        // write all remaining bytes in the buffer
-        if (nread > 0) {
-          if (!writer_.writeString(buffer, nread)) {
-            err_ = FbsonErrType::E_OUTPUT_FAIL;
-            return false;
-          }
-        }
-        // end writing string
-        if (!writer_.writeEndString()) {
-          err_ = FbsonErrType::E_OUTPUT_FAIL;
-          return false;
-        }
-        return true;
-      }
-    }
-
-    err_ = FbsonErrType::E_INVALID_STR;
-    return false;
-  }
-
-  // parse a number
-  // Number format can be hex, octal, or decimal (including float).
-  // Only decimal can have (+/-) sign prefix.
-  bool parseNumber(std::istream& in) {
-    bool ret = false;
-    switch (in.peek()) {
-    case '0': {
-      in.ignore();
-
-      if (in.peek() == 'x' || in.peek() == 'X') {
-        in.ignore();
-        ret = parseHex(in);
-      } else if (in.peek() == '.') {
-        in.ignore();
-        ret = parseDouble(in, 0, 0, 1);
-      } else {
-        ret = parseOctal(in);
-      }
-
-      break;
-    }
-    case '-': {
-      in.ignore();
-      ret = parseDecimal(in, -1);
-      break;
-    }
-    case '+':
-      in.ignore();
-    // fall through
-    default:
-      ret = parseDecimal(in, 1);
-      break;
-    }
-
-    return ret;
-  }
-
-  // parse a number in hex format
-  bool parseHex(std::istream& in) {
-    uint64_t val = 0;
-    int num_digits = 0;
-    char ch = tolower(in.peek());
-    while (in.good() && !strchr(kJsonDelim, ch) && (++num_digits) <= 16) {
-      if (ch >= '0' && ch <= '9') {
-        val = (val << 4) + (ch - '0');
-      } else if (ch >= 'a' && ch <= 'f') {
-        val = (val << 4) + (ch - 'a' + 10);
-      } else { // unrecognized hex digit
-        err_ = FbsonErrType::E_INVALID_HEX;
-        return false;
-      }
-
-      in.ignore();
-      ch = tolower(in.peek());
-    }
-
-    int size = 0;
-    if (num_digits <= 2) {
-      size = writer_.writeInt8((int8_t)val);
-    } else if (num_digits <= 4) {
-      size = writer_.writeInt16((int16_t)val);
-    } else if (num_digits <= 8) {
-      size = writer_.writeInt32((int32_t)val);
-    } else if (num_digits <= 16) {
-      size = writer_.writeInt64(val);
-    } else {
-      err_ = FbsonErrType::E_HEX_OVERFLOW;
-      return false;
-    }
-
-    if (size == 0) {
-      err_ = FbsonErrType::E_OUTPUT_FAIL;
-      return false;
-    }
-
-    return true;
-  }
-
-  // parse a number in octal format
-  bool parseOctal(std::istream& in) {
-    int64_t val = 0;
-    char ch = in.peek();
-    while (in.good() && !strchr(kJsonDelim, ch)) {
-      if (ch >= '0' && ch <= '7') {
-        val = val * 8 + (ch - '0');
-      } else {
-        err_ = FbsonErrType::E_INVALID_OCTAL;
-        return false;
-      }
-
-      // check if the number overflows
-      if (val < 0) {
-        err_ = FbsonErrType::E_OCTAL_OVERFLOW;
-        return false;
-      }
-
-      in.ignore();
-      ch = in.peek();
-    }
-
-    int size = 0;
-    if (val <= std::numeric_limits<int8_t>::max()) {
-      size = writer_.writeInt8((int8_t)val);
-    } else if (val <= std::numeric_limits<int16_t>::max()) {
-      size = writer_.writeInt16((int16_t)val);
-    } else if (val <= std::numeric_limits<int32_t>::max()) {
-      size = writer_.writeInt32((int32_t)val);
-    } else { // val <= INT64_MAX
-      size = writer_.writeInt64(val);
-    }
-
-    if (size == 0) {
-      err_ = FbsonErrType::E_OUTPUT_FAIL;
-      return false;
-    }
-
-    return true;
-  }
-
-  // parse a number in decimal (including float)
-  bool parseDecimal(std::istream& in, int sign) {
-    int64_t val = 0;
-    int precision = 0;
-
-    char ch = 0;
-    while (in.good() && (ch = in.peek()) == '0')
-      in.ignore();
-
-    while (in.good() && !strchr(kJsonDelim, ch)) {
-      if (ch >= '0' && ch <= '9') {
-        val = val * 10 + (ch - '0');
-        ++precision;
-      } else if (ch == '.') {
-        // note we don't pop out '.'
-        return parseDouble(in, static_cast<double>(val), precision, sign);
-      } else {
-        err_ = FbsonErrType::E_INVALID_DECIMAL;
-        return false;
-      }
-
-      in.ignore();
-
-      // if the number overflows int64_t, first parse it as double iff we see a
-      // decimal point later. Otherwise, will treat it as overflow
-      if (val < 0 && val > std::numeric_limits<int64_t>::min()) {
-        return parseDouble(in, static_cast<double>(val), precision, sign);
-      }
-
-      ch = in.peek();
-    }
-
-    if (sign < 0) {
-      val = -val;
-    }
-
-    int size = 0;
-    if (val >= std::numeric_limits<int8_t>::min() &&
-        val <= std::numeric_limits<int8_t>::max()) {
-      size = writer_.writeInt8((int8_t)val);
-    } else if (val >= std::numeric_limits<int16_t>::min() &&
-               val <= std::numeric_limits<int16_t>::max()) {
-      size = writer_.writeInt16((int16_t)val);
-    } else if (val >= std::numeric_limits<int32_t>::min() &&
-               val <= std::numeric_limits<int32_t>::max()) {
-      size = writer_.writeInt32((int32_t)val);
-    } else { // val <= INT64_MAX
-      size = writer_.writeInt64(val);
-    }
-
-    if (size == 0) {
-      err_ = FbsonErrType::E_OUTPUT_FAIL;
-      return false;
-    }
-
-    return true;
-  }
-
-  // parse IEEE745 double precision:
-  // Significand precision length - 15
-  // Maximum exponent value - 308
-  //
-  // "If a decimal string with at most 15 significant digits is converted to
-  // IEEE 754 double precision representation and then converted back to a
-  // string with the same number of significant digits, then the final string
-  // should match the original"
-  bool parseDouble(std::istream& in, double val, int precision, int sign) {
-    int integ = precision;
-    int frac = 0;
-    bool is_frac = false;
-
-    char ch = in.peek();
-    if (ch == '.') {
-      is_frac = true;
-      in.ignore();
-      ch = in.peek();
-    }
-
-    int exp = 0;
-    while (in.good() && !strchr(kJsonDelim, ch)) {
-      if (ch >= '0' && ch <= '9') {
-        if (precision < 15) {
-          val = val * 10 + (ch - '0');
-          if (is_frac) {
-            ++frac;
-          } else {
-            ++integ;
-          }
-          ++precision;
-        } else if (!is_frac) {
-          ++exp;
-        }
-      } else if (ch == 'e' || ch == 'E') {
-        in.ignore();
-        int exp2;
-        if (!parseExponent(in, exp2)) {
-          return false;
-        }
-
-        exp += exp2;
-        // check if exponent overflows
-        if (exp > 308 || exp < -308) {
-          err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
-          return false;
-        }
-
-        is_frac = true;
-        break;
-      }
-
-      in.ignore();
-      ch = in.peek();
-    }
-
-    if (!is_frac) {
-      err_ = FbsonErrType::E_DECIMAL_OVERFLOW;
-      return false;
-    }
-
-    val *= std::pow(10, exp - frac);
-    if (std::isnan(val) || std::isinf(val)) {
-      err_ = FbsonErrType::E_DOUBLE_OVERFLOW;
-      return false;
-    }
-
-    if (sign < 0) {
-      val = -val;
-    }
-
-    if (writer_.writeDouble(val) == 0) {
-      err_ = FbsonErrType::E_OUTPUT_FAIL;
-      return false;
-    }
-
-    return true;
-  }
-
-  // parse the exponent part of a double number
-  bool parseExponent(std::istream& in, int& exp) {
-    bool neg = false;
-
-    char ch = in.peek();
-    if (ch == '+') {
-      in.ignore();
-      ch = in.peek();
-    } else if (ch == '-') {
-      neg = true;
-      in.ignore();
-      ch = in.peek();
-    }
-
-    exp = 0;
-    while (in.good() && !strchr(kJsonDelim, ch)) {
-      if (ch >= '0' && ch <= '9') {
-        exp = exp * 10 + (ch - '0');
-      } else {
-        err_ = FbsonErrType::E_INVALID_EXPONENT;
-        return false;
-      }
-
-      if (exp > 308) {
-        err_ = FbsonErrType::E_EXPONENT_OVERFLOW;
-        return false;
-      }
-
-      in.ignore();
-      ch = in.peek();
-    }
-
-    if (neg) {
-      exp = -exp;
-    }
-
-    return true;
-  }
-
-  void trim(std::istream& in) {
-    while (in.good() && strchr(kWhiteSpace, in.peek())) {
-      in.ignore();
-    }
-  }
-
- private:
-  FbsonWriterT<OS_TYPE> writer_;
-  FbsonErrType err_;
-};
-
-typedef FbsonJsonParserT<FbsonOutStream> FbsonJsonParser;
-
-} // namespace fbson
-
-#endif // FBSON_FBSONPARSER_H
diff --git a/thirdparty/rocksdb/third-party/fbson/FbsonStream.h b/thirdparty/rocksdb/third-party/fbson/FbsonStream.h
deleted file mode 100644
index 12723ea30e..0000000000
--- a/thirdparty/rocksdb/third-party/fbson/FbsonStream.h
+++ /dev/null
@@ -1,182 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-/*
- * This header file defines FbsonInBuffer and FbsonOutStream classes.
- *
- * ** Input Buffer **
- * FbsonInBuffer is a customer input buffer to wrap raw character buffer. Its
- * object instances are used to create std::istream objects interally.
- *
- * ** Output Stream **
- * FbsonOutStream is a custom output stream classes, to contain the FBSON
- * serialized binary. The class is conveniently used to specialize templates of
- * FbsonParser and FbsonWriter.
- *
- * @author Tian Xia <tianx@fb.com>
- */
-
-#ifndef FBSON_FBSONSTREAM_H
-#define FBSON_FBSONSTREAM_H
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#if defined OS_WIN && !defined snprintf
-#define snprintf _snprintf
-#endif
-
-#include <inttypes.h>
-#include <iostream>
-
-namespace fbson {
-
-// lengths includes sign
-#define MAX_INT_DIGITS 11
-#define MAX_INT64_DIGITS 20
-#define MAX_DOUBLE_DIGITS 23 // 1(sign)+16(significant)+1(decimal)+5(exponent)
-
-/*
- * FBSON's implementation of input buffer
- */
-class FbsonInBuffer : public std::streambuf {
- public:
-  FbsonInBuffer(const char* str, uint32_t len) {
-    // this is read buffer and the str will not be changed
-    // so we use const_cast (ugly!) to remove constness
-    char* pch(const_cast<char*>(str));
-    setg(pch, pch, pch + len);
-  }
-};
-
-/*
- * FBSON's implementation of output stream.
- *
- * This is a wrapper of a char buffer. By default, the buffer capacity is 1024
- * bytes. We will double the buffer if realloc is needed for writes.
- */
-class FbsonOutStream : public std::ostream {
- public:
-  explicit FbsonOutStream(uint32_t capacity = 1024)
-      : std::ostream(nullptr),
-        head_(nullptr),
-        size_(0),
-        capacity_(capacity),
-        alloc_(true) {
-    if (capacity_ == 0) {
-      capacity_ = 1024;
-    }
-
-    head_ = (char*)malloc(capacity_);
-  }
-
-  FbsonOutStream(char* buffer, uint32_t capacity)
-      : std::ostream(nullptr),
-        head_(buffer),
-        size_(0),
-        capacity_(capacity),
-        alloc_(false) {
-    assert(buffer && capacity_ > 0);
-  }
-
-  ~FbsonOutStream() {
-    if (alloc_) {
-      free(head_);
-    }
-  }
-
-  void put(char c) { write(&c, 1); }
-
-  void write(const char* c_str) { write(c_str, (uint32_t)strlen(c_str)); }
-
-  void write(const char* bytes, uint32_t len) {
-    if (len == 0)
-      return;
-
-    if (size_ + len > capacity_) {
-      realloc(len);
-    }
-
-    memcpy(head_ + size_, bytes, len);
-    size_ += len;
-  }
-
-  // write the integer to string
-  void write(int i) {
-    // snprintf automatically adds a NULL, so we need one more char
-    if (size_ + MAX_INT_DIGITS + 1 > capacity_) {
-      realloc(MAX_INT_DIGITS + 1);
-    }
-
-    int len = snprintf(head_ + size_, MAX_INT_DIGITS + 1, "%d", i);
-    assert(len > 0);
-    size_ += len;
-  }
-
-  // write the 64bit integer to string
-  void write(int64_t l) {
-    // snprintf automatically adds a NULL, so we need one more char
-    if (size_ + MAX_INT64_DIGITS + 1 > capacity_) {
-      realloc(MAX_INT64_DIGITS + 1);
-    }
-
-    int len = snprintf(head_ + size_, MAX_INT64_DIGITS + 1, "%" PRIi64, l);
-    assert(len > 0);
-    size_ += len;
-  }
-
-  // write the double to string
-  void write(double d) {
-    // snprintf automatically adds a NULL, so we need one more char
-    if (size_ + MAX_DOUBLE_DIGITS + 1 > capacity_) {
-      realloc(MAX_DOUBLE_DIGITS + 1);
-    }
-
-    int len = snprintf(head_ + size_, MAX_DOUBLE_DIGITS + 1, "%.15g", d);
-    assert(len > 0);
-    size_ += len;
-  }
-
-  pos_type tellp() const { return size_; }
-
-  void seekp(pos_type pos) { size_ = (uint32_t)pos; }
-
-  const char* getBuffer() const { return head_; }
-
-  pos_type getSize() const { return tellp(); }
-
- private:
-  void realloc(uint32_t len) {
-    assert(capacity_ > 0);
-
-    capacity_ *= 2;
-    while (capacity_ < size_ + len) {
-      capacity_ *= 2;
-    }
-
-    if (alloc_) {
-      char* new_buf = (char*)::realloc(head_, capacity_);
-      assert(new_buf);
-      head_ = new_buf;
-    } else {
-      char* new_buf = (char*)::malloc(capacity_);
-      assert(new_buf);
-      memcpy(new_buf, head_, size_);
-      head_ = new_buf;
-      alloc_ = true;
-    }
-  }
-
- private:
-  char* head_;
-  uint32_t size_;
-  uint32_t capacity_;
-  bool alloc_;
-};
-
-} // namespace fbson
-
-#endif // FBSON_FBSONSTREAM_H
diff --git a/thirdparty/rocksdb/third-party/fbson/FbsonUtil.h b/thirdparty/rocksdb/third-party/fbson/FbsonUtil.h
deleted file mode 100644
index 2b6d6f5c97..0000000000
--- a/thirdparty/rocksdb/third-party/fbson/FbsonUtil.h
+++ /dev/null
@@ -1,163 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-/*
- * This header file defines miscellaneous utility classes.
- *
- * @author Tian Xia <tianx@fb.com>
- */
-
-#ifndef FBSON_FBSONUTIL_H
-#define FBSON_FBSONUTIL_H
-
-#include <sstream>
-#include "FbsonDocument.h"
-
-namespace fbson {
-
-#define OUT_BUF_SIZE 1024
-
-/*
- * FbsonToJson converts an FbsonValue object to a JSON string.
- */
-class FbsonToJson {
- public:
-  FbsonToJson() : os_(buffer_, OUT_BUF_SIZE) {}
-
-  // get json string
-  const char* json(const FbsonValue* pval) {
-    os_.clear();
-    os_.seekp(0);
-
-    if (pval) {
-      intern_json(pval);
-    }
-
-    os_.put(0);
-    return os_.getBuffer();
-  }
-
- private:
-  // recursively convert FbsonValue
-  void intern_json(const FbsonValue* val) {
-    switch (val->type()) {
-    case FbsonType::T_Null: {
-      os_.write("null", 4);
-      break;
-    }
-    case FbsonType::T_True: {
-      os_.write("true", 4);
-      break;
-    }
-    case FbsonType::T_False: {
-      os_.write("false", 5);
-      break;
-    }
-    case FbsonType::T_Int8: {
-      os_.write(((Int8Val*)val)->val());
-      break;
-    }
-    case FbsonType::T_Int16: {
-      os_.write(((Int16Val*)val)->val());
-      break;
-    }
-    case FbsonType::T_Int32: {
-      os_.write(((Int32Val*)val)->val());
-      break;
-    }
-    case FbsonType::T_Int64: {
-      os_.write(((Int64Val*)val)->val());
-      break;
-    }
-    case FbsonType::T_Double: {
-      os_.write(((DoubleVal*)val)->val());
-      break;
-    }
-    case FbsonType::T_String: {
-      os_.put('"');
-      os_.write(((StringVal*)val)->getBlob(), ((StringVal*)val)->getBlobLen());
-      os_.put('"');
-      break;
-    }
-    case FbsonType::T_Binary: {
-      os_.write("\"<BINARY>", 9);
-      os_.write(((BinaryVal*)val)->getBlob(), ((BinaryVal*)val)->getBlobLen());
-      os_.write("<BINARY>\"", 9);
-      break;
-    }
-    case FbsonType::T_Object: {
-      object_to_json((ObjectVal*)val);
-      break;
-    }
-    case FbsonType::T_Array: {
-      array_to_json((ArrayVal*)val);
-      break;
-    }
-    default:
-      break;
-    }
-  }
-
-  // convert object
-  void object_to_json(const ObjectVal* val) {
-    os_.put('{');
-
-    auto iter = val->begin();
-    auto iter_fence = val->end();
-
-    while (iter < iter_fence) {
-      // write key
-      if (iter->klen()) {
-        os_.put('"');
-        os_.write(iter->getKeyStr(), iter->klen());
-        os_.put('"');
-      } else {
-        os_.write(iter->getKeyId());
-      }
-      os_.put(':');
-
-      // convert value
-      intern_json(iter->value());
-
-      ++iter;
-      if (iter != iter_fence) {
-        os_.put(',');
-      }
-    }
-
-    assert(iter == iter_fence);
-
-    os_.put('}');
-  }
-
-  // convert array to json
-  void array_to_json(const ArrayVal* val) {
-    os_.put('[');
-
-    auto iter = val->begin();
-    auto iter_fence = val->end();
-
-    while (iter != iter_fence) {
-      // convert value
-      intern_json((const FbsonValue*)iter);
-      ++iter;
-      if (iter != iter_fence) {
-        os_.put(',');
-      }
-    }
-
-    assert(iter == iter_fence);
-
-    os_.put(']');
-  }
-
- private:
-  FbsonOutStream os_;
-  char buffer_[OUT_BUF_SIZE];
-};
-
-} // namespace fbson
-
-#endif // FBSON_FBSONUTIL_H
diff --git a/thirdparty/rocksdb/third-party/fbson/FbsonWriter.h b/thirdparty/rocksdb/third-party/fbson/FbsonWriter.h
deleted file mode 100644
index a254e9bbf8..0000000000
--- a/thirdparty/rocksdb/third-party/fbson/FbsonWriter.h
+++ /dev/null
@@ -1,430 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-/*
- * This file defines FbsonWriterT (template) and FbsonWriter.
- *
- * FbsonWriterT is a template class which implements an FBSON serializer.
- * Users call various write functions of FbsonWriterT object to write values
- * directly to FBSON packed bytes. All write functions of value or key return
- * the number of bytes written to FBSON, or 0 if there is an error. To write an
- * object, an array, or a string, you must call writeStart[..] before writing
- * values or key, and call writeEnd[..] after finishing at the end.
- *
- * By default, an FbsonWriterT object creates an output stream buffer.
- * Alternatively, you can also pass any output stream object to a writer, as
- * long as the stream object implements some basic functions of std::ostream
- * (such as FbsonOutStream, see FbsonStream.h).
- *
- * FbsonWriter specializes FbsonWriterT with FbsonOutStream type (see
- * FbsonStream.h). So unless you want to provide own a different output stream
- * type, use FbsonParser object.
- *
- * @author Tian Xia <tianx@fb.com>
- */
-
-#ifndef FBSON_FBSONWRITER_H
-#define FBSON_FBSONWRITER_H
-
-#include <stack>
-#include "FbsonDocument.h"
-#include "FbsonStream.h"
-
-namespace fbson {
-
-template <class OS_TYPE>
-class FbsonWriterT {
- public:
-  FbsonWriterT()
-      : alloc_(true), hasHdr_(false), kvState_(WS_Value), str_pos_(0) {
-    os_ = new OS_TYPE();
-  }
-
-  explicit FbsonWriterT(OS_TYPE& os)
-      : os_(&os),
-        alloc_(false),
-        hasHdr_(false),
-        kvState_(WS_Value),
-        str_pos_(0) {}
-
-  ~FbsonWriterT() {
-    if (alloc_) {
-      delete os_;
-    }
-  }
-
-  void reset() {
-    os_->clear();
-    os_->seekp(0);
-    hasHdr_ = false;
-    kvState_ = WS_Value;
-    for (; !stack_.empty(); stack_.pop())
-      ;
-  }
-
-  // write a key string (or key id if an external dict is provided)
-  uint32_t writeKey(const char* key,
-                    uint8_t len,
-                    hDictInsert handler = nullptr) {
-    if (len && !stack_.empty() && verifyKeyState()) {
-      int key_id = -1;
-      if (handler) {
-        key_id = handler(key, len);
-      }
-
-      uint32_t size = sizeof(uint8_t);
-      if (key_id < 0) {
-        os_->put(len);
-        os_->write(key, len);
-        size += len;
-      } else if (key_id <= FbsonKeyValue::sMaxKeyId) {
-        FbsonKeyValue::keyid_type idx = key_id;
-        os_->put(0);
-        os_->write((char*)&idx, sizeof(FbsonKeyValue::keyid_type));
-        size += sizeof(FbsonKeyValue::keyid_type);
-      } else { // key id overflow
-        assert(0);
-        return 0;
-      }
-
-      kvState_ = WS_Key;
-      return size;
-    }
-
-    return 0;
-  }
-
-  // write a key id
-  uint32_t writeKey(FbsonKeyValue::keyid_type idx) {
-    if (!stack_.empty() && verifyKeyState()) {
-      os_->put(0);
-      os_->write((char*)&idx, sizeof(FbsonKeyValue::keyid_type));
-      kvState_ = WS_Key;
-      return sizeof(uint8_t) + sizeof(FbsonKeyValue::keyid_type);
-    }
-
-    return 0;
-  }
-
-  uint32_t writeNull() {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_Null);
-      kvState_ = WS_Value;
-      return sizeof(FbsonValue);
-    }
-
-    return 0;
-  }
-
-  uint32_t writeBool(bool b) {
-    if (!stack_.empty() && verifyValueState()) {
-      if (b) {
-        os_->put((FbsonTypeUnder)FbsonType::T_True);
-      } else {
-        os_->put((FbsonTypeUnder)FbsonType::T_False);
-      }
-
-      kvState_ = WS_Value;
-      return sizeof(FbsonValue);
-    }
-
-    return 0;
-  }
-
-  uint32_t writeInt8(int8_t v) {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_Int8);
-      os_->put(v);
-      kvState_ = WS_Value;
-      return sizeof(Int8Val);
-    }
-
-    return 0;
-  }
-
-  uint32_t writeInt16(int16_t v) {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_Int16);
-      os_->write((char*)&v, sizeof(int16_t));
-      kvState_ = WS_Value;
-      return sizeof(Int16Val);
-    }
-
-    return 0;
-  }
-
-  uint32_t writeInt32(int32_t v) {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_Int32);
-      os_->write((char*)&v, sizeof(int32_t));
-      kvState_ = WS_Value;
-      return sizeof(Int32Val);
-    }
-
-    return 0;
-  }
-
-  uint32_t writeInt64(int64_t v) {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_Int64);
-      os_->write((char*)&v, sizeof(int64_t));
-      kvState_ = WS_Value;
-      return sizeof(Int64Val);
-    }
-
-    return 0;
-  }
-
-  uint32_t writeDouble(double v) {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_Double);
-      os_->write((char*)&v, sizeof(double));
-      kvState_ = WS_Value;
-      return sizeof(DoubleVal);
-    }
-
-    return 0;
-  }
-
-  // must call writeStartString before writing a string val
-  bool writeStartString() {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_String);
-      str_pos_ = os_->tellp();
-
-      // fill the size bytes with 0 for now
-      uint32_t size = 0;
-      os_->write((char*)&size, sizeof(uint32_t));
-
-      kvState_ = WS_String;
-      return true;
-    }
-
-    return false;
-  }
-
-  // finish writing a string val
-  bool writeEndString() {
-    if (kvState_ == WS_String) {
-      std::streampos cur_pos = os_->tellp();
-      int32_t size = (int32_t)(cur_pos - str_pos_ - sizeof(uint32_t));
-      assert(size >= 0);
-
-      os_->seekp(str_pos_);
-      os_->write((char*)&size, sizeof(uint32_t));
-      os_->seekp(cur_pos);
-
-      kvState_ = WS_Value;
-      return true;
-    }
-
-    return false;
-  }
-
-  uint32_t writeString(const char* str, uint32_t len) {
-    if (kvState_ == WS_String) {
-      os_->write(str, len);
-      return len;
-    }
-
-    return 0;
-  }
-
-  uint32_t writeString(char ch) {
-    if (kvState_ == WS_String) {
-      os_->put(ch);
-      return 1;
-    }
-
-    return 0;
-  }
-
-  // must call writeStartBinary before writing a binary val
-  bool writeStartBinary() {
-    if (!stack_.empty() && verifyValueState()) {
-      os_->put((FbsonTypeUnder)FbsonType::T_Binary);
-      str_pos_ = os_->tellp();
-
-      // fill the size bytes with 0 for now
-      uint32_t size = 0;
-      os_->write((char*)&size, sizeof(uint32_t));
-
-      kvState_ = WS_Binary;
-      return true;
-    }
-
-    return false;
-  }
-
-  // finish writing a binary val
-  bool writeEndBinary() {
-    if (kvState_ == WS_Binary) {
-      std::streampos cur_pos = os_->tellp();
-      int32_t size = (int32_t)(cur_pos - str_pos_ - sizeof(uint32_t));
-      assert(size >= 0);
-
-      os_->seekp(str_pos_);
-      os_->write((char*)&size, sizeof(uint32_t));
-      os_->seekp(cur_pos);
-
-      kvState_ = WS_Value;
-      return true;
-    }
-
-    return false;
-  }
-
-  uint32_t writeBinary(const char* bin, uint32_t len) {
-    if (kvState_ == WS_Binary) {
-      os_->write(bin, len);
-      return len;
-    }
-
-    return 0;
-  }
-
-  // must call writeStartObject before writing an object val
-  bool writeStartObject() {
-    if (stack_.empty() || verifyValueState()) {
-      if (stack_.empty()) {
-        // if this is a new FBSON, write the header
-        if (!hasHdr_) {
-          writeHeader();
-        } else
-          return false;
-      }
-
-      os_->put((FbsonTypeUnder)FbsonType::T_Object);
-      // save the size position
-      stack_.push(WriteInfo({WS_Object, os_->tellp()}));
-
-      // fill the size bytes with 0 for now
-      uint32_t size = 0;
-      os_->write((char*)&size, sizeof(uint32_t));
-
-      kvState_ = WS_Value;
-      return true;
-    }
-
-    return false;
-  }
-
-  // finish writing an object val
-  bool writeEndObject() {
-    if (!stack_.empty() && stack_.top().state == WS_Object &&
-        kvState_ == WS_Value) {
-      WriteInfo& ci = stack_.top();
-      std::streampos cur_pos = os_->tellp();
-      int32_t size = (int32_t)(cur_pos - ci.sz_pos - sizeof(uint32_t));
-      assert(size >= 0);
-
-      os_->seekp(ci.sz_pos);
-      os_->write((char*)&size, sizeof(uint32_t));
-      os_->seekp(cur_pos);
-      stack_.pop();
-
-      return true;
-    }
-
-    return false;
-  }
-
-  // must call writeStartArray before writing an array val
-  bool writeStartArray() {
-    if (stack_.empty() || verifyValueState()) {
-      if (stack_.empty()) {
-        // if this is a new FBSON, write the header
-        if (!hasHdr_) {
-          writeHeader();
-        } else
-          return false;
-      }
-
-      os_->put((FbsonTypeUnder)FbsonType::T_Array);
-      // save the size position
-      stack_.push(WriteInfo({WS_Array, os_->tellp()}));
-
-      // fill the size bytes with 0 for now
-      uint32_t size = 0;
-      os_->write((char*)&size, sizeof(uint32_t));
-
-      kvState_ = WS_Value;
-      return true;
-    }
-
-    return false;
-  }
-
-  // finish writing an array val
-  bool writeEndArray() {
-    if (!stack_.empty() && stack_.top().state == WS_Array &&
-        kvState_ == WS_Value) {
-      WriteInfo& ci = stack_.top();
-      std::streampos cur_pos = os_->tellp();
-      int32_t size = (int32_t)(cur_pos - ci.sz_pos - sizeof(uint32_t));
-      assert(size >= 0);
-
-      os_->seekp(ci.sz_pos);
-      os_->write((char*)&size, sizeof(uint32_t));
-      os_->seekp(cur_pos);
-      stack_.pop();
-
-      return true;
-    }
-
-    return false;
-  }
-
-  OS_TYPE* getOutput() { return os_; }
-
- private:
-  // verify we are in the right state before writing a value
-  bool verifyValueState() {
-    assert(!stack_.empty());
-    return (stack_.top().state == WS_Object && kvState_ == WS_Key) ||
-           (stack_.top().state == WS_Array && kvState_ == WS_Value);
-  }
-
-  // verify we are in the right state before writing a key
-  bool verifyKeyState() {
-    assert(!stack_.empty());
-    return stack_.top().state == WS_Object && kvState_ == WS_Value;
-  }
-
-  void writeHeader() {
-    os_->put(FBSON_VER);
-    hasHdr_ = true;
-  }
-
- private:
-  enum WriteState {
-    WS_NONE,
-    WS_Array,
-    WS_Object,
-    WS_Key,
-    WS_Value,
-    WS_String,
-    WS_Binary,
-  };
-
-  struct WriteInfo {
-    WriteState state;
-    std::streampos sz_pos;
-  };
-
- private:
-  OS_TYPE* os_;
-  bool alloc_;
-  bool hasHdr_;
-  WriteState kvState_; // key or value state
-  std::streampos str_pos_;
-  std::stack<WriteInfo> stack_;
-};
-
-typedef FbsonWriterT<FbsonOutStream> FbsonWriter;
-
-} // namespace fbson
-
-#endif // FBSON_FBSONWRITER_H
diff --git a/thirdparty/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h b/thirdparty/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h
index e3f0cfb95c..3cec41a9e4 100644
--- a/thirdparty/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h
+++ b/thirdparty/rocksdb/third-party/gtest-1.7.0/fused-src/gtest/gtest.h
@@ -3410,10 +3410,6 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
 
-inline const char* StrNCpy(char* dest, const char* src, size_t n) {
-  return strncpy(dest, src, n);
-}
-
 // ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
 // StrError() aren't needed on Windows CE at this time and thus not
 // defined there.
diff --git a/thirdparty/rocksdb/thirdparty.inc b/thirdparty/rocksdb/thirdparty.inc
index a364d1d448..f40b81fecc 100644
--- a/thirdparty/rocksdb/thirdparty.inc
+++ b/thirdparty/rocksdb/thirdparty.inc
@@ -1,14 +1,6 @@
 # Edit definitions below to specify paths to include files and libraries of all 3rd party libraries
 
-#
-# Edit these lines to set defaults for use of external libraries
-#
-set(USE_GFLAGS_DEFAULT 0)        # GFLAGS is disabled by default, enable with -DGFLAGS=1 cmake command line agrument
-set(USE_SNAPPY_DEFAULT 0)        # SNAPPY is disabled by default, enable with -DSNAPPY=1 cmake command line agrument
-set(USE_LZ4_DEFAULT 0)           # LZ4 is disabled by default, enable with -DLZ4=1 cmake command line agrument
-set(USE_ZLIB_DEFAULT 0)          # ZLIB is disabled by default, enable with -DZLIB=1 cmake command line agrument
-set(USE_XPRESS_DEFAULT 0)        # XPRESS is disabled by default, enable with -DXPRESS=1 cmake command line agrument
-
+# TODO: Make this work with find_package and/or get rid of it
 #
 # This example assumes all the libraries locate in directories under THIRDPARTY_HOME environment variable
 # Set environment variable THIRDPARTY_HOME to point to your third party libraries home (Unix style dir separators)
@@ -17,24 +9,20 @@ set(USE_XPRESS_DEFAULT 0)        # XPRESS is disabled by default, enable with -D
 set (THIRDPARTY_LIBS "")         # Initialization, don't touch
 
 #
-# Edit these 4 lines to define paths to GFLAGS
+# Defaults
 #
 set(GFLAGS_HOME $ENV{THIRDPARTY_HOME}/Gflags.Library)
-set(GFLAGS_INCLUDE ${GFLAGS_HOME}/inc/include)
-set(GFLAGS_LIB_DEBUG ${GFLAGS_HOME}/bin/debug/amd64/gflags.lib)
-set(GFLAGS_LIB_RELEASE ${GFLAGS_HOME}/bin/retail/amd64/gflags.lib)
+set(GFLAGS_INCLUDE ${GFLAGS_HOME}/build/native/include)
+set(GFLAGS_LIB_DEBUG ${GFLAGS_HOME}/lib/native/debug/amd64/gflags.lib)
+set(GFLAGS_LIB_RELEASE ${GFLAGS_HOME}/lib/native/retail/amd64/gflags.lib)
 
 # ================================================== GFLAGS ==================================================
-#
-# Don't touch these lines
-#
-if (DEFINED GFLAGS)
-  set(USE_GFLAGS ${GFLAGS})
-else ()
-  set(USE_GFLAGS ${USE_GFLAGS_DEFAULT})
+# For compatibility
+if (GFLAGS)
+  set(WITH_GFLAGS ON)
 endif ()
 
-if (${USE_GFLAGS} EQUAL 1)
+if (WITH_GFLAGS)
   message(STATUS "GFLAGS library is enabled")
   
   if(DEFINED ENV{GFLAGS_INCLUDE})
@@ -64,26 +52,22 @@ endif ()
 # Edit these 4 lines to define paths to Snappy
 #
 set(SNAPPY_HOME $ENV{THIRDPARTY_HOME}/Snappy.Library)
-set(SNAPPY_INCLUDE ${SNAPPY_HOME}/inc/inc)
-set(SNAPPY_LIB_DEBUG ${SNAPPY_HOME}/bin/debug/amd64/snappy.lib)
-set(SNAPPY_LIB_RELEASE ${SNAPPY_HOME}/bin/retail/amd64/snappy.lib)
+set(SNAPPY_INCLUDE ${SNAPPY_HOME}/build/native/inc/inc)
+set(SNAPPY_LIB_DEBUG ${SNAPPY_HOME}/lib/native/debug/amd64/snappy.lib)
+set(SNAPPY_LIB_RELEASE ${SNAPPY_HOME}/lib/native/retail/amd64/snappy.lib)
 
-#
-# Don't touch these lines
-#
-if (DEFINED SNAPPY)
-  set(USE_SNAPPY ${SNAPPY})
-else ()
-  set(USE_SNAPPY ${USE_SNAPPY_DEFAULT})
+# For compatibility
+if(SNAPPY)
+  set(WITH_SNAPPY ON)
 endif ()
 
-if (${USE_SNAPPY} EQUAL 1)
+if (WITH_SNAPPY)
   message(STATUS "SNAPPY library is enabled")
-  
+
   if(DEFINED ENV{SNAPPY_INCLUDE})
     set(SNAPPY_INCLUDE $ENV{SNAPPY_INCLUDE})
   endif()
-  
+
   if(DEFINED ENV{SNAPPY_LIB_DEBUG})
     set(SNAPPY_LIB_DEBUG $ENV{SNAPPY_LIB_DEBUG})
   endif()
@@ -91,7 +75,7 @@ if (${USE_SNAPPY} EQUAL 1)
   if(DEFINED ENV{SNAPPY_LIB_RELEASE})
     set(SNAPPY_LIB_RELEASE $ENV{SNAPPY_LIB_RELEASE})
   endif()
-  
+
   set(SNAPPY_CXX_FLAGS -DSNAPPY)
   set(SNAPPY_LIBS debug ${SNAPPY_LIB_DEBUG} optimized ${SNAPPY_LIB_RELEASE})
 
@@ -107,20 +91,17 @@ endif ()
 # Edit these 4 lines to define paths to LZ4
 #
 set(LZ4_HOME $ENV{THIRDPARTY_HOME}/LZ4.Library)
-set(LZ4_INCLUDE ${LZ4_HOME}/inc/include)
-set(LZ4_LIB_DEBUG ${LZ4_HOME}/bin/debug/amd64/lz4.lib)
-set(LZ4_LIB_RELEASE ${LZ4_HOME}/bin/retail/amd64/lz4.lib)
+set(LZ4_INCLUDE ${LZ4_HOME}/build/native/inc/inc)
+set(LZ4_LIB_DEBUG ${LZ4_HOME}/lib/native/debug/amd64/lz4.lib)
+set(LZ4_LIB_RELEASE ${LZ4_HOME}/lib/native/retail/amd64/lz4.lib)
 
-#
-# Don't touch these lines
-#
-if (DEFINED LZ4)
-  set(USE_LZ4 ${LZ4})
-else ()
-  set(USE_LZ4 ${USE_LZ4_DEFAULT})
+
+# For compatibility
+if (LZ4)
+  set(WITH_LZ4 ON)
 endif ()
 
-if (${USE_LZ4} EQUAL 1)
+if (WITH_LZ4)
   message(STATUS "LZ4 library is enabled")
   
   if(DEFINED ENV{LZ4_INCLUDE})
@@ -150,20 +131,16 @@ endif ()
 # Edit these 4 lines to define paths to ZLIB
 #
 set(ZLIB_HOME $ENV{THIRDPARTY_HOME}/ZLIB.Library)
-set(ZLIB_INCLUDE ${ZLIB_HOME}/inc/include)
-set(ZLIB_LIB_DEBUG ${ZLIB_HOME}/bin/debug/amd64/zlib.lib)
-set(ZLIB_LIB_RELEASE ${ZLIB_HOME}/bin/retail/amd64/zlib.lib)
+set(ZLIB_INCLUDE ${ZLIB_HOME}/build/native/inc/inc)
+set(ZLIB_LIB_DEBUG ${ZLIB_HOME}/lib/native/debug/amd64/zlib.lib)
+set(ZLIB_LIB_RELEASE ${ZLIB_HOME}/lib/native/retail/amd64/zlib.lib)
 
-#
-# Don't touch these lines
-#
-if (DEFINED ZLIB)
-  set(USE_ZLIB ${ZLIB})
-else ()
-  set(USE_ZLIB ${USE_ZLIB_DEFAULT})
+# For compatibilty
+if (ZLIB)
+  set(WITH_ZLIB ON)
 endif ()
 
-if (${USE_ZLIB} EQUAL 1)
+if (WITH_ZLIB)
   message(STATUS "ZLIB library is enabled")
 
   if(DEFINED ENV{ZLIB_INCLUDE})
@@ -188,13 +165,15 @@ else ()
   message(STATUS "ZLIB library is disabled")
 endif ()
 
-if (DEFINED XPRESS)
-  set(USE_XPRESS ${XPRESS})
-else ()
-  set(USE_XPRESS ${USE_XPRESS_DEFAULT})
+# ================================================== XPRESS ==================================================
+# This makes use of built-in Windows API, no additional includes, links to a system lib
+
+# For compatibilty
+if (XPRESS)
+  set(WITH_XPRESS ON)
 endif ()
 
-if (${USE_XPRESS} EQUAL 1)
+if (WITH_XPRESS)
   message(STATUS "XPRESS is enabled")
 
   add_definitions(-DXPRESS)
@@ -205,20 +184,56 @@ else ()
   message(STATUS "XPRESS is disabled")
 endif ()
 
+
+# ================================================== ZSTD ==================================================
 #
-# Edit these 4 lines to define paths to Jemalloc
+# Edit these 4 lines to define paths to ZSTD
 #
-set(JEMALLOC_HOME $ENV{THIRDPARTY_HOME}/Jemalloc.Library)
-set(JEMALLOC_INCLUDE ${JEMALLOC_HOME}/inc/include)
-set(JEMALLOC_LIB_DEBUG ${JEMALLOC_HOME}/bin/debug/amd64/jemalloc.lib)
-set(JEMALLOC_LIB_RELEASE ${JEMALLOC_HOME}/bin/retail/amd64/jemalloc.lib)
+set(ZSTD_HOME $ENV{THIRDPARTY_HOME}/ZSTD.Library)
+set(ZSTD_INCLUDE ${ZSTD_HOME}/build/native/inc)
+set(ZSTD_LIB_DEBUG ${ZSTD_HOME}/lib/native/debug/amd64/libzstd_static.lib)
+set(ZSTD_LIB_RELEASE ${ZSTD_HOME}/lib/native/retail/amd64/libzstd_static.lib)
+
+# For compatibility
+if (ZSTD)
+  set(WITH_ZSTD ON)
+endif ()
+
+if (WITH_ZSTD)
+  message(STATUS "ZSTD library is enabled")
+
+  if(DEFINED ENV{ZSTD_INCLUDE})
+    set(ZSTD_INCLUDE $ENV{ZSTD_INCLUDE})
+  endif()
+  
+  if(DEFINED ENV{ZSTD_LIB_DEBUG})
+    set(ZSTD_LIB_DEBUG $ENV{ZSTD_LIB_DEBUG})
+  endif()
+
+  if(DEFINED ENV{ZSTD_LIB_RELEASE})
+    set(ZSTD_LIB_RELEASE $ENV{ZSTD_LIB_RELEASE})
+  endif()
+
+  # ZSTD_STATIC_LINKING_ONLY only allows us to create an allocation functions override
+  # When jemalloc is in use
+  set(ZSTD_LIBS debug ${ZSTD_LIB_DEBUG} optimized ${ZSTD_LIB_RELEASE})
+
+  add_definitions(-DZSTD -DZSTD_STATIC_LINKING_ONLY)
+  include_directories(${ZSTD_INCLUDE})
+  set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${ZSTD_LIBS})
+else ()
+  message(STATUS "ZSTD library is disabled")
+endif ()
 
-# ================================================== JEMALLOC ==================================================
 #
-# Don't touch these lines
+# Edit these 4 lines to define paths to Jemalloc
 #
+set(JEMALLOC_HOME $ENV{THIRDPARTY_HOME}/Jemalloc.Library)
+set(JEMALLOC_INCLUDE ${JEMALLOC_HOME}/build/native/inc)
+set(JEMALLOC_LIB_DEBUG ${JEMALLOC_HOME}/lib/native/debug/amd64/jemalloc.lib)
+set(JEMALLOC_LIB_RELEASE ${JEMALLOC_HOME}/lib/native/retail/amd64/jemalloc.lib)
 
-# For compatibilty with previous
+# ================================================== JEMALLOC ==================================================
 if(JEMALLOC)
   set(WITH_JEMALLOC ON)
 endif()
@@ -245,9 +260,7 @@ if (WITH_JEMALLOC)
   include_directories(${JEMALLOC_INCLUDE})
   set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${JEMALLOC_LIBS})
   set (ARTIFACT_SUFFIX "_je")
-  
-  set(WITH_JEMALLOC ON)
-  
+
 else ()
   set (ARTIFACT_SUFFIX "")
   message(STATUS "JEMALLOC library is disabled")
diff --git a/thirdparty/rocksdb/tools/advisor/README.md b/thirdparty/rocksdb/tools/advisor/README.md
new file mode 100644
index 0000000000..f1e7165e4c
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/README.md
@@ -0,0 +1,96 @@
+# Rocksdb Tuning Advisor
+
+## Motivation
+
+The performance of Rocksdb is contingent on its tuning. However,
+because of the complexity of its underlying technology and a large number of
+configurable parameters, a good configuration is sometimes hard to obtain. The aim of
+the python command-line tool, Rocksdb Advisor, is to automate the process of
+suggesting improvements in the configuration based on advice from Rocksdb
+experts.
+
+## Overview
+
+Experts share their wisdom as rules comprising of conditions and suggestions in the INI format (refer
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+Users provide the Rocksdb configuration that they want to improve upon (as the
+familiar Rocksdb OPTIONS file —
+[example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini))
+and the path of the file which contains Rocksdb logs and statistics.
+The [Advisor](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser_example.py)
+creates appropriate DataSource objects (for Rocksdb
+[logs](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_log_parser.py),
+[options](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_options_parser.py),
+[statistics](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/db_stats_fetcher.py) etc.)
+and provides them to the [Rules Engine](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rule_parser.py).
+The Rules uses rules from experts to parse data-sources and trigger appropriate rules.
+The Advisor's output gives information about which rules were triggered,
+why they were triggered and what each of them suggests. Each suggestion
+provided by a triggered rule advises some action on a Rocksdb
+configuration option, for example, increase CFOptions.write_buffer_size,
+set bloom_bits to 2 etc.
+
+## Usage
+
+### Prerequisites
+The tool needs the following to run:
+* python3
+
+### Running the tool
+An example command to run the tool:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --rules_spec=advisor/rules.ini --rocksdb_options=test/input_files/OPTIONS-000005 --log_files_path_prefix=test/input_files/LOG-0 --stats_dump_period_sec=20
+```
+
+### Command-line arguments
+
+Most important amongst all the input that the Advisor needs, are the rules
+spec and starting Rocksdb configuration. The configuration is provided as the
+familiar Rocksdb Options file (refer [example](https://github.com/facebook/rocksdb/blob/master/examples/rocksdb_option_file_example.ini)).
+The Rules spec is written in the INI format (more details in
+[rules.ini](https://github.com/facebook/rocksdb/blob/master/tools/advisor/advisor/rules.ini)).
+
+In brief, a Rule is made of conditions and is triggered when all its
+constituent conditions are triggered. When triggered, a Rule suggests changes
+(increase/decrease/set to a suggested value) to certain Rocksdb options that
+aim to improve Rocksdb performance. Every Condition has a 'source' i.e.
+the data source that would be checked for triggering that condition.
+For example, a log Condition (with 'source=LOG') is triggered if a particular
+'regex' is found in the Rocksdb LOG files. As of now the Rules Engine
+supports 3 types of Conditions (and consequently data-sources):
+LOG, OPTIONS, TIME_SERIES. The TIME_SERIES data can be sourced from the
+Rocksdb [statistics](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/statistics.h)
+or [perf context](https://github.com/facebook/rocksdb/blob/master/include/rocksdb/perf_context.h).
+
+For more information about the remaining command-line arguments, run:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m advisor.rule_parser_example --help
+```
+
+### Sample output
+
+Here, a Rocksdb log-based rule has been triggered:
+
+```shell
+Rule: stall-too-many-memtables
+LogCondition: stall-too-many-memtables regex: Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+Suggestion: inc-bg-flush option : DBOptions.max_background_flushes action : increase suggested_values : ['2']
+Suggestion: inc-write-buffer option : CFOptions.max_write_buffer_number action : increase
+scope: col_fam:
+{'default'}
+```
+
+## Running the tests
+
+Tests for the code have been added to the
+[test/](https://github.com/facebook/rocksdb/tree/master/tools/advisor/test)
+directory. For example, to run the unit tests for db_log_parser.py:
+
+```shell
+cd rocksdb/tools/advisor
+python3 -m unittest -v test.test_db_log_parser
+```
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/__init__.py b/thirdparty/rocksdb/tools/advisor/advisor/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/bench_runner.py b/thirdparty/rocksdb/tools/advisor/advisor/bench_runner.py
new file mode 100644
index 0000000000..7c7ee78824
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/bench_runner.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from abc import ABC, abstractmethod
+import re
+
+
+class BenchmarkRunner(ABC):
+    @staticmethod
+    @abstractmethod
+    def is_metric_better(new_metric, old_metric):
+        pass
+
+    @abstractmethod
+    def run_experiment(self):
+        # should return a list of DataSource objects
+        pass
+
+    @staticmethod
+    def get_info_log_file_name(log_dir, db_path):
+        # Example: DB Path = /dev/shm and OPTIONS file has option
+        # db_log_dir=/tmp/rocks/, then the name of the log file will be
+        # 'dev_shm_LOG' and its location will be /tmp/rocks. If db_log_dir is
+        # not specified in the OPTIONS file, then the location of the log file
+        # will be /dev/shm and the name of the file will be 'LOG'
+        file_name = ''
+        if log_dir:
+            # refer GetInfoLogPrefix() in rocksdb/util/filename.cc
+            # example db_path: /dev/shm/dbbench
+            file_name = db_path[1:]  # to ignore the leading '/' character
+            to_be_replaced = re.compile('[^0-9a-zA-Z\-_\.]')
+            for character in to_be_replaced.findall(db_path):
+                file_name = file_name.replace(character, '_')
+            if not file_name.endswith('_'):
+                file_name += '_'
+        file_name += 'LOG'
+        return file_name
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/config_optimizer_example.py b/thirdparty/rocksdb/tools/advisor/advisor/config_optimizer_example.py
new file mode 100644
index 0000000000..e3736387ea
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/config_optimizer_example.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import argparse
+from advisor.db_config_optimizer import ConfigOptimizer
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import RulesSpec
+
+
+CONFIG_OPT_NUM_ITER = 10
+
+
+def main(args):
+    # initialise the RulesSpec parser
+    rule_spec_parser = RulesSpec(args.rules_spec)
+    # initialise the benchmark runner
+    bench_runner_module = __import__(
+        args.benchrunner_module, fromlist=[args.benchrunner_class]
+    )
+    bench_runner_class = getattr(bench_runner_module, args.benchrunner_class)
+    ods_args = {}
+    if args.ods_client and args.ods_entity:
+        ods_args['client_script'] = args.ods_client
+        ods_args['entity'] = args.ods_entity
+        if args.ods_key_prefix:
+            ods_args['key_prefix'] = args.ods_key_prefix
+    db_bench_runner = bench_runner_class(args.benchrunner_pos_args, ods_args)
+    # initialise the database configuration
+    db_options = DatabaseOptions(args.rocksdb_options, args.misc_options)
+    # set the frequency at which stats are dumped in the LOG file and the
+    # location of the LOG file.
+    db_log_dump_settings = {
+        "DBOptions.stats_dump_period_sec": {
+            NO_COL_FAMILY: args.stats_dump_period_sec
+        }
+    }
+    db_options.update_options(db_log_dump_settings)
+    # initialise the configuration optimizer
+    config_optimizer = ConfigOptimizer(
+        db_bench_runner,
+        db_options,
+        rule_spec_parser,
+        args.base_db_path
+    )
+    # run the optimiser to improve the database configuration for given
+    # benchmarks, with the help of expert-specified rules
+    final_db_options = config_optimizer.run()
+    # generate the final rocksdb options file
+    print(
+        'Final configuration in: ' +
+        final_db_options.generate_options_config('final')
+    )
+    print(
+        'Final miscellaneous options: ' +
+        repr(final_db_options.get_misc_options())
+    )
+
+
+if __name__ == '__main__':
+    '''
+    An example run of this tool from the command-line would look like:
+    python3 -m advisor.config_optimizer_example
+    --base_db_path=/tmp/rocksdbtest-155919/dbbench
+    --rocksdb_options=temp/OPTIONS_boot.tmp --misc_options bloom_bits=2
+    --rules_spec=advisor/rules.ini --stats_dump_period_sec=20
+    --benchrunner_module=advisor.db_bench_runner
+    --benchrunner_class=DBBenchRunner --benchrunner_pos_args ./../../db_bench
+    readwhilewriting use_existing_db=true duration=90
+    '''
+    parser = argparse.ArgumentParser(description='This script is used for\
+        searching for a better database configuration')
+    parser.add_argument(
+        '--rocksdb_options', required=True, type=str,
+        help='path of the starting Rocksdb OPTIONS file'
+    )
+    # these are options that are column-family agnostic and are not yet
+    # supported by the Rocksdb Options file: eg. bloom_bits=2
+    parser.add_argument(
+        '--misc_options', nargs='*',
+        help='whitespace-separated list of options that are not supported ' +
+        'by the Rocksdb OPTIONS file, given in the ' +
+        '<option_name>=<option_value> format eg. "bloom_bits=2 ' +
+        'rate_limiter_bytes_per_sec=128000000"')
+    parser.add_argument(
+        '--base_db_path', required=True, type=str,
+        help='path for the Rocksdb database'
+    )
+    parser.add_argument(
+        '--rules_spec', required=True, type=str,
+        help='path of the file containing the expert-specified Rules'
+    )
+    parser.add_argument(
+        '--stats_dump_period_sec', required=True, type=int,
+        help='the frequency (in seconds) at which STATISTICS are printed to ' +
+        'the Rocksdb LOG file'
+    )
+    # ODS arguments
+    parser.add_argument(
+        '--ods_client', type=str, help='the ODS client binary'
+    )
+    parser.add_argument(
+        '--ods_entity', type=str,
+        help='the servers for which the ODS stats need to be fetched'
+    )
+    parser.add_argument(
+        '--ods_key_prefix', type=str,
+        help='the prefix that needs to be attached to the keys of time ' +
+        'series to be fetched from ODS'
+    )
+    # benchrunner_module example: advisor.db_benchmark_client
+    parser.add_argument(
+        '--benchrunner_module', required=True, type=str,
+        help='the module containing the BenchmarkRunner class to be used by ' +
+        'the Optimizer, example: advisor.db_bench_runner'
+    )
+    # benchrunner_class example: DBBenchRunner
+    parser.add_argument(
+        '--benchrunner_class', required=True, type=str,
+        help='the name of the BenchmarkRunner class to be used by the ' +
+        'Optimizer, should be present in the module provided in the ' +
+        'benchrunner_module argument, example: DBBenchRunner'
+    )
+    parser.add_argument(
+        '--benchrunner_pos_args', nargs='*',
+        help='whitespace-separated positional arguments that are passed on ' +
+        'to the constructor of the BenchmarkRunner class provided in the ' +
+        'benchrunner_class argument, example: "use_existing_db=true ' +
+        'duration=900"'
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/db_bench_runner.py b/thirdparty/rocksdb/tools/advisor/advisor/db_bench_runner.py
new file mode 100644
index 0000000000..54424440b3
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/db_bench_runner.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.bench_runner import BenchmarkRunner
+from advisor.db_log_parser import DataSource, DatabaseLogs, NO_COL_FAMILY
+from advisor.db_stats_fetcher import (
+    LogStatsParser, OdsStatsFetcher, DatabasePerfContext
+)
+import shutil
+import subprocess
+import time
+
+
+'''
+NOTE: This is not thread-safe, because the output file is simply overwritten.
+'''
+
+
+class DBBenchRunner(BenchmarkRunner):
+    OUTPUT_FILE = "temp/dbbench_out.tmp"
+    ERROR_FILE = "temp/dbbench_err.tmp"
+    DB_PATH = "DB path"
+    THROUGHPUT = "ops/sec"
+    PERF_CON = " PERF_CONTEXT:"
+
+    @staticmethod
+    def is_metric_better(new_metric, old_metric):
+        # for db_bench 'throughput' is the metric returned by run_experiment
+        return new_metric >= old_metric
+
+    @staticmethod
+    def get_opt_args_str(misc_options_dict):
+        # given a dictionary of options and their values, return a string
+        # that can be appended as command-line arguments
+        optional_args_str = ""
+        for option_name, option_value in misc_options_dict.items():
+            if option_value:
+                optional_args_str += (
+                    " --" + option_name + "=" + str(option_value)
+                )
+        return optional_args_str
+
+    def __init__(self, positional_args, ods_args=None):
+        # parse positional_args list appropriately
+        self.db_bench_binary = positional_args[0]
+        self.benchmark = positional_args[1]
+        self.db_bench_args = None
+        if len(positional_args) > 2:
+            # options list with each option given as "<option>=<value>"
+            self.db_bench_args = positional_args[2:]
+        # save ods_args, if provided
+        self.ods_args = ods_args
+
+    def _parse_output(self, get_perf_context=False):
+        '''
+        Sample db_bench output after running 'readwhilewriting' benchmark:
+        DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+        readwhilewriting : 16.582 micros/op 60305 ops/sec; 4.2 MB/s (3433828\
+        of 5427999 found)\n
+        PERF_CONTEXT:\n
+        user_key_comparison_count = 500466712, block_cache_hit_count = ...\n
+        '''
+        output = {
+            self.THROUGHPUT: None, self.DB_PATH: None, self.PERF_CON: None
+        }
+        perf_context_begins = False
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            for line in fp:
+                if line.startswith(self.benchmark):
+                    # line from sample output:
+                    # readwhilewriting : 16.582 micros/op 60305 ops/sec; \
+                    # 4.2 MB/s (3433828 of 5427999 found)\n
+                    print(line)  # print output of the benchmark run
+                    token_list = line.strip().split()
+                    for ix, token in enumerate(token_list):
+                        if token.startswith(self.THROUGHPUT):
+                            # in above example, throughput = 60305 ops/sec
+                            output[self.THROUGHPUT] = (
+                                float(token_list[ix - 1])
+                            )
+                            break
+                elif get_perf_context and line.startswith(self.PERF_CON):
+                    # the following lines in the output contain perf context
+                    # statistics (refer example above)
+                    perf_context_begins = True
+                elif get_perf_context and perf_context_begins:
+                    # Sample perf_context output:
+                    # user_key_comparison_count = 500, block_cache_hit_count =\
+                    # 468, block_read_count = 580, block_read_byte = 445, ...
+                    token_list = line.strip().split(',')
+                    # token_list = ['user_key_comparison_count = 500',
+                    # 'block_cache_hit_count = 468','block_read_count = 580'...
+                    perf_context = {
+                        tk.split('=')[0].strip(): tk.split('=')[1].strip()
+                        for tk in token_list
+                        if tk
+                    }
+                    # TODO(poojam23): this is a hack and should be replaced
+                    # with the timestamp that db_bench will provide per printed
+                    # perf_context
+                    timestamp = int(time.time())
+                    perf_context_ts = {}
+                    for stat in perf_context.keys():
+                        perf_context_ts[stat] = {
+                            timestamp: int(perf_context[stat])
+                        }
+                    output[self.PERF_CON] = perf_context_ts
+                    perf_context_begins = False
+                elif line.startswith(self.DB_PATH):
+                    # line from sample output:
+                    # DB path: [/tmp/rocksdbtest-155919/dbbench]\n
+                    output[self.DB_PATH] = (
+                        line.split('[')[1].split(']')[0]
+                    )
+        return output
+
+    def get_log_options(self, db_options, db_path):
+        # get the location of the LOG file and the frequency at which stats are
+        # dumped in the LOG file
+        log_dir_path = None
+        stats_freq_sec = None
+        logs_file_prefix = None
+
+        # fetch frequency at which the stats are dumped in the Rocksdb logs
+        dump_period = 'DBOptions.stats_dump_period_sec'
+        # fetch the directory, if specified, in which the Rocksdb logs are
+        # dumped, by default logs are dumped in same location as database
+        log_dir = 'DBOptions.db_log_dir'
+        log_options = db_options.get_options([dump_period, log_dir])
+        if dump_period in log_options:
+            stats_freq_sec = int(log_options[dump_period][NO_COL_FAMILY])
+        if log_dir in log_options:
+            log_dir_path = log_options[log_dir][NO_COL_FAMILY]
+
+        log_file_name = DBBenchRunner.get_info_log_file_name(
+            log_dir_path, db_path
+        )
+
+        if not log_dir_path:
+            log_dir_path = db_path
+        if not log_dir_path.endswith('/'):
+            log_dir_path += '/'
+
+        logs_file_prefix = log_dir_path + log_file_name
+        return (logs_file_prefix, stats_freq_sec)
+
+    def _get_options_command_line_args_str(self, curr_options):
+        '''
+        This method uses the provided Rocksdb OPTIONS to create a string of
+        command-line arguments for db_bench.
+        The --options_file argument is always given and the options that are
+        not supported by the OPTIONS file are given as separate arguments.
+        '''
+        optional_args_str = DBBenchRunner.get_opt_args_str(
+            curr_options.get_misc_options()
+        )
+        # generate an options configuration file
+        options_file = curr_options.generate_options_config(nonce='12345')
+        optional_args_str += " --options_file=" + options_file
+        return optional_args_str
+
+    def _setup_db_before_experiment(self, curr_options, db_path):
+        # remove destination directory if it already exists
+        try:
+            shutil.rmtree(db_path, ignore_errors=True)
+        except OSError as e:
+            print('Error: rmdir ' + e.filename + ' ' + e.strerror)
+        # setup database with a million keys using the fillrandom benchmark
+        command = "%s --benchmarks=fillrandom --db=%s --num=1000000" % (
+            self.db_bench_binary, db_path
+        )
+        args_str = self._get_options_command_line_args_str(curr_options)
+        command += args_str
+        self._run_command(command)
+
+    def _build_experiment_command(self, curr_options, db_path):
+        command = "%s --benchmarks=%s --statistics --perf_level=3 --db=%s" % (
+            self.db_bench_binary, self.benchmark, db_path
+        )
+        # fetch the command-line arguments string for providing Rocksdb options
+        args_str = self._get_options_command_line_args_str(curr_options)
+        # handle the command-line args passed in the constructor, these
+        # arguments are specific to db_bench
+        for cmd_line_arg in self.db_bench_args:
+            args_str += (" --" + cmd_line_arg)
+        command += args_str
+        return command
+
+    def _run_command(self, command):
+        out_file = open(self.OUTPUT_FILE, "w+")
+        err_file = open(self.ERROR_FILE, "w+")
+        print('executing... - ' + command)
+        subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+        out_file.close()
+        err_file.close()
+
+    def run_experiment(self, db_options, db_path):
+        # setup the Rocksdb database before running experiment
+        self._setup_db_before_experiment(db_options, db_path)
+        # get the command to run the experiment
+        command = self._build_experiment_command(db_options, db_path)
+        experiment_start_time = int(time.time())
+        # run experiment
+        self._run_command(command)
+        experiment_end_time = int(time.time())
+        # parse the db_bench experiment output
+        parsed_output = self._parse_output(get_perf_context=True)
+
+        # get the log files path prefix and frequency at which Rocksdb stats
+        # are dumped in the logs
+        logs_file_prefix, stats_freq_sec = self.get_log_options(
+            db_options, parsed_output[self.DB_PATH]
+        )
+        # create the Rocksbd LOGS object
+        db_logs = DatabaseLogs(
+            logs_file_prefix, db_options.get_column_families()
+        )
+        # Create the Log STATS object
+        db_log_stats = LogStatsParser(logs_file_prefix, stats_freq_sec)
+        # Create the PerfContext STATS object
+        db_perf_context = DatabasePerfContext(
+            parsed_output[self.PERF_CON], 0, False
+        )
+        # create the data-sources dictionary
+        data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options],
+            DataSource.Type.LOG: [db_logs],
+            DataSource.Type.TIME_SERIES: [db_log_stats, db_perf_context]
+        }
+        # Create the ODS STATS object
+        if self.ods_args:
+            key_prefix = ''
+            if 'key_prefix' in self.ods_args:
+                key_prefix = self.ods_args['key_prefix']
+            data_sources[DataSource.Type.TIME_SERIES].append(OdsStatsFetcher(
+                self.ods_args['client_script'],
+                self.ods_args['entity'],
+                experiment_start_time,
+                experiment_end_time,
+                key_prefix
+            ))
+        # return the experiment's data-sources and throughput
+        return data_sources, parsed_output[self.THROUGHPUT]
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/db_config_optimizer.py b/thirdparty/rocksdb/tools/advisor/advisor/db_config_optimizer.py
new file mode 100644
index 0000000000..508c0f8fe7
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/db_config_optimizer.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Suggestion
+import copy
+import random
+
+
+class ConfigOptimizer:
+    SCOPE = 'scope'
+    SUGG_VAL = 'suggested values'
+
+    @staticmethod
+    def apply_action_on_value(old_value, action, suggested_values):
+        chosen_sugg_val = None
+        if suggested_values:
+            chosen_sugg_val = random.choice(list(suggested_values))
+        new_value = None
+        if action is Suggestion.Action.set or not old_value:
+            assert(chosen_sugg_val)
+            new_value = chosen_sugg_val
+        else:
+            # For increase/decrease actions, currently the code tries to make
+            # a 30% change in the option's value per iteration. An addend is
+            # also present (+1 or -1) to handle the cases when the option's
+            # old value was 0 or the final int() conversion suppressed the 30%
+            # change made to the option
+            old_value = float(old_value)
+            mul = 0
+            add = 0
+            if action is Suggestion.Action.increase:
+                if old_value < 0:
+                    mul = 0.7
+                    add = 2
+                else:
+                    mul = 1.3
+                    add = 2
+            elif action is Suggestion.Action.decrease:
+                if old_value < 0:
+                    mul = 1.3
+                    add = -2
+                else:
+                    mul = 0.7
+                    add = -2
+            new_value = int(old_value * mul + add)
+        return new_value
+
+    @staticmethod
+    def improve_db_config(options, rule, suggestions_dict):
+        # this method takes ONE 'rule' and applies all its suggestions on the
+        # appropriate options
+        required_options = []
+        rule_suggestions = []
+        for sugg_name in rule.get_suggestions():
+            option = suggestions_dict[sugg_name].option
+            action = suggestions_dict[sugg_name].action
+            # A Suggestion in the rules spec must have the 'option' and
+            # 'action' fields defined, always call perform_checks() method
+            # after parsing the rules file using RulesSpec
+            assert(option)
+            assert(action)
+            required_options.append(option)
+            rule_suggestions.append(suggestions_dict[sugg_name])
+        current_config = options.get_options(required_options)
+        # Create the updated configuration from the rule's suggestions
+        updated_config = {}
+        for sugg in rule_suggestions:
+            # case: when the option is not present in the current configuration
+            if sugg.option not in current_config:
+                try:
+                    new_value = ConfigOptimizer.apply_action_on_value(
+                        None, sugg.action, sugg.suggested_values
+                    )
+                    if sugg.option not in updated_config:
+                        updated_config[sugg.option] = {}
+                    if DatabaseOptions.is_misc_option(sugg.option):
+                        # this suggestion is on an option that is not yet
+                        # supported by the Rocksdb OPTIONS file and so it is
+                        # not prefixed by a section type.
+                        updated_config[sugg.option][NO_COL_FAMILY] = new_value
+                    else:
+                        for col_fam in rule.get_trigger_column_families():
+                            updated_config[sugg.option][col_fam] = new_value
+                except AssertionError:
+                    print(
+                        'WARNING(ConfigOptimizer): provide suggested_values ' +
+                        'for ' + sugg.option
+                    )
+                continue
+            # case: when the option is present in the current configuration
+            if NO_COL_FAMILY in current_config[sugg.option]:
+                old_value = current_config[sugg.option][NO_COL_FAMILY]
+                try:
+                    new_value = ConfigOptimizer.apply_action_on_value(
+                        old_value, sugg.action, sugg.suggested_values
+                    )
+                    if sugg.option not in updated_config:
+                        updated_config[sugg.option] = {}
+                    updated_config[sugg.option][NO_COL_FAMILY] = new_value
+                except AssertionError:
+                    print(
+                        'WARNING(ConfigOptimizer): provide suggested_values ' +
+                        'for ' + sugg.option
+                    )
+            else:
+                for col_fam in rule.get_trigger_column_families():
+                    old_value = None
+                    if col_fam in current_config[sugg.option]:
+                        old_value = current_config[sugg.option][col_fam]
+                    try:
+                        new_value = ConfigOptimizer.apply_action_on_value(
+                            old_value, sugg.action, sugg.suggested_values
+                        )
+                        if sugg.option not in updated_config:
+                            updated_config[sugg.option] = {}
+                        updated_config[sugg.option][col_fam] = new_value
+                    except AssertionError:
+                        print(
+                            'WARNING(ConfigOptimizer): provide ' +
+                            'suggested_values for ' + sugg.option
+                        )
+        return current_config, updated_config
+
+    @staticmethod
+    def pick_rule_to_apply(rules, last_rule_name, rules_tried, backtrack):
+        if not rules:
+            print('\nNo more rules triggered!')
+            return None
+        # if the last rule provided an improvement in the database performance,
+        # and it was triggered again (i.e. it is present in 'rules'), then pick
+        # the same rule for this iteration too.
+        if last_rule_name and not backtrack:
+            for rule in rules:
+                if rule.name == last_rule_name:
+                    return rule
+        # there was no previous rule OR the previous rule did not improve db
+        # performance OR it was not triggered for this iteration,
+        # then pick another rule that has not been tried yet
+        for rule in rules:
+            if rule.name not in rules_tried:
+                return rule
+        print('\nAll rules have been exhausted')
+        return None
+
+    @staticmethod
+    def apply_suggestions(
+        triggered_rules,
+        current_rule_name,
+        rules_tried,
+        backtrack,
+        curr_options,
+        suggestions_dict
+    ):
+        curr_rule = ConfigOptimizer.pick_rule_to_apply(
+            triggered_rules, current_rule_name, rules_tried, backtrack
+        )
+        if not curr_rule:
+            return tuple([None]*4)
+        # if a rule has been picked for improving db_config, update rules_tried
+        rules_tried.add(curr_rule.name)
+        # get updated config based on the picked rule
+        curr_conf, updated_conf = ConfigOptimizer.improve_db_config(
+            curr_options, curr_rule, suggestions_dict
+        )
+        conf_diff = DatabaseOptions.get_options_diff(curr_conf, updated_conf)
+        if not conf_diff:  # the current and updated configs are the same
+            curr_rule, rules_tried, curr_conf, updated_conf = (
+                ConfigOptimizer.apply_suggestions(
+                    triggered_rules,
+                    None,
+                    rules_tried,
+                    backtrack,
+                    curr_options,
+                    suggestions_dict
+                )
+            )
+        print('returning from apply_suggestions')
+        return (curr_rule, rules_tried, curr_conf, updated_conf)
+
+    # TODO(poojam23): check if this method is required or can we directly set
+    # the config equal to the curr_config
+    @staticmethod
+    def get_backtrack_config(curr_config, updated_config):
+        diff = DatabaseOptions.get_options_diff(curr_config, updated_config)
+        bt_config = {}
+        for option in diff:
+            bt_config[option] = {}
+            for col_fam in diff[option]:
+                bt_config[option][col_fam] = diff[option][col_fam][0]
+        print(bt_config)
+        return bt_config
+
+    def __init__(self, bench_runner, db_options, rule_parser, base_db):
+        self.bench_runner = bench_runner
+        self.db_options = db_options
+        self.rule_parser = rule_parser
+        self.base_db_path = base_db
+
+    def run(self):
+        # In every iteration of this method's optimization loop we pick ONE
+        # RULE from all the triggered rules and apply all its suggestions to
+        # the appropriate options.
+        # bootstrapping the optimizer
+        print('Bootstrapping optimizer:')
+        options = copy.deepcopy(self.db_options)
+        old_data_sources, old_metric = (
+            self.bench_runner.run_experiment(options, self.base_db_path)
+        )
+        print('Initial metric: ' + str(old_metric))
+        self.rule_parser.load_rules_from_spec()
+        self.rule_parser.perform_section_checks()
+        triggered_rules = self.rule_parser.get_triggered_rules(
+            old_data_sources, options.get_column_families()
+        )
+        print('\nTriggered:')
+        self.rule_parser.print_rules(triggered_rules)
+        backtrack = False
+        rules_tried = set()
+        curr_rule, rules_tried, curr_conf, updated_conf = (
+            ConfigOptimizer.apply_suggestions(
+                triggered_rules,
+                None,
+                rules_tried,
+                backtrack,
+                options,
+                self.rule_parser.get_suggestions_dict()
+            )
+        )
+        # the optimizer loop
+        while curr_rule:
+            print('\nRule picked for next iteration:')
+            print(curr_rule.name)
+            print('\ncurrent config:')
+            print(curr_conf)
+            print('updated config:')
+            print(updated_conf)
+            options.update_options(updated_conf)
+            # run bench_runner with updated config
+            new_data_sources, new_metric = (
+                self.bench_runner.run_experiment(options, self.base_db_path)
+            )
+            print('\nnew metric: ' + str(new_metric))
+            backtrack = not self.bench_runner.is_metric_better(
+                new_metric, old_metric
+            )
+            # update triggered_rules, metric, data_sources, if required
+            if backtrack:
+                # revert changes to options config
+                print('\nBacktracking to previous configuration')
+                backtrack_conf = ConfigOptimizer.get_backtrack_config(
+                    curr_conf, updated_conf
+                )
+                options.update_options(backtrack_conf)
+            else:
+                # run advisor on new data sources
+                self.rule_parser.load_rules_from_spec()  # reboot the advisor
+                self.rule_parser.perform_section_checks()
+                triggered_rules = self.rule_parser.get_triggered_rules(
+                    new_data_sources, options.get_column_families()
+                )
+                print('\nTriggered:')
+                self.rule_parser.print_rules(triggered_rules)
+                old_metric = new_metric
+                old_data_sources = new_data_sources
+                rules_tried = set()
+            # pick rule to work on and set curr_rule to that
+            curr_rule, rules_tried, curr_conf, updated_conf = (
+                ConfigOptimizer.apply_suggestions(
+                    triggered_rules,
+                    curr_rule.name,
+                    rules_tried,
+                    backtrack,
+                    options,
+                    self.rule_parser.get_suggestions_dict()
+                )
+            )
+        # return the final database options configuration
+        return options
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/db_log_parser.py b/thirdparty/rocksdb/tools/advisor/advisor/db_log_parser.py
new file mode 100644
index 0000000000..efd41a81ab
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/db_log_parser.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from abc import ABC, abstractmethod
+from calendar import timegm
+from enum import Enum
+import glob
+import re
+import time
+
+
+NO_COL_FAMILY = 'DB_WIDE'
+
+
+class DataSource(ABC):
+    class Type(Enum):
+        LOG = 1
+        DB_OPTIONS = 2
+        TIME_SERIES = 3
+
+    def __init__(self, type):
+        self.type = type
+
+    @abstractmethod
+    def check_and_trigger_conditions(self, conditions):
+        pass
+
+
+class Log:
+    @staticmethod
+    def is_new_log(log_line):
+        # The assumption is that a new log will start with a date printed in
+        # the below regex format.
+        date_regex = '\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}'
+        return re.match(date_regex, log_line)
+
+    def __init__(self, log_line, column_families):
+        token_list = log_line.strip().split()
+        self.time = token_list[0]
+        self.context = token_list[1]
+        self.message = " ".join(token_list[2:])
+        self.column_family = None
+        # example log for 'default' column family:
+        # "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634]
+        # [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n"
+        for col_fam in column_families:
+            search_for_str = '\[' + col_fam + '\]'
+            if re.search(search_for_str, self.message):
+                self.column_family = col_fam
+                break
+        if not self.column_family:
+            self.column_family = NO_COL_FAMILY
+
+    def get_human_readable_time(self):
+        # example from a log line: '2018/07/25-11:25:45.782710'
+        return self.time
+
+    def get_column_family(self):
+        return self.column_family
+
+    def get_context(self):
+        return self.context
+
+    def get_message(self):
+        return self.message
+
+    def append_message(self, remaining_log):
+        self.message = self.message + '\n' + remaining_log.strip()
+
+    def get_timestamp(self):
+        # example: '2018/07/25-11:25:45.782710' will be converted to the GMT
+        # Unix timestamp 1532517945 (note: this method assumes that self.time
+        # is in GMT)
+        hr_time = self.time + 'GMT'
+        timestamp = timegm(time.strptime(hr_time, "%Y/%m/%d-%H:%M:%S.%f%Z"))
+        return timestamp
+
+    def __repr__(self):
+        return (
+            'time: ' + self.time + '; context: ' + self.context +
+            '; col_fam: ' + self.column_family +
+            '; message: ' + self.message
+        )
+
+
+class DatabaseLogs(DataSource):
+    def __init__(self, logs_path_prefix, column_families):
+        super().__init__(DataSource.Type.LOG)
+        self.logs_path_prefix = logs_path_prefix
+        self.column_families = column_families
+
+    def trigger_conditions_for_log(self, conditions, log):
+        # For a LogCondition object, trigger is:
+        # Dict[column_family_name, List[Log]]. This explains why the condition
+        # was triggered and for which column families.
+        for cond in conditions:
+            if re.search(cond.regex, log.get_message(), re.IGNORECASE):
+                trigger = cond.get_trigger()
+                if not trigger:
+                    trigger = {}
+                if log.get_column_family() not in trigger:
+                    trigger[log.get_column_family()] = []
+                trigger[log.get_column_family()].append(log)
+                cond.set_trigger(trigger)
+
+    def check_and_trigger_conditions(self, conditions):
+        for file_name in glob.glob(self.logs_path_prefix + '*'):
+            # TODO(poojam23): find a way to distinguish between log files
+            # - generated in the current experiment but are labeled 'old'
+            # because they LOGs exceeded the file size limit  AND
+            # - generated in some previous experiment that are also labeled
+            # 'old' and were not deleted for some reason
+            if re.search('old', file_name, re.IGNORECASE):
+                continue
+            with open(file_name, 'r') as db_logs:
+                new_log = None
+                for line in db_logs:
+                    if Log.is_new_log(line):
+                        if new_log:
+                            self.trigger_conditions_for_log(
+                                conditions, new_log
+                            )
+                        new_log = Log(line, self.column_families)
+                    else:
+                        # To account for logs split into multiple lines
+                        new_log.append_message(line)
+            # Check for the last log in the file.
+            if new_log:
+                self.trigger_conditions_for_log(conditions, new_log)
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/db_options_parser.py b/thirdparty/rocksdb/tools/advisor/advisor/db_options_parser.py
new file mode 100644
index 0000000000..e689d892a5
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/db_options_parser.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import copy
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.ini_parser import IniParser
+import os
+
+
+class OptionsSpecParser(IniParser):
+    @staticmethod
+    def is_new_option(line):
+        return '=' in line
+
+    @staticmethod
+    def get_section_type(line):
+        '''
+        Example section header: [TableOptions/BlockBasedTable "default"]
+        Here ConfigurationOptimizer returned would be
+        'TableOptions.BlockBasedTable'
+        '''
+        section_path = line.strip()[1:-1].split()[0]
+        section_type = '.'.join(section_path.split('/'))
+        return section_type
+
+    @staticmethod
+    def get_section_name(line):
+        # example: get_section_name('[CFOptions "default"]')
+        token_list = line.strip()[1:-1].split('"')
+        # token_list = ['CFOptions', 'default', '']
+        if len(token_list) < 3:
+            return None
+        return token_list[1]  # return 'default'
+
+    @staticmethod
+    def get_section_str(section_type, section_name):
+        # Example:
+        # Case 1: get_section_str('DBOptions', NO_COL_FAMILY)
+        # Case 2: get_section_str('TableOptions.BlockBasedTable', 'default')
+        section_type = '/'.join(section_type.strip().split('.'))
+        # Case 1: section_type = 'DBOptions'
+        # Case 2: section_type = 'TableOptions/BlockBasedTable'
+        section_str = '[' + section_type
+        if section_name == NO_COL_FAMILY:
+            # Case 1: '[DBOptions]'
+            return (section_str + ']')
+        else:
+            # Case 2: '[TableOptions/BlockBasedTable "default"]'
+            return section_str + ' "' + section_name + '"]'
+
+    @staticmethod
+    def get_option_str(key, values):
+        option_str = key + '='
+        # get_option_str('db_log_dir', None), returns 'db_log_dir='
+        if values:
+            # example:
+            # get_option_str('max_bytes_for_level_multiplier_additional',
+            # [1,1,1,1,1,1,1]), returned string:
+            # 'max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1'
+            if isinstance(values, list):
+                for value in values:
+                    option_str += (str(value) + ':')
+                option_str = option_str[:-1]
+            else:
+                # example: get_option_str('write_buffer_size', 1048576)
+                # returned string: 'write_buffer_size=1048576'
+                option_str += str(values)
+        return option_str
+
+
+class DatabaseOptions(DataSource):
+
+    @staticmethod
+    def is_misc_option(option_name):
+        # these are miscellaneous options that are not yet supported by the
+        # Rocksdb options file, hence they are not prefixed with any section
+        # name
+        return '.' not in option_name
+
+    @staticmethod
+    def get_options_diff(opt_old, opt_new):
+        # type: Dict[option, Dict[col_fam, value]] X 2 ->
+        # Dict[option, Dict[col_fam, Tuple(old_value, new_value)]]
+        # note: diff should contain a tuple of values only if they are
+        # different from each other
+        options_union = set(opt_old.keys()).union(set(opt_new.keys()))
+        diff = {}
+        for opt in options_union:
+            diff[opt] = {}
+            # if option in options_union, then it must be in one of the configs
+            if opt not in opt_old:
+                for col_fam in opt_new[opt]:
+                    diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+            elif opt not in opt_new:
+                for col_fam in opt_old[opt]:
+                    diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+            else:
+                for col_fam in opt_old[opt]:
+                    if col_fam in opt_new[opt]:
+                        if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+                            diff[opt][col_fam] = (
+                                opt_old[opt][col_fam],
+                                opt_new[opt][col_fam]
+                            )
+                    else:
+                        diff[opt][col_fam] = (opt_old[opt][col_fam], None)
+                for col_fam in opt_new[opt]:
+                    if col_fam in opt_old[opt]:
+                        if opt_old[opt][col_fam] != opt_new[opt][col_fam]:
+                            diff[opt][col_fam] = (
+                                opt_old[opt][col_fam],
+                                opt_new[opt][col_fam]
+                            )
+                    else:
+                        diff[opt][col_fam] = (None, opt_new[opt][col_fam])
+            if not diff[opt]:
+                diff.pop(opt)
+        return diff
+
+    def __init__(self, rocksdb_options, misc_options=None):
+        super().__init__(DataSource.Type.DB_OPTIONS)
+        # The options are stored in the following data structure:
+        # Dict[section_type, Dict[section_name, Dict[option_name, value]]]
+        self.options_dict = None
+        self.column_families = None
+        # Load the options from the given file to a dictionary.
+        self.load_from_source(rocksdb_options)
+        # Setup the miscellaneous options expected to be List[str], where each
+        # element in the List has the format "<option_name>=<option_value>"
+        # These options are the ones that are not yet supported by the Rocksdb
+        # OPTIONS file, so they are provided separately
+        self.setup_misc_options(misc_options)
+
+    def setup_misc_options(self, misc_options):
+        self.misc_options = {}
+        if misc_options:
+            for option_pair_str in misc_options:
+                option_name = option_pair_str.split('=')[0].strip()
+                option_value = option_pair_str.split('=')[1].strip()
+                self.misc_options[option_name] = option_value
+
+    def load_from_source(self, options_path):
+        self.options_dict = {}
+        with open(options_path, 'r') as db_options:
+            for line in db_options:
+                line = OptionsSpecParser.remove_trailing_comment(line)
+                if not line:
+                    continue
+                if OptionsSpecParser.is_section_header(line):
+                    curr_sec_type = (
+                        OptionsSpecParser.get_section_type(line)
+                    )
+                    curr_sec_name = OptionsSpecParser.get_section_name(line)
+                    if curr_sec_type not in self.options_dict:
+                        self.options_dict[curr_sec_type] = {}
+                    if not curr_sec_name:
+                        curr_sec_name = NO_COL_FAMILY
+                    self.options_dict[curr_sec_type][curr_sec_name] = {}
+                    # example: if the line read from the Rocksdb OPTIONS file
+                    # is [CFOptions "default"], then the section type is
+                    # CFOptions and 'default' is the name of a column family
+                    # that for this database, so it's added to the list of
+                    # column families stored in this object
+                    if curr_sec_type == 'CFOptions':
+                        if not self.column_families:
+                            self.column_families = []
+                        self.column_families.append(curr_sec_name)
+                elif OptionsSpecParser.is_new_option(line):
+                    key, value = OptionsSpecParser.get_key_value_pair(line)
+                    self.options_dict[curr_sec_type][curr_sec_name][key] = (
+                        value
+                    )
+                else:
+                    error = 'Not able to parse line in Options file.'
+                    OptionsSpecParser.exit_with_parse_error(line, error)
+
+    def get_misc_options(self):
+        # these are options that are not yet supported by the Rocksdb OPTIONS
+        # file, hence they are provided and stored separately
+        return self.misc_options
+
+    def get_column_families(self):
+        return self.column_families
+
+    def get_all_options(self):
+        # This method returns all the options that are stored in this object as
+        # a: Dict[<sec_type>.<option_name>: Dict[col_fam, option_value]]
+        all_options = []
+        # Example: in the section header '[CFOptions "default"]' read from the
+        # OPTIONS file, sec_type='CFOptions'
+        for sec_type in self.options_dict:
+            for col_fam in self.options_dict[sec_type]:
+                for opt_name in self.options_dict[sec_type][col_fam]:
+                    option = sec_type + '.' + opt_name
+                    all_options.append(option)
+        all_options.extend(list(self.misc_options.keys()))
+        return self.get_options(all_options)
+
+    def get_options(self, reqd_options):
+        # type: List[str] -> Dict[str, Dict[str, Any]]
+        # List[option] -> Dict[option, Dict[col_fam, value]]
+        reqd_options_dict = {}
+        for option in reqd_options:
+            if DatabaseOptions.is_misc_option(option):
+                # the option is not prefixed by '<section_type>.' because it is
+                # not yet supported by the Rocksdb OPTIONS file; so it has to
+                # be fetched from the misc_options dictionary
+                if option not in self.misc_options:
+                    continue
+                if option not in reqd_options_dict:
+                    reqd_options_dict[option] = {}
+                reqd_options_dict[option][NO_COL_FAMILY] = (
+                    self.misc_options[option]
+                )
+            else:
+                # Example: option = 'TableOptions.BlockBasedTable.block_align'
+                # then, sec_type = 'TableOptions.BlockBasedTable'
+                sec_type = '.'.join(option.split('.')[:-1])
+                # opt_name = 'block_align'
+                opt_name = option.split('.')[-1]
+                if sec_type not in self.options_dict:
+                    continue
+                for col_fam in self.options_dict[sec_type]:
+                    if opt_name in self.options_dict[sec_type][col_fam]:
+                        if option not in reqd_options_dict:
+                            reqd_options_dict[option] = {}
+                        reqd_options_dict[option][col_fam] = (
+                            self.options_dict[sec_type][col_fam][opt_name]
+                        )
+        return reqd_options_dict
+
+    def update_options(self, options):
+        # An example 'options' object looks like:
+        # {'DBOptions.max_background_jobs': {NO_COL_FAMILY: 2},
+        # 'CFOptions.write_buffer_size': {'default': 1048576, 'cf_A': 128000},
+        # 'bloom_bits': {NO_COL_FAMILY: 4}}
+        for option in options:
+            if DatabaseOptions.is_misc_option(option):
+                # this is a misc_option i.e. an option that is not yet
+                # supported by the Rocksdb OPTIONS file, so it is not prefixed
+                # by '<section_type>.' and must be stored in the separate
+                # misc_options dictionary
+                if NO_COL_FAMILY not in options[option]:
+                    print(
+                        'WARNING(DatabaseOptions.update_options): not ' +
+                        'updating option ' + option + ' because it is in ' +
+                        'misc_option format but its scope is not ' +
+                        NO_COL_FAMILY + '. Check format of option.'
+                    )
+                    continue
+                self.misc_options[option] = options[option][NO_COL_FAMILY]
+            else:
+                sec_name = '.'.join(option.split('.')[:-1])
+                opt_name = option.split('.')[-1]
+                if sec_name not in self.options_dict:
+                    self.options_dict[sec_name] = {}
+                for col_fam in options[option]:
+                    # if the option is not already present in the dictionary,
+                    # it will be inserted, else it will be updated to the new
+                    # value
+                    if col_fam not in self.options_dict[sec_name]:
+                        self.options_dict[sec_name][col_fam] = {}
+                    self.options_dict[sec_name][col_fam][opt_name] = (
+                        copy.deepcopy(options[option][col_fam])
+                    )
+
+    def generate_options_config(self, nonce):
+        # this method generates a Rocksdb OPTIONS file in the INI format from
+        # the options stored in self.options_dict
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        file_name = '../temp/OPTIONS_' + str(nonce) + '.tmp'
+        file_path = os.path.join(this_path, file_name)
+        with open(file_path, 'w') as fp:
+            for section in self.options_dict:
+                for col_fam in self.options_dict[section]:
+                    fp.write(
+                        OptionsSpecParser.get_section_str(section, col_fam) +
+                        '\n'
+                    )
+                    for option in self.options_dict[section][col_fam]:
+                        values = self.options_dict[section][col_fam][option]
+                        fp.write(
+                            OptionsSpecParser.get_option_str(option, values) +
+                            '\n'
+                        )
+                fp.write('\n')
+        return file_path
+
+    def check_and_trigger_conditions(self, conditions):
+        for cond in conditions:
+            reqd_options_dict = self.get_options(cond.options)
+            # This contains the indices of options that are specific to some
+            # column family and are not database-wide options.
+            incomplete_option_ix = []
+            options = []
+            missing_reqd_option = False
+            for ix, option in enumerate(cond.options):
+                if option not in reqd_options_dict:
+                    print(
+                        'WARNING(DatabaseOptions.check_and_trigger): ' +
+                        'skipping condition ' + cond.name + ' because it '
+                        'requires option ' + option + ' but this option is' +
+                        ' not available'
+                    )
+                    missing_reqd_option = True
+                    break  # required option is absent
+                if NO_COL_FAMILY in reqd_options_dict[option]:
+                    options.append(reqd_options_dict[option][NO_COL_FAMILY])
+                else:
+                    options.append(None)
+                    incomplete_option_ix.append(ix)
+
+            if missing_reqd_option:
+                continue
+
+            # if all the options are database-wide options
+            if not incomplete_option_ix:
+                try:
+                    if eval(cond.eval_expr):
+                        cond.set_trigger({NO_COL_FAMILY: options})
+                except Exception as e:
+                    print(
+                        'WARNING(DatabaseOptions) check_and_trigger:' + str(e)
+                    )
+                continue
+
+            # for all the options that are not database-wide, we look for their
+            # values specific to column families
+            col_fam_options_dict = {}
+            for col_fam in self.column_families:
+                present = True
+                for ix in incomplete_option_ix:
+                    option = cond.options[ix]
+                    if col_fam not in reqd_options_dict[option]:
+                        present = False
+                        break
+                    options[ix] = reqd_options_dict[option][col_fam]
+                if present:
+                    try:
+                        if eval(cond.eval_expr):
+                            col_fam_options_dict[col_fam] = (
+                                copy.deepcopy(options)
+                            )
+                    except Exception as e:
+                        print(
+                            'WARNING(DatabaseOptions) check_and_trigger: ' +
+                            str(e)
+                        )
+            # Trigger for an OptionCondition object is of the form:
+            # Dict[col_fam_name: List[option_value]]
+            # where col_fam_name is the name of a column family for which
+            # 'eval_expr' evaluated to True and List[option_value] is the list
+            # of values of the options specified in the condition's 'options'
+            # field
+            if col_fam_options_dict:
+                cond.set_trigger(col_fam_options_dict)
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/db_stats_fetcher.py b/thirdparty/rocksdb/tools/advisor/advisor/db_stats_fetcher.py
new file mode 100755
index 0000000000..cf497cf1f7
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/db_stats_fetcher.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import Log
+from advisor.db_timeseries_parser import TimeSeriesData, NO_ENTITY
+import copy
+import glob
+import re
+import subprocess
+import time
+
+
+class LogStatsParser(TimeSeriesData):
+    STATS = 'STATISTICS:'
+
+    @staticmethod
+    def parse_log_line_for_stats(log_line):
+        # Example stat line (from LOG file):
+        # "rocksdb.db.get.micros P50 : 8.4 P95 : 21.8 P99 : 33.9 P100 : 92.0\n"
+        token_list = log_line.strip().split()
+        # token_list = ['rocksdb.db.get.micros', 'P50', ':', '8.4', 'P95', ':',
+        # '21.8', 'P99', ':', '33.9', 'P100', ':', '92.0']
+        stat_prefix = token_list[0] + '.'  # 'rocksdb.db.get.micros.'
+        stat_values = [
+            token
+            for token in token_list[1:]
+            if token != ':'
+        ]
+        # stat_values = ['P50', '8.4', 'P95', '21.8', 'P99', '33.9', 'P100',
+        # '92.0']
+        stat_dict = {}
+        for ix, metric in enumerate(stat_values):
+            if ix % 2 == 0:
+                stat_name = stat_prefix + metric
+                stat_name = stat_name.lower()  # Note: case insensitive names
+            else:
+                stat_dict[stat_name] = float(metric)
+        # stat_dict = {'rocksdb.db.get.micros.p50': 8.4,
+        # 'rocksdb.db.get.micros.p95': 21.8, 'rocksdb.db.get.micros.p99': 33.9,
+        # 'rocksdb.db.get.micros.p100': 92.0}
+        return stat_dict
+
+    def __init__(self, logs_path_prefix, stats_freq_sec):
+        super().__init__()
+        self.logs_file_prefix = logs_path_prefix
+        self.stats_freq_sec = stats_freq_sec
+        self.duration_sec = 60
+
+    def get_keys_from_conditions(self, conditions):
+        # Note: case insensitive stat names
+        reqd_stats = []
+        for cond in conditions:
+            for key in cond.keys:
+                key = key.lower()
+                # some keys are prepended with '[]' for OdsStatsFetcher to
+                # replace this with the appropriate key_prefix, remove these
+                # characters here since the LogStatsParser does not need
+                # a prefix
+                if key.startswith('[]'):
+                    reqd_stats.append(key[2:])
+                else:
+                    reqd_stats.append(key)
+        return reqd_stats
+
+    def add_to_timeseries(self, log, reqd_stats):
+        # this method takes in the Log object that contains the Rocksdb stats
+        # and a list of required stats, then it parses the stats line by line
+        # to fetch required stats and add them to the keys_ts object
+        # Example: reqd_stats = ['rocksdb.block.cache.hit.count',
+        # 'rocksdb.db.get.micros.p99']
+        # Let log.get_message() returns following string:
+        # "[WARN] [db/db_impl.cc:485] STATISTICS:\n
+        # rocksdb.block.cache.miss COUNT : 1459\n
+        # rocksdb.block.cache.hit COUNT : 37\n
+        # ...
+        # rocksdb.db.get.micros P50 : 15.6 P95 : 39.7 P99 : 62.6 P100 : 148.0\n
+        # ..."
+        new_lines = log.get_message().split('\n')
+        # let log_ts = 1532518219
+        log_ts = log.get_timestamp()
+        # example updates to keys_ts:
+        # keys_ts[NO_ENTITY]['rocksdb.db.get.micros.p99'][1532518219] = 62.6
+        # keys_ts[NO_ENTITY]['rocksdb.block.cache.hit.count'][1532518219] = 37
+        for line in new_lines[1:]:  # new_lines[0] does not contain any stats
+            stats_on_line = self.parse_log_line_for_stats(line)
+            for stat in stats_on_line:
+                if stat in reqd_stats:
+                    if stat not in self.keys_ts[NO_ENTITY]:
+                        self.keys_ts[NO_ENTITY][stat] = {}
+                    self.keys_ts[NO_ENTITY][stat][log_ts] = stats_on_line[stat]
+
+    def fetch_timeseries(self, reqd_stats):
+        # this method parses the Rocksdb LOG file and generates timeseries for
+        # each of the statistic in the list reqd_stats
+        self.keys_ts = {NO_ENTITY: {}}
+        for file_name in glob.glob(self.logs_file_prefix + '*'):
+            # TODO(poojam23): find a way to distinguish between 'old' log files
+            # from current and previous experiments, present in the same
+            # directory
+            if re.search('old', file_name, re.IGNORECASE):
+                continue
+            with open(file_name, 'r') as db_logs:
+                new_log = None
+                for line in db_logs:
+                    if Log.is_new_log(line):
+                        if (
+                            new_log and
+                            re.search(self.STATS, new_log.get_message())
+                        ):
+                            self.add_to_timeseries(new_log, reqd_stats)
+                        new_log = Log(line, column_families=[])
+                    else:
+                        # To account for logs split into multiple lines
+                        new_log.append_message(line)
+            # Check for the last log in the file.
+            if new_log and re.search(self.STATS, new_log.get_message()):
+                self.add_to_timeseries(new_log, reqd_stats)
+
+
+class DatabasePerfContext(TimeSeriesData):
+    # TODO(poojam23): check if any benchrunner provides PerfContext sampled at
+    # regular intervals
+    def __init__(self, perf_context_ts, stats_freq_sec, cumulative):
+        '''
+        perf_context_ts is expected to be in the following format:
+        Dict[metric, Dict[timestamp, value]], where for
+        each (metric, timestamp) pair, the value is database-wide (i.e.
+        summed over all the threads involved)
+        if stats_freq_sec == 0, per-metric only one value is reported
+        '''
+        super().__init__()
+        self.stats_freq_sec = stats_freq_sec
+        self.keys_ts = {NO_ENTITY: perf_context_ts}
+        if cumulative:
+            self.unaccumulate_metrics()
+
+    def unaccumulate_metrics(self):
+        # if the perf context metrics provided are cumulative in nature, this
+        # method can be used to convert them to a disjoint format
+        epoch_ts = copy.deepcopy(self.keys_ts)
+        for stat in self.keys_ts[NO_ENTITY]:
+            timeseries = sorted(
+                list(self.keys_ts[NO_ENTITY][stat].keys()), reverse=True
+            )
+            if len(timeseries) < 2:
+                continue
+            for ix, ts in enumerate(timeseries[:-1]):
+                epoch_ts[NO_ENTITY][stat][ts] = (
+                    epoch_ts[NO_ENTITY][stat][ts] -
+                    epoch_ts[NO_ENTITY][stat][timeseries[ix+1]]
+                )
+                if epoch_ts[NO_ENTITY][stat][ts] < 0:
+                    raise ValueError('DBPerfContext: really cumulative?')
+            # drop the smallest timestamp in the timeseries for this metric
+            epoch_ts[NO_ENTITY][stat].pop(timeseries[-1])
+        self.keys_ts = epoch_ts
+
+    def get_keys_from_conditions(self, conditions):
+        reqd_stats = []
+        for cond in conditions:
+            reqd_stats.extend([key.lower() for key in cond.keys])
+        return reqd_stats
+
+    def fetch_timeseries(self, statistics):
+        # this method is redundant for DatabasePerfContext because the __init__
+        # does the job of populating 'keys_ts'
+        pass
+
+
+class OdsStatsFetcher(TimeSeriesData):
+    # class constants
+    OUTPUT_FILE = 'temp/stats_out.tmp'
+    ERROR_FILE = 'temp/stats_err.tmp'
+    RAPIDO_COMMAND = "%s --entity=%s --key=%s --tstart=%s --tend=%s --showtime"
+
+    # static methods
+    @staticmethod
+    def _get_string_in_quotes(value):
+        return '"' + str(value) + '"'
+
+    @staticmethod
+    def _get_time_value_pair(pair_string):
+        # example pair_string: '[1532544591, 97.3653601828]'
+        pair_string = pair_string.replace('[', '')
+        pair_string = pair_string.replace(']', '')
+        pair = pair_string.split(',')
+        first = int(pair[0].strip())
+        second = float(pair[1].strip())
+        return [first, second]
+
+    @staticmethod
+    def _get_ods_cli_stime(start_time):
+        diff = int(time.time() - int(start_time))
+        stime = str(diff) + '_s'
+        return stime
+
+    def __init__(
+        self, client, entities, start_time, end_time, key_prefix=None
+    ):
+        super().__init__()
+        self.client = client
+        self.entities = entities
+        self.start_time = start_time
+        self.end_time = end_time
+        self.key_prefix = key_prefix
+        self.stats_freq_sec = 60
+        self.duration_sec = 60
+
+    def execute_script(self, command):
+        print('executing...')
+        print(command)
+        out_file = open(self.OUTPUT_FILE, "w+")
+        err_file = open(self.ERROR_FILE, "w+")
+        subprocess.call(command, shell=True, stdout=out_file, stderr=err_file)
+        out_file.close()
+        err_file.close()
+
+    def parse_rapido_output(self):
+        # Output looks like the following:
+        # <entity_name>\t<key_name>\t[[ts, value], [ts, value], ...]
+        # ts = timestamp; value = value of key_name in entity_name at time ts
+        self.keys_ts = {}
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            for line in fp:
+                token_list = line.strip().split('\t')
+                entity = token_list[0]
+                key = token_list[1]
+                if entity not in self.keys_ts:
+                    self.keys_ts[entity] = {}
+                if key not in self.keys_ts[entity]:
+                    self.keys_ts[entity][key] = {}
+                list_of_lists = [
+                    self._get_time_value_pair(pair_string)
+                    for pair_string in token_list[2].split('],')
+                ]
+                value = {pair[0]: pair[1] for pair in list_of_lists}
+                self.keys_ts[entity][key] = value
+
+    def parse_ods_output(self):
+        # Output looks like the following:
+        # <entity_name>\t<key_name>\t<timestamp>\t<value>
+        # there is one line per (entity_name, key_name, timestamp)
+        self.keys_ts = {}
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            for line in fp:
+                token_list = line.split()
+                entity = token_list[0]
+                if entity not in self.keys_ts:
+                    self.keys_ts[entity] = {}
+                key = token_list[1]
+                if key not in self.keys_ts[entity]:
+                    self.keys_ts[entity][key] = {}
+                self.keys_ts[entity][key][token_list[2]] = token_list[3]
+
+    def fetch_timeseries(self, statistics):
+        # this method fetches the timeseries of required stats from the ODS
+        # service and populates the 'keys_ts' object appropriately
+        print('OdsStatsFetcher: fetching ' + str(statistics))
+        if re.search('rapido', self.client, re.IGNORECASE):
+            command = self.RAPIDO_COMMAND % (
+                self.client,
+                self._get_string_in_quotes(self.entities),
+                self._get_string_in_quotes(','.join(statistics)),
+                self._get_string_in_quotes(self.start_time),
+                self._get_string_in_quotes(self.end_time)
+            )
+            # Run the tool and fetch the time-series data
+            self.execute_script(command)
+            # Parse output and populate the 'keys_ts' map
+            self.parse_rapido_output()
+        elif re.search('ods', self.client, re.IGNORECASE):
+            command = (
+                self.client + ' ' +
+                '--stime=' + self._get_ods_cli_stime(self.start_time) + ' ' +
+                self._get_string_in_quotes(self.entities) + ' ' +
+                self._get_string_in_quotes(','.join(statistics))
+            )
+            # Run the tool and fetch the time-series data
+            self.execute_script(command)
+            # Parse output and populate the 'keys_ts' map
+            self.parse_ods_output()
+
+    def get_keys_from_conditions(self, conditions):
+        reqd_stats = []
+        for cond in conditions:
+            for key in cond.keys:
+                use_prefix = False
+                if key.startswith('[]'):
+                    use_prefix = True
+                    key = key[2:]
+                # TODO(poojam23): this is very hacky and needs to be improved
+                if key.startswith("rocksdb"):
+                    key += ".60"
+                if use_prefix:
+                    if not self.key_prefix:
+                        print('Warning: OdsStatsFetcher might need key prefix')
+                        print('for the key: ' + key)
+                    else:
+                        key = self.key_prefix + "." + key
+                reqd_stats.append(key)
+        return reqd_stats
+
+    def fetch_rate_url(self, entities, keys, window_len, percent, display):
+        # type: (List[str], List[str], str, str, bool) -> str
+        transform_desc = (
+            "rate(" + str(window_len) + ",duration=" + str(self.duration_sec)
+        )
+        if percent:
+            transform_desc = transform_desc + ",%)"
+        else:
+            transform_desc = transform_desc + ")"
+        if re.search('rapido', self.client, re.IGNORECASE):
+            command = self.RAPIDO_COMMAND + " --transform=%s --url=%s"
+            command = command % (
+                self.client,
+                self._get_string_in_quotes(','.join(entities)),
+                self._get_string_in_quotes(','.join(keys)),
+                self._get_string_in_quotes(self.start_time),
+                self._get_string_in_quotes(self.end_time),
+                self._get_string_in_quotes(transform_desc),
+                self._get_string_in_quotes(display)
+            )
+        elif re.search('ods', self.client, re.IGNORECASE):
+            command = (
+                self.client + ' ' +
+                '--stime=' + self._get_ods_cli_stime(self.start_time) + ' ' +
+                '--fburlonly ' +
+                self._get_string_in_quotes(entities) + ' ' +
+                self._get_string_in_quotes(','.join(keys)) + ' ' +
+                self._get_string_in_quotes(transform_desc)
+            )
+        self.execute_script(command)
+        url = ""
+        with open(self.OUTPUT_FILE, 'r') as fp:
+            url = fp.readline()
+        return url
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/db_timeseries_parser.py b/thirdparty/rocksdb/tools/advisor/advisor/db_timeseries_parser.py
new file mode 100644
index 0000000000..308eb139ae
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/db_timeseries_parser.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from abc import abstractmethod
+from advisor.db_log_parser import DataSource
+from enum import Enum
+import math
+
+
+NO_ENTITY = 'ENTITY_PLACEHOLDER'
+
+
+class TimeSeriesData(DataSource):
+    class Behavior(Enum):
+        bursty = 1
+        evaluate_expression = 2
+
+    class AggregationOperator(Enum):
+        avg = 1
+        max = 2
+        min = 3
+        latest = 4
+        oldest = 5
+
+    def __init__(self):
+        super().__init__(DataSource.Type.TIME_SERIES)
+        self.keys_ts = None  # Dict[entity, Dict[key, Dict[timestamp, value]]]
+        self.stats_freq_sec = None
+
+    @abstractmethod
+    def get_keys_from_conditions(self, conditions):
+        # This method takes in a list of time-series conditions; for each
+        # condition it manipulates the 'keys' in the way that is supported by
+        # the subclass implementing this method
+        pass
+
+    @abstractmethod
+    def fetch_timeseries(self, required_statistics):
+        # this method takes in a list of statistics and fetches the timeseries
+        # for each of them and populates the 'keys_ts' dictionary
+        pass
+
+    def fetch_burst_epochs(
+        self, entities, statistic, window_sec, threshold, percent
+    ):
+        # type: (str, int, float, bool) -> Dict[str, Dict[int, float]]
+        # this method calculates the (percent) rate change in the 'statistic'
+        # for each entity (over 'window_sec' seconds) and returns the epochs
+        # where this rate change is greater than or equal to the 'threshold'
+        # value
+        if self.stats_freq_sec == 0:
+            # not time series data, cannot check for bursty behavior
+            return
+        if window_sec < self.stats_freq_sec:
+            window_sec = self.stats_freq_sec
+        # 'window_samples' is the number of windows to go back to
+        # compare the current window with, while calculating rate change.
+        window_samples = math.ceil(window_sec / self.stats_freq_sec)
+        burst_epochs = {}
+        # if percent = False:
+        # curr_val = value at window for which rate change is being calculated
+        # prev_val = value at window that is window_samples behind curr_window
+        # Then rate_without_percent =
+        # ((curr_val-prev_val)*duration_sec)/(curr_timestamp-prev_timestamp)
+        # if percent = True:
+        # rate_with_percent = (rate_without_percent * 100) / prev_val
+        # These calculations are in line with the rate() transform supported
+        # by ODS
+        for entity in entities:
+            if statistic not in self.keys_ts[entity]:
+                continue
+            timestamps = sorted(list(self.keys_ts[entity][statistic].keys()))
+            for ix in range(window_samples, len(timestamps), 1):
+                first_ts = timestamps[ix - window_samples]
+                last_ts = timestamps[ix]
+                first_val = self.keys_ts[entity][statistic][first_ts]
+                last_val = self.keys_ts[entity][statistic][last_ts]
+                diff = last_val - first_val
+                if percent:
+                    diff = diff * 100 / first_val
+                rate = (diff * self.duration_sec) / (last_ts - first_ts)
+                # if the rate change is greater than the provided threshold,
+                # then the condition is triggered for entity at time 'last_ts'
+                if rate >= threshold:
+                    if entity not in burst_epochs:
+                        burst_epochs[entity] = {}
+                    burst_epochs[entity][last_ts] = rate
+        return burst_epochs
+
+    def fetch_aggregated_values(self, entity, statistics, aggregation_op):
+        # type: (str, AggregationOperator) -> Dict[str, float]
+        # this method performs the aggregation specified by 'aggregation_op'
+        # on the timeseries of 'statistics' for 'entity' and returns:
+        # Dict[statistic, aggregated_value]
+        result = {}
+        for stat in statistics:
+            if stat not in self.keys_ts[entity]:
+                continue
+            agg_val = None
+            if aggregation_op is self.AggregationOperator.latest:
+                latest_timestamp = max(list(self.keys_ts[entity][stat].keys()))
+                agg_val = self.keys_ts[entity][stat][latest_timestamp]
+            elif aggregation_op is self.AggregationOperator.oldest:
+                oldest_timestamp = min(list(self.keys_ts[entity][stat].keys()))
+                agg_val = self.keys_ts[entity][stat][oldest_timestamp]
+            elif aggregation_op is self.AggregationOperator.max:
+                agg_val = max(list(self.keys_ts[entity][stat].values()))
+            elif aggregation_op is self.AggregationOperator.min:
+                agg_val = min(list(self.keys_ts[entity][stat].values()))
+            elif aggregation_op is self.AggregationOperator.avg:
+                values = list(self.keys_ts[entity][stat].values())
+                agg_val = sum(values) / len(values)
+            result[stat] = agg_val
+        return result
+
+    def check_and_trigger_conditions(self, conditions):
+        # get the list of statistics that need to be fetched
+        reqd_keys = self.get_keys_from_conditions(conditions)
+        # fetch the required statistics and populate the map 'keys_ts'
+        self.fetch_timeseries(reqd_keys)
+        # Trigger the appropriate conditions
+        for cond in conditions:
+            complete_keys = self.get_keys_from_conditions([cond])
+            # Get the entities that have all statistics required by 'cond':
+            # an entity is checked for a given condition only if we possess all
+            # of the condition's 'keys' for that entity
+            entities_with_stats = []
+            for entity in self.keys_ts:
+                stat_missing = False
+                for stat in complete_keys:
+                    if stat not in self.keys_ts[entity]:
+                        stat_missing = True
+                        break
+                if not stat_missing:
+                    entities_with_stats.append(entity)
+            if not entities_with_stats:
+                continue
+            if cond.behavior is self.Behavior.bursty:
+                # for a condition that checks for bursty behavior, only one key
+                # should be present in the condition's 'keys' field
+                result = self.fetch_burst_epochs(
+                    entities_with_stats,
+                    complete_keys[0],  # there should be only one key
+                    cond.window_sec,
+                    cond.rate_threshold,
+                    True
+                )
+                # Trigger in this case is:
+                # Dict[entity_name, Dict[timestamp, rate_change]]
+                # where the inner dictionary contains rate_change values when
+                # the rate_change >= threshold provided, with the
+                # corresponding timestamps
+                if result:
+                    cond.set_trigger(result)
+            elif cond.behavior is self.Behavior.evaluate_expression:
+                self.handle_evaluate_expression(
+                    cond,
+                    complete_keys,
+                    entities_with_stats
+                )
+
+    def handle_evaluate_expression(self, condition, statistics, entities):
+        trigger = {}
+        # check 'condition' for each of these entities
+        for entity in entities:
+            if hasattr(condition, 'aggregation_op'):
+                # in this case, the aggregation operation is performed on each
+                # of the condition's 'keys' and then with aggregated values
+                # condition's 'expression' is evaluated; if it evaluates to
+                # True, then list of the keys values is added to the
+                # condition's trigger: Dict[entity_name, List[stats]]
+                result = self.fetch_aggregated_values(
+                        entity, statistics, condition.aggregation_op
+                )
+                keys = [result[key] for key in statistics]
+                try:
+                    if eval(condition.expression):
+                        trigger[entity] = keys
+                except Exception as e:
+                    print(
+                        'WARNING(TimeSeriesData) check_and_trigger: ' + str(e)
+                    )
+            else:
+                # assumption: all stats have same series of timestamps
+                # this is similar to the above but 'expression' is evaluated at
+                # each timestamp, since there is no aggregation, and all the
+                # epochs are added to the trigger when the condition's
+                # 'expression' evaluated to true; so trigger is:
+                # Dict[entity, Dict[timestamp, List[stats]]]
+                for epoch in self.keys_ts[entity][statistics[0]].keys():
+                    keys = [
+                        self.keys_ts[entity][key][epoch]
+                        for key in statistics
+                    ]
+                    try:
+                        if eval(condition.expression):
+                            if entity not in trigger:
+                                trigger[entity] = {}
+                            trigger[entity][epoch] = keys
+                    except Exception as e:
+                        print(
+                            'WARNING(TimeSeriesData) check_and_trigger: ' +
+                            str(e)
+                        )
+        if trigger:
+            condition.set_trigger(trigger)
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/ini_parser.py b/thirdparty/rocksdb/tools/advisor/advisor/ini_parser.py
new file mode 100644
index 0000000000..4776ef2098
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/ini_parser.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from enum import Enum
+
+
+class IniParser:
+    class Element(Enum):
+        rule = 1
+        cond = 2
+        sugg = 3
+        key_val = 4
+        comment = 5
+
+    @staticmethod
+    def remove_trailing_comment(line):
+        line = line.strip()
+        comment_start = line.find('#')
+        if comment_start > -1:
+            return line[:comment_start]
+        return line
+
+    @staticmethod
+    def is_section_header(line):
+        # A section header looks like: [Rule "my-new-rule"]. Essentially,
+        # a line that is in square-brackets.
+        line = line.strip()
+        if line.startswith('[') and line.endswith(']'):
+            return True
+        return False
+
+    @staticmethod
+    def get_section_name(line):
+        # For a section header: [Rule "my-new-rule"], this method will return
+        # "my-new-rule".
+        token_list = line.strip()[1:-1].split('"')
+        if len(token_list) < 3:
+            error = 'needed section header: [<section_type> "<section_name>"]'
+            raise ValueError('Parsing error: ' + error + '\n' + line)
+        return token_list[1]
+
+    @staticmethod
+    def get_element(line):
+        line = IniParser.remove_trailing_comment(line)
+        if not line:
+            return IniParser.Element.comment
+        if IniParser.is_section_header(line):
+            if line.strip()[1:-1].startswith('Suggestion'):
+                return IniParser.Element.sugg
+            if line.strip()[1:-1].startswith('Rule'):
+                return IniParser.Element.rule
+            if line.strip()[1:-1].startswith('Condition'):
+                return IniParser.Element.cond
+        if '=' in line:
+            return IniParser.Element.key_val
+        error = 'not a recognizable RulesSpec element'
+        raise ValueError('Parsing error: ' + error + '\n' + line)
+
+    @staticmethod
+    def get_key_value_pair(line):
+        line = line.strip()
+        key = line.split('=')[0].strip()
+        value = "=".join(line.split('=')[1:])
+        if value == "":  # if the option has no value
+            return (key, None)
+        values = IniParser.get_list_from_value(value)
+        if len(values) == 1:
+            return (key, value)
+        return (key, values)
+
+    @staticmethod
+    def get_list_from_value(value):
+        values = value.strip().split(':')
+        return values
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/rule_parser.py b/thirdparty/rocksdb/tools/advisor/advisor/rule_parser.py
new file mode 100644
index 0000000000..592218f4ac
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/rule_parser.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from abc import ABC, abstractmethod
+from advisor.db_log_parser import DataSource, NO_COL_FAMILY
+from advisor.db_timeseries_parser import TimeSeriesData
+from enum import Enum
+from advisor.ini_parser import IniParser
+import re
+
+
+class Section(ABC):
+    def __init__(self, name):
+        self.name = name
+
+    @abstractmethod
+    def set_parameter(self, key, value):
+        pass
+
+    @abstractmethod
+    def perform_checks(self):
+        pass
+
+
+class Rule(Section):
+    def __init__(self, name):
+        super().__init__(name)
+        self.conditions = None
+        self.suggestions = None
+        self.overlap_time_seconds = None
+        self.trigger_entities = None
+        self.trigger_column_families = None
+
+    def set_parameter(self, key, value):
+        # If the Rule is associated with a single suggestion/condition, then
+        # value will be a string and not a list. Hence, convert it to a single
+        # element list before storing it in self.suggestions or
+        # self.conditions.
+        if key == 'conditions':
+            if isinstance(value, str):
+                self.conditions = [value]
+            else:
+                self.conditions = value
+        elif key == 'suggestions':
+            if isinstance(value, str):
+                self.suggestions = [value]
+            else:
+                self.suggestions = value
+        elif key == 'overlap_time_period':
+            self.overlap_time_seconds = value
+
+    def get_suggestions(self):
+        return self.suggestions
+
+    def perform_checks(self):
+        if not self.conditions or len(self.conditions) < 1:
+            raise ValueError(
+                self.name + ': rule must have at least one condition'
+            )
+        if not self.suggestions or len(self.suggestions) < 1:
+            raise ValueError(
+                self.name + ': rule must have at least one suggestion'
+            )
+        if self.overlap_time_seconds:
+            if len(self.conditions) != 2:
+                raise ValueError(
+                    self.name + ": rule must be associated with 2 conditions\
+                    in order to check for a time dependency between them"
+                )
+            time_format = '^\d+[s|m|h|d]$'
+            if (
+                not
+                re.match(time_format, self.overlap_time_seconds, re.IGNORECASE)
+            ):
+                raise ValueError(
+                    self.name + ": overlap_time_seconds format: \d+[s|m|h|d]"
+                )
+            else:  # convert to seconds
+                in_seconds = int(self.overlap_time_seconds[:-1])
+                if self.overlap_time_seconds[-1] == 'm':
+                    in_seconds *= 60
+                elif self.overlap_time_seconds[-1] == 'h':
+                    in_seconds *= (60 * 60)
+                elif self.overlap_time_seconds[-1] == 'd':
+                    in_seconds *= (24 * 60 * 60)
+                self.overlap_time_seconds = in_seconds
+
+    def get_overlap_timestamps(self, key1_trigger_epochs, key2_trigger_epochs):
+        # this method takes in 2 timeseries i.e. timestamps at which the
+        # rule's 2 TIME_SERIES conditions were triggered and it finds
+        # (if present) the first pair of timestamps at which the 2 conditions
+        # were triggered within 'overlap_time_seconds' of each other
+        key1_lower_bounds = [
+            epoch - self.overlap_time_seconds
+            for epoch in key1_trigger_epochs
+        ]
+        key1_lower_bounds.sort()
+        key2_trigger_epochs.sort()
+        trigger_ix = 0
+        overlap_pair = None
+        for key1_lb in key1_lower_bounds:
+            while (
+                key2_trigger_epochs[trigger_ix] < key1_lb and
+                trigger_ix < len(key2_trigger_epochs)
+            ):
+                trigger_ix += 1
+            if trigger_ix >= len(key2_trigger_epochs):
+                break
+            if (
+                key2_trigger_epochs[trigger_ix] <=
+                key1_lb + (2 * self.overlap_time_seconds)
+            ):
+                overlap_pair = (
+                    key2_trigger_epochs[trigger_ix],
+                    key1_lb + self.overlap_time_seconds
+                )
+                break
+        return overlap_pair
+
+    def get_trigger_entities(self):
+        return self.trigger_entities
+
+    def get_trigger_column_families(self):
+        return self.trigger_column_families
+
+    def is_triggered(self, conditions_dict, column_families):
+        if self.overlap_time_seconds:
+            condition1 = conditions_dict[self.conditions[0]]
+            condition2 = conditions_dict[self.conditions[1]]
+            if not (
+                condition1.get_data_source() is DataSource.Type.TIME_SERIES and
+                condition2.get_data_source() is DataSource.Type.TIME_SERIES
+            ):
+                raise ValueError(self.name + ': need 2 timeseries conditions')
+
+            map1 = condition1.get_trigger()
+            map2 = condition2.get_trigger()
+            if not (map1 and map2):
+                return False
+
+            self.trigger_entities = {}
+            is_triggered = False
+            entity_intersection = (
+                set(map1.keys()).intersection(set(map2.keys()))
+            )
+            for entity in entity_intersection:
+                overlap_timestamps_pair = (
+                    self.get_overlap_timestamps(
+                        list(map1[entity].keys()), list(map2[entity].keys())
+                    )
+                )
+                if overlap_timestamps_pair:
+                    self.trigger_entities[entity] = overlap_timestamps_pair
+                    is_triggered = True
+            if is_triggered:
+                self.trigger_column_families = set(column_families)
+            return is_triggered
+        else:
+            all_conditions_triggered = True
+            self.trigger_column_families = set(column_families)
+            for cond_name in self.conditions:
+                cond = conditions_dict[cond_name]
+                if not cond.get_trigger():
+                    all_conditions_triggered = False
+                    break
+                if (
+                    cond.get_data_source() is DataSource.Type.LOG or
+                    cond.get_data_source() is DataSource.Type.DB_OPTIONS
+                ):
+                    cond_col_fam = set(cond.get_trigger().keys())
+                    if NO_COL_FAMILY in cond_col_fam:
+                        cond_col_fam = set(column_families)
+                    self.trigger_column_families = (
+                        self.trigger_column_families.intersection(cond_col_fam)
+                    )
+                elif cond.get_data_source() is DataSource.Type.TIME_SERIES:
+                    cond_entities = set(cond.get_trigger().keys())
+                    if self.trigger_entities is None:
+                        self.trigger_entities = cond_entities
+                    else:
+                        self.trigger_entities = (
+                            self.trigger_entities.intersection(cond_entities)
+                        )
+                if not (self.trigger_entities or self.trigger_column_families):
+                    all_conditions_triggered = False
+                    break
+            if not all_conditions_triggered:  # clean up if rule not triggered
+                self.trigger_column_families = None
+                self.trigger_entities = None
+            return all_conditions_triggered
+
+    def __repr__(self):
+        # Append conditions
+        rule_string = "Rule: " + self.name + " has conditions:: "
+        is_first = True
+        for cond in self.conditions:
+            if is_first:
+                rule_string += cond
+                is_first = False
+            else:
+                rule_string += (" AND " + cond)
+        # Append suggestions
+        rule_string += "\nsuggestions:: "
+        is_first = True
+        for sugg in self.suggestions:
+            if is_first:
+                rule_string += sugg
+                is_first = False
+            else:
+                rule_string += (", " + sugg)
+        if self.trigger_entities:
+            rule_string += (', entities:: ' + str(self.trigger_entities))
+        if self.trigger_column_families:
+            rule_string += (', col_fam:: ' + str(self.trigger_column_families))
+        # Return constructed string
+        return rule_string
+
+
+class Suggestion(Section):
+    class Action(Enum):
+        set = 1
+        increase = 2
+        decrease = 3
+
+    def __init__(self, name):
+        super().__init__(name)
+        self.option = None
+        self.action = None
+        self.suggested_values = None
+        self.description = None
+
+    def set_parameter(self, key, value):
+        if key == 'option':
+            # Note:
+            # case 1: 'option' is supported by Rocksdb OPTIONS file; in this
+            # case the option belongs to one of the sections in the config
+            # file and it's name is prefixed by "<section_type>."
+            # case 2: 'option' is not supported by Rocksdb OPTIONS file; the
+            # option is not expected to have the character '.' in its name
+            self.option = value
+        elif key == 'action':
+            if self.option and not value:
+                raise ValueError(self.name + ': provide action for option')
+            self.action = self.Action[value]
+        elif key == 'suggested_values':
+            if isinstance(value, str):
+                self.suggested_values = [value]
+            else:
+                self.suggested_values = value
+        elif key == 'description':
+            self.description = value
+
+    def perform_checks(self):
+        if not self.description:
+            if not self.option:
+                raise ValueError(self.name + ': provide option or description')
+            if not self.action:
+                raise ValueError(self.name + ': provide action for option')
+            if self.action is self.Action.set and not self.suggested_values:
+                raise ValueError(
+                    self.name + ': provide suggested value for option'
+                )
+
+    def __repr__(self):
+        sugg_string = "Suggestion: " + self.name
+        if self.description:
+            sugg_string += (' description : ' + self.description)
+        else:
+            sugg_string += (
+                ' option : ' + self.option + ' action : ' + self.action.name
+            )
+            if self.suggested_values:
+                sugg_string += (
+                    ' suggested_values : ' + str(self.suggested_values)
+                )
+        return sugg_string
+
+
+class Condition(Section):
+    def __init__(self, name):
+        super().__init__(name)
+        self.data_source = None
+        self.trigger = None
+
+    def perform_checks(self):
+        if not self.data_source:
+            raise ValueError(self.name + ': condition not tied to data source')
+
+    def set_data_source(self, data_source):
+        self.data_source = data_source
+
+    def get_data_source(self):
+        return self.data_source
+
+    def reset_trigger(self):
+        self.trigger = None
+
+    def set_trigger(self, condition_trigger):
+        self.trigger = condition_trigger
+
+    def get_trigger(self):
+        return self.trigger
+
+    def is_triggered(self):
+        if self.trigger:
+            return True
+        return False
+
+    def set_parameter(self, key, value):
+        # must be defined by the subclass
+        raise NotImplementedError(self.name + ': provide source for condition')
+
+
+class LogCondition(Condition):
+    @classmethod
+    def create(cls, base_condition):
+        base_condition.set_data_source(DataSource.Type['LOG'])
+        base_condition.__class__ = cls
+        return base_condition
+
+    def set_parameter(self, key, value):
+        if key == 'regex':
+            self.regex = value
+
+    def perform_checks(self):
+        super().perform_checks()
+        if not self.regex:
+            raise ValueError(self.name + ': provide regex for log condition')
+
+    def __repr__(self):
+        log_cond_str = "LogCondition: " + self.name
+        log_cond_str += (" regex: " + self.regex)
+        # if self.trigger:
+        #     log_cond_str += (" trigger: " + str(self.trigger))
+        return log_cond_str
+
+
+class OptionCondition(Condition):
+    @classmethod
+    def create(cls, base_condition):
+        base_condition.set_data_source(DataSource.Type['DB_OPTIONS'])
+        base_condition.__class__ = cls
+        return base_condition
+
+    def set_parameter(self, key, value):
+        if key == 'options':
+            if isinstance(value, str):
+                self.options = [value]
+            else:
+                self.options = value
+        elif key == 'evaluate':
+            self.eval_expr = value
+
+    def perform_checks(self):
+        super().perform_checks()
+        if not self.options:
+            raise ValueError(self.name + ': options missing in condition')
+        if not self.eval_expr:
+            raise ValueError(self.name + ': expression missing in condition')
+
+    def __repr__(self):
+        opt_cond_str = "OptionCondition: " + self.name
+        opt_cond_str += (" options: " + str(self.options))
+        opt_cond_str += (" expression: " + self.eval_expr)
+        if self.trigger:
+            opt_cond_str += (" trigger: " + str(self.trigger))
+        return opt_cond_str
+
+
+class TimeSeriesCondition(Condition):
+    @classmethod
+    def create(cls, base_condition):
+        base_condition.set_data_source(DataSource.Type['TIME_SERIES'])
+        base_condition.__class__ = cls
+        return base_condition
+
+    def set_parameter(self, key, value):
+        if key == 'keys':
+            if isinstance(value, str):
+                self.keys = [value]
+            else:
+                self.keys = value
+        elif key == 'behavior':
+            self.behavior = TimeSeriesData.Behavior[value]
+        elif key == 'rate_threshold':
+            self.rate_threshold = float(value)
+        elif key == 'window_sec':
+            self.window_sec = int(value)
+        elif key == 'evaluate':
+            self.expression = value
+        elif key == 'aggregation_op':
+            self.aggregation_op = TimeSeriesData.AggregationOperator[value]
+
+    def perform_checks(self):
+        if not self.keys:
+            raise ValueError(self.name + ': specify timeseries key')
+        if not self.behavior:
+            raise ValueError(self.name + ': specify triggering behavior')
+        if self.behavior is TimeSeriesData.Behavior.bursty:
+            if not self.rate_threshold:
+                raise ValueError(self.name + ': specify rate burst threshold')
+            if not self.window_sec:
+                self.window_sec = 300  # default window length is 5 minutes
+            if len(self.keys) > 1:
+                raise ValueError(self.name + ': specify only one key')
+        elif self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+            if not (self.expression):
+                raise ValueError(self.name + ': specify evaluation expression')
+        else:
+            raise ValueError(self.name + ': trigger behavior not supported')
+
+    def __repr__(self):
+        ts_cond_str = "TimeSeriesCondition: " + self.name
+        ts_cond_str += (" statistics: " + str(self.keys))
+        ts_cond_str += (" behavior: " + self.behavior.name)
+        if self.behavior is TimeSeriesData.Behavior.bursty:
+            ts_cond_str += (" rate_threshold: " + str(self.rate_threshold))
+            ts_cond_str += (" window_sec: " + str(self.window_sec))
+        if self.behavior is TimeSeriesData.Behavior.evaluate_expression:
+            ts_cond_str += (" expression: " + self.expression)
+            if hasattr(self, 'aggregation_op'):
+                ts_cond_str += (" aggregation_op: " + self.aggregation_op.name)
+        if self.trigger:
+            ts_cond_str += (" trigger: " + str(self.trigger))
+        return ts_cond_str
+
+
+class RulesSpec:
+    def __init__(self, rules_path):
+        self.file_path = rules_path
+
+    def initialise_fields(self):
+        self.rules_dict = {}
+        self.conditions_dict = {}
+        self.suggestions_dict = {}
+
+    def perform_section_checks(self):
+        for rule in self.rules_dict.values():
+            rule.perform_checks()
+        for cond in self.conditions_dict.values():
+            cond.perform_checks()
+        for sugg in self.suggestions_dict.values():
+            sugg.perform_checks()
+
+    def load_rules_from_spec(self):
+        self.initialise_fields()
+        with open(self.file_path, 'r') as db_rules:
+            curr_section = None
+            for line in db_rules:
+                line = IniParser.remove_trailing_comment(line)
+                if not line:
+                    continue
+                element = IniParser.get_element(line)
+                if element is IniParser.Element.comment:
+                    continue
+                elif element is not IniParser.Element.key_val:
+                    curr_section = element  # it's a new IniParser header
+                    section_name = IniParser.get_section_name(line)
+                    if element is IniParser.Element.rule:
+                        new_rule = Rule(section_name)
+                        self.rules_dict[section_name] = new_rule
+                    elif element is IniParser.Element.cond:
+                        new_cond = Condition(section_name)
+                        self.conditions_dict[section_name] = new_cond
+                    elif element is IniParser.Element.sugg:
+                        new_suggestion = Suggestion(section_name)
+                        self.suggestions_dict[section_name] = new_suggestion
+                elif element is IniParser.Element.key_val:
+                    key, value = IniParser.get_key_value_pair(line)
+                    if curr_section is IniParser.Element.rule:
+                        new_rule.set_parameter(key, value)
+                    elif curr_section is IniParser.Element.cond:
+                        if key == 'source':
+                            if value == 'LOG':
+                                new_cond = LogCondition.create(new_cond)
+                            elif value == 'OPTIONS':
+                                new_cond = OptionCondition.create(new_cond)
+                            elif value == 'TIME_SERIES':
+                                new_cond = TimeSeriesCondition.create(new_cond)
+                        else:
+                            new_cond.set_parameter(key, value)
+                    elif curr_section is IniParser.Element.sugg:
+                        new_suggestion.set_parameter(key, value)
+
+    def get_rules_dict(self):
+        return self.rules_dict
+
+    def get_conditions_dict(self):
+        return self.conditions_dict
+
+    def get_suggestions_dict(self):
+        return self.suggestions_dict
+
+    def get_triggered_rules(self, data_sources, column_families):
+        self.trigger_conditions(data_sources)
+        triggered_rules = []
+        for rule in self.rules_dict.values():
+            if rule.is_triggered(self.conditions_dict, column_families):
+                triggered_rules.append(rule)
+        return triggered_rules
+
+    def trigger_conditions(self, data_sources):
+        for source_type in data_sources:
+            cond_subset = [
+                cond
+                for cond in self.conditions_dict.values()
+                if cond.get_data_source() is source_type
+            ]
+            if not cond_subset:
+                continue
+            for source in data_sources[source_type]:
+                source.check_and_trigger_conditions(cond_subset)
+
+    def print_rules(self, rules):
+        for rule in rules:
+            print('\nRule: ' + rule.name)
+            for cond_name in rule.conditions:
+                print(repr(self.conditions_dict[cond_name]))
+            for sugg_name in rule.suggestions:
+                print(repr(self.suggestions_dict[sugg_name]))
+            if rule.trigger_entities:
+                print('scope: entities:')
+                print(rule.trigger_entities)
+            if rule.trigger_column_families:
+                print('scope: col_fam:')
+                print(rule.trigger_column_families)
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/rule_parser_example.py b/thirdparty/rocksdb/tools/advisor/advisor/rule_parser_example.py
new file mode 100644
index 0000000000..d2348e5ae1
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/rule_parser_example.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.rule_parser import RulesSpec
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
+from advisor.db_stats_fetcher import LogStatsParser, OdsStatsFetcher
+import argparse
+
+
+def main(args):
+    # initialise the RulesSpec parser
+    rule_spec_parser = RulesSpec(args.rules_spec)
+    rule_spec_parser.load_rules_from_spec()
+    rule_spec_parser.perform_section_checks()
+    # initialize the DatabaseOptions object
+    db_options = DatabaseOptions(args.rocksdb_options)
+    # Create DatabaseLogs object
+    db_logs = DatabaseLogs(
+        args.log_files_path_prefix, db_options.get_column_families()
+    )
+    # Create the Log STATS object
+    db_log_stats = LogStatsParser(
+        args.log_files_path_prefix, args.stats_dump_period_sec
+    )
+    data_sources = {
+        DataSource.Type.DB_OPTIONS: [db_options],
+        DataSource.Type.LOG: [db_logs],
+        DataSource.Type.TIME_SERIES: [db_log_stats]
+    }
+    if args.ods_client:
+        data_sources[DataSource.Type.TIME_SERIES].append(OdsStatsFetcher(
+            args.ods_client,
+            args.ods_entity,
+            args.ods_tstart,
+            args.ods_tend,
+            args.ods_key_prefix
+        ))
+    triggered_rules = rule_spec_parser.get_triggered_rules(
+        data_sources, db_options.get_column_families()
+    )
+    rule_spec_parser.print_rules(triggered_rules)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Use this script to get\
+        suggestions for improving Rocksdb performance.')
+    parser.add_argument(
+        '--rules_spec', required=True, type=str,
+        help='path of the file containing the expert-specified Rules'
+    )
+    parser.add_argument(
+        '--rocksdb_options', required=True, type=str,
+        help='path of the starting Rocksdb OPTIONS file'
+    )
+    parser.add_argument(
+        '--log_files_path_prefix', required=True, type=str,
+        help='path prefix of the Rocksdb LOG files'
+    )
+    parser.add_argument(
+        '--stats_dump_period_sec', required=True, type=int,
+        help='the frequency (in seconds) at which STATISTICS are printed to ' +
+        'the Rocksdb LOG file'
+    )
+    # ODS arguments
+    parser.add_argument(
+        '--ods_client', type=str, help='the ODS client binary'
+    )
+    parser.add_argument(
+        '--ods_entity', type=str,
+        help='the servers for which the ODS stats need to be fetched'
+    )
+    parser.add_argument(
+        '--ods_key_prefix', type=str,
+        help='the prefix that needs to be attached to the keys of time ' +
+        'series to be fetched from ODS'
+    )
+    parser.add_argument(
+        '--ods_tstart', type=int,
+        help='start time of timeseries to be fetched from ODS'
+    )
+    parser.add_argument(
+        '--ods_tend', type=int,
+        help='end time of timeseries to be fetched from ODS'
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/thirdparty/rocksdb/tools/advisor/advisor/rules.ini b/thirdparty/rocksdb/tools/advisor/advisor/rules.ini
new file mode 100644
index 0000000000..ec7a07e605
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/advisor/rules.ini
@@ -0,0 +1,214 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+#
+# FORMAT: very similar to the Rocksdb ini file in terms of syntax
+# (refer rocksdb/examples/rocksdb_option_file_example.ini)
+#
+# The Rules INI file is made up of multiple sections and each section is made
+# up of multiple key-value pairs. The recognized section types are:
+# Rule, Suggestion, Condition. Each section must have a name specified in ""
+# in the section header. This name acts as an identifier in that section
+# type's namespace. A section header looks like:
+# [<section_type> "<section_name_identifier>"]
+#
+# There should be at least one Rule section in the file with its corresponding
+# Condition and Suggestion sections. A Rule is triggered only when all of its
+# conditions are triggered. The order in which a Rule's conditions and
+# suggestions are specified has no significance.
+#
+# A Condition must be associated with a data source specified by the parameter
+# 'source' and this must be the first parameter specified for the Condition.
+# A condition can be associated with one or more Rules.
+#
+# A Suggestion is an advised change to a Rocksdb option to improve the
+# performance of the database in some way. Every suggestion can be a part of
+# one or more Rules.
+
+[Rule "stall-too-many-memtables"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=stall-too-many-memtables
+
+[Condition "stall-too-many-memtables"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Rule "stall-too-many-L0"]
+suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
+conditions=stall-too-many-L0
+
+[Condition "stall-too-many-L0"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
+conditions=stop-too-many-L0
+
+[Condition "stop-too-many-L0"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Rule "stall-too-many-compaction-bytes"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
+conditions=stall-too-many-compaction-bytes
+
+[Condition "stall-too-many-compaction-bytes"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+suggested_values=2
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "inc-max-subcompactions"]
+option=DBOptions.max_subcompactions
+action=increase
+
+[Suggestion "inc-max-bg-compactions"]
+option=DBOptions.max_background_compactions
+action=increase
+suggested_values=2
+
+[Suggestion "inc-write-buffer-size"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "dec-max-bytes-for-level-base"]
+option=CFOptions.max_bytes_for_level_base
+action=decrease
+
+[Suggestion "inc-l0-slowdown-writes-trigger"]
+option=CFOptions.level0_slowdown_writes_trigger
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
+
+[Suggestion "inc-hard-pending-compaction-bytes-limit"]
+option=CFOptions.hard_pending_compaction_bytes_limit
+action=increase
+
+[Suggestion "inc-soft-pending-compaction-bytes-limit"]
+option=CFOptions.soft_pending_compaction_bytes_limit
+action=increase
+
+[Rule "level0-level1-ratio"]
+conditions=level0-level1-ratio
+suggestions=inc-base-max-bytes
+
+[Condition "level0-level1-ratio"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=int(options[0])*int(options[1])-int(options[2])>=1  # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "inc-base-max-bytes"]
+option=CFOptions.max_bytes_for_level_base
+action=increase
+
+[Rules "tuning-iostat-burst"]
+conditions=large-db-get-p99
+suggestions=bytes-per-sync-non0:wal-bytes-per-sync-non0:set-rate-limiter
+#overlap_time_period=10m
+
+[Condition "write-burst"]
+source=TIME_SERIES
+keys=dyno.flash_write_bytes_per_sec
+behavior=bursty
+window_sec=300  # the smaller this window, the more sensitivity to changes in the time series, so the rate_threshold should be bigger; when it's 60, then same as diff(%)
+rate_threshold=20
+
+[Condition "large-p99-read-latency"]
+source=TIME_SERIES
+keys=[]rocksdb.read.block.get.micros.p99
+behavior=bursty
+window_sec=300
+rate_threshold=10
+
+[Condition "large-db-get-p99"]
+source=TIME_SERIES
+keys=[]rocksdb.db.get.micros.p50:[]rocksdb.db.get.micros.p99
+behavior=evaluate_expression
+evaluate=(keys[1]/keys[0])>5
+
+[Suggestion "bytes-per-sync-non0"]
+option=DBOptions.bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "wal-bytes-per-sync-non0"]
+option=DBOptions.wal_bytes_per_sync
+action=set
+suggested_values=1048576
+
+[Suggestion "set-rate-limiter"]
+option=rate_limiter_bytes_per_sec
+action=set
+suggested_values=1024000
+
+[Rule "bloom-filter-percent-useful"]
+conditions=bloom-filter-percent-useful
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-filter-percent-useful"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=((keys[0]+keys[2])/(keys[0]+keys[1]))<0.9  # should evaluate to a boolean
+aggregation_op=latest
+
+[Rule "bloom-not-enabled"]
+conditions=bloom-not-enabled
+suggestions=inc-bloom-bits-per-key
+
+[Condition "bloom-not-enabled"]
+source=TIME_SERIES
+keys=[]rocksdb.bloom.filter.useful.count:[]rocksdb.bloom.filter.full.positive.count:[]rocksdb.bloom.filter.full.true.positive.count
+behavior=evaluate_expression
+evaluate=keys[0]+keys[1]+keys[2]==0
+aggregation_op=avg
+
+[Suggestion "inc-bloom-bits-per-key"]
+option=bloom_bits
+action=increase
+suggested_values=2
+
+[Rule "small-l0-files"]
+conditions=small-l0-files
+suggestions=dec-max-bytes-for-level-base:inc-write-buffer-size
+
+[Condition "small-l0-files"]
+source=OPTIONS
+options=CFOptions.max_bytes_for_level_base:CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size
+evaluate=int(options[0])>(10*int(options[1])*int(options[2]))
+
+[Rule "decompress-time-long"]
+conditions=decompress-time-long
+suggestions=dec-block-size:inc-block-cache-size:faster-compression-type
+
+[Condition "decompress-time-long"]
+source=TIME_SERIES
+keys=block_decompress_time:block_read_time:block_checksum_time
+behavior=evaluate_expression
+evaluate=(keys[0]/(keys[0]+keys[1]+keys[2]))>0.3
+
+[Suggestion "dec-block-size"]
+option=TableOptions.BlockBasedTable.block_size
+action=decrease
+
+[Suggestion "inc-block-cache-size"]
+option=cache_size
+action=increase
+suggested_values=16000000
+
+[Suggestion "faster-compression-type"]
+option=CFOptions.compression
+action=set
+suggested_values=kLZ4Compression
diff --git a/thirdparty/rocksdb/tools/advisor/test/__init__.py b/thirdparty/rocksdb/tools/advisor/test/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/LOG-0 b/thirdparty/rocksdb/tools/advisor/test/input_files/LOG-0
new file mode 100644
index 0000000000..3c9d51641b
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/LOG-0
@@ -0,0 +1,30 @@
+2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0
+2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0
+2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8
+2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"}
+2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started
+2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1.
+2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886
+2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK
+2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started
+2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1.
+2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK
+2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stalling writes because of estimated pending compaction bytes 14410584
+2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started
+2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done
+2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK
+2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1}
+2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
+2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
+2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
+2018/05/25-14:34:21.049000 7f82bd676200 [db/db_impl.cc:563] [col-fam-A] random log message for testing
+2018/05/25-14:34:21.049010 7f82bd676200 [db/db_impl.cc:234] [col-fam-B] log continuing on next line
+remaining part of the log
+2018/05/25-14:34:21.049020 7f82bd676200 [db/db_impl.cc:653] [col-fam-A] another random log message
+2018/05/25-14:34:21.049025 7f82bd676200 [db/db_impl.cc:331] [unknown] random log message no column family
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/LOG-1 b/thirdparty/rocksdb/tools/advisor/test/input_files/LOG-1
new file mode 100644
index 0000000000..b163f9a991
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/LOG-1
@@ -0,0 +1,25 @@
+2018/05/25-14:30:05.601692 7f82bd676200 RocksDB version: 5.14.0
+2018/05/25-14:30:07.626719 7f82ba72e700 (Original Log Time 2018/05/25-14:30:07.621966) [db/db_impl_compaction_flush.cc:1424] Calling FlushMemTableToOutputFile with column family [default], flush slots available 1, compaction slots available 1, flush slots scheduled 1, compaction slots scheduled 0
+2018/05/25-14:30:07.626725 7f82ba72e700 [db/flush_job.cc:301] [default] [JOB 3] Flushing memtable with next log file: 8
+2018/05/25-14:30:07.626738 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283807626732, "job": 3, "event": "flush_started", "num_memtables": 1, "num_entries": 28018, "num_deletes": 0, "memory_usage": 4065512, "flush_reason": "Write Buffer Full"}
+2018/05/25-14:30:07.626740 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 3] Level-0 flush table #10: started
+2018/05/25-14:30:07.764232 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #11. Immutable memtables: 1.
+2018/05/25-14:30:07.764240 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stalling writes because we have 4 level-0 files rate 39886
+2018/05/23-11:53:12.800143 7f9f36b40700 [WARN] [db/column_family.cc:799] [default] Stopping writes because we have 4 level-0 files rate 39886
+2018/05/25-14:30:09.398302 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283809398276, "cf_name": "default", "job": 3, "event": "table_file_creation", "file_number": 10, "file_size": 1890434, "table_properties": {"data_size": 1876749, "index_size": 23346, "filter_size": 0, "raw_key_size": 663120, "raw_average_key_size": 24, "raw_value_size": 2763000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27630, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:09.398351 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 3] Level-0 flush table #10: 1890434 bytes OK
+2018/05/25-14:30:25.491635 7f82ba72e700 [db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table #23: started
+2018/05/25-14:30:25.643618 7f82b2f20700 [db/db_impl_write.cc:1373] [default] New memtable created with log file: #24. Immutable memtables: 1.
+2018/05/25-14:30:25.643633 7f82b2f20700 [WARN] [db/column_family.cc:743] [default] Stopping writes because we have 2 immutable memtables (waiting for flush), max_write_buffer_number is set to 2
+2018/05/25-14:30:27.288181 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527283827288158, "cf_name": "default", "job": 10, "event": "table_file_creation", "file_number": 23, "file_size": 1893200, "table_properties": {"data_size": 1879460, "index_size": 23340, "filter_size": 0, "raw_key_size": 663360, "raw_average_key_size": 24, "raw_value_size": 2764000, "raw_average_value_size": 100, "num_data_blocks": 838, "num_entries": 27640, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:30:27.288210 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 10] Level-0 flush table #23: 1893200 bytes OK
+2018/05/25-14:30:27.289353 7f82ba72e700 [WARN] [db/column_family.cc:764] [default] Stopping writes because of estimated pending compaction bytes 14410584
+2018/05/25-14:30:27.289390 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.288829) [db/memtable_list.cc:377] [default] Level-0 commit table #23 started
+2018/05/25-14:30:27.289393 7f82ba72e700 (Original Log Time 2018/05/25-14:30:27.289332) [db/memtable_list.cc:409] [default] Level-0 commit table #23: memtable #1 done
+2018/05/25-14:34:21.047206 7f82ba72e700 EVENT_LOG_v1 {"time_micros": 1527284061047181, "cf_name": "default", "job": 44, "event": "table_file_creation", "file_number": 84, "file_size": 1890780, "table_properties": {"data_size": 1877100, "index_size": 23309, "filter_size": 0, "raw_key_size": 662808, "raw_average_key_size": 24, "raw_value_size": 2761700, "raw_average_value_size": 100, "num_data_blocks": 837, "num_entries": 27617, "filter_policy_name": "", "kDeletedKeys": "0", "kMergeOperands": "0"}}
+2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] [default] [JOB 44] Level-0 flush table #84: 1890780 bytes OK
+2018/05/25-14:34:21.048017 7f82ba72e700 (Original Log Time 2018/05/25-14:34:21.048005) EVENT_LOG_v1 {"time_micros": 1527284061047997, "job": 44, "event": "flush_finished", "output_compression": "Snappy", "lsm_state": [2, 1, 0, 0, 0, 0, 0], "immutable_memtables": 1}
+2018/05/25-14:34:21.048592 7f82bd676200 [DEBUG] [db/db_impl_files.cc:261] [JOB 45] Delete /tmp/rocksdbtest-155919/dbbench/000084.sst type=2 #84 -- OK
+2018/05/25-14:34:21.048603 7f82bd676200 EVENT_LOG_v1 {"time_micros": 1527284061048600, "job": 45, "event": "table_file_deletion", "file_number": 84}
+2018/05/25-14:34:21.048981 7f82bd676200 [db/db_impl.cc:398] Shutdown complete
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/OPTIONS-000005 b/thirdparty/rocksdb/tools/advisor/test/input_files/OPTIONS-000005
new file mode 100644
index 0000000000..009edb04d0
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/OPTIONS-000005
@@ -0,0 +1,49 @@
+# This is a RocksDB option file.
+#
+# For detailed file format spec, please refer to the example file
+# in examples/rocksdb_option_file_example.ini
+#
+
+[Version]
+  rocksdb_version=5.14.0
+  options_file_version=1.1
+
+[DBOptions]
+  manual_wal_flush=false
+  allow_ingest_behind=false
+  db_write_buffer_size=0
+  db_log_dir=
+  random_access_max_buffer_size=1048576
+
+[CFOptions "default"]
+  ttl=0
+  max_bytes_for_level_base=268435456
+  max_bytes_for_level_multiplier=10.000000
+  level0_file_num_compaction_trigger=4
+  level0_stop_writes_trigger=36
+  write_buffer_size=4194000
+  min_write_buffer_number_to_merge=1
+  num_levels=7
+  compaction_filter_factory=nullptr
+  compaction_style=kCompactionStyleLevel
+
+[TableOptions/BlockBasedTable "default"]
+  block_align=false
+  index_type=kBinarySearch
+
+[CFOptions "col_fam_A"]
+ttl=0
+max_bytes_for_level_base=268435456
+max_bytes_for_level_multiplier=10.000000
+level0_file_num_compaction_trigger=5
+level0_stop_writes_trigger=36
+write_buffer_size=1024000
+min_write_buffer_number_to_merge=1
+num_levels=5
+compaction_filter_factory=nullptr
+compaction_style=kCompactionStyleLevel
+
+[TableOptions/BlockBasedTable "col_fam_A"]
+block_align=true
+block_restart_interval=16
+index_type=kBinarySearch
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts b/thirdparty/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts
new file mode 100644
index 0000000000..e8ade9e3e3
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/log_stats_parser_keys_ts
@@ -0,0 +1,3 @@
+rocksdb.number.block.decompressed.count: 1530896335 88.0, 1530896361 788338.0, 1530896387 1539256.0, 1530896414 2255696.0, 1530896440 3009325.0, 1530896466 3767183.0, 1530896492 4529775.0, 1530896518 5297809.0, 1530896545 6033802.0, 1530896570 6794129.0
+rocksdb.db.get.micros.p50: 1530896335 295.5, 1530896361 16.561841, 1530896387 16.20677, 1530896414 16.31508, 1530896440 16.346602, 1530896466 16.284669, 1530896492 16.16005, 1530896518 16.069096, 1530896545 16.028746, 1530896570 15.9638
+rocksdb.manifest.file.sync.micros.p99: 1530896335 649.0, 1530896361 835.0, 1530896387 1435.0, 1530896414 9938.0, 1530896440 9938.0, 1530896466 9938.0, 1530896492 9938.0, 1530896518 1882.0, 1530896545 1837.0, 1530896570 1792.0
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err1.ini b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err1.ini
new file mode 100644
index 0000000000..23be55dde1
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err1.ini
@@ -0,0 +1,56 @@
+[Rule "missing-suggestions"]
+suggestions=
+conditions=missing-source
+
+[Condition "normal-rule"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Rule "missing-conditions"]
+conditions=
+suggestions=missing-description
+
+[Condition "missing-options"]
+source=OPTIONS
+options=
+evaluate=int(options[0])*int(options[1])-int(options[2])<(-251659456)  # should evaluate to a boolean
+
+[Rule "missing-expression"]
+conditions=missing-expression
+suggestions=missing-description
+
+[Condition "missing-expression"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=
+
+[Suggestion "missing-description"]
+description=
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:missing-action:inc-l0-stop-writes-trigger
+conditions=missing-regex
+
+[Condition "missing-regex"]
+source=LOG
+regex=
+
+[Suggestion "missing-option"]
+option=
+action=increase
+
+[Suggestion "normal-suggestion"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err2.ini b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err2.ini
new file mode 100644
index 0000000000..bce21dba96
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err2.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=missing-source
+
+[Condition "missing-source"]
+source=
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err3.ini b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err3.ini
new file mode 100644
index 0000000000..73c06e4692
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err3.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=missing-action:inc-write-buffer
+conditions=missing-source
+
+[Condition "normal-condition"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "missing-action"]
+option=DBOptions.max_background_flushes
+action=
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err4.ini b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err4.ini
new file mode 100644
index 0000000000..4d4aa3c70d
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/rules_err4.ini
@@ -0,0 +1,15 @@
+[Rule "normal-rule"]
+suggestions=inc-bg-flush
+conditions=missing-source
+
+[Condition "normal-condition"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion]  # missing section name
+option=CFOptions.max_write_buffer_number
+action=increase
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/test_rules.ini b/thirdparty/rocksdb/tools/advisor/test/input_files/test_rules.ini
new file mode 100644
index 0000000000..97b9374fc0
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/test_rules.ini
@@ -0,0 +1,47 @@
+[Rule "single-condition-false"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=log-4-false
+
+[Rule "multiple-conds-true"]
+suggestions=inc-write-buffer
+conditions=log-1-true:log-2-true:log-3-true
+
+[Rule "multiple-conds-one-false"]
+suggestions=inc-bg-flush
+conditions=log-1-true:log-4-false:log-3-true
+
+[Rule "multiple-conds-all-false"]
+suggestions=l0-l1-ratio-health-check
+conditions=log-4-false:options-1-false
+
+[Condition "log-1-true"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Condition "log-2-true"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Condition "log-3-true"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Condition "log-4-false"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Condition "options-1-false"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:DBOptions.random_access_max_buffer_size
+evaluate=int(options[0])*int(options[1])-int(options[2])<0  # should evaluate to a boolean
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "l0-l1-ratio-health-check"]
+description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < 5) is satisfied'
diff --git a/thirdparty/rocksdb/tools/advisor/test/input_files/triggered_rules.ini b/thirdparty/rocksdb/tools/advisor/test/input_files/triggered_rules.ini
new file mode 100644
index 0000000000..83b96da2be
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/input_files/triggered_rules.ini
@@ -0,0 +1,83 @@
+[Rule "stall-too-many-memtables"]
+suggestions=inc-bg-flush:inc-write-buffer
+conditions=stall-too-many-memtables
+
+[Condition "stall-too-many-memtables"]
+source=LOG
+regex=Stopping writes because we have \d+ immutable memtables \(waiting for flush\), max_write_buffer_number is set to \d+
+
+[Rule "stall-too-many-L0"]
+suggestions=inc-max-subcompactions:inc-max-bg-compactions:inc-write-buffer-size:dec-max-bytes-for-level-base:inc-l0-slowdown-writes-trigger
+conditions=stall-too-many-L0
+
+[Condition "stall-too-many-L0"]
+source=LOG
+regex=Stalling writes because we have \d+ level-0 files
+
+[Rule "stop-too-many-L0"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-l0-stop-writes-trigger
+conditions=stop-too-many-L0
+
+[Condition "stop-too-many-L0"]
+source=LOG
+regex=Stopping writes because we have \d+ level-0 files
+
+[Rule "stall-too-many-compaction-bytes"]
+suggestions=inc-max-bg-compactions:inc-write-buffer-size:inc-hard-pending-compaction-bytes-limit:inc-soft-pending-compaction-bytes-limit
+conditions=stall-too-many-compaction-bytes
+
+[Condition "stall-too-many-compaction-bytes"]
+source=LOG
+regex=Stalling writes because of estimated pending compaction bytes \d+
+
+[Suggestion "inc-bg-flush"]
+option=DBOptions.max_background_flushes
+action=increase
+
+[Suggestion "inc-write-buffer"]
+option=CFOptions.max_write_buffer_number
+action=increase
+
+[Suggestion "inc-max-subcompactions"]
+option=DBOptions.max_subcompactions
+action=increase
+
+[Suggestion "inc-max-bg-compactions"]
+option=DBOptions.max_background_compactions
+action=increase
+
+[Suggestion "inc-write-buffer-size"]
+option=CFOptions.write_buffer_size
+action=increase
+
+[Suggestion "dec-max-bytes-for-level-base"]
+option=CFOptions.max_bytes_for_level_base
+action=decrease
+
+[Suggestion "inc-l0-slowdown-writes-trigger"]
+option=CFOptions.level0_slowdown_writes_trigger
+action=increase
+
+[Suggestion "inc-l0-stop-writes-trigger"]
+option=CFOptions.level0_stop_writes_trigger
+action=increase
+
+[Suggestion "inc-hard-pending-compaction-bytes-limit"]
+option=CFOptions.hard_pending_compaction_bytes_limit
+action=increase
+
+[Suggestion "inc-soft-pending-compaction-bytes-limit"]
+option=CFOptions.soft_pending_compaction_bytes_limit
+action=increase
+
+[Rule "level0-level1-ratio"]
+conditions=level0-level1-ratio
+suggestions=l0-l1-ratio-health-check
+
+[Condition "level0-level1-ratio"]
+source=OPTIONS
+options=CFOptions.level0_file_num_compaction_trigger:CFOptions.write_buffer_size:CFOptions.max_bytes_for_level_base
+evaluate=int(options[0])*int(options[1])-int(options[2])>=-268173312  # should evaluate to a boolean, condition triggered if evaluates to true
+
+[Suggestion "l0-l1-ratio-health-check"]
+description='modify options such that (level0_file_num_compaction_trigger * write_buffer_size - max_bytes_for_level_base < -268173312) is satisfied'
diff --git a/thirdparty/rocksdb/tools/advisor/test/test_db_bench_runner.py b/thirdparty/rocksdb/tools/advisor/test/test_db_bench_runner.py
new file mode 100644
index 0000000000..1c4f77d50d
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/test_db_bench_runner.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_bench_runner import DBBenchRunner
+from advisor.db_log_parser import NO_COL_FAMILY, DataSource
+from advisor.db_options_parser import DatabaseOptions
+import os
+import unittest
+
+
+class TestDBBenchRunnerMethods(unittest.TestCase):
+    def setUp(self):
+        self.pos_args = [
+            './../../db_bench',
+            'overwrite',
+            'use_existing_db=true',
+            'duration=10'
+        ]
+        self.bench_runner = DBBenchRunner(self.pos_args)
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+        self.db_options = DatabaseOptions(options_path)
+
+    def test_setup(self):
+        self.assertEqual(self.bench_runner.db_bench_binary, self.pos_args[0])
+        self.assertEqual(self.bench_runner.benchmark, self.pos_args[1])
+        self.assertSetEqual(
+            set(self.bench_runner.db_bench_args), set(self.pos_args[2:])
+        )
+
+    def test_get_info_log_file_name(self):
+        log_file_name = DBBenchRunner.get_info_log_file_name(
+            None, 'random_path'
+        )
+        self.assertEqual(log_file_name, 'LOG')
+
+        log_file_name = DBBenchRunner.get_info_log_file_name(
+            '/dev/shm/', '/tmp/rocksdbtest-155919/dbbench/'
+        )
+        self.assertEqual(log_file_name, 'tmp_rocksdbtest-155919_dbbench_LOG')
+
+    def test_get_opt_args_str(self):
+        misc_opt_dict = {'bloom_bits': 2, 'empty_opt': None, 'rate_limiter': 3}
+        optional_args_str = DBBenchRunner.get_opt_args_str(misc_opt_dict)
+        self.assertEqual(optional_args_str, ' --bloom_bits=2 --rate_limiter=3')
+
+    def test_get_log_options(self):
+        db_path = '/tmp/rocksdb-155919/dbbench'
+        # when db_log_dir is present in the db_options
+        update_dict = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: '/dev/shm'},
+            'DBOptions.stats_dump_period_sec': {NO_COL_FAMILY: '20'}
+        }
+        self.db_options.update_options(update_dict)
+        log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+            self.db_options, db_path
+        )
+        self.assertEqual(
+            log_file_prefix, '/dev/shm/tmp_rocksdb-155919_dbbench_LOG'
+        )
+        self.assertEqual(stats_freq, 20)
+
+        update_dict = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: None},
+            'DBOptions.stats_dump_period_sec': {NO_COL_FAMILY: '30'}
+        }
+        self.db_options.update_options(update_dict)
+        log_file_prefix, stats_freq = self.bench_runner.get_log_options(
+            self.db_options, db_path
+        )
+        self.assertEqual(log_file_prefix, '/tmp/rocksdb-155919/dbbench/LOG')
+        self.assertEqual(stats_freq, 30)
+
+    def test_build_experiment_command(self):
+        # add some misc_options to db_options
+        update_dict = {
+            'bloom_bits': {NO_COL_FAMILY: 2},
+            'rate_limiter_bytes_per_sec': {NO_COL_FAMILY: 128000000}
+        }
+        self.db_options.update_options(update_dict)
+        db_path = '/dev/shm'
+        experiment_command = self.bench_runner._build_experiment_command(
+            self.db_options, db_path
+        )
+        opt_args_str = DBBenchRunner.get_opt_args_str(
+            self.db_options.get_misc_options()
+        )
+        opt_args_str += (
+            ' --options_file=' +
+            self.db_options.generate_options_config('12345')
+        )
+        for arg in self.pos_args[2:]:
+            opt_args_str += (' --' + arg)
+        expected_command = (
+            self.pos_args[0] + ' --benchmarks=' + self.pos_args[1] +
+            ' --statistics --perf_level=3 --db=' + db_path + opt_args_str
+        )
+        self.assertEqual(experiment_command, expected_command)
+
+
+class TestDBBenchRunner(unittest.TestCase):
+    def setUp(self):
+        # Note: the db_bench binary should be present in the rocksdb/ directory
+        self.pos_args = [
+            './../../db_bench',
+            'overwrite',
+            'use_existing_db=true',
+            'duration=20'
+        ]
+        self.bench_runner = DBBenchRunner(self.pos_args)
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+        self.db_options = DatabaseOptions(options_path)
+
+    def test_experiment_output(self):
+        update_dict = {'bloom_bits': {NO_COL_FAMILY: 2}}
+        self.db_options.update_options(update_dict)
+        db_path = '/dev/shm'
+        data_sources, throughput = self.bench_runner.run_experiment(
+            self.db_options, db_path
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.DB_OPTIONS][0].type,
+            DataSource.Type.DB_OPTIONS
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.LOG][0].type,
+            DataSource.Type.LOG
+        )
+        self.assertEqual(len(data_sources[DataSource.Type.TIME_SERIES]), 2)
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][0].type,
+            DataSource.Type.TIME_SERIES
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][1].type,
+            DataSource.Type.TIME_SERIES
+        )
+        self.assertEqual(
+            data_sources[DataSource.Type.TIME_SERIES][1].stats_freq_sec, 0
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/thirdparty/rocksdb/tools/advisor/test/test_db_log_parser.py b/thirdparty/rocksdb/tools/advisor/test/test_db_log_parser.py
new file mode 100644
index 0000000000..b704304330
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/test_db_log_parser.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import DatabaseLogs, Log, NO_COL_FAMILY
+from advisor.rule_parser import Condition, LogCondition
+import os
+import unittest
+
+
+class TestLog(unittest.TestCase):
+    def setUp(self):
+        self.column_families = ['default', 'col_fam_A']
+
+    def test_get_column_family(self):
+        test_log = (
+            "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
+            "[col_fam_A] [JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+        )
+        db_log = Log(test_log, self.column_families)
+        self.assertEqual('col_fam_A', db_log.get_column_family())
+
+        test_log = (
+            "2018/05/25-14:34:21.047233 7f82ba72e700 [db/flush_job.cc:371] " +
+            "[JOB 44] Level-0 flush table #84: 1890780 bytes OK"
+        )
+        db_log = Log(test_log, self.column_families)
+        db_log.append_message('[default] some remaining part of log')
+        self.assertEqual(NO_COL_FAMILY, db_log.get_column_family())
+
+    def test_get_methods(self):
+        hr_time = "2018/05/25-14:30:25.491635"
+        context = "7f82ba72e700"
+        message = (
+            "[db/flush_job.cc:331] [default] [JOB 10] Level-0 flush table " +
+            "#23: started"
+        )
+        test_log = hr_time + " " + context + " " + message
+        db_log = Log(test_log, self.column_families)
+        self.assertEqual(db_log.get_message(), message)
+        remaining_message = "[col_fam_A] some more logs"
+        db_log.append_message(remaining_message)
+        self.assertEqual(
+            db_log.get_human_readable_time(), "2018/05/25-14:30:25.491635"
+        )
+        self.assertEqual(db_log.get_context(), "7f82ba72e700")
+        self.assertEqual(db_log.get_timestamp(), 1527258625)
+        self.assertEqual(
+            db_log.get_message(), str(message + '\n' + remaining_message)
+        )
+
+    def test_is_new_log(self):
+        new_log = "2018/05/25-14:34:21.047233 context random new log"
+        remaining_log = "2018/05/25 not really a new log"
+        self.assertTrue(Log.is_new_log(new_log))
+        self.assertFalse(Log.is_new_log(remaining_log))
+
+
+class TestDatabaseLogs(unittest.TestCase):
+    def test_check_and_trigger_conditions(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        logs_path_prefix = os.path.join(this_path, 'input_files/LOG-0')
+        column_families = ['default', 'col-fam-A', 'col-fam-B']
+        db_logs = DatabaseLogs(logs_path_prefix, column_families)
+        # matches, has 2 col_fams
+        condition1 = LogCondition.create(Condition('cond-A'))
+        condition1.set_parameter('regex', 'random log message')
+        # matches, multiple lines message
+        condition2 = LogCondition.create(Condition('cond-B'))
+        condition2.set_parameter('regex', 'continuing on next line')
+        # does not match
+        condition3 = LogCondition.create(Condition('cond-C'))
+        condition3.set_parameter('regex', 'this should match no log')
+        db_logs.check_and_trigger_conditions(
+            [condition1, condition2, condition3]
+        )
+        cond1_trigger = condition1.get_trigger()
+        self.assertEqual(2, len(cond1_trigger.keys()))
+        self.assertSetEqual(
+            {'col-fam-A', NO_COL_FAMILY}, set(cond1_trigger.keys())
+        )
+        self.assertEqual(2, len(cond1_trigger['col-fam-A']))
+        messages = [
+            "[db/db_impl.cc:563] [col-fam-A] random log message for testing",
+            "[db/db_impl.cc:653] [col-fam-A] another random log message"
+        ]
+        self.assertIn(cond1_trigger['col-fam-A'][0].get_message(), messages)
+        self.assertIn(cond1_trigger['col-fam-A'][1].get_message(), messages)
+        self.assertEqual(1, len(cond1_trigger[NO_COL_FAMILY]))
+        self.assertEqual(
+            cond1_trigger[NO_COL_FAMILY][0].get_message(),
+            "[db/db_impl.cc:331] [unknown] random log message no column family"
+        )
+        cond2_trigger = condition2.get_trigger()
+        self.assertEqual(['col-fam-B'], list(cond2_trigger.keys()))
+        self.assertEqual(1, len(cond2_trigger['col-fam-B']))
+        self.assertEqual(
+            cond2_trigger['col-fam-B'][0].get_message(),
+            "[db/db_impl.cc:234] [col-fam-B] log continuing on next line\n" +
+            "remaining part of the log"
+        )
+        self.assertIsNone(condition3.get_trigger())
diff --git a/thirdparty/rocksdb/tools/advisor/test/test_db_options_parser.py b/thirdparty/rocksdb/tools/advisor/test/test_db_options_parser.py
new file mode 100644
index 0000000000..d53a9bdb54
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/test_db_options_parser.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_log_parser import NO_COL_FAMILY
+from advisor.db_options_parser import DatabaseOptions
+from advisor.rule_parser import Condition, OptionCondition
+import os
+import unittest
+
+
+class TestDatabaseOptions(unittest.TestCase):
+    def setUp(self):
+        self.this_path = os.path.abspath(os.path.dirname(__file__))
+        self.og_options = os.path.join(
+            self.this_path, 'input_files/OPTIONS-000005'
+        )
+        misc_options = [
+            'bloom_bits = 4', 'rate_limiter_bytes_per_sec = 1024000'
+        ]
+        # create the options object
+        self.db_options = DatabaseOptions(self.og_options, misc_options)
+        # perform clean-up before running tests
+        self.generated_options = os.path.join(
+            self.this_path, '../temp/OPTIONS_testing.tmp'
+        )
+        if os.path.isfile(self.generated_options):
+            os.remove(self.generated_options)
+
+    def test_get_options_diff(self):
+        old_opt = {
+            'DBOptions.stats_dump_freq_sec': {NO_COL_FAMILY: '20'},
+            'CFOptions.write_buffer_size': {
+                'default': '1024000',
+                'col_fam_A': '128000',
+                'col_fam_B': '128000000'
+            },
+            'DBOptions.use_fsync': {NO_COL_FAMILY: 'true'},
+            'DBOptions.max_log_file_size': {NO_COL_FAMILY: '128000000'}
+        }
+        new_opt = {
+            'bloom_bits': {NO_COL_FAMILY: '4'},
+            'CFOptions.write_buffer_size': {
+                'default': '128000000',
+                'col_fam_A': '128000',
+                'col_fam_C': '128000000'
+            },
+            'DBOptions.use_fsync': {NO_COL_FAMILY: 'true'},
+            'DBOptions.max_log_file_size': {NO_COL_FAMILY: '0'}
+        }
+        diff = DatabaseOptions.get_options_diff(old_opt, new_opt)
+
+        expected_diff = {
+            'DBOptions.stats_dump_freq_sec': {NO_COL_FAMILY: ('20', None)},
+            'bloom_bits': {NO_COL_FAMILY: (None, '4')},
+            'CFOptions.write_buffer_size': {
+                'default': ('1024000', '128000000'),
+                'col_fam_B': ('128000000', None),
+                'col_fam_C': (None, '128000000')
+            },
+            'DBOptions.max_log_file_size': {NO_COL_FAMILY: ('128000000', '0')}
+        }
+        self.assertDictEqual(diff, expected_diff)
+
+    def test_is_misc_option(self):
+        self.assertTrue(DatabaseOptions.is_misc_option('bloom_bits'))
+        self.assertFalse(
+            DatabaseOptions.is_misc_option('DBOptions.stats_dump_freq_sec')
+        )
+
+    def test_set_up(self):
+        options = self.db_options.get_all_options()
+        self.assertEqual(22, len(options.keys()))
+        expected_misc_options = {
+            'bloom_bits': '4', 'rate_limiter_bytes_per_sec': '1024000'
+        }
+        self.assertDictEqual(
+            expected_misc_options, self.db_options.get_misc_options()
+        )
+        self.assertListEqual(
+            ['default', 'col_fam_A'], self.db_options.get_column_families()
+        )
+
+    def test_get_options(self):
+        opt_to_get = [
+            'DBOptions.manual_wal_flush', 'DBOptions.db_write_buffer_size',
+            'bloom_bits', 'CFOptions.compaction_filter_factory',
+            'CFOptions.num_levels', 'rate_limiter_bytes_per_sec',
+            'TableOptions.BlockBasedTable.block_align', 'random_option'
+        ]
+        options = self.db_options.get_options(opt_to_get)
+        expected_options = {
+            'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'false'},
+            'DBOptions.db_write_buffer_size': {NO_COL_FAMILY: '0'},
+            'bloom_bits': {NO_COL_FAMILY: '4'},
+            'CFOptions.compaction_filter_factory': {
+                'default': 'nullptr', 'col_fam_A': 'nullptr'
+            },
+            'CFOptions.num_levels': {'default': '7', 'col_fam_A': '5'},
+            'rate_limiter_bytes_per_sec': {NO_COL_FAMILY: '1024000'},
+            'TableOptions.BlockBasedTable.block_align': {
+                'default': 'false', 'col_fam_A': 'true'
+            }
+        }
+        self.assertDictEqual(expected_options, options)
+
+    def test_update_options(self):
+        # add new, update old, set old
+        # before updating
+        expected_old_opts = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: None},
+            'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'false'},
+            'bloom_bits': {NO_COL_FAMILY: '4'},
+            'CFOptions.num_levels': {'default': '7', 'col_fam_A': '5'},
+            'TableOptions.BlockBasedTable.block_restart_interval': {
+                'col_fam_A': '16'
+            }
+        }
+        get_opts = list(expected_old_opts.keys())
+        options = self.db_options.get_options(get_opts)
+        self.assertEqual(expected_old_opts, options)
+        # after updating options
+        update_opts = {
+            'DBOptions.db_log_dir': {NO_COL_FAMILY: '/dev/shm'},
+            'DBOptions.manual_wal_flush': {NO_COL_FAMILY: 'true'},
+            'bloom_bits': {NO_COL_FAMILY: '2'},
+            'CFOptions.num_levels': {'col_fam_A': '7'},
+            'TableOptions.BlockBasedTable.block_restart_interval': {
+                'default': '32'
+            },
+            'random_misc_option': {NO_COL_FAMILY: 'something'}
+        }
+        self.db_options.update_options(update_opts)
+        update_opts['CFOptions.num_levels']['default'] = '7'
+        update_opts['TableOptions.BlockBasedTable.block_restart_interval'] = {
+            'default': '32', 'col_fam_A': '16'
+        }
+        get_opts.append('random_misc_option')
+        options = self.db_options.get_options(get_opts)
+        self.assertDictEqual(update_opts, options)
+        expected_misc_options = {
+            'bloom_bits': '2',
+            'rate_limiter_bytes_per_sec': '1024000',
+            'random_misc_option': 'something'
+        }
+        self.assertDictEqual(
+            expected_misc_options, self.db_options.get_misc_options()
+        )
+
+    def test_generate_options_config(self):
+        # make sure file does not exist from before
+        self.assertFalse(os.path.isfile(self.generated_options))
+        self.db_options.generate_options_config('testing')
+        self.assertTrue(os.path.isfile(self.generated_options))
+
+    def test_check_and_trigger_conditions(self):
+        # options only from CFOptions
+        # setup the OptionCondition objects to check and trigger
+        update_dict = {
+            'CFOptions.level0_file_num_compaction_trigger': {'col_fam_A': '4'},
+            'CFOptions.max_bytes_for_level_base': {'col_fam_A': '10'}
+        }
+        self.db_options.update_options(update_dict)
+        cond1 = Condition('opt-cond-1')
+        cond1 = OptionCondition.create(cond1)
+        cond1.set_parameter(
+            'options', [
+                'CFOptions.level0_file_num_compaction_trigger',
+                'TableOptions.BlockBasedTable.block_restart_interval',
+                'CFOptions.max_bytes_for_level_base'
+            ]
+        )
+        cond1.set_parameter(
+            'evaluate',
+            'int(options[0])*int(options[1])-int(options[2])>=0'
+        )
+        # only DBOptions
+        cond2 = Condition('opt-cond-2')
+        cond2 = OptionCondition.create(cond2)
+        cond2.set_parameter(
+            'options', [
+                'DBOptions.db_write_buffer_size',
+                'bloom_bits',
+                'rate_limiter_bytes_per_sec'
+            ]
+        )
+        cond2.set_parameter(
+            'evaluate',
+            '(int(options[2]) * int(options[1]) * int(options[0]))==0'
+        )
+        # mix of CFOptions and DBOptions
+        cond3 = Condition('opt-cond-3')
+        cond3 = OptionCondition.create(cond3)
+        cond3.set_parameter(
+            'options', [
+                'DBOptions.db_write_buffer_size',  # 0
+                'CFOptions.num_levels',  # 5, 7
+                'bloom_bits'  # 4
+            ]
+        )
+        cond3.set_parameter(
+            'evaluate', 'int(options[2])*int(options[0])+int(options[1])>6'
+        )
+        self.db_options.check_and_trigger_conditions([cond1, cond2, cond3])
+
+        cond1_trigger = {'col_fam_A': ['4', '16', '10']}
+        self.assertDictEqual(cond1_trigger, cond1.get_trigger())
+        cond2_trigger = {NO_COL_FAMILY: ['0', '4', '1024000']}
+        self.assertDictEqual(cond2_trigger, cond2.get_trigger())
+        cond3_trigger = {'default': ['0', '7', '4']}
+        self.assertDictEqual(cond3_trigger, cond3.get_trigger())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/thirdparty/rocksdb/tools/advisor/test/test_db_stats_fetcher.py b/thirdparty/rocksdb/tools/advisor/test/test_db_stats_fetcher.py
new file mode 100644
index 0000000000..afbbe8339e
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/test_db_stats_fetcher.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+from advisor.db_stats_fetcher import LogStatsParser, DatabasePerfContext
+from advisor.db_timeseries_parser import NO_ENTITY
+from advisor.rule_parser import Condition, TimeSeriesCondition
+import os
+import time
+import unittest
+from unittest.mock import MagicMock
+
+
+class TestLogStatsParser(unittest.TestCase):
+    def setUp(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        stats_file = os.path.join(
+            this_path, 'input_files/log_stats_parser_keys_ts'
+        )
+        # populate the keys_ts dictionary of LogStatsParser
+        self.stats_dict = {NO_ENTITY: {}}
+        with open(stats_file, 'r') as fp:
+            for line in fp:
+                stat_name = line.split(':')[0].strip()
+                self.stats_dict[NO_ENTITY][stat_name] = {}
+                token_list = line.split(':')[1].strip().split(',')
+                for token in token_list:
+                    timestamp = int(token.split()[0])
+                    value = float(token.split()[1])
+                    self.stats_dict[NO_ENTITY][stat_name][timestamp] = value
+        self.log_stats_parser = LogStatsParser('dummy_log_file', 20)
+        self.log_stats_parser.keys_ts = self.stats_dict
+
+    def test_check_and_trigger_conditions_bursty(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: bursty
+        cond1 = Condition('cond-1')
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+        cond1.set_parameter('behavior', 'bursty')
+        cond1.set_parameter('window_sec', 40)
+        cond1.set_parameter('rate_threshold', 0)
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {
+            NO_ENTITY: {1530896440: 0.9767546362322214}
+        }
+        self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+        # ensure that fetch_timeseries() was called once
+        self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+    def test_check_and_trigger_conditions_eval_agg(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: evaluate_expression
+        cond1 = Condition('cond-1')
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+        cond1.set_parameter('behavior', 'evaluate_expression')
+        keys = [
+            'rocksdb.manifest.file.sync.micros.p99',
+            'rocksdb.db.get.micros.p50'
+        ]
+        cond1.set_parameter('keys', keys)
+        cond1.set_parameter('aggregation_op', 'latest')
+        # condition evaluates to FALSE
+        cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)>200')
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+        self.assertIsNone(cond1.get_trigger())
+        # condition evaluates to TRUE
+        cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)<200')
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_cond_trigger = {NO_ENTITY: [1792.0, 15.9638]}
+        self.assertDictEqual(expected_cond_trigger, cond1.get_trigger())
+        # ensure that fetch_timeseries() was called
+        self.log_stats_parser.fetch_timeseries.assert_called()
+
+    def test_check_and_trigger_conditions_eval(self):
+        # mock fetch_timeseries() because 'keys_ts' has been pre-populated
+        self.log_stats_parser.fetch_timeseries = MagicMock()
+        # condition: evaluate_expression
+        cond1 = Condition('cond-1')
+        cond1 = TimeSeriesCondition.create(cond1)
+        cond1.set_parameter('keys', 'rocksdb.db.get.micros.p50')
+        cond1.set_parameter('behavior', 'evaluate_expression')
+        keys = [
+            'rocksdb.manifest.file.sync.micros.p99',
+            'rocksdb.db.get.micros.p50'
+        ]
+        cond1.set_parameter('keys', keys)
+        cond1.set_parameter('evaluate', 'keys[0]-(keys[1]*100)>500')
+        self.log_stats_parser.check_and_trigger_conditions([cond1])
+        expected_trigger = {NO_ENTITY: {
+            1530896414: [9938.0, 16.31508],
+            1530896440: [9938.0, 16.346602],
+            1530896466: [9938.0, 16.284669],
+            1530896492: [9938.0, 16.16005]
+        }}
+        self.assertDictEqual(expected_trigger, cond1.get_trigger())
+        self.log_stats_parser.fetch_timeseries.assert_called_once()
+
+
+class TestDatabasePerfContext(unittest.TestCase):
+    def test_unaccumulate_metrics(self):
+        perf_dict = {
+            "user_key_comparison_count": 675903942,
+            "block_cache_hit_count": 830086,
+        }
+        timestamp = int(time.time())
+        perf_ts = {}
+        for key in perf_dict:
+            perf_ts[key] = {}
+            start_val = perf_dict[key]
+            for ix in range(5):
+                perf_ts[key][timestamp+(ix*10)] = start_val + (2 * ix * ix)
+        db_perf_context = DatabasePerfContext(perf_ts, 10, True)
+        timestamps = [timestamp+(ix*10) for ix in range(1, 5, 1)]
+        values = [val for val in range(2, 15, 4)]
+        inner_dict = {timestamps[ix]: values[ix] for ix in range(4)}
+        expected_keys_ts = {NO_ENTITY: {
+            'user_key_comparison_count': inner_dict,
+            'block_cache_hit_count': inner_dict
+        }}
+        self.assertDictEqual(expected_keys_ts, db_perf_context.keys_ts)
diff --git a/thirdparty/rocksdb/tools/advisor/test/test_rule_parser.py b/thirdparty/rocksdb/tools/advisor/test/test_rule_parser.py
new file mode 100644
index 0000000000..9f1d0bf5cc
--- /dev/null
+++ b/thirdparty/rocksdb/tools/advisor/test/test_rule_parser.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+#  This source code is licensed under both the GPLv2 (found in the
+#  COPYING file in the root directory) and Apache 2.0 License
+#  (found in the LICENSE.Apache file in the root directory).
+
+import os
+import unittest
+from advisor.rule_parser import RulesSpec
+from advisor.db_log_parser import DatabaseLogs, DataSource
+from advisor.db_options_parser import DatabaseOptions
+
+RuleToSuggestions = {
+    "stall-too-many-memtables": [
+        'inc-bg-flush',
+        'inc-write-buffer'
+    ],
+    "stall-too-many-L0": [
+        'inc-max-subcompactions',
+        'inc-max-bg-compactions',
+        'inc-write-buffer-size',
+        'dec-max-bytes-for-level-base',
+        'inc-l0-slowdown-writes-trigger'
+    ],
+    "stop-too-many-L0": [
+        'inc-max-bg-compactions',
+        'inc-write-buffer-size',
+        'inc-l0-stop-writes-trigger'
+    ],
+    "stall-too-many-compaction-bytes": [
+        'inc-max-bg-compactions',
+        'inc-write-buffer-size',
+        'inc-hard-pending-compaction-bytes-limit',
+        'inc-soft-pending-compaction-bytes-limit'
+    ],
+    "level0-level1-ratio": [
+        'l0-l1-ratio-health-check'
+    ]
+}
+
+
+class TestAllRulesTriggered(unittest.TestCase):
+    def setUp(self):
+        # load the Rules
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        ini_path = os.path.join(this_path, 'input_files/triggered_rules.ini')
+        self.db_rules = RulesSpec(ini_path)
+        self.db_rules.load_rules_from_spec()
+        self.db_rules.perform_section_checks()
+        # load the data sources: LOG and OPTIONS
+        log_path = os.path.join(this_path, 'input_files/LOG-0')
+        options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+        db_options_parser = DatabaseOptions(options_path)
+        self.column_families = db_options_parser.get_column_families()
+        db_logs_parser = DatabaseLogs(log_path, self.column_families)
+        self.data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options_parser],
+            DataSource.Type.LOG: [db_logs_parser]
+        }
+
+    def test_triggered_conditions(self):
+        conditions_dict = self.db_rules.get_conditions_dict()
+        rules_dict = self.db_rules.get_rules_dict()
+        # Make sure none of the conditions is triggered beforehand
+        for cond in conditions_dict.values():
+            self.assertFalse(cond.is_triggered(), repr(cond))
+        for rule in rules_dict.values():
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families),
+                repr(rule)
+            )
+
+        # # Trigger the conditions as per the data sources.
+        # trigger_conditions(, conditions_dict)
+
+        # Get the set of rules that have been triggered
+        triggered_rules = self.db_rules.get_triggered_rules(
+            self.data_sources, self.column_families
+        )
+
+        # Make sure each condition and rule is triggered
+        for cond in conditions_dict.values():
+            if cond.get_data_source() is DataSource.Type.TIME_SERIES:
+                continue
+            self.assertTrue(cond.is_triggered(), repr(cond))
+
+        for rule in rules_dict.values():
+            self.assertIn(rule, triggered_rules)
+            # Check the suggestions made by the triggered rules
+            for sugg in rule.get_suggestions():
+                self.assertIn(sugg, RuleToSuggestions[rule.name])
+
+        for rule in triggered_rules:
+            self.assertIn(rule, rules_dict.values())
+            for sugg in RuleToSuggestions[rule.name]:
+                self.assertIn(sugg, rule.get_suggestions())
+
+
+class TestConditionsConjunctions(unittest.TestCase):
+    def setUp(self):
+        # load the Rules
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        ini_path = os.path.join(this_path, 'input_files/test_rules.ini')
+        self.db_rules = RulesSpec(ini_path)
+        self.db_rules.load_rules_from_spec()
+        self.db_rules.perform_section_checks()
+        # load the data sources: LOG and OPTIONS
+        log_path = os.path.join(this_path, 'input_files/LOG-1')
+        options_path = os.path.join(this_path, 'input_files/OPTIONS-000005')
+        db_options_parser = DatabaseOptions(options_path)
+        self.column_families = db_options_parser.get_column_families()
+        db_logs_parser = DatabaseLogs(log_path, self.column_families)
+        self.data_sources = {
+            DataSource.Type.DB_OPTIONS: [db_options_parser],
+            DataSource.Type.LOG: [db_logs_parser]
+        }
+
+    def test_condition_conjunctions(self):
+        conditions_dict = self.db_rules.get_conditions_dict()
+        rules_dict = self.db_rules.get_rules_dict()
+        # Make sure none of the conditions is triggered beforehand
+        for cond in conditions_dict.values():
+            self.assertFalse(cond.is_triggered(), repr(cond))
+        for rule in rules_dict.values():
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families),
+                repr(rule)
+            )
+
+        # Trigger the conditions as per the data sources.
+        self.db_rules.trigger_conditions(self.data_sources)
+
+        # Check for the conditions
+        conds_triggered = ['log-1-true', 'log-2-true', 'log-3-true']
+        conds_not_triggered = ['log-4-false', 'options-1-false']
+        for cond in conds_triggered:
+            self.assertTrue(conditions_dict[cond].is_triggered(), repr(cond))
+        for cond in conds_not_triggered:
+            self.assertFalse(conditions_dict[cond].is_triggered(), repr(cond))
+
+        # Check for the rules
+        rules_triggered = ['multiple-conds-true']
+        rules_not_triggered = [
+            'single-condition-false',
+            'multiple-conds-one-false',
+            'multiple-conds-all-false'
+        ]
+        for rule_name in rules_triggered:
+            rule = rules_dict[rule_name]
+            self.assertTrue(
+                rule.is_triggered(conditions_dict, self.column_families),
+                repr(rule)
+            )
+        for rule_name in rules_not_triggered:
+            rule = rules_dict[rule_name]
+            self.assertFalse(
+                rule.is_triggered(conditions_dict, self.column_families),
+                repr(rule)
+            )
+
+
+class TestSanityChecker(unittest.TestCase):
+    def setUp(self):
+        this_path = os.path.abspath(os.path.dirname(__file__))
+        ini_path = os.path.join(this_path, 'input_files/rules_err1.ini')
+        db_rules = RulesSpec(ini_path)
+        db_rules.load_rules_from_spec()
+        self.rules_dict = db_rules.get_rules_dict()
+        self.conditions_dict = db_rules.get_conditions_dict()
+        self.suggestions_dict = db_rules.get_suggestions_dict()
+
+    def test_rule_missing_suggestions(self):
+        regex = '.*rule must have at least one suggestion.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            self.rules_dict['missing-suggestions'].perform_checks()
+
+    def test_rule_missing_conditions(self):
+        regex = '.*rule must have at least one condition.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            self.rules_dict['missing-conditions'].perform_checks()
+
+    def test_condition_missing_regex(self):
+        regex = '.*provide regex for log condition.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            self.conditions_dict['missing-regex'].perform_checks()
+
+    def test_condition_missing_options(self):
+        regex = '.*options missing in condition.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            self.conditions_dict['missing-options'].perform_checks()
+
+    def test_condition_missing_expression(self):
+        regex = '.*expression missing in condition.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            self.conditions_dict['missing-expression'].perform_checks()
+
+    def test_suggestion_missing_option(self):
+        regex = '.*provide option or description.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            self.suggestions_dict['missing-option'].perform_checks()
+
+    def test_suggestion_missing_description(self):
+        regex = '.*provide option or description.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            self.suggestions_dict['missing-description'].perform_checks()
+
+
+class TestParsingErrors(unittest.TestCase):
+    def setUp(self):
+        self.this_path = os.path.abspath(os.path.dirname(__file__))
+
+    def test_condition_missing_source(self):
+        ini_path = os.path.join(self.this_path, 'input_files/rules_err2.ini')
+        db_rules = RulesSpec(ini_path)
+        regex = '.*provide source for condition.*'
+        with self.assertRaisesRegex(NotImplementedError, regex):
+            db_rules.load_rules_from_spec()
+
+    def test_suggestion_missing_action(self):
+        ini_path = os.path.join(self.this_path, 'input_files/rules_err3.ini')
+        db_rules = RulesSpec(ini_path)
+        regex = '.*provide action for option.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            db_rules.load_rules_from_spec()
+
+    def test_section_no_name(self):
+        ini_path = os.path.join(self.this_path, 'input_files/rules_err4.ini')
+        db_rules = RulesSpec(ini_path)
+        regex = 'Parsing error: needed section header:.*'
+        with self.assertRaisesRegex(ValueError, regex):
+            db_rules.load_rules_from_spec()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/thirdparty/rocksdb/tools/analyze_txn_stress_test.sh b/thirdparty/rocksdb/tools/analyze_txn_stress_test.sh
new file mode 100755
index 0000000000..8082606081
--- /dev/null
+++ b/thirdparty/rocksdb/tools/analyze_txn_stress_test.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Usage:
+# 1. Enable ROCKS_LOG_DETAILS in util/logging.h
+# 2. Run ./transaction_test --gtest_filter="MySQLStyleTransactionTest/MySQLStyleTransactionTest.TransactionStressTest/*" --gtest_break_on_failure
+# 3. SET=1 # 2 or 3
+# 4. LOG=/dev/shm/transaction_testdb_8600601584148590297/LOG
+# 5. grep RandomTransactionVerify $LOG | cut -d' ' -f 12 | sort -n # to find verify snapshots
+# 5. vn=1345
+# 6. vn_1=1340
+# 4. . tools/tools/analyze_txn_stress_test.sh
+echo Input params:
+# The rocksdb LOG path
+echo $LOG
+# Snapshot at which we got RandomTransactionVerify failure
+echo $vn
+# The snapshot before that where RandomTransactionVerify passed
+echo $vn_1
+# The stress tests use 3 sets, one or more might have shown inconsistent results.
+SET=${SET-1} # 1 or 2 or 3
+echo Checking set number $SET
+
+# Find the txns that committed between the two snapshots, and gather their changes made by them in /tmp/changes.txt
+# 2019/02/28-15:25:51.655477 7fffec9ff700 [DEBUG] [ilities/transactions/write_prepared_txn_db.cc:416] Txn 68497 Committing with 68498
+grep Committing $LOG | awk '{if ($9 <= vn && $9 > vn_1) print $0}' vn=$vn vn_1=${vn_1} > /tmp/txn.txt
+# 2019/02/28-15:25:49.046464 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:216] Commit of 65541 OK (txn12936193128775589751-9089)
+for i in `cat /tmp/txn.txt | awk '{print $6}'`; do grep "Commit of $i " $LOG; done > /tmp/names.txt
+for n in `cat /tmp/names.txt | awk '{print $9}'`; do grep $n $LOG; done > /tmp/changes.txt
+echo "Sum of the changes:"
+cat /tmp/changes.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}'
+
+# Gather read values at each snapshot
+# 2019/02/28-15:25:51.655926 7fffebbff700 [DEBUG] [il/transaction_test_util.cc:347] VerifyRead at 67972 (67693): 000230 value: 15983
+grep "VerifyRead at ${vn_1} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/va.txt
+grep "VerifyRead at ${vn} (.*): 000${SET}" $LOG | cut -d' ' -f 9- > /tmp/vb.txt
+
+# For each key in the 2nd snapshot, find the value read by 1st, do the adds, and see if the results match.
+IFS=$'\n'
+for l in `cat /tmp/vb.txt`; 
+do
+  grep $l /tmp/va.txt > /dev/null ; 
+  if [[ $? -ne 0 ]]; then 
+    #echo $l
+    k=`echo $l | awk '{print $1}'`;
+    v=`echo $l | awk '{print $3}'`;
+    # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867
+    exp=`grep "\<$k\>" /tmp/changes.txt | tail -1 | cut -d= -f2`;
+    if [[ $v -ne $exp ]]; then echo $l; fi
+  else
+    k=`echo $l | awk '{print $1}'`;
+    grep "\<$k\>" /tmp/changes.txt
+  fi;
+done
+
+# Check that all the keys read in the 1st snapshot are still visible in the 2nd
+for l in `cat /tmp/va.txt`; 
+do
+  k=`echo $l | awk '{print $1}'`;
+  grep "\<$k\>" /tmp/vb.txt > /dev/null
+  if [[ $? -ne 0 ]]; then
+    echo missing key $k
+  fi
+done
+
+# The following found a bug in ValidateSnapshot. It checks if the adds on each key match up.
+grep Insert /tmp/changes.txt | cut -d' ' -f 10 | sort | uniq > /tmp/keys.txt
+for k in `cat /tmp/keys.txt`;
+do
+  grep "\<$k\>" /tmp/changes.txt > /tmp/adds.txt;
+  # 2019/02/28-15:25:19.350111 7fffe81f5700 [DEBUG] [il/transaction_test_util.cc:194] Insert (txn12936193128775589751-2298) OK snap: 16289 key:000219 value: 3772+95=3867
+  START=`head -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d+ -f1`
+  END=`tail -1 /tmp/adds.txt | cut -d' ' -f 12 | cut -d= -f2`
+  ADDS=`cat /tmp/adds.txt | grep Insert | awk '{print $12}' | cut -d= -f1 | cut -d+ -f2 | awk '{sum+=$1} END{print sum}'`
+  EXP=$((START+ADDS))
+  # If first + all the adds != last then there was an issue with ValidateSnapshot.
+  if [[ $END -ne $EXP ]]; then echo inconsistent txn: $k $START+$ADDS=$END; cat /tmp/adds.txt; return 1; fi
+done
diff --git a/thirdparty/rocksdb/tools/auto_sanity_test.sh b/thirdparty/rocksdb/tools/auto_sanity_test.sh
index 54577ffb77..c98bd661c9 100755
--- a/thirdparty/rocksdb/tools/auto_sanity_test.sh
+++ b/thirdparty/rocksdb/tools/auto_sanity_test.sh
@@ -1,3 +1,4 @@
+# shellcheck disable=SC2148
 TMP_DIR="${TMPDIR:-/tmp}/rocksdb-sanity-test"
 
 if [ "$#" -lt 2 ]; then
diff --git a/thirdparty/rocksdb/tools/benchmark.sh b/thirdparty/rocksdb/tools/benchmark.sh
index 1a2c38439c..31df59cd7a 100755
--- a/thirdparty/rocksdb/tools/benchmark.sh
+++ b/thirdparty/rocksdb/tools/benchmark.sh
@@ -20,6 +20,7 @@ fi
 K=1024
 M=$((1024 * K))
 G=$((1024 * M))
+T=$((1024 * T))
 
 if [ -z $DB_DIR ]; then
   echo "DB_DIR is not defined"
@@ -44,16 +45,16 @@ if [ ! -z $DB_BENCH_NO_SYNC ]; then
   syncval="0";
 fi
 
-num_threads=${NUM_THREADS:-16}
+num_threads=${NUM_THREADS:-64}
 mb_written_per_sec=${MB_WRITE_PER_SEC:-0}
 # Only for tests that do range scans
 num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10}
-cache_size=${CACHE_SIZE:-$((1 * G))}
+cache_size=${CACHE_SIZE:-$((17179869184))}
 compression_max_dict_bytes=${COMPRESSION_MAX_DICT_BYTES:-0}
-compression_type=${COMPRESSION_TYPE:-snappy}
+compression_type=${COMPRESSION_TYPE:-zstd}
 duration=${DURATION:-0}
 
-num_keys=${NUM_KEYS:-$((1 * G))}
+num_keys=${NUM_KEYS:-8000000000}
 key_size=${KEY_SIZE:-20}
 value_size=${VALUE_SIZE:-400}
 block_size=${BLOCK_SIZE:-8192}
@@ -99,7 +100,6 @@ const_params="
 
 l0_config="
   --level0_file_num_compaction_trigger=4 \
-  --level0_slowdown_writes_trigger=12 \
   --level0_stop_writes_trigger=20"
 
 if [ $duration -gt 0 ]; then
@@ -115,11 +115,14 @@ params_w="$const_params \
 params_bulkload="$const_params \
                  --max_background_compactions=16 \
                  --max_write_buffer_number=8 \
+                 --allow_concurrent_memtable_write=false \
                  --max_background_flushes=7 \
                  --level0_file_num_compaction_trigger=$((10 * M)) \
                  --level0_slowdown_writes_trigger=$((10 * M)) \
                  --level0_stop_writes_trigger=$((10 * M))"
 
+params_fillseq="$params_w \
+		--allow_concurrent_memtable_write=false"
 #
 # Tune values for level and universal compaction.
 # For universal compaction, these level0_* options mean total sorted of runs in
@@ -153,8 +156,8 @@ function summarize_result {
   stall_pct=$( grep "^Cumulative stall" $test_out| tail -1  | awk '{  print $5 }' )
   ops_sec=$( grep ^${bench_name} $test_out | awk '{ print $5 }' )
   mb_sec=$( grep ^${bench_name} $test_out | awk '{ print $7 }' )
-  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $8 }' )
-  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $8 }' )
+  lo_wgb=$( grep "^  L0" $test_out | tail -1 | awk '{ print $9 }' )
+  sum_wgb=$( grep "^ Sum" $test_out | tail -1 | awk '{ print $9 }' )
   sum_size=$( grep "^ Sum" $test_out | tail -1 | awk '{ printf "%.1f", $3 / 1024.0 }' )
   wamp=$( echo "scale=1; $sum_wgb / $lo_wgb" | bc )
   wmb_ps=$( echo "scale=1; ( $sum_wgb * 1024.0 ) / $uptime" | bc )
@@ -179,6 +182,7 @@ function run_bulkload {
        $params_bulkload \
        --threads=1 \
        --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
        --disable_wal=1 \
        --seed=$( date +%s ) \
        2>&1 | tee -a $output_dir/benchmark_bulkload_fillrandom.log"
@@ -231,6 +235,7 @@ function run_manual_compaction_worker {
        --compaction_style=$2 \
        --subcompactions=$3 \
        --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
        --disable_wal=1 \
        --max_background_compactions=$4 \
        --seed=$( date +%s ) \
@@ -311,10 +316,11 @@ function run_fillseq {
   cmd="./db_bench --benchmarks=fillseq \
        --use_existing_db=0 \
        --sync=0 \
-       $params_w \
+       $params_fillseq \
        --min_level_to_compress=0 \
        --threads=1 \
        --memtablerep=vector \
+       --allow_concurrent_memtable_write=false \
        --disable_wal=$1 \
        --seed=$( date +%s ) \
        2>&1 | tee -a $log_file_name"
@@ -449,6 +455,7 @@ echo "===== Benchmark ====="
 
 # Run!!!
 IFS=',' read -a jobs <<< $1
+# shellcheck disable=SC2068
 for job in ${jobs[@]}; do
 
   if [ $job != debug ]; then
@@ -463,6 +470,12 @@ for job in ${jobs[@]}; do
   elif [ $job = fillseq_enable_wal ]; then
     run_fillseq 0
   elif [ $job = overwrite ]; then
+    syncval="0"
+    params_w="$params_w \
+	--writes=125000000 \
+	--subcompactions=4 \
+	--soft_pending_compaction_bytes_limit=$((1 * T)) \
+	--hard_pending_compaction_bytes_limit=$((4 * T)) "
     run_change overwrite
   elif [ $job = updaterandom ]; then
     run_change updaterandom
diff --git a/thirdparty/rocksdb/tools/benchmark_leveldb.sh b/thirdparty/rocksdb/tools/benchmark_leveldb.sh
index 7769969809..40c7733cea 100755
--- a/thirdparty/rocksdb/tools/benchmark_leveldb.sh
+++ b/thirdparty/rocksdb/tools/benchmark_leveldb.sh
@@ -151,6 +151,7 @@ echo "===== Benchmark ====="
 
 # Run!!!
 IFS=',' read -a jobs <<< $1
+# shellcheck disable=SC2068
 for job in ${jobs[@]}; do
 
   if [ $job != debug ]; then
diff --git a/thirdparty/rocksdb/tools/blob_dump.cc b/thirdparty/rocksdb/tools/blob_dump.cc
index 73601f2d80..7da6108580 100644
--- a/thirdparty/rocksdb/tools/blob_dump.cc
+++ b/thirdparty/rocksdb/tools/blob_dump.cc
@@ -27,9 +27,13 @@ int main(int argc, char** argv) {
       {"file", required_argument, nullptr, 'f'},
       {"show_key", optional_argument, nullptr, 'k'},
       {"show_blob", optional_argument, nullptr, 'b'},
+      {"show_uncompressed_blob", optional_argument, nullptr, 'r'},
+      {"show_summary", optional_argument, nullptr, 's'},
   };
   DisplayType show_key = DisplayType::kRaw;
   DisplayType show_blob = DisplayType::kNone;
+  DisplayType show_uncompressed_blob = DisplayType::kNone;
+  bool show_summary = false;
   std::string file;
   while (true) {
     int c = getopt_long(argc, argv, "hk::b::f:", options, nullptr);
@@ -42,7 +46,9 @@ int main(int argc, char** argv) {
         fprintf(stdout,
                 "Usage: blob_dump --file=filename "
                 "[--show_key[=none|raw|hex|detail]] "
-                "[--show_blob[=none|raw|hex|detail]]\n");
+                "[--show_blob[=none|raw|hex|detail]] "
+                "[--show_uncompressed_blob[=none|raw|hex|detail]] "
+                "[--show_summary]\n");
         return 0;
       case 'f':
         file = optarg;
@@ -64,16 +70,31 @@ int main(int argc, char** argv) {
           }
           show_blob = display_types.at(arg_str);
         } else {
-          show_blob = DisplayType::kDetail;
+          show_blob = DisplayType::kHex;
         }
         break;
+      case 'r':
+        if (optarg) {
+          if (display_types.count(arg_str) == 0) {
+            fprintf(stderr, "Unrecognized blob display type.\n");
+            return -1;
+          }
+          show_uncompressed_blob = display_types.at(arg_str);
+        } else {
+          show_uncompressed_blob = DisplayType::kHex;
+        }
+        break;
+      case 's':
+        show_summary = true;
+        break;
       default:
         fprintf(stderr, "Unrecognized option.\n");
         return -1;
     }
   }
   BlobDumpTool tool;
-  Status s = tool.Run(file, show_key, show_blob);
+  Status s =
+      tool.Run(file, show_key, show_blob, show_uncompressed_blob, show_summary);
   if (!s.ok()) {
     fprintf(stderr, "Failed: %s\n", s.ToString().c_str());
     return -1;
@@ -82,7 +103,7 @@ int main(int argc, char** argv) {
 }
 #else
 #include <stdio.h>
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "Not supported in lite mode.\n");
   return -1;
 }
diff --git a/thirdparty/rocksdb/tools/check_format_compatible.sh b/thirdparty/rocksdb/tools/check_format_compatible.sh
index 801648963e..a849b9e7e2 100755
--- a/thirdparty/rocksdb/tools/check_format_compatible.sh
+++ b/thirdparty/rocksdb/tools/check_format_compatible.sh
@@ -17,7 +17,7 @@ mkdir $input_data_path || true
 rm -rf $script_copy_dir
 cp $scriptpath $script_copy_dir -rf
 
-# Generate four random files.
+# Generate random files.
 for i in {1..6}
 do
   input_data[$i]=$input_data_path/data$i
@@ -41,9 +41,22 @@ with open('${input_data[$i]}', 'w') as f:
 EOF
 done
 
+# Generate file(s) with sorted keys.
+sorted_input_data=$input_data_path/sorted_data
+echo == Generating file with sorted keys ${sorted_input_data}
+python - <<EOF
+with open('${sorted_input_data}', 'w') as f:
+  for i in range(0,10):
+    k = str(i)
+    v = "value" + k
+    print >> f, k + " ==> " + v
+EOF
+
 declare -a backward_compatible_checkout_objs=("2.2.fb.branch" "2.3.fb.branch" "2.4.fb.branch" "2.5.fb.branch" "2.6.fb.branch" "2.7.fb.branch" "2.8.1.fb" "3.0.fb.branch" "3.1.fb" "3.2.fb" "3.3.fb" "3.4.fb" "3.5.fb" "3.6.fb" "3.7.fb" "3.8.fb" "3.9.fb")
-declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb")
-declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]})
+declare -a forward_compatible_checkout_objs=("3.10.fb" "3.11.fb" "3.12.fb" "3.13.fb" "4.0.fb" "4.1.fb" "4.2.fb" "4.3.fb" "4.4.fb" "4.5.fb" "4.6.fb" "4.7.fb" "4.8.fb" "4.9.fb" "4.10.fb" "4.11.fb" "4.12.fb" "4.13.fb" "5.0.fb" "5.1.fb" "5.2.fb" "5.3.fb" "5.4.fb" "5.5.fb" "5.6.fb" "5.7.fb" "5.8.fb" "5.9.fb" "5.10.fb")
+declare -a forward_compatible_with_options_checkout_objs=("5.11.fb" "5.12.fb" "5.13.fb" "5.14.fb")
+declare -a checkout_objs=(${backward_compatible_checkout_objs[@]} ${forward_compatible_checkout_objs[@]} ${forward_compatible_with_options_checkout_objs[@]})
+declare -a extern_sst_ingestion_compatible_checkout_objs=("5.14.fb" "5.15.fb" "5.16.fb" "5.17.fb" "5.18.fb")
 
 generate_db()
 {
@@ -59,7 +72,7 @@ generate_db()
 compare_db()
 {
     set +e
-    $script_copy_dir/verify_random_db.sh $1 $2 $3 $4
+    $script_copy_dir/verify_random_db.sh $1 $2 $3 $4 $5
     if [ $? -ne 0 ]; then
         echo ==== Read different content from $1 and $2 or error happened. ====
         exit 1
@@ -67,6 +80,28 @@ compare_db()
     set -e
 }
 
+write_external_sst()
+{
+    set +e
+    $script_copy_dir/write_external_sst.sh $1 $2 $3
+    if [ $? -ne 0 ]; then
+        echo ==== Error writing external SST file using data from $1 to $3 ====
+        exit 1
+    fi
+    set -e
+}
+
+ingest_external_sst()
+{
+    set +e
+    $script_copy_dir/ingest_external_sst.sh $1 $2
+    if [ $? -ne 0 ]; then
+        echo ==== Error ingesting external SST in $2 to DB at $1 ====
+        exit 1
+    fi
+    set -e
+}
+
 # Sandcastle sets us up with a remote that is just another directory on the same
 # machine and doesn't have our branches. Need to fetch them so checkout works.
 # Remote add may fail if added previously (we don't cleanup).
@@ -74,6 +109,41 @@ git remote add github_origin "https://github.com/facebook/rocksdb.git"
 set -e
 https_proxy="fwdproxy:8080" git fetch github_origin
 
+# Compatibility test for external SST file ingestion
+for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
+do
+  echo == Generating DB with extern SST file in "$checkout_obj" ...
+  https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_obj -b $checkout_obj
+  make clean
+  make ldb -j32
+  write_external_sst $input_data_path $test_dir/$checkout_obj $test_dir/$checkout_obj
+  ingest_external_sst $test_dir/$checkout_obj $test_dir/$checkout_obj
+done
+
+checkout_flag=${1:-"master"}
+
+echo == Building $checkout_flag debug
+https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
+make clean
+make ldb -j32
+compare_base_db_dir=$test_dir"/base_db_dir"
+write_external_sst $input_data_path $compare_base_db_dir $compare_base_db_dir
+ingest_external_sst $compare_base_db_dir $compare_base_db_dir
+
+for checkout_obj in "${extern_sst_ingestion_compatible_checkout_objs[@]}"
+do
+  echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag
+  git checkout $checkout_obj
+  make clean
+  make ldb -j32
+  compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 1
+  git checkout tmp-$checkout_flag
+  # Clean up
+  git branch -D $checkout_obj
+done
+
+echo == Finish compatibility test for SST ingestion.
+
 for checkout_obj in "${checkout_objs[@]}"
 do
    echo == Generating DB from "$checkout_obj" ...
@@ -86,7 +156,7 @@ done
 checkout_flag=${1:-"master"}
 
 echo == Building $checkout_flag debug
-https_proxy="fwdproxy:8080" git checkout github_origin/$checkout_flag -b tmp-$checkout_flag
+git checkout tmp-$checkout_flag
 make clean
 make ldb -j32
 compare_base_db_dir=$test_dir"/base_db_dir"
@@ -96,7 +166,7 @@ generate_db $input_data_path $compare_base_db_dir
 for checkout_obj in "${checkout_objs[@]}"
 do
    echo == Opening DB from "$checkout_obj" using debug build of $checkout_flag ...
-   compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1
+   compare_db $test_dir/$checkout_obj $compare_base_db_dir db_dump.txt 1 0
 done
 
 for checkout_obj in "${forward_compatible_checkout_objs[@]}"
@@ -108,4 +178,13 @@ do
    compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 0
 done
 
+for checkout_obj in "${forward_compatible_with_options_checkout_objs[@]}"
+do
+   echo == Build "$checkout_obj" and try to open DB generated using $checkout_flag with its options...
+   git checkout $checkout_obj
+   make clean
+   make ldb -j32
+   compare_db $test_dir/$checkout_obj $compare_base_db_dir forward_${checkout_obj}_dump.txt 1 1
+done
+
 echo ==== Compatibility Test PASSED ====
diff --git a/thirdparty/rocksdb/tools/db_bench_tool.cc b/thirdparty/rocksdb/tools/db_bench_tool.cc
index 0f89095434..0cb4e0eb27 100644
--- a/thirdparty/rocksdb/tools/db_bench_tool.cc
+++ b/thirdparty/rocksdb/tools/db_bench_tool.cc
@@ -20,8 +20,8 @@
 #include <unistd.h>
 #endif
 #include <fcntl.h>
-#include <gflags/gflags.h>
 #include <inttypes.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
@@ -34,10 +34,12 @@
 #include <unordered_map>
 
 #include "db/db_impl.h"
+#include "db/malloc_stats.h"
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
 #include "monitoring/histogram.h"
 #include "monitoring/statistics.h"
+#include "options/cf_options.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
@@ -61,6 +63,7 @@
 #include "util/cast_util.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
+#include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
 #include "util/stderr_logger.h"
@@ -70,15 +73,16 @@
 #include "util/xxhash.h"
 #include "utilities/blob_db/blob_db.h"
 #include "utilities/merge_operators.h"
+#include "utilities/merge_operators/bytesxor.h"
 #include "utilities/persistent_cache/block_cache_tier.h"
 
 #ifdef OS_WIN
 #include <io.h>  // open/close
 #endif
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::RegisterFlagValidator;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_string(
     benchmarks,
@@ -98,15 +102,17 @@ DEFINE_string(
     "readreverse,"
     "compact,"
     "compactall,"
-    "readrandom,"
     "multireadrandom,"
+    "mixgraph,"
     "readseq,"
     "readtocache,"
     "readreverse,"
     "readwhilewriting,"
     "readwhilemerging,"
+    "readwhilescanning,"
     "readrandomwriterandom,"
     "updaterandom,"
+    "xorupdaterandom,"
     "randomwithverify,"
     "fill100K,"
     "crc32c,"
@@ -146,11 +152,14 @@ DEFINE_string(
     "reads\n"
     "\treadwhilemerging      -- 1 merger, N threads doing random "
     "reads\n"
+    "\treadwhilescanning     -- 1 thread doing full table scan, "
+    "N threads doing random reads\n"
     "\treadrandomwriterandom -- N threads doing random-read, "
     "random-write\n"
-    "\tprefixscanrandom      -- prefix scan N times in random order\n"
     "\tupdaterandom  -- N threads doing read-modify-write for random "
     "keys\n"
+    "\txorupdaterandom  -- N threads doing read-XOR-write for "
+    "random keys\n"
     "\tappendrandom  -- N threads doing read-modify-write with "
     "growing values\n"
     "\tmergerandom   -- same as updaterandom/appendrandom using merge"
@@ -183,8 +192,8 @@ DEFINE_string(
     "\tresetstats  -- Reset DB stats\n"
     "\tlevelstats  -- Print the number of files and bytes per level\n"
     "\tsstables    -- Print sstable info\n"
-    "\theapprofile -- Dump a heap profile (if supported by this"
-    " port)\n");
+    "\theapprofile -- Dump a heap profile (if supported by this port)\n"
+    "\treplay      -- replay the trace file specified with trace_file\n");
 
 DEFINE_int64(num, 1000000, "Number of key/values to place in database");
 
@@ -242,11 +251,15 @@ DEFINE_bool(reverse_iterator, false,
             "When true use Prev rather than Next for iterators that do "
             "Seek and then Next");
 
+DEFINE_int64(max_scan_distance, 0,
+             "Used to define iterate_upper_bound (or iterate_lower_bound "
+             "if FLAGS_reverse_iterator is set to true) when value is nonzero");
+
 DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
 
 DEFINE_int64(batch_size, 1, "Batch size");
 
-static bool ValidateKeySize(const char* flagname, int32_t value) {
+static bool ValidateKeySize(const char* /*flagname*/, int32_t /*value*/) {
   return true;
 }
 
@@ -295,7 +308,7 @@ DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
 DEFINE_int32(max_write_buffer_number,
              rocksdb::Options().max_write_buffer_number,
              "The number of in-memory memtables. Each memtable is of size"
-             "write_buffer_size.");
+             " write_buffer_size bytes.");
 
 DEFINE_int32(min_write_buffer_number_to_merge,
              rocksdb::Options().min_write_buffer_number_to_merge,
@@ -350,7 +363,7 @@ DEFINE_uint64(subcompactions, 1,
               "Maximum number of subcompactions to divide L0-L1 compactions "
               "into.");
 static const bool FLAGS_subcompactions_dummy
-    __attribute__((unused)) = RegisterFlagValidator(&FLAGS_subcompactions,
+    __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_subcompactions,
                                                     &ValidateUint32Range);
 
 DEFINE_int32(max_background_flushes,
@@ -412,6 +425,8 @@ DEFINE_bool(cache_index_and_filter_blocks, false,
 DEFINE_bool(partition_index_and_filters, false,
             "Partition index and filter blocks.");
 
+DEFINE_bool(partition_index, false, "Partition index blocks");
+
 DEFINE_int64(metadata_block_size,
              rocksdb::BlockBasedTableOptions().metadata_block_size,
              "Max partition size when partitioning index/filters");
@@ -424,10 +439,19 @@ DEFINE_int32(ops_between_duration_checks, 1000,
 DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false,
             "Pin index/filter blocks of L0 files in block cache.");
 
+DEFINE_bool(
+    pin_top_level_index_and_filter, false,
+    "Pin top-level index of partitioned index/filter blocks in block cache.");
+
 DEFINE_int32(block_size,
              static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
              "Number of bytes in a block.");
 
+DEFINE_int32(
+    format_version,
+    static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
+    "Format version of SST files.");
+
 DEFINE_int32(block_restart_interval,
              rocksdb::BlockBasedTableOptions().block_restart_interval,
              "Number of keys between restart points "
@@ -442,6 +466,23 @@ DEFINE_int32(read_amp_bytes_per_bit,
              rocksdb::BlockBasedTableOptions().read_amp_bytes_per_bit,
              "Number of bytes per bit to be used in block read-amp bitmap");
 
+DEFINE_bool(enable_index_compression,
+            rocksdb::BlockBasedTableOptions().enable_index_compression,
+            "Compress the index block");
+
+DEFINE_bool(block_align, rocksdb::BlockBasedTableOptions().block_align,
+            "Align data blocks on page size");
+
+DEFINE_bool(use_data_block_hash_index, false,
+            "if use kDataBlockBinaryAndHash "
+            "instead of kDataBlockBinarySearch. "
+            "This is valid if only we use BlockTable");
+
+DEFINE_double(data_block_hash_table_util_ratio, 0.75,
+              "util ratio for data block hash index table. "
+              "This is only valid if use_data_block_hash_index is "
+              "set to true");
+
 DEFINE_int64(compressed_cache_size, -1,
              "Number of bytes to use as a cache of compressed data.");
 
@@ -473,6 +514,8 @@ DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
 DEFINE_double(memtable_bloom_size_ratio, 0,
               "Ratio of memtable size used for bloom filter. 0 means no bloom "
               "filter.");
+DEFINE_bool(memtable_whole_key_filtering, false,
+            "Try to use whole key bloom filter in memtables.");
 DEFINE_bool(memtable_use_huge_page, false,
             "Try to use huge page in memtables.");
 
@@ -480,6 +523,14 @@ DEFINE_bool(use_existing_db, false, "If true, do not destroy the existing"
             " database.  If you set this flag and also specify a benchmark that"
             " wants a fresh database, that benchmark will fail.");
 
+DEFINE_bool(use_existing_keys, false,
+            "If true, uses existing keys in the DB, "
+            "rather than generating new ones. This involves some startup "
+            "latency to load all keys into memory. It is supported for the "
+            "same read/overwrite benchmarks as `-use_existing_db=true`, which "
+            "must also be set for this flag to be enabled. When this flag is "
+            "set, the value for `-num` will be ignored.");
+
 DEFINE_bool(show_table_properties, false,
             "If true, then per-level table"
             " properties will be printed on every stats-interval when"
@@ -501,6 +552,8 @@ DEFINE_bool(read_cache_direct_write, true,
 DEFINE_bool(read_cache_direct_read, true,
             "Whether to use Direct IO for reading from read cache");
 
+DEFINE_bool(use_keep_filter, false, "Whether to use a noop compaction filter");
+
 static bool ValidateCacheNumshardbits(const char* flagname, int32_t value) {
   if (value >= 20) {
     fprintf(stderr, "Invalid value for --%s: %d, must be < 20\n",
@@ -515,6 +568,8 @@ DEFINE_bool(verify_checksum, true,
             " from storage");
 
 DEFINE_bool(statistics, false, "Database statistics");
+DEFINE_int32(stats_level, rocksdb::StatsLevel::kExceptDetailedTimers,
+             "stats level for statistics");
 DEFINE_string(statistics_string, "", "Serialized statistics string");
 static class std::shared_ptr<rocksdb::Statistics> dbstats;
 
@@ -604,9 +659,11 @@ DEFINE_bool(optimize_filters_for_hits, false,
 DEFINE_uint64(delete_obsolete_files_period_micros, 0,
               "Ignored. Left here for backward compatibility");
 
+DEFINE_int64(writes_before_delete_range, 0,
+             "Number of writes before DeleteRange is called regularly.");
+
 DEFINE_int64(writes_per_range_tombstone, 0,
-             "Number of writes between range "
-             "tombstones");
+             "Number of writes between range tombstones");
 
 DEFINE_int64(range_tombstone_width, 100, "Number of keys in tombstone's range");
 
@@ -618,14 +675,11 @@ DEFINE_bool(expand_range_tombstones, false,
             "Expand range tombstone into sequential regular tombstones.");
 
 #ifndef ROCKSDB_LITE
+// Transactions Options
 DEFINE_bool(optimistic_transaction_db, false,
             "Open a OptimisticTransactionDB instance. "
             "Required for randomtransaction benchmark.");
 
-DEFINE_bool(use_blob_db, false,
-            "Open a BlobDB instance. "
-            "Required for largevalue benchmark.");
-
 DEFINE_bool(transaction_db, false,
             "Open a TransactionDB instance. "
             "Required for randomtransaction benchmark.");
@@ -654,6 +708,7 @@ DEFINE_string(
     "RocksDB options related command-line arguments, all other arguments "
     "that are related to RocksDB options will be ignored:\n"
     "\t--use_existing_db\n"
+    "\t--use_existing_keys\n"
     "\t--statistics\n"
     "\t--row_cache_size\n"
     "\t--row_cache_numshardbits\n"
@@ -661,11 +716,42 @@ DEFINE_string(
     "\t--dump_malloc_stats\n"
     "\t--num_multi_db\n");
 
+// FIFO Compaction Options
 DEFINE_uint64(fifo_compaction_max_table_files_size_mb, 0,
               "The limit of total table file sizes to trigger FIFO compaction");
+
 DEFINE_bool(fifo_compaction_allow_compaction, true,
             "Allow compaction in FIFO compaction.");
+
 DEFINE_uint64(fifo_compaction_ttl, 0, "TTL for the SST Files in seconds.");
+
+// Blob DB Options
+DEFINE_bool(use_blob_db, false,
+            "Open a BlobDB instance. "
+            "Required for large value benchmark.");
+
+DEFINE_bool(blob_db_enable_gc, false, "Enable BlobDB garbage collection.");
+
+DEFINE_bool(blob_db_is_fifo, false, "Enable FIFO eviction strategy in BlobDB.");
+
+DEFINE_uint64(blob_db_max_db_size, 0,
+              "Max size limit of the directory where blob files are stored.");
+
+DEFINE_uint64(blob_db_max_ttl_range, 86400,
+              "TTL range to generate BlobDB data (in seconds).");
+
+DEFINE_uint64(blob_db_ttl_range_secs, 3600,
+              "TTL bucket size to use when creating blob files.");
+
+DEFINE_uint64(blob_db_min_blob_size, 0,
+              "Smallest blob to store in a file. Blobs smaller than this "
+              "will be inlined with the key in the LSM tree.");
+
+DEFINE_uint64(blob_db_bytes_per_sync, 0, "Bytes to sync blob file at.");
+
+DEFINE_uint64(blob_db_file_size, 256 * 1024 * 1024,
+              "Target size of each blob file.");
+
 #endif  // ROCKSDB_LITE
 
 DEFINE_bool(report_bg_io_stats, false,
@@ -674,6 +760,8 @@ DEFINE_bool(report_bg_io_stats, false,
 DEFINE_bool(use_stderr_info_logger, false,
             "Write info logs to stderr instead of to LOG file. ");
 
+DEFINE_string(trace_file, "", "Trace workload to a file. ");
+
 static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) {
   assert(ctype);
 
@@ -713,25 +801,22 @@ DEFINE_string(compression_type, "snappy",
 static enum rocksdb::CompressionType FLAGS_compression_type_e =
     rocksdb::kSnappyCompression;
 
-DEFINE_int32(compression_level, -1,
-             "Compression level. For zlib this should be -1 for the "
-             "default level, or between 0 and 9.");
+DEFINE_int64(sample_for_compression, 0, "Sample every N block for compression");
+
+DEFINE_int32(compression_level, rocksdb::CompressionOptions().level,
+             "Compression level. The meaning of this value is library-"
+             "dependent. If unset, we try to use the default for the library "
+             "specified in `--compression_type`");
 
-DEFINE_int32(compression_max_dict_bytes, 0,
+DEFINE_int32(compression_max_dict_bytes,
+             rocksdb::CompressionOptions().max_dict_bytes,
              "Maximum size of dictionary used to prime the compression "
              "library.");
 
-static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
-  if (value < -1 || value > 9) {
-    fprintf(stderr, "Invalid value for --%s: %d, must be between -1 and 9\n",
-            flagname, value);
-    return false;
-  }
-  return true;
-}
-
-static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
-    RegisterFlagValidator(&FLAGS_compression_level, &ValidateCompressionLevel);
+DEFINE_int32(compression_zstd_max_train_bytes,
+             rocksdb::CompressionOptions().zstd_max_train_bytes,
+             "Maximum size of training data passed to zstd's dictionary "
+             "trainer.");
 
 DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
              " from this level. Levels with number < min_level_to_compress are"
@@ -809,6 +894,13 @@ DEFINE_bool(enable_pipelined_write, true,
 DEFINE_bool(allow_concurrent_memtable_write, true,
             "Allow multi-writers to update mem tables in parallel.");
 
+DEFINE_bool(inplace_update_support, rocksdb::Options().inplace_update_support,
+            "Support in-place memtable update for smaller or same-size values");
+
+DEFINE_uint64(inplace_update_num_locks,
+              rocksdb::Options().inplace_update_num_locks,
+              "Number of RW locks to protect in-place memtable updates");
+
 DEFINE_bool(enable_write_thread_adaptive_yield, true,
             "Use a yielding spin loop for brief writer thread waits.");
 
@@ -826,6 +918,29 @@ DEFINE_int32(rate_limit_delay_max_milliseconds, 1000,
 
 DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
 
+DEFINE_bool(rate_limiter_auto_tuned, false,
+            "Enable dynamic adjustment of rate limit according to demand for "
+            "background I/O");
+
+
+DEFINE_bool(sine_write_rate, false,
+            "Use a sine wave write_rate_limit");
+
+DEFINE_uint64(sine_write_rate_interval_milliseconds, 10000,
+              "Interval of which the sine wave write_rate_limit is recalculated");
+
+DEFINE_double(sine_a, 1,
+             "A in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_b, 1,
+             "B in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_c, 0,
+             "C in f(x) = A sin(bx + c) + d");
+
+DEFINE_double(sine_d, 1,
+             "D in f(x) = A sin(bx + c) + d");
+
 DEFINE_bool(rate_limit_bg_reads, false,
             "Use options.rate_limiter on compaction reads");
 
@@ -834,6 +949,52 @@ DEFINE_uint64(
     "If non-zero, db_bench will rate-limit the writes going into RocksDB. This "
     "is the global rate in bytes/second.");
 
+// the parameters of mix_graph
+DEFINE_double(key_dist_a, 0.0,
+              "The parameter 'a' of key access distribution model "
+              "f(x)=a*x^b");
+DEFINE_double(key_dist_b, 0.0,
+              "The parameter 'b' of key access distribution model "
+              "f(x)=a*x^b");
+DEFINE_double(value_theta, 0.0,
+              "The parameter 'theta' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(value_k, 0.0,
+              "The parameter 'k' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(value_sigma, 0.0,
+              "The parameter 'theta' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(iter_theta, 0.0,
+              "The parameter 'theta' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(iter_k, 0.0,
+              "The parameter 'k' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(iter_sigma, 0.0,
+              "The parameter 'sigma' of Generized Pareto Distribution "
+              "f(x)=(1/sigma)*(1+k*(x-theta)/sigma)^-(1/k+1)");
+DEFINE_double(mix_get_ratio, 1.0,
+              "The ratio of Get queries of mix_graph workload");
+DEFINE_double(mix_put_ratio, 0.0,
+              "The ratio of Put queries of mix_graph workload");
+DEFINE_double(mix_seek_ratio, 0.0,
+              "The ratio of Seek queries of mix_graph workload");
+DEFINE_int64(mix_max_scan_len, 10000, "The max scan length of Iterator");
+DEFINE_int64(mix_ave_kv_size, 512,
+             "The average key-value size of this workload");
+DEFINE_int64(mix_max_value_size, 1024, "The max value size of this workload");
+DEFINE_double(
+    sine_mix_rate_noise, 0.0,
+    "Add the noise ratio to the sine rate, it is between 0.0 and 1.0");
+DEFINE_bool(sine_mix_rate, false,
+            "Enable the sine QPS control on the mix workload");
+DEFINE_uint64(
+    sine_mix_rate_interval_milliseconds, 10000,
+    "Interval of which the sine wave read_rate_limit is recalculated");
+DEFINE_int64(mix_accesses, -1,
+             "The total query accesses of mix_graph workload");
+
 DEFINE_uint64(
     benchmark_read_rate_limit, 0,
     "If non-zero, db_bench will rate-limit the reads from RocksDB. This "
@@ -844,6 +1005,9 @@ DEFINE_uint64(max_compaction_bytes, rocksdb::Options().max_compaction_bytes,
 
 #ifndef ROCKSDB_LITE
 DEFINE_bool(readonly, false, "Run read only benchmarks.");
+
+DEFINE_bool(print_malloc_stats, false,
+            "Print malloc stats to stdout after benchmarks finish.");
 #endif  // ROCKSDB_LITE
 
 DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions");
@@ -864,7 +1028,7 @@ DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
 
 DEFINE_bool(use_direct_io_for_flush_and_compaction,
             rocksdb::Options().use_direct_io_for_flush_and_compaction,
-            "Use O_DIRECT for background flush and compaction I/O");
+            "Use O_DIRECT for background flush and compaction writes");
 
 DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
             "Advise random access on table file open");
@@ -936,17 +1100,26 @@ DEFINE_int32(memtable_insert_with_hint_prefix_size, 0,
              "memtable insert with hint with the given prefix size.");
 DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
             "threads' IO priority");
+DEFINE_bool(enable_cpu_prio, false, "Lower the background flush/compaction "
+            "threads' CPU priority");
 DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
             "table becomes an identity function. This is only valid when key "
             "is 8 bytes");
 DEFINE_bool(dump_malloc_stats, true, "Dump malloc stats in LOG ");
+DEFINE_uint64(stats_dump_period_sec, rocksdb::Options().stats_dump_period_sec,
+              "Gap between printing stats to log in seconds");
+DEFINE_uint64(stats_persist_period_sec,
+              rocksdb::Options().stats_persist_period_sec,
+              "Gap between persisting stats in seconds");
+DEFINE_uint64(stats_history_buffer_size,
+              rocksdb::Options().stats_history_buffer_size,
+              "Max number of stats snapshots to keep in memory");
 
 enum RepFactory {
   kSkipList,
   kPrefixHash,
   kVectorRep,
   kHashLinkedList,
-  kCuckoo
 };
 
 static enum RepFactory StringToRepFactory(const char* ctype) {
@@ -960,8 +1133,6 @@ static enum RepFactory StringToRepFactory(const char* ctype) {
     return kVectorRep;
   else if (!strcasecmp(ctype, "hash_linkedlist"))
     return kHashLinkedList;
-  else if (!strcasecmp(ctype, "cuckoo"))
-    return kCuckoo;
 
   fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
   return kSkipList;
@@ -990,31 +1161,31 @@ DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try "
 DEFINE_bool(report_file_operations, false, "if report number of file "
             "operations");
 
-static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) =
+static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit);
 
-static const bool FLAGS_hard_rate_limit_dummy __attribute__((unused)) =
+static const bool FLAGS_hard_rate_limit_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_hard_rate_limit, &ValidateRateLimit);
 
-static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
+static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
 
-static const bool FLAGS_key_size_dummy __attribute__((unused)) =
+static const bool FLAGS_key_size_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_key_size, &ValidateKeySize);
 
-static const bool FLAGS_cache_numshardbits_dummy __attribute__((unused)) =
+static const bool FLAGS_cache_numshardbits_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_cache_numshardbits,
                           &ValidateCacheNumshardbits);
 
-static const bool FLAGS_readwritepercent_dummy __attribute__((unused)) =
+static const bool FLAGS_readwritepercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_readwritepercent, &ValidateInt32Percent);
 
 DEFINE_int32(disable_seek_compaction, false,
              "Not used, left here for backwards compatibility");
 
-static const bool FLAGS_deletepercent_dummy __attribute__((unused)) =
+static const bool FLAGS_deletepercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_deletepercent, &ValidateInt32Percent);
-static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) =
+static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_table_cache_numshardbits,
                           &ValidateTableCacheNumshardbits);
 
@@ -1042,19 +1213,20 @@ class ReportFileOpEnv : public EnvWrapper {
     counters_.bytes_written_ = 0;
   }
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& soptions) override {
     class CountingFile : public SequentialFile {
      private:
-      unique_ptr<SequentialFile> target_;
+      std::unique_ptr<SequentialFile> target_;
       ReportFileOpCounters* counters_;
 
      public:
-      CountingFile(unique_ptr<SequentialFile>&& target,
+      CountingFile(std::unique_ptr<SequentialFile>&& target,
                    ReportFileOpCounters* counters)
           : target_(std::move(target)), counters_(counters) {}
 
-      virtual Status Read(size_t n, Slice* result, char* scratch) override {
+      Status Read(size_t n, Slice* result, char* scratch) override {
         counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
         Status rv = target_->Read(n, result, scratch);
         counters_->bytes_read_.fetch_add(result->size(),
@@ -1062,7 +1234,7 @@ class ReportFileOpEnv : public EnvWrapper {
         return rv;
       }
 
-      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+      Status Skip(uint64_t n) override { return target_->Skip(n); }
     };
 
     Status s = target()->NewSequentialFile(f, r, soptions);
@@ -1074,19 +1246,19 @@ class ReportFileOpEnv : public EnvWrapper {
   }
 
   Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
+                             std::unique_ptr<RandomAccessFile>* r,
                              const EnvOptions& soptions) override {
     class CountingFile : public RandomAccessFile {
      private:
-      unique_ptr<RandomAccessFile> target_;
+      std::unique_ptr<RandomAccessFile> target_;
       ReportFileOpCounters* counters_;
 
      public:
-      CountingFile(unique_ptr<RandomAccessFile>&& target,
+      CountingFile(std::unique_ptr<RandomAccessFile>&& target,
                    ReportFileOpCounters* counters)
           : target_(std::move(target)), counters_(counters) {}
-      virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                          char* scratch) const override {
+      Status Read(uint64_t offset, size_t n, Slice* result,
+                  char* scratch) const override {
         counters_->read_counter_.fetch_add(1, std::memory_order_relaxed);
         Status rv = target_->Read(offset, n, result, scratch);
         counters_->bytes_read_.fetch_add(result->size(),
@@ -1103,15 +1275,15 @@ class ReportFileOpEnv : public EnvWrapper {
     return s;
   }
 
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& soptions) override {
     class CountingFile : public WritableFile {
      private:
-      unique_ptr<WritableFile> target_;
+      std::unique_ptr<WritableFile> target_;
       ReportFileOpCounters* counters_;
 
      public:
-      CountingFile(unique_ptr<WritableFile>&& target,
+      CountingFile(std::unique_ptr<WritableFile>&& target,
                    ReportFileOpCounters* counters)
           : target_(std::move(target)), counters_(counters) {}
 
@@ -1415,6 +1587,7 @@ class Stats {
  private:
   int id_;
   uint64_t start_;
+  uint64_t sine_interval_;
   uint64_t finish_;
   double seconds_;
   uint64_t done_;
@@ -1447,6 +1620,7 @@ class Stats {
     bytes_ = 0;
     seconds_ = 0;
     start_ = FLAGS_env->NowMicros();
+    sine_interval_ = FLAGS_env->NowMicros();
     finish_ = start_;
     last_report_finish_ = start_;
     message_.clear();
@@ -1519,6 +1693,18 @@ class Stats {
     }
   }
 
+  void ResetSineInterval() {
+    sine_interval_ = FLAGS_env->NowMicros();
+  }
+
+  uint64_t GetSineInterval() {
+    return sine_interval_;
+  }
+
+  uint64_t GetStart() {
+    return start_;
+  }
+
   void ResetLastOpTime() {
     // Set to now to avoid latency from calls to SleepForMicroseconds
     last_op_finish_ = FLAGS_env->NowMicros();
@@ -1663,7 +1849,7 @@ class Stats {
 
     fprintf(stdout, "%-12s : %11.3f micros/op %ld ops/sec;%s%s\n",
             name.ToString().c_str(),
-            elapsed * 1e6 / done_,
+            seconds_ * 1e6 / done_,
             (long)throughput,
             (extra.empty() ? "" : " "),
             extra.c_str());
@@ -1859,11 +2045,15 @@ class Benchmark {
   int prefix_size_;
   int64_t keys_per_prefix_;
   int64_t entries_per_batch_;
+  int64_t writes_before_delete_range_;
   int64_t writes_per_range_tombstone_;
   int64_t range_tombstone_width_;
   int64_t max_num_range_tombstones_;
   WriteOptions write_options_;
   Options open_options_;  // keep options around to properly destroy db later
+#ifndef ROCKSDB_LITE
+  TraceOptions trace_options_;
+#endif
   int64_t reads_;
   int64_t deletes_;
   double read_random_exp_range_;
@@ -1872,6 +2062,59 @@ class Benchmark {
   int64_t merge_keys_;
   bool report_file_operations_;
   bool use_blob_db_;
+  std::vector<std::string> keys_;
+
+  class ErrorHandlerListener : public EventListener {
+   public:
+#ifndef ROCKSDB_LITE
+    ErrorHandlerListener()
+        : mutex_(),
+          cv_(&mutex_),
+          no_auto_recovery_(false),
+          recovery_complete_(false) {}
+
+    ~ErrorHandlerListener() override {}
+
+    void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/,
+                              Status /*bg_error*/,
+                              bool* auto_recovery) override {
+      if (*auto_recovery && no_auto_recovery_) {
+        *auto_recovery = false;
+      }
+    }
+
+    void OnErrorRecoveryCompleted(Status /*old_bg_error*/) override {
+      InstrumentedMutexLock l(&mutex_);
+      recovery_complete_ = true;
+      cv_.SignalAll();
+    }
+
+    bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+      InstrumentedMutexLock l(&mutex_);
+      if (!recovery_complete_) {
+        cv_.Wait(/*abs_time_us*/);
+      }
+      if (recovery_complete_) {
+        recovery_complete_ = false;
+        return true;
+      }
+      return false;
+    }
+
+    void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+   private:
+    InstrumentedMutex mutex_;
+    InstrumentedCondVar cv_;
+    bool no_auto_recovery_;
+    bool recovery_complete_;
+#else   // ROCKSDB_LITE
+    bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; }
+    void EnableAutoRecovery(bool /*enable*/) {}
+#endif  // ROCKSDB_LITE
+  };
+
+  std::shared_ptr<ErrorHandlerListener> listener_;
 
   bool SanityCheck() {
     if (FLAGS_compression_ratio > 1) {
@@ -1881,36 +2124,37 @@ class Benchmark {
     return true;
   }
 
-  inline bool CompressSlice(const Slice& input, std::string* compressed) {
+  inline bool CompressSlice(const CompressionInfo& compression_info,
+                            const Slice& input, std::string* compressed) {
     bool ok = true;
     switch (FLAGS_compression_type_e) {
       case rocksdb::kSnappyCompression:
-        ok = Snappy_Compress(Options().compression_opts, input.data(),
-                             input.size(), compressed);
+        ok = Snappy_Compress(compression_info, input.data(), input.size(),
+                             compressed);
         break;
       case rocksdb::kZlibCompression:
-        ok = Zlib_Compress(Options().compression_opts, 2, input.data(),
-                           input.size(), compressed);
+        ok = Zlib_Compress(compression_info, 2, input.data(), input.size(),
+                           compressed);
         break;
       case rocksdb::kBZip2Compression:
-        ok = BZip2_Compress(Options().compression_opts, 2, input.data(),
-                            input.size(), compressed);
+        ok = BZip2_Compress(compression_info, 2, input.data(), input.size(),
+                            compressed);
         break;
       case rocksdb::kLZ4Compression:
-        ok = LZ4_Compress(Options().compression_opts, 2, input.data(),
-                          input.size(), compressed);
+        ok = LZ4_Compress(compression_info, 2, input.data(), input.size(),
+                          compressed);
         break;
       case rocksdb::kLZ4HCCompression:
-        ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(),
-                            input.size(), compressed);
+        ok = LZ4HC_Compress(compression_info, 2, input.data(), input.size(),
+                            compressed);
         break;
       case rocksdb::kXpressCompression:
         ok = XPRESS_Compress(input.data(),
           input.size(), compressed);
         break;
       case rocksdb::kZSTD:
-        ok = ZSTD_Compress(Options().compression_opts, input.data(),
-                           input.size(), compressed);
+        ok = ZSTD_Compress(compression_info, input.data(), input.size(),
+                           compressed);
         break;
       default:
         ok = false;
@@ -1953,6 +2197,8 @@ class Benchmark {
 
     auto compression = CompressionTypeToString(FLAGS_compression_type_e);
     fprintf(stdout, "Compression: %s\n", compression.c_str());
+    fprintf(stdout, "Compression sampling rate: %" PRId64 "\n",
+            FLAGS_sample_for_compression);
 
     switch (FLAGS_rep_factory) {
       case kPrefixHash:
@@ -1967,9 +2213,6 @@ class Benchmark {
       case kHashLinkedList:
         fprintf(stdout, "Memtablerep: hash_linkedlist\n");
         break;
-      case kCuckoo:
-        fprintf(stdout, "Memtablerep: cuckoo\n");
-        break;
     }
     fprintf(stdout, "Perf Level: %d\n", FLAGS_perf_level);
 
@@ -1992,7 +2235,12 @@ class Benchmark {
       const int len = FLAGS_block_size;
       std::string input_str(len, 'y');
       std::string compressed;
-      bool result = CompressSlice(Slice(input_str), &compressed);
+      CompressionOptions opts;
+      CompressionContext context(FLAGS_compression_type_e);
+      CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                           FLAGS_compression_type_e,
+                           FLAGS_sample_for_compression);
+      bool result = CompressSlice(info, Slice(input_str), &compressed);
 
       if (!result) {
         fprintf(stdout, "WARNING: %s compression is not enabled\n",
@@ -2080,8 +2328,9 @@ class Benchmark {
     explicit ExpiredTimeFilter(
         const std::shared_ptr<TimestampEmulator>& timestamp_emulator)
         : timestamp_emulator_(timestamp_emulator) {}
-    bool Filter(int level, const Slice& key, const Slice& existing_value,
-                std::string* new_value, bool* value_changed) const override {
+    bool Filter(int /*level*/, const Slice& key,
+                const Slice& /*existing_value*/, std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
       return KeyExpired(timestamp_emulator_.get(), key);
     }
     const char* Name() const override { return "ExpiredTimeFilter"; }
@@ -2090,6 +2339,17 @@ class Benchmark {
     std::shared_ptr<TimestampEmulator> timestamp_emulator_;
   };
 
+  class KeepFilter : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+                std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
+      return false;
+    }
+
+    const char* Name() const override { return "KeepFilter"; }
+  };
+
   std::shared_ptr<Cache> NewCache(int64_t capacity) {
     if (capacity <= 0) {
       return nullptr;
@@ -2133,10 +2393,11 @@ class Benchmark {
         merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys),
         report_file_operations_(FLAGS_report_file_operations),
 #ifndef ROCKSDB_LITE
-        use_blob_db_(FLAGS_use_blob_db) {
+        use_blob_db_(FLAGS_use_blob_db)
 #else
-        use_blob_db_(false) {
+        use_blob_db_(false)
 #endif  // !ROCKSDB_LITE
+  {
     // use simcache instead of cache
     if (FLAGS_simcache_size >= 0) {
       if (FLAGS_cache_numshardbits >= 1) {
@@ -2191,6 +2452,8 @@ class Benchmark {
         }
       }
     }
+
+    listener_.reset(new ErrorHandlerListener());
   }
 
   ~Benchmark() {
@@ -2222,6 +2485,13 @@ class Benchmark {
   //   |        key 00000         |
   //   ----------------------------
   void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) {
+    if (!keys_.empty()) {
+      assert(FLAGS_use_existing_keys);
+      assert(keys_.size() == static_cast<size_t>(num_keys));
+      assert(v < static_cast<uint64_t>(num_keys));
+      *key = keys_[v];
+      return;
+    }
     char* start = const_cast<char*>(key->data());
     char* pos = start;
     if (keys_per_prefix_ > 0) {
@@ -2320,6 +2590,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       value_size_ = FLAGS_value_size;
       key_size_ = FLAGS_key_size;
       entries_per_batch_ = FLAGS_batch_size;
+      writes_before_delete_range_ = FLAGS_writes_before_delete_range;
       writes_per_range_tombstone_ = FLAGS_writes_per_range_tombstone;
       range_tombstone_width_ = FLAGS_range_tombstone_width;
       max_num_range_tombstones_ = FLAGS_max_num_range_tombstones;
@@ -2436,6 +2707,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         fprintf(stderr, "entries_per_batch = %" PRIi64 "\n",
                 entries_per_batch_);
         method = &Benchmark::MultiReadRandom;
+      } else if (name == "mixgraph") {
+        method = &Benchmark::MixGraph;
       } else if (name == "readmissing") {
         ++key_size_;
         method = &Benchmark::ReadRandom;
@@ -2465,6 +2738,9 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       } else if (name == "readwhilemerging") {
         num_threads++;  // Add extra thread for writing
         method = &Benchmark::ReadWhileMerging;
+      } else if (name == "readwhilescanning") {
+        num_threads++;  // Add extra thread for scaning
+        method = &Benchmark::ReadWhileScanning;
       } else if (name == "readrandomwriterandom") {
         method = &Benchmark::ReadRandomWriteRandom;
       } else if (name == "readrandommergerandom") {
@@ -2476,6 +2752,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         method = &Benchmark::ReadRandomMergeRandom;
       } else if (name == "updaterandom") {
         method = &Benchmark::UpdateRandom;
+      } else if (name == "xorupdaterandom") {
+        method = &Benchmark::XORUpdateRandom;
       } else if (name == "appendrandom") {
         method = &Benchmark::AppendRandom;
       } else if (name == "mergerandom") {
@@ -2530,6 +2808,16 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         PrintStats("rocksdb.levelstats");
       } else if (name == "sstables") {
         PrintStats("rocksdb.sstables");
+      } else if (name == "replay") {
+        if (num_threads > 1) {
+          fprintf(stderr, "Multi-threaded replay is not yet supported\n");
+          exit(1);
+        }
+        if (FLAGS_trace_file == "") {
+          fprintf(stderr, "Please set --trace_file to be replayed from\n");
+          exit(1);
+        }
+        method = &Benchmark::Replay;
       } else if (!name.empty()) {  // No error message for empty name
         fprintf(stderr, "unknown benchmark '%s'\n", name.c_str());
         exit(1);
@@ -2560,6 +2848,32 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
       if (method != nullptr) {
         fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
+
+#ifndef ROCKSDB_LITE
+        // A trace_file option can be provided both for trace and replay
+        // operations. But db_bench does not support tracing and replaying at
+        // the same time, for now. So, start tracing only when it is not a
+        // replay.
+        if (FLAGS_trace_file != "" && name != "replay") {
+          std::unique_ptr<TraceWriter> trace_writer;
+          Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(),
+                                        FLAGS_trace_file, &trace_writer);
+          if (!s.ok()) {
+            fprintf(stderr, "Encountered an error starting a trace, %s\n",
+                    s.ToString().c_str());
+            exit(1);
+          }
+          s = db_.db->StartTrace(trace_options_, std::move(trace_writer));
+          if (!s.ok()) {
+            fprintf(stderr, "Encountered an error starting a trace, %s\n",
+                    s.ToString().c_str());
+            exit(1);
+          }
+          fprintf(stdout, "Tracing the workload to: [%s]\n",
+                  FLAGS_trace_file.c_str());
+        }
+#endif  // ROCKSDB_LITE
+
         if (num_warmup > 0) {
           printf("Warming up benchmark by running %d times\n", num_warmup);
         }
@@ -2585,6 +2899,17 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         (this->*post_process_method)();
       }
     }
+
+#ifndef ROCKSDB_LITE
+    if (name != "replay" && FLAGS_trace_file != "") {
+      Status s = db_.db->EndTrace();
+      if (!s.ok()) {
+        fprintf(stderr, "Encountered an error ending the trace, %s\n",
+                s.ToString().c_str());
+      }
+    }
+#endif  // ROCKSDB_LITE
+
     if (FLAGS_statistics) {
       fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str());
     }
@@ -2622,6 +2947,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
 
     SetPerfLevel(static_cast<PerfLevel> (shared->perf_level));
+    perf_context.EnablePerLevelPerfContext();
     thread->stats.Start(thread->tid);
     (arg->bm->*(arg->method))(thread);
     thread->stats.Stop();
@@ -2716,8 +3042,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
   void Crc32c(ThreadState* thread) {
     // Checksum about 500MB of data total
-    const int size = 4096;
-    const char* label = "(4K per op)";
+    const int size = FLAGS_block_size; // use --block_size option for db_bench
+    std::string labels = "(" + ToString(FLAGS_block_size) + " per op)";
+    const char* label = labels.c_str();
+
     std::string data(size, 'x');
     int64_t bytes = 0;
     uint32_t crc = 0;
@@ -2775,11 +3103,15 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     int64_t produced = 0;
     bool ok = true;
     std::string compressed;
-
+    CompressionOptions opts;
+    CompressionContext context(FLAGS_compression_type_e);
+    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                         FLAGS_compression_type_e,
+                         FLAGS_sample_for_compression);
     // Compress 1G
     while (ok && bytes < int64_t(1) << 30) {
       compressed.clear();
-      ok = CompressSlice(input, &compressed);
+      ok = CompressSlice(info, input, &compressed);
       produced += compressed.size();
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1, kCompress);
@@ -2801,11 +3133,21 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     Slice input = gen.Generate(FLAGS_block_size);
     std::string compressed;
 
-    bool ok = CompressSlice(input, &compressed);
+    CompressionContext compression_ctx(FLAGS_compression_type_e);
+    CompressionOptions compression_opts;
+    CompressionInfo compression_info(
+        compression_opts, compression_ctx, CompressionDict::GetEmptyDict(),
+        FLAGS_compression_type_e, FLAGS_sample_for_compression);
+    UncompressionContext uncompression_ctx(FLAGS_compression_type_e);
+    UncompressionInfo uncompression_info(uncompression_ctx,
+                                         UncompressionDict::GetEmptyDict(),
+                                         FLAGS_compression_type_e);
+
+    bool ok = CompressSlice(compression_info, input, &compressed);
     int64_t bytes = 0;
     int decompress_size;
     while (ok && bytes < 1024 * 1048576) {
-      char *uncompressed = nullptr;
+      CacheAllocationPtr uncompressed;
       switch (FLAGS_compression_type_e) {
         case rocksdb::kSnappyCompression: {
           // get size and allocate here to make comparison fair
@@ -2815,45 +3157,44 @@ void VerifyDBFromDB(std::string& truth_db_name) {
             ok = false;
             break;
           }
-          uncompressed = new char[ulength];
+          uncompressed = AllocateBlock(ulength, nullptr);
           ok = Snappy_Uncompress(compressed.data(), compressed.size(),
-                                 uncompressed);
+                                 uncompressed.get());
           break;
         }
       case rocksdb::kZlibCompression:
-        uncompressed = Zlib_Uncompress(compressed.data(), compressed.size(),
-                                       &decompress_size, 2);
-        ok = uncompressed != nullptr;
+        uncompressed = Zlib_Uncompress(uncompression_info, compressed.data(),
+                                       compressed.size(), &decompress_size, 2);
+        ok = uncompressed.get() != nullptr;
         break;
       case rocksdb::kBZip2Compression:
         uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(),
                                         &decompress_size, 2);
-        ok = uncompressed != nullptr;
+        ok = uncompressed.get() != nullptr;
         break;
       case rocksdb::kLZ4Compression:
-        uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
-                                      &decompress_size, 2);
-        ok = uncompressed != nullptr;
+        uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
+                                      compressed.size(), &decompress_size, 2);
+        ok = uncompressed.get() != nullptr;
         break;
       case rocksdb::kLZ4HCCompression:
-        uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(),
-                                      &decompress_size, 2);
-        ok = uncompressed != nullptr;
+        uncompressed = LZ4_Uncompress(uncompression_info, compressed.data(),
+                                      compressed.size(), &decompress_size, 2);
+        ok = uncompressed.get() != nullptr;
         break;
       case rocksdb::kXpressCompression:
-        uncompressed = XPRESS_Uncompress(compressed.data(), compressed.size(),
-          &decompress_size);
-        ok = uncompressed != nullptr;
+        uncompressed.reset(XPRESS_Uncompress(
+            compressed.data(), compressed.size(), &decompress_size));
+        ok = uncompressed.get() != nullptr;
         break;
       case rocksdb::kZSTD:
-        uncompressed = ZSTD_Uncompress(compressed.data(), compressed.size(),
-                                       &decompress_size);
-        ok = uncompressed != nullptr;
+        uncompressed = ZSTD_Uncompress(uncompression_info, compressed.data(),
+                                       compressed.size(), &decompress_size);
+        ok = uncompressed.get() != nullptr;
         break;
       default:
         ok = false;
       }
-      delete[] uncompressed;
       bytes += input.size();
       thread->stats.FinishedOps(nullptr, nullptr, 1, kUncompress);
     }
@@ -2883,6 +3224,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
               FLAGS_options_file.c_str(), s.ToString().c_str());
       exit(1);
     }
+#else
+    (void)opts;
 #endif
     return false;
   }
@@ -2916,9 +3259,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     options.use_direct_io_for_flush_and_compaction =
         FLAGS_use_direct_io_for_flush_and_compaction;
 #ifndef ROCKSDB_LITE
+    options.ttl = FLAGS_fifo_compaction_ttl;
     options.compaction_options_fifo = CompactionOptionsFIFO(
         FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024,
-        FLAGS_fifo_compaction_allow_compaction, FLAGS_fifo_compaction_ttl);
+        FLAGS_fifo_compaction_allow_compaction);
 #endif  // ROCKSDB_LITE
     if (FLAGS_prefix_size != 0) {
       options.prefix_extractor.reset(
@@ -2936,6 +3280,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0;
     options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio;
+    options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
     if (FLAGS_memtable_insert_with_hint_prefix_size > 0) {
       options.memtable_insert_with_hint_prefix_extractor.reset(
           NewCappedPrefixTransform(
@@ -2982,10 +3327,6 @@ void VerifyDBFromDB(std::string& truth_db_name) {
           new VectorRepFactory
         );
         break;
-      case kCuckoo:
-        options.memtable_factory.reset(NewHashCuckooRepFactory(
-            options.write_buffer_size, FLAGS_key_size + FLAGS_value_size));
-        break;
 #else
       default:
         fprintf(stderr, "Only skip list is supported in lite mode\n");
@@ -3020,6 +3361,12 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
         exit(1);
       }
+
+      if (!FLAGS_mmap_read) {
+        fprintf(stderr, "cuckoo table format requires mmap read to operate\n");
+        exit(1);
+      }
+
       rocksdb::CuckooTableOptions table_options;
       table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
       table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
@@ -3041,16 +3388,18 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       } else {
         block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
       }
-      if (FLAGS_partition_index_and_filters) {
+      if (FLAGS_partition_index_and_filters || FLAGS_partition_index) {
         if (FLAGS_use_hash_search) {
           fprintf(stderr,
                   "use_hash_search is incompatible with "
-                  "partition_index_and_filters and is ignored");
+                  "partition index and is ignored");
         }
         block_based_options.index_type =
             BlockBasedTableOptions::kTwoLevelIndexSearch;
-        block_based_options.partition_filters = true;
         block_based_options.metadata_block_size = FLAGS_metadata_block_size;
+        if (FLAGS_partition_index_and_filters) {
+          block_based_options.partition_filters = true;
+        }
       }
       if (cache_ == nullptr) {
         block_based_options.no_block_cache = true;
@@ -3059,6 +3408,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
           FLAGS_cache_index_and_filter_blocks;
       block_based_options.pin_l0_filter_and_index_blocks_in_cache =
           FLAGS_pin_l0_filter_and_index_blocks_in_cache;
+      block_based_options.pin_top_level_index_and_filter =
+          FLAGS_pin_top_level_index_and_filter;
       if (FLAGS_cache_high_pri_pool_ratio > 1e-6) {  // > 0.0 + eps
         block_based_options.cache_index_and_filter_blocks_with_high_priority =
             true;
@@ -3070,8 +3421,21 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       block_based_options.index_block_restart_interval =
           FLAGS_index_block_restart_interval;
       block_based_options.filter_policy = filter_policy_;
-      block_based_options.format_version = 2;
+      block_based_options.format_version =
+          static_cast<uint32_t>(FLAGS_format_version);
       block_based_options.read_amp_bytes_per_bit = FLAGS_read_amp_bytes_per_bit;
+      block_based_options.enable_index_compression =
+          FLAGS_enable_index_compression;
+      block_based_options.block_align = FLAGS_block_align;
+      if (FLAGS_use_data_block_hash_index) {
+        block_based_options.data_block_index_type =
+            rocksdb::BlockBasedTableOptions::kDataBlockBinaryAndHash;
+      } else {
+        block_based_options.data_block_index_type =
+            rocksdb::BlockBasedTableOptions::kDataBlockBinarySearch;
+      }
+      block_based_options.data_block_hash_table_util_ratio =
+          FLAGS_data_block_hash_table_util_ratio;
       if (FLAGS_read_cache_path != "") {
 #ifndef ROCKSDB_LITE
         Status rc_status;
@@ -3130,8 +3494,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     options.level0_slowdown_writes_trigger =
       FLAGS_level0_slowdown_writes_trigger;
     options.compression = FLAGS_compression_type_e;
-    options.compression_opts.level = FLAGS_compression_level;
-    options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
+    options.sample_for_compression = FLAGS_sample_for_compression;
     options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds;
     options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB;
     options.max_total_wal_size = FLAGS_max_total_wal_size;
@@ -3156,6 +3519,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     options.delayed_write_rate = FLAGS_delayed_write_rate;
     options.allow_concurrent_memtable_write =
         FLAGS_allow_concurrent_memtable_write;
+    options.inplace_update_support = FLAGS_inplace_update_support;
+    options.inplace_update_num_locks = FLAGS_inplace_update_num_locks;
     options.enable_write_thread_adaptive_yield =
         FLAGS_enable_write_thread_adaptive_yield;
     options.enable_pipelined_write = FLAGS_enable_pipelined_write;
@@ -3212,20 +3577,6 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     if (FLAGS_thread_status_per_interval > 0) {
       options.enable_thread_tracking = true;
     }
-    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
-      if (FLAGS_rate_limit_bg_reads &&
-          !FLAGS_new_table_reader_for_compaction_inputs) {
-        fprintf(stderr,
-                "rate limit compaction reads must have "
-                "new_table_reader_for_compaction_inputs set\n");
-        exit(1);
-      }
-      options.rate_limiter.reset(NewGenericRateLimiter(
-          FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
-          10 /* fairness */,
-          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
-                                    : RateLimiter::Mode::kWritesOnly));
-    }
 
 #ifndef ROCKSDB_LITE
     if (FLAGS_readonly && FLAGS_transaction_db) {
@@ -3244,7 +3595,31 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     options.wal_dir = FLAGS_wal_dir;
     options.create_if_missing = !FLAGS_use_existing_db;
     options.dump_malloc_stats = FLAGS_dump_malloc_stats;
+    options.stats_dump_period_sec =
+        static_cast<unsigned int>(FLAGS_stats_dump_period_sec);
+    options.stats_persist_period_sec =
+        static_cast<unsigned int>(FLAGS_stats_persist_period_sec);
+    options.stats_history_buffer_size =
+        static_cast<size_t>(FLAGS_stats_history_buffer_size);
 
+    options.compression_opts.level = FLAGS_compression_level;
+    options.compression_opts.max_dict_bytes = FLAGS_compression_max_dict_bytes;
+    options.compression_opts.zstd_max_train_bytes =
+        FLAGS_compression_zstd_max_train_bytes;
+    // If this is a block based table, set some related options
+    if (options.table_factory->Name() == BlockBasedTableFactory::kName &&
+        options.table_factory->GetOptions() != nullptr) {
+      BlockBasedTableOptions* table_options =
+          reinterpret_cast<BlockBasedTableOptions*>(
+              options.table_factory->GetOptions());
+      if (FLAGS_cache_size) {
+        table_options->block_cache = cache_;
+      }
+      if (FLAGS_bloom_bits >= 0) {
+        table_options->filter_policy.reset(NewBloomFilterPolicy(
+            FLAGS_bloom_bits, FLAGS_use_block_based_filter));
+      }
+    }
     if (FLAGS_row_cache_size) {
       if (FLAGS_cache_numshardbits >= 1) {
         options.row_cache =
@@ -3257,8 +3632,32 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
       FLAGS_env->LowerThreadPoolIOPriority(Env::HIGH);
     }
+    if (FLAGS_enable_cpu_prio) {
+      FLAGS_env->LowerThreadPoolCPUPriority(Env::LOW);
+      FLAGS_env->LowerThreadPoolCPUPriority(Env::HIGH);
+    }
     options.env = FLAGS_env;
+    if (FLAGS_sine_write_rate) {
+      FLAGS_benchmark_write_rate_limit = static_cast<uint64_t>(SineRate(0));
+    }
+
+    if (FLAGS_rate_limiter_bytes_per_sec > 0) {
+      if (FLAGS_rate_limit_bg_reads &&
+          !FLAGS_new_table_reader_for_compaction_inputs) {
+        fprintf(stderr,
+                "rate limit compaction reads must have "
+                "new_table_reader_for_compaction_inputs set\n");
+        exit(1);
+      }
+      options.rate_limiter.reset(NewGenericRateLimiter(
+          FLAGS_rate_limiter_bytes_per_sec, 100 * 1000 /* refill_period_us */,
+          10 /* fairness */,
+          FLAGS_rate_limit_bg_reads ? RateLimiter::Mode::kReadsOnly
+                                    : RateLimiter::Mode::kWritesOnly,
+          FLAGS_rate_limiter_auto_tuned));
+    }
 
+    options.listeners.emplace_back(listener_);
     if (FLAGS_num_multi_db <= 1) {
       OpenDb(options, FLAGS_db, &db_);
     } else {
@@ -3273,6 +3672,25 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       }
       options.wal_dir = wal_dir;
     }
+
+    // KeepFilter is a noop filter, this can be used to test compaction filter
+    if (FLAGS_use_keep_filter) {
+      options.compaction_filter = new KeepFilter();
+      fprintf(stdout, "A noop compaction filter is used\n");
+    }
+
+    if (FLAGS_use_existing_keys) {
+      // Only work on single database
+      assert(db_.db != nullptr);
+      ReadOptions read_opts;
+      read_opts.total_order_seek = true;
+      Iterator* iter = db_.db->NewIterator(read_opts);
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        keys_.emplace_back(iter->key().ToString());
+      }
+      delete iter;
+      FLAGS_num = keys_.size();
+    }
   }
 
   void Open(Options* opts) {
@@ -3359,7 +3777,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         db->db = db->opt_txn_db->GetBaseDB();
       }
     } else if (FLAGS_transaction_db) {
-      TransactionDB* ptr;
+      TransactionDB* ptr = nullptr;
       TransactionDBOptions txn_db_options;
       s = CreateLoggerFromOptions(db_name, options, &options.info_log);
       if (s.ok()) {
@@ -3370,7 +3788,14 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       }
     } else if (FLAGS_use_blob_db) {
       blob_db::BlobDBOptions blob_db_options;
-      blob_db::BlobDB* ptr;
+      blob_db_options.enable_garbage_collection = FLAGS_blob_db_enable_gc;
+      blob_db_options.is_fifo = FLAGS_blob_db_is_fifo;
+      blob_db_options.max_db_size = FLAGS_blob_db_max_db_size;
+      blob_db_options.ttl_range_secs = FLAGS_blob_db_ttl_range_secs;
+      blob_db_options.min_blob_size = FLAGS_blob_db_min_blob_size;
+      blob_db_options.bytes_per_sync = FLAGS_blob_db_bytes_per_sync;
+      blob_db_options.blob_file_size = FLAGS_blob_db_file_size;
+      blob_db::BlobDB* ptr = nullptr;
       s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr);
       if (s.ok()) {
         db->db = ptr;
@@ -3412,12 +3837,9 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
   class KeyGenerator {
    public:
-    KeyGenerator(Random64* rand, WriteMode mode,
-        uint64_t num, uint64_t num_per_set = 64 * 1024)
-      : rand_(rand),
-        mode_(mode),
-        num_(num),
-        next_(0) {
+    KeyGenerator(Random64* rand, WriteMode mode, uint64_t num,
+                 uint64_t /*num_per_set*/ = 64 * 1024)
+        : rand_(rand), mode_(mode), num_(num), next_(0) {
       if (mode_ == UNIQUE_RANDOM) {
         // NOTE: if memory consumption of this approach becomes a concern,
         // we can either break it into pieces and only random shuffle a section
@@ -3440,7 +3862,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         case RANDOM:
           return rand_->Next() % num_;
         case UNIQUE_RANDOM:
-          assert(next_ + 1 < num_);
+          assert(next_ < num_);
           return values_[next_++];
       }
       assert(false);
@@ -3471,6 +3893,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
   }
 
+  double SineRate(double x) {
+    return FLAGS_sine_a*sin((FLAGS_sine_b*x) + FLAGS_sine_c) + FLAGS_sine_d;
+  }
+
   void DoWrite(ThreadState* thread, WriteMode write_mode) {
     const int test_duration = write_mode == RANDOM ? FLAGS_duration : 0;
     const int64_t num_ops = writes_ == 0 ? num_ : writes_;
@@ -3490,7 +3916,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
     Duration duration(test_duration, max_ops, ops_per_stage);
     for (size_t i = 0; i < num_key_gens; i++) {
-      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_,
+      key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode,
+                                         num_ + max_num_range_tombstones_,
                                          ops_per_stage));
     }
 
@@ -3554,7 +3981,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         if (use_blob_db_) {
 #ifndef ROCKSDB_LITE
           Slice val = gen.Generate(value_size_);
-          int ttl = rand() % 86400;
+          int ttl = rand() % FLAGS_blob_db_max_ttl_range;
           blob_db::BlobDB* blobdb =
               static_cast<blob_db::BlobDB*>(db_with_cfh->db);
           s = blobdb->PutWithTTL(write_options_, key, val, ttl);
@@ -3571,9 +3998,13 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         bytes += value_size_ + key_size_;
         ++num_written;
         if (writes_per_range_tombstone_ > 0 &&
-            num_written / writes_per_range_tombstone_ <=
+            num_written > writes_before_delete_range_ &&
+            (num_written - writes_before_delete_range_) /
+                    writes_per_range_tombstone_ <=
                 max_num_range_tombstones_ &&
-            num_written % writes_per_range_tombstone_ == 0) {
+            (num_written - writes_before_delete_range_) %
+                    writes_per_range_tombstone_ ==
+                0) {
           int64_t begin_num = key_gens[id]->Next();
           if (FLAGS_expand_range_tombstones) {
             for (int64_t offset = 0; offset < range_tombstone_width_;
@@ -3612,12 +4043,35 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         }
       }
       if (!use_blob_db_) {
-#ifndef ROCKSDB_LITE
         s = db_with_cfh->db->Write(write_options_, &batch);
-#endif  //  ROCKSDB_LITE
       }
       thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db,
                                 entries_per_batch_, kWrite);
+      if (FLAGS_sine_write_rate) {
+        uint64_t now = FLAGS_env->NowMicros();
+
+        uint64_t usecs_since_last;
+        if (now > thread->stats.GetSineInterval()) {
+          usecs_since_last = now - thread->stats.GetSineInterval();
+        } else {
+          usecs_since_last = 0;
+        }
+
+        if (usecs_since_last >
+            (FLAGS_sine_write_rate_interval_milliseconds * uint64_t{1000})) {
+          double usecs_since_start =
+                  static_cast<double>(now - thread->stats.GetStart());
+          thread->stats.ResetSineInterval();
+          uint64_t write_rate =
+                  static_cast<uint64_t>(SineRate(usecs_since_start / 1000000.0));
+          thread->shared->write_rate_limiter.reset(
+                  NewGenericRateLimiter(write_rate));
+        }
+      }
+      if (!s.ok()) {
+        s = listener_->WaitForRecovery(600000000) ? Status::OK() : s;
+      }
+
       if (!s.ok()) {
         fprintf(stderr, "put error: %s\n", s.ToString().c_str());
         exit(1);
@@ -3703,12 +4157,13 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       for (size_t i = 0; i < num_db; i++) {
         auto db = db_list[i];
         auto compactionOptions = CompactionOptions();
+        compactionOptions.compression = FLAGS_compression_type_e;
         auto options = db->GetOptions();
         MutableCFOptions mutable_cf_options(options);
         for (size_t j = 0; j < sorted_runs[i].size(); j++) {
           compactionOptions.output_file_size_limit =
-              mutable_cf_options.MaxFileSizeForLevel(
-                  static_cast<int>(output_level));
+              MaxFileSizeForLevel(mutable_cf_options,
+                  static_cast<int>(output_level), compaction_style);
           std::cout << sorted_runs[i][j].size() << std::endl;
           db->CompactFiles(compactionOptions, {sorted_runs[i][j].back().name,
                                                sorted_runs[i][j].front().name},
@@ -3754,12 +4209,13 @@ void VerifyDBFromDB(std::string& truth_db_name) {
       for (size_t i = 0; i < num_db; i++) {
         auto db = db_list[i];
         auto compactionOptions = CompactionOptions();
+        compactionOptions.compression = FLAGS_compression_type_e;
         auto options = db->GetOptions();
         MutableCFOptions mutable_cf_options(options);
         for (size_t j = 0; j < sorted_runs[i].size(); j++) {
           compactionOptions.output_file_size_limit =
-              mutable_cf_options.MaxFileSizeForLevel(
-                  static_cast<int>(output_level));
+              MaxFileSizeForLevel(mutable_cf_options,
+                  static_cast<int>(output_level), compaction_style);
           db->CompactFiles(
               compactionOptions,
               {sorted_runs[i][j].back().name, sorted_runs[i][j].front().name},
@@ -3899,7 +4355,7 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         }
         if (levelMeta.level == 0) {
           for (auto& fileMeta : levelMeta.files) {
-            fprintf(stdout, "Level[%d]: %s(size: %" PRIu64 " bytes)\n",
+            fprintf(stdout, "Level[%d]: %s(size: %" ROCKSDB_PRIszt " bytes)\n",
                     levelMeta.level, fileMeta.name.c_str(), fileMeta.size);
           }
         } else {
@@ -3920,6 +4376,9 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     return Status::OK();
 #else
+    (void)thread;
+    (void)compaction_style;
+    (void)write_mode;
     fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n");
     return Status::NotSupported(
         "Rocksdb Lite doesn't support filldeterministic");
@@ -3959,7 +4418,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     delete iter;
     thread->stats.AddBytes(bytes);
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -4041,7 +4501,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -4119,7 +4580,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -4173,6 +4635,255 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddMessage(msg);
   }
 
+  // THe reverse function of Pareto function
+  int64_t ParetoCdfInversion(double u, double theta, double k, double sigma) {
+    double ret;
+    if (k == 0.0) {
+      ret = theta - sigma * std::log(u);
+    } else {
+      ret = theta + sigma * (std::pow(u, -1 * k) - 1) / k;
+    }
+    return static_cast<int64_t>(ceil(ret));
+  }
+  // inversion of y=ax^b
+  int64_t PowerCdfInversion(double u, double a, double b) {
+    double ret;
+    ret = std::pow((u / a), (1 / b));
+    return static_cast<int64_t>(ceil(ret));
+  }
+
+  // Add the noice to the QPS
+  double AddNoise(double origin, double noise_ratio) {
+    if (noise_ratio < 0.0 || noise_ratio > 1.0) {
+      return origin;
+    }
+    int band_int = static_cast<int>(FLAGS_sine_a);
+    double delta = (rand() % band_int - band_int / 2) * noise_ratio;
+    if (origin + delta < 0) {
+      return origin;
+    } else {
+      return (origin + delta);
+    }
+  }
+
+  // decide the query type
+  // 0 Get, 1 Put, 2 Seek, 3 SeekForPrev, 4 Delete, 5 SingleDelete, 6 merge
+  class QueryDecider {
+   public:
+    std::vector<int> type_;
+    std::vector<double> ratio_;
+    int range_;
+
+    QueryDecider() {}
+    ~QueryDecider() {}
+
+    Status Initiate(std::vector<double> ratio_input) {
+      int range_max = 1000;
+      double sum = 0.0;
+      for (auto& ratio : ratio_input) {
+        sum += ratio;
+      }
+      range_ = 0;
+      for (auto& ratio : ratio_input) {
+        range_ += static_cast<int>(ceil(range_max * (ratio / sum)));
+        type_.push_back(range_);
+        ratio_.push_back(ratio / sum);
+      }
+      return Status::OK();
+    }
+
+    int GetType(int64_t rand_num) {
+      if (rand_num < 0) {
+        rand_num = rand_num * (-1);
+      }
+      assert(range_ != 0);
+      int pos = static_cast<int>(rand_num % range_);
+      for (int i = 0; i < static_cast<int>(type_.size()); i++) {
+        if (pos < type_[i]) {
+          return i;
+        }
+      }
+      return 0;
+    }
+  };
+
+  // The graph wokrload mixed with Get, Put, Iterator
+  void MixGraph(ThreadState* thread) {
+    int64_t read = 0;  // including single gets and Next of iterators
+    int64_t gets = 0;
+    int64_t puts = 0;
+    int64_t found = 0;
+    int64_t seek = 0;
+    int64_t seek_found = 0;
+    int64_t bytes = 0;
+    const int64_t default_value_max = 1 * 1024 * 1024;
+    int64_t value_max = default_value_max;
+    int64_t scan_len_max = FLAGS_mix_max_scan_len;
+    double write_rate = 1000000.0;
+    double read_rate = 1000000.0;
+    std::vector<double> ratio{FLAGS_mix_get_ratio, FLAGS_mix_put_ratio,
+                              FLAGS_mix_seek_ratio};
+    char value_buffer[default_value_max];
+    QueryDecider query;
+    RandomGenerator gen;
+    Status s;
+    if (value_max > FLAGS_mix_max_value_size) {
+      value_max = FLAGS_mix_max_value_size;
+    }
+
+    ReadOptions options(FLAGS_verify_checksum, true);
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    PinnableSlice pinnable_val;
+    query.Initiate(ratio);
+
+    // the limit of qps initiation
+    if (FLAGS_sine_a != 0 || FLAGS_sine_d != 0) {
+      thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
+          read_rate, 100000 /* refill_period_us */, 10 /* fairness */,
+          RateLimiter::Mode::kReadsOnly));
+      thread->shared->write_rate_limiter.reset(
+          NewGenericRateLimiter(write_rate));
+    }
+
+    Duration duration(FLAGS_duration, reads_);
+    while (!duration.Done(1)) {
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      int64_t rand_v, key_rand, key_seed;
+      rand_v = GetRandomKey(&thread->rand) % FLAGS_num;
+      double u = static_cast<double>(rand_v) / FLAGS_num;
+      key_seed = PowerCdfInversion(u, FLAGS_key_dist_a, FLAGS_key_dist_b);
+      Random64 rand(key_seed);
+      key_rand = static_cast<int64_t>(rand.Next()) % FLAGS_num;
+      GenerateKeyFromInt(key_rand, FLAGS_num, &key);
+      int query_type = query.GetType(rand_v);
+
+      // change the qps
+      uint64_t now = FLAGS_env->NowMicros();
+      uint64_t usecs_since_last;
+      if (now > thread->stats.GetSineInterval()) {
+        usecs_since_last = now - thread->stats.GetSineInterval();
+      } else {
+        usecs_since_last = 0;
+      }
+
+      if (usecs_since_last >
+          (FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000})) {
+        double usecs_since_start =
+            static_cast<double>(now - thread->stats.GetStart());
+        thread->stats.ResetSineInterval();
+        double mix_rate_with_noise = AddNoise(
+            SineRate(usecs_since_start / 1000000.0), FLAGS_sine_mix_rate_noise);
+        read_rate = mix_rate_with_noise * (query.ratio_[0] + query.ratio_[2]);
+        write_rate =
+            mix_rate_with_noise * query.ratio_[1] * FLAGS_mix_ave_kv_size;
+
+        thread->shared->write_rate_limiter.reset(
+            NewGenericRateLimiter(write_rate));
+        thread->shared->read_rate_limiter.reset(NewGenericRateLimiter(
+            read_rate,
+            FLAGS_sine_mix_rate_interval_milliseconds * uint64_t{1000}, 10,
+            RateLimiter::Mode::kReadsOnly));
+      }
+      // Start the query
+      if (query_type == 0) {
+        // the Get query
+        gets++;
+        read++;
+        if (FLAGS_num_column_families > 1) {
+          s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key,
+                                   &pinnable_val);
+        } else {
+          pinnable_val.Reset();
+          s = db_with_cfh->db->Get(options,
+                                   db_with_cfh->db->DefaultColumnFamily(), key,
+                                   &pinnable_val);
+        }
+
+        if (s.ok()) {
+          found++;
+          bytes += key.size() + pinnable_val.size();
+        } else if (!s.IsNotFound()) {
+          fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str());
+          abort();
+        }
+
+        if (thread->shared->read_rate_limiter.get() != nullptr &&
+            read % 256 == 255) {
+          thread->shared->read_rate_limiter->Request(
+              256, Env::IO_HIGH, nullptr /* stats */,
+              RateLimiter::OpType::kRead);
+        }
+        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kRead);
+      } else if (query_type == 1) {
+        // the Put query
+        puts++;
+        int64_t value_size = ParetoCdfInversion(
+            u, FLAGS_value_theta, FLAGS_value_k, FLAGS_value_sigma);
+        if (value_size < 0) {
+          value_size = 10;
+        } else if (value_size > value_max) {
+          value_size = value_size % value_max;
+        }
+        s = db_with_cfh->db->Put(
+            write_options_, key,
+            gen.Generate(static_cast<unsigned int>(value_size)));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+
+        if (thread->shared->write_rate_limiter) {
+          thread->shared->write_rate_limiter->Request(
+              key.size() + value_size, Env::IO_HIGH, nullptr /*stats*/,
+              RateLimiter::OpType::kWrite);
+        }
+        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kWrite);
+      } else if (query_type == 2) {
+        // Seek query
+        if (db_with_cfh->db != nullptr) {
+          Iterator* single_iter = nullptr;
+          single_iter = db_with_cfh->db->NewIterator(options);
+          if (single_iter != nullptr) {
+            single_iter->Seek(key);
+            seek++;
+            read++;
+            if (single_iter->Valid() && single_iter->key().compare(key) == 0) {
+              seek_found++;
+            }
+            int64_t scan_length =
+                ParetoCdfInversion(u, FLAGS_iter_theta, FLAGS_iter_k,
+                                   FLAGS_iter_sigma) %
+                scan_len_max;
+            for (int64_t j = 0; j < scan_length && single_iter->Valid(); j++) {
+              Slice value = single_iter->value();
+              memcpy(value_buffer, value.data(),
+                     std::min(value.size(), sizeof(value_buffer)));
+              bytes += single_iter->key().size() + single_iter->value().size();
+              single_iter->Next();
+              assert(single_iter->status().ok());
+            }
+          }
+          delete single_iter;
+        }
+        thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1, kSeek);
+      }
+    }
+    char msg[256];
+    snprintf(msg, sizeof(msg),
+             "( Gets:%" PRIu64 " Puts:%" PRIu64 " Seek:%" PRIu64 " of %" PRIu64
+             " in %" PRIu64 " found)\n",
+             gets, puts, seek, found, read);
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(msg);
+
+    if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
+    }
+  }
+
   void IteratorCreation(ThreadState* thread) {
     Duration duration(FLAGS_duration, reads_);
     ReadOptions options(FLAGS_verify_checksum, true);
@@ -4212,9 +4923,31 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     std::unique_ptr<const char[]> key_guard;
     Slice key = AllocateKey(&key_guard);
 
+    std::unique_ptr<const char[]> upper_bound_key_guard;
+    Slice upper_bound = AllocateKey(&upper_bound_key_guard);
+    std::unique_ptr<const char[]> lower_bound_key_guard;
+    Slice lower_bound = AllocateKey(&lower_bound_key_guard);
+
     Duration duration(FLAGS_duration, reads_);
     char value_buffer[256];
     while (!duration.Done(1)) {
+      int64_t seek_pos = thread->rand.Next() % FLAGS_num;
+      GenerateKeyFromInt((uint64_t)seek_pos, FLAGS_num, &key);
+      if (FLAGS_max_scan_distance != 0) {
+        if (FLAGS_reverse_iterator) {
+          GenerateKeyFromInt(
+              static_cast<uint64_t>(std::max(
+                  static_cast<int64_t>(0), seek_pos - FLAGS_max_scan_distance)),
+              FLAGS_num, &lower_bound);
+          options.iterate_lower_bound = &lower_bound;
+        } else {
+          GenerateKeyFromInt(
+              (uint64_t)std::min(FLAGS_num, seek_pos + FLAGS_max_scan_distance),
+              FLAGS_num, &upper_bound);
+          options.iterate_upper_bound = &upper_bound;
+        }
+      }
+
       if (!FLAGS_use_tailing_iterator) {
         if (db_.db != nullptr) {
           delete single_iter;
@@ -4235,7 +4968,6 @@ void VerifyDBFromDB(std::string& truth_db_name) {
         iter_to_use = multi_iters[thread->rand.Next() % multi_iters.size()];
       }
 
-      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
       iter_to_use->Seek(key);
       read++;
       if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
@@ -4276,7 +5008,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -4414,6 +5147,45 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddBytes(bytes);
   }
 
+  void ReadWhileScanning(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      BGScan(thread);
+    }
+  }
+
+  void BGScan(ThreadState* thread) {
+    if (FLAGS_num_multi_db > 0) {
+      fprintf(stderr, "Not supporting multiple DBs.\n");
+      abort();
+    }
+    assert(db_.db != nullptr);
+    ReadOptions read_options;
+    Iterator* iter = db_.db->NewIterator(read_options);
+
+    fprintf(stderr, "num reads to do %" PRIu64 "\n", reads_);
+    Duration duration(FLAGS_duration, reads_);
+    uint64_t num_seek_to_first = 0;
+    uint64_t num_next = 0;
+    while (!duration.Done(1)) {
+      if (!iter->Valid()) {
+        iter->SeekToFirst();
+        num_seek_to_first++;
+      } else if (!iter->status().ok()) {
+        fprintf(stderr, "Iterator error: %s\n",
+                iter->status().ToString().c_str());
+        abort();
+      } else {
+        iter->Next();
+        num_next++;
+      }
+
+      thread->stats.FinishedOps(&db_, db_.db, 1, kSeek);
+    }
+    delete iter;
+  }
+
   // Given a key K and value V, this puts (K+"0", V), (K+"1", V), (K+"2", V)
   // in DB atomically i.e in a single batch. Also refer GetMany.
   Status PutMany(DB* db, const WriteOptions& writeoptions, const Slice& key,
@@ -4673,6 +5445,58 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddMessage(msg);
   }
 
+  // Read-XOR-write for random keys. Xors the existing value with a randomly
+  // generated value, and stores the result. Assuming A in the array of bytes
+  // representing the existing value, we generate an array B of the same size,
+  // then compute C = A^B as C[i]=A[i]^B[i], and store C
+  void XORUpdateRandom(ThreadState* thread) {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    RandomGenerator gen;
+    std::string existing_value;
+    int64_t found = 0;
+    Duration duration(FLAGS_duration, readwrites_);
+
+    BytesXOROperator xor_operator;
+
+    std::unique_ptr<const char[]> key_guard;
+    Slice key = AllocateKey(&key_guard);
+    // the number of iterations is the larger of read_ or write_
+    while (!duration.Done(1)) {
+      DB* db = SelectDB(thread);
+      GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key);
+
+      auto status = db->Get(options, key, &existing_value);
+      if (status.ok()) {
+        ++found;
+      } else if (!status.IsNotFound()) {
+        fprintf(stderr, "Get returned an error: %s\n",
+                status.ToString().c_str());
+        exit(1);
+      }
+
+      Slice value = gen.Generate(value_size_);
+      std::string new_value;
+
+      if (status.ok()) {
+        Slice existing_value_slice = Slice(existing_value);
+        xor_operator.XOR(&existing_value_slice, value, &new_value);
+      } else {
+        xor_operator.XOR(nullptr, value, &new_value);
+      }
+
+      Status s = db->Put(write_options_, key, Slice(new_value));
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+      thread->stats.FinishedOps(nullptr, db, 1);
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg),
+             "( updates:%" PRIu64 " found:%" PRIu64 ")", readwrites_, found);
+    thread->stats.AddMessage(msg);
+  }
+
   // Read-modify-write for random keys.
   // Each operation causes the key grow by value_size (simulating an append).
   // Generally used for benchmarking against merges of similar type
@@ -4747,17 +5571,27 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     // The number of iterations is the larger of read_ or write_
     Duration duration(FLAGS_duration, readwrites_);
     while (!duration.Done(1)) {
-      DB* db = SelectDB(thread);
-      GenerateKeyFromInt(thread->rand.Next() % merge_keys_, merge_keys_, &key);
+      DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
+      int64_t key_rand = thread->rand.Next() % merge_keys_;
+      GenerateKeyFromInt(key_rand, merge_keys_, &key);
 
-      Status s = db->Merge(write_options_, key, gen.Generate(value_size_));
+      Status s;
+      if (FLAGS_num_column_families > 1) {
+        s = db_with_cfh->db->Merge(write_options_,
+                                   db_with_cfh->GetCfh(key_rand), key,
+                                   gen.Generate(value_size_));
+      } else {
+        s = db_with_cfh->db->Merge(write_options_,
+                                   db_with_cfh->db->DefaultColumnFamily(), key,
+                                   gen.Generate(value_size_));
+      }
 
       if (!s.ok()) {
         fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
         exit(1);
       }
       bytes += key.size() + value_size_;
-      thread->stats.FinishedOps(nullptr, db, 1, kMerge);
+      thread->stats.FinishedOps(nullptr, db_with_cfh->db, 1, kMerge);
     }
 
     // Print some statistics
@@ -4936,8 +5770,10 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddMessage(msg);
 
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
+    thread->stats.AddBytes(static_cast<int64_t>(inserter.GetBytesInserted()));
   }
 
   // Verifies consistency of data after RandomTransaction() has been run.
@@ -5097,7 +5933,8 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     thread->stats.AddBytes(bytes);
     thread->stats.AddMessage(msg);
     if (FLAGS_perf_level > rocksdb::PerfLevel::kDisable) {
-      thread->stats.AddMessage(get_perf_context()->ToString());
+      thread->stats.AddMessage(std::string("PERF_CONTEXT:\n") +
+                               get_perf_context()->ToString());
     }
   }
 
@@ -5176,7 +6013,9 @@ void VerifyDBFromDB(std::string& truth_db_name) {
 
   void Compact(ThreadState* thread) {
     DB* db = SelectDB(thread);
-    db->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    db->CompactRange(cro, nullptr, nullptr);
   }
 
   void CompactAll() {
@@ -5216,6 +6055,37 @@ void VerifyDBFromDB(std::string& truth_db_name) {
     }
     fprintf(stdout, "\n%s\n", stats.c_str());
   }
+
+  void Replay(ThreadState* thread) {
+    if (db_.db != nullptr) {
+      Replay(thread, &db_);
+    }
+  }
+
+  void Replay(ThreadState* /*thread*/, DBWithColumnFamilies* db_with_cfh) {
+    Status s;
+    std::unique_ptr<TraceReader> trace_reader;
+    s = NewFileTraceReader(FLAGS_env, EnvOptions(), FLAGS_trace_file,
+                           &trace_reader);
+    if (!s.ok()) {
+      fprintf(
+          stderr,
+          "Encountered an error creating a TraceReader from the trace file. "
+          "Error: %s\n",
+          s.ToString().c_str());
+      exit(1);
+    }
+    Replayer replayer(db_with_cfh->db, db_with_cfh->cfh,
+                      std::move(trace_reader));
+    s = replayer.Replay();
+    if (s.ok()) {
+      fprintf(stdout, "Replay started from trace_file: %s\n",
+              FLAGS_trace_file.c_str());
+    } else {
+      fprintf(stderr, "Starting replay failed. Error: %s\n",
+              s.ToString().c_str());
+    }
+  }
 };
 
 int db_bench_tool(int argc, char** argv) {
@@ -5249,6 +6119,9 @@ int db_bench_tool(int argc, char** argv) {
   if (FLAGS_statistics) {
     dbstats = rocksdb::CreateDBStatistics();
   }
+  if (dbstats) {
+    dbstats->set_stats_level(static_cast<StatsLevel>(FLAGS_stats_level));
+  }
   FLAGS_compaction_pri_e = (rocksdb::CompactionPri)FLAGS_compaction_pri;
 
   std::vector<std::string> fanout = rocksdb::StringSplit(
@@ -5278,6 +6151,13 @@ int db_bench_tool(int argc, char** argv) {
     }
   }
 #endif  // ROCKSDB_LITE
+  if (FLAGS_use_existing_keys && !FLAGS_use_existing_db) {
+    fprintf(stderr,
+            "`-use_existing_db` must be true for `-use_existing_keys` to be "
+            "settable\n");
+    exit(1);
+  }
+
   if (!FLAGS_hdfs.empty()) {
     FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
   }
@@ -5322,6 +6202,15 @@ int db_bench_tool(int argc, char** argv) {
 
   rocksdb::Benchmark benchmark;
   benchmark.Run();
+
+#ifndef ROCKSDB_LITE
+  if (FLAGS_print_malloc_stats) {
+    std::string stats_string;
+    rocksdb::DumpMallocStats(&stats_string);
+    fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str());
+  }
+#endif  // ROCKSDB_LITE
+
   return 0;
 }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/tools/db_bench_tool_test.cc b/thirdparty/rocksdb/tools/db_bench_tool_test.cc
index 145f329dea..1b19de5f17 100644
--- a/thirdparty/rocksdb/tools/db_bench_tool_test.cc
+++ b/thirdparty/rocksdb/tools/db_bench_tool_test.cc
@@ -15,7 +15,7 @@
 #include "util/testutil.h"
 
 #ifdef GFLAGS
-#include <gflags/gflags.h>
+#include "util/gflags_compat.h"
 
 namespace rocksdb {
 namespace {
@@ -26,7 +26,7 @@ static const size_t kArgBufferSize = 100000;
 class DBBenchTest : public testing::Test {
  public:
   DBBenchTest() : rnd_(0xFB) {
-    test_path_ = test::TmpDir() + "/db_bench_test";
+    test_path_ = test::PerThreadDBPath("db_bench_test");
     Env::Default()->CreateDir(test_path_);
     db_path_ = test_path_ + "/db";
     wal_path_ = test_path_ + "/wal";
@@ -248,6 +248,7 @@ const std::string options_file_content = R"OPTIONS_FILE(
   verify_checksums_in_compaction=true
   merge_operator=nullptr
   memtable_prefix_bloom_bits=0
+  memtable_whole_key_filtering=true
   paranoid_file_checks=false
   inplace_update_num_locks=10000
   optimize_filters_for_hits=false
@@ -279,7 +280,7 @@ const std::string options_file_content = R"OPTIONS_FILE(
 
 TEST_F(DBBenchTest, OptionsFileFromFile) {
   const std::string kOptionsFileName = test_path_ + "/OPTIONS_flash";
-  unique_ptr<WritableFile> writable;
+  std::unique_ptr<WritableFile> writable;
   ASSERT_OK(Env::Default()->NewWritableFile(kOptionsFileName, &writable,
                                             EnvOptions()));
   ASSERT_OK(writable->Append(options_file_content));
diff --git a/thirdparty/rocksdb/tools/db_crashtest.py b/thirdparty/rocksdb/tools/db_crashtest.py
index d64da7ac1f..dbabb2b4f8 100644
--- a/thirdparty/rocksdb/tools/db_crashtest.py
+++ b/thirdparty/rocksdb/tools/db_crashtest.py
@@ -1,10 +1,8 @@
 #! /usr/bin/env python
 import os
-import re
 import sys
 import time
 import random
-import logging
 import tempfile
 import subprocess
 import shutil
@@ -12,54 +10,84 @@
 
 # params overwrite priority:
 #   for default:
-#       default_params < blackbox|whitebox_default_params < args
+#       default_params < {blackbox,whitebox}_default_params < args
 #   for simple:
-#       simple_default_params < blackbox|whitebox_simple_default_params < args
+#       default_params < {blackbox,whitebox}_default_params <
+#       simple_default_params <
+#       {blackbox,whitebox}_simple_default_params < args
+#   for enable_atomic_flush:
+#       default_params < {blackbox,whitebox}_default_params <
+#       atomic_flush_params < args
+
+expected_values_file = tempfile.NamedTemporaryFile()
 
 default_params = {
+    "acquire_snapshot_one_in": 10000,
     "block_size": 16384,
     "cache_size": 1048576,
-    "use_clock_cache": "false",
-    "delpercent": 5,
+    "checkpoint_one_in": 1000000,
+    "compression_type": "snappy",
+    "compression_max_dict_bytes": lambda: 16384 * random.randint(0, 1),
+    "compression_zstd_max_train_bytes": lambda: 65536 * random.randint(0, 1),
+    "clear_column_family_one_in": 0,
+    "compact_files_one_in": 1000000,
+    "compact_range_one_in": 1000000,
+    "delpercent": 4,
+    "delrangepercent": 1,
     "destroy_db_initially": 0,
-    "disable_wal": 0,
-    "allow_concurrent_memtable_write": 0,
-    "iterpercent": 10,
+    "enable_pipelined_write": lambda: random.randint(0, 1),
+    "expected_values_path": expected_values_file.name,
+    "flush_one_in": 1000000,
     "max_background_compactions": 20,
     "max_bytes_for_level_base": 10485760,
     "max_key": 100000000,
     "max_write_buffer_number": 3,
-    "memtablerep": "prefix_hash",
     "mmap_read": lambda: random.randint(0, 1),
+    "nooverwritepercent": 1,
     "open_files": 500000,
-    "prefix_size": 7,
     "prefixpercent": 5,
     "progress_reports": 0,
     "readpercent": 45,
+    "recycle_log_file_num": lambda: random.randint(0, 1),
     "reopen": 20,
-    "sync": 0,
+    "snapshot_hold_ops": 100000,
+    "subcompactions": lambda: random.randint(1, 4),
     "target_file_size_base": 2097152,
     "target_file_size_multiplier": 2,
-    "threads": 32,
+    "use_direct_reads": lambda: random.randint(0, 1),
+    "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1),
+    "use_full_merge_v1": lambda: random.randint(0, 1),
+    "use_merge": lambda: random.randint(0, 1),
     "verify_checksum": 1,
     "write_buffer_size": 4 * 1024 * 1024,
     "writepercent": 35,
-    "log2_keys_per_lock": 2,
-    "subcompactions": lambda: random.randint(1, 4),
-    "use_merge": lambda: random.randint(0, 1),
-    "use_full_merge_v1": lambda: random.randint(0, 1),
+    "format_version": lambda: random.randint(2, 4),
+    "index_block_restart_interval": lambda: random.choice(range(1, 16)),
 }
 
+_TEST_DIR_ENV_VAR = 'TEST_TMPDIR'
+
 
 def get_dbname(test_name):
-    test_tmpdir = os.environ.get("TEST_TMPDIR")
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
     if test_tmpdir is None or test_tmpdir == "":
         dbname = tempfile.mkdtemp(prefix='rocksdb_crashtest_' + test_name)
     else:
         dbname = test_tmpdir + "/rocksdb_crashtest_" + test_name
         shutil.rmtree(dbname, True)
+        os.mkdir(dbname)
     return dbname
 
+
+def is_direct_io_supported(dbname):
+    with tempfile.NamedTemporaryFile(dir=dbname) as f:
+        try:
+            os.open(f.name, os.O_DIRECT)
+        except:
+            return False
+        return True
+
+
 blackbox_default_params = {
     # total time for this script to test db_stress
     "duration": 6000,
@@ -74,90 +102,76 @@ def get_dbname(test_name):
 whitebox_default_params = {
     "duration": 10000,
     "log2_keys_per_lock": 10,
-    "nooverwritepercent": 1,
     "ops_per_thread": 200000,
-    "test_batches_snapshots": lambda: random.randint(0, 1),
-    "write_buffer_size": 4 * 1024 * 1024,
-    "subcompactions": lambda: random.randint(1, 4),
     "random_kill_odd": 888887,
+    "test_batches_snapshots": lambda: random.randint(0, 1),
 }
 
 simple_default_params = {
-    "block_size": 16384,
-    "cache_size": 1048576,
-    "use_clock_cache": "false",
-    "column_families": 1,
-    "delpercent": 5,
-    "destroy_db_initially": 0,
-    "disable_wal": 0,
     "allow_concurrent_memtable_write": lambda: random.randint(0, 1),
-    "iterpercent": 10,
+    "column_families": 1,
     "max_background_compactions": 1,
     "max_bytes_for_level_base": 67108864,
-    "max_key": 100000000,
-    "max_write_buffer_number": 3,
     "memtablerep": "skip_list",
-    "mmap_read": lambda: random.randint(0, 1),
-    "prefix_size": 0,
-    "prefixpercent": 0,
-    "progress_reports": 0,
-    "readpercent": 50,
-    "reopen": 20,
-    "sync": 0,
+    "prefixpercent": 25,
+    "readpercent": 25,
     "target_file_size_base": 16777216,
     "target_file_size_multiplier": 1,
     "test_batches_snapshots": 0,
-    "threads": 32,
-    "verify_checksum": 1,
     "write_buffer_size": 32 * 1024 * 1024,
-    "writepercent": 35,
-    "subcompactions": lambda: random.randint(1, 4),
 }
 
 blackbox_simple_default_params = {
-    "duration": 6000,
-    "interval": 120,
     "open_files": -1,
-    "ops_per_thread": 100000000,
     "set_options_one_in": 0,
-    "test_batches_snapshots": 0,
 }
 
-whitebox_simple_default_params = {
-    "duration": 10000,
-    "log2_keys_per_lock": 10,
-    "nooverwritepercent": 1,
-    "open_files": 500000,
-    "ops_per_thread": 200000,
-    "write_buffer_size": 32 * 1024 * 1024,
-    "subcompactions": lambda: random.randint(1, 4),
+whitebox_simple_default_params = {}
+
+atomic_flush_params = {
+    "disable_wal": 1,
+    "reopen": 0,
+    "test_atomic_flush": 1,
+    # use small value for write_buffer_size so that RocksDB triggers flush
+    # more frequently
+    "write_buffer_size": 1024 * 1024,
 }
 
 
 def finalize_and_sanitize(src_params):
     dest_params = dict([(k,  v() if callable(v) else v)
                         for (k, v) in src_params.items()])
+    if dest_params.get("compression_type") != "zstd" or \
+            dest_params.get("compression_max_dict_bytes") == 0:
+        dest_params["compression_zstd_max_train_bytes"] = 0
     if dest_params.get("allow_concurrent_memtable_write", 1) == 1:
         dest_params["memtablerep"] = "skip_list"
+    if dest_params["mmap_read"] == 1 or not is_direct_io_supported(
+            dest_params["db"]):
+        dest_params["use_direct_io_for_flush_and_compaction"] = 0
+        dest_params["use_direct_reads"] = 0
+    if dest_params.get("test_batches_snapshots") == 1:
+        dest_params["delpercent"] += dest_params["delrangepercent"]
+        dest_params["delrangepercent"] = 0
     return dest_params
 
 
 def gen_cmd_params(args):
     params = {}
 
+    params.update(default_params)
+    if args.test_type == 'blackbox':
+        params.update(blackbox_default_params)
+    if args.test_type == 'whitebox':
+        params.update(whitebox_default_params)
     if args.simple:
         params.update(simple_default_params)
         if args.test_type == 'blackbox':
             params.update(blackbox_simple_default_params)
         if args.test_type == 'whitebox':
             params.update(whitebox_simple_default_params)
-
-    if not args.simple:
-        params.update(default_params)
-        if args.test_type == 'blackbox':
-            params.update(blackbox_default_params)
-        if args.test_type == 'whitebox':
-            params.update(whitebox_default_params)
+    if args.enable_atomic_flush:
+        params.update(atomic_flush_params)
 
     for k, v in vars(args).items():
         if v is not None:
@@ -165,36 +179,34 @@ def gen_cmd_params(args):
     return params
 
 
-def gen_cmd(params):
+def gen_cmd(params, unknown_params):
     cmd = ['./db_stress'] + [
         '--{0}={1}'.format(k, v)
         for k, v in finalize_and_sanitize(params).items()
         if k not in set(['test_type', 'simple', 'duration', 'interval',
-                         'random_kill_odd'])
-        and v is not None]
+                         'random_kill_odd', 'enable_atomic_flush'])
+        and v is not None] + unknown_params
     return cmd
 
 
 # This script runs and kills db_stress multiple times. It checks consistency
 # in case of unsafe crashes in RocksDB.
-def blackbox_crash_main(args):
+def blackbox_crash_main(args, unknown_args):
     cmd_params = gen_cmd_params(args)
     dbname = get_dbname('blackbox')
     exit_time = time.time() + cmd_params['duration']
 
     print("Running blackbox-crash-test with \n"
           + "interval_between_crash=" + str(cmd_params['interval']) + "\n"
-          + "total-duration=" + str(cmd_params['duration']) + "\n"
-          + "threads=" + str(cmd_params['threads']) + "\n"
-          + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
-          + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n"
-          + "subcompactions=" + str(cmd_params['subcompactions']) + "\n")
+          + "total-duration=" + str(cmd_params['duration']) + "\n")
 
     while time.time() < exit_time:
         run_had_errors = False
         killtime = time.time() + cmd_params['interval']
 
-        cmd = gen_cmd(dict(cmd_params.items() + {'db': dbname}.items()))
+        cmd = gen_cmd(dict(
+            cmd_params.items() +
+            {'db': dbname}.items()), unknown_args)
 
         child = subprocess.Popen(cmd, stderr=subprocess.PIPE)
         print("Running db_stress with pid=%d: %s\n\n"
@@ -220,12 +232,12 @@ def blackbox_crash_main(args):
 
         while True:
             line = child.stderr.readline().strip()
-            if line != '' and not line.startswith('WARNING'):
+            if line == '':
+                break
+            elif not line.startswith('WARNING'):
                 run_had_errors = True
                 print('stderr has error message:')
                 print('***' + line + '***')
-            else:
-                break
 
         if run_had_errors:
             sys.exit(2)
@@ -238,7 +250,7 @@ def blackbox_crash_main(args):
 
 # This python script runs db_stress multiple times. Some runs with
 # kill_random_test that causes rocksdb to crash at various points in code.
-def whitebox_crash_main(args):
+def whitebox_crash_main(args, unknown_args):
     cmd_params = gen_cmd_params(args)
     dbname = get_dbname('whitebox')
 
@@ -247,11 +259,7 @@ def whitebox_crash_main(args):
     half_time = cur_time + cmd_params['duration'] / 2
 
     print("Running whitebox-crash-test with \n"
-          + "total-duration=" + str(cmd_params['duration']) + "\n"
-          + "threads=" + str(cmd_params['threads']) + "\n"
-          + "ops_per_thread=" + str(cmd_params['ops_per_thread']) + "\n"
-          + "write_buffer_size=" + str(cmd_params['write_buffer_size']) + "\n"
-          + "subcompactions=" + str(cmd_params['subcompactions']) + "\n")
+          + "total-duration=" + str(cmd_params['duration']) + "\n")
 
     total_check_mode = 4
     check_mode = 0
@@ -307,15 +315,15 @@ def whitebox_crash_main(args):
             }
         else:
             # normal run
-            additional_opts = additional_opts = {
+            additional_opts = {
                 "kill_random_test": None,
                 "ops_per_thread": cmd_params['ops_per_thread'],
             }
 
         cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
-                           + {'db': dbname}.items()))
+                           + {'db': dbname}.items()), unknown_args)
 
-        print "Running:" + ' '.join(cmd) + "\n"
+        print "Running:" + ' '.join(cmd) + "\n"  # noqa: E999 T25377293 Grandfathered in
 
         popen = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT)
@@ -356,6 +364,8 @@ def whitebox_crash_main(args):
             # we need to clean up after ourselves -- only do this on test
             # success
             shutil.rmtree(dbname, True)
+            os.mkdir(dbname)
+            cmd_params.pop('expected_values_path', None)
             check_mode = (check_mode + 1) % total_check_mode
 
         time.sleep(1)  # time to stabilize after a kill
@@ -366,6 +376,7 @@ def main():
         db_stress multiple times")
     parser.add_argument("test_type", choices=["blackbox", "whitebox"])
     parser.add_argument("--simple", action="store_true")
+    parser.add_argument("--enable_atomic_flush", action='store_true')
 
     all_params = dict(default_params.items()
                       + blackbox_default_params.items()
@@ -376,12 +387,19 @@ def main():
 
     for k, v in all_params.items():
         parser.add_argument("--" + k, type=type(v() if callable(v) else v))
-    args = parser.parse_args()
+    # unknown_args are passed directly to db_stress
+    args, unknown_args = parser.parse_known_args()
+
+    test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
+    if test_tmpdir is not None and not os.path.isdir(test_tmpdir):
+        print('%s env var is set to a non-existent directory: %s' %
+                (_TEST_DIR_ENV_VAR, test_tmpdir))
+        sys.exit(1)
 
     if args.test_type == 'blackbox':
-        blackbox_crash_main(args)
+        blackbox_crash_main(args, unknown_args)
     if args.test_type == 'whitebox':
-        whitebox_crash_main(args)
+        whitebox_crash_main(args, unknown_args)
 
 if __name__ == '__main__':
     main()
diff --git a/thirdparty/rocksdb/tools/db_repl_stress.cc b/thirdparty/rocksdb/tools/db_repl_stress.cc
index fac73c0668..c640b5945b 100644
--- a/thirdparty/rocksdb/tools/db_repl_stress.cc
+++ b/thirdparty/rocksdb/tools/db_repl_stress.cc
@@ -12,14 +12,13 @@ int main() {
 }
 #else
 
-#include <cstdio>
 #include <atomic>
-
-#include <gflags/gflags.h>
+#include <cstdio>
 
 #include "db/write_batch_internal.h"
 #include "rocksdb/db.h"
 #include "rocksdb/types.h"
+#include "util/gflags_compat.h"
 #include "util/testutil.h"
 
 // Run a thread to perform Put's.
@@ -30,12 +29,12 @@ int main() {
 
 using namespace rocksdb;
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 struct DataPumpThread {
   size_t no_records;
-  DB* db; // Assumption DB is Open'ed already.
+  DB* db;  // Assumption DB is Open'ed already.
 };
 
 static std::string RandomString(Random* rnd, int len) {
@@ -49,9 +48,10 @@ static void DataPumpThreadBody(void* arg) {
   DB* db = t->db;
   Random rnd(301);
   size_t i = 0;
-  while(i++ < t->no_records) {
-    if(!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
-                Slice(RandomString(&rnd, 500))).ok()) {
+  while (i++ < t->no_records) {
+    if (!db->Put(WriteOptions(), Slice(RandomString(&rnd, 500)),
+                 Slice(RandomString(&rnd, 500)))
+             .ok()) {
       fprintf(stderr, "Error in put\n");
       exit(1);
     }
@@ -67,34 +67,34 @@ struct ReplicationThread {
 static void ReplicationThreadBody(void* arg) {
   ReplicationThread* t = reinterpret_cast<ReplicationThread*>(arg);
   DB* db = t->db;
-  unique_ptr<TransactionLogIterator> iter;
+  std::unique_ptr<TransactionLogIterator> iter;
   SequenceNumber currentSeqNum = 1;
   while (!t->stop.load(std::memory_order_acquire)) {
     iter.reset();
     Status s;
-    while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
+    while (!db->GetUpdatesSince(currentSeqNum, &iter).ok()) {
       if (t->stop.load(std::memory_order_acquire)) {
         return;
       }
     }
     fprintf(stderr, "Refreshing iterator\n");
-    for(;iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
+    for (; iter->Valid(); iter->Next(), t->no_read++, currentSeqNum++) {
       BatchResult res = iter->GetBatch();
       if (res.sequence != currentSeqNum) {
-        fprintf(stderr,
-                "Missed a seq no. b/w %ld and %ld\n",
-                (long)currentSeqNum,
-                (long)res.sequence);
+        fprintf(stderr, "Missed a seq no. b/w %ld and %ld\n",
+                (long)currentSeqNum, (long)res.sequence);
         exit(1);
       }
     }
   }
 }
 
-DEFINE_uint64(num_inserts, 1000, "the num of inserts the first thread should"
+DEFINE_uint64(num_inserts, 1000,
+              "the num of inserts the first thread should"
               " perform.");
 DEFINE_uint64(wal_ttl_seconds, 1000, "the wal ttl for the run(in seconds)");
-DEFINE_uint64(wal_size_limit_MB, 10, "the wal size limit for the run"
+DEFINE_uint64(wal_size_limit_MB, 10,
+              "the wal size limit for the run"
               "(in MB)");
 
 int main(int argc, const char** argv) {
@@ -133,7 +133,8 @@ int main(int argc, const char** argv) {
   replThread.stop.store(false, std::memory_order_release);
 
   env->StartThread(ReplicationThreadBody, &replThread);
-  while(replThread.no_read < FLAGS_num_inserts);
+  while (replThread.no_read < FLAGS_num_inserts)
+    ;
   replThread.stop.store(true, std::memory_order_release);
   if (replThread.no_read < dataPump.no_records) {
     // no. read should be => than inserted.
@@ -151,7 +152,7 @@ int main(int argc, const char** argv) {
 
 #else  // ROCKSDB_LITE
 #include <stdio.h>
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "Not supported in lite mode.\n");
   return 1;
 }
diff --git a/thirdparty/rocksdb/tools/db_stress.cc b/thirdparty/rocksdb/tools/db_stress.cc
index d18eeab0c7..7f8c4b53f7 100644
--- a/thirdparty/rocksdb/tools/db_stress.cc
+++ b/thirdparty/rocksdb/tools/db_stress.cc
@@ -28,7 +28,10 @@ int main() {
 }
 #else
 
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif  // __STDC_FORMAT_MACROS
+
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdio.h>
@@ -37,9 +40,9 @@ int main() {
 #include <algorithm>
 #include <chrono>
 #include <exception>
+#include <queue>
 #include <thread>
 
-#include <gflags/gflags.h>
 #include "db/db_impl.h"
 #include "db/version_set.h"
 #include "hdfs/env_hdfs.h"
@@ -51,11 +54,17 @@ int main() {
 #include "rocksdb/slice.h"
 #include "rocksdb/slice_transform.h"
 #include "rocksdb/statistics.h"
+#include "rocksdb/utilities/backupable_db.h"
+#include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
 #include "rocksdb/write_batch.h"
 #include "util/coding.h"
 #include "util/compression.h"
 #include "util/crc32c.h"
+#include "util/gflags_compat.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
@@ -68,9 +77,9 @@ int main() {
 
 #include "utilities/merge_operators.h"
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::RegisterFlagValidator;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 static const long KB = 1024;
 static const int kRandomValueMaxFactor = 3;
@@ -88,14 +97,24 @@ static bool ValidateUint32Range(const char* flagname, uint64_t value) {
 }
 
 DEFINE_uint64(seed, 2341234, "Seed for PRNG");
-static const bool FLAGS_seed_dummy __attribute__((unused)) =
+static const bool FLAGS_seed_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_seed, &ValidateUint32Range);
 
+DEFINE_bool(read_only, false, "True if open DB in read-only mode during tests");
+
 DEFINE_int64(max_key, 1 * KB* KB,
              "Max number of key/values to place in database");
 
 DEFINE_int32(column_families, 10, "Number of column families");
 
+DEFINE_string(
+    options_file, "",
+    "The path to a RocksDB options file.  If specified, then db_stress will "
+    "run with the RocksDB options in the default column family of the "
+    "specified options file. Note that, when an options file is provided, "
+    "db_stress will ignore the flag values for all options that may be passed "
+    "via options file.");
+
 DEFINE_int64(
     active_width, 0,
     "Number of keys in active span of the key-range at any given time. The "
@@ -116,6 +135,13 @@ DEFINE_bool(test_batches_snapshots, false,
             "\t(b) No long validation at the end (more speed up)\n"
             "\t(c) Test snapshot and atomicity of batch writes");
 
+DEFINE_bool(atomic_flush, false,
+            "If set, enables atomic flush in the options.\n");
+
+DEFINE_bool(test_atomic_flush, false,
+            "If set, runs the stress test dedicated to verifying atomic flush "
+            "functionality. Setting this implies `atomic_flush=true`.\n");
+
 DEFINE_int32(threads, 32, "Number of concurrent threads to run.");
 
 DEFINE_int32(ttl, -1,
@@ -128,6 +154,8 @@ DEFINE_int32(value_size_mult, 8,
 
 DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
 
+DEFINE_bool(enable_pipelined_write, false, "Pipeline WAL/memtable writes");
+
 DEFINE_bool(verify_before_write, false, "Verify before write");
 
 DEFINE_bool(histogram, false, "Print histogram of operation timings");
@@ -182,6 +210,10 @@ DEFINE_double(memtable_prefix_bloom_size_ratio,
               "creates prefix blooms for memtables, each with size "
               "`write_buffer_size * memtable_prefix_bloom_size_ratio`.");
 
+DEFINE_bool(memtable_whole_key_filtering,
+            rocksdb::Options().memtable_whole_key_filtering,
+            "Enable whole key filtering in memtables.");
+
 DEFINE_int32(open_files, rocksdb::Options().max_open_files,
              "Maximum number of files to keep open at the same time "
              "(use default if == 0)");
@@ -208,6 +240,16 @@ DEFINE_int32(block_size,
              static_cast<int32_t>(rocksdb::BlockBasedTableOptions().block_size),
              "Number of bytes in a block.");
 
+DEFINE_int32(
+    format_version,
+    static_cast<int32_t>(rocksdb::BlockBasedTableOptions().format_version),
+    "Format version of SST files.");
+
+DEFINE_int32(index_block_restart_interval,
+             rocksdb::BlockBasedTableOptions().index_block_restart_interval,
+             "Number of keys between restart points "
+             "for delta encoding of keys in index block.");
+
 DEFINE_int32(max_background_compactions,
              rocksdb::Options().max_background_compactions,
              "The maximum number of concurrent background compactions "
@@ -268,7 +310,7 @@ DEFINE_bool(allow_concurrent_memtable_write, false,
 DEFINE_bool(enable_write_thread_adaptive_yield, true,
             "Use a yielding spin loop for brief writer thread waits.");
 
-static const bool FLAGS_subcompactions_dummy __attribute__((unused)) =
+static const bool FLAGS_subcompactions_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range);
 
 static bool ValidateInt32Positive(const char* flagname, int32_t value) {
@@ -280,7 +322,7 @@ static bool ValidateInt32Positive(const char* flagname, int32_t value) {
   return true;
 }
 DEFINE_int32(reopen, 10, "Number of times database reopens");
-static const bool FLAGS_reopen_dummy __attribute__((unused)) =
+static const bool FLAGS_reopen_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_reopen, &ValidateInt32Positive);
 
 DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. "
@@ -291,6 +333,14 @@ DEFINE_bool(use_block_based_filter, false, "use block based filter"
 
 DEFINE_string(db, "", "Use the db with the following name.");
 
+DEFINE_string(
+    expected_values_path, "",
+    "File where the array of expected uint32_t values will be stored. If "
+    "provided and non-empty, the DB state will be verified against these "
+    "values after recovery. --max_key and --column_family must be kept the "
+    "same across invocations of this program that use the same "
+    "--expected_values_path.");
+
 DEFINE_bool(verify_checksum, false,
             "Verify checksum for every block read from storage");
 
@@ -318,7 +368,7 @@ DEFINE_bool(use_fsync, false, "If true, issue fsync instead of fdatasync");
 DEFINE_int32(kill_random_test, 0,
              "If non-zero, kill at various points in source code with "
              "probability 1/this");
-static const bool FLAGS_kill_random_test_dummy __attribute__((unused)) =
+static const bool FLAGS_kill_random_test_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_kill_random_test, &ValidateInt32Positive);
 extern int rocksdb_kill_odds;
 
@@ -329,6 +379,9 @@ extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
 
 DEFINE_bool(disable_wal, false, "If true, do not write WAL for write.");
 
+DEFINE_uint64(recycle_log_file_num, rocksdb::Options().recycle_log_file_num,
+              "Number of old WAL files to keep around for later recycling");
+
 DEFINE_int64(target_file_size_base, rocksdb::Options().target_file_size_base,
              "Target level-1 file size for compaction");
 
@@ -350,10 +403,54 @@ DEFINE_uint64(rate_limiter_bytes_per_sec, 0, "Set options.rate_limiter value.");
 DEFINE_bool(rate_limit_bg_reads, false,
             "Use options.rate_limiter on compaction reads");
 
-// Temporarily disable this to allows it to detect new bugs
+DEFINE_bool(use_txn, false,
+            "Use TransactionDB. Currently the default write policy is "
+            "TxnDBWritePolicy::WRITE_PREPARED");
+
+DEFINE_int32(backup_one_in, 0,
+             "If non-zero, then CreateNewBackup() will be called once for "
+             "every N operations on average.  0 indicates CreateNewBackup() "
+             "is disabled.");
+
+DEFINE_int32(checkpoint_one_in, 0,
+             "If non-zero, then CreateCheckpoint() will be called once for "
+             "every N operations on average.  0 indicates CreateCheckpoint() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_one_in, 0,
+             "If non-zero, then IngestExternalFile() will be called once for "
+             "every N operations on average.  0 indicates IngestExternalFile() "
+             "is disabled.");
+
+DEFINE_int32(ingest_external_file_width, 1000,
+             "The width of the ingested external files.");
+
 DEFINE_int32(compact_files_one_in, 0,
-             "If non-zero, then CompactFiles() will be called one for every N "
-             "operations IN AVERAGE.  0 indicates CompactFiles() is disabled.");
+             "If non-zero, then CompactFiles() will be called once for every N "
+             "operations on average.  0 indicates CompactFiles() is disabled.");
+
+DEFINE_int32(compact_range_one_in, 0,
+             "If non-zero, then CompactRange() will be called once for every N "
+             "operations on average.  0 indicates CompactRange() is disabled.");
+
+DEFINE_int32(flush_one_in, 0,
+             "If non-zero, then Flush() will be called once for every N ops "
+             "on average.  0 indicates calls to Flush() are disabled.");
+
+DEFINE_int32(compact_range_width, 10000,
+             "The width of the ranges passed to CompactRange().");
+
+DEFINE_int32(acquire_snapshot_one_in, 0,
+             "If non-zero, then acquires a snapshot once every N operations on "
+             "average.");
+
+DEFINE_bool(compare_full_db_state_snapshot, false,
+            "If set we compare state of entire db (in one of the threads) with"
+            "each snapshot.");
+
+DEFINE_uint64(snapshot_hold_ops, 0,
+              "If non-zero, then releases snapshots N operations after they're "
+              "acquired.");
 
 static bool ValidateInt32Percent(const char* flagname, int32_t value) {
   if (value < 0 || value>100) {
@@ -366,29 +463,29 @@ static bool ValidateInt32Percent(const char* flagname, int32_t value) {
 
 DEFINE_int32(readpercent, 10,
              "Ratio of reads to total workload (expressed as a percentage)");
-static const bool FLAGS_readpercent_dummy __attribute__((unused)) =
+static const bool FLAGS_readpercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_readpercent, &ValidateInt32Percent);
 
 DEFINE_int32(prefixpercent, 20,
              "Ratio of prefix iterators to total workload (expressed as a"
              " percentage)");
-static const bool FLAGS_prefixpercent_dummy __attribute__((unused)) =
+static const bool FLAGS_prefixpercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_prefixpercent, &ValidateInt32Percent);
 
 DEFINE_int32(writepercent, 45,
              "Ratio of writes to total workload (expressed as a percentage)");
-static const bool FLAGS_writepercent_dummy __attribute__((unused)) =
+static const bool FLAGS_writepercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_writepercent, &ValidateInt32Percent);
 
 DEFINE_int32(delpercent, 15,
              "Ratio of deletes to total workload (expressed as a percentage)");
-static const bool FLAGS_delpercent_dummy __attribute__((unused)) =
+static const bool FLAGS_delpercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_delpercent, &ValidateInt32Percent);
 
 DEFINE_int32(delrangepercent, 0,
              "Ratio of range deletions to total workload (expressed as a "
              "percentage). Cannot be used with test_batches_snapshots");
-static const bool FLAGS_delrangepercent_dummy __attribute__((unused)) =
+static const bool FLAGS_delrangepercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_delrangepercent, &ValidateInt32Percent);
 
 DEFINE_int32(nooverwritepercent, 60,
@@ -399,11 +496,11 @@ static const bool FLAGS_nooverwritepercent_dummy __attribute__((__unused__)) =
 
 DEFINE_int32(iterpercent, 10, "Ratio of iterations to total workload"
              " (expressed as a percentage)");
-static const bool FLAGS_iterpercent_dummy __attribute__((unused)) =
+static const bool FLAGS_iterpercent_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_iterpercent, &ValidateInt32Percent);
 
 DEFINE_uint64(num_iterations, 10, "Number of iterations per MultiIterate run");
-static const bool FLAGS_num_iterations_dummy __attribute__((unused)) =
+static const bool FLAGS_num_iterations_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_num_iterations, &ValidateUint32Range);
 
 namespace {
@@ -472,6 +569,14 @@ DEFINE_string(compression_type, "snappy",
 static enum rocksdb::CompressionType FLAGS_compression_type_e =
     rocksdb::kSnappyCompression;
 
+DEFINE_int32(compression_max_dict_bytes, 0,
+             "Maximum size of dictionary used to prime the compression "
+             "library.");
+
+DEFINE_int32(compression_zstd_max_train_bytes, 0,
+             "Maximum size of training data passed to zstd's dictionary "
+             "trainer.");
+
 DEFINE_string(checksum_type, "kCRC32c", "Algorithm to use to checksum blocks");
 static enum rocksdb::ChecksumType FLAGS_checksum_type_e = rocksdb::kCRC32c;
 
@@ -480,13 +585,15 @@ DEFINE_string(hdfs, "", "Name of hdfs environment");
 static rocksdb::Env* FLAGS_env = rocksdb::Env::Default();
 
 DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread.");
-static const bool FLAGS_ops_per_thread_dummy __attribute__((unused)) =
+static const bool FLAGS_ops_per_thread_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_ops_per_thread, &ValidateUint32Range);
 
 DEFINE_uint64(log2_keys_per_lock, 2, "Log2 of number of keys per lock");
-static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((unused)) =
+static const bool FLAGS_log2_keys_per_lock_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_log2_keys_per_lock, &ValidateUint32Range);
 
+DEFINE_uint64(max_manifest_file_size, 16384, "Maximum size of a MANIFEST file");
+
 DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable");
 
 enum RepFactory {
@@ -509,6 +616,31 @@ enum RepFactory StringToRepFactory(const char* ctype) {
   fprintf(stdout, "Cannot parse memreptable %s\n", ctype);
   return kSkipList;
 }
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// truncation of constant value on static_cast
+#pragma warning(disable : 4309)
+#endif
+bool GetNextPrefix(const rocksdb::Slice& src, std::string* v) {
+  std::string ret = src.ToString();
+  for (int i = static_cast<int>(ret.size()) - 1; i >= 0; i--) {
+    if (ret[i] != static_cast<char>(255)) {
+      ret[i] = ret[i] + 1;
+      break;
+    } else if (i != 0) {
+      ret[i] = 0;
+    } else {
+      // all FF. No next prefix
+      return false;
+    }
+  }
+  *v = ret;
+  return true;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
 }  // namespace
 
 static enum RepFactory FLAGS_rep_factory;
@@ -523,7 +655,7 @@ static bool ValidatePrefixSize(const char* flagname, int32_t value) {
   return true;
 }
 DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep");
-static const bool FLAGS_prefix_size_dummy __attribute__((unused)) =
+static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) =
     RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize);
 
 DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge "
@@ -548,6 +680,18 @@ static std::string Key(int64_t val) {
   return big_endian_key;
 }
 
+static bool GetIntVal(std::string big_endian_key, uint64_t *key_p) {
+  unsigned int size_key = sizeof(*key_p);
+  assert(big_endian_key.size() == size_key);
+  std::string little_endian_key;
+  little_endian_key.resize(size_key);
+  for (size_t i = 0 ; i < size_key; ++i) {
+    little_endian_key[i] = big_endian_key[size_key - 1 - i];
+  }
+  Slice little_endian_slice = Slice(little_endian_key);
+  return GetFixed64(&little_endian_slice, key_p);
+}
+
 static std::string StringToHex(const std::string& str) {
   std::string result = "0x";
   result.append(Slice(str).ToString(true));
@@ -662,46 +806,36 @@ class Stats {
     }
   }
 
-  void AddBytesForWrites(int nwrites, size_t nbytes) {
+  void AddBytesForWrites(long nwrites, size_t nbytes) {
     writes_ += nwrites;
     bytes_ += nbytes;
   }
 
-  void AddGets(int ngets, int nfounds) {
+  void AddGets(long ngets, long nfounds) {
     founds_ += nfounds;
     gets_ += ngets;
   }
 
-  void AddPrefixes(int nprefixes, int count) {
+  void AddPrefixes(long nprefixes, long count) {
     prefixes_ += nprefixes;
     iterator_size_sums_ += count;
   }
 
-  void AddIterations(int n) {
-    iterations_ += n;
-  }
+  void AddIterations(long n) { iterations_ += n; }
 
-  void AddDeletes(int n) {
-    deletes_ += n;
-  }
+  void AddDeletes(long n) { deletes_ += n; }
 
   void AddSingleDeletes(size_t n) { single_deletes_ += n; }
 
-  void AddRangeDeletions(int n) {
-    range_deletions_ += n;
-  }
+  void AddRangeDeletions(long n) { range_deletions_ += n; }
 
-  void AddCoveredByRangeDeletions(int n) {
-    covered_by_range_deletions_ += n;
-  }
+  void AddCoveredByRangeDeletions(long n) { covered_by_range_deletions_ += n; }
 
-  void AddErrors(int n) {
-    errors_ += n;
-  }
+  void AddErrors(long n) { errors_ += n; }
 
-  void AddNumCompactFilesSucceed(int n) { num_compact_files_succeed_ += n; }
+  void AddNumCompactFilesSucceed(long n) { num_compact_files_succeed_ += n; }
 
-  void AddNumCompactFilesFailed(int n) { num_compact_files_failed_ += n; }
+  void AddNumCompactFilesFailed(long n) { num_compact_files_failed_ += n; }
 
   void Report(const char* name) {
     std::string extra;
@@ -750,7 +884,11 @@ class Stats {
 // State shared by all concurrent executions of the same benchmark.
 class SharedState {
  public:
-  static const uint32_t SENTINEL;
+  // indicates a key may have any value (or not be present) as an operation on
+  // it is incomplete.
+  static const uint32_t UNKNOWN_SENTINEL;
+  // indicates a key should definitely be deleted
+  static const uint32_t DELETION_SENTINEL;
 
   explicit SharedState(StressTest* stress_test)
       : cv_(&mu_),
@@ -768,33 +906,99 @@ class SharedState {
         bg_thread_finished_(false),
         stress_test_(stress_test),
         verification_failure_(false),
-        no_overwrite_ids_(FLAGS_column_families) {
+        no_overwrite_ids_(FLAGS_column_families),
+        values_(nullptr) {
     // Pick random keys in each column family that will not experience
     // overwrite
 
     printf("Choosing random keys with no overwrite\n");
-    Random rnd(seed_);
-    size_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
-    for (auto& cf_ids : no_overwrite_ids_) {
-      for (size_t i = 0; i < num_no_overwrite_keys; i++) {
-        size_t rand_key;
-        do {
-          rand_key = rnd.Next() % max_key_;
-        } while (cf_ids.find(rand_key) != cf_ids.end());
-        cf_ids.insert(rand_key);
+    Random64 rnd(seed_);
+    // Start with the identity permutation. Subsequent iterations of
+    // for loop below will start with perm of previous for loop
+    int64_t *permutation = new int64_t[max_key_];
+    for (int64_t i = 0; i < max_key_; i++) {
+      permutation[i] = i;
+    }
+    // Now do the Knuth shuffle
+    int64_t num_no_overwrite_keys = (max_key_ * FLAGS_nooverwritepercent) / 100;
+    // Only need to figure out first num_no_overwrite_keys of permutation
+    no_overwrite_ids_.reserve(num_no_overwrite_keys);
+    for (int64_t i = 0; i < num_no_overwrite_keys; i++) {
+      int64_t rand_index = i + rnd.Next() % (max_key_ - i);
+      // Swap i and rand_index;
+      int64_t temp = permutation[i];
+      permutation[i] = permutation[rand_index];
+      permutation[rand_index] = temp;
+      // Fill no_overwrite_ids_ with the first num_no_overwrite_keys of
+      // permutation
+      no_overwrite_ids_.insert(permutation[i]);
+    }
+    delete[] permutation;
+
+    size_t expected_values_size =
+        sizeof(std::atomic<uint32_t>) * FLAGS_column_families * max_key_;
+    bool values_init_needed = false;
+    Status status;
+    if (!FLAGS_expected_values_path.empty()) {
+      if (!std::atomic<uint32_t>{}.is_lock_free()) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_path on platforms without lock-free "
+            "std::atomic<uint32_t>");
+      }
+      if (status.ok() && FLAGS_clear_column_family_one_in > 0) {
+        status = Status::InvalidArgument(
+            "Cannot use --expected_values_path on when "
+            "--clear_column_family_one_in is greater than zero.");
+      }
+      uint64_t size = 0;
+      if (status.ok()) {
+        status = FLAGS_env->GetFileSize(FLAGS_expected_values_path, &size);
+      }
+      std::unique_ptr<WritableFile> wfile;
+      if (status.ok() && size == 0) {
+        const EnvOptions soptions;
+        status = FLAGS_env->NewWritableFile(FLAGS_expected_values_path, &wfile,
+                                            soptions);
+      }
+      if (status.ok() && size == 0) {
+        std::string buf(expected_values_size, '\0');
+        status = wfile->Append(buf);
+        values_init_needed = true;
+      }
+      if (status.ok()) {
+        status = FLAGS_env->NewMemoryMappedFileBuffer(
+            FLAGS_expected_values_path, &expected_mmap_buffer_);
+      }
+      if (status.ok()) {
+        assert(expected_mmap_buffer_->GetLen() == expected_values_size);
+        values_ =
+            static_cast<std::atomic<uint32_t>*>(expected_mmap_buffer_->GetBase());
+        assert(values_ != nullptr);
+      } else {
+        fprintf(stderr, "Failed opening shared file '%s' with error: %s\n",
+                FLAGS_expected_values_path.c_str(), status.ToString().c_str());
+        assert(values_ == nullptr);
+      }
+    }
+    if (values_ == nullptr) {
+      values_allocation_.reset(
+          new std::atomic<uint32_t>[FLAGS_column_families * max_key_]);
+      values_ = &values_allocation_[0];
+      values_init_needed = true;
+    }
+    assert(values_ != nullptr);
+    if (values_init_needed) {
+      for (int i = 0; i < FLAGS_column_families; ++i) {
+        for (int j = 0; j < max_key_; ++j) {
+          Delete(i, j, false /* pending */);
+        }
       }
-      assert(cf_ids.size() == num_no_overwrite_keys);
     }
 
     if (FLAGS_test_batches_snapshots) {
       fprintf(stdout, "No lock creation because test_batches_snapshots set\n");
       return;
     }
-    values_.resize(FLAGS_column_families);
-
-    for (int i = 0; i < FLAGS_column_families; ++i) {
-      values_[i] = std::vector<uint32_t>(max_key_, SENTINEL);
-    }
 
     long num_locks = static_cast<long>(max_key_ >> log2_keys_per_lock_);
     if (max_key_ & ((1 << log2_keys_per_lock_) - 1)) {
@@ -885,7 +1089,7 @@ class SharedState {
 
   bool HasVerificationFailedYet() { return verification_failure_.load(); }
 
-  port::Mutex* GetMutexForKey(int cf, long key) {
+  port::Mutex* GetMutexForKey(int cf, int64_t key) {
     return key_locks_[cf][key >> log2_keys_per_lock_].get();
   }
 
@@ -901,36 +1105,74 @@ class SharedState {
     }
   }
 
+  std::atomic<uint32_t>& Value(int cf, int64_t key) const {
+    return values_[cf * max_key_ + key];
+  }
+
   void ClearColumnFamily(int cf) {
-    std::fill(values_[cf].begin(), values_[cf].end(), SENTINEL);
+    std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */),
+              DELETION_SENTINEL);
   }
 
-  void Put(int cf, int64_t key, uint32_t value_base) {
-    values_[cf][key] = value_base;
+  // @param pending True if the update may have started but is not yet
+  //    guaranteed finished. This is useful for crash-recovery testing when the
+  //    process may crash before updating the expected values array.
+  void Put(int cf, int64_t key, uint32_t value_base, bool pending) {
+    if (!pending) {
+      // prevent expected-value update from reordering before Write
+      std::atomic_thread_fence(std::memory_order_release);
+    }
+    Value(cf, key).store(pending ? UNKNOWN_SENTINEL : value_base,
+                         std::memory_order_relaxed);
+    if (pending) {
+      // prevent Write from reordering before expected-value update
+      std::atomic_thread_fence(std::memory_order_release);
+    }
   }
 
-  uint32_t Get(int cf, int64_t key) const { return values_[cf][key]; }
+  uint32_t Get(int cf, int64_t key) const { return Value(cf, key); }
 
-  void Delete(int cf, int64_t key) { values_[cf][key] = SENTINEL; }
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  bool Delete(int cf, int64_t key, bool pending) {
+    if (Value(cf, key) == DELETION_SENTINEL) {
+      return false;
+    }
+    Put(cf, key, DELETION_SENTINEL, pending);
+    return true;
+  }
 
-  void SingleDelete(int cf, int64_t key) { values_[cf][key] = SENTINEL; }
+  // @param pending See comment above Put()
+  // Returns true if the key was not yet deleted.
+  bool SingleDelete(int cf, int64_t key, bool pending) {
+    return Delete(cf, key, pending);
+  }
 
-  int DeleteRange(int cf, int64_t begin_key, int64_t end_key) {
+  // @param pending See comment above Put()
+  // Returns number of keys deleted by the call.
+  int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) {
     int covered = 0;
     for (int64_t key = begin_key; key < end_key; ++key) {
-      if (values_[cf][key] != SENTINEL) {
+      if (Delete(cf, key, pending)) {
         ++covered;
       }
-      values_[cf][key] = SENTINEL;
     }
     return covered;
   }
 
-  bool AllowsOverwrite(int cf, int64_t key) {
-    return no_overwrite_ids_[cf].find(key) == no_overwrite_ids_[cf].end();
+  bool AllowsOverwrite(int64_t key) {
+    return no_overwrite_ids_.find(key) == no_overwrite_ids_.end();
   }
 
-  bool Exists(int cf, int64_t key) { return values_[cf][key] != SENTINEL; }
+  bool Exists(int cf, int64_t key) {
+    // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite
+    // is disallowed can't be accidentally added a second time, in which case
+    // SingleDelete wouldn't be able to properly delete the key. It does allow
+    // the case where a SingleDelete might be added which covers nothing, but
+    // that's not a correctness issue.
+    uint32_t expected_value = Value(cf, key).load();
+    return expected_value != DELETION_SENTINEL;
+  }
 
   uint32_t GetSeed() const { return seed_; }
 
@@ -942,6 +1184,10 @@ class SharedState {
 
   bool BgThreadFinished() const { return bg_thread_finished_; }
 
+  bool ShouldVerifyAtBeginning() const {
+    return expected_mmap_buffer_.get() != nullptr;
+  }
+
  private:
   port::Mutex mu_;
   port::CondVar cv_;
@@ -961,22 +1207,41 @@ class SharedState {
   std::atomic<bool> verification_failure_;
 
   // Keys that should not be overwritten
-  std::vector<std::set<size_t> > no_overwrite_ids_;
+  std::unordered_set<size_t> no_overwrite_ids_;
 
-  std::vector<std::vector<uint32_t>> values_;
+  std::atomic<uint32_t>* values_;
+  std::unique_ptr<std::atomic<uint32_t>[]> values_allocation_;
   // Has to make it owned by a smart ptr as port::Mutex is not copyable
   // and storing it in the container may require copying depending on the impl.
   std::vector<std::vector<std::unique_ptr<port::Mutex> > > key_locks_;
+  std::unique_ptr<MemoryMappedFileBuffer> expected_mmap_buffer_;
 };
 
-const uint32_t SharedState::SENTINEL = 0xffffffff;
+const uint32_t SharedState::UNKNOWN_SENTINEL = 0xfffffffe;
+const uint32_t SharedState::DELETION_SENTINEL = 0xffffffff;
 
 // Per-thread state for concurrent executions of the same benchmark.
 struct ThreadState {
-  uint32_t tid; // 0..n-1
-  Random rand;  // Has different seeds for different threads
+  uint32_t tid;  // 0..n-1
+  Random rand;   // Has different seeds for different threads
   SharedState* shared;
   Stats stats;
+  struct SnapshotState {
+    const Snapshot* snapshot;
+    // The cf from which we did a Get at this snapshot
+    int cf_at;
+    // The name of the cf at the time that we did a read
+    std::string cf_at_name;
+    // The key with which we did a Get at this snapshot
+    std::string key;
+    // The status of the Get
+    Status status;
+    // The value of the Get
+    std::string value;
+    // optional state of all keys in the db
+    std::vector<bool> *key_vec;
+  };
+  std::queue<std::pair<uint64_t, SnapshotState> > snapshot_queue;
 
   ThreadState(uint32_t index, SharedState* _shared)
       : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {}
@@ -985,13 +1250,17 @@ struct ThreadState {
 class DbStressListener : public EventListener {
  public:
   DbStressListener(const std::string& db_name,
-                   const std::vector<DbPath>& db_paths)
-      : db_name_(db_name), db_paths_(db_paths) {}
-  virtual ~DbStressListener() {}
+                   const std::vector<DbPath>& db_paths,
+                   const std::vector<ColumnFamilyDescriptor>& column_families)
+      : db_name_(db_name),
+        db_paths_(db_paths),
+        column_families_(column_families),
+        num_pending_file_creations_(0) {}
+  virtual ~DbStressListener() {
+    assert(num_pending_file_creations_ == 0);
+  }
 #ifndef ROCKSDB_LITE
-  virtual void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
-    assert(db);
-    assert(db->GetName() == db_name_);
+  virtual void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
     assert(IsValidColumnFamilyName(info.cf_name));
     VerifyFilePath(info.file_path);
     // pretending doing some work here
@@ -999,10 +1268,8 @@ class DbStressListener : public EventListener {
         std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
   }
 
-  virtual void OnCompactionCompleted(DB* db,
+  virtual void OnCompactionCompleted(DB* /*db*/,
                                      const CompactionJobInfo& ci) override {
-    assert(db);
-    assert(db->GetName() == db_name_);
     assert(IsValidColumnFamilyName(ci.cf_name));
     assert(ci.input_files.size() + ci.output_files.size() > 0U);
     for (const auto& file_path : ci.input_files) {
@@ -1016,17 +1283,24 @@ class DbStressListener : public EventListener {
         std::chrono::microseconds(Random::GetTLSInstance()->Uniform(5000)));
   }
 
+  virtual void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*info*/) override {
+    ++num_pending_file_creations_;
+  }
   virtual void OnTableFileCreated(const TableFileCreationInfo& info) override {
     assert(info.db_name == db_name_);
     assert(IsValidColumnFamilyName(info.cf_name));
-    VerifyFilePath(info.file_path);
+    if (info.file_size) {
+      VerifyFilePath(info.file_path);
+    }
     assert(info.job_id > 0 || FLAGS_compact_files_one_in > 0);
-    if (info.status.ok()) {
-      assert(info.file_size > 0);
-      assert(info.table_properties.data_size > 0);
+    if (info.status.ok() && info.file_size > 0) {
+      assert(info.table_properties.data_size > 0 ||
+             info.table_properties.num_range_deletions > 0);
       assert(info.table_properties.raw_key_size > 0);
       assert(info.table_properties.num_entries > 0);
     }
+    --num_pending_file_creations_;
   }
 
  protected:
@@ -1053,7 +1327,16 @@ class DbStressListener : public EventListener {
         return;
       }
     }
+    for (auto& cf : column_families_) {
+      for (const auto& cf_path : cf.options.cf_paths) {
+        if (cf_path.path == file_dir) {
+            return;
+        }
+      }
+    }
     assert(false);
+#else
+    (void)file_dir;
 #endif  // !NDEBUG
   }
 
@@ -1064,6 +1347,8 @@ class DbStressListener : public EventListener {
     bool result = ParseFileName(file_name, &file_number, &file_type);
     assert(result);
     assert(file_type == kTableFile);
+#else
+    (void)file_name;
 #endif  // !NDEBUG
   }
 
@@ -1078,6 +1363,8 @@ class DbStressListener : public EventListener {
       }
       VerifyFileName(file_path.substr(pos));
     }
+#else
+    (void)file_path;
 #endif  // !NDEBUG
   }
 #endif  // !ROCKSDB_LITE
@@ -1085,6 +1372,8 @@ class DbStressListener : public EventListener {
  private:
   std::string db_name_;
   std::vector<DbPath> db_paths_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  std::atomic<int> num_pending_file_creations_;
 };
 
 }  // namespace
@@ -1100,8 +1389,12 @@ class StressTest {
                                  : NewBloomFilterPolicy(FLAGS_bloom_bits, false)
                            : nullptr),
         db_(nullptr),
+#ifndef ROCKSDB_LITE
+        txn_db_(nullptr),
+#endif
         new_column_family_name_(1),
-        num_times_reopened_(0) {
+        num_times_reopened_(0),
+        db_preload_finished_(false) {
     if (FLAGS_destroy_db_initially) {
       std::vector<std::string> files;
       FLAGS_env->GetChildren(FLAGS_db, &files);
@@ -1110,11 +1403,18 @@ class StressTest {
           FLAGS_env->DeleteFile(FLAGS_db + "/" + files[i]);
         }
       }
-      DestroyDB(FLAGS_db, Options());
+      Options options;
+      options.env = FLAGS_env;
+      Status s = DestroyDB(FLAGS_db, options);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot destroy original db: %s\n",
+                s.ToString().c_str());
+        exit(1);
+      }
     }
   }
 
-  ~StressTest() {
+  virtual ~StressTest() {
     for (auto cf : column_families_) {
       delete cf;
     }
@@ -1145,18 +1445,18 @@ class StressTest {
 
     std::unordered_map<std::string, std::vector<std::string> > options_tbl = {
         {"write_buffer_size",
-         {ToString(FLAGS_write_buffer_size),
-          ToString(FLAGS_write_buffer_size * 2),
-          ToString(FLAGS_write_buffer_size * 4)}},
+         {ToString(options_.write_buffer_size),
+          ToString(options_.write_buffer_size * 2),
+          ToString(options_.write_buffer_size * 4)}},
         {"max_write_buffer_number",
-         {ToString(FLAGS_max_write_buffer_number),
-          ToString(FLAGS_max_write_buffer_number * 2),
-          ToString(FLAGS_max_write_buffer_number * 4)}},
+         {ToString(options_.max_write_buffer_number),
+          ToString(options_.max_write_buffer_number * 2),
+          ToString(options_.max_write_buffer_number * 4)}},
         {"arena_block_size",
          {
-             ToString(Options().arena_block_size),
-             ToString(FLAGS_write_buffer_size / 4),
-             ToString(FLAGS_write_buffer_size / 8),
+             ToString(options_.arena_block_size),
+             ToString(options_.write_buffer_size / 4),
+             ToString(options_.write_buffer_size / 8),
          }},
         {"memtable_huge_page_size", {"0", ToString(2 * 1024 * 1024)}},
         {"max_successive_merges", {"0", "2", "4"}},
@@ -1167,51 +1467,49 @@ class StressTest {
         {"hard_rate_limit", {"0", "1.1", "2.0"}},
         {"level0_file_num_compaction_trigger",
          {
-             ToString(FLAGS_level0_file_num_compaction_trigger),
-             ToString(FLAGS_level0_file_num_compaction_trigger + 2),
-             ToString(FLAGS_level0_file_num_compaction_trigger + 4),
+             ToString(options_.level0_file_num_compaction_trigger),
+             ToString(options_.level0_file_num_compaction_trigger + 2),
+             ToString(options_.level0_file_num_compaction_trigger + 4),
          }},
         {"level0_slowdown_writes_trigger",
          {
-             ToString(FLAGS_level0_slowdown_writes_trigger),
-             ToString(FLAGS_level0_slowdown_writes_trigger + 2),
-             ToString(FLAGS_level0_slowdown_writes_trigger + 4),
+             ToString(options_.level0_slowdown_writes_trigger),
+             ToString(options_.level0_slowdown_writes_trigger + 2),
+             ToString(options_.level0_slowdown_writes_trigger + 4),
          }},
         {"level0_stop_writes_trigger",
          {
-             ToString(FLAGS_level0_stop_writes_trigger),
-             ToString(FLAGS_level0_stop_writes_trigger + 2),
-             ToString(FLAGS_level0_stop_writes_trigger + 4),
+             ToString(options_.level0_stop_writes_trigger),
+             ToString(options_.level0_stop_writes_trigger + 2),
+             ToString(options_.level0_stop_writes_trigger + 4),
          }},
         {"max_compaction_bytes",
          {
-             ToString(FLAGS_target_file_size_base * 5),
-             ToString(FLAGS_target_file_size_base * 15),
-             ToString(FLAGS_target_file_size_base * 100),
+             ToString(options_.target_file_size_base * 5),
+             ToString(options_.target_file_size_base * 15),
+             ToString(options_.target_file_size_base * 100),
          }},
         {"target_file_size_base",
          {
-             ToString(FLAGS_target_file_size_base),
-             ToString(FLAGS_target_file_size_base * 2),
-             ToString(FLAGS_target_file_size_base * 4),
+             ToString(options_.target_file_size_base),
+             ToString(options_.target_file_size_base * 2),
+             ToString(options_.target_file_size_base * 4),
          }},
         {"target_file_size_multiplier",
          {
-             ToString(FLAGS_target_file_size_multiplier), "1", "2",
+             ToString(options_.target_file_size_multiplier), "1", "2",
          }},
         {"max_bytes_for_level_base",
          {
-             ToString(FLAGS_max_bytes_for_level_base / 2),
-             ToString(FLAGS_max_bytes_for_level_base),
-             ToString(FLAGS_max_bytes_for_level_base * 2),
+             ToString(options_.max_bytes_for_level_base / 2),
+             ToString(options_.max_bytes_for_level_base),
+             ToString(options_.max_bytes_for_level_base * 2),
          }},
         {"max_bytes_for_level_multiplier",
          {
-             ToString(FLAGS_max_bytes_for_level_multiplier), "1", "2",
+             ToString(options_.max_bytes_for_level_multiplier), "1", "2",
          }},
         {"max_sequential_skip_in_iterations", {"4", "8", "12"}},
-        {"use_direct_reads", {"false", "true"}},
-        {"use_direct_io_for_flush_and_compaction", {"false", "true"}},
     };
 
     options_table_ = std::move(options_tbl);
@@ -1223,12 +1521,25 @@ class StressTest {
   }
 
   bool Run() {
+    uint64_t now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Initializing db_stress\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str());
     PrintEnv();
-    BuildOptionsTable();
     Open();
+    BuildOptionsTable();
     SharedState shared(this);
+
+    if (FLAGS_read_only) {
+      now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Preloading db with %" PRIu64 " KVs\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str(), FLAGS_max_key);
+      PreloadDbAndReopenAsReadOnly(FLAGS_max_key, &shared);
+    }
     uint32_t n = shared.GetNumThreads();
 
+    now = FLAGS_env->NowMicros();
+    fprintf(stdout, "%s Initializing worker threads\n",
+            FLAGS_env->TimeToString(now / 1000000).c_str());
     std::vector<ThreadState*> threads(n);
     for (uint32_t i = 0; i < n; i++) {
       threads[i] = new ThreadState(i, &shared);
@@ -1248,8 +1559,15 @@ class StressTest {
       while (!shared.AllInitialized()) {
         shared.GetCondVar()->Wait();
       }
+      if (shared.ShouldVerifyAtBeginning()) {
+        if (shared.HasVerificationFailedYet()) {
+          printf("Crash-recovery verification failed :(\n");
+        } else {
+          printf("Crash-recovery verification passed :)\n");
+        }
+      }
 
-      auto now = FLAGS_env->NowMicros();
+      now = FLAGS_env->NowMicros();
       fprintf(stdout, "%s Starting database operations\n",
               FLAGS_env->TimeToString(now/1000000).c_str());
 
@@ -1284,7 +1602,7 @@ class StressTest {
       delete threads[i];
       threads[i] = nullptr;
     }
-    auto now = FLAGS_env->NowMicros();
+    now = FLAGS_env->NowMicros();
     if (!FLAGS_test_batches_snapshots && !shared.HasVerificationFailedYet()) {
       fprintf(stdout, "%s Verification successful\n",
               FLAGS_env->TimeToString(now/1000000).c_str());
@@ -1306,12 +1624,14 @@ class StressTest {
     return true;
   }
 
- private:
-
+ protected:
   static void ThreadBody(void* v) {
     ThreadState* thread = reinterpret_cast<ThreadState*>(v);
     SharedState* shared = thread->shared;
 
+    if (shared->ShouldVerifyAtBeginning()) {
+      thread->shared->GetStressTest()->VerifyDb(thread);
+    }
     {
       MutexLock l(shared->GetMutex());
       shared->IncInitialized();
@@ -1335,9 +1655,7 @@ class StressTest {
       }
     }
 
-    if (!FLAGS_test_batches_snapshots) {
-      thread->shared->GetStressTest()->VerifyDb(thread);
-    }
+    thread->shared->GetStressTest()->VerifyDb(thread);
 
     {
       MutexLock l(shared->GetMutex());
@@ -1346,7 +1664,6 @@ class StressTest {
         shared->GetCondVar()->SignalAll();
       }
     }
-
   }
 
   static void PoolSizeChangeThread(void* v) {
@@ -1381,228 +1698,178 @@ class StressTest {
     }
   }
 
-  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
-  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
-  // Also refer MultiGet.
-  Status MultiPut(ThreadState* thread, const WriteOptions& writeoptions,
-                  ColumnFamilyHandle* column_family, const Slice& key,
-                  const Slice& value, size_t sz) {
-    std::string keys[10] = {"9", "8", "7", "6", "5",
-                            "4", "3", "2", "1", "0"};
-    std::string values[10] = {"9", "8", "7", "6", "5",
-                              "4", "3", "2", "1", "0"};
-    Slice value_slices[10];
-    WriteBatch batch;
-    Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      values[i] += value.ToString();
-      value_slices[i] = values[i];
-      if (FLAGS_use_merge) {
-        batch.Merge(column_family, keys[i], value_slices[i]);
-      } else {
-        batch.Put(column_family, keys[i], value_slices[i]);
-      }
+  static void PrintKeyValue(int cf, uint64_t key, const char* value,
+      size_t sz) {
+    if (!FLAGS_verbose) {
+      return;
     }
-
-    s = db_->Write(writeoptions, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      // we did 10 writes each of size sz + 1
-      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    std::string tmp;
+    tmp.reserve(sz * 2 + 16);
+    char buf[4];
+    for (size_t i = 0; i < sz; i++) {
+      snprintf(buf, 4, "%X", value[i]);
+      tmp.append(buf);
     }
+    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") %s\n", cf,
+            key, sz, tmp.c_str());
+  }
 
-    return s;
+  static int64_t GenerateOneKey(ThreadState* thread, uint64_t iteration) {
+    const double completed_ratio =
+        static_cast<double>(iteration) / FLAGS_ops_per_thread;
+    const int64_t base_key = static_cast<int64_t>(
+        completed_ratio * (FLAGS_max_key - FLAGS_active_width));
+    return base_key + thread->rand.Next() % FLAGS_active_width;
   }
 
-  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
-  // in DB atomically i.e in a single batch. Also refer MultiGet.
-  Status MultiDelete(ThreadState* thread, const WriteOptions& writeoptions,
-                     ColumnFamilyHandle* column_family, const Slice& key) {
-    std::string keys[10] = {"9", "7", "5", "3", "1",
-                            "8", "6", "4", "2", "0"};
+  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
+    size_t value_sz =
+        ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
+    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
+    (void) max_sz;
+    *((uint32_t*)v) = rand;
+    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
+      v[i] = (char)(rand ^ i);
+    }
+    v[value_sz] = '\0';
+    return value_sz; // the size of the value set.
+  }
 
-    WriteBatch batch;
+  Status AssertSame(DB* db, ColumnFamilyHandle* cf,
+                    ThreadState::SnapshotState& snap_state) {
     Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      batch.Delete(column_family, keys[i]);
+    if (cf->GetName() != snap_state.cf_at_name) {
+      return s;
+    }
+    ReadOptions ropt;
+    ropt.snapshot = snap_state.snapshot;
+    PinnableSlice exp_v(&snap_state.value);
+    exp_v.PinSelf();
+    PinnableSlice v;
+    s = db->Get(ropt, cf, snap_state.key, &v);
+    if (!s.ok() && !s.IsNotFound()) {
+      return s;
+    }
+    if (snap_state.status != s) {
+      return Status::Corruption(
+          "The snapshot gave inconsistent results for key " +
+          ToString(Hash(snap_state.key.c_str(), snap_state.key.size(), 0)) +
+          " in cf " + cf->GetName() + ": (" + snap_state.status.ToString() +
+          ") vs. (" + s.ToString() + ")");
     }
-
-    s = db_->Write(writeoptions, &batch);
-    if (!s.ok()) {
-      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
-      thread->stats.AddErrors(1);
-    } else {
-      thread->stats.AddDeletes(10);
+    if (s.ok()) {
+      if (exp_v != v) {
+        return Status::Corruption("The snapshot gave inconsistent values: (" +
+                                  exp_v.ToString() + ") vs. (" + v.ToString() +
+                                  ")");
+      }
     }
-
-    return s;
+    if (snap_state.key_vec != nullptr) {
+      // When `prefix_extractor` is set, seeking to beginning and scanning
+      // across prefixes are only supported with `total_order_seek` set.
+      ropt.total_order_seek = true;
+      std::unique_ptr<Iterator> iterator(db->NewIterator(ropt));
+      std::unique_ptr<std::vector<bool>> tmp_bitvec(new std::vector<bool>(FLAGS_max_key));
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        uint64_t key_val;
+        if (GetIntVal(iterator->key().ToString(), &key_val)) {
+          (*tmp_bitvec.get())[key_val] = true;
+        }
+      }
+      if (!std::equal(snap_state.key_vec->begin(),
+                      snap_state.key_vec->end(),
+                      tmp_bitvec.get()->begin())) {
+        return Status::Corruption("Found inconsistent keys at this snapshot");
+      }
+    }
+    return Status::OK();
   }
 
-  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
-  // in the same snapshot, and verifies that all the values are of the form
-  // "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that MultiPut was used to put (K, V) into the DB.
-  Status MultiGet(ThreadState* thread, const ReadOptions& readoptions,
-                  ColumnFamilyHandle* column_family, const Slice& key,
-                  std::string* value) {
-    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
-    Slice key_slices[10];
-    std::string values[10];
-    ReadOptions readoptionscopy = readoptions;
-    readoptionscopy.snapshot = db_->GetSnapshot();
+  // Currently PreloadDb has to be single-threaded.
+  void PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
+                                    SharedState* shared) {
+    WriteOptions write_opts;
+    write_opts.disableWAL = FLAGS_disable_wal;
+    if (FLAGS_sync) {
+      write_opts.sync = true;
+    }
+    char value[100];
+    int cf_idx = 0;
     Status s;
-    for (int i = 0; i < 10; i++) {
-      keys[i] += key.ToString();
-      key_slices[i] = keys[i];
-      s = db_->Get(readoptionscopy, column_family, key_slices[i], value);
-      if (!s.ok() && !s.IsNotFound()) {
-        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
-        values[i] = "";
-        thread->stats.AddErrors(1);
-        // we continue after error rather than exiting so that we can
-        // find more errors if any
-      } else if (s.IsNotFound()) {
-        values[i] = "";
-        thread->stats.AddGets(1, 0);
-      } else {
-        values[i] = *value;
+    for (auto cfh : column_families_) {
+      for (int64_t k = 0; k != number_of_keys; ++k) {
+        std::string key_str = Key(k);
+        Slice key = key_str;
+        size_t sz = GenerateValue(0 /*value_base*/, value, sizeof(value));
+        Slice v(value, sz);
+        shared->Put(cf_idx, k, 0, true /* pending */);
 
-        char expected_prefix = (keys[i])[0];
-        char actual_prefix = (values[i])[0];
-        if (actual_prefix != expected_prefix) {
-          fprintf(stderr, "error expected prefix = %c actual = %c\n",
-                  expected_prefix, actual_prefix);
+        if (FLAGS_use_merge) {
+          if (!FLAGS_use_txn) {
+            s = db_->Merge(write_opts, cfh, key, v);
+          } else {
+#ifndef ROCKSDB_LITE
+            Transaction* txn;
+            s = NewTxn(write_opts, &txn);
+            if (s.ok()) {
+              s = txn->Merge(cfh, key, v);
+              if (s.ok()) {
+                s = CommitTxn(txn);
+              }
+            }
+#endif
+          }
+        } else {
+          if (!FLAGS_use_txn) {
+            s = db_->Put(write_opts, cfh, key, v);
+          } else {
+#ifndef ROCKSDB_LITE
+            Transaction* txn;
+            s = NewTxn(write_opts, &txn);
+            if (s.ok()) {
+              s = txn->Put(cfh, key, v);
+              if (s.ok()) {
+                s = CommitTxn(txn);
+              }
+            }
+#endif
+          }
         }
-        (values[i])[0] = ' '; // blank out the differing character
-        thread->stats.AddGets(1, 1);
-      }
-    }
-    db_->ReleaseSnapshot(readoptionscopy.snapshot);
 
-    // Now that we retrieved all values, check that they all match
-    for (int i = 1; i < 10; i++) {
-      if (values[i] != values[0]) {
-        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
-                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
-                StringToHex(values[i]).c_str());
-      // we continue after error rather than exiting so that we can
-      // find more errors if any
+        shared->Put(cf_idx, k, 0, false /* pending */);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      if (!s.ok()) {
+        break;
       }
+      ++cf_idx;
+    }
+    if (s.ok()) {
+      s = db_->Flush(FlushOptions(), column_families_);
     }
+    if (s.ok()) {
+      for (auto cf : column_families_) {
+        delete cf;
+      }
+      column_families_.clear();
+      delete db_;
+      db_ = nullptr;
+#ifndef ROCKSDB_LITE
+      txn_db_ = nullptr;
+#endif
 
-    return s;
-  }
-
-  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
-  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
-  // of the key. Each of these 10 scans returns a series of values;
-  // each series should be the same length, and it is verified for each
-  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
-  // ASSUMES that MultiPut was used to put (K, V)
-  Status MultiPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
-                         ColumnFamilyHandle* column_family,
-                         const Slice& key) {
-    std::string prefixes[10] = {"0", "1", "2", "3", "4",
-                                "5", "6", "7", "8", "9"};
-    Slice prefix_slices[10];
-    ReadOptions readoptionscopy[10];
-    const Snapshot* snapshot = db_->GetSnapshot();
-    Iterator* iters[10];
-    Status s = Status::OK();
-    for (int i = 0; i < 10; i++) {
-      prefixes[i] += key.ToString();
-      prefixes[i].resize(FLAGS_prefix_size);
-      prefix_slices[i] = Slice(prefixes[i]);
-      readoptionscopy[i] = readoptions;
-      readoptionscopy[i].snapshot = snapshot;
-      iters[i] = db_->NewIterator(readoptionscopy[i], column_family);
-      iters[i]->Seek(prefix_slices[i]);
-    }
-
-    int count = 0;
-    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
-      count++;
-      std::string values[10];
-      // get list of all values for this iteration
-      for (int i = 0; i < 10; i++) {
-        // no iterator should finish before the first one
-        assert(iters[i]->Valid() &&
-               iters[i]->key().starts_with(prefix_slices[i]));
-        values[i] = iters[i]->value().ToString();
-
-        char expected_first = (prefixes[i])[0];
-        char actual_first = (values[i])[0];
-
-        if (actual_first != expected_first) {
-          fprintf(stderr, "error expected first = %c actual = %c\n",
-                  expected_first, actual_first);
-        }
-        (values[i])[0] = ' '; // blank out the differing character
-      }
-      // make sure all values are equivalent
-      for (int i = 0; i < 10; i++) {
-        if (values[i] != values[0]) {
-          fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
-                  i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
-                  StringToHex(values[i]).c_str());
-          // we continue after error rather than exiting so that we can
-          // find more errors if any
-        }
-        iters[i]->Next();
-      }
-    }
-
-    // cleanup iterators and snapshot
-    for (int i = 0; i < 10; i++) {
-      // if the first iterator finished, they should have all finished
-      assert(!iters[i]->Valid() ||
-             !iters[i]->key().starts_with(prefix_slices[i]));
-      assert(iters[i]->status().ok());
-      delete iters[i];
-    }
-    db_->ReleaseSnapshot(snapshot);
-
-    if (s.ok()) {
-      thread->stats.AddPrefixes(1, count);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-
-    return s;
-  }
-
-  // Given a key K, this creates an iterator which scans to K and then
-  // does a random sequence of Next/Prev operations.
-  Status MultiIterate(ThreadState* thread, const ReadOptions& readoptions,
-                      ColumnFamilyHandle* column_family, const Slice& key) {
-    Status s;
-    const Snapshot* snapshot = db_->GetSnapshot();
-    ReadOptions readoptionscopy = readoptions;
-    readoptionscopy.snapshot = snapshot;
-    unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, column_family));
-
-    iter->Seek(key);
-    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
-      if (thread->rand.OneIn(2)) {
-        iter->Next();
-      } else {
-        iter->Prev();
-      }
-    }
-
-    if (s.ok()) {
-      thread->stats.AddIterations(1);
-    } else {
-      thread->stats.AddErrors(1);
-    }
-
-    db_->ReleaseSnapshot(snapshot);
-
-    return s;
+      db_preload_finished_.store(true);
+      auto now = FLAGS_env->NowMicros();
+      fprintf(stdout, "%s Reopening database in read-only\n",
+              FLAGS_env->TimeToString(now / 1000000).c_str());
+      // Reopen as read-only, can ignore all options related to updates
+      Open();
+    } else {
+      fprintf(stderr, "Failed to preload db");
+      exit(1);
+    }
   }
 
   Status SetOptions(ThreadState* thread) {
@@ -1632,12 +1899,37 @@ class StressTest {
     return db_->SetOptions(cfh, opts);
   }
 
-  void OperateDb(ThreadState* thread) {
+#ifndef ROCKSDB_LITE
+  Status NewTxn(WriteOptions& write_opts, Transaction** txn) {
+    if (!FLAGS_use_txn) {
+      return Status::InvalidArgument("NewTxn when FLAGS_use_txn is not set");
+    }
+    static std::atomic<uint64_t> txn_id = {0};
+    TransactionOptions txn_options;
+    *txn = txn_db_->BeginTransaction(write_opts, txn_options);
+    auto istr = std::to_string(txn_id.fetch_add(1));
+    Status s = (*txn)->SetName("xid" + istr);
+    return s;
+  }
+
+  Status CommitTxn(Transaction* txn) {
+    if (!FLAGS_use_txn) {
+      return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set");
+    }
+    Status s = txn->Prepare();
+    if (s.ok()) {
+      s = txn->Commit();
+    }
+    delete txn;
+    return s;
+  }
+#endif
+
+  virtual void OperateDb(ThreadState* thread) {
     ReadOptions read_opts(FLAGS_verify_checksum, true);
     WriteOptions write_opts;
     auto shared = thread->shared;
     char value[100];
-    auto max_key = thread->shared->GetMaxKey();
     std::string from_db;
     if (FLAGS_sync) {
       write_opts.sync = true;
@@ -1657,12 +1949,17 @@ class StressTest {
         {
           thread->stats.FinishedSingleOp();
           MutexLock l(thread->shared->GetMutex());
+          while (!thread->snapshot_queue.empty()) {
+            db_->ReleaseSnapshot(
+                thread->snapshot_queue.front().second.snapshot);
+            delete thread->snapshot_queue.front().second.key_vec;
+            thread->snapshot_queue.pop();
+          }
           thread->shared->IncVotedReopen();
           if (thread->shared->AllVotedReopen()) {
             thread->shared->GetStressTest()->Reopen();
             thread->shared->GetCondVar()->SignalAll();
-          }
-          else {
+          } else {
             thread->shared->GetCondVar()->Wait();
           }
           // Commenting this out as we don't want to reset stats on each open.
@@ -1681,43 +1978,9 @@ class StressTest {
         options_.inplace_update_support ^= options_.inplace_update_support;
       }
 
-      if (!FLAGS_test_batches_snapshots &&
-          FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
-        if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
-          // drop column family and then create it again (can't drop default)
-          int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
-          std::string new_name =
-              ToString(new_column_family_name_.fetch_add(1));
-          {
-            MutexLock l(thread->shared->GetMutex());
-            fprintf(
-                stdout,
-                "[CF %d] Dropping and recreating column family. new name: %s\n",
-                cf, new_name.c_str());
-          }
-          thread->shared->LockColumnFamily(cf);
-          Status s __attribute__((unused));
-          s = db_->DropColumnFamily(column_families_[cf]);
-          delete column_families_[cf];
-          if (!s.ok()) {
-            fprintf(stderr, "dropping column family error: %s\n",
-                s.ToString().c_str());
-            std::terminate();
-          }
-          s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
-                                      &column_families_[cf]);
-          column_family_names_[cf] = new_name;
-          thread->shared->ClearColumnFamily(cf);
-          if (!s.ok()) {
-            fprintf(stderr, "creating column family error: %s\n",
-                s.ToString().c_str());
-            std::terminate();
-          }
-          thread->shared->UnlockColumnFamily(cf);
-        }
-      }
+      MaybeClearOneColumnFamily(thread);
 
-#ifndef ROCKSDB_LITE  // Lite does not support GetColumnFamilyMetaData
+#ifndef ROCKSDB_LITE
       if (FLAGS_compact_files_one_in > 0 &&
           thread->rand.Uniform(FLAGS_compact_files_one_in) == 0) {
         auto* random_cf =
@@ -1757,8 +2020,8 @@ class StressTest {
                 db_->CompactFiles(CompactionOptions(), random_cf, input_files,
                                   static_cast<int>(output_level));
             if (!s.ok()) {
-              printf("Unable to perform CompactFiles(): %s\n",
-                     s.ToString().c_str());
+              fprintf(stdout, "Unable to perform CompactFiles(): %s\n",
+                      s.ToString().c_str());
               thread->stats.AddNumCompactFilesFailed(1);
             } else {
               thread->stats.AddNumCompactFilesSucceed(1);
@@ -1768,197 +2031,156 @@ class StressTest {
         }
       }
 #endif                // !ROCKSDB_LITE
-
-      const double completed_ratio =
-          static_cast<double>(i) / FLAGS_ops_per_thread;
-      const int64_t base_key = static_cast<int64_t>(
-          completed_ratio * (FLAGS_max_key - FLAGS_active_width));
-      long rand_key = base_key + thread->rand.Next() % FLAGS_active_width;
+      int64_t rand_key = GenerateOneKey(thread, i);
       int rand_column_family = thread->rand.Next() % FLAGS_column_families;
       std::string keystr = Key(rand_key);
       Slice key = keystr;
-      std::unique_ptr<MutexLock> l;
-      if (!FLAGS_test_batches_snapshots) {
-        l.reset(new MutexLock(
+      std::unique_ptr<MutexLock> lock;
+      if (ShouldAcquireMutexOnKey()) {
+        lock.reset(new MutexLock(
             shared->GetMutexForKey(rand_column_family, rand_key)));
       }
+
       auto column_family = column_families_[rand_column_family];
 
+      if (FLAGS_compact_range_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_compact_range_one_in) == 0) {
+        int64_t end_key_num;
+        if (port::kMaxInt64 - rand_key < FLAGS_compact_range_width) {
+          end_key_num = port::kMaxInt64;
+        } else {
+          end_key_num = FLAGS_compact_range_width + rand_key;
+        }
+        std::string end_key_buf = Key(end_key_num);
+        Slice end_key(end_key_buf);
+
+        CompactRangeOptions cro;
+        cro.exclusive_manual_compaction =
+            static_cast<bool>(thread->rand.Next() % 2);
+        Status status = db_->CompactRange(cro, column_family, &key, &end_key);
+        if (!status.ok()) {
+          printf("Unable to perform CompactRange(): %s\n",
+                 status.ToString().c_str());
+        }
+      }
+
+      std::vector<int> rand_column_families =
+          GenerateColumnFamilies(FLAGS_column_families, rand_column_family);
+
+      if (FLAGS_flush_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_flush_one_in) == 0) {
+        FlushOptions flush_opts;
+        std::vector<ColumnFamilyHandle*> cfhs;
+        std::for_each(
+            rand_column_families.begin(), rand_column_families.end(),
+            [this, &cfhs](int k) { cfhs.push_back(column_families_[k]); });
+        Status status = db_->Flush(flush_opts, cfhs);
+        if (!status.ok()) {
+          fprintf(stdout, "Unable to perform Flush(): %s\n",
+                  status.ToString().c_str());
+        }
+      }
+
+      std::vector<int64_t> rand_keys = GenerateKeys(rand_key);
+
+      if (FLAGS_ingest_external_file_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_ingest_external_file_one_in) == 0) {
+        TestIngestExternalFile(thread, rand_column_families, rand_keys, lock);
+      }
+
+      if (FLAGS_backup_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_backup_one_in) == 0) {
+        Status s = TestBackupRestore(thread, rand_column_families, rand_keys);
+        if (!s.ok()) {
+          VerificationAbort(shared, "Backup/restore gave inconsistent state",
+                            s);
+        }
+      }
+
+      if (FLAGS_checkpoint_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_checkpoint_one_in) == 0) {
+        Status s = TestCheckpoint(thread, rand_column_families, rand_keys);
+        if (!s.ok()) {
+          VerificationAbort(shared, "Checkpoint gave inconsistent state", s);
+        }
+      }
+
+      if (FLAGS_acquire_snapshot_one_in > 0 &&
+          thread->rand.Uniform(FLAGS_acquire_snapshot_one_in) == 0) {
+        auto snapshot = db_->GetSnapshot();
+        ReadOptions ropt;
+        ropt.snapshot = snapshot;
+        std::string value_at;
+        // When taking a snapshot, we also read a key from that snapshot. We
+        // will later read the same key before releasing the snapshot and verify
+        // that the results are the same.
+        auto status_at = db_->Get(ropt, column_family, key, &value_at);
+        std::vector<bool> *key_vec = nullptr;
+
+        if (FLAGS_compare_full_db_state_snapshot &&
+            (thread->tid == 0)) {
+          key_vec = new std::vector<bool>(FLAGS_max_key);
+          // When `prefix_extractor` is set, seeking to beginning and scanning
+          // across prefixes are only supported with `total_order_seek` set.
+          ropt.total_order_seek = true;
+          std::unique_ptr<Iterator> iterator(db_->NewIterator(ropt));
+          for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+            uint64_t key_val;
+            if (GetIntVal(iterator->key().ToString(), &key_val)) {
+              (*key_vec)[key_val] = true;
+            }
+          }
+        }
+
+        ThreadState::SnapshotState snap_state = {
+            snapshot, rand_column_family, column_family->GetName(),
+            keystr,   status_at,          value_at, key_vec};
+        thread->snapshot_queue.emplace(
+            std::min(FLAGS_ops_per_thread - 1, i + FLAGS_snapshot_hold_ops),
+            snap_state);
+      }
+      while (!thread->snapshot_queue.empty() &&
+          i == thread->snapshot_queue.front().first) {
+        auto snap_state = thread->snapshot_queue.front().second;
+        assert(snap_state.snapshot);
+        // Note: this is unsafe as the cf might be dropped concurrently. But it
+        // is ok since unclean cf drop is cunnrently not supported by write
+        // prepared transactions.
+        Status s =
+            AssertSame(db_, column_families_[snap_state.cf_at], snap_state);
+        if (!s.ok()) {
+          VerificationAbort(shared, "Snapshot gave inconsistent state", s);
+        }
+        db_->ReleaseSnapshot(snap_state.snapshot);
+        delete snap_state.key_vec;
+        thread->snapshot_queue.pop();
+      }
+
       int prob_op = thread->rand.Uniform(100);
       if (prob_op >= 0 && prob_op < (int)FLAGS_readpercent) {
         // OPERATION read
-        if (!FLAGS_test_batches_snapshots) {
-          Status s = db_->Get(read_opts, column_family, key, &from_db);
-          if (s.ok()) {
-            // found case
-            thread->stats.AddGets(1, 1);
-          } else if (s.IsNotFound()) {
-            // not found case
-            thread->stats.AddGets(1, 0);
-          } else {
-            // errors case
-            thread->stats.AddErrors(1);
-          }
-        } else {
-          MultiGet(thread, read_opts, column_family, key, &from_db);
-        }
+        TestGet(thread, read_opts, rand_column_families, rand_keys);
       } else if ((int)FLAGS_readpercent <= prob_op && prob_op < prefixBound) {
         // OPERATION prefix scan
         // keys are 8 bytes long, prefix size is FLAGS_prefix_size. There are
         // (8 - FLAGS_prefix_size) bytes besides the prefix. So there will
         // be 2 ^ ((8 - FLAGS_prefix_size) * 8) possible keys with the same
         // prefix
-        if (!FLAGS_test_batches_snapshots) {
-          Slice prefix = Slice(key.data(), FLAGS_prefix_size);
-          Iterator* iter = db_->NewIterator(read_opts, column_family);
-          int64_t count = 0;
-          for (iter->Seek(prefix);
-               iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
-            ++count;
-          }
-          assert(count <=
-                 (static_cast<int64_t>(1) << ((8 - FLAGS_prefix_size) * 8)));
-          if (iter->status().ok()) {
-            thread->stats.AddPrefixes(1, static_cast<int>(count));
-          } else {
-            thread->stats.AddErrors(1);
-          }
-          delete iter;
-        } else {
-          MultiPrefixScan(thread, read_opts, column_family, key);
-        }
+        TestPrefixScan(thread, read_opts, rand_column_families, rand_keys);
       } else if (prefixBound <= prob_op && prob_op < writeBound) {
         // OPERATION write
-        uint32_t value_base = thread->rand.Next();
-        size_t sz = GenerateValue(value_base, value, sizeof(value));
-        Slice v(value, sz);
-        if (!FLAGS_test_batches_snapshots) {
-          // If the chosen key does not allow overwrite and it already
-          // exists, choose another key.
-          while (!shared->AllowsOverwrite(rand_column_family, rand_key) &&
-                 shared->Exists(rand_column_family, rand_key)) {
-            l.reset();
-            rand_key = thread->rand.Next() % max_key;
-            rand_column_family = thread->rand.Next() % FLAGS_column_families;
-            l.reset(new MutexLock(
-                shared->GetMutexForKey(rand_column_family, rand_key)));
-          }
-
-          keystr = Key(rand_key);
-          key = keystr;
-          column_family = column_families_[rand_column_family];
-
-          if (FLAGS_verify_before_write) {
-            std::string keystr2 = Key(rand_key);
-            Slice k = keystr2;
-            Status s = db_->Get(read_opts, column_family, k, &from_db);
-            if (!VerifyValue(rand_column_family, rand_key, read_opts,
-                             thread->shared, from_db, s, true)) {
-              break;
-            }
-          }
-          shared->Put(rand_column_family, rand_key, value_base);
-          Status s;
-          if (FLAGS_use_merge) {
-            s = db_->Merge(write_opts, column_family, key, v);
-          } else {
-            s = db_->Put(write_opts, column_family, key, v);
-          }
-          if (!s.ok()) {
-            fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
-            std::terminate();
-          }
-          thread->stats.AddBytesForWrites(1, sz);
-        } else {
-          MultiPut(thread, write_opts, column_family, key, v, sz);
-        }
-        PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key),
-                      value, sz);
+        TestPut(thread, write_opts, read_opts, rand_column_families, rand_keys,
+                value, lock);
       } else if (writeBound <= prob_op && prob_op < delBound) {
         // OPERATION delete
-        if (!FLAGS_test_batches_snapshots) {
-          // If the chosen key does not allow overwrite and it does not exist,
-          // choose another key.
-          while (!shared->AllowsOverwrite(rand_column_family, rand_key) &&
-                 !shared->Exists(rand_column_family, rand_key)) {
-            l.reset();
-            rand_key = thread->rand.Next() % max_key;
-            rand_column_family = thread->rand.Next() % FLAGS_column_families;
-            l.reset(new MutexLock(
-                shared->GetMutexForKey(rand_column_family, rand_key)));
-          }
-
-          keystr = Key(rand_key);
-          key = keystr;
-          column_family = column_families_[rand_column_family];
-
-          // Use delete if the key may be overwritten and a single deletion
-          // otherwise.
-          if (shared->AllowsOverwrite(rand_column_family, rand_key)) {
-            shared->Delete(rand_column_family, rand_key);
-            Status s = db_->Delete(write_opts, column_family, key);
-            thread->stats.AddDeletes(1);
-            if (!s.ok()) {
-              fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
-              std::terminate();
-            }
-          } else {
-            shared->SingleDelete(rand_column_family, rand_key);
-            Status s = db_->SingleDelete(write_opts, column_family, key);
-            thread->stats.AddSingleDeletes(1);
-            if (!s.ok()) {
-              fprintf(stderr, "single delete error: %s\n",
-                      s.ToString().c_str());
-              std::terminate();
-            }
-          }
-        } else {
-          MultiDelete(thread, write_opts, column_family, key);
-        }
+        TestDelete(thread, write_opts, rand_column_families, rand_keys, lock);
       } else if (delBound <= prob_op && prob_op < delRangeBound) {
         // OPERATION delete range
-        if (!FLAGS_test_batches_snapshots) {
-          std::vector<std::unique_ptr<MutexLock>> range_locks;
-          // delete range does not respect disallowed overwrites. the keys for
-          // which overwrites are disallowed are randomly distributed so it
-          // could be expensive to find a range where each key allows
-          // overwrites.
-          if (rand_key > max_key - FLAGS_range_deletion_width) {
-            l.reset();
-            rand_key = thread->rand.Next() %
-                       (max_key - FLAGS_range_deletion_width + 1);
-            range_locks.emplace_back(new MutexLock(
-                shared->GetMutexForKey(rand_column_family, rand_key)));
-          } else {
-            range_locks.emplace_back(std::move(l));
-          }
-          for (int j = 1; j < FLAGS_range_deletion_width; ++j) {
-            if (((rand_key + j) & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
-              range_locks.emplace_back(new MutexLock(
-                    shared->GetMutexForKey(rand_column_family, rand_key + j)));
-            }
-          }
-
-          keystr = Key(rand_key);
-          key = keystr;
-          column_family = column_families_[rand_column_family];
-          std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
-          Slice end_key = end_keystr;
-          int covered = shared->DeleteRange(
-              rand_column_family, rand_key,
-              rand_key + FLAGS_range_deletion_width);
-          Status s = db_->DeleteRange(write_opts, column_family, key, end_key);
-          if (!s.ok()) {
-            fprintf(stderr, "delete range error: %s\n",
-                    s.ToString().c_str());
-            std::terminate();
-          }
-          thread->stats.AddRangeDeletions(1);
-          thread->stats.AddCoveredByRangeDeletions(covered);
-        }
+        TestDeleteRange(thread, write_opts, rand_column_families, rand_keys,
+                        lock);
       } else {
         // OPERATION iterate
-        MultiIterate(thread, read_opts, column_family, key);
+        TestIterate(thread, read_opts, rand_column_families, rand_keys);
       }
       thread->stats.FinishedSingleOp();
     }
@@ -1966,153 +2188,318 @@ class StressTest {
     thread->stats.Stop();
   }
 
-  void VerifyDb(ThreadState* thread) const {
-    ReadOptions options(FLAGS_verify_checksum, true);
-    auto shared = thread->shared;
-    const int64_t max_key = shared->GetMaxKey();
-    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
-    int64_t start = keys_per_thread * thread->tid;
-    int64_t end = start + keys_per_thread;
-    if (thread->tid == shared->GetNumThreads() - 1) {
-      end = max_key;
-    }
-    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
-      if (thread->shared->HasVerificationFailedYet()) {
-        break;
-      }
-      if (!thread->rand.OneIn(2)) {
-        // Use iterator to verify this range
-        unique_ptr<Iterator> iter(
-            db_->NewIterator(options, column_families_[cf]));
-        iter->Seek(Key(start));
-        for (auto i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          // TODO(ljin): update "long" to uint64_t
-          // Reseek when the prefix changes
-          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
-              0) {
-            iter->Seek(Key(i));
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = iter->status();
-          if (iter->Valid()) {
-            if (iter->key().compare(k) > 0) {
-              s = Status::NotFound(Slice());
-            } else if (iter->key().compare(k) == 0) {
-              from_db = iter->value().ToString();
-              iter->Next();
-            } else if (iter->key().compare(k) < 0) {
-              VerificationAbort(shared, "An out of range key was found",
-                                static_cast<int>(cf), i);
-            }
-          } else {
-            // The iterator found no value for the key in question, so do not
-            // move to the next item in the iterator
-            s = Status::NotFound(Slice());
-          }
-          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
-                      true);
-          if (from_db.length()) {
-            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
-                          from_db.data(), from_db.length());
-          }
-        }
-      } else {
-        // Use Get to verify this range
-        for (auto i = start; i < end; i++) {
-          if (thread->shared->HasVerificationFailedYet()) {
-            break;
-          }
-          std::string from_db;
-          std::string keystr = Key(i);
-          Slice k = keystr;
-          Status s = db_->Get(options, column_families_[cf], k, &from_db);
-          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
-                      true);
-          if (from_db.length()) {
-            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
-                          from_db.data(), from_db.length());
-          }
-        }
-      }
-    }
+  virtual void VerifyDb(ThreadState* thread) const = 0;
+
+  virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {}
+
+  virtual bool ShouldAcquireMutexOnKey() const { return false; }
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int rand_column_family) const {
+    return {rand_column_family};
   }
 
-  void VerificationAbort(SharedState* shared, std::string msg, int cf,
-                         int64_t key) const {
-    printf("Verification failed for column family %d key %" PRIi64 ": %s\n", cf, key,
-           msg.c_str());
-    shared->SetVerificationFailure();
+  virtual std::vector<int64_t> GenerateKeys(int64_t rand_key) const {
+    return {rand_key};
   }
 
-  bool VerifyValue(int cf, int64_t key, const ReadOptions& opts,
-                   SharedState* shared, const std::string& value_from_db,
-                   Status s, bool strict = false) const {
-    if (shared->HasVerificationFailedYet()) {
-      return false;
+  virtual Status TestGet(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) = 0;
+
+  virtual Status TestPut(ThreadState* thread,
+      WriteOptions& write_opts, const ReadOptions& read_opts,
+      const std::vector<int>& cf_ids, const std::vector<int64_t>& keys,
+      char (&value)[100], std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual Status TestDeleteRange(ThreadState* thread,
+      WriteOptions& write_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& lock) = 0;
+
+  virtual void TestIngestExternalFile(
+      ThreadState* thread, const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& lock) = 0;
+
+  // Given a key K, this creates an iterator which scans to K and then
+  // does a random sequence of Next/Prev operations.
+  virtual Status TestIterate(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    Status s;
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions readoptionscopy = read_opts;
+    readoptionscopy.snapshot = snapshot;
+
+    std::string upper_bound_str;
+    Slice upper_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, set a iterator upper bound
+      int64_t rand_upper_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      upper_bound_str = Key(rand_upper_key);
+      upper_bound = Slice(upper_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_upper_bound = &upper_bound;
+    }
+    std::string lower_bound_str;
+    Slice lower_bound;
+    if (thread->rand.OneIn(16)) {
+      // in 1/16 chance, set a iterator lower bound
+      int64_t rand_lower_key = GenerateOneKey(thread, FLAGS_ops_per_thread);
+      lower_bound_str = Key(rand_lower_key);
+      lower_bound = Slice(lower_bound_str);
+      // uppder_bound can be smaller than seek key, but the query itself
+      // should not crash either.
+      readoptionscopy.iterate_lower_bound = &lower_bound;
+    }
+
+    auto cfh = column_families_[rand_column_families[0]];
+    std::unique_ptr<Iterator> iter(db_->NewIterator(readoptionscopy, cfh));
+
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    iter->Seek(key);
+    for (uint64_t i = 0; i < FLAGS_num_iterations && iter->Valid(); i++) {
+      if (thread->rand.OneIn(2)) {
+        iter->Next();
+      } else {
+        iter->Prev();
+      }
     }
-    // compare value_from_db with the value in the shared state
-    char value[kValueMaxLen];
-    uint32_t value_base = shared->Get(cf, key);
-    if (value_base == SharedState::SENTINEL && !strict) {
-      return true;
+
+    if (s.ok()) {
+      thread->stats.AddIterations(1);
+    } else {
+      thread->stats.AddErrors(1);
     }
 
+    db_->ReleaseSnapshot(snapshot);
+
+    return s;
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual Status TestBackupRestore(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestBackupRestore\n");
+    std::terminate();
+  }
+
+  virtual Status TestCheckpoint(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestCheckpoint\n");
+    std::terminate();
+  }
+#else  // ROCKSDB_LITE
+  virtual Status TestBackupRestore(ThreadState* thread,
+                                   const std::vector<int>& rand_column_families,
+                                   const std::vector<int64_t>& rand_keys) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    assert(rand_column_families.size() == rand_keys.size());
+    std::string backup_dir = FLAGS_db + "/.backup" + ToString(thread->tid);
+    std::string restore_dir = FLAGS_db + "/.restore" + ToString(thread->tid);
+    BackupableDBOptions backup_opts(backup_dir);
+    BackupEngine* backup_engine = nullptr;
+    Status s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
     if (s.ok()) {
-      if (value_base == SharedState::SENTINEL) {
-        VerificationAbort(shared, "Unexpected value found", cf, key);
-        return false;
-      }
-      size_t sz = GenerateValue(value_base, value, sizeof(value));
-      if (value_from_db.length() != sz) {
-        VerificationAbort(shared, "Length of value read is not equal", cf, key);
-        return false;
+      s = backup_engine->CreateNewBackup(db_);
+    }
+    if (s.ok()) {
+      delete backup_engine;
+      backup_engine = nullptr;
+      s = BackupEngine::Open(FLAGS_env, backup_opts, &backup_engine);
+    }
+    if (s.ok()) {
+      s = backup_engine->RestoreDBFromLatestBackup(restore_dir /* db_dir */,
+                                                   restore_dir /* wal_dir */);
+    }
+    if (s.ok()) {
+      s = backup_engine->PurgeOldBackups(0 /* num_backups_to_keep */);
+    }
+    DB* restored_db = nullptr;
+    std::vector<ColumnFamilyHandle*> restored_cf_handles;
+    if (s.ok()) {
+      Options restore_options(options_);
+      restore_options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      assert(FLAGS_clear_column_family_one_in == 0);
+      for (auto name : column_family_names_) {
+        cf_descriptors.emplace_back(name, ColumnFamilyOptions(restore_options));
       }
-      if (memcmp(value_from_db.data(), value, sz) != 0) {
-        VerificationAbort(shared, "Contents of value read don't match", cf,
-                          key);
-        return false;
+      s = DB::Open(DBOptions(restore_options), restore_dir, cf_descriptors,
+                   &restored_cf_handles, &restored_db);
+    }
+    // for simplicity, currently only verifies existence/non-existence of a few
+    // keys
+    for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+      std::string key_str = Key(rand_keys[i]);
+      Slice key = key_str;
+      std::string restored_value;
+      Status get_status = restored_db->Get(
+          ReadOptions(), restored_cf_handles[rand_column_families[i]], key,
+          &restored_value);
+      bool exists =
+          thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+      if (get_status.ok()) {
+        if (!exists) {
+          s = Status::Corruption(
+              "key exists in restore but not in original db");
+        }
+      } else if (get_status.IsNotFound()) {
+        if (exists) {
+          s = Status::Corruption(
+              "key exists in original db but not in restore");
+        }
+      } else {
+        s = get_status;
       }
-    } else {
-      if (value_base != SharedState::SENTINEL) {
-        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
-        return false;
+    }
+    if (backup_engine != nullptr) {
+      delete backup_engine;
+      backup_engine = nullptr;
+    }
+    if (restored_db != nullptr) {
+      for (auto* cf_handle : restored_cf_handles) {
+        restored_db->DestroyColumnFamilyHandle(cf_handle);
       }
+      delete restored_db;
+      restored_db = nullptr;
     }
-    return true;
+    if (!s.ok()) {
+      printf("A backup/restore operation failed with: %s\n",
+             s.ToString().c_str());
+    }
+    return s;
   }
 
-  static void PrintKeyValue(int cf, int64_t key, const char* value,
-                            size_t sz) {
-    if (!FLAGS_verbose) {
-      return;
+  virtual Status TestCheckpoint(ThreadState* thread,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    // Note the column families chosen by `rand_column_families` cannot be
+    // dropped while the locks for `rand_keys` are held. So we should not have
+    // to worry about accessing those column families throughout this function.
+    assert(rand_column_families.size() == rand_keys.size());
+    std::string checkpoint_dir =
+        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
+    DestroyDB(checkpoint_dir, Options());
+    Checkpoint* checkpoint = nullptr;
+    Status s = Checkpoint::Create(db_, &checkpoint);
+    if (s.ok()) {
+      s = checkpoint->CreateCheckpoint(checkpoint_dir);
     }
-    fprintf(stdout, "[CF %d] %" PRIi64 " == > (%" ROCKSDB_PRIszt ") ", cf, key, sz);
-    for (size_t i = 0; i < sz; i++) {
-      fprintf(stdout, "%X", value[i]);
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* checkpoint_db = nullptr;
+    if (s.ok()) {
+      delete checkpoint;
+      checkpoint = nullptr;
+      Options options(options_);
+      options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      if (FLAGS_clear_column_family_one_in == 0) {
+        for (const auto& name : column_family_names_) {
+          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
+        }
+        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
+                                &cf_handles, &checkpoint_db);
+      }
+    }
+    if (checkpoint_db != nullptr) {
+      for (size_t i = 0; s.ok() && i < rand_column_families.size(); ++i) {
+        std::string key_str = Key(rand_keys[i]);
+        Slice key = key_str;
+        std::string value;
+        Status get_status = checkpoint_db->Get(
+            ReadOptions(), cf_handles[rand_column_families[i]], key, &value);
+        bool exists =
+            thread->shared->Exists(rand_column_families[i], rand_keys[i]);
+        if (get_status.ok()) {
+          if (!exists) {
+            s = Status::Corruption(
+                "key exists in checkpoint but not in original db");
+          }
+        } else if (get_status.IsNotFound()) {
+          if (exists) {
+            s = Status::Corruption(
+                "key exists in original db but not in checkpoint");
+          }
+        } else {
+          s = get_status;
+        }
+      }
+      for (auto cfh : cf_handles) {
+        delete cfh;
+      }
+      cf_handles.clear();
+      delete checkpoint_db;
+      checkpoint_db = nullptr;
+    }
+    DestroyDB(checkpoint_dir, Options());
+    if (!s.ok()) {
+      fprintf(stderr, "A checkpoint operation failed with: %s\n",
+              s.ToString().c_str());
     }
-    fprintf(stdout, "\n");
+    return s;
   }
+#endif  // ROCKSDB_LITE
 
-  static size_t GenerateValue(uint32_t rand, char *v, size_t max_sz) {
-    size_t value_sz =
-        ((rand % kRandomValueMaxFactor) + 1) * FLAGS_value_size_mult;
-    assert(value_sz <= max_sz && value_sz >= sizeof(uint32_t));
-    *((uint32_t*)v) = rand;
-    for (size_t i=sizeof(uint32_t); i < value_sz; i++) {
-      v[i] = (char)(rand ^ i);
-    }
-    v[value_sz] = '\0';
-    return value_sz; // the size of the value set.
+  void VerificationAbort(SharedState* shared, std::string msg, Status s) const {
+    printf("Verification failed: %s. Status is %s\n", msg.c_str(),
+           s.ToString().c_str());
+    shared->SetVerificationFailure();
+  }
+
+  void VerificationAbort(SharedState* shared, std::string msg, int cf,
+                         int64_t key) const {
+    printf("Verification failed for column family %d key %" PRIi64 ": %s\n", cf, key,
+           msg.c_str());
+    shared->SetVerificationFailure();
   }
 
   void PrintEnv() const {
     fprintf(stdout, "RocksDB version           : %d.%d\n", kMajorVersion,
             kMinorVersion);
+    fprintf(stdout, "Format version            : %d\n", FLAGS_format_version);
+    fprintf(stdout, "TransactionDB             : %s\n",
+            FLAGS_use_txn ? "true" : "false");
+    fprintf(stdout, "Read only mode            : %s\n",
+            FLAGS_read_only ? "true" : "false");
+    fprintf(stdout, "Atomic flush              : %s\n",
+            FLAGS_atomic_flush ? "true" : "false");
     fprintf(stdout, "Column families           : %d\n", FLAGS_column_families);
     if (!FLAGS_test_batches_snapshots) {
       fprintf(stdout, "Clear CFs one in          : %d\n",
@@ -2185,58 +2572,102 @@ class StressTest {
 
   void Open() {
     assert(db_ == nullptr);
-    BlockBasedTableOptions block_based_options;
-    block_based_options.block_cache = cache_;
-    block_based_options.block_cache_compressed = compressed_cache_;
-    block_based_options.checksum = FLAGS_checksum_type_e;
-    block_based_options.block_size = FLAGS_block_size;
-    block_based_options.format_version = 2;
-    block_based_options.filter_policy = filter_policy_;
-    options_.table_factory.reset(
-        NewBlockBasedTableFactory(block_based_options));
-    options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
-    options_.write_buffer_size = FLAGS_write_buffer_size;
-    options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
-    options_.min_write_buffer_number_to_merge =
-        FLAGS_min_write_buffer_number_to_merge;
-    options_.max_write_buffer_number_to_maintain =
-        FLAGS_max_write_buffer_number_to_maintain;
-    options_.memtable_prefix_bloom_size_ratio =
-        FLAGS_memtable_prefix_bloom_size_ratio;
-    options_.max_background_compactions = FLAGS_max_background_compactions;
-    options_.max_background_flushes = FLAGS_max_background_flushes;
-    options_.compaction_style =
-        static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
-    options_.prefix_extractor.reset(NewFixedPrefixTransform(FLAGS_prefix_size));
-    options_.max_open_files = FLAGS_open_files;
-    options_.statistics = dbstats;
-    options_.env = FLAGS_env;
-    options_.use_fsync = FLAGS_use_fsync;
-    options_.compaction_readahead_size = FLAGS_compaction_readahead_size;
-    options_.allow_mmap_reads = FLAGS_mmap_read;
-    options_.allow_mmap_writes = FLAGS_mmap_write;
-    options_.use_direct_reads = FLAGS_use_direct_reads;
-    options_.use_direct_io_for_flush_and_compaction =
-        FLAGS_use_direct_io_for_flush_and_compaction;
-    options_.target_file_size_base = FLAGS_target_file_size_base;
-    options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
-    options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
-    options_.max_bytes_for_level_multiplier =
-        FLAGS_max_bytes_for_level_multiplier;
-    options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
-    options_.level0_slowdown_writes_trigger =
-        FLAGS_level0_slowdown_writes_trigger;
-    options_.level0_file_num_compaction_trigger =
-        FLAGS_level0_file_num_compaction_trigger;
-    options_.compression = FLAGS_compression_type_e;
-    options_.create_if_missing = true;
-    options_.max_manifest_file_size = 10 * 1024;
-    options_.inplace_update_support = FLAGS_in_place_update;
-    options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
-    options_.allow_concurrent_memtable_write =
-        FLAGS_allow_concurrent_memtable_write;
-    options_.enable_write_thread_adaptive_yield =
-        FLAGS_enable_write_thread_adaptive_yield;
+#ifndef ROCKSDB_LITE
+    assert(txn_db_ == nullptr);
+#endif
+    if (FLAGS_options_file.empty()) {
+      BlockBasedTableOptions block_based_options;
+      block_based_options.block_cache = cache_;
+      block_based_options.block_cache_compressed = compressed_cache_;
+      block_based_options.checksum = FLAGS_checksum_type_e;
+      block_based_options.block_size = FLAGS_block_size;
+      block_based_options.format_version =
+          static_cast<uint32_t>(FLAGS_format_version);
+      block_based_options.index_block_restart_interval =
+          static_cast<int32_t>(FLAGS_index_block_restart_interval);
+      block_based_options.filter_policy = filter_policy_;
+      options_.table_factory.reset(
+          NewBlockBasedTableFactory(block_based_options));
+      options_.db_write_buffer_size = FLAGS_db_write_buffer_size;
+      options_.write_buffer_size = FLAGS_write_buffer_size;
+      options_.max_write_buffer_number = FLAGS_max_write_buffer_number;
+      options_.min_write_buffer_number_to_merge =
+          FLAGS_min_write_buffer_number_to_merge;
+      options_.max_write_buffer_number_to_maintain =
+          FLAGS_max_write_buffer_number_to_maintain;
+      options_.memtable_prefix_bloom_size_ratio =
+          FLAGS_memtable_prefix_bloom_size_ratio;
+      options_.memtable_whole_key_filtering =
+          FLAGS_memtable_whole_key_filtering;
+      options_.max_background_compactions = FLAGS_max_background_compactions;
+      options_.max_background_flushes = FLAGS_max_background_flushes;
+      options_.compaction_style =
+          static_cast<rocksdb::CompactionStyle>(FLAGS_compaction_style);
+      options_.prefix_extractor.reset(
+          NewFixedPrefixTransform(FLAGS_prefix_size));
+      options_.max_open_files = FLAGS_open_files;
+      options_.statistics = dbstats;
+      options_.env = FLAGS_env;
+      options_.use_fsync = FLAGS_use_fsync;
+      options_.compaction_readahead_size = FLAGS_compaction_readahead_size;
+      options_.allow_mmap_reads = FLAGS_mmap_read;
+      options_.allow_mmap_writes = FLAGS_mmap_write;
+      options_.use_direct_reads = FLAGS_use_direct_reads;
+      options_.use_direct_io_for_flush_and_compaction =
+          FLAGS_use_direct_io_for_flush_and_compaction;
+      options_.recycle_log_file_num =
+          static_cast<size_t>(FLAGS_recycle_log_file_num);
+      options_.target_file_size_base = FLAGS_target_file_size_base;
+      options_.target_file_size_multiplier = FLAGS_target_file_size_multiplier;
+      options_.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base;
+      options_.max_bytes_for_level_multiplier =
+          FLAGS_max_bytes_for_level_multiplier;
+      options_.level0_stop_writes_trigger = FLAGS_level0_stop_writes_trigger;
+      options_.level0_slowdown_writes_trigger =
+          FLAGS_level0_slowdown_writes_trigger;
+      options_.level0_file_num_compaction_trigger =
+          FLAGS_level0_file_num_compaction_trigger;
+      options_.compression = FLAGS_compression_type_e;
+      options_.compression_opts.max_dict_bytes =
+          FLAGS_compression_max_dict_bytes;
+      options_.compression_opts.zstd_max_train_bytes =
+          FLAGS_compression_zstd_max_train_bytes;
+      options_.create_if_missing = true;
+      options_.max_manifest_file_size = FLAGS_max_manifest_file_size;
+      options_.inplace_update_support = FLAGS_in_place_update;
+      options_.max_subcompactions = static_cast<uint32_t>(FLAGS_subcompactions);
+      options_.allow_concurrent_memtable_write =
+          FLAGS_allow_concurrent_memtable_write;
+      options_.enable_pipelined_write = FLAGS_enable_pipelined_write;
+      options_.enable_write_thread_adaptive_yield =
+          FLAGS_enable_write_thread_adaptive_yield;
+      options_.compaction_options_universal.size_ratio =
+          FLAGS_universal_size_ratio;
+      options_.compaction_options_universal.min_merge_width =
+          FLAGS_universal_min_merge_width;
+      options_.compaction_options_universal.max_merge_width =
+          FLAGS_universal_max_merge_width;
+      options_.compaction_options_universal.max_size_amplification_percent =
+          FLAGS_universal_max_size_amplification_percent;
+      options_.atomic_flush = FLAGS_atomic_flush;
+    } else {
+#ifdef ROCKSDB_LITE
+      fprintf(stderr, "--options_file not supported in lite mode\n");
+      exit(1);
+#else
+      DBOptions db_options;
+      std::vector<ColumnFamilyDescriptor> cf_descriptors;
+      Status s = LoadOptionsFromFile(FLAGS_options_file, Env::Default(),
+                                     &db_options, &cf_descriptors);
+      if (!s.ok()) {
+        fprintf(stderr, "Unable to load options file %s --- %s\n",
+                FLAGS_options_file.c_str(), s.ToString().c_str());
+        exit(1);
+      }
+      options_ = Options(db_options, cf_descriptors[0].options);
+#endif  // ROCKSDB_LITE
+    }
+
     if (FLAGS_rate_limiter_bytes_per_sec > 0) {
       options_.rate_limiter.reset(NewGenericRateLimiter(
           FLAGS_rate_limiter_bytes_per_sec, 1000 /* refill_period_us */,
@@ -2283,24 +2714,6 @@ class StressTest {
       options_.merge_operator = MergeOperators::CreatePutOperator();
     }
 
-    // set universal style compaction configurations, if applicable
-    if (FLAGS_universal_size_ratio != 0) {
-      options_.compaction_options_universal.size_ratio =
-          FLAGS_universal_size_ratio;
-    }
-    if (FLAGS_universal_min_merge_width != 0) {
-      options_.compaction_options_universal.min_merge_width =
-          FLAGS_universal_min_merge_width;
-    }
-    if (FLAGS_universal_max_merge_width != 0) {
-      options_.compaction_options_universal.max_merge_width =
-          FLAGS_universal_max_merge_width;
-    }
-    if (FLAGS_universal_max_size_amplification_percent != 0) {
-      options_.compaction_options_universal.max_size_amplification_percent =
-          FLAGS_universal_max_size_amplification_percent;
-    }
-
     fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str());
 
     Status s;
@@ -2355,10 +2768,43 @@ class StressTest {
       }
       options_.listeners.clear();
       options_.listeners.emplace_back(
-          new DbStressListener(FLAGS_db, options_.db_paths));
+          new DbStressListener(FLAGS_db, options_.db_paths, cf_descriptors));
       options_.create_missing_column_families = true;
-      s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
-                   &column_families_, &db_);
+      if (!FLAGS_use_txn) {
+        if (db_preload_finished_.load() && FLAGS_read_only) {
+          s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors,
+                                  &column_families_, &db_);
+        } else {
+          s = DB::Open(DBOptions(options_), FLAGS_db, cf_descriptors,
+                       &column_families_, &db_);
+        }
+      } else {
+#ifndef ROCKSDB_LITE
+        TransactionDBOptions txn_db_options;
+        // For the moment it is sufficient to test WRITE_PREPARED policy
+        txn_db_options.write_policy = TxnDBWritePolicy::WRITE_PREPARED;
+        s = TransactionDB::Open(options_, txn_db_options, FLAGS_db,
+                                cf_descriptors, &column_families_, &txn_db_);
+        db_ = txn_db_;
+        // after a crash, rollback to commit recovered transactions
+        std::vector<Transaction*> trans;
+        txn_db_->GetAllPreparedTransactions(&trans);
+        Random rand(static_cast<uint32_t>(FLAGS_seed));
+        for (auto txn : trans) {
+          if (rand.OneIn(2)) {
+            s = txn->Commit();
+            assert(s.ok());
+          } else {
+            s = txn->Rollback();
+            assert(s.ok());
+          }
+          delete txn;
+        }
+        trans.clear();
+        txn_db_->GetAllPreparedTransactions(&trans);
+        assert(trans.size() == 0);
+#endif
+      }
       assert(!s.ok() || column_families_.size() ==
                             static_cast<size_t>(FLAGS_column_families));
     } else {
@@ -2384,6 +2830,9 @@ class StressTest {
     column_families_.clear();
     delete db_;
     db_ = nullptr;
+#ifndef ROCKSDB_LITE
+    txn_db_ = nullptr;
+#endif
 
     num_times_reopened_++;
     auto now = FLAGS_env->NowMicros();
@@ -2399,11 +2848,13 @@ class StressTest {
     }
   }
 
- private:
   std::shared_ptr<Cache> cache_;
   std::shared_ptr<Cache> compressed_cache_;
   std::shared_ptr<const FilterPolicy> filter_policy_;
   DB* db_;
+#ifndef ROCKSDB_LITE
+  TransactionDB* txn_db_;
+#endif
   Options options_;
   std::vector<ColumnFamilyHandle*> column_families_;
   std::vector<std::string> column_family_names_;
@@ -2411,43 +2862,1112 @@ class StressTest {
   int num_times_reopened_;
   std::unordered_map<std::string, std::vector<std::string>> options_table_;
   std::vector<std::string> options_index_;
+  std::atomic<bool> db_preload_finished_;
 };
 
-}  // namespace rocksdb
+class NonBatchedOpsStressTest : public StressTest {
+ public:
+  NonBatchedOpsStressTest() {}
 
-int main(int argc, char** argv) {
-  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                  " [OPTIONS]...");
-  ParseCommandLineFlags(&argc, &argv, true);
-#if !defined(NDEBUG) && !defined(OS_MACOSX) && !defined(OS_WIN) && \
-  !defined(OS_SOLARIS) && !defined(OS_AIX)
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-    "NewWritableFile:O_DIRECT", [&](void* arg) {
-      int* val = static_cast<int*>(arg);
-      *val &= ~O_DIRECT;
-    });
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-    "NewRandomAccessFile:O_DIRECT", [&](void* arg) {
-      int* val = static_cast<int*>(arg);
-      *val &= ~O_DIRECT;
-    });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-#endif
+  virtual ~NonBatchedOpsStressTest() {}
 
-  if (FLAGS_statistics) {
-    dbstats = rocksdb::CreateDBStatistics();
+  virtual void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    auto shared = thread->shared;
+    const int64_t max_key = shared->GetMaxKey();
+    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
+    int64_t start = keys_per_thread * thread->tid;
+    int64_t end = start + keys_per_thread;
+    if (thread->tid == shared->GetNumThreads() - 1) {
+      end = max_key;
+    }
+    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
+      if (thread->shared->HasVerificationFailedYet()) {
+        break;
+      }
+      if (!thread->rand.OneIn(2)) {
+        // Use iterator to verify this range
+        std::unique_ptr<Iterator> iter(
+            db_->NewIterator(options, column_families_[cf]));
+        iter->Seek(Key(start));
+        for (auto i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          // TODO(ljin): update "long" to uint64_t
+          // Reseek when the prefix changes
+          if (i % (static_cast<int64_t>(1) << 8 * (8 - FLAGS_prefix_size)) ==
+              0) {
+            iter->Seek(Key(i));
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = iter->status();
+          if (iter->Valid()) {
+            if (iter->key().compare(k) > 0) {
+              s = Status::NotFound(Slice());
+            } else if (iter->key().compare(k) == 0) {
+              from_db = iter->value().ToString();
+              iter->Next();
+            } else if (iter->key().compare(k) < 0) {
+              VerificationAbort(shared, "An out of range key was found",
+                                static_cast<int>(cf), i);
+            }
+          } else {
+            // The iterator found no value for the key in question, so do not
+            // move to the next item in the iterator
+            s = Status::NotFound(Slice());
+          }
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      } else {
+        // Use Get to verify this range
+        for (auto i = start; i < end; i++) {
+          if (thread->shared->HasVerificationFailedYet()) {
+            break;
+          }
+          std::string from_db;
+          std::string keystr = Key(i);
+          Slice k = keystr;
+          Status s = db_->Get(options, column_families_[cf], k, &from_db);
+          VerifyValue(static_cast<int>(cf), i, options, shared, from_db, s,
+                      true);
+          if (from_db.length()) {
+            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
+                          from_db.data(), from_db.length());
+          }
+        }
+      }
+    }
   }
-  FLAGS_compression_type_e =
-    StringToCompressionType(FLAGS_compression_type.c_str());
-  FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
-  if (!FLAGS_hdfs.empty()) {
-    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
+
+  virtual void MaybeClearOneColumnFamily(ThreadState* thread) {
+    if (FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) {
+      if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) {
+        // drop column family and then create it again (can't drop default)
+        int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
+        std::string new_name =
+            ToString(new_column_family_name_.fetch_add(1));
+        {
+          MutexLock l(thread->shared->GetMutex());
+          fprintf(
+              stdout,
+              "[CF %d] Dropping and recreating column family. new name: %s\n",
+              cf, new_name.c_str());
+        }
+        thread->shared->LockColumnFamily(cf);
+        Status s = db_->DropColumnFamily(column_families_[cf]);
+        delete column_families_[cf];
+        if (!s.ok()) {
+          fprintf(stderr, "dropping column family error: %s\n",
+              s.ToString().c_str());
+          std::terminate();
+        }
+        s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
+                                    &column_families_[cf]);
+        column_family_names_[cf] = new_name;
+        thread->shared->ClearColumnFamily(cf);
+        if (!s.ok()) {
+          fprintf(stderr, "creating column family error: %s\n",
+              s.ToString().c_str());
+          std::terminate();
+        }
+        thread->shared->UnlockColumnFamily(cf);
+      }
+    }
   }
-  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
 
-  // The number of background threads should be at least as much the
-  // max number of concurrent compactions.
-  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
+  virtual bool ShouldAcquireMutexOnKey() const { return true; }
+
+  virtual Status TestGet(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    std::string from_db;
+    Status s = db_->Get(read_opts, cfh, key, &from_db);
+    if (s.ok()) {
+      // found case
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      // not found case
+      thread->stats.AddGets(1, 0);
+    } else {
+      // errors case
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+      const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = read_opts;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      // For half of the time, set the upper bound to the next prefix
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
+    long count = 0;
+    for (iter->Seek(prefix);
+        iter->Valid() && iter->key().starts_with(prefix); iter->Next()) {
+      ++count;
+    }
+    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
+    Status s = iter->status();
+    if (iter->status().ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    delete iter;
+    return s;
+  }
+
+  virtual Status TestPut(ThreadState* thread,
+      WriteOptions& write_opts, const ReadOptions& read_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      char (&value) [100], std::unique_ptr<MutexLock>& lock) {
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    while (!shared->AllowsOverwrite(rand_key) &&
+           (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
+      lock.reset();
+      rand_key = thread->rand.Next() % max_key;
+      rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      lock.reset(new MutexLock(
+          shared->GetMutexForKey(rand_column_family, rand_key)));
+    }
+
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    ColumnFamilyHandle* cfh = column_families_[rand_column_family];
+
+    if (FLAGS_verify_before_write) {
+      std::string key_str2 = Key(rand_key);
+      Slice k = key_str2;
+      std::string from_db;
+      Status s = db_->Get(read_opts, cfh, k, &from_db);
+      if (!VerifyValue(rand_column_family, rand_key, read_opts, shared,
+            from_db, s, true)) {
+        return s;
+      }
+    }
+    uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+    size_t sz = GenerateValue(value_base, value, sizeof(value));
+    Slice v(value, sz);
+    shared->Put(rand_column_family, rand_key, value_base, true /* pending */);
+    Status s;
+    if (FLAGS_use_merge) {
+      if (!FLAGS_use_txn) {
+        s = db_->Merge(write_opts, cfh, key, v);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Merge(cfh, key, v);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+    } else {
+      if (!FLAGS_use_txn) {
+        s = db_->Put(write_opts, cfh, key, v);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Put(cfh, key, v);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+    }
+    shared->Put(rand_column_family, rand_key, value_base, false /* pending */);
+    if (!s.ok()) {
+      fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    thread->stats.AddBytesForWrites(1, sz);
+    PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key),
+        value, sz);
+    return s;
+  }
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& lock) {
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+
+    // OPERATION delete
+    // If the chosen key does not allow overwrite and it does not exist,
+    // choose another key.
+    while (!shared->AllowsOverwrite(rand_key) &&
+           !shared->Exists(rand_column_family, rand_key)) {
+      lock.reset();
+      rand_key = thread->rand.Next() % max_key;
+      rand_column_family = thread->rand.Next() % FLAGS_column_families;
+      lock.reset(new MutexLock(
+          shared->GetMutexForKey(rand_column_family, rand_key)));
+    }
+
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_family];
+
+    // Use delete if the key may be overwritten and a single deletion
+    // otherwise.
+    Status s;
+    if (shared->AllowsOverwrite(rand_key)) {
+      shared->Delete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        s = db_->Delete(write_opts, cfh, key);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->Delete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+      shared->Delete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddDeletes(1);
+      if (!s.ok()) {
+        fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
+        std::terminate();
+      }
+    } else {
+      shared->SingleDelete(rand_column_family, rand_key, true /* pending */);
+      if (!FLAGS_use_txn) {
+        s = db_->SingleDelete(write_opts, cfh, key);
+      } else {
+#ifndef ROCKSDB_LITE
+        Transaction* txn;
+        s = NewTxn(write_opts, &txn);
+        if (s.ok()) {
+          s = txn->SingleDelete(cfh, key);
+          if (s.ok()) {
+            s = CommitTxn(txn);
+          }
+        }
+#endif
+      }
+      shared->SingleDelete(rand_column_family, rand_key, false /* pending */);
+      thread->stats.AddSingleDeletes(1);
+      if (!s.ok()) {
+        fprintf(stderr, "single delete error: %s\n",
+                s.ToString().c_str());
+        std::terminate();
+      }
+    }
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* thread,
+      WriteOptions& write_opts,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& lock) {
+    // OPERATION delete range
+    std::vector<std::unique_ptr<MutexLock>> range_locks;
+    // delete range does not respect disallowed overwrites. the keys for
+    // which overwrites are disallowed are randomly distributed so it
+    // could be expensive to find a range where each key allows
+    // overwrites.
+    int64_t rand_key = rand_keys[0];
+    int rand_column_family = rand_column_families[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      lock.reset();
+      rand_key = thread->rand.Next() %
+                 (max_key - FLAGS_range_deletion_width + 1);
+      range_locks.emplace_back(new MutexLock(
+          shared->GetMutexForKey(rand_column_family, rand_key)));
+    } else {
+      range_locks.emplace_back(std::move(lock));
+    }
+    for (int j = 1; j < FLAGS_range_deletion_width; ++j) {
+      if (((rand_key + j) & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(new MutexLock(
+              shared->GetMutexForKey(rand_column_family, rand_key + j)));
+      }
+    }
+    shared->DeleteRange(rand_column_family, rand_key,
+                        rand_key + FLAGS_range_deletion_width,
+                        true /* pending */);
+
+    std::string keystr = Key(rand_key);
+    Slice key = keystr;
+    auto cfh = column_families_[rand_column_family];
+    std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_keystr;
+    Status s = db_->DeleteRange(write_opts, cfh, key, end_key);
+    if (!s.ok()) {
+      fprintf(stderr, "delete range error: %s\n",
+              s.ToString().c_str());
+      std::terminate();
+    }
+    int covered = shared->DeleteRange(
+        rand_column_family, rand_key,
+        rand_key + FLAGS_range_deletion_width, false /* pending */);
+    thread->stats.AddRangeDeletions(1);
+    thread->stats.AddCoveredByRangeDeletions(covered);
+    return s;
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+#else
+  virtual void TestIngestExternalFile(
+      ThreadState* thread, const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys, std::unique_ptr<MutexLock>& lock) {
+    const std::string sst_filename =
+        FLAGS_db + "/." + ToString(thread->tid) + ".sst";
+    Status s;
+    if (FLAGS_env->FileExists(sst_filename).ok()) {
+      // Maybe we terminated abnormally before, so cleanup to give this file
+      // ingestion a clean slate
+      s = FLAGS_env->DeleteFile(sst_filename);
+    }
+
+    SstFileWriter sst_file_writer(EnvOptions(), options_);
+    if (s.ok()) {
+      s = sst_file_writer.Open(sst_filename);
+    }
+    int64_t key_base = rand_keys[0];
+    int column_family = rand_column_families[0];
+    std::vector<std::unique_ptr<MutexLock> > range_locks;
+    std::vector<uint32_t> values;
+    SharedState* shared = thread->shared;
+
+    // Grab locks, set pending state on expected values, and add keys
+    for (int64_t key = key_base;
+         s.ok() && key < std::min(key_base + FLAGS_ingest_external_file_width,
+                                  shared->GetMaxKey());
+         ++key) {
+      if (key == key_base) {
+        range_locks.emplace_back(std::move(lock));
+      } else if ((key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
+        range_locks.emplace_back(
+            new MutexLock(shared->GetMutexForKey(column_family, key)));
+      }
+
+      uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL;
+      values.push_back(value_base);
+      shared->Put(column_family, key, value_base, true /* pending */);
+
+      char value[100];
+      size_t value_len = GenerateValue(value_base, value, sizeof(value));
+      auto key_str = Key(key);
+      s = sst_file_writer.Put(Slice(key_str), Slice(value, value_len));
+    }
+
+    if (s.ok()) {
+      s = sst_file_writer.Finish();
+    }
+    if (s.ok()) {
+      s = db_->IngestExternalFile(column_families_[column_family],
+                                  {sst_filename}, IngestExternalFileOptions());
+    }
+    if (!s.ok()) {
+      fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str());
+      std::terminate();
+    }
+    int64_t key = key_base;
+    for (int32_t value : values) {
+      shared->Put(column_family, key, value, false /* pending */);
+      ++key;
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool VerifyValue(int cf, int64_t key, const ReadOptions& /*opts*/,
+                   SharedState* shared, const std::string& value_from_db,
+                   Status s, bool strict = false) const {
+    if (shared->HasVerificationFailedYet()) {
+      return false;
+    }
+    // compare value_from_db with the value in the shared state
+    char value[kValueMaxLen];
+    uint32_t value_base = shared->Get(cf, key);
+    if (value_base == SharedState::UNKNOWN_SENTINEL) {
+      return true;
+    }
+    if (value_base == SharedState::DELETION_SENTINEL && !strict) {
+      return true;
+    }
+
+    if (s.ok()) {
+      if (value_base == SharedState::DELETION_SENTINEL) {
+        VerificationAbort(shared, "Unexpected value found", cf, key);
+        return false;
+      }
+      size_t sz = GenerateValue(value_base, value, sizeof(value));
+      if (value_from_db.length() != sz) {
+        VerificationAbort(shared, "Length of value read is not equal", cf, key);
+        return false;
+      }
+      if (memcmp(value_from_db.data(), value, sz) != 0) {
+        VerificationAbort(shared, "Contents of value read don't match", cf,
+                          key);
+        return false;
+      }
+    } else {
+      if (value_base != SharedState::DELETION_SENTINEL) {
+        VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key);
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+class BatchedOpsStressTest : public StressTest {
+ public:
+  BatchedOpsStressTest() {}
+  virtual ~BatchedOpsStressTest() {}
+
+  // Given a key K and value V, this puts ("0"+K, "0"+V), ("1"+K, "1"+V), ...
+  // ("9"+K, "9"+V) in DB atomically i.e in a single batch.
+  // Also refer BatchedOpsStressTest::TestGet
+  virtual Status TestPut(ThreadState* thread,
+      WriteOptions& write_opts, const ReadOptions& /* read_opts */,
+      const std::vector<int>& rand_column_families, const std::vector<int64_t>& rand_keys,
+      char (&value)[100], std::unique_ptr<MutexLock>& /* lock */) {
+    uint32_t value_base =
+        thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL;
+    size_t sz = GenerateValue(value_base, value, sizeof(value));
+    Slice v(value, sz);
+    std::string keys[10] = {"9", "8", "7", "6", "5",
+                            "4", "3", "2", "1", "0"};
+    std::string values[10] = {"9", "8", "7", "6", "5",
+                              "4", "3", "2", "1", "0"};
+    Slice value_slices[10];
+    WriteBatch batch;
+    Status s;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key_str;
+      values[i] += v.ToString();
+      value_slices[i] = values[i];
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, keys[i], value_slices[i]);
+      } else {
+        batch.Put(cfh, keys[i], value_slices[i]);
+      }
+    }
+
+    s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multiput error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      // we did 10 writes each of size sz + 1
+      thread->stats.AddBytesForWrites(10, (sz + 1) * 10);
+    }
+
+    return s;
+  }
+
+  // Given a key K, this deletes ("0"+K), ("1"+K),... ("9"+K)
+  // in DB atomically i.e in a single batch. Also refer MultiGet.
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& writeoptions,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    std::string keys[10] = {"9", "7", "5", "3", "1",
+                            "8", "6", "4", "2", "0"};
+
+    WriteBatch batch;
+    Status s;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string key_str = Key(rand_keys[0]);
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key_str;
+      batch.Delete(cfh, keys[i]);
+    }
+
+    s = db_->Write(writeoptions, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidelete error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(10);
+    }
+
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* /* thread */,
+      WriteOptions& /* write_opts */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    return Status::NotSupported("BatchedOpsStressTest does not support "
+        "TestDeleteRange");
+  }
+
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "BatchedOpsStressTest does not support "
+            "TestIngestExternalFile\n");
+    std::terminate();
+  }
+
+  // Given a key K, this gets values for "0"+K, "1"+K,..."9"+K
+  // in the same snapshot, and verifies that all the values are of the form
+  // "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that BatchedOpsStressTest::TestPut was used to put (K, V) into
+  // the DB.
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    std::string keys[10] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"};
+    Slice key_slices[10];
+    std::string values[10];
+    ReadOptions readoptionscopy = readoptions;
+    readoptionscopy.snapshot = db_->GetSnapshot();
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string from_db;
+    Status s;
+    for (int i = 0; i < 10; i++) {
+      keys[i] += key.ToString();
+      key_slices[i] = keys[i];
+      s = db_->Get(readoptionscopy, cfh, key_slices[i], &from_db);
+      if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "get error: %s\n", s.ToString().c_str());
+        values[i] = "";
+        thread->stats.AddErrors(1);
+        // we continue after error rather than exiting so that we can
+        // find more errors if any
+      } else if (s.IsNotFound()) {
+        values[i] = "";
+        thread->stats.AddGets(1, 0);
+      } else {
+        values[i] = from_db;
+
+        char expected_prefix = (keys[i])[0];
+        char actual_prefix = (values[i])[0];
+        if (actual_prefix != expected_prefix) {
+          fprintf(stderr, "error expected prefix = %c actual = %c\n",
+                  expected_prefix, actual_prefix);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+        thread->stats.AddGets(1, 1);
+      }
+    }
+    db_->ReleaseSnapshot(readoptionscopy.snapshot);
+
+    // Now that we retrieved all values, check that they all match
+    for (int i = 1; i < 10; i++) {
+      if (values[i] != values[0]) {
+        fprintf(stderr, "error : inconsistent values for key %s: %s, %s\n",
+                key.ToString(true).c_str(), StringToHex(values[0]).c_str(),
+                StringToHex(values[i]).c_str());
+      // we continue after error rather than exiting so that we can
+      // find more errors if any
+      }
+    }
+
+    return s;
+  }
+
+  // Given a key, this does prefix scans for "0"+P, "1"+P,..."9"+P
+  // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes
+  // of the key. Each of these 10 scans returns a series of values;
+  // each series should be the same length, and it is verified for each
+  // index i that all the i'th values are of the form "0"+V, "1"+V,..."9"+V.
+  // ASSUMES that MultiPut was used to put (K, V)
+  virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions,
+      const std::vector<int>& rand_column_families,
+      const std::vector<int64_t>& rand_keys) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh = column_families_[rand_column_families[0]];
+    std::string prefixes[10] = {"0", "1", "2", "3", "4",
+                                "5", "6", "7", "8", "9"};
+    Slice prefix_slices[10];
+    ReadOptions readoptionscopy[10];
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Iterator* iters[10];
+    std::string upper_bounds[10];
+    Slice ub_slices[10];
+    Status s = Status::OK();
+    for (int i = 0; i < 10; i++) {
+      prefixes[i] += key.ToString();
+      prefixes[i].resize(FLAGS_prefix_size);
+      prefix_slices[i] = Slice(prefixes[i]);
+      readoptionscopy[i] = readoptions;
+      readoptionscopy[i].snapshot = snapshot;
+      if (thread->rand.OneIn(2) &&
+          GetNextPrefix(prefix_slices[i], &(upper_bounds[i]))) {
+        // For half of the time, set the upper bound to the next prefix
+        ub_slices[i] = Slice(upper_bounds[i]);
+        readoptionscopy[i].iterate_upper_bound = &(ub_slices[i]);
+      }
+      iters[i] = db_->NewIterator(readoptionscopy[i], cfh);
+      iters[i]->Seek(prefix_slices[i]);
+    }
+
+    long count = 0;
+    while (iters[0]->Valid() && iters[0]->key().starts_with(prefix_slices[0])) {
+      count++;
+      std::string values[10];
+      // get list of all values for this iteration
+      for (int i = 0; i < 10; i++) {
+        // no iterator should finish before the first one
+        assert(iters[i]->Valid() &&
+               iters[i]->key().starts_with(prefix_slices[i]));
+        values[i] = iters[i]->value().ToString();
+
+        char expected_first = (prefixes[i])[0];
+        char actual_first = (values[i])[0];
+
+        if (actual_first != expected_first) {
+          fprintf(stderr, "error expected first = %c actual = %c\n",
+                  expected_first, actual_first);
+        }
+        (values[i])[0] = ' '; // blank out the differing character
+      }
+      // make sure all values are equivalent
+      for (int i = 0; i < 10; i++) {
+        if (values[i] != values[0]) {
+          fprintf(stderr, "error : %d, inconsistent values for prefix %s: %s, %s\n",
+                  i, prefixes[i].c_str(), StringToHex(values[0]).c_str(),
+                  StringToHex(values[i]).c_str());
+          // we continue after error rather than exiting so that we can
+          // find more errors if any
+        }
+        iters[i]->Next();
+      }
+    }
+
+    // cleanup iterators and snapshot
+    for (int i = 0; i < 10; i++) {
+      // if the first iterator finished, they should have all finished
+      assert(!iters[i]->Valid() ||
+             !iters[i]->key().starts_with(prefix_slices[i]));
+      assert(iters[i]->status().ok());
+      delete iters[i];
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+
+    return s;
+  }
+
+  virtual void VerifyDb(ThreadState* /* thread */) const {}
+};
+
+class AtomicFlushStressTest : public StressTest {
+ public:
+  AtomicFlushStressTest() : batch_id_(0) {}
+
+  virtual ~AtomicFlushStressTest() {}
+
+  virtual Status TestPut(ThreadState* thread, WriteOptions& write_opts,
+                         const ReadOptions& /* read_opts */,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys,
+                         char (&value)[100],
+                         std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    uint64_t value_base = batch_id_.fetch_add(1);
+    size_t sz =
+        GenerateValue(static_cast<uint32_t>(value_base), value, sizeof(value));
+    Slice v(value, sz);
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      if (FLAGS_use_merge) {
+        batch.Merge(cfh, key, v);
+      } else { /* !FLAGS_use_merge */
+        batch.Put(cfh, key, v);
+      }
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi put or merge error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      auto num = static_cast<long>(rand_column_families.size());
+      thread->stats.AddBytesForWrites(num, (sz + 1) * num);
+    }
+
+    return s;
+  }
+
+  virtual Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
+                            const std::vector<int>& rand_column_families,
+                            const std::vector<int64_t>& rand_keys,
+                            std::unique_ptr<MutexLock>& /* lock */) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[cf];
+      batch.Delete(cfh, key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multidel error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddDeletes(static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
+                                 const std::vector<int>& rand_column_families,
+                                 const std::vector<int64_t>& rand_keys,
+                                 std::unique_ptr<MutexLock>& /* lock */) {
+    int64_t rand_key = rand_keys[0];
+    auto shared = thread->shared;
+    int64_t max_key = shared->GetMaxKey();
+    if (rand_key > max_key - FLAGS_range_deletion_width) {
+      rand_key =
+          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
+    }
+    std::string key_str = Key(rand_key);
+    Slice key = key_str;
+    std::string end_key_str = Key(rand_key + FLAGS_range_deletion_width);
+    Slice end_key = end_key_str;
+    WriteBatch batch;
+    for (auto cf : rand_column_families) {
+      ColumnFamilyHandle* cfh = column_families_[rand_column_families[cf]];
+      batch.DeleteRange(cfh, key, end_key);
+    }
+    Status s = db_->Write(write_opts, &batch);
+    if (!s.ok()) {
+      fprintf(stderr, "multi del range error: %s\n", s.ToString().c_str());
+      thread->stats.AddErrors(1);
+    } else {
+      thread->stats.AddRangeDeletions(
+          static_cast<long>(rand_column_families.size()));
+    }
+    return s;
+  }
+
+  virtual void TestIngestExternalFile(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */,
+      std::unique_ptr<MutexLock>& /* lock */) {
+    assert(false);
+    fprintf(stderr,
+            "AtomicFlushStressTest does not support TestIngestExternalFile "
+            "because it's not possible to verify the result\n");
+    std::terminate();
+  }
+
+  virtual Status TestGet(ThreadState* thread, const ReadOptions& readoptions,
+                         const std::vector<int>& rand_column_families,
+                         const std::vector<int64_t>& rand_keys) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    auto cfh =
+        column_families_[rand_column_families[thread->rand.Next() %
+                                              rand_column_families.size()]];
+    std::string from_db;
+    Status s = db_->Get(readoptions, cfh, key, &from_db);
+    if (s.ok()) {
+      thread->stats.AddGets(1, 1);
+    } else if (s.IsNotFound()) {
+      thread->stats.AddGets(1, 0);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    return s;
+  }
+
+  virtual Status TestPrefixScan(ThreadState* thread,
+                                const ReadOptions& readoptions,
+                                const std::vector<int>& rand_column_families,
+                                const std::vector<int64_t>& rand_keys) {
+    std::string key_str = Key(rand_keys[0]);
+    Slice key = key_str;
+    Slice prefix = Slice(key.data(), FLAGS_prefix_size);
+
+    std::string upper_bound;
+    Slice ub_slice;
+    ReadOptions ro_copy = readoptions;
+    if (thread->rand.OneIn(2) && GetNextPrefix(prefix, &upper_bound)) {
+      ub_slice = Slice(upper_bound);
+      ro_copy.iterate_upper_bound = &ub_slice;
+    }
+    auto cfh =
+        column_families_[rand_column_families[thread->rand.Next() %
+                                              rand_column_families.size()]];
+    Iterator* iter = db_->NewIterator(ro_copy, cfh);
+    long count = 0;
+    for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix);
+         iter->Next()) {
+      ++count;
+    }
+    assert(count <= (static_cast<long>(1) << ((8 - FLAGS_prefix_size) * 8)));
+    Status s = iter->status();
+    if (s.ok()) {
+      thread->stats.AddPrefixes(1, count);
+    } else {
+      thread->stats.AddErrors(1);
+    }
+    delete iter;
+    return s;
+  }
+
+#ifdef ROCKSDB_LITE
+  virtual Status TestCheckpoint(
+      ThreadState* /* thread */,
+      const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    assert(false);
+    fprintf(stderr,
+            "RocksDB lite does not support "
+            "TestCheckpoint\n");
+    std::terminate();
+  }
+#else
+  virtual Status TestCheckpoint(
+      ThreadState* thread, const std::vector<int>& /* rand_column_families */,
+      const std::vector<int64_t>& /* rand_keys */) {
+    std::string checkpoint_dir =
+        FLAGS_db + "/.checkpoint" + ToString(thread->tid);
+    DestroyDB(checkpoint_dir, Options());
+    Checkpoint* checkpoint = nullptr;
+    Status s = Checkpoint::Create(db_, &checkpoint);
+    if (s.ok()) {
+      s = checkpoint->CreateCheckpoint(checkpoint_dir);
+    }
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* checkpoint_db = nullptr;
+    if (s.ok()) {
+      delete checkpoint;
+      checkpoint = nullptr;
+      Options options(options_);
+      options.listeners.clear();
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      // TODO(ajkr): `column_family_names_` is not safe to access here when
+      // `clear_column_family_one_in != 0`. But we can't easily switch to
+      // `ListColumnFamilies` to get names because it won't necessarily give
+      // the same order as `column_family_names_`.
+      if (FLAGS_clear_column_family_one_in == 0) {
+        for (const auto& name : column_family_names_) {
+          cf_descs.emplace_back(name, ColumnFamilyOptions(options));
+        }
+        s = DB::OpenForReadOnly(DBOptions(options), checkpoint_dir, cf_descs,
+                                &cf_handles, &checkpoint_db);
+      }
+    }
+    if (checkpoint_db != nullptr) {
+      for (auto cfh : cf_handles) {
+        delete cfh;
+      }
+      cf_handles.clear();
+      delete checkpoint_db;
+      checkpoint_db = nullptr;
+    }
+    DestroyDB(checkpoint_dir, Options());
+    if (!s.ok()) {
+      fprintf(stderr, "A checkpoint operation failed with: %s\n",
+              s.ToString().c_str());
+    }
+    return s;
+  }
+#endif  // !ROCKSDB_LITE
+
+  virtual void VerifyDb(ThreadState* thread) const {
+    ReadOptions options(FLAGS_verify_checksum, true);
+    // We must set total_order_seek to true because we are doing a SeekToFirst
+    // on a column family whose memtables may support (by default) prefix-based
+    // iterator. In this case, NewIterator with options.total_order_seek being
+    // false returns a prefix-based iterator. Calling SeekToFirst using this
+    // iterator causes the iterator to become invalid. That means we cannot
+    // iterate the memtable using this iterator any more, although the memtable
+    // contains the most up-to-date key-values.
+    options.total_order_seek = true;
+    assert(thread != nullptr);
+    auto shared = thread->shared;
+    std::vector<std::unique_ptr<Iterator> > iters(column_families_.size());
+    for (size_t i = 0; i != column_families_.size(); ++i) {
+      iters[i].reset(db_->NewIterator(options, column_families_[i]));
+    }
+    for (auto& iter : iters) {
+      iter->SeekToFirst();
+    }
+    size_t num = column_families_.size();
+    assert(num == iters.size());
+    std::vector<Status> statuses(num, Status::OK());
+    do {
+      size_t valid_cnt = 0;
+      size_t idx = 0;
+      for (auto& iter : iters) {
+        if (iter->Valid()) {
+          ++valid_cnt;
+        } else {
+          statuses[idx] = iter->status();
+        }
+        ++idx;
+      }
+      if (valid_cnt == 0) {
+        Status status;
+        for (size_t i = 0; i != num; ++i) {
+          const auto& s = statuses[i];
+          if (!s.ok()) {
+            status = s;
+            fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    s.ToString().c_str());
+            shared->SetVerificationFailure();
+          }
+        }
+        if (status.ok()) {
+          fprintf(stdout, "Finished scanning all column families.\n");
+        }
+        break;
+      } else if (valid_cnt != iters.size()) {
+        for (size_t i = 0; i != num; ++i) {
+          if (!iters[i]->Valid()) {
+            if (statuses[i].ok()) {
+              fprintf(stderr, "Finished scanning cf %s\n",
+                      column_families_[i]->GetName().c_str());
+            } else {
+              fprintf(stderr, "Iterator on cf %s has error: %s\n",
+                      column_families_[i]->GetName().c_str(),
+                      statuses[i].ToString().c_str());
+            }
+          } else {
+            fprintf(stderr, "cf %s has remaining data to scan\n",
+                    column_families_[i]->GetName().c_str());
+          }
+        }
+        shared->SetVerificationFailure();
+        break;
+      }
+      // If the program reaches here, then all column families' iterators are
+      // still valid.
+      Slice key;
+      Slice value;
+      for (size_t i = 0; i != num; ++i) {
+        if (i == 0) {
+          key = iters[i]->key();
+          value = iters[i]->value();
+        } else {
+          if (key.compare(iters[i]->key()) != 0) {
+            fprintf(stderr, "Verification failed\n");
+            fprintf(stderr, "cf%s: %s => %s\n",
+                    column_families_[0]->GetName().c_str(),
+                    key.ToString(true /* hex */).c_str(),
+                    value.ToString(/* hex */).c_str());
+            fprintf(stderr, "cf%s: %s => %s\n",
+                    column_families_[i]->GetName().c_str(),
+                    iters[i]->key().ToString(true /* hex */).c_str(),
+                    iters[i]->value().ToString(true /* hex */).c_str());
+            shared->SetVerificationFailure();
+          }
+        }
+      }
+      for (auto& iter : iters) {
+        iter->Next();
+      }
+    } while (true);
+  }
+
+  virtual std::vector<int> GenerateColumnFamilies(
+      const int /* num_column_families */, int /* rand_column_family */) const {
+    std::vector<int> ret;
+    int num = static_cast<int>(column_families_.size());
+    int k = 0;
+    std::generate_n(back_inserter(ret), num, [&k]() -> int { return k++; });
+    return ret;
+  }
+
+ private:
+  std::atomic<int64_t> batch_id_;
+};
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
+                  " [OPTIONS]...");
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_statistics) {
+    dbstats = rocksdb::CreateDBStatistics();
+  }
+  FLAGS_compression_type_e =
+    StringToCompressionType(FLAGS_compression_type.c_str());
+  FLAGS_checksum_type_e = StringToChecksumType(FLAGS_checksum_type.c_str());
+  if (!FLAGS_hdfs.empty()) {
+    FLAGS_env  = new rocksdb::HdfsEnv(FLAGS_hdfs);
+  }
+  FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
+
+  // The number of background threads should be at least as much the
+  // max number of concurrent compactions.
+  FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions);
   FLAGS_env->SetBackgroundThreads(FLAGS_num_bottom_pri_threads,
                                   rocksdb::Env::Priority::BOTTOM);
   if (FLAGS_prefixpercent > 0 && FLAGS_prefix_size <= 0) {
@@ -2477,8 +3997,8 @@ int main(int argc, char** argv) {
       exit(1);
   }
   if (FLAGS_disable_wal == 1 && FLAGS_reopen > 0) {
-      fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
-      exit(1);
+    fprintf(stderr, "Error: Db cannot reopen safely with disable_wal set!\n");
+    exit(1);
   }
   if ((unsigned)FLAGS_reopen >= FLAGS_ops_per_thread) {
       fprintf(stderr,
@@ -2504,6 +4024,37 @@ int main(int argc, char** argv) {
             kValueMaxLen / kRandomValueMaxFactor);
     exit(1);
   }
+  if (FLAGS_use_merge && FLAGS_nooverwritepercent == 100) {
+    fprintf(
+        stderr,
+        "Error: nooverwritepercent must not be 100 when using merge operands");
+    exit(1);
+  }
+  if (FLAGS_ingest_external_file_one_in > 0 && FLAGS_nooverwritepercent > 0) {
+    fprintf(stderr,
+            "Error: nooverwritepercent must be 0 when using file ingestion\n");
+    exit(1);
+  }
+  if (FLAGS_clear_column_family_one_in > 0 && FLAGS_backup_one_in > 0) {
+    fprintf(stderr,
+            "Error: clear_column_family_one_in must be 0 when using backup\n");
+    exit(1);
+  }
+  if (FLAGS_test_atomic_flush) {
+    FLAGS_atomic_flush = true;
+  }
+  if (FLAGS_read_only) {
+    if (FLAGS_writepercent != 0 || FLAGS_delpercent != 0 ||
+        FLAGS_delrangepercent != 0) {
+      fprintf(stderr, "Error: updates are not supported in read only mode\n");
+      exit(1);
+    } else if (FLAGS_checkpoint_one_in > 0 &&
+               FLAGS_clear_column_family_one_in > 0) {
+      fprintf(stdout,
+              "Warn: checkpoint won't be validated since column families may "
+              "be dropped.\n");
+    }
+  }
 
   // Choose a location for the test database if none given with --db=<path>
   if (FLAGS_db.empty()) {
@@ -2516,8 +4067,15 @@ int main(int argc, char** argv) {
   rocksdb_kill_odds = FLAGS_kill_random_test;
   rocksdb_kill_prefix_blacklist = SplitString(FLAGS_kill_prefix_blacklist);
 
-  rocksdb::StressTest stress;
-  if (stress.Run()) {
+  std::unique_ptr<rocksdb::StressTest> stress;
+  if (FLAGS_test_atomic_flush) {
+    stress.reset(new rocksdb::AtomicFlushStressTest());
+  } else if (FLAGS_test_batches_snapshots) {
+    stress.reset(new rocksdb::BatchedOpsStressTest());
+  } else {
+    stress.reset(new rocksdb::NonBatchedOpsStressTest());
+  }
+  if (stress->Run()) {
     return 0;
   } else {
     return 1;
diff --git a/thirdparty/rocksdb/tools/dump/rocksdb_dump.cc b/thirdparty/rocksdb/tools/dump/rocksdb_dump.cc
index ddbfc2fb69..249371a273 100644
--- a/thirdparty/rocksdb/tools/dump/rocksdb_dump.cc
+++ b/thirdparty/rocksdb/tools/dump/rocksdb_dump.cc
@@ -18,9 +18,9 @@ int main() {
 
 #else
 
-#include <gflags/gflags.h>
 #include "rocksdb/convenience.h"
 #include "rocksdb/db_dump_tool.h"
+#include "util/gflags_compat.h"
 
 DEFINE_string(db_path, "", "Path to the db that will be dumped");
 DEFINE_string(dump_location, "", "Path to where the dump file location");
@@ -30,7 +30,7 @@ DEFINE_string(db_options, "",
               "Options string used to open the database that will be dumped");
 
 int main(int argc, char** argv) {
-  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
 
   if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
     fprintf(stderr, "Please set --db_path and --dump_location\n");
diff --git a/thirdparty/rocksdb/tools/dump/rocksdb_undump.cc b/thirdparty/rocksdb/tools/dump/rocksdb_undump.cc
index 0d04ccaa68..450c7e4759 100644
--- a/thirdparty/rocksdb/tools/dump/rocksdb_undump.cc
+++ b/thirdparty/rocksdb/tools/dump/rocksdb_undump.cc
@@ -18,9 +18,9 @@ int main() {
 
 #else
 
-#include <gflags/gflags.h>
 #include "rocksdb/convenience.h"
 #include "rocksdb/db_dump_tool.h"
+#include "util/gflags_compat.h"
 
 DEFINE_string(dump_location, "", "Path to the dump file that will be loaded");
 DEFINE_string(db_path, "", "Path to the db that we will undump the file into");
@@ -29,7 +29,7 @@ DEFINE_string(db_options, "",
               "Options string used to open the database that will be loaded");
 
 int main(int argc, char **argv) {
-  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
 
   if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
     fprintf(stderr, "Please set --db_path and --dump_location\n");
diff --git a/thirdparty/rocksdb/tools/ingest_external_sst.sh b/thirdparty/rocksdb/tools/ingest_external_sst.sh
new file mode 100755
index 0000000000..54ca3db3b3
--- /dev/null
+++ b/thirdparty/rocksdb/tools/ingest_external_sst.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+#
+#
+
+if [ "$#" -lt 2 ]; then
+  echo "usage: $BASH_SOURCE <DB Path> <External SST Dir>"
+  exit 1
+fi
+
+db_dir=$1
+external_sst_dir=$2
+
+for f in `find $external_sst_dir -name extern_sst*`
+do
+  echo == Ingesting external SST file $f to DB at $db_dir
+  ./ldb --db=$db_dir --create_if_missing ingest_extern_sst $f
+done
diff --git a/thirdparty/rocksdb/tools/ldb.cc b/thirdparty/rocksdb/tools/ldb.cc
index 6f70de6a6c..83193132b0 100644
--- a/thirdparty/rocksdb/tools/ldb.cc
+++ b/thirdparty/rocksdb/tools/ldb.cc
@@ -14,7 +14,7 @@ int main(int argc, char** argv) {
 }
 #else
 #include <stdio.h>
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "Not supported in lite mode.\n");
   return 1;
 }
diff --git a/thirdparty/rocksdb/tools/ldb_cmd.cc b/thirdparty/rocksdb/tools/ldb_cmd.cc
index c8b6221a57..e106bfbb2f 100644
--- a/thirdparty/rocksdb/tools/ldb_cmd.cc
+++ b/thirdparty/rocksdb/tools/ldb_cmd.cc
@@ -16,7 +16,7 @@
 #include "db/dbformat.h"
 #include "db/log_reader.h"
 #include "db/write_batch_internal.h"
-#include "port/dirent.h"
+#include "port/port_dirent.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/table_properties.h"
 #include "rocksdb/utilities/backupable_db.h"
@@ -81,10 +81,12 @@ const char* LDBCommand::DELIM = " ==> ";
 
 namespace {
 
-void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
+void DumpWalFile(Options options, std::string wal_file, bool print_header,
+                 bool print_values, bool is_write_committed,
                  LDBCommandExecuteResult* exec_state);
 
-void DumpSstFile(std::string filename, bool output_hex, bool show_properties);
+void DumpSstFile(Options options, std::string filename, bool output_hex,
+                 bool show_properties);
 };
 
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
@@ -112,7 +114,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
 LDBCommand* LDBCommand::InitFromCmdLineArgs(
     const std::vector<std::string>& args, const Options& options,
     const LDBOptions& ldb_options,
-    const std::vector<ColumnFamilyDescriptor>* column_families,
+    const std::vector<ColumnFamilyDescriptor>* /*column_families*/,
     const std::function<LDBCommand*(const ParsedParams&)>& selector) {
   // --x=y command line arguments are added as x->y map entries in
   // parsed_params.option_map.
@@ -130,12 +132,19 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs(
   for (const auto& arg : args) {
     if (arg[0] == '-' && arg[1] == '-'){
       std::vector<std::string> splits = StringSplit(arg, '=');
+      // --option_name=option_value
       if (splits.size() == 2) {
         std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
         parsed_params.option_map[optionKey] = splits[1];
-      } else {
+      } else if (splits.size() == 1) {
+        // --flag_name
         std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
         parsed_params.flags.push_back(optionKey);
+      } else {
+        // --option_name=option_value, option_value contains '='
+        std::string optionKey = splits[0].substr(OPTION_PREFIX.size());
+        parsed_params.option_map[optionKey] =
+            arg.substr(splits[0].length() + 1);
       }
     } else {
       cmdTokens.push_back(arg);
@@ -242,6 +251,14 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) {
   } else if (parsed_params.cmd == RestoreCommand::Name()) {
     return new RestoreCommand(parsed_params.cmd_params,
                               parsed_params.option_map, parsed_params.flags);
+  } else if (parsed_params.cmd == WriteExternalSstFilesCommand::Name()) {
+    return new WriteExternalSstFilesCommand(parsed_params.cmd_params,
+                                            parsed_params.option_map,
+                                            parsed_params.flags);
+  } else if (parsed_params.cmd == IngestExternalSstFilesCommand::Name()) {
+    return new IngestExternalSstFilesCommand(parsed_params.cmd_params,
+                                             parsed_params.option_map,
+                                             parsed_params.flags);
   }
   return nullptr;
 }
@@ -279,6 +296,7 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
                        const std::vector<std::string>& flags, bool is_read_only,
                        const std::vector<std::string>& valid_cmd_line_options)
     : db_(nullptr),
+      db_ttl_(nullptr),
       is_read_only_(is_read_only),
       is_key_hex_(false),
       is_value_hex_(false),
@@ -311,24 +329,24 @@ LDBCommand::LDBCommand(const std::map<std::string, std::string>& options,
 }
 
 void LDBCommand::OpenDB() {
-  Options opt;
-  bool opt_set = false;
   if (!create_if_missing_ && try_load_options_) {
-    Status s = LoadLatestOptions(db_path_, Env::Default(), &opt,
+    Status s = LoadLatestOptions(db_path_, Env::Default(), &options_,
                                  &column_families_, ignore_unknown_options_);
-    if (s.ok()) {
-      opt_set = true;
-    } else if (!s.IsNotFound()) {
+    if (!s.ok() && !s.IsNotFound()) {
       // Option file exists but load option file error.
       std::string msg = s.ToString();
       exec_state_ = LDBCommandExecuteResult::Failed(msg);
       db_ = nullptr;
       return;
     }
+    if (options_.env->FileExists(options_.wal_dir).IsNotFound()) {
+      options_.wal_dir = db_path_;
+      fprintf(
+          stderr,
+          "wal_dir loaded from the option file doesn't exist. Ignore it.\n");
+    }
   }
-  if (!opt_set) {
-    opt = PrepareOptionsForOpenDB();
-  }
+  options_ = PrepareOptionsForOpenDB();
   if (!exec_state_.IsNotStarted()) {
     return;
   }
@@ -342,13 +360,13 @@ void LDBCommand::OpenDB() {
           "ldb doesn't support TTL DB with multiple column families");
     }
     if (is_read_only_) {
-      st = DBWithTTL::Open(opt, db_path_, &db_ttl_, 0, true);
+      st = DBWithTTL::Open(options_, db_path_, &db_ttl_, 0, true);
     } else {
-      st = DBWithTTL::Open(opt, db_path_, &db_ttl_);
+      st = DBWithTTL::Open(options_, db_path_, &db_ttl_);
     }
     db_ = db_ttl_;
   } else {
-    if (!opt_set && column_families_.empty()) {
+    if (column_families_.empty()) {
       // Try to figure out column family lists
       std::vector<std::string> cf_list;
       st = DB::ListColumnFamilies(DBOptions(), db_path_, &cf_list);
@@ -359,22 +377,23 @@ void LDBCommand::OpenDB() {
       if (st.ok() && cf_list.size() > 1) {
         // Ignore single column family DB.
         for (auto cf_name : cf_list) {
-          column_families_.emplace_back(cf_name, opt);
+          column_families_.emplace_back(cf_name, options_);
         }
       }
     }
     if (is_read_only_) {
       if (column_families_.empty()) {
-        st = DB::OpenForReadOnly(opt, db_path_, &db_);
+        st = DB::OpenForReadOnly(options_, db_path_, &db_);
       } else {
-        st = DB::OpenForReadOnly(opt, db_path_, column_families_,
+        st = DB::OpenForReadOnly(options_, db_path_, column_families_,
                                  &handles_opened, &db_);
       }
     } else {
       if (column_families_.empty()) {
-        st = DB::Open(opt, db_path_, &db_);
+        st = DB::Open(options_, db_path_, &db_);
       } else {
-        st = DB::Open(opt, db_path_, column_families_, &handles_opened, &db_);
+        st = DB::Open(options_, db_path_, column_families_, &handles_opened,
+                      &db_);
       }
     }
   }
@@ -404,8 +423,6 @@ void LDBCommand::OpenDB() {
       CloseDB();
     }
   }
-
-  options_ = opt;
 }
 
 void LDBCommand::CloseDB() {
@@ -457,7 +474,7 @@ std::vector<std::string> LDBCommand::BuildCmdLineOptions(
  * updated.
  */
 bool LDBCommand::ParseIntOption(
-    const std::map<std::string, std::string>& options,
+    const std::map<std::string, std::string>& /*options*/,
     const std::string& option, int& value,
     LDBCommandExecuteResult& exec_state) {
   std::map<std::string, std::string>::const_iterator itr =
@@ -487,7 +504,7 @@ bool LDBCommand::ParseIntOption(
  * Returns false otherwise.
  */
 bool LDBCommand::ParseStringOption(
-    const std::map<std::string, std::string>& options,
+    const std::map<std::string, std::string>& /*options*/,
     const std::string& option, std::string* value) {
   auto itr = option_map_.find(option);
   if (itr != option_map_.end()) {
@@ -498,9 +515,19 @@ bool LDBCommand::ParseStringOption(
 }
 
 Options LDBCommand::PrepareOptionsForOpenDB() {
-
-  Options opt = options_;
-  opt.create_if_missing = false;
+  ColumnFamilyOptions* cf_opts;
+  auto column_families_iter =
+      std::find_if(column_families_.begin(), column_families_.end(),
+                   [this](const ColumnFamilyDescriptor& cf_desc) {
+                     return cf_desc.name == column_family_name_;
+                   });
+  if (column_families_iter != column_families_.end()) {
+    cf_opts = &column_families_iter->options;
+  } else {
+    cf_opts = static_cast<ColumnFamilyOptions*>(&options_);
+  }
+  DBOptions* db_opts = static_cast<DBOptions*>(&options_);
+  db_opts->create_if_missing = false;
 
   std::map<std::string, std::string>::const_iterator itr;
 
@@ -529,33 +556,33 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   }
 
   if (use_table_options) {
-    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    cf_opts->table_factory.reset(NewBlockBasedTableFactory(table_options));
   }
 
   itr = option_map_.find(ARG_AUTO_COMPACTION);
   if (itr != option_map_.end()) {
-    opt.disable_auto_compactions = ! StringToBool(itr->second);
+    cf_opts->disable_auto_compactions = !StringToBool(itr->second);
   }
 
   itr = option_map_.find(ARG_COMPRESSION_TYPE);
   if (itr != option_map_.end()) {
     std::string comp = itr->second;
     if (comp == "no") {
-      opt.compression = kNoCompression;
+      cf_opts->compression = kNoCompression;
     } else if (comp == "snappy") {
-      opt.compression = kSnappyCompression;
+      cf_opts->compression = kSnappyCompression;
     } else if (comp == "zlib") {
-      opt.compression = kZlibCompression;
+      cf_opts->compression = kZlibCompression;
     } else if (comp == "bzip2") {
-      opt.compression = kBZip2Compression;
+      cf_opts->compression = kBZip2Compression;
     } else if (comp == "lz4") {
-      opt.compression = kLZ4Compression;
+      cf_opts->compression = kLZ4Compression;
     } else if (comp == "lz4hc") {
-      opt.compression = kLZ4HCCompression;
+      cf_opts->compression = kLZ4HCCompression;
     } else if (comp == "xpress") {
-      opt.compression = kXpressCompression;
+      cf_opts->compression = kXpressCompression;
     } else if (comp == "zstd") {
-      opt.compression = kZSTD;
+      cf_opts->compression = kZSTD;
     } else {
       // Unknown compression.
       exec_state_ =
@@ -567,7 +594,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   if (ParseIntOption(option_map_, ARG_COMPRESSION_MAX_DICT_BYTES,
                      compression_max_dict_bytes, exec_state_)) {
     if (compression_max_dict_bytes >= 0) {
-      opt.compression_opts.max_dict_bytes = compression_max_dict_bytes;
+      cf_opts->compression_opts.max_dict_bytes = compression_max_dict_bytes;
     } else {
       exec_state_ = LDBCommandExecuteResult::Failed(
           ARG_COMPRESSION_MAX_DICT_BYTES + " must be >= 0.");
@@ -578,7 +605,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE,
         db_write_buffer_size, exec_state_)) {
     if (db_write_buffer_size >= 0) {
-      opt.db_write_buffer_size = db_write_buffer_size;
+      db_opts->db_write_buffer_size = db_write_buffer_size;
     } else {
       exec_state_ = LDBCommandExecuteResult::Failed(ARG_DB_WRITE_BUFFER_SIZE +
                                                     " must be >= 0.");
@@ -589,7 +616,7 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size,
         exec_state_)) {
     if (write_buffer_size > 0) {
-      opt.write_buffer_size = write_buffer_size;
+      cf_opts->write_buffer_size = write_buffer_size;
     } else {
       exec_state_ = LDBCommandExecuteResult::Failed(ARG_WRITE_BUFFER_SIZE +
                                                     " must be > 0.");
@@ -599,30 +626,33 @@ Options LDBCommand::PrepareOptionsForOpenDB() {
   int file_size;
   if (ParseIntOption(option_map_, ARG_FILE_SIZE, file_size, exec_state_)) {
     if (file_size > 0) {
-      opt.target_file_size_base = file_size;
+      cf_opts->target_file_size_base = file_size;
     } else {
       exec_state_ =
           LDBCommandExecuteResult::Failed(ARG_FILE_SIZE + " must be > 0.");
     }
   }
 
-  if (opt.db_paths.size() == 0) {
-    opt.db_paths.emplace_back(db_path_, std::numeric_limits<uint64_t>::max());
+  if (db_opts->db_paths.size() == 0) {
+    db_opts->db_paths.emplace_back(db_path_,
+                                   std::numeric_limits<uint64_t>::max());
   }
 
   int fix_prefix_len;
   if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len,
                      exec_state_)) {
     if (fix_prefix_len > 0) {
-      opt.prefix_extractor.reset(
+      cf_opts->prefix_extractor.reset(
           NewFixedPrefixTransform(static_cast<size_t>(fix_prefix_len)));
     } else {
       exec_state_ =
           LDBCommandExecuteResult::Failed(ARG_FIX_PREFIX_LEN + " must be > 0.");
     }
   }
-
-  return opt;
+  // TODO(ajkr): this return value doesn't reflect the CF options changed, so
+  // subcommands that rely on this won't see the effect of CF-related CLI args.
+  // Such subcommands need to be changed to properly support CFs.
+  return options_;
 }
 
 bool LDBCommand::ParseKeyValue(const std::string& line, std::string* key,
@@ -765,7 +795,7 @@ bool LDBCommand::StringToBool(std::string val) {
 }
 
 CompactorCommand::CompactorCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false,
@@ -835,7 +865,7 @@ const std::string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load";
 const std::string DBLoaderCommand::ARG_COMPACT = "compact";
 
 DBLoaderCommand::DBLoaderCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
@@ -913,8 +943,8 @@ void DBLoaderCommand::DoCommand() {
 
 namespace {
 
-void DumpManifestFile(std::string file, bool verbose, bool hex, bool json) {
-  Options options;
+void DumpManifestFile(Options options, std::string file, bool verbose, bool hex,
+                      bool json) {
   EnvOptions sopt;
   std::string dbname("dummy");
   std::shared_ptr<Cache> tc(NewLRUCache(options.max_open_files - 10,
@@ -951,7 +981,7 @@ void ManifestDumpCommand::Help(std::string& ret) {
 }
 
 ManifestDumpCommand::ManifestDumpCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
@@ -1015,7 +1045,7 @@ void ManifestDumpCommand::DoCommand() {
     printf("Processing Manifest file %s\n", manifestfile.c_str());
   }
 
-  DumpManifestFile(manifestfile, verbose_, is_key_hex_, json_);
+  DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_);
 
   if (verbose_) {
     printf("Processing Manifest file %s done\n", manifestfile.c_str());
@@ -1116,6 +1146,10 @@ std::string ReadableTime(int unixtime) {
 void IncBucketCounts(std::vector<uint64_t>& bucket_counts, int ttl_start,
                      int time_range, int bucket_size, int timekv,
                      int num_buckets) {
+#ifdef NDEBUG
+  (void)time_range;
+  (void)num_buckets;
+#endif
   assert(time_range > 0 && timekv >= ttl_start && bucket_size > 0 &&
     timekv < (ttl_start + time_range) && num_buckets > 1);
   int bucket = (timekv - ttl_start) / bucket_size;
@@ -1146,7 +1180,7 @@ const std::string InternalDumpCommand::ARG_STATS = "stats";
 const std::string InternalDumpCommand::ARG_INPUT_KEY_HEX = "input_key_hex";
 
 InternalDumpCommand::InternalDumpCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
@@ -1218,7 +1252,7 @@ void InternalDumpCommand::DoCommand() {
 
   // Cast as DBImpl to get internal iterator
   std::vector<KeyVersion> key_versions;
-  Status st = GetAllKeyVersions(db_, from_, to_, &key_versions);
+  Status st = GetAllKeyVersions(db_, from_, to_, max_keys_, &key_versions);
   if (!st.ok()) {
     exec_state_ = LDBCommandExecuteResult::Failed(st.ToString());
     return;
@@ -1251,8 +1285,8 @@ void InternalDumpCommand::DoCommand() {
       for(int j=0;row[j]!=delim_[0] && row[j]!='\0' && row[j]!='\x01';j++)
         rtype1+=row[j];
       if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
-        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
-            (long long)c,(long long)s2);
+        fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+                rtype2.c_str(), c, s2);
         c=1;
         s2=s1;
         rtype2 = rtype1;
@@ -1273,10 +1307,11 @@ void InternalDumpCommand::DoCommand() {
     if (max_keys_ > 0 && count >= max_keys_) break;
   }
   if(count_delim_) {
-    fprintf(stdout,"%s => count:%lld\tsize:%lld\n", rtype2.c_str(),
-        (long long)c,(long long)s2);
-  } else
-  fprintf(stdout, "Internal keys in range: %lld\n", (long long) count);
+    fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+            rtype2.c_str(), c, s2);
+  } else {
+    fprintf(stdout, "Internal keys in range: %lld\n", count);
+  }
 }
 
 const std::string DBDumperCommand::ARG_COUNT_ONLY = "count_only";
@@ -1285,7 +1320,7 @@ const std::string DBDumperCommand::ARG_STATS = "stats";
 const std::string DBDumperCommand::ARG_TTL_BUCKET = "bucket";
 
 DBDumperCommand::DBDumperCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, true,
@@ -1353,6 +1388,9 @@ DBDumperCommand::DBDumperCommand(
   itr = options.find(ARG_PATH);
   if (itr != options.end()) {
     path_ = itr->second;
+    if (db_path_.empty()) {
+      db_path_ = path_;
+    }
   }
 }
 
@@ -1401,14 +1439,16 @@ void DBDumperCommand::DoCommand() {
 
     switch (type) {
       case kLogFile:
-        DumpWalFile(path_, /* print_header_ */ true, /* print_values_ */ true,
+        // TODO(myabandeh): allow configuring is_write_commited
+        DumpWalFile(options_, path_, /* print_header_ */ true,
+                    /* print_values_ */ true, true /* is_write_commited */,
                     &exec_state_);
         break;
       case kTableFile:
-        DumpSstFile(path_, is_key_hex_, /* show_properties */ true);
+        DumpSstFile(options_, path_, is_key_hex_, /* show_properties */ true);
         break;
       case kDescriptorFile:
-        DumpManifestFile(path_, /* verbose_ */ false, is_key_hex_,
+        DumpManifestFile(options_, path_, /* verbose_ */ false, is_key_hex_,
                          /*  json_ */ false);
         break;
       default:
@@ -1436,7 +1476,9 @@ void DBDumperCommand::DoDumpCommand() {
   }
 
   // Setup key iterator
-  Iterator* iter = db_->NewIterator(ReadOptions(), GetCfHandle());
+  ReadOptions scan_read_opts;
+  scan_read_opts.total_order_seek = true;
+  Iterator* iter = db_->NewIterator(scan_read_opts, GetCfHandle());
   Status st = iter->status();
   if (!st.ok()) {
     exec_state_ =
@@ -1485,6 +1527,8 @@ void DBDumperCommand::DoDumpCommand() {
             ReadableTime(ttl_start).c_str(), ReadableTime(ttl_end).c_str());
   }
 
+  HistogramImpl vsize_hist;
+
   for (; iter->Valid(); iter->Next()) {
     int rawtime = 0;
     // If end marker was specified, we stop before it
@@ -1516,8 +1560,8 @@ void DBDumperCommand::DoDumpCommand() {
       for(int j=0;row[j]!=delim_[0] && row[j]!='\0';j++)
         rtype1+=row[j];
       if(rtype2.compare("") && rtype2.compare(rtype1)!=0) {
-        fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
-            (long long )c,(long long)s2);
+        fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+                rtype2.c_str(), c, s2);
         c=1;
         s2=s1;
         rtype2 = rtype1;
@@ -1529,7 +1573,9 @@ void DBDumperCommand::DoDumpCommand() {
 
     }
 
-
+    if (count_only_) {
+      vsize_hist.Add(iter->value().size());
+    }
 
     if (!count_only_ && !count_delim_) {
       if (is_db_ttl_ && timestamp_) {
@@ -1546,10 +1592,15 @@ void DBDumperCommand::DoDumpCommand() {
     PrintBucketCounts(bucket_counts, ttl_start, ttl_end, bucket_size,
                       num_buckets);
   } else if(count_delim_) {
-    fprintf(stdout,"%s => count:%lld\tsize:%lld\n",rtype2.c_str(),
-        (long long )c,(long long)s2);
+    fprintf(stdout, "%s => count:%" PRIu64 "\tsize:%" PRIu64 "\n",
+            rtype2.c_str(), c, s2);
   } else {
-    fprintf(stdout, "Keys in range: %lld\n", (long long) count);
+    fprintf(stdout, "Keys in range: %" PRIu64 "\n", count);
+  }
+
+  if (count_only_) {
+    fprintf(stdout, "Value size distribution: \n");
+    fprintf(stdout, "%s\n", vsize_hist.ToString().c_str());
   }
   // Clean up
   delete iter;
@@ -1560,7 +1611,7 @@ const std::string ReduceDBLevelsCommand::ARG_PRINT_OLD_LEVELS =
     "print_old_levels";
 
 ReduceDBLevelsCommand::ReduceDBLevelsCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false,
@@ -1670,6 +1721,7 @@ void ReduceDBLevelsCommand::DoCommand() {
   if (exec_state_.IsFailed()) {
     return;
   }
+  assert(db_ != nullptr);
   // Compact the whole DB to put all files to the highest level.
   fprintf(stdout, "Compacting the db...\n");
   db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr);
@@ -1689,7 +1741,7 @@ const std::string ChangeCompactionStyleCommand::ARG_NEW_COMPACTION_STYLE =
     "new_compaction_style";
 
 ChangeCompactionStyleCommand::ChangeCompactionStyleCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false,
@@ -1826,17 +1878,19 @@ void ChangeCompactionStyleCommand::DoCommand() {
 namespace {
 
 struct StdErrReporter : public log::Reader::Reporter {
-  virtual void Corruption(size_t bytes, const Status& s) override {
+  void Corruption(size_t /*bytes*/, const Status& s) override {
     std::cerr << "Corruption detected in log file " << s.ToString() << "\n";
   }
 };
 
 class InMemoryHandler : public WriteBatch::Handler {
  public:
-  InMemoryHandler(std::stringstream& row, bool print_values)
-      : Handler(), row_(row) {
-    print_values_ = print_values;
-  }
+  InMemoryHandler(std::stringstream& row, bool print_values,
+                  bool write_after_commit = false)
+      : Handler(),
+        row_(row),
+        print_values_(print_values),
+        write_after_commit_(write_after_commit) {}
 
   void commonPutMerge(const Slice& key, const Slice& value) {
     std::string k = LDBCommand::StringToHex(key.ToString());
@@ -1849,82 +1903,92 @@ class InMemoryHandler : public WriteBatch::Handler {
     }
   }
 
-  virtual Status PutCF(uint32_t cf, const Slice& key,
-                       const Slice& value) override {
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
     row_ << "PUT(" << cf << ") : ";
     commonPutMerge(key, value);
     return Status::OK();
   }
 
-  virtual Status MergeCF(uint32_t cf, const Slice& key,
-                         const Slice& value) override {
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
     row_ << "MERGE(" << cf << ") : ";
     commonPutMerge(key, value);
     return Status::OK();
   }
 
-  virtual Status DeleteCF(uint32_t cf, const Slice& key) override {
+  Status MarkNoop(bool) override {
+    row_ << "NOOP ";
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
     row_ << "DELETE(" << cf << ") : ";
     row_ << LDBCommand::StringToHex(key.ToString()) << " ";
     return Status::OK();
   }
 
-  virtual Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
     row_ << "SINGLE_DELETE(" << cf << ") : ";
     row_ << LDBCommand::StringToHex(key.ToString()) << " ";
     return Status::OK();
   }
 
-  virtual Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
-                               const Slice& end_key) override {
+  Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+                       const Slice& end_key) override {
     row_ << "DELETE_RANGE(" << cf << ") : ";
     row_ << LDBCommand::StringToHex(begin_key.ToString()) << " ";
     row_ << LDBCommand::StringToHex(end_key.ToString()) << " ";
     return Status::OK();
   }
 
-  virtual Status MarkBeginPrepare() override {
-    row_ << "BEGIN_PREARE ";
+  Status MarkBeginPrepare(bool unprepare) override {
+    row_ << "BEGIN_PREPARE(";
+    row_ << (unprepare ? "true" : "false") << ") ";
     return Status::OK();
   }
 
-  virtual Status MarkEndPrepare(const Slice& xid) override {
+  Status MarkEndPrepare(const Slice& xid) override {
     row_ << "END_PREPARE(";
     row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
     return Status::OK();
   }
 
-  virtual Status MarkRollback(const Slice& xid) override {
+  Status MarkRollback(const Slice& xid) override {
     row_ << "ROLLBACK(";
     row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
     return Status::OK();
   }
 
-  virtual Status MarkCommit(const Slice& xid) override {
+  Status MarkCommit(const Slice& xid) override {
     row_ << "COMMIT(";
     row_ << LDBCommand::StringToHex(xid.ToString()) << ") ";
     return Status::OK();
   }
 
-  virtual ~InMemoryHandler() {}
+  ~InMemoryHandler() override {}
+
+ protected:
+  bool WriteAfterCommit() const override { return write_after_commit_; }
 
  private:
   std::stringstream& row_;
   bool print_values_;
+  bool write_after_commit_;
 };
 
-void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
+void DumpWalFile(Options options, std::string wal_file, bool print_header,
+                 bool print_values, bool is_write_committed,
                  LDBCommandExecuteResult* exec_state) {
-  Env* env_ = Env::Default();
-  EnvOptions soptions;
-  unique_ptr<SequentialFileReader> wal_file_reader;
+  Env* env = options.env;
+  EnvOptions soptions(options);
+  std::unique_ptr<SequentialFileReader> wal_file_reader;
 
   Status status;
   {
-    unique_ptr<SequentialFile> file;
-    status = env_->NewSequentialFile(wal_file, &file, soptions);
+    std::unique_ptr<SequentialFile> file;
+    status = env->NewSequentialFile(wal_file, &file, soptions);
     if (status.ok()) {
-      wal_file_reader.reset(new SequentialFileReader(std::move(file)));
+      wal_file_reader.reset(
+          new SequentialFileReader(std::move(file), wal_file));
     }
   }
   if (!status.ok()) {
@@ -1949,9 +2013,8 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
       // bogus input, carry on as best we can
       log_number = 0;
     }
-    DBOptions db_options;
-    log::Reader reader(db_options.info_log, std::move(wal_file_reader),
-                       &reporter, true, 0, log_number);
+    log::Reader reader(options.info_log, std::move(wal_file_reader), &reporter,
+                       true /* checksum */, log_number);
     std::string scratch;
     WriteBatch batch;
     Slice record;
@@ -1974,7 +2037,7 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
         row << WriteBatchInternal::Count(&batch) << ",";
         row << WriteBatchInternal::ByteSize(&batch) << ",";
         row << reader.LastRecordOffset() << ",";
-        InMemoryHandler handler(row, print_values);
+        InMemoryHandler handler(row, print_values, is_write_committed);
         batch.Iterate(&handler);
         row << "\n";
       }
@@ -1986,18 +2049,20 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
 }  // namespace
 
 const std::string WALDumperCommand::ARG_WAL_FILE = "walfile";
+const std::string WALDumperCommand::ARG_WRITE_COMMITTED = "write_committed";
 const std::string WALDumperCommand::ARG_PRINT_VALUE = "print_value";
 const std::string WALDumperCommand::ARG_PRINT_HEADER = "header";
 
 WALDumperCommand::WALDumperCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, true,
-                 BuildCmdLineOptions(
-                     {ARG_WAL_FILE, ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
+                 BuildCmdLineOptions({ARG_WAL_FILE, ARG_WRITE_COMMITTED,
+                                      ARG_PRINT_HEADER, ARG_PRINT_VALUE})),
       print_header_(false),
-      print_values_(false) {
+      print_values_(false),
+      is_write_committed_(false) {
   wal_file_.clear();
 
   std::map<std::string, std::string>::const_iterator itr =
@@ -2009,6 +2074,8 @@ WALDumperCommand::WALDumperCommand(
 
   print_header_ = IsFlagPresent(flags, ARG_PRINT_HEADER);
   print_values_ = IsFlagPresent(flags, ARG_PRINT_VALUE);
+  is_write_committed_ = ParseBooleanOption(options, ARG_WRITE_COMMITTED, true);
+
   if (wal_file_.empty()) {
     exec_state_ = LDBCommandExecuteResult::Failed("Argument " + ARG_WAL_FILE +
                                                   " must be specified.");
@@ -2021,11 +2088,13 @@ void WALDumperCommand::Help(std::string& ret) {
   ret.append(" --" + ARG_WAL_FILE + "=<write_ahead_log_file_path>");
   ret.append(" [--" + ARG_PRINT_HEADER + "] ");
   ret.append(" [--" + ARG_PRINT_VALUE + "] ");
+  ret.append(" [--" + ARG_WRITE_COMMITTED + "=true|false] ");
   ret.append("\n");
 }
 
 void WALDumperCommand::DoCommand() {
-  DumpWalFile(wal_file_, print_header_, print_values_, &exec_state_);
+  DumpWalFile(options_, wal_file_, print_header_, print_values_,
+              is_write_committed_, &exec_state_);
 }
 
 // ----------------------------------------------------------------------------
@@ -2074,7 +2143,7 @@ void GetCommand::DoCommand() {
 // ----------------------------------------------------------------------------
 
 ApproxSizeCommand::ApproxSizeCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, true,
@@ -2190,7 +2259,7 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() {
 
 // ----------------------------------------------------------------------------
 
-ScanCommand::ScanCommand(const std::vector<std::string>& params,
+ScanCommand::ScanCommand(const std::vector<std::string>& /*params*/,
                          const std::map<std::string, std::string>& options,
                          const std::vector<std::string>& flags)
     : LDBCommand(
@@ -2264,7 +2333,9 @@ void ScanCommand::DoCommand() {
   }
 
   int num_keys_scanned = 0;
-  Iterator* it = db_->NewIterator(ReadOptions(), GetCfHandle());
+  ReadOptions scan_read_opts;
+  scan_read_opts.total_order_seek = true;
+  Iterator* it = db_->NewIterator(scan_read_opts, GetCfHandle());
   if (start_key_specified_) {
     it->Seek(start_key_);
   } else {
@@ -2473,7 +2544,7 @@ const char* DBQuerierCommand::PUT_CMD = "put";
 const char* DBQuerierCommand::DELETE_CMD = "delete";
 
 DBQuerierCommand::DBQuerierCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(
@@ -2552,7 +2623,7 @@ void DBQuerierCommand::DoCommand() {
 // ----------------------------------------------------------------------------
 
 CheckConsistencyCommand::CheckConsistencyCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
@@ -2584,16 +2655,13 @@ void CheckConsistencyCommand::DoCommand() {
 const std::string CheckPointCommand::ARG_CHECKPOINT_DIR = "checkpoint_dir";
 
 CheckPointCommand::CheckPointCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false /* is_read_only */,
                  BuildCmdLineOptions({ARG_CHECKPOINT_DIR})) {
   auto itr = options.find(ARG_CHECKPOINT_DIR);
-  if (itr == options.end()) {
-    exec_state_ = LDBCommandExecuteResult::Failed(
-        "--" + ARG_CHECKPOINT_DIR + ": missing checkpoint directory");
-  } else {
+  if (itr != options.end()) {
     checkpoint_dir_ = itr->second;
   }
 }
@@ -2622,7 +2690,7 @@ void CheckPointCommand::DoCommand() {
 
 // ----------------------------------------------------------------------------
 
-RepairCommand::RepairCommand(const std::vector<std::string>& params,
+RepairCommand::RepairCommand(const std::vector<std::string>& /*params*/,
                              const std::map<std::string, std::string>& options,
                              const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {}
@@ -2652,7 +2720,7 @@ const std::string BackupableCommand::ARG_BACKUP_DIR = "backup_dir";
 const std::string BackupableCommand::ARG_STDERR_LOG_LEVEL = "stderr_log_level";
 
 BackupableCommand::BackupableCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, false /* is_read_only */,
@@ -2784,7 +2852,8 @@ void RestoreCommand::DoCommand() {
 
 namespace {
 
-void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
+void DumpSstFile(Options options, std::string filename, bool output_hex,
+                 bool show_properties) {
   std::string from_key;
   std::string to_key;
   if (filename.length() <= 4 ||
@@ -2793,8 +2862,9 @@ void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
     return;
   }
   // no verification
-  rocksdb::SstFileReader reader(filename, false, output_hex);
-  Status st = reader.ReadSequential(true, -1, false,  // has_from
+  rocksdb::SstFileDumper dumper(options, filename, false, output_hex);
+  Status st = dumper.ReadSequential(true, std::numeric_limits<uint64_t>::max(),
+                                    false,            // has_from
                                     from_key, false,  // has_to
                                     to_key);
   if (!st.ok()) {
@@ -2808,21 +2878,17 @@ void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
 
     std::shared_ptr<const rocksdb::TableProperties>
         table_properties_from_reader;
-    st = reader.ReadTableProperties(&table_properties_from_reader);
+    st = dumper.ReadTableProperties(&table_properties_from_reader);
     if (!st.ok()) {
       std::cerr << filename << ": " << st.ToString()
                 << ". Try to use initial table properties" << std::endl;
-      table_properties = reader.GetInitTableProperties();
+      table_properties = dumper.GetInitTableProperties();
     } else {
       table_properties = table_properties_from_reader.get();
     }
     if (table_properties != nullptr) {
       std::cout << std::endl << "Table Properties:" << std::endl;
       std::cout << table_properties->ToString("\n") << std::endl;
-      std::cout << "# deleted keys: "
-                << rocksdb::GetDeletedKeys(
-                       table_properties->user_collected_properties)
-                << std::endl;
     }
   }
 }
@@ -2830,7 +2896,7 @@ void DumpSstFile(std::string filename, bool output_hex, bool show_properties) {
 }  // namespace
 
 DBFileDumperCommand::DBFileDumperCommand(
-    const std::vector<std::string>& params,
+    const std::vector<std::string>& /*params*/,
     const std::map<std::string, std::string>& options,
     const std::vector<std::string>& flags)
     : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {}
@@ -2862,7 +2928,7 @@ void DBFileDumperCommand::DoCommand() {
   manifest_filename.resize(manifest_filename.size() - 1);
   std::string manifest_filepath = db_->GetName() + "/" + manifest_filename;
   std::cout << manifest_filepath << std::endl;
-  DumpManifestFile(manifest_filepath, false, false, false);
+  DumpManifestFile(options_, manifest_filepath, false, false, false);
   std::cout << std::endl;
 
   std::cout << "SST Files" << std::endl;
@@ -2873,7 +2939,7 @@ void DBFileDumperCommand::DoCommand() {
     std::string filename = fileMetadata.db_path + fileMetadata.name;
     std::cout << filename << " level:" << fileMetadata.level << std::endl;
     std::cout << "------------------------------" << std::endl;
-    DumpSstFile(filename, false, true);
+    DumpSstFile(options_, filename, false, true);
     std::cout << std::endl;
   }
   std::cout << std::endl;
@@ -2889,10 +2955,210 @@ void DBFileDumperCommand::DoCommand() {
       // TODO(qyang): option.wal_dir should be passed into ldb command
       std::string filename = db_->GetOptions().wal_dir + wal->PathName();
       std::cout << filename << std::endl;
-      DumpWalFile(filename, true, true, &exec_state_);
+      // TODO(myabandeh): allow configuring is_write_commited
+      DumpWalFile(options_, filename, true, true, true /* is_write_commited */,
+                  &exec_state_);
     }
   }
 }
 
+void WriteExternalSstFilesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(WriteExternalSstFilesCommand::Name());
+  ret.append(" <output_sst_path>");
+  ret.append("\n");
+}
+
+WriteExternalSstFilesCommand::WriteExternalSstFilesCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, false /* is_read_only */,
+          BuildCmdLineOptions({ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, ARG_FROM,
+                               ARG_TO, ARG_CREATE_IF_MISSING})) {
+  create_if_missing_ =
+      IsFlagPresent(flags, ARG_CREATE_IF_MISSING) ||
+      ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false);
+  if (params.size() != 1) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "output SST file path must be specified");
+  } else {
+    output_sst_path_ = params.at(0);
+  }
+}
+
+void WriteExternalSstFilesCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  ColumnFamilyHandle* cfh = GetCfHandle();
+  SstFileWriter sst_file_writer(EnvOptions(), db_->GetOptions(), cfh);
+  Status status = sst_file_writer.Open(output_sst_path_);
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed("failed to open SST file: " +
+                                                  status.ToString());
+    return;
+  }
+
+  int bad_lines = 0;
+  std::string line;
+  std::ifstream ifs_stdin("/dev/stdin");
+  std::istream* istream_p = ifs_stdin.is_open() ? &ifs_stdin : &std::cin;
+  while (getline(*istream_p, line, '\n')) {
+    std::string key;
+    std::string value;
+    if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) {
+      status = sst_file_writer.Put(key, value);
+      if (!status.ok()) {
+        exec_state_ = LDBCommandExecuteResult::Failed(
+            "failed to write record to file: " + status.ToString());
+        return;
+      }
+    } else if (0 == line.find("Keys in range:")) {
+      // ignore this line
+    } else if (0 == line.find("Created bg thread 0x")) {
+      // ignore this line
+    } else {
+      bad_lines++;
+    }
+  }
+
+  status = sst_file_writer.Finish();
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "Failed to finish writing to file: " + status.ToString());
+    return;
+  }
+
+  if (bad_lines > 0) {
+    fprintf(stderr, "Warning: %d bad lines ignored.\n", bad_lines);
+  }
+  exec_state_ = LDBCommandExecuteResult::Succeed(
+      "external SST file written to " + output_sst_path_);
+}
+
+Options WriteExternalSstFilesCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = create_if_missing_;
+  return opt;
+}
+
+const std::string IngestExternalSstFilesCommand::ARG_MOVE_FILES = "move_files";
+const std::string IngestExternalSstFilesCommand::ARG_SNAPSHOT_CONSISTENCY =
+    "snapshot_consistency";
+const std::string IngestExternalSstFilesCommand::ARG_ALLOW_GLOBAL_SEQNO =
+    "allow_global_seqno";
+const std::string IngestExternalSstFilesCommand::ARG_ALLOW_BLOCKING_FLUSH =
+    "allow_blocking_flush";
+const std::string IngestExternalSstFilesCommand::ARG_INGEST_BEHIND =
+    "ingest_behind";
+const std::string IngestExternalSstFilesCommand::ARG_WRITE_GLOBAL_SEQNO =
+    "write_global_seqno";
+
+void IngestExternalSstFilesCommand::Help(std::string& ret) {
+  ret.append("  ");
+  ret.append(IngestExternalSstFilesCommand::Name());
+  ret.append(" <input_sst_path>");
+  ret.append(" [--" + ARG_MOVE_FILES + "] ");
+  ret.append(" [--" + ARG_SNAPSHOT_CONSISTENCY + "] ");
+  ret.append(" [--" + ARG_ALLOW_GLOBAL_SEQNO + "] ");
+  ret.append(" [--" + ARG_ALLOW_BLOCKING_FLUSH + "] ");
+  ret.append(" [--" + ARG_INGEST_BEHIND + "] ");
+  ret.append(" [--" + ARG_WRITE_GLOBAL_SEQNO + "] ");
+  ret.append("\n");
+}
+
+IngestExternalSstFilesCommand::IngestExternalSstFilesCommand(
+    const std::vector<std::string>& params,
+    const std::map<std::string, std::string>& options,
+    const std::vector<std::string>& flags)
+    : LDBCommand(
+          options, flags, false /* is_read_only */,
+          BuildCmdLineOptions({ARG_MOVE_FILES, ARG_SNAPSHOT_CONSISTENCY,
+                               ARG_ALLOW_GLOBAL_SEQNO, ARG_CREATE_IF_MISSING,
+                               ARG_ALLOW_BLOCKING_FLUSH, ARG_INGEST_BEHIND,
+                               ARG_WRITE_GLOBAL_SEQNO})),
+      move_files_(false),
+      snapshot_consistency_(true),
+      allow_global_seqno_(true),
+      allow_blocking_flush_(true),
+      ingest_behind_(false),
+      write_global_seqno_(true) {
+  create_if_missing_ =
+      IsFlagPresent(flags, ARG_CREATE_IF_MISSING) ||
+      ParseBooleanOption(options, ARG_CREATE_IF_MISSING, false);
+  move_files_ = IsFlagPresent(flags, ARG_MOVE_FILES) ||
+                ParseBooleanOption(options, ARG_MOVE_FILES, false);
+  snapshot_consistency_ =
+      IsFlagPresent(flags, ARG_SNAPSHOT_CONSISTENCY) ||
+      ParseBooleanOption(options, ARG_SNAPSHOT_CONSISTENCY, true);
+  allow_global_seqno_ =
+      IsFlagPresent(flags, ARG_ALLOW_GLOBAL_SEQNO) ||
+      ParseBooleanOption(options, ARG_ALLOW_GLOBAL_SEQNO, true);
+  allow_blocking_flush_ =
+      IsFlagPresent(flags, ARG_ALLOW_BLOCKING_FLUSH) ||
+      ParseBooleanOption(options, ARG_ALLOW_BLOCKING_FLUSH, true);
+  ingest_behind_ = IsFlagPresent(flags, ARG_INGEST_BEHIND) ||
+                   ParseBooleanOption(options, ARG_INGEST_BEHIND, false);
+  write_global_seqno_ =
+      IsFlagPresent(flags, ARG_WRITE_GLOBAL_SEQNO) ||
+      ParseBooleanOption(options, ARG_WRITE_GLOBAL_SEQNO, true);
+
+  if (allow_global_seqno_) {
+    if (!write_global_seqno_) {
+      fprintf(stderr,
+              "Warning: not writing global_seqno to the ingested SST can\n"
+              "prevent older versions of RocksDB from being able to open it\n");
+    }
+  } else {
+    if (write_global_seqno_) {
+      exec_state_ = LDBCommandExecuteResult::Failed(
+          "ldb cannot write global_seqno to the ingested SST when global_seqno "
+          "is not allowed");
+    }
+  }
+
+  if (params.size() != 1) {
+    exec_state_ =
+        LDBCommandExecuteResult::Failed("input SST path must be specified");
+  } else {
+    input_sst_path_ = params.at(0);
+  }
+}
+
+void IngestExternalSstFilesCommand::DoCommand() {
+  if (!db_) {
+    assert(GetExecuteState().IsFailed());
+    return;
+  }
+  if (GetExecuteState().IsFailed()) {
+    return;
+  }
+  ColumnFamilyHandle* cfh = GetCfHandle();
+  IngestExternalFileOptions ifo;
+  ifo.move_files = move_files_;
+  ifo.snapshot_consistency = snapshot_consistency_;
+  ifo.allow_global_seqno = allow_global_seqno_;
+  ifo.allow_blocking_flush = allow_blocking_flush_;
+  ifo.ingest_behind = ingest_behind_;
+  ifo.write_global_seqno = write_global_seqno_;
+  Status status = db_->IngestExternalFile(cfh, {input_sst_path_}, ifo);
+  if (!status.ok()) {
+    exec_state_ = LDBCommandExecuteResult::Failed(
+        "failed to ingest external SST: " + status.ToString());
+  } else {
+    exec_state_ =
+        LDBCommandExecuteResult::Succeed("external SST files ingested");
+  }
+}
+
+Options IngestExternalSstFilesCommand::PrepareOptionsForOpenDB() {
+  Options opt = LDBCommand::PrepareOptionsForOpenDB();
+  opt.create_if_missing = create_if_missing_;
+  return opt;
+}
+
 }   // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/tools/ldb_cmd_impl.h b/thirdparty/rocksdb/tools/ldb_cmd_impl.h
index 91afd2674c..868c81f44c 100644
--- a/thirdparty/rocksdb/tools/ldb_cmd_impl.h
+++ b/thirdparty/rocksdb/tools/ldb_cmd_impl.h
@@ -276,8 +276,10 @@ class WALDumperCommand : public LDBCommand {
   bool print_header_;
   std::string wal_file_;
   bool print_values_;
+  bool is_write_committed_;  // default will be set to true
 
   static const std::string ARG_WAL_FILE;
+  static const std::string ARG_WRITE_COMMITTED;
   static const std::string ARG_PRINT_HEADER;
   static const std::string ARG_PRINT_VALUE;
 };
@@ -520,4 +522,57 @@ class RestoreCommand : public BackupableCommand {
   static void Help(std::string& ret);
 };
 
+class WriteExternalSstFilesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "write_extern_sst"; }
+  WriteExternalSstFilesCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  virtual void DoCommand() override;
+
+  virtual bool NoDBOpen() override { return false; }
+
+  virtual Options PrepareOptionsForOpenDB() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string output_sst_path_;
+};
+
+class IngestExternalSstFilesCommand : public LDBCommand {
+ public:
+  static std::string Name() { return "ingest_extern_sst"; }
+  IngestExternalSstFilesCommand(
+      const std::vector<std::string>& params,
+      const std::map<std::string, std::string>& options,
+      const std::vector<std::string>& flags);
+
+  virtual void DoCommand() override;
+
+  virtual bool NoDBOpen() override { return false; }
+
+  virtual Options PrepareOptionsForOpenDB() override;
+
+  static void Help(std::string& ret);
+
+ private:
+  std::string input_sst_path_;
+  bool move_files_;
+  bool snapshot_consistency_;
+  bool allow_global_seqno_;
+  bool allow_blocking_flush_;
+  bool ingest_behind_;
+  bool write_global_seqno_;
+
+  static const std::string ARG_MOVE_FILES;
+  static const std::string ARG_SNAPSHOT_CONSISTENCY;
+  static const std::string ARG_ALLOW_GLOBAL_SEQNO;
+  static const std::string ARG_ALLOW_BLOCKING_FLUSH;
+  static const std::string ARG_INGEST_BEHIND;
+  static const std::string ARG_WRITE_GLOBAL_SEQNO;
+};
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/tools/ldb_cmd_test.cc b/thirdparty/rocksdb/tools/ldb_cmd_test.cc
index 16f9631dad..3b70995337 100644
--- a/thirdparty/rocksdb/tools/ldb_cmd_test.cc
+++ b/thirdparty/rocksdb/tools/ldb_cmd_test.cc
@@ -12,6 +12,8 @@ using std::string;
 using std::vector;
 using std::map;
 
+namespace rocksdb {
+
 class LdbCmdTest : public testing::Test {};
 
 TEST_F(LdbCmdTest, HexToString) {
@@ -47,6 +49,77 @@ TEST_F(LdbCmdTest, HexToStringBadInputs) {
   }
 }
 
+TEST_F(LdbCmdTest, MemEnv) {
+  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Options opts;
+  opts.env = env.get();
+  opts.create_if_missing = true;
+
+  DB* db = nullptr;
+  std::string dbname = test::TmpDir();
+  ASSERT_OK(DB::Open(opts, dbname, &db));
+
+  WriteOptions wopts;
+  for (int i = 0; i < 100; i++) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "%08d", i);
+    ASSERT_OK(db->Put(wopts, buf, buf));
+  }
+  FlushOptions fopts;
+  fopts.wait = true;
+  ASSERT_OK(db->Flush(fopts));
+
+  delete db;
+
+  char arg1[] = "./ldb";
+  char arg2[1024];
+  snprintf(arg2, sizeof(arg2), "--db=%s", dbname.c_str());
+  char arg3[] = "dump_live_files";
+  char* argv[] = {arg1, arg2, arg3};
+
+  rocksdb::LDBTool tool;
+  tool.Run(3, argv, opts);
+}
+
+TEST_F(LdbCmdTest, OptionParsing) {
+  // test parsing flags
+  {
+    std::vector<std::string> args;
+    args.push_back("scan");
+    args.push_back("--ttl");
+    args.push_back("--timestamp");
+    LDBCommand* command = rocksdb::LDBCommand::InitFromCmdLineArgs(
+        args, Options(), LDBOptions(), nullptr);
+    const std::vector<std::string> flags = command->TEST_GetFlags();
+    EXPECT_EQ(flags.size(), 2);
+    EXPECT_EQ(flags[0], "ttl");
+    EXPECT_EQ(flags[1], "timestamp");
+    delete command;
+  }
+  // test parsing options which contains equal sign in the option value
+  {
+    std::vector<std::string> args;
+    args.push_back("scan");
+    args.push_back("--db=/dev/shm/ldbtest/");
+    args.push_back(
+        "--from='abcd/efg/hijk/lmn/"
+        "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz=a&bcd_"
+        "ef=gh.ijk'");
+    LDBCommand* command = rocksdb::LDBCommand::InitFromCmdLineArgs(
+        args, Options(), LDBOptions(), nullptr);
+    const std::map<std::string, std::string> option_map =
+        command->TEST_GetOptionMap();
+    EXPECT_EQ(option_map.at("db"), "/dev/shm/ldbtest/");
+    EXPECT_EQ(option_map.at("from"),
+              "'abcd/efg/hijk/lmn/"
+              "opq:__rst.uvw.xyz?a=3+4+bcd+efghi&jk=lm_no&pq=rst-0&uv=wx-8&yz="
+              "a&bcd_ef=gh.ijk'");
+    delete command;
+  }
+}
+
+} // namespace rocksdb
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
@@ -54,7 +127,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/tools/ldb_test.py b/thirdparty/rocksdb/tools/ldb_test.py
index 5d52d79c30..2200fb464b 100644
--- a/thirdparty/rocksdb/tools/ldb_test.py
+++ b/thirdparty/rocksdb/tools/ldb_test.py
@@ -76,7 +76,7 @@ def assertRunFAILFull(self, params):
 
             my_check_output("./ldb %s >/dev/null 2>&1 |grep -v \"Created bg \
                 thread\"" % params, shell=True)
-        except Exception, e:
+        except Exception:
             return
         self.fail(
             "Exception should have been raised for command with params: %s" %
@@ -146,6 +146,14 @@ def dumpDb(self, params, dumpFile):
     def loadDb(self, params, dumpFile):
         return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params))
 
+    def writeExternSst(self, params, inputDumpFile, outputSst):
+        return 0 == run_err_null("cat %s | ./ldb write_extern_sst %s %s"
+                % (inputDumpFile, outputSst, params))
+
+    def ingestExternSst(self, params, inputSst):
+        return 0 == run_err_null("./ldb ingest_extern_sst %s %s"
+                                     % (inputSst, params))
+
     def testStringBatchPut(self):
         print "Running testStringBatchPut..."
         self.assertRunOK("batchput x1 y1 --create_if_missing", "OK")
@@ -229,7 +237,7 @@ def testTtlPutGet(self):
         self.assertRunFAIL("get --ttl a3")
         self.assertRunOK("checkconsistency", "OK")
 
-    def testInvalidCmdLines(self):
+    def testInvalidCmdLines(self):  # noqa: F811 T25377293 Grandfathered in
         print "Running testInvalidCmdLines..."
         # db not specified
         self.assertRunFAILFull("put 0x6133 0x6233 --hex --create_if_missing")
@@ -516,7 +524,7 @@ def testListColumnFamilies(self):
 
     def testColumnFamilies(self):
         print "Running testColumnFamilies..."
-        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)
+        dbPath = os.path.join(self.TMP_DIR, self.DB_NAME)  # noqa: F841 T25377293 Grandfathered in
         self.assertRunOK("put cf1_1 1 --create_if_missing", "OK")
         self.assertRunOK("put cf1_2 2 --create_if_missing", "OK")
         self.assertRunOK("put cf1_3 3 --try_load_options", "OK")
@@ -547,5 +555,38 @@ def testColumnFamilies(self):
         # non-existing column family.
         self.assertRunFAIL("get cf3_1 --column_family=four")
 
+    def testIngestExternalSst(self):
+        print "Running testIngestExternalSst..."
+
+        # Dump, load, write external sst and ingest it in another db
+        dbPath = os.path.join(self.TMP_DIR, "db1")
+        self.assertRunOK(
+            "batchput --db=%s --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4"
+            % dbPath,
+            "OK")
+        self.assertRunOK("scan --db=%s" % dbPath,
+                         "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4")
+        dumpFilePath = os.path.join(self.TMP_DIR, "dump1")
+        with open(dumpFilePath, 'w') as f:
+            f.write("x1 ==> y10\nx2 ==> y20\nx3 ==> y30\nx4 ==> y40")
+        externSstPath = os.path.join(self.TMP_DIR, "extern_data1.sst")
+        self.assertTrue(self.writeExternSst("--create_if_missing --db=%s"
+                            % dbPath,
+                        dumpFilePath,
+                        externSstPath))
+        # cannot ingest if allow_global_seqno is false
+        self.assertFalse(
+            self.ingestExternSst(
+                "--create_if_missing --allow_global_seqno=false --db=%s"
+                % dbPath,
+                externSstPath))
+        self.assertTrue(
+            self.ingestExternSst(
+                "--create_if_missing --allow_global_seqno --db=%s"
+                % dbPath,
+                externSstPath))
+        self.assertRunOKFull("scan --db=%s" % dbPath,
+                             "x1 : y10\nx2 : y20\nx3 : y30\nx4 : y40")
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/thirdparty/rocksdb/tools/ldb_tool.cc b/thirdparty/rocksdb/tools/ldb_tool.cc
index e8229ef7b9..fe307eab7d 100644
--- a/thirdparty/rocksdb/tools/ldb_tool.cc
+++ b/thirdparty/rocksdb/tools/ldb_tool.cc
@@ -13,7 +13,7 @@ namespace rocksdb {
 LDBOptions::LDBOptions() {}
 
 void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
-                                 const char* exec_name) {
+                                 const char* /*exec_name*/) {
   std::string ret;
 
   ret.append(ldb_options.print_help_header);
@@ -88,6 +88,8 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options,
   BackupCommand::Help(ret);
   RestoreCommand::Help(ret);
   CheckPointCommand::Help(ret);
+  WriteExternalSstFilesCommand::Help(ret);
+  IngestExternalSstFilesCommand::Help(ret);
 
   fprintf(stderr, "%s\n", ret.c_str());
 }
diff --git a/thirdparty/rocksdb/tools/reduce_levels_test.cc b/thirdparty/rocksdb/tools/reduce_levels_test.cc
index 7fe38bf7e8..1718b3344e 100644
--- a/thirdparty/rocksdb/tools/reduce_levels_test.cc
+++ b/thirdparty/rocksdb/tools/reduce_levels_test.cc
@@ -20,7 +20,7 @@ namespace rocksdb {
 class ReduceLevelTest : public testing::Test {
 public:
   ReduceLevelTest() {
-    dbname_ = test::TmpDir() + "/db_reduce_levels_test";
+    dbname_ = test::PerThreadDBPath("db_reduce_levels_test");
     DestroyDB(dbname_, Options());
     db_ = nullptr;
   }
@@ -210,7 +210,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/tools/regression_test.sh b/thirdparty/rocksdb/tools/regression_test.sh
index 58558bbe4a..69a53a470a 100755
--- a/thirdparty/rocksdb/tools/regression_test.sh
+++ b/thirdparty/rocksdb/tools/regression_test.sh
@@ -85,6 +85,10 @@
 #       db_bench.  Default: 4.
 #   MAX_BACKGROUND_COMPACTIONS:  The maximum number of concurrent compactions
 #       in db_bench.  Default: 16.
+#   NUM_HIGH_PRI_THREADS:  The number of high-pri threads available for
+#       concurrent flushes in db_bench.  Default: 4.
+#   NUM_LOW_PRI_THREADS:  The number of low-pri threads available for
+#       concurrent compactions in db_bench.  Default: 16.
 #   SEEK_NEXTS:  Controls how many Next() will be called after seek.
 #       Default: 10.
 #   SEED:  random seed that controls the randomness of the benchmark.
@@ -182,6 +186,8 @@ function init_arguments {
   STATS_INTERVAL_SECONDS=${STATS_INTERVAL_SECONDS:-600}
   MAX_BACKGROUND_FLUSHES=${MAX_BACKGROUND_FLUSHES:-4}
   MAX_BACKGROUND_COMPACTIONS=${MAX_BACKGROUND_COMPACTIONS:-16}
+  NUM_HIGH_PRI_THREADS=${NUM_HIGH_PRI_THREADS:-4}
+  NUM_LOW_PRI_THREADS=${NUM_LOW_PRI_THREADS:-16}
   DELETE_TEST_PATH=${DELETE_TEST_PATH:-0}
   SEEK_NEXTS=${SEEK_NEXTS:-10}
   SEED=${SEED:-$( date +%s )}
@@ -231,6 +237,8 @@ function run_db_bench {
       --max_background_flushes=$MAX_BACKGROUND_FLUSHES \
       --num_multi_db=$NUM_MULTI_DB \
       --max_background_compactions=$MAX_BACKGROUND_COMPACTIONS \
+      --num_high_pri_threads=$NUM_HIGH_PRI_THREADS \
+      --num_low_pri_threads=$NUM_LOW_PRI_THREADS \
       --seed=$SEED) 2>&1"
   ps_cmd="ps aux"
   if ! [ -z "$REMOTE_USER_AT_HOST" ]; then
@@ -368,7 +376,7 @@ function build_db_bench_and_ldb {
   make clean
   exit_on_error $?
 
-  DEBUG_LEVEL=0 make db_bench ldb -j32
+  DEBUG_LEVEL=0 PORTABLE=1 make db_bench ldb -j32
   exit_on_error $?
 }
 
@@ -457,4 +465,5 @@ function cleanup_test_directory {
 
 ############################################################################
 
+# shellcheck disable=SC2068
 main $@
diff --git a/thirdparty/rocksdb/tools/report_lite_binary_size.sh b/thirdparty/rocksdb/tools/report_lite_binary_size.sh
new file mode 100755
index 0000000000..232bb9bc05
--- /dev/null
+++ b/thirdparty/rocksdb/tools/report_lite_binary_size.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+# Script to report lite build binary size for latest RocksDB commits.
+# Usage:
+#   ./report_lite_binary_size [num_recent_commits]
+
+num_recent_commits=${1:-10}
+
+echo "Computing RocksDB lite build binary size for the most recent $num_recent_commits commits."
+
+for ((i=0; i < num_recent_commits; i++))
+do
+  git checkout master~$i
+  commit_hash=$(git show -s --format=%H)
+  commit_time=$(git show -s --format=%ct)
+
+  # It would be nice to check if scuba already have a record for the commit,
+  # but sandcastle don't seems to have scuba CLI installed.
+
+  make clean
+  make OPT=-DROCKSDB_LITE static_lib
+
+  if make OPT=-DROCKSDB_LITE static_lib
+  then
+    build_succeeded='true'
+    strip librocksdb.a
+    binary_size=$(stat -c %s librocksdb.a)
+  else
+    build_succeeded='false'
+    binary_size=0
+  fi
+
+  current_time="\"time\": $(date +%s)"
+  commit_hash="\"hash\": \"$commit_hash\""
+  commit_time="\"commit_time\": $commit_time"
+  build_succeeded="\"build_succeeded\": \"$build_succeeded\""
+  binary_size="\"binary_size\": $binary_size"
+
+  scribe_log="{\"int\":{$current_time, $commit_time, $binary_size}, \"normal\":{$commit_hash, $build_succeeded}}"
+  echo "Logging to scribe: $scribe_log"
+  scribe_cat perfpipe_rocksdb_lite_build "$scribe_log"
+done
diff --git a/thirdparty/rocksdb/tools/rocksdb_dump_test.sh b/thirdparty/rocksdb/tools/rocksdb_dump_test.sh
index 2cf8c060ba..8d9c7f5208 100755
--- a/thirdparty/rocksdb/tools/rocksdb_dump_test.sh
+++ b/thirdparty/rocksdb/tools/rocksdb_dump_test.sh
@@ -1,3 +1,4 @@
+# shellcheck disable=SC2148
 TESTDIR=`mktemp -d ${TMPDIR:-/tmp}/rocksdb-dump-test.XXXXX`
 DUMPFILE="tools/sample-dump.dmp"
 
diff --git a/thirdparty/rocksdb/tools/sst_dump.cc b/thirdparty/rocksdb/tools/sst_dump.cc
index 617d758155..4e10a8c063 100644
--- a/thirdparty/rocksdb/tools/sst_dump.cc
+++ b/thirdparty/rocksdb/tools/sst_dump.cc
@@ -14,7 +14,7 @@ int main(int argc, char** argv) {
 }
 #else
 #include <stdio.h>
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "Not supported in lite mode.\n");
   return 1;
 }
diff --git a/thirdparty/rocksdb/tools/sst_dump_test.cc b/thirdparty/rocksdb/tools/sst_dump_test.cc
index 460b5a2cc1..6bf3e3b97a 100644
--- a/thirdparty/rocksdb/tools/sst_dump_test.cc
+++ b/thirdparty/rocksdb/tools/sst_dump_test.cc
@@ -38,34 +38,30 @@ static std::string MakeValue(int i) {
   return key.Encode().ToString();
 }
 
-void createSST(const std::string& file_name,
-               const BlockBasedTableOptions& table_options) {
-  std::shared_ptr<rocksdb::TableFactory> tf;
-  tf.reset(new rocksdb::BlockBasedTableFactory(table_options));
-
-  unique_ptr<WritableFile> file;
-  Env* env = Env::Default();
-  EnvOptions env_options;
+void createSST(const Options& opts, const std::string& file_name) {
+  Env* env = opts.env;
+  EnvOptions env_options(opts);
   ReadOptions read_options;
-  Options opts;
   const ImmutableCFOptions imoptions(opts);
+  const MutableCFOptions moptions(opts);
   rocksdb::InternalKeyComparator ikc(opts.comparator);
-  unique_ptr<TableBuilder> tb;
+  std::unique_ptr<TableBuilder> tb;
+
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(env->NewWritableFile(file_name, &file, env_options));
 
-  env->NewWritableFile(file_name, &file, env_options);
-  opts.table_factory = tf;
   std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
       int_tbl_prop_collector_factories;
-  unique_ptr<WritableFileWriter> file_writer(
-      new WritableFileWriter(std::move(file), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), file_name, EnvOptions()));
   std::string column_family_name;
   int unknown_level = -1;
   tb.reset(opts.table_factory->NewTableBuilder(
-      TableBuilderOptions(imoptions, ikc, &int_tbl_prop_collector_factories,
-                          CompressionType::kNoCompression, CompressionOptions(),
-                          nullptr /* compression_dict */,
-                          false /* skip_filters */, column_family_name,
-                          unknown_level),
+      TableBuilderOptions(
+          imoptions, moptions, ikc, &int_tbl_prop_collector_factories,
+          CompressionType::kNoCompression, 0 /* sample_for_compression */,
+          CompressionOptions(), false /* skip_filters */, column_family_name,
+          unknown_level),
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       file_writer.get()));
 
@@ -78,8 +74,8 @@ void createSST(const std::string& file_name,
   file_writer->Close();
 }
 
-void cleanup(const std::string& file_name) {
-  Env* env = Env::Default();
+void cleanup(const Options& opts, const std::string& file_name) {
+  Env* env = opts.env;
   env->DeleteFile(file_name);
   std::string outfile_name = file_name.substr(0, file_name.length() - 4);
   outfile_name.append("_dump.txt");
@@ -89,122 +85,147 @@ void cleanup(const std::string& file_name) {
 
 // Test for sst dump tool "raw" mode
 class SSTDumpToolTest : public testing::Test {
+  std::string testDir_;
+
  public:
-  BlockBasedTableOptions table_options_;
+  SSTDumpToolTest() { testDir_ = test::TmpDir(); }
+
+  ~SSTDumpToolTest() override {}
 
-  SSTDumpToolTest() {}
+  std::string MakeFilePath(const std::string& file_name) const {
+    std::string path(testDir_);
+    path.append("/").append(file_name);
+    return path;
+  }
 
-  ~SSTDumpToolTest() {}
+  template <std::size_t N>
+  void PopulateCommandArgs(const std::string& file_path, const char* command,
+                           char* (&usage)[N]) const {
+    for (int i = 0; i < static_cast<int>(N); ++i) {
+      usage[i] = new char[optLength];
+    }
+    snprintf(usage[0], optLength, "./sst_dump");
+    snprintf(usage[1], optLength, "%s", command);
+    snprintf(usage[2], optLength, "--file=%s", file_path.c_str());
+  }
 };
 
 TEST_F(SSTDumpToolTest, EmptyFilter) {
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  Options opts;
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=raw");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--command=raw", usage);
 
   rocksdb::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage));
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
 
-  cleanup(file_name);
+  cleanup(opts, file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
 }
 
 TEST_F(SSTDumpToolTest, FilterBlock) {
-  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  Options opts;
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=raw");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--command=raw", usage);
 
   rocksdb::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage));
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
 
-  cleanup(file_name);
+  cleanup(opts, file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
 }
 
 TEST_F(SSTDumpToolTest, FullFilterBlock) {
-  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  Options opts;
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=raw");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--command=raw", usage);
 
   rocksdb::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage));
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
 
-  cleanup(file_name);
+  cleanup(opts, file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
 }
 
 TEST_F(SSTDumpToolTest, GetProperties) {
-  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  Options opts;
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
 
   char* usage[3];
-  for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
-  }
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--show_properties");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
+  PopulateCommandArgs(file_path, "--show_properties", usage);
 
   rocksdb::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage));
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
 
-  cleanup(file_name);
+  cleanup(opts, file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
 }
 
 TEST_F(SSTDumpToolTest, CompressedSizes) {
-  table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
-  std::string file_name = "rocksdb_sst_test.sst";
-  createSST(file_name, table_options_);
+  Options opts;
+  BlockBasedTableOptions table_opts;
+  table_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false));
+  opts.table_factory.reset(new BlockBasedTableFactory(table_opts));
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
 
   char* usage[3];
+  PopulateCommandArgs(file_path, "--command=recompress", usage);
+
+  rocksdb::SSTDumpTool tool;
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
+
+  cleanup(opts, file_path);
   for (int i = 0; i < 3; i++) {
-    usage[i] = new char[optLength];
+    delete[] usage[i];
   }
+}
+
+TEST_F(SSTDumpToolTest, MemEnv) {
+  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  Options opts;
+  opts.env = env.get();
+  std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
+  createSST(opts, file_path);
+
+  char* usage[3];
+  PopulateCommandArgs(file_path, "--command=verify_checksum", usage);
 
-  snprintf(usage[0], optLength, "./sst_dump");
-  snprintf(usage[1], optLength, "--command=recompress");
-  snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst");
   rocksdb::SSTDumpTool tool;
-  ASSERT_TRUE(!tool.Run(3, usage));
+  ASSERT_TRUE(!tool.Run(3, usage, opts));
 
-  cleanup(file_name);
+  cleanup(opts, file_path);
   for (int i = 0; i < 3; i++) {
     delete[] usage[i];
   }
 }
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
@@ -215,7 +236,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as SSTDumpTool is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/tools/sst_dump_tool.cc b/thirdparty/rocksdb/tools/sst_dump_tool.cc
index 4dca284cce..5cbbfc3854 100644
--- a/thirdparty/rocksdb/tools/sst_dump_tool.cc
+++ b/thirdparty/rocksdb/tools/sst_dump_tool.cc
@@ -43,12 +43,17 @@
 
 namespace rocksdb {
 
-SstFileReader::SstFileReader(const std::string& file_path,
-                             bool verify_checksum,
+SstFileDumper::SstFileDumper(const Options& options,
+                             const std::string& file_path, bool verify_checksum,
                              bool output_hex)
-    :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum),
-    output_hex_(output_hex), ioptions_(options_),
-    internal_comparator_(BytewiseComparator()) {
+    : file_name_(file_path),
+      read_num_(0),
+      verify_checksum_(verify_checksum),
+      output_hex_(output_hex),
+      options_(options),
+      ioptions_(options_),
+      moptions_(ColumnFamilyOptions(options_)),
+      internal_comparator_(BytewiseComparator()) {
   fprintf(stdout, "Process %s\n", file_path.c_str());
   init_result_ = GetTableReader(file_name_);
 }
@@ -71,7 +76,7 @@ static const std::vector<std::pair<CompressionType, const char*>>
         {CompressionType::kXpressCompression, "kXpressCompression"},
         {CompressionType::kZSTD, "kZSTD"}};
 
-Status SstFileReader::GetTableReader(const std::string& file_path) {
+Status SstFileDumper::GetTableReader(const std::string& file_path) {
   // Warning about 'magic_number' being uninitialized shows up only in UBsan
   // builds. Though access is guarded by 's.ok()' checks, fix the issue to
   // avoid any warnings.
@@ -80,8 +85,8 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
   // read table magic number
   Footer footer;
 
-  unique_ptr<RandomAccessFile> file;
-  uint64_t file_size;
+  std::unique_ptr<RandomAccessFile> file;
+  uint64_t file_size = 0;
   Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_);
   if (s.ok()) {
     s = options_.env->GetFileSize(file_path, &file_size);
@@ -120,54 +125,58 @@ Status SstFileReader::GetTableReader(const std::string& file_path) {
   return s;
 }
 
-Status SstFileReader::NewTableReader(
-    const ImmutableCFOptions& ioptions, const EnvOptions& soptions,
-    const InternalKeyComparator& internal_comparator, uint64_t file_size,
-    unique_ptr<TableReader>* table_reader) {
+Status SstFileDumper::NewTableReader(
+    const ImmutableCFOptions& /*ioptions*/, const EnvOptions& /*soptions*/,
+    const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size,
+    std::unique_ptr<TableReader>* /*table_reader*/) {
   // We need to turn off pre-fetching of index and filter nodes for
   // BlockBasedTable
   if (BlockBasedTableFactory::kName == options_.table_factory->Name()) {
     return options_.table_factory->NewTableReader(
-        TableReaderOptions(ioptions_, soptions_, internal_comparator_,
-                           /*skip_filters=*/false),
+        TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(),
+                           soptions_, internal_comparator_),
         std::move(file_), file_size, &table_reader_, /*enable_prefetch=*/false);
   }
 
   // For all other factory implementation
   return options_.table_factory->NewTableReader(
-      TableReaderOptions(ioptions_, soptions_, internal_comparator_),
+      TableReaderOptions(ioptions_, moptions_.prefix_extractor.get(), soptions_,
+                         internal_comparator_),
       std::move(file_), file_size, &table_reader_);
 }
 
-Status SstFileReader::VerifyChecksum() {
+Status SstFileDumper::VerifyChecksum() {
   return table_reader_->VerifyChecksum();
 }
 
-Status SstFileReader::DumpTable(const std::string& out_filename) {
-  unique_ptr<WritableFile> out_file;
+Status SstFileDumper::DumpTable(const std::string& out_filename) {
+  std::unique_ptr<WritableFile> out_file;
   Env* env = Env::Default();
   env->NewWritableFile(out_filename, &out_file, soptions_);
-  Status s = table_reader_->DumpTable(out_file.get());
+  Status s = table_reader_->DumpTable(out_file.get(),
+                                      moptions_.prefix_extractor.get());
   out_file->Close();
   return s;
 }
 
-uint64_t SstFileReader::CalculateCompressedTableSize(
+uint64_t SstFileDumper::CalculateCompressedTableSize(
     const TableBuilderOptions& tb_options, size_t block_size) {
-  unique_ptr<WritableFile> out_file;
-  unique_ptr<Env> env(NewMemEnv(Env::Default()));
+  std::unique_ptr<WritableFile> out_file;
+  std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
   env->NewWritableFile(testFileName, &out_file, soptions_);
-  unique_ptr<WritableFileWriter> dest_writer;
-  dest_writer.reset(new WritableFileWriter(std::move(out_file), soptions_));
+  std::unique_ptr<WritableFileWriter> dest_writer;
+  dest_writer.reset(
+      new WritableFileWriter(std::move(out_file), testFileName, soptions_));
   BlockBasedTableOptions table_options;
   table_options.block_size = block_size;
   BlockBasedTableFactory block_based_tf(table_options);
-  unique_ptr<TableBuilder> table_builder;
+  std::unique_ptr<TableBuilder> table_builder;
   table_builder.reset(block_based_tf.NewTableBuilder(
       tb_options,
       TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
       dest_writer.get()));
-  unique_ptr<InternalIterator> iter(table_reader_->NewIterator(ReadOptions()));
+  std::unique_ptr<InternalIterator> iter(table_reader_->NewIterator(
+      ReadOptions(), moptions_.prefix_extractor.get()));
   for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
     if (!iter->status().ok()) {
       fputs(iter->status().ToString().c_str(), stderr);
@@ -185,13 +194,15 @@ uint64_t SstFileReader::CalculateCompressedTableSize(
   return size;
 }
 
-int SstFileReader::ShowAllCompressionSizes(
+int SstFileDumper::ShowAllCompressionSizes(
     size_t block_size,
     const std::vector<std::pair<CompressionType, const char*>>&
         compression_types) {
   ReadOptions read_options;
   Options opts;
   const ImmutableCFOptions imoptions(opts);
+  const ColumnFamilyOptions cfo(opts);
+  const MutableCFOptions moptions(cfo);
   rocksdb::InternalKeyComparator ikc(opts.comparator);
   std::vector<std::unique_ptr<IntTblPropCollectorFactory> >
       block_based_table_factories;
@@ -203,11 +214,10 @@ int SstFileReader::ShowAllCompressionSizes(
       CompressionOptions compress_opt;
       std::string column_family_name;
       int unknown_level = -1;
-      TableBuilderOptions tb_opts(imoptions, ikc, &block_based_table_factories,
-                                  i.first, compress_opt,
-                                  nullptr /* compression_dict */,
-                                  false /* skip_filters */, column_family_name,
-                                  unknown_level);
+      TableBuilderOptions tb_opts(
+          imoptions, moptions, ikc, &block_based_table_factories, i.first,
+          0 /* sample_for_compression */, compress_opt,
+          false /* skip_filters */, column_family_name, unknown_level);
       uint64_t file_size = CalculateCompressedTableSize(tb_opts, block_size);
       fprintf(stdout, "Compression: %s", i.second);
       fprintf(stdout, " Size: %" PRIu64 "\n", file_size);
@@ -218,7 +228,7 @@ int SstFileReader::ShowAllCompressionSizes(
   return 0;
 }
 
-Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
+Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number,
                                           RandomAccessFileReader* file,
                                           uint64_t file_size) {
   TableProperties* table_properties = nullptr;
@@ -232,7 +242,7 @@ Status SstFileReader::ReadTableProperties(uint64_t table_magic_number,
   return s;
 }
 
-Status SstFileReader::SetTableOptionsByMagicNumber(
+Status SstFileDumper::SetTableOptionsByMagicNumber(
     uint64_t table_magic_number) {
   assert(table_properties_);
   if (table_magic_number == kBlockBasedTableMagicNumber ||
@@ -275,7 +285,7 @@ Status SstFileReader::SetTableOptionsByMagicNumber(
   return Status::OK();
 }
 
-Status SstFileReader::SetOldTableOptions() {
+Status SstFileDumper::SetOldTableOptions() {
   assert(table_properties_ == nullptr);
   options_.table_factory = std::make_shared<BlockBasedTableFactory>();
   fprintf(stdout, "Sst file format: block-based(old version)\n");
@@ -283,7 +293,7 @@ Status SstFileReader::SetOldTableOptions() {
   return Status::OK();
 }
 
-Status SstFileReader::ReadSequential(bool print_kv, uint64_t read_num,
+Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num,
                                      bool has_from, const std::string& from_key,
                                      bool has_to, const std::string& to_key,
                                      bool use_from_as_prefix) {
@@ -291,12 +301,12 @@ Status SstFileReader::ReadSequential(bool print_kv, uint64_t read_num,
     return init_result_;
   }
 
-  InternalIterator* iter =
-      table_reader_->NewIterator(ReadOptions(verify_checksum_, false));
+  InternalIterator* iter = table_reader_->NewIterator(
+      ReadOptions(verify_checksum_, false), moptions_.prefix_extractor.get());
   uint64_t i = 0;
   if (has_from) {
     InternalKey ikey;
-    ikey.SetMaxPossibleForUserKey(from_key);
+    ikey.SetMinPossibleForUserKey(from_key);
     iter->Seek(ikey.Encode());
   } else {
     iter->SeekToFirst();
@@ -340,7 +350,7 @@ Status SstFileReader::ReadSequential(bool print_kv, uint64_t read_num,
   return ret;
 }
 
-Status SstFileReader::ReadTableProperties(
+Status SstFileDumper::ReadTableProperties(
     std::shared_ptr<const TableProperties>* table_properties) {
   if (!table_reader_) {
     return init_result_;
@@ -409,9 +419,9 @@ void print_help() {
 
 }  // namespace
 
-int SSTDumpTool::Run(int argc, char** argv) {
+int SSTDumpTool::Run(int argc, char** argv, Options options) {
   const char* dir_or_file = nullptr;
-  uint64_t read_num = -1;
+  uint64_t read_num = std::numeric_limits<uint64_t>::max();
   std::string command;
 
   char junk;
@@ -428,7 +438,7 @@ int SSTDumpTool::Run(int argc, char** argv) {
   std::string from_key;
   std::string to_key;
   std::string block_size_str;
-  size_t block_size;
+  size_t block_size = 0;
   std::vector<std::pair<CompressionType, const char*>> compression_types;
   uint64_t total_num_files = 0;
   uint64_t total_num_data_blocks = 0;
@@ -537,7 +547,7 @@ int SSTDumpTool::Run(int argc, char** argv) {
   }
 
   std::vector<std::string> filenames;
-  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::Env* env = options.env;
   rocksdb::Status st = env->GetChildren(dir_or_file, &filenames);
   bool dir = true;
   if (!st.ok()) {
@@ -562,16 +572,16 @@ int SSTDumpTool::Run(int argc, char** argv) {
       filename = std::string(dir_or_file) + "/" + filename;
     }
 
-    rocksdb::SstFileReader reader(filename, verify_checksum,
+    rocksdb::SstFileDumper dumper(options, filename, verify_checksum,
                                   output_hex);
-    if (!reader.getStatus().ok()) {
+    if (!dumper.getStatus().ok()) {
       fprintf(stderr, "%s: %s\n", filename.c_str(),
-              reader.getStatus().ToString().c_str());
+              dumper.getStatus().ToString().c_str());
       continue;
     }
 
     if (command == "recompress") {
-      reader.ShowAllCompressionSizes(
+      dumper.ShowAllCompressionSizes(
           set_block_size ? block_size : 16384,
           compression_types.empty() ? kCompressions : compression_types);
       return 0;
@@ -581,7 +591,7 @@ int SSTDumpTool::Run(int argc, char** argv) {
       std::string out_filename = filename.substr(0, filename.length() - 4);
       out_filename.append("_dump.txt");
 
-      st = reader.DumpTable(out_filename);
+      st = dumper.DumpTable(out_filename);
       if (!st.ok()) {
         fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
         exit(1);
@@ -593,7 +603,7 @@ int SSTDumpTool::Run(int argc, char** argv) {
 
     // scan all files in give file path.
     if (command == "" || command == "scan" || command == "check") {
-      st = reader.ReadSequential(
+      st = dumper.ReadSequential(
           command == "scan", read_num > 0 ? (read_num - total_read) : read_num,
           has_from || use_from_as_prefix, from_key, has_to, to_key,
           use_from_as_prefix);
@@ -601,14 +611,14 @@ int SSTDumpTool::Run(int argc, char** argv) {
         fprintf(stderr, "%s: %s\n", filename.c_str(),
             st.ToString().c_str());
       }
-      total_read += reader.GetReadNumber();
+      total_read += dumper.GetReadNumber();
       if (read_num > 0 && total_read > read_num) {
         break;
       }
     }
 
     if (command == "verify") {
-      st = reader.VerifyChecksum();
+      st = dumper.VerifyChecksum();
       if (!st.ok()) {
         fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(),
                 st.ToString().c_str());
@@ -623,11 +633,11 @@ int SSTDumpTool::Run(int argc, char** argv) {
 
       std::shared_ptr<const rocksdb::TableProperties>
           table_properties_from_reader;
-      st = reader.ReadTableProperties(&table_properties_from_reader);
+      st = dumper.ReadTableProperties(&table_properties_from_reader);
       if (!st.ok()) {
         fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str());
         fprintf(stderr, "Try to use initial table properties\n");
-        table_properties = reader.GetInitTableProperties();
+        table_properties = dumper.GetInitTableProperties();
       } else {
         table_properties = table_properties_from_reader.get();
       }
@@ -638,19 +648,6 @@ int SSTDumpTool::Run(int argc, char** argv) {
                   "------------------------------\n"
                   "  %s",
                   table_properties->ToString("\n  ", ": ").c_str());
-          fprintf(stdout, "# deleted keys: %" PRIu64 "\n",
-                  rocksdb::GetDeletedKeys(
-                      table_properties->user_collected_properties));
-
-          bool property_present;
-          uint64_t merge_operands = rocksdb::GetMergeOperands(
-              table_properties->user_collected_properties, &property_present);
-          if (property_present) {
-            fprintf(stdout, "  # merge operands: %" PRIu64 "\n",
-                    merge_operands);
-          } else {
-            fprintf(stdout, "  # merge operands: UNKNOWN\n");
-          }
         }
         total_num_files += 1;
         total_num_data_blocks += table_properties->num_data_blocks;
diff --git a/thirdparty/rocksdb/tools/sst_dump_tool_imp.h b/thirdparty/rocksdb/tools/sst_dump_tool_imp.h
index 9531b5415b..846738a404 100644
--- a/thirdparty/rocksdb/tools/sst_dump_tool_imp.h
+++ b/thirdparty/rocksdb/tools/sst_dump_tool_imp.h
@@ -15,10 +15,10 @@
 
 namespace rocksdb {
 
-class SstFileReader {
+class SstFileDumper {
  public:
-  explicit SstFileReader(const std::string& file_name, bool verify_checksum,
-                         bool output_hex);
+  explicit SstFileDumper(const Options& options, const std::string& file_name,
+                         bool verify_checksum, bool output_hex);
 
   Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from,
                         const std::string& from_key, bool has_to,
@@ -57,7 +57,7 @@ class SstFileReader {
                         const EnvOptions& soptions,
                         const InternalKeyComparator& internal_comparator,
                         uint64_t file_size,
-                        unique_ptr<TableReader>* table_reader);
+                        std::unique_ptr<TableReader>* table_reader);
 
   std::string file_name_;
   uint64_t read_num_;
@@ -70,12 +70,13 @@ class SstFileReader {
   Options options_;
 
   Status init_result_;
-  unique_ptr<TableReader> table_reader_;
-  unique_ptr<RandomAccessFileReader> file_;
+  std::unique_ptr<TableReader> table_reader_;
+  std::unique_ptr<RandomAccessFileReader> file_;
 
   const ImmutableCFOptions ioptions_;
+  const MutableCFOptions moptions_;
   InternalKeyComparator internal_comparator_;
-  unique_ptr<TableProperties> table_properties_;
+  std::unique_ptr<TableProperties> table_properties_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/tools/trace_analyzer.cc b/thirdparty/rocksdb/tools/trace_analyzer.cc
new file mode 100644
index 0000000000..2aa84fd340
--- /dev/null
+++ b/thirdparty/rocksdb/tools/trace_analyzer.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+#include "tools/trace_analyzer_tool.h"
+int main(int argc, char** argv) {
+  return rocksdb::trace_analyzer_tool(argc, argv);
+}
+#endif
+#else
+#include <stdio.h>
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Not supported in lite mode.\n");
+  return 1;
+}
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/tools/trace_analyzer_test.cc b/thirdparty/rocksdb/tools/trace_analyzer_test.cc
new file mode 100644
index 0000000000..b2cc777d5a
--- /dev/null
+++ b/thirdparty/rocksdb/tools/trace_analyzer_test.cc
@@ -0,0 +1,721 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run trace_analyzer test\n");
+  return 1;
+}
+#else
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <sstream>
+#include <thread>
+
+#include "db/db_test_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "tools/trace_analyzer_tool.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/trace_replay.h"
+
+namespace rocksdb {
+
+namespace {
+static const int kMaxArgCount = 100;
+static const size_t kArgBufferSize = 100000;
+}  // namespace
+
+// The helper functions for the test
+class TraceAnalyzerTest : public testing::Test {
+ public:
+  TraceAnalyzerTest() : rnd_(0xFB) {
+    // test_path_ = test::TmpDir() + "trace_analyzer_test";
+    test_path_ = test::PerThreadDBPath("trace_analyzer_test");
+    env_ = rocksdb::Env::Default();
+    env_->CreateDir(test_path_);
+    dbname_ = test_path_ + "/db";
+  }
+
+  ~TraceAnalyzerTest() override {}
+
+  void GenerateTrace(std::string trace_path) {
+    Options options;
+    options.create_if_missing = true;
+    options.merge_operator = MergeOperators::CreatePutOperator();
+    ReadOptions ro;
+    WriteOptions wo;
+    TraceOptions trace_opt;
+    DB* db_ = nullptr;
+    std::string value;
+    std::unique_ptr<TraceWriter> trace_writer;
+    Iterator* single_iter = nullptr;
+
+    ASSERT_OK(
+        NewFileTraceWriter(env_, env_options_, trace_path, &trace_writer));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    ASSERT_OK(db_->StartTrace(trace_opt, std::move(trace_writer)));
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("a", "aaaaaaaaa"));
+    ASSERT_OK(batch.Merge("b", "aaaaaaaaaaaaaaaaaaaa"));
+    ASSERT_OK(batch.Delete("c"));
+    ASSERT_OK(batch.SingleDelete("d"));
+    ASSERT_OK(batch.DeleteRange("e", "f"));
+    ASSERT_OK(db_->Write(wo, &batch));
+
+    ASSERT_OK(db_->Get(ro, "a", &value));
+    single_iter = db_->NewIterator(ro);
+    single_iter->Seek("a");
+    single_iter->SeekForPrev("b");
+    delete single_iter;
+    std::this_thread::sleep_for (std::chrono::seconds(1));
+
+    db_->Get(ro, "g", &value);
+
+    ASSERT_OK(db_->EndTrace());
+
+    ASSERT_OK(env_->FileExists(trace_path));
+
+    std::unique_ptr<WritableFile> whole_f;
+    std::string whole_path = test_path_ + "/0.txt";
+    ASSERT_OK(env_->NewWritableFile(whole_path, &whole_f, env_options_));
+    std::string whole_str = "0x61\n0x62\n0x63\n0x64\n0x65\n0x66\n";
+    ASSERT_OK(whole_f->Append(whole_str));
+    delete db_;
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  void RunTraceAnalyzer(const std::vector<std::string>& args) {
+    char arg_buffer[kArgBufferSize];
+    char* argv[kMaxArgCount];
+    int argc = 0;
+    int cursor = 0;
+
+    for (const auto& arg : args) {
+      ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
+      ASSERT_LE(argc + 1, kMaxArgCount);
+      snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
+
+      argv[argc++] = arg_buffer + cursor;
+      cursor += static_cast<int>(arg.size()) + 1;
+    }
+
+    ASSERT_EQ(0, rocksdb::trace_analyzer_tool(argc, argv));
+  }
+
+  void CheckFileContent(const std::vector<std::string>& cnt,
+                        std::string file_path, bool full_content) {
+    ASSERT_OK(env_->FileExists(file_path));
+    std::unique_ptr<SequentialFile> f_ptr;
+    ASSERT_OK(env_->NewSequentialFile(file_path, &f_ptr, env_options_));
+
+    std::string get_line;
+    std::istringstream iss;
+    bool has_data = true;
+    std::vector<std::string> result;
+    uint32_t count;
+    Status s;
+    for (count = 0; ReadOneLine(&iss, f_ptr.get(), &get_line, &has_data, &s);
+         ++count) {
+      ASSERT_OK(s);
+      result.push_back(get_line);
+    }
+
+    ASSERT_EQ(cnt.size(), result.size());
+    for (int i = 0; i < static_cast<int>(result.size()); i++) {
+      if (full_content) {
+        ASSERT_EQ(result[i], cnt[i]);
+      } else {
+        ASSERT_EQ(result[i][0], cnt[i][0]);
+      }
+    }
+
+    return;
+  }
+
+  void AnalyzeTrace(std::vector<std::string>& paras_diff,
+                    std::string output_path, std::string trace_path) {
+    std::vector<std::string> paras = {"./trace_analyzer",
+                                      "-convert_to_human_readable_trace",
+                                      "-output_key_stats",
+                                      "-output_access_count_stats",
+                                      "-output_prefix=test",
+                                      "-output_prefix_cut=1",
+                                      "-output_time_series",
+                                      "-output_value_distribution",
+                                      "-output_qps_stats",
+                                      "-no_key",
+                                      "-no_print"};
+    for (auto& para : paras_diff) {
+      paras.push_back(para);
+    }
+    Status s = env_->FileExists(trace_path);
+    if (!s.ok()) {
+      GenerateTrace(trace_path);
+    }
+    env_->CreateDir(output_path);
+    RunTraceAnalyzer(paras);
+  }
+
+  rocksdb::Env* env_;
+  EnvOptions env_options_;
+  std::string test_path_;
+  std::string dbname_;
+  Random rnd_;
+};
+
+TEST_F(TraceAnalyzerTest, Get) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/get";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_get"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 10 0 1 1.000000", "0 10 1 1 1.000000"};
+  file_path = output_path + "/test-get-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+  file_path = output_path + "/test-get-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+                                       "1 1 1 1.000000 1.000000 0x61"};
+  file_path = output_path + "/test-get-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"0 1533000630 0", "0 1533000630 1"};
+  file_path = output_path + "/test-get-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-get-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-get-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 0 0 0 0 0 0 0 1"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of get
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-get-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path = output_path + "/test-get-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Put
+TEST_F(TraceAnalyzerTest, Put) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/put";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_put"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 9 0 1 1.000000"};
+  file_path = output_path + "/test-put-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path = output_path + "/test-put-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-put-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"1 1533056278 0"};
+  file_path = output_path + "/test-put-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-put-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-put-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 0 0 0 0 0 0 2"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Put
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-put-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path = output_path + "/test-put-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+
+  // Check the value size distribution
+  std::vector<std::string> value_dist = {
+      "Number_of_value_size_between 0 and 16 is: 1"};
+  file_path = output_path + "/test-put-0-accessed_value_size_distribution.txt";
+  CheckFileContent(value_dist, file_path, true);
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, Delete) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/delete";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_delete"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  file_path = output_path + "/test-delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"2 1533000630 0"};
+  file_path = output_path + "/test-delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"2 1"};
+  file_path = output_path + "/test-delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 0 0 0 0 0 3"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Delete
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x63 Access count: 1"};
+  file_path = output_path + "/test-delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Merge
+TEST_F(TraceAnalyzerTest, Merge) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/merge";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_merge"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 20 0 1 1.000000"};
+  file_path = output_path + "/test-merge-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path = output_path + "/test-merge-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-merge-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"5 1533000630 0"};
+  file_path = output_path + "/test-merge-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"1 1"};
+  file_path = output_path + "/test-merge-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-merge-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 0 0 1 0 0 4"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Merge
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-merge-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x62 Access count: 1"};
+  file_path = output_path + "/test-merge-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+
+  // Check the value size distribution
+  std::vector<std::string> value_dist = {
+      "Number_of_value_size_between 0 and 24 is: 1"};
+  file_path =
+      output_path + "/test-merge-0-accessed_value_size_distribution.txt";
+  CheckFileContent(value_dist, file_path, true);
+}
+
+// Test analyzing of SingleDelete
+TEST_F(TraceAnalyzerTest, SingleDelete) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/single_delete";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_single_delete"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  file_path = output_path + "/test-single_delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-single_delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-single_delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"3 1533000630 0"};
+  file_path = output_path + "/test-single_delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"3 1"};
+  file_path = output_path + "/test-single_delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-single_delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 1 0 1 0 0 5"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of SingleDelete
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-single_delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x64 Access count: 1"};
+  file_path =
+      output_path + "/test-single_delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of delete
+TEST_F(TraceAnalyzerTest, DeleteRange) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/range_delete";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_range_delete"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000", "0 0 1 1 1.000000"};
+  file_path = output_path + "/test-range_delete-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 2"};
+  file_path =
+      output_path + "/test-range_delete-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30",
+                                       "1 1 1 1.000000 1.000000 0x65"};
+  file_path = output_path + "/test-range_delete-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"4 1533000630 0", "4 1533060100 1"};
+  file_path = output_path + "/test-range_delete-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"4 1", "5 1"};
+  file_path = output_path + "/test-range_delete-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-range_delete-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 1 2 1 0 0 7"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of DeleteRange
+  std::vector<std::string> get_qps = {"2"};
+  file_path = output_path + "/test-range_delete-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 2",
+                                      "The prefix: 0x65 Access count: 1",
+                                      "The prefix: 0x66 Access count: 1"};
+  file_path =
+      output_path + "/test-range_delete-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+// Test analyzing of Iterator
+TEST_F(TraceAnalyzerTest, Iterator) {
+  std::string trace_path = test_path_ + "/trace";
+  std::string output_path = test_path_ + "/iterator";
+  std::string file_path;
+  std::vector<std::string> paras = {"-analyze_iterator"};
+  paras.push_back("-output_dir=" + output_path);
+  paras.push_back("-trace_path=" + trace_path);
+  paras.push_back("-key_space_dir=" + test_path_);
+  AnalyzeTrace(paras, output_path, trace_path);
+
+  // Check the output of Seek
+  // check the key_stats file
+  std::vector<std::string> k_stats = {"0 0 0 1 1.000000"};
+  file_path = output_path + "/test-iterator_Seek-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  std::vector<std::string> k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path + "/test-iterator_Seek-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the trace sequence
+  std::vector<std::string> k_sequence = {"1", "5", "2", "3", "4",
+                                         "0", "6", "7", "0"};
+  file_path = output_path + "/test-human_readable_trace.txt";
+  CheckFileContent(k_sequence, file_path, false);
+
+  // Check the prefix
+  std::vector<std::string> k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path = output_path + "/test-iterator_Seek-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  std::vector<std::string> k_series = {"6 1 0"};
+  file_path = output_path + "/test-iterator_Seek-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  std::vector<std::string> k_whole_access = {"0 1"};
+  file_path = output_path + "/test-iterator_Seek-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  std::vector<std::string> k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63",
+                                             "3 0x64", "4 0x65", "5 0x66"};
+  file_path = output_path + "/test-iterator_Seek-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the overall qps
+  std::vector<std::string> all_qps = {"1 1 1 1 2 1 1 1 9"};
+  file_path = output_path + "/test-qps_stats.txt";
+  CheckFileContent(all_qps, file_path, true);
+
+  // Check the qps of Iterator_Seek
+  std::vector<std::string> get_qps = {"1"};
+  file_path = output_path + "/test-iterator_Seek-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  std::vector<std::string> top_qps = {"At time: 0 with QPS: 1",
+                                      "The prefix: 0x61 Access count: 1"};
+  file_path =
+      output_path + "/test-iterator_Seek-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+
+  // Check the output of SeekForPrev
+  // check the key_stats file
+  k_stats = {"0 0 0 1 1.000000"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-accessed_key_stats.txt";
+  CheckFileContent(k_stats, file_path, true);
+
+  // Check the access count distribution
+  k_dist = {"access_count: 1 num: 1"};
+  file_path =
+      output_path +
+      "/test-iterator_SeekForPrev-0-accessed_key_count_distribution.txt";
+  CheckFileContent(k_dist, file_path, true);
+
+  // Check the prefix
+  k_prefix = {"0 0 0 0.000000 0.000000 0x30"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-accessed_key_prefix_cut.txt";
+  CheckFileContent(k_prefix, file_path, true);
+
+  // Check the time series
+  k_series = {"7 0 0"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-time_series.txt";
+  CheckFileContent(k_series, file_path, false);
+
+  // Check the accessed key in whole key space
+  k_whole_access = {"1 1"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-whole_key_stats.txt";
+  CheckFileContent(k_whole_access, file_path, true);
+
+  // Check the whole key prefix cut
+  k_whole_prefix = {"0 0x61", "1 0x62", "2 0x63", "3 0x64", "4 0x65", "5 0x66"};
+  file_path =
+      output_path + "/test-iterator_SeekForPrev-0-whole_key_prefix_cut.txt";
+  CheckFileContent(k_whole_prefix, file_path, true);
+
+  // Check the qps of Iterator_SeekForPrev
+  get_qps = {"1"};
+  file_path = output_path + "/test-iterator_SeekForPrev-0-qps_stats.txt";
+  CheckFileContent(get_qps, file_path, true);
+
+  // Check the top k qps prefix cut
+  top_qps = {"At time: 0 with QPS: 1", "The prefix: 0x62 Access count: 1"};
+  file_path = output_path +
+              "/test-iterator_SeekForPrev-0-accessed_top_k_qps_prefix_cut.txt";
+  CheckFileContent(top_qps, file_path, true);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif  // GFLAG
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "Trace_analyzer test is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE  return RUN_ALL_TESTS();
diff --git a/thirdparty/rocksdb/tools/trace_analyzer_tool.cc b/thirdparty/rocksdb/tools/trace_analyzer_tool.cc
new file mode 100644
index 0000000000..a018692524
--- /dev/null
+++ b/thirdparty/rocksdb/tools/trace_analyzer_tool.cc
@@ -0,0 +1,1985 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#ifdef GFLAGS
+#ifdef NUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+
+#include "db/db_impl.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/utilities/ldb_cmd.h"
+#include "rocksdb/write_batch.h"
+#include "table/meta_blocks.h"
+#include "table/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "tools/trace_analyzer_tool.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/file_reader_writer.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/trace_replay.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
+
+DEFINE_string(trace_path, "", "The trace file path.");
+DEFINE_string(output_dir, "", "The directory to store the output files.");
+DEFINE_string(output_prefix, "trace",
+              "The prefix used for all the output files.");
+DEFINE_bool(output_key_stats, false,
+            "Output the key access count statistics to file\n"
+            "for accessed keys:\n"
+            "file name: <prefix>-<query_type>-<cf_id>-accessed_key_stats.txt\n"
+            "Format:[cf_id value_size access_keyid access_count]\n"
+            "for the whole key space keys:\n"
+            "File name: <prefix>-<query_type>-<cf_id>-whole_key_stats.txt\n"
+            "Format:[whole_key_space_keyid access_count]");
+DEFINE_bool(output_access_count_stats, false,
+            "Output the access count distribution statistics to file.\n"
+            "File name:  <prefix>-<query_type>-<cf_id>-accessed_"
+            "key_count_distribution.txt \n"
+            "Format:[access_count number_of_access_count]");
+DEFINE_bool(output_time_series, false,
+            "Output the access time in second of each key, "
+            "such that we can have the time series data of the queries \n"
+            "File name: <prefix>-<query_type>-<cf_id>-time_series.txt\n"
+            "Format:[type_id time_in_sec access_keyid].");
+DEFINE_bool(try_process_corrupted_trace, false,
+            "In default, trace_analyzer will exit if the trace file is "
+            "corrupted due to the unexpected tracing cases. If this option "
+            "is enabled, trace_analyzer will stop reading the trace file, "
+            "and start analyzing the read-in data.");
+DEFINE_int32(output_prefix_cut, 0,
+             "The number of bytes as prefix to cut the keys.\n"
+             "If it is enabled, it will generate the following:\n"
+             "For accessed keys:\n"
+             "File name: <prefix>-<query_type>-<cf_id>-"
+             "accessed_key_prefix_cut.txt \n"
+             "Format:[acessed_keyid access_count_of_prefix "
+             "number_of_keys_in_prefix average_key_access "
+             "prefix_succ_ratio prefix]\n"
+             "For whole key space keys:\n"
+             "File name: <prefix>-<query_type>-<cf_id>"
+             "-whole_key_prefix_cut.txt\n"
+             "Format:[start_keyid_in_whole_keyspace prefix]\n"
+             "if 'output_qps_stats' and 'top_k' are enabled, it will output:\n"
+             "File name: <prefix>-<query_type>-<cf_id>"
+             "-accessed_top_k_qps_prefix_cut.txt\n"
+             "Format:[the_top_ith_qps_time QPS], [prefix qps_of_this_second].");
+DEFINE_bool(convert_to_human_readable_trace, false,
+            "Convert the binary trace file to a human readable txt file "
+            "for further processing. "
+            "This file will be extremely large "
+            "(similar size as the original binary trace file). "
+            "You can specify 'no_key' to reduce the size, if key is not "
+            "needed in the next step.\n"
+            "File name: <prefix>_human_readable_trace.txt\n"
+            "Format:[type_id cf_id value_size time_in_micorsec <key>].");
+DEFINE_bool(output_qps_stats, false,
+            "Output the query per second(qps) statistics \n"
+            "For the overall qps, it will contain all qps of each query type. "
+            "The time is started from the first trace record\n"
+            "File name: <prefix>_qps_stats.txt\n"
+            "Format: [qps_type_1 qps_type_2 ...... overall_qps]\n"
+            "For each cf and query, it will have its own qps output.\n"
+            "File name: <prefix>-<query_type>-<cf_id>_qps_stats.txt \n"
+            "Format:[query_count_in_this_second].");
+DEFINE_bool(no_print, false, "Do not print out any result");
+DEFINE_string(
+    print_correlation, "",
+    "intput format: [correlation pairs][.,.]\n"
+    "Output the query correlations between the pairs of query types "
+    "listed in the parameter, input should select the operations from:\n"
+    "get, put, delete, single_delete, rangle_delete, merge. No space "
+    "between the pairs separated by commar. Example: =[get,get]... "
+    "It will print out the number of pairs of 'A after B' and "
+    "the average time interval between the two query.");
+DEFINE_string(key_space_dir, "",
+              "<the directory stores full key space files> \n"
+              "The key space files should be: <column family id>.txt");
+DEFINE_bool(analyze_get, false, "Analyze the Get query.");
+DEFINE_bool(analyze_put, false, "Analyze the Put query.");
+DEFINE_bool(analyze_delete, false, "Analyze the Delete query.");
+DEFINE_bool(analyze_single_delete, false, "Analyze the SingleDelete query.");
+DEFINE_bool(analyze_range_delete, false, "Analyze the DeleteRange query.");
+DEFINE_bool(analyze_merge, false, "Analyze the Merge query.");
+DEFINE_bool(analyze_iterator, false,
+            " Analyze the iterate query like seek() and seekForPrev().");
+DEFINE_bool(no_key, false,
+            " Does not output the key to the result files to make smaller.");
+DEFINE_bool(print_overall_stats, true,
+            " Print the stats of the whole trace, "
+            "like total requests, keys, and etc.");
+DEFINE_bool(output_key_distribution, false, "Print the key size distribution.");
+DEFINE_bool(
+    output_value_distribution, false,
+    "Out put the value size distribution, only available for Put and Merge.\n"
+    "File name: <prefix>-<query_type>-<cf_id>"
+    "-accessed_value_size_distribution.txt\n"
+    "Format:[Number_of_value_size_between x and "
+    "x+value_interval is: <the count>]");
+DEFINE_int32(print_top_k_access, 1,
+             "<top K of the variables to be printed> "
+             "Print the top k accessed keys, top k accessed prefix "
+             "and etc.");
+DEFINE_int32(output_ignore_count, 0,
+             "<threshold>, ignores the access count <= this value, "
+             "it will shorter the output.");
+DEFINE_int32(value_interval, 8,
+             "To output the value distribution, we need to set the value "
+             "intervals and make the statistic of the value size distribution "
+             "in different intervals. The default is 8.");
+DEFINE_double(sample_ratio, 1.0,
+              "If the trace size is extremely huge or user want to sample "
+              "the trace when analyzing, sample ratio can be set (0, 1.0]");
+
+namespace rocksdb {
+
+std::map<std::string, int> taOptToIndex = {
+    {"get", 0},           {"put", 1},
+    {"delete", 2},        {"single_delete", 3},
+    {"range_delete", 4},  {"merge", 5},
+    {"iterator_Seek", 6}, {"iterator_SeekForPrev", 7}};
+
+std::map<int, std::string> taIndexToOpt = {
+    {0, "get"},           {1, "put"},
+    {2, "delete"},        {3, "single_delete"},
+    {4, "range_delete"},  {5, "merge"},
+    {6, "iterator_Seek"}, {7, "iterator_SeekForPrev"}};
+
+namespace {
+
+uint64_t MultiplyCheckOverflow(uint64_t op1, uint64_t op2) {
+  if (op1 == 0 || op2 == 0) {
+    return 0;
+  }
+  if (port::kMaxUint64 / op1 < op2) {
+    return op1;
+  }
+  return (op1 * op2);
+}
+
+void DecodeCFAndKeyFromString(std::string& buffer, uint32_t* cf_id, Slice* key) {
+  Slice buf(buffer);
+  GetFixed32(&buf, cf_id);
+  GetLengthPrefixedSlice(&buf, key);
+}
+
+}  // namespace
+
+// The default constructor of AnalyzerOptions
+AnalyzerOptions::AnalyzerOptions()
+    : correlation_map(kTaTypeNum, std::vector<int>(kTaTypeNum, -1)) {}
+
+AnalyzerOptions::~AnalyzerOptions() {}
+
+void AnalyzerOptions::SparseCorrelationInput(const std::string& in_str) {
+  std::string cur = in_str;
+  if (cur.size() == 0) {
+    return;
+  }
+  while (!cur.empty()) {
+    if (cur.compare(0, 1, "[") != 0) {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    std::string opt1, opt2;
+    std::size_t split = cur.find_first_of(",");
+    if (split != std::string::npos) {
+      opt1 = cur.substr(1, split - 1);
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    std::size_t end = cur.find_first_of("]");
+    if (end != std::string::npos) {
+      opt2 = cur.substr(split + 1, end - split - 1);
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+    cur = cur.substr(end + 1);
+
+    if (taOptToIndex.find(opt1) != taOptToIndex.end() &&
+        taOptToIndex.find(opt2) != taOptToIndex.end()) {
+      correlation_list.push_back(
+          std::make_pair(taOptToIndex[opt1], taOptToIndex[opt2]));
+    } else {
+      fprintf(stderr, "Invalid correlation input: %s\n", in_str.c_str());
+      exit(1);
+    }
+  }
+
+  int sequence = 0;
+  for (auto& it : correlation_list) {
+    correlation_map[it.first][it.second] = sequence;
+    sequence++;
+  }
+  return;
+}
+
+// The trace statistic struct constructor
+TraceStats::TraceStats() {
+  cf_id = 0;
+  cf_name = "0";
+  a_count = 0;
+  a_key_id = 0;
+  a_key_size_sqsum = 0;
+  a_key_size_sum = 0;
+  a_key_mid = 0;
+  a_value_size_sqsum = 0;
+  a_value_size_sum = 0;
+  a_value_mid = 0;
+  a_peak_qps = 0;
+  a_ave_qps = 0.0;
+}
+
+TraceStats::~TraceStats() {}
+
+// The trace analyzer constructor
+TraceAnalyzer::TraceAnalyzer(std::string& trace_path, std::string& output_path,
+                             AnalyzerOptions _analyzer_opts)
+    : trace_name_(trace_path),
+      output_path_(output_path),
+      analyzer_opts_(_analyzer_opts) {
+  rocksdb::EnvOptions env_options;
+  env_ = rocksdb::Env::Default();
+  offset_ = 0;
+  c_time_ = 0;
+  total_requests_ = 0;
+  total_access_keys_ = 0;
+  total_gets_ = 0;
+  total_writes_ = 0;
+  trace_create_time_ = 0;
+  begin_time_ = 0;
+  end_time_ = 0;
+  time_series_start_ = 0;
+  cur_time_sec_ = 0;
+  if (FLAGS_sample_ratio > 1.0 || FLAGS_sample_ratio <= 0) {
+    sample_max_ = 1;
+  } else {
+    sample_max_ = static_cast<uint32_t>(1.0 / FLAGS_sample_ratio);
+  }
+
+  ta_.resize(kTaTypeNum);
+  ta_[0].type_name = "get";
+  if (FLAGS_analyze_get) {
+    ta_[0].enabled = true;
+  } else {
+    ta_[0].enabled = false;
+  }
+  ta_[1].type_name = "put";
+  if (FLAGS_analyze_put) {
+    ta_[1].enabled = true;
+  } else {
+    ta_[1].enabled = false;
+  }
+  ta_[2].type_name = "delete";
+  if (FLAGS_analyze_delete) {
+    ta_[2].enabled = true;
+  } else {
+    ta_[2].enabled = false;
+  }
+  ta_[3].type_name = "single_delete";
+  if (FLAGS_analyze_single_delete) {
+    ta_[3].enabled = true;
+  } else {
+    ta_[3].enabled = false;
+  }
+  ta_[4].type_name = "range_delete";
+  if (FLAGS_analyze_range_delete) {
+    ta_[4].enabled = true;
+  } else {
+    ta_[4].enabled = false;
+  }
+  ta_[5].type_name = "merge";
+  if (FLAGS_analyze_merge) {
+    ta_[5].enabled = true;
+  } else {
+    ta_[5].enabled = false;
+  }
+  ta_[6].type_name = "iterator_Seek";
+  if (FLAGS_analyze_iterator) {
+    ta_[6].enabled = true;
+  } else {
+    ta_[6].enabled = false;
+  }
+  ta_[7].type_name = "iterator_SeekForPrev";
+  if (FLAGS_analyze_iterator) {
+    ta_[7].enabled = true;
+  } else {
+    ta_[7].enabled = false;
+  }
+  for (int i = 0; i < kTaTypeNum; i++) {
+    ta_[i].sample_count = 0;
+  }
+}
+
+TraceAnalyzer::~TraceAnalyzer() {}
+
+// Prepare the processing
+// Initiate the global trace reader and writer here
+Status TraceAnalyzer::PrepareProcessing() {
+  Status s;
+  // Prepare the trace reader
+  s = NewFileTraceReader(env_, env_options_, trace_name_, &trace_reader_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Prepare and open the trace sequence file writer if needed
+  if (FLAGS_convert_to_human_readable_trace) {
+    std::string trace_sequence_name;
+    trace_sequence_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-human_readable_trace.txt";
+    s = env_->NewWritableFile(trace_sequence_name, &trace_sequence_f_,
+                              env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // prepare the general QPS file writer
+  if (FLAGS_output_qps_stats) {
+    std::string qps_stats_name;
+    qps_stats_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-qps_stats.txt";
+    s = env_->NewWritableFile(qps_stats_name, &qps_f_, env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    qps_stats_name =
+        output_path_ + "/" + FLAGS_output_prefix + "-cf_qps_stats.txt";
+    s = env_->NewWritableFile(qps_stats_name, &cf_qps_f_, env_options_);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+Status TraceAnalyzer::ReadTraceHeader(Trace* header) {
+  assert(header != nullptr);
+  Status s = ReadTraceRecord(header);
+  if (!s.ok()) {
+    return s;
+  }
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+Status TraceAnalyzer::ReadTraceFooter(Trace* footer) {
+  assert(footer != nullptr);
+  Status s = ReadTraceRecord(footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer->type != kTraceEnd) {
+    return Status::Corruption("Corrupted trace file. Incorrect footer.");
+  }
+  return s;
+}
+
+Status TraceAnalyzer::ReadTraceRecord(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Slice enc_slice = Slice(encoded_trace);
+  GetFixed64(&enc_slice, &trace->ts);
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return s;
+}
+
+// process the trace itself and redirect the trace content
+// to different operation type handler. With different race
+// format, this function can be changed
+Status TraceAnalyzer::StartProcessing() {
+  Status s;
+  Trace header;
+  s = ReadTraceHeader(&header);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot read the header\n");
+    return s;
+  }
+  trace_create_time_ = header.ts;
+  if (FLAGS_output_time_series) {
+    time_series_start_ = header.ts;
+  }
+
+  Trace trace;
+  while (s.ok()) {
+    trace.reset();
+    s = ReadTraceRecord(&trace);
+    if (!s.ok()) {
+      break;
+    }
+
+    total_requests_++;
+    end_time_ = trace.ts;
+    if (trace.type == kTraceWrite) {
+      total_writes_++;
+      c_time_ = trace.ts;
+      WriteBatch batch(trace.payload);
+
+      // Note that, if the write happens in a transaction,
+      // 'Write' will be called twice, one for Prepare, one for
+      // Commit. Thus, in the trace, for the same WriteBatch, there
+      // will be two reords if it is in a transaction. Here, we only
+      // process the reord that is committed. If write is non-transaction,
+      // HasBeginPrepare()==false, so we process it normally.
+      if (batch.HasBeginPrepare() && !batch.HasCommit()) {
+        continue;
+      }
+      TraceWriteHandler write_handler(this);
+      s = batch.Iterate(&write_handler);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot process the write batch in the trace\n");
+        return s;
+      }
+    } else if (trace.type == kTraceGet) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
+      total_gets_++;
+
+      s = HandleGet(cf_id, key.ToString(), trace.ts, 1);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot process the get in the trace\n");
+        return s;
+      }
+    } else if (trace.type == kTraceIteratorSeek ||
+               trace.type == kTraceIteratorSeekForPrev) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKeyFromString(trace.payload, &cf_id, &key);
+      s = HandleIter(cf_id, key.ToString(), trace.ts, trace.type);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot process the iterator in the trace\n");
+        return s;
+      }
+    } else if (trace.type == kTraceEnd) {
+      break;
+    }
+  }
+  if (s.IsIncomplete()) {
+    // Fix it: Reaching eof returns Incomplete status at the moment.
+    //
+    return Status::OK();
+  }
+  return s;
+}
+
+// After the trace is processed by StartProcessing, the statistic data
+// is stored in the map or other in memory data structures. To get the
+// other statistic result such as key size distribution, value size
+// distribution, these data structures are re-processed here.
+Status TraceAnalyzer::MakeStatistics() {
+  int ret;
+  Status s;
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      stat.second.a_key_id = 0;
+      for (auto& record : stat.second.a_key_stats) {
+        record.second.key_id = stat.second.a_key_id;
+        stat.second.a_key_id++;
+        if (record.second.access_count <=
+            static_cast<uint64_t>(FLAGS_output_ignore_count)) {
+          continue;
+        }
+
+        // Generate the key access count distribution data
+        if (FLAGS_output_access_count_stats) {
+          if (stat.second.a_count_stats.find(record.second.access_count) ==
+              stat.second.a_count_stats.end()) {
+            stat.second.a_count_stats[record.second.access_count] = 1;
+          } else {
+            stat.second.a_count_stats[record.second.access_count]++;
+          }
+        }
+
+        // Generate the key size distribution data
+        if (FLAGS_output_key_distribution) {
+          if (stat.second.a_key_size_stats.find(record.first.size()) ==
+              stat.second.a_key_size_stats.end()) {
+            stat.second.a_key_size_stats[record.first.size()] = 1;
+          } else {
+            stat.second.a_key_size_stats[record.first.size()]++;
+          }
+        }
+
+        if (!FLAGS_print_correlation.empty()) {
+          s = MakeStatisticCorrelation(stat.second, record.second);
+          if (!s.ok()) {
+            return s;
+          }
+        }
+      }
+
+      // Output the prefix cut or the whole content of the accessed key space
+      if (FLAGS_output_key_stats || FLAGS_output_prefix_cut > 0) {
+        s = MakeStatisticKeyStatsOrPrefix(stat.second);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+
+      // output the access count distribution
+      if (FLAGS_output_access_count_stats && stat.second.a_count_dist_f) {
+        for (auto& record : stat.second.a_count_stats) {
+          ret = sprintf(buffer_, "access_count: %" PRIu64 " num: %" PRIu64 "\n",
+                        record.first, record.second);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_count_dist_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write access count distribution file failed\n");
+            return s;
+          }
+        }
+      }
+
+      // find the medium of the key size
+      uint64_t k_count = 0;
+      bool get_mid = false;
+      for (auto& record : stat.second.a_key_size_stats) {
+        k_count += record.second;
+        if (!get_mid && k_count >= stat.second.a_key_mid) {
+          stat.second.a_key_mid = record.first;
+          get_mid = true;
+        }
+        if (FLAGS_output_key_distribution && stat.second.a_key_size_f) {
+          ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n", record.first,
+                        record.second);
+          if (ret < 0) {
+            return Status::IOError("Format output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_key_size_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write key size distribution file failed\n");
+            return s;
+          }
+        }
+      }
+
+      // output the value size distribution
+      uint64_t v_begin = 0, v_end = 0, v_count = 0;
+      get_mid = false;
+      for (auto& record : stat.second.a_value_size_stats) {
+        v_begin = v_end;
+        v_end = (record.first + 1) * FLAGS_value_interval;
+        v_count += record.second;
+        if (!get_mid && v_count >= stat.second.a_count / 2) {
+          stat.second.a_value_mid = (v_begin + v_end) / 2;
+          get_mid = true;
+        }
+        if (FLAGS_output_value_distribution && stat.second.a_value_size_f &&
+            (type == TraceOperationType::kPut ||
+             type == TraceOperationType::kMerge)) {
+          ret = sprintf(buffer_,
+                        "Number_of_value_size_between %" PRIu64 " and %" PRIu64
+                        " is: %" PRIu64 "\n",
+                        v_begin, v_end, record.second);
+          if (ret < 0) {
+            return Status::IOError("Format output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_value_size_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write value size distribution file failed\n");
+            return s;
+          }
+        }
+      }
+    }
+  }
+
+  // Make the QPS statistics
+  if (FLAGS_output_qps_stats) {
+    s = MakeStatisticQPS();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+// Process the statistics of the key access and
+// prefix of the accessed keys if required
+Status TraceAnalyzer::MakeStatisticKeyStatsOrPrefix(TraceStats& stats) {
+  int ret;
+  Status s;
+  std::string prefix = "0";
+  uint64_t prefix_access = 0;
+  uint64_t prefix_count = 0;
+  uint64_t prefix_succ_access = 0;
+  double prefix_ave_access = 0.0;
+  stats.a_succ_count = 0;
+  for (auto& record : stats.a_key_stats) {
+    // write the key access statistic file
+    if (!stats.a_key_f) {
+      return Status::IOError("Failed to open accessed_key_stats file.");
+    }
+    stats.a_succ_count += record.second.succ_count;
+    double succ_ratio = 0.0;
+    if (record.second.access_count > 0) {
+      succ_ratio = (static_cast<double>(record.second.succ_count)) /
+                   record.second.access_count;
+    }
+    ret = sprintf(buffer_, "%u %zu %" PRIu64 " %" PRIu64 " %f\n",
+                  record.second.cf_id, record.second.value_size,
+                  record.second.key_id, record.second.access_count, succ_ratio);
+    if (ret < 0) {
+      return Status::IOError("Format output failed");
+    }
+    std::string printout(buffer_);
+    s = stats.a_key_f->Append(printout);
+    if (!s.ok()) {
+      fprintf(stderr, "Write key access file failed\n");
+      return s;
+    }
+
+    // write the prefix cut of the accessed keys
+    if (FLAGS_output_prefix_cut > 0 && stats.a_prefix_cut_f) {
+      if (record.first.compare(0, FLAGS_output_prefix_cut, prefix) != 0) {
+        std::string prefix_out = rocksdb::LDBCommand::StringToHex(prefix);
+        if (prefix_count == 0) {
+          prefix_ave_access = 0.0;
+        } else {
+          prefix_ave_access =
+              (static_cast<double>(prefix_access)) / prefix_count;
+        }
+        double prefix_succ_ratio = 0.0;
+        if (prefix_access > 0) {
+          prefix_succ_ratio =
+              (static_cast<double>(prefix_succ_access)) / prefix_access;
+        }
+        ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 " %" PRIu64 " %f %f %s\n",
+                      record.second.key_id, prefix_access, prefix_count,
+                      prefix_ave_access, prefix_succ_ratio, prefix_out.c_str());
+        if (ret < 0) {
+          return Status::IOError("Format output failed");
+        }
+        std::string pout(buffer_);
+        s = stats.a_prefix_cut_f->Append(pout);
+        if (!s.ok()) {
+          fprintf(stderr, "Write accessed key prefix file failed\n");
+          return s;
+        }
+
+        // make the top k statistic for the prefix
+        if (static_cast<int32_t>(stats.top_k_prefix_access.size()) <
+            FLAGS_print_top_k_access) {
+          stats.top_k_prefix_access.push(
+              std::make_pair(prefix_access, prefix_out));
+        } else {
+          if (prefix_access > stats.top_k_prefix_access.top().first) {
+            stats.top_k_prefix_access.pop();
+            stats.top_k_prefix_access.push(
+                std::make_pair(prefix_access, prefix_out));
+          }
+        }
+
+        if (static_cast<int32_t>(stats.top_k_prefix_ave.size()) <
+            FLAGS_print_top_k_access) {
+          stats.top_k_prefix_ave.push(
+              std::make_pair(prefix_ave_access, prefix_out));
+        } else {
+          if (prefix_ave_access > stats.top_k_prefix_ave.top().first) {
+            stats.top_k_prefix_ave.pop();
+            stats.top_k_prefix_ave.push(
+                std::make_pair(prefix_ave_access, prefix_out));
+          }
+        }
+
+        prefix = record.first.substr(0, FLAGS_output_prefix_cut);
+        prefix_access = 0;
+        prefix_count = 0;
+        prefix_succ_access = 0;
+      }
+      prefix_access += record.second.access_count;
+      prefix_count += 1;
+      prefix_succ_access += record.second.succ_count;
+    }
+  }
+  return Status::OK();
+}
+
+// Process the statistics of different query type
+// correlations
+Status TraceAnalyzer::MakeStatisticCorrelation(TraceStats& stats,
+                                               StatsUnit& unit) {
+  if (stats.correlation_output.size() !=
+      analyzer_opts_.correlation_list.size()) {
+    return Status::Corruption("Cannot make the statistic of correlation.");
+  }
+
+  for (int i = 0; i < static_cast<int>(analyzer_opts_.correlation_list.size());
+       i++) {
+    if (i >= static_cast<int>(stats.correlation_output.size()) ||
+        i >= static_cast<int>(unit.v_correlation.size())) {
+      break;
+    }
+    stats.correlation_output[i].first += unit.v_correlation[i].count;
+    stats.correlation_output[i].second += unit.v_correlation[i].total_ts;
+  }
+  return Status::OK();
+}
+
+// Process the statistics of QPS
+Status TraceAnalyzer::MakeStatisticQPS() {
+  if(begin_time_ == 0) {
+    begin_time_ = trace_create_time_;
+  }
+  uint32_t duration =
+      static_cast<uint32_t>((end_time_ - begin_time_) / 1000000);
+  int ret;
+  Status s;
+  std::vector<std::vector<uint32_t>> type_qps(
+      duration, std::vector<uint32_t>(kTaTypeNum + 1, 0));
+  std::vector<uint64_t> qps_sum(kTaTypeNum + 1, 0);
+  std::vector<uint32_t> qps_peak(kTaTypeNum + 1, 0);
+  qps_ave_.resize(kTaTypeNum + 1);
+
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      uint32_t time_line = 0;
+      uint64_t cf_qps_sum = 0;
+      for (auto& time_it : stat.second.a_qps_stats) {
+        if (time_it.first >= duration) {
+          continue;
+        }
+        type_qps[time_it.first][kTaTypeNum] += time_it.second;
+        type_qps[time_it.first][type] += time_it.second;
+        cf_qps_sum += time_it.second;
+        if (time_it.second > stat.second.a_peak_qps) {
+          stat.second.a_peak_qps = time_it.second;
+        }
+        if (stat.second.a_qps_f) {
+          while (time_line < time_it.first) {
+            ret = sprintf(buffer_, "%u\n", 0);
+            if (ret < 0) {
+              return Status::IOError("Format the output failed");
+            }
+            std::string printout(buffer_);
+            s = stat.second.a_qps_f->Append(printout);
+            if (!s.ok()) {
+              fprintf(stderr, "Write QPS file failed\n");
+              return s;
+            }
+            time_line++;
+          }
+          ret = sprintf(buffer_, "%u\n", time_it.second);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_qps_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write QPS file failed\n");
+            return s;
+          }
+          if (time_line == time_it.first) {
+            time_line++;
+          }
+        }
+
+        // Process the top k QPS peaks
+        if (FLAGS_output_prefix_cut > 0) {
+          if (static_cast<int32_t>(stat.second.top_k_qps_sec.size()) <
+              FLAGS_print_top_k_access) {
+            stat.second.top_k_qps_sec.push(
+                std::make_pair(time_it.second, time_it.first));
+          } else {
+            if (stat.second.top_k_qps_sec.size() > 0 &&
+                stat.second.top_k_qps_sec.top().first < time_it.second) {
+              stat.second.top_k_qps_sec.pop();
+              stat.second.top_k_qps_sec.push(
+                  std::make_pair(time_it.second, time_it.first));
+            }
+          }
+        }
+      }
+      if (duration == 0) {
+        stat.second.a_ave_qps = 0;
+      } else {
+        stat.second.a_ave_qps = (static_cast<double>(cf_qps_sum)) / duration;
+      }
+
+      // Output the accessed unique key number change overtime
+      if (stat.second.a_key_num_f) {
+        uint64_t cur_uni_key =
+            static_cast<uint64_t>(stat.second.a_key_stats.size());
+        double cur_ratio = 0.0;
+        uint64_t cur_num = 0;
+        for (uint32_t i = 0; i < duration; i++) {
+          auto find_time = stat.second.uni_key_num.find(i);
+          if (find_time != stat.second.uni_key_num.end()) {
+            cur_ratio = (static_cast<double>(find_time->second)) / cur_uni_key;
+            cur_num = find_time->second;
+          }
+          ret = sprintf(buffer_, "%" PRIu64 " %.12f\n", cur_num, cur_ratio);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_key_num_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr,
+                    "Write accessed unique key number change file failed\n");
+            return s;
+          }
+        }
+      }
+
+      // output the prefix of top k access peak
+      if (FLAGS_output_prefix_cut > 0 && stat.second.a_top_qps_prefix_f) {
+        while (!stat.second.top_k_qps_sec.empty()) {
+          ret = sprintf(buffer_, "At time: %u with QPS: %u\n",
+                        stat.second.top_k_qps_sec.top().second,
+                        stat.second.top_k_qps_sec.top().first);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.second.a_top_qps_prefix_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write prefix QPS top K file failed\n");
+            return s;
+          }
+          uint32_t qps_time = stat.second.top_k_qps_sec.top().second;
+          stat.second.top_k_qps_sec.pop();
+          if (stat.second.a_qps_prefix_stats.find(qps_time) !=
+              stat.second.a_qps_prefix_stats.end()) {
+            for (auto& qps_prefix : stat.second.a_qps_prefix_stats[qps_time]) {
+              std::string qps_prefix_out =
+                  rocksdb::LDBCommand::StringToHex(qps_prefix.first);
+              ret = sprintf(buffer_, "The prefix: %s Access count: %u\n",
+                            qps_prefix_out.c_str(), qps_prefix.second);
+              if (ret < 0) {
+                return Status::IOError("Format the output failed");
+              }
+              std::string pout(buffer_);
+              s = stat.second.a_top_qps_prefix_f->Append(pout);
+              if (!s.ok()) {
+                fprintf(stderr, "Write prefix QPS top K file failed\n");
+                return s;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (qps_f_) {
+    for (uint32_t i = 0; i < duration; i++) {
+      for (int type = 0; type <= kTaTypeNum; type++) {
+        if (type < kTaTypeNum) {
+          ret = sprintf(buffer_, "%u ", type_qps[i][type]);
+        } else {
+          ret = sprintf(buffer_, "%u\n", type_qps[i][type]);
+        }
+        if (ret < 0) {
+          return Status::IOError("Format the output failed");
+        }
+        std::string printout(buffer_);
+        s = qps_f_->Append(printout);
+        if (!s.ok()) {
+          return s;
+        }
+        qps_sum[type] += type_qps[i][type];
+        if (type_qps[i][type] > qps_peak[type]) {
+          qps_peak[type] = type_qps[i][type];
+        }
+      }
+    }
+  }
+
+  if (cf_qps_f_) {
+    int cfs_size = static_cast<uint32_t>(cfs_.size());
+    uint32_t v;
+    for (uint32_t i = 0; i < duration; i++) {
+      for (int cf = 0; cf < cfs_size; cf++) {
+        if (cfs_[cf].cf_qps.find(i) != cfs_[cf].cf_qps.end()) {
+          v = cfs_[cf].cf_qps[i];
+        } else {
+          v = 0;
+        }
+        if (cf < cfs_size - 1) {
+          ret = sprintf(buffer_, "%u ", v);
+        } else {
+          ret = sprintf(buffer_, "%u\n", v);
+        }
+        if (ret < 0) {
+          return Status::IOError("Format the output failed");
+        }
+        std::string printout(buffer_);
+        s = cf_qps_f_->Append(printout);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+  }
+
+  qps_peak_ = qps_peak;
+  for (int type = 0; type <= kTaTypeNum; type++) {
+    if (duration == 0) {
+      qps_ave_[type] = 0;
+    } else {
+      qps_ave_[type] = (static_cast<double>(qps_sum[type])) / duration;
+    }
+  }
+
+  return Status::OK();
+}
+
+// In reprocessing, if we have the whole key space
+// we can output the access count of all keys in a cf
+// we can make some statistics of the whole key space
+// also, we output the top k accessed keys here
+Status TraceAnalyzer::ReProcessing() {
+  int ret;
+  Status s;
+  for (auto& cf_it : cfs_) {
+    uint32_t cf_id = cf_it.first;
+
+    // output the time series;
+    if (FLAGS_output_time_series) {
+      for (int type = 0; type < kTaTypeNum; type++) {
+        if (!ta_[type].enabled ||
+            ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+          continue;
+        }
+        TraceStats& stat = ta_[type].stats[cf_id];
+        if (!stat.time_series_f) {
+          fprintf(stderr, "Cannot write time_series of '%s' in '%u'\n",
+                  ta_[type].type_name.c_str(), cf_id);
+          continue;
+        }
+        while (!stat.time_series.empty()) {
+          uint64_t key_id = 0;
+          auto found = stat.a_key_stats.find(stat.time_series.front().key);
+          if (found != stat.a_key_stats.end()) {
+            key_id = found->second.key_id;
+          }
+          ret = sprintf(buffer_, "%u %" PRIu64 " %" PRIu64 "\n",
+                        stat.time_series.front().type,
+                        stat.time_series.front().ts, key_id);
+          if (ret < 0) {
+            return Status::IOError("Format the output failed");
+          }
+          std::string printout(buffer_);
+          s = stat.time_series_f->Append(printout);
+          if (!s.ok()) {
+            fprintf(stderr, "Write time series file failed\n");
+            return s;
+          }
+          stat.time_series.pop_front();
+        }
+      }
+    }
+
+    // process the whole key space if needed
+    if (!FLAGS_key_space_dir.empty()) {
+      std::string whole_key_path =
+          FLAGS_key_space_dir + "/" + std::to_string(cf_id) + ".txt";
+      std::string input_key, get_key;
+      std::vector<std::string> prefix(kTaTypeNum);
+      std::istringstream iss;
+      bool has_data = true;
+      s = env_->NewSequentialFile(whole_key_path, &wkey_input_f_, env_options_);
+      if (!s.ok()) {
+        fprintf(stderr, "Cannot open the whole key space file of CF: %u\n",
+                cf_id);
+        wkey_input_f_.reset();
+      }
+      if (wkey_input_f_) {
+        for (cfs_[cf_id].w_count = 0;
+             ReadOneLine(&iss, wkey_input_f_.get(), &get_key, &has_data, &s);
+             ++cfs_[cf_id].w_count) {
+          if (!s.ok()) {
+            fprintf(stderr, "Read whole key space file failed\n");
+            return s;
+          }
+
+          input_key = rocksdb::LDBCommand::HexToString(get_key);
+          for (int type = 0; type < kTaTypeNum; type++) {
+            if (!ta_[type].enabled) {
+              continue;
+            }
+            TraceStats& stat = ta_[type].stats[cf_id];
+            if (stat.w_key_f) {
+              if (stat.a_key_stats.find(input_key) != stat.a_key_stats.end()) {
+                ret = sprintf(buffer_, "%" PRIu64 " %" PRIu64 "\n",
+                              cfs_[cf_id].w_count,
+                              stat.a_key_stats[input_key].access_count);
+                if (ret < 0) {
+                  return Status::IOError("Format the output failed");
+                }
+                std::string printout(buffer_);
+                s = stat.w_key_f->Append(printout);
+                if (!s.ok()) {
+                  fprintf(stderr, "Write whole key space access file failed\n");
+                  return s;
+                }
+              }
+            }
+
+            // Output the prefix cut file of the whole key space
+            if (FLAGS_output_prefix_cut > 0 && stat.w_prefix_cut_f) {
+              if (input_key.compare(0, FLAGS_output_prefix_cut, prefix[type]) !=
+                  0) {
+                prefix[type] = input_key.substr(0, FLAGS_output_prefix_cut);
+                std::string prefix_out =
+                    rocksdb::LDBCommand::StringToHex(prefix[type]);
+                ret = sprintf(buffer_, "%" PRIu64 " %s\n", cfs_[cf_id].w_count,
+                              prefix_out.c_str());
+                if (ret < 0) {
+                  return Status::IOError("Format the output failed");
+                }
+                std::string printout(buffer_);
+                s = stat.w_prefix_cut_f->Append(printout);
+                if (!s.ok()) {
+                  fprintf(stderr,
+                          "Write whole key space prefix cut file failed\n");
+                  return s;
+                }
+              }
+            }
+          }
+
+          // Make the statistics fo the key size distribution
+          if (FLAGS_output_key_distribution) {
+            if (cfs_[cf_id].w_key_size_stats.find(input_key.size()) ==
+                cfs_[cf_id].w_key_size_stats.end()) {
+              cfs_[cf_id].w_key_size_stats[input_key.size()] = 1;
+            } else {
+              cfs_[cf_id].w_key_size_stats[input_key.size()]++;
+            }
+          }
+        }
+      }
+    }
+
+    // process the top k accessed keys
+    if (FLAGS_print_top_k_access > 0) {
+      for (int type = 0; type < kTaTypeNum; type++) {
+        if (!ta_[type].enabled ||
+            ta_[type].stats.find(cf_id) == ta_[type].stats.end()) {
+          continue;
+        }
+        TraceStats& stat = ta_[type].stats[cf_id];
+        for (auto& record : stat.a_key_stats) {
+          if (static_cast<int32_t>(stat.top_k_queue.size()) <
+              FLAGS_print_top_k_access) {
+            stat.top_k_queue.push(
+                std::make_pair(record.second.access_count, record.first));
+          } else {
+            if (record.second.access_count > stat.top_k_queue.top().first) {
+              stat.top_k_queue.pop();
+              stat.top_k_queue.push(
+                  std::make_pair(record.second.access_count, record.first));
+            }
+          }
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// End the processing, print the requested results
+Status TraceAnalyzer::EndProcessing() {
+  if (trace_sequence_f_) {
+    trace_sequence_f_->Close();
+  }
+  if (FLAGS_no_print) {
+    return Status::OK();
+  }
+  PrintStatistics();
+  CloseOutputFiles();
+  return Status::OK();
+}
+
+// Insert the corresponding key statistics to the correct type
+// and correct CF, output the time-series file if needed
+Status TraceAnalyzer::KeyStatsInsertion(const uint32_t& type,
+                                        const uint32_t& cf_id,
+                                        const std::string& key,
+                                        const size_t value_size,
+                                        const uint64_t ts) {
+  Status s;
+  StatsUnit unit;
+  unit.key_id = 0;
+  unit.cf_id = cf_id;
+  unit.value_size = value_size;
+  unit.access_count = 1;
+  unit.latest_ts = ts;
+  if (type != TraceOperationType::kGet || value_size > 0) {
+    unit.succ_count = 1;
+  } else {
+    unit.succ_count = 0;
+  }
+  unit.v_correlation.resize(analyzer_opts_.correlation_list.size());
+  for (int i = 0;
+       i < (static_cast<int>(analyzer_opts_.correlation_list.size())); i++) {
+    unit.v_correlation[i].count = 0;
+    unit.v_correlation[i].total_ts = 0;
+  }
+  std::string prefix;
+  if (FLAGS_output_prefix_cut > 0) {
+    prefix = key.substr(0, FLAGS_output_prefix_cut);
+  }
+
+  if (begin_time_ == 0) {
+    begin_time_ = ts;
+  }
+  uint32_t time_in_sec;
+  if (ts < begin_time_) {
+    time_in_sec = 0;
+  } else {
+    time_in_sec = static_cast<uint32_t>((ts - begin_time_) / 1000000);
+  }
+
+  uint64_t dist_value_size = value_size / FLAGS_value_interval;
+  auto found_stats = ta_[type].stats.find(cf_id);
+  if (found_stats == ta_[type].stats.end()) {
+    ta_[type].stats[cf_id].cf_id = cf_id;
+    ta_[type].stats[cf_id].cf_name = std::to_string(cf_id);
+    ta_[type].stats[cf_id].a_count = 1;
+    ta_[type].stats[cf_id].a_key_id = 0;
+    ta_[type].stats[cf_id].a_key_size_sqsum = MultiplyCheckOverflow(
+        static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+    ta_[type].stats[cf_id].a_key_size_sum = key.size();
+    ta_[type].stats[cf_id].a_value_size_sqsum = MultiplyCheckOverflow(
+        static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+    ta_[type].stats[cf_id].a_value_size_sum = value_size;
+    s = OpenStatsOutputFiles(ta_[type].type_name, ta_[type].stats[cf_id]);
+    if (!FLAGS_print_correlation.empty()) {
+      s = StatsUnitCorrelationUpdate(unit, type, ts, key);
+    }
+    ta_[type].stats[cf_id].a_key_stats[key] = unit;
+    ta_[type].stats[cf_id].a_value_size_stats[dist_value_size] = 1;
+    ta_[type].stats[cf_id].a_qps_stats[time_in_sec] = 1;
+    ta_[type].stats[cf_id].correlation_output.resize(
+        analyzer_opts_.correlation_list.size());
+    if (FLAGS_output_prefix_cut > 0) {
+      std::map<std::string, uint32_t> tmp_qps_map;
+      tmp_qps_map[prefix] = 1;
+      ta_[type].stats[cf_id].a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+    }
+    if (time_in_sec != cur_time_sec_) {
+      ta_[type].stats[cf_id].uni_key_num[cur_time_sec_] =
+          static_cast<uint64_t>(ta_[type].stats[cf_id].a_key_stats.size());
+      cur_time_sec_ = time_in_sec;
+    }
+  } else {
+    found_stats->second.a_count++;
+    found_stats->second.a_key_size_sqsum += MultiplyCheckOverflow(
+        static_cast<uint64_t>(key.size()), static_cast<uint64_t>(key.size()));
+    found_stats->second.a_key_size_sum += key.size();
+    found_stats->second.a_value_size_sqsum += MultiplyCheckOverflow(
+        static_cast<uint64_t>(value_size), static_cast<uint64_t>(value_size));
+    found_stats->second.a_value_size_sum += value_size;
+    auto found_key = found_stats->second.a_key_stats.find(key);
+    if (found_key == found_stats->second.a_key_stats.end()) {
+      found_stats->second.a_key_stats[key] = unit;
+    } else {
+      found_key->second.access_count++;
+      if (type != TraceOperationType::kGet || value_size > 0) {
+        found_key->second.succ_count++;
+      }
+      if (!FLAGS_print_correlation.empty()) {
+        s = StatsUnitCorrelationUpdate(found_key->second, type, ts, key);
+      }
+    }
+    if (time_in_sec != cur_time_sec_) {
+      found_stats->second.uni_key_num[cur_time_sec_] =
+          static_cast<uint64_t>(found_stats->second.a_key_stats.size());
+      cur_time_sec_ = time_in_sec;
+    }
+
+    auto found_value =
+        found_stats->second.a_value_size_stats.find(dist_value_size);
+    if (found_value == found_stats->second.a_value_size_stats.end()) {
+      found_stats->second.a_value_size_stats[dist_value_size] = 1;
+    } else {
+      found_value->second++;
+    }
+
+    auto found_qps = found_stats->second.a_qps_stats.find(time_in_sec);
+    if (found_qps == found_stats->second.a_qps_stats.end()) {
+      found_stats->second.a_qps_stats[time_in_sec] = 1;
+    } else {
+      found_qps->second++;
+    }
+
+    if (FLAGS_output_prefix_cut > 0) {
+      auto found_qps_prefix =
+          found_stats->second.a_qps_prefix_stats.find(time_in_sec);
+      if (found_qps_prefix == found_stats->second.a_qps_prefix_stats.end()) {
+        std::map<std::string, uint32_t> tmp_qps_map;
+        found_stats->second.a_qps_prefix_stats[time_in_sec] = tmp_qps_map;
+      }
+      if (found_stats->second.a_qps_prefix_stats[time_in_sec].find(prefix) ==
+          found_stats->second.a_qps_prefix_stats[time_in_sec].end()) {
+        found_stats->second.a_qps_prefix_stats[time_in_sec][prefix] = 1;
+      } else {
+        found_stats->second.a_qps_prefix_stats[time_in_sec][prefix]++;
+      }
+    }
+  }
+
+  if (cfs_.find(cf_id) == cfs_.end()) {
+    CfUnit cf_unit;
+    cf_unit.cf_id = cf_id;
+    cf_unit.w_count = 0;
+    cf_unit.a_count = 0;
+    cfs_[cf_id] = cf_unit;
+  }
+
+  if (FLAGS_output_qps_stats) {
+    cfs_[cf_id].cf_qps[time_in_sec]++;
+  }
+
+  if (FLAGS_output_time_series) {
+    TraceUnit trace_u;
+    trace_u.type = type;
+    trace_u.key = key;
+    trace_u.value_size = value_size;
+    trace_u.ts = (ts - time_series_start_) / 1000000;
+    trace_u.cf_id = cf_id;
+    ta_[type].stats[cf_id].time_series.push_back(trace_u);
+  }
+
+  return Status::OK();
+}
+
+// Update the correlation unit of each key if enabled
+Status TraceAnalyzer::StatsUnitCorrelationUpdate(StatsUnit& unit,
+                                                 const uint32_t& type_second,
+                                                 const uint64_t& ts,
+                                                 const std::string& key) {
+  if (type_second >= kTaTypeNum) {
+    fprintf(stderr, "Unknown Type Id: %u\n", type_second);
+    return Status::NotFound();
+  }
+
+  for (int type_first = 0; type_first < kTaTypeNum; type_first++) {
+    if (type_first >= static_cast<int>(ta_.size()) ||
+        type_first >= static_cast<int>(analyzer_opts_.correlation_map.size())) {
+      break;
+    }
+    if (analyzer_opts_.correlation_map[type_first][type_second] < 0 ||
+        ta_[type_first].stats.find(unit.cf_id) == ta_[type_first].stats.end() ||
+        ta_[type_first].stats[unit.cf_id].a_key_stats.find(key) ==
+            ta_[type_first].stats[unit.cf_id].a_key_stats.end() ||
+        ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts == ts) {
+      continue;
+    }
+
+    int correlation_id =
+        analyzer_opts_.correlation_map[type_first][type_second];
+
+    // after get the x-y operation time or x, update;
+    if (correlation_id < 0 ||
+        correlation_id >= static_cast<int>(unit.v_correlation.size())) {
+      continue;
+    }
+    unit.v_correlation[correlation_id].count++;
+    unit.v_correlation[correlation_id].total_ts +=
+        (ts - ta_[type_first].stats[unit.cf_id].a_key_stats[key].latest_ts);
+  }
+
+  unit.latest_ts = ts;
+  return Status::OK();
+}
+
+// when a new trace statistic is created, the file handler
+// pointers should be initiated if needed according to
+// the trace analyzer options
+Status TraceAnalyzer::OpenStatsOutputFiles(const std::string& type,
+                                           TraceStats& new_stats) {
+  Status s;
+  if (FLAGS_output_key_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_stats.txt",
+                         &new_stats.a_key_f);
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_unique_key_num_change.txt",
+                         &new_stats.a_key_num_f);
+    if (!FLAGS_key_space_dir.empty()) {
+      s = CreateOutputFile(type, new_stats.cf_name, "whole_key_stats.txt",
+                           &new_stats.w_key_f);
+    }
+  }
+
+  if (FLAGS_output_access_count_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_key_count_distribution.txt",
+                         &new_stats.a_count_dist_f);
+  }
+
+  if (FLAGS_output_prefix_cut > 0) {
+    s = CreateOutputFile(type, new_stats.cf_name, "accessed_key_prefix_cut.txt",
+                         &new_stats.a_prefix_cut_f);
+    if (!FLAGS_key_space_dir.empty()) {
+      s = CreateOutputFile(type, new_stats.cf_name, "whole_key_prefix_cut.txt",
+                           &new_stats.w_prefix_cut_f);
+    }
+
+    if (FLAGS_output_qps_stats) {
+      s = CreateOutputFile(type, new_stats.cf_name,
+                           "accessed_top_k_qps_prefix_cut.txt",
+                           &new_stats.a_top_qps_prefix_f);
+    }
+  }
+
+  if (FLAGS_output_time_series) {
+    s = CreateOutputFile(type, new_stats.cf_name, "time_series.txt",
+                         &new_stats.time_series_f);
+  }
+
+  if (FLAGS_output_value_distribution) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_value_size_distribution.txt",
+                         &new_stats.a_value_size_f);
+  }
+
+  if (FLAGS_output_key_distribution) {
+    s = CreateOutputFile(type, new_stats.cf_name,
+                         "accessed_key_size_distribution.txt",
+                         &new_stats.a_key_size_f);
+  }
+
+  if (FLAGS_output_qps_stats) {
+    s = CreateOutputFile(type, new_stats.cf_name, "qps_stats.txt",
+                         &new_stats.a_qps_f);
+  }
+
+  return Status::OK();
+}
+
+// create the output path of the files to be opened
+Status TraceAnalyzer::CreateOutputFile(
+    const std::string& type, const std::string& cf_name,
+    const std::string& ending, std::unique_ptr<rocksdb::WritableFile>* f_ptr) {
+  std::string path;
+  path = output_path_ + "/" + FLAGS_output_prefix + "-" + type + "-" + cf_name +
+         "-" + ending;
+  Status s;
+  s = env_->NewWritableFile(path, f_ptr, env_options_);
+  if (!s.ok()) {
+    fprintf(stderr, "Cannot open file: %s\n", path.c_str());
+    exit(1);
+  }
+  return Status::OK();
+}
+
+// Close the output files in the TraceStats if they are opened
+void TraceAnalyzer::CloseOutputFiles() {
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    for (auto& stat : ta_[type].stats) {
+      if (stat.second.time_series_f) {
+        stat.second.time_series_f->Close();
+      }
+
+      if (stat.second.a_key_f) {
+        stat.second.a_key_f->Close();
+      }
+
+      if (stat.second.a_key_num_f) {
+        stat.second.a_key_num_f->Close();
+      }
+
+      if (stat.second.a_count_dist_f) {
+        stat.second.a_count_dist_f->Close();
+      }
+
+      if (stat.second.a_prefix_cut_f) {
+        stat.second.a_prefix_cut_f->Close();
+      }
+
+      if (stat.second.a_value_size_f) {
+        stat.second.a_value_size_f->Close();
+      }
+
+      if (stat.second.a_key_size_f) {
+        stat.second.a_key_size_f->Close();
+      }
+
+      if (stat.second.a_qps_f) {
+        stat.second.a_qps_f->Close();
+      }
+
+      if (stat.second.a_top_qps_prefix_f) {
+        stat.second.a_top_qps_prefix_f->Close();
+      }
+
+      if (stat.second.w_key_f) {
+        stat.second.w_key_f->Close();
+      }
+      if (stat.second.w_prefix_cut_f) {
+        stat.second.w_prefix_cut_f->Close();
+      }
+    }
+  }
+  return;
+}
+
+// Handle the Get request in the trace
+Status TraceAnalyzer::HandleGet(uint32_t column_family_id,
+                                const std::string& key, const uint64_t& ts,
+                                const uint32_t& get_ret) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kGet, column_family_id, key,
+                           value_size, ts);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (ta_[TraceOperationType::kGet].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kGet].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kGet].sample_count > 0) {
+    ta_[TraceOperationType::kGet].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kGet].sample_count++;
+
+  if (!ta_[TraceOperationType::kGet].enabled) {
+    return Status::OK();
+  }
+  if (get_ret == 1) {
+    value_size = 10;
+  }
+  s = KeyStatsInsertion(TraceOperationType::kGet, column_family_id, key,
+                        value_size, ts);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Put request in the write batch of the trace
+Status TraceAnalyzer::HandlePut(uint32_t column_family_id, const Slice& key,
+                                const Slice& value) {
+  Status s;
+  size_t value_size = value.ToString().size();
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kPut, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (ta_[TraceOperationType::kPut].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kPut].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kPut].sample_count > 0) {
+    ta_[TraceOperationType::kPut].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kPut].sample_count++;
+
+  if (!ta_[TraceOperationType::kPut].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kPut, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Delete request in the write batch of the trace
+Status TraceAnalyzer::HandleDelete(uint32_t column_family_id,
+                                   const Slice& key) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kDelete, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (ta_[TraceOperationType::kDelete].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kDelete].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kDelete].sample_count > 0) {
+    ta_[TraceOperationType::kDelete].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kDelete].sample_count++;
+
+  if (!ta_[TraceOperationType::kDelete].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kDelete, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the SingleDelete request in the write batch of the trace
+Status TraceAnalyzer::HandleSingleDelete(uint32_t column_family_id,
+                                         const Slice& key) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kSingleDelete, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (ta_[TraceOperationType::kSingleDelete].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kSingleDelete].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kSingleDelete].sample_count > 0) {
+    ta_[TraceOperationType::kSingleDelete].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kSingleDelete].sample_count++;
+
+  if (!ta_[TraceOperationType::kSingleDelete].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kSingleDelete, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the DeleteRange request in the write batch of the trace
+Status TraceAnalyzer::HandleDeleteRange(uint32_t column_family_id,
+                                        const Slice& begin_key,
+                                        const Slice& end_key) {
+  Status s;
+  size_t value_size = 0;
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kRangeDelete, column_family_id,
+                           begin_key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (ta_[TraceOperationType::kRangeDelete].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kRangeDelete].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kRangeDelete].sample_count > 0) {
+    ta_[TraceOperationType::kRangeDelete].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kRangeDelete].sample_count++;
+
+  if (!ta_[TraceOperationType::kRangeDelete].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
+                        begin_key.ToString(), value_size, c_time_);
+  s = KeyStatsInsertion(TraceOperationType::kRangeDelete, column_family_id,
+                        end_key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Merge request in the write batch of the trace
+Status TraceAnalyzer::HandleMerge(uint32_t column_family_id, const Slice& key,
+                                  const Slice& value) {
+  Status s;
+  size_t value_size = value.ToString().size();
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(TraceOperationType::kMerge, column_family_id,
+                           key.ToString(), value_size, c_time_);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (ta_[TraceOperationType::kMerge].sample_count >= sample_max_) {
+    ta_[TraceOperationType::kMerge].sample_count = 0;
+  }
+  if (ta_[TraceOperationType::kMerge].sample_count > 0) {
+    ta_[TraceOperationType::kMerge].sample_count++;
+    return Status::OK();
+  }
+  ta_[TraceOperationType::kMerge].sample_count++;
+
+  if (!ta_[TraceOperationType::kMerge].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(TraceOperationType::kMerge, column_family_id,
+                        key.ToString(), value_size, c_time_);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Handle the Iterator request in the trace
+Status TraceAnalyzer::HandleIter(uint32_t column_family_id,
+                                 const std::string& key, const uint64_t& ts,
+                                 TraceType& trace_type) {
+  Status s;
+  size_t value_size = 0;
+  int type = -1;
+  if (trace_type == kTraceIteratorSeek) {
+    type = TraceOperationType::kIteratorSeek;
+  } else if (trace_type == kTraceIteratorSeekForPrev) {
+    type = TraceOperationType::kIteratorSeekForPrev;
+  } else {
+    return s;
+  }
+  if (type == -1) {
+    return s;
+  }
+
+  if (FLAGS_convert_to_human_readable_trace && trace_sequence_f_) {
+    s = WriteTraceSequence(type, column_family_id, key, value_size, ts);
+    if (!s.ok()) {
+      return Status::Corruption("Failed to write the trace sequence to file");
+    }
+  }
+
+  if (ta_[type].sample_count >= sample_max_) {
+    ta_[type].sample_count = 0;
+  }
+  if (ta_[type].sample_count > 0) {
+    ta_[type].sample_count++;
+    return Status::OK();
+  }
+  ta_[type].sample_count++;
+
+  if (!ta_[type].enabled) {
+    return Status::OK();
+  }
+  s = KeyStatsInsertion(type, column_family_id, key, value_size, ts);
+  if (!s.ok()) {
+    return Status::Corruption("Failed to insert key statistics");
+  }
+  return s;
+}
+
+// Before the analyzer is closed, the requested general statistic results are
+// printed out here. In current stage, these information are not output to
+// the files.
+// -----type
+//          |__cf_id
+//                |_statistics
+void TraceAnalyzer::PrintStatistics() {
+  for (int type = 0; type < kTaTypeNum; type++) {
+    if (!ta_[type].enabled) {
+      continue;
+    }
+    ta_[type].total_keys = 0;
+    ta_[type].total_access = 0;
+    ta_[type].total_succ_access = 0;
+    printf("\n################# Operation Type: %s #####################\n",
+           ta_[type].type_name.c_str());
+    if (qps_ave_.size() == kTaTypeNum + 1) {
+      printf("Peak QPS is: %u Average QPS is: %f\n", qps_peak_[type],
+             qps_ave_[type]);
+    }
+    for (auto& stat_it : ta_[type].stats) {
+      if (stat_it.second.a_count == 0) {
+        continue;
+      }
+      TraceStats& stat = stat_it.second;
+      uint64_t total_a_keys = static_cast<uint64_t>(stat.a_key_stats.size());
+      double key_size_ave = 0.0;
+      double value_size_ave = 0.0;
+      double key_size_vari = 0.0;
+      double value_size_vari = 0.0;
+      if (stat.a_count > 0) {
+        key_size_ave =
+            (static_cast<double>(stat.a_key_size_sum)) / stat.a_count;
+        value_size_ave =
+            (static_cast<double>(stat.a_value_size_sum)) / stat.a_count;
+        key_size_vari = std::sqrt((static_cast<double>(stat.a_key_size_sqsum)) /
+                                      stat.a_count -
+                                  key_size_ave * key_size_ave);
+        value_size_vari = std::sqrt(
+            (static_cast<double>(stat.a_value_size_sqsum)) / stat.a_count -
+            value_size_ave * value_size_ave);
+      }
+      if (value_size_ave == 0.0) {
+        stat.a_value_mid = 0;
+      }
+      cfs_[stat.cf_id].a_count += total_a_keys;
+      ta_[type].total_keys += total_a_keys;
+      ta_[type].total_access += stat.a_count;
+      ta_[type].total_succ_access += stat.a_succ_count;
+      printf("*********************************************************\n");
+      printf("colume family id: %u\n", stat.cf_id);
+      printf("Total number of queries to this cf by %s: %" PRIu64 "\n",
+             ta_[type].type_name.c_str(), stat.a_count);
+      printf("Total unique keys in this cf: %" PRIu64 "\n", total_a_keys);
+      printf("Average key size: %f key size medium: %" PRIu64
+             " Key size Variation: %f\n",
+             key_size_ave, stat.a_key_mid, key_size_vari);
+      if (type == kPut || type == kMerge) {
+        printf("Average value size: %f Value size medium: %" PRIu64
+               " Value size variation: %f\n",
+               value_size_ave, stat.a_value_mid, value_size_vari);
+      }
+      printf("Peak QPS is: %u Average QPS is: %f\n", stat.a_peak_qps,
+             stat.a_ave_qps);
+
+      // print the top k accessed key and its access count
+      if (FLAGS_print_top_k_access > 0) {
+        printf("The Top %d keys that are accessed:\n",
+               FLAGS_print_top_k_access);
+        while (!stat.top_k_queue.empty()) {
+          std::string hex_key =
+              rocksdb::LDBCommand::StringToHex(stat.top_k_queue.top().second);
+          printf("Access_count: %" PRIu64 " %s\n", stat.top_k_queue.top().first,
+                 hex_key.c_str());
+          stat.top_k_queue.pop();
+        }
+      }
+
+      // print the top k access prefix range and
+      // top k prefix range with highest average access per key
+      if (FLAGS_output_prefix_cut > 0) {
+        printf("The Top %d accessed prefix range:\n", FLAGS_print_top_k_access);
+        while (!stat.top_k_prefix_access.empty()) {
+          printf("Prefix: %s Access count: %" PRIu64 "\n",
+                 stat.top_k_prefix_access.top().second.c_str(),
+                 stat.top_k_prefix_access.top().first);
+          stat.top_k_prefix_access.pop();
+        }
+
+        printf("The Top %d prefix with highest access per key:\n",
+               FLAGS_print_top_k_access);
+        while (!stat.top_k_prefix_ave.empty()) {
+          printf("Prefix: %s access per key: %f\n",
+                 stat.top_k_prefix_ave.top().second.c_str(),
+                 stat.top_k_prefix_ave.top().first);
+          stat.top_k_prefix_ave.pop();
+        }
+      }
+
+      // print the operation correlations
+      if (!FLAGS_print_correlation.empty()) {
+        for (int correlation = 0;
+             correlation <
+             static_cast<int>(analyzer_opts_.correlation_list.size());
+             correlation++) {
+          printf(
+              "The correlation statistics of '%s' after '%s' is:",
+              taIndexToOpt[analyzer_opts_.correlation_list[correlation].second]
+                  .c_str(),
+              taIndexToOpt[analyzer_opts_.correlation_list[correlation].first]
+                  .c_str());
+          double correlation_ave = 0.0;
+          if (stat.correlation_output[correlation].first > 0) {
+            correlation_ave =
+                (static_cast<double>(
+                    stat.correlation_output[correlation].second)) /
+                (stat.correlation_output[correlation].first * 1000);
+          }
+          printf(" total numbers: %" PRIu64 " average time: %f(ms)\n",
+                 stat.correlation_output[correlation].first, correlation_ave);
+        }
+      }
+    }
+    printf("*********************************************************\n");
+    printf("Total keys of '%s' is: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+           ta_[type].total_keys);
+    printf("Total access is: %" PRIu64 "\n", ta_[type].total_access);
+    total_access_keys_ += ta_[type].total_keys;
+  }
+
+  // Print the overall statistic information of the trace
+  printf("\n*********************************************************\n");
+  printf("*********************************************************\n");
+  printf("The column family based statistics\n");
+  for (auto& cf : cfs_) {
+    printf("The column family id: %u\n", cf.first);
+    printf("The whole key space key numbers: %" PRIu64 "\n", cf.second.w_count);
+    printf("The accessed key space key numbers: %" PRIu64 "\n",
+           cf.second.a_count);
+  }
+
+  if (FLAGS_print_overall_stats) {
+    printf("\n*********************************************************\n");
+    printf("*********************************************************\n");
+    if (qps_peak_.size() == kTaTypeNum + 1) {
+      printf("Average QPS per second: %f Peak QPS: %u\n", qps_ave_[kTaTypeNum],
+             qps_peak_[kTaTypeNum]);
+    }
+    printf("The statistics related to query number need to times: %u\n",
+           sample_max_);
+    printf("Total_requests: %" PRIu64 " Total_accessed_keys: %" PRIu64
+           " Total_gets: %" PRIu64 " Total_write_batch: %" PRIu64 "\n",
+           total_requests_, total_access_keys_, total_gets_, total_writes_);
+    for (int type = 0; type < kTaTypeNum; type++) {
+      if (!ta_[type].enabled) {
+        continue;
+      }
+      printf("Operation: '%s' has: %" PRIu64 "\n", ta_[type].type_name.c_str(),
+             ta_[type].total_access);
+    }
+  }
+}
+
+// Write the trace sequence to file
+Status TraceAnalyzer::WriteTraceSequence(const uint32_t& type,
+                                         const uint32_t& cf_id,
+                                         const std::string& key,
+                                         const size_t value_size,
+                                         const uint64_t ts) {
+  std::string hex_key = rocksdb::LDBCommand::StringToHex(key);
+  int ret;
+  ret =
+      sprintf(buffer_, "%u %u %zu %" PRIu64 "\n", type, cf_id, value_size, ts);
+  if (ret < 0) {
+    return Status::IOError("failed to format the output");
+  }
+  std::string printout(buffer_);
+  if (!FLAGS_no_key) {
+    printout = hex_key + " " + printout;
+  }
+  return trace_sequence_f_->Append(printout);
+}
+
+// The entrance function of Trace_Analyzer
+int trace_analyzer_tool(int argc, char** argv) {
+  std::string trace_path;
+  std::string output_path;
+
+  AnalyzerOptions analyzer_opts;
+
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  if (!FLAGS_print_correlation.empty()) {
+    analyzer_opts.SparseCorrelationInput(FLAGS_print_correlation);
+  }
+
+  std::unique_ptr<TraceAnalyzer> analyzer(
+      new TraceAnalyzer(FLAGS_trace_path, FLAGS_output_dir, analyzer_opts));
+
+  if (!analyzer) {
+    fprintf(stderr, "Cannot initiate the trace analyzer\n");
+    exit(1);
+  }
+
+  rocksdb::Status s = analyzer->PrepareProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot initiate the trace reader\n");
+    exit(1);
+  }
+
+  s = analyzer->StartProcessing();
+  if (!s.ok() && !FLAGS_try_process_corrupted_trace) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot processing the trace\n");
+    exit(1);
+  }
+
+  s = analyzer->MakeStatistics();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    analyzer->EndProcessing();
+    fprintf(stderr, "Cannot make the statistics\n");
+    exit(1);
+  }
+
+  s = analyzer->ReProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot re-process the trace for more statistics\n");
+    analyzer->EndProcessing();
+    exit(1);
+  }
+
+  s = analyzer->EndProcessing();
+  if (!s.ok()) {
+    fprintf(stderr, "%s\n", s.getState());
+    fprintf(stderr, "Cannot close the trace analyzer\n");
+    exit(1);
+  }
+
+  return 0;
+}
+}  // namespace rocksdb
+
+#endif  // Endif of Gflag
+#endif  // RocksDB LITE
diff --git a/thirdparty/rocksdb/tools/trace_analyzer_tool.h b/thirdparty/rocksdb/tools/trace_analyzer_tool.h
new file mode 100644
index 0000000000..be96f5005d
--- /dev/null
+++ b/thirdparty/rocksdb/tools/trace_analyzer_tool.h
@@ -0,0 +1,290 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include <list>
+#include <map>
+#include <queue>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "rocksdb/env.h"
+#include "rocksdb/trace_reader_writer.h"
+#include "rocksdb/write_batch.h"
+#include "util/trace_replay.h"
+
+namespace rocksdb {
+
+class DBImpl;
+class WriteBatch;
+
+enum TraceOperationType : int {
+  kGet = 0,
+  kPut = 1,
+  kDelete = 2,
+  kSingleDelete = 3,
+  kRangeDelete = 4,
+  kMerge = 5,
+  kIteratorSeek = 6,
+  kIteratorSeekForPrev = 7,
+  kTaTypeNum = 8
+};
+
+struct TraceUnit {
+  uint64_t ts;
+  uint32_t type;
+  uint32_t cf_id;
+  size_t value_size;
+  std::string key;
+};
+
+struct TypeCorrelation {
+  uint64_t count;
+  uint64_t total_ts;
+};
+
+struct StatsUnit {
+  uint64_t key_id;
+  uint64_t access_count;
+  uint64_t latest_ts;
+  uint64_t succ_count;  // current only used to count Get if key found
+  uint32_t cf_id;
+  size_t value_size;
+  std::vector<TypeCorrelation> v_correlation;
+};
+
+class AnalyzerOptions {
+ public:
+  std::vector<std::vector<int>> correlation_map;
+  std::vector<std::pair<int, int>> correlation_list;
+
+  AnalyzerOptions();
+
+  ~AnalyzerOptions();
+
+  void SparseCorrelationInput(const std::string& in_str);
+};
+
+// Note that, for the variable names  in the trace_analyzer,
+// Starting with 'a_' means the variable is used for 'accessed_keys'.
+// Starting with 'w_' means it is used for 'the whole key space'.
+// Ending with '_f' means a file write or reader pointer.
+// For example, 'a_count' means 'accessed_keys_count',
+// 'w_key_f' means 'whole_key_space_file'.
+
+struct TraceStats {
+  uint32_t cf_id;
+  std::string cf_name;
+  uint64_t a_count;
+  uint64_t a_succ_count;
+  uint64_t a_key_id;
+  uint64_t a_key_size_sqsum;
+  uint64_t a_key_size_sum;
+  uint64_t a_key_mid;
+  uint64_t a_value_size_sqsum;
+  uint64_t a_value_size_sum;
+  uint64_t a_value_mid;
+  uint32_t a_peak_qps;
+  double a_ave_qps;
+  std::map<std::string, StatsUnit> a_key_stats;
+  std::map<uint64_t, uint64_t> a_count_stats;
+  std::map<uint64_t, uint64_t> a_key_size_stats;
+  std::map<uint64_t, uint64_t> a_value_size_stats;
+  std::map<uint32_t, uint32_t> a_qps_stats;
+  std::map<uint32_t, std::map<std::string, uint32_t>> a_qps_prefix_stats;
+  std::priority_queue<std::pair<uint64_t, std::string>,
+                      std::vector<std::pair<uint64_t, std::string>>,
+                      std::greater<std::pair<uint64_t, std::string>>>
+      top_k_queue;
+  std::priority_queue<std::pair<uint64_t, std::string>,
+                      std::vector<std::pair<uint64_t, std::string>>,
+                      std::greater<std::pair<uint64_t, std::string>>>
+      top_k_prefix_access;
+  std::priority_queue<std::pair<double, std::string>,
+                      std::vector<std::pair<double, std::string>>,
+                      std::greater<std::pair<double, std::string>>>
+      top_k_prefix_ave;
+  std::priority_queue<std::pair<uint32_t, uint32_t>,
+                      std::vector<std::pair<uint32_t, uint32_t>>,
+                      std::greater<std::pair<uint32_t, uint32_t>>>
+      top_k_qps_sec;
+  std::list<TraceUnit> time_series;
+  std::vector<std::pair<uint64_t, uint64_t>> correlation_output;
+  std::map<uint32_t, uint64_t> uni_key_num;
+
+  std::unique_ptr<rocksdb::WritableFile> time_series_f;
+  std::unique_ptr<rocksdb::WritableFile> a_key_f;
+  std::unique_ptr<rocksdb::WritableFile> a_count_dist_f;
+  std::unique_ptr<rocksdb::WritableFile> a_prefix_cut_f;
+  std::unique_ptr<rocksdb::WritableFile> a_value_size_f;
+  std::unique_ptr<rocksdb::WritableFile> a_key_size_f;
+  std::unique_ptr<rocksdb::WritableFile> a_key_num_f;
+  std::unique_ptr<rocksdb::WritableFile> a_qps_f;
+  std::unique_ptr<rocksdb::WritableFile> a_top_qps_prefix_f;
+  std::unique_ptr<rocksdb::WritableFile> w_key_f;
+  std::unique_ptr<rocksdb::WritableFile> w_prefix_cut_f;
+
+  TraceStats();
+  ~TraceStats();
+  TraceStats(const TraceStats&) = delete;
+  TraceStats& operator=(const TraceStats&) = delete;
+  TraceStats(TraceStats&&) = default;
+  TraceStats& operator=(TraceStats&&) = default;
+};
+
+struct TypeUnit {
+  std::string type_name;
+  bool enabled;
+  uint64_t total_keys;
+  uint64_t total_access;
+  uint64_t total_succ_access;
+  uint32_t sample_count;
+  std::map<uint32_t, TraceStats> stats;
+  TypeUnit() = default;
+  ~TypeUnit() = default;
+  TypeUnit(const TypeUnit&) = delete;
+  TypeUnit& operator=(const TypeUnit&) = delete;
+  TypeUnit(TypeUnit&&) = default;
+  TypeUnit& operator=(TypeUnit&&) = default;
+};
+
+struct CfUnit {
+  uint32_t cf_id;
+  uint64_t w_count;  // total keys in this cf if we use the whole key space
+  uint64_t a_count;  // the total keys in this cf that are accessed
+  std::map<uint64_t, uint64_t> w_key_size_stats;  // whole key space key size
+                                                  // statistic this cf
+  std::map<uint32_t, uint32_t> cf_qps;
+};
+
+class TraceAnalyzer {
+ public:
+  TraceAnalyzer(std::string& trace_path, std::string& output_path,
+                AnalyzerOptions _analyzer_opts);
+  ~TraceAnalyzer();
+
+  Status PrepareProcessing();
+
+  Status StartProcessing();
+
+  Status MakeStatistics();
+
+  Status ReProcessing();
+
+  Status EndProcessing();
+
+  Status WriteTraceUnit(TraceUnit& unit);
+
+  // The trace  processing functions for different type
+  Status HandleGet(uint32_t column_family_id, const std::string& key,
+                   const uint64_t& ts, const uint32_t& get_ret);
+  Status HandlePut(uint32_t column_family_id, const Slice& key,
+                   const Slice& value);
+  Status HandleDelete(uint32_t column_family_id, const Slice& key);
+  Status HandleSingleDelete(uint32_t column_family_id, const Slice& key);
+  Status HandleDeleteRange(uint32_t column_family_id, const Slice& begin_key,
+                           const Slice& end_key);
+  Status HandleMerge(uint32_t column_family_id, const Slice& key,
+                     const Slice& value);
+  Status HandleIter(uint32_t column_family_id, const std::string& key,
+                    const uint64_t& ts, TraceType& trace_type);
+  std::vector<TypeUnit>& GetTaVector() { return ta_; }
+
+ private:
+  rocksdb::Env* env_;
+  EnvOptions env_options_;
+  std::unique_ptr<TraceReader> trace_reader_;
+  size_t offset_;
+  char buffer_[1024];
+  uint64_t c_time_;
+  std::string trace_name_;
+  std::string output_path_;
+  AnalyzerOptions analyzer_opts_;
+  uint64_t total_requests_;
+  uint64_t total_access_keys_;
+  uint64_t total_gets_;
+  uint64_t total_writes_;
+  uint64_t trace_create_time_;
+  uint64_t begin_time_;
+  uint64_t end_time_;
+  uint64_t time_series_start_;
+  uint32_t sample_max_;
+  uint32_t cur_time_sec_;
+  std::unique_ptr<rocksdb::WritableFile> trace_sequence_f_;  // readable trace
+  std::unique_ptr<rocksdb::WritableFile> qps_f_;             // overall qps
+  std::unique_ptr<rocksdb::WritableFile> cf_qps_f_;  // The qps of each CF>
+  std::unique_ptr<rocksdb::SequentialFile> wkey_input_f_;
+  std::vector<TypeUnit> ta_;  // The main statistic collecting data structure
+  std::map<uint32_t, CfUnit> cfs_;  // All the cf_id appears in this trace;
+  std::vector<uint32_t> qps_peak_;
+  std::vector<double> qps_ave_;
+
+  Status ReadTraceHeader(Trace* header);
+  Status ReadTraceFooter(Trace* footer);
+  Status ReadTraceRecord(Trace* trace);
+  Status KeyStatsInsertion(const uint32_t& type, const uint32_t& cf_id,
+                           const std::string& key, const size_t value_size,
+                           const uint64_t ts);
+  Status StatsUnitCorrelationUpdate(StatsUnit& unit, const uint32_t& type,
+                                    const uint64_t& ts, const std::string& key);
+  Status OpenStatsOutputFiles(const std::string& type, TraceStats& new_stats);
+  Status CreateOutputFile(const std::string& type, const std::string& cf_name,
+                          const std::string& ending,
+                          std::unique_ptr<rocksdb::WritableFile>* f_ptr);
+  void CloseOutputFiles();
+
+  void PrintStatistics();
+  Status TraceUnitWriter(std::unique_ptr<rocksdb::WritableFile>& f_ptr,
+                         TraceUnit& unit);
+  Status WriteTraceSequence(const uint32_t& type, const uint32_t& cf_id,
+                            const std::string& key, const size_t value_size,
+                            const uint64_t ts);
+  Status MakeStatisticKeyStatsOrPrefix(TraceStats& stats);
+  Status MakeStatisticCorrelation(TraceStats& stats, StatsUnit& unit);
+  Status MakeStatisticQPS();
+};
+
+// write bach handler to be used for WriteBache iterator
+// when processing the write trace
+class TraceWriteHandler : public WriteBatch::Handler {
+ public:
+  TraceWriteHandler() { ta_ptr = nullptr; }
+  explicit TraceWriteHandler(TraceAnalyzer* _ta_ptr) { ta_ptr = _ta_ptr; }
+  ~TraceWriteHandler() {}
+
+  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
+                       const Slice& value) override {
+    return ta_ptr->HandlePut(column_family_id, key, value);
+  }
+  virtual Status DeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+    return ta_ptr->HandleDelete(column_family_id, key);
+  }
+  virtual Status SingleDeleteCF(uint32_t column_family_id,
+                                const Slice& key) override {
+    return ta_ptr->HandleSingleDelete(column_family_id, key);
+  }
+  virtual Status DeleteRangeCF(uint32_t column_family_id,
+                               const Slice& begin_key,
+                               const Slice& end_key) override {
+    return ta_ptr->HandleDeleteRange(column_family_id, begin_key, end_key);
+  }
+  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
+                         const Slice& value) override {
+    return ta_ptr->HandleMerge(column_family_id, key, value);
+  }
+
+ private:
+  TraceAnalyzer* ta_ptr;
+};
+
+int trace_analyzer_tool(int argc, char** argv);
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/tools/verify_random_db.sh b/thirdparty/rocksdb/tools/verify_random_db.sh
index 7000f5a1aa..129233001c 100755
--- a/thirdparty/rocksdb/tools/verify_random_db.sh
+++ b/thirdparty/rocksdb/tools/verify_random_db.sh
@@ -7,7 +7,7 @@
 
 scriptpath=`dirname $BASH_SOURCE`
 if [ "$#" -lt 2 ]; then
-  echo "usage: $BASH_SOURCE <db_directory> <compare_base_db_directory> [dump_file_name] [if_try_load_options]"
+  echo "usage: $BASH_SOURCE <db_directory> <compare_base_db_directory> [dump_file_name] [if_try_load_options] [if_ignore_unknown_options]"
   exit 1
 fi
 
@@ -15,6 +15,7 @@ db_dir=$1
 base_db_dir=$2
 dump_file_name=${3:-"dump_file.txt"}
 try_load_options=${4:-"1"}
+ignore_unknown_options=${5:-"0"}
 db_dump=$db_dir"/"$dump_file_name
 base_db_dump=$base_db_dir"/"$dump_file_name
 extra_param=
@@ -23,6 +24,10 @@ if [ "$try_load_options" = "1" ]; then
  extra_param=" --try_load_options "
 fi
 
+if [ "$ignore_unknown_options" = "1" ]; then
+ extra_param=" --ignore_unknown_options "
+fi
+
 set -e
 echo == Dumping data from $db_dir to $db_dump
 ./ldb dump --db=$db_dir $extra_param > $db_dump
diff --git a/thirdparty/rocksdb/tools/write_external_sst.sh b/thirdparty/rocksdb/tools/write_external_sst.sh
new file mode 100755
index 0000000000..6efc300202
--- /dev/null
+++ b/thirdparty/rocksdb/tools/write_external_sst.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+#
+#
+#
+
+if [ "$#" -lt 3 ]; then
+  echo "usage: $BASH_SOURCE <input_data_path> <DB Path> <extern SST dir>"
+  exit 1
+fi
+
+input_data_dir=$1
+db_dir=$2
+extern_sst_dir=$3
+rm -rf $db_dir
+
+set -e
+
+n=0
+
+for f in `find $input_data_dir -name sorted_data*`
+do
+  echo == Writing external SST file $f to $extern_sst_dir/extern_sst${n}
+  ./ldb --db=$db_dir --create_if_missing write_extern_sst $extern_sst_dir/extern_sst${n} < $f
+  let "n = n + 1"
+done
diff --git a/thirdparty/rocksdb/tools/write_stress.cc b/thirdparty/rocksdb/tools/write_stress.cc
index e5e4204a8d..ddb1d0aed0 100644
--- a/thirdparty/rocksdb/tools/write_stress.cc
+++ b/thirdparty/rocksdb/tools/write_stress.cc
@@ -56,9 +56,10 @@ int main() {
 }
 #else
 
-#include <gflags/gflags.h>
-
+#ifndef __STDC_FORMAT_MACROS
 #define __STDC_FORMAT_MACROS
+#endif // __STDC_FORMAT_MACROS
+
 #include <inttypes.h>
 #include <atomic>
 #include <random>
@@ -72,10 +73,11 @@ int main() {
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "util/filename.h"
+#include "util/gflags_compat.h"
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::RegisterFlagValidator;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::RegisterFlagValidator;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 DEFINE_int32(key_size, 10, "Key size");
 DEFINE_int32(value_size, 100, "Value size");
@@ -163,7 +165,7 @@ class WriteStress {
       std::uniform_int_distribution<int> char_dist('a', 'z');
       std::string ret;
       for (int i = 0; i < len; ++i) {
-        ret += char_dist(r);
+        ret += static_cast<char>(char_dist(r));
       }
       return ret;
     };
@@ -210,13 +212,13 @@ class WriteStress {
                                            FLAGS_prefix_mutate_period_sec *
                                            1000 * 1000LL));
       if (dist(rng) < FLAGS_first_char_mutate_probability) {
-        key_prefix_[0].store(char_dist(rng), std::memory_order_relaxed);
+        key_prefix_[0].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
       }
       if (dist(rng) < FLAGS_second_char_mutate_probability) {
-        key_prefix_[1].store(char_dist(rng), std::memory_order_relaxed);
+        key_prefix_[1].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
       }
       if (dist(rng) < FLAGS_third_char_mutate_probability) {
-        key_prefix_[2].store(char_dist(rng), std::memory_order_relaxed);
+        key_prefix_[2].store(static_cast<char>(char_dist(rng)), std::memory_order_relaxed);
       }
     }
   }
diff --git a/thirdparty/rocksdb/tools/write_stress_runner.py b/thirdparty/rocksdb/tools/write_stress_runner.py
index f69657832e..b84dfd3cad 100644
--- a/thirdparty/rocksdb/tools/write_stress_runner.py
+++ b/thirdparty/rocksdb/tools/write_stress_runner.py
@@ -21,7 +21,7 @@ def generate_runtimes(total_runtime):
 
 def main(args):
     runtimes = generate_runtimes(int(args.runtime_sec))
-    print "Going to execute write stress for " + str(runtimes)
+    print "Going to execute write stress for " + str(runtimes)  # noqa: E999 T25377293 Grandfathered in
     first_time = True
 
     for runtime in runtimes:
diff --git a/thirdparty/rocksdb/util/aligned_buffer.h b/thirdparty/rocksdb/util/aligned_buffer.h
index e93f4b5c66..2201b48777 100644
--- a/thirdparty/rocksdb/util/aligned_buffer.h
+++ b/thirdparty/rocksdb/util/aligned_buffer.h
@@ -23,6 +23,8 @@ inline size_t Roundup(size_t x, size_t y) {
   return ((x + y - 1) / y) * y;
 }
 
+inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; }
+
 // This class is to manage an aligned user
 // allocated buffer for direct I/O purposes
 // though can be used for any purpose.
@@ -94,12 +96,19 @@ class AlignedBuffer {
     alignment_ = alignment;
   }
 
-  // Allocates a new buffer and sets bufstart_ to the aligned first byte
-  void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false) {
+  // Allocates a new buffer and sets bufstart_ to the aligned first byte.
+  // requested_capacity: requested new buffer capacity. This capacity will be
+  //     rounded up based on alignment.
+  // copy_data: Copy data from old buffer to new buffer.
+  // copy_offset: Copy data from this offset in old buffer.
+  // copy_len: Number of bytes to copy.
+  void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false,
+                         uint64_t copy_offset = 0, size_t copy_len = 0) {
     assert(alignment_ > 0);
     assert((alignment_ & (alignment_ - 1)) == 0);
 
-    if (copy_data && requested_capacity < cursize_) {
+    copy_len = copy_len > 0 ? copy_len : cursize_;
+    if (copy_data && requested_capacity < copy_len) {
       // If we are downsizing to a capacity that is smaller than the current
       // data in the buffer. Ignore the request.
       return;
@@ -112,7 +121,9 @@ class AlignedBuffer {
         ~static_cast<uintptr_t>(alignment_ - 1));
 
     if (copy_data) {
-      memcpy(new_bufstart, bufstart_, cursize_);
+      assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_);
+      memcpy(new_bufstart, bufstart_ + copy_offset, copy_len);
+      cursize_ = copy_len;
     } else {
       cursize_ = 0;
     }
@@ -159,6 +170,12 @@ class AlignedBuffer {
     }
   }
 
+  void PadWith(size_t pad_size, int padding) {
+    assert((pad_size + cursize_) <= capacity_);
+    memset(bufstart_ + cursize_, padding, pad_size);
+    cursize_ += pad_size;
+  }
+
   // After a partial flush move the tail to the beginning of the buffer
   void RefitTail(size_t tail_offset, size_t tail_size) {
     if (tail_size > 0) {
diff --git a/thirdparty/rocksdb/util/arena.cc b/thirdparty/rocksdb/util/arena.cc
index 6185b5c558..d7799eb266 100644
--- a/thirdparty/rocksdb/util/arena.cc
+++ b/thirdparty/rocksdb/util/arena.cc
@@ -33,7 +33,7 @@ const size_t Arena::kInlineSize;
 
 const size_t Arena::kMinBlockSize = 4096;
 const size_t Arena::kMaxBlockSize = 2u << 30;
-static const int kAlignUnit = sizeof(void*);
+static const int kAlignUnit = alignof(max_align_t);
 
 size_t OptimizeBlockSize(size_t block_size) {
   // Make sure block_size is in optimal range
@@ -62,6 +62,8 @@ Arena::Arena(size_t block_size, AllocTracker* tracker, size_t huge_page_size)
   if (hugetlb_size_ && kBlockSize > hugetlb_size_) {
     hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_;
   }
+#else
+  (void)huge_page_size;
 #endif
   if (tracker_ != nullptr) {
     tracker_->Allocate(kInlineSize);
@@ -79,6 +81,9 @@ Arena::~Arena() {
 
 #ifdef MAP_HUGETLB
   for (const auto& mmap_info : huge_blocks_) {
+    if (mmap_info.addr_ == nullptr) {
+      continue;
+    }
     auto ret = munmap(mmap_info.addr_, mmap_info.length_);
     if (ret != 0) {
       // TODO(sdong): Better handling
@@ -126,11 +131,15 @@ char* Arena::AllocateFromHugePage(size_t bytes) {
   if (hugetlb_size_ == 0) {
     return nullptr;
   }
-  // already reserve space in huge_blocks_ before calling mmap().
-  // this way the insertion into the vector below will not throw and we
-  // won't leak the mapping in that case. if reserve() throws, we
-  // won't leak either
-  huge_blocks_.reserve(huge_blocks_.size() + 1);
+  // Reserve space in `huge_blocks_` before calling `mmap`.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called
+  //   `mmap` yet.
+  // - If `mmap` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  huge_blocks_.emplace_back(nullptr /* addr */, 0 /* length */);
 
   void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE),
                     (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), -1, 0);
@@ -138,14 +147,14 @@ char* Arena::AllocateFromHugePage(size_t bytes) {
   if (addr == MAP_FAILED) {
     return nullptr;
   }
-  // the following shouldn't throw because of the above reserve()
-  huge_blocks_.emplace_back(MmapInfo(addr, bytes));
+  huge_blocks_.back() = MmapInfo(addr, bytes);
   blocks_memory_ += bytes;
   if (tracker_ != nullptr) {
     tracker_->Allocate(bytes);
   }
   return reinterpret_cast<char*>(addr);
 #else
+  (void)bytes;
   return nullptr;
 #endif
 }
@@ -173,6 +182,9 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
       return addr;
     }
   }
+#else
+  (void)huge_page_size;
+  (void)logger;
 #endif
 
   size_t current_mod =
@@ -193,11 +205,15 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size,
 }
 
 char* Arena::AllocateNewBlock(size_t block_bytes) {
-  // already reserve space in blocks_ before allocating memory via new.
-  // this way the insertion into the vector below will not throw and we
-  // won't leak the allocated memory in that case. if reserve() throws,
-  // we won't leak either
-  blocks_.reserve(blocks_.size() + 1);
+  // Reserve space in `blocks_` before allocating memory via new.
+  // Use `emplace_back()` instead of `reserve()` to let std::vector manage its
+  // own memory and do fewer reallocations.
+  //
+  // - If `emplace_back` throws, no memory leaks because we haven't called `new`
+  //   yet.
+  // - If `new` throws, no memory leaks because the vector will be cleaned up
+  //   via RAII.
+  blocks_.emplace_back(nullptr);
 
   char* block = new char[block_bytes];
   size_t allocated_size;
@@ -216,8 +232,7 @@ char* Arena::AllocateNewBlock(size_t block_bytes) {
   if (tracker_ != nullptr) {
     tracker_->Allocate(allocated_size);
   }
-  // the following shouldn't throw because of the above reserve()
-  blocks_.push_back(block);
+  blocks_.back() = block;
   return block;
 }
 
diff --git a/thirdparty/rocksdb/util/arena.h b/thirdparty/rocksdb/util/arena.h
index af53a2ff81..dc64154c85 100644
--- a/thirdparty/rocksdb/util/arena.h
+++ b/thirdparty/rocksdb/util/arena.h
@@ -82,7 +82,7 @@ class Arena : public Allocator {
   }
 
  private:
-  char inline_block_[kInlineSize] __attribute__((__aligned__(sizeof(void*))));
+  char inline_block_[kInlineSize] __attribute__((__aligned__(alignof(max_align_t))));
   // Number of bytes allocated in one block
   const size_t kBlockSize;
   // Array of new[] allocated memory blocks
diff --git a/thirdparty/rocksdb/util/arena_test.cc b/thirdparty/rocksdb/util/arena_test.cc
index 53777a20b6..9dfc28ab2e 100644
--- a/thirdparty/rocksdb/util/arena_test.cc
+++ b/thirdparty/rocksdb/util/arena_test.cc
@@ -91,14 +91,15 @@ static void ApproximateMemoryUsageTest(size_t huge_page_size) {
   ASSERT_EQ(kZero, arena.ApproximateMemoryUsage());
 
   // allocate inline bytes
+  const size_t kAlignUnit = alignof(max_align_t);
   EXPECT_TRUE(arena.IsInInlineBlock());
-  arena.AllocateAligned(8);
+  arena.AllocateAligned(kAlignUnit);
   EXPECT_TRUE(arena.IsInInlineBlock());
-  arena.AllocateAligned(Arena::kInlineSize / 2 - 16);
+  arena.AllocateAligned(Arena::kInlineSize / 2 - (2 * kAlignUnit));
   EXPECT_TRUE(arena.IsInInlineBlock());
   arena.AllocateAligned(Arena::kInlineSize / 2);
   EXPECT_TRUE(arena.IsInInlineBlock());
-  ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - 8);
+  ASSERT_EQ(arena.ApproximateMemoryUsage(), Arena::kInlineSize - kAlignUnit);
   ASSERT_PRED2(CheckMemoryAllocated, arena.MemoryAllocatedBytes(),
                Arena::kInlineSize);
 
diff --git a/thirdparty/rocksdb/util/auto_roll_logger.h b/thirdparty/rocksdb/util/auto_roll_logger.h
index 2f1f943d6a..64fce4d63e 100644
--- a/thirdparty/rocksdb/util/auto_roll_logger.h
+++ b/thirdparty/rocksdb/util/auto_roll_logger.h
@@ -80,6 +80,9 @@ class AutoRollLogger : public Logger {
   }
 
   virtual ~AutoRollLogger() {
+    if (logger_ && !closed_) {
+      logger_->Close();
+    }
   }
 
   void SetCallNowMicrosEveryNRecords(uint64_t call_NowMicros_every_N_records) {
@@ -93,6 +96,16 @@ class AutoRollLogger : public Logger {
 
   uint64_t TEST_ctime() const { return ctime_; }
 
+ protected:
+  // Implementation of Close()
+  virtual Status CloseImpl() override {
+    if (logger_) {
+      return logger_->Close();
+    } else {
+      return Status::OK();
+    }
+  }
+
  private:
   bool LogExpired();
   Status ResetLogger();
@@ -103,7 +116,6 @@ class AutoRollLogger : public Logger {
   std::string ValistToString(const char* format, va_list args) const;
   // Write the logs marked as headers to the new log file
   void WriteHeaderInfo();
-
   std::string log_fname_; // Current active info log's file name.
   std::string dbname_;
   std::string db_log_dir_;
diff --git a/thirdparty/rocksdb/util/auto_roll_logger_test.cc b/thirdparty/rocksdb/util/auto_roll_logger_test.cc
index 9b39748ce7..ab9e059580 100644
--- a/thirdparty/rocksdb/util/auto_roll_logger_test.cc
+++ b/thirdparty/rocksdb/util/auto_roll_logger_test.cc
@@ -28,13 +28,13 @@ namespace {
 class NoSleepEnv : public EnvWrapper {
  public:
   NoSleepEnv(Env* base) : EnvWrapper(base) {}
-  virtual void SleepForMicroseconds(int micros) override {
+  void SleepForMicroseconds(int micros) override {
     fake_time_ += static_cast<uint64_t>(micros);
   }
 
-  virtual uint64_t NowNanos() override { return fake_time_ * 1000; }
+  uint64_t NowNanos() override { return fake_time_ * 1000; }
 
-  virtual uint64_t NowMicros() override { return fake_time_; }
+  uint64_t NowMicros() override { return fake_time_; }
 
  private:
   uint64_t fake_time_ = 6666666666;
@@ -71,9 +71,10 @@ class AutoRollLoggerTest : public testing::Test {
 
 const std::string AutoRollLoggerTest::kSampleMessage(
     "this is the message to be written to the log file!!");
-const std::string AutoRollLoggerTest::kTestDir(test::TmpDir() + "/db_log_test");
-const std::string AutoRollLoggerTest::kLogFile(test::TmpDir() +
-                                               "/db_log_test/LOG");
+const std::string AutoRollLoggerTest::kTestDir(
+    test::PerThreadDBPath("db_log_test"));
+const std::string AutoRollLoggerTest::kLogFile(
+    test::PerThreadDBPath("db_log_test") + "/LOG");
 Env* AutoRollLoggerTest::default_env = Env::Default();
 
 // In this test we only want to Log some simple log message with
@@ -229,7 +230,7 @@ TEST_F(AutoRollLoggerTest, CompositeRollByTimeAndSizeLogger) {
 TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
   DBOptions options;
   NoSleepEnv nse(Env::Default());
-  shared_ptr<Logger> logger;
+  std::shared_ptr<Logger> logger;
 
   // Normal logger
   ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger));
@@ -272,7 +273,7 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) {
 
 TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) {
   DBOptions options;
-  shared_ptr<Logger> logger;
+  std::shared_ptr<Logger> logger;
 
   InitTestDb();
   options.max_log_file_size = 1024 * 5;
@@ -354,6 +355,45 @@ TEST_F(AutoRollLoggerTest, InfoLogLevel) {
   inFile.close();
 }
 
+TEST_F(AutoRollLoggerTest, Close) {
+  InitTestDb();
+
+  size_t log_size = 8192;
+  size_t log_lines = 0;
+  AutoRollLogger logger(Env::Default(), kTestDir, "", log_size, 0);
+  for (int log_level = InfoLogLevel::HEADER_LEVEL;
+       log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+    logger.SetInfoLogLevel((InfoLogLevel)log_level);
+    for (int log_type = InfoLogLevel::DEBUG_LEVEL;
+         log_type <= InfoLogLevel::HEADER_LEVEL; log_type++) {
+      // log messages with log level smaller than log_level will not be
+      // logged.
+      LogMessage((InfoLogLevel)log_type, &logger, kSampleMessage.c_str());
+    }
+    log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+  }
+  for (int log_level = InfoLogLevel::HEADER_LEVEL;
+       log_level >= InfoLogLevel::DEBUG_LEVEL; log_level--) {
+    logger.SetInfoLogLevel((InfoLogLevel)log_level);
+
+    // again, messages with level smaller than log_level will not be logged.
+    ROCKS_LOG_HEADER(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_DEBUG(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_INFO(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_WARN(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_ERROR(&logger, "%s", kSampleMessage.c_str());
+    ROCKS_LOG_FATAL(&logger, "%s", kSampleMessage.c_str());
+    log_lines += InfoLogLevel::HEADER_LEVEL - log_level + 1;
+  }
+  ASSERT_EQ(logger.Close(), Status::OK());
+
+  std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str());
+  size_t lines = std::count(std::istreambuf_iterator<char>(inFile),
+                         std::istreambuf_iterator<char>(), '\n');
+  ASSERT_EQ(log_lines, lines);
+  inFile.close();
+}
+
 // Test the logger Header function for roll over logs
 // We expect the new logs creates as roll over to carry the headers specified
 static std::vector<std::string> GetOldFileNames(const std::string& path) {
@@ -412,12 +452,12 @@ TEST_F(AutoRollLoggerTest, LogHeaderTest) {
     if (test_num == 0) {
       // Log some headers explicitly using Header()
       for (size_t i = 0; i < MAX_HEADERS; i++) {
-        Header(&logger, "%s %d", HEADER_STR.c_str(), i);
+        Header(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i);
       }
     } else if (test_num == 1) {
       // HEADER_LEVEL should make this behave like calling Header()
       for (size_t i = 0; i < MAX_HEADERS; i++) {
-        ROCKS_LOG_HEADER(&logger, "%s %d", HEADER_STR.c_str(), i);
+        ROCKS_LOG_HEADER(&logger, "%s %" ROCKSDB_PRIszt, HEADER_STR.c_str(), i);
       }
     }
 
@@ -481,7 +521,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as AutoRollLogger is not supported in ROCKSDB_LITE\n");
   return 0;
diff --git a/thirdparty/rocksdb/util/autovector.h b/thirdparty/rocksdb/util/autovector.h
index b5c8471245..5843fa8a11 100644
--- a/thirdparty/rocksdb/util/autovector.h
+++ b/thirdparty/rocksdb/util/autovector.h
@@ -179,15 +179,16 @@ class autovector {
   typedef std::reverse_iterator<iterator> reverse_iterator;
   typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
 
-  autovector() = default;
+  autovector() : values_(reinterpret_cast<pointer>(buf_)) {}
 
-  autovector(std::initializer_list<T> init_list) {
+  autovector(std::initializer_list<T> init_list)
+      : values_(reinterpret_cast<pointer>(buf_)) {
     for (const T& item : init_list) {
       push_back(item);
     }
   }
 
-  ~autovector() = default;
+  ~autovector() { clear(); }
 
   // -- Immutable operations
   // Indicate if all data resides in in-stack data structure.
@@ -203,10 +204,18 @@ class autovector {
   void resize(size_type n) {
     if (n > kSize) {
       vect_.resize(n - kSize);
+      while (num_stack_items_ < kSize) {
+        new ((void*)(&values_[num_stack_items_++])) value_type();
+      }
       num_stack_items_ = kSize;
     } else {
       vect_.clear();
-      num_stack_items_ = n;
+      while (num_stack_items_ < n) {
+        new ((void*)(&values_[num_stack_items_++])) value_type();
+      }
+      while (num_stack_items_ > n) {
+        values_[--num_stack_items_].~value_type();
+      }
     }
   }
 
@@ -214,12 +223,18 @@ class autovector {
 
   const_reference operator[](size_type n) const {
     assert(n < size());
-    return n < kSize ? values_[n] : vect_[n - kSize];
+    if (n < kSize) {
+      return values_[n];
+    }
+    return vect_[n - kSize];
   }
 
   reference operator[](size_type n) {
     assert(n < size());
-    return n < kSize ? values_[n] : vect_[n - kSize];
+    if (n < kSize) {
+      return values_[n];
+    }
+    return vect_[n - kSize];
   }
 
   const_reference at(size_type n) const {
@@ -255,6 +270,7 @@ class autovector {
   // -- Mutable Operations
   void push_back(T&& item) {
     if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_])) value_type();
       values_[num_stack_items_++] = std::move(item);
     } else {
       vect_.push_back(item);
@@ -263,6 +279,7 @@ class autovector {
 
   void push_back(const T& item) {
     if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_])) value_type();
       values_[num_stack_items_++] = item;
     } else {
       vect_.push_back(item);
@@ -271,7 +288,12 @@ class autovector {
 
   template <class... Args>
   void emplace_back(Args&&... args) {
-    push_back(value_type(args...));
+    if (num_stack_items_ < kSize) {
+      new ((void*)(&values_[num_stack_items_++]))
+          value_type(std::forward<Args>(args)...);
+    } else {
+      vect_.emplace_back(std::forward<Args>(args)...);
+    }
   }
 
   void pop_back() {
@@ -279,12 +301,14 @@ class autovector {
     if (!vect_.empty()) {
       vect_.pop_back();
     } else {
-      --num_stack_items_;
+      values_[--num_stack_items_].~value_type();
     }
   }
 
   void clear() {
-    num_stack_items_ = 0;
+    while (num_stack_items_ > 0) {
+      values_[--num_stack_items_].~value_type();
+    }
     vect_.clear();
   }
 
@@ -318,13 +342,17 @@ class autovector {
 
  private:
   size_type num_stack_items_ = 0;  // current number of items
-  value_type values_[kSize];       // the first `kSize` items
+  alignas(alignof(
+      value_type)) char buf_[kSize *
+                             sizeof(value_type)];  // the first `kSize` items
+  pointer values_;
   // used only if there are more than `kSize` items.
   std::vector<T> vect_;
 };
 
 template <class T, size_t kSize>
 autovector<T, kSize>& autovector<T, kSize>::assign(const autovector& other) {
+  values_ = reinterpret_cast<pointer>(buf_);
   // copy the internal vector
   vect_.assign(other.vect_.begin(), other.vect_.end());
 
diff --git a/thirdparty/rocksdb/util/autovector_test.cc b/thirdparty/rocksdb/util/autovector_test.cc
index 2d7bcea57d..13299669cd 100644
--- a/thirdparty/rocksdb/util/autovector_test.cc
+++ b/thirdparty/rocksdb/util/autovector_test.cc
@@ -27,6 +27,9 @@ template <class T>
 void AssertAutoVectorOnlyInStack(autovector<T, kSize>* vec, bool result) {
 #ifndef ROCKSDB_LITE
   ASSERT_EQ(vec->only_in_stack(), result);
+#else
+  (void) vec;
+  (void) result;
 #endif  // !ROCKSDB_LITE
 }
 }  // namespace
diff --git a/thirdparty/rocksdb/util/bloom.cc b/thirdparty/rocksdb/util/bloom.cc
index 9af17f8732..9c05f7107b 100644
--- a/thirdparty/rocksdb/util/bloom.cc
+++ b/thirdparty/rocksdb/util/bloom.cc
@@ -119,6 +119,9 @@ int FullFilterBitsBuilder::CalculateNumEntry(const uint32_t space) {
 
 inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
     uint32_t num_lines, uint32_t total_bits) {
+#ifdef NDEBUG
+  (void)total_bits;
+#endif
   assert(num_lines > 0 && total_bits > 0);
 
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
@@ -141,19 +144,37 @@ class FullFilterBitsReader : public FilterBitsReader {
       : data_(const_cast<char*>(contents.data())),
         data_len_(static_cast<uint32_t>(contents.size())),
         num_probes_(0),
-        num_lines_(0) {
+        num_lines_(0),
+        log2_cache_line_size_(0) {
     assert(data_);
     GetFilterMeta(contents, &num_probes_, &num_lines_);
     // Sanitize broken parameter
     if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) {
       num_lines_ = 0;
       num_probes_ = 0;
+    } else if (num_lines_ != 0) {
+      while (true) {
+        uint32_t num_lines_at_curr_cache_size =
+            (data_len_ - 5) >> log2_cache_line_size_;
+        if (num_lines_at_curr_cache_size == 0) {
+          // The cache line size seems not a power of two. It's not supported
+          // and indicates a corruption so disable using this filter.
+          assert(false);
+          num_lines_ = 0;
+          num_probes_ = 0;
+          break;
+        }
+        if (num_lines_at_curr_cache_size == num_lines_) {
+          break;
+        }
+        ++log2_cache_line_size_;
+      }
     }
   }
 
-  ~FullFilterBitsReader() {}
+  ~FullFilterBitsReader() override {}
 
-  virtual bool MayMatch(const Slice& entry) override {
+  bool MayMatch(const Slice& entry) override {
     if (data_len_ <= 5) {   // remain same with original filter
       return false;
     }
@@ -170,6 +191,7 @@ class FullFilterBitsReader : public FilterBitsReader {
   uint32_t data_len_;
   size_t num_probes_;
   uint32_t num_lines_;
+  uint32_t log2_cache_line_size_;
 
   // Get num_probes, and num_lines from filter
   // If filter format broken, set both to 0.
@@ -177,10 +199,10 @@ class FullFilterBitsReader : public FilterBitsReader {
                              uint32_t* num_lines);
 
   // "filter" contains the data appended by a preceding call to
-  // CreateFilterFromHash() on this class.  This method must return true if
-  // the key was in the list of keys passed to CreateFilter().
-  // This method may return true or false if the key was not on the
-  // list, but it should aim to return false with a high probability.
+  // FilterBitsBuilder::Finish. This method must return true if the key was
+  // passed to FilterBitsBuilder::AddKey. This method may return true or false
+  // if the key was not on the list, but it should aim to return false with a
+  // high probability.
   //
   // hash: target to be checked
   // filter: the whole filter, including meta data bytes
@@ -219,17 +241,20 @@ bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
   // It is ensured the params are valid before calling it
   assert(num_probes != 0);
   assert(num_lines != 0 && (len - 5) % num_lines == 0);
-  uint32_t cache_line_size = (len - 5) / num_lines;
   const char* data = filter.data();
 
   uint32_t h = hash;
   const uint32_t delta = (h >> 17) | (h << 15);  // Rotate right 17 bits
-  uint32_t b = (h % num_lines) * (cache_line_size * 8);
+  // Left shift by an extra 3 to convert bytes to bits
+  uint32_t b = (h % num_lines) << (log2_cache_line_size_ + 3);
+  PREFETCH(&data[b / 8], 0 /* rw */, 1 /* locality */);
+  PREFETCH(&data[b / 8 + (1 << log2_cache_line_size_) - 1], 0 /* rw */,
+           1 /* locality */);
 
   for (uint32_t i = 0; i < num_probes; ++i) {
     // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
     //  to a simple and operation by compiler.
-    const uint32_t bitpos = b + (h % (cache_line_size * 8));
+    const uint32_t bitpos = b + (h & ((1 << (log2_cache_line_size_ + 3)) - 1));
     if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) {
       return false;
     }
@@ -249,15 +274,11 @@ class BloomFilterPolicy : public FilterPolicy {
     initialize();
   }
 
-  ~BloomFilterPolicy() {
-  }
+  ~BloomFilterPolicy() override {}
 
-  virtual const char* Name() const override {
-    return "rocksdb.BuiltinBloomFilter";
-  }
+  const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; }
 
-  virtual void CreateFilter(const Slice* keys, int n,
-                            std::string* dst) const override {
+  void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
     // Compute bloom filter size (in both bits and bytes)
     size_t bits = n * bits_per_key_;
 
@@ -285,8 +306,7 @@ class BloomFilterPolicy : public FilterPolicy {
     }
   }
 
-  virtual bool KeyMayMatch(const Slice& key,
-                           const Slice& bloom_filter) const override {
+  bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const override {
     const size_t len = bloom_filter.size();
     if (len < 2) return false;
 
@@ -312,7 +332,7 @@ class BloomFilterPolicy : public FilterPolicy {
     return true;
   }
 
-  virtual FilterBitsBuilder* GetFilterBitsBuilder() const override {
+  FilterBitsBuilder* GetFilterBitsBuilder() const override {
     if (use_block_based_builder_) {
       return nullptr;
     }
@@ -320,8 +340,7 @@ class BloomFilterPolicy : public FilterPolicy {
     return new FullFilterBitsBuilder(bits_per_key_, num_probes_);
   }
 
-  virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents)
-      const override {
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
     return new FullFilterBitsReader(contents);
   }
 
diff --git a/thirdparty/rocksdb/util/bloom_test.cc b/thirdparty/rocksdb/util/bloom_test.cc
index 9c323414ec..4b25e9b6c6 100644
--- a/thirdparty/rocksdb/util/bloom_test.cc
+++ b/thirdparty/rocksdb/util/bloom_test.cc
@@ -15,17 +15,17 @@ int main() {
 }
 #else
 
-#include <gflags/gflags.h>
 #include <vector>
 
 #include "rocksdb/filter_policy.h"
 #include "table/full_filter_bits_builder.h"
 #include "util/arena.h"
+#include "util/gflags_compat.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
-using GFLAGS::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
 DEFINE_int32(bits_per_key, 10, "");
 
@@ -63,9 +63,7 @@ class BloomTest : public testing::Test {
   BloomTest() : policy_(
       NewBloomFilterPolicy(FLAGS_bits_per_key)) {}
 
-  ~BloomTest() {
-    delete policy_;
-  }
+  ~BloomTest() override { delete policy_; }
 
   void Reset() {
     keys_.clear();
@@ -192,9 +190,7 @@ class FullBloomTest : public testing::Test {
     Reset();
   }
 
-  ~FullBloomTest() {
-    delete policy_;
-  }
+  ~FullBloomTest() override { delete policy_; }
 
   FullFilterBitsBuilder* GetFullFilterBitsBuilder() {
     return dynamic_cast<FullFilterBitsBuilder*>(bits_builder_.get());
diff --git a/thirdparty/rocksdb/util/channel.h b/thirdparty/rocksdb/util/channel.h
index 1b030192cf..0225482c00 100644
--- a/thirdparty/rocksdb/util/channel.h
+++ b/thirdparty/rocksdb/util/channel.h
@@ -3,13 +3,13 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 
+#pragma once
+
 #include <condition_variable>
 #include <mutex>
 #include <queue>
 #include <utility>
 
-#pragma once
-
 namespace rocksdb {
 
 template <class T>
diff --git a/thirdparty/rocksdb/util/coding.cc b/thirdparty/rocksdb/util/coding.cc
index 3b58e3f1fa..b5cfac869a 100644
--- a/thirdparty/rocksdb/util/coding.cc
+++ b/thirdparty/rocksdb/util/coding.cc
@@ -15,6 +15,11 @@
 
 namespace rocksdb {
 
+// conversion' conversion from 'type1' to 'type2', possible loss of data
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4244)
+#endif
 char* EncodeVarint32(char* dst, uint32_t v) {
   // Operate on characters as unsigneds
   unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
@@ -42,6 +47,9 @@ char* EncodeVarint32(char* dst, uint32_t v) {
   }
   return reinterpret_cast<char*>(ptr);
 }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 const char* GetVarint32PtrFallback(const char* p, const char* limit,
                                    uint32_t* value) {
diff --git a/thirdparty/rocksdb/util/coding.h b/thirdparty/rocksdb/util/coding.h
index 5cf009472a..4046a2b60b 100644
--- a/thirdparty/rocksdb/util/coding.h
+++ b/thirdparty/rocksdb/util/coding.h
@@ -32,6 +32,7 @@ namespace rocksdb {
 const unsigned int kMaxVarint64Length = 10;
 
 // Standard Put... routines append to a string
+extern void PutFixed16(std::string* dst, uint16_t value);
 extern void PutFixed32(std::string* dst, uint32_t value);
 extern void PutFixed64(std::string* dst, uint64_t value);
 extern void PutVarint32(std::string* dst, uint32_t value);
@@ -54,6 +55,7 @@ extern void PutLengthPrefixedSliceParts(std::string* dst,
 // and advance the slice past the parsed value.
 extern bool GetFixed64(Slice* input, uint64_t* value);
 extern bool GetFixed32(Slice* input, uint32_t* value);
+extern bool GetFixed16(Slice* input, uint16_t* value);
 extern bool GetVarint32(Slice* input, uint32_t* value);
 extern bool GetVarint64(Slice* input, uint64_t* value);
 extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
@@ -62,18 +64,35 @@ extern Slice GetLengthPrefixedSlice(const char* data);
 
 extern Slice GetSliceUntil(Slice* slice, char delimiter);
 
+// Borrowed from
+// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208
+constexpr inline uint64_t i64ToZigzag(const int64_t l) {
+  return (static_cast<uint64_t>(l) << 1) ^ static_cast<uint64_t>(l >> 63);
+}
+inline int64_t zigzagToI64(uint64_t n) {
+  return (n >> 1) ^ -static_cast<int64_t>(n & 1);
+}
+
 // Pointer-based variants of GetVarint...  These either store a value
 // in *v and return a pointer just past the parsed value, or return
 // nullptr on error.  These routines only look at bytes in the range
 // [p..limit-1]
 extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
 extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+inline const char* GetVarsignedint64Ptr(const char* p, const char* limit,
+                                        int64_t* value) {
+  uint64_t u = 0;
+  const char* ret = GetVarint64Ptr(p, limit, &u);
+  *value = zigzagToI64(u);
+  return ret;
+}
 
 // Returns the length of the varint32 or varint64 encoding of "v"
 extern int VarintLength(uint64_t v);
 
 // Lower-level versions of Put... that write directly into a character buffer
 // REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed16(char* dst, uint16_t value);
 extern void EncodeFixed32(char* dst, uint32_t value);
 extern void EncodeFixed64(char* dst, uint64_t value);
 
@@ -86,6 +105,18 @@ extern char* EncodeVarint64(char* dst, uint64_t value);
 // Lower-level versions of Get... that read directly from a character buffer
 // without any bounds checking.
 
+inline uint16_t DecodeFixed16(const char* ptr) {
+  if (port::kLittleEndian) {
+    // Load the raw bytes
+    uint16_t result;
+    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint16_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint16_t>(static_cast<unsigned char>(ptr[1])) << 8));
+  }
+}
+
 inline uint32_t DecodeFixed32(const char* ptr) {
   if (port::kLittleEndian) {
     // Load the raw bytes
@@ -131,6 +162,15 @@ inline const char* GetVarint32Ptr(const char* p,
 }
 
 // -- Implementation of the functions declared above
+inline void EncodeFixed16(char* buf, uint16_t value) {
+  if (port::kLittleEndian) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+  }
+}
+
 inline void EncodeFixed32(char* buf, uint32_t value) {
   if (port::kLittleEndian) {
     memcpy(buf, &value, sizeof(value));
@@ -158,6 +198,17 @@ inline void EncodeFixed64(char* buf, uint64_t value) {
 }
 
 // Pull the last 8 bits and cast it to a character
+inline void PutFixed16(std::string* dst, uint16_t value) {
+  if (port::kLittleEndian) {
+    dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
+                sizeof(value));
+  } else {
+    char buf[sizeof(value)];
+    EncodeFixed16(buf, value);
+    dst->append(buf, sizeof(buf));
+  }
+}
+
 inline void PutFixed32(std::string* dst, uint32_t value) {
   if (port::kLittleEndian) {
     dst->append(const_cast<const char*>(reinterpret_cast<char*>(&value)),
@@ -214,11 +265,18 @@ inline char* EncodeVarint64(char* dst, uint64_t v) {
 }
 
 inline void PutVarint64(std::string* dst, uint64_t v) {
-  char buf[10];
+  char buf[kMaxVarint64Length];
   char* ptr = EncodeVarint64(buf, v);
   dst->append(buf, static_cast<size_t>(ptr - buf));
 }
 
+inline void PutVarsignedint64(std::string* dst, int64_t v) {
+  char buf[kMaxVarint64Length];
+  // Using Zigzag format to convert signed to unsigned
+  char* ptr = EncodeVarint64(buf, i64ToZigzag(v));
+  dst->append(buf, static_cast<size_t>(ptr - buf));
+}
+
 inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) {
   char buf[20];
   char* ptr = EncodeVarint64(buf, v1);
@@ -286,6 +344,15 @@ inline bool GetFixed32(Slice* input, uint32_t* value) {
   return true;
 }
 
+inline bool GetFixed16(Slice* input, uint16_t* value) {
+  if (input->size() < sizeof(uint16_t)) {
+    return false;
+  }
+  *value = DecodeFixed16(input->data());
+  input->remove_prefix(sizeof(uint16_t));
+  return true;
+}
+
 inline bool GetVarint32(Slice* input, uint32_t* value) {
   const char* p = input->data();
   const char* limit = p + input->size();
diff --git a/thirdparty/rocksdb/util/coding_test.cc b/thirdparty/rocksdb/util/coding_test.cc
index 49fb73d4ab..f7b1671d1e 100644
--- a/thirdparty/rocksdb/util/coding_test.cc
+++ b/thirdparty/rocksdb/util/coding_test.cc
@@ -14,6 +14,19 @@
 namespace rocksdb {
 
 class Coding { };
+TEST(Coding, Fixed16) {
+  std::string s;
+  for (uint16_t v = 0; v < 0xFFFF; v++) {
+    PutFixed16(&s, v);
+  }
+
+  const char* p = s.data();
+  for (uint16_t v = 0; v < 0xFFFF; v++) {
+    uint16_t actual = DecodeFixed16(p);
+    ASSERT_EQ(v, actual);
+    p += sizeof(uint16_t);
+  }
+}
 
 TEST(Coding, Fixed32) {
   std::string s;
diff --git a/thirdparty/rocksdb/util/compaction_job_stats_impl.cc b/thirdparty/rocksdb/util/compaction_job_stats_impl.cc
index 1787e839f8..a1ebc8b961 100644
--- a/thirdparty/rocksdb/util/compaction_job_stats_impl.cc
+++ b/thirdparty/rocksdb/util/compaction_job_stats_impl.cc
@@ -11,6 +11,7 @@ namespace rocksdb {
 
 void CompactionJobStats::Reset() {
   elapsed_micros = 0;
+  cpu_micros = 0;
 
   num_input_records = 0;
   num_input_files = 0;
@@ -45,6 +46,7 @@ void CompactionJobStats::Reset() {
 
 void CompactionJobStats::Add(const CompactionJobStats& stats) {
   elapsed_micros += stats.elapsed_micros;
+  cpu_micros += stats.cpu_micros;
 
   num_input_records += stats.num_input_records;
   num_input_files += stats.num_input_files;
@@ -79,7 +81,7 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) {
 
 void CompactionJobStats::Reset() {}
 
-void CompactionJobStats::Add(const CompactionJobStats& stats) {}
+void CompactionJobStats::Add(const CompactionJobStats& /*stats*/) {}
 
 #endif  // !ROCKSDB_LITE
 
diff --git a/thirdparty/rocksdb/util/comparator.cc b/thirdparty/rocksdb/util/comparator.cc
index f3148f754f..b42c23725f 100644
--- a/thirdparty/rocksdb/util/comparator.cc
+++ b/thirdparty/rocksdb/util/comparator.cc
@@ -17,27 +17,21 @@
 
 namespace rocksdb {
 
-Comparator::~Comparator() { }
-
 namespace {
 class BytewiseComparatorImpl : public Comparator {
  public:
   BytewiseComparatorImpl() { }
 
-  virtual const char* Name() const override {
-    return "leveldb.BytewiseComparator";
-  }
+  const char* Name() const override { return "leveldb.BytewiseComparator"; }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     return a.compare(b);
   }
 
-  virtual bool Equal(const Slice& a, const Slice& b) const override {
-    return a == b;
-  }
+  bool Equal(const Slice& a, const Slice& b) const override { return a == b; }
 
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
     // Find length of common prefix
     size_t min_length = std::min(start->size(), limit.size());
     size_t diff_index = 0;
@@ -87,7 +81,7 @@ class BytewiseComparatorImpl : public Comparator {
     }
   }
 
-  virtual void FindShortSuccessor(std::string* key) const override {
+  void FindShortSuccessor(std::string* key) const override {
     // Find first character that can be incremented
     size_t n = key->size();
     for (size_t i = 0; i < n; i++) {
@@ -100,21 +94,105 @@ class BytewiseComparatorImpl : public Comparator {
     }
     // *key is a run of 0xffs.  Leave it alone.
   }
+
+  bool IsSameLengthImmediateSuccessor(const Slice& s,
+                                      const Slice& t) const override {
+    if (s.size() != t.size() || s.size() == 0) {
+      return false;
+    }
+    size_t diff_ind = s.difference_offset(t);
+    // same slice
+    if (diff_ind >= s.size()) return false;
+    uint8_t byte_s = static_cast<uint8_t>(s[diff_ind]);
+    uint8_t byte_t = static_cast<uint8_t>(t[diff_ind]);
+    // first different byte must be consecutive, and remaining bytes must be
+    // 0xff for s and 0x00 for t
+    if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) {
+      for (size_t i = diff_ind + 1; i < s.size(); ++i) {
+        byte_s = static_cast<uint8_t>(s[i]);
+        byte_t = static_cast<uint8_t>(t[i]);
+        if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) {
+          return false;
+        }
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
 };
 
 class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl {
  public:
   ReverseBytewiseComparatorImpl() { }
 
-  virtual const char* Name() const override {
+  const char* Name() const override {
     return "rocksdb.ReverseBytewiseComparator";
   }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     return -a.compare(b);
   }
-};
 
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    // Find length of common prefix
+    size_t min_length = std::min(start->size(), limit.size());
+    size_t diff_index = 0;
+    while ((diff_index < min_length) &&
+           ((*start)[diff_index] == limit[diff_index])) {
+      diff_index++;
+    }
+
+    assert(diff_index <= min_length);
+    if (diff_index == min_length) {
+      // Do not shorten if one string is a prefix of the other
+      //
+      // We could handle cases like:
+      //     V
+      // A A 2 X Y
+      // A A 2
+      // in a similar way as BytewiseComparator::FindShortestSeparator().
+      // We keep it simple by not implementing it. We can come back to it
+      // later when needed.
+    } else {
+      uint8_t start_byte = static_cast<uint8_t>((*start)[diff_index]);
+      uint8_t limit_byte = static_cast<uint8_t>(limit[diff_index]);
+      if (start_byte > limit_byte && diff_index < start->size() - 1) {
+        // Case like
+        //     V
+        // A A 3 A A
+        // A A 1 B B
+        //
+        // or
+        //     v
+        // A A 2 A A
+        // A A 1 B B
+        // In this case "AA2" will be good.
+#ifndef NDEBUG
+        std::string old_start = *start;
+#endif
+        start->resize(diff_index + 1);
+#ifndef NDEBUG
+        assert(old_start >= *start);
+#endif
+        assert(Slice(*start).compare(limit) > 0);
+      }
+    }
+  }
+
+  void FindShortSuccessor(std::string* /*key*/) const override {
+    // Don't do anything for simplicity.
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return false;
+  }
+};
 }// namespace
 
 const Comparator* BytewiseComparator() {
diff --git a/thirdparty/rocksdb/util/compression.h b/thirdparty/rocksdb/util/compression.h
index 468b961fbf..b901ceb351 100644
--- a/thirdparty/rocksdb/util/compression.h
+++ b/thirdparty/rocksdb/util/compression.h
@@ -11,10 +11,21 @@
 
 #include <algorithm>
 #include <limits>
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+#ifdef OS_FREEBSD
+#include <malloc_np.h>
+#else  // OS_FREEBSD
+#include <malloc.h>
+#endif  // OS_FREEBSD
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
 #include <string>
 
 #include "rocksdb/options.h"
+#include "rocksdb/table.h"
 #include "util/coding.h"
+#include "util/compression_context_cache.h"
+#include "util/memory_allocator.h"
+#include "util/string_util.h"
 
 #ifdef SNAPPY
 #include <snappy.h>
@@ -35,6 +46,105 @@
 
 #if defined(ZSTD)
 #include <zstd.h>
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+#include <zdict.h>
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+namespace rocksdb {
+// Need this for the context allocation override
+// On windows we need to do this explicitly
+#if (ZSTD_VERSION_NUMBER >= 500)
+#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \
+    defined(ZSTD_STATIC_LINKING_ONLY)
+#define ROCKSDB_ZSTD_CUSTOM_MEM
+namespace port {
+ZSTD_customMem GetJeZstdAllocationOverrides();
+}  // namespace port
+#endif  // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) &&
+        // defined(ZSTD_STATIC_LINKING_ONLY)
+
+// We require `ZSTD_sizeof_DDict` and `ZSTD_createDDict_byReference` to use
+// `ZSTD_DDict`. The former was introduced in v1.0.0 and the latter was
+// introduced in v1.1.3. But an important bug fix for `ZSTD_sizeof_DDict` came
+// in v1.1.4, so that is the version we require. As of today's latest version
+// (v1.3.8), they are both still in the experimental API, which means they are
+// only exported when the compiler flag `ZSTD_STATIC_LINKING_ONLY` is set.
+#if defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+#define ROCKSDB_ZSTD_DDICT
+#endif  // defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104
+
+// Cached data represents a portion that can be re-used
+// If, in the future we have more than one native context to
+// cache we can arrange this as a tuple
+class ZSTDUncompressCachedData {
+ public:
+  using ZSTDNativeContext = ZSTD_DCtx*;
+  ZSTDUncompressCachedData() {}
+  // Init from cache
+  ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete;
+  ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+  ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) ROCKSDB_NOEXCEPT
+      : ZSTDUncompressCachedData() {
+    *this = std::move(o);
+  }
+  ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o)
+      ROCKSDB_NOEXCEPT {
+    assert(zstd_ctx_ == nullptr);
+    std::swap(zstd_ctx_, o.zstd_ctx_);
+    std::swap(cache_idx_, o.cache_idx_);
+    return *this;
+  }
+  ZSTDNativeContext Get() const { return zstd_ctx_; }
+  int64_t GetCacheIndex() const { return cache_idx_; }
+  void CreateIfNeeded() {
+    if (zstd_ctx_ == nullptr) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ =
+          ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ = ZSTD_createDCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+      cache_idx_ = -1;
+    }
+  }
+  void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) {
+    zstd_ctx_ = o.zstd_ctx_;
+    cache_idx_ = idx;
+  }
+  ~ZSTDUncompressCachedData() {
+    if (zstd_ctx_ != nullptr && cache_idx_ == -1) {
+      ZSTD_freeDCtx(zstd_ctx_);
+    }
+  }
+
+ private:
+  ZSTDNativeContext zstd_ctx_ = nullptr;
+  int64_t cache_idx_ = -1;  // -1 means this instance owns the context
+};
+#endif  // (ZSTD_VERSION_NUMBER >= 500)
+}  // namespace rocksdb
+#endif  // ZSTD
+
+#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500)
+namespace rocksdb {
+class ZSTDUncompressCachedData {
+  void* padding;  // unused
+ public:
+  using ZSTDNativeContext = void*;
+  ZSTDUncompressCachedData() {}
+  ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {}
+  ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete;
+  ZSTDUncompressCachedData(ZSTDUncompressCachedData&&)
+      ROCKSDB_NOEXCEPT = default;
+  ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&)
+      ROCKSDB_NOEXCEPT = default;
+  ZSTDNativeContext Get() const { return nullptr; }
+  int64_t GetCacheIndex() const { return -1; }
+  void CreateIfNeeded() {}
+  void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {}
+ private:
+  void ignore_padding__() { padding = nullptr; }
+};
+}  // namespace rocksdb
 #endif
 
 #if defined(XPRESS)
@@ -43,54 +153,308 @@
 
 namespace rocksdb {
 
+// Holds dictionary and related data, like ZSTD's digested compression
+// dictionary.
+struct CompressionDict {
+#if ZSTD_VERSION_NUMBER >= 700
+  ZSTD_CDict* zstd_cdict_ = nullptr;
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  std::string dict_;
+
+ public:
+#if ZSTD_VERSION_NUMBER >= 700
+  CompressionDict(std::string dict, CompressionType type, int level) {
+#else   // ZSTD_VERSION_NUMBER >= 700
+  CompressionDict(std::string dict, CompressionType /*type*/, int /*level*/) {
+#endif  // ZSTD_VERSION_NUMBER >= 700
+    dict_ = std::move(dict);
+#if ZSTD_VERSION_NUMBER >= 700
+    zstd_cdict_ = nullptr;
+    if (!dict_.empty() && (type == kZSTD || type == kZSTDNotFinalCompression)) {
+      if (level == CompressionOptions::kDefaultCompressionLevel) {
+        // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+        // https://github.com/facebook/zstd/issues/1148
+        level = 3;
+      }
+      // Should be safe (but slower) if below call fails as we'll use the
+      // raw dictionary to compress.
+      zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level);
+      assert(zstd_cdict_ != nullptr);
+    }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  }
+
+  ~CompressionDict() {
+#if ZSTD_VERSION_NUMBER >= 700
+    size_t res = 0;
+    if (zstd_cdict_ != nullptr) {
+      res = ZSTD_freeCDict(zstd_cdict_);
+    }
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ZSTD_VERSION_NUMBER >= 700
+  }
+
+#if ZSTD_VERSION_NUMBER >= 700
+  const ZSTD_CDict* GetDigestedZstdCDict() const { return zstd_cdict_; }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+
+  Slice GetRawDict() const { return dict_; }
+
+  static const CompressionDict& GetEmptyDict() {
+    static CompressionDict empty_dict{};
+    return empty_dict;
+  }
+
+  CompressionDict() = default;
+  // Disable copy/move
+  CompressionDict(const CompressionDict&) = delete;
+  CompressionDict& operator=(const CompressionDict&) = delete;
+  CompressionDict(CompressionDict&&) = delete;
+  CompressionDict& operator=(CompressionDict&&) = delete;
+};
+
+// Holds dictionary and related data, like ZSTD's digested uncompression
+// dictionary.
+struct UncompressionDict {
+#ifdef ROCKSDB_ZSTD_DDICT
+  ZSTD_DDict* zstd_ddict_;
+#endif  // ROCKSDB_ZSTD_DDICT
+  // Block containing the data for the compression dictionary. It may be
+  // redundant with the data held in `zstd_ddict_`.
+  std::string dict_;
+  // This `Statistics` pointer is intended to be used upon block cache eviction,
+  // so only needs to be populated on `UncompressionDict`s that'll be inserted
+  // into block cache.
+  Statistics* statistics_;
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  UncompressionDict(std::string dict, bool using_zstd,
+                    Statistics* _statistics = nullptr) {
+#else   // ROCKSDB_ZSTD_DDICT
+  UncompressionDict(std::string dict, bool /*using_zstd*/,
+                    Statistics* _statistics = nullptr) {
+#endif  // ROCKSDB_ZSTD_DDICT
+    dict_ = std::move(dict);
+    statistics_ = _statistics;
+#ifdef ROCKSDB_ZSTD_DDICT
+    zstd_ddict_ = nullptr;
+    if (!dict_.empty() && using_zstd) {
+      zstd_ddict_ = ZSTD_createDDict_byReference(dict_.data(), dict_.size());
+      assert(zstd_ddict_ != nullptr);
+    }
+#endif  // ROCKSDB_ZSTD_DDICT
+  }
+
+  ~UncompressionDict() {
+#ifdef ROCKSDB_ZSTD_DDICT
+    size_t res = 0;
+    if (zstd_ddict_ != nullptr) {
+      res = ZSTD_freeDDict(zstd_ddict_);
+    }
+    assert(res == 0);  // Last I checked they can't fail
+    (void)res;         // prevent unused var warning
+#endif                 // ROCKSDB_ZSTD_DDICT
+  }
+
+#ifdef ROCKSDB_ZSTD_DDICT
+  const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; }
+#endif  // ROCKSDB_ZSTD_DDICT
+
+  Slice GetRawDict() const { return dict_; }
+
+  static const UncompressionDict& GetEmptyDict() {
+    static UncompressionDict empty_dict{};
+    return empty_dict;
+  }
+
+  Statistics* statistics() const { return statistics_; }
+
+  size_t ApproximateMemoryUsage() {
+    size_t usage = 0;
+    usage += sizeof(struct UncompressionDict);
+#ifdef ROCKSDB_ZSTD_DDICT
+    usage += ZSTD_sizeof_DDict(zstd_ddict_);
+#endif  // ROCKSDB_ZSTD_DDICT
+    usage += dict_.size();
+    return usage;
+  }
+
+  UncompressionDict() = default;
+  // Disable copy/move
+  UncompressionDict(const CompressionDict&) = delete;
+  UncompressionDict& operator=(const CompressionDict&) = delete;
+  UncompressionDict(CompressionDict&&) = delete;
+  UncompressionDict& operator=(CompressionDict&&) = delete;
+};
+
+class CompressionContext {
+ private:
+#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500)
+  ZSTD_CCtx* zstd_ctx_ = nullptr;
+  void CreateNativeContext(CompressionType type) {
+    if (type == kZSTD || type == kZSTDNotFinalCompression) {
+#ifdef ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ =
+          ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides());
+#else   // ROCKSDB_ZSTD_CUSTOM_MEM
+      zstd_ctx_ = ZSTD_createCCtx();
+#endif  // ROCKSDB_ZSTD_CUSTOM_MEM
+    }
+  }
+  void DestroyNativeContext() {
+    if (zstd_ctx_ != nullptr) {
+      ZSTD_freeCCtx(zstd_ctx_);
+    }
+  }
+
+ public:
+  // callable inside ZSTD_Compress
+  ZSTD_CCtx* ZSTDPreallocCtx() const {
+    assert(zstd_ctx_ != nullptr);
+    return zstd_ctx_;
+  }
+
+#else   // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ private:
+  void CreateNativeContext(CompressionType /* type */) {}
+  void DestroyNativeContext() {}
+#endif  // ZSTD && (ZSTD_VERSION_NUMBER >= 500)
+ public:
+  explicit CompressionContext(CompressionType type) {
+    CreateNativeContext(type);
+  }
+  ~CompressionContext() { DestroyNativeContext(); }
+  CompressionContext(const CompressionContext&) = delete;
+  CompressionContext& operator=(const CompressionContext&) = delete;
+};
+
+class CompressionInfo {
+  const CompressionOptions& opts_;
+  const CompressionContext& context_;
+  const CompressionDict& dict_;
+  const CompressionType type_;
+  const uint64_t sample_for_compression_;
+
+ public:
+  CompressionInfo(const CompressionOptions& _opts,
+                  const CompressionContext& _context,
+                  const CompressionDict& _dict, CompressionType _type,
+                  uint64_t _sample_for_compression)
+      : opts_(_opts),
+        context_(_context),
+        dict_(_dict),
+        type_(_type),
+        sample_for_compression_(_sample_for_compression) {}
+
+  const CompressionOptions& options() const { return opts_; }
+  const CompressionContext& context() const { return context_; }
+  const CompressionDict& dict() const { return dict_; }
+  CompressionType type() const { return type_; }
+  uint64_t SampleForCompression() const { return sample_for_compression_; }
+};
+
+class UncompressionContext {
+ private:
+  CompressionContextCache* ctx_cache_ = nullptr;
+  ZSTDUncompressCachedData uncomp_cached_data_;
+
+ public:
+  struct NoCache {};
+  // Do not use context cache, used by TableBuilder
+  UncompressionContext(NoCache, CompressionType /* type */) {}
+
+  explicit UncompressionContext(CompressionType type) {
+    if (type == kZSTD || type == kZSTDNotFinalCompression) {
+      ctx_cache_ = CompressionContextCache::Instance();
+      uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData();
+    }
+  }
+  ~UncompressionContext() {
+    if (uncomp_cached_data_.GetCacheIndex() != -1) {
+      assert(ctx_cache_ != nullptr);
+      ctx_cache_->ReturnCachedZSTDUncompressData(
+          uncomp_cached_data_.GetCacheIndex());
+    }
+  }
+  UncompressionContext(const UncompressionContext&) = delete;
+  UncompressionContext& operator=(const UncompressionContext&) = delete;
+
+  ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const {
+    return uncomp_cached_data_.Get();
+  }
+};
+
+class UncompressionInfo {
+  const UncompressionContext& context_;
+  const UncompressionDict& dict_;
+  const CompressionType type_;
+
+ public:
+  UncompressionInfo(const UncompressionContext& _context,
+                    const UncompressionDict& _dict, CompressionType _type)
+      : context_(_context), dict_(_dict), type_(_type) {}
+
+  const UncompressionContext& context() const { return context_; }
+  const UncompressionDict& dict() const { return dict_; }
+  CompressionType type() const { return type_; }
+};
+
 inline bool Snappy_Supported() {
 #ifdef SNAPPY
   return true;
-#endif
+#else
   return false;
+#endif
 }
 
 inline bool Zlib_Supported() {
 #ifdef ZLIB
   return true;
-#endif
+#else
   return false;
+#endif
 }
 
 inline bool BZip2_Supported() {
 #ifdef BZIP2
   return true;
-#endif
+#else
   return false;
+#endif
 }
 
 inline bool LZ4_Supported() {
 #ifdef LZ4
   return true;
-#endif
+#else
   return false;
+#endif
 }
 
 inline bool XPRESS_Supported() {
 #ifdef XPRESS
   return true;
-#endif
+#else
   return false;
+#endif
 }
 
 inline bool ZSTD_Supported() {
 #ifdef ZSTD
   // ZSTD format is finalized since version 0.8.0.
   return (ZSTD_versionNumber() >= 800);
-#endif
+#else
   return false;
+#endif
 }
 
 inline bool ZSTDNotFinal_Supported() {
 #ifdef ZSTD
   return true;
-#endif
+#else
   return false;
+#endif
 }
 
 inline bool CompressionTypeSupported(CompressionType compression_type) {
@@ -136,14 +500,40 @@ inline std::string CompressionTypeToString(CompressionType compression_type) {
     case kXpressCompression:
       return "Xpress";
     case kZSTD:
-    case kZSTDNotFinalCompression:
       return "ZSTD";
+    case kZSTDNotFinalCompression:
+      return "ZSTDNotFinal";
     default:
       assert(false);
       return "";
   }
 }
 
+inline std::string CompressionOptionsToString(
+    CompressionOptions& compression_options) {
+  std::string result;
+  result.reserve(512);
+  result.append("window_bits=")
+      .append(ToString(compression_options.window_bits))
+      .append("; ");
+  result.append("level=")
+      .append(ToString(compression_options.level))
+      .append("; ");
+  result.append("strategy=")
+      .append(ToString(compression_options.strategy))
+      .append("; ");
+  result.append("max_dict_bytes=")
+      .append(ToString(compression_options.max_dict_bytes))
+      .append("; ");
+  result.append("zstd_max_train_bytes=")
+      .append(ToString(compression_options.zstd_max_train_bytes))
+      .append("; ");
+  result.append("enabled=")
+      .append(ToString(compression_options.enabled))
+      .append("; ");
+  return result;
+}
+
 // compress_format_version can have two values:
 // 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed
 // block. Also, decompressed sizes for LZ4 are encoded in platform-dependent
@@ -151,7 +541,7 @@ inline std::string CompressionTypeToString(CompressionType compression_type) {
 // 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the
 // start of compressed block. Snappy format is the same as version 1.
 
-inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
+inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input,
                             size_t length, ::std::string* output) {
 #ifdef SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
@@ -159,9 +549,12 @@ inline bool Snappy_Compress(const CompressionOptions& opts, const char* input,
   snappy::RawCompress(input, length, &(*output)[0], &outlen);
   output->resize(outlen);
   return true;
-#endif
-
+#else
+  (void)input;
+  (void)length;
+  (void)output;
   return false;
+#endif
 }
 
 inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
@@ -169,15 +562,20 @@ inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
 #ifdef SNAPPY
   return snappy::GetUncompressedLength(input, length, result);
 #else
+  (void)input;
+  (void)length;
+  (void)result;
   return false;
 #endif
 }
 
-inline bool Snappy_Uncompress(const char* input, size_t length,
-                              char* output) {
+inline bool Snappy_Uncompress(const char* input, size_t length, char* output) {
 #ifdef SNAPPY
   return snappy::RawUncompress(input, length, output);
 #else
+  (void)input;
+  (void)length;
+  (void)output;
   return false;
 #endif
 }
@@ -209,10 +607,9 @@ inline bool GetDecompressedSizeInfo(const char** input_data,
 // header in varint32 format
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline bool Zlib_Compress(const CompressionOptions& opts,
+inline bool Zlib_Compress(const CompressionInfo& info,
                           uint32_t compress_format_version, const char* input,
-                          size_t length, ::std::string* output,
-                          const Slice& compression_dict = Slice()) {
+                          size_t length, ::std::string* output) {
 #ifdef ZLIB
   if (length > std::numeric_limits<uint32_t>::max()) {
     // Can't compress more than 4GB
@@ -234,14 +631,21 @@ inline bool Zlib_Compress(const CompressionOptions& opts,
   // memLevel=9 uses maximum memory for optimal speed.
   // The default value is 8. See zconf.h for more details.
   static const int memLevel = 8;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    level = Z_DEFAULT_COMPRESSION;
+  } else {
+    level = info.options().level;
+  }
   z_stream _stream;
   memset(&_stream, 0, sizeof(z_stream));
-  int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits,
-                        memLevel, opts.strategy);
+  int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits,
+                        memLevel, info.options().strategy);
   if (st != Z_OK) {
     return false;
   }
 
+  Slice compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     // Initialize the compression library's dictionary
     st = deflateSetDictionary(
@@ -254,7 +658,7 @@ inline bool Zlib_Compress(const CompressionOptions& opts,
   }
 
   // Compress the input, and put compressed data in output.
-  _stream.next_in = (Bytef *)input;
+  _stream.next_in = (Bytef*)input;
   _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
@@ -273,8 +677,14 @@ inline bool Zlib_Compress(const CompressionOptions& opts,
 
   deflateEnd(&_stream);
   return compressed;
-#endif
+#else
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
   return false;
+#endif
 }
 
 // compress_format_version == 1 -- decompressed size is not included in the
@@ -283,11 +693,10 @@ inline bool Zlib_Compress(const CompressionOptions& opts,
 // header in varint32 format
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
-                             int* decompress_size,
-                             uint32_t compress_format_version,
-                             const Slice& compression_dict = Slice(),
-                             int windowBits = -14) {
+inline CacheAllocationPtr Zlib_Uncompress(
+    const UncompressionInfo& info, const char* input_data, size_t input_length,
+    int* decompress_size, uint32_t compress_format_version,
+    MemoryAllocator* allocator = nullptr, int windowBits = -14) {
 #ifdef ZLIB
   uint32_t output_len = 0;
   if (compress_format_version == 2) {
@@ -310,12 +719,13 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
   // For raw inflate, the windowBits should be -8..-15.
   // If windowBits is bigger than zero, it will use either zlib
   // header or gzip header. Adding 32 to it will do automatic detection.
-  int st = inflateInit2(&_stream,
-      windowBits > 0 ? windowBits + 32 : windowBits);
+  int st =
+      inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits);
   if (st != Z_OK) {
     return nullptr;
   }
 
+  Slice compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     // Initialize the compression library's dictionary
     st = inflateSetDictionary(
@@ -326,12 +736,12 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
     }
   }
 
-  _stream.next_in = (Bytef *)input_data;
+  _stream.next_in = (Bytef*)input_data;
   _stream.avail_in = static_cast<unsigned int>(input_length);
 
-  char* output = new char[output_len];
+  auto output = AllocateBlock(output_len, allocator);
 
-  _stream.next_out = (Bytef *)output;
+  _stream.next_out = (Bytef*)output.get();
   _stream.avail_out = static_cast<unsigned int>(output_len);
 
   bool done = false;
@@ -347,21 +757,19 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
         // compress_format_version == 2
         assert(compress_format_version != 2);
         size_t old_sz = output_len;
-        uint32_t output_len_delta = output_len/5;
+        uint32_t output_len_delta = output_len / 5;
         output_len += output_len_delta < 10 ? 10 : output_len_delta;
-        char* tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
 
         // Set more output.
-        _stream.next_out = (Bytef *)(output + old_sz);
+        _stream.next_out = (Bytef*)(output.get() + old_sz);
         _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
       }
       case Z_BUF_ERROR:
       default:
-        delete[] output;
         inflateEnd(&_stream);
         return nullptr;
     }
@@ -372,19 +780,25 @@ inline char* Zlib_Uncompress(const char* input_data, size_t input_length,
   *decompress_size = static_cast<int>(output_len - _stream.avail_out);
   inflateEnd(&_stream);
   return output;
-#endif
-
+#else
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)compress_format_version;
+  (void)allocator;
+  (void)windowBits;
   return nullptr;
+#endif
 }
 
 // compress_format_version == 1 -- decompressed size is not included in the
 // block header
 // compress_format_version == 2 -- decompressed size is included in the block
 // header in varint32 format
-inline bool BZip2_Compress(const CompressionOptions& opts,
-                           uint32_t compress_format_version,
-                           const char* input, size_t length,
-                           ::std::string* output) {
+inline bool BZip2_Compress(const CompressionInfo& /*info*/,
+                           uint32_t compress_format_version, const char* input,
+                           size_t length, ::std::string* output) {
 #ifdef BZIP2
   if (length > std::numeric_limits<uint32_t>::max()) {
     // Can't compress more than 4GB
@@ -399,7 +813,6 @@ inline bool BZip2_Compress(const CompressionOptions& opts,
   // This may not be big enough if the compression actually expands data.
   output->resize(output_header_len + length);
 
-
   bz_stream _stream;
   memset(&_stream, 0, sizeof(bz_stream));
 
@@ -412,7 +825,7 @@ inline bool BZip2_Compress(const CompressionOptions& opts,
   }
 
   // Compress the input, and put compressed data in output.
-  _stream.next_in = (char *)input;
+  _stream.next_in = (char*)input;
   _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
@@ -431,17 +844,22 @@ inline bool BZip2_Compress(const CompressionOptions& opts,
 
   BZ2_bzCompressEnd(&_stream);
   return compressed;
-#endif
+#else
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
   return false;
+#endif
 }
 
 // compress_format_version == 1 -- decompressed size is not included in the
 // block header
 // compress_format_version == 2 -- decompressed size is included in the block
 // header in varint32 format
-inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
-                              int* decompress_size,
-                              uint32_t compress_format_version) {
+inline CacheAllocationPtr BZip2_Uncompress(
+    const char* input_data, size_t input_length, int* decompress_size,
+    uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) {
 #ifdef BZIP2
   uint32_t output_len = 0;
   if (compress_format_version == 2) {
@@ -466,12 +884,12 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
     return nullptr;
   }
 
-  _stream.next_in = (char *)input_data;
+  _stream.next_in = (char*)input_data;
   _stream.avail_in = static_cast<unsigned int>(input_length);
 
-  char* output = new char[output_len];
+  auto output = AllocateBlock(output_len, allocator);
 
-  _stream.next_out = (char *)output;
+  _stream.next_out = (char*)output.get();
   _stream.avail_out = static_cast<unsigned int>(output_len);
 
   bool done = false;
@@ -488,18 +906,16 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
         assert(compress_format_version != 2);
         uint32_t old_sz = output_len;
         output_len = output_len * 1.2;
-        char* tmp = new char[output_len];
-        memcpy(tmp, output, old_sz);
-        delete[] output;
-        output = tmp;
+        auto tmp = AllocateBlock(output_len, allocator);
+        memcpy(tmp.get(), output.get(), old_sz);
+        output = std::move(tmp);
 
         // Set more output.
-        _stream.next_out = (char *)(output + old_sz);
+        _stream.next_out = (char*)(output.get() + old_sz);
         _stream.avail_out = static_cast<unsigned int>(output_len - old_sz);
         break;
       }
       default:
-        delete[] output;
         BZ2_bzDecompressEnd(&_stream);
         return nullptr;
     }
@@ -510,8 +926,14 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
   *decompress_size = static_cast<int>(output_len - _stream.avail_out);
   BZ2_bzDecompressEnd(&_stream);
   return output;
-#endif
+#else
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)compress_format_version;
+  (void)allocator;
   return nullptr;
+#endif
 }
 
 // compress_format_version == 1 -- decompressed size is included in the
@@ -520,10 +942,9 @@ inline char* BZip2_Uncompress(const char* input_data, size_t input_length,
 // header in varint32 format
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline bool LZ4_Compress(const CompressionOptions& opts,
+inline bool LZ4_Compress(const CompressionInfo& info,
                          uint32_t compress_format_version, const char* input,
-                         size_t length, ::std::string* output,
-                         const Slice compression_dict = Slice()) {
+                         size_t length, ::std::string* output) {
 #ifdef LZ4
   if (length > std::numeric_limits<uint32_t>::max()) {
     // Can't compress more than 4GB
@@ -549,14 +970,15 @@ inline bool LZ4_Compress(const CompressionOptions& opts,
   int outlen;
 #if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_stream_t* stream = LZ4_createStream();
+  Slice compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     LZ4_loadDict(stream, compression_dict.data(),
                  static_cast<int>(compression_dict.size()));
   }
 #if LZ4_VERSION_NUMBER >= 10700  // r129+
-  outlen = LZ4_compress_fast_continue(
-      stream, input, &(*output)[output_header_len], static_cast<int>(length),
-      compress_bound, 1);
+  outlen =
+      LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len],
+                                 static_cast<int>(length), compress_bound, 1);
 #else  // up to r128
   outlen = LZ4_compress_limitedOutput_continue(
       stream, input, &(*output)[output_header_len], static_cast<int>(length),
@@ -566,6 +988,7 @@ inline bool LZ4_Compress(const CompressionOptions& opts,
 #else   // up to r123
   outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len],
                                       static_cast<int>(length), compress_bound);
+  (void)ctx;
 #endif  // LZ4_VERSION_NUMBER >= 10400
 
   if (outlen == 0) {
@@ -573,8 +996,14 @@ inline bool LZ4_Compress(const CompressionOptions& opts,
   }
   output->resize(static_cast<size_t>(output_header_len + outlen));
   return true;
-#endif  // LZ4
+#else  // LZ4
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
   return false;
+#endif
 }
 
 // compress_format_version == 1 -- decompressed size is included in the
@@ -583,10 +1012,12 @@ inline bool LZ4_Compress(const CompressionOptions& opts,
 // header in varint32 format
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
-                            int* decompress_size,
-                            uint32_t compress_format_version,
-                            const Slice& compression_dict = Slice()) {
+inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info,
+                                         const char* input_data,
+                                         size_t input_length,
+                                         int* decompress_size,
+                                         uint32_t compress_format_version,
+                                         MemoryAllocator* allocator = nullptr) {
 #ifdef LZ4
   uint32_t output_len = 0;
   if (compress_format_version == 2) {
@@ -606,31 +1037,39 @@ inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
     input_data += 8;
   }
 
-  char* output = new char[output_len];
+  auto output = AllocateBlock(output_len, allocator);
 #if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_streamDecode_t* stream = LZ4_createStreamDecode();
+  Slice compression_dict = info.dict().GetRawDict();
   if (compression_dict.size()) {
     LZ4_setStreamDecode(stream, compression_dict.data(),
                         static_cast<int>(compression_dict.size()));
   }
   *decompress_size = LZ4_decompress_safe_continue(
-      stream, input_data, output, static_cast<int>(input_length),
+      stream, input_data, output.get(), static_cast<int>(input_length),
       static_cast<int>(output_len));
   LZ4_freeStreamDecode(stream);
 #else   // up to r123
-  *decompress_size =
-      LZ4_decompress_safe(input_data, output, static_cast<int>(input_length),
-                          static_cast<int>(output_len));
+  *decompress_size = LZ4_decompress_safe(input_data, output.get(),
+                                         static_cast<int>(input_length),
+                                         static_cast<int>(output_len));
+  (void)ctx;
 #endif  // LZ4_VERSION_NUMBER >= 10400
 
   if (*decompress_size < 0) {
-    delete[] output;
     return nullptr;
   }
   assert(*decompress_size == static_cast<int>(output_len));
   return output;
-#endif  // LZ4
+#else  // LZ4
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)compress_format_version;
+  (void)allocator;
   return nullptr;
+#endif
 }
 
 // compress_format_version == 1 -- decompressed size is included in the
@@ -639,10 +1078,9 @@ inline char* LZ4_Uncompress(const char* input_data, size_t input_length,
 // header in varint32 format
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline bool LZ4HC_Compress(const CompressionOptions& opts,
+inline bool LZ4HC_Compress(const CompressionInfo& info,
                            uint32_t compress_format_version, const char* input,
-                           size_t length, ::std::string* output,
-                           const Slice& compression_dict = Slice()) {
+                           size_t length, ::std::string* output) {
 #ifdef LZ4
   if (length > std::numeric_limits<uint32_t>::max()) {
     // Can't compress more than 4GB
@@ -666,9 +1104,16 @@ inline bool LZ4HC_Compress(const CompressionOptions& opts,
   output->resize(static_cast<size_t>(output_header_len + compress_bound));
 
   int outlen;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    level = 0;  // lz4hc.h says any value < 1 will be sanitized to default
+  } else {
+    level = info.options().level;
+  }
 #if LZ4_VERSION_NUMBER >= 10400  // r124+
   LZ4_streamHC_t* stream = LZ4_createStreamHC();
-  LZ4_resetStreamHC(stream, opts.level);
+  LZ4_resetStreamHC(stream, level);
+  Slice compression_dict = info.dict().GetRawDict();
   const char* compression_dict_data =
       compression_dict.size() > 0 ? compression_dict.data() : nullptr;
   size_t compression_dict_size = compression_dict.size();
@@ -689,7 +1134,7 @@ inline bool LZ4HC_Compress(const CompressionOptions& opts,
 #elif LZ4_VERSION_MAJOR  // r113-r123
   outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len],
                                          static_cast<int>(length),
-                                         compress_bound, opts.level);
+                                         compress_bound, level);
 #else                    // up to r112
   outlen =
       LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len],
@@ -701,31 +1146,43 @@ inline bool LZ4HC_Compress(const CompressionOptions& opts,
   }
   output->resize(static_cast<size_t>(output_header_len + outlen));
   return true;
-#endif  // LZ4
+#else  // LZ4
+  (void)info;
+  (void)compress_format_version;
+  (void)input;
+  (void)length;
+  (void)output;
   return false;
+#endif
 }
 
-inline bool XPRESS_Compress(const char* input, size_t length, std::string* output) {
 #ifdef XPRESS
+inline bool XPRESS_Compress(const char* input, size_t length,
+                            std::string* output) {
   return port::xpress::Compress(input, length, output);
-#endif
+}
+#else
+inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/,
+                            std::string* /*output*/) {
   return false;
 }
+#endif
 
-inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
-  int* decompress_size) {
 #ifdef XPRESS
+inline char* XPRESS_Uncompress(const char* input_data, size_t input_length,
+                               int* decompress_size) {
   return port::xpress::Decompress(input_data, input_length, decompress_size);
-#endif
+}
+#else
+inline char* XPRESS_Uncompress(const char* /*input_data*/,
+                               size_t /*input_length*/,
+                               int* /*decompress_size*/) {
   return nullptr;
 }
+#endif
 
-
-// @param compression_dict Data for presetting the compression library's
-//    dictionary.
-inline bool ZSTD_Compress(const CompressionOptions& opts, const char* input,
-                          size_t length, ::std::string* output,
-                          const Slice& compression_dict = Slice()) {
+inline bool ZSTD_Compress(const CompressionInfo& info, const char* input,
+                          size_t length, ::std::string* output) {
 #ifdef ZSTD
   if (length > std::numeric_limits<uint32_t>::max()) {
     // Can't compress more than 4GB
@@ -737,31 +1194,54 @@ inline bool ZSTD_Compress(const CompressionOptions& opts, const char* input,
 
   size_t compressBound = ZSTD_compressBound(length);
   output->resize(static_cast<size_t>(output_header_len + compressBound));
-  size_t outlen;
+  size_t outlen = 0;
+  int level;
+  if (info.options().level == CompressionOptions::kDefaultCompressionLevel) {
+    // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see
+    // https://github.com/facebook/zstd/issues/1148
+    level = 3;
+  } else {
+    level = info.options().level;
+  }
 #if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
-  ZSTD_CCtx* context = ZSTD_createCCtx();
-  outlen = ZSTD_compress_usingDict(
-      context, &(*output)[output_header_len], compressBound, input, length,
-      compression_dict.data(), compression_dict.size(), opts.level);
-  ZSTD_freeCCtx(context);
-#else  // up to v0.4.x
+  ZSTD_CCtx* context = info.context().ZSTDPreallocCtx();
+  assert(context != nullptr);
+#if ZSTD_VERSION_NUMBER >= 700  // v0.7.0+
+  if (info.dict().GetDigestedZstdCDict() != nullptr) {
+    outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len],
+                                      compressBound, input, length,
+                                      info.dict().GetDigestedZstdCDict());
+  }
+#endif  // ZSTD_VERSION_NUMBER >= 700
+  if (outlen == 0) {
+    outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len],
+                                     compressBound, input, length,
+                                     info.dict().GetRawDict().data(),
+                                     info.dict().GetRawDict().size(), level);
+  }
+#else   // up to v0.4.x
   outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input,
-                         length, opts.level);
+                         length, level);
 #endif  // ZSTD_VERSION_NUMBER >= 500
   if (outlen == 0) {
     return false;
   }
   output->resize(output_header_len + outlen);
   return true;
-#endif
+#else  // ZSTD
+  (void)info;
+  (void)input;
+  (void)length;
+  (void)output;
   return false;
+#endif
 }
 
 // @param compression_dict Data for presetting the compression library's
 //    dictionary.
-inline char* ZSTD_Uncompress(const char* input_data, size_t input_length,
-                             int* decompress_size,
-                             const Slice& compression_dict = Slice()) {
+inline CacheAllocationPtr ZSTD_Uncompress(
+    const UncompressionInfo& info, const char* input_data, size_t input_length,
+    int* decompress_size, MemoryAllocator* allocator = nullptr) {
 #ifdef ZSTD
   uint32_t output_len = 0;
   if (!compression::GetDecompressedSizeInfo(&input_data, &input_length,
@@ -769,23 +1249,99 @@ inline char* ZSTD_Uncompress(const char* input_data, size_t input_length,
     return nullptr;
   }
 
-  char* output = new char[output_len];
-  size_t actual_output_length;
+  auto output = AllocateBlock(output_len, allocator);
+  size_t actual_output_length = 0;
 #if ZSTD_VERSION_NUMBER >= 500  // v0.5.0+
-  ZSTD_DCtx* context = ZSTD_createDCtx();
-  actual_output_length = ZSTD_decompress_usingDict(
-      context, output, output_len, input_data, input_length,
-      compression_dict.data(), compression_dict.size());
-  ZSTD_freeDCtx(context);
-#else  // up to v0.4.x
+  ZSTD_DCtx* context = info.context().GetZSTDContext();
+  assert(context != nullptr);
+#ifdef ROCKSDB_ZSTD_DDICT
+  if (info.dict().GetDigestedZstdDDict() != nullptr) {
+    actual_output_length = ZSTD_decompress_usingDDict(
+        context, output.get(), output_len, input_data, input_length,
+        info.dict().GetDigestedZstdDDict());
+  }
+#endif  // ROCKSDB_ZSTD_DDICT
+  if (actual_output_length == 0) {
+    actual_output_length = ZSTD_decompress_usingDict(
+        context, output.get(), output_len, input_data, input_length,
+        info.dict().GetRawDict().data(), info.dict().GetRawDict().size());
+  }
+#else   // up to v0.4.x
+  (void)info;
   actual_output_length =
-      ZSTD_decompress(output, output_len, input_data, input_length);
+      ZSTD_decompress(output.get(), output_len, input_data, input_length);
 #endif  // ZSTD_VERSION_NUMBER >= 500
   assert(actual_output_length == output_len);
   *decompress_size = static_cast<int>(actual_output_length);
   return output;
-#endif
+#else  // ZSTD
+  (void)info;
+  (void)input_data;
+  (void)input_length;
+  (void)decompress_size;
+  (void)allocator;
   return nullptr;
+#endif
+}
+
+inline bool ZSTD_TrainDictionarySupported() {
+#ifdef ZSTD
+  // Dictionary trainer is available since v0.6.1 for static linking, but not
+  // available for dynamic linking until v1.1.3. For now we enable the feature
+  // in v1.1.3+ only.
+  return (ZSTD_versionNumber() >= 10103);
+#else
+  return false;
+#endif
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+                                        const std::vector<size_t>& sample_lens,
+                                        size_t max_dict_bytes) {
+  // Dictionary trainer is available since v0.6.1 for static linking, but not
+  // available for dynamic linking until v1.1.3. For now we enable the feature
+  // in v1.1.3+ only.
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+  assert(samples.empty() == sample_lens.empty());
+  if (samples.empty()) {
+    return "";
+  }
+  std::string dict_data(max_dict_bytes, '\0');
+  size_t dict_len = ZDICT_trainFromBuffer(
+      &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0],
+      static_cast<unsigned>(sample_lens.size()));
+  if (ZDICT_isError(dict_len)) {
+    return "";
+  }
+  assert(dict_len <= max_dict_bytes);
+  dict_data.resize(dict_len);
+  return dict_data;
+#else   // up to v1.1.2
+  assert(false);
+  (void)samples;
+  (void)sample_lens;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD_VERSION_NUMBER >= 10103
+}
+
+inline std::string ZSTD_TrainDictionary(const std::string& samples,
+                                        size_t sample_len_shift,
+                                        size_t max_dict_bytes) {
+  // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable
+  // only since v0.8.0. For now we enable the feature in stable versions only.
+#if ZSTD_VERSION_NUMBER >= 10103  // v1.1.3+
+  // skips potential partial sample at the end of "samples"
+  size_t num_samples = samples.size() >> sample_len_shift;
+  std::vector<size_t> sample_lens(num_samples, size_t(1) << sample_len_shift);
+  return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes);
+#else   // up to v1.1.2
+  assert(false);
+  (void)samples;
+  (void)sample_len_shift;
+  (void)max_dict_bytes;
+  return "";
+#endif  // ZSTD_VERSION_NUMBER >= 10103
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/compression_context_cache.cc b/thirdparty/rocksdb/util/compression_context_cache.cc
new file mode 100644
index 0000000000..6fb5c4fcd9
--- /dev/null
+++ b/thirdparty/rocksdb/util/compression_context_cache.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#include "util/compression_context_cache.h"
+
+#include "util/compression.h"
+#include "util/core_local.h"
+
+#include <atomic>
+
+namespace rocksdb {
+namespace compression_cache {
+
+void* const SentinelValue = nullptr;
+// Cache ZSTD uncompression contexts for reads
+// if needed we can add ZSTD compression context caching
+// which is currently is not done since BlockBasedTableBuilder
+// simply creates one compression context per new SST file.
+struct ZSTDCachedData {
+  // We choose to cache the below structure instead of a ptr
+  // because we want to avoid a) native types leak b) make
+  // cache use transparent for the user
+  ZSTDUncompressCachedData uncomp_cached_data_;
+  std::atomic<void*> zstd_uncomp_sentinel_;
+
+  char
+      padding[(CACHE_LINE_SIZE -
+               (sizeof(ZSTDUncompressCachedData) + sizeof(std::atomic<void*>)) %
+                   CACHE_LINE_SIZE)];  // unused padding field
+
+  ZSTDCachedData() : zstd_uncomp_sentinel_(&uncomp_cached_data_) {}
+  ZSTDCachedData(const ZSTDCachedData&) = delete;
+  ZSTDCachedData& operator=(const ZSTDCachedData&) = delete;
+
+  ZSTDUncompressCachedData GetUncompressData(int64_t idx) {
+    ZSTDUncompressCachedData result;
+    void* expected = &uncomp_cached_data_;
+    if (zstd_uncomp_sentinel_.compare_exchange_strong(expected,
+                                                      SentinelValue)) {
+      uncomp_cached_data_.CreateIfNeeded();
+      result.InitFromCache(uncomp_cached_data_, idx);
+    } else {
+      // Creates one time use data
+      result.CreateIfNeeded();
+    }
+    return result;
+  }
+  // Return the entry back into circulation
+  // This is executed only when we successfully obtained
+  // in the first place
+  void ReturnUncompressData() {
+    if (zstd_uncomp_sentinel_.exchange(&uncomp_cached_data_) != SentinelValue) {
+      // Means we are returning while not having it acquired.
+      assert(false);
+    }
+  }
+};
+static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0,
+              "Expected CACHE_LINE_SIZE alignment");
+}  // namespace compression_cache
+
+using namespace compression_cache;
+
+class CompressionContextCache::Rep {
+ public:
+  Rep() {}
+  ZSTDUncompressCachedData GetZSTDUncompressData() {
+    auto p = per_core_uncompr_.AccessElementAndIndex();
+    int64_t idx = static_cast<int64_t>(p.second);
+    return p.first->GetUncompressData(idx);
+  }
+  void ReturnZSTDUncompressData(int64_t idx) {
+    assert(idx >= 0);
+    auto* cn = per_core_uncompr_.AccessAtCore(static_cast<size_t>(idx));
+    cn->ReturnUncompressData();
+  }
+
+ private:
+  CoreLocalArray<ZSTDCachedData> per_core_uncompr_;
+};
+
+CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {}
+
+CompressionContextCache* CompressionContextCache::Instance() {
+  static CompressionContextCache instance;
+  return &instance;
+}
+
+void CompressionContextCache::InitSingleton() { Instance(); }
+
+ZSTDUncompressCachedData
+CompressionContextCache::GetCachedZSTDUncompressData() {
+  return rep_->GetZSTDUncompressData();
+}
+
+void CompressionContextCache::ReturnCachedZSTDUncompressData(int64_t idx) {
+  rep_->ReturnZSTDUncompressData(idx);
+}
+
+CompressionContextCache::~CompressionContextCache() { delete rep_; }
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/compression_context_cache.h b/thirdparty/rocksdb/util/compression_context_cache.h
new file mode 100644
index 0000000000..4ea6ae35e7
--- /dev/null
+++ b/thirdparty/rocksdb/util/compression_context_cache.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+// Compression context cache allows to cache compression/uncompression contexts
+// This helps with Random Read latencies and reduces CPU utilization
+// Caching is implemented using CoreLocal facility. Compression/Uncompression
+// instances are cached on a per core basis using CoreLocalArray. A borrowed
+// instance is atomically replaced with a sentinel value for the time of being
+// used. If it turns out that another thread is already makes use of the
+// instance we still create one on the heap which is later is destroyed.
+
+#pragma once
+
+#include <stdint.h>
+
+namespace rocksdb {
+class ZSTDUncompressCachedData;
+
+class CompressionContextCache {
+ public:
+  // Singleton
+  static CompressionContextCache* Instance();
+  static void InitSingleton();
+  CompressionContextCache(const CompressionContextCache&) = delete;
+  CompressionContextCache& operator=(const CompressionContextCache&) = delete;
+
+  ZSTDUncompressCachedData GetCachedZSTDUncompressData();
+  void ReturnCachedZSTDUncompressData(int64_t idx);
+
+ private:
+  // Singleton
+  CompressionContextCache();
+  ~CompressionContextCache();
+
+  class Rep;
+  Rep* rep_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/concurrent_arena.cc b/thirdparty/rocksdb/util/concurrent_arena.cc
index 07fa03cf78..cef77d7e75 100644
--- a/thirdparty/rocksdb/util/concurrent_arena.cc
+++ b/thirdparty/rocksdb/util/concurrent_arena.cc
@@ -18,9 +18,17 @@ namespace rocksdb {
 __thread size_t ConcurrentArena::tls_cpuid = 0;
 #endif
 
+namespace {
+// If the shard block size is too large, in the worst case, every core
+// allocates a block without populate it. If the shared block size is
+// 1MB, 64 cores will quickly allocate 64MB, and may quickly trigger a
+// flush. Cap the size instead.
+const size_t kMaxShardBlockSize = size_t{128 * 1024};
+}  // namespace
+
 ConcurrentArena::ConcurrentArena(size_t block_size, AllocTracker* tracker,
                                  size_t huge_page_size)
-    : shard_block_size_(block_size / 8),
+    : shard_block_size_(std::min(kMaxShardBlockSize, block_size / 8)),
       shards_(),
       arena_(block_size, tracker, huge_page_size) {
   Fixup();
diff --git a/thirdparty/rocksdb/util/concurrent_arena.h b/thirdparty/rocksdb/util/concurrent_arena.h
index 1ab88c7ff1..a6191100fd 100644
--- a/thirdparty/rocksdb/util/concurrent_arena.h
+++ b/thirdparty/rocksdb/util/concurrent_arena.h
@@ -91,7 +91,7 @@ class ConcurrentArena : public Allocator {
     char* free_begin_;
     std::atomic<size_t> allocated_and_unused_;
 
-    Shard() : allocated_and_unused_(0) {}
+    Shard() : free_begin_(nullptr), allocated_and_unused_(0) {}
   };
 
 #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL
diff --git a/thirdparty/rocksdb/util/concurrent_task_limiter_impl.cc b/thirdparty/rocksdb/util/concurrent_task_limiter_impl.cc
new file mode 100644
index 0000000000..e1ce4bef72
--- /dev/null
+++ b/thirdparty/rocksdb/util/concurrent_task_limiter_impl.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/concurrent_task_limiter_impl.h"
+#include "rocksdb/concurrent_task_limiter.h"
+
+namespace rocksdb {
+
+ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl(
+    const std::string& name, int32_t max_outstanding_task)
+    : name_(name),
+      max_outstanding_tasks_{max_outstanding_task},
+      outstanding_tasks_{0} {
+
+}
+
+ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() {
+  assert(outstanding_tasks_ == 0);
+}
+
+const std::string& ConcurrentTaskLimiterImpl::GetName() const {
+  return name_;
+}
+
+void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) {
+  max_outstanding_tasks_.store(limit, std::memory_order_relaxed);
+}
+
+void ConcurrentTaskLimiterImpl::ResetMaxOutstandingTask() {
+  max_outstanding_tasks_.store(-1, std::memory_order_relaxed);
+}
+
+int32_t ConcurrentTaskLimiterImpl::GetOutstandingTask() const {
+  return outstanding_tasks_.load(std::memory_order_relaxed);
+}
+
+std::unique_ptr<TaskLimiterToken> ConcurrentTaskLimiterImpl::GetToken(
+    bool force) {
+  int32_t limit = max_outstanding_tasks_.load(std::memory_order_relaxed);
+  int32_t tasks = outstanding_tasks_.load(std::memory_order_relaxed);
+  // force = true, bypass the throttle.
+  // limit < 0 means unlimited tasks.
+  while (force || limit < 0 || tasks < limit) {
+    if (outstanding_tasks_.compare_exchange_weak(tasks, tasks + 1)) {
+      return std::unique_ptr<TaskLimiterToken>(new TaskLimiterToken(this));
+    }
+  }
+  return nullptr;
+}
+
+ConcurrentTaskLimiter* NewConcurrentTaskLimiter(
+    const std::string& name, int32_t limit) {
+  return new ConcurrentTaskLimiterImpl(name, limit);
+}
+
+TaskLimiterToken::~TaskLimiterToken() {
+  --limiter_->outstanding_tasks_;
+  assert(limiter_->outstanding_tasks_ >= 0);
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/concurrent_task_limiter_impl.h b/thirdparty/rocksdb/util/concurrent_task_limiter_impl.h
new file mode 100644
index 0000000000..515f1481e0
--- /dev/null
+++ b/thirdparty/rocksdb/util/concurrent_task_limiter_impl.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/env.h"
+#include "rocksdb/concurrent_task_limiter.h"
+
+namespace rocksdb {
+
+class TaskLimiterToken;
+
+class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter {
+ public:
+  explicit ConcurrentTaskLimiterImpl(const std::string& name,
+                                     int32_t max_outstanding_task);
+
+  virtual ~ConcurrentTaskLimiterImpl();
+
+  virtual const std::string& GetName() const override;
+
+  virtual void SetMaxOutstandingTask(int32_t limit) override;
+
+  virtual void ResetMaxOutstandingTask() override;
+
+  virtual int32_t GetOutstandingTask() const override;
+
+  // Request token for adding a new task.
+  // If force == true, it requests a token bypassing throttle.
+  // Returns nullptr if it got throttled.
+  virtual std::unique_ptr<TaskLimiterToken> GetToken(bool force);
+
+ private:
+  friend class TaskLimiterToken;
+
+  std::string name_;
+  std::atomic<int32_t> max_outstanding_tasks_;
+  std::atomic<int32_t> outstanding_tasks_;
+
+  // No copying allowed
+  ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete;
+  ConcurrentTaskLimiterImpl& operator=(
+      const ConcurrentTaskLimiterImpl&) = delete;
+};
+
+class TaskLimiterToken {
+ public:
+  explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter)
+      : limiter_(limiter) {}
+  ~TaskLimiterToken();
+
+ private:
+  ConcurrentTaskLimiterImpl* limiter_;
+
+  // no copying allowed
+  TaskLimiterToken(const TaskLimiterToken&) = delete;
+  void operator=(const TaskLimiterToken&) = delete;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/crc32c.cc b/thirdparty/rocksdb/util/crc32c.cc
index ae36f82305..9e4b65e66e 100644
--- a/thirdparty/rocksdb/util/crc32c.cc
+++ b/thirdparty/rocksdb/util/crc32c.cc
@@ -9,18 +9,43 @@
 //
 // A portable implementation of crc32c, optimized to handle
 // four bytes at a time.
-
 #include "util/crc32c.h"
-
 #include <stdint.h>
 #ifdef HAVE_SSE42
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 #endif
 #include "util/coding.h"
+#include "util/util.h"
+
+#ifdef __powerpc64__
+#include "util/crc32c_ppc.h"
+#include "util/crc32c_ppc_constants.h"
+
+#if __linux__
+#include <sys/auxv.h>
+
+#ifndef PPC_FEATURE2_VEC_CRYPTO
+#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
+#endif
+
+#ifndef AT_HWCAP2
+#define AT_HWCAP2 26
+#endif
+
+#endif /* __linux__ */
+
+#endif
 
 namespace rocksdb {
 namespace crc32c {
 
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+#ifdef __powerpc64__
+static int arch_ppc_crc32 = 0;
+#endif /* __powerpc64__ */
+#endif
+
 static const uint32_t table0_[256] = {
   0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
   0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
@@ -329,6 +354,7 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) {
 
 template<void (*CRC32)(uint64_t*, uint8_t const**)>
 uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
+
   const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
   const uint8_t *e = p + size;
   uint64_t l = crc ^ 0xffffffffu;
@@ -371,14 +397,15 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) {
 }
 
 // Detect if SS42 or not.
+#ifndef HAVE_POWER8
+
 static bool isSSE42() {
 #ifndef HAVE_SSE42
   return false;
 #elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
   uint32_t c_;
-  uint32_t d_;
-  __asm__("cpuid" : "=c"(c_), "=d"(d_) : "a"(1) : "ebx");
-  return c_ & (1U << 20);  // copied from CpuId.h in Folly.
+  __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+  return c_ & (1U << 20);  // copied from CpuId.h in Folly. Test SSE42
 #elif defined(_WIN64)
   int info[4];
   __cpuidex(info, 0x00000001, 0);
@@ -388,21 +415,817 @@ static bool isSSE42() {
 #endif
 }
 
+static bool isPCLMULQDQ() {
+#ifndef HAVE_SSE42
+// in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ are
+// supported by compiler
+  return false;
+#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE)
+  uint32_t c_;
+  __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx");
+  return c_ & (1U << 1);  // PCLMULQDQ is in bit 1 (not bit 0)
+#elif defined(_WIN64)
+  int info[4];
+  __cpuidex(info, 0x00000001, 0);
+  return (info[2] & ((int)1 << 1)) != 0;
+#else
+  return false;
+#endif
+}
+
+#endif  // HAVE_POWER8
+
 typedef uint32_t (*Function)(uint32_t, const char*, size_t);
 
-static inline Function Choose_Extend() {
-  return isSSE42() ? ExtendImpl<Fast_CRC32> : ExtendImpl<Slow_CRC32>;
+#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC)
+uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) {
+  return crc32c_ppc(crc, (const unsigned char *)buf, size);
+}
+
+#if __linux__
+static int arch_ppc_probe(void) {
+  arch_ppc_crc32 = 0;
+
+#if defined(__powerpc64__)
+  if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1;
+#endif /* __powerpc64__ */
+
+  return arch_ppc_crc32;
+}
+#endif  // __linux__
+
+static bool isAltiVec() {
+  if (arch_ppc_probe()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+#endif
+
+
+std::string IsFastCrc32Supported() {
+  bool has_fast_crc = false;
+  std::string fast_zero_msg;
+  std::string arch;
+#ifdef HAVE_POWER8
+#ifdef HAS_ALTIVEC
+  if (arch_ppc_probe()) {
+    has_fast_crc = true;
+    arch = "PPC";
+  }
+#else
+  has_fast_crc = false;
+  arch = "PPC";
+#endif
+#else
+  has_fast_crc = isSSE42();
+  arch = "x86";
+#endif
+  if (has_fast_crc) {
+    fast_zero_msg.append("Supported on " + arch);
+  }
+  else {
+    fast_zero_msg.append("Not supported on " + arch);
+  }
+  return fast_zero_msg;
 }
 
-bool IsFastCrc32Supported() {
-  return isSSE42();
+
+/*
+ * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands
+ *  This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the author be held liable for any damages
+ * arising from the use of this software.
+ *  Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *  1. The origin of this software must not be misrepresented; you must not
+ *   claim that you wrote the original software. If you use this software
+ *   in a product, an acknowledgment in the product documentation would be
+ *   appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *   misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *  Ferry Toth
+ * ftoth@exalondelft.nl
+ *
+ * https://github.com/htot/crc32c
+ *
+ * Modified by Facebook
+ *
+ * Original intel whitepaper:
+ * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ *
+ * This version is from the folly library, created by Dave Watson <davejwatson@fb.com>
+ *
+*/
+#if defined HAVE_SSE42 && defined HAVE_PCLMUL
+
+#define CRCtriplet(crc, buf, offset)                  \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \
+  crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset));
+
+#define CRCduplet(crc, buf, offset)                   \
+  crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \
+  crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset));
+
+#define CRCsinglet(crc, buf, offset)                    \
+  crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset));
+
+
+// Numbers taken directly from intel whitepaper.
+// clang-format off
+const uint64_t clmul_constants[] = {
+    0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6,
+    0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e,
+    0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da,
+    0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8,
+    0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296,
+    0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2,
+    0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6,
+    0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092,
+    0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0,
+    0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456,
+    0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e,
+    0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a,
+    0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574,
+    0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832,
+    0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124,
+    0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86,
+    0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e,
+    0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a,
+    0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46,
+    0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a,
+    0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a,
+    0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4,
+    0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56,
+    0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2,
+    0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c,
+    0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac,
+    0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64,
+    0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e,
+    0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c,
+    0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28,
+    0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26,
+    0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c,
+    0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c,
+    0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c,
+    0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4,
+    0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844,
+    0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c,
+    0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730,
+    0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c,
+    0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2,
+    0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2,
+    0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e,
+    0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a,
+    0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a,
+    0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a,
+    0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768,
+    0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4,
+    0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c,
+    0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba,
+    0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312,
+    0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544,
+    0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a,
+    0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e,
+    0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a,
+    0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c,
+    0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a,
+    0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6,
+    0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca,
+    0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888,
+    0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e,
+    0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528,
+    0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a,
+    0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e,
+    0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa,
+};
+
+// Compute the crc32c value for buffer smaller than 8
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+inline void align_to_8(
+    size_t len,
+    uint64_t& crc0, // crc so far, updated on return
+    const unsigned char*& next) { // next data pointer, updated on return
+  uint32_t crc32bit = static_cast<uint32_t>(crc0);
+  if (len & 0x04) {
+    crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next);
+    next += sizeof(uint32_t);
+  }
+  if (len & 0x02) {
+    crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next);
+    next += sizeof(uint16_t);
+  }
+  if (len & 0x01) {
+    crc32bit = _mm_crc32_u8(crc32bit, *(next));
+    next++;
+  }
+  crc0 = crc32bit;
 }
 
-Function ChosenExtend = Choose_Extend();
+//
+// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well
+// chosen constant and xor's these with the remaining CRC.
+//
+inline uint64_t CombineCRC(
+    size_t block_size,
+    uint64_t crc0,
+    uint64_t crc1,
+    uint64_t crc2,
+    const uint64_t* next2) {
+  const auto multiplier =
+      *(reinterpret_cast<const __m128i*>(clmul_constants) + block_size - 1);
+  const auto crc0_xmm = _mm_set_epi64x(0, crc0);
+  const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00);
+  const auto crc1_xmm = _mm_set_epi64x(0, crc1);
+  const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10);
+  const auto res = _mm_xor_si128(res0, res1);
+  crc0 = _mm_cvtsi128_si64(res);
+  crc0 = crc0 ^ *((uint64_t*)next2 - 1);
+  crc2 = _mm_crc32_u64(crc2, crc0);
+  return crc2;
+}
 
+// Compute CRC-32C using the Intel hardware instruction.
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
+uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) {
+  const unsigned char* next = (const unsigned char*)buf;
+  uint64_t count;
+  uint64_t crc0, crc1, crc2;
+  crc0 = crc ^ 0xffffffffu;
+
+
+  if (len >= 8) {
+    // if len > 216 then align and use triplets
+    if (len > 216) {
+      {
+        // Work on the bytes (< 8) before the first 8-byte alignment addr starts
+        uint64_t align_bytes = (8 - (uintptr_t)next) & 7;
+        len -= align_bytes;
+        align_to_8(align_bytes, crc0, next);
+      }
+
+      // Now work on the remaining blocks
+      count = len / 24; // number of triplets
+      len %= 24; // bytes remaining
+      uint64_t n = count >> 7; // #blocks = first block + full blocks
+      uint64_t block_size = count & 127;
+      if (block_size == 0) {
+        block_size = 128;
+      } else {
+        n++;
+      }
+      // points to the first byte of the next block
+      const uint64_t* next0 = (uint64_t*)next + block_size;
+      const uint64_t* next1 = next0 + block_size;
+      const uint64_t* next2 = next1 + block_size;
+
+      crc1 = crc2 = 0;
+      // Use Duff's device, a for() loop inside a switch()
+      // statement. This needs to execute at least once, round len
+      // down to nearest triplet multiple
+      switch (block_size) {
+        case 128:
+          do {
+            // jumps here for a full block of len 128
+            CRCtriplet(crc, next, -128);
+	    FALLTHROUGH_INTENDED;
+            case 127:
+              // jumps here or below for the first block smaller
+              CRCtriplet(crc, next, -127);
+	      FALLTHROUGH_INTENDED;
+            case 126:
+              CRCtriplet(crc, next, -126); // than 128
+	      FALLTHROUGH_INTENDED;
+            case 125:
+              CRCtriplet(crc, next, -125);
+	      FALLTHROUGH_INTENDED;
+            case 124:
+              CRCtriplet(crc, next, -124);
+	      FALLTHROUGH_INTENDED;
+            case 123:
+              CRCtriplet(crc, next, -123);
+	      FALLTHROUGH_INTENDED;
+            case 122:
+              CRCtriplet(crc, next, -122);
+	      FALLTHROUGH_INTENDED;
+            case 121:
+              CRCtriplet(crc, next, -121);
+	      FALLTHROUGH_INTENDED;
+            case 120:
+              CRCtriplet(crc, next, -120);
+              FALLTHROUGH_INTENDED;
+            case 119:
+              CRCtriplet(crc, next, -119);
+              FALLTHROUGH_INTENDED;
+            case 118:
+              CRCtriplet(crc, next, -118);
+              FALLTHROUGH_INTENDED;
+            case 117:
+              CRCtriplet(crc, next, -117);
+              FALLTHROUGH_INTENDED;
+            case 116:
+              CRCtriplet(crc, next, -116);
+              FALLTHROUGH_INTENDED;
+            case 115:
+              CRCtriplet(crc, next, -115);
+              FALLTHROUGH_INTENDED;
+            case 114:
+              CRCtriplet(crc, next, -114);
+              FALLTHROUGH_INTENDED;
+            case 113:
+              CRCtriplet(crc, next, -113);
+              FALLTHROUGH_INTENDED;
+            case 112:
+              CRCtriplet(crc, next, -112);
+              FALLTHROUGH_INTENDED;
+            case 111:
+              CRCtriplet(crc, next, -111);
+              FALLTHROUGH_INTENDED;
+            case 110:
+              CRCtriplet(crc, next, -110);
+              FALLTHROUGH_INTENDED;
+            case 109:
+              CRCtriplet(crc, next, -109);
+              FALLTHROUGH_INTENDED;
+            case 108:
+              CRCtriplet(crc, next, -108);
+              FALLTHROUGH_INTENDED;
+            case 107:
+              CRCtriplet(crc, next, -107);
+              FALLTHROUGH_INTENDED;
+            case 106:
+              CRCtriplet(crc, next, -106);
+              FALLTHROUGH_INTENDED;
+            case 105:
+              CRCtriplet(crc, next, -105);
+              FALLTHROUGH_INTENDED;
+            case 104:
+              CRCtriplet(crc, next, -104);
+              FALLTHROUGH_INTENDED;
+            case 103:
+              CRCtriplet(crc, next, -103);
+              FALLTHROUGH_INTENDED;
+            case 102:
+              CRCtriplet(crc, next, -102);
+              FALLTHROUGH_INTENDED;
+            case 101:
+              CRCtriplet(crc, next, -101);
+              FALLTHROUGH_INTENDED;
+            case 100:
+              CRCtriplet(crc, next, -100);
+              FALLTHROUGH_INTENDED;
+            case 99:
+              CRCtriplet(crc, next, -99);
+              FALLTHROUGH_INTENDED;
+            case 98:
+              CRCtriplet(crc, next, -98);
+              FALLTHROUGH_INTENDED;
+            case 97:
+              CRCtriplet(crc, next, -97);
+              FALLTHROUGH_INTENDED;
+            case 96:
+              CRCtriplet(crc, next, -96);
+              FALLTHROUGH_INTENDED;
+            case 95:
+              CRCtriplet(crc, next, -95);
+              FALLTHROUGH_INTENDED;
+            case 94:
+              CRCtriplet(crc, next, -94);
+              FALLTHROUGH_INTENDED;
+            case 93:
+              CRCtriplet(crc, next, -93);
+              FALLTHROUGH_INTENDED;
+            case 92:
+              CRCtriplet(crc, next, -92);
+              FALLTHROUGH_INTENDED;
+            case 91:
+              CRCtriplet(crc, next, -91);
+              FALLTHROUGH_INTENDED;
+            case 90:
+              CRCtriplet(crc, next, -90);
+              FALLTHROUGH_INTENDED;
+            case 89:
+              CRCtriplet(crc, next, -89);
+              FALLTHROUGH_INTENDED;
+            case 88:
+              CRCtriplet(crc, next, -88);
+              FALLTHROUGH_INTENDED;
+            case 87:
+              CRCtriplet(crc, next, -87);
+              FALLTHROUGH_INTENDED;
+            case 86:
+              CRCtriplet(crc, next, -86);
+              FALLTHROUGH_INTENDED;
+            case 85:
+              CRCtriplet(crc, next, -85);
+              FALLTHROUGH_INTENDED;
+            case 84:
+              CRCtriplet(crc, next, -84);
+              FALLTHROUGH_INTENDED;
+            case 83:
+              CRCtriplet(crc, next, -83);
+              FALLTHROUGH_INTENDED;
+            case 82:
+              CRCtriplet(crc, next, -82);
+              FALLTHROUGH_INTENDED;
+            case 81:
+              CRCtriplet(crc, next, -81);
+              FALLTHROUGH_INTENDED;
+            case 80:
+              CRCtriplet(crc, next, -80);
+              FALLTHROUGH_INTENDED;
+            case 79:
+              CRCtriplet(crc, next, -79);
+              FALLTHROUGH_INTENDED;
+            case 78:
+              CRCtriplet(crc, next, -78);
+              FALLTHROUGH_INTENDED;
+            case 77:
+              CRCtriplet(crc, next, -77);
+              FALLTHROUGH_INTENDED;
+            case 76:
+              CRCtriplet(crc, next, -76);
+              FALLTHROUGH_INTENDED;
+            case 75:
+              CRCtriplet(crc, next, -75);
+              FALLTHROUGH_INTENDED;
+            case 74:
+              CRCtriplet(crc, next, -74);
+              FALLTHROUGH_INTENDED;
+            case 73:
+              CRCtriplet(crc, next, -73);
+              FALLTHROUGH_INTENDED;
+            case 72:
+              CRCtriplet(crc, next, -72);
+              FALLTHROUGH_INTENDED;
+            case 71:
+              CRCtriplet(crc, next, -71);
+              FALLTHROUGH_INTENDED;
+            case 70:
+              CRCtriplet(crc, next, -70);
+              FALLTHROUGH_INTENDED;
+            case 69:
+              CRCtriplet(crc, next, -69);
+              FALLTHROUGH_INTENDED;
+            case 68:
+              CRCtriplet(crc, next, -68);
+              FALLTHROUGH_INTENDED;
+            case 67:
+              CRCtriplet(crc, next, -67);
+              FALLTHROUGH_INTENDED;
+            case 66:
+              CRCtriplet(crc, next, -66);
+              FALLTHROUGH_INTENDED;
+            case 65:
+              CRCtriplet(crc, next, -65);
+              FALLTHROUGH_INTENDED;
+            case 64:
+              CRCtriplet(crc, next, -64);
+              FALLTHROUGH_INTENDED;
+            case 63:
+              CRCtriplet(crc, next, -63);
+              FALLTHROUGH_INTENDED;
+            case 62:
+              CRCtriplet(crc, next, -62);
+              FALLTHROUGH_INTENDED;
+            case 61:
+              CRCtriplet(crc, next, -61);
+              FALLTHROUGH_INTENDED;
+            case 60:
+              CRCtriplet(crc, next, -60);
+              FALLTHROUGH_INTENDED;
+            case 59:
+              CRCtriplet(crc, next, -59);
+              FALLTHROUGH_INTENDED;
+            case 58:
+              CRCtriplet(crc, next, -58);
+              FALLTHROUGH_INTENDED;
+            case 57:
+              CRCtriplet(crc, next, -57);
+              FALLTHROUGH_INTENDED;
+            case 56:
+              CRCtriplet(crc, next, -56);
+              FALLTHROUGH_INTENDED;
+            case 55:
+              CRCtriplet(crc, next, -55);
+              FALLTHROUGH_INTENDED;
+            case 54:
+              CRCtriplet(crc, next, -54);
+              FALLTHROUGH_INTENDED;
+            case 53:
+              CRCtriplet(crc, next, -53);
+              FALLTHROUGH_INTENDED;
+            case 52:
+              CRCtriplet(crc, next, -52);
+              FALLTHROUGH_INTENDED;
+            case 51:
+              CRCtriplet(crc, next, -51);
+              FALLTHROUGH_INTENDED;
+            case 50:
+              CRCtriplet(crc, next, -50);
+              FALLTHROUGH_INTENDED;
+            case 49:
+              CRCtriplet(crc, next, -49);
+              FALLTHROUGH_INTENDED;
+            case 48:
+              CRCtriplet(crc, next, -48);
+              FALLTHROUGH_INTENDED;
+            case 47:
+              CRCtriplet(crc, next, -47);
+              FALLTHROUGH_INTENDED;
+            case 46:
+              CRCtriplet(crc, next, -46);
+              FALLTHROUGH_INTENDED;
+            case 45:
+              CRCtriplet(crc, next, -45);
+              FALLTHROUGH_INTENDED;
+            case 44:
+              CRCtriplet(crc, next, -44);
+              FALLTHROUGH_INTENDED;
+            case 43:
+              CRCtriplet(crc, next, -43);
+              FALLTHROUGH_INTENDED;
+            case 42:
+              CRCtriplet(crc, next, -42);
+              FALLTHROUGH_INTENDED;
+            case 41:
+              CRCtriplet(crc, next, -41);
+              FALLTHROUGH_INTENDED;
+            case 40:
+              CRCtriplet(crc, next, -40);
+              FALLTHROUGH_INTENDED;
+            case 39:
+              CRCtriplet(crc, next, -39);
+              FALLTHROUGH_INTENDED;
+            case 38:
+              CRCtriplet(crc, next, -38);
+              FALLTHROUGH_INTENDED;
+            case 37:
+              CRCtriplet(crc, next, -37);
+              FALLTHROUGH_INTENDED;
+            case 36:
+              CRCtriplet(crc, next, -36);
+              FALLTHROUGH_INTENDED;
+            case 35:
+              CRCtriplet(crc, next, -35);
+              FALLTHROUGH_INTENDED;
+            case 34:
+              CRCtriplet(crc, next, -34);
+              FALLTHROUGH_INTENDED;
+            case 33:
+              CRCtriplet(crc, next, -33);
+              FALLTHROUGH_INTENDED;
+            case 32:
+              CRCtriplet(crc, next, -32);
+              FALLTHROUGH_INTENDED;
+            case 31:
+              CRCtriplet(crc, next, -31);
+              FALLTHROUGH_INTENDED;
+            case 30:
+              CRCtriplet(crc, next, -30);
+              FALLTHROUGH_INTENDED;
+            case 29:
+              CRCtriplet(crc, next, -29);
+              FALLTHROUGH_INTENDED;
+            case 28:
+              CRCtriplet(crc, next, -28);
+              FALLTHROUGH_INTENDED;
+            case 27:
+              CRCtriplet(crc, next, -27);
+              FALLTHROUGH_INTENDED;
+            case 26:
+              CRCtriplet(crc, next, -26);
+              FALLTHROUGH_INTENDED;
+            case 25:
+              CRCtriplet(crc, next, -25);
+              FALLTHROUGH_INTENDED;
+            case 24:
+              CRCtriplet(crc, next, -24);
+              FALLTHROUGH_INTENDED;
+            case 23:
+              CRCtriplet(crc, next, -23);
+              FALLTHROUGH_INTENDED;
+            case 22:
+              CRCtriplet(crc, next, -22);
+              FALLTHROUGH_INTENDED;
+            case 21:
+              CRCtriplet(crc, next, -21);
+              FALLTHROUGH_INTENDED;
+            case 20:
+              CRCtriplet(crc, next, -20);
+              FALLTHROUGH_INTENDED;
+            case 19:
+              CRCtriplet(crc, next, -19);
+              FALLTHROUGH_INTENDED;
+            case 18:
+              CRCtriplet(crc, next, -18);
+              FALLTHROUGH_INTENDED;
+            case 17:
+              CRCtriplet(crc, next, -17);
+              FALLTHROUGH_INTENDED;
+            case 16:
+              CRCtriplet(crc, next, -16);
+              FALLTHROUGH_INTENDED;
+            case 15:
+              CRCtriplet(crc, next, -15);
+              FALLTHROUGH_INTENDED;
+            case 14:
+              CRCtriplet(crc, next, -14);
+              FALLTHROUGH_INTENDED;
+            case 13:
+              CRCtriplet(crc, next, -13);
+              FALLTHROUGH_INTENDED;
+            case 12:
+              CRCtriplet(crc, next, -12);
+              FALLTHROUGH_INTENDED;
+            case 11:
+              CRCtriplet(crc, next, -11);
+              FALLTHROUGH_INTENDED;
+            case 10:
+              CRCtriplet(crc, next, -10);
+              FALLTHROUGH_INTENDED;
+            case 9:
+              CRCtriplet(crc, next, -9);
+              FALLTHROUGH_INTENDED;
+            case 8:
+              CRCtriplet(crc, next, -8);
+              FALLTHROUGH_INTENDED;
+            case 7:
+              CRCtriplet(crc, next, -7);
+              FALLTHROUGH_INTENDED;
+            case 6:
+              CRCtriplet(crc, next, -6);
+              FALLTHROUGH_INTENDED;
+            case 5:
+              CRCtriplet(crc, next, -5);
+              FALLTHROUGH_INTENDED;
+            case 4:
+              CRCtriplet(crc, next, -4);
+              FALLTHROUGH_INTENDED;
+            case 3:
+              CRCtriplet(crc, next, -3);
+              FALLTHROUGH_INTENDED;
+            case 2:
+              CRCtriplet(crc, next, -2);
+              FALLTHROUGH_INTENDED;
+            case 1:
+              CRCduplet(crc, next, -1); // the final triplet is actually only 2
+              //{ CombineCRC(); }
+              crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2);
+              if (--n > 0) {
+                crc1 = crc2 = 0;
+                block_size = 128;
+                // points to the first byte of the next block
+                next0 = next2 + 128;
+                next1 = next0 + 128; // from here on all blocks are 128 long
+                next2 = next1 + 128;
+              }
+              FALLTHROUGH_INTENDED;
+            case 0:;
+          } while (n > 0);
+      }
+      next = (const unsigned char*)next2;
+    }
+    uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets
+    len = len & 7;
+    next += (count2 * 8);
+    switch (count2) {
+      case 27:
+        CRCsinglet(crc0, next, -27 * 8);
+        FALLTHROUGH_INTENDED;
+      case 26:
+        CRCsinglet(crc0, next, -26 * 8);
+        FALLTHROUGH_INTENDED;
+      case 25:
+        CRCsinglet(crc0, next, -25 * 8);
+        FALLTHROUGH_INTENDED;
+      case 24:
+        CRCsinglet(crc0, next, -24 * 8);
+        FALLTHROUGH_INTENDED;
+      case 23:
+        CRCsinglet(crc0, next, -23 * 8);
+        FALLTHROUGH_INTENDED;
+      case 22:
+        CRCsinglet(crc0, next, -22 * 8);
+        FALLTHROUGH_INTENDED;
+      case 21:
+        CRCsinglet(crc0, next, -21 * 8);
+        FALLTHROUGH_INTENDED;
+      case 20:
+        CRCsinglet(crc0, next, -20 * 8);
+        FALLTHROUGH_INTENDED;
+      case 19:
+        CRCsinglet(crc0, next, -19 * 8);
+        FALLTHROUGH_INTENDED;
+      case 18:
+        CRCsinglet(crc0, next, -18 * 8);
+        FALLTHROUGH_INTENDED;
+      case 17:
+        CRCsinglet(crc0, next, -17 * 8);
+        FALLTHROUGH_INTENDED;
+      case 16:
+        CRCsinglet(crc0, next, -16 * 8);
+        FALLTHROUGH_INTENDED;
+      case 15:
+        CRCsinglet(crc0, next, -15 * 8);
+        FALLTHROUGH_INTENDED;
+      case 14:
+        CRCsinglet(crc0, next, -14 * 8);
+        FALLTHROUGH_INTENDED;
+      case 13:
+        CRCsinglet(crc0, next, -13 * 8);
+        FALLTHROUGH_INTENDED;
+      case 12:
+        CRCsinglet(crc0, next, -12 * 8);
+        FALLTHROUGH_INTENDED;
+      case 11:
+        CRCsinglet(crc0, next, -11 * 8);
+        FALLTHROUGH_INTENDED;
+      case 10:
+        CRCsinglet(crc0, next, -10 * 8);
+        FALLTHROUGH_INTENDED;
+      case 9:
+        CRCsinglet(crc0, next, -9 * 8);
+        FALLTHROUGH_INTENDED;
+      case 8:
+        CRCsinglet(crc0, next, -8 * 8);
+        FALLTHROUGH_INTENDED;
+      case 7:
+        CRCsinglet(crc0, next, -7 * 8);
+        FALLTHROUGH_INTENDED;
+      case 6:
+        CRCsinglet(crc0, next, -6 * 8);
+        FALLTHROUGH_INTENDED;
+      case 5:
+        CRCsinglet(crc0, next, -5 * 8);
+        FALLTHROUGH_INTENDED;
+      case 4:
+        CRCsinglet(crc0, next, -4 * 8);
+        FALLTHROUGH_INTENDED;
+      case 3:
+        CRCsinglet(crc0, next, -3 * 8);
+        FALLTHROUGH_INTENDED;
+      case 2:
+        CRCsinglet(crc0, next, -2 * 8);
+        FALLTHROUGH_INTENDED;
+      case 1:
+        CRCsinglet(crc0, next, -1 * 8);
+        FALLTHROUGH_INTENDED;
+      case 0:;
+    }
+  }
+  {
+    align_to_8(len, crc0, next);
+    return (uint32_t)crc0 ^ 0xffffffffu;
+  }
+}
+
+#endif //HAVE_SSE42 && HAVE_PCLMUL
+
+static inline Function Choose_Extend() {
+#ifndef HAVE_POWER8
+  if (isSSE42()) {
+    if (isPCLMULQDQ()) {
+#if defined HAVE_SSE42  && defined HAVE_PCLMUL && !defined NO_THREEWAY_CRC32C
+      return crc32c_3way;
+#else
+    return ExtendImpl<Fast_CRC32>; // Fast_CRC32 will check HAVE_SSE42 itself
+#endif
+    }
+    else {  // no runtime PCLMULQDQ support but has SSE42 support
+      return ExtendImpl<Fast_CRC32>;
+    }
+  } // end of isSSE42()
+  else {
+    return ExtendImpl<Slow_CRC32>;
+  }
+#else  //HAVE_POWER8
+  return isAltiVec() ? ExtendPPCImpl : ExtendImpl<Slow_CRC32>;
+#endif
+}
+
+static Function ChosenExtend = Choose_Extend();
 uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
   return ChosenExtend(crc, buf, size);
 }
 
+
 }  // namespace crc32c
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/crc32c.h b/thirdparty/rocksdb/util/crc32c.h
index 984852969a..faee5d54b3 100644
--- a/thirdparty/rocksdb/util/crc32c.h
+++ b/thirdparty/rocksdb/util/crc32c.h
@@ -10,11 +10,12 @@
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
+#include <string>
 
 namespace rocksdb {
 namespace crc32c {
 
-extern bool IsFastCrc32Supported();
+extern std::string IsFastCrc32Supported();
 
 // Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
 // crc32c of some string A.  Extend() is often used to maintain the
diff --git a/thirdparty/rocksdb/util/crc32c_ppc.c b/thirdparty/rocksdb/util/crc32c_ppc.c
new file mode 100644
index 0000000000..3c517c88ca
--- /dev/null
+++ b/thirdparty/rocksdb/util/crc32c_ppc.c
@@ -0,0 +1,95 @@
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#define CRC_TABLE
+#include <inttypes.h>
+#include <stdlib.h>
+#include <strings.h>
+#include "util/crc32c_ppc_constants.h"
+
+#define VMX_ALIGN 16
+#define VMX_ALIGN_MASK (VMX_ALIGN - 1)
+
+#ifdef REFLECT
+static unsigned int crc32_align(unsigned int crc, unsigned char const *p,
+                                unsigned long len) {
+  while (len--) crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+  return crc;
+}
+#endif
+
+#ifdef HAVE_POWER8
+unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p,
+                            unsigned long len);
+
+static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data,
+                             unsigned len) {
+  unsigned int prealign;
+  unsigned int tail;
+
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  if (len < VMX_ALIGN + VMX_ALIGN_MASK) {
+    crc = crc32_align(crc, data, (unsigned long)len);
+    goto out;
+  }
+
+  if ((unsigned long)data & VMX_ALIGN_MASK) {
+    prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK);
+    crc = crc32_align(crc, data, prealign);
+    len -= prealign;
+    data += prealign;
+  }
+
+  crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK);
+
+  tail = len & VMX_ALIGN_MASK;
+  if (tail) {
+    data += len & ~VMX_ALIGN_MASK;
+    crc = crc32_align(crc, data, tail);
+  }
+
+out:
+#ifdef CRC_XOR
+  crc ^= 0xffffffff;
+#endif
+
+  return crc;
+}
+
+/* This wrapper function works around the fact that crc32_vpmsum
+ * does not gracefully handle the case where the data pointer is NULL.  There
+ * may be room for performance improvement here.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+  unsigned char *buf2;
+
+  if (!data) {
+    buf2 = (unsigned char *)malloc(len);
+    bzero(buf2, len);
+    crc = crc32_vpmsum(crc, buf2, len);
+    free(buf2);
+  } else {
+    crc = crc32_vpmsum(crc, data, (unsigned long)len);
+  }
+  return crc;
+}
+
+#else /* HAVE_POWER8 */
+
+/* This symbol has to exist on non-ppc architectures (and on legacy
+ * ppc systems using power7 or below) in order to compile properly
+ * there, even though it won't be called.
+ */
+uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, unsigned len) {
+  return 0;
+}
+
+#endif /* HAVE_POWER8 */
diff --git a/thirdparty/rocksdb/util/crc32c_ppc.h b/thirdparty/rocksdb/util/crc32c_ppc.h
new file mode 100644
index 0000000000..3bcaecfe82
--- /dev/null
+++ b/thirdparty/rocksdb/util/crc32c_ppc.h
@@ -0,0 +1,20 @@
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer,
+                           unsigned len);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/thirdparty/rocksdb/util/crc32c_ppc_asm.S b/thirdparty/rocksdb/util/crc32c_ppc_asm.S
new file mode 100644
index 0000000000..6de7979733
--- /dev/null
+++ b/thirdparty/rocksdb/util/crc32c_ppc_asm.S
@@ -0,0 +1,753 @@
+//  Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#include <ppc-asm.h>
+#include "ppc-opcode.h"
+
+#undef toc
+
+#ifndef r1
+#define r1 1
+#endif
+
+#ifndef r2
+#define r2 2
+#endif
+
+	.section	.rodata
+.balign 16
+
+.byteswap_constant:
+	/* byte reverse permute constant */
+	.octa 0x0F0E0D0C0B0A09080706050403020100
+
+#define __ASSEMBLY__
+#include "crc32c_ppc_constants.h"
+
+	.text
+
+#if defined(__BIG_ENDIAN__) && defined(REFLECT)
+#define BYTESWAP_DATA
+#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
+#define BYTESWAP_DATA
+#else
+#undef BYTESWAP_DATA
+#endif
+
+#define off16		r25
+#define off32		r26
+#define off48		r27
+#define off64		r28
+#define off80		r29
+#define off96		r30
+#define off112		r31
+
+#define const1		v24
+#define const2		v25
+
+#define byteswap	v26
+#define	mask_32bit	v27
+#define	mask_64bit	v28
+#define zeroes		v29
+
+#ifdef BYTESWAP_DATA
+#define VPERM(A, B, C, D) vperm	A, B, C, D
+#else
+#define VPERM(A, B, C, D)
+#endif
+
+/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
+FUNC_START(__crc32_vpmsum)
+	std	r31,-8(r1)
+	std	r30,-16(r1)
+	std	r29,-24(r1)
+	std	r28,-32(r1)
+	std	r27,-40(r1)
+	std	r26,-48(r1)
+	std	r25,-56(r1)
+
+	li	off16,16
+	li	off32,32
+	li	off48,48
+	li	off64,64
+	li	off80,80
+	li	off96,96
+	li	off112,112
+	li	r0,0
+
+	/* Enough room for saving 10 non volatile VMX registers */
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	stvx	v20,0,r6
+	stvx	v21,off16,r6
+	stvx	v22,off32,r6
+	stvx	v23,off48,r6
+	stvx	v24,off64,r6
+	stvx	v25,off80,r6
+	stvx	v26,off96,r6
+	stvx	v27,off112,r6
+	stvx	v28,0,r7
+	stvx	v29,off16,r7
+
+	mr	r10,r3
+
+	vxor	zeroes,zeroes,zeroes
+	vspltisw v0,-1
+
+	vsldoi	mask_32bit,zeroes,v0,4
+	vsldoi	mask_64bit,zeroes,v0,8
+
+	/* Get the initial value into v8 */
+	vxor	v8,v8,v8
+	MTVRD(v8, r3)
+#ifdef REFLECT
+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
+#else
+	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
+#endif
+
+#ifdef BYTESWAP_DATA
+	addis	r3,r2,.byteswap_constant@toc@ha
+	addi	r3,r3,.byteswap_constant@toc@l
+
+	lvx	byteswap,0,r3
+	addi	r3,r3,16
+#endif
+
+	cmpdi	r5,256
+	blt	.Lshort
+
+	rldicr	r6,r5,0,56
+
+	/* Checksum in blocks of MAX_SIZE */
+1:	lis	r7,MAX_SIZE@h
+	ori	r7,r7,MAX_SIZE@l
+	mr	r9,r7
+	cmpd	r6,r7
+	bgt	2f
+	mr	r7,r6
+2:	subf	r6,r7,r6
+
+	/* our main loop does 128 bytes at a time */
+	srdi	r7,r7,7
+
+	/*
+	 * Work out the offset into the constants table to start at. Each
+	 * constant is 16 bytes, and it is used against 128 bytes of input
+	 * data - 128 / 16 = 8
+	 */
+	sldi	r8,r7,4
+	srdi	r9,r9,3
+	subf	r8,r8,r9
+
+	/* We reduce our final 128 bytes in a separate step */
+	addi	r7,r7,-1
+	mtctr	r7
+
+	addis	r3,r2,.constants@toc@ha
+	addi	r3,r3,.constants@toc@l
+
+	/* Find the start of our constants */
+	add	r3,r3,r8
+
+	/* zero v0-v7 which will contain our checksums */
+	vxor	v0,v0,v0
+	vxor	v1,v1,v1
+	vxor	v2,v2,v2
+	vxor	v3,v3,v3
+	vxor	v4,v4,v4
+	vxor	v5,v5,v5
+	vxor	v6,v6,v6
+	vxor	v7,v7,v7
+
+	lvx	const1,0,r3
+
+	/*
+	 * If we are looping back to consume more data we use the values
+	 * already in v16-v23.
+	 */
+	cmpdi	r0,1
+	beq	2f
+
+	/* First warm up pass */
+	lvx	v16,0,r4
+	lvx	v17,off16,r4
+	VPERM(v16,v16,v16,byteswap)
+	VPERM(v17,v17,v17,byteswap)
+	lvx	v18,off32,r4
+	lvx	v19,off48,r4
+	VPERM(v18,v18,v18,byteswap)
+	VPERM(v19,v19,v19,byteswap)
+	lvx	v20,off64,r4
+	lvx	v21,off80,r4
+	VPERM(v20,v20,v20,byteswap)
+	VPERM(v21,v21,v21,byteswap)
+	lvx	v22,off96,r4
+	lvx	v23,off112,r4
+	VPERM(v22,v22,v22,byteswap)
+	VPERM(v23,v23,v23,byteswap)
+	addi	r4,r4,8*16
+
+	/* xor in initial value */
+	vxor	v16,v16,v8
+
+2:	bdz	.Lfirst_warm_up_done
+
+	addi	r3,r3,16
+	lvx	const2,0,r3
+
+	/* Second warm up pass */
+	VPMSUMD(v8,v16,const1)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v9,v17,const1)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v10,v18,const1)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v11,v19,const1)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdz	.Lfirst_cool_down
+
+	/*
+	 * main loop. We modulo schedule it such that it takes three iterations
+	 * to complete - first iteration load, second iteration vpmsum, third
+	 * iteration xor.
+	 */
+	.balign	16
+4:	lvx	const1,0,r3
+	addi	r3,r3,16
+	ori	r2,r2,0
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const2)
+	lvx	v16,0,r4
+	VPERM(v16,v16,v16,byteswap)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const2)
+	lvx	v17,off16,r4
+	VPERM(v17,v17,v17,byteswap)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const2)
+	lvx	v18,off32,r4
+	VPERM(v18,v18,v18,byteswap)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const2)
+	lvx	v19,off48,r4
+	VPERM(v19,v19,v19,byteswap)
+	lvx	const2,0,r3
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	lvx	v20,off64,r4
+	VPERM(v20,v20,v20,byteswap)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	lvx	v21,off80,r4
+	VPERM(v21,v21,v21,byteswap)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	lvx	v22,off96,r4
+	VPERM(v22,v22,v22,byteswap)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	lvx	v23,off112,r4
+	VPERM(v23,v23,v23,byteswap)
+
+	addi	r4,r4,8*16
+
+	bdnz	4b
+
+.Lfirst_cool_down:
+	/* First cool down pass */
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	vxor	v0,v0,v8
+	VPMSUMD(v8,v16,const1)
+	ori	r2,r2,0
+
+	vxor	v1,v1,v9
+	VPMSUMD(v9,v17,const1)
+	ori	r2,r2,0
+
+	vxor	v2,v2,v10
+	VPMSUMD(v10,v18,const1)
+	ori	r2,r2,0
+
+	vxor	v3,v3,v11
+	VPMSUMD(v11,v19,const1)
+	ori	r2,r2,0
+
+	vxor	v4,v4,v12
+	VPMSUMD(v12,v20,const1)
+	ori	r2,r2,0
+
+	vxor	v5,v5,v13
+	VPMSUMD(v13,v21,const1)
+	ori	r2,r2,0
+
+	vxor	v6,v6,v14
+	VPMSUMD(v14,v22,const1)
+	ori	r2,r2,0
+
+	vxor	v7,v7,v15
+	VPMSUMD(v15,v23,const1)
+	ori	r2,r2,0
+
+.Lsecond_cool_down:
+	/* Second cool down pass */
+	vxor	v0,v0,v8
+	vxor	v1,v1,v9
+	vxor	v2,v2,v10
+	vxor	v3,v3,v11
+	vxor	v4,v4,v12
+	vxor	v5,v5,v13
+	vxor	v6,v6,v14
+	vxor	v7,v7,v15
+
+#ifdef REFLECT
+	/*
+	 * vpmsumd produces a 96 bit result in the least significant bits
+	 * of the register. Since we are bit reflected we have to shift it
+	 * left 32 bits so it occupies the least significant bits in the
+	 * bit reflected domain.
+	 */
+	vsldoi	v0,v0,zeroes,4
+	vsldoi	v1,v1,zeroes,4
+	vsldoi	v2,v2,zeroes,4
+	vsldoi	v3,v3,zeroes,4
+	vsldoi	v4,v4,zeroes,4
+	vsldoi	v5,v5,zeroes,4
+	vsldoi	v6,v6,zeroes,4
+	vsldoi	v7,v7,zeroes,4
+#endif
+
+	/* xor with last 1024 bits */
+	lvx	v8,0,r4
+	lvx	v9,off16,r4
+	VPERM(v8,v8,v8,byteswap)
+	VPERM(v9,v9,v9,byteswap)
+	lvx	v10,off32,r4
+	lvx	v11,off48,r4
+	VPERM(v10,v10,v10,byteswap)
+	VPERM(v11,v11,v11,byteswap)
+	lvx	v12,off64,r4
+	lvx	v13,off80,r4
+	VPERM(v12,v12,v12,byteswap)
+	VPERM(v13,v13,v13,byteswap)
+	lvx	v14,off96,r4
+	lvx	v15,off112,r4
+	VPERM(v14,v14,v14,byteswap)
+	VPERM(v15,v15,v15,byteswap)
+
+	addi	r4,r4,8*16
+
+	vxor	v16,v0,v8
+	vxor	v17,v1,v9
+	vxor	v18,v2,v10
+	vxor	v19,v3,v11
+	vxor	v20,v4,v12
+	vxor	v21,v5,v13
+	vxor	v22,v6,v14
+	vxor	v23,v7,v15
+
+	li	r0,1
+	cmpdi	r6,0
+	addi	r6,r6,128
+	bne	1b
+
+	/* Work out how many bytes we have left */
+	andi.	r5,r5,127
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,128
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks are in the tail */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	/*
+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
+	 * 32 bits to include the trailing 32 bits of zeros
+	 */
+	lvx	v0,0,r3
+	lvx	v1,off16,r3
+	lvx	v2,off32,r3
+	lvx	v3,off48,r3
+	lvx	v4,off64,r3
+	lvx	v5,off80,r3
+	lvx	v6,off96,r3
+	lvx	v7,off112,r3
+	addi	r3,r3,8*16
+
+	VPMSUMW(v0,v16,v0)
+	VPMSUMW(v1,v17,v1)
+	VPMSUMW(v2,v18,v2)
+	VPMSUMW(v3,v19,v3)
+	VPMSUMW(v4,v20,v4)
+	VPMSUMW(v5,v21,v5)
+	VPMSUMW(v6,v22,v6)
+	VPMSUMW(v7,v23,v7)
+
+	/* Now reduce the tail (0 - 112 bytes) */
+	cmpdi	r7,0
+	beq	1f
+
+	lvx	v16,0,r4
+	lvx	v17,0,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off32,r4
+	lvx	v17,off32,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off64,r4
+	lvx	v17,off64,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+	bdz	1f
+
+	lvx	v16,off96,r4
+	lvx	v17,off96,r3
+	VPERM(v16,v16,v16,byteswap)
+	VPMSUMW(v16,v16,v17)
+	vxor	v0,v0,v16
+
+	/* Now xor all the parallel chunks together */
+1:	vxor	v0,v0,v1
+	vxor	v2,v2,v3
+	vxor	v4,v4,v5
+	vxor	v6,v6,v7
+
+	vxor	v0,v0,v2
+	vxor	v4,v4,v6
+
+	vxor	v0,v0,v4
+
+.Lbarrett_reduction:
+	/* Barrett constants */
+	addis	r3,r2,.barrett_constants@toc@ha
+	addi	r3,r3,.barrett_constants@toc@l
+
+	lvx	const1,0,r3
+	lvx	const2,off16,r3
+
+	vsldoi	v1,v0,v0,8
+	vxor	v0,v0,v1		/* xor two 64 bit results together */
+
+#ifdef REFLECT
+	/* shift left one bit */
+	vspltisb v1,1
+	vsl	v0,v0,v1
+#endif
+
+	vand	v0,v0,mask_64bit
+
+#ifndef REFLECT
+	/*
+	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
+	 * the multiple of our polynomial that we need to subtract. By
+	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
+	 * result back down 2x bits, we round down to the nearest multiple.
+	 */
+	VPMSUMD(v1,v0,const1)	/* ma */
+	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
+	VPMSUMD(v1,v1,const2)	/* qn */
+	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Get the result into r3. We need to shift it left 8 bytes:
+	 * V0 [ 0 1 2 X ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
+#else
+	/*
+	 * The reflected version of Barrett reduction. Instead of bit
+	 * reflecting our data (which is expensive to do), we bit reflect our
+	 * constants and our algorithm, which means the intermediate data in
+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
+	 * the algorithm because we don't carry in mod 2 arithmetic.
+	 */
+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
+	VPMSUMD(v1,v1,const1)		/* ma */
+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
+	VPMSUMD(v1,v1,const2)		/* qn */
+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
+
+	/*
+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
+	 * the high 32 bits. We just need to shift it left 4 bytes
+	 * V0 [ 0 1 X 3 ]
+	 * V0 [ 0 X 2 3 ]
+	 */
+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
+#endif
+
+	/* Get it into r3 */
+	MFVRD(r3, v0)
+
+.Lout:
+	subi	r6,r1,56+10*16
+	subi	r7,r1,56+2*16
+
+	lvx	v20,0,r6
+	lvx	v21,off16,r6
+	lvx	v22,off32,r6
+	lvx	v23,off48,r6
+	lvx	v24,off64,r6
+	lvx	v25,off80,r6
+	lvx	v26,off96,r6
+	lvx	v27,off112,r6
+	lvx	v28,0,r7
+	lvx	v29,off16,r7
+
+	ld	r31,-8(r1)
+	ld	r30,-16(r1)
+	ld	r29,-24(r1)
+	ld	r28,-32(r1)
+	ld	r27,-40(r1)
+	ld	r26,-48(r1)
+	ld	r25,-56(r1)
+
+	blr
+
+.Lfirst_warm_up_done:
+	lvx	const1,0,r3
+	addi	r3,r3,16
+
+	VPMSUMD(v8,v16,const1)
+	VPMSUMD(v9,v17,const1)
+	VPMSUMD(v10,v18,const1)
+	VPMSUMD(v11,v19,const1)
+	VPMSUMD(v12,v20,const1)
+	VPMSUMD(v13,v21,const1)
+	VPMSUMD(v14,v22,const1)
+	VPMSUMD(v15,v23,const1)
+
+	b	.Lsecond_cool_down
+
+.Lshort:
+	cmpdi	r5,0
+	beq	.Lzero
+
+	addis	r3,r2,.short_constants@toc@ha
+	addi	r3,r3,.short_constants@toc@l
+
+	/* Calculate where in the constant table we need to start */
+	subfic	r6,r5,256
+	add	r3,r3,r6
+
+	/* How many 16 byte chunks? */
+	srdi	r7,r5,4
+	mtctr	r7
+
+	vxor	v19,v19,v19
+	vxor	v20,v20,v20
+
+	lvx	v0,0,r4
+	lvx	v16,0,r3
+	VPERM(v0,v0,v16,byteswap)
+	vxor	v0,v0,v8	/* xor in initial value */
+	VPMSUMW(v0,v0,v16)
+	bdz	.Lv0
+
+	lvx	v1,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v1,v1,v17,byteswap)
+	VPMSUMW(v1,v1,v17)
+	bdz	.Lv1
+
+	lvx	v2,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v2,v2,v16,byteswap)
+	VPMSUMW(v2,v2,v16)
+	bdz	.Lv2
+
+	lvx	v3,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v3,v3,v17,byteswap)
+	VPMSUMW(v3,v3,v17)
+	bdz	.Lv3
+
+	lvx	v4,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v4,v4,v16,byteswap)
+	VPMSUMW(v4,v4,v16)
+	bdz	.Lv4
+
+	lvx	v5,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v5,v5,v17,byteswap)
+	VPMSUMW(v5,v5,v17)
+	bdz	.Lv5
+
+	lvx	v6,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v6,v6,v16,byteswap)
+	VPMSUMW(v6,v6,v16)
+	bdz	.Lv6
+
+	lvx	v7,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v7,v7,v17,byteswap)
+	VPMSUMW(v7,v7,v17)
+	bdz	.Lv7
+
+	addi	r3,r3,128
+	addi	r4,r4,128
+
+	lvx	v8,0,r4
+	lvx	v16,0,r3
+	VPERM(v8,v8,v16,byteswap)
+	VPMSUMW(v8,v8,v16)
+	bdz	.Lv8
+
+	lvx	v9,off16,r4
+	lvx	v17,off16,r3
+	VPERM(v9,v9,v17,byteswap)
+	VPMSUMW(v9,v9,v17)
+	bdz	.Lv9
+
+	lvx	v10,off32,r4
+	lvx	v16,off32,r3
+	VPERM(v10,v10,v16,byteswap)
+	VPMSUMW(v10,v10,v16)
+	bdz	.Lv10
+
+	lvx	v11,off48,r4
+	lvx	v17,off48,r3
+	VPERM(v11,v11,v17,byteswap)
+	VPMSUMW(v11,v11,v17)
+	bdz	.Lv11
+
+	lvx	v12,off64,r4
+	lvx	v16,off64,r3
+	VPERM(v12,v12,v16,byteswap)
+	VPMSUMW(v12,v12,v16)
+	bdz	.Lv12
+
+	lvx	v13,off80,r4
+	lvx	v17,off80,r3
+	VPERM(v13,v13,v17,byteswap)
+	VPMSUMW(v13,v13,v17)
+	bdz	.Lv13
+
+	lvx	v14,off96,r4
+	lvx	v16,off96,r3
+	VPERM(v14,v14,v16,byteswap)
+	VPMSUMW(v14,v14,v16)
+	bdz	.Lv14
+
+	lvx	v15,off112,r4
+	lvx	v17,off112,r3
+	VPERM(v15,v15,v17,byteswap)
+	VPMSUMW(v15,v15,v17)
+
+.Lv15:	vxor	v19,v19,v15
+.Lv14:	vxor	v20,v20,v14
+.Lv13:	vxor	v19,v19,v13
+.Lv12:	vxor	v20,v20,v12
+.Lv11:	vxor	v19,v19,v11
+.Lv10:	vxor	v20,v20,v10
+.Lv9:	vxor	v19,v19,v9
+.Lv8:	vxor	v20,v20,v8
+.Lv7:	vxor	v19,v19,v7
+.Lv6:	vxor	v20,v20,v6
+.Lv5:	vxor	v19,v19,v5
+.Lv4:	vxor	v20,v20,v4
+.Lv3:	vxor	v19,v19,v3
+.Lv2:	vxor	v20,v20,v2
+.Lv1:	vxor	v19,v19,v1
+.Lv0:	vxor	v20,v20,v0
+
+	vxor	v0,v19,v20
+
+	b	.Lbarrett_reduction
+
+.Lzero:
+	mr	r3,r10
+	b	.Lout
+
+FUNC_END(__crc32_vpmsum)
diff --git a/thirdparty/rocksdb/util/crc32c_ppc_constants.h b/thirdparty/rocksdb/util/crc32c_ppc_constants.h
new file mode 100644
index 0000000000..57d6630322
--- /dev/null
+++ b/thirdparty/rocksdb/util/crc32c_ppc_constants.h
@@ -0,0 +1,901 @@
+//  Copyright (C) 2015, 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#pragma once
+
+#define CRC 0x1edc6f41
+#define REFLECT
+#define CRC_XOR
+
+#ifndef __ASSEMBLY__
+#ifdef CRC_TABLE
+static const unsigned int crc_table[] = {
+    0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c,
+    0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+    0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c,
+    0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+    0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc,
+    0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+    0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512,
+    0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+    0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad,
+    0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+    0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf,
+    0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+    0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f,
+    0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+    0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f,
+    0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+    0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e,
+    0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+    0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e,
+    0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+    0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de,
+    0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+    0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4,
+    0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+    0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b,
+    0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+    0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5,
+    0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+    0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975,
+    0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+    0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905,
+    0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+    0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8,
+    0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+    0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8,
+    0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+    0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78,
+    0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+    0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6,
+    0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+    0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69,
+    0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+    0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,
+};
+
+#endif
+
+#else
+#define MAX_SIZE 32768
+.constants :
+
+        /* Reduce 262144 kbits to 1024 bits */
+        /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */
+        .octa 0x00000000b6ca9e20000000009c37c408
+
+        /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */
+        .octa 0x00000000350249a800000001b51df26c
+
+        /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */
+        .octa 0x00000001862dac54000000000724b9d0
+
+        /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */
+        .octa 0x00000001d87fb48c00000001c00532fe
+
+        /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */
+        .octa 0x00000001f39b699e00000000f05a9362
+
+        /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */
+        .octa 0x0000000101da11b400000001e1007970
+
+        /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */
+        .octa 0x00000001cab571e000000000a57366ee
+
+        /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */
+        .octa 0x00000000c7020cfe0000000192011284
+
+        /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */
+        .octa 0x00000000cdaed1ae0000000162716d9a
+
+        /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */
+        .octa 0x00000001e804effc00000000cd97ecde
+
+        /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */
+        .octa 0x0000000077c3ea3a0000000058812bc0
+
+        /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */
+        .octa 0x0000000068df31b40000000088b8c12e
+
+        /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */
+        .octa 0x00000000b059b6c200000001230b234c
+
+        /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */
+        .octa 0x0000000145fb8ed800000001120b416e
+
+        /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */
+        .octa 0x00000000cbc0916800000001974aecb0
+
+        /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */
+        .octa 0x000000005ceeedc2000000008ee3f226
+
+        /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */
+        .octa 0x0000000047d74e8600000001089aba9a
+
+        /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */
+        .octa 0x00000001407e9e220000000065113872
+
+        /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */
+        .octa 0x00000001da967bda000000005c07ec10
+
+        /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */
+        .octa 0x000000006c8983680000000187590924
+
+        /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */
+        .octa 0x00000000f2d14c9800000000e35da7c6
+
+        /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */
+        .octa 0x00000001993c6ad4000000000415855a
+
+        /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */
+        .octa 0x000000014683d1ac0000000073617758
+
+        /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */
+        .octa 0x00000001a7c93e6c0000000176021d28
+
+        /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */
+        .octa 0x000000010211e90a00000001c358fd0a
+
+        /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */
+        .octa 0x000000001119403e00000001ff7a2c18
+
+        /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */
+        .octa 0x000000001c3261aa00000000f2d9f7e4
+
+        /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */
+        .octa 0x000000014e37a634000000016cf1f9c8
+
+        /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */
+        .octa 0x0000000073786c0c000000010af9279a
+
+        /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */
+        .octa 0x000000011dc037f80000000004f101e8
+
+        /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */
+        .octa 0x0000000031433dfc0000000070bcf184
+
+        /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */
+        .octa 0x000000009cde8348000000000a8de642
+
+        /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */
+        .octa 0x0000000038d3c2a60000000062ea130c
+
+        /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */
+        .octa 0x000000011b25f26000000001eb31cbb2
+
+        /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */
+        .octa 0x000000001629e6f00000000170783448
+
+        /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */
+        .octa 0x0000000160838b4c00000001a684b4c6
+
+        /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */
+        .octa 0x000000007a44011c00000000253ca5b4
+
+        /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */
+        .octa 0x00000000226f417a0000000057b4b1e2
+
+        /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */
+        .octa 0x0000000045eb2eb400000000b6bd084c
+
+        /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */
+        .octa 0x000000014459d70c0000000123c2d592
+
+        /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */
+        .octa 0x00000001d406ed8200000000159dafce
+
+        /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */
+        .octa 0x0000000160c8e1a80000000127e1a64e
+
+        /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */
+        .octa 0x0000000027ba80980000000056860754
+
+        /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */
+        .octa 0x000000006d92d01800000001e661aae8
+
+        /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */
+        .octa 0x000000012ed7e3f200000000f82c6166
+
+        /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */
+        .octa 0x000000002dc8778800000000c4f9c7ae
+
+        /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */
+        .octa 0x0000000018240bb80000000074203d20
+
+        /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */
+        .octa 0x000000001ad381580000000198173052
+
+        /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */
+        .octa 0x00000001396b78f200000001ce8aba54
+
+        /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */
+        .octa 0x000000011a68133400000001850d5d94
+
+        /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */
+        .octa 0x000000012104732e00000001d609239c
+
+        /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */
+        .octa 0x00000000a140d90c000000001595f048
+
+        /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */
+        .octa 0x00000001b7215eda0000000042ccee08
+
+        /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */
+        .octa 0x00000001aaf1df3c000000010a389d74
+
+        /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */
+        .octa 0x0000000029d15b8a000000012a840da6
+
+        /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */
+        .octa 0x00000000f1a96922000000001d181c0c
+
+        /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */
+        .octa 0x00000001ac80d03c0000000068b7d1f6
+
+        /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */
+        .octa 0x000000000f11d56a000000005b0f14fc
+
+        /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */
+        .octa 0x00000001f1c022a20000000179e9e730
+
+        /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */
+        .octa 0x0000000173d00ae200000001ce1368d6
+
+        /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */
+        .octa 0x00000001d4ffe4ac0000000112c3a84c
+
+        /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */
+        .octa 0x000000016edc5ae400000000de940fee
+
+        /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */
+        .octa 0x00000001f1a0214000000000fe896b7e
+
+        /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */
+        .octa 0x00000000ca0b28a000000001f797431c
+
+        /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */
+        .octa 0x00000001928e30a20000000053e989ba
+
+        /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */
+        .octa 0x0000000097b1b002000000003920cd16
+
+        /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */
+        .octa 0x00000000b15bf90600000001e6f579b8
+
+        /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */
+        .octa 0x00000000411c5d52000000007493cb0a
+
+        /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */
+        .octa 0x00000001c36f330000000001bdd376d8
+
+        /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */
+        .octa 0x00000001119227e0000000016badfee6
+
+        /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */
+        .octa 0x00000000114d47020000000071de5c58
+
+        /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */
+        .octa 0x00000000458b5b9800000000453f317c
+
+        /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */
+        .octa 0x000000012e31fb8e0000000121675cce
+
+        /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */
+        .octa 0x000000005cf619d800000001f409ee92
+
+        /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */
+        .octa 0x0000000063f4d8b200000000f36b9c88
+
+        /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */
+        .octa 0x000000004138dc8a0000000036b398f4
+
+        /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */
+        .octa 0x00000001d29ee8e000000001748f9adc
+
+        /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */
+        .octa 0x000000006a08ace800000001be94ec00
+
+        /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */
+        .octa 0x0000000127d4201000000000b74370d6
+
+        /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */
+        .octa 0x0000000019d76b6200000001174d0b98
+
+        /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */
+        .octa 0x00000001b1471f6e00000000befc06a4
+
+        /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */
+        .octa 0x00000001f64c19cc00000001ae125288
+
+        /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */
+        .octa 0x00000000003c0ea00000000095c19b34
+
+        /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */
+        .octa 0x000000014d73abf600000001a78496f2
+
+        /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */
+        .octa 0x00000001620eb84400000001ac5390a0
+
+        /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */
+        .octa 0x0000000147655048000000002a80ed6e
+
+        /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */
+        .octa 0x0000000067b5077e00000001fa9b0128
+
+        /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */
+        .octa 0x0000000010ffe20600000001ea94929e
+
+        /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */
+        .octa 0x000000000fee8f1e0000000125f4305c
+
+        /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */
+        .octa 0x00000001da26fbae00000001471e2002
+
+        /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */
+        .octa 0x00000001b3a8bd880000000132d2253a
+
+        /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */
+        .octa 0x00000000e8f3898e00000000f26b3592
+
+        /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */
+        .octa 0x00000000b0d0d28c00000000bc8b67b0
+
+        /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */
+        .octa 0x0000000030f2a798000000013a826ef2
+
+        /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */
+        .octa 0x000000000fba10020000000081482c84
+
+        /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */
+        .octa 0x00000000bdb9bd7200000000e77307c2
+
+        /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */
+        .octa 0x0000000075d3bf5a00000000d4a07ec8
+
+        /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */
+        .octa 0x00000000ef1f98a00000000017102100
+
+        /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */
+        .octa 0x00000000689c760200000000db406486
+
+        /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */
+        .octa 0x000000016d5fa5fe0000000192db7f88
+
+        /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */
+        .octa 0x00000001d0d2b9ca000000018bf67b1e
+
+        /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */
+        .octa 0x0000000041e7b470000000007c09163e
+
+        /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */
+        .octa 0x00000001cbb6495e000000000adac060
+
+        /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */
+        .octa 0x000000010052a0b000000000bd8316ae
+
+        /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */
+        .octa 0x00000001d8effb5c000000019f09ab54
+
+        /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */
+        .octa 0x00000001d969853c0000000125155542
+
+        /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */
+        .octa 0x00000000523ccce2000000018fdb5882
+
+        /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */
+        .octa 0x000000001e2436bc00000000e794b3f4
+
+        /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */
+        .octa 0x00000000ddd1c3a2000000016f9bb022
+
+        /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */
+        .octa 0x0000000019fcfe3800000000290c9978
+
+        /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */
+        .octa 0x00000001ce95db640000000083c0f350
+
+        /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */
+        .octa 0x00000000af5828060000000173ea6628
+
+        /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */
+        .octa 0x00000001006388f600000001c8b4e00a
+
+        /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */
+        .octa 0x0000000179eca00a00000000de95d6aa
+
+        /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */
+        .octa 0x0000000122410a6a000000010b7f7248
+
+        /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */
+        .octa 0x000000004288e87c00000001326e3a06
+
+        /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */
+        .octa 0x000000016c5490da00000000bb62c2e6
+
+        /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */
+        .octa 0x00000000d1c71f6e0000000156a4b2c2
+
+        /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */
+        .octa 0x00000001b4ce08a6000000011dfe763a
+
+        /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */
+        .octa 0x00000001466ba60c000000007bcca8e2
+
+        /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */
+        .octa 0x00000001f6c488a40000000186118faa
+
+        /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */
+        .octa 0x000000013bfb06820000000111a65a88
+
+        /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */
+        .octa 0x00000000690e9e54000000003565e1c4
+
+        /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */
+        .octa 0x00000000281346b6000000012ed02a82
+
+        /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */
+        .octa 0x000000015646402400000000c486ecfc
+
+        /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */
+        .octa 0x000000016063a8dc0000000001b951b2
+
+        /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */
+        .octa 0x0000000116a663620000000048143916
+
+        /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */
+        .octa 0x000000017e8aa4d200000001dc2ae124
+
+        /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */
+        .octa 0x00000001728eb10c00000001416c58d6
+
+        /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */
+        .octa 0x00000001b08fd7fa00000000a479744a
+
+        /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */
+        .octa 0x00000001092a16e80000000096ca3a26
+
+        /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */
+        .octa 0x00000000a505637c00000000ff223d4e
+
+        /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */
+        .octa 0x00000000d94869b2000000010e84da42
+
+        /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */
+        .octa 0x00000001c8b203ae00000001b61ba3d0
+
+        /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */
+        .octa 0x000000005704aea000000000680f2de8
+
+        /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */
+        .octa 0x000000012e295fa2000000008772a9a8
+
+        /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */
+        .octa 0x000000011d0908bc0000000155f295bc
+
+        /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */
+        .octa 0x0000000193ed97ea00000000595f9282
+
+        /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */
+        .octa 0x000000013a0f1c520000000164b1c25a
+
+        /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */
+        .octa 0x000000010c2c40c000000000fbd67c50
+
+        /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */
+        .octa 0x00000000ff6fac3e0000000096076268
+
+        /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */
+        .octa 0x000000017b3609c000000001d288e4cc
+
+        /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */
+        .octa 0x0000000088c8c92200000001eaac1bdc
+
+        /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */
+        .octa 0x00000001751baae600000001f1ea39e2
+
+        /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */
+        .octa 0x000000010795297200000001eb6506fc
+
+        /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */
+        .octa 0x0000000162b00abe000000010f806ffe
+
+        /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */
+        .octa 0x000000000d7b404c000000010408481e
+
+        /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */
+        .octa 0x00000000763b13d40000000188260534
+
+        /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */
+        .octa 0x00000000f6dc22d80000000058fc73e0
+
+        /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */
+        .octa 0x000000007daae06000000000391c59b8
+
+        /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */
+        .octa 0x000000013359ab7c000000018b638400
+
+        /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */
+        .octa 0x000000008add438a000000011738f5c4
+
+        /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */
+        .octa 0x00000001edbefdea000000008cf7c6da
+
+        /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */
+        .octa 0x000000004104e0f800000001ef97fb16
+
+        /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */
+        .octa 0x00000000b48a82220000000102130e20
+
+        /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */
+        .octa 0x00000001bcb4684400000000db968898
+
+        /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */
+        .octa 0x000000013293ce0a00000000b5047b5e
+
+        /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */
+        .octa 0x00000001710d0844000000010b90fdb2
+
+        /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */
+        .octa 0x0000000117907f6e000000004834a32e
+
+        /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */
+        .octa 0x0000000087ddf93e0000000059c8f2b0
+
+        /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */
+        .octa 0x000000005970e9b00000000122cec508
+
+        /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */
+        .octa 0x0000000185b2b7d0000000000a330cda
+
+        /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */
+        .octa 0x00000001dcee0efc000000014a47148c
+
+        /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */
+        .octa 0x0000000030da27220000000042c61cb8
+
+        /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */
+        .octa 0x000000012f925a180000000012fe6960
+
+        /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */
+        .octa 0x00000000dd2e357c00000000dbda2c20
+
+        /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */
+        .octa 0x00000000071c80de000000011122410c
+
+        /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */
+        .octa 0x000000011513140a00000000977b2070
+
+        /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */
+        .octa 0x00000001df876e8e000000014050438e
+
+        /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */
+        .octa 0x000000015f81d6ce0000000147c840e8
+
+        /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */
+        .octa 0x000000019dd94dbe00000001cc7c88ce
+
+        /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */
+        .octa 0x00000001373d206e00000001476b35a4
+
+        /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */
+        .octa 0x00000000668ccade000000013d52d508
+
+        /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */
+        .octa 0x00000001b192d268000000008e4be32e
+
+        /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */
+        .octa 0x00000000e30f3a7800000000024120fe
+
+        /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */
+        .octa 0x000000010ef1f7bc00000000ddecddb4
+
+        /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */
+        .octa 0x00000001f5ac738000000000d4d403bc
+
+        /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */
+        .octa 0x000000011822ea7000000001734b89aa
+
+        /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */
+        .octa 0x00000000c3a33848000000010e7a58d6
+
+        /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */
+        .octa 0x00000001bd151c2400000001f9f04e9c
+
+        /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */
+        .octa 0x0000000056002d7600000000b692225e
+
+        /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */
+        .octa 0x000000014657c4f4000000019b8d3f3e
+
+        /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */
+        .octa 0x0000000113742d7c00000001a874f11e
+
+        /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */
+        .octa 0x000000019c5920ba000000010d5a4254
+
+        /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */
+        .octa 0x000000005216d2d600000000bbb2f5d6
+
+        /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */
+        .octa 0x0000000136f5ad8a0000000179cc0e36
+
+        /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */
+        .octa 0x000000018b07beb600000001dca1da4a
+
+        /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */
+        .octa 0x00000000db1e93b000000000feb1a192
+
+        /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */
+        .octa 0x000000000b96fa3a00000000d1eeedd6
+
+        /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */
+        .octa 0x00000001d9968af0000000008fad9bb4
+
+        /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */
+        .octa 0x000000000e4a77a200000001884938e4
+
+        /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */
+        .octa 0x00000000508c2ac800000001bc2e9bc0
+
+        /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */
+        .octa 0x0000000021572a8000000001f9658a68
+
+        /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */
+        .octa 0x00000001b859daf2000000001b9224fc
+
+        /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */
+        .octa 0x000000016f7884740000000055b2fb84
+
+        /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */
+        .octa 0x00000001b438810e000000018b090348
+
+        /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */
+        .octa 0x0000000095ddc6f2000000011ccbd5ea
+
+        /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */
+        .octa 0x00000001d977c20c0000000007ae47f8
+
+        /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */
+        .octa 0x00000000ebedb99a0000000172acbec0
+
+        /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */
+        .octa 0x00000001df9e9e9200000001c6e3ff20
+
+        /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */
+        .octa 0x00000001a4a3f95200000000e1b38744
+
+        /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */
+        .octa 0x00000000e2f5122000000000791585b2
+
+        /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */
+        .octa 0x000000004aa01f3e00000000ac53b894
+
+        /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */
+        .octa 0x00000000b3e90a5800000001ed5f2cf4
+
+        /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */
+        .octa 0x000000000c9ca2aa00000001df48b2e0
+
+        /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */
+        .octa 0x000000015168231600000000049c1c62
+
+        /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */
+        .octa 0x0000000036fce78c000000017c460c12
+
+        /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */
+        .octa 0x000000009037dc10000000015be4da7e
+
+        /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */
+        .octa 0x00000000d3298582000000010f38f668
+
+        /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */
+        .octa 0x00000001b42e8ad60000000039f40a00
+
+        /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */
+        .octa 0x00000000142a983800000000bd4c10c4
+
+        /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */
+        .octa 0x0000000109c7f1900000000042db1d98
+
+        /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */
+        .octa 0x0000000056ff931000000001c905bae6
+
+        /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */
+        .octa 0x00000001594513aa00000000069d40ea
+
+        /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */
+        .octa 0x00000001e3b5b1e8000000008e4fbad0
+
+        /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */
+        .octa 0x000000011dd5fc080000000047bedd46
+
+        /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */
+        .octa 0x00000001675f0cc20000000026396bf8
+
+        /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */
+        .octa 0x00000000d1c8dd4400000000379beb92
+
+        /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */
+        .octa 0x0000000115ebd3d8000000000abae54a
+
+        /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */
+        .octa 0x00000001ecbd0dac0000000007e6a128
+
+        /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */
+        .octa 0x00000000cdf67af2000000000ade29d2
+
+        /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */
+        .octa 0x000000004c01ff4c00000000f974c45c
+
+        /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */
+        .octa 0x00000000f2d8657e00000000e77ac60a
+
+        /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */
+        .octa 0x000000006bae74c40000000145895816
+
+        /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */
+        .octa 0x0000000152af8aa00000000038e362be
+
+        /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */
+        .octa 0x0000000004663802000000007f991a64
+
+        /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */
+        .octa 0x00000001ab2f5afc00000000fa366d3a
+
+        /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */
+        .octa 0x0000000074a4ebd400000001a2bb34f0
+
+        /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */
+        .octa 0x00000001d7ab3a4c0000000028a9981e
+
+        /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */
+        .octa 0x00000001a8da60c600000001dbc672be
+
+        /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */
+        .octa 0x000000013cf6382000000000b04d77f6
+
+        /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */
+        .octa 0x00000000bec12e1e0000000124400d96
+
+        /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */
+        .octa 0x00000001c6368010000000014ca4b414
+
+        /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */
+        .octa 0x00000001e6e78758000000012fe2c938
+
+        /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */
+        .octa 0x000000008d7f2b3c00000001faed01e6
+
+        /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */
+        .octa 0x000000016b4a156e000000007e80ecfe
+
+        /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */
+        .octa 0x00000001c63cfeb60000000098daee94
+
+        /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */
+        .octa 0x000000015f902670000000010a04edea
+
+        /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */
+        .octa 0x00000001cd5de11e00000001c00b4524
+
+        /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */
+        .octa 0x000000001acaec540000000170296550
+
+        /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */
+        .octa 0x000000002bd0ca780000000181afaa48
+
+        /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */
+        .octa 0x0000000032d63d5c0000000185a31ffa
+
+        /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */
+        .octa 0x000000001c6d4e4c000000002469f608
+
+        /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */
+        .octa 0x0000000106a60b92000000006980102a
+
+        /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */
+        .octa 0x00000000d3855e120000000111ea9ca8
+
+        /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */
+        .octa 0x00000000e312563600000001bd1d29ce
+
+        /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */
+        .octa 0x000000009e8f7ea400000001b34b9580
+
+        /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */
+        .octa 0x00000001c82e562c000000003076054e
+
+        /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */
+        .octa 0x00000000ca9f09ce000000012a608ea4
+
+        /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */
+        .octa 0x00000000c63764e600000000784d05fe
+
+        /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */
+        .octa 0x0000000168d2e49e000000016ef0d82a
+
+        /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */
+        .octa 0x00000000e986c1480000000075bda454
+
+        /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */
+        .octa 0x00000000cfb65894000000003dc0a1c4
+
+        /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */
+        .octa 0x0000000111cadee400000000e9a5d8be
+
+        /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */
+        .octa 0x0000000171fb63ce00000001609bc4b4
+
+        .short_constants :
+
+        /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include
+           the trailing 32 bits of zeros */
+        /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod
+           p(x)` */
+        .octa 0x7fec2963e5bf80485cf015c388e56f72
+
+        /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod
+           p(x)` */
+        .octa 0x38e888d4844752a9963a18920246e2e6
+
+        /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod
+           p(x)` */
+        .octa 0x42316c00730206ad419a441956993a31
+
+        /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod
+           p(x)` */
+        .octa 0x543d5c543e65ddf9924752ba2b830011
+
+        /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod
+           p(x)` */
+        .octa 0x78e87aaf56767c9255bd7f9518e4a304
+
+        /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod
+           p(x)` */
+        .octa 0x8f68fcec1903da7f6d76739fe0553f1e
+
+        /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod
+           p(x)` */
+        .octa 0x3f4840246791d588c133722b1fe0b5c3
+
+        /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod
+           p(x)` */
+        .octa 0x34c96751b04de25a64b67ee0e55ef1f3
+
+        /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)`
+         */
+        .octa 0x156c8e180b4a395b069db049b8fdb1e7
+
+        /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */
+        .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e
+
+        /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */
+        .octa 0x041d37768cd75659817cdc5119b29a35
+
+        /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */
+        .octa 0x3a0777818cfaa9651ce9d94b36c41f1c
+
+        /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */
+        .octa 0x0e148e8252377a554f256efcb82be955
+
+        /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */
+        .octa 0x9c25531d19e65ddeec1631edb2dea967
+
+        /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */
+        .octa 0x790606ff9957c0a65d27e147510ac59a
+
+        /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */
+        .octa 0x82f63b786ea2d55ca66805eb18b8ea18
+
+        .barrett_constants :
+        /* 33 bit reflected Barrett constant m - (4^32)/n */
+        .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */
+        /* 33 bit reflected Barrett constant n */
+        .octa 0x00000000000000000000000105ec76f1
+#endif
diff --git a/thirdparty/rocksdb/util/crc32c_test.cc b/thirdparty/rocksdb/util/crc32c_test.cc
index 306194e9c1..d5983586bc 100644
--- a/thirdparty/rocksdb/util/crc32c_test.cc
+++ b/thirdparty/rocksdb/util/crc32c_test.cc
@@ -6,16 +6,66 @@
 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
-
 #include "util/crc32c.h"
 #include "util/testharness.h"
+#include "util/coding.h"
 
 namespace rocksdb {
 namespace crc32c {
 
 class CRC { };
 
+
+// Tests for 3-way crc32c algorithm. We need these tests because it uses
+// different lookup tables than the original Fast_CRC32
+const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t);
+char buffer[BUFFER_SIZE];
+
+struct ExpectedResult {
+  size_t offset;
+  size_t length;
+  uint32_t crc32c;
+};
+
+ExpectedResult expectedResults[] = {
+    // Zero-byte input
+    { 0, 0, ~0U },
+    // Small aligned inputs to test special cases in SIMD implementations
+    { 8, 1, 1543413366 },
+    { 8, 2, 523493126 },
+    { 8, 3, 1560427360 },
+    { 8, 4, 3422504776 },
+    { 8, 5, 447841138 },
+    { 8, 6, 3910050499 },
+    { 8, 7, 3346241981 },
+    // Small unaligned inputs
+    { 9, 1, 3855826643 },
+    { 10, 2, 560880875 },
+    { 11, 3, 1479707779 },
+    { 12, 4, 2237687071 },
+    { 13, 5, 4063855784 },
+    { 14, 6, 2553454047 },
+    { 15, 7, 1349220140 },
+    // Larger inputs to test leftover chunks at the end of aligned blocks
+    { 8, 8, 627613930 },
+    { 8, 9, 2105929409 },
+    { 8, 10, 2447068514 },
+    { 8, 11, 863807079 },
+    { 8, 12, 292050879 },
+    { 8, 13, 1411837737 },
+    { 8, 14, 2614515001 },
+    { 8, 15, 3579076296 },
+    { 8, 16, 2897079161 },
+    { 8, 17, 675168386 },
+    // // Much larger inputs
+    { 0, BUFFER_SIZE, 2096790750 },
+    { 1, BUFFER_SIZE / 2, 3854797577 },
+
+};
+
 TEST(CRC, StandardResults) {
+
+  // Original Fast_CRC32 tests.
   // From rfc3720 section B.4.
   char buf[32];
 
@@ -26,12 +76,12 @@ TEST(CRC, StandardResults) {
   ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf)));
 
   for (int i = 0; i < 32; i++) {
-    buf[i] = i;
+    buf[i] = static_cast<char>(i);
   }
   ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf)));
 
   for (int i = 0; i < 32; i++) {
-    buf[i] = 31 - i;
+    buf[i] = static_cast<char>(31 - i);
   }
   ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf)));
 
@@ -50,6 +100,24 @@ TEST(CRC, StandardResults) {
     0x00, 0x00, 0x00, 0x00,
   };
   ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+
+  // 3-Way Crc32c tests ported from folly.
+  // Test 1: single computation
+  for (auto expected : expectedResults) {
+    uint32_t result = Value(buffer + expected.offset, expected.length);
+    EXPECT_EQ(~expected.crc32c, result);
+  }
+
+  // Test 2: stitching two computations
+  for (auto expected : expectedResults) {
+    size_t partialLength = expected.length / 2;
+    uint32_t partialChecksum = Value(buffer + expected.offset, partialLength);
+    uint32_t result = Extend(partialChecksum,
+        buffer + expected.offset + partialLength,
+        expected.length - partialLength);
+    EXPECT_EQ(~expected.crc32c, result);
+  }
+
 }
 
 TEST(CRC, Values) {
@@ -72,7 +140,37 @@ TEST(CRC, Mask) {
 }  // namespace crc32c
 }  // namespace rocksdb
 
+// copied from folly
+const uint64_t FNV_64_HASH_START = 14695981039346656037ULL;
+inline uint64_t fnv64_buf(const void* buf,
+                          size_t n,
+                          uint64_t hash = FNV_64_HASH_START) {
+  // forcing signed char, since other platforms can use unsigned
+  const signed char* char_buf = reinterpret_cast<const signed char*>(buf);
+
+  for (size_t i = 0; i < n; ++i) {
+    hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) +
+      (hash << 8) + (hash << 40);
+    hash ^= char_buf[i];
+  }
+  return hash;
+}
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
+
+  // Populate a buffer with a deterministic pattern
+  // on which to compute checksums
+
+  const uint8_t* src = (uint8_t*)rocksdb::crc32c::buffer;
+  uint64_t* dst = (uint64_t*)rocksdb::crc32c::buffer;
+  const uint64_t* end = (const uint64_t*)(rocksdb::crc32c::buffer + rocksdb::crc32c::BUFFER_SIZE);
+  *dst++ = 0;
+  while (dst < end) {
+    rocksdb::EncodeFixed64(reinterpret_cast<char*>(dst), fnv64_buf((const char*)src, sizeof(uint64_t)));
+    dst++;
+    src += sizeof(uint64_t);
+  }
+
   return RUN_ALL_TESTS();
 }
diff --git a/thirdparty/rocksdb/util/delete_scheduler.cc b/thirdparty/rocksdb/util/delete_scheduler.cc
index 93fc166971..f5ee284489 100644
--- a/thirdparty/rocksdb/util/delete_scheduler.cc
+++ b/thirdparty/rocksdb/util/delete_scheduler.cc
@@ -19,19 +19,23 @@
 
 namespace rocksdb {
 
-DeleteScheduler::DeleteScheduler(Env* env, const std::string& trash_dir,
-                                 int64_t rate_bytes_per_sec, Logger* info_log,
-                                 SstFileManagerImpl* sst_file_manager)
+DeleteScheduler::DeleteScheduler(Env* env, int64_t rate_bytes_per_sec,
+                                 Logger* info_log,
+                                 SstFileManagerImpl* sst_file_manager,
+                                 double max_trash_db_ratio,
+                                 uint64_t bytes_max_delete_chunk)
     : env_(env),
-      trash_dir_(trash_dir),
       total_trash_size_(0),
       rate_bytes_per_sec_(rate_bytes_per_sec),
       pending_files_(0),
+      bytes_max_delete_chunk_(bytes_max_delete_chunk),
       closing_(false),
       cv_(&mu_),
       info_log_(info_log),
-      sst_file_manager_(sst_file_manager) {
+      sst_file_manager_(sst_file_manager),
+      max_trash_db_ratio_(max_trash_db_ratio) {
   assert(sst_file_manager != nullptr);
+  assert(max_trash_db_ratio >= 0);
   bg_thread_.reset(
       new port::Thread(&DeleteScheduler::BackgroundEmptyTrash, this));
 }
@@ -47,11 +51,13 @@ DeleteScheduler::~DeleteScheduler() {
   }
 }
 
-Status DeleteScheduler::DeleteFile(const std::string& file_path) {
+Status DeleteScheduler::DeleteFile(const std::string& file_path,
+                                   const std::string& dir_to_sync,
+                                   const bool force_bg) {
   Status s;
-  if (rate_bytes_per_sec_.load() <= 0 ||
+  if (rate_bytes_per_sec_.load() <= 0 || (!force_bg &&
       total_trash_size_.load() >
-          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_) {
+          sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) {
     // Rate limiting is disabled or trash size makes up more than
     // max_trash_db_ratio_ (default 25%) of the total DB size
     TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
@@ -63,11 +69,11 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path) {
   }
 
   // Move file to trash
-  std::string path_in_trash;
-  s = MoveToTrash(file_path, &path_in_trash);
+  std::string trash_file;
+  s = MarkAsTrash(file_path, &trash_file);
+
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(info_log_, "Failed to move %s to trash directory (%s)",
-                    file_path.c_str(), trash_dir_.c_str());
+    ROCKS_LOG_ERROR(info_log_, "Failed to mark %s as trash", file_path.c_str());
     s = env_->DeleteFile(file_path);
     if (s.ok()) {
       sst_file_manager_->OnDeleteFile(file_path);
@@ -75,10 +81,15 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path) {
     return s;
   }
 
+  // Update the total trash size
+  uint64_t trash_file_size = 0;
+  env_->GetFileSize(trash_file, &trash_file_size);
+  total_trash_size_.fetch_add(trash_file_size);
+
   // Add file to delete queue
   {
     InstrumentedMutexLock l(&mu_);
-    queue_.push(path_in_trash);
+    queue_.emplace(trash_file, dir_to_sync);
     pending_files_++;
     if (pending_files_ == 1) {
       cv_.SignalAll();
@@ -92,44 +103,84 @@ std::map<std::string, Status> DeleteScheduler::GetBackgroundErrors() {
   return bg_errors_;
 }
 
-Status DeleteScheduler::MoveToTrash(const std::string& file_path,
-                                    std::string* path_in_trash) {
+const std::string DeleteScheduler::kTrashExtension = ".trash";
+bool DeleteScheduler::IsTrashFile(const std::string& file_path) {
+  return (file_path.size() >= kTrashExtension.size() &&
+          file_path.rfind(kTrashExtension) ==
+              file_path.size() - kTrashExtension.size());
+}
+
+Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                         const std::string& path) {
   Status s;
-  // Figure out the name of the file in trash folder
+  // Check if there are any files marked as trash in this path
+  std::vector<std::string> files_in_path;
+  s = env->GetChildren(path, &files_in_path);
+  if (!s.ok()) {
+    return s;
+  }
+  for (const std::string& current_file : files_in_path) {
+    if (!DeleteScheduler::IsTrashFile(current_file)) {
+      // not a trash file, skip
+      continue;
+    }
+
+    Status file_delete;
+    std::string trash_file = path + "/" + current_file;
+    if (sfm) {
+      // We have an SstFileManager that will schedule the file delete
+      sfm->OnAddFile(trash_file);
+      file_delete = sfm->ScheduleFileDeletion(trash_file, path);
+    } else {
+      // Delete the file immediately
+      file_delete = env->DeleteFile(trash_file);
+    }
+
+    if (s.ok() && !file_delete.ok()) {
+      s = file_delete;
+    }
+  }
+
+  return s;
+}
+
+Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
+                                    std::string* trash_file) {
+  // Sanity check of the path
   size_t idx = file_path.rfind("/");
   if (idx == std::string::npos || idx == file_path.size() - 1) {
     return Status::InvalidArgument("file_path is corrupted");
   }
-  *path_in_trash = trash_dir_ + file_path.substr(idx);
-  std::string unique_suffix = "";
 
-  if (*path_in_trash == file_path) {
-    // This file is already in trash
+  Status s;
+  if (DeleteScheduler::IsTrashFile(file_path)) {
+    // This is already a trash file
+    *trash_file = file_path;
     return s;
   }
 
+  *trash_file = file_path + kTrashExtension;
   // TODO(tec) : Implement Env::RenameFileIfNotExist and remove
   //             file_move_mu mutex.
+  int cnt = 0;
   InstrumentedMutexLock l(&file_move_mu_);
   while (true) {
-    s = env_->FileExists(*path_in_trash + unique_suffix);
+    s = env_->FileExists(*trash_file);
     if (s.IsNotFound()) {
       // We found a path for our file in trash
-      *path_in_trash += unique_suffix;
-      s = env_->RenameFile(file_path, *path_in_trash);
+      s = env_->RenameFile(file_path, *trash_file);
       break;
     } else if (s.ok()) {
       // Name conflict, generate new random suffix
-      unique_suffix = env_->GenerateUniqueId();
+      *trash_file = file_path + std::to_string(cnt) + kTrashExtension;
     } else {
       // Error during FileExists call, we cannot continue
       break;
     }
+    cnt++;
   }
   if (s.ok()) {
-    uint64_t trash_file_size = 0;
-    sst_file_manager_->OnMoveFile(file_path, *path_in_trash, &trash_file_size);
-    total_trash_size_.fetch_add(trash_file_size);
+    sst_file_manager_->OnMoveFile(file_path, *trash_file);
   }
   return s;
 }
@@ -160,16 +211,21 @@ void DeleteScheduler::BackgroundEmptyTrash() {
       }
 
       // Get new file to delete
-      std::string path_in_trash = queue_.front();
-      queue_.pop();
+      const FileAndDir& fad = queue_.front();
+      std::string path_in_trash = fad.fname;
 
       // We dont need to hold the lock while deleting the file
       mu_.Unlock();
       uint64_t deleted_bytes = 0;
+      bool is_complete = true;
       // Delete file from trash and update total_penlty value
-      Status s = DeleteTrashFile(path_in_trash,  &deleted_bytes);
+      Status s =
+          DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
       total_deleted_bytes += deleted_bytes;
       mu_.Lock();
+      if (is_complete) {
+        queue_.pop();
+      }
 
       if (!s.ok()) {
         bg_errors_[path_in_trash] = s;
@@ -189,7 +245,9 @@ void DeleteScheduler::BackgroundEmptyTrash() {
       TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
                                &total_penlty);
 
-      pending_files_--;
+      if (is_complete) {
+        pending_files_--;
+      }
       if (pending_files_ == 0) {
         // Unblock WaitForEmptyTrash since there are no more files waiting
         // to be deleted
@@ -200,23 +258,84 @@ void DeleteScheduler::BackgroundEmptyTrash() {
 }
 
 Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
-                                        uint64_t* deleted_bytes) {
+                                        const std::string& dir_to_sync,
+                                        uint64_t* deleted_bytes,
+                                        bool* is_complete) {
   uint64_t file_size;
   Status s = env_->GetFileSize(path_in_trash, &file_size);
+  *is_complete = true;
+  TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
   if (s.ok()) {
-    TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
-    s = env_->DeleteFile(path_in_trash);
-  }
+    bool need_full_delete = true;
+    if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
+      uint64_t num_hard_links = 2;
+      // We don't have to worry aobut data race between linking a new
+      // file after the number of file link check and ftruncte because
+      // the file is now in trash and no hardlink is supposed to create
+      // to trash files by RocksDB.
+      Status my_status = env_->NumFileLinks(path_in_trash, &num_hard_links);
+      if (my_status.ok()) {
+        if (num_hard_links == 1) {
+          std::unique_ptr<WritableFile> wf;
+          my_status =
+              env_->ReopenWritableFile(path_in_trash, &wf, EnvOptions());
+          if (my_status.ok()) {
+            my_status = wf->Truncate(file_size - bytes_max_delete_chunk_);
+            if (my_status.ok()) {
+              TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:Fsync");
+              my_status = wf->Fsync();
+            }
+          }
+          if (my_status.ok()) {
+            *deleted_bytes = bytes_max_delete_chunk_;
+            need_full_delete = false;
+            *is_complete = false;
+          } else {
+            ROCKS_LOG_WARN(info_log_,
+                           "Failed to partially delete %s from trash -- %s",
+                           path_in_trash.c_str(), my_status.ToString().c_str());
+          }
+        } else {
+          ROCKS_LOG_INFO(info_log_,
+                         "Cannot delete %s slowly through ftruncate from trash "
+                         "as it has other links",
+                         path_in_trash.c_str());
+        }
+      } else if (!num_link_error_printed_) {
+        ROCKS_LOG_INFO(
+            info_log_,
+            "Cannot delete files slowly through ftruncate from trash "
+            "as Env::NumFileLinks() returns error: %s",
+            my_status.ToString().c_str());
+        num_link_error_printed_ = true;
+      }
+    }
 
+    if (need_full_delete) {
+      s = env_->DeleteFile(path_in_trash);
+      if (!dir_to_sync.empty()) {
+        std::unique_ptr<Directory> dir_obj;
+        if (s.ok()) {
+          s = env_->NewDirectory(dir_to_sync, &dir_obj);
+        }
+        if (s.ok()) {
+          s = dir_obj->Fsync();
+          TEST_SYNC_POINT_CALLBACK(
+              "DeleteScheduler::DeleteTrashFile::AfterSyncDir",
+              reinterpret_cast<void*>(const_cast<std::string*>(&dir_to_sync)));
+        }
+      }
+      *deleted_bytes = file_size;
+      sst_file_manager_->OnDeleteFile(path_in_trash);
+    }
+  }
   if (!s.ok()) {
     // Error while getting file size or while deleting
     ROCKS_LOG_ERROR(info_log_, "Failed to delete %s from trash -- %s",
                     path_in_trash.c_str(), s.ToString().c_str());
     *deleted_bytes = 0;
   } else {
-    *deleted_bytes = file_size;
-    total_trash_size_.fetch_sub(file_size);
-    sst_file_manager_->OnDeleteFile(path_in_trash);
+    total_trash_size_.fetch_sub(*deleted_bytes);
   }
 
   return s;
diff --git a/thirdparty/rocksdb/util/delete_scheduler.h b/thirdparty/rocksdb/util/delete_scheduler.h
index 4c07ed67c8..29b70517b6 100644
--- a/thirdparty/rocksdb/util/delete_scheduler.h
+++ b/thirdparty/rocksdb/util/delete_scheduler.h
@@ -24,7 +24,7 @@ class Logger;
 class SstFileManagerImpl;
 
 // DeleteScheduler allows the DB to enforce a rate limit on file deletion,
-// Instead of deleteing files immediately, files are moved to trash_dir
+// Instead of deleteing files immediately, files are marked as trash
 // and deleted in a background thread that apply sleep penlty between deletes
 // if they are happening in a rate faster than rate_bytes_per_sec,
 //
@@ -32,9 +32,9 @@ class SstFileManagerImpl;
 // case DeleteScheduler will delete files immediately.
 class DeleteScheduler {
  public:
-  DeleteScheduler(Env* env, const std::string& trash_dir,
-                  int64_t rate_bytes_per_sec, Logger* info_log,
-                  SstFileManagerImpl* sst_file_manager);
+  DeleteScheduler(Env* env, int64_t rate_bytes_per_sec, Logger* info_log,
+                  SstFileManagerImpl* sst_file_manager,
+                  double max_trash_db_ratio, uint64_t bytes_max_delete_chunk);
 
   ~DeleteScheduler();
 
@@ -43,11 +43,14 @@ class DeleteScheduler {
 
   // Set delete rate limit in bytes per second
   void SetRateBytesPerSecond(int64_t bytes_per_sec) {
-    return rate_bytes_per_sec_.store(bytes_per_sec);
+    rate_bytes_per_sec_.store(bytes_per_sec);
   }
 
-  // Move file to trash directory and schedule it's deletion
-  Status DeleteFile(const std::string& fname);
+  // Mark file as trash directory and schedule it's deletion. If force_bg is
+  // set, it forces the file to always be deleted in the background thread,
+  // except when rate limiting is disabled
+  Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
+      const bool force_bg = false);
 
   // Wait for all files being deleteing in the background to finish or for
   // destructor to be called.
@@ -59,34 +62,57 @@ class DeleteScheduler {
 
   uint64_t GetTotalTrashSize() { return total_trash_size_.load(); }
 
-  void TEST_SetMaxTrashDBRatio(double r) {
+  // Return trash/DB size ratio where new files will be deleted immediately
+  double GetMaxTrashDBRatio() {
+    return max_trash_db_ratio_.load();
+  }
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  void SetMaxTrashDBRatio(double r) {
     assert(r >= 0);
-    max_trash_db_ratio_ = r;
+    max_trash_db_ratio_.store(r);
   }
 
+  static const std::string kTrashExtension;
+  static bool IsTrashFile(const std::string& file_path);
+
+  // Check if there are any .trash filse in path, and schedule their deletion
+  // Or delete immediately if sst_file_manager is nullptr
+  static Status CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
+                                 const std::string& path);
+
  private:
-  Status MoveToTrash(const std::string& file_path, std::string* path_in_trash);
+  Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
 
   Status DeleteTrashFile(const std::string& path_in_trash,
-                         uint64_t* deleted_bytes);
+                         const std::string& dir_to_sync,
+                         uint64_t* deleted_bytes, bool* is_complete);
 
   void BackgroundEmptyTrash();
 
   Env* env_;
-  // Path to the trash directory
-  std::string trash_dir_;
-  // total size of trash directory
+  // total size of trash files
   std::atomic<uint64_t> total_trash_size_;
   // Maximum number of bytes that should be deleted per second
   std::atomic<int64_t> rate_bytes_per_sec_;
   // Mutex to protect queue_, pending_files_, bg_errors_, closing_
   InstrumentedMutex mu_;
-  // Queue of files in trash that need to be deleted
-  std::queue<std::string> queue_;
-  // Number of files in trash that are waiting to be deleted
+
+  struct FileAndDir {
+    FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
+    std::string fname;
+    std::string dir;  // empty will be skipped.
+  };
+
+  // Queue of trash files that need to be deleted
+  std::queue<FileAndDir> queue_;
+  // Number of trash files that are waiting to be deleted
   int32_t pending_files_;
+  uint64_t bytes_max_delete_chunk_;
   // Errors that happened in BackgroundEmptyTrash (file_path => error)
   std::map<std::string, Status> bg_errors_;
+
+  bool num_link_error_printed_ = false;
   // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop
   bool closing_;
   // Condition variable signaled in these conditions
@@ -100,9 +126,10 @@ class DeleteScheduler {
   InstrumentedMutex file_move_mu_;
   Logger* info_log_;
   SstFileManagerImpl* sst_file_manager_;
-  // If the trash size constitutes for more than 25% of the total DB size
-  // we will start deleting new files passed to DeleteScheduler immediately
-  double max_trash_db_ratio_ = 0.25;
+  // If the trash size constitutes for more than this fraction of the total DB
+  // size we will start deleting new files passed to DeleteScheduler
+  // immediately
+  std::atomic<double> max_trash_db_ratio_;
   static const uint64_t kMicrosInSecond = 1000 * 1000LL;
 };
 
diff --git a/thirdparty/rocksdb/util/delete_scheduler_test.cc b/thirdparty/rocksdb/util/delete_scheduler_test.cc
index 208bdd7417..0d8e354b9c 100644
--- a/thirdparty/rocksdb/util/delete_scheduler_test.cc
+++ b/thirdparty/rocksdb/util/delete_scheduler_test.cc
@@ -28,17 +28,23 @@ namespace rocksdb {
 class DeleteSchedulerTest : public testing::Test {
  public:
   DeleteSchedulerTest() : env_(Env::Default()) {
-    dummy_files_dir_ = test::TmpDir(env_) + "/delete_scheduler_dummy_data_dir";
-    DestroyAndCreateDir(dummy_files_dir_);
-    trash_dir_ = test::TmpDir(env_) + "/delete_scheduler_trash";
-    DestroyAndCreateDir(trash_dir_);
+    const int kNumDataDirs = 3;
+    dummy_files_dirs_.reserve(kNumDataDirs);
+    for (size_t i = 0; i < kNumDataDirs; ++i) {
+      dummy_files_dirs_.emplace_back(
+          test::PerThreadDBPath(env_, "delete_scheduler_dummy_data_dir") +
+          ToString(i));
+      DestroyAndCreateDir(dummy_files_dirs_.back());
+    }
   }
 
-  ~DeleteSchedulerTest() {
+  ~DeleteSchedulerTest() override {
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
     rocksdb::SyncPoint::GetInstance()->LoadDependency({});
     rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
-    test::DestroyDir(env_, dummy_files_dir_);
+    for (const auto& dummy_files_dir : dummy_files_dirs_) {
+      test::DestroyDir(env_, dummy_files_dir);
+    }
   }
 
   void DestroyAndCreateDir(const std::string& dir) {
@@ -46,37 +52,59 @@ class DeleteSchedulerTest : public testing::Test {
     EXPECT_OK(env_->CreateDir(dir));
   }
 
-  int CountFilesInDir(const std::string& dir) {
+  int CountNormalFiles(size_t dummy_files_dirs_idx = 0) {
+    std::vector<std::string> files_in_dir;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int normal_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (!DeleteScheduler::IsTrashFile(f) && f != "." && f != "..") {
+        normal_cnt++;
+      }
+    }
+    return normal_cnt;
+  }
+
+  int CountTrashFiles(size_t dummy_files_dirs_idx = 0) {
     std::vector<std::string> files_in_dir;
-    EXPECT_OK(env_->GetChildren(dir, &files_in_dir));
-    // Ignore "." and ".."
-    return static_cast<int>(files_in_dir.size()) - 2;
+    EXPECT_OK(env_->GetChildren(dummy_files_dirs_[dummy_files_dirs_idx],
+                                &files_in_dir));
+
+    int trash_cnt = 0;
+    for (auto& f : files_in_dir) {
+      if (DeleteScheduler::IsTrashFile(f)) {
+        trash_cnt++;
+      }
+    }
+    return trash_cnt;
   }
 
-  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024) {
-    std::string file_path = dummy_files_dir_ + "/" + file_name;
+  std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
+                           size_t dummy_files_dirs_idx = 0) {
+    std::string file_path =
+        dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
     std::unique_ptr<WritableFile> f;
     env_->NewWritableFile(file_path, &f, EnvOptions());
     std::string data(size, 'A');
     EXPECT_OK(f->Append(data));
     EXPECT_OK(f->Close());
-    sst_file_mgr_->OnAddFile(file_path);
+    sst_file_mgr_->OnAddFile(file_path, false);
     return file_path;
   }
 
   void NewDeleteScheduler() {
-    ASSERT_OK(env_->CreateDirIfMissing(trash_dir_));
+    // Tests in this file are for DeleteScheduler component and dont create any
+    // DBs, so we need to set max_trash_db_ratio to 100% (instead of default
+    // 25%)
     sst_file_mgr_.reset(
-        new SstFileManagerImpl(env_, nullptr, trash_dir_, rate_bytes_per_sec_));
+        new SstFileManagerImpl(env_, nullptr, rate_bytes_per_sec_,
+                               /* max_trash_db_ratio= */ 1.1, 128 * 1024));
     delete_scheduler_ = sst_file_mgr_->delete_scheduler();
-    // Tests in this file are for DeleteScheduler component and dont create any
-    // DBs, so we need to use set this value to 100% (instead of default 25%)
-    delete_scheduler_->TEST_SetMaxTrashDBRatio(1.1);
   }
 
   Env* env_;
-  std::string dummy_files_dir_;
-  std::string trash_dir_;
+  std::vector<std::string> dummy_files_dirs_;
   int64_t rate_bytes_per_sec_;
   DeleteScheduler* delete_scheduler_;
   std::unique_ptr<SstFileManagerImpl> sst_file_mgr_;
@@ -99,6 +127,13 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::BackgroundEmptyTrash:Wait",
       [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  int dir_synced = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile::AfterSyncDir", [&](void* arg) {
+        dir_synced++;
+        std::string* dir = reinterpret_cast<std::string*>(arg);
+        EXPECT_EQ(dummy_files_dirs_[0], *dir);
+      });
 
   int num_files = 100;  // 100 files
   uint64_t file_size = 1024;  // every file is 1 kb
@@ -109,10 +144,11 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
     rocksdb::SyncPoint::GetInstance()->ClearTrace();
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-    DestroyAndCreateDir(dummy_files_dir_);
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
     rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
     NewDeleteScheduler();
 
+    dir_synced = 0;
     // Create 100 dummy files, every file is 1 Kb
     std::vector<std::string> generated_files;
     for (int i = 0; i < num_files; i++) {
@@ -122,9 +158,10 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
 
     // Delete dummy files and measure time spent to empty trash
     for (int i = 0; i < num_files; i++) {
-      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i]));
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i],
+                                              dummy_files_dirs_[0]));
     }
-    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+    ASSERT_EQ(CountNormalFiles(), 0);
 
     uint64_t delete_start_time = env_->NowMicros();
     TEST_SYNC_POINT("DeleteSchedulerTest::BasicRateLimiting:1");
@@ -144,11 +181,49 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) {
     }
     ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
 
-    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+    ASSERT_EQ(num_files, dir_synced);
+
+    ASSERT_EQ(CountTrashFiles(), 0);
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
 
+TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) {
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  rate_bytes_per_sec_ = 1 << 20;  // 1MB
+  NewDeleteScheduler();
+
+  // Generate dummy files in multiple directories
+  const size_t kNumFiles = dummy_files_dirs_.size();
+  const size_t kFileSize = 1 << 10;  // 1KB
+  std::vector<std::string> generated_files;
+  for (size_t i = 0; i < kNumFiles; i++) {
+    generated_files.push_back(NewDummyFile("file", kFileSize, i));
+    ASSERT_EQ(1, CountNormalFiles(i));
+  }
+
+  // Mark dummy files as trash
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(1, CountTrashFiles(i));
+  }
+  TEST_SYNC_POINT("DeleteSchedulerTest::MultiDbPathDeletionsScheduled:1");
+  delete_scheduler_->WaitForEmptyTrash();
+
+  // Verify dummy files eventually got deleted
+  for (size_t i = 0; i < kNumFiles; i++) {
+    ASSERT_EQ(0, CountNormalFiles(i));
+    ASSERT_EQ(0, CountTrashFiles(i));
+  }
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+}
+
 // Same as the BasicRateLimiting test but delete files in multiple threads.
 // 1- Create 100 dummy files
 // 2- Delete the 100 dummy files using DeleteScheduler using 10 threads
@@ -177,7 +252,7 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
     rocksdb::SyncPoint::GetInstance()->ClearTrace();
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-    DestroyAndCreateDir(dummy_files_dir_);
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
     rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
     NewDeleteScheduler();
 
@@ -196,7 +271,7 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
       int range_start = idx * num_files;
       int range_end = range_start + num_files;
       for (int j = range_start; j < range_end; j++) {
-        ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j]));
+        ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[j], ""));
       }
     };
 
@@ -226,8 +301,8 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) {
     }
     ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
 
-    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
-    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
@@ -239,7 +314,7 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
   int bg_delete_file = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
+      [&](void* /*arg*/) { bg_delete_file++; });
 
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -249,10 +324,10 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
   for (int i = 0; i < 10; i++) {
     // Every file we delete will be deleted immediately
     std::string dummy_file = NewDummyFile("dummy.data");
-    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file));
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
     ASSERT_TRUE(env_->FileExists(dummy_file).IsNotFound());
-    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
-    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+    ASSERT_EQ(CountNormalFiles(), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
   }
 
   ASSERT_EQ(bg_delete_file, 0);
@@ -279,16 +354,16 @@ TEST_F(DeleteSchedulerTest, ConflictNames) {
   // Create "conflict.data" and move it to trash 10 times
   for (int i = 0; i < 10; i++) {
     std::string dummy_file = NewDummyFile("conflict.data");
-    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file));
+    ASSERT_OK(delete_scheduler_->DeleteFile(dummy_file, ""));
   }
-  ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+  ASSERT_EQ(CountNormalFiles(), 0);
   // 10 files ("conflict.data" x 10) in trash
-  ASSERT_EQ(CountFilesInDir(trash_dir_), 10);
+  ASSERT_EQ(CountTrashFiles(), 10);
 
   // Hold BackgroundEmptyTrash
   TEST_SYNC_POINT("DeleteSchedulerTest::ConflictNames:1");
   delete_scheduler_->WaitForEmptyTrash();
-  ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+  ASSERT_EQ(CountTrashFiles(), 0);
 
   auto bg_errors = delete_scheduler_->GetBackgroundErrors();
   ASSERT_EQ(bg_errors.size(), 0);
@@ -315,17 +390,17 @@ TEST_F(DeleteSchedulerTest, BackgroundError) {
   // Generate 10 dummy files and move them to trash
   for (int i = 0; i < 10; i++) {
     std::string file_name = "data_" + ToString(i) + ".data";
-    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
   }
-  ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
-  ASSERT_EQ(CountFilesInDir(trash_dir_), 10);
+  ASSERT_EQ(CountNormalFiles(), 0);
+  ASSERT_EQ(CountTrashFiles(), 10);
 
   // Delete 10 files from trash, this will cause background errors in
   // BackgroundEmptyTrash since we already deleted the files it was
   // goind to delete
   for (int i = 0; i < 10; i++) {
-    std::string file_name = "data_" + ToString(i) + ".data";
-    ASSERT_OK(env_->DeleteFile(trash_dir_ + "/" + file_name));
+    std::string file_name = "data_" + ToString(i) + ".data.trash";
+    ASSERT_OK(env_->DeleteFile(dummy_files_dirs_[0] + "/" + file_name));
   }
 
   // Hold BackgroundEmptyTrash
@@ -346,7 +421,7 @@ TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
   int bg_delete_file = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
+      [&](void* /*arg*/) { bg_delete_file++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
@@ -357,12 +432,12 @@ TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
     // Generate 10 dummy files and move them to trash
     for (int i = 0; i < 10; i++) {
       std::string file_name = "data_" + ToString(i) + ".data";
-      ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
+      ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
     }
-    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+    ASSERT_EQ(CountNormalFiles(), 0);
     delete_scheduler_->WaitForEmptyTrash();
     ASSERT_EQ(bg_delete_file, 10 * run);
-    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
 
     auto bg_errors = delete_scheduler_->GetBackgroundErrors();
     ASSERT_EQ(bg_errors.size(), 0);
@@ -372,6 +447,71 @@ TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) {
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 }
 
+TEST_F(DeleteSchedulerTest, DeletePartialFile) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  // Should delete in 4 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_1", 500 * 1024), ""));
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 100 * 1024), ""));
+  // Should delete in 2 batch
+  ASSERT_OK(
+      delete_scheduler_->DeleteFile(NewDummyFile("data_2", 200 * 1024), ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(7, bg_delete_file);
+  ASSERT_EQ(4, bg_fsync);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+}
+
+#ifdef OS_LINUX
+TEST_F(DeleteSchedulerTest, NoPartialDeleteWithLink) {
+  int bg_delete_file = 0;
+  int bg_fsync = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void*) { bg_delete_file++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:Fsync", [&](void*) { bg_fsync++; });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  rate_bytes_per_sec_ = 1024 * 1024;  // 1 MB / sec
+  NewDeleteScheduler();
+
+  std::string file1 = NewDummyFile("data_1", 500 * 1024);
+  std::string file2 = NewDummyFile("data_2", 100 * 1024);
+
+  ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
+  ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
+
+  // Should delete in 4 batch if there is no hardlink
+  ASSERT_OK(delete_scheduler_->DeleteFile(file1, ""));
+  ASSERT_OK(delete_scheduler_->DeleteFile(file2, ""));
+
+  delete_scheduler_->WaitForEmptyTrash();
+
+  auto bg_errors = delete_scheduler_->GetBackgroundErrors();
+  ASSERT_EQ(bg_errors.size(), 0);
+  ASSERT_EQ(2, bg_delete_file);
+  ASSERT_EQ(0, bg_fsync);
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+}
+#endif
+
 // 1- Create a DeleteScheduler with very slow rate limit (1 Byte / sec)
 // 2- Delete 100 files using DeleteScheduler
 // 3- Delete the DeleteScheduler (call the destructor while queue is not empty)
@@ -381,7 +521,7 @@ TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
   int bg_delete_file = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
+      [&](void* /*arg*/) { bg_delete_file++; });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   rate_bytes_per_sec_ = 1;  // 1 Byte / sec
@@ -389,7 +529,7 @@ TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
 
   for (int i = 0; i < 100; i++) {
     std::string file_name = "data_" + ToString(i) + ".data";
-    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
+    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name), ""));
   }
 
   // Deleting 100 files will need >28 hours to delete
@@ -397,35 +537,7 @@ TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) {
   sst_file_mgr_.reset();
 
   ASSERT_LT(bg_delete_file, 100);
-  ASSERT_GT(CountFilesInDir(trash_dir_), 0);
-
-  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
-}
-
-// 1- Delete the trash directory
-// 2- Delete 10 files using DeleteScheduler
-// 3- Make sure that the 10 files were deleted immediately since DeleteScheduler
-//    failed to move them to trash directory
-TEST_F(DeleteSchedulerTest, MoveToTrashError) {
-  int bg_delete_file = 0;
-  rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
-  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
-
-  rate_bytes_per_sec_ = 1024;  // 1 Kb / sec
-  NewDeleteScheduler();
-
-  // We will delete the trash directory, that mean that DeleteScheduler wont
-  // be able to move files to trash and will delete files them immediately.
-  ASSERT_OK(test::DestroyDir(env_, trash_dir_));
-  for (int i = 0; i < 10; i++) {
-    std::string file_name = "data_" + ToString(i) + ".data";
-    ASSERT_OK(delete_scheduler_->DeleteFile(NewDummyFile(file_name)));
-  }
-
-  ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
-  ASSERT_EQ(bg_delete_file, 0);
+  ASSERT_GT(CountTrashFiles(), 0);
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
@@ -436,10 +548,10 @@ TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
   int fg_delete_file = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
+      [&](void* /*arg*/) { bg_delete_file++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteFile",
-      [&](void* arg) { fg_delete_file++; });
+      [&](void* /*arg*/) { fg_delete_file++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::BackgroundEmptyTrash:Wait",
       [&](void* arg) { penalties.push_back(*(static_cast<int*>(arg))); });
@@ -465,7 +577,7 @@ TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
     rocksdb::SyncPoint::GetInstance()->ClearTrace();
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
-    DestroyAndCreateDir(dummy_files_dir_);
+    DestroyAndCreateDir(dummy_files_dirs_[0]);
     rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024;
     delete_scheduler_->SetRateBytesPerSecond(rate_bytes_per_sec_);
 
@@ -478,9 +590,9 @@ TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
 
     // Delete dummy files and measure time spent to empty trash
     for (int i = 0; i < num_files; i++) {
-      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i]));
+      ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
     }
-    ASSERT_EQ(CountFilesInDir(dummy_files_dir_), 0);
+    ASSERT_EQ(CountNormalFiles(), 0);
 
     if (rate_bytes_per_sec_ > 0) {
       uint64_t delete_start_time = env_->NowMicros();
@@ -508,7 +620,7 @@ TEST_F(DeleteSchedulerTest, DISABLED_DynamicRateLimiting1) {
       ASSERT_EQ(fg_delete_file, num_files);
     }
 
-    ASSERT_EQ(CountFilesInDir(trash_dir_), 0);
+    ASSERT_EQ(CountTrashFiles(), 0);
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   }
 }
@@ -518,9 +630,9 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
   int fg_delete_file = 0;
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "DeleteScheduler::DeleteTrashFile:DeleteFile",
-      [&](void* arg) { bg_delete_file++; });
+      [&](void* /*arg*/) { bg_delete_file++; });
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DeleteScheduler::DeleteFile", [&](void* arg) { fg_delete_file++; });
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
 
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
@@ -529,7 +641,7 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
   rate_bytes_per_sec_ = 1;  // 1 byte per sec (very slow trash delete)
 
   NewDeleteScheduler();
-  delete_scheduler_->TEST_SetMaxTrashDBRatio(0.25);
+  delete_scheduler_->SetMaxTrashDBRatio(0.25);
 
   std::vector<std::string> generated_files;
   for (int i = 0; i < num_files; i++) {
@@ -538,7 +650,7 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
   }
 
   for (std::string& file_name : generated_files) {
-    delete_scheduler_->DeleteFile(file_name);
+    delete_scheduler_->DeleteFile(file_name, "");
   }
 
   // When we end up with 26 files in trash we will start
@@ -548,6 +660,27 @@ TEST_F(DeleteSchedulerTest, ImmediateDeleteOn25PercDBSize) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+TEST_F(DeleteSchedulerTest, IsTrashCheck) {
+  // Trash files
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("x.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile(".trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("/a/b/c/abc..sst.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("^^^^^.log.trash"));
+  ASSERT_TRUE(DeleteScheduler::IsTrashFile("abc.t.trash"));
+
+  // Not trash files
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.txt"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sst"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("/a/b/c/abc.sstrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("^^^^^.trashh"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile(".ttrash"));
+  ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
@@ -556,7 +689,7 @@ int main(int argc, char** argv) {
 }
 
 #else
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   printf("DeleteScheduler is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/util/duplicate_detector.h b/thirdparty/rocksdb/util/duplicate_detector.h
new file mode 100644
index 0000000000..40a1cbd129
--- /dev/null
+++ b/thirdparty/rocksdb/util/duplicate_detector.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
+#include "util/set_comparator.h"
+
+namespace rocksdb {
+// During recovery if the memtable is flushed we cannot rely on its help on
+// duplicate key detection and as key insert will not be attempted. This class
+// will be used as a emulator of memtable to tell if insertion of a key/seq
+// would have resulted in duplication.
+class DuplicateDetector {
+ public:
+  explicit DuplicateDetector(DBImpl* db) : db_(db) {}
+  bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) {
+    assert(seq >= batch_seq_);
+    if (batch_seq_ != seq) {  // it is a new batch
+      keys_.clear();
+    }
+    batch_seq_ = seq;
+    CFKeys& cf_keys = keys_[cf];
+    if (cf_keys.size() == 0) {  // just inserted
+      InitWithComp(cf);
+    }
+    auto it = cf_keys.insert(key);
+    if (it.second == false) {  // second is false if a element already existed.
+      keys_.clear();
+      InitWithComp(cf);
+      keys_[cf].insert(key);
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  SequenceNumber batch_seq_ = 0;
+  DBImpl* db_;
+  using CFKeys = std::set<Slice, SetComparator>;
+  std::map<uint32_t, CFKeys> keys_;
+  void InitWithComp(const uint32_t cf) {
+    auto h = db_->GetColumnFamilyHandle(cf);
+    if (!h) {
+      // TODO(myabandeh): This is not a concern in MyRocks as drop cf is not
+      // implemented yet. When it does, we should return proper error instead
+      // of throwing exception.
+      ROCKS_LOG_FATAL(
+          db_->immutable_db_options().info_log,
+          "Recovering an entry from the dropped column family %" PRIu32
+          ". WAL must must have been emptied before dropping the column "
+          "family", cf);
+#ifndef ROCKSDB_LITE
+      throw std::runtime_error(
+          "Recovering an entry from a dropped column family. "
+          "WAL must must have been flushed before dropping the column "
+          "family");
+#endif
+      return;
+    }
+    auto cmp = h->GetComparator();
+    keys_[cf] = CFKeys(SetComparator(cmp));
+  }
+};
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/dynamic_bloom.cc b/thirdparty/rocksdb/util/dynamic_bloom.cc
index 7c296cb4db..8e90efd89a 100644
--- a/thirdparty/rocksdb/util/dynamic_bloom.cc
+++ b/thirdparty/rocksdb/util/dynamic_bloom.cc
@@ -32,19 +32,13 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) {
 
 DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits,
                            uint32_t locality, uint32_t num_probes,
-                           uint32_t (*hash_func)(const Slice& key),
-                           size_t huge_page_tlb_size,
-                           Logger* logger)
-    : DynamicBloom(num_probes, hash_func) {
+                           size_t huge_page_tlb_size, Logger* logger)
+    : DynamicBloom(num_probes) {
   SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger);
 }
 
-DynamicBloom::DynamicBloom(uint32_t num_probes,
-                           uint32_t (*hash_func)(const Slice& key))
-    : kTotalBits(0),
-      kNumBlocks(0),
-      kNumProbes(num_probes),
-      hash_func_(hash_func == nullptr ? &BloomHash : hash_func) {}
+DynamicBloom::DynamicBloom(uint32_t num_probes)
+    : kTotalBits(0), kNumBlocks(0), kNumProbes(num_probes), data_(nullptr) {}
 
 void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits,
                               uint32_t num_blocks) {
diff --git a/thirdparty/rocksdb/util/dynamic_bloom.h b/thirdparty/rocksdb/util/dynamic_bloom.h
index 17325dd390..654bc9ad5f 100644
--- a/thirdparty/rocksdb/util/dynamic_bloom.h
+++ b/thirdparty/rocksdb/util/dynamic_bloom.h
@@ -10,6 +10,7 @@
 #include "rocksdb/slice.h"
 
 #include "port/port.h"
+#include "util/hash.h"
 
 #include <atomic>
 #include <memory>
@@ -35,12 +36,10 @@ class DynamicBloom {
   explicit DynamicBloom(Allocator* allocator,
                         uint32_t total_bits, uint32_t locality = 0,
                         uint32_t num_probes = 6,
-                        uint32_t (*hash_func)(const Slice& key) = nullptr,
                         size_t huge_page_tlb_size = 0,
                         Logger* logger = nullptr);
 
-  explicit DynamicBloom(uint32_t num_probes = 6,
-                        uint32_t (*hash_func)(const Slice& key) = nullptr);
+  explicit DynamicBloom(uint32_t num_probes = 6);
 
   void SetTotalBits(Allocator* allocator, uint32_t total_bits,
                     uint32_t locality, size_t huge_page_tlb_size,
@@ -86,7 +85,6 @@ class DynamicBloom {
   uint32_t kNumBlocks;
   const uint32_t kNumProbes;
 
-  uint32_t (*hash_func_)(const Slice& key);
   std::atomic<uint8_t>* data_;
 
   // or_func(ptr, mask) should effect *ptr |= mask with the appropriate
@@ -95,10 +93,10 @@ class DynamicBloom {
   void AddHash(uint32_t hash, const OrFunc& or_func);
 };
 
-inline void DynamicBloom::Add(const Slice& key) { AddHash(hash_func_(key)); }
+inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); }
 
 inline void DynamicBloom::AddConcurrently(const Slice& key) {
-  AddHashConcurrently(hash_func_(key));
+  AddHashConcurrently(BloomHash(key));
 }
 
 inline void DynamicBloom::AddHash(uint32_t hash) {
@@ -122,15 +120,23 @@ inline void DynamicBloom::AddHashConcurrently(uint32_t hash) {
 }
 
 inline bool DynamicBloom::MayContain(const Slice& key) const {
-  return (MayContainHash(hash_func_(key)));
+  return (MayContainHash(BloomHash(key)));
 }
 
+#if defined(_MSC_VER)
+#pragma warning(push)
+// local variable is initialized but not referenced
+#pragma warning(disable : 4189) 
+#endif
 inline void DynamicBloom::Prefetch(uint32_t h) {
   if (kNumBlocks != 0) {
     uint32_t b = ((h >> 11 | (h << 21)) % kNumBlocks) * (CACHE_LINE_SIZE * 8);
     PREFETCH(&(data_[b / 8]), 0, 3);
   }
 }
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 inline bool DynamicBloom::MayContainHash(uint32_t h) const {
   assert(IsInitialized());
diff --git a/thirdparty/rocksdb/util/dynamic_bloom_test.cc b/thirdparty/rocksdb/util/dynamic_bloom_test.cc
index f50036b76a..4244bff1a4 100644
--- a/thirdparty/rocksdb/util/dynamic_bloom_test.cc
+++ b/thirdparty/rocksdb/util/dynamic_bloom_test.cc
@@ -15,7 +15,6 @@ int main() {
 #define __STDC_FORMAT_MACROS
 #endif
 
-#include <gflags/gflags.h>
 #include <inttypes.h>
 #include <algorithm>
 #include <atomic>
@@ -27,12 +26,13 @@ int main() {
 #include "dynamic_bloom.h"
 #include "port/port.h"
 #include "util/arena.h"
+#include "util/gflags_compat.h"
 #include "util/logging.h"
+#include "util/stop_watch.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
-#include "util/stop_watch.h"
 
-using GFLAGS::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 
 DEFINE_int32(bits_per_key, 10, "");
 DEFINE_int32(num_probes, 6, "");
@@ -258,12 +258,12 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
 
       timer.Start();
 
-      std::function<void(size_t)> adder = [&](size_t t) {
+      std::function<void(size_t)> adder([&](size_t t) {
         for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
           std_bloom.AddConcurrently(
               Slice(reinterpret_cast<const char*>(&i), 8));
         }
-      };
+      });
       for (size_t t = 0; t < num_threads; ++t) {
         threads.emplace_back(adder, t);
       }
@@ -279,13 +279,13 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
 
       timer.Start();
 
-      std::function<void(size_t)> hitter = [&](size_t t) {
+      std::function<void(size_t)> hitter([&](size_t t) {
         for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) {
           bool f =
               std_bloom.MayContain(Slice(reinterpret_cast<const char*>(&i), 8));
           ASSERT_TRUE(f);
         }
-      };
+      });
       for (size_t t = 0; t < num_threads; ++t) {
         threads.emplace_back(hitter, t);
       }
@@ -302,7 +302,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
       timer.Start();
 
       std::atomic<uint32_t> false_positives(0);
-      std::function<void(size_t)> misser = [&](size_t t) {
+      std::function<void(size_t)> misser([&](size_t t) {
         for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys;
              i += num_threads) {
           bool f =
@@ -311,7 +311,7 @@ TEST_F(DynamicBloomTest, concurrent_with_perf) {
             ++false_positives;
           }
         }
-      };
+      });
       for (size_t t = 0; t < num_threads; ++t) {
         threads.emplace_back(misser, t);
       }
diff --git a/thirdparty/rocksdb/util/event_logger_test.cc b/thirdparty/rocksdb/util/event_logger_test.cc
index 13b639442e..4bcf30ff5e 100644
--- a/thirdparty/rocksdb/util/event_logger_test.cc
+++ b/thirdparty/rocksdb/util/event_logger_test.cc
@@ -15,7 +15,7 @@ class EventLoggerTest : public testing::Test {};
 class StringLogger : public Logger {
  public:
   using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {
+  void Logv(const char* format, va_list ap) override {
     vsnprintf(buffer_, sizeof(buffer_), format, ap);
   }
   char* buffer() { return buffer_; }
diff --git a/thirdparty/rocksdb/util/fault_injection_test_env.cc b/thirdparty/rocksdb/util/fault_injection_test_env.cc
index 3b3a8b9359..9cad23871b 100644
--- a/thirdparty/rocksdb/util/fault_injection_test_env.cc
+++ b/thirdparty/rocksdb/util/fault_injection_test_env.cc
@@ -29,12 +29,12 @@ std::string GetDirName(const std::string filename) {
 
 // A basic file truncation function suitable for this test.
 Status Truncate(Env* env, const std::string& filename, uint64_t length) {
-  unique_ptr<SequentialFile> orig_file;
+  std::unique_ptr<SequentialFile> orig_file;
   const EnvOptions options;
   Status s = env->NewSequentialFile(filename, &orig_file, options);
   if (!s.ok()) {
-    fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(),
-            s.ToString().c_str());
+    fprintf(stderr, "Cannot open file %s for truncation: %s\n",
+            filename.c_str(), s.ToString().c_str());
     return s;
   }
 
@@ -46,7 +46,7 @@ Status Truncate(Env* env, const std::string& filename, uint64_t length) {
 #endif
   if (s.ok()) {
     std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
-    unique_ptr<WritableFile> tmp_file;
+    std::unique_ptr<WritableFile> tmp_file;
     s = env->NewWritableFile(tmp_name, &tmp_file, options);
     if (s.ok()) {
       s = tmp_file->Append(result);
@@ -103,7 +103,7 @@ Status TestDirectory::Fsync() {
 }
 
 TestWritableFile::TestWritableFile(const std::string& fname,
-                                   unique_ptr<WritableFile>&& f,
+                                   std::unique_ptr<WritableFile>&& f,
                                    FaultInjectionTestEnv* env)
     : state_(fname),
       target_(std::move(f)),
@@ -121,11 +121,12 @@ TestWritableFile::~TestWritableFile() {
 
 Status TestWritableFile::Append(const Slice& data) {
   if (!env_->IsFilesystemActive()) {
-    return Status::Corruption("Not Active");
+    return env_->GetError();
   }
   Status s = target_->Append(data);
   if (s.ok()) {
     state_.pos_ += data.size();
+    env_->WritableFileAppended(state_);
   }
   return s;
 }
@@ -153,12 +154,13 @@ Status TestWritableFile::Sync() {
   }
   // No need to actual sync.
   state_.pos_at_last_sync_ = state_.pos_;
+  env_->WritableFileSynced(state_);
   return Status::OK();
 }
 
 Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
-                                           unique_ptr<Directory>* result) {
-  unique_ptr<Directory> r;
+                                           std::unique_ptr<Directory>* result) {
+  std::unique_ptr<Directory> r;
   Status s = target()->NewDirectory(name, &r);
   assert(s.ok());
   if (!s.ok()) {
@@ -168,11 +170,11 @@ Status FaultInjectionTestEnv::NewDirectory(const std::string& name,
   return Status::OK();
 }
 
-Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
-                                              unique_ptr<WritableFile>* result,
-                                              const EnvOptions& soptions) {
+Status FaultInjectionTestEnv::NewWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
   if (!IsFilesystemActive()) {
-    return Status::Corruption("Not Active");
+    return GetError();
   }
   // Not allow overwriting files
   Status s = target()->FileExists(fname);
@@ -197,9 +199,39 @@ Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
   return s;
 }
 
+Status FaultInjectionTestEnv::ReopenWritableFile(
+    const std::string& fname, std::unique_ptr<WritableFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  Status s = target()->ReopenWritableFile(fname, result, soptions);
+  if (s.ok()) {
+    result->reset(new TestWritableFile(fname, std::move(*result), this));
+    // WritableFileWriter* file is opened
+    // again then it will be truncated - so forget our saved state.
+    UntrackFile(fname);
+    MutexLock l(&mutex_);
+    open_files_.insert(fname);
+    auto dir_and_name = GetDirAndName(fname);
+    auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first];
+    list.insert(dir_and_name.second);
+  }
+  return s;
+}
+
+Status FaultInjectionTestEnv::NewRandomAccessFile(
+    const std::string& fname, std::unique_ptr<RandomAccessFile>* result,
+    const EnvOptions& soptions) {
+  if (!IsFilesystemActive()) {
+    return GetError();
+  }
+  return target()->NewRandomAccessFile(fname, result, soptions);
+}
+
 Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
   if (!IsFilesystemActive()) {
-    return Status::Corruption("Not Active");
+    return GetError();
   }
   Status s = EnvWrapper::DeleteFile(f);
   if (!s.ok()) {
@@ -216,7 +248,7 @@ Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
 Status FaultInjectionTestEnv::RenameFile(const std::string& s,
                                          const std::string& t) {
   if (!IsFilesystemActive()) {
-    return Status::Corruption("Not Active");
+    return GetError();
   }
   Status ret = EnvWrapper::RenameFile(s, t);
 
@@ -247,6 +279,28 @@ void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
   }
 }
 
+void FaultInjectionTestEnv::WritableFileSynced(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_files_.find(state.filename_) != open_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
+void FaultInjectionTestEnv::WritableFileAppended(const FileState& state) {
+  MutexLock l(&mutex_);
+  if (open_files_.find(state.filename_) != open_files_.end()) {
+    if (db_file_state_.find(state.filename_) == db_file_state_.end()) {
+      db_file_state_.insert(std::make_pair(state.filename_, state));
+    } else {
+      db_file_state_[state.filename_] = state;
+    }
+  }
+}
+
 // For every file that is not fully synced, make a call to `func` with
 // FileState of the file as the parameter.
 Status FaultInjectionTestEnv::DropFileData(
diff --git a/thirdparty/rocksdb/util/fault_injection_test_env.h b/thirdparty/rocksdb/util/fault_injection_test_env.h
index 5d0ae63445..7c5a080f73 100644
--- a/thirdparty/rocksdb/util/fault_injection_test_env.h
+++ b/thirdparty/rocksdb/util/fault_injection_test_env.h
@@ -11,8 +11,7 @@
 // the last "sync". It then checks for data loss errors by purposely dropping
 // file data (or entire files) not protected by a "sync".
 
-#ifndef UTIL_FAULT_INJECTION_TEST_ENV_H_
-#define UTIL_FAULT_INJECTION_TEST_ENV_H_
+#pragma once
 
 #include <map>
 #include <set>
@@ -57,7 +56,7 @@ struct FileState {
 class TestWritableFile : public WritableFile {
  public:
   explicit TestWritableFile(const std::string& fname,
-                            unique_ptr<WritableFile>&& f,
+                            std::unique_ptr<WritableFile>&& f,
                             FaultInjectionTestEnv* env);
   virtual ~TestWritableFile();
   virtual Status Append(const Slice& data) override;
@@ -68,10 +67,17 @@ class TestWritableFile : public WritableFile {
   virtual Status Flush() override;
   virtual Status Sync() override;
   virtual bool IsSyncThreadSafe() const override { return true; }
+  virtual Status PositionedAppend(const Slice& data,
+                                  uint64_t offset) override {
+    return target_->PositionedAppend(data, offset);
+  }
+  virtual bool use_direct_io() const override {
+    return target_->use_direct_io();
+  };
 
  private:
   FileState state_;
-  unique_ptr<WritableFile> target_;
+  std::unique_ptr<WritableFile> target_;
   bool writable_file_opened_;
   FaultInjectionTestEnv* env_;
 };
@@ -88,7 +94,7 @@ class TestDirectory : public Directory {
  private:
   FaultInjectionTestEnv* env_;
   std::string dirname_;
-  unique_ptr<Directory> dir_;
+  std::unique_ptr<Directory> dir_;
 };
 
 class FaultInjectionTestEnv : public EnvWrapper {
@@ -98,19 +104,41 @@ class FaultInjectionTestEnv : public EnvWrapper {
   virtual ~FaultInjectionTestEnv() {}
 
   Status NewDirectory(const std::string& name,
-                      unique_ptr<Directory>* result) override;
+                      std::unique_ptr<Directory>* result) override;
 
   Status NewWritableFile(const std::string& fname,
-                         unique_ptr<WritableFile>* result,
+                         std::unique_ptr<WritableFile>* result,
                          const EnvOptions& soptions) override;
 
+  Status ReopenWritableFile(const std::string& fname,
+                            std::unique_ptr<WritableFile>* result,
+                            const EnvOptions& soptions) override;
+
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& soptions) override;
+
   virtual Status DeleteFile(const std::string& f) override;
 
   virtual Status RenameFile(const std::string& s,
                             const std::string& t) override;
 
+  virtual Status GetFreeSpace(const std::string& path,
+                              uint64_t* disk_free) override {
+    if (!IsFilesystemActive() && error_ == Status::NoSpace()) {
+      *disk_free = 0;
+      return Status::OK();
+    } else {
+      return target()->GetFreeSpace(path, disk_free);
+    }
+  }
+
   void WritableFileClosed(const FileState& state);
 
+  void WritableFileSynced(const FileState& state);
+
+  void WritableFileAppended(const FileState& state);
+
   // For every file that is not fully synced, make a call to `func` with
   // FileState of the file as the parameter.
   Status DropFileData(std::function<Status(Env*, FileState)> func);
@@ -138,12 +166,20 @@ class FaultInjectionTestEnv : public EnvWrapper {
     MutexLock l(&mutex_);
     return filesystem_active_;
   }
-  void SetFilesystemActiveNoLock(bool active) { filesystem_active_ = active; }
-  void SetFilesystemActive(bool active) {
+  void SetFilesystemActiveNoLock(bool active,
+      Status error = Status::Corruption("Not active")) {
+    filesystem_active_ = active;
+    if (!active) {
+      error_ = error;
+    }
+  }
+  void SetFilesystemActive(bool active,
+      Status error = Status::Corruption("Not active")) {
     MutexLock l(&mutex_);
-    SetFilesystemActiveNoLock(active);
+    SetFilesystemActiveNoLock(active, error);
   }
   void AssertNoOpenFile() { assert(open_files_.empty()); }
+  Status GetError() { return error_; }
 
  private:
   port::Mutex mutex_;
@@ -152,8 +188,7 @@ class FaultInjectionTestEnv : public EnvWrapper {
   std::unordered_map<std::string, std::set<std::string>>
       dir_to_new_files_since_last_sync_;
   bool filesystem_active_;  // Record flushes, syncs, writes
+  Status error_;
 };
 
 }  // namespace rocksdb
-
-#endif  // UTIL_FAULT_INJECTION_TEST_ENV_H_
diff --git a/thirdparty/rocksdb/util/file_reader_writer.cc b/thirdparty/rocksdb/util/file_reader_writer.cc
index f46b78fa06..9a818cb0f0 100644
--- a/thirdparty/rocksdb/util/file_reader_writer.cc
+++ b/thirdparty/rocksdb/util/file_reader_writer.cc
@@ -62,7 +62,7 @@ Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
 Status SequentialFileReader::Skip(uint64_t n) {
 #ifndef ROCKSDB_LITE
   if (use_direct_io()) {
-    offset_ += n;
+    offset_ += static_cast<size_t>(n);
     return Status::OK();
   }
 #endif  // !ROCKSDB_LITE
@@ -75,20 +75,22 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
   uint64_t elapsed = 0;
   {
     StopWatch sw(env_, stats_, hist_type_,
-                 (stats_ != nullptr) ? &elapsed : nullptr);
+                 (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
+                true /*delay_enabled*/);
+    auto prev_perf_level = GetPerfLevel();
     IOSTATS_TIMER_GUARD(read_nanos);
     if (use_direct_io()) {
 #ifndef ROCKSDB_LITE
       size_t alignment = file_->GetRequiredBufferAlignment();
-      size_t aligned_offset = TruncateToPageBoundary(alignment, offset);
-      size_t offset_advance = offset - aligned_offset;
-      size_t read_size = Roundup(offset + n, alignment) - aligned_offset;
+      size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
+      size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
+      size_t read_size = Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
       AlignedBuffer buf;
       buf.Alignment(alignment);
       buf.AllocateNewBuffer(read_size);
       while (buf.CurrentSize() < read_size) {
         size_t allowed;
-        if (rate_limiter_ != nullptr) {
+        if (for_compaction_ && rate_limiter_ != nullptr) {
           allowed = rate_limiter_->RequestToken(
               buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
               Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
@@ -97,8 +99,24 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
           allowed = read_size;
         }
         Slice tmp;
-        s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
-                        buf.Destination());
+
+        FileOperationInfo::TimePoint start_ts;
+        uint64_t orig_offset = 0;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+          orig_offset = aligned_offset + buf.CurrentSize();
+        }
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp,
+                          buf.Destination());
+        }
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
+                                 s);
+        }
+
         buf.Size(buf.CurrentSize() + tmp.size());
         if (!s.ok() || tmp.size() < allowed) {
           break;
@@ -117,14 +135,38 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
       while (pos < n) {
         size_t allowed;
         if (for_compaction_ && rate_limiter_ != nullptr) {
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStart();
+          }
           allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */,
                                                 Env::IOPriority::IO_LOW, stats_,
                                                 RateLimiter::OpType::kRead);
+          if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
+            sw.DelayStop();
+          }
         } else {
           allowed = n;
         }
         Slice tmp_result;
-        s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
+
+#ifndef ROCKSDB_LITE
+        FileOperationInfo::TimePoint start_ts;
+        if (ShouldNotifyListeners()) {
+          start_ts = std::chrono::system_clock::now();
+        }
+#endif
+        {
+          IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
+          s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos);
+        }
+#ifndef ROCKSDB_LITE
+        if (ShouldNotifyListeners()) {
+          auto finish_ts = std::chrono::system_clock::now();
+          NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
+                                 finish_ts, s);
+        }
+#endif
+
         if (res_scratch == nullptr) {
           // we can't simply use `scratch` because reads of mmap'd files return
           // data in a different buffer.
@@ -141,10 +183,12 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
       *result = Slice(res_scratch, s.ok() ? pos : 0);
     }
     IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size());
+    SetPerfLevel(prev_perf_level);
   }
   if (stats_ != nullptr && file_read_hist_ != nullptr) {
     file_read_hist_->Add(elapsed);
   }
+
   return s;
 }
 
@@ -219,6 +263,31 @@ Status WritableFileWriter::Append(const Slice& data) {
   return s;
 }
 
+Status WritableFileWriter::Pad(const size_t pad_bytes) {
+  assert(pad_bytes < kDefaultPageSize);
+  size_t left = pad_bytes;
+  size_t cap = buf_.Capacity() - buf_.CurrentSize();
+
+  // Assume pad_bytes is small compared to buf_ capacity. So we always
+  // use buf_ rather than write directly to file in certain cases like
+  // Append() does.
+  while (left) {
+    size_t append_bytes = std::min(cap, left);
+    buf_.PadWith(append_bytes, 0);
+    left -= append_bytes;
+    if (left > 0) {
+      Status s = Flush();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    cap = buf_.Capacity() - buf_.CurrentSize();
+  }
+  pending_sync_ = true;
+  filesize_ += pad_bytes;
+  return Status::OK();
+}
+
 Status WritableFileWriter::Close() {
 
   // Do not quit immediately on failure the file MUST be closed
@@ -239,6 +308,9 @@ Status WritableFileWriter::Close() {
   // we need to let the file know where data ends.
   if (use_direct_io()) {
     interim = writable_file_->Truncate(filesize_);
+    if (interim.ok()) {
+      interim = writable_file_->Fsync();
+    }
     if (!interim.ok() && s.ok()) {
       s = interim;
     }
@@ -266,7 +338,9 @@ Status WritableFileWriter::Flush() {
   if (buf_.CurrentSize() > 0) {
     if (use_direct_io()) {
 #ifndef ROCKSDB_LITE
-      s = WriteDirect();
+      if (pending_sync_) {
+        s = WriteDirect();
+      }
 #endif  // !ROCKSDB_LITE
     } else {
       s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize());
@@ -344,11 +418,14 @@ Status WritableFileWriter::SyncInternal(bool use_fsync) {
   Status s;
   IOSTATS_TIMER_GUARD(fsync_nanos);
   TEST_SYNC_POINT("WritableFileWriter::SyncInternal:0");
+  auto prev_perf_level = GetPerfLevel();
+  IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
   if (use_fsync) {
     s = writable_file_->Fsync();
   } else {
     s = writable_file_->Sync();
   }
+  SetPerfLevel(prev_perf_level);
   return s;
 }
 
@@ -379,7 +456,27 @@ Status WritableFileWriter::WriteBuffered(const char* data, size_t size) {
     {
       IOSTATS_TIMER_GUARD(write_nanos);
       TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
-      s = writable_file_->Append(Slice(src, allowed));
+
+#ifndef ROCKSDB_LITE
+      FileOperationInfo::TimePoint start_ts;
+      uint64_t old_size = writable_file_->GetFileSize();
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+        old_size = next_write_offset_;
+      }
+#endif
+      {
+        auto prev_perf_level = GetPerfLevel();
+        IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, env_);
+        s = writable_file_->Append(Slice(src, allowed));
+        SetPerfLevel(prev_perf_level);
+      }
+#ifndef ROCKSDB_LITE
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s);
+      }
+#endif
       if (!s.ok()) {
         return s;
       }
@@ -442,8 +539,16 @@ Status WritableFileWriter::WriteDirect() {
     {
       IOSTATS_TIMER_GUARD(write_nanos);
       TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend");
+      FileOperationInfo::TimePoint start_ts;
+      if (ShouldNotifyListeners()) {
+        start_ts = std::chrono::system_clock::now();
+      }
       // direct writes must be positional
       s = writable_file_->PositionedAppend(Slice(src, size), write_offset);
+      if (ShouldNotifyListeners()) {
+        auto finish_ts = std::chrono::system_clock::now();
+        NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s);
+      }
       if (!s.ok()) {
         buf_.Size(file_advance + leftover_tail);
         return s;
@@ -481,9 +586,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
         alignment_(file_->GetRequiredBufferAlignment()),
         readahead_size_(Roundup(readahead_size, alignment_)),
         buffer_(),
-        buffer_offset_(0),
-        buffer_len_(0) {
-
+        buffer_offset_(0) {
     buffer_.Alignment(alignment_);
     buffer_.AllocateNewBuffer(readahead_size_);
   }
@@ -492,89 +595,93 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
 
  ReadaheadRandomAccessFile& operator=(const ReadaheadRandomAccessFile&) = delete;
 
-  virtual Status Read(uint64_t offset, size_t n, Slice* result,
-                      char* scratch) const override {
-
-    if (n + alignment_ >= readahead_size_) {
-      return file_->Read(offset, n, result, scratch);
-    }
-
-    std::unique_lock<std::mutex> lk(lock_);
-
-    size_t cached_len = 0;
-    // Check if there is a cache hit, means that [offset, offset + n) is either
-    // completely or partially in the buffer
-    // If it's completely cached, including end of file case when offset + n is
-    // greater than EOF, return
-    if (TryReadFromCache(offset, n, &cached_len, scratch) &&
-        (cached_len == n ||
-         // End of file
-         buffer_len_ < readahead_size_)) {
-      *result = Slice(scratch, cached_len);
-      return Status::OK();
-    }
-    size_t advanced_offset = offset + cached_len;
-    // In the case of cache hit advanced_offset is already aligned, means that
-    // chunk_offset equals to advanced_offset
-    size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
-    Slice readahead_result;
-
-    Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
-    if (s.ok()) {
-      // In the case of cache miss, i.e. when cached_len equals 0, an offset can
-      // exceed the file end position, so the following check is required
-      if (advanced_offset < chunk_offset + buffer_len_) {
-        // In the case of cache miss, the first chunk_padding bytes in buffer_
-        // are
-        // stored for alignment only and must be skipped
-        size_t chunk_padding = advanced_offset - chunk_offset;
-        auto remaining_len =
-            std::min(buffer_len_ - chunk_padding, n - cached_len);
-        memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
-               remaining_len);
-        *result = Slice(scratch, cached_len + remaining_len);
-      } else {
-        *result = Slice(scratch, cached_len);
-      }
-    }
-    return s;
-  }
-
-  virtual Status Prefetch(uint64_t offset, size_t n) override {
-    size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset);
-    if (prefetch_offset == buffer_offset_) {
-      return Status::OK();
-    }
-    return ReadIntoBuffer(prefetch_offset,
-                          Roundup(offset + n, alignment_) - prefetch_offset);
-  }
-
-  virtual size_t GetUniqueId(char* id, size_t max_size) const override {
-    return file_->GetUniqueId(id, max_size);
-  }
-
-  virtual void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
-
-  virtual Status InvalidateCache(size_t offset, size_t length) override {
-    return file_->InvalidateCache(offset, length);
-  }
-
-  virtual bool use_direct_io() const override {
-    return file_->use_direct_io();
-  }
-
- private:
-  bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
-                         char* scratch) const {
-    if (offset < buffer_offset_ || offset >= buffer_offset_ + buffer_len_) {
-      *cached_len = 0;
-      return false;
-    }
-    uint64_t offset_in_buffer = offset - buffer_offset_;
-    *cached_len =
-        std::min(buffer_len_ - static_cast<size_t>(offset_in_buffer), n);
-    memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
-    return true;
+ Status Read(uint64_t offset, size_t n, Slice* result,
+             char* scratch) const override {
+   if (n + alignment_ >= readahead_size_) {
+     return file_->Read(offset, n, result, scratch);
+   }
+
+   std::unique_lock<std::mutex> lk(lock_);
+
+   size_t cached_len = 0;
+   // Check if there is a cache hit, means that [offset, offset + n) is either
+   // completely or partially in the buffer
+   // If it's completely cached, including end of file case when offset + n is
+   // greater than EOF, return
+   if (TryReadFromCache(offset, n, &cached_len, scratch) &&
+       (cached_len == n ||
+        // End of file
+        buffer_.CurrentSize() < readahead_size_)) {
+     *result = Slice(scratch, cached_len);
+     return Status::OK();
+   }
+   size_t advanced_offset = static_cast<size_t>(offset + cached_len);
+   // In the case of cache hit advanced_offset is already aligned, means that
+   // chunk_offset equals to advanced_offset
+   size_t chunk_offset = TruncateToPageBoundary(alignment_, advanced_offset);
+   Slice readahead_result;
+
+   Status s = ReadIntoBuffer(chunk_offset, readahead_size_);
+   if (s.ok()) {
+     // In the case of cache miss, i.e. when cached_len equals 0, an offset can
+     // exceed the file end position, so the following check is required
+     if (advanced_offset < chunk_offset + buffer_.CurrentSize()) {
+       // In the case of cache miss, the first chunk_padding bytes in buffer_
+       // are
+       // stored for alignment only and must be skipped
+       size_t chunk_padding = advanced_offset - chunk_offset;
+       auto remaining_len =
+           std::min(buffer_.CurrentSize() - chunk_padding, n - cached_len);
+       memcpy(scratch + cached_len, buffer_.BufferStart() + chunk_padding,
+              remaining_len);
+       *result = Slice(scratch, cached_len + remaining_len);
+     } else {
+       *result = Slice(scratch, cached_len);
+     }
+   }
+   return s;
+ }
+
+ Status Prefetch(uint64_t offset, size_t n) override {
+   if (n < readahead_size_) {
+     // Don't allow smaller prefetches than the configured `readahead_size_`.
+     // `Read()` assumes a smaller prefetch buffer indicates EOF was reached.
+     return Status::OK();
+   }
+   size_t offset_ = static_cast<size_t>(offset);
+   size_t prefetch_offset = TruncateToPageBoundary(alignment_, offset_);
+   if (prefetch_offset == buffer_offset_) {
+     return Status::OK();
+   }
+   return ReadIntoBuffer(prefetch_offset,
+                         Roundup(offset_ + n, alignment_) - prefetch_offset);
+ }
+
+ size_t GetUniqueId(char* id, size_t max_size) const override {
+   return file_->GetUniqueId(id, max_size);
+ }
+
+ void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
+
+ Status InvalidateCache(size_t offset, size_t length) override {
+   return file_->InvalidateCache(offset, length);
+ }
+
+ bool use_direct_io() const override { return file_->use_direct_io(); }
+
+private:
+ bool TryReadFromCache(uint64_t offset, size_t n, size_t* cached_len,
+                       char* scratch) const {
+   if (offset < buffer_offset_ ||
+       offset >= buffer_offset_ + buffer_.CurrentSize()) {
+     *cached_len = 0;
+     return false;
+   }
+   uint64_t offset_in_buffer = offset - buffer_offset_;
+   *cached_len = std::min(
+       buffer_.CurrentSize() - static_cast<size_t>(offset_in_buffer), n);
+   memcpy(scratch, buffer_.BufferStart() + offset_in_buffer, *cached_len);
+   return true;
   }
 
   Status ReadIntoBuffer(uint64_t offset, size_t n) const {
@@ -587,7 +694,8 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
     Status s = file_->Read(offset, n, &result, buffer_.BufferStart());
     if (s.ok()) {
       buffer_offset_ = offset;
-      buffer_len_ = result.size();
+      buffer_.Size(result.size());
+      assert(buffer_.BufferStart() == result.data());
     }
     return s;
   }
@@ -599,33 +707,106 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
   mutable std::mutex lock_;
   mutable AlignedBuffer buffer_;
   mutable uint64_t buffer_offset_;
-  mutable size_t buffer_len_;
 };
 }  // namespace
 
 Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
                                     uint64_t offset, size_t n) {
   size_t alignment = reader->file()->GetRequiredBufferAlignment();
-  uint64_t roundup_offset = Roundup(offset, alignment);
-  uint64_t roundup_len = Roundup(n, alignment);
-  buffer_.Alignment(alignment);
-  buffer_.AllocateNewBuffer(roundup_len);
+  size_t offset_ = static_cast<size_t>(offset);
+  uint64_t rounddown_offset = Rounddown(offset_, alignment);
+  uint64_t roundup_end = Roundup(offset_ + n, alignment);
+  uint64_t roundup_len = roundup_end - rounddown_offset;
+  assert(roundup_len >= alignment);
+  assert(roundup_len % alignment == 0);
+
+  // Check if requested bytes are in the existing buffer_.
+  // If all bytes exist -- return.
+  // If only a few bytes exist -- reuse them & read only what is really needed.
+  //     This is typically the case of incremental reading of data.
+  // If no bytes exist in buffer -- full pread.
+
+  Status s;
+  uint64_t chunk_offset_in_buffer = 0;
+  uint64_t chunk_len = 0;
+  bool copy_data_to_new_buffer = false;
+  if (buffer_.CurrentSize() > 0 && offset >= buffer_offset_ &&
+      offset <= buffer_offset_ + buffer_.CurrentSize()) {
+    if (offset + n <= buffer_offset_ + buffer_.CurrentSize()) {
+      // All requested bytes are already in the buffer. So no need to Read
+      // again.
+      return s;
+    } else {
+      // Only a few requested bytes are in the buffer. memmove those chunk of
+      // bytes to the beginning, and memcpy them back into the new buffer if a
+      // new buffer is created.
+      chunk_offset_in_buffer = Rounddown(static_cast<size_t>(offset - buffer_offset_), alignment);
+      chunk_len = buffer_.CurrentSize() - chunk_offset_in_buffer;
+      assert(chunk_offset_in_buffer % alignment == 0);
+      assert(chunk_len % alignment == 0);
+      assert(chunk_offset_in_buffer + chunk_len <=
+             buffer_offset_ + buffer_.CurrentSize());
+      if (chunk_len > 0) {
+        copy_data_to_new_buffer = true;
+      } else {
+        // this reset is not necessary, but just to be safe.
+        chunk_offset_in_buffer = 0;
+      }
+    }
+  }
+
+  // Create a new buffer only if current capacity is not sufficient, and memcopy
+  // bytes from old buffer if needed (i.e., if chunk_len is greater than 0).
+  if (buffer_.Capacity() < roundup_len) {
+    buffer_.Alignment(alignment);
+    buffer_.AllocateNewBuffer(static_cast<size_t>(roundup_len),
+                              copy_data_to_new_buffer, chunk_offset_in_buffer,
+                              static_cast<size_t>(chunk_len));
+  } else if (chunk_len > 0) {
+    // New buffer not needed. But memmove bytes from tail to the beginning since
+    // chunk_len is greater than 0.
+    buffer_.RefitTail(static_cast<size_t>(chunk_offset_in_buffer), static_cast<size_t>(chunk_len));
+  }
 
   Slice result;
-  Status s =
-      reader->Read(roundup_offset, roundup_len, &result, buffer_.BufferStart());
+  s = reader->Read(rounddown_offset + chunk_len,
+                   static_cast<size_t>(roundup_len - chunk_len), &result,
+                   buffer_.BufferStart() + chunk_len);
   if (s.ok()) {
-    buffer_offset_ = roundup_offset;
-    buffer_len_ = result.size();
+    buffer_offset_ = rounddown_offset;
+    buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
   }
   return s;
 }
 
 bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
-                                          Slice* result) const {
-  if (offset < buffer_offset_ || offset + n > buffer_offset_ + buffer_len_) {
+                                          Slice* result) {
+  if (track_min_offset_ && offset < min_offset_read_) {
+    min_offset_read_ = static_cast<size_t>(offset);
+  }
+  if (!enable_ || offset < buffer_offset_) {
     return false;
   }
+
+  // If the buffer contains only a few of the requested bytes:
+  //    If readahead is enabled: prefetch the remaining bytes + readadhead bytes
+  //        and satisfy the request.
+  //    If readahead is not enabled: return false.
+  if (offset + n > buffer_offset_ + buffer_.CurrentSize()) {
+    if (readahead_size_ > 0) {
+      assert(file_reader_ != nullptr);
+      assert(max_readahead_size_ >= readahead_size_);
+
+      Status s = Prefetch(file_reader_, offset, n + readahead_size_);
+      if (!s.ok()) {
+        return false;
+      }
+      readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
+    } else {
+      return false;
+    }
+  }
+
   uint64_t offset_in_buffer = offset - buffer_offset_;
   *result = Slice(buffer_.BufferStart() + offset_in_buffer, n);
   return true;
@@ -639,11 +820,48 @@ std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
 }
 
 Status NewWritableFile(Env* env, const std::string& fname,
-                       unique_ptr<WritableFile>* result,
+                       std::unique_ptr<WritableFile>* result,
                        const EnvOptions& options) {
   Status s = env->NewWritableFile(fname, result, options);
   TEST_KILL_RANDOM("NewWritableFile:0", rocksdb_kill_odds * REDUCE_ODDS2);
   return s;
 }
 
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result) {
+  const int kBufferSize = 8192;
+  char buffer[kBufferSize + 1];
+  Slice input_slice;
+
+  std::string line;
+  bool has_complete_line = false;
+  while (!has_complete_line) {
+    if (std::getline(*iss, line)) {
+      has_complete_line = !iss->eof();
+    } else {
+      has_complete_line = false;
+    }
+    if (!has_complete_line) {
+      // if we're not sure whether we have a complete line,
+      // further read from the file.
+      if (*has_data) {
+        *result = seq_file->Read(kBufferSize, &input_slice, buffer);
+      }
+      if (input_slice.size() == 0) {
+        // meaning we have read all the data
+        *has_data = false;
+        break;
+      } else {
+        iss->str(line + input_slice.ToString());
+        // reset the internal state of iss so that we can keep reading it.
+        iss->clear();
+        *has_data = (input_slice.size() == kBufferSize);
+        continue;
+      }
+    }
+  }
+  *output = line;
+  return *has_data || has_complete_line;
+}
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/file_reader_writer.h b/thirdparty/rocksdb/util/file_reader_writer.h
index 9be6924582..4451f8b81b 100644
--- a/thirdparty/rocksdb/util/file_reader_writer.h
+++ b/thirdparty/rocksdb/util/file_reader_writer.h
@@ -8,11 +8,14 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 #pragma once
 #include <atomic>
+#include <sstream>
 #include <string>
 #include "port/port.h"
 #include "rocksdb/env.h"
+#include "rocksdb/listener.h"
 #include "rocksdb/rate_limiter.h"
 #include "util/aligned_buffer.h"
+#include "util/sync_point.h"
 
 namespace rocksdb {
 
@@ -25,11 +28,13 @@ std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
 class SequentialFileReader {
  private:
   std::unique_ptr<SequentialFile> file_;
+  std::string file_name_;
   std::atomic<size_t> offset_;  // read offset
 
  public:
-  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file)
-      : file_(std::move(_file)), offset_(0) {}
+  explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file,
+                                const std::string& _file_name)
+      : file_(std::move(_file)), file_name_(_file_name), offset_(0) {}
 
   SequentialFileReader(SequentialFileReader&& o) ROCKSDB_NOEXCEPT {
     *this = std::move(o);
@@ -51,11 +56,31 @@ class SequentialFileReader {
 
   SequentialFile* file() { return file_.get(); }
 
+  std::string file_name() { return file_name_; }
+
   bool use_direct_io() const { return file_->use_direct_io(); }
 };
 
 class RandomAccessFileReader {
  private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileReadFinish(uint64_t offset, size_t length,
+                              const FileOperationInfo::TimePoint& start_ts,
+                              const FileOperationInfo::TimePoint& finish_ts,
+                              const Status& status) const {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileReadFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
   std::unique_ptr<RandomAccessFile> file_;
   std::string     file_name_;
   Env*            env_;
@@ -64,16 +89,15 @@ class RandomAccessFileReader {
   HistogramImpl*  file_read_hist_;
   RateLimiter* rate_limiter_;
   bool for_compaction_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
 
  public:
-  explicit RandomAccessFileReader(std::unique_ptr<RandomAccessFile>&& raf,
-                                  std::string _file_name,
-                                  Env* env = nullptr,
-                                  Statistics* stats = nullptr,
-                                  uint32_t hist_type = 0,
-                                  HistogramImpl* file_read_hist = nullptr,
-                                  RateLimiter* rate_limiter = nullptr,
-                                  bool for_compaction = false)
+  explicit RandomAccessFileReader(
+      std::unique_ptr<RandomAccessFile>&& raf, std::string _file_name,
+      Env* env = nullptr, Statistics* stats = nullptr, uint32_t hist_type = 0,
+      HistogramImpl* file_read_hist = nullptr,
+      RateLimiter* rate_limiter = nullptr, bool for_compaction = false,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
       : file_(std::move(raf)),
         file_name_(std::move(_file_name)),
         env_(env),
@@ -81,7 +105,19 @@ class RandomAccessFileReader {
         hist_type_(hist_type),
         file_read_hist_(file_read_hist),
         rate_limiter_(rate_limiter),
-        for_compaction_(for_compaction) {}
+        for_compaction_(for_compaction),
+        listeners_() {
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
+  }
 
   RandomAccessFileReader(RandomAccessFileReader&& o) ROCKSDB_NOEXCEPT {
     *this = std::move(o);
@@ -118,7 +154,27 @@ class RandomAccessFileReader {
 // Use posix write to write data to a file.
 class WritableFileWriter {
  private:
+#ifndef ROCKSDB_LITE
+  void NotifyOnFileWriteFinish(uint64_t offset, size_t length,
+                               const FileOperationInfo::TimePoint& start_ts,
+                               const FileOperationInfo::TimePoint& finish_ts,
+                               const Status& status) {
+    FileOperationInfo info(file_name_, start_ts, finish_ts);
+    info.offset = offset;
+    info.length = length;
+    info.status = status;
+
+    for (auto& listener : listeners_) {
+      listener->OnFileWriteFinish(info);
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  bool ShouldNotifyListeners() const { return !listeners_.empty(); }
+
   std::unique_ptr<WritableFile> writable_file_;
+  std::string file_name_;
+  Env* env_;
   AlignedBuffer           buf_;
   size_t                  max_buffer_size_;
   // Actually written data size can be used for truncate
@@ -135,11 +191,17 @@ class WritableFileWriter {
   uint64_t                bytes_per_sync_;
   RateLimiter*            rate_limiter_;
   Statistics* stats_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
 
  public:
-  WritableFileWriter(std::unique_ptr<WritableFile>&& file,
-                     const EnvOptions& options, Statistics* stats = nullptr)
+  WritableFileWriter(
+      std::unique_ptr<WritableFile>&& file, const std::string& _file_name,
+      const EnvOptions& options, Env* env = nullptr,
+      Statistics* stats = nullptr,
+      const std::vector<std::shared_ptr<EventListener>>& listeners = {})
       : writable_file_(std::move(file)),
+        file_name_(_file_name),
+        env_(env),
         buf_(),
         max_buffer_size_(options.writable_file_max_buffer_size),
         filesize_(0),
@@ -150,9 +212,22 @@ class WritableFileWriter {
         last_sync_size_(0),
         bytes_per_sync_(options.bytes_per_sync),
         rate_limiter_(options.rate_limiter),
-        stats_(stats) {
+        stats_(stats),
+        listeners_() {
+    TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0",
+                             reinterpret_cast<void*>(max_buffer_size_));
     buf_.Alignment(writable_file_->GetRequiredBufferAlignment());
     buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_));
+#ifndef ROCKSDB_LITE
+    std::for_each(listeners.begin(), listeners.end(),
+                  [this](const std::shared_ptr<EventListener>& e) {
+                    if (e->ShouldBeNotifiedOnFileIO()) {
+                      listeners_.emplace_back(e);
+                    }
+                  });
+#else  // !ROCKSDB_LITE
+    (void)listeners;
+#endif
   }
 
   WritableFileWriter(const WritableFileWriter&) = delete;
@@ -161,8 +236,12 @@ class WritableFileWriter {
 
   ~WritableFileWriter() { Close(); }
 
+  std::string file_name() const { return file_name_; }
+
   Status Append(const Slice& data);
 
+  Status Pad(const size_t pad_bytes);
+
   Status Flush();
 
   Status Close();
@@ -184,6 +263,8 @@ class WritableFileWriter {
 
   bool use_direct_io() { return writable_file_->use_direct_io(); }
 
+  bool TEST_BufferIsEmpty() { return buf_.CurrentSize() == 0; }
+
  private:
   // Used when os buffering is OFF and we are writing
   // DMA such as in Direct I/O mode
@@ -196,18 +277,50 @@ class WritableFileWriter {
   Status SyncInternal(bool use_fsync);
 };
 
+// FilePrefetchBuffer can automatically do the readahead if file_reader,
+// readahead_size, and max_readahead_size are passed in.
+// max_readahead_size should be greater than or equal to readahead_size.
+// readahead_size will be doubled on every IO, until max_readahead_size.
 class FilePrefetchBuffer {
  public:
+  // If `track_min_offset` is true, track minimum offset ever read.
+  FilePrefetchBuffer(RandomAccessFileReader* file_reader = nullptr,
+                     size_t readadhead_size = 0, size_t max_readahead_size = 0,
+                     bool enable = true, bool track_min_offset = false)
+      : buffer_offset_(0),
+        file_reader_(file_reader),
+        readahead_size_(readadhead_size),
+        max_readahead_size_(max_readahead_size),
+        min_offset_read_(port::kMaxSizet),
+        enable_(enable),
+        track_min_offset_(track_min_offset) {}
   Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n);
-  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result) const;
+  bool TryReadFromCache(uint64_t offset, size_t n, Slice* result);
+
+  // The minimum `offset` ever passed to TryReadFromCache(). Only be tracked
+  // if track_min_offset = true.
+  size_t min_offset_read() const { return min_offset_read_; }
 
  private:
   AlignedBuffer buffer_;
   uint64_t buffer_offset_;
-  size_t buffer_len_;
+  RandomAccessFileReader* file_reader_;
+  size_t readahead_size_;
+  size_t max_readahead_size_;
+  // The minimum `offset` ever passed to TryReadFromCache().
+  size_t min_offset_read_;
+  // if false, TryReadFromCache() always return false, and we only take stats
+  // for track_min_offset_ if track_min_offset_ = true
+  bool enable_;
+  // If true, track minimum `offset` ever passed to TryReadFromCache(), which
+  // can be fetched from min_offset_read().
+  bool track_min_offset_;
 };
 
 extern Status NewWritableFile(Env* env, const std::string& fname,
-                              unique_ptr<WritableFile>* result,
+                              std::unique_ptr<WritableFile>* result,
                               const EnvOptions& options);
+bool ReadOneLine(std::istringstream* iss, SequentialFile* seq_file,
+                 std::string* output, bool* has_data, Status* result);
+
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/file_reader_writer_test.cc b/thirdparty/rocksdb/util/file_reader_writer_test.cc
index 45675e9dd7..6a7ea6d7da 100644
--- a/thirdparty/rocksdb/util/file_reader_writer_test.cc
+++ b/thirdparty/rocksdb/util/file_reader_writer_test.cc
@@ -20,15 +20,13 @@ TEST_F(WritableFileWriterTest, RangeSync) {
   class FakeWF : public WritableFile {
    public:
     explicit FakeWF() : size_(0), last_synced_(0) {}
-    ~FakeWF() {}
+    ~FakeWF() override {}
 
     Status Append(const Slice& data) override {
       size_ += data.size();
       return Status::OK();
     }
-    virtual Status Truncate(uint64_t size) override {
-      return Status::OK();
-    }
+    Status Truncate(uint64_t /*size*/) override { return Status::OK(); }
     Status Close() override {
       EXPECT_GE(size_, last_synced_ + kMb);
       EXPECT_LT(size_, last_synced_ + 2 * kMb);
@@ -39,17 +37,21 @@ TEST_F(WritableFileWriterTest, RangeSync) {
     Status Flush() override { return Status::OK(); }
     Status Sync() override { return Status::OK(); }
     Status Fsync() override { return Status::OK(); }
-    void SetIOPriority(Env::IOPriority pri) override {}
+    void SetIOPriority(Env::IOPriority /*pri*/) override {}
     uint64_t GetFileSize() override { return size_; }
-    void GetPreallocationStatus(size_t* block_size,
-                                size_t* last_allocated_block) override {}
-    size_t GetUniqueId(char* id, size_t max_size) const override { return 0; }
-    Status InvalidateCache(size_t offset, size_t length) override {
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {}
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      return 0;
+    }
+    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
       return Status::OK();
     }
 
    protected:
-    Status Allocate(uint64_t offset, uint64_t len) override { return Status::OK(); }
+    Status Allocate(uint64_t /*offset*/, uint64_t /*len*/) override {
+      return Status::OK();
+    }
     Status RangeSync(uint64_t offset, uint64_t nbytes) override {
       EXPECT_EQ(offset % 4096, 0u);
       EXPECT_EQ(nbytes % 4096, 0u);
@@ -69,9 +71,9 @@ TEST_F(WritableFileWriterTest, RangeSync) {
 
   EnvOptions env_options;
   env_options.bytes_per_sync = kMb;
-  unique_ptr<FakeWF> wf(new FakeWF);
-  unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(std::move(wf), env_options));
+  std::unique_ptr<FakeWF> wf(new FakeWF);
+  std::unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, env_options));
   Random r(301);
   std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
   for (int i = 0; i < 1000; i++) {
@@ -95,7 +97,7 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
         : file_data_(_file_data),
           use_direct_io_(_use_direct_io),
           no_flush_(_no_flush) {}
-    ~FakeWF() {}
+    ~FakeWF() override {}
 
     Status Append(const Slice& data) override {
       file_data_->append(data.data(), data.size());
@@ -111,7 +113,7 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
       return Status::OK();
     }
 
-    virtual Status Truncate(uint64_t size) override {
+    Status Truncate(uint64_t size) override {
       file_data_->resize(size);
       return Status::OK();
     }
@@ -119,12 +121,14 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
     Status Flush() override { return Status::OK(); }
     Status Sync() override { return Status::OK(); }
     Status Fsync() override { return Status::OK(); }
-    void SetIOPriority(Env::IOPriority pri) override {}
+    void SetIOPriority(Env::IOPriority /*pri*/) override {}
     uint64_t GetFileSize() override { return size_; }
-    void GetPreallocationStatus(size_t* block_size,
-                                size_t* last_allocated_block) override {}
-    size_t GetUniqueId(char* id, size_t max_size) const override { return 0; }
-    Status InvalidateCache(size_t offset, size_t length) override {
+    void GetPreallocationStatus(size_t* /*block_size*/,
+                                size_t* /*last_allocated_block*/) override {}
+    size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override {
+      return 0;
+    }
+    Status InvalidateCache(size_t /*offset*/, size_t /*length*/) override {
       return Status::OK();
     }
     bool use_direct_io() const override { return use_direct_io_; }
@@ -143,15 +147,15 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) {
     env_options.writable_file_max_buffer_size =
         (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024;
     std::string actual;
-    unique_ptr<FakeWF> wf(new FakeWF(&actual,
+    std::unique_ptr<FakeWF> wf(new FakeWF(&actual,
 #ifndef ROCKSDB_LITE
-                                     attempt % 2 == 1,
+                                          attempt % 2 == 1,
 #else
-                                     false,
+                                          false,
 #endif
-                                     no_flush));
-    unique_ptr<WritableFileWriter> writer(
-        new WritableFileWriter(std::move(wf), env_options));
+                                          no_flush));
+    std::unique_ptr<WritableFileWriter> writer(new WritableFileWriter(
+        std::move(wf), "" /* don't care */, env_options));
 
     std::string target;
     for (int i = 0; i < 20; i++) {
@@ -179,14 +183,14 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) {
    public:
     explicit FakeWF() : use_direct_io_(false), io_error_(false) {}
 
-    virtual bool use_direct_io() const override { return use_direct_io_; }
-    Status Append(const Slice& data) override {
+    bool use_direct_io() const override { return use_direct_io_; }
+    Status Append(const Slice& /*data*/) override {
       if (io_error_) {
         return Status::IOError("Fake IO error");
       }
       return Status::OK();
     }
-    Status PositionedAppend(const Slice& data, uint64_t) override {
+    Status PositionedAppend(const Slice& /*data*/, uint64_t) override {
       if (io_error_) {
         return Status::IOError("Fake IO error");
       }
@@ -202,10 +206,10 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) {
     bool use_direct_io_;
     bool io_error_;
   };
-  unique_ptr<FakeWF> wf(new FakeWF());
+  std::unique_ptr<FakeWF> wf(new FakeWF());
   wf->Setuse_direct_io(true);
-  unique_ptr<WritableFileWriter> writer(
-      new WritableFileWriter(std::move(wf), EnvOptions()));
+  std::unique_ptr<WritableFileWriter> writer(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
 
   ASSERT_OK(writer->Append(std::string(2 * kMb, 'a')));
 
@@ -222,7 +226,7 @@ class ReadaheadRandomAccessFileTest
   static std::vector<size_t> GetReadaheadSizeList() {
     return {1lu << 12, 1lu << 16};
   }
-  virtual void SetUp() override {
+  void SetUp() override {
     readahead_size_ = GetParam();
     scratch_.reset(new char[2 * readahead_size_]);
     ResetSourceStr();
@@ -234,8 +238,9 @@ class ReadaheadRandomAccessFileTest
     return std::string(result.data(), result.size());
   }
   void ResetSourceStr(const std::string& str = "") {
-    auto write_holder = std::unique_ptr<WritableFileWriter>(
-        test::GetWritableFileWriter(new test::StringSink(&control_contents_)));
+    auto write_holder =
+        std::unique_ptr<WritableFileWriter>(test::GetWritableFileWriter(
+            new test::StringSink(&control_contents_), "" /* don't care */));
     write_holder->Append(Slice(str));
     write_holder->Flush();
     auto read_holder = std::unique_ptr<RandomAccessFile>(
diff --git a/thirdparty/rocksdb/util/file_util.cc b/thirdparty/rocksdb/util/file_util.cc
index c6323b35b0..ba1b4744bb 100644
--- a/thirdparty/rocksdb/util/file_util.cc
+++ b/thirdparty/rocksdb/util/file_util.cc
@@ -19,82 +19,90 @@ Status CopyFile(Env* env, const std::string& source,
                 const std::string& destination, uint64_t size, bool use_fsync) {
   const EnvOptions soptions;
   Status s;
-  unique_ptr<SequentialFileReader> src_reader;
-  unique_ptr<WritableFileWriter> dest_writer;
+  std::unique_ptr<SequentialFileReader> src_reader;
+  std::unique_ptr<WritableFileWriter> dest_writer;
 
   {
-    unique_ptr<SequentialFile> srcfile;
-  s = env->NewSequentialFile(source, &srcfile, soptions);
-  unique_ptr<WritableFile> destfile;
-  if (s.ok()) {
+    std::unique_ptr<SequentialFile> srcfile;
+    s = env->NewSequentialFile(source, &srcfile, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+    std::unique_ptr<WritableFile> destfile;
     s = env->NewWritableFile(destination, &destfile, soptions);
-  } else {
-    return s;
-  }
+    if (!s.ok()) {
+      return s;
+    }
 
-  if (size == 0) {
-    // default argument means copy everything
-    if (s.ok()) {
+    if (size == 0) {
+      // default argument means copy everything
       s = env->GetFileSize(source, &size);
-    } else {
-      return s;
+      if (!s.ok()) {
+        return s;
+      }
     }
-  }
-  src_reader.reset(new SequentialFileReader(std::move(srcfile)));
-  dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions));
+    src_reader.reset(new SequentialFileReader(std::move(srcfile), source));
+    dest_writer.reset(
+        new WritableFileWriter(std::move(destfile), destination, soptions));
   }
 
   char buffer[4096];
   Slice slice;
   while (size > 0) {
     size_t bytes_to_read = std::min(sizeof(buffer), static_cast<size_t>(size));
-    if (s.ok()) {
-      s = src_reader->Read(bytes_to_read, &slice, buffer);
+    s = src_reader->Read(bytes_to_read, &slice, buffer);
+    if (!s.ok()) {
+      return s;
     }
-    if (s.ok()) {
-      if (slice.size() == 0) {
-        return Status::Corruption("file too small");
-      }
-      s = dest_writer->Append(slice);
+    if (slice.size() == 0) {
+      return Status::Corruption("file too small");
     }
+    s = dest_writer->Append(slice);
     if (!s.ok()) {
       return s;
     }
     size -= slice.size();
   }
-  dest_writer->Sync(use_fsync);
-  return Status::OK();
+  return dest_writer->Sync(use_fsync);
 }
 
 // Utility function to create a file with the provided contents
 Status CreateFile(Env* env, const std::string& destination,
-                  const std::string& contents) {
+                  const std::string& contents, bool use_fsync) {
   const EnvOptions soptions;
   Status s;
-  unique_ptr<WritableFileWriter> dest_writer;
+  std::unique_ptr<WritableFileWriter> dest_writer;
 
-  unique_ptr<WritableFile> destfile;
+  std::unique_ptr<WritableFile> destfile;
   s = env->NewWritableFile(destination, &destfile, soptions);
   if (!s.ok()) {
     return s;
   }
-  dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions));
-  return dest_writer->Append(Slice(contents));
+  dest_writer.reset(
+      new WritableFileWriter(std::move(destfile), destination, soptions));
+  s = dest_writer->Append(Slice(contents));
+  if (!s.ok()) {
+    return s;
+  }
+  return dest_writer->Sync(use_fsync);
 }
 
-Status DeleteSSTFile(const ImmutableDBOptions* db_options,
-                     const std::string& fname, uint32_t path_id) {
-  // TODO(tec): support sst_file_manager for multiple path_ids
+Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                     const std::string& fname, const std::string& dir_to_sync,
+                     const bool force_bg) {
 #ifndef ROCKSDB_LITE
-  auto sfm =
+  SstFileManagerImpl* sfm =
       static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
-  if (sfm && path_id == 0) {
-    return sfm->ScheduleFileDeletion(fname);
+  if (sfm) {
+    return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
   } else {
     return db_options->env->DeleteFile(fname);
   }
 #else
+  (void)dir_to_sync;
+  (void)force_bg;
   // SstFileManager is not supported in ROCKSDB_LITE
+  // Delete file immediately
   return db_options->env->DeleteFile(fname);
 #endif
 }
diff --git a/thirdparty/rocksdb/util/file_util.h b/thirdparty/rocksdb/util/file_util.h
index e59377ab17..c3b365c8bc 100644
--- a/thirdparty/rocksdb/util/file_util.h
+++ b/thirdparty/rocksdb/util/file_util.h
@@ -10,6 +10,7 @@
 #include "rocksdb/env.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
+#include "util/filename.h"
 
 namespace rocksdb {
 // use_fsync maps to options.use_fsync, which determines the way that
@@ -19,9 +20,11 @@ extern Status CopyFile(Env* env, const std::string& source,
                        bool use_fsync);
 
 extern Status CreateFile(Env* env, const std::string& destination,
-                         const std::string& contents);
+                         const std::string& contents, bool use_fsync);
 
-extern Status DeleteSSTFile(const ImmutableDBOptions* db_options,
-                            const std::string& fname, uint32_t path_id);
+extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
+                           const std::string& fname,
+                           const std::string& path_to_sync,
+                           const bool force_bg = false);
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/filelock_test.cc b/thirdparty/rocksdb/util/filelock_test.cc
index cb4bd43be4..f8721b5909 100644
--- a/thirdparty/rocksdb/util/filelock_test.cc
+++ b/thirdparty/rocksdb/util/filelock_test.cc
@@ -7,6 +7,7 @@
 #include "rocksdb/env.h"
 
 #include <vector>
+#include <fcntl.h>
 #include "util/coding.h"
 #include "util/testharness.h"
 
@@ -18,13 +19,13 @@ class LockTest : public testing::Test {
   std::string file_;
   rocksdb::Env* env_;
 
-  LockTest() : file_(test::TmpDir() + "/db_testlock_file"),
-               env_(rocksdb::Env::Default()) {
+  LockTest()
+      : file_(test::PerThreadDBPath("db_testlock_file")),
+        env_(rocksdb::Env::Default()) {
     current_ = this;
   }
 
-  ~LockTest() {
-  }
+  ~LockTest() override {}
 
   Status LockFile(FileLock** db_lock) {
     return env_->LockFile(file_, db_lock);
@@ -33,6 +34,78 @@ class LockTest : public testing::Test {
   Status UnlockFile(FileLock* db_lock) {
     return env_->UnlockFile(db_lock);
   }
+
+  bool AssertFileIsLocked(){
+    return CheckFileLock( /* lock_expected = */ true);
+  }
+
+  bool AssertFileIsNotLocked(){
+    return CheckFileLock( /* lock_expected = */ false);
+  }
+
+  bool CheckFileLock(bool lock_expected){
+    // We need to fork to check the fcntl lock as we need
+    // to open and close the file from a different process
+    // to avoid either releasing the lock on close, or not
+    // contending for it when requesting a lock.
+
+#ifdef OS_WIN
+
+    // WaitForSingleObject and GetExitCodeProcess can do what waitpid does.
+    // TODO - implement on Windows
+    return true;
+
+#else
+
+    pid_t pid = fork();
+    if ( 0 == pid ) {
+      // child process
+      int exit_val = EXIT_FAILURE;
+      int fd = open(file_.c_str(), O_RDWR | O_CREAT, 0644);
+      if (fd < 0) {
+        // could not open file, could not check if it was locked
+        fprintf( stderr, "Open on on file %s failed.\n",file_.c_str());
+        exit(exit_val);
+      }
+
+      struct flock f;
+      memset(&f, 0, sizeof(f));
+      f.l_type = (F_WRLCK);
+      f.l_whence = SEEK_SET;
+      f.l_start = 0;
+      f.l_len = 0; // Lock/unlock entire file
+      int value = fcntl(fd, F_SETLK, &f);
+      if( value == -1 ){
+        if( lock_expected ){
+          exit_val = EXIT_SUCCESS;
+        }
+      } else {
+        if( ! lock_expected ){
+          exit_val = EXIT_SUCCESS;
+        }
+      }
+      close(fd); // lock is released for child process
+      exit(exit_val);
+    } else if (pid > 0) {
+      // parent process
+      int status;
+      while (-1 == waitpid(pid, &status, 0));
+      if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+        // child process exited with non success status
+        return false;
+      } else {
+        return true;
+      }
+    } else {
+      fprintf( stderr, "Fork failed\n" );
+      return false;
+    }
+    return false;
+
+#endif
+
+  }
+
 };
 LockTest* LockTest::current_;
 
@@ -43,12 +116,21 @@ TEST_F(LockTest, LockBySameThread) {
   // acquire a lock on a file
   ASSERT_OK(LockFile(&lock1));
 
+  // check the file is locked
+  ASSERT_TRUE( AssertFileIsLocked() );
+
   // re-acquire the lock on the same file. This should fail.
   ASSERT_TRUE(LockFile(&lock2).IsIOError());
 
+  // check the file is locked
+  ASSERT_TRUE( AssertFileIsLocked() );
+
   // release the lock
   ASSERT_OK(UnlockFile(lock1));
 
+  // check the file is not locked
+  ASSERT_TRUE( AssertFileIsNotLocked() );
+
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/filename.cc b/thirdparty/rocksdb/util/filename.cc
index fa1618e1ff..32289aecb4 100644
--- a/thirdparty/rocksdb/util/filename.cc
+++ b/thirdparty/rocksdb/util/filename.cc
@@ -80,6 +80,13 @@ std::string BlobFileName(const std::string& blobdirname, uint64_t number) {
   return MakeFileName(blobdirname, number, kRocksDBBlobFileExt.c_str());
 }
 
+std::string BlobFileName(const std::string& dbname, const std::string& blob_dir,
+                         uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname + "/" + blob_dir, number,
+                      kRocksDBBlobFileExt.c_str());
+}
+
 std::string ArchivalDirectory(const std::string& dir) {
   return dir + "/" + ARCHIVAL_DIR;
 }
@@ -370,7 +377,7 @@ Status SetCurrentFile(Env* env, const std::string& dbname,
   }
   if (s.ok()) {
     if (directory_to_fsync != nullptr) {
-      directory_to_fsync->Fsync();
+      s = directory_to_fsync->Fsync();
     }
   } else {
     env->DeleteFile(tmp);
diff --git a/thirdparty/rocksdb/util/filename.h b/thirdparty/rocksdb/util/filename.h
index 0d4bacf536..eea6b1b02f 100644
--- a/thirdparty/rocksdb/util/filename.h
+++ b/thirdparty/rocksdb/util/filename.h
@@ -49,6 +49,9 @@ extern std::string LogFileName(const std::string& dbname, uint64_t number);
 
 extern std::string BlobFileName(const std::string& bdirname, uint64_t number);
 
+extern std::string BlobFileName(const std::string& dbname,
+                                const std::string& blob_dir, uint64_t number);
+
 static const std::string ARCHIVAL_DIR = "archive";
 
 extern std::string ArchivalDirectory(const std::string& dbname);
diff --git a/thirdparty/rocksdb/util/gflags_compat.h b/thirdparty/rocksdb/util/gflags_compat.h
new file mode 100644
index 0000000000..0ea3aef5e9
--- /dev/null
+++ b/thirdparty/rocksdb/util/gflags_compat.h
@@ -0,0 +1,12 @@
+//  Copyright (c) 2017-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <gflags/gflags.h>
+
+#ifndef GFLAGS_NAMESPACE
+// in case it's not defined in old versions, that's probably because it was
+// still google by default.
+#define GFLAGS_NAMESPACE google
+#endif
diff --git a/thirdparty/rocksdb/util/hash.cc b/thirdparty/rocksdb/util/hash.cc
index a0660c60a4..852710d73f 100644
--- a/thirdparty/rocksdb/util/hash.cc
+++ b/thirdparty/rocksdb/util/hash.cc
@@ -10,6 +10,7 @@
 #include <string.h>
 #include "util/coding.h"
 #include "util/hash.h"
+#include "util/util.h"
 
 namespace rocksdb {
 
@@ -40,10 +41,10 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) {
     // are unsigned we first cast the char to int8_t.
     case 3:
       h += static_cast<uint32_t>(static_cast<int8_t>(data[2])) << 16;
-    // fall through
+      FALLTHROUGH_INTENDED;
     case 2:
       h += static_cast<uint32_t>(static_cast<int8_t>(data[1])) << 8;
-    // fall through
+      FALLTHROUGH_INTENDED;
     case 1:
       h += static_cast<uint32_t>(static_cast<int8_t>(data[0]));
       h *= m;
diff --git a/thirdparty/rocksdb/util/hash.h b/thirdparty/rocksdb/util/hash.h
index 4a13f45644..ed42b08942 100644
--- a/thirdparty/rocksdb/util/hash.h
+++ b/thirdparty/rocksdb/util/hash.h
@@ -14,19 +14,36 @@
 #include <stdint.h>
 
 #include "rocksdb/slice.h"
+#include "util/murmurhash.h"
 
 namespace rocksdb {
 
+// Non-persistent hash. Only used for in-memory data structure
+// The hash results are applicable to change.
+extern uint64_t NPHash64(const char* data, size_t n, uint32_t seed);
+
 extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
 
 inline uint32_t BloomHash(const Slice& key) {
   return Hash(key.data(), key.size(), 0xbc9f1d34);
 }
 
+inline uint64_t GetSliceNPHash64(const Slice& s) {
+  return NPHash64(s.data(), s.size(), 0);
+}
+
 inline uint32_t GetSliceHash(const Slice& s) {
   return Hash(s.data(), s.size(), 397);
 }
 
+inline uint64_t NPHash64(const char* data, size_t n, uint32_t seed) {
+  // Right now murmurhash2B is used. It should able to be freely
+  // changed to a better hash, without worrying about backward
+  // compatibility issue.
+  return MURMUR_HASH(data, static_cast<int>(n),
+                     static_cast<unsigned int>(seed));
+}
+
 // std::hash compatible interface.
 struct SliceHasher {
   uint32_t operator()(const Slice& s) const { return GetSliceHash(s); }
diff --git a/thirdparty/rocksdb/util/heap.h b/thirdparty/rocksdb/util/heap.h
index 4d5894134f..6093c20e2b 100644
--- a/thirdparty/rocksdb/util/heap.h
+++ b/thirdparty/rocksdb/util/heap.h
@@ -92,9 +92,9 @@ class BinaryHeap {
     reset_root_cmp_cache();
   }
 
-  bool empty() const {
-    return data_.empty();
-  }
+  bool empty() const { return data_.empty(); }
+
+  size_t size() const { return data_.size(); }
 
   void reset_root_cmp_cache() { root_cmp_cache_ = port::kMaxSizet; }
 
diff --git a/thirdparty/rocksdb/util/heap_test.cc b/thirdparty/rocksdb/util/heap_test.cc
index b415615f6f..d036a62ebc 100644
--- a/thirdparty/rocksdb/util/heap_test.cc
+++ b/thirdparty/rocksdb/util/heap_test.cc
@@ -16,7 +16,7 @@
 #ifndef GFLAGS
 const int64_t FLAGS_iters = 100000;
 #else
-#include <gflags/gflags.h>
+#include "util/gflags_compat.h"
 DEFINE_int64(iters, 100000, "number of pseudo-random operations in each test");
 #endif  // GFLAGS
 
@@ -133,7 +133,7 @@ INSTANTIATE_TEST_CASE_P(
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
 #ifdef GFLAGS
-  GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
 #endif  // GFLAGS
   return RUN_ALL_TESTS();
 }
diff --git a/thirdparty/rocksdb/util/jemalloc_nodump_allocator.cc b/thirdparty/rocksdb/util/jemalloc_nodump_allocator.cc
new file mode 100644
index 0000000000..cdd08e932e
--- /dev/null
+++ b/thirdparty/rocksdb/util/jemalloc_nodump_allocator.cc
@@ -0,0 +1,206 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/jemalloc_nodump_allocator.h"
+
+#include <string>
+#include <thread>
+
+#include "port/likely.h"
+#include "port/port.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+std::atomic<extent_alloc_t*> JemallocNodumpAllocator::original_alloc_{nullptr};
+
+JemallocNodumpAllocator::JemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::unique_ptr<extent_hooks_t>&& arena_hooks, unsigned arena_index)
+    : options_(options),
+      arena_hooks_(std::move(arena_hooks)),
+      arena_index_(arena_index),
+      tcache_(&JemallocNodumpAllocator::DestroyThreadSpecificCache) {}
+
+int JemallocNodumpAllocator::GetThreadSpecificCache(size_t size) {
+  // We always enable tcache. The only corner case is when there are a ton of
+  // threads accessing with low frequency, then it could consume a lot of
+  // memory (may reach # threads * ~1MB) without bringing too much benefit.
+  if (options_.limit_tcache_size && (size <= options_.tcache_size_lower_bound ||
+                                     size > options_.tcache_size_upper_bound)) {
+    return MALLOCX_TCACHE_NONE;
+  }
+  unsigned* tcache_index = reinterpret_cast<unsigned*>(tcache_.Get());
+  if (UNLIKELY(tcache_index == nullptr)) {
+    // Instantiate tcache.
+    tcache_index = new unsigned(0);
+    size_t tcache_index_size = sizeof(unsigned);
+    int ret =
+        mallctl("tcache.create", tcache_index, &tcache_index_size, nullptr, 0);
+    if (ret != 0) {
+      // No good way to expose the error. Silently disable tcache.
+      delete tcache_index;
+      return MALLOCX_TCACHE_NONE;
+    }
+    tcache_.Reset(static_cast<void*>(tcache_index));
+  }
+  return MALLOCX_TCACHE(*tcache_index);
+}
+
+void* JemallocNodumpAllocator::Allocate(size_t size) {
+  int tcache_flag = GetThreadSpecificCache(size);
+  return mallocx(size, MALLOCX_ARENA(arena_index_) | tcache_flag);
+}
+
+void JemallocNodumpAllocator::Deallocate(void* p) {
+  // Obtain tcache.
+  size_t size = 0;
+  if (options_.limit_tcache_size) {
+    size = malloc_usable_size(p);
+  }
+  int tcache_flag = GetThreadSpecificCache(size);
+  // No need to pass arena index to dallocx(). Jemalloc will find arena index
+  // from its own metadata.
+  dallocx(p, tcache_flag);
+}
+
+void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr,
+                                     size_t size, size_t alignment, bool* zero,
+                                     bool* commit, unsigned arena_ind) {
+  extent_alloc_t* original_alloc =
+      original_alloc_.load(std::memory_order_relaxed);
+  assert(original_alloc != nullptr);
+  void* result = original_alloc(extent, new_addr, size, alignment, zero, commit,
+                                arena_ind);
+  if (result != nullptr) {
+    int ret = madvise(result, size, MADV_DONTDUMP);
+    if (ret != 0) {
+      fprintf(
+          stderr,
+          "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d",
+          ret);
+      assert(false);
+    }
+  }
+  return result;
+}
+
+Status JemallocNodumpAllocator::DestroyArena(unsigned arena_index) {
+  assert(arena_index != 0);
+  std::string key = "arena." + ToString(arena_index) + ".destroy";
+  int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to destroy jemalloc arena, error code: " +
+                              ToString(ret));
+  }
+  return Status::OK();
+}
+
+void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) {
+  assert(ptr != nullptr);
+  unsigned* tcache_index = static_cast<unsigned*>(ptr);
+  size_t tcache_index_size = sizeof(unsigned);
+  int ret __attribute__((__unused__)) =
+      mallctl("tcache.destroy", nullptr, 0, tcache_index, tcache_index_size);
+  // Silently ignore error.
+  assert(ret == 0);
+  delete tcache_index;
+}
+
+JemallocNodumpAllocator::~JemallocNodumpAllocator() {
+  // Destroy tcache before destroying arena.
+  autovector<void*> tcache_list;
+  tcache_.Scrape(&tcache_list, nullptr);
+  for (void* tcache_index : tcache_list) {
+    DestroyThreadSpecificCache(tcache_index);
+  }
+  // Destroy arena. Silently ignore error.
+  Status s __attribute__((__unused__)) = DestroyArena(arena_index_);
+  assert(s.ok());
+}
+
+size_t JemallocNodumpAllocator::UsableSize(void* p,
+                                           size_t /*allocation_size*/) const {
+  return malloc_usable_size(static_cast<void*>(p));
+}
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+Status NewJemallocNodumpAllocator(
+    JemallocAllocatorOptions& options,
+    std::shared_ptr<MemoryAllocator>* memory_allocator) {
+  *memory_allocator = nullptr;
+  Status unsupported = Status::NotSupported(
+      "JemallocNodumpAllocator only available with jemalloc version >= 5 "
+      "and MADV_DONTDUMP is available.");
+#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+  (void)options;
+  return unsupported;
+#else
+  if (!HasJemalloc()) {
+    return unsupported;
+  }
+  if (memory_allocator == nullptr) {
+    return Status::InvalidArgument("memory_allocator must be non-null.");
+  }
+  if (options.limit_tcache_size &&
+      options.tcache_size_lower_bound >= options.tcache_size_upper_bound) {
+    return Status::InvalidArgument(
+        "tcache_size_lower_bound larger or equal to tcache_size_upper_bound.");
+  }
+
+  // Create arena.
+  unsigned arena_index = 0;
+  size_t arena_index_size = sizeof(arena_index);
+  int ret =
+      mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0);
+  if (ret != 0) {
+    return Status::Incomplete("Failed to create jemalloc arena, error code: " +
+                              ToString(ret));
+  }
+  assert(arena_index != 0);
+
+  // Read existing hooks.
+  std::string key = "arena." + ToString(arena_index) + ".extent_hooks";
+  extent_hooks_t* hooks;
+  size_t hooks_size = sizeof(hooks);
+  ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0);
+  if (ret != 0) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Failed to read existing hooks, error code: " +
+                              ToString(ret));
+  }
+
+  // Store existing alloc.
+  extent_alloc_t* original_alloc = hooks->alloc;
+  extent_alloc_t* expected = nullptr;
+  bool success =
+      JemallocNodumpAllocator::original_alloc_.compare_exchange_strong(
+          expected, original_alloc);
+  if (!success && original_alloc != expected) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Original alloc conflict.");
+  }
+
+  // Set the custom hook.
+  std::unique_ptr<extent_hooks_t> new_hooks(new extent_hooks_t(*hooks));
+  new_hooks->alloc = &JemallocNodumpAllocator::Alloc;
+  extent_hooks_t* hooks_ptr = new_hooks.get();
+  ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr));
+  if (ret != 0) {
+    JemallocNodumpAllocator::DestroyArena(arena_index);
+    return Status::Incomplete("Failed to set custom hook, error code: " +
+                              ToString(ret));
+  }
+
+  // Create cache allocator.
+  memory_allocator->reset(
+      new JemallocNodumpAllocator(options, std::move(new_hooks), arena_index));
+  return Status::OK();
+#endif  // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/jemalloc_nodump_allocator.h b/thirdparty/rocksdb/util/jemalloc_nodump_allocator.h
new file mode 100644
index 0000000000..e93c122377
--- /dev/null
+++ b/thirdparty/rocksdb/util/jemalloc_nodump_allocator.h
@@ -0,0 +1,79 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <vector>
+
+#include "port/jemalloc_helper.h"
+#include "port/port.h"
+#include "rocksdb/memory_allocator.h"
+#include "util/core_local.h"
+#include "util/thread_local.h"
+
+#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX)
+
+#include <sys/mman.h>
+
+#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP)
+#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR
+
+namespace rocksdb {
+
+class JemallocNodumpAllocator : public MemoryAllocator {
+ public:
+  JemallocNodumpAllocator(JemallocAllocatorOptions& options,
+                          std::unique_ptr<extent_hooks_t>&& arena_hooks,
+                          unsigned arena_index);
+  ~JemallocNodumpAllocator();
+
+  const char* Name() const override { return "JemallocNodumpAllocator"; }
+  void* Allocate(size_t size) override;
+  void Deallocate(void* p) override;
+  size_t UsableSize(void* p, size_t allocation_size) const override;
+
+ private:
+  friend Status NewJemallocNodumpAllocator(
+      JemallocAllocatorOptions& options,
+      std::shared_ptr<MemoryAllocator>* memory_allocator);
+
+  // Custom alloc hook to replace jemalloc default alloc.
+  static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size,
+                     size_t alignment, bool* zero, bool* commit,
+                     unsigned arena_ind);
+
+  // Destroy arena on destruction of the allocator, or on failure.
+  static Status DestroyArena(unsigned arena_index);
+
+  // Destroy tcache on destruction of the allocator, or thread exit.
+  static void DestroyThreadSpecificCache(void* ptr);
+
+  // Get or create tcache. Return flag suitable to use with `mallocx`:
+  // either MALLOCX_TCACHE_NONE or MALLOCX_TCACHE(tc).
+  int GetThreadSpecificCache(size_t size);
+
+  // A function pointer to jemalloc default alloc. Use atomic to make sure
+  // NewJemallocNodumpAllocator is thread-safe.
+  //
+  // Hack: original_alloc_ needs to be static for Alloc() to access it.
+  // alloc needs to be static to pass to jemalloc as function pointer.
+  static std::atomic<extent_alloc_t*> original_alloc_;
+
+  const JemallocAllocatorOptions options_;
+
+  // Custom hooks has to outlive corresponding arena.
+  const std::unique_ptr<extent_hooks_t> arena_hooks_;
+
+  // Arena index.
+  const unsigned arena_index_;
+
+  // Hold thread-local tcache index.
+  ThreadLocalPtr tcache_;
+};
+
+}  // namespace rocksdb
+#endif  // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP
+#endif  // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX
diff --git a/thirdparty/rocksdb/util/kv_map.h b/thirdparty/rocksdb/util/kv_map.h
index 784a244aec..d5ba3307f1 100644
--- a/thirdparty/rocksdb/util/kv_map.h
+++ b/thirdparty/rocksdb/util/kv_map.h
@@ -10,7 +10,6 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/slice.h"
 #include "util/coding.h"
-#include "util/murmurhash.h"
 
 namespace rocksdb {
 namespace stl_wrappers {
diff --git a/thirdparty/rocksdb/util/log_write_bench.cc b/thirdparty/rocksdb/util/log_write_bench.cc
index 4008e43378..5c9b3e84bf 100644
--- a/thirdparty/rocksdb/util/log_write_bench.cc
+++ b/thirdparty/rocksdb/util/log_write_bench.cc
@@ -11,16 +11,15 @@ int main() {
 }
 #else
 
-#include <gflags/gflags.h>
-
 #include "monitoring/histogram.h"
 #include "rocksdb/env.h"
 #include "util/file_reader_writer.h"
+#include "util/gflags_compat.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
 
-using GFLAGS::ParseCommandLineFlags;
-using GFLAGS::SetUsageMessage;
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+using GFLAGS_NAMESPACE::SetUsageMessage;
 
 // A simple benchmark to simulate transactional logs
 
@@ -32,13 +31,13 @@ DEFINE_bool(enable_sync, false, "sync after each write.");
 
 namespace rocksdb {
 void RunBenchmark() {
-  std::string file_name = test::TmpDir() + "/log_write_benchmark.log";
+  std::string file_name = test::PerThreadDBPath("log_write_benchmark.log");
   Env* env = Env::Default();
   EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions());
   env_options.bytes_per_sync = FLAGS_bytes_per_sync;
-  unique_ptr<WritableFile> file;
+  std::unique_ptr<WritableFile> file;
   env->NewWritableFile(file_name, &file, env_options);
-  unique_ptr<WritableFileWriter> writer;
+  std::unique_ptr<WritableFileWriter> writer;
   writer.reset(new WritableFileWriter(std::move(file), env_options));
 
   std::string record;
diff --git a/thirdparty/rocksdb/util/logging.h b/thirdparty/rocksdb/util/logging.h
index 992e0018d7..a4ef31bd6b 100644
--- a/thirdparty/rocksdb/util/logging.h
+++ b/thirdparty/rocksdb/util/logging.h
@@ -11,40 +11,51 @@
 // with macros.
 
 #pragma once
-#include "port/port.h"
 
 // Helper macros that include information about file name and line number
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-#define PREPEND_FILE_LINE(FMT) ("[" __FILE__ ":" TOSTRING(__LINE__) "] " FMT)
+#define ROCKS_LOG_STRINGIFY(x) #x
+#define ROCKS_LOG_TOSTRING(x) ROCKS_LOG_STRINGIFY(x)
+#define ROCKS_LOG_PREPEND_FILE_LINE(FMT) ("[%s:" ROCKS_LOG_TOSTRING(__LINE__) "] " FMT)
+
+inline const char* RocksLogShorterFileName(const char* file)
+{
+  // 15 is the length of "util/logging.h".
+  // If the name of this file changed, please change this number, too.
+  return file + (sizeof(__FILE__) > 15 ? sizeof(__FILE__) - 15 : 0);
+}
 
 // Don't inclide file/line info in HEADER level
-#define ROCKS_LOG_HEADER(LGR, FMT, ...) \
+#define ROCKS_LOG_HEADER(LGR, FMT, ...)                                          \
   rocksdb::Log(InfoLogLevel::HEADER_LEVEL, LGR, FMT, ##__VA_ARGS__)
 
-#define ROCKS_LOG_DEBUG(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::DEBUG_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_DEBUG(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::DEBUG_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
+
+#define ROCKS_LOG_INFO(LGR, FMT, ...)                                            \
+  rocksdb::Log(InfoLogLevel::INFO_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_INFO(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::INFO_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_WARN(LGR, FMT, ...)                                            \
+  rocksdb::Log(InfoLogLevel::WARN_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_WARN(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::WARN_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_ERROR(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::ERROR_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_ERROR(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::ERROR_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_FATAL(LGR, FMT, ...)                                           \
+  rocksdb::Log(InfoLogLevel::FATAL_LEVEL, LGR, ROCKS_LOG_PREPEND_FILE_LINE(FMT), \
+               RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_FATAL(LGR, FMT, ...)                                 \
-  rocksdb::Log(InfoLogLevel::FATAL_LEVEL, LGR, PREPEND_FILE_LINE(FMT), \
-               ##__VA_ARGS__)
+#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...)                                      \
+  rocksdb::LogToBuffer(LOG_BUF, ROCKS_LOG_PREPEND_FILE_LINE(FMT),                \
+                       RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_BUFFER(LOG_BUF, FMT, ...) \
-  rocksdb::LogToBuffer(LOG_BUF, PREPEND_FILE_LINE(FMT), ##__VA_ARGS__)
+#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...)                 \
+  rocksdb::LogToBuffer(LOG_BUF, MAX_LOG_SIZE, ROCKS_LOG_PREPEND_FILE_LINE(FMT),  \
+                       RocksLogShorterFileName(__FILE__), ##__VA_ARGS__)
 
-#define ROCKS_LOG_BUFFER_MAX_SZ(LOG_BUF, MAX_LOG_SIZE, FMT, ...)      \
-  rocksdb::LogToBuffer(LOG_BUF, MAX_LOG_SIZE, PREPEND_FILE_LINE(FMT), \
-                       ##__VA_ARGS__)
+#define ROCKS_LOG_DETAILS(LGR, FMT, ...) \
+  ;  // due to overhead by default skip such lines
+// ROCKS_LOG_DEBUG(LGR, FMT, ##__VA_ARGS__)
diff --git a/thirdparty/rocksdb/util/memory_allocator.h b/thirdparty/rocksdb/util/memory_allocator.h
new file mode 100644
index 0000000000..99a7241d0a
--- /dev/null
+++ b/thirdparty/rocksdb/util/memory_allocator.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+
+#include "rocksdb/memory_allocator.h"
+
+namespace rocksdb {
+
+struct CustomDeleter {
+  CustomDeleter(MemoryAllocator* a = nullptr) : allocator(a) {}
+
+  void operator()(char* ptr) const {
+    if (allocator) {
+      allocator->Deallocate(reinterpret_cast<void*>(ptr));
+    } else {
+      delete[] ptr;
+    }
+  }
+
+  MemoryAllocator* allocator;
+};
+
+using CacheAllocationPtr = std::unique_ptr<char[], CustomDeleter>;
+
+inline CacheAllocationPtr AllocateBlock(size_t size,
+                                        MemoryAllocator* allocator) {
+  if (allocator) {
+    auto block = reinterpret_cast<char*>(allocator->Allocate(size));
+    return CacheAllocationPtr(block, allocator);
+  }
+  return CacheAllocationPtr(new char[size]);
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/mock_time_env.h b/thirdparty/rocksdb/util/mock_time_env.h
new file mode 100644
index 0000000000..feada47771
--- /dev/null
+++ b/thirdparty/rocksdb/util/mock_time_env.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/env.h"
+
+namespace rocksdb {
+
+class MockTimeEnv : public EnvWrapper {
+ public:
+  explicit MockTimeEnv(Env* base) : EnvWrapper(base) {}
+
+  virtual Status GetCurrentTime(int64_t* time) override {
+    assert(time != nullptr);
+    assert(current_time_ <=
+           static_cast<uint64_t>(std::numeric_limits<int64_t>::max()));
+    *time = static_cast<int64_t>(current_time_);
+    return Status::OK();
+  }
+
+  virtual uint64_t NowMicros() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000);
+    return current_time_ * 1000000;
+  }
+
+  virtual uint64_t NowNanos() override {
+    assert(current_time_ <= std::numeric_limits<uint64_t>::max() / 1000000000);
+    return current_time_ * 1000000000;
+  }
+
+  uint64_t RealNowMicros() { return target()->NowMicros(); }
+
+  void set_current_time(uint64_t time) {
+    assert(time >= current_time_);
+    current_time_ = time;
+  }
+
+ private:
+  std::atomic<uint64_t> current_time_{0};
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/mpsc.h b/thirdparty/rocksdb/util/mpsc.h
deleted file mode 100644
index 7449fd3505..0000000000
--- a/thirdparty/rocksdb/util/mpsc.h
+++ /dev/null
@@ -1,158 +0,0 @@
-//  Portions Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-// Large parts of this file is borrowed from the public domain code below.
-// from https://github.com/mstump/queues
-
-// C++ implementation of Dmitry Vyukov's non-intrusive
-// lock free unbound MPSC queue
-// http://www.1024cores.net/home/
-// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
-
-// License from mstump/queues
-// This is free and unencumbered software released into the public domain.
-//
-// Anyone is free to copy, modify, publish, use, compile, sell, or
-// distribute this software, either in source code form or as a compiled
-// binary, for any purpose, commercial or non-commercial, and by any
-// means.
-//
-// In jurisdictions that recognize copyright laws, the author or authors
-// of this software dedicate any and all copyright interest in the
-// software to the public domain. We make this dedication for the benefit
-// of the public at large and to the detriment of our heirs and
-// successors. We intend this dedication to be an overt act of
-// relinquishment in perpetuity of all present and future rights to this
-// software under copyright law.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-//
-// For more information, please refer to <http://unlicense.org>
-
-// License from http://www.1024cores.net/home/
-// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue
-// Copyright (c) 2010-2011 Dmitry Vyukov. All rights reserved.
-// Redistribution and use in source and binary forms, with or
-// without modification, are permitted provided that the following
-// conditions are met:
-// 1. Redistributions of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// THIS SOFTWARE IS PROVIDED BY DMITRY VYUKOV "AS IS" AND ANY EXPRESS OR
-// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
-// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
-// EVENT SHALL DMITRY VYUKOV OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
-// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// The views and conclusions contained in the software and documentation
-// are those of the authors and should not be interpreted as representing
-// official policies, either expressed or implied, of Dmitry Vyukov.
-//
-
-#ifndef UTIL_MPSC_H_
-#define UTIL_MPSC_H_
-
-#include <atomic>
-#include <cassert>
-#include <type_traits>
-
-/**
- * Multiple Producer Single Consumer Lockless Q
- */
-template <typename T>
-class mpsc_queue_t {
- public:
-  struct buffer_node_t {
-    T data;
-    std::atomic<buffer_node_t*> next;
-  };
-
-  mpsc_queue_t() {
-    buffer_node_aligned_t* al_st = new buffer_node_aligned_t;
-    buffer_node_t* node = new (al_st) buffer_node_t();
-    _head.store(node);
-    _tail.store(node);
-
-    node->next.store(nullptr, std::memory_order_relaxed);
-  }
-
-  ~mpsc_queue_t() {
-    T output;
-    while (this->dequeue(&output)) {
-    }
-    buffer_node_t* front = _head.load(std::memory_order_relaxed);
-    front->~buffer_node_t();
-
-    ::operator delete(front);
-  }
-
-  void enqueue(const T& input) {
-    buffer_node_aligned_t* al_st = new buffer_node_aligned_t;
-    buffer_node_t* node = new (al_st) buffer_node_t();
-
-    node->data = input;
-    node->next.store(nullptr, std::memory_order_relaxed);
-
-    buffer_node_t* prev_head = _head.exchange(node, std::memory_order_acq_rel);
-    prev_head->next.store(node, std::memory_order_release);
-  }
-
-  bool dequeue(T* output) {
-    buffer_node_t* tail = _tail.load(std::memory_order_relaxed);
-    buffer_node_t* next = tail->next.load(std::memory_order_acquire);
-
-    if (next == nullptr) {
-      return false;
-    }
-
-    *output = next->data;
-    _tail.store(next, std::memory_order_release);
-
-    tail->~buffer_node_t();
-
-    ::operator delete(tail);
-    return true;
-  }
-
-  // you can only use pop_all if the queue is SPSC
-  buffer_node_t* pop_all() {
-    // nobody else can move the tail pointer.
-    buffer_node_t* tptr = _tail.load(std::memory_order_relaxed);
-    buffer_node_t* next =
-        tptr->next.exchange(nullptr, std::memory_order_acquire);
-    _head.exchange(tptr, std::memory_order_acquire);
-
-    // there is a race condition here
-    return next;
-  }
-
- private:
-  typedef typename std::aligned_storage<
-      sizeof(buffer_node_t), std::alignment_of<buffer_node_t>::value>::type
-      buffer_node_aligned_t;
-
-  std::atomic<buffer_node_t*> _head;
-  std::atomic<buffer_node_t*> _tail;
-
-  mpsc_queue_t(const mpsc_queue_t&) = delete;
-  mpsc_queue_t& operator=(const mpsc_queue_t&) = delete;
-};
-
-#endif  // UTIL_MPSC_H_
diff --git a/thirdparty/rocksdb/util/murmurhash.cc b/thirdparty/rocksdb/util/murmurhash.cc
index 4d71d58908..3b759c5e6a 100644
--- a/thirdparty/rocksdb/util/murmurhash.cc
+++ b/thirdparty/rocksdb/util/murmurhash.cc
@@ -10,6 +10,7 @@
   is under the MIT license.
 */
 #include "murmurhash.h"
+#include "util/util.h"
 
 #if defined(__x86_64__)
 
@@ -20,6 +21,13 @@
 //
 // 64-bit hash for 64-bit platforms
 
+#ifdef ROCKSDB_UBSAN_RUN
+#if defined(__clang__)
+__attribute__((__no_sanitize__("alignment")))
+#elif defined(__GNUC__)
+__attribute__((__no_sanitize_undefined__))
+#endif
+#endif
 uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
 {
     const uint64_t m = 0xc6a4a7935bd1e995;
@@ -46,12 +54,12 @@ uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )
 
     switch(len & 7)
     {
-    case 7: h ^= ((uint64_t)data2[6]) << 48; // fallthrough
-    case 6: h ^= ((uint64_t)data2[5]) << 40; // fallthrough
-    case 5: h ^= ((uint64_t)data2[4]) << 32; // fallthrough
-    case 4: h ^= ((uint64_t)data2[3]) << 24; // fallthrough
-    case 3: h ^= ((uint64_t)data2[2]) << 16; // fallthrough
-    case 2: h ^= ((uint64_t)data2[1]) << 8; // fallthrough
+    case 7: h ^= ((uint64_t)data2[6]) << 48; FALLTHROUGH_INTENDED;
+    case 6: h ^= ((uint64_t)data2[5]) << 40; FALLTHROUGH_INTENDED;
+    case 5: h ^= ((uint64_t)data2[4]) << 32; FALLTHROUGH_INTENDED;
+    case 4: h ^= ((uint64_t)data2[3]) << 24; FALLTHROUGH_INTENDED;
+    case 3: h ^= ((uint64_t)data2[2]) << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= ((uint64_t)data2[1]) << 8;  FALLTHROUGH_INTENDED;
     case 1: h ^= ((uint64_t)data2[0]);
         h *= m;
     };
@@ -113,8 +121,8 @@ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )
 
     switch(len)
     {
-    case 3: h ^= data[2] << 16; // fallthrough
-    case 2: h ^= data[1] << 8; // fallthrough
+    case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= data[1] << 8;  FALLTHROUGH_INTENDED;
     case 1: h ^= data[0];
         h *= m;
     };
@@ -167,8 +175,8 @@ unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed )
 
     switch(len)
     {
-    case 3: h ^= data[2] << 16; // fallthrough
-    case 2: h ^= data[1] << 8; // fallthrough
+    case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED;
+    case 2: h ^= data[1] << 8;  FALLTHROUGH_INTENDED;
     case 1: h ^= data[0];
         h *= m;
     };
diff --git a/thirdparty/rocksdb/util/ppc-opcode.h b/thirdparty/rocksdb/util/ppc-opcode.h
new file mode 100644
index 0000000000..554fa50a89
--- /dev/null
+++ b/thirdparty/rocksdb/util/ppc-opcode.h
@@ -0,0 +1,28 @@
+//  Copyright (c) 2017 International Business Machines Corp.
+//  All rights reserved.
+//  This source code is licensed under the BSD-style license found in the
+//  LICENSE file in the root directory of this source tree. An additional grant
+//  of patent rights can be found in the PATENTS file in the same directory.
+//  This source code is also licensed under the GPLv2 license found in the
+//  COPYING file in the root directory of this source tree.
+
+#pragma once
+
+#define __PPC_RA(a) (((a)&0x1f) << 16)
+#define __PPC_RB(b) (((b)&0x1f) << 11)
+#define __PPC_XA(a) ((((a)&0x1f) << 16) | (((a)&0x20) >> 3))
+#define __PPC_XB(b) ((((b)&0x1f) << 11) | (((b)&0x20) >> 4))
+#define __PPC_XS(s) ((((s)&0x1f) << 21) | (((s)&0x20) >> 5))
+#define __PPC_XT(s) __PPC_XS(s)
+#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b))
+#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b))
+
+#define PPC_INST_VPMSUMW 0x10000488
+#define PPC_INST_VPMSUMD 0x100004c8
+#define PPC_INST_MFVSRD 0x7c000066
+#define PPC_INST_MTVSRD 0x7c000166
+
+#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b)
+#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b)
+#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0)
+#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0)
diff --git a/thirdparty/rocksdb/util/rate_limiter.cc b/thirdparty/rocksdb/util/rate_limiter.cc
index b9160b2500..9d23c38f7a 100644
--- a/thirdparty/rocksdb/util/rate_limiter.cc
+++ b/thirdparty/rocksdb/util/rate_limiter.cc
@@ -45,13 +45,15 @@ struct GenericRateLimiter::Req {
 
 GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
                                        int64_t refill_period_us,
-                                       int32_t fairness, RateLimiter::Mode mode)
+                                       int32_t fairness, RateLimiter::Mode mode,
+                                       Env* env, bool auto_tuned)
     : RateLimiter(mode),
       refill_period_us_(refill_period_us),
-      rate_bytes_per_sec_(rate_bytes_per_sec),
+      rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
+                                     : rate_bytes_per_sec),
       refill_bytes_per_period_(
-          CalculateRefillBytesPerPeriod(rate_bytes_per_sec)),
-      env_(Env::Default()),
+          CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)),
+      env_(env),
       stop_(false),
       exit_cv_(&request_mutex_),
       requests_to_wait_(0),
@@ -59,7 +61,12 @@ GenericRateLimiter::GenericRateLimiter(int64_t rate_bytes_per_sec,
       next_refill_us_(NowMicrosMonotonic(env_)),
       fairness_(fairness > 100 ? 100 : fairness),
       rnd_((uint32_t)time(nullptr)),
-      leader_(nullptr) {
+      leader_(nullptr),
+      auto_tuned_(auto_tuned),
+      num_drains_(0),
+      prev_num_drains_(0),
+      max_bytes_per_sec_(rate_bytes_per_sec),
+      tuned_time_(NowMicrosMonotonic(env_)) {
   total_requests_[0] = 0;
   total_requests_[1] = 0;
   total_bytes_through_[0] = 0;
@@ -98,6 +105,16 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
   TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1",
                            &rate_bytes_per_sec_);
   MutexLock g(&request_mutex_);
+
+  if (auto_tuned_) {
+    static const int kRefillsPerTune = 100;
+    std::chrono::microseconds now(NowMicrosMonotonic(env_));
+    if (now - tuned_time_ >=
+        kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) {
+      Tune();
+    }
+  }
+
   if (stop_) {
     return;
   }
@@ -138,6 +155,7 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
       } else {
         int64_t wait_until = env_->NowMicros() + delta;
         RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
+        ++num_drains_;
         timedout = r.cv.TimedWait(wait_until);
       }
     } else {
@@ -256,15 +274,66 @@ int64_t GenericRateLimiter::CalculateRefillBytesPerPeriod(
   }
 }
 
+Status GenericRateLimiter::Tune() {
+  const int kLowWatermarkPct = 50;
+  const int kHighWatermarkPct = 90;
+  const int kAdjustFactorPct = 5;
+  // computed rate limit will be in
+  // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`.
+  const int kAllowedRangeFactor = 20;
+
+  std::chrono::microseconds prev_tuned_time = tuned_time_;
+  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_));
+
+  int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time +
+                               std::chrono::microseconds(refill_period_us_) -
+                               std::chrono::microseconds(1)) /
+                              std::chrono::microseconds(refill_period_us_);
+  // We tune every kRefillsPerTune intervals, so the overflow and division-by-
+  // zero conditions should never happen.
+  assert(num_drains_ - prev_num_drains_ <= port::kMaxInt64 / 100);
+  assert(elapsed_intervals > 0);
+  int64_t drained_pct =
+      (num_drains_ - prev_num_drains_) * 100 / elapsed_intervals;
+
+  int64_t prev_bytes_per_sec = GetBytesPerSecond();
+  int64_t new_bytes_per_sec;
+  if (drained_pct == 0) {
+    new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor;
+  } else if (drained_pct < kLowWatermarkPct) {
+    // sanitize to prevent overflow
+    int64_t sanitized_prev_bytes_per_sec =
+        std::min(prev_bytes_per_sec, port::kMaxInt64 / 100);
+    new_bytes_per_sec =
+        std::max(max_bytes_per_sec_ / kAllowedRangeFactor,
+                 sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct));
+  } else if (drained_pct > kHighWatermarkPct) {
+    // sanitize to prevent overflow
+    int64_t sanitized_prev_bytes_per_sec = std::min(
+        prev_bytes_per_sec, port::kMaxInt64 / (100 + kAdjustFactorPct));
+    new_bytes_per_sec =
+        std::min(max_bytes_per_sec_,
+                 sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100);
+  } else {
+    new_bytes_per_sec = prev_bytes_per_sec;
+  }
+  if (new_bytes_per_sec != prev_bytes_per_sec) {
+    SetBytesPerSecond(new_bytes_per_sec);
+  }
+  num_drains_ = prev_num_drains_;
+  return Status::OK();
+}
+
 RateLimiter* NewGenericRateLimiter(
     int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */,
     int32_t fairness /* = 10 */,
-    RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */) {
+    RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */,
+    bool auto_tuned /* = false */) {
   assert(rate_bytes_per_sec > 0);
   assert(refill_period_us > 0);
   assert(fairness > 0);
   return new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness,
-                                mode);
+                                mode, Env::Default(), auto_tuned);
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/rate_limiter.h b/thirdparty/rocksdb/util/rate_limiter.h
index 0564bd07c2..cb91f0aeb5 100644
--- a/thirdparty/rocksdb/util/rate_limiter.h
+++ b/thirdparty/rocksdb/util/rate_limiter.h
@@ -11,20 +11,21 @@
 
 #include <algorithm>
 #include <atomic>
+#include <chrono>
 #include <deque>
 #include "port/port.h"
-#include "util/mutexlock.h"
-#include "util/random.h"
 #include "rocksdb/env.h"
 #include "rocksdb/rate_limiter.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
 
 namespace rocksdb {
 
 class GenericRateLimiter : public RateLimiter {
  public:
   GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
-                     int32_t fairness,
-                     RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly);
+                     int32_t fairness, RateLimiter::Mode mode, Env* env,
+                     bool auto_tuned);
 
   virtual ~GenericRateLimiter();
 
@@ -68,6 +69,8 @@ class GenericRateLimiter : public RateLimiter {
  private:
   void Refill();
   int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec);
+  Status Tune();
+
   uint64_t NowMicrosMonotonic(Env* env) {
     return env->NowNanos() / std::milli::den;
   }
@@ -99,6 +102,12 @@ class GenericRateLimiter : public RateLimiter {
   struct Req;
   Req* leader_;
   std::deque<Req*> queue_[Env::IO_TOTAL];
+
+  bool auto_tuned_;
+  int64_t num_drains_;
+  int64_t prev_num_drains_;
+  const int64_t max_bytes_per_sec_;
+  std::chrono::microseconds tuned_time_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/rate_limiter_test.cc b/thirdparty/rocksdb/util/rate_limiter_test.cc
index f099808b5a..d3f3be3ba9 100644
--- a/thirdparty/rocksdb/util/rate_limiter_test.cc
+++ b/thirdparty/rocksdb/util/rate_limiter_test.cc
@@ -12,8 +12,12 @@
 #endif
 
 #include "util/rate_limiter.h"
+
 #include <inttypes.h>
+#include <chrono>
 #include <limits>
+
+#include "db/db_test_util.h"
 #include "rocksdb/env.h"
 #include "util/random.h"
 #include "util/sync_point.h"
@@ -25,7 +29,9 @@ namespace rocksdb {
 class RateLimiterTest : public testing::Test {};
 
 TEST_F(RateLimiterTest, OverflowRate) {
-  GenericRateLimiter limiter(port::kMaxInt64, 1000, 10);
+  GenericRateLimiter limiter(port::kMaxInt64, 1000, 10,
+                             RateLimiter::Mode::kWritesOnly, Env::Default(),
+                             false /* auto_tuned */);
   ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll);
 }
 
@@ -36,9 +42,9 @@ TEST_F(RateLimiterTest, StartStop) {
 TEST_F(RateLimiterTest, Modes) {
   for (auto mode : {RateLimiter::Mode::kWritesOnly,
                     RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) {
-    GenericRateLimiter limiter(2000 /* rate_bytes_per_sec */,
-                               1000 * 1000 /* refill_period_us */,
-                               10 /* fairness */, mode);
+    GenericRateLimiter limiter(
+        2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+        10 /* fairness */, mode, Env::Default(), false /* auto_tuned */);
     limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
                     RateLimiter::OpType::kRead);
     if (mode == RateLimiter::Mode::kWritesOnly) {
@@ -147,7 +153,9 @@ TEST_F(RateLimiterTest, LimitChangeTest) {
     // refill per second
     for (int iter = 0; iter < 2; iter++) {
       std::shared_ptr<RateLimiter> limiter =
-          std::make_shared<GenericRateLimiter>(target, refill_period, 10);
+          std::make_shared<GenericRateLimiter>(
+              target, refill_period, 10, RateLimiter::Mode::kWritesOnly,
+              Env::Default(), false /* auto_tuned */);
       rocksdb::SyncPoint::GetInstance()->LoadDependency(
           {{"GenericRateLimiter::Request",
             "RateLimiterTest::LimitChangeTest:changeLimitStart"},
@@ -172,6 +180,57 @@ TEST_F(RateLimiterTest, LimitChangeTest) {
   }
 }
 
+TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) {
+  const std::chrono::seconds kTimePerRefill(1);
+  const int kRefillsPerTune = 100;  // needs to match util/rate_limiter.cc
+
+  SpecialEnv special_env(Env::Default());
+  special_env.no_slowdown_ = true;
+  special_env.time_elapse_only_sleep_ = true;
+
+  auto stats = CreateDBStatistics();
+  std::unique_ptr<RateLimiter> rate_limiter(new GenericRateLimiter(
+      1000 /* rate_bytes_per_sec */,
+      std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */,
+      RateLimiter::Mode::kWritesOnly, &special_env, true /* auto_tuned */));
+
+  // Use callback to advance time because we need to advance (1) after Request()
+  // has determined the bytes are not available; and (2) before Refill()
+  // computes the next refill time (ensuring refill time in the future allows
+  // the next request to drain the rate limiter).
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Refill", [&](void* /*arg*/) {
+        special_env.SleepForMicroseconds(static_cast<int>(
+            std::chrono::microseconds(kTimePerRefill).count()));
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+
+  // verify rate limit increases after a sequence of periods where rate limiter
+  // is always drained
+  int64_t orig_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+                        RateLimiter::OpType::kWrite);
+  while (std::chrono::microseconds(special_env.NowMicros()) <=
+         kRefillsPerTune * kTimePerRefill) {
+    rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(),
+                          RateLimiter::OpType::kWrite);
+  }
+  int64_t new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec);
+
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+  // decreases after a sequence of periods where rate limiter is not drained
+  orig_bytes_per_sec = new_bytes_per_sec;
+  special_env.SleepForMicroseconds(static_cast<int>(
+      kRefillsPerTune * std::chrono::microseconds(kTimePerRefill).count()));
+  // make a request so tuner can be triggered
+  rate_limiter->Request(1 /* bytes */, Env::IO_HIGH, stats.get(),
+                        RateLimiter::OpType::kWrite);
+  new_bytes_per_sec = rate_limiter->GetSingleBurstBytes();
+  ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec);
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
diff --git a/thirdparty/rocksdb/util/repeatable_thread.h b/thirdparty/rocksdb/util/repeatable_thread.h
new file mode 100644
index 0000000000..967cc49945
--- /dev/null
+++ b/thirdparty/rocksdb/util/repeatable_thread.h
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <functional>
+#include <string>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "util/mock_time_env.h"
+#include "util/mutexlock.h"
+
+namespace rocksdb {
+
+class RepeatableThread {
+ public:
+  RepeatableThread(std::function<void()> function,
+                   const std::string& thread_name, Env* env, uint64_t delay_us,
+                   uint64_t initial_delay_us = 0)
+      : function_(function),
+        thread_name_("rocksdb:" + thread_name),
+        env_(env),
+        delay_us_(delay_us),
+        initial_delay_us_(initial_delay_us),
+        mutex_(env),
+        cond_var_(&mutex_),
+        running_(true),
+#ifndef NDEBUG
+        waiting_(false),
+        run_count_(0),
+#endif
+        thread_([this] { thread(); }) {
+  }
+
+  void cancel() {
+    {
+      InstrumentedMutexLock l(&mutex_);
+      if (!running_) {
+        return;
+      }
+      running_ = false;
+      cond_var_.SignalAll();
+    }
+    thread_.join();
+  }
+
+  bool IsRunning() { return running_; }
+
+  ~RepeatableThread() { cancel(); }
+
+#ifndef NDEBUG
+  // Wait until RepeatableThread starting waiting, call the optional callback,
+  // then wait for one run of RepeatableThread. Tests can use provide a
+  // custom env object to mock time, and use the callback here to bump current
+  // time and trigger RepeatableThread. See repeatable_thread_test for example.
+  //
+  // Note: only support one caller of this method.
+  void TEST_WaitForRun(std::function<void()> callback = nullptr) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!waiting_) {
+      cond_var_.Wait();
+    }
+    uint64_t prev_count = run_count_;
+    if (callback != nullptr) {
+      callback();
+    }
+    cond_var_.SignalAll();
+    while (!(run_count_ > prev_count)) {
+      cond_var_.Wait();
+    }
+  }
+#endif
+
+ private:
+  bool wait(uint64_t delay) {
+    InstrumentedMutexLock l(&mutex_);
+    if (running_ && delay > 0) {
+      uint64_t wait_until = env_->NowMicros() + delay;
+#ifndef NDEBUG
+      waiting_ = true;
+      cond_var_.SignalAll();
+#endif
+      while (running_) {
+        cond_var_.TimedWait(wait_until);
+        if (env_->NowMicros() >= wait_until) {
+          break;
+        }
+      }
+#ifndef NDEBUG
+      waiting_ = false;
+#endif
+    }
+    return running_;
+  }
+
+  void thread() {
+#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+    // Set thread name.
+    auto thread_handle = thread_.native_handle();
+    int ret __attribute__((__unused__)) =
+        pthread_setname_np(thread_handle, thread_name_.c_str());
+    assert(ret == 0);
+#endif
+#endif
+
+    assert(delay_us_ > 0);
+    if (!wait(initial_delay_us_)) {
+      return;
+    }
+    do {
+      function_();
+#ifndef NDEBUG
+      {
+        InstrumentedMutexLock l(&mutex_);
+        run_count_++;
+        cond_var_.SignalAll();
+      }
+#endif
+    } while (wait(delay_us_));
+  }
+
+  const std::function<void()> function_;
+  const std::string thread_name_;
+  Env* const env_;
+  const uint64_t delay_us_;
+  const uint64_t initial_delay_us_;
+
+  // Mutex lock should be held when accessing running_, waiting_
+  // and run_count_.
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cond_var_;
+  bool running_;
+#ifndef NDEBUG
+  // RepeatableThread waiting for timeout.
+  bool waiting_;
+  // Times function_ had run.
+  uint64_t run_count_;
+#endif
+  port::Thread thread_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/repeatable_thread_test.cc b/thirdparty/rocksdb/util/repeatable_thread_test.cc
new file mode 100644
index 0000000000..ee853c1056
--- /dev/null
+++ b/thirdparty/rocksdb/util/repeatable_thread_test.cc
@@ -0,0 +1,106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "util/repeatable_thread.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+
+class RepeatableThreadTest : public testing::Test {
+ public:
+  RepeatableThreadTest()
+      : mock_env_(new rocksdb::MockTimeEnv(rocksdb::Env::Default())) {}
+
+ protected:
+  std::unique_ptr<rocksdb::MockTimeEnv> mock_env_;
+};
+
+TEST_F(RepeatableThreadTest, TimedTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  rocksdb::Env* env = rocksdb::Env::Default();
+  rocksdb::port::Mutex mutex;
+  rocksdb::port::CondVar test_cv(&mutex);
+  int count = 0;
+  uint64_t prev_time = env->NowMicros();
+  rocksdb::RepeatableThread thread(
+      [&] {
+        rocksdb::MutexLock l(&mutex);
+        count++;
+        uint64_t now = env->NowMicros();
+        assert(count == 1 || prev_time + 1 * kSecond <= now);
+        prev_time = now;
+        if (count >= kIteration) {
+          test_cv.SignalAll();
+        }
+      },
+      "rt_test", env, 1 * kSecond);
+  // Wait for execution finish.
+  {
+    rocksdb::MutexLock l(&mutex);
+    while (count < kIteration) {
+      test_cv.Wait();
+    }
+  }
+
+  // Test cancel
+  thread.cancel();
+}
+
+TEST_F(RepeatableThreadTest, MockEnvTest) {
+  constexpr uint64_t kSecond = 1000000;  // 1s = 1000000us
+  constexpr int kIteration = 3;
+  mock_env_->set_current_time(0);  // in seconds
+  std::atomic<int> count{0};
+
+#if defined(OS_MACOSX) && !defined(NDEBUG)
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        // Obtain the current (real) time in seconds and add 1000 extra seconds
+        // to ensure that RepeatableThread::wait invokes TimedWait with a time
+        // greater than (real) current time. This is to prevent the TimedWait
+        // function from returning immediately without sleeping and releasing
+        // the mutex on certain platforms, e.g. OS X. If TimedWait returns
+        // immediately, the mutex will not be released, and
+        // RepeatableThread::TEST_WaitForRun never has a chance to execute the
+        // callback which, in this case, updates the result returned by
+        // mock_env->NowMicros. Consequently, RepeatableThread::wait cannot
+        // break out of the loop, causing test to hang. The extra 1000 seconds
+        // is a best-effort approach because there seems no reliable and
+        // deterministic way to provide the aforementioned guarantee. By the
+        // time RepeatableThread::wait is called, it is no guarantee that the
+        // delay + mock_env->NowMicros will be greater than the current real
+        // time. However, 1000 seconds should be sufficient in most cases.
+        uint64_t time_us = *reinterpret_cast<uint64_t*>(arg);
+        if (time_us < mock_env_->RealNowMicros()) {
+          *reinterpret_cast<uint64_t*>(arg) = mock_env_->RealNowMicros() + 1000;
+        }
+      });
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+#endif  // OS_MACOSX && !NDEBUG
+
+  rocksdb::RepeatableThread thread([&] { count++; }, "rt_test", mock_env_.get(),
+                                   1 * kSecond, 1 * kSecond);
+  for (int i = 1; i <= kIteration; i++) {
+    // Bump current time
+    thread.TEST_WaitForRun([&] { mock_env_->set_current_time(i); });
+  }
+  // Test function should be exectued exactly kIteraion times.
+  ASSERT_EQ(kIteration, count.load());
+
+  // Test cancel
+  thread.cancel();
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/thirdparty/rocksdb/util/set_comparator.h b/thirdparty/rocksdb/util/set_comparator.h
new file mode 100644
index 0000000000..4ecd004036
--- /dev/null
+++ b/thirdparty/rocksdb/util/set_comparator.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+namespace rocksdb {
+// A comparator to be used in std::set
+struct SetComparator {
+  explicit SetComparator() : user_comparator_(BytewiseComparator()) {}
+  explicit SetComparator(const Comparator* user_comparator)
+      : user_comparator_(user_comparator ? user_comparator
+                                         : BytewiseComparator()) {}
+  bool operator()(const Slice& lhs, const Slice& rhs) const {
+    return user_comparator_->Compare(lhs, rhs) < 0;
+  }
+
+ private:
+  const Comparator* user_comparator_;
+};
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/slice.cc b/thirdparty/rocksdb/util/slice.cc
index 8d95a8ae19..5e23ae0a3d 100644
--- a/thirdparty/rocksdb/util/slice.cc
+++ b/thirdparty/rocksdb/util/slice.cc
@@ -32,22 +32,27 @@ class FixedPrefixTransform : public SliceTransform {
         // the class implementation itself.
         name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {}
 
-  virtual const char* Name() const override { return name_.c_str(); }
+  const char* Name() const override { return name_.c_str(); }
 
-  virtual Slice Transform(const Slice& src) const override {
+  Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
     return Slice(src.data(), prefix_len_);
   }
 
-  virtual bool InDomain(const Slice& src) const override {
+  bool InDomain(const Slice& src) const override {
     return (src.size() >= prefix_len_);
   }
 
-  virtual bool InRange(const Slice& dst) const override {
+  bool InRange(const Slice& dst) const override {
     return (dst.size() == prefix_len_);
   }
 
-  virtual bool SameResultWhenAppended(const Slice& prefix) const override {
+  bool FullLengthEnabled(size_t* len) const override {
+    *len = prefix_len_;
+    return true;
+  }
+
+  bool SameResultWhenAppended(const Slice& prefix) const override {
     return InDomain(prefix);
   }
 };
@@ -67,20 +72,25 @@ class CappedPrefixTransform : public SliceTransform {
         // the class implementation itself.
         name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {}
 
-  virtual const char* Name() const override { return name_.c_str(); }
+  const char* Name() const override { return name_.c_str(); }
 
-  virtual Slice Transform(const Slice& src) const override {
+  Slice Transform(const Slice& src) const override {
     assert(InDomain(src));
     return Slice(src.data(), std::min(cap_len_, src.size()));
   }
 
-  virtual bool InDomain(const Slice& src) const override { return true; }
+  bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  virtual bool InRange(const Slice& dst) const override {
+  bool InRange(const Slice& dst) const override {
     return (dst.size() <= cap_len_);
   }
 
-  virtual bool SameResultWhenAppended(const Slice& prefix) const override {
+  bool FullLengthEnabled(size_t* len) const override {
+    *len = cap_len_;
+    return true;
+  }
+
+  bool SameResultWhenAppended(const Slice& prefix) const override {
     return prefix.size() >= cap_len_;
   }
 };
@@ -89,15 +99,15 @@ class NoopTransform : public SliceTransform {
  public:
   explicit NoopTransform() { }
 
-  virtual const char* Name() const override { return "rocksdb.Noop"; }
+  const char* Name() const override { return "rocksdb.Noop"; }
 
-  virtual Slice Transform(const Slice& src) const override { return src; }
+  Slice Transform(const Slice& src) const override { return src; }
 
-  virtual bool InDomain(const Slice& src) const override { return true; }
+  bool InDomain(const Slice& /*src*/) const override { return true; }
 
-  virtual bool InRange(const Slice& dst) const override { return true; }
+  bool InRange(const Slice& /*dst*/) const override { return true; }
 
-  virtual bool SameResultWhenAppended(const Slice& prefix) const override {
+  bool SameResultWhenAppended(const Slice& /*prefix*/) const override {
     return false;
   }
 };
@@ -182,7 +192,7 @@ bool Slice::DecodeHex(std::string* result) const {
     if (h2 < 0) {
       return false;
     }
-    result->push_back((h1 << 4) | h2);
+    result->push_back(static_cast<char>((h1 << 4) | h2));
   }
   return true;
 }
diff --git a/thirdparty/rocksdb/util/slice_transform_test.cc b/thirdparty/rocksdb/util/slice_transform_test.cc
index 0b0e5648d8..f91675ccec 100644
--- a/thirdparty/rocksdb/util/slice_transform_test.cc
+++ b/thirdparty/rocksdb/util/slice_transform_test.cc
@@ -24,7 +24,7 @@ TEST_F(SliceTransformTest, CapPrefixTransform) {
   std::string s;
   s = "abcdefge";
 
-  unique_ptr<const SliceTransform> transform;
+  std::unique_ptr<const SliceTransform> transform;
 
   transform.reset(NewCappedPrefixTransform(6));
   ASSERT_EQ(transform->Transform(s).ToString(), "abcdef");
@@ -53,11 +53,11 @@ class SliceTransformDBTest : public testing::Test {
 
  public:
   SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) {
-    dbname_ = test::TmpDir() + "/slice_transform_db_test";
+    dbname_ = test::PerThreadDBPath("slice_transform_db_test");
     EXPECT_OK(DestroyDB(dbname_, last_options_));
   }
 
-  ~SliceTransformDBTest() {
+  ~SliceTransformDBTest() override {
     delete db_;
     EXPECT_OK(DestroyDB(dbname_, last_options_));
   }
@@ -115,7 +115,7 @@ TEST_F(SliceTransformDBTest, CapPrefix) {
   ASSERT_OK(db()->Put(wo, "foo3", "bar3"));
   ASSERT_OK(db()->Flush(fo));
 
-  unique_ptr<Iterator> iter(db()->NewIterator(ro));
+  std::unique_ptr<Iterator> iter(db()->NewIterator(ro));
 
   iter->Seek("foo");
   ASSERT_OK(iter->status());
diff --git a/thirdparty/rocksdb/util/sst_file_manager_impl.cc b/thirdparty/rocksdb/util/sst_file_manager_impl.cc
index 511df32ac7..6a770b106e 100644
--- a/thirdparty/rocksdb/util/sst_file_manager_impl.cc
+++ b/thirdparty/rocksdb/util/sst_file_manager_impl.cc
@@ -5,8 +5,14 @@
 
 #include "util/sst_file_manager_impl.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <vector>
 
+#include "db/db_impl.h"
 #include "port/port.h"
 #include "rocksdb/env.h"
 #include "rocksdb/sst_file_manager.h"
@@ -17,23 +23,51 @@ namespace rocksdb {
 
 #ifndef ROCKSDB_LITE
 SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger,
-                                       const std::string& trash_dir,
-                                       int64_t rate_bytes_per_sec)
+                                       int64_t rate_bytes_per_sec,
+                                       double max_trash_db_ratio,
+                                       uint64_t bytes_max_delete_chunk)
     : env_(env),
       logger_(logger),
       total_files_size_(0),
+      in_progress_files_size_(0),
+      compaction_buffer_size_(0),
+      cur_compactions_reserved_size_(0),
       max_allowed_space_(0),
-      delete_scheduler_(env, trash_dir, rate_bytes_per_sec, logger.get(),
-                        this) {}
+      delete_scheduler_(env, rate_bytes_per_sec, logger.get(), this,
+                        max_trash_db_ratio, bytes_max_delete_chunk),
+      cv_(&mu_),
+      closing_(false),
+      bg_thread_(nullptr),
+      reserved_disk_buffer_(0),
+      free_space_trigger_(0),
+      cur_instance_(nullptr) {
+}
 
-SstFileManagerImpl::~SstFileManagerImpl() {}
+SstFileManagerImpl::~SstFileManagerImpl() {
+  Close();
+}
 
-Status SstFileManagerImpl::OnAddFile(const std::string& file_path) {
+void SstFileManagerImpl::Close() {
+  {
+    MutexLock l(&mu_);
+    if (closing_) {
+      return;
+    }
+    closing_ = true;
+    cv_.SignalAll();
+  }
+  if (bg_thread_) {
+    bg_thread_->join();
+  }
+}
+
+Status SstFileManagerImpl::OnAddFile(const std::string& file_path,
+                                     bool compaction) {
   uint64_t file_size;
   Status s = env_->GetFileSize(file_path, &file_size);
   if (s.ok()) {
     MutexLock l(&mu_);
-    OnAddFileImpl(file_path, file_size);
+    OnAddFileImpl(file_path, file_size, compaction);
   }
   TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile");
   return s;
@@ -48,6 +82,31 @@ Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) {
   return Status::OK();
 }
 
+void SstFileManagerImpl::OnCompactionCompletion(Compaction* c) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  for (size_t i = 0; i < c->num_input_levels(); i++) {
+    for (size_t j = 0; j < c->num_input_files(i); j++) {
+      FileMetaData* filemeta = c->input(i, j);
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+  cur_compactions_reserved_size_ -= size_added_by_compaction;
+
+  auto new_files = c->edit()->GetNewFiles();
+  for (auto& new_file : new_files) {
+    auto fn = TableFileName(c->immutable_cf_options()->cf_paths,
+                            new_file.second.fd.GetNumber(),
+                            new_file.second.fd.GetPathId());
+    if (in_progress_files_.find(fn) != in_progress_files_.end()) {
+      auto tracked_file = tracked_files_.find(fn);
+      assert(tracked_file != tracked_files_.end());
+      in_progress_files_size_ -= tracked_file->second;
+      in_progress_files_.erase(fn);
+    }
+  }
+}
+
 Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
                                       const std::string& new_path,
                                       uint64_t* file_size) {
@@ -56,7 +115,7 @@ Status SstFileManagerImpl::OnMoveFile(const std::string& old_path,
     if (file_size != nullptr) {
       *file_size = tracked_files_[old_path];
     }
-    OnAddFileImpl(new_path, tracked_files_[old_path]);
+    OnAddFileImpl(new_path, tracked_files_[old_path], false);
     OnDeleteFileImpl(old_path);
   }
   TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile");
@@ -68,6 +127,12 @@ void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) {
   max_allowed_space_ = max_allowed_space;
 }
 
+void SstFileManagerImpl::SetCompactionBufferSize(
+    uint64_t compaction_buffer_size) {
+  MutexLock l(&mu_);
+  compaction_buffer_size_ = compaction_buffer_size;
+}
+
 bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
   MutexLock l(&mu_);
   if (max_allowed_space_ <= 0) {
@@ -76,6 +141,80 @@ bool SstFileManagerImpl::IsMaxAllowedSpaceReached() {
   return total_files_size_ >= max_allowed_space_;
 }
 
+bool SstFileManagerImpl::IsMaxAllowedSpaceReachedIncludingCompactions() {
+  MutexLock l(&mu_);
+  if (max_allowed_space_ <= 0) {
+    return false;
+  }
+  return total_files_size_ + cur_compactions_reserved_size_ >=
+         max_allowed_space_;
+}
+
+bool SstFileManagerImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    Status bg_error) {
+  MutexLock l(&mu_);
+  uint64_t size_added_by_compaction = 0;
+  // First check if we even have the space to do the compaction
+  for (size_t i = 0; i < inputs.size(); i++) {
+    for (size_t j = 0; j < inputs[i].size(); j++) {
+      FileMetaData* filemeta = inputs[i][j];
+      size_added_by_compaction += filemeta->fd.GetFileSize();
+    }
+  }
+
+  // Update cur_compactions_reserved_size_ so concurrent compaction
+  // don't max out space
+  size_t needed_headroom =
+      cur_compactions_reserved_size_ + size_added_by_compaction +
+      compaction_buffer_size_;
+  if (max_allowed_space_ != 0 &&
+      (needed_headroom + total_files_size_ > max_allowed_space_)) {
+    return false;
+  }
+
+  // Implement more aggressive checks only if this DB instance has already
+  // seen a NoSpace() error. This is tin order to contain a single potentially
+  // misbehaving DB instance and prevent it from slowing down compactions of
+  // other DB instances
+  if (CheckFreeSpace() && bg_error == Status::NoSpace()) {
+    auto fn =
+        TableFileName(cfd->ioptions()->cf_paths, inputs[0][0]->fd.GetNumber(),
+                      inputs[0][0]->fd.GetPathId());
+    uint64_t free_space = 0;
+    env_->GetFreeSpace(fn, &free_space);
+    // needed_headroom is based on current size reserved by compactions,
+    // minus any files created by running compactions as they would count
+    // against the reserved size. If user didn't specify any compaction
+    // buffer, add reserved_disk_buffer_ that's calculated by default so the
+    // compaction doesn't end up leaving nothing for logs and flush SSTs
+    if (compaction_buffer_size_ == 0) {
+      needed_headroom += reserved_disk_buffer_;
+    }
+    needed_headroom -= in_progress_files_size_;
+    if (free_space < needed_headroom + size_added_by_compaction) {
+      // We hit the condition of not enough disk space
+      ROCKS_LOG_ERROR(logger_,
+                      "free space [%" PRIu64
+                      " bytes] is less than "
+                      "needed headroom [%" ROCKSDB_PRIszt " bytes]\n",
+                      free_space, needed_headroom);
+      return false;
+    }
+  }
+
+  cur_compactions_reserved_size_ += size_added_by_compaction;
+  // Take a snapshot of cur_compactions_reserved_size_ for when we encounter
+  // a NoSpace error.
+  free_space_trigger_ = cur_compactions_reserved_size_;
+  return true;
+}
+
+uint64_t SstFileManagerImpl::GetCompactionsReservedSize() {
+  MutexLock l(&mu_);
+  return cur_compactions_reserved_size_;
+}
+
 uint64_t SstFileManagerImpl::GetTotalSize() {
   MutexLock l(&mu_);
   return total_files_size_;
@@ -95,8 +234,192 @@ void SstFileManagerImpl::SetDeleteRateBytesPerSecond(int64_t delete_rate) {
   return delete_scheduler_.SetRateBytesPerSecond(delete_rate);
 }
 
-Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path) {
-  return delete_scheduler_.DeleteFile(file_path);
+double SstFileManagerImpl::GetMaxTrashDBRatio() {
+  return delete_scheduler_.GetMaxTrashDBRatio();
+}
+
+void SstFileManagerImpl::SetMaxTrashDBRatio(double r) {
+  return delete_scheduler_.SetMaxTrashDBRatio(r);
+}
+
+uint64_t SstFileManagerImpl::GetTotalTrashSize() {
+  return delete_scheduler_.GetTotalTrashSize();
+}
+
+void SstFileManagerImpl::ReserveDiskBuffer(uint64_t size,
+                                           const std::string& path) {
+  MutexLock l(&mu_);
+
+  reserved_disk_buffer_ += size;
+  if (path_.empty()) {
+    path_ = path;
+  }
+}
+
+void SstFileManagerImpl::ClearError() {
+  while (true) {
+    MutexLock l(&mu_);
+
+    if (closing_) {
+      return;
+    }
+
+    uint64_t free_space;
+    Status s = env_->GetFreeSpace(path_, &free_space);
+    if (s.ok()) {
+      // In case of multi-DB instances, some of them may have experienced a
+      // soft error and some a hard error. In the SstFileManagerImpl, a hard
+      // error will basically override previously reported soft errors. Once
+      // we clear the hard error, we don't keep track of previous errors for
+      // now
+      if (bg_err_.severity() == Status::Severity::kHardError) {
+        if (free_space < reserved_disk_buffer_) {
+          ROCKS_LOG_ERROR(logger_,
+                          "free space [%" PRIu64
+                          " bytes] is less than "
+                          "required disk buffer [%" PRIu64 " bytes]\n",
+                          free_space, reserved_disk_buffer_);
+          ROCKS_LOG_ERROR(logger_, "Cannot clear hard error\n");
+          s = Status::NoSpace();
+        }
+      } else if (bg_err_.severity() == Status::Severity::kSoftError) {
+        if (free_space < free_space_trigger_) {
+          ROCKS_LOG_WARN(logger_,
+                         "free space [%" PRIu64
+                         " bytes] is less than "
+                         "free space for compaction trigger [%" PRIu64
+                         " bytes]\n",
+                         free_space, free_space_trigger_);
+          ROCKS_LOG_WARN(logger_, "Cannot clear soft error\n");
+          s = Status::NoSpace();
+        }
+      }
+    }
+
+    // Someone could have called CancelErrorRecovery() and the list could have
+    // become empty, so check again here
+    if (s.ok() && !error_handler_list_.empty()) {
+      auto error_handler = error_handler_list_.front();
+      // Since we will release the mutex, set cur_instance_ to signal to the
+      // shutdown thread, if it calls // CancelErrorRecovery() the meantime,
+      // to indicate that this DB instance is busy. The DB instance is
+      // guaranteed to not be deleted before RecoverFromBGError() returns,
+      // since the ErrorHandler::recovery_in_prog_ flag would be true
+      cur_instance_ = error_handler;
+      mu_.Unlock();
+      s = error_handler->RecoverFromBGError();
+      mu_.Lock();
+      // The DB instance might have been deleted while we were
+      // waiting for the mutex, so check cur_instance_ to make sure its
+      // still non-null
+      if (cur_instance_) {
+        // Check for error again, since the instance may have recovered but
+        // immediately got another error. If that's the case, and the new
+        // error is also a NoSpace() non-fatal error, leave the instance in
+        // the list
+        Status err = cur_instance_->GetBGError();
+        if (s.ok() && err == Status::NoSpace() &&
+            err.severity() < Status::Severity::kFatalError) {
+          s = err;
+        }
+        cur_instance_ = nullptr;
+      }
+
+      if (s.ok() || s.IsShutdownInProgress() ||
+          (!s.ok() && s.severity() >= Status::Severity::kFatalError)) {
+        // If shutdown is in progress, abandon this handler instance
+        // and continue with the others
+        error_handler_list_.pop_front();
+      }
+    }
+
+    if (!error_handler_list_.empty()) {
+      // If there are more instances to be recovered, reschedule after 5
+      // seconds
+      int64_t wait_until = env_->NowMicros() + 5000000;
+      cv_.TimedWait(wait_until);
+    }
+
+    // Check again for error_handler_list_ empty, as a DB instance shutdown
+    // could have removed it from the queue while we were in timed wait
+    if (error_handler_list_.empty()) {
+      ROCKS_LOG_INFO(logger_, "Clearing error\n");
+      bg_err_ = Status::OK();
+      return;
+    }
+  }
+}
+
+void SstFileManagerImpl::StartErrorRecovery(ErrorHandler* handler,
+                                            Status bg_error) {
+  MutexLock l(&mu_);
+  if (bg_error.severity() == Status::Severity::kSoftError) {
+    if (bg_err_.ok()) {
+      // Setting bg_err_ basically means we're in degraded mode
+      // Assume that all pending compactions will fail similarly. The trigger
+      // for clearing this condition is set to current compaction reserved
+      // size, so we stop checking disk space available in
+      // EnoughRoomForCompaction once this much free space is available
+      bg_err_ = bg_error;
+    }
+  } else if (bg_error.severity() == Status::Severity::kHardError) {
+    bg_err_ = bg_error;
+  } else {
+    assert(false);
+  }
+
+  // If this is the first instance of this error, kick of a thread to poll
+  // and recover from this condition
+  if (error_handler_list_.empty()) {
+    error_handler_list_.push_back(handler);
+    // Release lock before calling join. Its ok to do so because
+    // error_handler_list_ is now non-empty, so no other invocation of this
+    // function will execute this piece of code
+    mu_.Unlock();
+    if (bg_thread_) {
+      bg_thread_->join();
+    }
+    // Start a new thread. The previous one would have exited.
+    bg_thread_.reset(new port::Thread(&SstFileManagerImpl::ClearError, this));
+    mu_.Lock();
+  } else {
+    // Check if this DB instance is already in the list
+    for (auto iter = error_handler_list_.begin();
+         iter != error_handler_list_.end(); ++iter) {
+      if ((*iter) == handler) {
+        return;
+      }
+    }
+    error_handler_list_.push_back(handler);
+  }
+}
+
+bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) {
+  MutexLock l(&mu_);
+
+  if (cur_instance_ == handler) {
+    // This instance is currently busy attempting to recover
+    // Nullify it so the recovery thread doesn't attempt to access it again
+    cur_instance_ = nullptr;
+    return false;
+  }
+
+  for (auto iter = error_handler_list_.begin();
+       iter != error_handler_list_.end(); ++iter) {
+    if ((*iter) == handler) {
+      error_handler_list_.erase(iter);
+      return true;
+    }
+  }
+  return false;
+}
+
+Status SstFileManagerImpl::ScheduleFileDeletion(
+    const std::string& file_path, const std::string& path_to_sync,
+    const bool force_bg) {
+  TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion");
+  return delete_scheduler_.DeleteFile(file_path, path_to_sync,
+                                      force_bg);
 }
 
 void SstFileManagerImpl::WaitForEmptyTrash() {
@@ -104,14 +427,24 @@ void SstFileManagerImpl::WaitForEmptyTrash() {
 }
 
 void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
-                                       uint64_t file_size) {
+                                       uint64_t file_size, bool compaction) {
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file != tracked_files_.end()) {
     // File was added before, we will just update the size
+    assert(!compaction);
     total_files_size_ -= tracked_file->second;
     total_files_size_ += file_size;
+    cur_compactions_reserved_size_ -= file_size;
   } else {
     total_files_size_ += file_size;
+    if (compaction) {
+      // Keep track of the size of files created by in-progress compactions.
+      // When calculating whether there's enough headroom for new compactions,
+      // this will be subtracted from cur_compactions_reserved_size_.
+      // Otherwise, compactions will be double counted.
+      in_progress_files_size_ += file_size;
+      in_progress_files_.insert(file_path);
+    }
   }
   tracked_files_[file_path] = file_size;
 }
@@ -120,38 +453,47 @@ void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) {
   auto tracked_file = tracked_files_.find(file_path);
   if (tracked_file == tracked_files_.end()) {
     // File is not tracked
+    assert(in_progress_files_.find(file_path) == in_progress_files_.end());
     return;
   }
 
   total_files_size_ -= tracked_file->second;
+  // Check if it belonged to an in-progress compaction
+  if (in_progress_files_.find(file_path) != in_progress_files_.end()) {
+    in_progress_files_size_ -= tracked_file->second;
+    in_progress_files_.erase(file_path);
+  }
   tracked_files_.erase(tracked_file);
 }
 
 SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
                                   std::string trash_dir,
                                   int64_t rate_bytes_per_sec,
-                                  bool delete_existing_trash, Status* status) {
+                                  bool delete_existing_trash, Status* status,
+                                  double max_trash_db_ratio,
+                                  uint64_t bytes_max_delete_chunk) {
   SstFileManagerImpl* res =
-      new SstFileManagerImpl(env, info_log, trash_dir, rate_bytes_per_sec);
+      new SstFileManagerImpl(env, info_log, rate_bytes_per_sec,
+                             max_trash_db_ratio, bytes_max_delete_chunk);
 
+  // trash_dir is deprecated and not needed anymore, but if user passed it
+  // we will still remove files in it.
   Status s;
-  if (trash_dir != "") {
-    s = env->CreateDirIfMissing(trash_dir);
-    if (s.ok() && delete_existing_trash) {
-      std::vector<std::string> files_in_trash;
-      s = env->GetChildren(trash_dir, &files_in_trash);
-      if (s.ok()) {
-        for (const std::string& trash_file : files_in_trash) {
-          if (trash_file == "." || trash_file == "..") {
-            continue;
-          }
-
-          std::string path_in_trash = trash_dir + "/" + trash_file;
-          res->OnAddFile(path_in_trash);
-          Status file_delete = res->ScheduleFileDeletion(path_in_trash);
-          if (s.ok() && !file_delete.ok()) {
-            s = file_delete;
-          }
+  if (delete_existing_trash && trash_dir != "") {
+    std::vector<std::string> files_in_trash;
+    s = env->GetChildren(trash_dir, &files_in_trash);
+    if (s.ok()) {
+      for (const std::string& trash_file : files_in_trash) {
+        if (trash_file == "." || trash_file == "..") {
+          continue;
+        }
+
+        std::string path_in_trash = trash_dir + "/" + trash_file;
+        res->OnAddFile(path_in_trash);
+        Status file_delete =
+            res->ScheduleFileDeletion(path_in_trash, trash_dir);
+        if (s.ok() && !file_delete.ok()) {
+          s = file_delete;
         }
       }
     }
@@ -166,10 +508,13 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
 
 #else
 
-SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
-                                  std::string trash_dir,
-                                  int64_t rate_bytes_per_sec,
-                                  bool delete_existing_trash, Status* status) {
+SstFileManager* NewSstFileManager(Env* /*env*/,
+                                  std::shared_ptr<Logger> /*info_log*/,
+                                  std::string /*trash_dir*/,
+                                  int64_t /*rate_bytes_per_sec*/,
+                                  bool /*delete_existing_trash*/,
+                                  Status* status, double /*max_trash_db_ratio*/,
+                                  uint64_t /*bytes_max_delete_chunk*/) {
   if (status) {
     *status =
         Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE");
@@ -180,4 +525,3 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr<Logger> info_log,
 #endif  // ROCKSDB_LITE
 
 }  // namespace rocksdb
-
diff --git a/thirdparty/rocksdb/util/sst_file_manager_impl.h b/thirdparty/rocksdb/util/sst_file_manager_impl.h
index b737bf7687..211b4fa716 100644
--- a/thirdparty/rocksdb/util/sst_file_manager_impl.h
+++ b/thirdparty/rocksdb/util/sst_file_manager_impl.h
@@ -11,6 +11,8 @@
 
 #include "port/port.h"
 
+#include "db/compaction.h"
+#include "db/error_handler.h"
 #include "rocksdb/sst_file_manager.h"
 #include "util/delete_scheduler.h"
 
@@ -25,13 +27,14 @@ class Logger;
 class SstFileManagerImpl : public SstFileManager {
  public:
   explicit SstFileManagerImpl(Env* env, std::shared_ptr<Logger> logger,
-                              const std::string& trash_dir,
-                              int64_t rate_bytes_per_sec);
+                              int64_t rate_bytes_per_sec,
+                              double max_trash_db_ratio,
+                              uint64_t bytes_max_delete_chunk);
 
   ~SstFileManagerImpl();
 
   // DB will call OnAddFile whenever a new sst file is added.
-  Status OnAddFile(const std::string& file_path);
+  Status OnAddFile(const std::string& file_path, bool compaction = false);
 
   // DB will call OnDeleteFile whenever an sst file is deleted.
   Status OnDeleteFile(const std::string& file_path);
@@ -50,12 +53,31 @@ class SstFileManagerImpl : public SstFileManager {
   // thread-safe.
   void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override;
 
+  void SetCompactionBufferSize(uint64_t compaction_buffer_size) override;
+
   // Return true if the total size of SST files exceeded the maximum allowed
   // space usage.
   //
   // thread-safe.
   bool IsMaxAllowedSpaceReached() override;
 
+  bool IsMaxAllowedSpaceReachedIncludingCompactions() override;
+
+  // Returns true is there is enough (approximate) space for the specified
+  // compaction. Space is approximate because this function conservatively
+  // estimates how much space is currently being used by compactions (i.e.
+  // if a compaction has started, this function bumps the used space by
+  // the full compaction size).
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               Status bg_error);
+
+  // Bookkeeping so total_file_sizes_ goes back to normal after compaction
+  // finishes
+  void OnCompactionCompletion(Compaction* c);
+
+  uint64_t GetCompactionsReservedSize();
+
   // Return the total size of all tracked files.
   uint64_t GetTotalSize() override;
 
@@ -68,8 +90,33 @@ class SstFileManagerImpl : public SstFileManager {
   // Update the delete rate limit in bytes per second.
   virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) override;
 
-  // Move file to trash directory and schedule it's deletion.
-  virtual Status ScheduleFileDeletion(const std::string& file_path);
+  // Return trash/DB size ratio where new files will be deleted immediately
+  virtual double GetMaxTrashDBRatio() override;
+
+  // Update trash/DB size ratio where new files will be deleted immediately
+  virtual void SetMaxTrashDBRatio(double ratio) override;
+
+  // Return the total size of trash files
+  uint64_t GetTotalTrashSize() override;
+
+  // Called by each DB instance using this sst file manager to reserve
+  // disk buffer space for recovery from out of space errors
+  void ReserveDiskBuffer(uint64_t buffer, const std::string& path);
+
+  // Set a flag upon encountering disk full. May enqueue the ErrorHandler
+  // instance for background polling and recovery
+  void StartErrorRecovery(ErrorHandler* db, Status bg_error);
+
+  // Remove the given Errorhandler instance from the recovery queue. Its
+  // not guaranteed
+  bool CancelErrorRecovery(ErrorHandler* db);
+
+  // Mark file as trash and schedule it's deletion. If force_bg is set, it
+  // forces the file to be deleting in the background regardless of DB size,
+  // except when rate limited delete is disabled
+  virtual Status ScheduleFileDeletion(const std::string& file_path,
+                                      const std::string& dir_to_sync,
+                                      const bool force_bg = false);
 
   // Wait for all files being deleteing in the background to finish or for
   // destructor to be called.
@@ -77,27 +124,64 @@ class SstFileManagerImpl : public SstFileManager {
 
   DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
 
+  // Stop the error recovery background thread. This should be called only
+  // once in the object's lifetime, and before the destructor
+  void Close();
+
  private:
   // REQUIRES: mutex locked
-  void OnAddFileImpl(const std::string& file_path, uint64_t file_size);
+  void OnAddFileImpl(const std::string& file_path, uint64_t file_size,
+                     bool compaction);
   // REQUIRES: mutex locked
   void OnDeleteFileImpl(const std::string& file_path);
 
+  void ClearError();
+  bool CheckFreeSpace() {
+    return bg_err_.severity() == Status::Severity::kSoftError;
+  }
+
   Env* env_;
   std::shared_ptr<Logger> logger_;
   // Mutex to protect tracked_files_, total_files_size_
   port::Mutex mu_;
   // The summation of the sizes of all files in tracked_files_ map
   uint64_t total_files_size_;
+  // The summation of all output files of in-progress compactions
+  uint64_t in_progress_files_size_;
+  // Compactions should only execute if they can leave at least
+  // this amount of buffer space for logs and flushes
+  uint64_t compaction_buffer_size_;
+  // Estimated size of the current ongoing compactions
+  uint64_t cur_compactions_reserved_size_;
   // A map containing all tracked files and there sizes
   //  file_path => file_size
   std::unordered_map<std::string, uint64_t> tracked_files_;
+  // A set of files belonging to in-progress compactions
+  std::unordered_set<std::string> in_progress_files_;
   // The maximum allowed space (in bytes) for sst files.
   uint64_t max_allowed_space_;
-  // DeleteScheduler used to throttle file deletition, if SstFileManagerImpl was
-  // created with rate_bytes_per_sec == 0 or trash_dir == "", delete_scheduler_
-  // rate limiting will be disabled and will simply delete the files.
+  // DeleteScheduler used to throttle file deletition.
   DeleteScheduler delete_scheduler_;
+  port::CondVar cv_;
+  // Flag to force error recovery thread to exit
+  bool closing_;
+  // Background error recovery thread
+  std::unique_ptr<port::Thread> bg_thread_;
+  // A path in the filesystem corresponding to this SFM. This is used for
+  // calling Env::GetFreeSpace. Posix requires a path in the filesystem
+  std::string path_;
+  // Save the current background error
+  Status bg_err_;
+  // Amount of free disk headroom before allowing recovery from hard errors
+  uint64_t reserved_disk_buffer_;
+  // For soft errors, amount of free disk space before we can allow
+  // compactions to run full throttle. If disk space is below this trigger,
+  // compactions will be gated by free disk space > input size
+  uint64_t free_space_trigger_;
+  // List of database error handler instances tracked by this sst file manager
+  std::list<ErrorHandler*> error_handler_list_;
+  // Pointer to ErrorHandler instance that is currently processing recovery
+  ErrorHandler* cur_instance_;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/status.cc b/thirdparty/rocksdb/util/status.cc
index e0c1af99ec..c66bf6f8e1 100644
--- a/thirdparty/rocksdb/util/status.cc
+++ b/thirdparty/rocksdb/util/status.cc
@@ -9,20 +9,45 @@
 
 #include "rocksdb/status.h"
 #include <stdio.h>
+#ifdef OS_WIN
+#include <string.h>
+#endif
 #include <cstring>
 #include "port/port.h"
 
 namespace rocksdb {
 
 const char* Status::CopyState(const char* state) {
-  char* const result =
-      new char[std::strlen(state) + 1];  // +1 for the null terminator
-  std::strcpy(result, state);
+#ifdef OS_WIN
+  const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
+  char* result = new char[cch];
+  errno_t ret;
+  ret = strncpy_s(result, cch, state, cch - 1);
+  result[cch - 1] = '\0';
+  assert(ret == 0);
   return result;
+#else
+  const size_t cch = std::strlen(state) + 1;  // +1 for the null terminator
+  return std::strncpy(new char[cch], state, cch);
+#endif
 }
 
-Status::Status(Code _code, SubCode _subcode, const Slice& msg, const Slice& msg2)
-    : code_(_code), subcode_(_subcode) {
+static const char* msgs[static_cast<int>(Status::kMaxSubCode)] = {
+    "",                                                   // kNone
+    "Timeout Acquiring Mutex",                            // kMutexTimeout
+    "Timeout waiting to lock key",                        // kLockTimeout
+    "Failed to acquire lock due to max_num_locks limit",  // kLockLimit
+    "No space left on device",                            // kNoSpace
+    "Deadlock",                                           // kDeadlock
+    "Stale file handle",                                  // kStaleFile
+    "Memory limit reached",                               // kMemoryLimit
+    "Space limit reached",                                // kSpaceLimit
+    "No such file or directory",                          // kPathNotFound
+};
+
+Status::Status(Code _code, SubCode _subcode, const Slice& msg,
+               const Slice& msg2)
+    : code_(_code), subcode_(_subcode), sev_(kNoError) {
   assert(code_ != kOk);
   assert(subcode_ != kMaxSubCode);
   const size_t len1 = msg.size();
diff --git a/thirdparty/rocksdb/util/status_message.cc b/thirdparty/rocksdb/util/status_message.cc
deleted file mode 100644
index 6e9d4e4f7c..0000000000
--- a/thirdparty/rocksdb/util/status_message.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "rocksdb/status.h"
-
-namespace rocksdb {
-
-const char* Status::msgs[] = {
-    "",                                                   // kNone
-    "Timeout Acquiring Mutex",                            // kMutexTimeout
-    "Timeout waiting to lock key",                        // kLockTimeout
-    "Failed to acquire lock due to max_num_locks limit",  // kLockLimit
-    "No space left on device",                            // kNoSpace
-    "Deadlock",                                           // kDeadlock
-    "Stale file handle",                                  // kStaleFile
-    "Memory limit reached"                                // kMemoryLimit
-};
-
-}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/stop_watch.h b/thirdparty/rocksdb/util/stop_watch.h
index 89be103b73..afa708e370 100644
--- a/thirdparty/rocksdb/util/stop_watch.h
+++ b/thirdparty/rocksdb/util/stop_watch.h
@@ -15,13 +15,20 @@ namespace rocksdb {
 class StopWatch {
  public:
   StopWatch(Env* const env, Statistics* statistics, const uint32_t hist_type,
-            uint64_t* elapsed = nullptr, bool overwrite = true)
+            uint64_t* elapsed = nullptr, bool overwrite = true,
+            bool delay_enabled = false)
       : env_(env),
         statistics_(statistics),
         hist_type_(hist_type),
         elapsed_(elapsed),
         overwrite_(overwrite),
-        stats_enabled_(statistics && statistics->HistEnabledForType(hist_type)),
+        stats_enabled_(statistics &&
+                       statistics->get_stats_level() >=
+                           StatsLevel::kExceptTimers &&
+                       statistics->HistEnabledForType(hist_type)),
+        delay_enabled_(delay_enabled),
+        total_delay_(0),
+        delay_start_time_(0),
         start_time_((stats_enabled_ || elapsed != nullptr) ? env->NowMicros()
                                                            : 0) {}
 
@@ -33,13 +40,36 @@ class StopWatch {
         *elapsed_ += env_->NowMicros() - start_time_;
       }
     }
+    if (elapsed_ && delay_enabled_) {
+      *elapsed_ -= total_delay_;
+    }
     if (stats_enabled_) {
-      statistics_->measureTime(hist_type_,
-          (elapsed_ != nullptr) ? *elapsed_ :
-                                  (env_->NowMicros() - start_time_));
+      statistics_->reportTimeToHistogram(
+          hist_type_, (elapsed_ != nullptr)
+                          ? *elapsed_
+                          : (env_->NowMicros() - start_time_));
+    }
+  }
+
+  void DelayStart() {
+    // if delay_start_time_ is not 0, it means we are already tracking delay,
+    // so delay_start_time_ should not be overwritten
+    if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) {
+      delay_start_time_ = env_->NowMicros();
     }
   }
 
+  void DelayStop() {
+    if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) {
+      total_delay_ += env_->NowMicros() - delay_start_time_;
+    }
+    // reset to 0 means currently no delay is being tracked, so two consecutive
+    // calls to DelayStop will not increase total_delay_
+    delay_start_time_ = 0;
+  }
+
+  uint64_t GetDelay() const { return delay_enabled_ ? total_delay_ : 0; }
+
   uint64_t start_time() const { return start_time_; }
 
  private:
@@ -49,6 +79,9 @@ class StopWatch {
   uint64_t* elapsed_;
   bool overwrite_;
   bool stats_enabled_;
+  bool delay_enabled_;
+  uint64_t total_delay_;
+  uint64_t delay_start_time_;
   const uint64_t start_time_;
 };
 
diff --git a/thirdparty/rocksdb/util/string_util.cc b/thirdparty/rocksdb/util/string_util.cc
index a37605aa0f..26e6759ac2 100644
--- a/thirdparty/rocksdb/util/string_util.cc
+++ b/thirdparty/rocksdb/util/string_util.cc
@@ -21,6 +21,7 @@
 #include <utility>
 #include <vector>
 #include "rocksdb/env.h"
+#include "port/port.h"
 #include "rocksdb/slice.h"
 
 namespace rocksdb {
@@ -244,10 +245,10 @@ std::string trim(const std::string& str) {
   if (str.empty()) return std::string();
   size_t start = 0;
   size_t end = str.size() - 1;
-  while (isspace(str[start]) != 0 && start <= end) {
+  while (isspace(str[start]) != 0 && start < end) {
     ++start;
   }
-  while (isspace(str[end]) != 0 && start <= end) {
+  while (isspace(str[end]) != 0 && start < end) {
     --end;
   }
   if (start <= end) {
@@ -276,6 +277,15 @@ uint32_t ParseUint32(const std::string& value) {
   }
 }
 
+int32_t ParseInt32(const std::string& value) {
+  int64_t num = ParseInt64(value);
+  if (num <= port::kMaxInt32 && num >= port::kMinInt32) {
+    return static_cast<int32_t>(num);
+  } else {
+    throw std::out_of_range(value);
+  }
+}
+
 #endif
 
 uint64_t ParseUint64(const std::string& value) {
@@ -303,6 +313,31 @@ uint64_t ParseUint64(const std::string& value) {
   return num;
 }
 
+int64_t ParseInt64(const std::string& value) {
+  size_t endchar;
+#ifndef CYGWIN
+  int64_t num = std::stoll(value.c_str(), &endchar);
+#else
+  char* endptr;
+  int64_t num = std::strtoll(value.c_str(), &endptr, 0);
+  endchar = endptr - value.c_str();
+#endif
+
+  if (endchar < value.length()) {
+    char c = value[endchar];
+    if (c == 'k' || c == 'K')
+      num <<= 10LL;
+    else if (c == 'm' || c == 'M')
+      num <<= 20LL;
+    else if (c == 'g' || c == 'G')
+      num <<= 30LL;
+    else if (c == 't' || c == 'T')
+      num <<= 40LL;
+  }
+
+  return num;
+}
+
 int ParseInt(const std::string& value) {
   size_t endchar;
 #ifndef CYGWIN
diff --git a/thirdparty/rocksdb/util/string_util.h b/thirdparty/rocksdb/util/string_util.h
index b2bca40ac5..6e125ddfa8 100644
--- a/thirdparty/rocksdb/util/string_util.h
+++ b/thirdparty/rocksdb/util/string_util.h
@@ -109,12 +109,17 @@ std::string trim(const std::string& str);
 bool ParseBoolean(const std::string& type, const std::string& value);
 
 uint32_t ParseUint32(const std::string& value);
+
+int32_t ParseInt32(const std::string& value);
 #endif
 
 uint64_t ParseUint64(const std::string& value);
 
 int ParseInt(const std::string& value);
 
+
+int64_t ParseInt64(const std::string& value);
+
 double ParseDouble(const std::string& value);
 
 size_t ParseSizeT(const std::string& value);
diff --git a/thirdparty/rocksdb/util/sync_point.cc b/thirdparty/rocksdb/util/sync_point.cc
index c8c9fbc26a..4599c256d9 100644
--- a/thirdparty/rocksdb/util/sync_point.cc
+++ b/thirdparty/rocksdb/util/sync_point.cc
@@ -4,10 +4,7 @@
 //  (found in the LICENSE.Apache file in the root directory).
 
 #include "util/sync_point.h"
-#include <functional>
-#include <thread>
-#include "port/port.h"
-#include "util/random.h"
+#include "util/sync_point_impl.h"
 
 int rocksdb_kill_odds = 0;
 std::vector<std::string> rocksdb_kill_prefix_blacklist;
@@ -15,156 +12,55 @@ std::vector<std::string> rocksdb_kill_prefix_blacklist;
 #ifndef NDEBUG
 namespace rocksdb {
 
-void TestKillRandom(std::string kill_point, int odds,
-                    const std::string& srcfile, int srcline) {
-  for (auto& p : rocksdb_kill_prefix_blacklist) {
-    if (kill_point.substr(0, p.length()) == p) {
-      return;
-    }
-  }
-
-  assert(odds > 0);
-  if (odds % 7 == 0) {
-    // class Random uses multiplier 16807, which is 7^5. If odds are
-    // multiplier of 7, there might be limited values generated.
-    odds++;
-  }
-  auto* r = Random::GetTLSInstance();
-  bool crash = r->OneIn(odds);
-  if (crash) {
-    port::Crash(srcfile, srcline);
-  }
-}
-
 SyncPoint* SyncPoint::GetInstance() {
   static SyncPoint sync_point;
   return &sync_point;
 }
 
-void SyncPoint::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  successors_.clear();
-  predecessors_.clear();
-  cleared_points_.clear();
-  for (const auto& dependency : dependencies) {
-    successors_[dependency.predecessor].push_back(dependency.successor);
-    predecessors_[dependency.successor].push_back(dependency.predecessor);
-  }
-  cv_.notify_all();
+SyncPoint::SyncPoint() : impl_(new Data) {}
+
+SyncPoint:: ~SyncPoint() {
+  delete impl_;
 }
 
-void SyncPoint::LoadDependencyAndMarkers(
-    const std::vector<SyncPointPair>& dependencies,
-    const std::vector<SyncPointPair>& markers) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  successors_.clear();
-  predecessors_.clear();
-  cleared_points_.clear();
-  markers_.clear();
-  marked_thread_id_.clear();
-  for (const auto& dependency : dependencies) {
-    successors_[dependency.predecessor].push_back(dependency.successor);
-    predecessors_[dependency.successor].push_back(dependency.predecessor);
-  }
-  for (const auto& marker : markers) {
-    successors_[marker.predecessor].push_back(marker.successor);
-    predecessors_[marker.successor].push_back(marker.predecessor);
-    markers_[marker.predecessor].push_back(marker.successor);
-  }
-  cv_.notify_all();
+void SyncPoint::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
+  impl_->LoadDependency(dependencies);
 }
 
-bool SyncPoint::PredecessorsAllCleared(const std::string& point) {
-  for (const auto& pred : predecessors_[point]) {
-    if (cleared_points_.count(pred) == 0) {
-      return false;
-    }
-  }
-  return true;
+void SyncPoint::LoadDependencyAndMarkers(
+  const std::vector<SyncPointPair>& dependencies,
+  const std::vector<SyncPointPair>& markers) {
+  impl_->LoadDependencyAndMarkers(dependencies, markers);
 }
 
-void SyncPoint::SetCallBack(const std::string point,
-                            std::function<void(void*)> callback) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  callbacks_[point] = callback;
+void SyncPoint::SetCallBack(const std::string& point,
+  const std::function<void(void*)>& callback) {
+  impl_->SetCallBack(point, callback);
 }
 
-void SyncPoint::ClearCallBack(const std::string point) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  while (num_callbacks_running_ > 0) {
-    cv_.wait(lock);
-  }
-  callbacks_.erase(point);
+void SyncPoint::ClearCallBack(const std::string& point) {
+  impl_->ClearCallBack(point);
 }
 
 void SyncPoint::ClearAllCallBacks() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  while (num_callbacks_running_ > 0) {
-    cv_.wait(lock);
-  }
-  callbacks_.clear();
+  impl_->ClearAllCallBacks();
 }
 
 void SyncPoint::EnableProcessing() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  enabled_ = true;
+  impl_->EnableProcessing();
 }
 
 void SyncPoint::DisableProcessing() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  enabled_ = false;
+  impl_->DisableProcessing();
 }
 
 void SyncPoint::ClearTrace() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  cleared_points_.clear();
-}
-
-bool SyncPoint::DisabledByMarker(const std::string& point,
-                                 std::thread::id thread_id) {
-  auto marked_point_iter = marked_thread_id_.find(point);
-  return marked_point_iter != marked_thread_id_.end() &&
-         thread_id != marked_point_iter->second;
+  impl_->ClearTrace();
 }
 
 void SyncPoint::Process(const std::string& point, void* cb_arg) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  if (!enabled_) {
-    return;
-  }
-
-  auto thread_id = std::this_thread::get_id();
-
-  auto marker_iter = markers_.find(point);
-  if (marker_iter != markers_.end()) {
-    for (auto marked_point : marker_iter->second) {
-      marked_thread_id_.insert(std::make_pair(marked_point, thread_id));
-    }
-  }
-
-  if (DisabledByMarker(point, thread_id)) {
-    return;
-  }
-
-  while (!PredecessorsAllCleared(point)) {
-    cv_.wait(lock);
-    if (DisabledByMarker(point, thread_id)) {
-      return;
-    }
-  }
-
-  auto callback_pair = callbacks_.find(point);
-  if (callback_pair != callbacks_.end()) {
-    num_callbacks_running_++;
-    mutex_.unlock();
-    callback_pair->second(cb_arg);
-    mutex_.lock();
-    num_callbacks_running_--;
-    cv_.notify_all();
-  }
-
-  cleared_points_.insert(point);
-  cv_.notify_all();
+  impl_->Process(point, cb_arg);
 }
+
 }  // namespace rocksdb
 #endif  // NDEBUG
diff --git a/thirdparty/rocksdb/util/sync_point.h b/thirdparty/rocksdb/util/sync_point.h
index ada61beccc..cb4b1e7177 100644
--- a/thirdparty/rocksdb/util/sync_point.h
+++ b/thirdparty/rocksdb/util/sync_point.h
@@ -5,13 +5,10 @@
 #pragma once
 
 #include <assert.h>
-#include <condition_variable>
 #include <functional>
 #include <mutex>
 #include <string>
 #include <thread>
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 // This is only set from db_stress.cc and for testing only.
@@ -26,7 +23,7 @@ extern std::vector<std::string> rocksdb_kill_prefix_blacklist;
 #else
 
 namespace rocksdb {
-// Kill the process with probablity 1/odds for testing.
+// Kill the process with probability 1/odds for testing.
 extern void TestKillRandom(std::string kill_point, int odds,
                            const std::string& srcfile, int srcline);
 
@@ -46,7 +43,9 @@ extern void TestKillRandom(std::string kill_point, int odds,
 
 #ifdef NDEBUG
 #define TEST_SYNC_POINT(x)
+#define TEST_IDX_SYNC_POINT(x, index)
 #define TEST_SYNC_POINT_CALLBACK(x, y)
+#define INIT_SYNC_POINT_SINGLETONS()
 #else
 
 namespace rocksdb {
@@ -64,6 +63,10 @@ class SyncPoint {
  public:
   static SyncPoint* GetInstance();
 
+  SyncPoint(const SyncPoint&) = delete;
+  SyncPoint& operator=(const SyncPoint&) = delete;
+  ~SyncPoint();
+
   struct SyncPointPair {
     std::string predecessor;
     std::string successor;
@@ -80,12 +83,14 @@ class SyncPoint {
   void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies,
                                 const std::vector<SyncPointPair>& markers);
 
-  // Set up a call back function in sync point.
-  void SetCallBack(const std::string point,
-                   std::function<void(void*)> callback);
+  // The argument to the callback is passed through from
+  // TEST_SYNC_POINT_CALLBACK(); nullptr if TEST_SYNC_POINT or
+  // TEST_IDX_SYNC_POINT was used.
+  void SetCallBack(const std::string& point,
+                   const std::function<void(void*)>& callback);
 
   // Clear callback function by point
-  void ClearCallBack(const std::string point);
+  void ClearCallBack(const std::string& point);
 
   // Clear all call back functions.
   void ClearAllCallBacks();
@@ -101,29 +106,20 @@ class SyncPoint {
 
   // triggered by TEST_SYNC_POINT, blocking execution until all predecessors
   // are executed.
-  // And/or call registered callback functionn, with argument `cb_arg`
+  // And/or call registered callback function, with argument `cb_arg`
   void Process(const std::string& point, void* cb_arg = nullptr);
 
   // TODO: it might be useful to provide a function that blocks until all
   // sync points are cleared.
 
+  // We want this to be public so we can
+  // subclass the implementation
+  struct Data;
+
  private:
-  bool PredecessorsAllCleared(const std::string& point);
-  bool DisabledByMarker(const std::string& point, std::thread::id thread_id);
-
-  // successor/predecessor map loaded from LoadDependency
-  std::unordered_map<std::string, std::vector<std::string>> successors_;
-  std::unordered_map<std::string, std::vector<std::string>> predecessors_;
-  std::unordered_map<std::string, std::function<void(void*)> > callbacks_;
-  std::unordered_map<std::string, std::vector<std::string> > markers_;
-  std::unordered_map<std::string, std::thread::id> marked_thread_id_;
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  // sync points that have been passed through
-  std::unordered_set<std::string> cleared_points_;
-  bool enabled_ = false;
-  int num_callbacks_running_ = 0;
+   // Singleton
+  SyncPoint();
+  Data*  impl_;
 };
 
 }  // namespace rocksdb
@@ -135,6 +131,10 @@ class SyncPoint {
 // See TransactionLogIteratorRace in db_test.cc for an example use case.
 // TEST_SYNC_POINT is no op in release build.
 #define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x)
+#define TEST_IDX_SYNC_POINT(x, index) \
+  rocksdb::SyncPoint::GetInstance()->Process(x + std::to_string(index))
 #define TEST_SYNC_POINT_CALLBACK(x, y) \
   rocksdb::SyncPoint::GetInstance()->Process(x, y)
+#define INIT_SYNC_POINT_SINGLETONS() \
+  (void)rocksdb::SyncPoint::GetInstance();
 #endif  // NDEBUG
diff --git a/thirdparty/rocksdb/util/sync_point_impl.cc b/thirdparty/rocksdb/util/sync_point_impl.cc
new file mode 100644
index 0000000000..248c381a32
--- /dev/null
+++ b/thirdparty/rocksdb/util/sync_point_impl.cc
@@ -0,0 +1,129 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/sync_point_impl.h"
+
+#ifndef NDEBUG
+namespace rocksdb {
+
+void TestKillRandom(std::string kill_point, int odds,
+                    const std::string& srcfile, int srcline) {
+  for (auto& p : rocksdb_kill_prefix_blacklist) {
+    if (kill_point.substr(0, p.length()) == p) {
+      return;
+    }
+  }
+
+  assert(odds > 0);
+  if (odds % 7 == 0) {
+    // class Random uses multiplier 16807, which is 7^5. If odds are
+    // multiplier of 7, there might be limited values generated.
+    odds++;
+  }
+  auto* r = Random::GetTLSInstance();
+  bool crash = r->OneIn(odds);
+  if (crash) {
+    port::Crash(srcfile, srcline);
+  }
+}
+
+
+void SyncPoint::Data::LoadDependency(const std::vector<SyncPointPair>& dependencies) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  successors_.clear();
+  predecessors_.clear();
+  cleared_points_.clear();
+  for (const auto& dependency : dependencies) {
+    successors_[dependency.predecessor].push_back(dependency.successor);
+    predecessors_[dependency.successor].push_back(dependency.predecessor);
+  }
+  cv_.notify_all();
+}
+
+void SyncPoint::Data::LoadDependencyAndMarkers(
+  const std::vector<SyncPointPair>& dependencies,
+  const std::vector<SyncPointPair>& markers) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  successors_.clear();
+  predecessors_.clear();
+  cleared_points_.clear();
+  markers_.clear();
+  marked_thread_id_.clear();
+  for (const auto& dependency : dependencies) {
+    successors_[dependency.predecessor].push_back(dependency.successor);
+    predecessors_[dependency.successor].push_back(dependency.predecessor);
+  }
+  for (const auto& marker : markers) {
+    successors_[marker.predecessor].push_back(marker.successor);
+    predecessors_[marker.successor].push_back(marker.predecessor);
+    markers_[marker.predecessor].push_back(marker.successor);
+  }
+  cv_.notify_all();
+}
+
+bool SyncPoint::Data::PredecessorsAllCleared(const std::string& point) {
+  for (const auto& pred : predecessors_[point]) {
+    if (cleared_points_.count(pred) == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void SyncPoint::Data::ClearCallBack(const std::string& point) {
+  std::unique_lock<std::mutex> lock(mutex_);
+  while (num_callbacks_running_ > 0) {
+    cv_.wait(lock);
+  }
+  callbacks_.erase(point);
+}
+
+void SyncPoint::Data::ClearAllCallBacks() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  while (num_callbacks_running_ > 0) {
+    cv_.wait(lock);
+  }
+  callbacks_.clear();
+}
+
+void SyncPoint::Data::Process(const std::string& point, void* cb_arg) {
+  if (!enabled_) {
+    return;
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto thread_id = std::this_thread::get_id();
+
+  auto marker_iter = markers_.find(point);
+  if (marker_iter != markers_.end()) {
+    for (auto& marked_point : marker_iter->second) {
+      marked_thread_id_.emplace(marked_point, thread_id);
+    }
+  }
+
+  if (DisabledByMarker(point, thread_id)) {
+    return;
+  }
+
+  while (!PredecessorsAllCleared(point)) {
+    cv_.wait(lock);
+    if (DisabledByMarker(point, thread_id)) {
+      return;
+    }
+  }
+
+  auto callback_pair = callbacks_.find(point);
+  if (callback_pair != callbacks_.end()) {
+    num_callbacks_running_++;
+    mutex_.unlock();
+    callback_pair->second(cb_arg);
+    mutex_.lock();
+    num_callbacks_running_--;
+  }
+  cleared_points_.insert(point);
+  cv_.notify_all();
+}
+} // rocksdb
+#endif
diff --git a/thirdparty/rocksdb/util/sync_point_impl.h b/thirdparty/rocksdb/util/sync_point_impl.h
new file mode 100644
index 0000000000..3c7e704918
--- /dev/null
+++ b/thirdparty/rocksdb/util/sync_point_impl.h
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/sync_point.h"
+
+#include <assert.h>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "port/port.h"
+#include "util/random.h"
+
+#pragma once
+
+#ifndef NDEBUG
+namespace rocksdb {
+struct SyncPoint::Data {
+  Data() : enabled_(false) {}
+  // Enable proper deletion by subclasses
+  virtual ~Data() {}
+  // successor/predecessor map loaded from LoadDependency
+  std::unordered_map<std::string, std::vector<std::string>> successors_;
+  std::unordered_map<std::string, std::vector<std::string>> predecessors_;
+  std::unordered_map<std::string, std::function<void(void*)> > callbacks_;
+  std::unordered_map<std::string, std::vector<std::string> > markers_;
+  std::unordered_map<std::string, std::thread::id> marked_thread_id_;
+
+  std::mutex              mutex_;
+  std::condition_variable cv_;
+  // sync points that have been passed through
+  std::unordered_set<std::string> cleared_points_;
+  std::atomic<bool> enabled_;
+  int num_callbacks_running_ = 0;
+
+  void LoadDependency(const std::vector<SyncPointPair>& dependencies);
+  void LoadDependencyAndMarkers(const std::vector<SyncPointPair>& dependencies,
+    const std::vector<SyncPointPair>& markers);
+  bool PredecessorsAllCleared(const std::string& point);
+  void SetCallBack(const std::string& point,
+    const std::function<void(void*)>& callback) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  callbacks_[point] = callback;
+}
+
+  void ClearCallBack(const std::string& point);
+  void ClearAllCallBacks();
+  void EnableProcessing() {
+    enabled_ = true;
+  }
+  void DisableProcessing() {
+    enabled_ = false;
+  }
+  void ClearTrace() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cleared_points_.clear();
+  }
+  bool DisabledByMarker(const std::string& point,
+                        std::thread::id thread_id) {
+    auto marked_point_iter = marked_thread_id_.find(point);
+    return marked_point_iter != marked_thread_id_.end() &&
+           thread_id != marked_point_iter->second;
+  }
+  void Process(const std::string& point, void* cb_arg);
+};
+}
+#endif // NDEBUG
diff --git a/thirdparty/rocksdb/util/testharness.cc b/thirdparty/rocksdb/util/testharness.cc
index 4626ea0857..8f5eb2a4d6 100644
--- a/thirdparty/rocksdb/util/testharness.cc
+++ b/thirdparty/rocksdb/util/testharness.cc
@@ -9,10 +9,19 @@
 
 #include "util/testharness.h"
 #include <string>
+#include <thread>
 
 namespace rocksdb {
 namespace test {
 
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s) {
+  if (s.ok()) {
+    return ::testing::AssertionSuccess();
+  } else {
+    return ::testing::AssertionFailure() << s_expr << std::endl
+                                         << s.ToString();
+  }
+}
 
 std::string TmpDir(Env* env) {
   std::string dir;
@@ -21,6 +30,19 @@ std::string TmpDir(Env* env) {
   return dir;
 }
 
+std::string PerThreadDBPath(std::string dir, std::string name) {
+  size_t tid = std::hash<std::thread::id>()(std::this_thread::get_id());
+  return dir + "/" + name + "_" + std::to_string(tid);
+}
+
+std::string PerThreadDBPath(std::string name) {
+  return PerThreadDBPath(test::TmpDir(), name);
+}
+
+std::string PerThreadDBPath(Env* env, std::string name) {
+  return PerThreadDBPath(test::TmpDir(env), name);
+}
+
 int RandomSeed() {
   const char* env = getenv("TEST_RANDOM_SEED");
   int result = (env != nullptr ? atoi(env) : 301);
diff --git a/thirdparty/rocksdb/util/testharness.h b/thirdparty/rocksdb/util/testharness.h
index 44ee76eb9a..39e77f8a99 100644
--- a/thirdparty/rocksdb/util/testharness.h
+++ b/thirdparty/rocksdb/util/testharness.h
@@ -9,6 +9,12 @@
 
 #pragma once
 
+#ifdef OS_AIX
+#include "gtest/gtest.h"
+#else
+#include <gtest/gtest.h>
+#endif
+
 #include <string>
 #include "rocksdb/env.h"
 
@@ -18,10 +24,22 @@ namespace test {
 // Return the directory to use for temporary storage.
 std::string TmpDir(Env* env = Env::Default());
 
+// A path unique within the thread
+std::string PerThreadDBPath(std::string name);
+std::string PerThreadDBPath(Env* env, std::string name);
+std::string PerThreadDBPath(std::string dir, std::string name);
+
 // Return a randomization seed for this run.  Typically returns the
 // same number on repeated invocations of this binary, but automated
 // runs may be able to vary the seed.
 int RandomSeed();
 
+::testing::AssertionResult AssertStatus(const char* s_expr, const Status& s);
+
+#define ASSERT_OK(s) ASSERT_PRED_FORMAT1(rocksdb::test::AssertStatus, s)
+#define ASSERT_NOK(s) ASSERT_FALSE((s).ok())
+#define EXPECT_OK(s) EXPECT_PRED_FORMAT1(rocksdb::test::AssertStatus, s)
+#define EXPECT_NOK(s) EXPECT_FALSE((s).ok())
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/testutil.cc b/thirdparty/rocksdb/util/testutil.cc
index f3010f3f2c..ec95d107e4 100644
--- a/thirdparty/rocksdb/util/testutil.cc
+++ b/thirdparty/rocksdb/util/testutil.cc
@@ -19,10 +19,13 @@
 namespace rocksdb {
 namespace test {
 
+const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
+const uint32_t kLatestFormatVersion = 4u;
+
 Slice RandomString(Random* rnd, int len, std::string* dst) {
   dst->resize(len);
   for (int i = 0; i < len; i++) {
-    (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));   // ' ' .. '~'
+    (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95));  // ' ' .. '~'
   }
   return Slice(*dst);
 }
@@ -39,9 +42,8 @@ extern std::string RandomHumanReadableString(Random* rnd, int len) {
 std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
   // Make sure to generate a wide variety of characters so we
   // test the boundary conditions for short-key optimizations.
-  static const char kTestChars[] = {
-    '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff'
-  };
+  static const char kTestChars[] = {'\0', '\1', 'a',    'b',    'c',
+                                    'd',  'e',  '\xfd', '\xfe', '\xff'};
   std::string result;
   for (int i = 0; i < len; i++) {
     std::size_t indx = 0;
@@ -64,7 +66,6 @@ std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
   return result;
 }
 
-
 extern Slice CompressibleString(Random* rnd, double compressed_fraction,
                                 int len, std::string* dst) {
   int raw = static_cast<int>(len * compressed_fraction);
@@ -84,13 +85,11 @@ extern Slice CompressibleString(Random* rnd, double compressed_fraction,
 namespace {
 class Uint64ComparatorImpl : public Comparator {
  public:
-  Uint64ComparatorImpl() { }
+  Uint64ComparatorImpl() {}
 
-  virtual const char* Name() const override {
-    return "rocksdb.Uint64Comparator";
-  }
+  const char* Name() const override { return "rocksdb.Uint64Comparator"; }
 
-  virtual int Compare(const Slice& a, const Slice& b) const override {
+  int Compare(const Slice& a, const Slice& b) const override {
     assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t));
     const uint64_t* left = reinterpret_cast<const uint64_t*>(a.data());
     const uint64_t* right = reinterpret_cast<const uint64_t*>(b.data());
@@ -107,43 +106,36 @@ class Uint64ComparatorImpl : public Comparator {
     }
   }
 
-  virtual void FindShortestSeparator(std::string* start,
-      const Slice& limit) const override {
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {
     return;
   }
 
-  virtual void FindShortSuccessor(std::string* key) const override {
-    return;
-  }
+  void FindShortSuccessor(std::string* /*key*/) const override { return; }
 };
 }  // namespace
 
-static port::OnceType once;
-static const Comparator* uint64comp;
-
-static void InitModule() {
-  uint64comp = new Uint64ComparatorImpl;
-}
-
 const Comparator* Uint64Comparator() {
-  port::InitOnce(&once, InitModule);
-  return uint64comp;
+  static Uint64ComparatorImpl uint64comp;
+  return &uint64comp;
 }
 
-WritableFileWriter* GetWritableFileWriter(WritableFile* wf) {
-  unique_ptr<WritableFile> file(wf);
-  return new WritableFileWriter(std::move(file), EnvOptions());
+WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
+                                          const std::string& fname) {
+  std::unique_ptr<WritableFile> file(wf);
+  return new WritableFileWriter(std::move(file), fname, EnvOptions());
 }
 
 RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) {
-  unique_ptr<RandomAccessFile> file(raf);
+  std::unique_ptr<RandomAccessFile> file(raf);
   return new RandomAccessFileReader(std::move(file),
                                     "[test RandomAccessFileReader]");
 }
 
-SequentialFileReader* GetSequentialFileReader(SequentialFile* se) {
-  unique_ptr<SequentialFile> file(se);
-  return new SequentialFileReader(std::move(file));
+SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
+                                              const std::string& fname) {
+  std::unique_ptr<SequentialFile> file(se);
+  return new SequentialFileReader(std::move(file), fname);
 }
 
 void CorruptKeyType(InternalKey* ikey) {
@@ -200,6 +192,7 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
   BlockBasedTableOptions opt;
   opt.cache_index_and_filter_blocks = rnd->Uniform(2);
   opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
+  opt.pin_top_level_index_and_filter = rnd->Uniform(2);
   opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch
                                    : BlockBasedTableOptions::kHashSearch;
   opt.hash_index_allow_collision = rnd->Uniform(2);
@@ -225,6 +218,8 @@ TableFactory* RandomTableFactory(Random* rnd, int pre_defined) {
       return NewBlockBasedTableFactory();
   }
 #else
+  (void)rnd;
+  (void)pre_defined;
   return NewBlockBasedTableFactory();
 #endif  // !ROCKSDB_LITE
 }
@@ -310,6 +305,8 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
   cf_opt->paranoid_file_checks = rnd->Uniform(2);
   cf_opt->purge_redundant_kvs_while_flush = rnd->Uniform(2);
   cf_opt->force_consistency_checks = rnd->Uniform(2);
+  cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
+  cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
 
   // double options
   cf_opt->hard_rate_limit = static_cast<double>(rnd->Uniform(10000)) / 13;
@@ -348,10 +345,13 @@ void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, Random* rnd) {
 
   // uint64_t options
   static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
+  cf_opt->ttl = uint_max + rnd->Uniform(10000);
   cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
   cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
   cf_opt->max_compaction_bytes =
       cf_opt->target_file_size_base * rnd->Uniform(100);
+  cf_opt->compaction_options_fifo.max_table_files_size =
+      uint_max + rnd->Uniform(10000);
 
   // unsigned int options
   cf_opt->rate_limit_delay_max_milliseconds = rnd->Uniform(10000);
@@ -397,5 +397,21 @@ Status DestroyDir(Env* env, const std::string& dir) {
   return s;
 }
 
+bool IsDirectIOSupported(Env* env, const std::string& dir) {
+  EnvOptions env_options;
+  env_options.use_mmap_writes = false;
+  env_options.use_direct_writes = true;
+  std::string tmp = TempFileName(dir, 999);
+  Status s;
+  {
+    std::unique_ptr<WritableFile> file;
+    s = env->NewWritableFile(tmp, &file, env_options);
+  }
+  if (s.ok()) {
+    s = env->DeleteFile(tmp);
+  }
+  return s.ok();
+}
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/testutil.h b/thirdparty/rocksdb/util/testutil.h
index 02bfb0ff6d..2aab3df72c 100644
--- a/thirdparty/rocksdb/util/testutil.h
+++ b/thirdparty/rocksdb/util/testutil.h
@@ -32,6 +32,9 @@ class SequentialFileReader;
 
 namespace test {
 
+extern const uint32_t kDefaultFormatVersion;
+extern const uint32_t kLatestFormatVersion;
+
 // Store in *dst a random string of length "len" and return a Slice that
 // references the generated data.
 extern Slice RandomString(Random* rnd, int len, std::string* dst);
@@ -61,7 +64,7 @@ class ErrorEnv : public EnvWrapper {
                num_writable_file_errors_(0) { }
 
   virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
+                                 std::unique_ptr<WritableFile>* result,
                                  const EnvOptions& soptions) override {
     result->reset();
     if (writable_file_error_) {
@@ -72,6 +75,7 @@ class ErrorEnv : public EnvWrapper {
   }
 };
 
+#ifndef NDEBUG
 // An internal comparator that just forward comparing results from the
 // user comparator in it. Can be used to test entities that have no dependency
 // on internal key structure but consumes InternalKeyComparator, like
@@ -86,14 +90,8 @@ class PlainInternalKeyComparator : public InternalKeyComparator {
   virtual int Compare(const Slice& a, const Slice& b) const override {
     return user_comparator()->Compare(a, b);
   }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {
-    user_comparator()->FindShortestSeparator(start, limit);
-  }
-  virtual void FindShortSuccessor(std::string* key) const override {
-    user_comparator()->FindShortSuccessor(key);
-  }
 };
+#endif
 
 // A test comparator which compare two strings in this way:
 // (1) first compare prefix of 8 bytes in alphabet order,
@@ -121,10 +119,10 @@ class SimpleSuffixReverseComparator : public Comparator {
       return -(suffix_a.compare(suffix_b));
     }
   }
-  virtual void FindShortestSeparator(std::string* start,
-                                     const Slice& limit) const override {}
+  virtual void FindShortestSeparator(std::string* /*start*/,
+                                     const Slice& /*limit*/) const override {}
 
-  virtual void FindShortSuccessor(std::string* key) const override {}
+  virtual void FindShortSuccessor(std::string* /*key*/) const override {}
 };
 
 // Returns a user key comparator that can be used for comparing two uint64_t
@@ -177,16 +175,21 @@ class VectorIterator : public InternalIterator {
 
   virtual Status status() const override { return Status::OK(); }
 
+  virtual bool IsKeyPinned() const override { return true; }
+  virtual bool IsValuePinned() const override { return true; }
+
  private:
   std::vector<std::string> keys_;
   std::vector<std::string> values_;
   size_t current_;
 };
-extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf);
+extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf,
+                                                 const std::string& fname);
 
 extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf);
 
-extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se);
+extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se,
+                                                     const std::string& fname);
 
 class StringSink: public WritableFile {
  public:
@@ -245,9 +248,9 @@ class RandomRWStringSink : public RandomRWFile {
  public:
   explicit RandomRWStringSink(StringSink* ss) : ss_(ss) {}
 
-  Status Write(uint64_t offset, const Slice& data) {
+  Status Write(uint64_t offset, const Slice& data) override {
     if (offset + data.size() > ss_->contents_.size()) {
-      ss_->contents_.resize(offset + data.size(), '\0');
+      ss_->contents_.resize(static_cast<size_t>(offset) + data.size(), '\0');
     }
 
     char* pos = const_cast<char*>(ss_->contents_.data() + offset);
@@ -255,7 +258,8 @@ class RandomRWStringSink : public RandomRWFile {
     return Status::OK();
   }
 
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* /*scratch*/) const override {
     *result = Slice(nullptr, 0);
     if (offset < ss_->contents_.size()) {
       size_t str_res_sz =
@@ -265,11 +269,11 @@ class RandomRWStringSink : public RandomRWFile {
     return Status::OK();
   }
 
-  Status Flush() { return Status::OK(); }
+  Status Flush() override { return Status::OK(); }
 
-  Status Sync() { return Status::OK(); }
+  Status Sync() override { return Status::OK(); }
 
-  Status Close() { return Status::OK(); }
+  Status Close() override { return Status::OK(); }
 
   const std::string& contents() const { return ss_->contents(); }
 
@@ -376,7 +380,7 @@ class StringSource: public RandomAccessFile {
 class NullLogger : public Logger {
  public:
   using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {}
+  virtual void Logv(const char* /*format*/, va_list /*ap*/) override {}
   virtual size_t GetLogFileSize() const override { return 0; }
 };
 
@@ -457,15 +461,16 @@ class FilterNumber : public CompactionFilter {
 
   std::string last_merge_operand_key() { return last_merge_operand_key_; }
 
-  bool Filter(int level, const rocksdb::Slice& key, const rocksdb::Slice& value,
-              std::string* new_value, bool* value_changed) const override {
+  bool Filter(int /*level*/, const rocksdb::Slice& /*key*/,
+              const rocksdb::Slice& value, std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     if (value.size() == sizeof(uint64_t)) {
       return num_ == DecodeFixed64(value.data());
     }
     return true;
   }
 
-  bool FilterMergeOperand(int level, const rocksdb::Slice& key,
+  bool FilterMergeOperand(int /*level*/, const rocksdb::Slice& key,
                           const rocksdb::Slice& value) const override {
     last_merge_operand_key_ = key.ToString();
     if (value.size() == sizeof(uint64_t)) {
@@ -513,7 +518,7 @@ class StringEnv : public EnvWrapper {
             "Attemp to read when it already reached eof.");
       }
       // TODO(yhchiang): Currently doesn't handle the overflow case.
-      offset_ += n;
+      offset_ += static_cast<size_t>(n);
       return Status::OK();
     }
 
@@ -527,7 +532,7 @@ class StringEnv : public EnvWrapper {
     explicit StringSink(std::string* contents)
         : WritableFile(), contents_(contents) {}
     virtual Status Truncate(uint64_t size) override {
-      contents_->resize(size);
+      contents_->resize(static_cast<size_t>(size));
       return Status::OK();
     }
     virtual Status Close() override { return Status::OK(); }
@@ -549,7 +554,7 @@ class StringEnv : public EnvWrapper {
 
   const Status WriteToNewFile(const std::string& file_name,
                               const std::string& content) {
-    unique_ptr<WritableFile> r;
+    std::unique_ptr<WritableFile> r;
     auto s = NewWritableFile(file_name, &r, EnvOptions());
     if (!s.ok()) {
       return s;
@@ -562,8 +567,9 @@ class StringEnv : public EnvWrapper {
   }
 
   // The following text is boilerplate that forwards all methods to target()
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
-                           const EnvOptions& options) override {
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& /*options*/) override {
     auto iter = files_.find(f);
     if (iter == files_.end()) {
       return Status::NotFound("The specified file does not exist", f);
@@ -571,13 +577,13 @@ class StringEnv : public EnvWrapper {
     r->reset(new SeqStringSource(iter->second));
     return Status::OK();
   }
-  Status NewRandomAccessFile(const std::string& f,
-                             unique_ptr<RandomAccessFile>* r,
-                             const EnvOptions& options) override {
+  Status NewRandomAccessFile(const std::string& /*f*/,
+                             std::unique_ptr<RandomAccessFile>* /*r*/,
+                             const EnvOptions& /*options*/) override {
     return Status::NotSupported();
   }
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
-                         const EnvOptions& options) override {
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& /*options*/) override {
     auto iter = files_.find(f);
     if (iter != files_.end()) {
       return Status::IOError("The specified file already exists", f);
@@ -585,8 +591,8 @@ class StringEnv : public EnvWrapper {
     r->reset(new StringSink(&files_[f]));
     return Status::OK();
   }
-  virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+  virtual Status NewDirectory(const std::string& /*name*/,
+                              std::unique_ptr<Directory>* /*result*/) override {
     return Status::NotSupported();
   }
   Status FileExists(const std::string& f) override {
@@ -595,21 +601,21 @@ class StringEnv : public EnvWrapper {
     }
     return Status::OK();
   }
-  Status GetChildren(const std::string& dir,
-                     std::vector<std::string>* r) override {
+  Status GetChildren(const std::string& /*dir*/,
+                     std::vector<std::string>* /*r*/) override {
     return Status::NotSupported();
   }
   Status DeleteFile(const std::string& f) override {
     files_.erase(f);
     return Status::OK();
   }
-  Status CreateDir(const std::string& d) override {
+  Status CreateDir(const std::string& /*d*/) override {
     return Status::NotSupported();
   }
-  Status CreateDirIfMissing(const std::string& d) override {
+  Status CreateDirIfMissing(const std::string& /*d*/) override {
     return Status::NotSupported();
   }
-  Status DeleteDir(const std::string& d) override {
+  Status DeleteDir(const std::string& /*d*/) override {
     return Status::NotSupported();
   }
   Status GetFileSize(const std::string& f, uint64_t* s) override {
@@ -621,24 +627,25 @@ class StringEnv : public EnvWrapper {
     return Status::OK();
   }
 
-  Status GetFileModificationTime(const std::string& fname,
-                                 uint64_t* file_mtime) override {
+  Status GetFileModificationTime(const std::string& /*fname*/,
+                                 uint64_t* /*file_mtime*/) override {
     return Status::NotSupported();
   }
 
-  Status RenameFile(const std::string& s, const std::string& t) override {
+  Status RenameFile(const std::string& /*s*/,
+                    const std::string& /*t*/) override {
     return Status::NotSupported();
   }
 
-  Status LinkFile(const std::string& s, const std::string& t) override {
+  Status LinkFile(const std::string& /*s*/, const std::string& /*t*/) override {
     return Status::NotSupported();
   }
 
-  Status LockFile(const std::string& f, FileLock** l) override {
+  Status LockFile(const std::string& /*f*/, FileLock** /*l*/) override {
     return Status::NotSupported();
   }
 
-  Status UnlockFile(FileLock* l) override { return Status::NotSupported(); }
+  Status UnlockFile(FileLock* /*l*/) override { return Status::NotSupported(); }
 
  protected:
   std::unordered_map<std::string, std::string> files_;
@@ -661,14 +668,14 @@ class ChanglingMergeOperator : public MergeOperator {
 
   void SetName(const std::string& name) { name_ = name; }
 
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
+  virtual bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                           MergeOperationOutput* /*merge_out*/) const override {
     return false;
   }
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value,
-                                 Logger* logger) const override {
+  virtual bool PartialMergeMulti(const Slice& /*key*/,
+                                 const std::deque<Slice>& /*operand_list*/,
+                                 std::string* /*new_value*/,
+                                 Logger* /*logger*/) const override {
     return false;
   }
   virtual const char* Name() const override { return name_.c_str(); }
@@ -689,8 +696,9 @@ class ChanglingCompactionFilter : public CompactionFilter {
 
   void SetName(const std::string& name) { name_ = name; }
 
-  bool Filter(int level, const Slice& key, const Slice& existing_value,
-              std::string* new_value, bool* value_changed) const override {
+  bool Filter(int /*level*/, const Slice& /*key*/,
+              const Slice& /*existing_value*/, std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
     return false;
   }
 
@@ -713,7 +721,7 @@ class ChanglingCompactionFilterFactory : public CompactionFilterFactory {
   void SetName(const std::string& name) { name_ = name; }
 
   std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
+      const CompactionFilter::Context& /*context*/) override {
     return std::unique_ptr<CompactionFilter>();
   }
 
@@ -740,5 +748,7 @@ std::string RandomName(Random* rnd, const size_t len);
 
 Status DestroyDir(Env* env, const std::string& dir);
 
+bool IsDirectIOSupported(Env* env, const std::string& dir);
+
 }  // namespace test
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/thread_list_test.cc b/thirdparty/rocksdb/util/thread_list_test.cc
index 36a221bf2d..a4a343a9cf 100644
--- a/thirdparty/rocksdb/util/thread_list_test.cc
+++ b/thirdparty/rocksdb/util/thread_list_test.cc
@@ -47,7 +47,7 @@ class SimulatedBackgroundTask {
     }
     Env::Default()->GetThreadStatusUpdater()->ClearThreadState();
     Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation();
-    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(0);
+    Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(nullptr);
     running_count_--;
     bg_cv_.notify_all();
   }
diff --git a/thirdparty/rocksdb/util/thread_local.cc b/thirdparty/rocksdb/util/thread_local.cc
index 5361951a99..7346eff11e 100644
--- a/thirdparty/rocksdb/util/thread_local.cc
+++ b/thirdparty/rocksdb/util/thread_local.cc
@@ -38,7 +38,11 @@ class StaticMeta;
 //     | thread 3 |    void*   |    void*   |    void*   | <- ThreadData
 //     ---------------------------------------------------
 struct ThreadData {
-  explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst) : entries(), inst(_inst) {}
+  explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst)
+    : entries(),
+      next(nullptr),
+      prev(nullptr),
+      inst(_inst) {}
   std::vector<Entry> entries;
   ThreadData* next;
   ThreadData* prev;
@@ -174,14 +178,15 @@ namespace wintlscleanup {
 
 // This is set to OnThreadExit in StaticMeta singleton constructor
 UnrefHandler thread_local_inclass_routine = nullptr;
-pthread_key_t thread_local_key = -1;
+pthread_key_t thread_local_key = pthread_key_t (-1);
 
 // Static callback function to call with each thread termination.
 void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) {
   // We decided to punt on PROCESS_EXIT
   if (DLL_THREAD_DETACH == reason) {
-    if (thread_local_key != pthread_key_t(-1) && thread_local_inclass_routine != nullptr) {
-      void* tls = pthread_getspecific(thread_local_key);
+    if (thread_local_key != pthread_key_t(-1) &&
+        thread_local_inclass_routine != nullptr) {
+      void* tls = TlsGetValue(thread_local_key);
       if (tls != nullptr) {
         thread_local_inclass_routine(tls);
       }
@@ -199,7 +204,7 @@ extern "C" {
 // The linker must not discard thread_callback_on_exit.  (We force a reference
 // to this variable with a linker /include:symbol pragma to ensure that.) If
 // this variable is discarded, the OnThreadExit function will never be called.
-#ifdef _WIN64
+#ifndef _X86_
 
 // .CRT section is merged with .rdata on x64 so it must be constant data.
 #pragma const_seg(".CRT$XLB")
@@ -214,7 +219,7 @@ const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit =
 #pragma comment(linker, "/include:_tls_used")
 #pragma comment(linker, "/include:p_thread_callback_on_exit")
 
-#else  // _WIN64
+#else  // _X86_
 
 #pragma data_seg(".CRT$XLB")
 PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
@@ -224,7 +229,7 @@ PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit;
 #pragma comment(linker, "/INCLUDE:__tls_used")
 #pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit")
 
-#endif  // _WIN64
+#endif  // _X86_
 
 #else
 // https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc
@@ -300,7 +305,10 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) {
   delete tls;
 }
 
-ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0), head_(this) {
+ThreadLocalPtr::StaticMeta::StaticMeta()
+  : next_instance_id_(0),
+    head_(this),
+    pthread_key_(0) {
   if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) {
     abort();
   }
diff --git a/thirdparty/rocksdb/util/thread_local.h b/thirdparty/rocksdb/util/thread_local.h
index 1ca5b10ddc..5dad729215 100644
--- a/thirdparty/rocksdb/util/thread_local.h
+++ b/thirdparty/rocksdb/util/thread_local.h
@@ -24,6 +24,13 @@ namespace rocksdb {
 // pointer (if not NULL) when one of the following happens:
 // (1) a thread terminates
 // (2) a ThreadLocalPtr is destroyed
+//
+// Warning: this function is called while holding a global mutex. The same mutex
+// is used (at least in some cases) by most methods of ThreadLocalPtr, and it's
+// shared across all instances of ThreadLocalPtr. Thereforere extra care
+// is needed to avoid deadlocks. In particular, the handler shouldn't lock any
+// mutexes and shouldn't call any methods of any ThreadLocalPtr instances,
+// unless you know what you're doing.
 typedef void (*UnrefHandler)(void* ptr);
 
 // ThreadLocalPtr stores only values of pointer type.  Different from
@@ -38,6 +45,9 @@ class ThreadLocalPtr {
  public:
   explicit ThreadLocalPtr(UnrefHandler handler = nullptr);
 
+  ThreadLocalPtr(const ThreadLocalPtr&) = delete;
+  ThreadLocalPtr& operator=(const ThreadLocalPtr&) = delete;
+
   ~ThreadLocalPtr();
 
   // Return the current pointer stored in thread local
diff --git a/thirdparty/rocksdb/util/thread_local_test.cc b/thirdparty/rocksdb/util/thread_local_test.cc
index 6fee5eaa57..789be83d8f 100644
--- a/thirdparty/rocksdb/util/thread_local_test.cc
+++ b/thirdparty/rocksdb/util/thread_local_test.cc
@@ -535,7 +535,7 @@ TEST_F(ThreadLocalTest, CompareAndSwap) {
 
 namespace {
 
-void* AccessThreadLocal(void* arg) {
+void* AccessThreadLocal(void* /*arg*/) {
   TEST_SYNC_POINT("AccessThreadLocal:Start");
   ThreadLocalPtr tlp;
   tlp.Reset(new std::string("hello RocksDB"));
diff --git a/thirdparty/rocksdb/util/thread_operation.h b/thirdparty/rocksdb/util/thread_operation.h
index 025392b59d..f1827da0a0 100644
--- a/thirdparty/rocksdb/util/thread_operation.h
+++ b/thirdparty/rocksdb/util/thread_operation.h
@@ -70,7 +70,7 @@ static OperationStageInfo global_op_stage_table[] = {
   {ThreadStatus::STAGE_MEMTABLE_ROLLBACK,
       "MemTableList::RollbackMemtableFlush"},
   {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS,
-      "MemTableList::InstallMemtableFlushResults"},
+      "MemTableList::TryInstallMemtableFlushResults"},
 };
 
 // The structure that describes a state.
diff --git a/thirdparty/rocksdb/util/threadpool_imp.cc b/thirdparty/rocksdb/util/threadpool_imp.cc
index f38e6422b4..acac0063bc 100644
--- a/thirdparty/rocksdb/util/threadpool_imp.cc
+++ b/thirdparty/rocksdb/util/threadpool_imp.cc
@@ -18,13 +18,15 @@
 
 #ifdef OS_LINUX
 #  include <sys/syscall.h>
+#  include <sys/resource.h>
 #endif
 
+#include <stdlib.h>
 #include <algorithm>
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
-#include <stdlib.h>
+#include <sstream>
 #include <thread>
 #include <vector>
 
@@ -53,6 +55,8 @@ struct ThreadPoolImpl::Impl {
 
   void LowerIOPriority();
 
+  void LowerCPUPriority();
+
   void WakeUpAllThreads() {
     bgsignal_.notify_all();
   }
@@ -97,6 +101,7 @@ struct ThreadPoolImpl::Impl {
   static void* BGThreadWrapper(void* arg);
 
   bool low_io_priority_;
+  bool low_cpu_priority_;
   Env::Priority priority_;
   Env*         env_;
 
@@ -125,6 +130,7 @@ inline
 ThreadPoolImpl::Impl::Impl()
     :
       low_io_priority_(false),
+      low_cpu_priority_(false),
       priority_(Env::LOW),
       env_(nullptr),
       total_threads_limit_(0),
@@ -147,6 +153,9 @@ void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) {
 
   wait_for_jobs_to_complete_ = wait_for_jobs_to_complete;
   exit_all_threads_ = true;
+  // prevent threads from being recreated right after they're joined, in case
+  // the user is concurrently submitting jobs.
+  total_threads_limit_ = 0;
 
   lock.unlock();
 
@@ -168,11 +177,18 @@ void ThreadPoolImpl::Impl::LowerIOPriority() {
   low_io_priority_ = true;
 }
 
+inline
+void ThreadPoolImpl::Impl::LowerCPUPriority() {
+  std::lock_guard<std::mutex> lock(mu_);
+  low_cpu_priority_ = true;
+}
 
 void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
   bool low_io_priority = false;
+  bool low_cpu_priority = false;
+
   while (true) {
-// Wait until there is an item that is ready to run
+    // Wait until there is an item that is ready to run
     std::unique_lock<std::mutex> lock(mu_);
     // Stop waiting if the thread needs to do work or needs to terminate.
     while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) &&
@@ -182,7 +198,7 @@ void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
 
     if (exit_all_threads_) {  // mechanism to let BG threads exit safely
 
-      if(!wait_for_jobs_to_complete_ ||
+      if (!wait_for_jobs_to_complete_ ||
           queue_.empty()) {
         break;
        }
@@ -210,9 +226,20 @@ void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
                      std::memory_order_relaxed);
 
     bool decrease_io_priority = (low_io_priority != low_io_priority_);
+    bool decrease_cpu_priority = (low_cpu_priority != low_cpu_priority_);
     lock.unlock();
 
 #ifdef OS_LINUX
+    if (decrease_cpu_priority) {
+      setpriority(
+          PRIO_PROCESS,
+          // Current thread.
+          0,
+          // Lowest priority possible.
+          19);
+      low_cpu_priority = true;
+    }
+
     if (decrease_io_priority) {
 #define IOPRIO_CLASS_SHIFT (13)
 #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data)
@@ -233,6 +260,7 @@ void ThreadPoolImpl::Impl::BGThread(size_t thread_id) {
     }
 #else
     (void)decrease_io_priority;  // avoid 'unused variable' error
+    (void)decrease_cpu_priority;
 #endif
     func();
   }
@@ -251,11 +279,28 @@ void* ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) {
   size_t thread_id = meta->thread_id_;
   ThreadPoolImpl::Impl* tp = meta->thread_pool_;
 #ifdef ROCKSDB_USING_THREAD_STATUS
-  // for thread-status
-  ThreadStatusUtil::RegisterThread(
-      tp->GetHostEnv(), (tp->GetThreadPriority() == Env::Priority::HIGH
-                             ? ThreadStatus::HIGH_PRIORITY
-                             : ThreadStatus::LOW_PRIORITY));
+  // initialize it because compiler isn't good enough to see we don't use it
+  // uninitialized
+  ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES;
+  switch (tp->GetThreadPriority()) {
+    case Env::Priority::HIGH:
+      thread_type = ThreadStatus::HIGH_PRIORITY;
+      break;
+    case Env::Priority::LOW:
+      thread_type = ThreadStatus::LOW_PRIORITY;
+      break;
+    case Env::Priority::BOTTOM:
+      thread_type = ThreadStatus::BOTTOM_PRIORITY;
+      break;
+    case Env::Priority::USER:
+      thread_type = ThreadStatus::USER;
+      break;
+    case Env::Priority::TOTAL:
+      assert(false);
+      return nullptr;
+  }
+  assert(thread_type != ThreadStatus::NUM_THREAD_TYPES);
+  ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type);
 #endif
   delete meta;
   tp->BGThread(thread_id);
@@ -296,11 +341,14 @@ void ThreadPoolImpl::Impl::StartBGThreads() {
 #if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
 #if __GLIBC_PREREQ(2, 12)
     auto th_handle = p_t.native_handle();
-    char name_buf[16];
-    snprintf(name_buf, sizeof name_buf, "rocksdb:bg%" ROCKSDB_PRIszt,
-             bgthreads_.size());
-    name_buf[sizeof name_buf - 1] = '\0';
-    pthread_setname_np(th_handle, name_buf);
+    std::string thread_priority = Env::PriorityToString(GetThreadPriority());
+    std::ostringstream thread_name_stream;
+    thread_name_stream << "rocksdb:";
+    for (char c : thread_priority) {
+      thread_name_stream << static_cast<char>(tolower(c));
+    }
+    thread_name_stream << bgthreads_.size();
+    pthread_setname_np(th_handle, thread_name_stream.str().c_str());
 #endif
 #endif
     bgthreads_.push_back(std::move(p_t));
@@ -404,6 +452,10 @@ void ThreadPoolImpl::LowerIOPriority() {
   impl_->LowerIOPriority();
 }
 
+void ThreadPoolImpl::LowerCPUPriority() {
+  impl_->LowerCPUPriority();
+}
+
 void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) {
   impl_->SetBackgroundThreadsInternal(num, false);
 }
@@ -420,16 +472,12 @@ void ThreadPoolImpl::SubmitJob(std::function<void()>&& job) {
 
 void ThreadPoolImpl::Schedule(void(*function)(void* arg1), void* arg,
   void* tag, void(*unschedFunction)(void* arg)) {
-
-  std::function<void()> fn = [arg, function] { function(arg); };
-
-  std::function<void()> unfn;
-  if (unschedFunction != nullptr) {
-    auto uf = [arg, unschedFunction] { unschedFunction(arg); };
-    unfn = std::move(uf);
+  if (unschedFunction == nullptr) {
+    impl_->Submit(std::bind(function, arg), std::function<void()>(), tag);
+  } else {
+    impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg),
+                  tag);
   }
-
-  impl_->Submit(std::move(fn), std::move(unfn), tag);
 }
 
 int ThreadPoolImpl::UnSchedule(void* arg) {
diff --git a/thirdparty/rocksdb/util/threadpool_imp.h b/thirdparty/rocksdb/util/threadpool_imp.h
index cced19bdd3..3cdafb839f 100644
--- a/thirdparty/rocksdb/util/threadpool_imp.h
+++ b/thirdparty/rocksdb/util/threadpool_imp.h
@@ -46,10 +46,14 @@ class ThreadPoolImpl : public ThreadPool {
   // start yet
   void WaitForJobsAndJoinAllThreads() override;
 
-  // Make threads to run at a lower kernel priority
+  // Make threads to run at a lower kernel IO priority
   // Currently only has effect on Linux
   void LowerIOPriority();
 
+  // Make threads to run at a lower kernel CPU priority
+  // Currently only has effect on Linux
+  void LowerCPUPriority();
+
   // Ensure there is at aleast num threads in the pool
   // but do not kill threads if there are more
   void IncBackgroundThreadsIfNeeded(int num);
diff --git a/thirdparty/rocksdb/util/timer_queue.h b/thirdparty/rocksdb/util/timer_queue.h
index f068ffefbf..bd8a4f8504 100644
--- a/thirdparty/rocksdb/util/timer_queue.h
+++ b/thirdparty/rocksdb/util/timer_queue.h
@@ -22,8 +22,6 @@
 
 #pragma once
 
-#include "port/port.h"
-
 #include <assert.h>
 #include <chrono>
 #include <condition_variable>
@@ -33,6 +31,9 @@
 #include <utility>
 #include <vector>
 
+#include "port/port.h"
+#include "util/sync_point.h"
+
 // Allows execution of handlers at a specified time in the future
 // Guarantees:
 //  - All handlers are executed ONCE, even if cancelled (aborted parameter will
@@ -48,7 +49,13 @@ class TimerQueue {
  public:
   TimerQueue() : m_th(&TimerQueue::run, this) {}
 
-  ~TimerQueue() {
+  ~TimerQueue() { shutdown(); }
+
+  // This function is not thread-safe.
+  void shutdown() {
+    if (closed_) {
+      return;
+    }
     cancelAll();
     // Abusing the timer queue to trigger the shutdown.
     add(0, [this](bool) {
@@ -56,6 +63,7 @@ class TimerQueue {
       return std::make_pair(false, 0);
     });
     m_th.join();
+    closed_ = true;
   }
 
   // Adds a new timer
@@ -67,6 +75,7 @@ class TimerQueue {
     WorkItem item;
     Clock::time_point tp = Clock::now();
     item.end = tp + std::chrono::milliseconds(milliseconds);
+    TEST_SYNC_POINT_CALLBACK("TimeQueue::Add:item.end", &item.end);
     item.period = milliseconds;
     item.handler = std::move(handler);
 
@@ -217,4 +226,5 @@ class TimerQueue {
     std::vector<WorkItem>& getContainer() { return this->c; }
   } m_items;
   rocksdb::port::Thread m_th;
+  bool closed_ = false;
 };
diff --git a/thirdparty/rocksdb/util/trace_replay.cc b/thirdparty/rocksdb/util/trace_replay.cc
new file mode 100644
index 0000000000..28160b2929
--- /dev/null
+++ b/thirdparty/rocksdb/util/trace_replay.cc
@@ -0,0 +1,300 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/trace_replay.h"
+
+#include <chrono>
+#include <sstream>
+#include <thread>
+#include "db/db_impl.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace rocksdb {
+
+const std::string kTraceMagic = "feedcafedeadbeef";
+
+namespace {
+void EncodeCFAndKey(std::string* dst, uint32_t cf_id, const Slice& key) {
+  PutFixed32(dst, cf_id);
+  PutLengthPrefixedSlice(dst, key);
+}
+
+void DecodeCFAndKey(std::string& buffer, uint32_t* cf_id, Slice* key) {
+  Slice buf(buffer);
+  GetFixed32(&buf, cf_id);
+  GetLengthPrefixedSlice(&buf, key);
+}
+}  // namespace
+
+Tracer::Tracer(Env* env, const TraceOptions& trace_options,
+               std::unique_ptr<TraceWriter>&& trace_writer)
+    : env_(env),
+      trace_options_(trace_options),
+      trace_writer_(std::move(trace_writer)),
+      trace_request_count_ (0) {
+  WriteHeader();
+}
+
+Tracer::~Tracer() { trace_writer_.reset(); }
+
+Status Tracer::Write(WriteBatch* write_batch) {
+  TraceType trace_type = kTraceWrite;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  trace.payload = write_batch->Data();
+  return WriteTrace(trace);
+}
+
+Status Tracer::Get(ColumnFamilyHandle* column_family, const Slice& key) {
+  TraceType trace_type = kTraceGet;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  EncodeCFAndKey(&trace.payload, column_family->GetID(), key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeek(const uint32_t& cf_id, const Slice& key) {
+  TraceType trace_type = kTraceIteratorSeek;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  EncodeCFAndKey(&trace.payload, cf_id, key);
+  return WriteTrace(trace);
+}
+
+Status Tracer::IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key) {
+  TraceType trace_type = kTraceIteratorSeekForPrev;
+  if (ShouldSkipTrace(trace_type)) {
+    return Status::OK();
+  }
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = trace_type;
+  EncodeCFAndKey(&trace.payload, cf_id, key);
+  return WriteTrace(trace);
+}
+
+bool Tracer::ShouldSkipTrace(const TraceType& trace_type) {
+  if (IsTraceFileOverMax()) {
+    return true;
+  }
+  if ((trace_options_.filter & kTraceFilterGet
+    && trace_type == kTraceGet)
+   || (trace_options_.filter & kTraceFilterWrite
+    && trace_type == kTraceWrite)) {
+    return true;
+  }
+  ++trace_request_count_;
+  if (trace_request_count_ < trace_options_.sampling_frequency) {
+    return true;
+  }
+  trace_request_count_ = 0;
+  return false;
+}
+
+bool Tracer::IsTraceFileOverMax() {
+  uint64_t trace_file_size = trace_writer_->GetFileSize();
+  return (trace_file_size > trace_options_.max_trace_file_size);
+}
+
+Status Tracer::WriteHeader() {
+  std::ostringstream s;
+  s << kTraceMagic << "\t"
+    << "Trace Version: 0.1\t"
+    << "RocksDB Version: " << kMajorVersion << "." << kMinorVersion << "\t"
+    << "Format: Timestamp OpType Payload\n";
+  std::string header(s.str());
+
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceBegin;
+  trace.payload = header;
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteFooter() {
+  Trace trace;
+  trace.ts = env_->NowMicros();
+  trace.type = kTraceEnd;
+  trace.payload = "";
+  return WriteTrace(trace);
+}
+
+Status Tracer::WriteTrace(const Trace& trace) {
+  std::string encoded_trace;
+  PutFixed64(&encoded_trace, trace.ts);
+  encoded_trace.push_back(trace.type);
+  PutFixed32(&encoded_trace, static_cast<uint32_t>(trace.payload.size()));
+  encoded_trace.append(trace.payload);
+  return trace_writer_->Write(Slice(encoded_trace));
+}
+
+Status Tracer::Close() { return WriteFooter(); }
+
+Replayer::Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+                   std::unique_ptr<TraceReader>&& reader)
+    : trace_reader_(std::move(reader)) {
+  assert(db != nullptr);
+  db_ = static_cast<DBImpl*>(db->GetRootDB());
+  for (ColumnFamilyHandle* cfh : handles) {
+    cf_map_[cfh->GetID()] = cfh;
+  }
+}
+
+Replayer::~Replayer() { trace_reader_.reset(); }
+
+Status Replayer::Replay() {
+  Status s;
+  Trace header;
+  s = ReadHeader(&header);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::chrono::system_clock::time_point replay_epoch =
+      std::chrono::system_clock::now();
+  WriteOptions woptions;
+  ReadOptions roptions;
+  Trace trace;
+  uint64_t ops = 0;
+  Iterator* single_iter = nullptr;
+  while (s.ok()) {
+    trace.reset();
+    s = ReadTrace(&trace);
+    if (!s.ok()) {
+      break;
+    }
+
+    std::this_thread::sleep_until(
+        replay_epoch + std::chrono::microseconds(trace.ts - header.ts));
+    if (trace.type == kTraceWrite) {
+      WriteBatch batch(trace.payload);
+      db_->Write(woptions, &batch);
+      ops++;
+    } else if (trace.type == kTraceGet) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      std::string value;
+      if (cf_id == 0) {
+        db_->Get(roptions, key, &value);
+      } else {
+        db_->Get(roptions, cf_map_[cf_id], key, &value);
+      }
+      ops++;
+    } else if (trace.type == kTraceIteratorSeek) {
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      if (cf_id == 0) {
+        single_iter = db_->NewIterator(roptions);
+      } else {
+        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+      }
+      single_iter->Seek(key);
+      ops++;
+      delete single_iter;
+    } else if (trace.type == kTraceIteratorSeekForPrev) {
+      // Currently, only support to call the Seek()
+      uint32_t cf_id = 0;
+      Slice key;
+      DecodeCFAndKey(trace.payload, &cf_id, &key);
+      if (cf_id > 0 && cf_map_.find(cf_id) == cf_map_.end()) {
+        return Status::Corruption("Invalid Column Family ID.");
+      }
+
+      if (cf_id == 0) {
+        single_iter = db_->NewIterator(roptions);
+      } else {
+        single_iter = db_->NewIterator(roptions, cf_map_[cf_id]);
+      }
+      single_iter->SeekForPrev(key);
+      ops++;
+      delete single_iter;
+    } else if (trace.type == kTraceEnd) {
+      // Do nothing for now.
+      // TODO: Add some validations later.
+      break;
+    }
+  }
+
+  if (s.IsIncomplete()) {
+    // Reaching eof returns Incomplete status at the moment.
+    // Could happen when killing a process without calling EndTrace() API.
+    // TODO: Add better error handling.
+    return Status::OK();
+  }
+  return s;
+}
+
+Status Replayer::ReadHeader(Trace* header) {
+  assert(header != nullptr);
+  Status s = ReadTrace(header);
+  if (!s.ok()) {
+    return s;
+  }
+  if (header->type != kTraceBegin) {
+    return Status::Corruption("Corrupted trace file. Incorrect header.");
+  }
+  if (header->payload.substr(0, kTraceMagic.length()) != kTraceMagic) {
+    return Status::Corruption("Corrupted trace file. Incorrect magic.");
+  }
+
+  return s;
+}
+
+Status Replayer::ReadFooter(Trace* footer) {
+  assert(footer != nullptr);
+  Status s = ReadTrace(footer);
+  if (!s.ok()) {
+    return s;
+  }
+  if (footer->type != kTraceEnd) {
+    return Status::Corruption("Corrupted trace file. Incorrect footer.");
+  }
+
+  // TODO: Add more validations later
+  return s;
+}
+
+Status Replayer::ReadTrace(Trace* trace) {
+  assert(trace != nullptr);
+  std::string encoded_trace;
+  Status s = trace_reader_->Read(&encoded_trace);
+  if (!s.ok()) {
+    return s;
+  }
+
+  Slice enc_slice = Slice(encoded_trace);
+  GetFixed64(&enc_slice, &trace->ts);
+  trace->type = static_cast<TraceType>(enc_slice[0]);
+  enc_slice.remove_prefix(kTraceTypeSize + kTracePayloadLengthSize);
+  trace->payload = enc_slice.ToString();
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/trace_replay.h b/thirdparty/rocksdb/util/trace_replay.h
new file mode 100644
index 0000000000..749ea2f643
--- /dev/null
+++ b/thirdparty/rocksdb/util/trace_replay.h
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/trace_reader_writer.h"
+
+namespace rocksdb {
+
+class ColumnFamilyHandle;
+class ColumnFamilyData;
+class DB;
+class DBImpl;
+class Slice;
+class WriteBatch;
+
+extern const std::string kTraceMagic;
+const unsigned int kTraceTimestampSize = 8;
+const unsigned int kTraceTypeSize = 1;
+const unsigned int kTracePayloadLengthSize = 4;
+const unsigned int kTraceMetadataSize =
+    kTraceTimestampSize + kTraceTypeSize + kTracePayloadLengthSize;
+
+enum TraceType : char {
+  kTraceBegin = 1,
+  kTraceEnd = 2,
+  kTraceWrite = 3,
+  kTraceGet = 4,
+  kTraceIteratorSeek = 5,
+  kTraceIteratorSeekForPrev = 6,
+  kTraceMax,
+};
+
+// TODO: This should also be made part of public interface to help users build
+// custom TracerReaders and TraceWriters.
+struct Trace {
+  uint64_t ts;
+  TraceType type;
+  std::string payload;
+
+  void reset() {
+    ts = 0;
+    type = kTraceMax;
+    payload.clear();
+  }
+};
+
+// Trace RocksDB operations using a TraceWriter.
+class Tracer {
+ public:
+  Tracer(Env* env, const TraceOptions& trace_options,
+         std::unique_ptr<TraceWriter>&& trace_writer);
+  ~Tracer();
+
+  Status Write(WriteBatch* write_batch);
+  Status Get(ColumnFamilyHandle* cfname, const Slice& key);
+  Status IteratorSeek(const uint32_t& cf_id, const Slice& key);
+  Status IteratorSeekForPrev(const uint32_t& cf_id, const Slice& key);
+  bool IsTraceFileOverMax();
+
+  Status Close();
+
+ private:
+  Status WriteHeader();
+  Status WriteFooter();
+  Status WriteTrace(const Trace& trace);
+  bool ShouldSkipTrace(const TraceType& type);
+
+  Env* env_;
+  TraceOptions trace_options_;
+  std::unique_ptr<TraceWriter> trace_writer_;
+  uint64_t trace_request_count_;
+};
+
+// Replay RocksDB operations from a trace.
+class Replayer {
+ public:
+  Replayer(DB* db, const std::vector<ColumnFamilyHandle*>& handles,
+           std::unique_ptr<TraceReader>&& reader);
+  ~Replayer();
+
+  Status Replay();
+
+ private:
+  Status ReadHeader(Trace* header);
+  Status ReadFooter(Trace* footer);
+  Status ReadTrace(Trace* trace);
+
+  DBImpl* db_;
+  std::unique_ptr<TraceReader> trace_reader_;
+  std::unordered_map<uint32_t, ColumnFamilyHandle*> cf_map_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/transaction_test_util.cc b/thirdparty/rocksdb/util/transaction_test_util.cc
index 0d6948b08f..30cff11e14 100644
--- a/thirdparty/rocksdb/util/transaction_test_util.cc
+++ b/thirdparty/rocksdb/util/transaction_test_util.cc
@@ -11,6 +11,9 @@
 #include "util/transaction_test_util.h"
 
 #include <inttypes.h>
+#include <algorithm>
+#include <numeric>
+#include <random>
 #include <string>
 #include <thread>
 
@@ -18,6 +21,10 @@
 #include "rocksdb/utilities/optimistic_transaction_db.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
+
+#include "db/dbformat.h"
+#include "db/snapshot_impl.h"
+#include "util/logging.h"
 #include "util/random.h"
 #include "util/string_util.h"
 
@@ -25,13 +32,15 @@ namespace rocksdb {
 
 RandomTransactionInserter::RandomTransactionInserter(
     Random64* rand, const WriteOptions& write_options,
-    const ReadOptions& read_options, uint64_t num_keys, uint16_t num_sets)
+    const ReadOptions& read_options, uint64_t num_keys, uint16_t num_sets,
+    const uint64_t cmt_delay_ms, const uint64_t first_id)
     : rand_(rand),
       write_options_(write_options),
       read_options_(read_options),
       num_keys_(num_keys),
       num_sets_(num_sets),
-      txn_id_(0) {}
+      txn_id_(first_id),
+      cmt_delay_ms_(cmt_delay_ms) {}
 
 RandomTransactionInserter::~RandomTransactionInserter() {
   if (txn_ != nullptr) {
@@ -46,7 +55,24 @@ bool RandomTransactionInserter::TransactionDBInsert(
     TransactionDB* db, const TransactionOptions& txn_options) {
   txn_ = db->BeginTransaction(write_options_, txn_options, txn_);
 
-  return DoInsert(nullptr, txn_, false);
+  std::hash<std::thread::id> hasher;
+  char name[64];
+  snprintf(name, 64, "txn%" ROCKSDB_PRIszt "-%" PRIu64,
+           hasher(std::this_thread::get_id()), txn_id_++);
+  assert(strlen(name) < 64 - 1);
+  assert(txn_->SetName(name).ok());
+
+  // Take a snapshot if set_snapshot was not set or with 50% change otherwise
+  bool take_snapshot = txn_->GetSnapshot() == nullptr || rand_->OneIn(2);
+  if (take_snapshot) {
+    txn_->SetSnapshot();
+    read_options_.snapshot = txn_->GetSnapshot();
+  }
+  auto res = DoInsert(db, txn_, false);
+  if (take_snapshot) {
+    read_options_.snapshot = nullptr;
+  }
+  return res;
 }
 
 bool RandomTransactionInserter::OptimisticTransactionDBInsert(
@@ -55,59 +81,80 @@ bool RandomTransactionInserter::OptimisticTransactionDBInsert(
   optimistic_txn_ =
       db->BeginTransaction(write_options_, txn_options, optimistic_txn_);
 
-  return DoInsert(nullptr, optimistic_txn_, true);
+  return DoInsert(db, optimistic_txn_, true);
 }
 
 bool RandomTransactionInserter::DBInsert(DB* db) {
   return DoInsert(db, nullptr, false);
 }
 
+Status RandomTransactionInserter::DBGet(
+    DB* db, Transaction* txn, ReadOptions& read_options, uint16_t set_i,
+    uint64_t ikey, bool get_for_update, uint64_t* int_value,
+    std::string* full_key, bool* unexpected_error) {
+  Status s;
+  // Five digits (since the largest uint16_t is 65535) plus the NUL
+  // end char.
+  char prefix_buf[6];
+  // Pad prefix appropriately so we can iterate over each set
+  assert(set_i + 1 <= 9999);
+  snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1);
+  // key format:  [SET#][random#]
+  std::string skey = ToString(ikey);
+  Slice base_key(skey);
+  *full_key = std::string(prefix_buf) + base_key.ToString();
+  Slice key(*full_key);
+
+  std::string value;
+  if (txn != nullptr) {
+    if (get_for_update) {
+      s = txn->GetForUpdate(read_options, key, &value);
+    } else {
+      s = txn->Get(read_options, key, &value);
+    }
+  } else {
+    s = db->Get(read_options, key, &value);
+  }
+
+  if (s.ok()) {
+    // Found key, parse its value
+    *int_value = std::stoull(value);
+    if (*int_value == 0 || *int_value == ULONG_MAX) {
+      *unexpected_error = true;
+      fprintf(stderr, "Get returned unexpected value: %s\n", value.c_str());
+      s = Status::Corruption();
+    }
+  } else if (s.IsNotFound()) {
+    // Have not yet written to this key, so assume its value is 0
+    *int_value = 0;
+    s = Status::OK();
+  }
+  return s;
+}
+
 bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
                                          bool is_optimistic) {
   Status s;
   WriteBatch batch;
-  std::string value;
 
   // pick a random number to use to increment a key in each set
   uint64_t incr = (rand_->Next() % 100) + 1;
-
   bool unexpected_error = false;
 
+  std::vector<uint16_t> set_vec(num_sets_);
+  std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
+  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
+
   // For each set, pick a key at random and increment it
-  for (uint8_t i = 0; i < num_sets_; i++) {
+  for (uint16_t set_i : set_vec) {
     uint64_t int_value = 0;
-    char prefix_buf[5];
-    // prefix_buf needs to be large enough to hold a uint16 in string form
-
-    // key format:  [SET#][random#]
-    std::string rand_key = ToString(rand_->Next() % num_keys_);
-    Slice base_key(rand_key);
-
-    // Pad prefix appropriately so we can iterate over each set
-    snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", i + 1);
-    std::string full_key = std::string(prefix_buf) + base_key.ToString();
+    std::string full_key;
+    uint64_t rand_key = rand_->Next() % num_keys_;
+    const bool get_for_update = txn ? rand_->OneIn(2) : false;
+    s = DBGet(db, txn, read_options_, set_i, rand_key, get_for_update,
+              &int_value, &full_key, &unexpected_error);
     Slice key(full_key);
-
-    if (txn != nullptr) {
-      s = txn->GetForUpdate(read_options_, key, &value);
-    } else {
-      s = db->Get(read_options_, key, &value);
-    }
-
-    if (s.ok()) {
-      // Found key, parse its value
-      int_value = std::stoull(value);
-
-      if (int_value == 0 || int_value == ULONG_MAX) {
-        unexpected_error = true;
-        fprintf(stderr, "Get returned unexpected value: %s\n", value.c_str());
-        s = Status::Corruption();
-      }
-    } else if (s.IsNotFound()) {
-      // Have not yet written to this key, so assume its value is 0
-      int_value = 0;
-      s = Status::OK();
-    } else {
+    if (!s.ok()) {
       // Optimistic transactions should never return non-ok status here.
       // Non-optimistic transactions may return write-coflict/timeout errors.
       if (is_optimistic || !(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) {
@@ -123,7 +170,11 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
       std::string sum = ToString(int_value + incr);
       if (txn != nullptr) {
         s = txn->Put(key, sum);
-        if (!s.ok()) {
+        if (!get_for_update && (s.IsBusy() || s.IsTimedOut())) {
+          // If the initial get was not for update, then the key is not locked
+          // before put and put could fail due to concurrent writes.
+          break;
+        } else if (!s.ok()) {
           // Since we did a GetForUpdate, Put should not fail.
           fprintf(stderr, "Put returned an unexpected error: %s\n",
                   s.ToString().c_str());
@@ -132,19 +183,46 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
       } else {
         batch.Put(key, sum);
       }
+      bytes_inserted_ += key.size() + sum.size();
+    }
+    if (txn != nullptr) {
+      ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                      "Insert (%s) %s snap: %" PRIu64 " key:%s value: %" PRIu64
+                      "+%" PRIu64 "=%" PRIu64,
+                      txn->GetName().c_str(), s.ToString().c_str(),
+                      txn->GetSnapshot()->GetSequenceNumber(), full_key.c_str(),
+                      int_value, incr, int_value + incr);
     }
   }
 
   if (s.ok()) {
     if (txn != nullptr) {
-      std::hash<std::thread::id> hasher;
-      char name[64];
-      snprintf(name, 64, "txn%zu-%d", hasher(std::this_thread::get_id()),
-               txn_id_++);
-      assert(strlen(name) < 64 - 1);
-      txn->SetName(name);
-      s = txn->Prepare();
-      s = txn->Commit();
+      bool with_prepare = !is_optimistic && !rand_->OneIn(10);
+      if (with_prepare) {
+        // Also try commit without prepare
+        s = txn->Prepare();
+        assert(s.ok());
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Prepare of %" PRIu64 " %s (%s)", txn->GetId(),
+                        s.ToString().c_str(), txn->GetName().c_str());
+        db->GetDBOptions().env->SleepForMicroseconds(
+            static_cast<int>(cmt_delay_ms_ * 1000));
+      }
+      if (!rand_->OneIn(20)) {
+        s = txn->Commit();
+        assert(!with_prepare || s.ok());
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Commit of %" PRIu64 " %s (%s)", txn->GetId(),
+                        s.ToString().c_str(), txn->GetName().c_str());
+      } else {
+        // Also try 5% rollback
+        s = txn->Rollback();
+        ROCKS_LOG_DEBUG(db->GetDBOptions().info_log,
+                        "Rollback %" PRIu64 " %s %s", txn->GetId(),
+                        txn->GetName().c_str(), s.ToString().c_str());
+        assert(s.ok());
+      }
+      assert(is_optimistic || s.ok());
 
       if (!s.ok()) {
         if (is_optimistic) {
@@ -167,7 +245,6 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
                   s.ToString().c_str());
         }
       }
-
     } else {
       s = db->Write(write_options_, &batch);
       if (!s.ok()) {
@@ -178,7 +255,9 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
     }
   } else {
     if (txn != nullptr) {
-      txn->Rollback();
+      assert(txn->Rollback().ok());
+      ROCKS_LOG_DEBUG(db->GetDBOptions().info_log, "Error %s for txn %s",
+                      s.ToString().c_str(), txn->GetName().c_str());
     }
   }
 
@@ -194,48 +273,108 @@ bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn,
   return !unexpected_error;
 }
 
-Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets) {
+// Verify that the sum of the keys in each set are equal
+Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets,
+                                         uint64_t num_keys_per_set,
+                                         bool take_snapshot, Random64* rand,
+                                         uint64_t delay_ms) {
+  // delay_ms is the delay between taking a snapshot and doing the reads. It
+  // emulates reads from a long-running backup job.
+  assert(delay_ms == 0 || take_snapshot);
   uint64_t prev_total = 0;
+  uint32_t prev_i = 0;
+  bool prev_assigned = false;
+
+  ReadOptions roptions;
+  if (take_snapshot) {
+    roptions.snapshot = db->GetSnapshot();
+    db->GetDBOptions().env->SleepForMicroseconds(
+        static_cast<int>(delay_ms * 1000));
+  }
+
+  std::vector<uint16_t> set_vec(num_sets);
+  std::iota(set_vec.begin(), set_vec.end(), static_cast<uint16_t>(0));
+  std::shuffle(set_vec.begin(), set_vec.end(), std::random_device{});
 
   // For each set of keys with the same prefix, sum all the values
-  for (uint32_t i = 0; i < num_sets; i++) {
+  for (uint16_t set_i : set_vec) {
+    // Five digits (since the largest uint16_t is 65535) plus the NUL
+    // end char.
     char prefix_buf[6];
-    snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", i + 1);
+    assert(set_i + 1 <= 9999);
+    snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", set_i + 1);
     uint64_t total = 0;
 
-    Iterator* iter = db->NewIterator(ReadOptions());
-
-    for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) {
-      Slice key = iter->key();
-
-      // stop when we reach a different prefix
-      if (key.ToString().compare(0, 4, prefix_buf) != 0) {
-        break;
+    // Use either point lookup or iterator. Point lookups are slower so we use
+    // it less often.
+    const bool use_point_lookup =
+        num_keys_per_set != 0 && rand && rand->OneIn(10);
+    if (use_point_lookup) {
+      ReadOptions read_options;
+      for (uint64_t k = 0; k < num_keys_per_set; k++) {
+        std::string dont_care;
+        uint64_t int_value = 0;
+        bool unexpected_error = false;
+        const bool FOR_UPDATE = false;
+        Status s = DBGet(db, nullptr, roptions, set_i, k, FOR_UPDATE,
+                         &int_value, &dont_care, &unexpected_error);
+        assert(s.ok());
+        assert(!unexpected_error);
+        total += int_value;
       }
-
-      Slice value = iter->value();
-      uint64_t int_value = std::stoull(value.ToString());
-      if (int_value == 0 || int_value == ULONG_MAX) {
-        fprintf(stderr, "Iter returned unexpected value: %s\n",
-                value.ToString().c_str());
-        return Status::Corruption();
+    } else {  // user iterators
+      Iterator* iter = db->NewIterator(roptions);
+      for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        // stop when we reach a different prefix
+        if (key.ToString().compare(0, 4, prefix_buf) != 0) {
+          break;
+        }
+        Slice value = iter->value();
+        uint64_t int_value = std::stoull(value.ToString());
+        if (int_value == 0 || int_value == ULONG_MAX) {
+          fprintf(stderr, "Iter returned unexpected value: %s\n",
+                  value.ToString().c_str());
+          return Status::Corruption();
+        }
+        ROCKS_LOG_DEBUG(
+            db->GetDBOptions().info_log,
+            "VerifyRead at %" PRIu64 " (%" PRIu64 "): %.*s value: %" PRIu64,
+            roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul,
+            roptions.snapshot
+                ? ((SnapshotImpl*)roptions.snapshot)->min_uncommitted_
+                : 0ul,
+            static_cast<int>(key.size()), key.data(), int_value);
+        total += int_value;
       }
-
-      total += int_value;
+      delete iter;
     }
-    delete iter;
-
-    if (i > 0) {
-      if (total != prev_total) {
-        fprintf(stderr,
-                "RandomTransactionVerify found inconsistent totals. "
-                "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64
-                " \n",
-                i - 1, prev_total, i, total);
-        return Status::Corruption();
-      }
+
+    if (prev_assigned && total != prev_total) {
+      db->GetDBOptions().info_log->Flush();
+      fprintf(stdout,
+              "RandomTransactionVerify found inconsistent totals using "
+              "pointlookup? %d "
+              "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64
+              " at snapshot %" PRIu64 "\n",
+              use_point_lookup, prev_i, prev_total, set_i, total,
+              roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul);
+      fflush(stdout);
+      return Status::Corruption();
+    } else {
+      ROCKS_LOG_DEBUG(
+          db->GetDBOptions().info_log,
+          "RandomTransactionVerify pass pointlookup? %d total: %" PRIu64
+          " snap: %" PRIu64,
+          use_point_lookup, total,
+          roptions.snapshot ? roptions.snapshot->GetSequenceNumber() : 0ul);
     }
     prev_total = total;
+    prev_i = set_i;
+    prev_assigned = true;
+  }
+  if (take_snapshot) {
+    db->ReleaseSnapshot(roptions.snapshot);
   }
 
   return Status::OK();
diff --git a/thirdparty/rocksdb/util/transaction_test_util.h b/thirdparty/rocksdb/util/transaction_test_util.h
index 8805490fd8..1aa4196ab3 100644
--- a/thirdparty/rocksdb/util/transaction_test_util.h
+++ b/thirdparty/rocksdb/util/transaction_test_util.h
@@ -35,10 +35,13 @@ class RandomTransactionInserter {
  public:
   // num_keys is the number of keys in each set.
   // num_sets is the number of sets of keys.
+  // cmt_delay_ms is the delay between prepare (if there is any) and commit
+  // first_id is the id of the first transaction
   explicit RandomTransactionInserter(
       Random64* rand, const WriteOptions& write_options = WriteOptions(),
       const ReadOptions& read_options = ReadOptions(), uint64_t num_keys = 1000,
-      uint16_t num_sets = 3);
+      uint16_t num_sets = 3, const uint64_t cmt_delay_ms = 0,
+      const uint64_t first_id = 0);
 
   ~RandomTransactionInserter();
 
@@ -68,8 +71,16 @@ class RandomTransactionInserter {
   // Error status may be obtained by calling GetLastStatus().
   bool DBInsert(DB* db);
 
+  // Get the ikey'th key from set set_i
+  static Status DBGet(DB* db, Transaction* txn, ReadOptions& read_options,
+                      uint16_t set_i, uint64_t ikey, bool get_for_update,
+                      uint64_t* int_value, std::string* full_key,
+                      bool* unexpected_error);
+
   // Returns OK if Invariant is true.
-  static Status Verify(DB* db, uint16_t num_sets);
+  static Status Verify(DB* db, uint16_t num_sets, uint64_t num_keys_per_set = 0,
+                       bool take_snapshot = false, Random64* rand = nullptr,
+                       uint64_t delay_ms = 0);
 
   // Returns the status of the previous Insert operation
   Status GetLastStatus() { return last_status_; }
@@ -83,11 +94,14 @@ class RandomTransactionInserter {
   // write any data.
   uint64_t GetFailureCount() { return failure_count_; }
 
+  // Returns the sum of user keys/values Put() to the DB.
+  size_t GetBytesInserted() { return bytes_inserted_; }
+
  private:
   // Input options
   Random64* rand_;
   const WriteOptions write_options_;
-  const ReadOptions read_options_;
+  ReadOptions read_options_;
   const uint64_t num_keys_;
   const uint16_t num_sets_;
 
@@ -97,6 +111,8 @@ class RandomTransactionInserter {
   // Number of failed insert batches attempted
   uint64_t failure_count_ = 0;
 
+  size_t bytes_inserted_ = 0;
+
   // Status returned by most recent insert operation
   Status last_status_;
 
@@ -104,7 +120,9 @@ class RandomTransactionInserter {
   Transaction* txn_ = nullptr;
   Transaction* optimistic_txn_ = nullptr;
 
-  std::atomic<int> txn_id_;
+  uint64_t txn_id_;
+  // The delay between ::Prepare and ::Commit
+  const uint64_t cmt_delay_ms_;
 
   bool DoInsert(DB* db, Transaction* txn, bool is_optimistic);
 };
diff --git a/thirdparty/rocksdb/util/user_comparator_wrapper.h b/thirdparty/rocksdb/util/user_comparator_wrapper.h
new file mode 100644
index 0000000000..43797709c9
--- /dev/null
+++ b/thirdparty/rocksdb/util/user_comparator_wrapper.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/comparator.h"
+
+namespace rocksdb {
+
+// Wrapper of user comparator, with auto increment to
+// perf_context.user_key_comparison_count.
+class UserComparatorWrapper final : public Comparator {
+ public:
+  explicit UserComparatorWrapper(const Comparator* const user_cmp)
+      : user_comparator_(user_cmp) {}
+  
+  ~UserComparatorWrapper() = default;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->Compare(a, b);
+  }
+
+  bool Equal(const Slice& a, const Slice& b) const override {
+    PERF_COUNTER_ADD(user_key_comparison_count, 1);
+    return user_comparator_->Equal(a, b);
+  }
+
+  const char* Name() const override { return user_comparator_->Name(); }
+
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    return user_comparator_->FindShortestSeparator(start, limit);
+  }
+
+  void FindShortSuccessor(std::string* key) const override {
+    return user_comparator_->FindShortSuccessor(key);
+  }
+
+  const Comparator* GetRootComparator() const override {
+    return user_comparator_->GetRootComparator();
+  }
+
+  bool IsSameLengthImmediateSuccessor(const Slice& s,
+                                      const Slice& t) const override {
+    return user_comparator_->IsSameLengthImmediateSuccessor(s, t);
+  }
+
+  bool CanKeysWithDifferentByteContentsBeEqual() const override {
+    return user_comparator_->CanKeysWithDifferentByteContentsBeEqual();
+  }
+
+ private:
+  const Comparator* user_comparator_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/util.h b/thirdparty/rocksdb/util/util.h
new file mode 100644
index 0000000000..a5fd364907
--- /dev/null
+++ b/thirdparty/rocksdb/util/util.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef FALLTHROUGH_INTENDED
+#if defined(__clang__)
+#define FALLTHROUGH_INTENDED [[clang::fallthrough]]
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define FALLTHROUGH_INTENDED [[gnu::fallthrough]]
+#else
+#define FALLTHROUGH_INTENDED do {} while (0)
+#endif
+#endif
diff --git a/thirdparty/rocksdb/util/vector_iterator.h b/thirdparty/rocksdb/util/vector_iterator.h
new file mode 100644
index 0000000000..da60eb229c
--- /dev/null
+++ b/thirdparty/rocksdb/util/vector_iterator.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace rocksdb {
+
+// Iterator over a vector of keys/values
+class VectorIterator : public InternalIterator {
+ public:
+  VectorIterator(std::vector<std::string> keys, std::vector<std::string> values,
+                 const InternalKeyComparator* icmp)
+      : keys_(std::move(keys)),
+        values_(std::move(values)),
+        indexed_cmp_(icmp, &keys_),
+        current_(keys.size()) {
+    assert(keys_.size() == values_.size());
+
+    indices_.reserve(keys_.size());
+    for (size_t i = 0; i < keys_.size(); i++) {
+      indices_.push_back(i);
+    }
+    std::sort(indices_.begin(), indices_.end(), indexed_cmp_);
+  }
+
+  virtual bool Valid() const override {
+    return !indices_.empty() && current_ < indices_.size();
+  }
+
+  virtual void SeekToFirst() override { current_ = 0; }
+  virtual void SeekToLast() override { current_ = indices_.size() - 1; }
+
+  virtual void Seek(const Slice& target) override {
+    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                indexed_cmp_) -
+               indices_.begin();
+  }
+
+  virtual void SeekForPrev(const Slice& target) override {
+    current_ = std::lower_bound(indices_.begin(), indices_.end(), target,
+                                indexed_cmp_) -
+               indices_.begin();
+    if (!Valid()) {
+      SeekToLast();
+    } else {
+      Prev();
+    }
+  }
+
+  virtual void Next() override { current_++; }
+  virtual void Prev() override { current_--; }
+
+  virtual Slice key() const override {
+    return Slice(keys_[indices_[current_]]);
+  }
+  virtual Slice value() const override {
+    return Slice(values_[indices_[current_]]);
+  }
+
+  virtual Status status() const override { return Status::OK(); }
+
+  virtual bool IsKeyPinned() const override { return true; }
+  virtual bool IsValuePinned() const override { return true; }
+
+ private:
+  struct IndexedKeyComparator {
+    IndexedKeyComparator(const InternalKeyComparator* c,
+                         const std::vector<std::string>* ks)
+        : cmp(c), keys(ks) {}
+
+    bool operator()(size_t a, size_t b) const {
+      return cmp->Compare((*keys)[a], (*keys)[b]) < 0;
+    }
+
+    bool operator()(size_t a, const Slice& b) const {
+      return cmp->Compare((*keys)[a], b) < 0;
+    }
+
+    bool operator()(const Slice& a, size_t b) const {
+      return cmp->Compare(a, (*keys)[b]) < 0;
+    }
+
+    const InternalKeyComparator* cmp;
+    const std::vector<std::string>* keys;
+  };
+
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+  IndexedKeyComparator indexed_cmp_;
+  std::vector<size_t> indices_;
+  size_t current_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/xxhash.cc b/thirdparty/rocksdb/util/xxhash.cc
index 4bce61a487..2ec95a636e 100644
--- a/thirdparty/rocksdb/util/xxhash.cc
+++ b/thirdparty/rocksdb/util/xxhash.cc
@@ -34,6 +34,39 @@ You can contact the author at :
 //**************************************
 // Tuning parameters
 //**************************************
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable. Unfortunately, on some target/compiler combinations, the
+ * generated assembly is sub-optimal. The below switch allow to select different
+ * access method for improved performance. Method 0 (default) : use `memcpy()`.
+ * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
+ * extension (ie, not portable). This method is safe if your compiler supports
+ * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
+ * access. This method doesn't depend on compiler but violate C standard. It can
+ * generate buggy code on targets which do not support unaligned memory
+ * accesses. But in some circumstances, it's the only known way to get the most
+ * performance (ie GCC + ARMv6) See http://stackoverflow.com/a/32095106/646947
+ * for details. Prefer these methods in priority order (0 > 1 > 2)
+ */
+
+#include "util/util.h"
+
+#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line \
+                                   for example */
+#if defined(__GNUC__) &&                                     \
+    (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||  \
+     defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
+     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))
+#define XXH_FORCE_MEMORY_ACCESS 2
+#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) ||      \
+    (defined(__GNUC__) &&                                     \
+     (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) ||  \
+      defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \
+      defined(__ARM_ARCH_7S__)))
+#define XXH_FORCE_MEMORY_ACCESS 1
+#endif
+#endif
+
 // Unaligned memory access is automatically enabled for "common" CPU, such as x86.
 // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected.
 // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance.
@@ -58,6 +91,21 @@ You can contact the author at :
 // This option has no impact on Little_Endian CPU.
 #define XXH_FORCE_NATIVE_FORMAT 0
 
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \
+    defined(_M_X64)
+#define XXH_FORCE_ALIGN_CHECK 0
+#else
+#define XXH_FORCE_ALIGN_CHECK 1
+#endif
+#endif
 
 //**************************************
 // Compiler Specific Options
@@ -91,7 +139,7 @@ FORCE_INLINE void  XXH_free  (void* p)  { free(p); }
 // for memcpy()
 #include <string.h>
 FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
-
+#include <assert.h> /* assert */
 
 namespace rocksdb {
 //**************************************
@@ -134,6 +182,34 @@ typedef struct _U32_S { U32 v; } _PACKED U32_S;
 
 #define A32(x) (((U32_S *)(x))->v)
 
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*)memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union {
+  U32 u32;
+} __attribute__((packed)) unalign;
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static U32 XXH_read32(const void* memPtr) {
+  U32 val;
+  memcpy(&val, memPtr, sizeof(val));
+  return val;
+}
+
+#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
 
 //***************************************
 // Compiler-specific Functions and Macros
@@ -143,8 +219,10 @@ typedef struct _U32_S { U32 v; } _PACKED U32_S;
 // Note : although _rotl exists for minGW (GCC under windows), performance seems poor
 #if defined(_MSC_VER)
 #  define XXH_rotl32(x,r) _rotl(x,r)
+#define XXH_rotl64(x, r) _rotl64(x, r)
 #else
 #  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#define XXH_rotl64(x, r) ((x << r) | (x >> (64 - r)))
 #endif
 
 #if defined(_MSC_VER)     // Visual Studio
@@ -199,12 +277,25 @@ FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_al
         return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr);
 }
 
-FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); }
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian,
+                                    XXH_alignment align) {
+  if (align == XXH_unaligned)
+    return endian == XXH_littleEndian ? XXH_read32(ptr)
+                                      : XXH_swap32(XXH_read32(ptr));
+  else
+    return endian == XXH_littleEndian ? *(const U32*)ptr
+                                      : XXH_swap32(*(const U32*)ptr);
+}
 
+FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) {
+  return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
 
 //****************************
 // Simple Hash Functions
 //****************************
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
 FORCE_INLINE U32 XXH32_endian_align(const void* input, int len, U32 seed, XXH_endianess endian, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
@@ -476,4 +567,508 @@ U32 XXH32_digest (void* state_in)
     return h32;
 }
 
+/* *******************************************************************
+ *  64-bit hash functions
+ *********************************************************************/
+
+ #if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+ /* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+ static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+ #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+ /* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+ /* currently only defined for gcc and icc */
+ typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
+ static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+ #else
+
+ /* portable and safe solution. Generally efficient.
+  * see : http://stackoverflow.com/a/32095106/646947
+  */
+
+ static U64 XXH_read64(const void* memPtr)
+ {
+     U64 val;
+     memcpy(&val, memPtr, sizeof(val));
+     return val;
+ }
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER) /* Visual Studio */
+#define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#define XXH_swap64 __builtin_bswap64
+#else
+static U64 XXH_swap64(U64 x) {
+  return ((x << 56) & 0xff00000000000000ULL) |
+         ((x << 40) & 0x00ff000000000000ULL) |
+         ((x << 24) & 0x0000ff0000000000ULL) |
+         ((x << 8) & 0x000000ff00000000ULL) |
+         ((x >> 8) & 0x00000000ff000000ULL) |
+         ((x >> 24) & 0x0000000000ff0000ULL) |
+         ((x >> 40) & 0x000000000000ff00ULL) |
+         ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian,
+                                    XXH_alignment align) {
+  if (align == XXH_unaligned)
+    return endian == XXH_littleEndian ? XXH_read64(ptr)
+                                      : XXH_swap64(XXH_read64(ptr));
+  else
+    return endian == XXH_littleEndian ? *(const U64*)ptr
+                                      : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) {
+  return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr) {
+  return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+/*======   xxh64   ======*/
+
+static const U64 PRIME64_1 =
+    11400714785074694791ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111
+                              */
+static const U64 PRIME64_2 =
+    14029467366897019727ULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111
+                              */
+static const U64 PRIME64_3 =
+    1609587929392839161ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001
+                             */
+static const U64 PRIME64_4 =
+    9650029242287828579ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011
+                             */
+static const U64 PRIME64_5 =
+    2870177450012600261ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101
+                             */
+
+static U64 XXH64_round(U64 acc, U64 input) {
+  acc += input * PRIME64_2;
+  acc = XXH_rotl64(acc, 31);
+  acc *= PRIME64_1;
+  return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val) {
+  val = XXH64_round(0, val);
+  acc ^= val;
+  acc = acc * PRIME64_1 + PRIME64_4;
+  return acc;
+}
+
+static U64 XXH64_avalanche(U64 h64) {
+  h64 ^= h64 >> 33;
+  h64 *= PRIME64_2;
+  h64 ^= h64 >> 29;
+  h64 *= PRIME64_3;
+  h64 ^= h64 >> 32;
+  return h64;
+}
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+static U64 XXH64_finalize(U64 h64, const void* ptr, size_t len,
+                          XXH_endianess endian, XXH_alignment align) {
+  const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64          \
+  h64 ^= (*p++) * PRIME64_5; \
+  h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64                           \
+  h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+  p += 4;                                     \
+  h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64                                    \
+  {                                                    \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p));   \
+    p += 8;                                            \
+    h64 ^= k1;                                         \
+    h64 = XXH_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; \
+  }
+
+  switch (len & 31) {
+    case 24:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 16:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 8:
+      PROCESS8_64;
+      return XXH64_avalanche(h64);
+
+    case 28:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 20:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 12:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 4:
+      PROCESS4_64;
+      return XXH64_avalanche(h64);
+
+    case 25:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 17:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 9:
+      PROCESS8_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 29:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 21:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 13:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 5:
+      PROCESS4_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 26:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 18:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 10:
+      PROCESS8_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 30:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 22:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 14:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 6:
+      PROCESS4_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 27:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 19:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 11:
+      PROCESS8_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      PROCESS1_64;
+      return XXH64_avalanche(h64);
+
+    case 31:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 23:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 15:
+      PROCESS8_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 7:
+      PROCESS4_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 3:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 2:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 1:
+      PROCESS1_64;
+      FALLTHROUGH_INTENDED;
+      /* fallthrough */
+    case 0:
+      return XXH64_avalanche(h64);
+  }
+
+  /* impossible to reach */
+  assert(0);
+  return 0; /* unreachable, but some compilers complain without it */
+}
+
+FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed,
+                                    XXH_endianess endian, XXH_alignment align) {
+  const BYTE* p = (const BYTE*)input;
+  const BYTE* bEnd = p + len;
+  U64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+  if (p == NULL) {
+    len = 0;
+    bEnd = p = (const BYTE*)(size_t)32;
+  }
+#endif
+
+  if (len >= 32) {
+    const BYTE* const limit = bEnd - 32;
+    U64 v1 = seed + PRIME64_1 + PRIME64_2;
+    U64 v2 = seed + PRIME64_2;
+    U64 v3 = seed + 0;
+    U64 v4 = seed - PRIME64_1;
+
+    do {
+      v1 = XXH64_round(v1, XXH_get64bits(p));
+      p += 8;
+      v2 = XXH64_round(v2, XXH_get64bits(p));
+      p += 8;
+      v3 = XXH64_round(v3, XXH_get64bits(p));
+      p += 8;
+      v4 = XXH64_round(v4, XXH_get64bits(p));
+      p += 8;
+    } while (p <= limit);
+
+    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
+          XXH_rotl64(v4, 18);
+    h64 = XXH64_mergeRound(h64, v1);
+    h64 = XXH64_mergeRound(h64, v2);
+    h64 = XXH64_mergeRound(h64, v3);
+    h64 = XXH64_mergeRound(h64, v4);
+
+  } else {
+    h64 = seed + PRIME64_5;
+  }
+
+  h64 += (U64)len;
+
+  return XXH64_finalize(h64, p, len, endian, align);
+}
+
+unsigned long long XXH64(const void* input, size_t len,
+                         unsigned long long seed) {
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
+#else
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if (XXH_FORCE_ALIGN_CHECK) {
+    if ((((size_t)input) & 7) ==
+        0) { /* Input is aligned, let's leverage the speed advantage */
+      if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian,
+                                  XXH_aligned);
+      else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }
+  }
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_endian_align(input, len, seed, XXH_littleEndian,
+                              XXH_unaligned);
+  else
+    return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+/*======   Hash Streaming   ======*/
+
+XXH64_state_t* XXH64_createState(void) {
+  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) {
+  XXH_free(statePtr);
+  return XXH_OK;
+}
+
+void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) {
+  memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) {
+  XXH64_state_t state; /* using a local state to memcpy() in order to avoid
+                          strict-aliasing warnings */
+  memset(&state, 0, sizeof(state));
+  state.v1 = seed + PRIME64_1 + PRIME64_2;
+  state.v2 = seed + PRIME64_2;
+  state.v3 = seed + 0;
+  state.v4 = seed - PRIME64_1;
+  /* do not write into reserved, planned to be removed in a future version */
+  memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+  return XXH_OK;
+}
+
+FORCE_INLINE XXH_errorcode XXH64_update_endian(XXH64_state_t* state,
+                                               const void* input, size_t len,
+                                               XXH_endianess endian) {
+  if (input == NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && \
+    (XXH_ACCEPT_NULL_INPUT_POINTER >= 1)
+    return XXH_OK;
+#else
+    return XXH_ERROR;
+#endif
+
+  {
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) { /* fill in tmp buffer */
+      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+      state->memsize += (U32)len;
+      return XXH_OK;
+    }
+
+    if (state->memsize) { /* tmp buffer is full */
+      XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input,
+                 32 - state->memsize);
+      state->v1 =
+          XXH64_round(state->v1, XXH_readLE64(state->mem64 + 0, endian));
+      state->v2 =
+          XXH64_round(state->v2, XXH_readLE64(state->mem64 + 1, endian));
+      state->v3 =
+          XXH64_round(state->v3, XXH_readLE64(state->mem64 + 2, endian));
+      state->v4 =
+          XXH64_round(state->v4, XXH_readLE64(state->mem64 + 3, endian));
+      p += 32 - state->memsize;
+      state->memsize = 0;
+    }
+
+    if (p + 32 <= bEnd) {
+      const BYTE* const limit = bEnd - 32;
+      U64 v1 = state->v1;
+      U64 v2 = state->v2;
+      U64 v3 = state->v3;
+      U64 v4 = state->v4;
+
+      do {
+        v1 = XXH64_round(v1, XXH_readLE64(p, endian));
+        p += 8;
+        v2 = XXH64_round(v2, XXH_readLE64(p, endian));
+        p += 8;
+        v3 = XXH64_round(v3, XXH_readLE64(p, endian));
+        p += 8;
+        v4 = XXH64_round(v4, XXH_readLE64(p, endian));
+        p += 8;
+      } while (p <= limit);
+
+      state->v1 = v1;
+      state->v2 = v2;
+      state->v3 = v3;
+      state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+      XXH_memcpy(state->mem64, p, (size_t)(bEnd - p));
+      state->memsize = (unsigned)(bEnd - p);
+    }
+  }
+
+  return XXH_OK;
+}
+
+XXH_errorcode XXH64_update(XXH64_state_t* state_in, const void* input,
+                           size_t len) {
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+  else
+    return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+FORCE_INLINE U64 XXH64_digest_endian(const XXH64_state_t* state,
+                                     XXH_endianess endian) {
+  U64 h64;
+
+  if (state->total_len >= 32) {
+    U64 const v1 = state->v1;
+    U64 const v2 = state->v2;
+    U64 const v3 = state->v3;
+    U64 const v4 = state->v4;
+
+    h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) +
+          XXH_rotl64(v4, 18);
+    h64 = XXH64_mergeRound(h64, v1);
+    h64 = XXH64_mergeRound(h64, v2);
+    h64 = XXH64_mergeRound(h64, v3);
+    h64 = XXH64_mergeRound(h64, v4);
+  } else {
+    h64 = state->v3 /*seed*/ + PRIME64_5;
+  }
+
+  h64 += (U64)state->total_len;
+
+  return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian,
+                        XXH_aligned);
+}
+
+unsigned long long XXH64_digest(const XXH64_state_t* state_in) {
+  XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+  if ((endian_detected == XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+    return XXH64_digest_endian(state_in, XXH_littleEndian);
+  else
+    return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+/*====== Canonical representation   ======*/
+
+void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) {
+  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+  memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) {
+  return XXH_readBE64(src);
+}
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/util/xxhash.h b/thirdparty/rocksdb/util/xxhash.h
index 3343e3488f..88352ac75f 100644
--- a/thirdparty/rocksdb/util/xxhash.h
+++ b/thirdparty/rocksdb/util/xxhash.h
@@ -59,6 +59,14 @@ It depends on successfully passing SMHasher test set.
 
 #pragma once
 
+#include <stdlib.h>
+
+#if !defined(__VMS) &&       \
+    (defined(__cplusplus) || \
+     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
+#include <stdint.h>
+#endif
+
 #if defined (__cplusplus)
 namespace rocksdb {
 #endif
@@ -67,6 +75,7 @@ namespace rocksdb {
 //****************************
 // Type
 //****************************
+/* size_t */
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
@@ -157,7 +166,74 @@ To free memory context, use XXH32_digest(), or free().
 #define XXH32_result XXH32_digest
 #define XXH32_getIntermediateResult XXH32_intermediateDigest
 
+/*-**********************************************************************
+ *  64-bit hash
+ ************************************************************************/
+typedef unsigned long long XXH64_hash_t;
 
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory
+   address "input". "seed" can be used to alter the result predictably. This
+   function runs faster on 64-bit systems, but slower on 32-bit systems (see
+   benchmark).
+*/
+XXH64_hash_t XXH64(const void* input, size_t length, unsigned long long seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
+XXH64_state_t* XXH64_createState(void);
+XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
+void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed);
+XXH_errorcode XXH64_update(XXH64_state_t* statePtr, const void* input,
+                           size_t length);
+XXH64_hash_t XXH64_digest(const XXH64_state_t* statePtr);
+
+/*======   Canonical representation   ======*/
+typedef struct {
+  unsigned char digest[8];
+} XXH64_canonical_t;
+void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+#if !defined(__VMS) &&       \
+    (defined(__cplusplus) || \
+     (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */))
+
+struct XXH64_state_s {
+  uint64_t total_len;
+  uint64_t v1;
+  uint64_t v2;
+  uint64_t v3;
+  uint64_t v4;
+  uint64_t mem64[4];
+  uint32_t memsize;
+  uint32_t reserved[2]; /* never read nor write, might be removed in a future
+                           version */
+};                      /* typedef'd to XXH64_state_t */
+
+#else
+
+#ifndef XXH_NO_LONG_LONG /* remove 64-bit support */
+struct XXH64_state_s {
+  unsigned long long total_len;
+  unsigned long long v1;
+  unsigned long long v2;
+  unsigned long long v3;
+  unsigned long long v4;
+  unsigned long long mem64[4];
+  unsigned memsize;
+  unsigned reserved[2]; /* never read nor write, might be removed in a future
+                           version */
+};                      /* typedef'd to XXH64_state_t */
+#endif
+
+#endif
 
 #if defined (__cplusplus)
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/backupable/backupable_db.cc b/thirdparty/rocksdb/utilities/backupable/backupable_db.cc
index 8921309e46..b7c15c3915 100644
--- a/thirdparty/rocksdb/utilities/backupable/backupable_db.cc
+++ b/thirdparty/rocksdb/utilities/backupable/backupable_db.cc
@@ -92,7 +92,7 @@ class BackupEngineImpl : public BackupEngine {
  public:
   BackupEngineImpl(Env* db_env, const BackupableDBOptions& options,
                    bool read_only = false);
-  ~BackupEngineImpl();
+  ~BackupEngineImpl() override;
   Status CreateNewBackupWithMetadata(DB* db, const std::string& app_metadata,
                                      bool flush_before_backup = false,
                                      std::function<void()> progress_callback =
@@ -114,11 +114,11 @@ class BackupEngineImpl : public BackupEngine {
   Status RestoreDBFromLatestBackup(
       const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) override {
-    return RestoreDBFromBackup(latest_backup_id_, db_dir, wal_dir,
+    return RestoreDBFromBackup(latest_valid_backup_id_, db_dir, wal_dir,
                                restore_options);
   }
 
-  virtual Status VerifyBackup(BackupID backup_id) override;
+  Status VerifyBackup(BackupID backup_id) override;
 
   Status Initialize();
 
@@ -146,11 +146,17 @@ class BackupEngineImpl : public BackupEngine {
 
   class BackupMeta {
    public:
-    BackupMeta(const std::string& meta_filename,
+    BackupMeta(
+        const std::string& meta_filename, const std::string& meta_tmp_filename,
         std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos,
         Env* env)
-      : timestamp_(0), size_(0), meta_filename_(meta_filename),
-        file_infos_(file_infos), env_(env) {}
+        : timestamp_(0),
+          sequence_number_(0),
+          size_(0),
+          meta_filename_(meta_filename),
+          meta_tmp_filename_(meta_tmp_filename),
+          file_infos_(file_infos),
+          env_(env) {}
 
     BackupMeta(const BackupMeta&) = delete;
     BackupMeta& operator=(const BackupMeta&) = delete;
@@ -228,6 +234,7 @@ class BackupEngineImpl : public BackupEngine {
     uint64_t size_;
     std::string app_metadata_;
     std::string const meta_filename_;
+    std::string const meta_tmp_filename_;
     // files with relative paths (without "/" prefix!!)
     std::vector<std::shared_ptr<FileInfo>> files_;
     std::unordered_map<std::string, std::shared_ptr<FileInfo>>* file_infos_;
@@ -257,12 +264,14 @@ class BackupEngineImpl : public BackupEngine {
   inline std::string GetSharedFileRel(const std::string& file = "",
                                       bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return "shared/" + file + (tmp ? ".tmp" : "");
+    return std::string("shared/") + (tmp ? "." : "") + file +
+           (tmp ? ".tmp" : "");
   }
   inline std::string GetSharedFileWithChecksumRel(const std::string& file = "",
                                                   bool tmp = false) const {
     assert(file.size() == 0 || file[0] != '/');
-    return GetSharedChecksumDirRel() + "/" + file + (tmp ? ".tmp" : "");
+    return GetSharedChecksumDirRel() + "/" + (tmp ? "." : "") + file +
+           (tmp ? ".tmp" : "");
   }
   inline std::string GetSharedFileWithChecksum(const std::string& file,
                                                const uint32_t checksum_value,
@@ -283,8 +292,9 @@ class BackupEngineImpl : public BackupEngine {
   inline std::string GetBackupMetaDir() const {
     return GetAbsolutePath("meta");
   }
-  inline std::string GetBackupMetaFile(BackupID backup_id) const {
-    return GetBackupMetaDir() + "/" + rocksdb::ToString(backup_id);
+  inline std::string GetBackupMetaFile(BackupID backup_id, bool tmp) const {
+    return GetBackupMetaDir() + "/" + (tmp ? "." : "") +
+           rocksdb::ToString(backup_id) + (tmp ? ".tmp" : "");
   }
 
   // If size_limit == 0, there is no size limit, copy everything.
@@ -295,16 +305,16 @@ class BackupEngineImpl : public BackupEngine {
   // @param contents If non-empty, the file will be created with these contents.
   Status CopyOrCreateFile(const std::string& src, const std::string& dst,
                           const std::string& contents, Env* src_env,
-                          Env* dst_env, bool sync, RateLimiter* rate_limiter,
+                          Env* dst_env, const EnvOptions& src_env_options,
+                          bool sync, RateLimiter* rate_limiter,
                           uint64_t* size = nullptr,
                           uint32_t* checksum_value = nullptr,
                           uint64_t size_limit = 0,
                           std::function<void()> progress_callback = []() {});
 
-  Status CalculateChecksum(const std::string& src,
-                           Env* src_env,
-                           uint64_t size_limit,
-                           uint32_t* checksum_value);
+  Status CalculateChecksum(const std::string& src, Env* src_env,
+                           const EnvOptions& src_env_options,
+                           uint64_t size_limit, uint32_t* checksum_value);
 
   struct CopyOrCreateResult {
     uint64_t size;
@@ -321,13 +331,24 @@ class BackupEngineImpl : public BackupEngine {
     std::string contents;
     Env* src_env;
     Env* dst_env;
+    EnvOptions src_env_options;
     bool sync;
     RateLimiter* rate_limiter;
     uint64_t size_limit;
     std::promise<CopyOrCreateResult> result;
     std::function<void()> progress_callback;
 
-    CopyOrCreateWorkItem() {}
+    CopyOrCreateWorkItem()
+        : src_path(""),
+          dst_path(""),
+          contents(""),
+          src_env(nullptr),
+          dst_env(nullptr),
+          src_env_options(),
+          sync(false),
+          rate_limiter(nullptr),
+          size_limit(0) {}
+
     CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete;
     CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete;
 
@@ -341,6 +362,7 @@ class BackupEngineImpl : public BackupEngine {
       contents = std::move(o.contents);
       src_env = o.src_env;
       dst_env = o.dst_env;
+      src_env_options = std::move(o.src_env_options);
       sync = o.sync;
       rate_limiter = o.rate_limiter;
       size_limit = o.size_limit;
@@ -351,14 +373,15 @@ class BackupEngineImpl : public BackupEngine {
 
     CopyOrCreateWorkItem(std::string _src_path, std::string _dst_path,
                          std::string _contents, Env* _src_env, Env* _dst_env,
-                         bool _sync, RateLimiter* _rate_limiter,
-                         uint64_t _size_limit,
+                         EnvOptions _src_env_options, bool _sync,
+                         RateLimiter* _rate_limiter, uint64_t _size_limit,
                          std::function<void()> _progress_callback = []() {})
         : src_path(std::move(_src_path)),
           dst_path(std::move(_dst_path)),
           contents(std::move(_contents)),
           src_env(_src_env),
           dst_env(_dst_env),
+          src_env_options(std::move(_src_env_options)),
           sync(_sync),
           rate_limiter(_rate_limiter),
           size_limit(_size_limit),
@@ -373,7 +396,13 @@ class BackupEngineImpl : public BackupEngine {
     std::string dst_path_tmp;
     std::string dst_path;
     std::string dst_relative;
-    BackupAfterCopyOrCreateWorkItem() {}
+    BackupAfterCopyOrCreateWorkItem()
+      : shared(false),
+        needed_to_copy(false),
+        backup_env(nullptr),
+        dst_path_tmp(""),
+        dst_path(""),
+        dst_relative("") {}
 
     BackupAfterCopyOrCreateWorkItem(BackupAfterCopyOrCreateWorkItem&& o)
         ROCKSDB_NOEXCEPT {
@@ -409,7 +438,8 @@ class BackupEngineImpl : public BackupEngine {
   struct RestoreAfterCopyOrCreateWorkItem {
     std::future<CopyOrCreateResult> result;
     uint32_t checksum_value;
-    RestoreAfterCopyOrCreateWorkItem() {}
+    RestoreAfterCopyOrCreateWorkItem()
+      : checksum_value(0) {}
     RestoreAfterCopyOrCreateWorkItem(std::future<CopyOrCreateResult>&& _result,
                                      uint32_t _checksum_value)
         : result(std::move(_result)), checksum_value(_checksum_value) {}
@@ -445,16 +475,18 @@ class BackupEngineImpl : public BackupEngine {
       std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
       BackupID backup_id, bool shared, const std::string& src_dir,
       const std::string& fname,  // starts with "/"
-      RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit = 0,
+      const EnvOptions& src_env_options, RateLimiter* rate_limiter,
+      uint64_t size_bytes, uint64_t size_limit = 0,
       bool shared_checksum = false,
       std::function<void()> progress_callback = []() {},
       const std::string& contents = std::string());
 
   // backup state data
   BackupID latest_backup_id_;
-  std::map<BackupID, unique_ptr<BackupMeta>> backups_;
-  std::map<BackupID,
-           std::pair<Status, unique_ptr<BackupMeta>>> corrupt_backups_;
+  BackupID latest_valid_backup_id_;
+  std::map<BackupID, std::unique_ptr<BackupMeta>> backups_;
+  std::map<BackupID, std::pair<Status, std::unique_ptr<BackupMeta>>>
+      corrupt_backups_;
   std::unordered_map<std::string,
                      std::shared_ptr<FileInfo>> backuped_file_infos_;
   std::atomic<bool> stop_backup_;
@@ -465,10 +497,10 @@ class BackupEngineImpl : public BackupEngine {
   Env* backup_env_;
 
   // directories
-  unique_ptr<Directory> backup_directory_;
-  unique_ptr<Directory> shared_directory_;
-  unique_ptr<Directory> meta_directory_;
-  unique_ptr<Directory> private_directory_;
+  std::unique_ptr<Directory> backup_directory_;
+  std::unique_ptr<Directory> shared_directory_;
+  std::unique_ptr<Directory> meta_directory_;
+  std::unique_ptr<Directory> private_directory_;
 
   static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL;  // 5MB
   size_t copy_file_buffer_size_;
@@ -494,6 +526,8 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env,
                                    const BackupableDBOptions& options,
                                    bool read_only)
     : initialized_(false),
+      latest_backup_id_(0),
+      latest_valid_backup_id_(0),
       stop_backup_(false),
       options_(options),
       db_env_(db_env),
@@ -586,13 +620,15 @@ Status BackupEngineImpl::Initialize() {
       continue;
     }
     assert(backups_.find(backup_id) == backups_.end());
-    backups_.insert(
-        std::make_pair(backup_id, unique_ptr<BackupMeta>(new BackupMeta(
-                                      GetBackupMetaFile(backup_id),
-                                      &backuped_file_infos_, backup_env_))));
+    backups_.insert(std::make_pair(
+        backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
+                       GetBackupMetaFile(backup_id, false /* tmp */),
+                       GetBackupMetaFile(backup_id, true /* tmp */),
+                       &backuped_file_infos_, backup_env_))));
   }
 
   latest_backup_id_ = 0;
+  latest_valid_backup_id_ = 0;
   if (options_.destroy_old_data) {  // Destroy old data
     assert(!read_only_);
     ROCKS_LOG_INFO(
@@ -615,15 +651,18 @@ Status BackupEngineImpl::Initialize() {
     }
     // load the backups if any, until valid_backups_to_open of the latest
     // non-corrupted backups have been successfully opened.
-    int valid_backups_to_open;
-    if (options_.max_valid_backups_to_open == 0) {
-      valid_backups_to_open = INT_MAX;
-    } else {
-      valid_backups_to_open = options_.max_valid_backups_to_open;
-    }
+    int valid_backups_to_open = options_.max_valid_backups_to_open;
     for (auto backup_iter = backups_.rbegin();
-         backup_iter != backups_.rend() && valid_backups_to_open > 0;
+         backup_iter != backups_.rend();
          ++backup_iter) {
+      assert(latest_backup_id_ == 0 || latest_backup_id_ > backup_iter->first);
+      if (latest_backup_id_ == 0) {
+        latest_backup_id_ = backup_iter->first;
+      }
+      if (valid_backups_to_open == 0) {
+        break;
+      }
+
       InsertPathnameToSizeBytes(
           GetAbsolutePath(GetPrivateFileRel(backup_iter->first)), backup_env_,
           &abs_path_to_size);
@@ -644,7 +683,11 @@ Status BackupEngineImpl::Initialize() {
         ROCKS_LOG_INFO(options_.info_log, "Loading backup %" PRIu32 " OK:\n%s",
                        backup_iter->first,
                        backup_iter->second->GetInfoString().c_str());
-        latest_backup_id_ = std::max(latest_backup_id_, backup_iter->first);
+        assert(latest_valid_backup_id_ == 0 ||
+               latest_valid_backup_id_ > backup_iter->first);
+        if (latest_valid_backup_id_ == 0) {
+          latest_valid_backup_id_ = backup_iter->first;
+        }
         --valid_backups_to_open;
       }
     }
@@ -668,6 +711,8 @@ Status BackupEngineImpl::Initialize() {
   }
 
   ROCKS_LOG_INFO(options_.info_log, "Latest backup is %u", latest_backup_id_);
+  ROCKS_LOG_INFO(options_.info_log, "Latest valid backup is %u",
+                 latest_valid_backup_id_);
 
   // set up threads perform copies from files_to_copy_or_create_ in the
   // background
@@ -683,9 +728,10 @@ Status BackupEngineImpl::Initialize() {
         CopyOrCreateResult result;
         result.status = CopyOrCreateFile(
             work_item.src_path, work_item.dst_path, work_item.contents,
-            work_item.src_env, work_item.dst_env, work_item.sync,
-            work_item.rate_limiter, &result.size, &result.checksum_value,
-            work_item.size_limit, work_item.progress_callback);
+            work_item.src_env, work_item.dst_env, work_item.src_env_options,
+            work_item.sync, work_item.rate_limiter, &result.size,
+            &result.checksum_value, work_item.size_limit,
+            work_item.progress_callback);
         work_item.result.set_value(std::move(result));
       }
     });
@@ -707,37 +753,41 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   BackupID new_backup_id = latest_backup_id_ + 1;
 
   assert(backups_.find(new_backup_id) == backups_.end());
-  auto ret = backups_.insert(
-      std::make_pair(new_backup_id, unique_ptr<BackupMeta>(new BackupMeta(
-                                        GetBackupMetaFile(new_backup_id),
-                                        &backuped_file_infos_, backup_env_))));
+
+  auto private_dir = GetAbsolutePath(GetPrivateFileRel(new_backup_id));
+  Status s = backup_env_->FileExists(private_dir);
+  if (s.ok()) {
+    // maybe last backup failed and left partial state behind, clean it up.
+    // need to do this before updating backups_ such that a private dir
+    // named after new_backup_id will be cleaned up
+    s = GarbageCollect();
+  } else if (s.IsNotFound()) {
+    // normal case, the new backup's private dir doesn't exist yet
+    s = Status::OK();
+  }
+
+  auto ret = backups_.insert(std::make_pair(
+      new_backup_id, std::unique_ptr<BackupMeta>(new BackupMeta(
+                         GetBackupMetaFile(new_backup_id, false /* tmp */),
+                         GetBackupMetaFile(new_backup_id, true /* tmp */),
+                         &backuped_file_infos_, backup_env_))));
   assert(ret.second == true);
   auto& new_backup = ret.first->second;
   new_backup->RecordTimestamp();
   new_backup->SetAppMetadata(app_metadata);
 
-  auto start_backup = backup_env_-> NowMicros();
+  auto start_backup = backup_env_->NowMicros();
 
   ROCKS_LOG_INFO(options_.info_log,
                  "Started the backup process -- creating backup %u",
                  new_backup_id);
-
-  auto private_tmp_dir = GetAbsolutePath(GetPrivateFileRel(new_backup_id, true));
-  Status s = backup_env_->FileExists(private_tmp_dir);
-  if (s.ok()) {
-    // maybe last backup failed and left partial state behind, clean it up
-    s = GarbageCollect();
-  } else if (s.IsNotFound()) {
-    // normal case, the new backup's private dir doesn't exist yet
-    s = Status::OK();
-  }
   if (s.ok()) {
-    s = backup_env_->CreateDir(private_tmp_dir);
+    s = backup_env_->CreateDir(private_dir);
   }
 
   RateLimiter* rate_limiter = options_.backup_rate_limiter.get();
   if (rate_limiter) {
-    copy_file_buffer_size_ = rate_limiter->GetSingleBurstBytes();
+    copy_file_buffer_size_ = static_cast<size_t>(rate_limiter->GetSingleBurstBytes());
   }
 
   // A set into which we will insert the dst_paths that are calculated for live
@@ -752,9 +802,11 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   if (s.ok()) {
     CheckpointImpl checkpoint(db);
     uint64_t sequence_number = 0;
+    DBOptions db_options = db->GetDBOptions();
+    EnvOptions src_raw_env_options(db_options);
     s = checkpoint.CreateCustomCheckpoint(
-        db->GetDBOptions(),
-        [&](const std::string& src_dirname, const std::string& fname,
+        db_options,
+        [&](const std::string& /*src_dirname*/, const std::string& /*fname*/,
             FileType) {
           // custom checkpoint will switch to calling copy_file_cb after it sees
           // NotSupported returned from link_file_cb.
@@ -771,11 +823,33 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
           if (type == kTableFile) {
             st = db_env_->GetFileSize(src_dirname + fname, &size_bytes);
           }
+          EnvOptions src_env_options;
+          switch (type) {
+            case kLogFile:
+              src_env_options =
+                  db_env_->OptimizeForLogRead(src_raw_env_options);
+              break;
+            case kTableFile:
+              src_env_options = db_env_->OptimizeForCompactionTableRead(
+                  src_raw_env_options, ImmutableDBOptions(db_options));
+              break;
+            case kDescriptorFile:
+              src_env_options =
+                  db_env_->OptimizeForManifestRead(src_raw_env_options);
+              break;
+            default:
+              // Other backed up files (like options file) are not read by live
+              // DB, so don't need to worry about avoiding mixing buffered and
+              // direct I/O. Just use plain defaults.
+              src_env_options = src_raw_env_options;
+              break;
+          }
           if (st.ok()) {
             st = AddBackupFileWorkItem(
                 live_dst_paths, backup_items_to_finish, new_backup_id,
                 options_.share_table_files && type == kTableFile, src_dirname,
-                fname, rate_limiter, size_bytes, size_limit_bytes,
+                fname, src_env_options, rate_limiter, size_bytes,
+                size_limit_bytes,
                 options_.share_files_with_checksum && type == kTableFile,
                 progress_callback);
           }
@@ -785,8 +859,9 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
           Log(options_.info_log, "add file for backup %s", fname.c_str());
           return AddBackupFileWorkItem(
               live_dst_paths, backup_items_to_finish, new_backup_id,
-              false /* shared */, "" /* src_dir */, fname, rate_limiter,
-              contents.size(), 0 /* size_limit */, false /* shared_checksum */,
+              false /* shared */, "" /* src_dir */, fname,
+              EnvOptions() /* src_env_options */, rate_limiter, contents.size(),
+              0 /* size_limit */, false /* shared_checksum */,
               progress_callback, contents);
         } /* create_file_cb */,
         &sequence_number, flush_before_backup ? 0 : port::kMaxUint64);
@@ -818,18 +893,6 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   // we copied all the files, enable file deletions
   db->EnableFileDeletions(false);
 
-  if (s.ok()) {
-    // move tmp private backup to real backup folder
-    ROCKS_LOG_INFO(
-        options_.info_log,
-        "Moving tmp backup directory to the real one: %s -> %s\n",
-        GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)).c_str(),
-        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)).c_str());
-    s = backup_env_->RenameFile(
-        GetAbsolutePath(GetPrivateFileRel(new_backup_id, true)),  // tmp
-        GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)));
-  }
-
   auto backup_time = backup_env_->NowMicros() - start_backup;
 
   if (s.ok()) {
@@ -837,24 +900,24 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
     s = new_backup->StoreToFile(options_.sync);
   }
   if (s.ok() && options_.sync) {
-    unique_ptr<Directory> backup_private_directory;
+    std::unique_ptr<Directory> backup_private_directory;
     backup_env_->NewDirectory(
         GetAbsolutePath(GetPrivateFileRel(new_backup_id, false)),
         &backup_private_directory);
     if (backup_private_directory != nullptr) {
-      backup_private_directory->Fsync();
+      s = backup_private_directory->Fsync();
     }
-    if (private_directory_ != nullptr) {
-      private_directory_->Fsync();
+    if (s.ok() && private_directory_ != nullptr) {
+      s = private_directory_->Fsync();
     }
-    if (meta_directory_ != nullptr) {
-      meta_directory_->Fsync();
+    if (s.ok() && meta_directory_ != nullptr) {
+      s = meta_directory_->Fsync();
     }
-    if (shared_directory_ != nullptr) {
-      shared_directory_->Fsync();
+    if (s.ok() && shared_directory_ != nullptr) {
+      s = shared_directory_->Fsync();
     }
-    if (backup_directory_ != nullptr) {
-      backup_directory_->Fsync();
+    if (s.ok() && backup_directory_ != nullptr) {
+      s = backup_directory_->Fsync();
     }
   }
 
@@ -877,6 +940,7 @@ Status BackupEngineImpl::CreateNewBackupWithMetadata(
   // here we know that we succeeded and installed the new backup
   // in the LATEST_BACKUP file
   latest_backup_id_ = new_backup_id;
+  latest_valid_backup_id_ = new_backup_id;
   ROCKS_LOG_INFO(options_.info_log, "Backup DONE. All is good");
 
   // backup_speed is in byte/second
@@ -937,17 +1001,24 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) {
     corrupt_backups_.erase(corrupt);
   }
 
-  std::vector<std::string> to_delete;
-  for (auto& itr : backuped_file_infos_) {
-    if (itr.second->refs == 0) {
-      Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
-      ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s", itr.first.c_str(),
-                     s.ToString().c_str());
-      to_delete.push_back(itr.first);
+  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
+    std::vector<std::string> to_delete;
+    for (auto& itr : backuped_file_infos_) {
+      if (itr.second->refs == 0) {
+        Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first));
+        ROCKS_LOG_INFO(options_.info_log, "Deleting %s -- %s",
+                       itr.first.c_str(), s.ToString().c_str());
+        to_delete.push_back(itr.first);
+      }
     }
-  }
-  for (auto& td : to_delete) {
-    backuped_file_infos_.erase(td);
+    for (auto& td : to_delete) {
+      backuped_file_infos_.erase(td);
+    }
+  } else {
+    ROCKS_LOG_WARN(
+        options_.info_log,
+        "DeleteBackup cleanup is limited since `max_valid_backups_to_open` "
+        "constrains how many backups the engine knows about");
   }
 
   // take care of private dirs -- GarbageCollect() will take care of them
@@ -1038,7 +1109,7 @@ Status BackupEngineImpl::RestoreDBFromBackup(
 
   RateLimiter* rate_limiter = options_.restore_rate_limiter.get();
   if (rate_limiter) {
-    copy_file_buffer_size_ = rate_limiter->GetSingleBurstBytes();
+    copy_file_buffer_size_ = static_cast<size_t>(rate_limiter->GetSingleBurstBytes());
   }
   Status s;
   std::vector<RestoreAfterCopyOrCreateWorkItem> restore_items_to_finish;
@@ -1074,7 +1145,8 @@ Status BackupEngineImpl::RestoreDBFromBackup(
                    dst.c_str());
     CopyOrCreateWorkItem copy_or_create_work_item(
         GetAbsolutePath(file), dst, "" /* contents */, backup_env_, db_env_,
-        false, rate_limiter, 0 /* size_limit */);
+        EnvOptions() /* src_env_options */, false, rate_limiter,
+        0 /* size_limit */);
     RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
         copy_or_create_work_item.result.get_future(),
         file_info->checksum_value);
@@ -1143,15 +1215,15 @@ Status BackupEngineImpl::VerifyBackup(BackupID backup_id) {
 
 Status BackupEngineImpl::CopyOrCreateFile(
     const std::string& src, const std::string& dst, const std::string& contents,
-    Env* src_env, Env* dst_env, bool sync, RateLimiter* rate_limiter,
-    uint64_t* size, uint32_t* checksum_value, uint64_t size_limit,
-    std::function<void()> progress_callback) {
+    Env* src_env, Env* dst_env, const EnvOptions& src_env_options, bool sync,
+    RateLimiter* rate_limiter, uint64_t* size, uint32_t* checksum_value,
+    uint64_t size_limit, std::function<void()> progress_callback) {
   assert(src.empty() != contents.empty());
   Status s;
-  unique_ptr<WritableFile> dst_file;
-  unique_ptr<SequentialFile> src_file;
-  EnvOptions env_options;
-  env_options.use_mmap_writes = false;
+  std::unique_ptr<WritableFile> dst_file;
+  std::unique_ptr<SequentialFile> src_file;
+  EnvOptions dst_env_options;
+  dst_env_options.use_mmap_writes = false;
   // TODO:(gzh) maybe use direct reads/writes here if possible
   if (size != nullptr) {
     *size = 0;
@@ -1165,20 +1237,20 @@ Status BackupEngineImpl::CopyOrCreateFile(
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  s = dst_env->NewWritableFile(dst, &dst_file, env_options);
+  s = dst_env->NewWritableFile(dst, &dst_file, dst_env_options);
   if (s.ok() && !src.empty()) {
-    s = src_env->NewSequentialFile(src, &src_file, env_options);
+    s = src_env->NewSequentialFile(src, &src_file, src_env_options);
   }
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<WritableFileWriter> dest_writer(
-      new WritableFileWriter(std::move(dst_file), env_options));
-  unique_ptr<SequentialFileReader> src_reader;
-  unique_ptr<char[]> buf;
+  std::unique_ptr<WritableFileWriter> dest_writer(
+      new WritableFileWriter(std::move(dst_file), dst, dst_env_options));
+  std::unique_ptr<SequentialFileReader> src_reader;
+  std::unique_ptr<char[]> buf;
   if (!src.empty()) {
-    src_reader.reset(new SequentialFileReader(std::move(src_file)));
+    src_reader.reset(new SequentialFileReader(std::move(src_file), src));
     buf.reset(new char[copy_file_buffer_size_]);
   }
 
@@ -1191,7 +1263,7 @@ Status BackupEngineImpl::CopyOrCreateFile(
     if (!src.empty()) {
       size_t buffer_to_read = (copy_file_buffer_size_ < size_limit)
                                   ? copy_file_buffer_size_
-                                  : size_limit;
+                                  : static_cast<size_t>(size_limit);
       s = src_reader->Read(buffer_to_read, &data, buf.get());
       processed_buffer_size += buffer_to_read;
     } else {
@@ -1236,9 +1308,10 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
     std::unordered_set<std::string>& live_dst_paths,
     std::vector<BackupAfterCopyOrCreateWorkItem>& backup_items_to_finish,
     BackupID backup_id, bool shared, const std::string& src_dir,
-    const std::string& fname, RateLimiter* rate_limiter, uint64_t size_bytes,
-    uint64_t size_limit, bool shared_checksum,
-    std::function<void()> progress_callback, const std::string& contents) {
+    const std::string& fname, const EnvOptions& src_env_options,
+    RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit,
+    bool shared_checksum, std::function<void()> progress_callback,
+    const std::string& contents) {
   assert(!fname.empty() && fname[0] == '/');
   assert(contents.empty() != src_dir.empty());
 
@@ -1249,7 +1322,7 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
 
   if (shared && shared_checksum) {
     // add checksum and file length to the file name
-    s = CalculateChecksum(src_dir + fname, db_env_, size_limit,
+    s = CalculateChecksum(src_dir + fname, db_env_, src_env_options, size_limit,
                           &checksum_value);
     if (!s.ok()) {
       return s;
@@ -1265,22 +1338,33 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
     dst_relative_tmp = GetSharedFileRel(dst_relative, true);
     dst_relative = GetSharedFileRel(dst_relative, false);
   } else {
-    dst_relative_tmp = GetPrivateFileRel(backup_id, true, dst_relative);
     dst_relative = GetPrivateFileRel(backup_id, false, dst_relative);
   }
-  std::string dst_path = GetAbsolutePath(dst_relative);
-  std::string dst_path_tmp = GetAbsolutePath(dst_relative_tmp);
+
+  // We copy into `temp_dest_path` and, once finished, rename it to
+  // `final_dest_path`. This allows files to atomically appear at
+  // `final_dest_path`. We can copy directly to the final path when atomicity
+  // is unnecessary, like for files in private backup directories.
+  const std::string* copy_dest_path;
+  std::string temp_dest_path;
+  std::string final_dest_path = GetAbsolutePath(dst_relative);
+  if (!dst_relative_tmp.empty()) {
+    temp_dest_path = GetAbsolutePath(dst_relative_tmp);
+    copy_dest_path = &temp_dest_path;
+  } else {
+    copy_dest_path = &final_dest_path;
+  }
 
   // if it's shared, we also need to check if it exists -- if it does, no need
   // to copy it again.
   bool need_to_copy = true;
-  // true if dst_path is the same path as another live file
+  // true if final_dest_path is the same path as another live file
   const bool same_path =
-      live_dst_paths.find(dst_path) != live_dst_paths.end();
+      live_dst_paths.find(final_dest_path) != live_dst_paths.end();
 
   bool file_exists = false;
   if (shared && !same_path) {
-    Status exist = backup_env_->FileExists(dst_path);
+    Status exist = backup_env_->FileExists(final_dest_path);
     if (exist.ok()) {
       file_exists = true;
     } else if (exist.IsNotFound()) {
@@ -1309,34 +1393,34 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
           "overwrite the file.",
           fname.c_str());
       need_to_copy = true;
-      backup_env_->DeleteFile(dst_path);
+      backup_env_->DeleteFile(final_dest_path);
     } else {
       // the file is present and referenced by a backup
       ROCKS_LOG_INFO(options_.info_log,
                      "%s already present, calculate checksum", fname.c_str());
-      s = CalculateChecksum(src_dir + fname, db_env_, size_limit,
-                            &checksum_value);
+      s = CalculateChecksum(src_dir + fname, db_env_, src_env_options,
+                            size_limit, &checksum_value);
     }
   }
-  live_dst_paths.insert(dst_path);
+  live_dst_paths.insert(final_dest_path);
 
   if (!contents.empty() || need_to_copy) {
     ROCKS_LOG_INFO(options_.info_log, "Copying %s to %s", fname.c_str(),
-                   dst_path_tmp.c_str());
+                   copy_dest_path->c_str());
     CopyOrCreateWorkItem copy_or_create_work_item(
-        src_dir.empty() ? "" : src_dir + fname, dst_path_tmp, contents, db_env_,
-        backup_env_, options_.sync, rate_limiter, size_limit,
-        progress_callback);
+        src_dir.empty() ? "" : src_dir + fname, *copy_dest_path, contents,
+        db_env_, backup_env_, src_env_options, options_.sync, rate_limiter,
+        size_limit, progress_callback);
     BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
         copy_or_create_work_item.result.get_future(), shared, need_to_copy,
-        backup_env_, dst_path_tmp, dst_path, dst_relative);
+        backup_env_, temp_dest_path, final_dest_path, dst_relative);
     files_to_copy_or_create_.write(std::move(copy_or_create_work_item));
     backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
   } else {
     std::promise<CopyOrCreateResult> promise_result;
     BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item(
         promise_result.get_future(), shared, need_to_copy, backup_env_,
-        dst_path_tmp, dst_path, dst_relative);
+        temp_dest_path, final_dest_path, dst_relative);
     backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item));
     CopyOrCreateResult result;
     result.status = s;
@@ -1348,6 +1432,7 @@ Status BackupEngineImpl::AddBackupFileWorkItem(
 }
 
 Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
+                                           const EnvOptions& src_env_options,
                                            uint64_t size_limit,
                                            uint32_t* checksum_value) {
   *checksum_value = 0;
@@ -1355,18 +1440,14 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
     size_limit = std::numeric_limits<uint64_t>::max();
   }
 
-  EnvOptions env_options;
-  env_options.use_mmap_writes = false;
-  env_options.use_direct_reads = false;
-
   std::unique_ptr<SequentialFile> src_file;
-  Status s = src_env->NewSequentialFile(src, &src_file, env_options);
+  Status s = src_env->NewSequentialFile(src, &src_file, src_env_options);
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<SequentialFileReader> src_reader(
-      new SequentialFileReader(std::move(src_file)));
+  std::unique_ptr<SequentialFileReader> src_reader(
+      new SequentialFileReader(std::move(src_file), src));
   std::unique_ptr<char[]> buf(new char[copy_file_buffer_size_]);
   Slice data;
 
@@ -1375,7 +1456,7 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env,
       return Status::Incomplete("Backup stopped");
     }
     size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ?
-      copy_file_buffer_size_ : size_limit;
+      copy_file_buffer_size_ : static_cast<size_t>(size_limit);
     s = src_reader->Read(buffer_to_read, &data, buf.get());
 
     if (!s.ok()) {
@@ -1429,9 +1510,18 @@ Status BackupEngineImpl::InsertPathnameToSizeBytes(
 Status BackupEngineImpl::GarbageCollect() {
   assert(!read_only_);
   ROCKS_LOG_INFO(options_.info_log, "Starting garbage collection");
+  if (options_.max_valid_backups_to_open == port::kMaxInt32) {
+    ROCKS_LOG_WARN(
+        options_.info_log,
+        "Garbage collection is limited since `max_valid_backups_to_open` "
+        "constrains how many backups the engine knows about");
+  }
 
-  if (options_.share_table_files) {
+  if (options_.share_table_files &&
+      options_.max_valid_backups_to_open == port::kMaxInt32) {
     // delete obsolete shared files
+    // we cannot do this when BackupEngine has `max_valid_backups_to_open` set
+    // as those engines don't know about all shared files.
     std::vector<std::string> shared_children;
     {
       std::string shared_path;
@@ -1481,6 +1571,8 @@ Status BackupEngineImpl::GarbageCollect() {
     }
   }
   for (auto& child : private_children) {
+    // it's ok to do this when BackupEngine has `max_valid_backups_to_open` set
+    // as the engine always knows all valid backup numbers.
     BackupID backup_id = 0;
     bool tmp_dir = child.find(".tmp") != std::string::npos;
     sscanf(child.c_str(), "%u", &backup_id);
@@ -1491,7 +1583,7 @@ Status BackupEngineImpl::GarbageCollect() {
     }
     // here we have to delete the dir and all its children
     std::string full_private_path =
-        GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir));
+        GetAbsolutePath(GetPrivateFileRel(backup_id));
     std::vector<std::string> subchildren;
     backup_env_->GetChildren(full_private_path, &subchildren);
     for (auto& subchild : subchildren) {
@@ -1572,15 +1664,15 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
     const std::unordered_map<std::string, uint64_t>& abs_path_to_size) {
   assert(Empty());
   Status s;
-  unique_ptr<SequentialFile> backup_meta_file;
+  std::unique_ptr<SequentialFile> backup_meta_file;
   s = env_->NewSequentialFile(meta_filename_, &backup_meta_file, EnvOptions());
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<SequentialFileReader> backup_meta_reader(
-      new SequentialFileReader(std::move(backup_meta_file)));
-  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
+  std::unique_ptr<SequentialFileReader> backup_meta_reader(
+      new SequentialFileReader(std::move(backup_meta_file), meta_filename_));
+  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_ + 1]);
   Slice data;
   s = backup_meta_reader->Read(max_backup_meta_file_size_, &data, buf.get());
 
@@ -1674,17 +1766,16 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(
 
 Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
   Status s;
-  unique_ptr<WritableFile> backup_meta_file;
+  std::unique_ptr<WritableFile> backup_meta_file;
   EnvOptions env_options;
   env_options.use_mmap_writes = false;
   env_options.use_direct_writes = false;
-  s = env_->NewWritableFile(meta_filename_ + ".tmp", &backup_meta_file,
-                            env_options);
+  s = env_->NewWritableFile(meta_tmp_filename_, &backup_meta_file, env_options);
   if (!s.ok()) {
     return s;
   }
 
-  unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
+  std::unique_ptr<char[]> buf(new char[max_backup_meta_file_size_]);
   size_t len = 0, buf_size = max_backup_meta_file_size_;
   len += snprintf(buf.get(), buf_size, "%" PRId64 "\n", timestamp_);
   len += snprintf(buf.get() + len, buf_size - len, "%" PRIu64 "\n",
@@ -1692,23 +1783,55 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
   if (!app_metadata_.empty()) {
     std::string hex_encoded_metadata =
         Slice(app_metadata_).ToString(/* hex */ true);
-    if (hex_encoded_metadata.size() + kMetaDataPrefix.size() + 1 >
-        buf_size - len) {
+
+    // +1 to accommodate newline character
+    size_t hex_meta_strlen = kMetaDataPrefix.ToString().length() + hex_encoded_metadata.length() + 1;
+    if (hex_meta_strlen >= buf_size) {
       return Status::Corruption("Buffer too small to fit backup metadata");
     }
-    memcpy(buf.get() + len, kMetaDataPrefix.data(), kMetaDataPrefix.size());
-    len += kMetaDataPrefix.size();
-    memcpy(buf.get() + len, hex_encoded_metadata.data(),
-           hex_encoded_metadata.size());
-    len += hex_encoded_metadata.size();
-    buf[len++] = '\n';
+    else if (len + hex_meta_strlen >= buf_size) {
+      backup_meta_file->Append(Slice(buf.get(), len));
+      buf.reset();
+      std::unique_ptr<char[]> new_reset_buf(
+          new char[max_backup_meta_file_size_]);
+      buf.swap(new_reset_buf);
+      len = 0;
+    }
+    len += snprintf(buf.get() + len, buf_size - len, "%s%s\n",
+                    kMetaDataPrefix.ToString().c_str(),
+                    hex_encoded_metadata.c_str());
+  }
+
+  char writelen_temp[19];
+  if (len + snprintf(writelen_temp, sizeof(writelen_temp),
+                     "%" ROCKSDB_PRIszt "\n", files_.size()) >= buf_size) {
+    backup_meta_file->Append(Slice(buf.get(), len));
+    buf.reset();
+    std::unique_ptr<char[]> new_reset_buf(new char[max_backup_meta_file_size_]);
+    buf.swap(new_reset_buf);
+    len = 0;
   }
-  len += snprintf(buf.get() + len, buf_size - len, "%" ROCKSDB_PRIszt "\n",
-                  files_.size());
+  {
+    const char *const_write = writelen_temp;
+    len += snprintf(buf.get() + len, buf_size - len, "%s", const_write);
+  }
+
   for (const auto& file : files_) {
     // use crc32 for now, switch to something else if needed
-    len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n",
-                    file->filename.c_str(), file->checksum_value);
+
+    size_t newlen = len + file->filename.length() + snprintf(writelen_temp,
+      sizeof(writelen_temp), " crc32 %u\n", file->checksum_value);
+    const char *const_write = writelen_temp;
+    if (newlen >= buf_size) {
+      backup_meta_file->Append(Slice(buf.get(), len));
+      buf.reset();
+      std::unique_ptr<char[]> new_reset_buf(
+          new char[max_backup_meta_file_size_]);
+      buf.swap(new_reset_buf);
+      len = 0;
+    }
+    len += snprintf(buf.get() + len, buf_size - len, "%s%s",
+                    file->filename.c_str(), const_write);
   }
 
   s = backup_meta_file->Append(Slice(buf.get(), len));
@@ -1719,7 +1842,7 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) {
     s = backup_meta_file->Close();
   }
   if (s.ok()) {
-    s = env_->RenameFile(meta_filename_ + ".tmp", meta_filename_);
+    s = env_->RenameFile(meta_tmp_filename_, meta_filename_);
   }
   return s;
 }
@@ -1730,34 +1853,33 @@ class BackupEngineReadOnlyImpl : public BackupEngineReadOnly {
   BackupEngineReadOnlyImpl(Env* db_env, const BackupableDBOptions& options)
       : backup_engine_(new BackupEngineImpl(db_env, options, true)) {}
 
-  virtual ~BackupEngineReadOnlyImpl() {}
+  ~BackupEngineReadOnlyImpl() override {}
 
   // The returned BackupInfos are in chronological order, which means the
   // latest backup comes last.
-  virtual void GetBackupInfo(std::vector<BackupInfo>* backup_info) override {
+  void GetBackupInfo(std::vector<BackupInfo>* backup_info) override {
     backup_engine_->GetBackupInfo(backup_info);
   }
 
-  virtual void GetCorruptedBackups(
-      std::vector<BackupID>* corrupt_backup_ids) override {
+  void GetCorruptedBackups(std::vector<BackupID>* corrupt_backup_ids) override {
     backup_engine_->GetCorruptedBackups(corrupt_backup_ids);
   }
 
-  virtual Status RestoreDBFromBackup(
+  Status RestoreDBFromBackup(
       BackupID backup_id, const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) override {
     return backup_engine_->RestoreDBFromBackup(backup_id, db_dir, wal_dir,
                                                restore_options);
   }
 
-  virtual Status RestoreDBFromLatestBackup(
+  Status RestoreDBFromLatestBackup(
       const std::string& db_dir, const std::string& wal_dir,
       const RestoreOptions& restore_options = RestoreOptions()) override {
     return backup_engine_->RestoreDBFromLatestBackup(db_dir, wal_dir,
                                                      restore_options);
   }
 
-  virtual Status VerifyBackup(BackupID backup_id) override {
+  Status VerifyBackup(BackupID backup_id) override {
     return backup_engine_->VerifyBackup(backup_id);
   }
 
diff --git a/thirdparty/rocksdb/utilities/backupable/backupable_db_test.cc b/thirdparty/rocksdb/utilities/backupable/backupable_db_test.cc
index be20a8d9b3..1548203dd0 100644
--- a/thirdparty/rocksdb/utilities/backupable/backupable_db_test.cc
+++ b/thirdparty/rocksdb/utilities/backupable/backupable_db_test.cc
@@ -35,8 +35,6 @@ namespace rocksdb {
 
 namespace {
 
-using std::unique_ptr;
-
 class DummyDB : public StackableDB {
  public:
   /* implicit */
@@ -44,50 +42,42 @@ class DummyDB : public StackableDB {
      : StackableDB(nullptr), options_(options), dbname_(dbname),
        deletions_enabled_(true), sequence_number_(0) {}
 
-  virtual SequenceNumber GetLatestSequenceNumber() const override {
+  SequenceNumber GetLatestSequenceNumber() const override {
     return ++sequence_number_;
   }
 
-  virtual const std::string& GetName() const override {
-    return dbname_;
-  }
+  const std::string& GetName() const override { return dbname_; }
 
-  virtual Env* GetEnv() const override {
-    return options_.env;
-  }
+  Env* GetEnv() const override { return options_.env; }
 
   using DB::GetOptions;
-  virtual Options GetOptions(ColumnFamilyHandle* column_family) const override {
+  Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
     return options_;
   }
 
-  virtual DBOptions GetDBOptions() const override {
-    return DBOptions(options_);
-  }
+  DBOptions GetDBOptions() const override { return DBOptions(options_); }
 
-  virtual Status EnableFileDeletions(bool force) override {
+  Status EnableFileDeletions(bool /*force*/) override {
     EXPECT_TRUE(!deletions_enabled_);
     deletions_enabled_ = true;
     return Status::OK();
   }
 
-  virtual Status DisableFileDeletions() override {
+  Status DisableFileDeletions() override {
     EXPECT_TRUE(deletions_enabled_);
     deletions_enabled_ = false;
     return Status::OK();
   }
 
-  virtual Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
-                              bool flush_memtable = true) override {
+  Status GetLiveFiles(std::vector<std::string>& vec, uint64_t* mfs,
+                      bool /*flush_memtable*/ = true) override {
     EXPECT_TRUE(!deletions_enabled_);
     vec = live_files_;
     *mfs = 100;
     return Status::OK();
   }
 
-  virtual ColumnFamilyHandle* DefaultColumnFamily() const override {
-    return nullptr;
-  }
+  ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
 
   class DummyLogFile : public LogFile {
    public:
@@ -95,36 +85,32 @@ class DummyDB : public StackableDB {
      DummyLogFile(const std::string& path, bool alive = true)
          : path_(path), alive_(alive) {}
 
-    virtual std::string PathName() const override {
-      return path_;
-    }
+     std::string PathName() const override { return path_; }
 
-    virtual uint64_t LogNumber() const override {
-      // what business do you have calling this method?
-      ADD_FAILURE();
-      return 0;
-    }
+     uint64_t LogNumber() const override {
+       // what business do you have calling this method?
+       ADD_FAILURE();
+       return 0;
+     }
 
-    virtual WalFileType Type() const override {
-      return alive_ ? kAliveLogFile : kArchivedLogFile;
-    }
+     WalFileType Type() const override {
+       return alive_ ? kAliveLogFile : kArchivedLogFile;
+     }
 
-    virtual SequenceNumber StartSequence() const override {
-      // this seqnum guarantees the dummy file will be included in the backup
-      // as long as it is alive.
-      return kMaxSequenceNumber;
-    }
+     SequenceNumber StartSequence() const override {
+       // this seqnum guarantees the dummy file will be included in the backup
+       // as long as it is alive.
+       return kMaxSequenceNumber;
+     }
 
-    virtual uint64_t SizeFileBytes() const override {
-      return 0;
-    }
+     uint64_t SizeFileBytes() const override { return 0; }
 
-   private:
-    std::string path_;
-    bool alive_;
+    private:
+     std::string path_;
+     bool alive_;
   }; // DummyLogFile
 
-  virtual Status GetSortedWalFiles(VectorLogPtr& files) override {
+  Status GetSortedWalFiles(VectorLogPtr& files) override {
     EXPECT_TRUE(!deletions_enabled_);
     files.resize(wal_files_.size());
     for (size_t i = 0; i < files.size(); ++i) {
@@ -135,7 +121,7 @@ class DummyDB : public StackableDB {
   }
 
   // To avoid FlushWAL called on stacked db which is nullptr
-  virtual Status FlushWAL(bool sync) override { return Status::OK(); }
+  Status FlushWAL(bool /*sync*/) override { return Status::OK(); }
 
   std::vector<std::string> live_files_;
   // pair<filename, alive?>
@@ -155,7 +141,7 @@ class TestEnv : public EnvWrapper {
    public:
     explicit DummySequentialFile(bool fail_reads)
         : SequentialFile(), rnd_(5), fail_reads_(fail_reads) {}
-    virtual Status Read(size_t n, Slice* result, char* scratch) override {
+    Status Read(size_t n, Slice* result, char* scratch) override {
       if (fail_reads_) {
         return Status::IOError();
       }
@@ -168,17 +154,19 @@ class TestEnv : public EnvWrapper {
       return Status::OK();
     }
 
-    virtual Status Skip(uint64_t n) override {
+    Status Skip(uint64_t n) override {
       size_left = (n > size_left) ? size_left - n : 0;
       return Status::OK();
     }
+
    private:
     size_t size_left = 200;
     Random rnd_;
     bool fail_reads_;
   };
 
-  Status NewSequentialFile(const std::string& f, unique_ptr<SequentialFile>* r,
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
                            const EnvOptions& options) override {
     MutexLock l(&mutex_);
     if (dummy_sequential_file_) {
@@ -186,11 +174,18 @@ class TestEnv : public EnvWrapper {
           new TestEnv::DummySequentialFile(dummy_sequential_file_fail_reads_));
       return Status::OK();
     } else {
-      return EnvWrapper::NewSequentialFile(f, r, options);
+      Status s = EnvWrapper::NewSequentialFile(f, r, options);
+      if (s.ok()) {
+        if ((*r)->use_direct_io()) {
+          ++num_direct_seq_readers_;
+        }
+        ++num_seq_readers_;
+      }
+      return s;
     }
   }
 
-  Status NewWritableFile(const std::string& f, unique_ptr<WritableFile>* r,
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
                          const EnvOptions& options) override {
     MutexLock l(&mutex_);
     written_files_.push_back(f);
@@ -198,10 +193,31 @@ class TestEnv : public EnvWrapper {
       return Status::NotSupported("Sorry, can't do this");
     }
     limit_written_files_--;
-    return EnvWrapper::NewWritableFile(f, r, options);
+    Status s = EnvWrapper::NewWritableFile(f, r, options);
+    if (s.ok()) {
+      if ((*r)->use_direct_io()) {
+        ++num_direct_writers_;
+      }
+      ++num_writers_;
+    }
+    return s;
   }
 
-  virtual Status DeleteFile(const std::string& fname) override {
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& options) override {
+    MutexLock l(&mutex_);
+    Status s = EnvWrapper::NewRandomAccessFile(fname, result, options);
+    if (s.ok()) {
+      if ((*result)->use_direct_io()) {
+        ++num_direct_rand_readers_;
+      }
+      ++num_rand_readers_;
+    }
+    return s;
+  }
+
+  Status DeleteFile(const std::string& fname) override {
     MutexLock l(&mutex_);
     if (fail_delete_files_) {
       return Status::IOError();
@@ -211,7 +227,7 @@ class TestEnv : public EnvWrapper {
     return EnvWrapper::DeleteFile(fname);
   }
 
-  virtual Status DeleteDir(const std::string& dirname) override {
+  Status DeleteDir(const std::string& dirname) override {
     MutexLock l(&mutex_);
     if (fail_delete_files_) {
       return Status::IOError();
@@ -306,14 +322,31 @@ class TestEnv : public EnvWrapper {
   }
 
   void SetNewDirectoryFailure(bool fail) { new_directory_failure_ = fail; }
-  virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
     if (new_directory_failure_) {
       return Status::IOError("SimulatedFailure");
     }
     return EnvWrapper::NewDirectory(name, result);
   }
 
+  void ClearFileOpenCounters() {
+    MutexLock l(&mutex_);
+    num_rand_readers_ = 0;
+    num_direct_rand_readers_ = 0;
+    num_seq_readers_ = 0;
+    num_direct_seq_readers_ = 0;
+    num_writers_ = 0;
+    num_direct_writers_ = 0;
+  }
+
+  int num_rand_readers() { return num_rand_readers_; }
+  int num_direct_rand_readers() { return num_direct_rand_readers_; }
+  int num_seq_readers() { return num_seq_readers_; }
+  int num_direct_seq_readers() { return num_direct_seq_readers_; }
+  int num_writers() { return num_writers_; }
+  int num_direct_writers() { return num_direct_writers_; }
+
  private:
   port::Mutex mutex_;
   bool dummy_sequential_file_ = false;
@@ -327,6 +360,15 @@ class TestEnv : public EnvWrapper {
   bool get_children_failure_ = false;
   bool create_dir_if_missing_failure_ = false;
   bool new_directory_failure_ = false;
+
+  // Keeps track of how many files of each type were successfully opened, and
+  // out of those, how many were opened with direct I/O.
+  std::atomic<int> num_rand_readers_;
+  std::atomic<int> num_direct_rand_readers_;
+  std::atomic<int> num_seq_readers_;
+  std::atomic<int> num_direct_seq_readers_;
+  std::atomic<int> num_writers_;
+  std::atomic<int> num_direct_writers_;
 };  // TestEnv
 
 class FileManager : public EnvWrapper {
@@ -426,7 +468,7 @@ class FileManager : public EnvWrapper {
   }
 
   Status WriteToFile(const std::string& fname, const std::string& data) {
-    unique_ptr<WritableFile> file;
+    std::unique_ptr<WritableFile> file;
     EnvOptions env_options;
     env_options.use_mmap_writes = false;
     Status s = EnvWrapper::NewWritableFile(fname, &file, env_options);
@@ -476,8 +518,8 @@ class BackupableDBTest : public testing::Test {
  public:
   BackupableDBTest() {
     // set up files
-    std::string db_chroot = test::TmpDir() + "/backupable_db";
-    std::string backup_chroot = test::TmpDir() + "/backupable_db_backup";
+    std::string db_chroot = test::PerThreadDBPath("backupable_db");
+    std::string backup_chroot = test::PerThreadDBPath("backupable_db_backup");
     Env::Default()->CreateDir(db_chroot);
     Env::Default()->CreateDir(backup_chroot);
     dbname_ = "/tempdb";
@@ -521,7 +563,7 @@ class BackupableDBTest : public testing::Test {
 
   void OpenDBAndBackupEngineShareWithChecksum(
       bool destroy_old_data = false, bool dummy = false,
-      bool share_table_files = true, bool share_with_checksums = false) {
+      bool /*share_table_files*/ = true, bool share_with_checksums = false) {
     backupable_options_->share_files_with_checksum = share_with_checksums;
     OpenDBAndBackupEngine(destroy_old_data, dummy, share_with_checksums);
   }
@@ -619,22 +661,22 @@ class BackupableDBTest : public testing::Test {
   std::shared_ptr<Logger> logger_;
 
   // envs
-  unique_ptr<Env> db_chroot_env_;
-  unique_ptr<Env> backup_chroot_env_;
-  unique_ptr<TestEnv> test_db_env_;
-  unique_ptr<TestEnv> test_backup_env_;
-  unique_ptr<FileManager> file_manager_;
+  std::unique_ptr<Env> db_chroot_env_;
+  std::unique_ptr<Env> backup_chroot_env_;
+  std::unique_ptr<TestEnv> test_db_env_;
+  std::unique_ptr<TestEnv> test_backup_env_;
+  std::unique_ptr<FileManager> file_manager_;
 
   // all the dbs!
   DummyDB* dummy_db_; // BackupableDB owns dummy_db_
-  unique_ptr<DB> db_;
-  unique_ptr<BackupEngine> backup_engine_;
+  std::unique_ptr<DB> db_;
+  std::unique_ptr<BackupEngine> backup_engine_;
 
   // options
   Options options_;
 
  protected:
-  unique_ptr<BackupableDBOptions> backupable_options_;
+  std::unique_ptr<BackupableDBOptions> backupable_options_;
 }; // BackupableDBTest
 
 void AppendPath(const std::string& path, std::vector<std::string>& v) {
@@ -810,9 +852,8 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   test_db_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_);
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
   std::vector<std::string> should_have_written = {
-      "/shared/00010.sst.tmp",    "/shared/00011.sst.tmp",
-      "/private/1.tmp/CURRENT",   "/private/1.tmp/MANIFEST-01",
-      "/private/1.tmp/00011.log", "/meta/1.tmp"};
+      "/shared/.00010.sst.tmp", "/shared/.00011.sst.tmp", "/private/1/CURRENT",
+      "/private/1/MANIFEST-01", "/private/1/00011.log",   "/meta/.1.tmp"};
   AppendPath(backupdir_, should_have_written);
   test_backup_env_->AssertWrittenFiles(should_have_written);
 
@@ -828,9 +869,9 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false));
   // should not open 00010.sst - it's already there
 
-  should_have_written = {"/shared/00015.sst.tmp", "/private/2.tmp/CURRENT",
-                         "/private/2.tmp/MANIFEST-01",
-                         "/private/2.tmp/00011.log", "/meta/2.tmp"};
+  should_have_written = {"/shared/.00015.sst.tmp", "/private/2/CURRENT",
+                         "/private/2/MANIFEST-01", "/private/2/00011.log",
+                         "/meta/.2.tmp"};
   AppendPath(backupdir_, should_have_written);
   test_backup_env_->AssertWrittenFiles(should_have_written);
 
@@ -843,7 +884,7 @@ TEST_F(BackupableDBTest, NoDoubleCopy) {
   ASSERT_OK(test_backup_env_->FileExists(backupdir_ + "/shared/00015.sst"));
 
   // MANIFEST file size should be only 100
-  uint64_t size;
+  uint64_t size = 0;
   test_backup_env_->GetFileSize(backupdir_ + "/private/2/MANIFEST-01", &size);
   ASSERT_EQ(100UL, size);
   test_backup_env_->GetFileSize(backupdir_ + "/shared/00015.sst", &size);
@@ -975,7 +1016,7 @@ TEST_F(BackupableDBTest, InterruptCreationTest) {
       backup_engine_->CreateNewBackup(db_.get(), !!(rnd.Next() % 2)).ok());
   CloseDBAndBackupEngine();
   // should also fail cleanup so the tmp directory stays behind
-  ASSERT_OK(backup_chroot_env_->FileExists(backupdir_ + "/private/1.tmp/"));
+  ASSERT_OK(backup_chroot_env_->FileExists(backupdir_ + "/private/1/"));
 
   OpenDBAndBackupEngine(false /* destroy_old_data */);
   test_backup_env_->SetLimitWrittenFiles(1000000);
@@ -1019,6 +1060,33 @@ TEST_F(BackupableDBTest, BackupOptions) {
   CloseDBAndBackupEngine();
 }
 
+TEST_F(BackupableDBTest, SetOptionsBackupRaceCondition) {
+  OpenDBAndBackupEngine(true);
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1",
+        "BackupableDBTest::SetOptionsBackupRaceCondition:BeforeSetOptions"},
+       {"BackupableDBTest::SetOptionsBackupRaceCondition:AfterSetOptions",
+        "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  rocksdb::port::Thread setoptions_thread{[this]() {
+    TEST_SYNC_POINT(
+        "BackupableDBTest::SetOptionsBackupRaceCondition:BeforeSetOptions");
+    DBImpl* dbi = static_cast<DBImpl*>(db_.get());
+    // Change arbitrary option to trigger OPTIONS file deletion
+    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                              {{"paranoid_file_checks", "false"}}));
+    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                              {{"paranoid_file_checks", "true"}}));
+    ASSERT_OK(dbi->SetOptions(dbi->DefaultColumnFamily(),
+                              {{"paranoid_file_checks", "false"}}));
+    TEST_SYNC_POINT(
+        "BackupableDBTest::SetOptionsBackupRaceCondition:AfterSetOptions");
+  }};
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get()));
+  setoptions_thread.join();
+  CloseDBAndBackupEngine();
+}
+
 // This test verifies we don't delete the latest backup when read-only option is
 // set
 TEST_F(BackupableDBTest, NoDeleteWithReadOnly) {
@@ -1169,8 +1237,8 @@ TEST_F(BackupableDBTest, DeleteTmpFiles) {
     } else {
       shared_tmp += "/shared";
     }
-    shared_tmp += "/00006.sst.tmp";
-    std::string private_tmp_dir = backupdir_ + "/private/10.tmp";
+    shared_tmp += "/.00006.sst.tmp";
+    std::string private_tmp_dir = backupdir_ + "/private/10";
     std::string private_tmp_file = private_tmp_dir + "/00003.sst";
     file_manager_->WriteToFile(shared_tmp, "tmp");
     file_manager_->CreateDir(private_tmp_dir);
@@ -1527,6 +1595,138 @@ TEST_F(BackupableDBTest, LimitBackupsOpened) {
   DestroyDB(dbname_, options_);
 }
 
+TEST_F(BackupableDBTest, CreateWhenLatestBackupCorrupted) {
+  // we should pick an ID greater than corrupted backups' IDs so creation can
+  // succeed even when latest backup is corrupted.
+  const int kNumKeys = 5000;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0 /* from */, kNumKeys);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            true /* flush_before_backup */));
+  ASSERT_OK(file_manager_->CorruptFile(backupdir_ + "/meta/1",
+                                       3 /* bytes_to_corrupt */));
+  CloseDBAndBackupEngine();
+
+  OpenDBAndBackupEngine();
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                            true /* flush_before_backup */));
+  std::vector<BackupInfo> backup_infos;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  ASSERT_EQ(2, backup_infos[0].backup_id);
+}
+
+TEST_F(BackupableDBTest, WriteOnlyEngine) {
+  // Verify we can open a backup engine and create new ones even if reading old
+  // backups would fail with IOError. IOError is a more serious condition than
+  // corruption and would cause the engine to fail opening. So the only way to
+  // avoid is by not reading old backups at all, i.e., respecting
+  // `max_valid_backups_to_open == 0`.
+  const int kNumKeys = 5000;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  FillDB(db_.get(), 0 /* from */, kNumKeys);
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  CloseDBAndBackupEngine();
+
+  backupable_options_->max_valid_backups_to_open = 0;
+  // cause any meta-file reads to fail with IOError during Open
+  test_backup_env_->SetDummySequentialFile(true);
+  test_backup_env_->SetDummySequentialFileFailReads(true);
+  OpenDBAndBackupEngine();
+  test_backup_env_->SetDummySequentialFileFailReads(false);
+  test_backup_env_->SetDummySequentialFile(false);
+
+  ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+  std::vector<BackupInfo> backup_infos;
+  backup_engine_->GetBackupInfo(&backup_infos);
+  ASSERT_EQ(1, backup_infos.size());
+  ASSERT_EQ(2, backup_infos[0].backup_id);
+}
+
+TEST_F(BackupableDBTest, WriteOnlyEngineNoSharedFileDeletion) {
+  // Verifies a write-only BackupEngine does not delete files belonging to valid
+  // backups when GarbageCollect, PurgeOldBackups, or DeleteBackup are called.
+  const int kNumKeys = 5000;
+  for (int i = 0; i < 3; ++i) {
+    OpenDBAndBackupEngine(i == 0 /* destroy_old_data */);
+    FillDB(db_.get(), i * kNumKeys, (i + 1) * kNumKeys);
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), true));
+    CloseDBAndBackupEngine();
+
+    backupable_options_->max_valid_backups_to_open = 0;
+    OpenDBAndBackupEngine();
+    switch (i) {
+      case 0:
+        ASSERT_OK(backup_engine_->GarbageCollect());
+        break;
+      case 1:
+        ASSERT_OK(backup_engine_->PurgeOldBackups(1 /* num_backups_to_keep */));
+        break;
+      case 2:
+        ASSERT_OK(backup_engine_->DeleteBackup(2 /* backup_id */));
+        break;
+      default:
+        assert(false);
+    }
+    CloseDBAndBackupEngine();
+
+    backupable_options_->max_valid_backups_to_open = port::kMaxInt32;
+    AssertBackupConsistency(i + 1, 0, (i + 1) * kNumKeys);
+  }
+}
+
+TEST_P(BackupableDBTestWithParam, BackupUsingDirectIO) {
+  // Tests direct I/O on the backup engine's reads and writes on the DB env and
+  // backup env
+  // We use ChrootEnv underneath so the below line checks for direct I/O support
+  // in the chroot directory, not the true filesystem root.
+  if (!test::IsDirectIOSupported(test_db_env_.get(), "/")) {
+    return;
+  }
+  const int kNumKeysPerBackup = 100;
+  const int kNumBackups = 3;
+  options_.use_direct_reads = true;
+  OpenDBAndBackupEngine(true /* destroy_old_data */);
+  for (int i = 0; i < kNumBackups; ++i) {
+    FillDB(db_.get(), i * kNumKeysPerBackup /* from */,
+           (i + 1) * kNumKeysPerBackup /* to */);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    // Clear the file open counters and then do a bunch of backup engine ops.
+    // For all ops, files should be opened in direct mode.
+    test_backup_env_->ClearFileOpenCounters();
+    test_db_env_->ClearFileOpenCounters();
+    CloseBackupEngine();
+    OpenBackupEngine();
+    ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(),
+                                              false /* flush_before_backup */));
+    ASSERT_OK(backup_engine_->VerifyBackup(i + 1));
+    CloseBackupEngine();
+    OpenBackupEngine();
+    std::vector<BackupInfo> backup_infos;
+    backup_engine_->GetBackupInfo(&backup_infos);
+    ASSERT_EQ(static_cast<size_t>(i + 1), backup_infos.size());
+
+    // Verify backup engine always opened files with direct I/O
+    ASSERT_EQ(0, test_db_env_->num_writers());
+    ASSERT_EQ(0, test_db_env_->num_rand_readers());
+    ASSERT_GT(test_db_env_->num_direct_seq_readers(), 0);
+    // Currently the DB doesn't support reading WALs or manifest with direct
+    // I/O, so subtract two.
+    ASSERT_EQ(test_db_env_->num_seq_readers() - 2,
+              test_db_env_->num_direct_seq_readers());
+    ASSERT_EQ(0, test_db_env_->num_rand_readers());
+  }
+  CloseDBAndBackupEngine();
+
+  for (int i = 0; i < kNumBackups; ++i) {
+    AssertBackupConsistency(i + 1 /* backup_id */,
+                            i * kNumKeysPerBackup /* start_exist */,
+                            (i + 1) * kNumKeysPerBackup /* end_exist */,
+                            (i + 2) * kNumKeysPerBackup /* end */);
+  }
+}
+
 }  // anon namespace
 
 } //  namespace rocksdb
@@ -1540,7 +1740,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as BackupableDB is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_compaction_filter.cc b/thirdparty/rocksdb/utilities/blob_db/blob_compaction_filter.cc
new file mode 100644
index 0000000000..f145d9a92b
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_compaction_filter.cc
@@ -0,0 +1,114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_compaction_filter.h"
+#include "db/dbformat.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+namespace {
+
+// CompactionFilter to delete expired blob index from base DB.
+class BlobIndexCompactionFilter : public CompactionFilter {
+ public:
+  BlobIndexCompactionFilter(BlobCompactionContext context,
+                            uint64_t current_time, Statistics* statistics)
+      : context_(context),
+        current_time_(current_time),
+        statistics_(statistics) {}
+
+  ~BlobIndexCompactionFilter() override {
+    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_COUNT, expired_count_);
+    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EXPIRED_SIZE, expired_size_);
+    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_COUNT, evicted_count_);
+    RecordTick(statistics_, BLOB_DB_BLOB_INDEX_EVICTED_SIZE, evicted_size_);
+  }
+
+  const char* Name() const override { return "BlobIndexCompactionFilter"; }
+
+  // Filter expired blob indexes regardless of snapshots.
+  bool IgnoreSnapshots() const override { return true; }
+
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType value_type,
+                    const Slice& value, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    if (value_type != kBlobIndex) {
+      return Decision::kKeep;
+    }
+    BlobIndex blob_index;
+    Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      // Unable to decode blob index. Keeping the value.
+      return Decision::kKeep;
+    }
+    if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
+      // Expired
+      expired_count_++;
+      expired_size_ += key.size() + value.size();
+      return Decision::kRemove;
+    }
+    if (!blob_index.IsInlined() &&
+        blob_index.file_number() < context_.next_file_number &&
+        context_.current_blob_files.count(blob_index.file_number()) == 0) {
+      // Corresponding blob file gone. Could have been garbage collected or
+      // evicted by FIFO eviction.
+      evicted_count_++;
+      evicted_size_ += key.size() + value.size();
+      return Decision::kRemove;
+    }
+    if (context_.fifo_eviction_seq > 0 && blob_index.HasTTL() &&
+        blob_index.expiration() < context_.evict_expiration_up_to) {
+      // Hack: Internal key is passed to BlobIndexCompactionFilter for it to
+      // get sequence number.
+      ParsedInternalKey ikey;
+      bool ok = ParseInternalKey(key, &ikey);
+      // Remove keys that could have been remove by last FIFO eviction.
+      // If get error while parsing key, ignore and continue.
+      if (ok && ikey.sequence < context_.fifo_eviction_seq) {
+        evicted_count_++;
+        evicted_size_ += key.size() + value.size();
+        return Decision::kRemove;
+      }
+    }
+    return Decision::kKeep;
+  }
+
+ private:
+  BlobCompactionContext context_;
+  const uint64_t current_time_;
+  Statistics* statistics_;
+  // It is safe to not using std::atomic since the compaction filter, created
+  // from a compaction filter factroy, will not be called from multiple threads.
+  mutable uint64_t expired_count_ = 0;
+  mutable uint64_t expired_size_ = 0;
+  mutable uint64_t evicted_count_ = 0;
+  mutable uint64_t evicted_size_ = 0;
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<CompactionFilter>
+BlobIndexCompactionFilterFactory::CreateCompactionFilter(
+    const CompactionFilter::Context& /*context*/) {
+  int64_t current_time = 0;
+  Status s = env_->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    return nullptr;
+  }
+  assert(current_time >= 0);
+
+  BlobCompactionContext context;
+  blob_db_impl_->GetCompactionContext(&context);
+
+  return std::unique_ptr<CompactionFilter>(new BlobIndexCompactionFilter(
+      context, static_cast<uint64_t>(current_time), statistics_));
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_compaction_filter.h b/thirdparty/rocksdb/utilities/blob_db/blob_compaction_filter.h
index 26cd188fe9..7a8ea61357 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_compaction_filter.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_compaction_filter.h
@@ -5,72 +5,41 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#include <unordered_set>
+
+#include "monitoring/statistics.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/env.h"
+#include "utilities/blob_db/blob_db_impl.h"
 #include "utilities/blob_db/blob_index.h"
 
 namespace rocksdb {
 namespace blob_db {
 
-// CompactionFilter to delete expired blob index from base DB.
-class BlobIndexCompactionFilter : public CompactionFilter {
- public:
-  explicit BlobIndexCompactionFilter(uint64_t current_time)
-      : current_time_(current_time) {}
-
-  virtual const char* Name() const override {
-    return "BlobIndexCompactionFilter";
-  }
-
-  // Filter expired blob indexes regardless of snapshots.
-  virtual bool IgnoreSnapshots() const override { return true; }
-
-  virtual Decision FilterV2(int /*level*/, const Slice& /*key*/,
-                            ValueType value_type, const Slice& value,
-                            std::string* /*new_value*/,
-                            std::string* /*skip_until*/) const override {
-    if (value_type != kBlobIndex) {
-      return Decision::kKeep;
-    }
-    BlobIndex blob_index;
-    Status s = blob_index.DecodeFrom(value);
-    if (!s.ok()) {
-      // Unable to decode blob index. Keeping the value.
-      return Decision::kKeep;
-    }
-    if (blob_index.HasTTL() && blob_index.expiration() <= current_time_) {
-      // Expired
-      return Decision::kRemove;
-    }
-    return Decision::kKeep;
-  }
-
- private:
-  const uint64_t current_time_;
+struct BlobCompactionContext {
+  uint64_t next_file_number;
+  std::unordered_set<uint64_t> current_blob_files;
+  SequenceNumber fifo_eviction_seq;
+  uint64_t evict_expiration_up_to;
 };
 
 class BlobIndexCompactionFilterFactory : public CompactionFilterFactory {
  public:
-  explicit BlobIndexCompactionFilterFactory(Env* env) : env_(env) {}
+  BlobIndexCompactionFilterFactory(BlobDBImpl* blob_db_impl, Env* env,
+                                   Statistics* statistics)
+      : blob_db_impl_(blob_db_impl), env_(env), statistics_(statistics) {}
 
   virtual const char* Name() const override {
     return "BlobIndexCompactionFilterFactory";
   }
 
   virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& /*context*/) override {
-    int64_t current_time = 0;
-    Status s = env_->GetCurrentTime(&current_time);
-    if (!s.ok()) {
-      return nullptr;
-    }
-    assert(current_time >= 0);
-    return std::unique_ptr<CompactionFilter>(
-        new BlobIndexCompactionFilter(static_cast<uint64_t>(current_time)));
-  }
+      const CompactionFilter::Context& /*context*/) override;
 
  private:
+  BlobDBImpl* blob_db_impl_;
   Env* env_;
+  Statistics* statistics_;
 };
 
 }  // namespace blob_db
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db.cc b/thirdparty/rocksdb/utilities/blob_db/blob_db.cc
index b278df77f3..d660def490 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_db.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db.cc
@@ -12,88 +12,10 @@
 #include "utilities/blob_db/blob_db.h"
 
 #include <inttypes.h>
-
-#include "db/write_batch_internal.h"
-#include "monitoring/instrumented_mutex.h"
-#include "options/cf_options.h"
-#include "rocksdb/compaction_filter.h"
-#include "rocksdb/convenience.h"
-#include "rocksdb/env.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/utilities/stackable_db.h"
-#include "table/block.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_builder.h"
-#include "util/file_reader_writer.h"
-#include "util/filename.h"
-#include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
-
 namespace blob_db {
-port::Mutex listener_mutex;
-typedef std::shared_ptr<BlobDBFlushBeginListener> FlushBeginListener_t;
-typedef std::shared_ptr<BlobReconcileWalFilter> ReconcileWalFilter_t;
-typedef std::shared_ptr<EvictAllVersionsCompactionListener>
-    CompactionListener_t;
-
-// to ensure the lifetime of the listeners
-std::vector<std::shared_ptr<EventListener>> all_blobdb_listeners;
-std::vector<ReconcileWalFilter_t> all_wal_filters;
-
-Status BlobDB::OpenAndLoad(const Options& options,
-                           const BlobDBOptions& bdb_options,
-                           const std::string& dbname, BlobDB** blob_db,
-                           Options* changed_options) {
-  if (options.compaction_filter != nullptr ||
-      options.compaction_filter_factory != nullptr) {
-    return Status::NotSupported("Blob DB doesn't support compaction filter.");
-  }
-
-  *changed_options = options;
-  *blob_db = nullptr;
-
-  FlushBeginListener_t fblistener =
-      std::make_shared<BlobDBFlushBeginListener>();
-  ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
-  CompactionListener_t ce_listener =
-      std::make_shared<EvictAllVersionsCompactionListener>();
-
-  {
-    MutexLock l(&listener_mutex);
-    all_blobdb_listeners.push_back(fblistener);
-    if (bdb_options.enable_garbage_collection) {
-      all_blobdb_listeners.push_back(ce_listener);
-    }
-    all_wal_filters.push_back(rw_filter);
-  }
-
-  changed_options->compaction_filter_factory.reset(
-      new BlobIndexCompactionFilterFactory(options.env));
-  changed_options->listeners.emplace_back(fblistener);
-  if (bdb_options.enable_garbage_collection) {
-    changed_options->listeners.emplace_back(ce_listener);
-  }
-  changed_options->wal_filter = rw_filter.get();
-
-  DBOptions db_options(*changed_options);
-
-  // we need to open blob db first so that recovery can happen
-  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
-
-  fblistener->SetImplPtr(bdb);
-  if (bdb_options.enable_garbage_collection) {
-    ce_listener->SetImplPtr(bdb);
-  }
-  rw_filter->SetImplPtr(bdb);
-
-  Status s = bdb->OpenPhase1();
-  if (!s.ok()) return s;
-
-  *blob_db = bdb;
-  return s;
-}
 
 Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options,
                     const std::string& dbname, BlobDB** blob_db) {
@@ -115,119 +37,65 @@ Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options,
   return s;
 }
 
-Status BlobDB::Open(const DBOptions& db_options_input,
+Status BlobDB::Open(const DBOptions& db_options,
                     const BlobDBOptions& bdb_options, const std::string& dbname,
                     const std::vector<ColumnFamilyDescriptor>& column_families,
-                    std::vector<ColumnFamilyHandle*>* handles, BlobDB** blob_db,
-                    bool no_base_db) {
+                    std::vector<ColumnFamilyHandle*>* handles,
+                    BlobDB** blob_db) {
   if (column_families.size() != 1 ||
       column_families[0].name != kDefaultColumnFamilyName) {
     return Status::NotSupported(
         "Blob DB doesn't support non-default column family.");
   }
-  *blob_db = nullptr;
-  Status s;
-
-  DBOptions db_options(db_options_input);
-  if (db_options.info_log == nullptr) {
-    s = CreateLoggerFromOptions(dbname, db_options, &db_options.info_log);
-    if (!s.ok()) {
-      return s;
-    }
-  }
-
-  FlushBeginListener_t fblistener =
-      std::make_shared<BlobDBFlushBeginListener>();
-  CompactionListener_t ce_listener =
-      std::make_shared<EvictAllVersionsCompactionListener>();
-  ReconcileWalFilter_t rw_filter = std::make_shared<BlobReconcileWalFilter>();
-
-  db_options.listeners.emplace_back(fblistener);
-  if (bdb_options.enable_garbage_collection) {
-    db_options.listeners.emplace_back(ce_listener);
-  }
-  db_options.wal_filter = rw_filter.get();
-
-  {
-    MutexLock l(&listener_mutex);
-    all_blobdb_listeners.push_back(fblistener);
-    if (bdb_options.enable_garbage_collection) {
-      all_blobdb_listeners.push_back(ce_listener);
-    }
-    all_wal_filters.push_back(rw_filter);
-  }
-
-  ColumnFamilyOptions cf_options(column_families[0].options);
-  if (cf_options.compaction_filter != nullptr ||
-      cf_options.compaction_filter_factory != nullptr) {
-    return Status::NotSupported("Blob DB doesn't support compaction filter.");
-  }
-  cf_options.compaction_filter_factory.reset(
-      new BlobIndexCompactionFilterFactory(db_options.env));
-  ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options);
 
-  // we need to open blob db first so that recovery can happen
-  BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options);
-  fblistener->SetImplPtr(bdb);
-  if (bdb_options.enable_garbage_collection) {
-    ce_listener->SetImplPtr(bdb);
-  }
-  rw_filter->SetImplPtr(bdb);
-
-  s = bdb->OpenPhase1();
-  if (!s.ok()) {
-    delete bdb;
-    return s;
-  }
-
-  if (no_base_db) {
-    *blob_db = bdb;
-    return s;
-  }
-
-  DB* db = nullptr;
-  s = DB::Open(db_options, dbname, {cf_descriptor}, handles, &db);
-  if (!s.ok()) {
-    delete bdb;
-    return s;
-  }
-
-  // set the implementation pointer
-  s = bdb->LinkToBaseDB(db);
-  if (!s.ok()) {
-    delete bdb;
-    bdb = nullptr;
+  BlobDBImpl* blob_db_impl = new BlobDBImpl(dbname, bdb_options, db_options,
+                                            column_families[0].options);
+  Status s = blob_db_impl->Open(handles);
+  if (s.ok()) {
+    *blob_db = static_cast<BlobDB*>(blob_db_impl);
+  } else {
+    delete blob_db_impl;
+    *blob_db = nullptr;
   }
-  *blob_db = bdb;
-  bdb_options.Dump(db_options.info_log.get());
   return s;
 }
 
-BlobDB::BlobDB(DB* db) : StackableDB(db) {}
+BlobDB::BlobDB() : StackableDB(nullptr) {}
 
 void BlobDBOptions::Dump(Logger* log) const {
-  ROCKS_LOG_HEADER(log, "                 blob_db_options.blob_dir: %s",
-                   blob_dir.c_str());
-  ROCKS_LOG_HEADER(log, "            blob_db_options.path_relative: %d",
-                   path_relative);
-  ROCKS_LOG_HEADER(log, "                  blob_db_options.is_fifo: %d",
-                   is_fifo);
-  ROCKS_LOG_HEADER(log, "            blob_db_options.blob_dir_size: %" PRIu64,
-                   blob_dir_size);
-  ROCKS_LOG_HEADER(log, "           blob_db_options.ttl_range_secs: %" PRIu32,
-                   ttl_range_secs);
-  ROCKS_LOG_HEADER(log, "           blob_db_options.bytes_per_sync: %" PRIu64,
-                   bytes_per_sync);
-  ROCKS_LOG_HEADER(log, "           blob_db_options.blob_file_size: %" PRIu64,
-                   blob_file_size);
-  ROCKS_LOG_HEADER(log, "            blob_db_options.ttl_extractor: %p",
-                   ttl_extractor.get());
-  ROCKS_LOG_HEADER(log, "              blob_db_options.compression: %d",
-                   static_cast<int>(compression));
-  ROCKS_LOG_HEADER(log, "blob_db_options.enable_garbage_collection: %d",
-                   enable_garbage_collection);
-  ROCKS_LOG_HEADER(log, " blob_db_options.disable_background_tasks: %d",
-                   disable_background_tasks);
+  ROCKS_LOG_HEADER(
+      log, "                                  BlobDBOptions.blob_dir: %s",
+      blob_dir.c_str());
+  ROCKS_LOG_HEADER(
+      log, "                             BlobDBOptions.path_relative: %d",
+      path_relative);
+  ROCKS_LOG_HEADER(
+      log, "                                   BlobDBOptions.is_fifo: %d",
+      is_fifo);
+  ROCKS_LOG_HEADER(
+      log, "                               BlobDBOptions.max_db_size: %" PRIu64,
+      max_db_size);
+  ROCKS_LOG_HEADER(
+      log, "                            BlobDBOptions.ttl_range_secs: %" PRIu64,
+      ttl_range_secs);
+  ROCKS_LOG_HEADER(
+      log, "                             BlobDBOptions.min_blob_size: %" PRIu64,
+      min_blob_size);
+  ROCKS_LOG_HEADER(
+      log, "                            BlobDBOptions.bytes_per_sync: %" PRIu64,
+      bytes_per_sync);
+  ROCKS_LOG_HEADER(
+      log, "                            BlobDBOptions.blob_file_size: %" PRIu64,
+      blob_file_size);
+  ROCKS_LOG_HEADER(
+      log, "                               BlobDBOptions.compression: %d",
+      static_cast<int>(compression));
+  ROCKS_LOG_HEADER(
+      log, "                 BlobDBOptions.enable_garbage_collection: %d",
+      enable_garbage_collection);
+  ROCKS_LOG_HEADER(
+      log, "                  BlobDBOptions.disable_background_tasks: %d",
+      disable_background_tasks);
 }
 
 }  // namespace blob_db
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db.h b/thirdparty/rocksdb/utilities/blob_db/blob_db.h
index 3ade460eb2..3beb74fc93 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_db.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db.h
@@ -18,8 +18,6 @@ namespace rocksdb {
 
 namespace blob_db {
 
-class TTLExtractor;
-
 // A wrapped database which puts values of KV pairs in a separate log
 // and store location to the log in the underlying DB.
 // It lacks lots of importatant functionalities, e.g. DB restarts,
@@ -36,13 +34,15 @@ struct BlobDBOptions {
   // whether the blob_dir path is relative or absolute.
   bool path_relative = true;
 
-  // is the eviction strategy fifo based
+  // When max_db_size is reached, evict blob files to free up space
+  // instead of returnning NoSpace error on write. Blob files will be
+  // evicted from oldest to newest, based on file creation time.
   bool is_fifo = false;
 
-  // maximum size of the blob dir. Once this gets used, up
-  // evict the blob file which is oldest (is_fifo )
-  // 0 means no limits
-  uint64_t blob_dir_size = 0;
+  // Maximum size of the database (including SST files and blob files).
+  //
+  // Default: 0 (no limits)
+  uint64_t max_db_size = 0;
 
   // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
   // (10 minutes), and the first bucket starts at 1471542000
@@ -52,22 +52,19 @@ struct BlobDBOptions {
   // and so on
   uint64_t ttl_range_secs = 3600;
 
-  // The smallest value to store in blob log. Value larger than this threshold
+  // The smallest value to store in blob log. Values smaller than this threshold
   // will be inlined in base DB together with the key.
   uint64_t min_blob_size = 0;
 
-  // at what bytes will the blob files be synced to blob log.
-  uint64_t bytes_per_sync = 0;
+  // Allows OS to incrementally sync blob files to disk for every
+  // bytes_per_sync bytes written. Users shouldn't rely on it for
+  // persistency guarantee.
+  uint64_t bytes_per_sync = 512 * 1024;
 
   // the target size of each blob file. File will become immutable
   // after it exceeds that size
   uint64_t blob_file_size = 256 * 1024 * 1024;
 
-  // Instead of setting TTL explicitly by calling PutWithTTL or PutUntil,
-  // applications can set a TTLExtractor which can extract TTL from key-value
-  // pairs.
-  std::shared_ptr<TTLExtractor> ttl_extractor = nullptr;
-
   // what compression to use for Blob's
   CompressionType compression = kNoCompression;
 
@@ -98,8 +95,6 @@ class BlobDB : public StackableDB {
   }
 
   using rocksdb::StackableDB::Delete;
-  virtual Status Delete(const WriteOptions& options,
-                        const Slice& key) override = 0;
   virtual Status Delete(const WriteOptions& options,
                         ColumnFamilyHandle* column_family,
                         const Slice& key) override {
@@ -107,7 +102,8 @@ class BlobDB : public StackableDB {
       return Status::NotSupported(
           "Blob DB doesn't support non-default column family.");
     }
-    return Delete(options, key);
+    assert(db_ != nullptr);
+    return db_->Delete(options, column_family, key);
   }
 
   virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
@@ -141,6 +137,15 @@ class BlobDB : public StackableDB {
                      ColumnFamilyHandle* column_family, const Slice& key,
                      PinnableSlice* value) override = 0;
 
+  // Get value and expiration.
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value, uint64_t* expiration) = 0;
+  virtual Status Get(const ReadOptions& options, const Slice& key,
+                     PinnableSlice* value, uint64_t* expiration) {
+    return Get(options, DefaultColumnFamily(), key, value, expiration);
+  }
+
   using rocksdb::StackableDB::MultiGet;
   virtual std::vector<Status> MultiGet(
       const ReadOptions& options,
@@ -178,7 +183,6 @@ class BlobDB : public StackableDB {
 
   virtual Status Write(const WriteOptions& opts,
                        WriteBatch* updates) override = 0;
-
   using rocksdb::StackableDB::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
   virtual Iterator* NewIterator(const ReadOptions& options,
@@ -190,23 +194,10 @@ class BlobDB : public StackableDB {
     return NewIterator(options);
   }
 
-  // Starting point for opening a Blob DB.
-  // changed_options - critical. Blob DB loads and inserts listeners
-  // into options which are necessary for recovery and atomicity
-  // Use this pattern if you need control on step 2, i.e. your
-  // BaseDB is not just a simple rocksdb but a stacked DB
-  // 1. ::OpenAndLoad
-  // 2. Open Base DB with the changed_options
-  // 3. ::LinkToBaseDB
-  static Status OpenAndLoad(const Options& options,
-                            const BlobDBOptions& bdb_options,
-                            const std::string& dbname, BlobDB** blob_db,
-                            Options* changed_options);
-
-  // This is another way to open BLOB DB which do not have other
-  // Stackable DB's in play
-  // Steps.
-  // 1. ::Open
+  using rocksdb::StackableDB::Close;
+  virtual Status Close() override = 0;
+
+  // Opening blob db.
   static Status Open(const Options& options, const BlobDBOptions& bdb_options,
                      const std::string& dbname, BlobDB** blob_db);
 
@@ -215,49 +206,22 @@ class BlobDB : public StackableDB {
                      const std::string& dbname,
                      const std::vector<ColumnFamilyDescriptor>& column_families,
                      std::vector<ColumnFamilyHandle*>* handles,
-                     BlobDB** blob_db, bool no_base_db = false);
+                     BlobDB** blob_db);
 
   virtual BlobDBOptions GetBlobDBOptions() const = 0;
 
-  virtual ~BlobDB() {}
+  virtual Status SyncBlobFiles() = 0;
 
-  virtual Status LinkToBaseDB(DB* db_base) = 0;
+  virtual ~BlobDB() {}
 
  protected:
-  explicit BlobDB(DB* db);
+  explicit BlobDB();
 };
 
 // Destroy the content of the database.
 Status DestroyBlobDB(const std::string& dbname, const Options& options,
                      const BlobDBOptions& bdb_options);
 
-// TTLExtractor allow applications to extract TTL from key-value pairs.
-// This useful for applications using Put or WriteBatch to write keys and
-// don't intend to migrate to PutWithTTL or PutUntil.
-//
-// Applications can implement either ExtractTTL or ExtractExpiration. If both
-// are implemented, ExtractExpiration will take precedence.
-class TTLExtractor {
- public:
-  // Extract TTL from key-value pair.
-  // Return true if the key has TTL, false otherwise. If key has TTL,
-  // TTL is pass back through ttl. The method can optionally modify the value,
-  // pass the result back through new_value, and also set value_changed to true.
-  virtual bool ExtractTTL(const Slice& key, const Slice& value, uint64_t* ttl,
-                          std::string* new_value, bool* value_changed);
-
-  // Extract expiration time from key-value pair.
-  // Return true if the key has expiration time, false otherwise. If key has
-  // expiration time, it is pass back through expiration. The method can
-  // optionally modify the value, pass the result back through new_value,
-  // and also set value_changed to true.
-  virtual bool ExtractExpiration(const Slice& key, const Slice& value,
-                                 uint64_t now, uint64_t* expiration,
-                                 std::string* new_value, bool* value_changed);
-
-  virtual ~TTLExtractor() = default;
-};
-
 }  // namespace blob_db
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.cc b/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.cc
index 23f173fd9a..5dcddc214c 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.cc
@@ -8,12 +8,12 @@
 #include <algorithm>
 #include <cinttypes>
 #include <iomanip>
-#include <limits>
 #include <memory>
 
 #include "db/db_impl.h"
 #include "db/write_batch_internal.h"
 #include "monitoring/instrumented_mutex.h"
+#include "monitoring/statistics.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
 #include "rocksdb/iterator.h"
@@ -26,13 +26,18 @@
 #include "util/cast_util.h"
 #include "util/crc32c.h"
 #include "util/file_reader_writer.h"
+#include "util/file_util.h"
 #include "util/filename.h"
 #include "util/logging.h"
 #include "util/mutexlock.h"
 #include "util/random.h"
+#include "util/sst_file_manager_impl.h"
+#include "util/stop_watch.h"
 #include "util/sync_point.h"
 #include "util/timer_queue.h"
+#include "utilities/blob_db/blob_compaction_filter.h"
 #include "utilities/blob_db/blob_db_iterator.h"
+#include "utilities/blob_db/blob_db_listener.h"
 #include "utilities/blob_db/blob_index.h"
 
 namespace {
@@ -42,20 +47,16 @@ int kBlockBasedTableVersionFormat = 2;
 namespace rocksdb {
 namespace blob_db {
 
-Random blob_rgen(static_cast<uint32_t>(time(nullptr)));
-
-void BlobDBFlushBeginListener::OnFlushBegin(DB* db, const FlushJobInfo& info) {
-  if (impl_) impl_->OnFlushBeginHandler(db, info);
-}
-
-WalFilter::WalProcessingOption BlobReconcileWalFilter::LogRecordFound(
-    unsigned long long log_number, const std::string& log_file_name,
-    const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) {
-  return WalFilter::WalProcessingOption::kContinueProcessing;
+bool BlobFileComparator::operator()(
+    const std::shared_ptr<BlobFile>& lhs,
+    const std::shared_ptr<BlobFile>& rhs) const {
+  return lhs->BlobFileNumber() > rhs->BlobFileNumber();
 }
 
-bool blobf_compare_ttl::operator()(const std::shared_ptr<BlobFile>& lhs,
-                                   const std::shared_ptr<BlobFile>& rhs) const {
+bool BlobFileComparatorTTL::operator()(
+    const std::shared_ptr<BlobFile>& lhs,
+    const std::shared_ptr<BlobFile>& rhs) const {
+  assert(lhs->HasTTL() && rhs->HasTTL());
   if (lhs->expiration_range_.first < rhs->expiration_range_.first) {
     return true;
   }
@@ -65,145 +66,137 @@ bool blobf_compare_ttl::operator()(const std::shared_ptr<BlobFile>& lhs,
   return lhs->BlobFileNumber() < rhs->BlobFileNumber();
 }
 
-void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
-    int level, const Slice& key,
-    CompactionEventListener::CompactionListenerValueType value_type,
-    const Slice& existing_value, const SequenceNumber& sn, bool is_new) {
-  assert(impl_->bdb_options_.enable_garbage_collection);
-  if (!is_new &&
-      value_type ==
-          CompactionEventListener::CompactionListenerValueType::kValue) {
-    BlobIndex blob_index;
-    Status s = blob_index.DecodeFrom(existing_value);
-    if (s.ok()) {
-      if (impl_->debug_level_ >= 3)
-        ROCKS_LOG_INFO(
-            impl_->db_options_.info_log,
-            "CALLBACK COMPACTED OUT KEY: %s SN: %d "
-            "NEW: %d FN: %" PRIu64 " OFFSET: %" PRIu64 " SIZE: %" PRIu64,
-            key.ToString().c_str(), sn, is_new, blob_index.file_number(),
-            blob_index.offset(), blob_index.size());
-
-      impl_->override_vals_q_.enqueue({blob_index.file_number(), key.size(),
-                                       blob_index.offset(), blob_index.size(),
-                                       sn});
-    }
-  } else {
-    if (impl_->debug_level_ >= 3)
-      ROCKS_LOG_INFO(impl_->db_options_.info_log,
-                     "CALLBACK NEW KEY: %s SN: %d NEW: %d",
-                     key.ToString().c_str(), sn, is_new);
-  }
-}
-
 BlobDBImpl::BlobDBImpl(const std::string& dbname,
                        const BlobDBOptions& blob_db_options,
-                       const DBOptions& db_options)
-    : BlobDB(nullptr),
+                       const DBOptions& db_options,
+                       const ColumnFamilyOptions& cf_options)
+    : BlobDB(),
+      dbname_(dbname),
       db_impl_(nullptr),
       env_(db_options.env),
-      ttl_extractor_(blob_db_options.ttl_extractor.get()),
       bdb_options_(blob_db_options),
       db_options_(db_options),
+      cf_options_(cf_options),
       env_options_(db_options),
-      dir_change_(false),
+      statistics_(db_options_.statistics.get()),
       next_file_number_(1),
       epoch_of_(0),
-      shutdown_(false),
-      current_epoch_(0),
+      closed_(true),
       open_file_count_(0),
-      last_period_write_(0),
-      last_period_ampl_(0),
-      total_periods_write_(0),
-      total_periods_ampl_(0),
-      total_blob_space_(0),
-      open_p1_done_(false),
-      debug_level_(0),
-      oldest_file_evicted_(false) {
+      total_blob_size_(0),
+      live_sst_size_(0),
+      fifo_eviction_seq_(0),
+      evict_expiration_up_to_(0),
+      debug_level_(0) {
   blob_dir_ = (bdb_options_.path_relative)
                   ? dbname + "/" + bdb_options_.blob_dir
                   : bdb_options_.blob_dir;
+  env_options_.bytes_per_sync = blob_db_options.bytes_per_sync;
+}
+
+BlobDBImpl::~BlobDBImpl() {
+  tqueue_.shutdown();
+  // CancelAllBackgroundWork(db_, true);
+  Status s __attribute__((__unused__)) = Close();
+  assert(s.ok());
 }
 
-Status BlobDBImpl::LinkToBaseDB(DB* db) {
+Status BlobDBImpl::Close() {
+  if (closed_) {
+    return Status::OK();
+  }
+  closed_ = true;
+
+  // Close base DB before BlobDBImpl destructs to stop event listener and
+  // compaction filter call.
+  Status s = db_->Close();
+  // delete db_ anyway even if close failed.
+  delete db_;
+  // Reset pointers to avoid StackableDB delete the pointer again.
+  db_ = nullptr;
+  db_impl_ = nullptr;
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = SyncBlobFiles();
+  return s;
+}
+
+BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
+
+Status BlobDBImpl::Open(std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
   assert(db_ == nullptr);
-  assert(open_p1_done_);
+  if (blob_dir_.empty()) {
+    return Status::NotSupported("No blob directory in options");
+  }
+  if (cf_options_.compaction_filter != nullptr ||
+      cf_options_.compaction_filter_factory != nullptr) {
+    return Status::NotSupported("Blob DB doesn't support compaction filter.");
+  }
 
-  db_ = db;
+  Status s;
 
-  // the Base DB in-itself can be a stackable DB
-  db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
+  // Create info log.
+  if (db_options_.info_log == nullptr) {
+    s = CreateLoggerFromOptions(dbname_, db_options_, &db_options_.info_log);
+    if (!s.ok()) {
+      return s;
+    }
+  }
 
-  env_ = db_->GetEnv();
+  ROCKS_LOG_INFO(db_options_.info_log, "Opening BlobDB...");
 
-  Status s = env_->CreateDirIfMissing(blob_dir_);
+  // Open blob directory.
+  s = env_->CreateDirIfMissing(blob_dir_);
   if (!s.ok()) {
-    ROCKS_LOG_WARN(db_options_.info_log,
-                   "Failed to create blob directory: %s status: '%s'",
-                   blob_dir_.c_str(), s.ToString().c_str());
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to create blob_dir %s, status: %s",
+                    blob_dir_.c_str(), s.ToString().c_str());
   }
   s = env_->NewDirectory(blob_dir_, &dir_ent_);
   if (!s.ok()) {
-    ROCKS_LOG_WARN(db_options_.info_log,
-                   "Failed to open blob directory: %s status: '%s'",
-                   blob_dir_.c_str(), s.ToString().c_str());
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to open blob_dir %s, status: %s", blob_dir_.c_str(),
+                    s.ToString().c_str());
+    return s;
   }
 
-  if (!bdb_options_.disable_background_tasks) {
-    StartBackgroundTasks();
+  // Open blob files.
+  s = OpenAllBlobFiles();
+  if (!s.ok()) {
+    return s;
   }
-  return s;
-}
 
-BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
+  // Update options
+  db_options_.listeners.push_back(std::make_shared<BlobDBListener>(this));
+  cf_options_.compaction_filter_factory.reset(
+      new BlobIndexCompactionFilterFactory(this, env_, statistics_));
 
-BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options)
-    : BlobDB(db),
-      db_impl_(static_cast_with_check<DBImpl, DB>(db)),
-      bdb_options_(blob_db_options),
-      db_options_(db->GetOptions()),
-      env_options_(db_->GetOptions()),
-      dir_change_(false),
-      next_file_number_(1),
-      epoch_of_(0),
-      shutdown_(false),
-      current_epoch_(0),
-      open_file_count_(0),
-      last_period_write_(0),
-      last_period_ampl_(0),
-      total_periods_write_(0),
-      total_periods_ampl_(0),
-      total_blob_space_(0),
-      oldest_file_evicted_(false) {
-  if (!bdb_options_.blob_dir.empty())
-    blob_dir_ = (bdb_options_.path_relative)
-                    ? db_->GetName() + "/" + bdb_options_.blob_dir
-                    : bdb_options_.blob_dir;
-}
-
-BlobDBImpl::~BlobDBImpl() {
-  // CancelAllBackgroundWork(db_, true);
+  // Open base db.
+  ColumnFamilyDescriptor cf_descriptor(kDefaultColumnFamilyName, cf_options_);
+  s = DB::Open(db_options_, dbname_, {cf_descriptor}, handles, &db_);
+  if (!s.ok()) {
+    return s;
+  }
+  db_impl_ = static_cast_with_check<DBImpl, DB>(db_->GetRootDB());
 
-  Shutdown();
-}
+  // Add trash files in blob dir to file delete scheduler.
+  SstFileManagerImpl* sfm = static_cast<SstFileManagerImpl*>(
+      db_impl_->immutable_db_options().sst_file_manager.get());
+  DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_);
 
-Status BlobDBImpl::OpenPhase1() {
-  assert(db_ == nullptr);
-  if (blob_dir_.empty())
-    return Status::NotSupported("No blob directory in options");
+  UpdateLiveSSTSize();
 
-  std::unique_ptr<Directory> dir_ent;
-  Status s = env_->NewDirectory(blob_dir_, &dir_ent);
-  if (!s.ok()) {
-    ROCKS_LOG_WARN(db_options_.info_log,
-                   "Failed to open blob directory: %s status: '%s'",
-                   blob_dir_.c_str(), s.ToString().c_str());
-    open_p1_done_ = true;
-    return Status::OK();
+  // Start background jobs.
+  if (!bdb_options_.disable_background_tasks) {
+    StartBackgroundTasks();
   }
 
-  s = OpenAllFiles();
-  open_p1_done_ = true;
+  ROCKS_LOG_INFO(db_options_.info_log, "BlobDB pointer %p", this);
+  bdb_options_.Dump(db_options_.info_log.get());
+  closed_ = false;
   return s;
 }
 
@@ -212,222 +205,96 @@ void BlobDBImpl::StartBackgroundTasks() {
   tqueue_.add(
       kReclaimOpenFilesPeriodMillisecs,
       std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1));
-  tqueue_.add(kGCCheckPeriodMillisecs,
-              std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1));
-  if (bdb_options_.enable_garbage_collection) {
-    tqueue_.add(
-        kDeleteCheckPeriodMillisecs,
-        std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1));
-    tqueue_.add(
-        kDeleteCheckPeriodMillisecs,
-        std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1));
-  }
   tqueue_.add(
       kDeleteObsoleteFilesPeriodMillisecs,
       std::bind(&BlobDBImpl::DeleteObsoleteFiles, this, std::placeholders::_1));
   tqueue_.add(kSanityCheckPeriodMillisecs,
               std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1));
-  tqueue_.add(kWriteAmplificationStatsPeriodMillisecs,
-              std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1));
-  tqueue_.add(kFSyncFilesPeriodMillisecs,
-              std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1));
   tqueue_.add(
-      kCheckSeqFilesPeriodMillisecs,
-      std::bind(&BlobDBImpl::CheckSeqFiles, this, std::placeholders::_1));
-}
-
-void BlobDBImpl::Shutdown() { shutdown_.store(true); }
-
-void BlobDBImpl::OnFlushBeginHandler(DB* db, const FlushJobInfo& info) {
-  if (shutdown_.load()) return;
-
-  // a callback that happens too soon needs to be ignored
-  if (!db_) return;
-
-  FsyncFiles(false);
+      kEvictExpiredFilesPeriodMillisecs,
+      std::bind(&BlobDBImpl::EvictExpiredFiles, this, std::placeholders::_1));
 }
 
-Status BlobDBImpl::GetAllLogFiles(
-    std::set<std::pair<uint64_t, std::string>>* file_nums) {
+Status BlobDBImpl::GetAllBlobFiles(std::set<uint64_t>* file_numbers) {
+  assert(file_numbers != nullptr);
   std::vector<std::string> all_files;
-  Status status = env_->GetChildren(blob_dir_, &all_files);
-  if (!status.ok()) {
-    return status;
+  Status s = env_->GetChildren(blob_dir_, &all_files);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to get list of blob files, status: %s",
+                    s.ToString().c_str());
+    return s;
   }
 
-  for (const auto& f : all_files) {
-    uint64_t number;
+  for (const auto& file_name : all_files) {
+    uint64_t file_number;
     FileType type;
-    bool psucc = ParseFileName(f, &number, &type);
-    if (psucc && type == kBlobFile) {
-      file_nums->insert(std::make_pair(number, f));
+    bool success = ParseFileName(file_name, &file_number, &type);
+    if (success && type == kBlobFile) {
+      file_numbers->insert(file_number);
     } else {
       ROCKS_LOG_WARN(db_options_.info_log,
-                     "Skipping file in blob directory %s parse: %d type: %d",
-                     f.c_str(), psucc, ((psucc) ? type : -1));
+                     "Skipping file in blob directory: %s", file_name.c_str());
     }
   }
 
-  return status;
+  return s;
 }
 
-Status BlobDBImpl::OpenAllFiles() {
-  WriteLock wl(&mutex_);
-
-  std::set<std::pair<uint64_t, std::string>> file_nums;
-  Status status = GetAllLogFiles(&file_nums);
-
-  if (!status.ok()) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to collect files from blob dir: %s status: '%s'",
-                    blob_dir_.c_str(), status.ToString().c_str());
-    return status;
+Status BlobDBImpl::OpenAllBlobFiles() {
+  std::set<uint64_t> file_numbers;
+  Status s = GetAllBlobFiles(&file_numbers);
+  if (!s.ok()) {
+    return s;
   }
 
-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "BlobDir files path: %s count: %d min: %" PRIu64
-                 " max: %" PRIu64,
-                 blob_dir_.c_str(), static_cast<int>(file_nums.size()),
-                 (file_nums.empty()) ? -1 : (file_nums.begin())->first,
-                 (file_nums.empty()) ? -1 : (file_nums.end())->first);
-
-  if (!file_nums.empty())
-    next_file_number_.store((file_nums.rbegin())->first + 1);
-
-  for (auto f_iter : file_nums) {
-    std::string bfpath = BlobFileName(blob_dir_, f_iter.first);
-    uint64_t size_bytes;
-    Status s1 = env_->GetFileSize(bfpath, &size_bytes);
-    if (!s1.ok()) {
-      ROCKS_LOG_WARN(
-          db_options_.info_log,
-          "Unable to get size of %s. File skipped from open status: '%s'",
-          bfpath.c_str(), s1.ToString().c_str());
-      continue;
-    }
-
-    if (debug_level_ >= 1)
-      ROCKS_LOG_INFO(db_options_.info_log, "Blob File open: %s size: %" PRIu64,
-                     bfpath.c_str(), size_bytes);
+  if (!file_numbers.empty()) {
+    next_file_number_.store(*file_numbers.rbegin() + 1);
+  }
 
-    std::shared_ptr<BlobFile> bfptr =
-        std::make_shared<BlobFile>(this, blob_dir_, f_iter.first);
-    bfptr->SetFileSize(size_bytes);
+  std::string blob_file_list;
+  std::string obsolete_file_list;
 
-    // since this file already existed, we will try to reconcile
-    // deleted count with LSM
-    bfptr->gc_once_after_open_ = true;
+  for (auto& file_number : file_numbers) {
+    std::shared_ptr<BlobFile> blob_file = std::make_shared<BlobFile>(
+        this, blob_dir_, file_number, db_options_.info_log.get());
+    blob_file->MarkImmutable();
 
-    // read header
-    std::shared_ptr<Reader> reader;
-    reader = bfptr->OpenSequentialReader(env_, db_options_, env_options_);
-    s1 = reader->ReadHeader(&bfptr->header_);
-    if (!s1.ok()) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Failure to read header for blob-file %s "
-                      "status: '%s' size: %" PRIu64,
-                      bfpath.c_str(), s1.ToString().c_str(), size_bytes);
+    // Read file header and footer
+    Status read_metadata_status = blob_file->ReadMetadata(env_, env_options_);
+    if (read_metadata_status.IsCorruption()) {
+      // Remove incomplete file.
+      ObsoleteBlobFile(blob_file, 0 /*obsolete_seq*/, false /*update_size*/);
+      if (!obsolete_file_list.empty()) {
+        obsolete_file_list.append(", ");
+      }
+      obsolete_file_list.append(ToString(file_number));
       continue;
+    } else if (!read_metadata_status.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Unable to read metadata of blob file %" PRIu64
+                      ", status: '%s'",
+                      file_number, read_metadata_status.ToString().c_str());
+      return read_metadata_status;
     }
-    bfptr->SetHasTTL(bfptr->header_.has_ttl);
-    bfptr->SetCompression(bfptr->header_.compression);
-    bfptr->header_valid_ = true;
-
-    std::shared_ptr<RandomAccessFileReader> ra_reader =
-        GetOrOpenRandomAccessReader(bfptr, env_, env_options_);
-
-    BlobLogFooter bf;
-    s1 = bfptr->ReadFooter(&bf);
-
-    bfptr->CloseRandomAccessLocked();
-    if (s1.ok()) {
-      s1 = bfptr->SetFromFooterLocked(bf);
-      if (!s1.ok()) {
-        ROCKS_LOG_ERROR(db_options_.info_log,
-                        "Header Footer mismatch for blob-file %s "
-                        "status: '%s' size: %" PRIu64,
-                        bfpath.c_str(), s1.ToString().c_str(), size_bytes);
-        continue;
-      }
-    } else {
-      ROCKS_LOG_INFO(db_options_.info_log,
-                     "File found incomplete (w/o footer) %s", bfpath.c_str());
-
-      // sequentially iterate over the file and read all the records
-      ExpirationRange expiration_range(std::numeric_limits<uint32_t>::max(),
-                                       std::numeric_limits<uint32_t>::min());
-
-      uint64_t blob_count = 0;
-      BlobLogRecord record;
-      Reader::ReadLevel shallow = Reader::kReadHeaderKey;
-
-      uint64_t record_start = reader->GetNextByte();
-      // TODO(arahut) - when we detect corruption, we should truncate
-      while (reader->ReadRecord(&record, shallow).ok()) {
-        ++blob_count;
-        if (bfptr->HasTTL()) {
-          expiration_range.first =
-              std::min(expiration_range.first, record.expiration);
-          expiration_range.second =
-              std::max(expiration_range.second, record.expiration);
-        }
-        record_start = reader->GetNextByte();
-      }
-
-      if (record_start != bfptr->GetFileSize()) {
-        ROCKS_LOG_ERROR(db_options_.info_log,
-                        "Blob file is corrupted or crashed during write %s"
-                        " good_size: %" PRIu64 " file_size: %" PRIu64,
-                        bfpath.c_str(), record_start, bfptr->GetFileSize());
-      }
 
-      if (!blob_count) {
-        ROCKS_LOG_INFO(db_options_.info_log, "BlobCount = 0 in file %s",
-                       bfpath.c_str());
-        continue;
-      }
+    total_blob_size_ += blob_file->GetFileSize();
 
-      bfptr->SetBlobCount(blob_count);
-      bfptr->SetSequenceRange({0, 0});
-
-      ROCKS_LOG_INFO(db_options_.info_log,
-                     "Blob File: %s blob_count: %" PRIu64
-                     " size_bytes: %" PRIu64 " has_ttl: %d",
-                     bfpath.c_str(), blob_count, size_bytes, bfptr->HasTTL());
-
-      if (bfptr->HasTTL()) {
-        expiration_range.second = std::max(
-            expiration_range.second,
-            expiration_range.first + (uint32_t)bdb_options_.ttl_range_secs);
-        bfptr->set_expiration_range(expiration_range);
-
-        uint64_t now = EpochNow();
-        if (expiration_range.second < now) {
-          Status fstatus = CreateWriterLocked(bfptr);
-          if (fstatus.ok()) fstatus = bfptr->WriteFooterAndCloseLocked();
-          if (!fstatus.ok()) {
-            ROCKS_LOG_ERROR(
-                db_options_.info_log,
-                "Failed to close Blob File: %s status: '%s'. Skipped",
-                bfpath.c_str(), fstatus.ToString().c_str());
-            continue;
-          } else {
-            ROCKS_LOG_ERROR(
-                db_options_.info_log,
-                "Blob File Closed: %s now: %d expiration_range: (%d, %d)",
-                bfpath.c_str(), now, expiration_range.first,
-                expiration_range.second);
-          }
-        } else {
-          open_ttl_files_.insert(bfptr);
-        }
-      }
+    blob_files_[file_number] = blob_file;
+    if (!blob_file_list.empty()) {
+      blob_file_list.append(", ");
     }
-
-    blob_files_.insert(std::make_pair(f_iter.first, bfptr));
+    blob_file_list.append(ToString(file_number));
   }
 
-  return status;
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Found %" ROCKSDB_PRIszt " blob files: %s", blob_files_.size(),
+                 blob_file_list.c_str());
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Found %" ROCKSDB_PRIszt
+                 " incomplete or corrupted blob files: %s",
+                 obsolete_files_.size(), obsolete_file_list.c_str());
+  return s;
 }
 
 void BlobDBImpl::CloseRandomAccessLocked(
@@ -436,18 +303,23 @@ void BlobDBImpl::CloseRandomAccessLocked(
   open_file_count_--;
 }
 
-std::shared_ptr<RandomAccessFileReader> BlobDBImpl::GetOrOpenRandomAccessReader(
-    const std::shared_ptr<BlobFile>& bfile, Env* env,
-    const EnvOptions& env_options) {
+Status BlobDBImpl::GetBlobFileReader(
+    const std::shared_ptr<BlobFile>& blob_file,
+    std::shared_ptr<RandomAccessFileReader>* reader) {
+  assert(reader != nullptr);
   bool fresh_open = false;
-  auto rar = bfile->GetOrOpenRandomAccessReader(env, env_options, &fresh_open);
-  if (fresh_open) open_file_count_++;
-  return rar;
+  Status s = blob_file->GetReader(env_, env_options_, reader, &fresh_open);
+  if (s.ok() && fresh_open) {
+    assert(*reader != nullptr);
+    open_file_count_++;
+  }
+  return s;
 }
 
 std::shared_ptr<BlobFile> BlobDBImpl::NewBlobFile(const std::string& reason) {
   uint64_t file_num = next_file_number_++;
-  auto bfile = std::make_shared<BlobFile>(this, blob_dir_, file_num);
+  auto bfile = std::make_shared<BlobFile>(this, blob_dir_, file_num,
+                                          db_options_.info_log.get());
   ROCKS_LOG_DEBUG(db_options_.info_log, "New blob file created: %s reason='%s'",
                   bfile->PathName().c_str(), reason.c_str());
   LogFlush(db_options_.info_log);
@@ -469,12 +341,13 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
   }
 
   std::unique_ptr<WritableFileWriter> fwriter;
-  fwriter.reset(new WritableFileWriter(std::move(wfile), env_options_));
+  fwriter.reset(new WritableFileWriter(std::move(wfile), fpath, env_options_));
 
   uint64_t boffset = bfile->GetFileSize();
   if (debug_level_ >= 2 && boffset) {
-    ROCKS_LOG_DEBUG(db_options_.info_log, "Open blob file: %s with offset: %d",
-                    fpath.c_str(), boffset);
+    ROCKS_LOG_DEBUG(db_options_.info_log,
+                    "Open blob file: %s with offset: %" PRIu64, fpath.c_str(),
+                    boffset);
   }
 
   Writer::ElemType et = Writer::kEtNone;
@@ -484,14 +357,14 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
     et = Writer::kEtRecord;
   } else if (bfile->file_size_) {
     ROCKS_LOG_WARN(db_options_.info_log,
-                   "Open blob file: %s with wrong size: %d", fpath.c_str(),
-                   boffset);
+                   "Open blob file: %s with wrong size: %" PRIu64,
+                   fpath.c_str(), boffset);
     return Status::Corruption("Invalid blob file size");
   }
 
   bfile->log_writer_ = std::make_shared<Writer>(
-      std::move(fwriter), bfile->file_number_, bdb_options_.bytes_per_sync,
-      db_options_.use_fsync, boffset);
+      std::move(fwriter), env_, statistics_, bfile->file_number_,
+      bdb_options_.bytes_per_sync, db_options_.use_fsync, boffset);
   bfile->log_writer_->last_elem_type_ = et;
 
   return s;
@@ -499,105 +372,123 @@ Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr<BlobFile>& bfile) {
 
 std::shared_ptr<BlobFile> BlobDBImpl::FindBlobFileLocked(
     uint64_t expiration) const {
-  if (open_ttl_files_.empty()) return nullptr;
+  if (open_ttl_files_.empty()) {
+    return nullptr;
+  }
 
   std::shared_ptr<BlobFile> tmp = std::make_shared<BlobFile>();
+  tmp->SetHasTTL(true);
   tmp->expiration_range_ = std::make_pair(expiration, 0);
+  tmp->file_number_ = std::numeric_limits<uint64_t>::max();
 
   auto citr = open_ttl_files_.equal_range(tmp);
   if (citr.first == open_ttl_files_.end()) {
     assert(citr.second == open_ttl_files_.end());
 
     std::shared_ptr<BlobFile> check = *(open_ttl_files_.rbegin());
-    return (check->expiration_range_.second < expiration) ? nullptr : check;
+    return (check->expiration_range_.second <= expiration) ? nullptr : check;
   }
 
-  if (citr.first != citr.second) return *(citr.first);
+  if (citr.first != citr.second) {
+    return *(citr.first);
+  }
 
   auto finditr = citr.second;
-  if (finditr != open_ttl_files_.begin()) --finditr;
+  if (finditr != open_ttl_files_.begin()) {
+    --finditr;
+  }
 
-  bool b2 = (*finditr)->expiration_range_.second < expiration;
+  bool b2 = (*finditr)->expiration_range_.second <= expiration;
   bool b1 = (*finditr)->expiration_range_.first > expiration;
 
   return (b1 || b2) ? nullptr : (*finditr);
 }
 
-std::shared_ptr<Writer> BlobDBImpl::CheckOrCreateWriterLocked(
-    const std::shared_ptr<BlobFile>& bfile) {
-  std::shared_ptr<Writer> writer = bfile->GetWriter();
-  if (writer) return writer;
-
-  Status s = CreateWriterLocked(bfile);
-  if (!s.ok()) return nullptr;
-
-  writer = bfile->GetWriter();
-  return writer;
+Status BlobDBImpl::CheckOrCreateWriterLocked(
+    const std::shared_ptr<BlobFile>& blob_file,
+    std::shared_ptr<Writer>* writer) {
+  assert(writer != nullptr);
+  *writer = blob_file->GetWriter();
+  if (*writer != nullptr) {
+    return Status::OK();
+  }
+  Status s = CreateWriterLocked(blob_file);
+  if (s.ok()) {
+    *writer = blob_file->GetWriter();
+  }
+  return s;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFile() {
+Status BlobDBImpl::SelectBlobFile(std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file != nullptr);
   {
     ReadLock rl(&mutex_);
     if (open_non_ttl_file_ != nullptr) {
-      return open_non_ttl_file_;
+      *blob_file = open_non_ttl_file_;
+      return Status::OK();
     }
   }
 
   // CHECK again
   WriteLock wl(&mutex_);
   if (open_non_ttl_file_ != nullptr) {
-    return open_non_ttl_file_;
+    *blob_file = open_non_ttl_file_;
+    return Status::OK();
   }
 
-  std::shared_ptr<BlobFile> bfile = NewBlobFile("SelectBlobFile");
-  assert(bfile);
+  *blob_file = NewBlobFile("SelectBlobFile");
+  assert(*blob_file != nullptr);
 
   // file not visible, hence no lock
-  std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
-  if (!writer) {
+  std::shared_ptr<Writer> writer;
+  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to get writer from blob file: %s",
-                    bfile->PathName().c_str());
-    return nullptr;
+                    "Failed to get writer from blob file: %s, error: %s",
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  bfile->file_size_ = BlobLogHeader::kSize;
-  bfile->header_.compression = bdb_options_.compression;
-  bfile->header_.has_ttl = false;
-  bfile->header_.column_family_id =
+  (*blob_file)->file_size_ = BlobLogHeader::kSize;
+  (*blob_file)->header_.compression = bdb_options_.compression;
+  (*blob_file)->header_.has_ttl = false;
+  (*blob_file)->header_.column_family_id =
       reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  bfile->header_valid_ = true;
-  bfile->SetHasTTL(false);
-  bfile->SetCompression(bdb_options_.compression);
+  (*blob_file)->header_valid_ = true;
+  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
+  (*blob_file)->SetHasTTL(false);
+  (*blob_file)->SetCompression(bdb_options_.compression);
 
-  Status s = writer->WriteHeader(bfile->header_);
+  s = writer->WriteHeader((*blob_file)->header_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to write header to new blob file: %s"
                     " status: '%s'",
-                    bfile->PathName().c_str(), s.ToString().c_str());
-    return nullptr;
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  dir_change_.store(true);
-  blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
-  open_non_ttl_file_ = bfile;
-  return bfile;
+  blob_files_.insert(
+      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
+  open_non_ttl_file_ = *blob_file;
+  total_blob_size_ += BlobLogHeader::kSize;
+  return s;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) {
+Status BlobDBImpl::SelectBlobFileTTL(uint64_t expiration,
+                                     std::shared_ptr<BlobFile>* blob_file) {
+  assert(blob_file != nullptr);
   assert(expiration != kNoExpiration);
   uint64_t epoch_read = 0;
-  std::shared_ptr<BlobFile> bfile;
   {
     ReadLock rl(&mutex_);
-    bfile = FindBlobFileLocked(expiration);
+    *blob_file = FindBlobFileLocked(expiration);
     epoch_read = epoch_of_.load();
   }
 
-  if (bfile) {
-    assert(!bfile->Immutable());
-    return bfile;
+  if (*blob_file != nullptr) {
+    assert(!(*blob_file)->Immutable());
+    return Status::OK();
   }
 
   uint64_t exp_low =
@@ -605,70 +496,66 @@ std::shared_ptr<BlobFile> BlobDBImpl::SelectBlobFileTTL(uint64_t expiration) {
   uint64_t exp_high = exp_low + bdb_options_.ttl_range_secs;
   ExpirationRange expiration_range = std::make_pair(exp_low, exp_high);
 
-  bfile = NewBlobFile("SelectBlobFileTTL");
-  assert(bfile);
+  *blob_file = NewBlobFile("SelectBlobFileTTL");
+  assert(*blob_file != nullptr);
 
-  ROCKS_LOG_INFO(db_options_.info_log, "New blob file TTL range: %s %d %d",
-                 bfile->PathName().c_str(), exp_low, exp_high);
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "New blob file TTL range: %s %" PRIu64 " %" PRIu64,
+                 (*blob_file)->PathName().c_str(), exp_low, exp_high);
   LogFlush(db_options_.info_log);
 
   // we don't need to take lock as no other thread is seeing bfile yet
-  std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
-  if (!writer) {
-    ROCKS_LOG_ERROR(db_options_.info_log,
-                    "Failed to get writer from blob file with TTL: %s",
-                    bfile->PathName().c_str());
-    return nullptr;
+  std::shared_ptr<Writer> writer;
+  Status s = CheckOrCreateWriterLocked(*blob_file, &writer);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "Failed to get writer from blob file with TTL: %s, error: %s",
+        (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  bfile->header_.expiration_range = expiration_range;
-  bfile->header_.compression = bdb_options_.compression;
-  bfile->header_.has_ttl = true;
-  bfile->header_.column_family_id =
+  (*blob_file)->header_.expiration_range = expiration_range;
+  (*blob_file)->header_.compression = bdb_options_.compression;
+  (*blob_file)->header_.has_ttl = true;
+  (*blob_file)->header_.column_family_id =
       reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  ;
-  bfile->header_valid_ = true;
-  bfile->SetHasTTL(true);
-  bfile->SetCompression(bdb_options_.compression);
-  bfile->file_size_ = BlobLogHeader::kSize;
+  (*blob_file)->header_valid_ = true;
+  (*blob_file)->SetColumnFamilyId((*blob_file)->header_.column_family_id);
+  (*blob_file)->SetHasTTL(true);
+  (*blob_file)->SetCompression(bdb_options_.compression);
+  (*blob_file)->file_size_ = BlobLogHeader::kSize;
 
   // set the first value of the range, since that is
   // concrete at this time.  also necessary to add to open_ttl_files_
-  bfile->expiration_range_ = expiration_range;
+  (*blob_file)->expiration_range_ = expiration_range;
 
   WriteLock wl(&mutex_);
   // in case the epoch has shifted in the interim, then check
   // check condition again - should be rare.
   if (epoch_of_.load() != epoch_read) {
-    auto bfile2 = FindBlobFileLocked(expiration);
-    if (bfile2) return bfile2;
+    std::shared_ptr<BlobFile> blob_file2 = FindBlobFileLocked(expiration);
+    if (blob_file2 != nullptr) {
+      *blob_file = std::move(blob_file2);
+      return Status::OK();
+    }
   }
 
-  Status s = writer->WriteHeader(bfile->header_);
+  s = writer->WriteHeader((*blob_file)->header_);
   if (!s.ok()) {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to write header to new blob file: %s"
                     " status: '%s'",
-                    bfile->PathName().c_str(), s.ToString().c_str());
-    return nullptr;
+                    (*blob_file)->PathName().c_str(), s.ToString().c_str());
+    return s;
   }
 
-  dir_change_.store(true);
-  blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile));
-  open_ttl_files_.insert(bfile);
+  blob_files_.insert(
+      std::make_pair((*blob_file)->BlobFileNumber(), *blob_file));
+  open_ttl_files_.insert(*blob_file);
+  total_blob_size_ += BlobLogHeader::kSize;
   epoch_of_++;
 
-  return bfile;
-}
-
-Status BlobDBImpl::Delete(const WriteOptions& options, const Slice& key) {
-  SequenceNumber lsn = db_impl_->GetLatestSequenceNumber();
-  Status s = db_->Delete(options, key);
-
-  if (bdb_options_.enable_garbage_collection) {
-    // add deleted key to list of keys that have been deleted for book-keeping
-    delete_keys_q_.enqueue({DefaultColumnFamily(), key.ToString(), lsn});
-  }
   return s;
 }
 
@@ -677,45 +564,34 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
   const WriteOptions& options_;
   BlobDBImpl* blob_db_impl_;
   uint32_t default_cf_id_;
-  SequenceNumber sequence_;
   WriteBatch batch_;
 
  public:
   BlobInserter(const WriteOptions& options, BlobDBImpl* blob_db_impl,
-               uint32_t default_cf_id, SequenceNumber seq)
+               uint32_t default_cf_id)
       : options_(options),
         blob_db_impl_(blob_db_impl),
-        default_cf_id_(default_cf_id),
-        sequence_(seq) {}
-
-  SequenceNumber sequence() { return sequence_; }
+        default_cf_id_(default_cf_id) {}
 
   WriteBatch* batch() { return &batch_; }
 
-  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) override {
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
     if (column_family_id != default_cf_id_) {
       return Status::NotSupported(
           "Blob DB doesn't support non-default column family.");
     }
-    std::string new_value;
-    Slice value_slice;
-    uint64_t expiration =
-        blob_db_impl_->ExtractExpiration(key, value, &value_slice, &new_value);
-    Status s = blob_db_impl_->PutBlobValue(options_, key, value_slice,
-                                           expiration, sequence_, &batch_);
-    sequence_++;
+    Status s = blob_db_impl_->PutBlobValue(options_, key, value, kNoExpiration,
+                                           &batch_);
     return s;
   }
 
-  virtual Status DeleteCF(uint32_t column_family_id,
-                          const Slice& key) override {
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
     if (column_family_id != default_cf_id_) {
       return Status::NotSupported(
           "Blob DB doesn't support non-default column family.");
     }
     Status s = WriteBatchInternal::Delete(&batch_, column_family_id, key);
-    sequence_++;
     return s;
   }
 
@@ -727,33 +603,29 @@ class BlobDBImpl::BlobInserter : public WriteBatch::Handler {
     }
     Status s = WriteBatchInternal::DeleteRange(&batch_, column_family_id,
                                                begin_key, end_key);
-    sequence_++;
     return s;
   }
 
-  virtual Status SingleDeleteCF(uint32_t /*column_family_id*/,
-                                const Slice& /*key*/) override {
+  Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                        const Slice& /*key*/) override {
     return Status::NotSupported("Not supported operation in blob db.");
   }
 
-  virtual Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
-                         const Slice& /*value*/) override {
+  Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                 const Slice& /*value*/) override {
     return Status::NotSupported("Not supported operation in blob db.");
   }
 
-  virtual void LogData(const Slice& blob) override { batch_.PutLogData(blob); }
+  void LogData(const Slice& blob) override { batch_.PutLogData(blob); }
 };
 
 Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
-
+  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_WRITE);
   uint32_t default_cf_id =
       reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily())->GetID();
-  // TODO(yiwu): In case there are multiple writers the latest sequence would
-  // not be the actually sequence we are writting. Need to get the sequence
-  // from write batch after DB write instead.
-  SequenceNumber current_seq = GetLatestSequenceNumber() + 1;
   Status s;
-  BlobInserter blob_inserter(options, this, default_cf_id, current_seq);
+  BlobInserter blob_inserter(options, this, default_cf_id);
   {
     // Release write_mutex_ before DB write to avoid race condition with
     // flush begin listener, which also require write_mutex_ to sync
@@ -764,86 +636,12 @@ Status BlobDBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
   if (!s.ok()) {
     return s;
   }
-  s = db_->Write(options, blob_inserter.batch());
-  if (!s.ok()) {
-    return s;
-  }
-
-  // add deleted key to list of keys that have been deleted for book-keeping
-  class DeleteBookkeeper : public WriteBatch::Handler {
-   public:
-    explicit DeleteBookkeeper(BlobDBImpl* impl, const SequenceNumber& seq)
-        : impl_(impl), sequence_(seq) {}
-
-    virtual Status PutCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
-                         const Slice& /*value*/) override {
-      sequence_++;
-      return Status::OK();
-    }
-
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
-      ColumnFamilyHandle* cfh =
-          impl_->db_impl_->GetColumnFamilyHandleUnlocked(column_family_id);
-
-      impl_->delete_keys_q_.enqueue({cfh, key.ToString(), sequence_});
-      sequence_++;
-      return Status::OK();
-    }
-
-   private:
-    BlobDBImpl* impl_;
-    SequenceNumber sequence_;
-  };
-
-  if (bdb_options_.enable_garbage_collection) {
-    // add deleted key to list of keys that have been deleted for book-keeping
-    DeleteBookkeeper delete_bookkeeper(this, current_seq);
-    s = updates->Iterate(&delete_bookkeeper);
-  }
-
-  return s;
-}
-
-Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
-                                uint64_t* manifest_file_size,
-                                bool flush_memtable) {
-  // Hold a lock in the beginning to avoid updates to base DB during the call
-  ReadLock rl(&mutex_);
-  Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
-  if (!s.ok()) {
-    return s;
-  }
-  ret.reserve(ret.size() + blob_files_.size());
-  for (auto bfile_pair : blob_files_) {
-    auto blob_file = bfile_pair.second;
-    ret.emplace_back(blob_file->PathName());
-  }
-  return Status::OK();
-}
-
-void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
-  // Hold a lock in the beginning to avoid updates to base DB during the call
-  ReadLock rl(&mutex_);
-  db_->GetLiveFilesMetaData(metadata);
-  for (auto bfile_pair : blob_files_) {
-    auto blob_file = bfile_pair.second;
-    LiveFileMetaData filemetadata;
-    filemetadata.size = blob_file->GetFileSize();
-    filemetadata.name = blob_file->PathName();
-    auto cfh =
-        reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-    filemetadata.column_family_name = cfh->GetName();
-    metadata->emplace_back(filemetadata);
-  }
+  return db_->Write(options, blob_inserter.batch());
 }
 
 Status BlobDBImpl::Put(const WriteOptions& options, const Slice& key,
                        const Slice& value) {
-  std::string new_value;
-  Slice value_slice;
-  uint64_t expiration = ExtractExpiration(key, value, &value_slice, &new_value);
-  return PutUntil(options, key, value_slice, expiration);
+  return PutUntil(options, key, value, kNoExpiration);
 }
 
 Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
@@ -856,6 +654,8 @@ Status BlobDBImpl::PutWithTTL(const WriteOptions& options,
 
 Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
                             const Slice& value, uint64_t expiration) {
+  StopWatch write_sw(env_, statistics_, BLOB_DB_WRITE_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_PUT);
   TEST_SYNC_POINT("BlobDBImpl::PutUntil:Start");
   Status s;
   WriteBatch batch;
@@ -864,11 +664,7 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
     // flush begin listener, which also require write_mutex_ to sync
     // blob files.
     MutexLock l(&write_mutex_);
-    // TODO(yiwu): In case there are multiple writers the latest sequence would
-    // not be the actually sequence we are writting. Need to get the sequence
-    // from write batch after DB write instead.
-    SequenceNumber sequence = GetLatestSequenceNumber() + 1;
-    s = PutBlobValue(options, key, value, expiration, sequence, &batch);
+    s = PutBlobValue(options, key, value, expiration, &batch);
   }
   if (s.ok()) {
     s = db_->Write(options, &batch);
@@ -877,9 +673,10 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
   return s;
 }
 
-Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
-                                const Slice& value, uint64_t expiration,
-                                SequenceNumber sequence, WriteBatch* batch) {
+Status BlobDBImpl::PutBlobValue(const WriteOptions& /*options*/,
+                                const Slice& key, const Slice& value,
+                                uint64_t expiration, WriteBatch* batch) {
+  write_mutex_.AssertHeld();
   Status s;
   std::string index_entry;
   uint32_t column_family_id =
@@ -888,50 +685,73 @@ Status BlobDBImpl::PutBlobValue(const WriteOptions& options, const Slice& key,
     if (expiration == kNoExpiration) {
       // Put as normal value
       s = batch->Put(key, value);
+      RecordTick(statistics_, BLOB_DB_WRITE_INLINED);
     } else {
       // Inlined with TTL
       BlobIndex::EncodeInlinedTTL(&index_entry, expiration, value);
       s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
                                            index_entry);
+      RecordTick(statistics_, BLOB_DB_WRITE_INLINED_TTL);
     }
   } else {
-    std::shared_ptr<BlobFile> bfile = (expiration != kNoExpiration)
-                                          ? SelectBlobFileTTL(expiration)
-                                          : SelectBlobFile();
-    if (!bfile) {
-      return Status::NotFound("Blob file not found");
-    }
-
-    assert(bfile->compression() == bdb_options_.compression);
     std::string compression_output;
     Slice value_compressed = GetCompressedSlice(value, &compression_output);
 
     std::string headerbuf;
     Writer::ConstructBlobHeader(&headerbuf, key, value_compressed, expiration);
 
-    s = AppendBlob(bfile, headerbuf, key, value_compressed, expiration,
-                   &index_entry);
+    // Check DB size limit before selecting blob file to
+    // Since CheckSizeAndEvictBlobFiles() can close blob files, it needs to be
+    // done before calling SelectBlobFile().
+    s = CheckSizeAndEvictBlobFiles(headerbuf.size() + key.size() +
+                                   value_compressed.size());
+    if (!s.ok()) {
+      return s;
+    }
 
+    std::shared_ptr<BlobFile> blob_file;
+    if (expiration != kNoExpiration) {
+      s = SelectBlobFileTTL(expiration, &blob_file);
+    } else {
+      s = SelectBlobFile(&blob_file);
+    }
+    if (s.ok()) {
+      assert(blob_file != nullptr);
+      assert(blob_file->compression() == bdb_options_.compression);
+      s = AppendBlob(blob_file, headerbuf, key, value_compressed, expiration,
+                     &index_entry);
+    }
     if (s.ok()) {
-      bfile->ExtendSequenceRange(sequence);
       if (expiration != kNoExpiration) {
-        bfile->ExtendExpirationRange(expiration);
+        blob_file->ExtendExpirationRange(expiration);
       }
-      s = CloseBlobFileIfNeeded(bfile);
-      if (s.ok()) {
-        s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
-                                             index_entry);
+      s = CloseBlobFileIfNeeded(blob_file);
+    }
+    if (s.ok()) {
+      s = WriteBatchInternal::PutBlobIndex(batch, column_family_id, key,
+                                           index_entry);
+    }
+    if (s.ok()) {
+      if (expiration == kNoExpiration) {
+        RecordTick(statistics_, BLOB_DB_WRITE_BLOB);
+      } else {
+        RecordTick(statistics_, BLOB_DB_WRITE_BLOB_TTL);
       }
     } else {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Failed to append blob to FILE: %s: KEY: %s VALSZ: %d"
-                      " status: '%s' blob_file: '%s'",
-                      bfile->PathName().c_str(), key.ToString().c_str(),
-                      value.size(), s.ToString().c_str(),
-                      bfile->DumpState().c_str());
+      ROCKS_LOG_ERROR(
+          db_options_.info_log,
+          "Failed to append blob to FILE: %s: KEY: %s VALSZ: %" ROCKSDB_PRIszt
+          " status: '%s' blob_file: '%s'",
+          blob_file->PathName().c_str(), key.ToString().c_str(), value.size(),
+          s.ToString().c_str(), blob_file->DumpState().c_str());
     }
   }
 
+  RecordTick(statistics_, BLOB_DB_NUM_KEYS_WRITTEN);
+  RecordTick(statistics_, BLOB_DB_BYTES_WRITTEN, key.size() + value.size());
+  RecordInHistogram(statistics_, BLOB_DB_KEY_SIZE, key.size());
+  RecordInHistogram(statistics_, BLOB_DB_VALUE_SIZE, value.size());
+
   return s;
 }
 
@@ -940,78 +760,123 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
   if (bdb_options_.compression == kNoCompression) {
     return raw;
   }
-  CompressionType ct = bdb_options_.compression;
-  CompressionOptions compression_opts;
-  CompressBlock(raw, compression_opts, &ct, kBlockBasedTableVersionFormat,
-                Slice(), compression_output);
+  StopWatch compression_sw(env_, statistics_, BLOB_DB_COMPRESSION_MICROS);
+  CompressionType type = bdb_options_.compression;
+  CompressionOptions opts;
+  CompressionContext context(type);
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type,
+                       0 /* sample_for_compression */);
+  CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false,
+                compression_output, nullptr, nullptr);
   return *compression_output;
 }
 
-uint64_t BlobDBImpl::ExtractExpiration(const Slice& key, const Slice& value,
-                                       Slice* value_slice,
-                                       std::string* new_value) {
-  uint64_t expiration = kNoExpiration;
-  bool has_expiration = false;
-  bool value_changed = false;
-  if (ttl_extractor_ != nullptr) {
-    has_expiration = ttl_extractor_->ExtractExpiration(
-        key, value, EpochNow(), &expiration, new_value, &value_changed);
-  }
-  *value_slice = value_changed ? Slice(*new_value) : value;
-  return has_expiration ? expiration : kNoExpiration;
+void BlobDBImpl::GetCompactionContext(BlobCompactionContext* context) {
+  ReadLock l(&mutex_);
+
+  context->next_file_number = next_file_number_.load();
+  context->current_blob_files.clear();
+  for (auto& p : blob_files_) {
+    context->current_blob_files.insert(p.first);
+  }
+  context->fifo_eviction_seq = fifo_eviction_seq_;
+  context->evict_expiration_up_to = evict_expiration_up_to_;
 }
 
-std::shared_ptr<BlobFile> BlobDBImpl::GetOldestBlobFile() {
-  std::vector<std::shared_ptr<BlobFile>> blob_files;
-  CopyBlobFiles(&blob_files, [](const std::shared_ptr<BlobFile>& f) {
-    return !f->Obsolete() && f->Immutable();
-  });
-  blobf_compare_ttl compare;
-  return *std::min_element(blob_files.begin(), blob_files.end(), compare);
+void BlobDBImpl::UpdateLiveSSTSize() {
+  uint64_t live_sst_size = 0;
+  bool ok = GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+  if (ok) {
+    live_sst_size_.store(live_sst_size);
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Updated total SST file size: %" PRIu64 " bytes.",
+                   live_sst_size);
+  } else {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "Failed to update total SST file size after flush or compaction.");
+  }
+  {
+    // Trigger FIFO eviction if needed.
+    MutexLock l(&write_mutex_);
+    Status s = CheckSizeAndEvictBlobFiles(0, true /*force*/);
+    if (s.IsNoSpace()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "DB grow out-of-space after SST size updated. Current live"
+                     " SST size: %" PRIu64
+                     " , current blob files size: %" PRIu64 ".",
+                     live_sst_size_.load(), total_blob_size_.load());
+    }
+  }
 }
 
-bool BlobDBImpl::EvictOldestBlobFile() {
-  auto oldest_file = GetOldestBlobFile();
-  if (oldest_file == nullptr) {
-    return false;
+Status BlobDBImpl::CheckSizeAndEvictBlobFiles(uint64_t blob_size,
+                                              bool force_evict) {
+  write_mutex_.AssertHeld();
+
+  uint64_t live_sst_size = live_sst_size_.load();
+  if (bdb_options_.max_db_size == 0 ||
+      live_sst_size + total_blob_size_.load() + blob_size <=
+          bdb_options_.max_db_size) {
+    return Status::OK();
   }
 
-  WriteLock wl(&mutex_);
-  // Double check the file is not obsolete by others
-  if (oldest_file_evicted_ == false && !oldest_file->Obsolete()) {
-    auto expiration_range = oldest_file->GetExpirationRange();
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "Evict oldest blob file since DB out of space. Current "
-                   "space used: %" PRIu64 ", blob dir size: %" PRIu64
-                   ", evicted blob file #%" PRIu64
-                   " with expiration range (%" PRIu64 ", %" PRIu64 ").",
-                   total_blob_space_.load(), bdb_options_.blob_dir_size,
-                   oldest_file->BlobFileNumber(), expiration_range.first,
-                   expiration_range.second);
-    oldest_file->MarkObsolete(oldest_file->GetSequenceRange().second);
-    obsolete_files_.push_back(oldest_file);
-    oldest_file_evicted_.store(true);
-    return true;
+  if (bdb_options_.is_fifo == false ||
+      (!force_evict && live_sst_size + blob_size > bdb_options_.max_db_size)) {
+    // FIFO eviction is disabled, or no space to insert new blob even we evict
+    // all blob files.
+    return Status::NoSpace(
+        "Write failed, as writing it would exceed max_db_size limit.");
   }
 
-  return false;
-}
+  std::vector<std::shared_ptr<BlobFile>> candidate_files;
+  CopyBlobFiles(&candidate_files);
+  std::sort(candidate_files.begin(), candidate_files.end(),
+            BlobFileComparator());
+  fifo_eviction_seq_ = GetLatestSequenceNumber();
 
-Status BlobDBImpl::CheckSize(size_t blob_size) {
-  uint64_t new_space_util = total_blob_space_.load() + blob_size;
-  if (bdb_options_.blob_dir_size > 0) {
-    if (!bdb_options_.is_fifo &&
-        (new_space_util > bdb_options_.blob_dir_size)) {
-      return Status::NoSpace(
-          "Write failed, as writing it would exceed blob_dir_size limit.");
+  WriteLock l(&mutex_);
+
+  while (!candidate_files.empty() &&
+         live_sst_size + total_blob_size_.load() + blob_size >
+             bdb_options_.max_db_size) {
+    std::shared_ptr<BlobFile> blob_file = candidate_files.back();
+    candidate_files.pop_back();
+    WriteLock file_lock(&blob_file->mutex_);
+    if (blob_file->Obsolete()) {
+      // File already obsoleted by someone else.
+      continue;
     }
-    if (bdb_options_.is_fifo && !oldest_file_evicted_.load() &&
-        (new_space_util >
-         kEvictOldestFileAtSize * bdb_options_.blob_dir_size)) {
-      EvictOldestBlobFile();
+    // FIFO eviction can evict open blob files.
+    if (!blob_file->Immutable()) {
+      Status s = CloseBlobFile(blob_file, false /*need_lock*/);
+      if (!s.ok()) {
+        return s;
+      }
     }
+    assert(blob_file->Immutable());
+    auto expiration_range = blob_file->GetExpirationRange();
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "Evict oldest blob file since DB out of space. Current "
+                   "live SST file size: %" PRIu64 ", total blob size: %" PRIu64
+                   ", max db size: %" PRIu64 ", evicted blob file #%" PRIu64
+                   ".",
+                   live_sst_size, total_blob_size_.load(),
+                   bdb_options_.max_db_size, blob_file->BlobFileNumber());
+    ObsoleteBlobFile(blob_file, fifo_eviction_seq_, true /*update_size*/);
+    evict_expiration_up_to_ = expiration_range.first;
+    RecordTick(statistics_, BLOB_DB_FIFO_NUM_FILES_EVICTED);
+    RecordTick(statistics_, BLOB_DB_FIFO_NUM_KEYS_EVICTED,
+               blob_file->BlobCount());
+    RecordTick(statistics_, BLOB_DB_FIFO_BYTES_EVICTED,
+               blob_file->GetFileSize());
+    TEST_SYNC_POINT("BlobDBImpl::EvictOldestBlobFile:Evicted");
+  }
+  if (live_sst_size + total_blob_size_.load() + blob_size >
+      bdb_options_.max_db_size) {
+    return Status::NoSpace(
+        "Write failed, as writing it would exceed max_db_size limit.");
   }
-
   return Status::OK();
 }
 
@@ -1019,18 +884,16 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
                               const std::string& headerbuf, const Slice& key,
                               const Slice& value, uint64_t expiration,
                               std::string* index_entry) {
-  auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size();
-  Status s = CheckSize(size_put);
-  if (!s.ok()) {
-    return s;
-  }
-
+  Status s;
   uint64_t blob_offset = 0;
   uint64_t key_offset = 0;
   {
     WriteLock lockbfile_w(&bfile->mutex_);
-    std::shared_ptr<Writer> writer = CheckOrCreateWriterLocked(bfile);
-    if (!writer) return Status::IOError("Failed to create blob writer");
+    std::shared_ptr<Writer> writer;
+    s = CheckOrCreateWriterLocked(bfile, &writer);
+    if (!s.ok()) {
+      return s;
+    }
 
     // write the blob to the blob log.
     s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset,
@@ -1047,9 +910,9 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
   // increment blob count
   bfile->blob_count_++;
 
+  uint64_t size_put = headerbuf.size() + key.size() + value.size();
   bfile->file_size_ += size_put;
-  last_period_write_ += size_put;
-  total_blob_space_ += size_put;
+  total_blob_size_ += size_put;
 
   if (expiration == kNoExpiration) {
     BlobIndex::EncodeBlob(index_entry, bfile->BlobFileNumber(), blob_offset,
@@ -1066,6 +929,8 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr<BlobFile>& bfile,
 std::vector<Status> BlobDBImpl::MultiGet(
     const ReadOptions& read_options,
     const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  StopWatch multiget_sw(env_, statistics_, BLOB_DB_MULTIGET_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_MULTIGET);
   // Get a snapshot to avoid blob file get deleted between we
   // fetch and index entry and reading from the file.
   ReadOptions ro(read_options);
@@ -1097,7 +962,7 @@ bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
 }
 
 Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
-                                PinnableSlice* value) {
+                                PinnableSlice* value, uint64_t* expiration) {
   assert(value != nullptr);
   BlobIndex blob_index;
   Status s = blob_index.DecodeFrom(index_entry);
@@ -1107,6 +972,13 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
   if (blob_index.HasTTL() && blob_index.expiration() <= EpochNow()) {
     return Status::NotFound("Key expired");
   }
+  if (expiration != nullptr) {
+    if (blob_index.HasTTL()) {
+      *expiration = blob_index.expiration();
+    } else {
+      *expiration = kNoExpiration;
+    }
+  }
   if (blob_index.IsInlined()) {
     // TODO(yiwu): If index_entry is a PinnableSlice, we can also pin the same
     // memory buffer to avoid extra copy.
@@ -1153,56 +1025,63 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
   }
 
   // takes locks when called
-  std::shared_ptr<RandomAccessFileReader> reader =
-      GetOrOpenRandomAccessReader(bfile, env_, env_options_);
-
-  std::string* valueptr = value->GetSelf();
-  std::string value_c;
-  if (bdb_options_.compression != kNoCompression) {
-    valueptr = &value_c;
+  std::shared_ptr<RandomAccessFileReader> reader;
+  s = GetBlobFileReader(bfile, &reader);
+  if (!s.ok()) {
+    return s;
   }
 
+  assert(blob_index.offset() > key.size() + sizeof(uint32_t));
+  uint64_t record_offset = blob_index.offset() - key.size() - sizeof(uint32_t);
+  uint64_t record_size = sizeof(uint32_t) + key.size() + blob_index.size();
+
   // Allocate the buffer. This is safe in C++11
-  // Note that std::string::reserved() does not work, since previous value
-  // of the buffer can be larger than blob_index.size().
-  valueptr->resize(blob_index.size());
-  char* buffer = &(*valueptr)[0];
-
-  Slice blob_value;
-  s = reader->Read(blob_index.offset(), blob_index.size(), &blob_value, buffer);
-  if (!s.ok() || blob_value.size() != blob_index.size()) {
-    if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Failed to read blob from file: %s blob_offset: %" PRIu64
-                      " blob_size: %" PRIu64 " read: %d key: %s status: '%s'",
-                      bfile->PathName().c_str(), blob_index.offset(),
-                      blob_index.size(), static_cast<int>(blob_value.size()),
-                      key.data(), s.ToString().c_str());
-    }
-    return Status::NotFound("Blob Not Found as couldnt retrieve Blob");
-  }
+  std::string buffer_str(static_cast<size_t>(record_size), static_cast<char>(0));
+  char* buffer = &buffer_str[0];
 
-  // TODO(yiwu): Add an option to skip crc checking.
-  Slice crc_slice;
-  uint32_t crc_exp;
-  std::string crc_str;
-  crc_str.resize(sizeof(uint32_t));
-  char* crc_buffer = &(crc_str[0]);
-  s = reader->Read(blob_index.offset() - (key.size() + sizeof(uint32_t)),
-                   sizeof(uint32_t), &crc_slice, crc_buffer);
-  if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) {
-    if (debug_level_ >= 2) {
-      ROCKS_LOG_ERROR(db_options_.info_log,
-                      "Failed to fetch blob crc file: %s blob_offset: %" PRIu64
-                      " blob_size: %" PRIu64 " key: %s status: '%s'",
-                      bfile->PathName().c_str(), blob_index.offset(),
-                      blob_index.size(), key.data(), s.ToString().c_str());
-    }
-    return Status::NotFound("Blob Not Found as couldnt retrieve CRC");
+  // A partial blob record contain checksum, key and value.
+  Slice blob_record;
+  {
+    StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+    s = reader->Read(record_offset, static_cast<size_t>(record_size), &blob_record, buffer);
+    RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
   }
+  if (!s.ok()) {
+    ROCKS_LOG_DEBUG(db_options_.info_log,
+                    "Failed to read blob from blob file %" PRIu64
+                    ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
+                    ", key_size: %" ROCKSDB_PRIszt ", status: '%s'",
+                    bfile->BlobFileNumber(), blob_index.offset(),
+                    blob_index.size(), key.size(), s.ToString().c_str());
+    return s;
+  }
+  if (blob_record.size() != record_size) {
+    ROCKS_LOG_DEBUG(
+        db_options_.info_log,
+        "Failed to read blob from blob file %" PRIu64 ", blob_offset: %" PRIu64
+        ", blob_size: %" PRIu64 ", key_size: %" ROCKSDB_PRIszt
+        ", read %" ROCKSDB_PRIszt " bytes, expected %" PRIu64 " bytes",
+        bfile->BlobFileNumber(), blob_index.offset(), blob_index.size(),
+        key.size(), blob_record.size(), record_size);
 
-  uint32_t crc = crc32c::Value(key.data(), key.size());
-  crc = crc32c::Extend(crc, blob_value.data(), blob_value.size());
+    return Status::Corruption("Failed to retrieve blob from blob index.");
+  }
+  Slice crc_slice(blob_record.data(), sizeof(uint32_t));
+  Slice blob_value(blob_record.data() + sizeof(uint32_t) + key.size(),
+                   static_cast<size_t>(blob_index.size()));
+  uint32_t crc_exp;
+  if (!GetFixed32(&crc_slice, &crc_exp)) {
+    ROCKS_LOG_DEBUG(db_options_.info_log,
+                    "Unable to decode CRC from blob file %" PRIu64
+                    ", blob_offset: %" PRIu64 ", blob_size: %" PRIu64
+                    ", key size: %" ROCKSDB_PRIszt ", status: '%s'",
+                    bfile->BlobFileNumber(), blob_index.offset(),
+                    blob_index.size(), key.size(), s.ToString().c_str());
+    return Status::Corruption("Unable to decode checksum.");
+  }
+
+  uint32_t crc = crc32c::Value(blob_record.data() + sizeof(uint32_t),
+                               blob_record.size() - sizeof(uint32_t));
   crc = crc32c::Mask(crc);  // Adjust for storage
   if (crc != crc_exp) {
     if (debug_level_ >= 2) {
@@ -1215,24 +1094,44 @@ Status BlobDBImpl::GetBlobValue(const Slice& key, const Slice& index_entry,
     return Status::Corruption("Corruption. Blob CRC mismatch");
   }
 
-  if (bfile->compression() != kNoCompression) {
+  if (bfile->compression() == kNoCompression) {
+    value->PinSelf(blob_value);
+  } else {
     BlockContents contents;
     auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
-    s = UncompressBlockContentsForCompressionType(
-        blob_value.data(), blob_value.size(), &contents,
-        kBlockBasedTableVersionFormat, Slice(), bfile->compression(),
-        *(cfh->cfd()->ioptions()));
-    *(value->GetSelf()) = contents.data.ToString();
+    {
+      StopWatch decompression_sw(env_, statistics_,
+                                 BLOB_DB_DECOMPRESSION_MICROS);
+      UncompressionContext context(bfile->compression());
+      UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                             bfile->compression());
+      s = UncompressBlockContentsForCompressionType(
+          info, blob_value.data(), blob_value.size(), &contents,
+          kBlockBasedTableVersionFormat, *(cfh->cfd()->ioptions()));
+    }
+    value->PinSelf(contents.data);
   }
 
-  value->PinSelf();
-
   return s;
 }
 
 Status BlobDBImpl::Get(const ReadOptions& read_options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        PinnableSlice* value) {
+  return Get(read_options, column_family, key, value, nullptr /*expiration*/);
+}
+
+Status BlobDBImpl::Get(const ReadOptions& read_options,
+                       ColumnFamilyHandle* column_family, const Slice& key,
+                       PinnableSlice* value, uint64_t* expiration) {
+  StopWatch get_sw(env_, statistics_, BLOB_DB_GET_MICROS);
+  RecordTick(statistics_, BLOB_DB_NUM_GET);
+  return GetImpl(read_options, column_family, key, value, expiration);
+}
+
+Status BlobDBImpl::GetImpl(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* value, uint64_t* expiration) {
   if (column_family != DefaultColumnFamily()) {
     return Status::NotSupported(
         "Blob DB doesn't support non-default column family.");
@@ -1243,16 +1142,26 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
   ReadOptions ro(read_options);
   bool snapshot_created = SetSnapshotIfNeeded(&ro);
 
+  PinnableSlice index_entry;
   Status s;
   bool is_blob_index = false;
-  s = db_impl_->GetImpl(ro, column_family, key, value,
-                        nullptr /*value_found*/, &is_blob_index);
+  s = db_impl_->GetImpl(ro, column_family, key, &index_entry,
+                        nullptr /*value_found*/, nullptr /*read_callback*/,
+                        &is_blob_index);
   TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:1");
   TEST_SYNC_POINT("BlobDBImpl::Get:AfterIndexEntryGet:2");
-  if (s.ok() && is_blob_index) {
-    std::string index_entry = value->ToString();
-    value->Reset();
-    s = GetBlobValue(key, index_entry, value);
+  if (expiration != nullptr) {
+    *expiration = kNoExpiration;
+  }
+  RecordTick(statistics_, BLOB_DB_NUM_KEYS_READ);
+  if (s.ok()) {
+    if (is_blob_index) {
+      s = GetBlobValue(key, index_entry, value, expiration);
+    } else {
+      // The index entry is the value itself in this case.
+      value->PinSelf(index_entry);
+    }
+    RecordTick(statistics_, BLOB_DB_BYTES_READ, value->size());
   }
   if (snapshot_created) {
     db_->ReleaseSnapshot(ro.snapshot);
@@ -1261,60 +1170,86 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
 }
 
 std::pair<bool, int64_t> BlobDBImpl::SanityCheck(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
+  if (aborted) {
+    return std::make_pair(false, -1);
+  }
 
   ROCKS_LOG_INFO(db_options_.info_log, "Starting Sanity Check");
-
-  ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" PRIu64,
+  ROCKS_LOG_INFO(db_options_.info_log, "Number of files %" ROCKSDB_PRIszt,
                  blob_files_.size());
-
-  ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" PRIu64,
+  ROCKS_LOG_INFO(db_options_.info_log, "Number of open files %" ROCKSDB_PRIszt,
                  open_ttl_files_.size());
 
   for (auto bfile : open_ttl_files_) {
     assert(!bfile->Immutable());
   }
 
-  uint64_t epoch_now = EpochNow();
+  uint64_t now = EpochNow();
 
-  for (auto bfile_pair : blob_files_) {
-    auto bfile = bfile_pair.second;
-    ROCKS_LOG_INFO(
-        db_options_.info_log,
-        "Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64,
-        bfile->PathName().c_str(), bfile->GetFileSize(), bfile->BlobCount(),
-        bfile->deleted_count_, bfile->deleted_size_,
-        (bfile->expiration_range_.second - epoch_now));
+  for (auto blob_file_pair : blob_files_) {
+    auto blob_file = blob_file_pair.second;
+    char buf[1000];
+    int pos = snprintf(buf, sizeof(buf),
+                       "Blob file %" PRIu64 ", size %" PRIu64
+                       ", blob count %" PRIu64 ", immutable %d",
+                       blob_file->BlobFileNumber(), blob_file->GetFileSize(),
+                       blob_file->BlobCount(), blob_file->Immutable());
+    if (blob_file->HasTTL()) {
+      auto expiration_range = blob_file->GetExpirationRange();
+      pos += snprintf(buf + pos, sizeof(buf) - pos,
+                      ", expiration range (%" PRIu64 ", %" PRIu64 ")",
+                      expiration_range.first, expiration_range.second);
+      if (!blob_file->Obsolete()) {
+        pos += snprintf(buf + pos, sizeof(buf) - pos,
+                        ", expire in %" PRIu64 " seconds",
+                        expiration_range.second - now);
+      }
+    }
+    if (blob_file->Obsolete()) {
+      pos += snprintf(buf + pos, sizeof(buf) - pos, ", obsolete at %" PRIu64,
+                      blob_file->GetObsoleteSequence());
+    }
+    snprintf(buf + pos, sizeof(buf) - pos, ".");
+    ROCKS_LOG_INFO(db_options_.info_log, "%s", buf);
   }
 
   // reschedule
   return std::make_pair(true, -1);
 }
 
-Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile) {
+Status BlobDBImpl::CloseBlobFile(std::shared_ptr<BlobFile> bfile,
+                                 bool need_lock) {
   assert(bfile != nullptr);
+  write_mutex_.AssertHeld();
   Status s;
-  ROCKS_LOG_INFO(db_options_.info_log, "Close blob file %" PRIu64,
-                 bfile->BlobFileNumber());
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Closing blob file %" PRIu64 ". Path: %s",
+                 bfile->BlobFileNumber(), bfile->PathName().c_str());
   {
-    WriteLock wl(&mutex_);
+    std::unique_ptr<WriteLock> lock;
+    if (need_lock) {
+      lock.reset(new WriteLock(&mutex_));
+    }
 
     if (bfile->HasTTL()) {
       size_t erased __attribute__((__unused__));
       erased = open_ttl_files_.erase(bfile);
-      assert(erased == 1);
-    } else {
-      assert(bfile == open_non_ttl_file_);
+    } else if (bfile == open_non_ttl_file_) {
       open_non_ttl_file_ = nullptr;
     }
   }
 
   if (!bfile->closed_.load()) {
-    WriteLock lockbfile_w(&bfile->mutex_);
+    std::unique_ptr<WriteLock> file_lock;
+    if (need_lock) {
+      file_lock.reset(new WriteLock(&bfile->mutex_));
+    }
     s = bfile->WriteFooterAndCloseLocked();
   }
 
-  if (!s.ok()) {
+  if (s.ok()) {
+    total_blob_size_ += BlobLogFooter::kSize;
+  } else {
     ROCKS_LOG_ERROR(db_options_.info_log,
                     "Failed to close blob file %" PRIu64 "with error: %s",
                     bfile->BlobFileNumber(), s.ToString().c_str());
@@ -1331,196 +1266,126 @@ Status BlobDBImpl::CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile) {
   return CloseBlobFile(bfile);
 }
 
+void BlobDBImpl::ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
+                                  SequenceNumber obsolete_seq,
+                                  bool update_size) {
+  // Should hold write lock of mutex_ or during DB open.
+  blob_file->MarkObsolete(obsolete_seq);
+  obsolete_files_.push_back(blob_file);
+  assert(total_blob_size_.load() >= blob_file->GetFileSize());
+  if (update_size) {
+    total_blob_size_ -= blob_file->GetFileSize();
+  }
+}
+
 bool BlobDBImpl::VisibleToActiveSnapshot(
     const std::shared_ptr<BlobFile>& bfile) {
   assert(bfile->Obsolete());
-  SequenceNumber first_sequence = bfile->GetSequenceRange().first;
-  SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence();
-  return db_impl_->HasActiveSnapshotInRange(first_sequence, obsolete_sequence);
-}
 
-bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
-                                       uint64_t blob_offset,
-                                       uint64_t blob_size) {
-  assert(bdb_options_.enable_garbage_collection);
-  (void)blob_offset;
-  std::shared_ptr<BlobFile> bfile;
+  // We check whether the oldest snapshot is no less than the last sequence
+  // by the time the blob file become obsolete. If so, the blob file is not
+  // visible to all existing snapshots.
+  //
+  // If we keep track of the earliest sequence of the keys in the blob file,
+  // we could instead check if there's a snapshot falls in range
+  // [earliest_sequence, obsolete_sequence). But doing so will make the
+  // implementation more complicated.
+  SequenceNumber obsolete_sequence = bfile->GetObsoleteSequence();
+  SequenceNumber oldest_snapshot = kMaxSequenceNumber;
   {
-    ReadLock rl(&mutex_);
-    auto hitr = blob_files_.find(file_number);
-
-    // file was deleted
-    if (hitr == blob_files_.end()) {
-      return false;
+    // Need to lock DBImpl mutex before access snapshot list.
+    InstrumentedMutexLock l(db_impl_->mutex());
+    auto& snapshots = db_impl_->snapshots();
+    if (!snapshots.empty()) {
+      oldest_snapshot = snapshots.oldest()->GetSequenceNumber();
     }
-
-    bfile = hitr->second;
   }
-
-  WriteLock lockbfile_w(&bfile->mutex_);
-
-  bfile->deleted_count_++;
-  bfile->deleted_size_ += key_size + blob_size + BlobLogRecord::kHeaderSize;
-  return true;
-}
-
-bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& index_entry) {
-  assert(bdb_options_.enable_garbage_collection);
-  BlobIndex blob_index;
-  Status s = blob_index.DecodeFrom(index_entry);
-  if (!s.ok()) {
+  bool visible = oldest_snapshot < obsolete_sequence;
+  if (visible) {
     ROCKS_LOG_INFO(db_options_.info_log,
-                   "Could not parse lsm val in MarkBlobDeleted %s",
-                   index_entry.ToString().c_str());
-    return false;
+                   "Obsolete blob file %" PRIu64 " (obsolete at %" PRIu64
+                   ") visible to oldest snapshot %" PRIu64 ".",
+                   bfile->BlobFileNumber(), obsolete_sequence, oldest_snapshot);
   }
-  bool succ = FindFileAndEvictABlob(blob_index.file_number(), key.size(),
-                                    blob_index.offset(), blob_index.size());
-  return succ;
+  return visible;
 }
 
-std::pair<bool, int64_t> BlobDBImpl::EvictCompacted(bool aborted) {
-  assert(bdb_options_.enable_garbage_collection);
-  if (aborted) return std::make_pair(false, -1);
-
-  override_packet_t packet;
-  size_t total_vals = 0;
-  size_t mark_evicted = 0;
-  while (override_vals_q_.dequeue(&packet)) {
-    bool succeeded =
-        FindFileAndEvictABlob(packet.file_number_, packet.key_size_,
-                              packet.blob_offset_, packet.blob_size_);
-    total_vals++;
-    if (succeeded) {
-      mark_evicted++;
-    }
+std::pair<bool, int64_t> BlobDBImpl::EvictExpiredFiles(bool aborted) {
+  if (aborted) {
+    return std::make_pair(false, -1);
   }
-  ROCKS_LOG_INFO(db_options_.info_log,
-                 "Mark %" ROCKSDB_PRIszt
-                 " values to evict, out of %" ROCKSDB_PRIszt
-                 " compacted values.",
-                 mark_evicted, total_vals);
-  return std::make_pair(true, -1);
-}
-
-std::pair<bool, int64_t> BlobDBImpl::EvictDeletions(bool aborted) {
-  assert(bdb_options_.enable_garbage_collection);
-  if (aborted) return std::make_pair(false, -1);
-
-  ColumnFamilyHandle* last_cfh = nullptr;
-  Options last_op;
-
-  Arena arena;
-  ScopedArenaIterator iter;
-
-  // we will use same RangeDelAggregator for all cf's.
-  // essentially we do not support Range Deletes now
-  std::unique_ptr<RangeDelAggregator> range_del_agg;
-  delete_packet_t dpacket;
-  while (delete_keys_q_.dequeue(&dpacket)) {
-    if (last_cfh != dpacket.cfh_) {
-      if (!range_del_agg) {
-        auto cfhi = reinterpret_cast<ColumnFamilyHandleImpl*>(dpacket.cfh_);
-        auto cfd = cfhi->cfd();
-        range_del_agg.reset(new RangeDelAggregator(cfd->internal_comparator(),
-                                                   kMaxSequenceNumber));
-      }
 
-      // this can be expensive
-      last_cfh = dpacket.cfh_;
-      last_op = db_impl_->GetOptions(last_cfh);
-      iter.set(db_impl_->NewInternalIterator(&arena, range_del_agg.get(),
-                                             dpacket.cfh_));
-      // this will not work for multiple CF's.
-    }
-
-    Slice user_key(dpacket.key_);
-    InternalKey target(user_key, dpacket.dsn_, kTypeValue);
-
-    Slice eslice = target.Encode();
-    iter->Seek(eslice);
-
-    if (!iter->status().ok()) {
-      ROCKS_LOG_INFO(db_options_.info_log, "Invalid iterator seek %s",
-                     dpacket.key_.c_str());
-      continue;
-    }
-
-    const Comparator* bwc = BytewiseComparator();
-    while (iter->Valid()) {
-      if (!bwc->Equal(ExtractUserKey(iter->key()), ExtractUserKey(eslice)))
-        break;
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:0");
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:1");
 
-      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
-      if (!ParseInternalKey(iter->key(), &ikey)) {
-        continue;
+  std::vector<std::shared_ptr<BlobFile>> process_files;
+  uint64_t now = EpochNow();
+  {
+    ReadLock rl(&mutex_);
+    for (auto p : blob_files_) {
+      auto& blob_file = p.second;
+      ReadLock file_lock(&blob_file->mutex_);
+      if (blob_file->HasTTL() && !blob_file->Obsolete() &&
+          blob_file->GetExpirationRange().second <= now) {
+        process_files.push_back(blob_file);
       }
-
-      // once you hit a DELETE, assume the keys below have been
-      // processed previously
-      if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) break;
-
-      Slice val = iter->value();
-      MarkBlobDeleted(ikey.user_key, val);
-
-      iter->Next();
     }
   }
-  return std::make_pair(true, -1);
-}
 
-std::pair<bool, int64_t> BlobDBImpl::CheckSeqFiles(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:2");
+  TEST_SYNC_POINT("BlobDBImpl::EvictExpiredFiles:3");
+  TEST_SYNC_POINT_CALLBACK("BlobDBImpl::EvictExpiredFiles:cb", nullptr);
 
-  std::vector<std::shared_ptr<BlobFile>> process_files;
+  SequenceNumber seq = GetLatestSequenceNumber();
   {
-    uint64_t epoch_now = EpochNow();
-
-    ReadLock rl(&mutex_);
-    for (auto bfile : open_ttl_files_) {
-      {
-        ReadLock lockbfile_r(&bfile->mutex_);
-
-        if (bfile->expiration_range_.second > epoch_now) continue;
-        process_files.push_back(bfile);
+    MutexLock l(&write_mutex_);
+    for (auto& blob_file : process_files) {
+      WriteLock file_lock(&blob_file->mutex_);
+      if (!blob_file->Immutable()) {
+        CloseBlobFile(blob_file, false /*need_lock*/);
+      }
+      // Need to double check if the file is obsolete.
+      if (!blob_file->Obsolete()) {
+        ObsoleteBlobFile(blob_file, seq, true /*update_size*/);
       }
     }
   }
 
-  for (auto bfile : process_files) {
-    CloseBlobFile(bfile);
-  }
-
   return std::make_pair(true, -1);
 }
 
-std::pair<bool, int64_t> BlobDBImpl::FsyncFiles(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
-
+Status BlobDBImpl::SyncBlobFiles() {
   MutexLock l(&write_mutex_);
 
   std::vector<std::shared_ptr<BlobFile>> process_files;
   {
     ReadLock rl(&mutex_);
     for (auto fitr : open_ttl_files_) {
-      if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync))
-        process_files.push_back(fitr);
+      process_files.push_back(fitr);
     }
-
-    if (open_non_ttl_file_ != nullptr &&
-        open_non_ttl_file_->NeedsFsync(true, bdb_options_.bytes_per_sync)) {
+    if (open_non_ttl_file_ != nullptr) {
       process_files.push_back(open_non_ttl_file_);
     }
   }
 
-  for (auto fitr : process_files) {
-    if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync)) fitr->Fsync();
+  Status s;
+  for (auto& blob_file : process_files) {
+    s = blob_file->Fsync();
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "Failed to sync blob file %" PRIu64 ", status: %s",
+                      blob_file->BlobFileNumber(), s.ToString().c_str());
+      return s;
+    }
   }
 
-  bool expected = true;
-  if (dir_change_.compare_exchange_weak(expected, false)) dir_ent_->Fsync();
-
-  return std::make_pair(true, -1);
+  s = dir_ent_->Fsync();
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log,
+                    "Failed to sync blob directory, status: %s",
+                    s.ToString().c_str());
+  }
+  return s;
 }
 
 std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
@@ -1544,35 +1409,6 @@ std::pair<bool, int64_t> BlobDBImpl::ReclaimOpenFiles(bool aborted) {
   return std::make_pair(true, -1);
 }
 
-// TODO(yiwu): correct the stats and expose it.
-std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
-
-  WriteLock wl(&mutex_);
-
-  if (all_periods_write_.size() >= kWriteAmplificationStatsPeriods) {
-    total_periods_write_ -= (*all_periods_write_.begin());
-    total_periods_ampl_ = (*all_periods_ampl_.begin());
-
-    all_periods_write_.pop_front();
-    all_periods_ampl_.pop_front();
-  }
-
-  uint64_t val1 = last_period_write_.load();
-  uint64_t val2 = last_period_ampl_.load();
-
-  all_periods_write_.push_back(val1);
-  all_periods_ampl_.push_back(val2);
-
-  last_period_write_ = 0;
-  last_period_ampl_ = 0;
-
-  total_periods_write_ += val1;
-  total_periods_ampl_ += val2;
-
-  return std::make_pair(true, -1);
-}
-
 // Write callback for garbage collection to check if key has been updated
 // since last read. Similar to how OptimisticTransaction works. See inline
 // comment in GCFileAndUpdateLSM().
@@ -1582,7 +1418,7 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
                                  SequenceNumber upper_bound)
       : cfd_(cfd), key_(key), upper_bound_(upper_bound) {}
 
-  virtual Status Callback(DB* db) override {
+  Status Callback(DB* db) override {
     auto* db_impl = reinterpret_cast<DBImpl*>(db);
     auto* sv = db_impl->GetAndRefSuperVersion(cfd_);
     SequenceNumber latest_seq = 0;
@@ -1609,7 +1445,7 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
     return s;
   }
 
-  virtual bool AllowWriteBatching() override { return false; }
+  bool AllowWriteBatching() override { return false; }
 
  private:
   ColumnFamilyData* cfd_;
@@ -1633,13 +1469,14 @@ class BlobDBImpl::GarbageCollectionWriteCallback : public WriteCallback {
 // DELETED in the LSM
 Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                                       GCStats* gc_stats) {
+  StopWatch gc_sw(env_, statistics_, BLOB_DB_GC_MICROS);
   uint64_t now = EpochNow();
 
   std::shared_ptr<Reader> reader =
-      bfptr->OpenSequentialReader(env_, db_options_, env_options_);
+      bfptr->OpenRandomAccessReader(env_, db_options_, env_options_);
   if (!reader) {
     ROCKS_LOG_ERROR(db_options_.info_log,
-                    "File sequential reader could not be opened",
+                    "File sequential reader could not be opened for %s",
                     bfptr->PathName().c_str());
     return Status::IOError("failed to create sequential reader");
   }
@@ -1653,10 +1490,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     return s;
   }
 
-  bool first_gc = bfptr->gc_once_after_open_;
-
-  auto* cfh =
-      db_impl_->GetColumnFamilyHandleUnlocked(bfptr->column_family_id());
+  auto cfh = db_impl_->DefaultColumnFamily();
   auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
   auto column_family_id = cfd->GetID();
   bool has_ttl = header.has_ttl;
@@ -1664,19 +1498,9 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
   // this reads the key but skips the blob
   Reader::ReadLevel shallow = Reader::kReadHeaderKey;
 
-  bool no_relocation_ttl =
-      (has_ttl && now >= bfptr->GetExpirationRange().second);
-
-  bool no_relocation_lsmdel = false;
-  {
-    ReadLock lockbfile_r(&bfptr->mutex_);
-    no_relocation_lsmdel =
-        (bfptr->GetFileSize() ==
-         (BlobLogHeader::kSize + bfptr->deleted_size_ + BlobLogFooter::kSize));
-  }
+  bool file_expired = has_ttl && now >= bfptr->GetExpirationRange().second;
 
-  bool no_relocation = no_relocation_ttl || no_relocation_lsmdel;
-  if (!no_relocation) {
+  if (!file_expired) {
     // read the blob because you have to write it back to new file
     shallow = Reader::kReadHeaderKeyBlob;
   }
@@ -1712,7 +1536,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     PinnableSlice index_entry;
     Status get_status = db_impl_->GetImpl(
         ReadOptions(), cfh, record.key, &index_entry, nullptr /*value_found*/,
-        &is_blob_index);
+        nullptr /*read_callback*/, &is_blob_index);
     TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB");
     if (!get_status.ok() && !get_status.IsNotFound()) {
       // error
@@ -1725,6 +1549,8 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     if (get_status.IsNotFound() || !is_blob_index) {
       // Either the key is deleted or updated with a newer version whish is
       // inlined in LSM.
+      gc_stats->num_keys_overwritten++;
+      gc_stats->bytes_overwritten += record.record_size();
       continue;
     }
 
@@ -1736,9 +1562,12 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                       s.ToString().c_str());
       break;
     }
-    if (blob_index.file_number() != bfptr->BlobFileNumber() ||
+    if (blob_index.IsInlined() ||
+        blob_index.file_number() != bfptr->BlobFileNumber() ||
         blob_index.offset() != blob_offset) {
       // Key has been overwritten. Drop the blob record.
+      gc_stats->num_keys_overwritten++;
+      gc_stats->bytes_overwritten += record.record_size();
       continue;
     }
 
@@ -1747,9 +1576,9 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     // If key has expired, remove it from base DB.
     // TODO(yiwu): Blob indexes will be remove by BlobIndexCompactionFilter.
     // We can just drop the blob record.
-    if (no_relocation_ttl || (has_ttl && now >= record.expiration)) {
-      gc_stats->num_deletes++;
-      gc_stats->deleted_size += record.value_size;
+    if (file_expired || (has_ttl && now >= record.expiration)) {
+      gc_stats->num_keys_expired++;
+      gc_stats->bytes_expired += record.record_size();
       TEST_SYNC_POINT("BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete");
       WriteBatch delete_batch;
       Status delete_status = delete_batch.Delete(record.key);
@@ -1757,12 +1586,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
         delete_status = db_impl_->WriteWithCallback(WriteOptions(),
                                                     &delete_batch, &callback);
       }
-      if (delete_status.ok()) {
-        gc_stats->delete_succeeded++;
-      } else if (delete_status.IsBusy()) {
-        // The key is overwritten in the meanwhile. Drop the blob record.
-        gc_stats->overwritten_while_delete++;
-      } else {
+      if (!delete_status.ok() && !delete_status.IsBusy()) {
         // We hit an error.
         s = delete_status;
         ROCKS_LOG_ERROR(db_options_.info_log,
@@ -1774,26 +1598,30 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
       continue;
     }
 
-    if (first_gc) {
-      // Do not relocate blob record for initial GC.
-      continue;
-    }
-
     // Relocate the blob record to new file.
     if (!newfile) {
       // new file
       std::string reason("GC of ");
       reason += bfptr->PathName();
       newfile = NewBlobFile(reason);
-      gc_stats->newfile = newfile;
 
-      new_writer = CheckOrCreateWriterLocked(newfile);
-      newfile->header_ = std::move(header);
+      s = CheckOrCreateWriterLocked(newfile, &new_writer);
+      if (!s.ok()) {
+        ROCKS_LOG_ERROR(db_options_.info_log,
+                        "Failed to open file %s for writer, error: %s",
+                        newfile->PathName().c_str(), s.ToString().c_str());
+        break;
+      }
       // Can't use header beyond this point
+      newfile->header_ = std::move(header);
       newfile->header_valid_ = true;
       newfile->file_size_ = BlobLogHeader::kSize;
-      s = new_writer->WriteHeader(newfile->header_);
+      newfile->SetColumnFamilyId(bfptr->column_family_id());
+      newfile->SetHasTTL(bfptr->HasTTL());
+      newfile->SetCompression(bfptr->compression());
+      newfile->expiration_range_ = bfptr->expiration_range_;
 
+      s = new_writer->WriteHeader(newfile->header_);
       if (!s.ok()) {
         ROCKS_LOG_ERROR(db_options_.info_log,
                         "File: %s - header writing failed",
@@ -1801,15 +1629,14 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
         break;
       }
 
+      // We don't add the file to open_ttl_files_ or open_non_ttl_files_, to
+      // avoid user writes writing to the file, and avoid
+      // EvictExpiredFiles close the file by mistake.
       WriteLock wl(&mutex_);
-
-      dir_change_.store(true);
       blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile));
     }
 
-    gc_stats->num_relocate++;
     std::string new_index_entry;
-
     uint64_t new_blob_offset = 0;
     uint64_t new_key_offset = 0;
     // write the blob to the blob log.
@@ -1833,12 +1660,12 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
                                                    &rewrite_batch, &callback);
     }
     if (rewrite_status.ok()) {
-      newfile->ExtendSequenceRange(
-          WriteBatchInternal::Sequence(&rewrite_batch));
-      gc_stats->relocate_succeeded++;
+      gc_stats->num_keys_relocated++;
+      gc_stats->bytes_relocated += record.record_size();
     } else if (rewrite_status.IsBusy()) {
       // The key is overwritten in the meanwhile. Drop the blob record.
-      gc_stats->overwritten_while_relocate++;
+      gc_stats->num_keys_overwritten++;
+      gc_stats->bytes_overwritten += record.record_size();
     } else {
       // We hit an error.
       s = rewrite_status;
@@ -1848,117 +1675,66 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
     }
   }  // end of ReadRecord loop
 
-  if (s.ok()) {
-    SequenceNumber obsolete_sequence =
-        newfile == nullptr ? bfptr->GetSequenceRange().second + 1
-                           : newfile->GetSequenceRange().second;
-    bfptr->MarkObsolete(obsolete_sequence);
-    if (!first_gc) {
-      WriteLock wl(&mutex_);
-      obsolete_files_.push_back(bfptr);
-    }
+  {
+    WriteLock wl(&mutex_);
+    ObsoleteBlobFile(bfptr, GetLatestSequenceNumber(), true /*update_size*/);
   }
 
   ROCKS_LOG_INFO(
       db_options_.info_log,
-      "%s blob file %" PRIu64
-      ". Total blob records: %" PRIu64 ", Deletes: %" PRIu64 "/%" PRIu64
-      " succeeded, Relocates: %" PRIu64 "/%" PRIu64 " succeeded.",
+      "%s blob file %" PRIu64 ". Total blob records: %" PRIu64
+      ", expired: %" PRIu64 " keys/%" PRIu64
+      " bytes, updated or deleted by user: %" PRIu64 " keys/%" PRIu64
+      " bytes, rewrite to new file: %" PRIu64 " keys/%" PRIu64 " bytes.",
       s.ok() ? "Successfully garbage collected" : "Failed to garbage collect",
-      bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->delete_succeeded,
-      gc_stats->num_deletes, gc_stats->relocate_succeeded,
-      gc_stats->num_relocate);
+      bfptr->BlobFileNumber(), gc_stats->blob_count, gc_stats->num_keys_expired,
+      gc_stats->bytes_expired, gc_stats->num_keys_overwritten,
+      gc_stats->bytes_overwritten, gc_stats->num_keys_relocated,
+      gc_stats->bytes_relocated);
+  RecordTick(statistics_, BLOB_DB_GC_NUM_FILES);
+  RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_OVERWRITTEN,
+             gc_stats->num_keys_overwritten);
+  RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_EXPIRED,
+             gc_stats->num_keys_expired);
+  RecordTick(statistics_, BLOB_DB_GC_BYTES_OVERWRITTEN,
+             gc_stats->bytes_overwritten);
+  RecordTick(statistics_, BLOB_DB_GC_BYTES_EXPIRED, gc_stats->bytes_expired);
   if (newfile != nullptr) {
-    total_blob_space_ += newfile->file_size_;
+    {
+      MutexLock l(&write_mutex_);
+      CloseBlobFile(newfile);
+    }
+    total_blob_size_ += newfile->file_size_;
     ROCKS_LOG_INFO(db_options_.info_log, "New blob file %" PRIu64 ".",
                    newfile->BlobFileNumber());
+    RecordTick(statistics_, BLOB_DB_GC_NUM_NEW_FILES);
+    RecordTick(statistics_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
+               gc_stats->num_keys_relocated);
+    RecordTick(statistics_, BLOB_DB_GC_BYTES_RELOCATED,
+               gc_stats->bytes_relocated);
   }
-  return s;
-}
-
-// Ideally we should hold the lock during the entire function,
-// but under the asusmption that this is only called when a
-// file is Immutable, we can reduce the critical section
-bool BlobDBImpl::ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,
-                              bool is_oldest_non_ttl_file,
-                              std::string* reason) {
-  if (bfile->HasTTL()) {
-    ExpirationRange expiration_range = bfile->GetExpirationRange();
-    if (now > expiration_range.second) {
-      *reason = "entire file ttl expired";
-      return true;
-    }
-
-    if (!bfile->file_size_.load()) {
-      ROCKS_LOG_ERROR(db_options_.info_log, "Invalid file size = 0 %s",
-                      bfile->PathName().c_str());
-      *reason = "file is empty";
-      return false;
-    }
-
-    if (bfile->gc_once_after_open_.load()) {
-      return true;
-    }
-
-    if (bdb_options_.ttl_range_secs < kPartialExpirationGCRangeSecs) {
-      *reason = "has ttl but partial expiration not turned on";
-      return false;
-    }
-
-    ReadLock lockbfile_r(&bfile->mutex_);
-    bool ret = ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
-                kPartialExpirationPercentage);
-    if (ret) {
-      *reason = "deleted blobs beyond threshold";
-    } else {
-      *reason = "deleted blobs below threshold";
-    }
-    return ret;
-  }
-
-  // when crash happens, we lose the in-memory account of deleted blobs.
-  // we are therefore forced to do one GC to make sure delete accounting
-  // is OK
-  if (bfile->gc_once_after_open_.load()) {
-    return true;
-  }
-
-  ReadLock lockbfile_r(&bfile->mutex_);
-
-  if (bdb_options_.enable_garbage_collection) {
-    if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) >
-        kPartialExpirationPercentage) {
-      *reason = "deleted simple blobs beyond threshold";
-      return true;
-    }
-  }
-
-  // if we haven't reached limits of disk space, don't DELETE
-  if (bdb_options_.blob_dir_size == 0 ||
-      total_blob_space_.load() < bdb_options_.blob_dir_size) {
-    *reason = "disk space not exceeded";
-    return false;
-  }
-
-  if (is_oldest_non_ttl_file) {
-    *reason = "out of space and is the oldest simple blob file";
-    return true;
+  if (!s.ok()) {
+    RecordTick(statistics_, BLOB_DB_GC_FAILURES);
   }
-  *reason = "out of space but is not the oldest simple blob file";
-  return false;
+  return s;
 }
 
 std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
+  if (aborted) {
+    return std::make_pair(false, -1);
+  }
 
-  {
-    ReadLock rl(&mutex_);
-    if (obsolete_files_.empty()) return std::make_pair(true, -1);
+  MutexLock delete_file_lock(&delete_file_mutex_);
+  if (disable_file_deletions_ > 0) {
+    return std::make_pair(true, -1);
   }
 
   std::list<std::shared_ptr<BlobFile>> tobsolete;
   {
     WriteLock wl(&mutex_);
+    if (obsolete_files_.empty()) {
+      return std::make_pair(true, -1);
+    }
     tobsolete.swap(obsolete_files_);
   }
 
@@ -1980,7 +1756,8 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
                    bfile->PathName().c_str());
 
     blob_files_.erase(bfile->BlobFileNumber());
-    Status s = env_->DeleteFile(bfile->PathName());
+    Status s = DeleteDBFile(&(db_impl_->immutable_db_options()),
+                             bfile->PathName(), blob_dir_, true);
     if (!s.ok()) {
       ROCKS_LOG_ERROR(db_options_.info_log,
                       "File failed to be deleted as obsolete %s",
@@ -1990,7 +1767,6 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
     }
 
     file_deleted = true;
-    total_blob_space_ -= bfile->file_size_;
     ROCKS_LOG_INFO(db_options_.info_log,
                    "File deleted as obsolete from blob dir %s",
                    bfile->PathName().c_str());
@@ -2000,10 +1776,11 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
 
   // directory change. Fsync
   if (file_deleted) {
-    dir_ent_->Fsync();
-
-    // reset oldest_file_evicted flag
-    oldest_file_evicted_.store(false);
+    Status s = dir_ent_->Fsync();
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log, "Failed to sync dir %s: %s",
+                      blob_dir_.c_str(), s.ToString().c_str());
+    }
   }
 
   // put files back into obsolete if for some reason, delete failed
@@ -2018,111 +1795,20 @@ std::pair<bool, int64_t> BlobDBImpl::DeleteObsoleteFiles(bool aborted) {
 }
 
 void BlobDBImpl::CopyBlobFiles(
-    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
-    std::function<bool(const std::shared_ptr<BlobFile>&)> predicate) {
+    std::vector<std::shared_ptr<BlobFile>>* bfiles_copy) {
   ReadLock rl(&mutex_);
-
   for (auto const& p : blob_files_) {
-    bool pred_value = true;
-    if (predicate) {
-      pred_value = predicate(p.second);
-    }
-    if (pred_value) {
-      bfiles_copy->push_back(p.second);
-    }
-  }
-}
-
-void BlobDBImpl::FilterSubsetOfFiles(
-    const std::vector<std::shared_ptr<BlobFile>>& blob_files,
-    std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
-    size_t files_to_collect) {
-  // 100.0 / 15.0 = 7
-  uint64_t next_epoch_increment = static_cast<uint64_t>(
-      std::ceil(100 / static_cast<double>(kGCFilePercentage)));
-  uint64_t now = EpochNow();
-
-  size_t files_processed = 0;
-  bool non_ttl_file_found = false;
-  for (auto bfile : blob_files) {
-    if (files_processed >= files_to_collect) break;
-    // if this is the first time processing the file
-    // i.e. gc_epoch == -1, process it.
-    // else process the file if its processing epoch matches
-    // the current epoch. Typically the #of epochs should be
-    // around 5-10
-    if (bfile->gc_epoch_ != -1 && (uint64_t)bfile->gc_epoch_ != epoch) {
-      continue;
-    }
-
-    files_processed++;
-    // reset the epoch
-    bfile->gc_epoch_ = epoch + next_epoch_increment;
-
-    // file has already been GC'd or is still open for append,
-    // then it should not be GC'd
-    if (bfile->Obsolete() || !bfile->Immutable()) continue;
-
-    bool is_oldest_non_ttl_file = false;
-    if (!non_ttl_file_found && !bfile->HasTTL()) {
-      is_oldest_non_ttl_file = true;
-      non_ttl_file_found = true;
-    }
-
-    std::string reason;
-    bool shouldgc = ShouldGCFile(bfile, now, is_oldest_non_ttl_file, &reason);
-    if (!shouldgc) {
-      ROCKS_LOG_DEBUG(db_options_.info_log,
-                      "File has been skipped for GC ttl %s %" PRIu64 " %" PRIu64
-                      " reason='%s'",
-                      bfile->PathName().c_str(), now,
-                      bfile->GetExpirationRange().second, reason.c_str());
-      continue;
-    }
-
-    ROCKS_LOG_INFO(db_options_.info_log,
-                   "File has been chosen for GC ttl %s %" PRIu64 " %" PRIu64
-                   " reason='%s'",
-                   bfile->PathName().c_str(), now,
-                   bfile->GetExpirationRange().second, reason.c_str());
-    to_process->push_back(bfile);
+    bfiles_copy->push_back(p.second);
   }
 }
 
 std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
-  if (aborted) return std::make_pair(false, -1);
-
-  current_epoch_++;
-
-  std::vector<std::shared_ptr<BlobFile>> blob_files;
-  CopyBlobFiles(&blob_files);
-
-  if (!blob_files.size()) return std::make_pair(true, -1);
-
-  // 15% of files are collected each call to space out the IO and CPU
-  // consumption.
-  size_t files_to_collect = (kGCFilePercentage * blob_files.size()) / 100;
-
-  std::vector<std::shared_ptr<BlobFile>> to_process;
-  FilterSubsetOfFiles(blob_files, &to_process, current_epoch_,
-                      files_to_collect);
-
-  for (auto bfile : to_process) {
-    GCStats gc_stats;
-    Status s = GCFileAndUpdateLSM(bfile, &gc_stats);
-    if (!s.ok()) {
-      continue;
-    }
-
-    if (bfile->gc_once_after_open_.load()) {
-      WriteLock lockbfile_w(&bfile->mutex_);
-
-      bfile->deleted_size_ = gc_stats.deleted_size;
-      bfile->deleted_count_ = gc_stats.num_deletes;
-      bfile->gc_once_after_open_ = false;
-    }
+  if (aborted) {
+    return std::make_pair(false, -1);
   }
 
+  // TODO(yiwu): Garbage collection implementation.
+
   // reschedule
   return std::make_pair(true, -1);
 }
@@ -2140,8 +1826,8 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) {
   }
   auto* iter = db_impl_->NewIteratorImpl(
       read_options, cfd, snapshot->GetSequenceNumber(),
-      true /*allow_blob*/);
-  return new BlobDBIterator(own_snapshot, iter, this);
+      nullptr /*read_callback*/, true /*allow_blob*/);
+  return new BlobDBIterator(own_snapshot, iter, this, env_, statistics_);
 }
 
 Status DestroyBlobDB(const std::string& dbname, const Options& options,
@@ -2161,7 +1847,7 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options,
     uint64_t number;
     FileType type;
     if (ParseFileName(f, &number, &type) && type == kBlobFile) {
-      Status del = env->DeleteFile(blobdir + "/" + f);
+      Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true);
       if (status.ok() && !del.ok()) {
         status = del;
       }
@@ -2207,15 +1893,28 @@ void BlobDBImpl::TEST_DeleteObsoleteFiles() {
 }
 
 Status BlobDBImpl::TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile) {
+  MutexLock l(&write_mutex_);
   return CloseBlobFile(bfile);
 }
 
+void BlobDBImpl::TEST_ObsoleteBlobFile(std::shared_ptr<BlobFile>& blob_file,
+                                       SequenceNumber obsolete_seq,
+                                       bool update_size) {
+  return ObsoleteBlobFile(blob_file, obsolete_seq, update_size);
+}
+
 Status BlobDBImpl::TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
                                            GCStats* gc_stats) {
   return GCFileAndUpdateLSM(bfile, gc_stats);
 }
 
 void BlobDBImpl::TEST_RunGC() { RunGC(false /*abort*/); }
+
+void BlobDBImpl::TEST_EvictExpiredFiles() {
+  EvictExpiredFiles(false /*abort*/);
+}
+
+uint64_t BlobDBImpl::TEST_live_sst_size() { return live_sst_size_.load(); }
 #endif  //  !NDEBUG
 
 }  // namespace blob_db
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.h b/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.h
index 9881107d35..0a22c0acd9 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db_impl.h
@@ -24,8 +24,8 @@
 #include "rocksdb/db.h"
 #include "rocksdb/listener.h"
 #include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/wal_filter.h"
-#include "util/mpsc.h"
 #include "util/mutexlock.h"
 #include "util/timer_queue.h"
 #include "utilities/blob_db/blob_db.h"
@@ -43,108 +43,30 @@ struct FlushJobInfo;
 
 namespace blob_db {
 
-class BlobFile;
+struct BlobCompactionContext;
 class BlobDBImpl;
-
-class BlobDBFlushBeginListener : public EventListener {
- public:
-  explicit BlobDBFlushBeginListener() : impl_(nullptr) {}
-
-  void OnFlushBegin(DB* db, const FlushJobInfo& info) override;
-
-  void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
-
- protected:
-  BlobDBImpl* impl_;
-};
-
-// this implements the callback from the WAL which ensures that the
-// blob record is present in the blob log. If fsync/fdatasync in not
-// happening on every write, there is the probability that keys in the
-// blob log can lag the keys in blobs
-class BlobReconcileWalFilter : public WalFilter {
- public:
-  virtual WalFilter::WalProcessingOption LogRecordFound(
-      unsigned long long log_number, const std::string& log_file_name,
-      const WriteBatch& batch, WriteBatch* new_batch,
-      bool* batch_changed) override;
-
-  virtual const char* Name() const override { return "BlobDBWalReconciler"; }
-
-  void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
-
- protected:
-  BlobDBImpl* impl_;
-};
-
-class EvictAllVersionsCompactionListener : public EventListener {
- public:
-  class InternalListener : public CompactionEventListener {
-    friend class BlobDBImpl;
-
-   public:
-    virtual void OnCompaction(int level, const Slice& key,
-                              CompactionListenerValueType value_type,
-                              const Slice& existing_value,
-                              const SequenceNumber& sn, bool is_new) override;
-
-    void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
-
-   private:
-    BlobDBImpl* impl_;
-  };
-
-  explicit EvictAllVersionsCompactionListener()
-      : internal_listener_(new InternalListener()) {}
-
-  virtual CompactionEventListener* GetCompactionEventListener() override {
-    return internal_listener_.get();
-  }
-
-  void SetImplPtr(BlobDBImpl* p) { internal_listener_->SetImplPtr(p); }
-
- private:
-  std::unique_ptr<InternalListener> internal_listener_;
-};
-
-#if 0
-class EvictAllVersionsFilterFactory : public CompactionFilterFactory {
- private:
-  BlobDBImpl* impl_;
-
- public:
-  EvictAllVersionsFilterFactory() : impl_(nullptr) {}
-
-  void SetImplPtr(BlobDBImpl* p) { impl_ = p; }
-
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override;
-
-  virtual const char* Name() const override {
-    return "EvictAllVersionsFilterFactory";
-  }
-};
-#endif
+class BlobFile;
 
 // Comparator to sort "TTL" aware Blob files based on the lower value of
 // TTL range.
-struct blobf_compare_ttl {
+struct BlobFileComparatorTTL {
+  bool operator()(const std::shared_ptr<BlobFile>& lhs,
+                  const std::shared_ptr<BlobFile>& rhs) const;
+};
+
+struct BlobFileComparator {
   bool operator()(const std::shared_ptr<BlobFile>& lhs,
                   const std::shared_ptr<BlobFile>& rhs) const;
 };
 
 struct GCStats {
   uint64_t blob_count = 0;
-  uint64_t num_deletes = 0;
-  uint64_t deleted_size = 0;
-  uint64_t retry_delete = 0;
-  uint64_t delete_succeeded = 0;
-  uint64_t overwritten_while_delete = 0;
-  uint64_t num_relocate = 0;
-  uint64_t retry_relocate = 0;
-  uint64_t relocate_succeeded = 0;
-  uint64_t overwritten_while_relocate = 0;
-  std::shared_ptr<BlobFile> newfile = nullptr;
+  uint64_t num_keys_overwritten = 0;
+  uint64_t num_keys_expired = 0;
+  uint64_t num_keys_relocated = 0;
+  uint64_t bytes_overwritten = 0;
+  uint64_t bytes_expired = 0;
+  uint64_t bytes_relocated = 0;
 };
 
 /**
@@ -153,9 +75,6 @@ struct GCStats {
  * Garbage Collected.
  */
 class BlobDBImpl : public BlobDB {
-  friend class BlobDBFlushBeginListener;
-  friend class EvictAllVersionsCompactionListener;
-  friend class BlobDB;
   friend class BlobFile;
   friend class BlobDBIterator;
 
@@ -175,35 +94,14 @@ class BlobDBImpl : public BlobDB {
   // how many random access open files can we tolerate
   static constexpr uint32_t kOpenFilesTrigger = 100;
 
-  // how many periods of stats do we keep.
-  static constexpr uint32_t kWriteAmplificationStatsPeriods = 24;
-
-  // what is the length of any period
-  static constexpr uint32_t kWriteAmplificationStatsPeriodMillisecs =
-      3600 * 1000;
-
-  // we will garbage collect blob files in
-  // which entire files have expired. However if the
-  // ttl_range of files is very large say a day, we
-  // would have to wait for the entire day, before we
-  // recover most of the space.
-  static constexpr uint32_t kPartialExpirationGCRangeSecs = 4 * 3600;
-
-  // this should be based on allowed Write Amplification
-  // if 50% of the space of a blob file has been deleted/expired,
-  static constexpr uint32_t kPartialExpirationPercentage = 75;
-
-  // how often should we schedule a job to fsync open files
-  static constexpr uint32_t kFSyncFilesPeriodMillisecs = 10 * 1000;
-
   // how often to schedule reclaim open files.
   static constexpr uint32_t kReclaimOpenFilesPeriodMillisecs = 1 * 1000;
 
   // how often to schedule delete obs files periods
   static constexpr uint32_t kDeleteObsoleteFilesPeriodMillisecs = 10 * 1000;
 
-  // how often to schedule check seq files period
-  static constexpr uint32_t kCheckSeqFilesPeriodMillisecs = 10 * 1000;
+  // how often to schedule expired files eviction.
+  static constexpr uint32_t kEvictExpiredFilesPeriodMillisecs = 10 * 1000;
 
   // when should oldest file be evicted:
   // on reaching 90% of blob_dir_size
@@ -213,21 +111,22 @@ class BlobDBImpl : public BlobDB {
   Status Put(const WriteOptions& options, const Slice& key,
              const Slice& value) override;
 
-  using BlobDB::Delete;
-  Status Delete(const WriteOptions& options, const Slice& key) override;
-
   using BlobDB::Get;
   Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
              const Slice& key, PinnableSlice* value) override;
 
+  Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             uint64_t* expiration) override;
+
   using BlobDB::NewIterator;
   virtual Iterator* NewIterator(const ReadOptions& read_options) override;
 
   using BlobDB::NewIterators;
   virtual Status NewIterators(
-      const ReadOptions& read_options,
-      const std::vector<ColumnFamilyHandle*>& column_families,
-      std::vector<Iterator*>* iterators) override {
+      const ReadOptions& /*read_options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_families*/,
+      std::vector<Iterator*>* /*iterators*/) override {
     return Status::NotSupported("Not implemented");
   }
 
@@ -239,11 +138,7 @@ class BlobDBImpl : public BlobDB {
 
   virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
 
-  virtual Status GetLiveFiles(std::vector<std::string>&,
-                              uint64_t* manifest_file_size,
-                              bool flush_memtable = true) override;
-  virtual void GetLiveFilesMetaData(
-      std::vector<LiveFileMetaData>* ) override;
+  virtual Status Close() override;
 
   using BlobDB::PutWithTTL;
   Status PutWithTTL(const WriteOptions& options, const Slice& key,
@@ -253,17 +148,31 @@ class BlobDBImpl : public BlobDB {
   Status PutUntil(const WriteOptions& options, const Slice& key,
                   const Slice& value, uint64_t expiration) override;
 
-  Status LinkToBaseDB(DB* db) override;
-
   BlobDBOptions GetBlobDBOptions() const override;
 
-  BlobDBImpl(DB* db, const BlobDBOptions& bdb_options);
-
   BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options,
-             const DBOptions& db_options);
+             const DBOptions& db_options,
+             const ColumnFamilyOptions& cf_options);
+
+  virtual Status DisableFileDeletions() override;
+
+  virtual Status EnableFileDeletions(bool force) override;
+
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override;
+  virtual void GetLiveFilesMetaData(std::vector<LiveFileMetaData>*) override;
 
   ~BlobDBImpl();
 
+  Status Open(std::vector<ColumnFamilyHandle*>* handles);
+
+  Status SyncBlobFiles() override;
+
+  void UpdateLiveSSTSize();
+
+  void GetCompactionContext(BlobCompactionContext* context);
+
 #ifndef NDEBUG
   Status TEST_GetBlobValue(const Slice& key, const Slice& index_entry,
                            PinnableSlice* value);
@@ -274,56 +183,57 @@ class BlobDBImpl : public BlobDB {
 
   Status TEST_CloseBlobFile(std::shared_ptr<BlobFile>& bfile);
 
+  void TEST_ObsoleteBlobFile(std::shared_ptr<BlobFile>& blob_file,
+                             SequenceNumber obsolete_seq = 0,
+                             bool update_size = true);
+
   Status TEST_GCFileAndUpdateLSM(std::shared_ptr<BlobFile>& bfile,
                                  GCStats* gc_stats);
 
   void TEST_RunGC();
 
+  void TEST_EvictExpiredFiles();
+
   void TEST_DeleteObsoleteFiles();
+
+  uint64_t TEST_live_sst_size();
+
+  const std::string& TEST_blob_dir() const { return blob_dir_; }
 #endif  //  !NDEBUG
 
  private:
   class GarbageCollectionWriteCallback;
   class BlobInserter;
 
-  Status OpenPhase1();
-
   // Create a snapshot if there isn't one in read options.
   // Return true if a snapshot is created.
   bool SetSnapshotIfNeeded(ReadOptions* read_options);
 
+  Status GetImpl(const ReadOptions& read_options,
+                 ColumnFamilyHandle* column_family, const Slice& key,
+                 PinnableSlice* value, uint64_t* expiration = nullptr);
+
   Status GetBlobValue(const Slice& key, const Slice& index_entry,
-                      PinnableSlice* value);
+                      PinnableSlice* value, uint64_t* expiration = nullptr);
 
   Slice GetCompressedSlice(const Slice& raw,
                            std::string* compression_output) const;
 
-  // Just before flush starts acting on memtable files,
-  // this handler is called.
-  void OnFlushBeginHandler(DB* db, const FlushJobInfo& info);
-
-  // is this file ready for Garbage collection. if the TTL of the file
-  // has expired or if threshold of the file has been evicted
-  // tt - current time
-  // last_id - the id of the non-TTL file to evict
-  bool ShouldGCFile(std::shared_ptr<BlobFile> bfile, uint64_t now,
-                    bool is_oldest_non_ttl_file, std::string* reason);
-
-  // collect all the blob log files from the blob directory
-  Status GetAllLogFiles(std::set<std::pair<uint64_t, std::string>>* file_nums);
-
   // Close a file by appending a footer, and removes file from open files list.
-  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile);
+  Status CloseBlobFile(std::shared_ptr<BlobFile> bfile, bool need_lock = true);
 
   // Close a file if its size exceeds blob_file_size
   Status CloseBlobFileIfNeeded(std::shared_ptr<BlobFile>& bfile);
 
-  uint64_t ExtractExpiration(const Slice& key, const Slice& value,
-                             Slice* value_slice, std::string* new_value);
+  // Mark file as obsolete and move the file to obsolete file list.
+  //
+  // REQUIRED: hold write lock of mutex_ or during DB open.
+  void ObsoleteBlobFile(std::shared_ptr<BlobFile> blob_file,
+                        SequenceNumber obsolete_seq, bool update_size);
 
   Status PutBlobValue(const WriteOptions& options, const Slice& key,
                       const Slice& value, uint64_t expiration,
-                      SequenceNumber sequence, WriteBatch* batch);
+                      WriteBatch* batch);
 
   Status AppendBlob(const std::shared_ptr<BlobFile>& bfile,
                     const std::string& headerbuf, const Slice& key,
@@ -332,15 +242,14 @@ class BlobDBImpl : public BlobDB {
 
   // find an existing blob log file based on the expiration unix epoch
   // if such a file does not exist, return nullptr
-  std::shared_ptr<BlobFile> SelectBlobFileTTL(uint64_t expiration);
+  Status SelectBlobFileTTL(uint64_t expiration,
+                           std::shared_ptr<BlobFile>* blob_file);
 
   // find an existing blob log file to append the value to
-  std::shared_ptr<BlobFile> SelectBlobFile();
+  Status SelectBlobFile(std::shared_ptr<BlobFile>* blob_file);
 
   std::shared_ptr<BlobFile> FindBlobFileLocked(uint64_t expiration) const;
 
-  void Shutdown();
-
   // periodic sanity check. Bunch of checks
   std::pair<bool, int64_t> SanityCheck(bool aborted);
 
@@ -352,26 +261,15 @@ class BlobDBImpl : public BlobDB {
   // Major task to garbage collect expired and deleted blobs
   std::pair<bool, int64_t> RunGC(bool aborted);
 
-  // asynchronous task to fsync/fdatasync the open blob files
-  std::pair<bool, int64_t> FsyncFiles(bool aborted);
-
   // periodically check if open blob files and their TTL's has expired
   // if expired, close the sequential writer and make the file immutable
-  std::pair<bool, int64_t> CheckSeqFiles(bool aborted);
+  std::pair<bool, int64_t> EvictExpiredFiles(bool aborted);
 
   // if the number of open files, approaches ULIMIT's this
   // task will close random readers, which are kept around for
   // efficiency
   std::pair<bool, int64_t> ReclaimOpenFiles(bool aborted);
 
-  // periodically print write amplification statistics
-  std::pair<bool, int64_t> WaStats(bool aborted);
-
-  // background task to do book-keeping of deleted keys
-  std::pair<bool, int64_t> EvictDeletions(bool aborted);
-
-  std::pair<bool, int64_t> EvictCompacted(bool aborted);
-
   std::pair<bool, int64_t> RemoveTimerQ(TimerQueue* tq, bool aborted);
 
   // Adds the background tasks to the timer queue
@@ -380,13 +278,14 @@ class BlobDBImpl : public BlobDB {
   // add a new Blob File
   std::shared_ptr<BlobFile> NewBlobFile(const std::string& reason);
 
-  Status OpenAllFiles();
+  // collect all the blob log files from the blob directory
+  Status GetAllBlobFiles(std::set<uint64_t>* file_numbers);
 
-  // hold write mutex on file and call
-  // creates a Random Access reader for GET call
-  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
-      const std::shared_ptr<BlobFile>& bfile, Env* env,
-      const EnvOptions& env_options);
+  // Open all blob files found in blob_dir.
+  Status OpenAllBlobFiles();
+
+  Status GetBlobFileReader(const std::shared_ptr<BlobFile>& blob_file,
+                           std::shared_ptr<RandomAccessFileReader>* reader);
 
   // hold write mutex on file and call.
   // Close the above Random Access reader
@@ -398,8 +297,8 @@ class BlobDBImpl : public BlobDB {
 
   // returns a Writer object for the file. If writer is not
   // already present, creates one. Needs Write Mutex to be held
-  std::shared_ptr<Writer> CheckOrCreateWriterLocked(
-      const std::shared_ptr<BlobFile>& bfile);
+  Status CheckOrCreateWriterLocked(const std::shared_ptr<BlobFile>& blob_file,
+                                   std::shared_ptr<Writer>* writer);
 
   // Iterate through keys and values on Blob and write into
   // separate file the remaining blobs and delete/update pointers
@@ -412,40 +311,33 @@ class BlobDBImpl : public BlobDB {
   bool VisibleToActiveSnapshot(const std::shared_ptr<BlobFile>& file);
   bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr<BlobFile>& bfile);
 
-  bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue);
-
-  bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
-                             uint64_t blob_offset, uint64_t blob_size);
-
-  void CopyBlobFiles(
-      std::vector<std::shared_ptr<BlobFile>>* bfiles_copy,
-      std::function<bool(const std::shared_ptr<BlobFile>&)> predicate = {});
-
-  void FilterSubsetOfFiles(
-      const std::vector<std::shared_ptr<BlobFile>>& blob_files,
-      std::vector<std::shared_ptr<BlobFile>>* to_process, uint64_t epoch,
-      size_t files_to_collect);
+  void CopyBlobFiles(std::vector<std::shared_ptr<BlobFile>>* bfiles_copy);
 
   uint64_t EpochNow() { return env_->NowMicros() / 1000000; }
 
-  Status CheckSize(size_t blob_size);
+  // Check if inserting a new blob will make DB grow out of space.
+  // If is_fifo = true, FIFO eviction will be triggered to make room for the
+  // new blob. If force_evict = true, FIFO eviction will evict blob files
+  // even eviction will not make enough room for the new blob.
+  Status CheckSizeAndEvictBlobFiles(uint64_t blob_size,
+                                    bool force_evict = false);
 
-  std::shared_ptr<BlobFile> GetOldestBlobFile();
-
-  bool EvictOldestBlobFile();
+  // name of the database directory
+  std::string dbname_;
 
   // the base DB
   DBImpl* db_impl_;
   Env* env_;
-  TTLExtractor* ttl_extractor_;
 
   // the options that govern the behavior of Blob Storage
   BlobDBOptions bdb_options_;
   DBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
   EnvOptions env_options_;
 
-  // name of the database directory
-  std::string dbname_;
+  // Raw pointer of statistic. db_options_ has a std::shared_ptr to hold
+  // ownership.
+  Statistics* statistics_;
 
   // by default this is "blob_dir" under dbname_
   // but can be configured
@@ -454,8 +346,6 @@ class BlobDBImpl : public BlobDB {
   // pointer to directory
   std::unique_ptr<Directory> dir_ent_;
 
-  std::atomic<bool> dir_change_;
-
   // Read Write Mutex, which protects all the data structures
   // HEAVILY TRAFFICKED
   mutable port::RWMutex mutex_;
@@ -477,68 +367,57 @@ class BlobDBImpl : public BlobDB {
 
   // all the blob files which are currently being appended to based
   // on variety of incoming TTL's
-  std::multiset<std::shared_ptr<BlobFile>, blobf_compare_ttl> open_ttl_files_;
-
-  // packet of information to put in lockess delete(s) queue
-  struct delete_packet_t {
-    ColumnFamilyHandle* cfh_;
-    std::string key_;
-    SequenceNumber dsn_;
-  };
-
-  struct override_packet_t {
-    uint64_t file_number_;
-    uint64_t key_size_;
-    uint64_t blob_offset_;
-    uint64_t blob_size_;
-    SequenceNumber dsn_;
-  };
-
-  // LOCKLESS multiple producer single consumer queue to quickly append
-  // deletes without taking lock. Can rapidly grow in size!!
-  // deletes happen in LSM, but minor book-keeping needs to happen on
-  // BLOB side (for triggering eviction)
-  mpsc_queue_t<delete_packet_t> delete_keys_q_;
-
-  // LOCKLESS multiple producer single consumer queue for values
-  // that are being compacted
-  mpsc_queue_t<override_packet_t> override_vals_q_;
-
-  // atomic bool to represent shutdown
-  std::atomic<bool> shutdown_;
+  std::set<std::shared_ptr<BlobFile>, BlobFileComparatorTTL> open_ttl_files_;
+
+  // Flag to check whether Close() has been called on this DB
+  bool closed_;
 
   // timer based queue to execute tasks
   TimerQueue tqueue_;
 
-  // only accessed in GC thread, hence not atomic. The epoch of the
-  // GC task. Each execution is one epoch. Helps us in allocating
-  // files to one execution
-  uint64_t current_epoch_;
-
   // number of files opened for random access/GET
   // counter is used to monitor and close excess RA files.
   std::atomic<uint32_t> open_file_count_;
 
-  // should hold mutex to modify
-  // STATISTICS for WA of Blob Files due to GC
-  // collect by default 24 hourly periods
-  std::list<uint64_t> all_periods_write_;
-  std::list<uint64_t> all_periods_ampl_;
+  // Total size of all live blob files (i.e. exclude obsolete files).
+  std::atomic<uint64_t> total_blob_size_;
+
+  // total size of SST files.
+  std::atomic<uint64_t> live_sst_size_;
 
-  std::atomic<uint64_t> last_period_write_;
-  std::atomic<uint64_t> last_period_ampl_;
+  // Latest FIFO eviction timestamp
+  //
+  // REQUIRES: access with metex_ lock held.
+  uint64_t fifo_eviction_seq_;
 
-  uint64_t total_periods_write_;
-  uint64_t total_periods_ampl_;
+  // The expiration up to which latest FIFO eviction evicts.
+  //
+  // REQUIRES: access with metex_ lock held.
+  uint64_t evict_expiration_up_to_;
 
-  // total size of all blob files at a given time
-  std::atomic<uint64_t> total_blob_space_;
   std::list<std::shared_ptr<BlobFile>> obsolete_files_;
-  bool open_p1_done_;
 
-  uint32_t debug_level_;
+  // DeleteObsoleteFiles, DiableFileDeletions and EnableFileDeletions block
+  // on the mutex to avoid contention.
+  //
+  // While DeleteObsoleteFiles hold both mutex_ and delete_file_mutex_, note
+  // the difference. mutex_ only needs to be held when access the
+  // data-structure, and delete_file_mutex_ needs to be held the whole time
+  // during DeleteObsoleteFiles to avoid being run simultaneously with
+  // DisableFileDeletions.
+  //
+  // If both of mutex_ and delete_file_mutex_ needs to be held, it is adviced
+  // to hold delete_file_mutex_ first to avoid deadlock.
+  mutable port::Mutex delete_file_mutex_;
+
+  // Each call of DisableFileDeletions will increase disable_file_deletion_
+  // by 1. EnableFileDeletions will either decrease the count by 1 or reset
+  // it to zeor, depending on the force flag.
+  //
+  // REQUIRES: access with delete_file_mutex_ held.
+  int disable_file_deletions_ = 0;
 
-  std::atomic<bool> oldest_file_evicted_;
+  uint32_t debug_level_;
 };
 
 }  // namespace blob_db
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc b/thirdparty/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc
new file mode 100644
index 0000000000..8effe88c0a
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db_impl_filesnapshot.cc
@@ -0,0 +1,108 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/blob_db/blob_db_impl.h"
+
+#include "util/filename.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+// BlobDBImpl methods to get snapshot of files, e.g. for replication.
+
+namespace rocksdb {
+namespace blob_db {
+
+Status BlobDBImpl::DisableFileDeletions() {
+  // Disable base DB file deletions.
+  Status s = db_impl_->DisableFileDeletions();
+  if (!s.ok()) {
+    return s;
+  }
+
+  int count = 0;
+  {
+    // Hold delete_file_mutex_ to make sure no DeleteObsoleteFiles job
+    // is running.
+    MutexLock l(&delete_file_mutex_);
+    count = ++disable_file_deletions_;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "Disalbed blob file deletions. count: %d", count);
+  return Status::OK();
+}
+
+Status BlobDBImpl::EnableFileDeletions(bool force) {
+  // Enable base DB file deletions.
+  Status s = db_impl_->EnableFileDeletions(force);
+  if (!s.ok()) {
+    return s;
+  }
+
+  int count = 0;
+  {
+    MutexLock l(&delete_file_mutex_);
+    if (force) {
+      disable_file_deletions_ = 0;
+    } else if (disable_file_deletions_ > 0) {
+      count = --disable_file_deletions_;
+    }
+    assert(count >= 0);
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log, "Enabled blob file deletions. count: %d",
+                 count);
+  // Consider trigger DeleteobsoleteFiles once after re-enabled, if we are to
+  // make DeleteobsoleteFiles re-run interval configuration.
+  return Status::OK();
+}
+
+Status BlobDBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                                uint64_t* manifest_file_size,
+                                bool flush_memtable) {
+  if (!bdb_options_.path_relative) {
+    return Status::NotSupported(
+        "Not able to get relative blob file path from absolute blob_dir.");
+  }
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  Status s = db_->GetLiveFiles(ret, manifest_file_size, flush_memtable);
+  if (!s.ok()) {
+    return s;
+  }
+  ret.reserve(ret.size() + blob_files_.size());
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    // Path should be relative to db_name, but begin with slash.
+    ret.emplace_back(
+        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber()));
+  }
+  return Status::OK();
+}
+
+void BlobDBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  // Path should be relative to db_name.
+  assert(bdb_options_.path_relative);
+  // Hold a lock in the beginning to avoid updates to base DB during the call
+  ReadLock rl(&mutex_);
+  db_->GetLiveFilesMetaData(metadata);
+  for (auto bfile_pair : blob_files_) {
+    auto blob_file = bfile_pair.second;
+    LiveFileMetaData filemetadata;
+    filemetadata.size = static_cast<size_t>(blob_file->GetFileSize());
+    // Path should be relative to db_name, but begin with slash.
+    filemetadata.name =
+        BlobFileName("", bdb_options_.blob_dir, blob_file->BlobFileNumber());
+    auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(DefaultColumnFamily());
+    filemetadata.column_family_name = cfh->GetName();
+    metadata->emplace_back(filemetadata);
+  }
+}
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db_iterator.h b/thirdparty/rocksdb/utilities/blob_db/blob_db_iterator.h
index c8aa1ff17e..1565c670b1 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_db_iterator.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db_iterator.h
@@ -6,7 +6,9 @@
 #pragma once
 #ifndef ROCKSDB_LITE
 
+#include "monitoring/statistics.h"
 #include "rocksdb/iterator.h"
+#include "util/stop_watch.h"
 #include "utilities/blob_db/blob_db_impl.h"
 
 namespace rocksdb {
@@ -17,8 +19,12 @@ using rocksdb::ManagedSnapshot;
 class BlobDBIterator : public Iterator {
  public:
   BlobDBIterator(ManagedSnapshot* snapshot, ArenaWrappedDBIter* iter,
-                 BlobDBImpl* blob_db)
-      : snapshot_(snapshot), iter_(iter), blob_db_(blob_db) {}
+                 BlobDBImpl* blob_db, Env* env, Statistics* statistics)
+      : snapshot_(snapshot),
+        iter_(iter),
+        blob_db_(blob_db),
+        env_(env),
+        statistics_(statistics) {}
 
   virtual ~BlobDBIterator() = default;
 
@@ -37,35 +43,59 @@ class BlobDBIterator : public Iterator {
   }
 
   void SeekToFirst() override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekToFirst();
-    UpdateBlobValue();
+    while (UpdateBlobValue()) {
+      iter_->Next();
+    }
   }
 
   void SeekToLast() override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekToLast();
-    UpdateBlobValue();
+    while (UpdateBlobValue()) {
+      iter_->Prev();
+    }
   }
 
   void Seek(const Slice& target) override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->Seek(target);
-    UpdateBlobValue();
+    while (UpdateBlobValue()) {
+      iter_->Next();
+    }
   }
 
   void SeekForPrev(const Slice& target) override {
+    StopWatch seek_sw(env_, statistics_, BLOB_DB_SEEK_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_SEEK);
     iter_->SeekForPrev(target);
-    UpdateBlobValue();
+    while (UpdateBlobValue()) {
+      iter_->Prev();
+    }
   }
 
   void Next() override {
     assert(Valid());
+    StopWatch next_sw(env_, statistics_, BLOB_DB_NEXT_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_NEXT);
     iter_->Next();
-    UpdateBlobValue();
+    while (UpdateBlobValue()) {
+      iter_->Next();
+    }
   }
 
   void Prev() override {
     assert(Valid());
+    StopWatch prev_sw(env_, statistics_, BLOB_DB_PREV_MICROS);
+    RecordTick(statistics_, BLOB_DB_NUM_PREV);
     iter_->Prev();
-    UpdateBlobValue();
+    while (UpdateBlobValue()) {
+      iter_->Prev();
+    }
   }
 
   Slice key() const override {
@@ -84,18 +114,32 @@ class BlobDBIterator : public Iterator {
   // Iterator::Refresh() not supported.
 
  private:
-  void UpdateBlobValue() {
+  // Return true if caller should continue to next value.
+  bool UpdateBlobValue() {
     TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:1");
     TEST_SYNC_POINT("BlobDBIterator::UpdateBlobValue:Start:2");
     value_.Reset();
-    if (iter_->Valid() && iter_->IsBlob()) {
-      status_ = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_);
+    status_ = Status::OK();
+    if (iter_->Valid() && iter_->status().ok() && iter_->IsBlob()) {
+      Status s = blob_db_->GetBlobValue(iter_->key(), iter_->value(), &value_);
+      if (s.IsNotFound()) {
+        return true;
+      } else {
+        if (!s.ok()) {
+          status_ = s;
+        }
+        return false;
+      }
+    } else {
+      return false;
     }
   }
 
   std::unique_ptr<ManagedSnapshot> snapshot_;
   std::unique_ptr<ArenaWrappedDBIter> iter_;
   BlobDBImpl* blob_db_;
+  Env* env_;
+  Statistics* statistics_;
   Status status_;
   PinnableSlice value_;
 };
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db_listener.h b/thirdparty/rocksdb/utilities/blob_db/blob_db_listener.h
new file mode 100644
index 0000000000..f096d238ba
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db_listener.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <atomic>
+
+#include "rocksdb/listener.h"
+#include "util/mutexlock.h"
+#include "utilities/blob_db/blob_db_impl.h"
+
+namespace rocksdb {
+namespace blob_db {
+
+class BlobDBListener : public EventListener {
+ public:
+  explicit BlobDBListener(BlobDBImpl* blob_db_impl)
+      : blob_db_impl_(blob_db_impl) {}
+
+  void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override {
+    assert(blob_db_impl_ != nullptr);
+    blob_db_impl_->SyncBlobFiles();
+  }
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override {
+    assert(blob_db_impl_ != nullptr);
+    blob_db_impl_->UpdateLiveSSTSize();
+  }
+
+  void OnCompactionCompleted(DB* /*db*/,
+                             const CompactionJobInfo& /*info*/) override {
+    assert(blob_db_impl_ != nullptr);
+    blob_db_impl_->UpdateLiveSSTSize();
+  }
+
+ private:
+  BlobDBImpl* blob_db_impl_;
+};
+
+}  // namespace blob_db
+}  // namespace rocksdb
+#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_db_test.cc b/thirdparty/rocksdb/utilities/blob_db/blob_db_test.cc
index 03396eed38..afb953df9c 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_db_test.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_db_test.cc
@@ -6,6 +6,7 @@
 #ifndef ROCKSDB_LITE
 
 #include <algorithm>
+#include <chrono>
 #include <cstdlib>
 #include <map>
 #include <memory>
@@ -16,7 +17,10 @@
 #include "port/port.h"
 #include "rocksdb/utilities/debug.h"
 #include "util/cast_util.h"
+#include "util/fault_injection_test_env.h"
+#include "util/file_util.h"
 #include "util/random.h"
+#include "util/sst_file_manager_impl.h"
 #include "util/string_util.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
@@ -38,14 +42,18 @@ class BlobDBTest : public testing::Test {
   };
 
   BlobDBTest()
-      : dbname_(test::TmpDir() + "/blob_db_test"),
+      : dbname_(test::PerThreadDBPath("blob_db_test")),
         mock_env_(new MockTimeEnv(Env::Default())),
+        fault_injection_env_(new FaultInjectionTestEnv(Env::Default())),
         blob_db_(nullptr) {
     Status s = DestroyBlobDB(dbname_, Options(), BlobDBOptions());
     assert(s.ok());
   }
 
-  ~BlobDBTest() { Destroy(); }
+  ~BlobDBTest() override {
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    Destroy();
+  }
 
   Status TryOpen(BlobDBOptions bdb_options = BlobDBOptions(),
                  Options options = Options()) {
@@ -66,13 +74,19 @@ class BlobDBTest : public testing::Test {
     Open(bdb_options, options);
   }
 
+  void Close() {
+    assert(blob_db_ != nullptr);
+    delete blob_db_;
+    blob_db_ = nullptr;
+  }
+
   void Destroy() {
     if (blob_db_) {
       Options options = blob_db_->GetOptions();
       BlobDBOptions bdb_options = blob_db_->GetBlobDBOptions();
       delete blob_db_;
-      ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options));
       blob_db_ = nullptr;
+      ASSERT_OK(DestroyBlobDB(dbname_, options, bdb_options));
     }
   }
 
@@ -80,8 +94,13 @@ class BlobDBTest : public testing::Test {
     return reinterpret_cast<BlobDBImpl *>(blob_db_);
   }
 
-  Status Put(const Slice &key, const Slice &value) {
-    return blob_db_->Put(WriteOptions(), key, value);
+  Status Put(const Slice &key, const Slice &value,
+             std::map<std::string, std::string> *data = nullptr) {
+    Status s = blob_db_->Put(WriteOptions(), key, value);
+    if (data != nullptr) {
+      (*data)[key.ToString()] = value.ToString();
+    }
+    return s;
   }
 
   void Delete(const std::string &key,
@@ -92,6 +111,15 @@ class BlobDBTest : public testing::Test {
     }
   }
 
+  Status PutWithTTL(const Slice &key, const Slice &value, uint64_t ttl,
+                    std::map<std::string, std::string> *data = nullptr) {
+    Status s = blob_db_->PutWithTTL(WriteOptions(), key, value, ttl);
+    if (data != nullptr) {
+      (*data)[key.ToString()] = value.ToString();
+    }
+    return s;
+  }
+
   Status PutUntil(const Slice &key, const Slice &value, uint64_t expiration) {
     return blob_db_->PutUntil(WriteOptions(), key, value, expiration);
   }
@@ -179,8 +207,9 @@ class BlobDBTest : public testing::Test {
       const std::map<std::string, KeyVersion> &expected_versions) {
     auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
     DB *db = blob_db_->GetRootDB();
+    const size_t kMaxKeys = 10000;
     std::vector<KeyVersion> versions;
-    GetAllKeyVersions(db, "", "", &versions);
+    GetAllKeyVersions(db, "", "", kMaxKeys, &versions);
     ASSERT_EQ(expected_versions.size(), versions.size());
     size_t i = 0;
     for (auto &key_version : expected_versions) {
@@ -218,7 +247,7 @@ class BlobDBTest : public testing::Test {
 
   const std::string dbname_;
   std::unique_ptr<MockTimeEnv> mock_env_;
-  std::shared_ptr<TTLExtractor> ttl_extractor_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
   BlobDB *blob_db_;
 };  // class BlobDBTest
 
@@ -260,8 +289,8 @@ TEST_F(BlobDBTest, PutWithTTL) {
   ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
+  ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
+  ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
   VerifyDB(data);
 }
 
@@ -290,217 +319,81 @@ TEST_F(BlobDBTest, PutUntil) {
   ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
   GCStats gc_stats;
   ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
+  ASSERT_EQ(100 - data.size(), gc_stats.num_keys_expired);
+  ASSERT_EQ(data.size(), gc_stats.num_keys_relocated);
   VerifyDB(data);
 }
 
-TEST_F(BlobDBTest, TTLExtrator_NoTTL) {
-  // The default ttl extractor return no ttl for every key.
-  ttl_extractor_.reset(new TTLExtractor());
+TEST_F(BlobDBTest, StackableDBGet) {
   Random rnd(301);
-  Options options;
-  options.env = mock_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = 1000;
   bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.ttl_extractor = ttl_extractor_;
   bdb_options.disable_background_tasks = true;
-  Open(bdb_options, options);
+  Open(bdb_options);
   std::map<std::string, std::string> data;
-  mock_env_->set_current_time(0);
   for (size_t i = 0; i < 100; i++) {
     PutRandom("key" + ToString(i), &rnd, &data);
   }
-  // very far in the future..
-  mock_env_->set_current_time(std::numeric_limits<uint64_t>::max() / 1000000 -
-                              10);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_FALSE(blob_files[0]->HasTTL());
-  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
-  GCStats gc_stats;
-  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(0, gc_stats.num_deletes);
-  ASSERT_EQ(100, gc_stats.num_relocate);
-  VerifyDB(data);
-}
-
-TEST_F(BlobDBTest, TTLExtractor_ExtractTTL) {
-  Random rnd(301);
-  class TestTTLExtractor : public TTLExtractor {
-   public:
-    explicit TestTTLExtractor(Random *r) : rnd(r) {}
-
-    virtual bool ExtractTTL(const Slice &key, const Slice &value, uint64_t *ttl,
-                            std::string * /*new_value*/,
-                            bool * /*value_changed*/) override {
-      *ttl = rnd->Next() % 100;
-      if (*ttl > 50) {
-        data[key.ToString()] = value.ToString();
-      }
-      return true;
-    }
-
-    Random *rnd;
-    std::map<std::string, std::string> data;
-  };
-  ttl_extractor_.reset(new TestTTLExtractor(&rnd));
-  Options options;
-  options.env = mock_env_.get();
-  BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = 1000;
-  bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.ttl_extractor = ttl_extractor_;
-  bdb_options.disable_background_tasks = true;
-  Open(bdb_options, options);
-  mock_env_->set_current_time(50);
   for (size_t i = 0; i < 100; i++) {
-    PutRandom("key" + ToString(i), &rnd);
+    StackableDB *db = blob_db_;
+    ColumnFamilyHandle *column_family = db->DefaultColumnFamily();
+    std::string key = "key" + ToString(i);
+    PinnableSlice pinnable_value;
+    ASSERT_OK(db->Get(ReadOptions(), column_family, key, &pinnable_value));
+    std::string string_value;
+    ASSERT_OK(db->Get(ReadOptions(), column_family, key, &string_value));
+    ASSERT_EQ(string_value, pinnable_value.ToString());
+    ASSERT_EQ(string_value, data[key]);
   }
-  mock_env_->set_current_time(100);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_TRUE(blob_files[0]->HasTTL());
-  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
-  GCStats gc_stats;
-  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
-  VerifyDB(data);
 }
 
-TEST_F(BlobDBTest, TTLExtractor_ExtractExpiration) {
-  Random rnd(301);
-  class TestTTLExtractor : public TTLExtractor {
-   public:
-    explicit TestTTLExtractor(Random *r) : rnd(r) {}
-
-    virtual bool ExtractExpiration(const Slice &key, const Slice &value,
-                                   uint64_t /*now*/, uint64_t *expiration,
-                                   std::string * /*new_value*/,
-                                   bool * /*value_changed*/) override {
-      *expiration = rnd->Next() % 100 + 50;
-      if (*expiration > 100) {
-        data[key.ToString()] = value.ToString();
-      }
-      return true;
-    }
-
-    Random *rnd;
-    std::map<std::string, std::string> data;
-  };
-  ttl_extractor_.reset(new TestTTLExtractor(&rnd));
+TEST_F(BlobDBTest, GetExpiration) {
   Options options;
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = 1000;
-  bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.ttl_extractor = ttl_extractor_;
   bdb_options.disable_background_tasks = true;
-  Open(bdb_options, options);
-  mock_env_->set_current_time(50);
-  for (size_t i = 0; i < 100; i++) {
-    PutRandom("key" + ToString(i), &rnd);
-  }
   mock_env_->set_current_time(100);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_TRUE(blob_files[0]->HasTTL());
-  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
-  GCStats gc_stats;
-  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  auto &data = static_cast<TestTTLExtractor *>(ttl_extractor_.get())->data;
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
-  VerifyDB(data);
+  Open(bdb_options, options);
+  Put("key1", "value1");
+  PutWithTTL("key2", "value2", 200);
+  PinnableSlice value;
+  uint64_t expiration;
+  ASSERT_OK(blob_db_->Get(ReadOptions(), "key1", &value, &expiration));
+  ASSERT_EQ("value1", value.ToString());
+  ASSERT_EQ(kNoExpiration, expiration);
+  ASSERT_OK(blob_db_->Get(ReadOptions(), "key2", &value, &expiration));
+  ASSERT_EQ("value2", value.ToString());
+  ASSERT_EQ(300 /* = 100 + 200 */, expiration);
 }
 
-TEST_F(BlobDBTest, TTLExtractor_ChangeValue) {
-  class TestTTLExtractor : public TTLExtractor {
-   public:
-    const Slice kTTLSuffix = Slice("ttl:");
-
-    bool ExtractTTL(const Slice & /*key*/, const Slice &value, uint64_t *ttl,
-                    std::string *new_value, bool *value_changed) override {
-      if (value.size() < 12) {
-        return false;
-      }
-      const char *p = value.data() + value.size() - 12;
-      if (kTTLSuffix != Slice(p, 4)) {
-        return false;
-      }
-      *ttl = DecodeFixed64(p + 4);
-      *new_value = Slice(value.data(), value.size() - 12).ToString();
-      *value_changed = true;
-      return true;
-    }
-  };
-  Random rnd(301);
+TEST_F(BlobDBTest, GetIOError) {
   Options options;
-  options.env = mock_env_.get();
+  options.env = fault_injection_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.ttl_range_secs = 1000;
-  bdb_options.min_blob_size = 0;
-  bdb_options.blob_file_size = 256 * 1000 * 1000;
-  bdb_options.ttl_extractor = std::make_shared<TestTTLExtractor>();
+  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
   bdb_options.disable_background_tasks = true;
   Open(bdb_options, options);
-  std::map<std::string, std::string> data;
-  mock_env_->set_current_time(50);
-  for (size_t i = 0; i < 100; i++) {
-    int len = rnd.Next() % kMaxBlobSize + 1;
-    std::string key = "key" + ToString(i);
-    std::string value = test::RandomHumanReadableString(&rnd, len);
-    uint64_t ttl = rnd.Next() % 100;
-    std::string value_ttl = value + "ttl:";
-    PutFixed64(&value_ttl, ttl);
-    ASSERT_OK(blob_db_->Put(WriteOptions(), Slice(key), Slice(value_ttl)));
-    if (ttl > 50) {
-      data[key] = value;
-    }
-  }
-  mock_env_->set_current_time(100);
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
-  ASSERT_TRUE(blob_files[0]->HasTTL());
-  ASSERT_OK(bdb_impl->TEST_CloseBlobFile(blob_files[0]));
-  GCStats gc_stats;
-  ASSERT_OK(bdb_impl->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
-  ASSERT_EQ(100 - data.size(), gc_stats.num_deletes);
-  ASSERT_EQ(data.size(), gc_stats.num_relocate);
-  VerifyDB(data);
+  ColumnFamilyHandle *column_family = blob_db_->DefaultColumnFamily();
+  PinnableSlice value;
+  ASSERT_OK(Put("foo", "bar"));
+  fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+  Status s = blob_db_->Get(ReadOptions(), column_family, "foo", &value);
+  ASSERT_TRUE(s.IsIOError());
+  // Reactivate file system to allow test to close DB.
+  fault_injection_env_->SetFilesystemActive(true);
 }
 
-TEST_F(BlobDBTest, StackableDBGet) {
-  Random rnd(301);
+TEST_F(BlobDBTest, PutIOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.min_blob_size = 0;
+  bdb_options.min_blob_size = 0;  // Make sure value write to blob file
   bdb_options.disable_background_tasks = true;
-  Open(bdb_options);
-  std::map<std::string, std::string> data;
-  for (size_t i = 0; i < 100; i++) {
-    PutRandom("key" + ToString(i), &rnd, &data);
-  }
-  for (size_t i = 0; i < 100; i++) {
-    StackableDB *db = blob_db_;
-    ColumnFamilyHandle *column_family = db->DefaultColumnFamily();
-    std::string key = "key" + ToString(i);
-    PinnableSlice pinnable_value;
-    ASSERT_OK(db->Get(ReadOptions(), column_family, key, &pinnable_value));
-    std::string string_value;
-    ASSERT_OK(db->Get(ReadOptions(), column_family, key, &string_value));
-    ASSERT_EQ(string_value, pinnable_value.ToString());
-    ASSERT_EQ(string_value, data[key]);
-  }
+  Open(bdb_options, options);
+  fault_injection_env_->SetFilesystemActive(false, Status::IOError());
+  ASSERT_TRUE(Put("foo", "v1").IsIOError());
+  fault_injection_env_->SetFilesystemActive(true, Status::IOError());
+  ASSERT_OK(Put("bar", "v1"));
 }
 
 TEST_F(BlobDBTest, WriteBatch) {
@@ -610,7 +503,6 @@ TEST_F(BlobDBTest, DecompressAfterReopen) {
   Reopen(bdb_options);
   VerifyDB(data);
 }
-
 #endif
 
 TEST_F(BlobDBTest, MultipleWriters) {
@@ -675,8 +567,8 @@ TEST_F(BlobDBTest, GCAfterOverwriteKeys) {
   GCStats gc_stats;
   ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   ASSERT_EQ(200, gc_stats.blob_count);
-  ASSERT_EQ(0, gc_stats.num_deletes);
-  ASSERT_EQ(200 - new_keys, gc_stats.num_relocate);
+  ASSERT_EQ(0, gc_stats.num_keys_expired);
+  ASSERT_EQ(200 - new_keys, gc_stats.num_keys_relocated);
   VerifyDB(data);
 }
 
@@ -704,10 +596,9 @@ TEST_F(BlobDBTest, GCRelocateKeyWhileOverwriting) {
   GCStats gc_stats;
   ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   ASSERT_EQ(1, gc_stats.blob_count);
-  ASSERT_EQ(0, gc_stats.num_deletes);
-  ASSERT_EQ(1, gc_stats.num_relocate);
-  ASSERT_EQ(0, gc_stats.relocate_succeeded);
-  ASSERT_EQ(1, gc_stats.overwritten_while_relocate);
+  ASSERT_EQ(0, gc_stats.num_keys_expired);
+  ASSERT_EQ(1, gc_stats.num_keys_overwritten);
+  ASSERT_EQ(0, gc_stats.num_keys_relocated);
   writer.join();
   VerifyDB({{"foo", "v2"}});
 }
@@ -741,16 +632,34 @@ TEST_F(BlobDBTest, GCExpiredKeyWhileOverwriting) {
   GCStats gc_stats;
   ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_files[0], &gc_stats));
   ASSERT_EQ(1, gc_stats.blob_count);
-  ASSERT_EQ(1, gc_stats.num_deletes);
-  ASSERT_EQ(0, gc_stats.delete_succeeded);
-  ASSERT_EQ(1, gc_stats.overwritten_while_delete);
-  ASSERT_EQ(0, gc_stats.num_relocate);
+  ASSERT_EQ(1, gc_stats.num_keys_expired);
+  ASSERT_EQ(0, gc_stats.num_keys_relocated);
   writer.join();
   VerifyDB({{"foo", "v2"}});
 }
 
+TEST_F(BlobDBTest, NewFileGeneratedFromGCShouldMarkAsImmutable) {
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  ASSERT_OK(Put("foo", "bar"));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  auto blob_file1 = blob_files[0];
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file1));
+  GCStats gc_stats;
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(blob_file1, &gc_stats));
+  ASSERT_EQ(1, gc_stats.blob_count);
+  ASSERT_EQ(1, gc_stats.num_keys_relocated);
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  ASSERT_EQ(blob_file1, blob_files[0]);
+  ASSERT_TRUE(blob_files[1]->Immutable());
+}
+
 // This test is no longer valid since we now return an error when we go
-// over the configured blob_dir_size.
+// over the configured max_db_size.
 // The test needs to be re-written later in such a way that writes continue
 // after a GC happens.
 TEST_F(BlobDBTest, DISABLED_GCOldestSimpleBlobFileWhenOutOfSpace) {
@@ -758,7 +667,7 @@ TEST_F(BlobDBTest, DISABLED_GCOldestSimpleBlobFileWhenOutOfSpace) {
   Options options;
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.blob_dir_size = 100;
+  bdb_options.max_db_size = 100;
   bdb_options.blob_file_size = 100;
   bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
@@ -838,8 +747,7 @@ TEST_F(BlobDBTest, ReadWhileGC) {
     GCStats gc_stats;
     ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
     ASSERT_EQ(1, gc_stats.blob_count);
-    ASSERT_EQ(1, gc_stats.num_relocate);
-    ASSERT_EQ(1, gc_stats.relocate_succeeded);
+    ASSERT_EQ(1, gc_stats.num_keys_relocated);
     blob_db_impl()->TEST_DeleteObsoleteFiles();
     // The file shouln't be deleted
     blob_files = blob_db_impl()->TEST_GetBlobFiles();
@@ -863,6 +771,115 @@ TEST_F(BlobDBTest, ReadWhileGC) {
   }
 }
 
+TEST_F(BlobDBTest, SstFileManager) {
+  // run the same test for Get(), MultiGet() and Iterator each.
+  std::shared_ptr<SstFileManager> sst_file_manager(
+      NewSstFileManager(mock_env_.get()));
+  sst_file_manager->SetDeleteRateBytesPerSecond(1);
+  SstFileManagerImpl *sfm =
+      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  Options db_options;
+
+  int files_deleted_directly = 0;
+  int files_scheduled_to_delete = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion",
+      [&](void * /*arg*/) { files_scheduled_to_delete++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile",
+      [&](void * /*arg*/) { files_deleted_directly++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+  db_options.sst_file_manager = sst_file_manager;
+
+  Open(bdb_options, db_options);
+
+  // Create one obselete file and clean it.
+  blob_db_->Put(WriteOptions(), "foo", "bar");
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  std::shared_ptr<BlobFile> bfile = blob_files[0];
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile));
+  GCStats gc_stats;
+  ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+
+  // Even if SSTFileManager is not set, DB is creating a dummy one.
+  ASSERT_EQ(1, files_scheduled_to_delete);
+  ASSERT_EQ(0, files_deleted_directly);
+  Destroy();
+  // Make sure that DestroyBlobDB() also goes through delete scheduler.
+  ASSERT_GE(files_scheduled_to_delete, 2);
+  // Due to a timing issue, the WAL may or may not be deleted directly. The
+  // blob file is first scheduled, followed by WAL. If the background trash
+  // thread does not wake up on time, the WAL file will be directly
+  // deleted as the trash size will be > DB size
+  ASSERT_LE(files_deleted_directly, 1);
+  SyncPoint::GetInstance()->DisableProcessing();
+  sfm->WaitForEmptyTrash();
+}
+
+TEST_F(BlobDBTest, SstFileManagerRestart) {
+  int files_deleted_directly = 0;
+  int files_scheduled_to_delete = 0;
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion",
+      [&](void * /*arg*/) { files_scheduled_to_delete++; });
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile",
+      [&](void * /*arg*/) { files_deleted_directly++; });
+
+  // run the same test for Get(), MultiGet() and Iterator each.
+  std::shared_ptr<SstFileManager> sst_file_manager(
+      NewSstFileManager(mock_env_.get()));
+  sst_file_manager->SetDeleteRateBytesPerSecond(1);
+  SstFileManagerImpl *sfm =
+      static_cast<SstFileManagerImpl *>(sst_file_manager.get());
+
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  Options db_options;
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  db_options.sst_file_manager = sst_file_manager;
+
+  Open(bdb_options, db_options);
+  std::string blob_dir = blob_db_impl()->TEST_blob_dir();
+  blob_db_->Put(WriteOptions(), "foo", "bar");
+  Close();
+
+  // Create 3 dummy trash files under the blob_dir
+  CreateFile(db_options.env, blob_dir + "/000666.blob.trash", "", false);
+  CreateFile(db_options.env, blob_dir + "/000888.blob.trash", "", true);
+  CreateFile(db_options.env, blob_dir + "/something_not_match.trash", "",
+             false);
+
+  // Make sure that reopening the DB rescan the existing trash files
+  Open(bdb_options, db_options);
+  ASSERT_GE(files_scheduled_to_delete, 3);
+  // Depending on timing, the WAL file may or may not be directly deleted
+  ASSERT_LE(files_deleted_directly, 1);
+
+  sfm->WaitForEmptyTrash();
+
+  // There should be exact one file under the blob dir now.
+  std::vector<std::string> all_files;
+  ASSERT_OK(db_options.env->GetChildren(blob_dir, &all_files));
+  int nfiles = 0;
+  for (const auto &f : all_files) {
+    assert(!f.empty());
+    if (f[0] == '.') {
+      continue;
+    }
+    nfiles++;
+  }
+  ASSERT_EQ(nfiles, 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
   BlobDBOptions bdb_options;
   bdb_options.min_blob_size = 0;
@@ -904,21 +921,19 @@ TEST_F(BlobDBTest, SnapshotAndGarbageCollection) {
       ASSERT_TRUE(bfile->Obsolete());
       ASSERT_EQ(1, gc_stats.blob_count);
       if (delete_key) {
-        ASSERT_EQ(0, gc_stats.num_relocate);
-        ASSERT_EQ(bfile->GetSequenceRange().second + 1,
-                  bfile->GetObsoleteSequence());
+        ASSERT_EQ(0, gc_stats.num_keys_relocated);
       } else {
-        ASSERT_EQ(1, gc_stats.num_relocate);
-        ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
-                  bfile->GetObsoleteSequence());
+        ASSERT_EQ(1, gc_stats.num_keys_relocated);
       }
+      ASSERT_EQ(blob_db_->GetLatestSequenceNumber(),
+                bfile->GetObsoleteSequence());
       if (i == 3) {
         snapshot = blob_db_->GetSnapshot();
       }
       size_t num_files = delete_key ? 3 : 4;
       ASSERT_EQ(num_files, blob_db_impl()->TEST_GetBlobFiles().size());
       blob_db_impl()->TEST_DeleteObsoleteFiles();
-      if (i == 0 || i == 3 || (i == 2 && delete_key)) {
+      if (i == 3) {
         // The snapshot shouldn't see data in bfile
         ASSERT_EQ(num_files - 1, blob_db_impl()->TEST_GetBlobFiles().size());
         blob_db_->ReleaseSnapshot(snapshot);
@@ -969,6 +984,8 @@ TEST_F(BlobDBTest, ColumnFamilyNotSupported) {
 TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   Random rnd(301);
   BlobDBOptions bdb_options;
+  bdb_options.blob_dir = "blob_dir";
+  bdb_options.path_relative = true;
   bdb_options.min_blob_size = 0;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
@@ -976,16 +993,16 @@ TEST_F(BlobDBTest, GetLiveFilesMetaData) {
   for (size_t i = 0; i < 100; i++) {
     PutRandom("key" + ToString(i), &rnd, &data);
   }
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
   std::vector<LiveFileMetaData> metadata;
-  bdb_impl->GetLiveFilesMetaData(&metadata);
+  blob_db_->GetLiveFilesMetaData(&metadata);
   ASSERT_EQ(1U, metadata.size());
-  std::string filename = dbname_ + "/blob_dir/000001.blob";
+  // Path should be relative to db_name, but begin with slash.
+  std::string filename = "/blob_dir/000001.blob";
   ASSERT_EQ(filename, metadata[0].name);
   ASSERT_EQ("default", metadata[0].column_family_name);
   std::vector<std::string> livefile;
   uint64_t mfs;
-  bdb_impl->GetLiveFiles(livefile, &mfs, false);
+  ASSERT_OK(blob_db_->GetLiveFiles(livefile, &mfs, false));
   ASSERT_EQ(4U, livefile.size());
   ASSERT_EQ(filename, livefile[3]);
   VerifyDB(data);
@@ -1044,13 +1061,14 @@ TEST_F(BlobDBTest, MigrateFromPlainRocksDB) {
 }
 
 // Test to verify that a NoSpace IOError Status is returned on reaching
-// blob_dir_size limit.
+// max_db_size limit.
 TEST_F(BlobDBTest, OutOfSpace) {
   // Use mock env to stop wall clock.
   Options options;
   options.env = mock_env_.get();
   BlobDBOptions bdb_options;
-  bdb_options.blob_dir_size = 150;
+  bdb_options.max_db_size = 200;
+  bdb_options.is_fifo = false;
   bdb_options.disable_background_tasks = true;
   Open(bdb_options);
 
@@ -1059,46 +1077,178 @@ TEST_F(BlobDBTest, OutOfSpace) {
   std::string value(100, 'v');
   ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 60));
 
-  // Putting another blob should fail as ading it would exceed the blob_dir_size
+  // Putting another blob should fail as ading it would exceed the max_db_size
   // limit.
   Status s = blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60);
   ASSERT_TRUE(s.IsIOError());
   ASSERT_TRUE(s.IsNoSpace());
 }
 
-TEST_F(BlobDBTest, EvictOldestFileWhenCloseToSpaceLimit) {
-  // Use mock env to stop wall clock.
-  Options options;
+TEST_F(BlobDBTest, FIFOEviction) {
   BlobDBOptions bdb_options;
-  bdb_options.blob_dir_size = 270;
+  bdb_options.max_db_size = 200;
   bdb_options.blob_file_size = 100;
-  bdb_options.disable_background_tasks = true;
   bdb_options.is_fifo = true;
+  bdb_options.disable_background_tasks = true;
   Open(bdb_options);
 
+  std::atomic<int> evict_count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobDBImpl::EvictOldestBlobFile:Evicted",
+      [&](void *) { evict_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
   // Each stored blob has an overhead of 32 bytes currently.
   // So a 100 byte blob should take up 132 bytes.
   std::string value(100, 'v');
   ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key1", value, 10));
+  VerifyDB({{"key1", value}});
 
-  auto *bdb_impl = static_cast<BlobDBImpl *>(blob_db_);
-  auto blob_files = bdb_impl->TEST_GetBlobFiles();
-  ASSERT_EQ(1, blob_files.size());
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
 
-  // Adding another 100 byte blob would take the total size to 264 bytes
-  // (2*132), which is more than 90% of blob_dir_size. So, the oldest file
-  // should be evicted and put in obsolete files list.
+  // Adding another 100 bytes blob would take the total size to 264 bytes
+  // (2*132). max_db_size will be exceeded
+  // than max_db_size and trigger FIFO eviction.
   ASSERT_OK(blob_db_->PutWithTTL(WriteOptions(), "key2", value, 60));
+  ASSERT_EQ(1, evict_count);
+  // key1 will exist until corresponding file be deleted.
+  VerifyDB({{"key1", value}, {"key2", value}});
+
+  // Adding another 100 bytes blob without TTL.
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "key3", value));
+  ASSERT_EQ(2, evict_count);
+  // key1 and key2 will exist until corresponding file be deleted.
+  VerifyDB({{"key1", value}, {"key2", value}, {"key3", value}});
+
+  // The fourth blob file, without TTL.
+  ASSERT_OK(blob_db_->Put(WriteOptions(), "key4", value));
+  ASSERT_EQ(3, evict_count);
+  VerifyDB(
+      {{"key1", value}, {"key2", value}, {"key3", value}, {"key4", value}});
 
-  auto obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
-  ASSERT_EQ(1, obsolete_files.size());
-  ASSERT_TRUE(obsolete_files[0]->Immutable());
-  ASSERT_EQ(blob_files[0]->BlobFileNumber(),
-            obsolete_files[0]->BlobFileNumber());
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(4, blob_files.size());
+  ASSERT_TRUE(blob_files[0]->Obsolete());
+  ASSERT_TRUE(blob_files[1]->Obsolete());
+  ASSERT_TRUE(blob_files[2]->Obsolete());
+  ASSERT_FALSE(blob_files[3]->Obsolete());
+  auto obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
+  ASSERT_EQ(3, obsolete_files.size());
+  ASSERT_EQ(blob_files[0], obsolete_files[0]);
+  ASSERT_EQ(blob_files[1], obsolete_files[1]);
+  ASSERT_EQ(blob_files[2], obsolete_files[2]);
 
-  bdb_impl->TEST_DeleteObsoleteFiles();
-  obsolete_files = bdb_impl->TEST_GetObsoleteFiles();
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  obsolete_files = blob_db_impl()->TEST_GetObsoleteFiles();
   ASSERT_TRUE(obsolete_files.empty());
+  VerifyDB({{"key4", value}});
+}
+
+TEST_F(BlobDBTest, FIFOEviction_NoOldestFileToEvict) {
+  Options options;
+  BlobDBOptions bdb_options;
+  bdb_options.max_db_size = 1000;
+  bdb_options.blob_file_size = 5000;
+  bdb_options.is_fifo = true;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+
+  std::atomic<int> evict_count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobDBImpl::EvictOldestBlobFile:Evicted",
+      [&](void *) { evict_count++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string value(2000, 'v');
+  ASSERT_TRUE(Put("foo", std::string(2000, 'v')).IsNoSpace());
+  ASSERT_EQ(0, evict_count);
+}
+
+TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) {
+  BlobDBOptions bdb_options;
+  bdb_options.is_fifo = true;
+  bdb_options.min_blob_size = 100;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  // Use mock env to stop wall clock.
+  options.env = mock_env_.get();
+  options.disable_auto_compactions = true;
+  auto statistics = CreateDBStatistics();
+  options.statistics = statistics;
+  Open(bdb_options, options);
+
+  ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size());
+  std::string small_value(50, 'v');
+  std::map<std::string, std::string> data;
+  // Insert some data into LSM tree to make sure FIFO eviction take SST
+  // file size into account.
+  for (int i = 0; i < 1000; i++) {
+    ASSERT_OK(Put("key" + ToString(i), small_value, &data));
+  }
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+  uint64_t live_sst_size = 0;
+  ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+                                       &live_sst_size));
+  ASSERT_TRUE(live_sst_size > 0);
+  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
+
+  bdb_options.max_db_size = live_sst_size + 2000;
+  Reopen(bdb_options, options);
+  ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size());
+
+  std::string value_1k(1000, 'v');
+  ASSERT_OK(PutWithTTL("large_key1", value_1k, 60, &data));
+  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  VerifyDB(data);
+  // large_key2 evicts large_key1
+  ASSERT_OK(PutWithTTL("large_key2", value_1k, 60, &data));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  data.erase("large_key1");
+  VerifyDB(data);
+  // large_key3 get no enough space even after evicting large_key2, so it
+  // instead return no space error.
+  std::string value_2k(2000, 'v');
+  ASSERT_TRUE(PutWithTTL("large_key3", value_2k, 60).IsNoSpace());
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  // Verify large_key2 still exists.
+  VerifyDB(data);
+}
+
+// Test flush or compaction will trigger FIFO eviction since they update
+// total SST file size.
+TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) {
+  BlobDBOptions bdb_options;
+  bdb_options.max_db_size = 1000;
+  bdb_options.is_fifo = true;
+  bdb_options.min_blob_size = 100;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  // Use mock env to stop wall clock.
+  options.env = mock_env_.get();
+  auto statistics = CreateDBStatistics();
+  options.statistics = statistics;
+  options.compression = kNoCompression;
+  Open(bdb_options, options);
+
+  std::string value(800, 'v');
+  ASSERT_OK(PutWithTTL("large_key", value, 60));
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  VerifyDB({{"large_key", value}});
+
+  // Insert some small keys and flush to bring DB out of space.
+  std::map<std::string, std::string> data;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + ToString(i), "v", &data));
+  }
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+
+  // Verify large_key is deleted by FIFO eviction.
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  VerifyDB(data);
 }
 
 TEST_F(BlobDBTest, InlineSmallValues) {
@@ -1115,10 +1265,6 @@ TEST_F(BlobDBTest, InlineSmallValues) {
   Open(bdb_options, options);
   std::map<std::string, std::string> data;
   std::map<std::string, KeyVersion> versions;
-  SequenceNumber first_non_ttl_seq = kMaxSequenceNumber;
-  SequenceNumber first_ttl_seq = kMaxSequenceNumber;
-  SequenceNumber last_non_ttl_seq = 0;
-  SequenceNumber last_ttl_seq = 0;
   for (size_t i = 0; i < 1000; i++) {
     bool is_small_value = rnd.Next() % 2;
     bool has_ttl = rnd.Next() % 2;
@@ -1138,15 +1284,6 @@ TEST_F(BlobDBTest, InlineSmallValues) {
     versions[key] =
         KeyVersion(key, value, sequence,
                    (is_small_value && !has_ttl) ? kTypeValue : kTypeBlobIndex);
-    if (!is_small_value) {
-      if (!has_ttl) {
-        first_non_ttl_seq = std::min(first_non_ttl_seq, sequence);
-        last_non_ttl_seq = std::max(last_non_ttl_seq, sequence);
-      } else {
-        first_ttl_seq = std::min(first_ttl_seq, sequence);
-        last_ttl_seq = std::max(last_ttl_seq, sequence);
-      }
-    }
   }
   VerifyDB(data);
   VerifyBaseDB(versions);
@@ -1163,21 +1300,17 @@ TEST_F(BlobDBTest, InlineSmallValues) {
     ttl_file = blob_files[1];
   }
   ASSERT_FALSE(non_ttl_file->HasTTL());
-  ASSERT_EQ(first_non_ttl_seq, non_ttl_file->GetSequenceRange().first);
-  ASSERT_EQ(last_non_ttl_seq, non_ttl_file->GetSequenceRange().second);
   ASSERT_TRUE(ttl_file->HasTTL());
-  ASSERT_EQ(first_ttl_seq, ttl_file->GetSequenceRange().first);
-  ASSERT_EQ(last_ttl_seq, ttl_file->GetSequenceRange().second);
 }
 
 TEST_F(BlobDBTest, CompactionFilterNotSupported) {
   class TestCompactionFilter : public CompactionFilter {
-    virtual const char *Name() const { return "TestCompactionFilter"; }
+    const char *Name() const override { return "TestCompactionFilter"; }
   };
   class TestCompactionFilterFactory : public CompactionFilterFactory {
-    virtual const char *Name() const { return "TestCompactionFilterFactory"; }
-    virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-        const CompactionFilter::Context & /*context*/) {
+    const char *Name() const override { return "TestCompactionFilterFactory"; }
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context & /*context*/) override {
       return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
     }
   };
@@ -1194,6 +1327,7 @@ TEST_F(BlobDBTest, CompactionFilterNotSupported) {
   }
 }
 
+// Test comapction filter should remove any expired blob index.
 TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   constexpr size_t kNumKeys = 100;
   constexpr size_t kNumPuts = 1000;
@@ -1251,7 +1385,8 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   blob_db_->ReleaseSnapshot(snapshot);
   // Verify expired blob index are filtered.
   std::vector<KeyVersion> versions;
-  GetAllKeyVersions(blob_db_, "", "", &versions);
+  const size_t kMaxKeys = 10000;
+  GetAllKeyVersions(blob_db_, "", "", kMaxKeys, &versions);
   ASSERT_EQ(data_after_compact.size(), versions.size());
   for (auto &version : versions) {
     ASSERT_TRUE(data_after_compact.count(version.user_key) > 0);
@@ -1259,6 +1394,287 @@ TEST_F(BlobDBTest, FilterExpiredBlobIndex) {
   VerifyDB(data_after_compact);
 }
 
+// Test compaction filter should remove any blob index where corresponding
+// blob file has been removed (either by FIFO or garbage collection).
+TEST_F(BlobDBTest, FilterFileNotAvailable) {
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.disable_auto_compactions = true;
+  Open(bdb_options, options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_EQ(1, blob_files[0]->BlobFileNumber());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[0]));
+
+  ASSERT_OK(Put("bar", "v2"));
+  blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(2, blob_files.size());
+  ASSERT_EQ(2, blob_files[1]->BlobFileNumber());
+  ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_files[1]));
+
+  const size_t kMaxKeys = 10000;
+
+  DB *base_db = blob_db_->GetRootDB();
+  std::vector<KeyVersion> versions;
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(2, versions.size());
+  ASSERT_EQ("bar", versions[0].user_key);
+  ASSERT_EQ("foo", versions[1].user_key);
+  VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
+
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(2, versions.size());
+  ASSERT_EQ("bar", versions[0].user_key);
+  ASSERT_EQ("foo", versions[1].user_key);
+  VerifyDB({{"bar", "v2"}, {"foo", "v1"}});
+
+  // Remove the first blob file and compact. foo should be remove from base db.
+  blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[0]);
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(1, versions.size());
+  ASSERT_EQ("bar", versions[0].user_key);
+  VerifyDB({{"bar", "v2"}});
+
+  // Remove the second blob file and compact. bar should be remove from base db.
+  blob_db_impl()->TEST_ObsoleteBlobFile(blob_files[1]);
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(GetAllKeyVersions(base_db, "", "", kMaxKeys, &versions));
+  ASSERT_EQ(0, versions.size());
+  VerifyDB({});
+}
+
+// Test compaction filter should filter any inlined TTL keys that would have
+// been dropped by last FIFO eviction if they are store out-of-line.
+TEST_F(BlobDBTest, FilterForFIFOEviction) {
+  Random rnd(215);
+  BlobDBOptions bdb_options;
+  bdb_options.min_blob_size = 100;
+  bdb_options.ttl_range_secs = 60;
+  bdb_options.max_db_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  // Use mock env to stop wall clock.
+  mock_env_->set_current_time(0);
+  options.env = mock_env_.get();
+  auto statistics = CreateDBStatistics();
+  options.statistics = statistics;
+  options.disable_auto_compactions = true;
+  Open(bdb_options, options);
+
+  std::map<std::string, std::string> data;
+  std::map<std::string, std::string> data_after_compact;
+  // Insert some small values that will be inlined.
+  for (int i = 0; i < 1000; i++) {
+    std::string key = "key" + ToString(i);
+    std::string value = test::RandomHumanReadableString(&rnd, 50);
+    uint64_t ttl = rnd.Next() % 120 + 1;
+    ASSERT_OK(PutWithTTL(key, value, ttl, &data));
+    if (ttl >= 60) {
+      data_after_compact[key] = value;
+    }
+  }
+  uint64_t num_keys_to_evict = data.size() - data_after_compact.size();
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+  uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size();
+  ASSERT_GT(live_sst_size, 0);
+  VerifyDB(data);
+
+  bdb_options.max_db_size = live_sst_size + 30000;
+  bdb_options.is_fifo = true;
+  Reopen(bdb_options, options);
+  VerifyDB(data);
+
+  // Put two large values, each on a different blob file.
+  std::string large_value(10000, 'v');
+  ASSERT_OK(PutWithTTL("large_key1", large_value, 90));
+  ASSERT_OK(PutWithTTL("large_key2", large_value, 150));
+  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(0, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  data["large_key1"] = large_value;
+  data["large_key2"] = large_value;
+  VerifyDB(data);
+
+  // Put a third large value which will bring the DB out of space.
+  // FIFO eviction will evict the file of large_key1.
+  ASSERT_OK(PutWithTTL("large_key3", large_value, 150));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  ASSERT_EQ(2, blob_db_impl()->TEST_GetBlobFiles().size());
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  data.erase("large_key1");
+  data["large_key3"] = large_value;
+  VerifyDB(data);
+
+  // Putting some more small values. These values shouldn't be evicted by
+  // compaction filter since they are inserted after FIFO eviction.
+  ASSERT_OK(PutWithTTL("foo", "v", 30, &data_after_compact));
+  ASSERT_OK(PutWithTTL("bar", "v", 30, &data_after_compact));
+
+  // FIFO eviction doesn't trigger again since there enough room for the flush.
+  ASSERT_OK(blob_db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+
+  // Manual compact and check if compaction filter evict those keys with
+  // expiration < 60.
+  ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // All keys with expiration < 60, plus large_key1 is filtered by
+  // compaction filter.
+  ASSERT_EQ(num_keys_to_evict + 1,
+            statistics->getTickerCount(BLOB_DB_BLOB_INDEX_EVICTED_COUNT));
+  ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED));
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  data_after_compact["large_key2"] = large_value;
+  data_after_compact["large_key3"] = large_value;
+  VerifyDB(data_after_compact);
+}
+
+// File should be evicted after expiration.
+TEST_F(BlobDBTest, EvictExpiredFile) {
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = 100;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = true;
+  Options options;
+  options.env = mock_env_.get();
+  Open(bdb_options, options);
+  mock_env_->set_current_time(50);
+  std::map<std::string, std::string> data;
+  ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  auto blob_file = blob_files[0];
+  ASSERT_FALSE(blob_file->Immutable());
+  ASSERT_FALSE(blob_file->Obsolete());
+  VerifyDB(data);
+  mock_env_->set_current_time(250);
+  // The key should expired now.
+  blob_db_impl()->TEST_EvictExpiredFiles();
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+  ASSERT_TRUE(blob_file->Immutable());
+  ASSERT_TRUE(blob_file->Obsolete());
+  blob_db_impl()->TEST_DeleteObsoleteFiles();
+  ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+  ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+  // Make sure we don't return garbage value after blob file being evicted,
+  // but the blob index still exists in the LSM tree.
+  std::string val = "";
+  ASSERT_TRUE(blob_db_->Get(ReadOptions(), "foo", &val).IsNotFound());
+  ASSERT_EQ("", val);
+}
+
+TEST_F(BlobDBTest, DisableFileDeletions) {
+  BlobDBOptions bdb_options;
+  bdb_options.disable_background_tasks = true;
+  Open(bdb_options);
+  std::map<std::string, std::string> data;
+  for (bool force : {true, false}) {
+    ASSERT_OK(Put("foo", "v", &data));
+    auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+    ASSERT_EQ(1, blob_files.size());
+    auto blob_file = blob_files[0];
+    ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(blob_file));
+    blob_db_impl()->TEST_ObsoleteBlobFile(blob_file);
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    // Call DisableFileDeletions twice.
+    ASSERT_OK(blob_db_->DisableFileDeletions());
+    ASSERT_OK(blob_db_->DisableFileDeletions());
+    // File deletions should be disabled.
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    VerifyDB(data);
+    // Enable file deletions once. If force=true, file deletion is enabled.
+    // Otherwise it needs to enable it for a second time.
+    ASSERT_OK(blob_db_->EnableFileDeletions(force));
+    blob_db_impl()->TEST_DeleteObsoleteFiles();
+    if (!force) {
+      ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size());
+      ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size());
+      VerifyDB(data);
+      // Call EnableFileDeletions a second time.
+      ASSERT_OK(blob_db_->EnableFileDeletions(false));
+      blob_db_impl()->TEST_DeleteObsoleteFiles();
+    }
+    // Regardless of value of `force`, file should be deleted by now.
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size());
+    ASSERT_EQ(0, blob_db_impl()->TEST_GetObsoleteFiles().size());
+    VerifyDB({});
+  }
+}
+
+TEST_F(BlobDBTest, ShutdownWait) {
+  BlobDBOptions bdb_options;
+  bdb_options.ttl_range_secs = 100;
+  bdb_options.min_blob_size = 0;
+  bdb_options.disable_background_tasks = false;
+  Options options;
+  options.env = mock_env_.get();
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"BlobDBImpl::EvictExpiredFiles:0", "BlobDBTest.ShutdownWait:0"},
+      {"BlobDBTest.ShutdownWait:1", "BlobDBImpl::EvictExpiredFiles:1"},
+      {"BlobDBImpl::EvictExpiredFiles:2", "BlobDBTest.ShutdownWait:2"},
+      {"BlobDBTest.ShutdownWait:3", "BlobDBImpl::EvictExpiredFiles:3"},
+  });
+  // Force all tasks to be scheduled immediately.
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "TimeQueue::Add:item.end", [&](void *arg) {
+        std::chrono::steady_clock::time_point *tp =
+            static_cast<std::chrono::steady_clock::time_point *>(arg);
+        *tp =
+            std::chrono::steady_clock::now() - std::chrono::milliseconds(10000);
+      });
+
+  rocksdb::SyncPoint::GetInstance()->SetCallBack(
+      "BlobDBImpl::EvictExpiredFiles:cb", [&](void * /*arg*/) {
+        // Sleep 3 ms to increase the chance of data race.
+        // We've synced up the code so that EvictExpiredFiles()
+        // is called concurrently with ~BlobDBImpl().
+        // ~BlobDBImpl() is supposed to wait for all background
+        // task to shutdown before doing anything else. In order
+        // to use the same test to reproduce a bug of the waiting
+        // logic, we wait a little bit here, so that TSAN can
+        // catch the data race.
+        // We should improve the test if we find a better way.
+        Env::Default()->SleepForMicroseconds(3000);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Open(bdb_options, options);
+  mock_env_->set_current_time(50);
+  std::map<std::string, std::string> data;
+  ASSERT_OK(PutWithTTL("foo", "bar", 100, &data));
+  auto blob_files = blob_db_impl()->TEST_GetBlobFiles();
+  ASSERT_EQ(1, blob_files.size());
+  auto blob_file = blob_files[0];
+  ASSERT_FALSE(blob_file->Immutable());
+  ASSERT_FALSE(blob_file->Obsolete());
+  VerifyDB(data);
+
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:0");
+  mock_env_->set_current_time(250);
+  // The key should expired now.
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:1");
+
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:2");
+  TEST_SYNC_POINT("BlobDBTest.ShutdownWait:3");
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
 }  //  namespace blob_db
 }  //  namespace rocksdb
 
@@ -1271,7 +1687,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as BlobDB is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.cc b/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.cc
index b7ae8162d4..37eee19dbe 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.cc
@@ -17,7 +17,9 @@
 #include "port/port.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/env.h"
+#include "table/format.h"
 #include "util/coding.h"
+#include "util/file_reader_writer.h"
 #include "util/string_util.h"
 
 namespace rocksdb {
@@ -27,7 +29,10 @@ BlobDumpTool::BlobDumpTool()
     : reader_(nullptr), buffer_(nullptr), buffer_size_(0) {}
 
 Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
-                         DisplayType show_blob) {
+                         DisplayType show_blob,
+                         DisplayType show_uncompressed_blob,
+                         bool show_summary) {
+  constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
   Status s;
   Env* env = Env::Default();
   s = env->FileExists(filename);
@@ -44,13 +49,15 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
   if (!s.ok()) {
     return s;
   }
+  file = NewReadaheadRandomAccessFile(std::move(file), kReadaheadSize);
   if (file_size == 0) {
     return Status::Corruption("File is empty.");
   }
   reader_.reset(new RandomAccessFileReader(std::move(file), filename));
   uint64_t offset = 0;
   uint64_t footer_offset = 0;
-  s = DumpBlobLogHeader(&offset);
+  CompressionType compression = kNoCompression;
+  s = DumpBlobLogHeader(&offset, &compression);
   if (!s.ok()) {
     return s;
   }
@@ -58,14 +65,30 @@ Status BlobDumpTool::Run(const std::string& filename, DisplayType show_key,
   if (!s.ok()) {
     return s;
   }
-  if (show_key != DisplayType::kNone) {
+  uint64_t total_records = 0;
+  uint64_t total_key_size = 0;
+  uint64_t total_blob_size = 0;
+  uint64_t total_uncompressed_blob_size = 0;
+  if (show_key != DisplayType::kNone || show_summary) {
     while (offset < footer_offset) {
-      s = DumpRecord(show_key, show_blob, &offset);
+      s = DumpRecord(show_key, show_blob, show_uncompressed_blob, show_summary,
+                     compression, &offset, &total_records, &total_key_size,
+                     &total_blob_size, &total_uncompressed_blob_size);
       if (!s.ok()) {
-        return s;
+        break;
       }
     }
   }
+  if (show_summary) {
+    fprintf(stdout, "Summary:\n");
+    fprintf(stdout, "  total records: %" PRIu64 "\n", total_records);
+    fprintf(stdout, "  total key size: %" PRIu64 "\n", total_key_size);
+    fprintf(stdout, "  total blob size: %" PRIu64 "\n", total_blob_size);
+    if (compression != kNoCompression) {
+      fprintf(stdout, "  total raw blob size: %" PRIu64 "\n",
+              total_uncompressed_blob_size);
+    }
+  }
   return s;
 }
 
@@ -89,7 +112,8 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
   return s;
 }
 
-Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
+Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset,
+                                       CompressionType* compression) {
   Slice slice;
   Status s = Read(0, BlobLogHeader::kSize, &slice);
   if (!s.ok()) {
@@ -114,6 +138,7 @@ Status BlobDumpTool::DumpBlobLogHeader(uint64_t* offset) {
   fprintf(stdout, "  Expiration range : %s\n",
           GetString(header.expiration_range).c_str());
   *offset = BlobLogHeader::kSize;
+  *compression = header.compression;
   return s;
 }
 
@@ -136,21 +161,26 @@ Status BlobDumpTool::DumpBlobLogFooter(uint64_t file_size,
   BlobLogFooter footer;
   s = footer.DecodeFrom(slice);
   if (!s.ok()) {
-    return s;
+    return no_footer();
   }
   fprintf(stdout, "Blob log footer:\n");
   fprintf(stdout, "  Blob count       : %" PRIu64 "\n", footer.blob_count);
   fprintf(stdout, "  Expiration Range : %s\n",
           GetString(footer.expiration_range).c_str());
-  fprintf(stdout, "  Sequence Range   : %s\n",
-          GetString(footer.sequence_range).c_str());
   return s;
 }
 
 Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
-                                uint64_t* offset) {
-  fprintf(stdout, "Read record with offset 0x%" PRIx64 " (%" PRIu64 "):\n",
-          *offset, *offset);
+                                DisplayType show_uncompressed_blob,
+                                bool show_summary, CompressionType compression,
+                                uint64_t* offset, uint64_t* total_records,
+                                uint64_t* total_key_size,
+                                uint64_t* total_blob_size,
+                                uint64_t* total_uncompressed_blob_size) {
+  if (show_key != DisplayType::kNone) {
+    fprintf(stdout, "Read record with offset 0x%" PRIx64 " (%" PRIu64 "):\n",
+            *offset, *offset);
+  }
   Slice slice;
   Status s = Read(*offset, BlobLogRecord::kHeaderSize, &slice);
   if (!s.ok()) {
@@ -163,23 +193,50 @@ Status BlobDumpTool::DumpRecord(DisplayType show_key, DisplayType show_blob,
   }
   uint64_t key_size = record.key_size;
   uint64_t value_size = record.value_size;
-  fprintf(stdout, "  key size   : %" PRIu64 "\n", key_size);
-  fprintf(stdout, "  value size : %" PRIu64 "\n", value_size);
-  fprintf(stdout, "  expiration : %" PRIu64 "\n", record.expiration);
+  if (show_key != DisplayType::kNone) {
+    fprintf(stdout, "  key size   : %" PRIu64 "\n", key_size);
+    fprintf(stdout, "  value size : %" PRIu64 "\n", value_size);
+    fprintf(stdout, "  expiration : %" PRIu64 "\n", record.expiration);
+  }
   *offset += BlobLogRecord::kHeaderSize;
-  s = Read(*offset, key_size + value_size, &slice);
+  s = Read(*offset, static_cast<size_t>(key_size + value_size), &slice);
   if (!s.ok()) {
     return s;
   }
+  // Decompress value
+  std::string uncompressed_value;
+  if (compression != kNoCompression &&
+      (show_uncompressed_blob != DisplayType::kNone || show_summary)) {
+    BlockContents contents;
+    UncompressionContext context(compression);
+    UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                           compression);
+    s = UncompressBlockContentsForCompressionType(
+        info, slice.data() + key_size, static_cast<size_t>(value_size),
+        &contents, 2 /*compress_format_version*/,
+        ImmutableCFOptions(Options()));
+    if (!s.ok()) {
+      return s;
+    }
+    uncompressed_value = contents.data.ToString();
+  }
   if (show_key != DisplayType::kNone) {
     fprintf(stdout, "  key        : ");
-    DumpSlice(Slice(slice.data(), key_size), show_key);
+    DumpSlice(Slice(slice.data(), static_cast<size_t>(key_size)), show_key);
     if (show_blob != DisplayType::kNone) {
       fprintf(stdout, "  blob       : ");
-      DumpSlice(Slice(slice.data() + key_size, value_size), show_blob);
+      DumpSlice(Slice(slice.data() + static_cast<size_t>(key_size), static_cast<size_t>(value_size)), show_blob);
+    }
+    if (show_uncompressed_blob != DisplayType::kNone) {
+      fprintf(stdout, "  raw blob   : ");
+      DumpSlice(Slice(uncompressed_value), show_uncompressed_blob);
     }
   }
   *offset += key_size + value_size;
+  *total_records += 1;
+  *total_key_size += key_size;
+  *total_blob_size += value_size;
+  *total_uncompressed_blob_size += uncompressed_value.size();
   return s;
 }
 
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.h b/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.h
index abba91dcad..ff4672fd3f 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_dump_tool.h
@@ -27,19 +27,24 @@ class BlobDumpTool {
 
   BlobDumpTool();
 
-  Status Run(const std::string& filename, DisplayType key_type,
-             DisplayType blob_type);
+  Status Run(const std::string& filename, DisplayType show_key,
+             DisplayType show_blob, DisplayType show_uncompressed_blob,
+             bool show_summary);
 
  private:
   std::unique_ptr<RandomAccessFileReader> reader_;
-  std::unique_ptr<char> buffer_;
+  std::unique_ptr<char[]> buffer_;
   size_t buffer_size_;
 
   Status Read(uint64_t offset, size_t size, Slice* result);
-  Status DumpBlobLogHeader(uint64_t* offset);
+  Status DumpBlobLogHeader(uint64_t* offset, CompressionType* compression);
   Status DumpBlobLogFooter(uint64_t file_size, uint64_t* footer_offset);
   Status DumpRecord(DisplayType show_key, DisplayType show_blob,
-                    uint64_t* offset);
+                    DisplayType show_uncompressed_blob, bool show_summary,
+                    CompressionType compression, uint64_t* offset,
+                    uint64_t* total_records, uint64_t* total_key_size,
+                    uint64_t* total_blob_size,
+                    uint64_t* total_uncompressed_blob_size);
   void DumpSlice(const Slice s, DisplayType type);
 
   template <class T>
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_file.cc b/thirdparty/rocksdb/utilities/blob_db/blob_file.cc
index 162f364a2f..3bcbd04873 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_file.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_file.cc
@@ -13,6 +13,7 @@
 #include <stdio.h>
 
 #include <algorithm>
+#include <limits>
 #include <memory>
 
 #include "db/column_family.h"
@@ -29,41 +30,38 @@ namespace blob_db {
 BlobFile::BlobFile()
     : parent_(nullptr),
       file_number_(0),
-      has_ttl_(false),
+      info_log_(nullptr),
+      column_family_id_(std::numeric_limits<uint32_t>::max()),
       compression_(kNoCompression),
+      has_ttl_(false),
       blob_count_(0),
-      gc_epoch_(-1),
       file_size_(0),
-      deleted_count_(0),
-      deleted_size_(0),
       closed_(false),
       obsolete_(false),
-      gc_once_after_open_(false),
       expiration_range_({0, 0}),
-      sequence_range_({kMaxSequenceNumber, 0}),
       last_access_(-1),
       last_fsync_(0),
-      header_valid_(false) {}
+      header_valid_(false),
+      footer_valid_(false) {}
 
-BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn)
+BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn,
+                   Logger* info_log)
     : parent_(p),
       path_to_dir_(bdir),
       file_number_(fn),
-      has_ttl_(false),
+      info_log_(info_log),
+      column_family_id_(std::numeric_limits<uint32_t>::max()),
       compression_(kNoCompression),
+      has_ttl_(false),
       blob_count_(0),
-      gc_epoch_(-1),
       file_size_(0),
-      deleted_count_(0),
-      deleted_size_(0),
       closed_(false),
       obsolete_(false),
-      gc_once_after_open_(false),
       expiration_range_({0, 0}),
-      sequence_range_({kMaxSequenceNumber, 0}),
       last_access_(-1),
       last_fsync_(0),
-      header_valid_(false) {}
+      header_valid_(false),
+      footer_valid_(false) {}
 
 BlobFile::~BlobFile() {
   if (obsolete_) {
@@ -76,53 +74,49 @@ BlobFile::~BlobFile() {
   }
 }
 
-uint32_t BlobFile::column_family_id() const {
-  // TODO(yiwu): Should return column family id encoded in blob file after
-  // we add blob db column family support.
-  return reinterpret_cast<ColumnFamilyHandle*>(parent_->DefaultColumnFamily())
-      ->GetID();
-}
+uint32_t BlobFile::column_family_id() const { return column_family_id_; }
 
 std::string BlobFile::PathName() const {
   return BlobFileName(path_to_dir_, file_number_);
 }
 
-std::shared_ptr<Reader> BlobFile::OpenSequentialReader(
+std::shared_ptr<Reader> BlobFile::OpenRandomAccessReader(
     Env* env, const DBOptions& db_options,
     const EnvOptions& env_options) const {
-  std::unique_ptr<SequentialFile> sfile;
-  Status s = env->NewSequentialFile(PathName(), &sfile, env_options);
+  constexpr size_t kReadaheadSize = 2 * 1024 * 1024;
+  std::unique_ptr<RandomAccessFile> sfile;
+  std::string path_name(PathName());
+  Status s = env->NewRandomAccessFile(path_name, &sfile, env_options);
   if (!s.ok()) {
     // report something here.
     return nullptr;
   }
+  sfile = NewReadaheadRandomAccessFile(std::move(sfile), kReadaheadSize);
 
-  std::unique_ptr<SequentialFileReader> sfile_reader;
-  sfile_reader.reset(new SequentialFileReader(std::move(sfile)));
+  std::unique_ptr<RandomAccessFileReader> sfile_reader;
+  sfile_reader.reset(new RandomAccessFileReader(std::move(sfile), path_name));
 
-  std::shared_ptr<Reader> log_reader =
-      std::make_shared<Reader>(db_options.info_log, std::move(sfile_reader));
+  std::shared_ptr<Reader> log_reader = std::make_shared<Reader>(
+      std::move(sfile_reader), db_options.env, db_options.statistics.get());
 
   return log_reader;
 }
 
 std::string BlobFile::DumpState() const {
   char str[1000];
-  snprintf(str, sizeof(str),
-           "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " gc_epoch: %" PRIu64
-           " file_size: %" PRIu64 " deleted_count: %" PRIu64
-           " deleted_size: %" PRIu64
-           " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
-           ") sequence_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d",
-           path_to_dir_.c_str(), file_number_, blob_count_.load(),
-           gc_epoch_.load(), file_size_.load(), deleted_count_, deleted_size_,
-           closed_.load(), obsolete_.load(), expiration_range_.first,
-           expiration_range_.second, sequence_range_.first,
-           sequence_range_.second, (!!log_writer_), (!!ra_file_reader_));
+  snprintf(
+      str, sizeof(str),
+      "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 " file_size: %" PRIu64
+      " closed: %d obsolete: %d expiration_range: (%" PRIu64 ", %" PRIu64
+      "), writer: %d reader: %d",
+      path_to_dir_.c_str(), file_number_, blob_count_.load(), file_size_.load(),
+      closed_.load(), obsolete_.load(), expiration_range_.first,
+      expiration_range_.second, (!!log_writer_), (!!ra_file_reader_));
   return str;
 }
 
 void BlobFile::MarkObsolete(SequenceNumber sequence) {
+  assert(Immutable());
   obsolete_sequence_ = sequence;
   obsolete_.store(true);
 }
@@ -134,26 +128,17 @@ bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const {
 }
 
 Status BlobFile::WriteFooterAndCloseLocked() {
-  ROCKS_LOG_INFO(parent_->db_options_.info_log,
-                 "File is being closed after footer %s", PathName().c_str());
-
   BlobLogFooter footer;
   footer.blob_count = blob_count_;
   if (HasTTL()) {
     footer.expiration_range = expiration_range_;
   }
 
-  footer.sequence_range = sequence_range_;
-
   // this will close the file and reset the Writable File Pointer.
   Status s = log_writer_->AppendFooter(footer);
   if (s.ok()) {
     closed_ = true;
     file_size_ += BlobLogFooter::kSize;
-  } else {
-    ROCKS_LOG_ERROR(parent_->db_options_.info_log,
-                    "Failure to read Header for blob-file %s",
-                    PathName().c_str());
   }
   // delete the sequential writer
   log_writer_.reset();
@@ -188,16 +173,17 @@ Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) {
   last_fsync_.store(file_size_);
   blob_count_ = footer.blob_count;
   expiration_range_ = footer.expiration_range;
-  sequence_range_ = footer.sequence_range;
   closed_ = true;
   return Status::OK();
 }
 
-void BlobFile::Fsync() {
+Status BlobFile::Fsync() {
+  Status s;
   if (log_writer_.get()) {
-    log_writer_->Sync();
+    s = log_writer_->Sync();
     last_fsync_.store(file_size_.load());
   }
+  return s;
 }
 
 void BlobFile::CloseRandomAccessLocked() {
@@ -205,36 +191,144 @@ void BlobFile::CloseRandomAccessLocked() {
   last_access_ = -1;
 }
 
-std::shared_ptr<RandomAccessFileReader> BlobFile::GetOrOpenRandomAccessReader(
-    Env* env, const EnvOptions& env_options, bool* fresh_open) {
+Status BlobFile::GetReader(Env* env, const EnvOptions& env_options,
+                           std::shared_ptr<RandomAccessFileReader>* reader,
+                           bool* fresh_open) {
+  assert(reader != nullptr);
+  assert(fresh_open != nullptr);
   *fresh_open = false;
   int64_t current_time = 0;
   env->GetCurrentTime(&current_time);
   last_access_.store(current_time);
+  Status s;
 
   {
     ReadLock lockbfile_r(&mutex_);
-    if (ra_file_reader_) return ra_file_reader_;
+    if (ra_file_reader_) {
+      *reader = ra_file_reader_;
+      return s;
+    }
   }
 
   WriteLock lockbfile_w(&mutex_);
-  if (ra_file_reader_) return ra_file_reader_;
+  // Double check.
+  if (ra_file_reader_) {
+    *reader = ra_file_reader_;
+    return s;
+  }
 
   std::unique_ptr<RandomAccessFile> rfile;
-  Status s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
+  s = env->NewRandomAccessFile(PathName(), &rfile, env_options);
   if (!s.ok()) {
-    ROCKS_LOG_ERROR(parent_->db_options_.info_log,
+    ROCKS_LOG_ERROR(info_log_,
                     "Failed to open blob file for random-read: %s status: '%s'"
                     " exists: '%s'",
                     PathName().c_str(), s.ToString().c_str(),
                     env->FileExists(PathName()).ToString().c_str());
-    return nullptr;
+    return s;
   }
 
   ra_file_reader_ = std::make_shared<RandomAccessFileReader>(std::move(rfile),
                                                              PathName());
+  *reader = ra_file_reader_;
   *fresh_open = true;
-  return ra_file_reader_;
+  return s;
+}
+
+Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) {
+  assert(Immutable());
+  // Get file size.
+  uint64_t file_size = 0;
+  Status s = env->GetFileSize(PathName(), &file_size);
+  if (s.ok()) {
+    file_size_ = file_size;
+  } else {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to get size of blob file %" PRIu64
+                    ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+  if (file_size < BlobLogHeader::kSize) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Incomplete blob file blob file %" PRIu64
+                    ", size: %" PRIu64,
+                    file_number_, file_size);
+    return Status::Corruption("Incomplete blob file header.");
+  }
+
+  // Create file reader.
+  std::unique_ptr<RandomAccessFile> file;
+  s = env->NewRandomAccessFile(PathName(), &file, env_options);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to open blob file %" PRIu64 ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(file), PathName()));
+
+  // Read file header.
+  char header_buf[BlobLogHeader::kSize];
+  Slice header_slice;
+  s = file_reader->Read(0, BlobLogHeader::kSize, &header_slice, header_buf);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to read header of blob file %" PRIu64
+                    ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+  BlobLogHeader header;
+  s = header.DecodeFrom(header_slice);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to decode header of blob file %" PRIu64
+                    ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+  column_family_id_ = header.column_family_id;
+  compression_ = header.compression;
+  has_ttl_ = header.has_ttl;
+  if (has_ttl_) {
+    expiration_range_ = header.expiration_range;
+  }
+  header_valid_ = true;
+
+  // Read file footer.
+  if (file_size_ < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+    // OK not to have footer.
+    assert(!footer_valid_);
+    return Status::OK();
+  }
+  char footer_buf[BlobLogFooter::kSize];
+  Slice footer_slice;
+  s = file_reader->Read(file_size - BlobLogFooter::kSize, BlobLogFooter::kSize,
+                        &footer_slice, footer_buf);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "Failed to read footer of blob file %" PRIu64
+                    ", status: %s",
+                    file_number_, s.ToString().c_str());
+    return s;
+  }
+  BlobLogFooter footer;
+  s = footer.DecodeFrom(footer_slice);
+  if (!s.ok()) {
+    // OK not to have footer.
+    assert(!footer_valid_);
+    return Status::OK();
+  }
+  blob_count_ = footer.blob_count;
+  if (has_ttl_) {
+    assert(header.expiration_range.first <= footer.expiration_range.first);
+    assert(header.expiration_range.second >= footer.expiration_range.second);
+    expiration_range_ = footer.expiration_range;
+  }
+  footer_valid_ = true;
+  return Status::OK();
 }
 
 }  // namespace blob_db
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_file.h b/thirdparty/rocksdb/utilities/blob_db/blob_file.h
index 4085cfef0b..668a037228 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_file.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_file.h
@@ -23,7 +23,8 @@ class BlobDBImpl;
 
 class BlobFile {
   friend class BlobDBImpl;
-  friend struct blobf_compare_ttl;
+  friend struct BlobFileComparator;
+  friend struct BlobFileComparatorTTL;
 
  private:
   // access to parent
@@ -37,28 +38,25 @@ class BlobFile {
   // after that
   uint64_t file_number_;
 
-  // If true, the keys in this file all has TTL. Otherwise all keys don't
-  // have TTL.
-  bool has_ttl_;
+  // Info log.
+  Logger* info_log_;
+
+  // Column family id.
+  uint32_t column_family_id_;
 
   // Compression type of blobs in the file
   CompressionType compression_;
 
+  // If true, the keys in this file all has TTL. Otherwise all keys don't
+  // have TTL.
+  bool has_ttl_;
+
   // number of blobs in the file
   std::atomic<uint64_t> blob_count_;
 
-  // the file will be selected for GC in this future epoch
-  std::atomic<int64_t> gc_epoch_;
-
   // size of the file
   std::atomic<uint64_t> file_size_;
 
-  // number of blobs in this particular file which have been evicted
-  uint64_t deleted_count_;
-
-  // size of deleted blobs (used by heuristic to select file for GC)
-  uint64_t deleted_size_;
-
   BlobLogHeader header_;
 
   // closed_ = true implies the file is no more mutable
@@ -73,13 +71,8 @@ class BlobFile {
   // Data in this file is visible to a snapshot taken before the sequence.
   SequenceNumber obsolete_sequence_;
 
-  // should this file been gc'd once to reconcile lost deletes/compactions
-  std::atomic<bool> gc_once_after_open_;
-
   ExpirationRange expiration_range_;
 
-  SequenceRange sequence_range_;
-
   // Sequential/Append writer for blobs
   std::shared_ptr<Writer> log_writer_;
 
@@ -98,19 +91,25 @@ class BlobFile {
 
   bool header_valid_;
 
+  bool footer_valid_;
+
   SequenceNumber garbage_collection_finish_sequence_;
 
  public:
   BlobFile();
 
-  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum);
+  BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum,
+           Logger* info_log);
 
   ~BlobFile();
 
   uint32_t column_family_id() const;
 
-  // Returns log file's pathname relative to the main db dir
-  // Eg. For a live-log-file = blob_dir/000003.blob
+  void SetColumnFamilyId(uint32_t cf_id) {
+    column_family_id_ = cf_id;
+  }
+
+  // Returns log file's absolute pathname.
   std::string PathName() const;
 
   // Primary identifier for blob file.
@@ -125,6 +124,13 @@ class BlobFile {
 
   std::string DumpState() const;
 
+  // if the file is not taking any more appends.
+  bool Immutable() const { return closed_.load(); }
+
+  // Mark the file as immutable.
+  // REQUIRES: write lock held, or access from single thread (on DB open).
+  void MarkImmutable() { closed_ = true; }
+
   // if the file has gone through GC and blobs have been relocated
   bool Obsolete() const {
     assert(Immutable() || !obsolete_.load());
@@ -140,13 +146,10 @@ class BlobFile {
     return obsolete_sequence_;
   }
 
-  // if the file is not taking any more appends.
-  bool Immutable() const { return closed_.load(); }
-
   // we will assume this is atomic
   bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const;
 
-  void Fsync();
+  Status Fsync();
 
   uint64_t GetFileSize() const {
     return file_size_.load(std::memory_order_acquire);
@@ -161,17 +164,6 @@ class BlobFile {
     expiration_range_.second = std::max(expiration_range_.second, expiration);
   }
 
-  SequenceRange GetSequenceRange() const { return sequence_range_; }
-
-  void SetSequenceRange(SequenceRange sequence_range) {
-    sequence_range_ = sequence_range;
-  }
-
-  void ExtendSequenceRange(SequenceNumber sequence) {
-    sequence_range_.first = std::min(sequence_range_.first, sequence);
-    sequence_range_.second = std::max(sequence_range_.second, sequence);
-  }
-
   bool HasTTL() const { return has_ttl_; }
 
   void SetHasTTL(bool has_ttl) { has_ttl_ = has_ttl; }
@@ -184,8 +176,17 @@ class BlobFile {
 
   std::shared_ptr<Writer> GetWriter() const { return log_writer_; }
 
+  // Read blob file header and footer. Return corruption if file header is
+  // malform or incomplete. If footer is malform or incomplete, set
+  // footer_valid_ to false and return Status::OK.
+  Status ReadMetadata(Env* env, const EnvOptions& env_options);
+
+  Status GetReader(Env* env, const EnvOptions& env_options,
+                   std::shared_ptr<RandomAccessFileReader>* reader,
+                   bool* fresh_open);
+
  private:
-  std::shared_ptr<Reader> OpenSequentialReader(
+  std::shared_ptr<Reader> OpenRandomAccessReader(
       Env* env, const DBOptions& db_options,
       const EnvOptions& env_options) const;
 
@@ -193,9 +194,6 @@ class BlobFile {
 
   Status WriteFooterAndCloseLocked();
 
-  std::shared_ptr<RandomAccessFileReader> GetOrOpenRandomAccessReader(
-      Env* env, const EnvOptions& env_options, bool* fresh_open);
-
   void CloseRandomAccessLocked();
 
   // this is used, when you are reading only the footer of a
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_log_format.cc b/thirdparty/rocksdb/utilities/blob_db/blob_log_format.cc
index eb748ac994..8726cb8f1f 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_log_format.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_log_format.cc
@@ -67,8 +67,6 @@ void BlobLogFooter::EncodeTo(std::string* dst) {
   PutFixed64(dst, blob_count);
   PutFixed64(dst, expiration_range.first);
   PutFixed64(dst, expiration_range.second);
-  PutFixed64(dst, sequence_range.first);
-  PutFixed64(dst, sequence_range.second);
   crc = crc32c::Value(dst->c_str(), dst->size());
   crc = crc32c::Mask(crc);
   PutFixed32(dst, crc);
@@ -82,14 +80,12 @@ Status BlobLogFooter::DecodeFrom(Slice src) {
                               "Unexpected blob file footer size");
   }
   uint32_t src_crc = 0;
-  src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - 4);
+  src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t));
   src_crc = crc32c::Mask(src_crc);
-  uint32_t magic_number;
+  uint32_t magic_number = 0;
   if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
       !GetFixed64(&src, &expiration_range.first) ||
-      !GetFixed64(&src, &expiration_range.second) ||
-      !GetFixed64(&src, &sequence_range.first) ||
-      !GetFixed64(&src, &sequence_range.second) || !GetFixed32(&src, &crc)) {
+      !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) {
     return Status::Corruption(kErrorMessage, "Error decoding content");
   }
   if (magic_number != kMagicNumber) {
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_log_format.h b/thirdparty/rocksdb/utilities/blob_db/blob_log_format.h
index 0b5cff5479..fcc042f06d 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_log_format.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_log_format.h
@@ -10,7 +10,9 @@
 #ifndef ROCKSDB_LITE
 
 #include <limits>
+#include <memory>
 #include <utility>
+
 #include "rocksdb/options.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@@ -24,7 +26,6 @@ constexpr uint32_t kVersion1 = 1;
 constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
 
 using ExpirationRange = std::pair<uint64_t, uint64_t>;
-using SequenceRange = std::pair<uint64_t, uint64_t>;
 
 // Format of blob log file header (30 bytes):
 //
@@ -53,24 +54,23 @@ struct BlobLogHeader {
   Status DecodeFrom(Slice slice);
 };
 
-// Format of blob log file footer (48 bytes):
+// Format of blob log file footer (32 bytes):
 //
-//    +--------------+------------+-------------------+-------------------+------------+
-//    | magic number | blob count | expiration range  |  sequence range   | footer CRC |
-//    +--------------+------------+-------------------+-------------------+------------+
-//    |   Fixed32    |  Fixed64   | Fixed64 + Fixed64 | Fixed64 + Fixed64 |   Fixed32  |
-//    +--------------+------------+-------------------+-------------------+------------+
+//    +--------------+------------+-------------------+------------+
+//    | magic number | blob count | expiration range  | footer CRC |
+//    +--------------+------------+-------------------+------------+
+//    |   Fixed32    |  Fixed64   | Fixed64 + Fixed64 |   Fixed32  |
+//    +--------------+------------+-------------------+------------+
 //
 // The footer will be presented only when the blob file is properly closed.
 //
 // Unlike the same field in file header, expiration range in the footer is the
 // range of smallest and largest expiration of the data in this file.
 struct BlobLogFooter {
-  static constexpr size_t kSize = 48;
+  static constexpr size_t kSize = 32;
 
   uint64_t blob_count = 0;
   ExpirationRange expiration_range = std::make_pair(0, 0);
-  SequenceRange sequence_range = std::make_pair(0, 0);
   uint32_t crc = 0;
 
   void EncodeTo(std::string* dst);
@@ -108,8 +108,10 @@ struct BlobLogRecord {
   uint32_t blob_crc = 0;
   Slice key;
   Slice value;
-  std::string key_buf;
-  std::string value_buf;
+  std::unique_ptr<char[]> key_buf;
+  std::unique_ptr<char[]> value_buf;
+
+  uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
 
   void EncodeHeaderTo(std::string* dst);
 
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.cc b/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.cc
index a2421b9300..8ffcc2fa1e 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.cc
@@ -9,22 +9,29 @@
 
 #include <algorithm>
 
+#include "monitoring/statistics.h"
 #include "util/file_reader_writer.h"
+#include "util/stop_watch.h"
 
 namespace rocksdb {
 namespace blob_db {
 
-Reader::Reader(std::shared_ptr<Logger> info_log,
-               unique_ptr<SequentialFileReader>&& _file)
-    : info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) {}
-
-Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
-  buf->reserve(size);
-  Status s = file_->Read(size, slice, &(*buf)[0]);
+Reader::Reader(std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
+               Statistics* statistics)
+    : file_(std::move(file_reader)),
+      env_(env),
+      statistics_(statistics),
+      buffer_(),
+      next_byte_(0) {}
+
+Status Reader::ReadSlice(uint64_t size, Slice* slice, char* buf) {
+  StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+  Status s = file_->Read(next_byte_, static_cast<size_t>(size), slice, buf);
   next_byte_ += size;
   if (!s.ok()) {
     return s;
   }
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
   if (slice->size() != size) {
     return Status::Corruption("EOF reached while reading record");
   }
@@ -34,7 +41,7 @@ Status Reader::ReadSlice(uint64_t size, Slice* slice, std::string* buf) {
 Status Reader::ReadHeader(BlobLogHeader* header) {
   assert(file_.get() != nullptr);
   assert(next_byte_ == 0);
-  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, &backing_store_);
+  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
   if (!s.ok()) {
     return s;
   }
@@ -48,7 +55,7 @@ Status Reader::ReadHeader(BlobLogHeader* header) {
 
 Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
                           uint64_t* blob_offset) {
-  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, &backing_store_);
+  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
   if (!s.ok()) {
     return s;
   }
@@ -68,20 +75,22 @@ Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level,
 
   switch (level) {
     case kReadHeader:
-      file_->Skip(record->key_size + record->value_size);
       next_byte_ += kb_size;
       break;
 
     case kReadHeaderKey:
-      s = ReadSlice(record->key_size, &record->key, &record->key_buf);
-      file_->Skip(record->value_size);
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
       next_byte_ += record->value_size;
       break;
 
     case kReadHeaderKeyBlob:
-      s = ReadSlice(record->key_size, &record->key, &record->key_buf);
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
       if (s.ok()) {
-        s = ReadSlice(record->value_size, &record->value, &record->value_buf);
+        record->value_buf.reset(new char[record->value_size]);
+        s = ReadSlice(record->value_size, &record->value,
+                      record->value_buf.get());
       }
       if (s.ok()) {
         s = record->CheckBlobCRC();
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.h b/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.h
index 9c76b92aef..45e2e95514 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_log_reader.h
@@ -10,8 +10,11 @@
 #include <memory>
 #include <string>
 
+#include "rocksdb/env.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
+#include "util/file_reader_writer.h"
 #include "utilities/blob_db/blob_log_format.h"
 
 namespace rocksdb {
@@ -37,17 +40,8 @@ class Reader {
 
   // Create a reader that will return log records from "*file".
   // "*file" must remain live while this Reader is in use.
-  //
-  // If "reporter" is non-nullptr, it is notified whenever some data is
-  // dropped due to a detected corruption.  "*reporter" must remain
-  // live while this Reader is in use.
-  //
-  // If "checksum" is true, verify checksums if available.
-  //
-  // The Reader will start reading at the first record located at physical
-  // position >= initial_offset within the file.
-  Reader(std::shared_ptr<Logger> info_log,
-         std::unique_ptr<SequentialFileReader>&& file);
+  Reader(std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
+         Statistics* statistics);
 
   ~Reader() = default;
 
@@ -66,22 +60,19 @@ class Reader {
   Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
                     uint64_t* blob_offset = nullptr);
 
-  Status ReadSlice(uint64_t size, Slice* slice, std::string* buf);
-
-  SequentialFileReader* file() { return file_.get(); }
-
   void ResetNextByte() { next_byte_ = 0; }
 
   uint64_t GetNextByte() const { return next_byte_; }
 
-  const SequentialFileReader* file_reader() const { return file_.get(); }
-
  private:
-  std::shared_ptr<Logger> info_log_;
-  const std::unique_ptr<SequentialFileReader> file_;
+  Status ReadSlice(uint64_t size, Slice* slice, char* buf);
+
+  const std::unique_ptr<RandomAccessFileReader> file_;
+  Env* env_;
+  Statistics* statistics_;
 
-  std::string backing_store_;
   Slice buffer_;
+  char header_buf_[BlobLogRecord::kHeaderSize];
 
   // which byte to read next. For asserting proper usage
   uint64_t next_byte_;
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.cc b/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.cc
index 806ca3c959..51578c5c32 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.cc
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.cc
@@ -8,17 +8,23 @@
 
 #include <cstdint>
 #include <string>
+
+#include "monitoring/statistics.h"
 #include "rocksdb/env.h"
 #include "util/coding.h"
 #include "util/file_reader_writer.h"
+#include "util/stop_watch.h"
 #include "utilities/blob_db/blob_log_format.h"
 
 namespace rocksdb {
 namespace blob_db {
 
-Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
-               uint64_t bpsync, bool use_fs, uint64_t boffset)
+Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
+               Statistics* statistics, uint64_t log_number, uint64_t bpsync,
+               bool use_fs, uint64_t boffset)
     : dest_(std::move(dest)),
+      env_(env),
+      statistics_(statistics),
       log_number_(log_number),
       block_offset_(boffset),
       bytes_per_sync_(bpsync),
@@ -26,7 +32,12 @@ Writer::Writer(unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
       use_fsync_(use_fs),
       last_elem_type_(kEtNone) {}
 
-void Writer::Sync() { dest_->Sync(use_fsync_); }
+Status Writer::Sync() {
+  StopWatch sync_sw(env_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+  Status s = dest_->Sync(use_fsync_);
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
+  return s;
+}
 
 Status Writer::WriteHeader(BlobLogHeader& header) {
   assert(block_offset_ == 0);
@@ -40,6 +51,8 @@ Status Writer::WriteHeader(BlobLogHeader& header) {
     s = dest_->Flush();
   }
   last_elem_type_ = kEtFileHdr;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogHeader::kSize);
   return s;
 }
 
@@ -58,6 +71,8 @@ Status Writer::AppendFooter(BlobLogFooter& footer) {
   }
 
   last_elem_type_ = kEtFileFooter;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogFooter::kSize);
   return s;
 }
 
@@ -98,6 +113,7 @@ void Writer::ConstructBlobHeader(std::string* buf, const Slice& key,
 Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
                                   const Slice& key, const Slice& val,
                                   uint64_t* key_offset, uint64_t* blob_offset) {
+  StopWatch write_sw(env_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
   Status s = dest_->Append(Slice(headerbuf));
   if (s.ok()) {
     s = dest_->Append(key);
@@ -113,6 +129,8 @@ Status Writer::EmitPhysicalRecord(const std::string& headerbuf,
   *blob_offset = *key_offset + key.size();
   block_offset_ = *blob_offset + val.size();
   last_elem_type_ = kEtRecord;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogRecord::kHeaderSize + key.size() + val.size());
   return s;
 }
 
diff --git a/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.h b/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.h
index 2a1f05e1b3..dccac355cb 100644
--- a/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.h
+++ b/thirdparty/rocksdb/utilities/blob_db/blob_log_writer.h
@@ -10,7 +10,9 @@
 #include <memory>
 #include <string>
 
+#include "rocksdb/env.h"
 #include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
 #include "rocksdb/status.h"
 #include "rocksdb/types.h"
 #include "utilities/blob_db/blob_log_format.h"
@@ -34,9 +36,9 @@ class Writer {
   // Create a writer that will append data to "*dest".
   // "*dest" must be initially empty.
   // "*dest" must remain live while this Writer is in use.
-  explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
-                  uint64_t log_number, uint64_t bpsync, bool use_fsync,
-                  uint64_t boffset = 0);
+  Writer(std::unique_ptr<WritableFileWriter>&& dest, Env* env,
+         Statistics* statistics, uint64_t log_number, uint64_t bpsync,
+         bool use_fsync, uint64_t boffset = 0);
 
   ~Writer() = default;
 
@@ -69,12 +71,14 @@ class Writer {
 
   bool ShouldSync() const { return block_offset_ > next_sync_offset_; }
 
-  void Sync();
+  Status Sync();
 
   void ResetSyncPointer() { next_sync_offset_ += bytes_per_sync_; }
 
  private:
   std::unique_ptr<WritableFileWriter> dest_;
+  Env* env_;
+  Statistics* statistics_;
   uint64_t log_number_;
   uint64_t block_offset_;  // Current offset in block
   uint64_t bytes_per_sync_;
diff --git a/thirdparty/rocksdb/utilities/blob_db/ttl_extractor.cc b/thirdparty/rocksdb/utilities/blob_db/ttl_extractor.cc
deleted file mode 100644
index 267f904b67..0000000000
--- a/thirdparty/rocksdb/utilities/blob_db/ttl_extractor.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_LITE
-
-#include "utilities/blob_db/blob_db.h"
-#include "util/coding.h"
-
-namespace rocksdb {
-namespace blob_db {
-
-bool TTLExtractor::ExtractTTL(const Slice& /*key*/, const Slice& /*value*/,
-                              uint64_t* /*ttl*/, std::string* /*new_value*/,
-                              bool* /*value_changed*/) {
-  return false;
-}
-
-bool TTLExtractor::ExtractExpiration(const Slice& key, const Slice& value,
-                                     uint64_t now, uint64_t* expiration,
-                                     std::string* new_value,
-                                     bool* value_changed) {
-  uint64_t ttl;
-  bool has_ttl = ExtractTTL(key, value, &ttl, new_value, value_changed);
-  if (has_ttl) {
-    *expiration = now + ttl;
-  }
-  return has_ttl;
-}
-
-}  // namespace blob_db
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc b/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
index e817972ee3..1b99d3a8b7 100644
--- a/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
+++ b/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.cc
@@ -17,19 +17,20 @@ const char* CassandraCompactionFilter::Name() const {
 }
 
 CompactionFilter::Decision CassandraCompactionFilter::FilterV2(
-  int level,
-  const Slice& key,
-  ValueType value_type,
-  const Slice& existing_value,
-  std::string* new_value,
-  std::string* skip_until) const {
-
+    int /*level*/, const Slice& /*key*/, ValueType value_type,
+    const Slice& existing_value, std::string* new_value,
+    std::string* /*skip_until*/) const {
   bool value_changed = false;
   RowValue row_value = RowValue::Deserialize(
     existing_value.data(), existing_value.size());
-  RowValue compacted = purge_ttl_on_expiration_ ?
-    row_value.PurgeTtl(&value_changed) :
-    row_value.ExpireTtl(&value_changed);
+  RowValue compacted =
+      purge_ttl_on_expiration_
+          ? row_value.RemoveExpiredColumns(&value_changed)
+          : row_value.ConvertExpiredColumnsToTombstones(&value_changed);
+
+  if (value_type == ValueType::kValue) {
+    compacted = compacted.RemoveTombstones(gc_grace_period_in_seconds_);
+  }
 
   if(compacted.Empty()) {
     return Decision::kRemove;
diff --git a/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.h b/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
index c09b8e74aa..4ee2445a1a 100644
--- a/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
+++ b/thirdparty/rocksdb/utilities/cassandra/cassandra_compaction_filter.h
@@ -15,25 +15,28 @@ namespace cassandra {
  * Compaction filter for removing expired Cassandra data with ttl.
  * If option `purge_ttl_on_expiration` is set to true, expired data
  * will be directly purged. Otherwise expired data will be converted
- * tombstones first, then be eventally removed after gc grace period. 
- * `purge_ttl_on_expiration` should only be on in the case all the 
+ * tombstones first, then be eventally removed after gc grace period.
+ * `purge_ttl_on_expiration` should only be on in the case all the
  * writes have same ttl setting, otherwise it could bring old data back.
+ *
+ * Compaction filter is also in charge of removing tombstone that has been
+ * promoted to kValue type after serials of merging in compaction.
  */
 class CassandraCompactionFilter : public CompactionFilter {
 public:
-  explicit CassandraCompactionFilter(bool purge_ttl_on_expiration)
-    : purge_ttl_on_expiration_(purge_ttl_on_expiration) {}
+ explicit CassandraCompactionFilter(bool purge_ttl_on_expiration,
+                                    int32_t gc_grace_period_in_seconds)
+     : purge_ttl_on_expiration_(purge_ttl_on_expiration),
+       gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {}
 
-  const char* Name() const override;
-  virtual Decision FilterV2(int level,
-                            const Slice& key,
-                            ValueType value_type,
-                            const Slice& existing_value,
-                            std::string* new_value,
-                            std::string* skip_until) const override;
+ const char* Name() const override;
+ virtual Decision FilterV2(int level, const Slice& key, ValueType value_type,
+                           const Slice& existing_value, std::string* new_value,
+                           std::string* skip_until) const override;
 
 private:
   bool purge_ttl_on_expiration_;
+  int32_t gc_grace_period_in_seconds_;
 };
 }  // namespace cassandra
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/cassandra/cassandra_format_test.cc b/thirdparty/rocksdb/utilities/cassandra/cassandra_format_test.cc
index 0cf124d0ca..8f9baa7235 100644
--- a/thirdparty/rocksdb/utilities/cassandra/cassandra_format_test.cc
+++ b/thirdparty/rocksdb/utilities/cassandra/cassandra_format_test.cc
@@ -122,6 +122,20 @@ TEST(ExpiringColumnTest, ExpiringColumn) {
       == 0);
 }
 
+TEST(TombstoneTest, TombstoneCollectable) {
+  int32_t now = (int32_t)time(nullptr);
+  int32_t gc_grace_seconds = 16440;
+  int32_t time_delta_seconds = 10;
+  EXPECT_TRUE(Tombstone(ColumnTypeMask::DELETION_MASK, 0,
+                        now - gc_grace_seconds - time_delta_seconds,
+                        ToMicroSeconds(now - gc_grace_seconds - time_delta_seconds))
+                  .Collectable(gc_grace_seconds));
+  EXPECT_FALSE(Tombstone(ColumnTypeMask::DELETION_MASK, 0,
+                         now - gc_grace_seconds + time_delta_seconds,
+                         ToMicroSeconds(now - gc_grace_seconds + time_delta_seconds))
+                   .Collectable(gc_grace_seconds));
+}
+
 TEST(TombstoneTest, Tombstone) {
   int8_t mask = ColumnTypeMask::DELETION_MASK;
   int8_t index = 2;
@@ -304,21 +318,21 @@ TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired) {
   int64_t now = time(nullptr);
 
   auto row_value = CreateTestRowValue({
-    std::make_tuple(kColumn, 0, ToMicroSeconds(now)),
-    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired
-    std::make_tuple(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired
-    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+    CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)),
+    CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired
+    CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired
+    CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
   });
 
   bool changed = false;
-  auto purged = row_value.PurgeTtl(&changed);
+  auto purged = row_value.RemoveExpiredColumns(&changed);
   EXPECT_TRUE(changed);
   EXPECT_EQ(purged.columns_.size(), 3);
   VerifyRowValueColumns(purged.columns_, 0, kColumn, 0, ToMicroSeconds(now));
   VerifyRowValueColumns(purged.columns_, 1, kExpiringColumn, 2, ToMicroSeconds(now));
   VerifyRowValueColumns(purged.columns_, 2, kTombstone, 3, ToMicroSeconds(now));
 
-  purged.PurgeTtl(&changed);
+  purged.RemoveExpiredColumns(&changed);
   EXPECT_FALSE(changed);
 }
 
@@ -326,14 +340,14 @@ TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) {
   int64_t now = time(nullptr);
 
   auto row_value = CreateTestRowValue({
-    std::make_tuple(kColumn, 0, ToMicroSeconds(now)),
-    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired
-    std::make_tuple(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired
-    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+    CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now)),
+    CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 10)), //expired
+    CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now)), // not expired
+    CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
   });
 
   bool changed = false;
-  auto compacted = row_value.ExpireTtl(&changed);
+  auto compacted = row_value.ConvertExpiredColumnsToTombstones(&changed);
   EXPECT_TRUE(changed);
   EXPECT_EQ(compacted.columns_.size(), 4);
   VerifyRowValueColumns(compacted.columns_, 0, kColumn, 0, ToMicroSeconds(now));
@@ -341,7 +355,7 @@ TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones) {
   VerifyRowValueColumns(compacted.columns_, 2, kExpiringColumn, 2, ToMicroSeconds(now));
   VerifyRowValueColumns(compacted.columns_, 3, kTombstone, 3, ToMicroSeconds(now));
 
-  compacted.ExpireTtl(&changed);
+  compacted.ConvertExpiredColumnsToTombstones(&changed);
   EXPECT_FALSE(changed);
 }
 } // namespace cassandra
diff --git a/thirdparty/rocksdb/utilities/cassandra/cassandra_functional_test.cc b/thirdparty/rocksdb/utilities/cassandra/cassandra_functional_test.cc
index 0c02228a7f..dacc6f03ce 100644
--- a/thirdparty/rocksdb/utilities/cassandra/cassandra_functional_test.cc
+++ b/thirdparty/rocksdb/utilities/cassandra/cassandra_functional_test.cc
@@ -21,14 +21,12 @@ namespace rocksdb {
 namespace cassandra {
 
 // Path to the database on file system
-const std::string kDbName = test::TmpDir() + "/cassandra_functional_test";
+const std::string kDbName = test::PerThreadDBPath("cassandra_functional_test");
 
 class CassandraStore {
  public:
   explicit CassandraStore(std::shared_ptr<DB> db)
-      : db_(db),
-        merge_option_(),
-        get_option_() {
+      : db_(db), write_option_(), get_option_() {
     assert(db);
   }
 
@@ -36,7 +34,7 @@ class CassandraStore {
     std::string result;
     val.Serialize(&result);
     Slice valSlice(result.data(), result.size());
-    auto s = db_->Merge(merge_option_, key, valSlice);
+    auto s = db_->Merge(write_option_, key, valSlice);
 
     if (s.ok()) {
       return true;
@@ -46,6 +44,19 @@ class CassandraStore {
     }
   }
 
+  bool Put(const std::string& key, const RowValue& val) {
+    std::string result;
+    val.Serialize(&result);
+    Slice valSlice(result.data(), result.size());
+    auto s = db_->Put(write_option_, key, valSlice);
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << "ERROR " << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
   void Flush() {
     dbfull()->TEST_FlushMemTable();
     dbfull()->TEST_WaitForCompact();
@@ -75,29 +86,30 @@ class CassandraStore {
 
  private:
   std::shared_ptr<DB> db_;
-  WriteOptions merge_option_;
+  WriteOptions write_option_;
   ReadOptions get_option_;
 
   DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_.get()); }
-
 };
 
 class TestCompactionFilterFactory : public CompactionFilterFactory {
 public:
-  explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration)
-    : purge_ttl_on_expiration_(purge_ttl_on_expiration) {}
+ explicit TestCompactionFilterFactory(bool purge_ttl_on_expiration,
+                                      int32_t gc_grace_period_in_seconds)
+     : purge_ttl_on_expiration_(purge_ttl_on_expiration),
+       gc_grace_period_in_seconds_(gc_grace_period_in_seconds) {}
 
-  virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-      const CompactionFilter::Context& context) override {
-    return unique_ptr<CompactionFilter>(new CassandraCompactionFilter(purge_ttl_on_expiration_));
-  }
+ std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+     const CompactionFilter::Context& /*context*/) override {
+   return std::unique_ptr<CompactionFilter>(new CassandraCompactionFilter(
+       purge_ttl_on_expiration_, gc_grace_period_in_seconds_));
+ }
 
-  virtual const char* Name() const override {
-    return "TestCompactionFilterFactory";
-  }
+ const char* Name() const override { return "TestCompactionFilterFactory"; }
 
 private:
   bool purge_ttl_on_expiration_;
+  int32_t gc_grace_period_in_seconds_;
 };
 
 
@@ -112,37 +124,40 @@ class CassandraFunctionalTest : public testing::Test {
     DB* db;
     Options options;
     options.create_if_missing = true;
-    options.merge_operator.reset(new CassandraValueMergeOperator());
-    auto* cf_factory = new TestCompactionFilterFactory(purge_ttl_on_expiration_);
+    options.merge_operator.reset(new CassandraValueMergeOperator(gc_grace_period_in_seconds_));
+    auto* cf_factory = new TestCompactionFilterFactory(
+        purge_ttl_on_expiration_, gc_grace_period_in_seconds_);
     options.compaction_filter_factory.reset(cf_factory);
     EXPECT_OK(DB::Open(options, kDbName, &db));
     return std::shared_ptr<DB>(db);
   }
 
   bool purge_ttl_on_expiration_ = false;
+  int32_t gc_grace_period_in_seconds_ = 100;
 };
 
 // THE TEST CASES BEGIN HERE
 
 TEST_F(CassandraFunctionalTest, SimpleMergeTest) {
   CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
 
   store.Append("k1", CreateTestRowValue({
-    std::make_tuple(kTombstone, 0, 5),
-    std::make_tuple(kColumn, 1, 8),
-    std::make_tuple(kExpiringColumn, 2, 5),
+    CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now + 5)),
+    CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now + 8)),
+    CreateTestColumnSpec(kExpiringColumn, 2, ToMicroSeconds(now + 5)),
   }));
   store.Append("k1",CreateTestRowValue({
-    std::make_tuple(kColumn, 0, 2),
-    std::make_tuple(kExpiringColumn, 1, 5),
-    std::make_tuple(kTombstone, 2, 7),
-    std::make_tuple(kExpiringColumn, 7, 17),
+    CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now + 2)),
+    CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now + 5)),
+    CreateTestColumnSpec(kTombstone, 2, ToMicroSeconds(now + 7)),
+    CreateTestColumnSpec(kExpiringColumn, 7, ToMicroSeconds(now + 17)),
   }));
   store.Append("k1", CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, 6),
-    std::make_tuple(kTombstone, 1, 5),
-    std::make_tuple(kColumn, 2, 4),
-    std::make_tuple(kTombstone, 11, 11),
+    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now + 6)),
+    CreateTestColumnSpec(kTombstone, 1, ToMicroSeconds(now + 5)),
+    CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now + 4)),
+    CreateTestColumnSpec(kTombstone, 11, ToMicroSeconds(now + 11)),
   }));
 
   auto ret = store.Get("k1");
@@ -150,11 +165,11 @@ TEST_F(CassandraFunctionalTest, SimpleMergeTest) {
   ASSERT_TRUE(std::get<0>(ret));
   RowValue& merged = std::get<1>(ret);
   EXPECT_EQ(merged.columns_.size(), 5);
-  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 0, 6);
-  VerifyRowValueColumns(merged.columns_, 1, kColumn, 1, 8);
-  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 2, 7);
-  VerifyRowValueColumns(merged.columns_, 3, kExpiringColumn, 7, 17);
-  VerifyRowValueColumns(merged.columns_, 4, kTombstone, 11, 11);
+  VerifyRowValueColumns(merged.columns_, 0, kExpiringColumn, 0, ToMicroSeconds(now + 6));
+  VerifyRowValueColumns(merged.columns_, 1, kColumn, 1, ToMicroSeconds(now + 8));
+  VerifyRowValueColumns(merged.columns_, 2, kTombstone, 2, ToMicroSeconds(now + 7));
+  VerifyRowValueColumns(merged.columns_, 3, kExpiringColumn, 7, ToMicroSeconds(now + 17));
+  VerifyRowValueColumns(merged.columns_, 4, kTombstone, 11, ToMicroSeconds(now + 11));
 }
 
 TEST_F(CassandraFunctionalTest,
@@ -163,16 +178,16 @@ TEST_F(CassandraFunctionalTest,
   int64_t now= time(nullptr);
 
   store.Append("k1", CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired
-    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl + 10)), // not expired
-    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired
+    CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl + 10)), // not expired
+    CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
   }));
 
   store.Flush();
 
   store.Append("k1",CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
-    std::make_tuple(kColumn, 2, ToMicroSeconds(now))
+    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
+    CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))
   }));
 
   store.Flush();
@@ -196,16 +211,16 @@ TEST_F(CassandraFunctionalTest,
   int64_t now = time(nullptr);
 
   store.Append("k1", CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired
-    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now)), // not expired
-    std::make_tuple(kTombstone, 3, ToMicroSeconds(now))
+    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)), //expired
+    CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now)), // not expired
+    CreateTestColumnSpec(kTombstone, 3, ToMicroSeconds(now))
   }));
 
   store.Flush();
 
   store.Append("k1",CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
-    std::make_tuple(kColumn, 2, ToMicroSeconds(now))
+    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)), //expired
+    CreateTestColumnSpec(kColumn, 2, ToMicroSeconds(now))
   }));
 
   store.Flush();
@@ -227,14 +242,59 @@ TEST_F(CassandraFunctionalTest,
   int64_t now = time(nullptr);
 
   store.Append("k1", CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)),
-    std::make_tuple(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 20)),
+    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 20)),
+    CreateTestColumnSpec(kExpiringColumn, 1, ToMicroSeconds(now - kTtl - 20)),
+  }));
+
+  store.Flush();
+
+  store.Append("k1",CreateTestRowValue({
+    CreateTestColumnSpec(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)),
+  }));
+
+  store.Flush();
+  store.Compact();
+  ASSERT_FALSE(std::get<0>(store.Get("k1")));
+}
+
+TEST_F(CassandraFunctionalTest,
+       CompactionShouldRemoveTombstoneExceedingGCGracePeriod) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Append("k1", CreateTestRowValue({
+    CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
+    CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now))
+  }));
+
+  store.Append("k2", CreateTestRowValue({
+    CreateTestColumnSpec(kColumn, 0, ToMicroSeconds(now))
   }));
 
   store.Flush();
 
   store.Append("k1",CreateTestRowValue({
-    std::make_tuple(kExpiringColumn, 0, ToMicroSeconds(now - kTtl - 10)),
+    CreateTestColumnSpec(kColumn, 1, ToMicroSeconds(now)),
+  }));
+
+  store.Flush();
+  store.Compact();
+
+  auto ret = store.Get("k1");
+  ASSERT_TRUE(std::get<0>(ret));
+  RowValue& gced = std::get<1>(ret);
+  EXPECT_EQ(gced.columns_.size(), 1);
+  VerifyRowValueColumns(gced.columns_, 0, kColumn, 1, ToMicroSeconds(now));
+}
+
+TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) {
+  purge_ttl_on_expiration_ = true;
+  CassandraStore store(OpenDb());
+  int64_t now = time(nullptr);
+
+  store.Put("k1", CreateTestRowValue({
+    CreateTestColumnSpec(kTombstone, 0, ToMicroSeconds(now - gc_grace_period_in_seconds_ - 1)),
   }));
 
   store.Flush();
diff --git a/thirdparty/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc b/thirdparty/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
index 78c7d8e578..8d6dc10ded 100644
--- a/thirdparty/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
+++ b/thirdparty/rocksdb/utilities/cassandra/cassandra_row_merge_test.cc
@@ -15,27 +15,27 @@ TEST(RowValueMergeTest, Merge) {
   std::vector<RowValue> row_values;
   row_values.push_back(
     CreateTestRowValue({
-      std::make_tuple(kTombstone, 0, 5),
-      std::make_tuple(kColumn, 1, 8),
-      std::make_tuple(kExpiringColumn, 2, 5),
+      CreateTestColumnSpec(kTombstone, 0, 5),
+      CreateTestColumnSpec(kColumn, 1, 8),
+      CreateTestColumnSpec(kExpiringColumn, 2, 5),
     })
   );
 
   row_values.push_back(
     CreateTestRowValue({
-      std::make_tuple(kColumn, 0, 2),
-      std::make_tuple(kExpiringColumn, 1, 5),
-      std::make_tuple(kTombstone, 2, 7),
-      std::make_tuple(kExpiringColumn, 7, 17),
+      CreateTestColumnSpec(kColumn, 0, 2),
+      CreateTestColumnSpec(kExpiringColumn, 1, 5),
+      CreateTestColumnSpec(kTombstone, 2, 7),
+      CreateTestColumnSpec(kExpiringColumn, 7, 17),
     })
   );
 
   row_values.push_back(
     CreateTestRowValue({
-      std::make_tuple(kExpiringColumn, 0, 6),
-      std::make_tuple(kTombstone, 1, 5),
-      std::make_tuple(kColumn, 2, 4),
-      std::make_tuple(kTombstone, 11, 11),
+      CreateTestColumnSpec(kExpiringColumn, 0, 6),
+      CreateTestColumnSpec(kTombstone, 1, 5),
+      CreateTestColumnSpec(kColumn, 2, 4),
+      CreateTestColumnSpec(kTombstone, 11, 11),
     })
   );
 
@@ -60,24 +60,24 @@ TEST(RowValueMergeTest, MergeWithRowTombstone) {
   // This row's timestamp is smaller than tombstone.
   row_values.push_back(
     CreateTestRowValue({
-      std::make_tuple(kColumn, 0, 5),
-      std::make_tuple(kColumn, 1, 6),
+      CreateTestColumnSpec(kColumn, 0, 5),
+      CreateTestColumnSpec(kColumn, 1, 6),
     })
   );
 
   // Some of the column's row is smaller, some is larger.
   row_values.push_back(
     CreateTestRowValue({
-      std::make_tuple(kColumn, 2, 10),
-      std::make_tuple(kColumn, 3, 12),
+      CreateTestColumnSpec(kColumn, 2, 10),
+      CreateTestColumnSpec(kColumn, 3, 12),
     })
   );
 
   // All of the column's rows are larger than tombstone.
   row_values.push_back(
     CreateTestRowValue({
-      std::make_tuple(kColumn, 4, 13),
-      std::make_tuple(kColumn, 5, 14),
+      CreateTestColumnSpec(kColumn, 4, 13),
+      CreateTestColumnSpec(kColumn, 5, 14),
     })
   );
 
diff --git a/thirdparty/rocksdb/utilities/cassandra/format.cc b/thirdparty/rocksdb/utilities/cassandra/format.cc
index 2b096cdbb9..42cd7206b6 100644
--- a/thirdparty/rocksdb/utilities/cassandra/format.cc
+++ b/thirdparty/rocksdb/utilities/cassandra/format.cc
@@ -129,7 +129,7 @@ std::shared_ptr<Tombstone> ExpiringColumn::ToTombstone() const {
   int64_t marked_for_delete_at =
     std::chrono::duration_cast<std::chrono::microseconds>(expired_at).count();
   return std::make_shared<Tombstone>(
-    ColumnTypeMask::DELETION_MASK,
+    static_cast<int8_t>(ColumnTypeMask::DELETION_MASK),
     Index(),
     local_deletion_time,
     marked_for_delete_at);
@@ -176,6 +176,13 @@ void Tombstone::Serialize(std::string* dest) const {
   rocksdb::cassandra::Serialize<int64_t>(marked_for_delete_at_, dest);
 }
 
+bool Tombstone::Collectable(int32_t gc_grace_period_in_seconds) const {
+  auto local_deleted_at = std::chrono::time_point<std::chrono::system_clock>(
+      std::chrono::seconds(local_deletion_time_));
+  auto gc_grace_period = std::chrono::seconds(gc_grace_period_in_seconds);
+  return local_deleted_at + gc_grace_period < std::chrono::system_clock::now();
+}
+
 std::shared_ptr<Tombstone> Tombstone::Deserialize(const char *src,
                                                   std::size_t offset) {
   int8_t mask = rocksdb::cassandra::Deserialize<int8_t>(src, offset);
@@ -231,7 +238,7 @@ void RowValue::Serialize(std::string* dest) const {
   }
 }
 
-RowValue RowValue::PurgeTtl(bool* changed) const {
+RowValue RowValue::RemoveExpiredColumns(bool* changed) const {
   *changed = false;
   Columns new_columns;
   for (auto& column : columns_) {
@@ -250,7 +257,7 @@ RowValue RowValue::PurgeTtl(bool* changed) const {
   return RowValue(std::move(new_columns), last_modified_time_);
 }
 
-RowValue RowValue::ExpireTtl(bool* changed) const {
+RowValue RowValue::ConvertExpiredColumnsToTombstones(bool* changed) const {
   *changed = false;
   Columns new_columns;
   for (auto& column : columns_) {
@@ -259,7 +266,7 @@ RowValue RowValue::ExpireTtl(bool* changed) const {
         std::static_pointer_cast<ExpiringColumn>(column);
 
       if(expiring_column->Expired()) {
-        shared_ptr<Tombstone> tombstone = expiring_column->ToTombstone();
+        std::shared_ptr<Tombstone> tombstone = expiring_column->ToTombstone();
         new_columns.push_back(tombstone);
         *changed = true;
         continue;
@@ -270,6 +277,23 @@ RowValue RowValue::ExpireTtl(bool* changed) const {
   return RowValue(std::move(new_columns), last_modified_time_);
 }
 
+RowValue RowValue::RemoveTombstones(int32_t gc_grace_period) const {
+  Columns new_columns;
+  for (auto& column : columns_) {
+    if (column->Mask() == ColumnTypeMask::DELETION_MASK) {
+      std::shared_ptr<Tombstone> tombstone =
+          std::static_pointer_cast<Tombstone>(column);
+
+      if (tombstone->Collectable(gc_grace_period)) {
+        continue;
+      }
+    }
+
+    new_columns.push_back(column);
+  }
+  return RowValue(std::move(new_columns), last_modified_time_);
+}
+
 bool RowValue::Empty() const {
   return columns_.empty();
 }
diff --git a/thirdparty/rocksdb/utilities/cassandra/format.h b/thirdparty/rocksdb/utilities/cassandra/format.h
index fad6df4c47..09a4923565 100644
--- a/thirdparty/rocksdb/utilities/cassandra/format.h
+++ b/thirdparty/rocksdb/utilities/cassandra/format.h
@@ -115,7 +115,7 @@ class Tombstone : public ColumnBase {
   virtual int64_t Timestamp() const override;
   virtual std::size_t Size() const override;
   virtual void Serialize(std::string* dest) const override;
-
+  bool Collectable(int32_t gc_grace_period) const;
   static std::shared_ptr<Tombstone> Deserialize(const char* src,
                                                 std::size_t offset);
 
@@ -152,10 +152,10 @@ class RowValue {
   // Create a Row containing columns.
   RowValue(Columns columns,
            int64_t last_modified_time);
-  RowValue(const RowValue& that) = delete;
-  RowValue(RowValue&& that) noexcept = default;
-  RowValue& operator=(const RowValue& that) = delete;
-  RowValue& operator=(RowValue&& that) = default;
+  RowValue(const RowValue& /*that*/) = delete;
+  RowValue(RowValue&& /*that*/) noexcept = default;
+  RowValue& operator=(const RowValue& /*that*/) = delete;
+  RowValue& operator=(RowValue&& /*that*/) = default;
 
   std::size_t Size() const;;
   bool IsTombstone() const;
@@ -163,8 +163,9 @@ class RowValue {
   // otherwise it returns the max timestamp of containing columns.
   int64_t LastModifiedTime() const;
   void Serialize(std::string* dest) const;
-  RowValue PurgeTtl(bool* changed) const;
-  RowValue ExpireTtl(bool* changed) const;
+  RowValue RemoveExpiredColumns(bool* changed) const;
+  RowValue ConvertExpiredColumnsToTombstones(bool* changed) const;
+  RowValue RemoveTombstones(int32_t gc_grace_period) const;
   bool Empty() const;
 
   static RowValue Deserialize(const char* src, std::size_t size);
@@ -177,6 +178,19 @@ class RowValue {
   Columns columns_;
   int64_t last_modified_time_;
 
+  FRIEND_TEST(RowValueTest, PurgeTtlShouldRemvoeAllColumnsExpired);
+  FRIEND_TEST(RowValueTest, ExpireTtlShouldConvertExpiredColumnsToTombstones);
+  FRIEND_TEST(RowValueMergeTest, Merge);
+  FRIEND_TEST(RowValueMergeTest, MergeWithRowTombstone);
+  FRIEND_TEST(CassandraFunctionalTest, SimpleMergeTest);
+  FRIEND_TEST(
+    CassandraFunctionalTest, CompactionShouldConvertExpiredColumnsToTombstone);
+  FRIEND_TEST(
+    CassandraFunctionalTest, CompactionShouldPurgeExpiredColumnsIfPurgeTtlIsOn);
+  FRIEND_TEST(
+    CassandraFunctionalTest, CompactionShouldRemoveRowWhenAllColumnExpiredIfPurgeTtlIsOn);
+  FRIEND_TEST(CassandraFunctionalTest,
+              CompactionShouldRemoveTombstoneExceedingGCGracePeriod);
 };
 
 } // namepsace cassandrda
diff --git a/thirdparty/rocksdb/utilities/cassandra/merge_operator.cc b/thirdparty/rocksdb/utilities/cassandra/merge_operator.cc
index 715ef8586a..4e529a6f2a 100644
--- a/thirdparty/rocksdb/utilities/cassandra/merge_operator.cc
+++ b/thirdparty/rocksdb/utilities/cassandra/merge_operator.cc
@@ -22,13 +22,6 @@ bool CassandraValueMergeOperator::FullMergeV2(
     MergeOperationOutput* merge_out) const {
   // Clear the *new_value for writing.
   merge_out->new_value.clear();
-
-  if (merge_in.existing_value == nullptr && merge_in.operand_list.size() == 1) {
-    // Only one operand
-    merge_out->existing_operand = merge_in.operand_list.back();
-    return true;
-  }
-
   std::vector<RowValue> row_values;
   if (merge_in.existing_value) {
     row_values.push_back(
@@ -41,6 +34,7 @@ bool CassandraValueMergeOperator::FullMergeV2(
   }
 
   RowValue merged = RowValue::Merge(std::move(row_values));
+  merged = merged.RemoveTombstones(gc_grace_period_in_seconds_);
   merge_out->new_value.reserve(merged.Size());
   merged.Serialize(&(merge_out->new_value));
 
@@ -48,10 +42,8 @@ bool CassandraValueMergeOperator::FullMergeV2(
 }
 
 bool CassandraValueMergeOperator::PartialMergeMulti(
-    const Slice& key,
-    const std::deque<Slice>& operand_list,
-    std::string* new_value,
-    Logger* logger) const {
+    const Slice& /*key*/, const std::deque<Slice>& operand_list,
+    std::string* new_value, Logger* /*logger*/) const {
   // Clear the *new_value for writing.
   assert(new_value);
   new_value->clear();
@@ -72,9 +64,4 @@ const char* CassandraValueMergeOperator::Name() const  {
 
 } // namespace cassandra
 
-std::shared_ptr<MergeOperator>
-    MergeOperators::CreateCassandraMergeOperator() {
-  return std::make_shared<rocksdb::cassandra::CassandraValueMergeOperator>();
-}
-
 } // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/cassandra/merge_operator.h b/thirdparty/rocksdb/utilities/cassandra/merge_operator.h
index 28066ca059..4d02c26de4 100644
--- a/thirdparty/rocksdb/utilities/cassandra/merge_operator.h
+++ b/thirdparty/rocksdb/utilities/cassandra/merge_operator.h
@@ -15,19 +15,30 @@ namespace cassandra {
  */
 class CassandraValueMergeOperator : public MergeOperator {
 public:
-  static std::shared_ptr<MergeOperator> CreateSharedInstance();
+ explicit CassandraValueMergeOperator(int32_t gc_grace_period_in_seconds,
+                                      size_t operands_limit = 0)
+     : gc_grace_period_in_seconds_(gc_grace_period_in_seconds),
+       operands_limit_(operands_limit) {}
 
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override;
+ virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                          MergeOperationOutput* merge_out) const override;
 
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value,
-                                 Logger* logger) const override;
+ virtual bool PartialMergeMulti(const Slice& key,
+                                const std::deque<Slice>& operand_list,
+                                std::string* new_value,
+                                Logger* logger) const override;
 
-  virtual const char* Name() const override;
+ virtual const char* Name() const override;
 
-  virtual bool AllowSingleOperand() const override { return true; }
+ virtual bool AllowSingleOperand() const override { return true; }
+
+ virtual bool ShouldMerge(const std::vector<Slice>& operands) const override {
+   return operands_limit_ > 0 && operands.size() >= operands_limit_;
+ }
+
+private:
+ int32_t gc_grace_period_in_seconds_;
+ size_t operands_limit_;
 };
 } // namespace cassandra
 } // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/cassandra/test_utils.cc b/thirdparty/rocksdb/utilities/cassandra/test_utils.cc
index 61f53b2d37..d4dc8f00a4 100644
--- a/thirdparty/rocksdb/utilities/cassandra/test_utils.cc
+++ b/thirdparty/rocksdb/utilities/cassandra/test_utils.cc
@@ -9,7 +9,6 @@ namespace rocksdb {
 namespace cassandra {
 const char kData[] = {'d', 'a', 't', 'a'};
 const char kExpiringData[] = {'e', 'd', 'a', 't', 'a'};
-const int32_t kLocalDeletionTime = 1;
 const int32_t kTtl = 86400;
 const int8_t kColumn = 0;
 const int8_t kTombstone = 1;
@@ -19,8 +18,8 @@ std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask,
                                              int8_t index,
                                              int64_t timestamp) {
   if ((mask & ColumnTypeMask::DELETION_MASK) != 0) {
-    return std::shared_ptr<Tombstone>(new Tombstone(
-      mask, index, kLocalDeletionTime, timestamp));
+    return std::shared_ptr<Tombstone>(
+        new Tombstone(mask, index, ToSeconds(timestamp), timestamp));
   } else if ((mask & ColumnTypeMask::EXPIRATION_MASK) != 0) {
     return std::shared_ptr<ExpiringColumn>(new ExpiringColumn(
       mask, index, timestamp, sizeof(kExpiringData), kExpiringData, kTtl));
@@ -30,6 +29,12 @@ std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask,
   }
 }
 
+std::tuple<int8_t, int8_t, int64_t> CreateTestColumnSpec(int8_t mask,
+                                                         int8_t index,
+                                                         int64_t timestamp) {
+  return std::make_tuple(mask, index, timestamp);
+}
+
 RowValue CreateTestRowValue(
     std::vector<std::tuple<int8_t, int8_t, int64_t>> column_specs) {
   std::vector<std::shared_ptr<ColumnBase>> columns;
@@ -44,7 +49,7 @@ RowValue CreateTestRowValue(
 }
 
 RowValue CreateRowTombstone(int64_t timestamp) {
-  return RowValue(kLocalDeletionTime, timestamp);
+  return RowValue(ToSeconds(timestamp), timestamp);
 }
 
 void VerifyRowValueColumns(
@@ -63,5 +68,8 @@ int64_t ToMicroSeconds(int64_t seconds) {
   return seconds * (int64_t)1000000;
 }
 
+int32_t ToSeconds(int64_t microseconds) {
+  return (int32_t)(microseconds / (int64_t)1000000);
+}
 }
 }
diff --git a/thirdparty/rocksdb/utilities/cassandra/test_utils.h b/thirdparty/rocksdb/utilities/cassandra/test_utils.h
index 463b12bf28..80374b0cba 100644
--- a/thirdparty/rocksdb/utilities/cassandra/test_utils.h
+++ b/thirdparty/rocksdb/utilities/cassandra/test_utils.h
@@ -13,7 +13,6 @@ namespace rocksdb {
 namespace cassandra {
 extern const char kData[];
 extern const char kExpiringData[];
-extern const int32_t kLocalDeletionTime;
 extern const int32_t kTtl;
 extern const int8_t kColumn;
 extern const int8_t kTombstone;
@@ -24,6 +23,10 @@ std::shared_ptr<ColumnBase> CreateTestColumn(int8_t mask,
                                              int8_t index,
                                              int64_t timestamp);
 
+std::tuple<int8_t, int8_t, int64_t> CreateTestColumnSpec(int8_t mask,
+                                                         int8_t index,
+                                                         int64_t timestamp);
+
 RowValue CreateTestRowValue(
     std::vector<std::tuple<int8_t, int8_t, int64_t>> column_specs);
 
@@ -38,6 +41,6 @@ void VerifyRowValueColumns(
 );
 
 int64_t ToMicroSeconds(int64_t seconds);
-
+int32_t ToSeconds(int64_t microseconds);
 }
 }
diff --git a/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.cc b/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.cc
index 0cdddbd628..9863ac1d56 100644
--- a/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.cc
+++ b/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.cc
@@ -37,11 +37,33 @@ Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) {
   return Status::OK();
 }
 
-Status Checkpoint::CreateCheckpoint(const std::string& checkpoint_dir,
-                                    uint64_t log_size_for_flush) {
+Status Checkpoint::CreateCheckpoint(const std::string& /*checkpoint_dir*/,
+                                    uint64_t /*log_size_for_flush*/) {
   return Status::NotSupported("");
 }
 
+void CheckpointImpl::CleanStagingDirectory(
+    const std::string& full_private_path, Logger* info_log) {
+    std::vector<std::string> subchildren;
+  Status s = db_->GetEnv()->FileExists(full_private_path);
+  if (s.IsNotFound()) {
+    return;
+  }
+  ROCKS_LOG_INFO(info_log, "File exists %s -- %s",
+                 full_private_path.c_str(), s.ToString().c_str());
+  db_->GetEnv()->GetChildren(full_private_path, &subchildren);
+  for (auto& subchild : subchildren) {
+    std::string subchild_path = full_private_path + "/" + subchild;
+    s = db_->GetEnv()->DeleteFile(subchild_path);
+    ROCKS_LOG_INFO(info_log, "Delete file %s -- %s",
+                   subchild_path.c_str(), s.ToString().c_str());
+  }
+  // finally delete the private dir
+  s = db_->GetEnv()->DeleteDir(full_private_path);
+  ROCKS_LOG_INFO(info_log, "Delete dir %s -- %s",
+                 full_private_path.c_str(), s.ToString().c_str());
+}
+
 // Builds an openable snapshot of RocksDB
 Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
                                         uint64_t log_size_for_flush) {
@@ -59,7 +81,23 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
       db_options.info_log,
       "Started the snapshot process -- creating snapshot in directory %s",
       checkpoint_dir.c_str());
-  std::string full_private_path = checkpoint_dir + ".tmp";
+
+  size_t final_nonslash_idx = checkpoint_dir.find_last_not_of('/');
+  if (final_nonslash_idx == std::string::npos) {
+    // npos means it's only slashes or empty. Non-empty means it's the root
+    // directory, but it shouldn't be because we verified above the directory
+    // doesn't exist.
+    assert(checkpoint_dir.empty());
+    return Status::InvalidArgument("invalid checkpoint directory name");
+  }
+
+  std::string full_private_path =
+      checkpoint_dir.substr(0, final_nonslash_idx + 1) + ".tmp";
+  ROCKS_LOG_INFO(
+      db_options.info_log,
+      "Snapshot process -- using temporary directory %s",
+      full_private_path.c_str());
+  CleanStagingDirectory(full_private_path, db_options.info_log.get());
   // create snapshot directory
   s = db_->GetEnv()->CreateDir(full_private_path);
   uint64_t sequence_number = 0;
@@ -82,7 +120,8 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
         } /* copy_file_cb */,
         [&](const std::string& fname, const std::string& contents, FileType) {
           ROCKS_LOG_INFO(db_options.info_log, "Creating %s", fname.c_str());
-          return CreateFile(db_->GetEnv(), full_private_path + fname, contents);
+          return CreateFile(db_->GetEnv(), full_private_path + fname, contents,
+                            db_options.use_fsync);
         } /* create_file_cb */,
         &sequence_number, log_size_for_flush);
     // we copied all the files, enable file deletions
@@ -94,7 +133,7 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
     s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir);
   }
   if (s.ok()) {
-    unique_ptr<Directory> checkpoint_directory;
+    std::unique_ptr<Directory> checkpoint_directory;
     db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory);
     if (checkpoint_directory != nullptr) {
       s = checkpoint_directory->Fsync();
@@ -110,19 +149,7 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
     // clean all the files we might have created
     ROCKS_LOG_INFO(db_options.info_log, "Snapshot failed -- %s",
                    s.ToString().c_str());
-    // we have to delete the dir and all its children
-    std::vector<std::string> subchildren;
-    db_->GetEnv()->GetChildren(full_private_path, &subchildren);
-    for (auto& subchild : subchildren) {
-      std::string subchild_path = full_private_path + "/" + subchild;
-      Status s1 = db_->GetEnv()->DeleteFile(subchild_path);
-      ROCKS_LOG_INFO(db_options.info_log, "Delete file %s -- %s",
-                     subchild_path.c_str(), s1.ToString().c_str());
-    }
-    // finally delete the private dir
-    Status s1 = db_->GetEnv()->DeleteDir(full_private_path);
-    ROCKS_LOG_INFO(db_options.info_log, "Delete dir %s -- %s",
-                   full_private_path.c_str(), s1.ToString().c_str());
+    CleanStagingDirectory(full_private_path, db_options.info_log.get());
   }
   return s;
 }
diff --git a/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.h b/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.h
index 7deea9812d..a85fde59b6 100644
--- a/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.h
+++ b/thirdparty/rocksdb/utilities/checkpoint/checkpoint_impl.h
@@ -47,6 +47,7 @@ class CheckpointImpl : public Checkpoint {
       uint64_t* sequence_number, uint64_t log_size_for_flush);
 
  private:
+  void CleanStagingDirectory(const std::string& path, Logger* info_log);
   DB* db_;
 };
 
diff --git a/thirdparty/rocksdb/utilities/checkpoint/checkpoint_test.cc b/thirdparty/rocksdb/utilities/checkpoint/checkpoint_test.cc
index 56c8c6e050..9318a733dc 100644
--- a/thirdparty/rocksdb/utilities/checkpoint/checkpoint_test.cc
+++ b/thirdparty/rocksdb/utilities/checkpoint/checkpoint_test.cc
@@ -17,12 +17,13 @@
 #include <thread>
 #include <utility>
 #include "db/db_impl.h"
-#include "port/stack_trace.h"
 #include "port/port.h"
+#include "port/stack_trace.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/utilities/checkpoint.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "util/fault_injection_test_env.h"
 #include "util/sync_point.h"
 #include "util/testharness.h"
 
@@ -42,11 +43,12 @@ class CheckpointTest : public testing::Test {
   DB* db_;
   Options last_options_;
   std::vector<ColumnFamilyHandle*> handles_;
+  std::string snapshot_name_;
 
   CheckpointTest() : env_(Env::Default()) {
     env_->SetBackgroundThreads(1, Env::LOW);
     env_->SetBackgroundThreads(1, Env::HIGH);
-    dbname_ = test::TmpDir(env_) + "/db_test";
+    dbname_ = test::PerThreadDBPath(env_, "checkpoint_test");
     alternative_wal_dir_ = dbname_ + "/wal";
     auto options = CurrentOptions();
     auto delete_options = options;
@@ -55,10 +57,16 @@ class CheckpointTest : public testing::Test {
     // Destroy it for not alternative WAL dir is used.
     EXPECT_OK(DestroyDB(dbname_, options));
     db_ = nullptr;
+    snapshot_name_ = test::PerThreadDBPath(env_, "snapshot");
+    std::string snapshot_tmp_name = snapshot_name_ + ".tmp";
+    EXPECT_OK(DestroyDB(snapshot_name_, options));
+    env_->DeleteDir(snapshot_name_);
+    EXPECT_OK(DestroyDB(snapshot_tmp_name, options));
+    env_->DeleteDir(snapshot_tmp_name);
     Reopen(options);
   }
 
-  ~CheckpointTest() {
+  ~CheckpointTest() override {
     rocksdb::SyncPoint::GetInstance()->DisableProcessing();
     rocksdb::SyncPoint::GetInstance()->LoadDependency({});
     rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -69,6 +77,7 @@ class CheckpointTest : public testing::Test {
     options.db_paths.emplace_back(dbname_ + "_3", 0);
     options.db_paths.emplace_back(dbname_ + "_4", 0);
     EXPECT_OK(DestroyDB(dbname_, options));
+    EXPECT_OK(DestroyDB(snapshot_name_, options));
   }
 
   // Return the current option configuration.
@@ -155,6 +164,16 @@ class CheckpointTest : public testing::Test {
     return DB::OpenForReadOnly(options, dbname_, &db_);
   }
 
+  Status ReadOnlyReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const Options& options) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (const auto& cf : cfs) {
+      column_families.emplace_back(cf, options);
+    }
+    return DB::OpenForReadOnly(options, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
   Status TryReopen(const Options& options) {
     Close();
     last_options_ = options;
@@ -219,7 +238,6 @@ class CheckpointTest : public testing::Test {
 TEST_F(CheckpointTest, GetSnapshotLink) {
   for (uint64_t log_size_for_flush : {0, 1000000}) {
     Options options;
-    const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
     DB* snapshotDB;
     ReadOptions roptions;
     std::string result;
@@ -229,8 +247,6 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     delete db_;
     db_ = nullptr;
     ASSERT_OK(DestroyDB(dbname_, options));
-    ASSERT_OK(DestroyDB(snapshot_name, options));
-    env_->DeleteDir(snapshot_name);
 
     // Create a database
     Status s;
@@ -240,14 +256,14 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     ASSERT_OK(Put(key, "v1"));
     // Take a snapshot
     ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name, log_size_for_flush));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, log_size_for_flush));
     ASSERT_OK(Put(key, "v2"));
     ASSERT_EQ("v2", Get(key));
     ASSERT_OK(Flush());
     ASSERT_EQ("v2", Get(key));
     // Open snapshot and verify contents while DB is running
     options.create_if_missing = false;
-    ASSERT_OK(DB::Open(options, snapshot_name, &snapshotDB));
+    ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
     ASSERT_OK(snapshotDB->Get(roptions, key, &result));
     ASSERT_EQ("v1", result);
     delete snapshotDB;
@@ -260,7 +276,7 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
 
     // Open snapshot and verify contents
     options.create_if_missing = false;
-    dbname_ = snapshot_name;
+    dbname_ = snapshot_name_;
     ASSERT_OK(DB::Open(options, dbname_, &db_));
     ASSERT_EQ("v1", Get(key));
     delete db_;
@@ -269,7 +285,7 @@ TEST_F(CheckpointTest, GetSnapshotLink) {
     delete checkpoint;
 
     // Restore DB name
-    dbname_ = test::TmpDir(env_) + "/db_test";
+    dbname_ = test::PerThreadDBPath(env_, "db_test");
   }
 }
 
@@ -289,21 +305,17 @@ TEST_F(CheckpointTest, CheckpointCF) {
   ASSERT_OK(Put(4, "four", "four"));
   ASSERT_OK(Put(5, "five", "five"));
 
-  const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
   DB* snapshotDB;
   ReadOptions roptions;
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
 
-  ASSERT_OK(DestroyDB(snapshot_name, options));
-  env_->DeleteDir(snapshot_name);
-
   Status s;
   // Take a snapshot
   rocksdb::port::Thread t([&]() {
     Checkpoint* checkpoint;
     ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
     delete checkpoint;
   });
   TEST_SYNC_POINT("CheckpointTest::CheckpointCF:1");
@@ -331,7 +343,7 @@ TEST_F(CheckpointTest, CheckpointCF) {
     for (size_t i = 0; i < cfs.size(); ++i) {
       column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
     }
-  ASSERT_OK(DB::Open(options, snapshot_name,
+  ASSERT_OK(DB::Open(options, snapshot_name_,
         column_families, &cphandles, &snapshotDB));
   ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
   ASSERT_EQ("Default1", result);
@@ -344,7 +356,6 @@ TEST_F(CheckpointTest, CheckpointCF) {
   cphandles.clear();
   delete snapshotDB;
   snapshotDB = nullptr;
-  ASSERT_OK(DestroyDB(snapshot_name, options));
 }
 
 TEST_F(CheckpointTest, CheckpointCFNoFlush) {
@@ -358,26 +369,22 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   Flush();
   ASSERT_OK(Put(2, "two", "two"));
 
-  const std::string snapshot_name = test::TmpDir(env_) + "/snapshot";
   DB* snapshotDB;
   ReadOptions roptions;
   std::string result;
   std::vector<ColumnFamilyHandle*> cphandles;
 
-  ASSERT_OK(DestroyDB(snapshot_name, options));
-  env_->DeleteDir(snapshot_name);
-
   Status s;
   // Take a snapshot
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "DBImpl::BackgroundCallFlush:start", [&](void* arg) {
+      "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
         // Flush should never trigger.
         FAIL();
       });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   Checkpoint* checkpoint;
   ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name, 1000000));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_, 1000000));
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 
   delete checkpoint;
@@ -395,7 +402,7 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   for (size_t i = 0; i < cfs.size(); ++i) {
     column_families.push_back(ColumnFamilyDescriptor(cfs[i], options));
   }
-  ASSERT_OK(DB::Open(options, snapshot_name, column_families, &cphandles,
+  ASSERT_OK(DB::Open(options, snapshot_name_, column_families, &cphandles,
                      &snapshotDB));
   ASSERT_OK(snapshotDB->Get(roptions, cphandles[0], "Default", &result));
   ASSERT_EQ("Default", result);
@@ -409,14 +416,9 @@ TEST_F(CheckpointTest, CheckpointCFNoFlush) {
   cphandles.clear();
   delete snapshotDB;
   snapshotDB = nullptr;
-  ASSERT_OK(DestroyDB(snapshot_name, options));
 }
 
 TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
-  const std::string kSnapshotName = test::TmpDir(env_) + "/snapshot";
-  ASSERT_OK(DestroyDB(kSnapshotName, CurrentOptions()));
-  env_->DeleteDir(kSnapshotName);
-
   Options options = CurrentOptions();
   options.max_manifest_file_size = 0;  // always rollover manifest for file add
   Reopen(options);
@@ -438,7 +440,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
   rocksdb::port::Thread t([&]() {
     Checkpoint* checkpoint;
     ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
-    ASSERT_OK(checkpoint->CreateCheckpoint(kSnapshotName));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
     delete checkpoint;
   });
   TEST_SYNC_POINT(
@@ -452,18 +454,15 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing) {
   DB* snapshotDB;
   // Successful Open() implies that CURRENT pointed to the manifest in the
   // checkpoint.
-  ASSERT_OK(DB::Open(options, kSnapshotName, &snapshotDB));
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshotDB));
   delete snapshotDB;
   snapshotDB = nullptr;
 }
 
 TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
   Close();
-  const std::string kSnapshotName = test::TmpDir(env_) + "/snapshot";
-  const std::string dbname = test::TmpDir() + "/transaction_testdb";
-  ASSERT_OK(DestroyDB(kSnapshotName, CurrentOptions()));
+  const std::string dbname = test::PerThreadDBPath("transaction_testdb");
   ASSERT_OK(DestroyDB(dbname, CurrentOptions()));
-  env_->DeleteDir(kSnapshotName);
   env_->DeleteDir(dbname);
 
   Options options = CurrentOptions();
@@ -521,7 +520,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
   rocksdb::port::Thread t([&]() {
     Checkpoint* checkpoint;
     ASSERT_OK(Checkpoint::Create(txdb, &checkpoint));
-    ASSERT_OK(checkpoint->CreateCheckpoint(kSnapshotName));
+    ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
     delete checkpoint;
   });
   TEST_SYNC_POINT(
@@ -536,7 +535,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
 
   // No more than two logs files should exist.
   std::vector<std::string> files;
-  env_->GetChildren(kSnapshotName, &files);
+  env_->GetChildren(snapshot_name_, &files);
   int num_log_files = 0;
   for (auto& file : files) {
     uint64_t num;
@@ -559,7 +558,7 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
   column_families.push_back(
       ColumnFamilyDescriptor("CFB", ColumnFamilyOptions()));
   std::vector<rocksdb::ColumnFamilyHandle*> cf_handles;
-  ASSERT_OK(TransactionDB::Open(options, txn_db_options, kSnapshotName,
+  ASSERT_OK(TransactionDB::Open(options, txn_db_options, snapshot_name_,
                                 column_families, &cf_handles, &snapshotDB));
   ASSERT_OK(snapshotDB->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
@@ -576,6 +575,116 @@ TEST_F(CheckpointTest, CurrentFileModifiedWhileCheckpointing2PC) {
   delete txdb;
 }
 
+TEST_F(CheckpointTest, CheckpointInvalidDirectoryName) {
+  for (std::string checkpoint_dir : {"", "/", "////"}) {
+    Checkpoint* checkpoint;
+    ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+    ASSERT_TRUE(checkpoint->CreateCheckpoint("").IsInvalidArgument());
+    delete checkpoint;
+  }
+}
+
+TEST_F(CheckpointTest, CheckpointWithParallelWrites) {
+  // When run with TSAN, this exposes the data race fixed in
+  // https://github.com/facebook/rocksdb/pull/3603
+  ASSERT_OK(Put("key1", "val1"));
+  port::Thread thread([this]() { ASSERT_OK(Put("key2", "val2")); });
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  thread.join();
+}
+
+TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
+  Options options = CurrentOptions();
+  std::unique_ptr<FaultInjectionTestEnv> env(new FaultInjectionTestEnv(env_));
+  options.env = env.get();
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  env->DropUnsyncedFileData();
+
+  // make sure it's openable even though whatever data that wasn't synced got
+  // dropped.
+  options.env = env_;
+  DB* snapshot_db;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
+  ASSERT_EQ("val1", get_result);
+  delete snapshot_db;
+  delete db_;
+  db_ = nullptr;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  Close();
+  Options options = CurrentOptions();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+  DB* snapshot_db = nullptr;
+  ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
+  ReadOptions read_opts;
+  std::string get_result;
+  ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result));
+  ASSERT_EQ("foo_value", get_result);
+  delete snapshot_db;
+}
+
+TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  for (int i = 0; i != 3; ++i) {
+    ASSERT_OK(Put(i, "foo", "foo_value"));
+    ASSERT_OK(Flush(i));
+  }
+  Close();
+  Status s = ReadOnlyReopenWithColumnFamilies(
+      {kDefaultColumnFamilyName, "pikachu", "eevee"}, options);
+  ASSERT_OK(s);
+  Checkpoint* checkpoint = nullptr;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
+  delete checkpoint;
+  checkpoint = nullptr;
+  Close();
+
+  std::vector<ColumnFamilyDescriptor> column_families{
+      {kDefaultColumnFamilyName, options},
+      {"pikachu", options},
+      {"eevee", options}};
+  DB* snapshot_db = nullptr;
+  std::vector<ColumnFamilyHandle*> snapshot_handles;
+  s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles,
+               &snapshot_db);
+  ASSERT_OK(s);
+  ReadOptions read_opts;
+  for (int i = 0; i != 3; ++i) {
+    std::string get_result;
+    s = snapshot_db->Get(read_opts, snapshot_handles[i], "foo", &get_result);
+    ASSERT_OK(s);
+    ASSERT_EQ("foo_value", get_result);
+  }
+
+  for (auto snapshot_h : snapshot_handles) {
+    delete snapshot_h;
+  }
+  snapshot_handles.clear();
+  delete snapshot_db;
+}
+
 }  // namespace rocksdb
 
 int main(int argc, char** argv) {
@@ -587,7 +696,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as Checkpoint is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/col_buf_decoder.cc b/thirdparty/rocksdb/utilities/col_buf_decoder.cc
deleted file mode 100644
index 3fb31794f7..0000000000
--- a/thirdparty/rocksdb/utilities/col_buf_decoder.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "utilities/col_buf_decoder.h"
-#include <cstring>
-#include <string>
-#include "port/port.h"
-
-namespace rocksdb {
-
-ColBufDecoder::~ColBufDecoder() {}
-
-namespace {
-
-inline uint64_t EncodeFixed64WithEndian(uint64_t val, bool big_endian,
-                                        size_t size) {
-  if (big_endian && port::kLittleEndian) {
-    val = EndianTransform(val, size);
-  } else if (!big_endian && !port::kLittleEndian) {
-    val = EndianTransform(val, size);
-  }
-  return val;
-}
-
-}  // namespace
-
-ColBufDecoder* ColBufDecoder::NewColBufDecoder(
-    const ColDeclaration& col_declaration) {
-  if (col_declaration.col_type == "FixedLength") {
-    return new FixedLengthColBufDecoder(
-        col_declaration.size, col_declaration.col_compression_type,
-        col_declaration.nullable, col_declaration.big_endian);
-  } else if (col_declaration.col_type == "VariableLength") {
-    return new VariableLengthColBufDecoder();
-  } else if (col_declaration.col_type == "VariableChunk") {
-    return new VariableChunkColBufDecoder(col_declaration.col_compression_type);
-  } else if (col_declaration.col_type == "LongFixedLength") {
-    return new LongFixedLengthColBufDecoder(col_declaration.size,
-                                            col_declaration.nullable);
-  }
-  // Unrecognized column type
-  return nullptr;
-}
-
-namespace {
-
-void ReadVarint64(const char** src_ptr, uint64_t* val_ptr) {
-  const char* q = GetVarint64Ptr(*src_ptr, *src_ptr + 10, val_ptr);
-  assert(q != nullptr);
-  *src_ptr = q;
-}
-}  // namespace
-
-size_t FixedLengthColBufDecoder::Init(const char* src) {
-  remain_runs_ = 0;
-  last_val_ = 0;
-  // Dictionary initialization
-  dict_vec_.clear();
-  const char* orig_src = src;
-  if (col_compression_type_ == kColDict ||
-      col_compression_type_ == kColRleDict) {
-    const char* q;
-    uint64_t dict_size;
-    // Bypass limit
-    q = GetVarint64Ptr(src, src + 10, &dict_size);
-    assert(q != nullptr);
-    src = q;
-
-    uint64_t dict_key;
-    for (uint64_t i = 0; i < dict_size; ++i) {
-      // Bypass limit
-      ReadVarint64(&src, &dict_key);
-
-      dict_key = EncodeFixed64WithEndian(dict_key, big_endian_, size_);
-      dict_vec_.push_back(dict_key);
-    }
-  }
-  return src - orig_src;
-}
-
-size_t FixedLengthColBufDecoder::Decode(const char* src, char** dest) {
-  uint64_t read_val = 0;
-  const char* orig_src = src;
-  const char* src_limit = src + 20;
-  if (nullable_) {
-    bool not_null;
-    not_null = *src;
-    src += 1;
-    if (!not_null) {
-      return 1;
-    }
-  }
-  if (IsRunLength(col_compression_type_)) {
-    if (remain_runs_ == 0) {
-      const char* q;
-      run_val_ = 0;
-      if (col_compression_type_ == kColRle) {
-        memcpy(&run_val_, src, size_);
-        src += size_;
-      } else {
-        q = GetVarint64Ptr(src, src_limit, &run_val_);
-        assert(q != nullptr);
-        src = q;
-      }
-
-      q = GetVarint64Ptr(src, src_limit, &remain_runs_);
-      assert(q != nullptr);
-      src = q;
-
-      if (col_compression_type_ != kColRleDeltaVarint &&
-          col_compression_type_ != kColRleDict) {
-        run_val_ = EncodeFixed64WithEndian(run_val_, big_endian_, size_);
-      }
-    }
-    read_val = run_val_;
-  } else {
-    if (col_compression_type_ == kColNoCompression) {
-      memcpy(&read_val, src, size_);
-      src += size_;
-    } else {
-      // Assume a column does not exceed 8 bytes here
-      const char* q = GetVarint64Ptr(src, src_limit, &read_val);
-      assert(q != nullptr);
-      src = q;
-    }
-    if (col_compression_type_ != kColDeltaVarint &&
-        col_compression_type_ != kColDict) {
-      read_val = EncodeFixed64WithEndian(read_val, big_endian_, size_);
-    }
-  }
-
-  uint64_t write_val = read_val;
-  if (col_compression_type_ == kColDeltaVarint ||
-      col_compression_type_ == kColRleDeltaVarint) {
-    // does not support 64 bit
-
-    uint64_t mask = (write_val & 1) ? (~uint64_t(0)) : 0;
-    int64_t delta = (write_val >> 1) ^ mask;
-    write_val = last_val_ + delta;
-
-    uint64_t tmp = write_val;
-    write_val = EncodeFixed64WithEndian(write_val, big_endian_, size_);
-    last_val_ = tmp;
-  } else if (col_compression_type_ == kColRleDict ||
-             col_compression_type_ == kColDict) {
-    uint64_t dict_val = read_val;
-    assert(dict_val < dict_vec_.size());
-    write_val = dict_vec_[dict_val];
-  }
-
-  // dest->append(reinterpret_cast<char*>(&write_val), size_);
-  memcpy(*dest, reinterpret_cast<char*>(&write_val), size_);
-  *dest += size_;
-  if (IsRunLength(col_compression_type_)) {
-    --remain_runs_;
-  }
-  return src - orig_src;
-}
-
-size_t LongFixedLengthColBufDecoder::Decode(const char* src, char** dest) {
-  if (nullable_) {
-    bool not_null;
-    not_null = *src;
-    src += 1;
-    if (!not_null) {
-      return 1;
-    }
-  }
-  memcpy(*dest, src, size_);
-  *dest += size_;
-  return size_ + 1;
-}
-
-size_t VariableLengthColBufDecoder::Decode(const char* src, char** dest) {
-  uint8_t len;
-  len = *src;
-  memcpy(dest, reinterpret_cast<char*>(&len), 1);
-  *dest += 1;
-  src += 1;
-  memcpy(*dest, src, len);
-  *dest += len;
-  return len + 1;
-}
-
-size_t VariableChunkColBufDecoder::Init(const char* src) {
-  // Dictionary initialization
-  dict_vec_.clear();
-  const char* orig_src = src;
-  if (col_compression_type_ == kColDict) {
-    const char* q;
-    uint64_t dict_size;
-    // Bypass limit
-    q = GetVarint64Ptr(src, src + 10, &dict_size);
-    assert(q != nullptr);
-    src = q;
-
-    uint64_t dict_key;
-    for (uint64_t i = 0; i < dict_size; ++i) {
-      // Bypass limit
-      ReadVarint64(&src, &dict_key);
-      dict_vec_.push_back(dict_key);
-    }
-  }
-  return src - orig_src;
-}
-
-size_t VariableChunkColBufDecoder::Decode(const char* src, char** dest) {
-  const char* orig_src = src;
-  uint64_t size = 0;
-  ReadVarint64(&src, &size);
-  int64_t full_chunks = size / 8;
-  uint64_t chunk_buf;
-  size_t chunk_size = 8;
-  for (int64_t i = 0; i < full_chunks + 1; ++i) {
-    chunk_buf = 0;
-    if (i == full_chunks) {
-      chunk_size = size % 8;
-    }
-    if (col_compression_type_ == kColDict) {
-      uint64_t dict_val;
-      ReadVarint64(&src, &dict_val);
-      assert(dict_val < dict_vec_.size());
-      chunk_buf = dict_vec_[dict_val];
-    } else {
-      memcpy(&chunk_buf, src, chunk_size);
-      src += chunk_size;
-    }
-    memcpy(*dest, reinterpret_cast<char*>(&chunk_buf), 8);
-    *dest += 8;
-    uint8_t mask = ((0xFF - 8) + chunk_size) & 0xFF;
-    memcpy(*dest, reinterpret_cast<char*>(&mask), 1);
-    *dest += 1;
-  }
-
-  return src - orig_src;
-}
-
-}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/col_buf_decoder.h b/thirdparty/rocksdb/utilities/col_buf_decoder.h
deleted file mode 100644
index e795e4ecdf..0000000000
--- a/thirdparty/rocksdb/utilities/col_buf_decoder.h
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "util/coding.h"
-#include "utilities/col_buf_encoder.h"
-
-namespace rocksdb {
-
-struct ColDeclaration;
-
-// ColBufDecoder is a class to decode column buffers. It can be populated from a
-// ColDeclaration. Before starting decoding, a Init() method should be called.
-// Each time it takes a column value into Decode() method.
-class ColBufDecoder {
- public:
-  virtual ~ColBufDecoder() = 0;
-  virtual size_t Init(const char* src) { return 0; }
-  virtual size_t Decode(const char* src, char** dest) = 0;
-  static ColBufDecoder* NewColBufDecoder(const ColDeclaration& col_declaration);
-
- protected:
-  std::string buffer_;
-  static inline bool IsRunLength(ColCompressionType type) {
-    return type == kColRle || type == kColRleVarint ||
-           type == kColRleDeltaVarint || type == kColRleDict;
-  }
-};
-
-class FixedLengthColBufDecoder : public ColBufDecoder {
- public:
-  explicit FixedLengthColBufDecoder(
-      size_t size, ColCompressionType col_compression_type = kColNoCompression,
-      bool nullable = false, bool big_endian = false)
-      : size_(size),
-        col_compression_type_(col_compression_type),
-        nullable_(nullable),
-        big_endian_(big_endian) {}
-
-  size_t Init(const char* src) override;
-  size_t Decode(const char* src, char** dest) override;
-  ~FixedLengthColBufDecoder() {}
-
- private:
-  size_t size_;
-  ColCompressionType col_compression_type_;
-  bool nullable_;
-  bool big_endian_;
-
-  // for decoding
-  std::vector<uint64_t> dict_vec_;
-  uint64_t remain_runs_;
-  uint64_t run_val_;
-  uint64_t last_val_;
-};
-
-class LongFixedLengthColBufDecoder : public ColBufDecoder {
- public:
-  LongFixedLengthColBufDecoder(size_t size, bool nullable)
-      : size_(size), nullable_(nullable) {}
-
-  size_t Decode(const char* src, char** dest) override;
-  ~LongFixedLengthColBufDecoder() {}
-
- private:
-  size_t size_;
-  bool nullable_;
-};
-
-class VariableLengthColBufDecoder : public ColBufDecoder {
- public:
-  size_t Decode(const char* src, char** dest) override;
-  ~VariableLengthColBufDecoder() {}
-};
-
-class VariableChunkColBufDecoder : public VariableLengthColBufDecoder {
- public:
-  size_t Init(const char* src) override;
-  size_t Decode(const char* src, char** dest) override;
-  explicit VariableChunkColBufDecoder(ColCompressionType col_compression_type)
-      : col_compression_type_(col_compression_type) {}
-  VariableChunkColBufDecoder() : col_compression_type_(kColNoCompression) {}
-
- private:
-  ColCompressionType col_compression_type_;
-  std::unordered_map<uint64_t, uint64_t> dictionary_;
-  std::vector<uint64_t> dict_vec_;
-};
-
-struct KVPairColBufDecoders {
-  std::vector<std::unique_ptr<ColBufDecoder>> key_col_bufs;
-  std::vector<std::unique_ptr<ColBufDecoder>> value_col_bufs;
-  std::unique_ptr<ColBufDecoder> value_checksum_buf;
-
-  explicit KVPairColBufDecoders(const KVPairColDeclarations& kvp_cd) {
-    for (auto kcd : *kvp_cd.key_col_declarations) {
-      key_col_bufs.emplace_back(
-          std::move(ColBufDecoder::NewColBufDecoder(kcd)));
-    }
-    for (auto vcd : *kvp_cd.value_col_declarations) {
-      value_col_bufs.emplace_back(
-          std::move(ColBufDecoder::NewColBufDecoder(vcd)));
-    }
-    value_checksum_buf.reset(
-        ColBufDecoder::NewColBufDecoder(*kvp_cd.value_checksum_declaration));
-  }
-};
-}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/col_buf_encoder.cc b/thirdparty/rocksdb/utilities/col_buf_encoder.cc
deleted file mode 100644
index feaf5646ae..0000000000
--- a/thirdparty/rocksdb/utilities/col_buf_encoder.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include "utilities/col_buf_encoder.h"
-#include <cstring>
-#include <string>
-#include "port/port.h"
-
-namespace rocksdb {
-
-ColBufEncoder::~ColBufEncoder() {}
-
-namespace {
-
-inline uint64_t DecodeFixed64WithEndian(uint64_t val, bool big_endian,
-                                        size_t size) {
-  if (big_endian && port::kLittleEndian) {
-    val = EndianTransform(val, size);
-  } else if (!big_endian && !port::kLittleEndian) {
-    val = EndianTransform(val, size);
-  }
-  return val;
-}
-
-}  // namespace
-
-const std::string &ColBufEncoder::GetData() { return buffer_; }
-
-ColBufEncoder *ColBufEncoder::NewColBufEncoder(
-    const ColDeclaration &col_declaration) {
-  if (col_declaration.col_type == "FixedLength") {
-    return new FixedLengthColBufEncoder(
-        col_declaration.size, col_declaration.col_compression_type,
-        col_declaration.nullable, col_declaration.big_endian);
-  } else if (col_declaration.col_type == "VariableLength") {
-    return new VariableLengthColBufEncoder();
-  } else if (col_declaration.col_type == "VariableChunk") {
-    return new VariableChunkColBufEncoder(col_declaration.col_compression_type);
-  } else if (col_declaration.col_type == "LongFixedLength") {
-    return new LongFixedLengthColBufEncoder(col_declaration.size,
-                                            col_declaration.nullable);
-  }
-  // Unrecognized column type
-  return nullptr;
-}
-
-#ifdef ROCKSDB_UBSAN_RUN
-#if defined(__clang__)
-__attribute__((__no_sanitize__("shift")))
-#elif defined(__GNUC__)
-__attribute__((__no_sanitize_undefined__))
-#endif
-#endif
-size_t FixedLengthColBufEncoder::Append(const char *buf) {
-  if (nullable_) {
-    if (buf == nullptr) {
-      buffer_.append(1, 0);
-      return 0;
-    } else {
-      buffer_.append(1, 1);
-    }
-  }
-  uint64_t read_val = 0;
-  memcpy(&read_val, buf, size_);
-  read_val = DecodeFixed64WithEndian(read_val, big_endian_, size_);
-
-  // Determine write value
-  uint64_t write_val = read_val;
-  if (col_compression_type_ == kColDeltaVarint ||
-      col_compression_type_ == kColRleDeltaVarint) {
-    int64_t delta = read_val - last_val_;
-    // Encode signed delta value
-    delta = (delta << 1) ^ (delta >> 63);
-    write_val = delta;
-    last_val_ = read_val;
-  } else if (col_compression_type_ == kColDict ||
-             col_compression_type_ == kColRleDict) {
-    auto iter = dictionary_.find(read_val);
-    uint64_t dict_val;
-    if (iter == dictionary_.end()) {
-      // Add new entry to dictionary
-      dict_val = dictionary_.size();
-      dictionary_.insert(std::make_pair(read_val, dict_val));
-      dict_vec_.push_back(read_val);
-    } else {
-      dict_val = iter->second;
-    }
-    write_val = dict_val;
-  }
-
-  // Write into buffer
-  if (IsRunLength(col_compression_type_)) {
-    if (run_length_ == -1) {
-      // First element
-      run_val_ = write_val;
-      run_length_ = 1;
-    } else if (write_val != run_val_) {
-      // End of run
-      // Write run value
-      if (col_compression_type_ == kColRle) {
-        buffer_.append(reinterpret_cast<char *>(&run_val_), size_);
-      } else {
-        PutVarint64(&buffer_, run_val_);
-      }
-      // Write run length
-      PutVarint64(&buffer_, run_length_);
-      run_val_ = write_val;
-      run_length_ = 1;
-    } else {
-      run_length_++;
-    }
-  } else {  // non run-length encodings
-    if (col_compression_type_ == kColNoCompression) {
-      buffer_.append(reinterpret_cast<char *>(&write_val), size_);
-    } else {
-      PutVarint64(&buffer_, write_val);
-    }
-  }
-  return size_;
-}
-
-void FixedLengthColBufEncoder::Finish() {
-  if (col_compression_type_ == kColDict ||
-      col_compression_type_ == kColRleDict) {
-    std::string header;
-    PutVarint64(&header, dict_vec_.size());
-    // Put dictionary in the header
-    for (auto item : dict_vec_) {
-      PutVarint64(&header, item);
-    }
-    buffer_ = header + buffer_;
-  }
-  if (IsRunLength(col_compression_type_)) {
-    // Finish last run value
-    if (col_compression_type_ == kColRle) {
-      buffer_.append(reinterpret_cast<char *>(&run_val_), size_);
-    } else {
-      PutVarint64(&buffer_, run_val_);
-    }
-    PutVarint64(&buffer_, run_length_);
-  }
-}
-
-size_t LongFixedLengthColBufEncoder::Append(const char *buf) {
-  if (nullable_) {
-    if (buf == nullptr) {
-      buffer_.append(1, 0);
-      return 0;
-    } else {
-      buffer_.append(1, 1);
-    }
-  }
-  buffer_.append(buf, size_);
-  return size_;
-}
-
-void LongFixedLengthColBufEncoder::Finish() {}
-
-size_t VariableLengthColBufEncoder::Append(const char *buf) {
-  uint8_t length = 0;
-  length = *buf;
-  buffer_.append(buf, 1);
-  buf += 1;
-  buffer_.append(buf, length);
-  return length + 1;
-}
-
-void VariableLengthColBufEncoder::Finish() {}
-
-size_t VariableChunkColBufEncoder::Append(const char *buf) {
-  const char *orig_buf = buf;
-  uint8_t mark = 0xFF;
-  size_t length = 0;
-  std::string tmp_buffer;
-  while (mark == 0xFF) {
-    uint64_t val;
-    memcpy(&val, buf, 8);
-    buf += 8;
-    mark = *buf;
-    buf += 1;
-    int8_t chunk_size = 8 - (0xFF - mark);
-    if (col_compression_type_ == kColDict) {
-      auto iter = dictionary_.find(val);
-      uint64_t dict_val;
-      if (iter == dictionary_.end()) {
-        dict_val = dictionary_.size();
-        dictionary_.insert(std::make_pair(val, dict_val));
-        dict_vec_.push_back(val);
-      } else {
-        dict_val = iter->second;
-      }
-      PutVarint64(&tmp_buffer, dict_val);
-    } else {
-      tmp_buffer.append(reinterpret_cast<char *>(&val), chunk_size);
-    }
-    length += chunk_size;
-  }
-
-  PutVarint64(&buffer_, length);
-  buffer_.append(tmp_buffer);
-  return buf - orig_buf;
-}
-
-void VariableChunkColBufEncoder::Finish() {
-  if (col_compression_type_ == kColDict) {
-    std::string header;
-    PutVarint64(&header, dict_vec_.size());
-    for (auto item : dict_vec_) {
-      PutVarint64(&header, item);
-    }
-    buffer_ = header + buffer_;
-  }
-}
-
-}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/col_buf_encoder.h b/thirdparty/rocksdb/utilities/col_buf_encoder.h
deleted file mode 100644
index 902879925c..0000000000
--- a/thirdparty/rocksdb/utilities/col_buf_encoder.h
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "util/coding.h"
-
-namespace rocksdb {
-
-enum ColCompressionType {
-  kColNoCompression,
-  kColRle,
-  kColVarint,
-  kColRleVarint,
-  kColDeltaVarint,
-  kColRleDeltaVarint,
-  kColDict,
-  kColRleDict
-};
-
-struct ColDeclaration;
-
-// ColBufEncoder is a class to encode column buffers. It can be populated from a
-// ColDeclaration. Each time it takes a column value into Append() method to
-// encode the column and store it into an internal buffer. After all rows for
-// this column are consumed, a Finish() should be called to add header and
-// remaining data.
-class ColBufEncoder {
- public:
-  // Read a column, encode data and append into internal buffer.
-  virtual size_t Append(const char *buf) = 0;
-  virtual ~ColBufEncoder() = 0;
-  // Get the internal column buffer. Should only be called after Finish().
-  const std::string &GetData();
-  // Finish encoding. Add header and remaining data.
-  virtual void Finish() = 0;
-  // Populate a ColBufEncoder from ColDeclaration.
-  static ColBufEncoder *NewColBufEncoder(const ColDeclaration &col_declaration);
-
- protected:
-  std::string buffer_;
-  static inline bool IsRunLength(ColCompressionType type) {
-    return type == kColRle || type == kColRleVarint ||
-           type == kColRleDeltaVarint || type == kColRleDict;
-  }
-};
-
-// Encoder for fixed length column buffer. In fixed length column buffer, the
-// size of the column should not exceed 8 bytes.
-// The following encodings are supported:
-// Varint: Variable length integer. See util/coding.h for more details
-// Rle (Run length encoding): encode a sequence of contiguous value as
-// [run_value][run_length]. Can be combined with Varint
-// Delta: Encode value to its delta with its adjacent entry. Use varint to
-// possibly reduce stored bytes. Can be combined with Rle.
-// Dictionary: Use a dictionary to record all possible values in the block and
-// encode them with an ID started from 0. IDs are encoded as varint. A column
-// with dictionary encoding will have a header to store all actual values,
-// ordered by their dictionary value, and the data will be replaced by
-// dictionary value. Can be combined with Rle.
-class FixedLengthColBufEncoder : public ColBufEncoder {
- public:
-  explicit FixedLengthColBufEncoder(
-      size_t size, ColCompressionType col_compression_type = kColNoCompression,
-      bool nullable = false, bool big_endian = false)
-      : size_(size),
-        col_compression_type_(col_compression_type),
-        nullable_(nullable),
-        big_endian_(big_endian),
-        last_val_(0),
-        run_length_(-1),
-        run_val_(0) {}
-
-  size_t Append(const char *buf) override;
-  void Finish() override;
-  ~FixedLengthColBufEncoder() {}
-
- private:
-  size_t size_;
-  ColCompressionType col_compression_type_;
-  // If set as true, the input value can be null (represented as nullptr). When
-  // nullable is true, use one more byte before actual value to indicate if the
-  // current value is null.
-  bool nullable_;
-  // If set as true, input value will be treated as big endian encoded.
-  bool big_endian_;
-
-  // for encoding
-  uint64_t last_val_;
-  int16_t run_length_;
-  uint64_t run_val_;
-  // Map to store dictionary for dictionary encoding
-  std::unordered_map<uint64_t, uint64_t> dictionary_;
-  // Vector of dictionary keys.
-  std::vector<uint64_t> dict_vec_;
-};
-
-// Long fixed length column buffer is a variant of fixed length buffer to hold
-// fixed length buffer with more than 8 bytes. We do not support any special
-// encoding schemes in LongFixedLengthColBufEncoder.
-class LongFixedLengthColBufEncoder : public ColBufEncoder {
- public:
-  LongFixedLengthColBufEncoder(size_t size, bool nullable)
-      : size_(size), nullable_(nullable) {}
-  size_t Append(const char *buf) override;
-  void Finish() override;
-
-  ~LongFixedLengthColBufEncoder() {}
-
- private:
-  size_t size_;
-  bool nullable_;
-};
-
-// Variable length column buffer holds a format of variable length column. In
-// this format, a column is composed of one byte length k, followed by data with
-// k bytes long data.
-class VariableLengthColBufEncoder : public ColBufEncoder {
- public:
-  size_t Append(const char *buf) override;
-  void Finish() override;
-
-  ~VariableLengthColBufEncoder() {}
-};
-
-// Variable chunk column buffer holds another format of variable length column.
-// In this format, a column contains multiple chunks of data, each of which is
-// composed of 8 bytes long data, and one byte as a mask to indicate whether we
-// have more data to come. If no more data coming, the mask is set as 0xFF. If
-// the chunk is the last chunk and has only k valid bytes, the mask is set as
-// 0xFF - (8 - k).
-class VariableChunkColBufEncoder : public VariableLengthColBufEncoder {
- public:
-  size_t Append(const char *buf) override;
-  void Finish() override;
-  explicit VariableChunkColBufEncoder(ColCompressionType col_compression_type)
-      : col_compression_type_(col_compression_type) {}
-  VariableChunkColBufEncoder() : col_compression_type_(kColNoCompression) {}
-
- private:
-  ColCompressionType col_compression_type_;
-  // Map to store dictionary for dictionary encoding
-  std::unordered_map<uint64_t, uint64_t> dictionary_;
-  // Vector of dictionary keys.
-  std::vector<uint64_t> dict_vec_;
-};
-
-// ColDeclaration declares a column's type, algorithm of column-aware encoding,
-// and other column data like endian and nullability.
-struct ColDeclaration {
-  explicit ColDeclaration(
-      std::string _col_type,
-      ColCompressionType _col_compression_type = kColNoCompression,
-      size_t _size = 0, bool _nullable = false, bool _big_endian = false)
-      : col_type(_col_type),
-        col_compression_type(_col_compression_type),
-        size(_size),
-        nullable(_nullable),
-        big_endian(_big_endian) {}
-  std::string col_type;
-  ColCompressionType col_compression_type;
-  size_t size;
-  bool nullable;
-  bool big_endian;
-};
-
-// KVPairColDeclarations is a class to hold column declaration of columns in
-// key and value.
-struct KVPairColDeclarations {
-  std::vector<ColDeclaration> *key_col_declarations;
-  std::vector<ColDeclaration> *value_col_declarations;
-  ColDeclaration *value_checksum_declaration;
-  KVPairColDeclarations(std::vector<ColDeclaration> *_key_col_declarations,
-                        std::vector<ColDeclaration> *_value_col_declarations,
-                        ColDeclaration *_value_checksum_declaration)
-      : key_col_declarations(_key_col_declarations),
-        value_col_declarations(_value_col_declarations),
-        value_checksum_declaration(_value_checksum_declaration) {}
-};
-
-// Similar to KVPairDeclarations, KVPairColBufEncoders is used to hold column
-// buffer encoders of all columns in key and value.
-struct KVPairColBufEncoders {
-  std::vector<std::unique_ptr<ColBufEncoder>> key_col_bufs;
-  std::vector<std::unique_ptr<ColBufEncoder>> value_col_bufs;
-  std::unique_ptr<ColBufEncoder> value_checksum_buf;
-
-  explicit KVPairColBufEncoders(const KVPairColDeclarations &kvp_cd) {
-    for (auto kcd : *kvp_cd.key_col_declarations) {
-      key_col_bufs.emplace_back(
-          std::move(ColBufEncoder::NewColBufEncoder(kcd)));
-    }
-    for (auto vcd : *kvp_cd.value_col_declarations) {
-      value_col_bufs.emplace_back(
-          std::move(ColBufEncoder::NewColBufEncoder(vcd)));
-    }
-    value_checksum_buf.reset(
-        ColBufEncoder::NewColBufEncoder(*kvp_cd.value_checksum_declaration));
-  }
-
-  // Helper function to call Finish()
-  void Finish() {
-    for (auto &col_buf : key_col_bufs) {
-      col_buf->Finish();
-    }
-    for (auto &col_buf : value_col_bufs) {
-      col_buf->Finish();
-    }
-    value_checksum_buf->Finish();
-  }
-};
-}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/column_aware_encoding_exp.cc b/thirdparty/rocksdb/utilities/column_aware_encoding_exp.cc
deleted file mode 100644
index 9dcd23eedd..0000000000
--- a/thirdparty/rocksdb/utilities/column_aware_encoding_exp.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <cstdio>
-#include <cstdlib>
-
-#ifndef ROCKSDB_LITE
-#ifdef GFLAGS
-
-#include <gflags/gflags.h>
-#include <inttypes.h>
-#include <vector>
-#include "rocksdb/env.h"
-#include "rocksdb/options.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_reader.h"
-#include "table/format.h"
-#include "tools/sst_dump_tool_imp.h"
-#include "util/compression.h"
-#include "util/stop_watch.h"
-#include "utilities/col_buf_encoder.h"
-#include "utilities/column_aware_encoding_util.h"
-
-using GFLAGS::ParseCommandLineFlags;
-DEFINE_string(encoded_file, "", "file to store encoded data blocks");
-DEFINE_string(decoded_file, "",
-              "file to store decoded data blocks after encoding");
-DEFINE_string(format, "col", "Output Format. Can be 'row' or 'col'");
-// TODO(jhli): option `col` should be removed and replaced by general
-// column specifications.
-DEFINE_string(index_type, "col", "Index type. Can be 'primary' or 'secondary'");
-DEFINE_string(dump_file, "",
-              "Dump data blocks separated by columns in human-readable format");
-DEFINE_bool(decode, false, "Deocde blocks after they are encoded");
-DEFINE_bool(stat, false,
-            "Print column distribution statistics. Cannot decode in this mode");
-DEFINE_string(compression_type, "kNoCompression",
-              "The compression algorithm used to compress data blocks");
-
-namespace rocksdb {
-
-class ColumnAwareEncodingExp {
- public:
-  static void Run(const std::string& sst_file) {
-    bool decode = FLAGS_decode;
-    if (FLAGS_decoded_file.size() > 0) {
-      decode = true;
-    }
-    if (FLAGS_stat) {
-      decode = false;
-    }
-
-    ColumnAwareEncodingReader reader(sst_file);
-    std::vector<ColDeclaration>* key_col_declarations;
-    std::vector<ColDeclaration>* value_col_declarations;
-    ColDeclaration* value_checksum_declaration;
-    if (FLAGS_index_type == "primary") {
-      ColumnAwareEncodingReader::GetColDeclarationsPrimary(
-          &key_col_declarations, &value_col_declarations,
-          &value_checksum_declaration);
-    } else {
-      ColumnAwareEncodingReader::GetColDeclarationsSecondary(
-          &key_col_declarations, &value_col_declarations,
-          &value_checksum_declaration);
-    }
-    KVPairColDeclarations kvp_cd(key_col_declarations, value_col_declarations,
-                                 value_checksum_declaration);
-
-    if (!FLAGS_dump_file.empty()) {
-      std::vector<KVPairBlock> kv_pair_blocks;
-      reader.GetKVPairsFromDataBlocks(&kv_pair_blocks);
-      reader.DumpDataColumns(FLAGS_dump_file, kvp_cd, kv_pair_blocks);
-      return;
-    }
-    std::unordered_map<std::string, CompressionType> compressions = {
-        {"kNoCompression", CompressionType::kNoCompression},
-        {"kZlibCompression", CompressionType::kZlibCompression},
-        {"kZSTD", CompressionType::kZSTD}};
-
-    // Find Compression
-    CompressionType compression_type = compressions[FLAGS_compression_type];
-    EnvOptions env_options;
-    if (CompressionTypeSupported(compression_type)) {
-      fprintf(stdout, "[%s]\n", FLAGS_compression_type.c_str());
-      unique_ptr<WritableFile> encoded_out_file;
-
-      std::unique_ptr<Env> env(NewMemEnv(Env::Default()));
-      if (!FLAGS_encoded_file.empty()) {
-        env->NewWritableFile(FLAGS_encoded_file, &encoded_out_file,
-                             env_options);
-      }
-
-      std::vector<KVPairBlock> kv_pair_blocks;
-      reader.GetKVPairsFromDataBlocks(&kv_pair_blocks);
-
-      std::vector<std::string> encoded_blocks;
-      StopWatchNano sw(env.get(), true);
-      if (FLAGS_format == "col") {
-        reader.EncodeBlocks(kvp_cd, encoded_out_file.get(), compression_type,
-                            kv_pair_blocks, &encoded_blocks, FLAGS_stat);
-      } else {  // row format
-        reader.EncodeBlocksToRowFormat(encoded_out_file.get(), compression_type,
-                                       kv_pair_blocks, &encoded_blocks);
-      }
-      if (encoded_out_file != nullptr) {
-        uint64_t size = 0;
-        env->GetFileSize(FLAGS_encoded_file, &size);
-        fprintf(stdout, "File size: %" PRIu64 "\n", size);
-      }
-      uint64_t encode_time = sw.ElapsedNanosSafe(false /* reset */);
-      fprintf(stdout, "Encode time: %" PRIu64 "\n", encode_time);
-      if (decode) {
-        unique_ptr<WritableFile> decoded_out_file;
-        if (!FLAGS_decoded_file.empty()) {
-          env->NewWritableFile(FLAGS_decoded_file, &decoded_out_file,
-                               env_options);
-        }
-        sw.Start();
-        if (FLAGS_format == "col") {
-          reader.DecodeBlocks(kvp_cd, decoded_out_file.get(), &encoded_blocks);
-        } else {
-          reader.DecodeBlocksFromRowFormat(decoded_out_file.get(),
-                                           &encoded_blocks);
-        }
-        uint64_t decode_time = sw.ElapsedNanosSafe(true /* reset */);
-        fprintf(stdout, "Decode time: %" PRIu64 "\n", decode_time);
-      }
-    } else {
-      fprintf(stdout, "Unsupported compression type: %s.\n",
-              FLAGS_compression_type.c_str());
-    }
-    delete key_col_declarations;
-    delete value_col_declarations;
-    delete value_checksum_declaration;
-  }
-};
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  int arg_idx = ParseCommandLineFlags(&argc, &argv, true);
-  if (arg_idx >= argc) {
-    fprintf(stdout, "SST filename required.\n");
-    exit(1);
-  }
-  std::string sst_file(argv[arg_idx]);
-  if (FLAGS_format != "row" && FLAGS_format != "col") {
-    fprintf(stderr, "Format must be 'row' or 'col'\n");
-    exit(1);
-  }
-  if (FLAGS_index_type != "primary" && FLAGS_index_type != "secondary") {
-    fprintf(stderr, "Format must be 'primary' or 'secondary'\n");
-    exit(1);
-  }
-  rocksdb::ColumnAwareEncodingExp::Run(sst_file);
-  return 0;
-}
-
-#else
-int main() {
-  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
-  return 1;
-}
-#endif  // GFLAGS
-#else
-int main(int argc, char** argv) {
-  fprintf(stderr, "Not supported in lite mode.\n");
-  return 1;
-}
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/column_aware_encoding_test.cc b/thirdparty/rocksdb/utilities/column_aware_encoding_test.cc
deleted file mode 100644
index b99ff563a2..0000000000
--- a/thirdparty/rocksdb/utilities/column_aware_encoding_test.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#ifndef ROCKSDB_LITE
-
-#include <vector>
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "utilities/col_buf_decoder.h"
-#include "utilities/col_buf_encoder.h"
-
-namespace rocksdb {
-
-class ColumnAwareEncodingTest : public testing::Test {
- public:
-  ColumnAwareEncodingTest() {}
-
-  ~ColumnAwareEncodingTest() {}
-};
-
-class ColumnAwareEncodingTestWithSize
-    : public ColumnAwareEncodingTest,
-      public testing::WithParamInterface<size_t> {
- public:
-  ColumnAwareEncodingTestWithSize() {}
-
-  ~ColumnAwareEncodingTestWithSize() {}
-
-  static std::vector<size_t> GetValues() { return {4, 8}; }
-};
-
-INSTANTIATE_TEST_CASE_P(
-    ColumnAwareEncodingTestWithSize, ColumnAwareEncodingTestWithSize,
-    ::testing::ValuesIn(ColumnAwareEncodingTestWithSize::GetValues()));
-
-TEST_P(ColumnAwareEncodingTestWithSize, NoCompressionEncodeDecode) {
-  size_t col_size = GetParam();
-  std::unique_ptr<ColBufEncoder> col_buf_encoder(
-      new FixedLengthColBufEncoder(col_size, kColNoCompression, false, true));
-  std::string str_buf;
-  uint64_t base_val = 0x0102030405060708;
-  uint64_t val = 0;
-  memcpy(&val, &base_val, col_size);
-  const int row_count = 4;
-  for (int i = 0; i < row_count; ++i) {
-    str_buf.append(reinterpret_cast<char*>(&val), col_size);
-  }
-  const char* str_buf_ptr = str_buf.c_str();
-  for (int i = 0; i < row_count; ++i) {
-    col_buf_encoder->Append(str_buf_ptr);
-  }
-  col_buf_encoder->Finish();
-  const std::string& encoded_data = col_buf_encoder->GetData();
-  // Check correctness of encoded string length
-  ASSERT_EQ(row_count * col_size, encoded_data.size());
-
-  const char* encoded_data_ptr = encoded_data.c_str();
-  uint64_t expected_encoded_val;
-  if (col_size == 8) {
-    expected_encoded_val = port::kLittleEndian ? 0x0807060504030201 : 0x0102030405060708;
-  } else if (col_size == 4) {
-    expected_encoded_val = port::kLittleEndian ? 0x08070605 : 0x0102030400000000;
-  }
-  uint64_t encoded_val = 0;
-  for (int i = 0; i < row_count; ++i) {
-    memcpy(&encoded_val, encoded_data_ptr, col_size);
-    // Check correctness of encoded value
-    ASSERT_EQ(expected_encoded_val, encoded_val);
-    encoded_data_ptr += col_size;
-  }
-
-  std::unique_ptr<ColBufDecoder> col_buf_decoder(
-      new FixedLengthColBufDecoder(col_size, kColNoCompression, false, true));
-  encoded_data_ptr = encoded_data.c_str();
-  encoded_data_ptr += col_buf_decoder->Init(encoded_data_ptr);
-  char* decoded_data = new char[100];
-  char* decoded_data_base = decoded_data;
-  for (int i = 0; i < row_count; ++i) {
-    encoded_data_ptr +=
-        col_buf_decoder->Decode(encoded_data_ptr, &decoded_data);
-  }
-
-  // Check correctness of decoded string length
-  ASSERT_EQ(row_count * col_size, decoded_data - decoded_data_base);
-  decoded_data = decoded_data_base;
-  for (int i = 0; i < row_count; ++i) {
-    uint64_t decoded_val;
-    decoded_val = 0;
-    memcpy(&decoded_val, decoded_data, col_size);
-    // Check correctness of decoded value
-    ASSERT_EQ(val, decoded_val);
-    decoded_data += col_size;
-  }
-  delete[] decoded_data_base;
-}
-
-TEST_P(ColumnAwareEncodingTestWithSize, RleEncodeDecode) {
-  size_t col_size = GetParam();
-  std::unique_ptr<ColBufEncoder> col_buf_encoder(
-      new FixedLengthColBufEncoder(col_size, kColRle, false, true));
-  std::string str_buf;
-  uint64_t base_val = 0x0102030405060708;
-  uint64_t val = 0;
-  memcpy(&val, &base_val, col_size);
-  const int row_count = 4;
-  for (int i = 0; i < row_count; ++i) {
-    str_buf.append(reinterpret_cast<char*>(&val), col_size);
-  }
-  const char* str_buf_ptr = str_buf.c_str();
-  for (int i = 0; i < row_count; ++i) {
-    str_buf_ptr += col_buf_encoder->Append(str_buf_ptr);
-  }
-  col_buf_encoder->Finish();
-  const std::string& encoded_data = col_buf_encoder->GetData();
-  // Check correctness of encoded string length
-  ASSERT_EQ(col_size + 1, encoded_data.size());
-
-  const char* encoded_data_ptr = encoded_data.c_str();
-  uint64_t encoded_val = 0;
-  memcpy(&encoded_val, encoded_data_ptr, col_size);
-  uint64_t expected_encoded_val;
-  if (col_size == 8) {
-    expected_encoded_val = port::kLittleEndian ? 0x0807060504030201 : 0x0102030405060708;
-  } else if (col_size == 4) {
-    expected_encoded_val = port::kLittleEndian ? 0x08070605 : 0x0102030400000000;
-  }
-  // Check correctness of encoded value
-  ASSERT_EQ(expected_encoded_val, encoded_val);
-
-  std::unique_ptr<ColBufDecoder> col_buf_decoder(
-      new FixedLengthColBufDecoder(col_size, kColRle, false, true));
-  char* decoded_data = new char[100];
-  char* decoded_data_base = decoded_data;
-  encoded_data_ptr += col_buf_decoder->Init(encoded_data_ptr);
-  for (int i = 0; i < row_count; ++i) {
-    encoded_data_ptr +=
-        col_buf_decoder->Decode(encoded_data_ptr, &decoded_data);
-  }
-  // Check correctness of decoded string length
-  ASSERT_EQ(decoded_data - decoded_data_base, row_count * col_size);
-  decoded_data = decoded_data_base;
-  for (int i = 0; i < row_count; ++i) {
-    uint64_t decoded_val;
-    decoded_val = 0;
-    memcpy(&decoded_val, decoded_data, col_size);
-    // Check correctness of decoded value
-    ASSERT_EQ(val, decoded_val);
-    decoded_data += col_size;
-  }
-  delete[] decoded_data_base;
-}
-
-TEST_P(ColumnAwareEncodingTestWithSize, DeltaEncodeDecode) {
-  size_t col_size = GetParam();
-  int row_count = 4;
-  std::unique_ptr<ColBufEncoder> col_buf_encoder(
-      new FixedLengthColBufEncoder(col_size, kColDeltaVarint, false, true));
-  std::string str_buf;
-  uint64_t base_val1 = port::kLittleEndian ? 0x0102030405060708 : 0x0807060504030201;
-  uint64_t base_val2 = port::kLittleEndian ? 0x0202030405060708 : 0x0807060504030202;
-  uint64_t val1 = 0, val2 = 0;
-  memcpy(&val1, &base_val1, col_size);
-  memcpy(&val2, &base_val2, col_size);
-  const char* str_buf_ptr;
-  for (int i = 0; i < row_count / 2; ++i) {
-    str_buf = std::string(reinterpret_cast<char*>(&val1), col_size);
-    str_buf_ptr = str_buf.c_str();
-    col_buf_encoder->Append(str_buf_ptr);
-
-    str_buf = std::string(reinterpret_cast<char*>(&val2), col_size);
-    str_buf_ptr = str_buf.c_str();
-    col_buf_encoder->Append(str_buf_ptr);
-  }
-  col_buf_encoder->Finish();
-  const std::string& encoded_data = col_buf_encoder->GetData();
-  // Check encoded string length
-  int varint_len = 0;
-  if (col_size == 8) {
-    varint_len = 9;
-  } else if (col_size == 4) {
-    varint_len = port::kLittleEndian ? 5 : 9;
-  }
-  // Check encoded string length: first value is original one (val - 0), the
-  // coming three are encoded as 1, -1, 1, so they should take 1 byte in varint.
-  ASSERT_EQ(varint_len + 3 * 1, encoded_data.size());
-
-  std::unique_ptr<ColBufDecoder> col_buf_decoder(
-      new FixedLengthColBufDecoder(col_size, kColDeltaVarint, false, true));
-  char* decoded_data = new char[100];
-  char* decoded_data_base = decoded_data;
-  const char* encoded_data_ptr = encoded_data.c_str();
-  encoded_data_ptr += col_buf_decoder->Init(encoded_data_ptr);
-  for (int i = 0; i < row_count; ++i) {
-    encoded_data_ptr +=
-        col_buf_decoder->Decode(encoded_data_ptr, &decoded_data);
-  }
-
-  // Check correctness of decoded string length
-  ASSERT_EQ(row_count * col_size, decoded_data - decoded_data_base);
-  decoded_data = decoded_data_base;
-
-  // Check correctness of decoded data
-  for (int i = 0; i < row_count / 2; ++i) {
-    uint64_t decoded_val = 0;
-    memcpy(&decoded_val, decoded_data, col_size);
-    ASSERT_EQ(val1, decoded_val);
-    decoded_data += col_size;
-    memcpy(&decoded_val, decoded_data, col_size);
-    ASSERT_EQ(val2, decoded_val);
-    decoded_data += col_size;
-  }
-  delete[] decoded_data_base;
-}
-
-TEST_F(ColumnAwareEncodingTest, ChunkBufEncodeDecode) {
-  std::unique_ptr<ColBufEncoder> col_buf_encoder(
-      new VariableChunkColBufEncoder(kColDict));
-  std::string buf("12345678\377\1\0\0\0\0\0\0\0\376", 18);
-  col_buf_encoder->Append(buf.c_str());
-  col_buf_encoder->Finish();
-  const std::string& encoded_data = col_buf_encoder->GetData();
-  const char* str_ptr = encoded_data.c_str();
-
-  std::unique_ptr<ColBufDecoder> col_buf_decoder(
-      new VariableChunkColBufDecoder(kColDict));
-  str_ptr += col_buf_decoder->Init(str_ptr);
-  char* decoded_data = new char[100];
-  char* decoded_data_base = decoded_data;
-  col_buf_decoder->Decode(str_ptr, &decoded_data);
-  for (size_t i = 0; i < buf.size(); ++i) {
-    ASSERT_EQ(buf[i], decoded_data_base[i]);
-  }
-  delete[] decoded_data_base;
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-#include <cstdio>
-
-int main() {
-  fprintf(stderr,
-          "SKIPPED as column aware encoding experiment is not enabled in "
-          "ROCKSDB_LITE\n");
-}
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/column_aware_encoding_util.cc b/thirdparty/rocksdb/utilities/column_aware_encoding_util.cc
deleted file mode 100644
index c36e422549..0000000000
--- a/thirdparty/rocksdb/utilities/column_aware_encoding_util.cc
+++ /dev/null
@@ -1,490 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#ifndef ROCKSDB_LITE
-
-#include "utilities/column_aware_encoding_util.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "include/rocksdb/comparator.h"
-#include "include/rocksdb/slice.h"
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-#include "table/block_based_table_builder.h"
-#include "table/block_based_table_factory.h"
-#include "table/format.h"
-#include "table/table_reader.h"
-#include "util/coding.h"
-#include "utilities/col_buf_decoder.h"
-#include "utilities/col_buf_encoder.h"
-
-#include "port/port.h"
-
-namespace rocksdb {
-
-ColumnAwareEncodingReader::ColumnAwareEncodingReader(
-    const std::string& file_path)
-    : file_name_(file_path),
-      ioptions_(options_),
-      internal_comparator_(BytewiseComparator()) {
-  InitTableReader(file_name_);
-}
-
-void ColumnAwareEncodingReader::InitTableReader(const std::string& file_path) {
-  std::unique_ptr<RandomAccessFile> file;
-  uint64_t file_size;
-  options_.env->NewRandomAccessFile(file_path, &file, soptions_);
-  options_.env->GetFileSize(file_path, &file_size);
-
-  file_.reset(new RandomAccessFileReader(std::move(file), file_path));
-
-  options_.comparator = &internal_comparator_;
-  options_.table_factory = std::make_shared<BlockBasedTableFactory>();
-
-  std::unique_ptr<TableReader> table_reader;
-  options_.table_factory->NewTableReader(
-      TableReaderOptions(ioptions_, soptions_, internal_comparator_,
-                         /*skip_filters=*/false),
-      std::move(file_), file_size, &table_reader, /*enable_prefetch=*/false);
-
-  table_reader_.reset(dynamic_cast<BlockBasedTable*>(table_reader.release()));
-}
-
-void ColumnAwareEncodingReader::GetKVPairsFromDataBlocks(
-    std::vector<KVPairBlock>* kv_pair_blocks) {
-  table_reader_->GetKVPairsFromDataBlocks(kv_pair_blocks);
-}
-
-void ColumnAwareEncodingReader::DecodeBlocks(
-    const KVPairColDeclarations& kvp_col_declarations, WritableFile* out_file,
-    const std::vector<std::string>* blocks) {
-  char* decoded_content_base = new char[16384];
-  Options options;
-  ImmutableCFOptions ioptions(options);
-  for (auto& block : *blocks) {
-    KVPairColBufDecoders kvp_col_bufs(kvp_col_declarations);
-    auto& key_col_bufs = kvp_col_bufs.key_col_bufs;
-    auto& value_col_bufs = kvp_col_bufs.value_col_bufs;
-    auto& value_checksum_buf = kvp_col_bufs.value_checksum_buf;
-
-    auto& slice_final_with_bit = block;
-    uint32_t format_version = 2;
-    Slice compression_dict;
-    BlockContents contents;
-    const char* content_ptr;
-
-    CompressionType type =
-        (CompressionType)slice_final_with_bit[slice_final_with_bit.size() - 1];
-    if (type != kNoCompression) {
-      UncompressBlockContents(slice_final_with_bit.c_str(),
-                              slice_final_with_bit.size() - 1, &contents,
-                              format_version, compression_dict, ioptions);
-      content_ptr = contents.data.data();
-    } else {
-      content_ptr = slice_final_with_bit.data();
-    }
-
-    size_t num_kv_pairs;
-    const char* header_content_ptr = content_ptr;
-    num_kv_pairs = DecodeFixed64(header_content_ptr);
-
-    header_content_ptr += sizeof(size_t);
-    size_t num_key_columns = key_col_bufs.size();
-    size_t num_value_columns = value_col_bufs.size();
-    std::vector<const char*> key_content_ptr(num_key_columns);
-    std::vector<const char*> value_content_ptr(num_value_columns);
-    const char* checksum_content_ptr;
-
-    size_t num_columns = num_key_columns + num_value_columns;
-    const char* col_content_ptr =
-        header_content_ptr + sizeof(size_t) * num_columns;
-
-    // Read headers
-    for (size_t i = 0; i < num_key_columns; ++i) {
-      key_content_ptr[i] = col_content_ptr;
-      key_content_ptr[i] += key_col_bufs[i]->Init(key_content_ptr[i]);
-      size_t offset;
-      offset = DecodeFixed64(header_content_ptr);
-      header_content_ptr += sizeof(size_t);
-      col_content_ptr += offset;
-    }
-    for (size_t i = 0; i < num_value_columns; ++i) {
-      value_content_ptr[i] = col_content_ptr;
-      value_content_ptr[i] += value_col_bufs[i]->Init(value_content_ptr[i]);
-      size_t offset;
-      offset = DecodeFixed64(header_content_ptr);
-      header_content_ptr += sizeof(size_t);
-      col_content_ptr += offset;
-    }
-    checksum_content_ptr = col_content_ptr;
-    checksum_content_ptr += value_checksum_buf->Init(checksum_content_ptr);
-
-    // Decode block
-    char* decoded_content = decoded_content_base;
-    for (size_t j = 0; j < num_kv_pairs; ++j) {
-      for (size_t i = 0; i < num_key_columns; ++i) {
-        key_content_ptr[i] +=
-            key_col_bufs[i]->Decode(key_content_ptr[i], &decoded_content);
-      }
-      for (size_t i = 0; i < num_value_columns; ++i) {
-        value_content_ptr[i] +=
-            value_col_bufs[i]->Decode(value_content_ptr[i], &decoded_content);
-      }
-      checksum_content_ptr +=
-          value_checksum_buf->Decode(checksum_content_ptr, &decoded_content);
-    }
-
-    size_t offset = decoded_content - decoded_content_base;
-    Slice output_content(decoded_content, offset);
-
-    if (out_file != nullptr) {
-      out_file->Append(output_content);
-    }
-  }
-  delete[] decoded_content_base;
-}
-
-void ColumnAwareEncodingReader::DecodeBlocksFromRowFormat(
-    WritableFile* out_file, const std::vector<std::string>* blocks) {
-  Options options;
-  ImmutableCFOptions ioptions(options);
-  for (auto& block : *blocks) {
-    auto& slice_final_with_bit = block;
-    uint32_t format_version = 2;
-    Slice compression_dict;
-    BlockContents contents;
-    std::string decoded_content;
-
-    CompressionType type =
-        (CompressionType)slice_final_with_bit[slice_final_with_bit.size() - 1];
-    if (type != kNoCompression) {
-      UncompressBlockContents(slice_final_with_bit.c_str(),
-                              slice_final_with_bit.size() - 1, &contents,
-                              format_version, compression_dict, ioptions);
-      decoded_content = std::string(contents.data.data(), contents.data.size());
-    } else {
-      decoded_content = std::move(slice_final_with_bit);
-    }
-
-    if (out_file != nullptr) {
-      out_file->Append(decoded_content);
-    }
-  }
-}
-
-void ColumnAwareEncodingReader::DumpDataColumns(
-    const std::string& filename,
-    const KVPairColDeclarations& kvp_col_declarations,
-    const std::vector<KVPairBlock>& kv_pair_blocks) {
-  KVPairColBufEncoders kvp_col_bufs(kvp_col_declarations);
-  auto& key_col_bufs = kvp_col_bufs.key_col_bufs;
-  auto& value_col_bufs = kvp_col_bufs.value_col_bufs;
-  auto& value_checksum_buf = kvp_col_bufs.value_checksum_buf;
-
-  FILE* fp = fopen(filename.c_str(), "w");
-  size_t block_id = 1;
-  for (auto& kv_pairs : kv_pair_blocks) {
-    fprintf(fp, "---------------- Block: %-4" ROCKSDB_PRIszt " ----------------\n", block_id);
-    for (auto& kv_pair : kv_pairs) {
-      const auto& key = kv_pair.first;
-      const auto& value = kv_pair.second;
-      size_t value_offset = 0;
-
-      const char* key_ptr = key.data();
-      for (auto& buf : key_col_bufs) {
-        size_t col_size = buf->Append(key_ptr);
-        std::string tmp_buf(key_ptr, col_size);
-        Slice col(tmp_buf);
-        fprintf(fp, "%s ", col.ToString(true).c_str());
-        key_ptr += col_size;
-      }
-      fprintf(fp, "|");
-
-      const char* value_ptr = value.data();
-      for (auto& buf : value_col_bufs) {
-        size_t col_size = buf->Append(value_ptr);
-        std::string tmp_buf(value_ptr, col_size);
-        Slice col(tmp_buf);
-        fprintf(fp, " %s", col.ToString(true).c_str());
-        value_ptr += col_size;
-        value_offset += col_size;
-      }
-
-      if (value_offset < value.size()) {
-        size_t col_size = value_checksum_buf->Append(value_ptr);
-        std::string tmp_buf(value_ptr, col_size);
-        Slice col(tmp_buf);
-        fprintf(fp, "|%s", col.ToString(true).c_str());
-      } else {
-        value_checksum_buf->Append(nullptr);
-      }
-      fprintf(fp, "\n");
-    }
-    block_id++;
-  }
-  fclose(fp);
-}
-
-namespace {
-
-void CompressDataBlock(const std::string& output_content, Slice* slice_final,
-                       CompressionType* type, std::string* compressed_output) {
-  CompressionOptions compression_opts;
-  uint32_t format_version = 2;  // hard-coded version
-  Slice compression_dict;
-  *slice_final =
-      CompressBlock(output_content, compression_opts, type, format_version,
-                    compression_dict, compressed_output);
-}
-
-}  // namespace
-
-void ColumnAwareEncodingReader::EncodeBlocksToRowFormat(
-    WritableFile* out_file, CompressionType compression_type,
-    const std::vector<KVPairBlock>& kv_pair_blocks,
-    std::vector<std::string>* blocks) {
-  std::string output_content;
-  for (auto& kv_pairs : kv_pair_blocks) {
-    output_content.clear();
-    std::string last_key;
-    size_t counter = 0;
-    const size_t block_restart_interval = 16;
-    for (auto& kv_pair : kv_pairs) {
-      const auto& key = kv_pair.first;
-      const auto& value = kv_pair.second;
-
-      Slice last_key_piece(last_key);
-      size_t shared = 0;
-      if (counter >= block_restart_interval) {
-        counter = 0;
-      } else {
-        const size_t min_length = std::min(last_key_piece.size(), key.size());
-        while ((shared < min_length) && last_key_piece[shared] == key[shared]) {
-          shared++;
-        }
-      }
-      const size_t non_shared = key.size() - shared;
-      output_content.append(key.c_str() + shared, non_shared);
-      output_content.append(value);
-
-      last_key.resize(shared);
-      last_key.append(key.data() + shared, non_shared);
-      counter++;
-    }
-    Slice slice_final;
-    auto type = compression_type;
-    std::string compressed_output;
-    CompressDataBlock(output_content, &slice_final, &type, &compressed_output);
-
-    if (out_file != nullptr) {
-      out_file->Append(slice_final);
-    }
-
-    // Add a bit in the end for decoding
-    std::string slice_final_with_bit(slice_final.data(), slice_final.size());
-    slice_final_with_bit.append(reinterpret_cast<char*>(&type), 1);
-    blocks->push_back(
-        std::string(slice_final_with_bit.data(), slice_final_with_bit.size()));
-  }
-}
-
-Status ColumnAwareEncodingReader::EncodeBlocks(
-    const KVPairColDeclarations& kvp_col_declarations, WritableFile* out_file,
-    CompressionType compression_type,
-    const std::vector<KVPairBlock>& kv_pair_blocks,
-    std::vector<std::string>* blocks, bool print_column_stat) {
-  std::vector<size_t> key_col_sizes(
-      kvp_col_declarations.key_col_declarations->size(), 0);
-  std::vector<size_t> value_col_sizes(
-      kvp_col_declarations.value_col_declarations->size(), 0);
-  size_t value_checksum_size = 0;
-
-  for (auto& kv_pairs : kv_pair_blocks) {
-    KVPairColBufEncoders kvp_col_bufs(kvp_col_declarations);
-    auto& key_col_bufs = kvp_col_bufs.key_col_bufs;
-    auto& value_col_bufs = kvp_col_bufs.value_col_bufs;
-    auto& value_checksum_buf = kvp_col_bufs.value_checksum_buf;
-
-    size_t num_kv_pairs = 0;
-    for (auto& kv_pair : kv_pairs) {
-      const auto& key = kv_pair.first;
-      const auto& value = kv_pair.second;
-      size_t value_offset = 0;
-      num_kv_pairs++;
-
-      const char* key_ptr = key.data();
-      for (auto& buf : key_col_bufs) {
-        size_t col_size = buf->Append(key_ptr);
-        key_ptr += col_size;
-      }
-
-      const char* value_ptr = value.data();
-      for (auto& buf : value_col_bufs) {
-        size_t col_size = buf->Append(value_ptr);
-        value_ptr += col_size;
-        value_offset += col_size;
-      }
-
-      if (value_offset < value.size()) {
-        value_checksum_buf->Append(value_ptr);
-      } else {
-        value_checksum_buf->Append(nullptr);
-      }
-    }
-
-    kvp_col_bufs.Finish();
-    // Get stats
-    // Compress and write a block
-    if (print_column_stat) {
-      for (size_t i = 0; i < key_col_bufs.size(); ++i) {
-        Slice slice_final;
-        auto type = compression_type;
-        std::string compressed_output;
-        CompressDataBlock(key_col_bufs[i]->GetData(), &slice_final, &type,
-                          &compressed_output);
-        out_file->Append(slice_final);
-        key_col_sizes[i] += slice_final.size();
-      }
-      for (size_t i = 0; i < value_col_bufs.size(); ++i) {
-        Slice slice_final;
-        auto type = compression_type;
-        std::string compressed_output;
-        CompressDataBlock(value_col_bufs[i]->GetData(), &slice_final, &type,
-                          &compressed_output);
-        out_file->Append(slice_final);
-        value_col_sizes[i] += slice_final.size();
-      }
-      Slice slice_final;
-      auto type = compression_type;
-      std::string compressed_output;
-      CompressDataBlock(value_checksum_buf->GetData(), &slice_final, &type,
-                        &compressed_output);
-      out_file->Append(slice_final);
-      value_checksum_size += slice_final.size();
-    } else {
-      std::string output_content;
-      // Write column sizes
-      PutFixed64(&output_content, num_kv_pairs);
-      for (auto& buf : key_col_bufs) {
-        size_t size = buf->GetData().size();
-        PutFixed64(&output_content, size);
-      }
-      for (auto& buf : value_col_bufs) {
-        size_t size = buf->GetData().size();
-        PutFixed64(&output_content, size);
-      }
-      // Write data
-      for (auto& buf : key_col_bufs) {
-        output_content.append(buf->GetData());
-      }
-      for (auto& buf : value_col_bufs) {
-        output_content.append(buf->GetData());
-      }
-      output_content.append(value_checksum_buf->GetData());
-
-      Slice slice_final;
-      auto type = compression_type;
-      std::string compressed_output;
-      CompressDataBlock(output_content, &slice_final, &type,
-                        &compressed_output);
-
-      if (out_file != nullptr) {
-        out_file->Append(slice_final);
-      }
-
-      // Add a bit in the end for decoding
-      std::string slice_final_with_bit(slice_final.data(),
-                                       slice_final.size() + 1);
-      slice_final_with_bit[slice_final.size()] = static_cast<char>(type);
-      blocks->push_back(std::string(slice_final_with_bit.data(),
-                                    slice_final_with_bit.size()));
-    }
-  }
-
-  if (print_column_stat) {
-    size_t total_size = 0;
-    for (size_t i = 0; i < key_col_sizes.size(); ++i)
-      total_size += key_col_sizes[i];
-    for (size_t i = 0; i < value_col_sizes.size(); ++i)
-      total_size += value_col_sizes[i];
-    total_size += value_checksum_size;
-
-    for (size_t i = 0; i < key_col_sizes.size(); ++i)
-      printf("Key col %" ROCKSDB_PRIszt " size: %" ROCKSDB_PRIszt " percentage %lf%%\n", i, key_col_sizes[i],
-             100.0 * key_col_sizes[i] / total_size);
-    for (size_t i = 0; i < value_col_sizes.size(); ++i)
-      printf("Value col %" ROCKSDB_PRIszt " size: %" ROCKSDB_PRIszt " percentage %lf%%\n", i,
-             value_col_sizes[i], 100.0 * value_col_sizes[i] / total_size);
-    printf("Value checksum size: %" ROCKSDB_PRIszt " percentage %lf%%\n", value_checksum_size,
-           100.0 * value_checksum_size / total_size);
-  }
-  return Status::OK();
-}
-
-void ColumnAwareEncodingReader::GetColDeclarationsPrimary(
-    std::vector<ColDeclaration>** key_col_declarations,
-    std::vector<ColDeclaration>** value_col_declarations,
-    ColDeclaration** value_checksum_declaration) {
-  *key_col_declarations = new std::vector<ColDeclaration>{
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4, false,
-                     true),
-      ColDeclaration("FixedLength", ColCompressionType::kColRleDeltaVarint, 8,
-                     false, true),
-      ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8,
-                     false, true),
-      ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8,
-                     false, true),
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8)};
-
-  *value_col_declarations = new std::vector<ColDeclaration>{
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4),
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4),
-      ColDeclaration("FixedLength", ColCompressionType::kColRle, 1),
-      ColDeclaration("VariableLength"),
-      ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 4),
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8)};
-  *value_checksum_declaration = new ColDeclaration(
-      "LongFixedLength", ColCompressionType::kColNoCompression, 9,
-      true /* nullable */);
-}
-
-void ColumnAwareEncodingReader::GetColDeclarationsSecondary(
-    std::vector<ColDeclaration>** key_col_declarations,
-    std::vector<ColDeclaration>** value_col_declarations,
-    ColDeclaration** value_checksum_declaration) {
-  *key_col_declarations = new std::vector<ColDeclaration>{
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 4, false,
-                     true),
-      ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8,
-                     false, true),
-      ColDeclaration("FixedLength", ColCompressionType::kColRleDeltaVarint, 8,
-                     false, true),
-      ColDeclaration("FixedLength", ColCompressionType::kColRle, 1),
-      ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 4,
-                     false, true),
-      ColDeclaration("FixedLength", ColCompressionType::kColDeltaVarint, 8,
-                     false, true),
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8, false,
-                     true),
-      ColDeclaration("VariableChunk", ColCompressionType::kColNoCompression),
-      ColDeclaration("FixedLength", ColCompressionType::kColRleVarint, 8)};
-  *value_col_declarations = new std::vector<ColDeclaration>();
-  *value_checksum_declaration = new ColDeclaration(
-      "LongFixedLength", ColCompressionType::kColNoCompression, 9,
-      true /* nullable */);
-}
-
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/column_aware_encoding_util.h b/thirdparty/rocksdb/utilities/column_aware_encoding_util.h
deleted file mode 100644
index 385d410d15..0000000000
--- a/thirdparty/rocksdb/utilities/column_aware_encoding_util.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <string>
-#include <vector>
-#include "db/dbformat.h"
-#include "include/rocksdb/env.h"
-#include "include/rocksdb/listener.h"
-#include "include/rocksdb/options.h"
-#include "include/rocksdb/status.h"
-#include "options/cf_options.h"
-#include "table/block_based_table_reader.h"
-
-namespace rocksdb {
-
-struct ColDeclaration;
-struct KVPairColDeclarations;
-
-class ColumnAwareEncodingReader {
- public:
-  explicit ColumnAwareEncodingReader(const std::string& file_name);
-
-  void GetKVPairsFromDataBlocks(std::vector<KVPairBlock>* kv_pair_blocks);
-
-  void EncodeBlocksToRowFormat(WritableFile* out_file,
-                               CompressionType compression_type,
-                               const std::vector<KVPairBlock>& kv_pair_blocks,
-                               std::vector<std::string>* blocks);
-
-  void DecodeBlocksFromRowFormat(WritableFile* out_file,
-                                 const std::vector<std::string>* blocks);
-
-  void DumpDataColumns(const std::string& filename,
-                       const KVPairColDeclarations& kvp_col_declarations,
-                       const std::vector<KVPairBlock>& kv_pair_blocks);
-
-  Status EncodeBlocks(const KVPairColDeclarations& kvp_col_declarations,
-                      WritableFile* out_file, CompressionType compression_type,
-                      const std::vector<KVPairBlock>& kv_pair_blocks,
-                      std::vector<std::string>* blocks, bool print_column_stat);
-
-  void DecodeBlocks(const KVPairColDeclarations& kvp_col_declarations,
-                    WritableFile* out_file,
-                    const std::vector<std::string>* blocks);
-
-  static void GetColDeclarationsPrimary(
-      std::vector<ColDeclaration>** key_col_declarations,
-      std::vector<ColDeclaration>** value_col_declarations,
-      ColDeclaration** value_checksum_declaration);
-
-  static void GetColDeclarationsSecondary(
-      std::vector<ColDeclaration>** key_col_declarations,
-      std::vector<ColDeclaration>** value_col_declarations,
-      ColDeclaration** value_checksum_declaration);
-
- private:
-  // Init the TableReader for the sst file
-  void InitTableReader(const std::string& file_path);
-
-  std::string file_name_;
-  EnvOptions soptions_;
-
-  Options options_;
-
-  Status init_result_;
-  std::unique_ptr<BlockBasedTable> table_reader_;
-  std::unique_ptr<RandomAccessFileReader> file_;
-
-  const ImmutableCFOptions ioptions_;
-  InternalKeyComparator internal_comparator_;
-  std::unique_ptr<TableProperties> table_properties_;
-};
-
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/thirdparty/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
index 43a2529345..49760ba5a9 100644
--- a/thirdparty/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
+++ b/thirdparty/rocksdb/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc
@@ -16,12 +16,11 @@ const char* RemoveEmptyValueCompactionFilter::Name() const {
   return "RemoveEmptyValueCompactionFilter";
 }
 
-bool RemoveEmptyValueCompactionFilter::Filter(int level,
-    const Slice& key,
-    const Slice& existing_value,
-    std::string* new_value,
-    bool* value_changed) const {
-
+bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/,
+                                              const Slice& /*key*/,
+                                              const Slice& existing_value,
+                                              std::string* /*new_value*/,
+                                              bool* /*value_changed*/) const {
   // remove kv pairs that have empty values
   return existing_value.empty();
 }
diff --git a/thirdparty/rocksdb/utilities/date_tiered/date_tiered_db_impl.cc b/thirdparty/rocksdb/utilities/date_tiered/date_tiered_db_impl.cc
deleted file mode 100644
index c1b1ceb5ec..0000000000
--- a/thirdparty/rocksdb/utilities/date_tiered/date_tiered_db_impl.cc
+++ /dev/null
@@ -1,396 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-#ifndef ROCKSDB_LITE
-
-#include "utilities/date_tiered/date_tiered_db_impl.h"
-
-#include <limits>
-
-#include "db/db_impl.h"
-#include "db/db_iter.h"
-#include "db/write_batch_internal.h"
-#include "monitoring/instrumented_mutex.h"
-#include "options/options_helper.h"
-#include "rocksdb/convenience.h"
-#include "rocksdb/env.h"
-#include "rocksdb/iterator.h"
-#include "rocksdb/utilities/date_tiered_db.h"
-#include "table/merging_iterator.h"
-#include "util/coding.h"
-#include "util/filename.h"
-#include "util/string_util.h"
-
-namespace rocksdb {
-
-// Open the db inside DateTieredDBImpl because options needs pointer to its ttl
-DateTieredDBImpl::DateTieredDBImpl(
-    DB* db, Options options,
-    const std::vector<ColumnFamilyDescriptor>& descriptors,
-    const std::vector<ColumnFamilyHandle*>& handles, int64_t ttl,
-    int64_t column_family_interval)
-    : db_(db),
-      cf_options_(ColumnFamilyOptions(options)),
-      ioptions_(ImmutableCFOptions(options)),
-      ttl_(ttl),
-      column_family_interval_(column_family_interval),
-      mutex_(options.statistics.get(), db->GetEnv(), DB_MUTEX_WAIT_MICROS,
-             options.use_adaptive_mutex) {
-  latest_timebound_ = std::numeric_limits<int64_t>::min();
-  for (size_t i = 0; i < handles.size(); ++i) {
-    const auto& name = descriptors[i].name;
-    int64_t timestamp = 0;
-    try {
-      timestamp = ParseUint64(name);
-    } catch (const std::invalid_argument&) {
-      // Bypass unrelated column family, e.g. default
-      db_->DestroyColumnFamilyHandle(handles[i]);
-      continue;
-    }
-    if (timestamp > latest_timebound_) {
-      latest_timebound_ = timestamp;
-    }
-    handle_map_.insert(std::make_pair(timestamp, handles[i]));
-  }
-}
-
-DateTieredDBImpl::~DateTieredDBImpl() {
-  for (auto handle : handle_map_) {
-    db_->DestroyColumnFamilyHandle(handle.second);
-  }
-  delete db_;
-  db_ = nullptr;
-}
-
-Status DateTieredDB::Open(const Options& options, const std::string& dbname,
-                          DateTieredDB** dbptr, int64_t ttl,
-                          int64_t column_family_interval, bool read_only) {
-  DBOptions db_options(options);
-  ColumnFamilyOptions cf_options(options);
-  std::vector<ColumnFamilyDescriptor> descriptors;
-  std::vector<ColumnFamilyHandle*> handles;
-  DB* db;
-  Status s;
-
-  // Get column families
-  std::vector<std::string> column_family_names;
-  s = DB::ListColumnFamilies(db_options, dbname, &column_family_names);
-  if (!s.ok()) {
-    // No column family found. Use default
-    s = DB::Open(options, dbname, &db);
-    if (!s.ok()) {
-      return s;
-    }
-  } else {
-    for (auto name : column_family_names) {
-      descriptors.emplace_back(ColumnFamilyDescriptor(name, cf_options));
-    }
-
-    // Open database
-    if (read_only) {
-      s = DB::OpenForReadOnly(db_options, dbname, descriptors, &handles, &db);
-    } else {
-      s = DB::Open(db_options, dbname, descriptors, &handles, &db);
-    }
-  }
-
-  if (s.ok()) {
-    *dbptr = new DateTieredDBImpl(db, options, descriptors, handles, ttl,
-                                  column_family_interval);
-  }
-  return s;
-}
-
-// Checks if the string is stale or not according to TTl provided
-bool DateTieredDBImpl::IsStale(int64_t keytime, int64_t ttl, Env* env) {
-  if (ttl <= 0) {
-    // Data is fresh if TTL is non-positive
-    return false;
-  }
-  int64_t curtime;
-  if (!env->GetCurrentTime(&curtime).ok()) {
-    // Treat the data as fresh if could not get current time
-    return false;
-  }
-  return curtime >= keytime + ttl;
-}
-
-// Drop column family when all data in that column family is expired
-// TODO(jhli): Can be made a background job
-Status DateTieredDBImpl::DropObsoleteColumnFamilies() {
-  int64_t curtime;
-  Status s;
-  s = db_->GetEnv()->GetCurrentTime(&curtime);
-  if (!s.ok()) {
-    return s;
-  }
-  {
-    InstrumentedMutexLock l(&mutex_);
-    auto iter = handle_map_.begin();
-    while (iter != handle_map_.end()) {
-      if (iter->first <= curtime - ttl_) {
-        s = db_->DropColumnFamily(iter->second);
-        if (!s.ok()) {
-          return s;
-        }
-        delete iter->second;
-        iter = handle_map_.erase(iter);
-      } else {
-        break;
-      }
-    }
-  }
-  return Status::OK();
-}
-
-// Get timestamp from user key
-Status DateTieredDBImpl::GetTimestamp(const Slice& key, int64_t* result) {
-  if (key.size() < kTSLength) {
-    return Status::Corruption("Bad timestamp in key");
-  }
-  const char* pos = key.data() + key.size() - 8;
-  int64_t timestamp = 0;
-  if (port::kLittleEndian) {
-    int bytes_to_fill = 8;
-    for (int i = 0; i < bytes_to_fill; ++i) {
-      timestamp |= (static_cast<uint64_t>(static_cast<unsigned char>(pos[i]))
-                    << ((bytes_to_fill - i - 1) << 3));
-    }
-  } else {
-    memcpy(&timestamp, pos, sizeof(timestamp));
-  }
-  *result = timestamp;
-  return Status::OK();
-}
-
-Status DateTieredDBImpl::CreateColumnFamily(
-    ColumnFamilyHandle** column_family) {
-  int64_t curtime;
-  Status s;
-  mutex_.AssertHeld();
-  s = db_->GetEnv()->GetCurrentTime(&curtime);
-  if (!s.ok()) {
-    return s;
-  }
-  int64_t new_timebound;
-  if (handle_map_.empty()) {
-    new_timebound = curtime + column_family_interval_;
-  } else {
-    new_timebound =
-        latest_timebound_ +
-        ((curtime - latest_timebound_) / column_family_interval_ + 1) *
-            column_family_interval_;
-  }
-  std::string cf_name = ToString(new_timebound);
-  latest_timebound_ = new_timebound;
-  s = db_->CreateColumnFamily(cf_options_, cf_name, column_family);
-  if (s.ok()) {
-    handle_map_.insert(std::make_pair(new_timebound, *column_family));
-  }
-  return s;
-}
-
-Status DateTieredDBImpl::FindColumnFamily(int64_t keytime,
-                                          ColumnFamilyHandle** column_family,
-                                          bool create_if_missing) {
-  *column_family = nullptr;
-  {
-    InstrumentedMutexLock l(&mutex_);
-    auto iter = handle_map_.upper_bound(keytime);
-    if (iter == handle_map_.end()) {
-      if (!create_if_missing) {
-        return Status::NotFound();
-      } else {
-        return CreateColumnFamily(column_family);
-      }
-    }
-    // Move to previous element to get the appropriate time window
-    *column_family = iter->second;
-  }
-  return Status::OK();
-}
-
-Status DateTieredDBImpl::Put(const WriteOptions& options, const Slice& key,
-                             const Slice& val) {
-  int64_t timestamp = 0;
-  Status s;
-  s = GetTimestamp(key, &timestamp);
-  if (!s.ok()) {
-    return s;
-  }
-  DropObsoleteColumnFamilies();
-
-  // Prune request to obsolete data
-  if (IsStale(timestamp, ttl_, db_->GetEnv())) {
-    return Status::InvalidArgument();
-  }
-
-  // Decide column family (i.e. the time window) to put into
-  ColumnFamilyHandle* column_family;
-  s = FindColumnFamily(timestamp, &column_family, true /*create_if_missing*/);
-  if (!s.ok()) {
-    return s;
-  }
-
-  // Efficiently put with WriteBatch
-  WriteBatch batch;
-  batch.Put(column_family, key, val);
-  return Write(options, &batch);
-}
-
-Status DateTieredDBImpl::Get(const ReadOptions& options, const Slice& key,
-                             std::string* value) {
-  int64_t timestamp = 0;
-  Status s;
-  s = GetTimestamp(key, &timestamp);
-  if (!s.ok()) {
-    return s;
-  }
-  // Prune request to obsolete data
-  if (IsStale(timestamp, ttl_, db_->GetEnv())) {
-    return Status::NotFound();
-  }
-
-  // Decide column family to get from
-  ColumnFamilyHandle* column_family;
-  s = FindColumnFamily(timestamp, &column_family, false /*create_if_missing*/);
-  if (!s.ok()) {
-    return s;
-  }
-  if (column_family == nullptr) {
-    // Cannot find column family
-    return Status::NotFound();
-  }
-
-  // Get value with key
-  return db_->Get(options, column_family, key, value);
-}
-
-bool DateTieredDBImpl::KeyMayExist(const ReadOptions& options, const Slice& key,
-                                   std::string* value, bool* value_found) {
-  int64_t timestamp = 0;
-  Status s;
-  s = GetTimestamp(key, &timestamp);
-  if (!s.ok()) {
-    // Cannot get current time
-    return false;
-  }
-  // Decide column family to get from
-  ColumnFamilyHandle* column_family;
-  s = FindColumnFamily(timestamp, &column_family, false /*create_if_missing*/);
-  if (!s.ok() || column_family == nullptr) {
-    // Cannot find column family
-    return false;
-  }
-  if (IsStale(timestamp, ttl_, db_->GetEnv())) {
-    return false;
-  }
-  return db_->KeyMayExist(options, column_family, key, value, value_found);
-}
-
-Status DateTieredDBImpl::Delete(const WriteOptions& options, const Slice& key) {
-  int64_t timestamp = 0;
-  Status s;
-  s = GetTimestamp(key, &timestamp);
-  if (!s.ok()) {
-    return s;
-  }
-  DropObsoleteColumnFamilies();
-  // Prune request to obsolete data
-  if (IsStale(timestamp, ttl_, db_->GetEnv())) {
-    return Status::NotFound();
-  }
-
-  // Decide column family to get from
-  ColumnFamilyHandle* column_family;
-  s = FindColumnFamily(timestamp, &column_family, false /*create_if_missing*/);
-  if (!s.ok()) {
-    return s;
-  }
-  if (column_family == nullptr) {
-    // Cannot find column family
-    return Status::NotFound();
-  }
-
-  // Get value with key
-  return db_->Delete(options, column_family, key);
-}
-
-Status DateTieredDBImpl::Merge(const WriteOptions& options, const Slice& key,
-                               const Slice& value) {
-  // Decide column family to get from
-  int64_t timestamp = 0;
-  Status s;
-  s = GetTimestamp(key, &timestamp);
-  if (!s.ok()) {
-    // Cannot get current time
-    return s;
-  }
-  ColumnFamilyHandle* column_family;
-  s = FindColumnFamily(timestamp, &column_family, true /*create_if_missing*/);
-  if (!s.ok()) {
-    return s;
-  }
-  WriteBatch batch;
-  batch.Merge(column_family, key, value);
-  return Write(options, &batch);
-}
-
-Status DateTieredDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
-  class Handler : public WriteBatch::Handler {
-   public:
-    explicit Handler() {}
-    WriteBatch updates_ttl;
-    Status batch_rewrite_status;
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
-      WriteBatchInternal::Put(&updates_ttl, column_family_id, key, value);
-      return Status::OK();
-    }
-    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) override {
-      WriteBatchInternal::Merge(&updates_ttl, column_family_id, key, value);
-      return Status::OK();
-    }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
-      WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
-      return Status::OK();
-    }
-    virtual void LogData(const Slice& blob) override {
-      updates_ttl.PutLogData(blob);
-    }
-  };
-  Handler handler;
-  updates->Iterate(&handler);
-  if (!handler.batch_rewrite_status.ok()) {
-    return handler.batch_rewrite_status;
-  } else {
-    return db_->Write(opts, &(handler.updates_ttl));
-  }
-}
-
-Iterator* DateTieredDBImpl::NewIterator(const ReadOptions& opts) {
-  if (handle_map_.empty()) {
-    return NewEmptyIterator();
-  }
-
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
-
-  auto db_iter = NewArenaWrappedDbIterator(
-      db_impl->GetEnv(), opts, ioptions_, kMaxSequenceNumber,
-      cf_options_.max_sequential_skip_in_iterations, 0);
-
-  auto arena = db_iter->GetArena();
-  MergeIteratorBuilder builder(cf_options_.comparator, arena);
-  for (auto& item : handle_map_) {
-    auto handle = item.second;
-    builder.AddIterator(db_impl->NewInternalIterator(
-        arena, db_iter->GetRangeDelAggregator(), handle));
-  }
-  auto internal_iter = builder.Finish();
-  db_iter->SetIterUnderDBIter(internal_iter);
-  return db_iter;
-}
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/date_tiered/date_tiered_db_impl.h b/thirdparty/rocksdb/utilities/date_tiered/date_tiered_db_impl.h
deleted file mode 100644
index 2236cff8c7..0000000000
--- a/thirdparty/rocksdb/utilities/date_tiered/date_tiered_db_impl.h
+++ /dev/null
@@ -1,89 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "monitoring/instrumented_mutex.h"
-#include "options/cf_options.h"
-#include "rocksdb/db.h"
-#include "rocksdb/utilities/date_tiered_db.h"
-
-namespace rocksdb {
-
-// Implementation of DateTieredDB.
-class DateTieredDBImpl : public DateTieredDB {
- public:
-  DateTieredDBImpl(DB* db, Options options,
-                   const std::vector<ColumnFamilyDescriptor>& descriptors,
-                   const std::vector<ColumnFamilyHandle*>& handles, int64_t ttl,
-                   int64_t column_family_interval);
-
-  virtual ~DateTieredDBImpl();
-
-  Status Put(const WriteOptions& options, const Slice& key,
-             const Slice& val) override;
-
-  Status Get(const ReadOptions& options, const Slice& key,
-             std::string* value) override;
-
-  Status Delete(const WriteOptions& options, const Slice& key) override;
-
-  bool KeyMayExist(const ReadOptions& options, const Slice& key,
-                   std::string* value, bool* value_found = nullptr) override;
-
-  Status Merge(const WriteOptions& options, const Slice& key,
-               const Slice& value) override;
-
-  Iterator* NewIterator(const ReadOptions& opts) override;
-
-  Status DropObsoleteColumnFamilies() override;
-
-  // Extract timestamp from key.
-  static Status GetTimestamp(const Slice& key, int64_t* result);
-
- private:
-  // Base database object
-  DB* db_;
-
-  const ColumnFamilyOptions cf_options_;
-
-  const ImmutableCFOptions ioptions_;
-
-  // Storing all column family handles for time series data.
-  std::vector<ColumnFamilyHandle*> handles_;
-
-  // Manages a mapping from a column family's maximum timestamp to its handle.
-  std::map<int64_t, ColumnFamilyHandle*> handle_map_;
-
-  // A time-to-live value to indicate when the data should be removed.
-  int64_t ttl_;
-
-  // An variable to indicate the time range of a column family.
-  int64_t column_family_interval_;
-
-  // Indicate largest maximum timestamp of a column family.
-  int64_t latest_timebound_;
-
-  // Mutex to protect handle_map_ operations.
-  InstrumentedMutex mutex_;
-
-  // Internal method to execute Put and Merge in batch.
-  Status Write(const WriteOptions& opts, WriteBatch* updates);
-
-  Status CreateColumnFamily(ColumnFamilyHandle** column_family);
-
-  Status FindColumnFamily(int64_t keytime, ColumnFamilyHandle** column_family,
-                          bool create_if_missing);
-
-  static bool IsStale(int64_t keytime, int64_t ttl, Env* env);
-};
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/date_tiered/date_tiered_test.cc b/thirdparty/rocksdb/utilities/date_tiered/date_tiered_test.cc
deleted file mode 100644
index 55e3f622dc..0000000000
--- a/thirdparty/rocksdb/utilities/date_tiered/date_tiered_test.cc
+++ /dev/null
@@ -1,468 +0,0 @@
-// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file. See the AUTHORS file for names of contributors.
-
-#ifndef ROCKSDB_LITE
-
-#ifndef OS_WIN
-#include <unistd.h>
-#endif
-
-#include <map>
-#include <memory>
-
-#include "rocksdb/compaction_filter.h"
-#include "rocksdb/utilities/date_tiered_db.h"
-#include "util/logging.h"
-#include "util/string_util.h"
-#include "util/testharness.h"
-
-namespace rocksdb {
-
-namespace {
-
-typedef std::map<std::string, std::string> KVMap;
-}
-
-class SpecialTimeEnv : public EnvWrapper {
- public:
-  explicit SpecialTimeEnv(Env* base) : EnvWrapper(base) {
-    base->GetCurrentTime(&current_time_);
-  }
-
-  void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
-  virtual Status GetCurrentTime(int64_t* current_time) override {
-    *current_time = current_time_;
-    return Status::OK();
-  }
-
- private:
-  int64_t current_time_ = 0;
-};
-
-class DateTieredTest : public testing::Test {
- public:
-  DateTieredTest() {
-    env_.reset(new SpecialTimeEnv(Env::Default()));
-    dbname_ = test::TmpDir() + "/date_tiered";
-    options_.create_if_missing = true;
-    options_.env = env_.get();
-    date_tiered_db_.reset(nullptr);
-    DestroyDB(dbname_, Options());
-  }
-
-  ~DateTieredTest() {
-    CloseDateTieredDB();
-    DestroyDB(dbname_, Options());
-  }
-
-  void OpenDateTieredDB(int64_t ttl, int64_t column_family_interval,
-                        bool read_only = false) {
-    ASSERT_TRUE(date_tiered_db_.get() == nullptr);
-    DateTieredDB* date_tiered_db = nullptr;
-    ASSERT_OK(DateTieredDB::Open(options_, dbname_, &date_tiered_db, ttl,
-                                 column_family_interval, read_only));
-    date_tiered_db_.reset(date_tiered_db);
-  }
-
-  void CloseDateTieredDB() { date_tiered_db_.reset(nullptr); }
-
-  Status AppendTimestamp(std::string* key) {
-    char ts[8];
-    int bytes_to_fill = 8;
-    int64_t timestamp_value = 0;
-    Status s = env_->GetCurrentTime(&timestamp_value);
-    if (!s.ok()) {
-      return s;
-    }
-    if (port::kLittleEndian) {
-      for (int i = 0; i < bytes_to_fill; ++i) {
-        ts[i] = (timestamp_value >> ((bytes_to_fill - i - 1) << 3)) & 0xFF;
-      }
-    } else {
-      memcpy(ts, static_cast<void*>(&timestamp_value), bytes_to_fill);
-    }
-    key->append(ts, 8);
-    return Status::OK();
-  }
-
-  // Populates and returns a kv-map
-  void MakeKVMap(int64_t num_entries, KVMap* kvmap) {
-    kvmap->clear();
-    int digits = 1;
-    for (int64_t dummy = num_entries; dummy /= 10; ++digits) {
-    }
-    int digits_in_i = 1;
-    for (int64_t i = 0; i < num_entries; i++) {
-      std::string key = "key";
-      std::string value = "value";
-      if (i % 10 == 0) {
-        digits_in_i++;
-      }
-      for (int j = digits_in_i; j < digits; j++) {
-        key.append("0");
-        value.append("0");
-      }
-      AppendNumberTo(&key, i);
-      AppendNumberTo(&value, i);
-      ASSERT_OK(AppendTimestamp(&key));
-      (*kvmap)[key] = value;
-    }
-    // check all insertions done
-    ASSERT_EQ(num_entries, static_cast<int64_t>(kvmap->size()));
-  }
-
-  size_t GetColumnFamilyCount() {
-    DBOptions db_options(options_);
-    std::vector<std::string> cf;
-    DB::ListColumnFamilies(db_options, dbname_, &cf);
-    return cf.size();
-  }
-
-  void Sleep(int64_t sleep_time) { env_->Sleep(sleep_time); }
-
-  static const int64_t kSampleSize_ = 100;
-  std::string dbname_;
-  std::unique_ptr<DateTieredDB> date_tiered_db_;
-  std::unique_ptr<SpecialTimeEnv> env_;
-  KVMap kvmap_;
-
- private:
-  Options options_;
-  KVMap::iterator kv_it_;
-  const std::string kNewValue_ = "new_value";
-  unique_ptr<CompactionFilter> test_comp_filter_;
-};
-
-// Puts a set of values and checks its presence using Get during ttl
-TEST_F(DateTieredTest, KeyLifeCycle) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(2, 2);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-
-  // Create key value pairs to insert
-  KVMap map_insert;
-  MakeKVMap(kSampleSize_, &map_insert);
-
-  // Put data in database
-  for (auto& kv : map_insert) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-
-  Sleep(1);
-  // T=1, keys should still reside in database
-  for (auto& kv : map_insert) {
-    std::string value;
-    ASSERT_OK(date_tiered_db_->Get(ropts, kv.first, &value));
-    ASSERT_EQ(value, kv.second);
-  }
-
-  Sleep(1);
-  // T=2, keys should not be retrieved
-  for (auto& kv : map_insert) {
-    std::string value;
-    auto s = date_tiered_db_->Get(ropts, kv.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-
-  CloseDateTieredDB();
-}
-
-TEST_F(DateTieredTest, DeleteTest) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(2, 2);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-
-  // Create key value pairs to insert
-  KVMap map_insert;
-  MakeKVMap(kSampleSize_, &map_insert);
-
-  // Put data in database
-  for (auto& kv : map_insert) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-
-  Sleep(1);
-  // Delete keys when they are not obsolete
-  for (auto& kv : map_insert) {
-    ASSERT_OK(date_tiered_db_->Delete(wopts, kv.first));
-  }
-
-  // Key should not be found
-  for (auto& kv : map_insert) {
-    std::string value;
-    auto s = date_tiered_db_->Get(ropts, kv.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-}
-
-TEST_F(DateTieredTest, KeyMayExistTest) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(2, 2);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-
-  // Create key value pairs to insert
-  KVMap map_insert;
-  MakeKVMap(kSampleSize_, &map_insert);
-
-  // Put data in database
-  for (auto& kv : map_insert) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-
-  Sleep(1);
-  // T=1, keys should still reside in database
-  for (auto& kv : map_insert) {
-    std::string value;
-    ASSERT_TRUE(date_tiered_db_->KeyMayExist(ropts, kv.first, &value));
-    ASSERT_EQ(value, kv.second);
-  }
-}
-
-// Database open and close should not affect
-TEST_F(DateTieredTest, MultiOpen) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(4, 4);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-
-  // Create key value pairs to insert
-  KVMap map_insert;
-  MakeKVMap(kSampleSize_, &map_insert);
-
-  // Put data in database
-  for (auto& kv : map_insert) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-  CloseDateTieredDB();
-
-  Sleep(1);
-  OpenDateTieredDB(2, 2);
-  // T=1, keys should still reside in database
-  for (auto& kv : map_insert) {
-    std::string value;
-    ASSERT_OK(date_tiered_db_->Get(ropts, kv.first, &value));
-    ASSERT_EQ(value, kv.second);
-  }
-
-  Sleep(1);
-  // T=2, keys should not be retrieved
-  for (auto& kv : map_insert) {
-    std::string value;
-    auto s = date_tiered_db_->Get(ropts, kv.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-
-  CloseDateTieredDB();
-}
-
-// If the key in Put() is obsolete, the data should not be written into database
-TEST_F(DateTieredTest, InsertObsoleteDate) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(2, 2);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-
-  // Create key value pairs to insert
-  KVMap map_insert;
-  MakeKVMap(kSampleSize_, &map_insert);
-
-  Sleep(2);
-  // T=2, keys put into database are already obsolete
-  // Put data in database. Operations should not return OK
-  for (auto& kv : map_insert) {
-    auto s = date_tiered_db_->Put(wopts, kv.first, kv.second);
-    ASSERT_TRUE(s.IsInvalidArgument());
-  }
-
-  // Data should not be found in database
-  for (auto& kv : map_insert) {
-    std::string value;
-    auto s = date_tiered_db_->Get(ropts, kv.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-
-  CloseDateTieredDB();
-}
-
-// Resets the timestamp of a set of kvs by updating them and checks that they
-// are not deleted according to the old timestamp
-TEST_F(DateTieredTest, ColumnFamilyCounts) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(4, 2);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-  // Only default column family
-  ASSERT_EQ(1, GetColumnFamilyCount());
-
-  // Create key value pairs to insert
-  KVMap map_insert;
-  MakeKVMap(kSampleSize_, &map_insert);
-  for (auto& kv : map_insert) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-  // A time series column family is created
-  ASSERT_EQ(2, GetColumnFamilyCount());
-
-  Sleep(2);
-  KVMap map_insert2;
-  MakeKVMap(kSampleSize_, &map_insert2);
-  for (auto& kv : map_insert2) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-  // Another time series column family is created
-  ASSERT_EQ(3, GetColumnFamilyCount());
-
-  Sleep(4);
-
-  // Data should not be found in database
-  for (auto& kv : map_insert) {
-    std::string value;
-    auto s = date_tiered_db_->Get(ropts, kv.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-
-  // Explicitly drop obsolete column families
-  date_tiered_db_->DropObsoleteColumnFamilies();
-
-  // The first column family is deleted from database
-  ASSERT_EQ(2, GetColumnFamilyCount());
-
-  CloseDateTieredDB();
-}
-
-// Puts a set of values and checks its presence using iterator during ttl
-TEST_F(DateTieredTest, IteratorLifeCycle) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(2, 2);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-
-  // Create key value pairs to insert
-  KVMap map_insert;
-  MakeKVMap(kSampleSize_, &map_insert);
-  Iterator* dbiter;
-
-  // Put data in database
-  for (auto& kv : map_insert) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-
-  Sleep(1);
-  ASSERT_EQ(2, GetColumnFamilyCount());
-  // T=1, keys should still reside in database
-  dbiter = date_tiered_db_->NewIterator(ropts);
-  dbiter->SeekToFirst();
-  for (auto& kv : map_insert) {
-    ASSERT_TRUE(dbiter->Valid());
-    ASSERT_EQ(0, dbiter->value().compare(kv.second));
-    dbiter->Next();
-  }
-  delete dbiter;
-
-  Sleep(4);
-  // T=5, keys should not be retrieved
-  for (auto& kv : map_insert) {
-    std::string value;
-    auto s = date_tiered_db_->Get(ropts, kv.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-
-  // Explicitly drop obsolete column families
-  date_tiered_db_->DropObsoleteColumnFamilies();
-
-  // Only default column family
-  ASSERT_EQ(1, GetColumnFamilyCount());
-
-  // Empty iterator
-  dbiter = date_tiered_db_->NewIterator(ropts);
-  dbiter->Seek(map_insert.begin()->first);
-  ASSERT_FALSE(dbiter->Valid());
-  delete dbiter;
-
-  CloseDateTieredDB();
-}
-
-// Iterator should be able to merge data from multiple column families
-TEST_F(DateTieredTest, IteratorMerge) {
-  WriteOptions wopts;
-  ReadOptions ropts;
-
-  // T=0, open the database and insert data
-  OpenDateTieredDB(4, 2);
-  ASSERT_TRUE(date_tiered_db_.get() != nullptr);
-
-  Iterator* dbiter;
-
-  // Put data in database
-  KVMap map_insert1;
-  MakeKVMap(kSampleSize_, &map_insert1);
-  for (auto& kv : map_insert1) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-  ASSERT_EQ(2, GetColumnFamilyCount());
-
-  Sleep(2);
-  // Put more data
-  KVMap map_insert2;
-  MakeKVMap(kSampleSize_, &map_insert2);
-  for (auto& kv : map_insert2) {
-    ASSERT_OK(date_tiered_db_->Put(wopts, kv.first, kv.second));
-  }
-  // Multiple column families for time series data
-  ASSERT_EQ(3, GetColumnFamilyCount());
-
-  // Iterator should be able to merge data from different column families
-  dbiter = date_tiered_db_->NewIterator(ropts);
-  dbiter->SeekToFirst();
-  KVMap::iterator iter1 = map_insert1.begin();
-  KVMap::iterator iter2 = map_insert2.begin();
-  for (; iter1 != map_insert1.end() && iter2 != map_insert2.end();
-       iter1++, iter2++) {
-    ASSERT_TRUE(dbiter->Valid());
-    ASSERT_EQ(0, dbiter->value().compare(iter1->second));
-    dbiter->Next();
-
-    ASSERT_TRUE(dbiter->Valid());
-    ASSERT_EQ(0, dbiter->value().compare(iter2->second));
-    dbiter->Next();
-  }
-  delete dbiter;
-
-  CloseDateTieredDB();
-}
-
-}  //  namespace rocksdb
-
-// A black-box test for the DateTieredDB around rocksdb
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-#include <stdio.h>
-
-int main(int argc, char** argv) {
-  fprintf(stderr, "SKIPPED as DateTieredDB is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/debug.cc b/thirdparty/rocksdb/utilities/debug.cc
index ce0b9580c7..72fcbf0f54 100644
--- a/thirdparty/rocksdb/utilities/debug.cc
+++ b/thirdparty/rocksdb/utilities/debug.cc
@@ -12,24 +12,28 @@
 namespace rocksdb {
 
 Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
+                         size_t max_num_ikeys,
                          std::vector<KeyVersion>* key_versions) {
   assert(key_versions != nullptr);
   key_versions->clear();
 
   DBImpl* idb = static_cast<DBImpl*>(db->GetRootDB());
   auto icmp = InternalKeyComparator(idb->GetOptions().comparator);
-  RangeDelAggregator range_del_agg(icmp, {} /* snapshots */);
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
   Arena arena;
-  ScopedArenaIterator iter(idb->NewInternalIterator(&arena, &range_del_agg));
+  ScopedArenaIterator iter(
+      idb->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber));
 
   if (!begin_key.empty()) {
     InternalKey ikey;
-    ikey.SetMaxPossibleForUserKey(begin_key);
+    ikey.SetMinPossibleForUserKey(begin_key);
     iter->Seek(ikey.Encode());
   } else {
     iter->SeekToFirst();
   }
 
+  size_t num_keys = 0;
   for (; iter->Valid(); iter->Next()) {
     ParsedInternalKey ikey;
     if (!ParseInternalKey(iter->key(), &ikey)) {
@@ -46,6 +50,9 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key,
                                iter->value().ToString() /* _value */,
                                ikey.sequence /* _sequence */,
                                static_cast<int>(ikey.type) /* _type */);
+    if (++num_keys >= max_num_ikeys) {
+      break;
+    }
   }
   return Status::OK();
 }
diff --git a/thirdparty/rocksdb/utilities/document/document_db.cc b/thirdparty/rocksdb/utilities/document/document_db.cc
deleted file mode 100644
index f7b5b3b2f3..0000000000
--- a/thirdparty/rocksdb/utilities/document/document_db.cc
+++ /dev/null
@@ -1,1193 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include "rocksdb/utilities/document_db.h"
-
-#include "rocksdb/cache.h"
-#include "rocksdb/table.h"
-#include "rocksdb/filter_policy.h"
-#include "rocksdb/comparator.h"
-#include "rocksdb/db.h"
-#include "rocksdb/slice.h"
-#include "rocksdb/utilities/json_document.h"
-#include "util/coding.h"
-#include "util/mutexlock.h"
-#include "port/port.h"
-
-namespace rocksdb {
-
-// IMPORTANT NOTE: Secondary index column families should be very small and
-// generally fit in memory. Assume that accessing secondary index column
-// families is much faster than accessing primary index (data heap) column
-// family. Accessing a key (i.e. checking for existence) from a column family in
-// RocksDB is not much faster than accessing both key and value since they are
-// kept together and loaded from storage together.
-
-namespace {
-// < 0   <=>  lhs < rhs
-// == 0  <=>  lhs == rhs
-// > 0   <=>  lhs == rhs
-// TODO(icanadi) move this to JSONDocument?
-int DocumentCompare(const JSONDocument& lhs, const JSONDocument& rhs) {
-  assert(lhs.IsObject() == false && rhs.IsObject() == false &&
-         lhs.type() == rhs.type());
-
-  switch (lhs.type()) {
-    case JSONDocument::kNull:
-      return 0;
-    case JSONDocument::kBool:
-      return static_cast<int>(lhs.GetBool()) - static_cast<int>(rhs.GetBool());
-    case JSONDocument::kDouble: {
-      double res = lhs.GetDouble() - rhs.GetDouble();
-      return res == 0.0 ? 0 : (res < 0.0 ? -1 : 1);
-    }
-    case JSONDocument::kInt64: {
-      int64_t res = lhs.GetInt64() - rhs.GetInt64();
-      return res == 0 ? 0 : (res < 0 ? -1 : 1);
-    }
-    case JSONDocument::kString:
-      return Slice(lhs.GetString()).compare(Slice(rhs.GetString()));
-    default:
-      assert(false);
-  }
-  return 0;
-}
-}  // namespace
-
-class Filter {
- public:
-  // returns nullptr on parse failure
-  static Filter* ParseFilter(const JSONDocument& filter);
-
-  struct Interval {
-    JSONDocument upper_bound;
-    JSONDocument lower_bound;
-    bool upper_inclusive;
-    bool lower_inclusive;
-    Interval()
-        : upper_bound(),
-          lower_bound(),
-          upper_inclusive(false),
-          lower_inclusive(false) {}
-    Interval(const JSONDocument& ub, const JSONDocument& lb, bool ui, bool li)
-        : upper_bound(ub),
-          lower_bound(lb),
-          upper_inclusive(ui),
-          lower_inclusive(li) {
-    }
-
-    void UpdateUpperBound(const JSONDocument& ub, bool inclusive);
-    void UpdateLowerBound(const JSONDocument& lb, bool inclusive);
-  };
-
-  bool SatisfiesFilter(const JSONDocument& document) const;
-  const Interval* GetInterval(const std::string& field) const;
-
- private:
-  explicit Filter(const JSONDocument& filter) : filter_(filter.Copy()) {
-    assert(filter_.IsOwner());
-  }
-
-  // copied from the parameter
-  const JSONDocument filter_;
-  // constant after construction
-  std::unordered_map<std::string, Interval> intervals_;
-};
-
-void Filter::Interval::UpdateUpperBound(const JSONDocument& ub,
-                                        bool inclusive) {
-  bool update = upper_bound.IsNull();
-  if (!update) {
-    int cmp = DocumentCompare(upper_bound, ub);
-    update = (cmp > 0) || (cmp == 0 && !inclusive);
-  }
-  if (update) {
-    upper_bound = ub;
-    upper_inclusive = inclusive;
-  }
-}
-
-void Filter::Interval::UpdateLowerBound(const JSONDocument& lb,
-                                        bool inclusive) {
-  bool update = lower_bound.IsNull();
-  if (!update) {
-    int cmp = DocumentCompare(lower_bound, lb);
-    update = (cmp < 0) || (cmp == 0 && !inclusive);
-  }
-  if (update) {
-    lower_bound = lb;
-    lower_inclusive = inclusive;
-  }
-}
-
-Filter* Filter::ParseFilter(const JSONDocument& filter) {
-  if (filter.IsObject() == false) {
-    return nullptr;
-  }
-
-  std::unique_ptr<Filter> f(new Filter(filter));
-
-  for (const auto& items : f->filter_.Items()) {
-    if (items.first.size() && items.first[0] == '$') {
-      // fields starting with '$' are commands
-      continue;
-    }
-    assert(f->intervals_.find(items.first) == f->intervals_.end());
-    if (items.second.IsObject()) {
-      if (items.second.Count() == 0) {
-        // uhm...?
-        return nullptr;
-      }
-      Interval interval;
-      for (const auto& condition : items.second.Items()) {
-        if (condition.second.IsObject() || condition.second.IsArray()) {
-          // comparison operators not defined on objects. invalid array
-          return nullptr;
-        }
-        // comparison operators:
-        if (condition.first == "$gt") {
-          interval.UpdateLowerBound(condition.second, false);
-        } else if (condition.first == "$gte") {
-          interval.UpdateLowerBound(condition.second, true);
-        } else if (condition.first == "$lt") {
-          interval.UpdateUpperBound(condition.second, false);
-        } else if (condition.first == "$lte") {
-          interval.UpdateUpperBound(condition.second, true);
-        } else {
-          // TODO(icanadi) more logical operators
-          return nullptr;
-        }
-      }
-      f->intervals_.insert({items.first, interval});
-    } else {
-      // equality
-      f->intervals_.insert(
-          {items.first, Interval(items.second,
-                                 items.second, true, true)});
-    }
-  }
-
-  return f.release();
-}
-
-const Filter::Interval* Filter::GetInterval(const std::string& field) const {
-  auto itr = intervals_.find(field);
-  if (itr == intervals_.end()) {
-    return nullptr;
-  }
-  // we can do that since intervals_ is constant after construction
-  return &itr->second;
-}
-
-bool Filter::SatisfiesFilter(const JSONDocument& document) const {
-  for (const auto& interval : intervals_) {
-    if (!document.Contains(interval.first)) {
-      // doesn't have the value, doesn't satisfy the filter
-      // (we don't support null queries yet)
-      return false;
-    }
-    auto value = document[interval.first];
-    if (!interval.second.upper_bound.IsNull()) {
-      if (value.type() != interval.second.upper_bound.type()) {
-        // no cross-type queries yet
-        // TODO(icanadi) do this at least for numbers!
-        return false;
-      }
-      int cmp = DocumentCompare(interval.second.upper_bound, value);
-      if (cmp < 0 || (cmp == 0 && interval.second.upper_inclusive == false)) {
-        // bigger (or equal) than upper bound
-        return false;
-      }
-    }
-    if (!interval.second.lower_bound.IsNull()) {
-      if (value.type() != interval.second.lower_bound.type()) {
-        // no cross-type queries yet
-        return false;
-      }
-      int cmp = DocumentCompare(interval.second.lower_bound, value);
-      if (cmp > 0 || (cmp == 0 && interval.second.lower_inclusive == false)) {
-        // smaller (or equal) than the lower bound
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-class Index {
- public:
-  Index() = default;
-  virtual ~Index() {}
-
-  virtual const char* Name() const = 0;
-
-  // Functions that are executed during write time
-  // ---------------------------------------------
-  // GetIndexKey() generates a key that will be used to index document and
-  // returns the key though the second std::string* parameter
-  virtual void GetIndexKey(const JSONDocument& document,
-                           std::string* key) const = 0;
-  // Keys generated with GetIndexKey() will be compared using this comparator.
-  // It should be assumed that there will be a suffix added to the index key
-  // according to IndexKey implementation
-  virtual const Comparator* GetComparator() const = 0;
-
-  // Functions that are executed during query time
-  // ---------------------------------------------
-  enum Direction {
-    kForwards,
-    kBackwards,
-  };
-  // Returns true if this index can provide some optimization for satisfying
-  // filter. False otherwise
-  virtual bool UsefulIndex(const Filter& filter) const = 0;
-  // For every filter (assuming UsefulIndex()) there is a continuous interval of
-  // keys in the index that satisfy the index conditions. That interval can be
-  // three things:
-  // * [A, B]
-  // * [A, infinity>
-  // * <-infinity, B]
-  //
-  // Query engine that uses this Index for optimization will access the interval
-  // by first calling Position() and then iterating in the Direction (returned
-  // by Position()) while ShouldContinueLooking() is true.
-  // * For [A, B] interval Position() will Seek() to A and return kForwards.
-  // ShouldContinueLooking() will be true until the iterator value gets beyond B
-  // -- then it will return false
-  // * For [A, infinity> Position() will Seek() to A and return kForwards.
-  // ShouldContinueLooking() will always return true
-  // * For <-infinity, B] Position() will Seek() to B and return kBackwards.
-  // ShouldContinueLooking() will always return true (given that iterator is
-  // advanced by calling Prev())
-  virtual Direction Position(const Filter& filter,
-                             Iterator* iterator) const = 0;
-  virtual bool ShouldContinueLooking(const Filter& filter,
-                                     const Slice& secondary_key,
-                                     Direction direction) const = 0;
-
-  // Static function that is executed when Index is created
-  // ---------------------------------------------
-  // Create Index from user-supplied description. Return nullptr on parse
-  // failure.
-  static Index* CreateIndexFromDescription(const JSONDocument& description,
-                                           const std::string& name);
-
- private:
-  // No copying allowed
-  Index(const Index&);
-  void operator=(const Index&);
-};
-
-// Encoding helper function
-namespace {
-std::string InternalSecondaryIndexName(const std::string& user_name) {
-  return "index_" + user_name;
-}
-
-// Don't change these, they are persisted in secondary indexes
-enum JSONPrimitivesEncoding : char {
-  kNull = 0x1,
-  kBool = 0x2,
-  kDouble = 0x3,
-  kInt64 = 0x4,
-  kString = 0x5,
-};
-
-// encodes simple JSON members (meaning string, integer, etc)
-// the end result of this will be lexicographically compared to each other
-bool EncodeJSONPrimitive(const JSONDocument& json, std::string* dst) {
-  // TODO(icanadi) revise this at some point, have a custom comparator
-  switch (json.type()) {
-    case JSONDocument::kNull:
-      dst->push_back(kNull);
-      break;
-    case JSONDocument::kBool:
-      dst->push_back(kBool);
-      dst->push_back(static_cast<char>(json.GetBool()));
-      break;
-    case JSONDocument::kDouble:
-      dst->push_back(kDouble);
-      PutFixed64(dst, static_cast<uint64_t>(json.GetDouble()));
-      break;
-    case JSONDocument::kInt64:
-      dst->push_back(kInt64);
-      {
-        auto val = json.GetInt64();
-        dst->push_back((val < 0) ? '0' : '1');
-        PutFixed64(dst, static_cast<uint64_t>(val));
-      }
-      break;
-    case JSONDocument::kString:
-      dst->push_back(kString);
-      dst->append(json.GetString());
-      break;
-    default:
-      return false;
-  }
-  return true;
-}
-
-}  // namespace
-
-// format of the secondary key is:
-// <secondary_key><primary_key><offset_of_primary_key uint32_t>
-class IndexKey {
- public:
-  IndexKey() : ok_(false) {}
-  explicit IndexKey(const Slice& slice) {
-    if (slice.size() < sizeof(uint32_t)) {
-      ok_ = false;
-      return;
-    }
-    uint32_t primary_key_offset =
-        DecodeFixed32(slice.data() + slice.size() - sizeof(uint32_t));
-    if (primary_key_offset >= slice.size() - sizeof(uint32_t)) {
-      ok_ = false;
-      return;
-    }
-    parts_[0] = Slice(slice.data(), primary_key_offset);
-    parts_[1] = Slice(slice.data() + primary_key_offset,
-                      slice.size() - primary_key_offset - sizeof(uint32_t));
-    ok_ = true;
-  }
-  IndexKey(const Slice& secondary_key, const Slice& primary_key) : ok_(true) {
-    parts_[0] = secondary_key;
-    parts_[1] = primary_key;
-  }
-
-  SliceParts GetSliceParts() {
-    uint32_t primary_key_offset = static_cast<uint32_t>(parts_[0].size());
-    EncodeFixed32(primary_key_offset_buf_, primary_key_offset);
-    parts_[2] = Slice(primary_key_offset_buf_, sizeof(uint32_t));
-    return SliceParts(parts_, 3);
-  }
-
-  const Slice& GetPrimaryKey() const { return parts_[1]; }
-  const Slice& GetSecondaryKey() const { return parts_[0]; }
-
-  bool ok() const { return ok_; }
-
- private:
-  bool ok_;
-  // 0 -- secondary key
-  // 1 -- primary key
-  // 2 -- primary key offset
-  Slice parts_[3];
-  char primary_key_offset_buf_[sizeof(uint32_t)];
-};
-
-class SimpleSortedIndex : public Index {
- public:
-  SimpleSortedIndex(const std::string& field, const std::string& name)
-      : field_(field), name_(name) {}
-
-  virtual const char* Name() const override { return name_.c_str(); }
-
-  virtual void GetIndexKey(const JSONDocument& document, std::string* key) const
-      override {
-    if (!document.Contains(field_)) {
-      if (!EncodeJSONPrimitive(JSONDocument(JSONDocument::kNull), key)) {
-        assert(false);
-      }
-    } else {
-      if (!EncodeJSONPrimitive(document[field_], key)) {
-        assert(false);
-      }
-    }
-  }
-  virtual const Comparator* GetComparator() const override {
-    return BytewiseComparator();
-  }
-
-  virtual bool UsefulIndex(const Filter& filter) const override {
-    return filter.GetInterval(field_) != nullptr;
-  }
-  // REQUIRES: UsefulIndex(filter) == true
-  virtual Direction Position(const Filter& filter,
-                             Iterator* iterator) const override {
-    auto interval = filter.GetInterval(field_);
-    assert(interval != nullptr);  // because index is useful
-    Direction direction;
-
-    const JSONDocument* limit;
-    if (!interval->lower_bound.IsNull()) {
-      limit = &(interval->lower_bound);
-      direction = kForwards;
-    } else {
-      limit = &(interval->upper_bound);
-      direction = kBackwards;
-    }
-
-    std::string encoded_limit;
-    if (!EncodeJSONPrimitive(*limit, &encoded_limit)) {
-      assert(false);
-    }
-    iterator->Seek(Slice(encoded_limit));
-
-    return direction;
-  }
-  // REQUIRES: UsefulIndex(filter) == true
-  virtual bool ShouldContinueLooking(
-      const Filter& filter, const Slice& secondary_key,
-      Index::Direction direction) const override {
-    auto interval = filter.GetInterval(field_);
-    assert(interval != nullptr);  // because index is useful
-    if (direction == kForwards) {
-      if (interval->upper_bound.IsNull()) {
-        // continue looking, no upper bound
-        return true;
-      }
-      std::string encoded_upper_bound;
-      if (!EncodeJSONPrimitive(interval->upper_bound, &encoded_upper_bound)) {
-        // uhm...?
-        // TODO(icanadi) store encoded upper and lower bounds in Filter*?
-        assert(false);
-      }
-      // TODO(icanadi) we need to somehow decode this and use DocumentCompare()
-      int compare = secondary_key.compare(Slice(encoded_upper_bound));
-      // if (current key is bigger than upper bound) OR (current key is equal to
-      // upper bound, but inclusive is false) THEN stop looking. otherwise,
-      // continue
-      return (compare > 0 ||
-              (compare == 0 && interval->upper_inclusive == false))
-                 ? false
-                 : true;
-    } else {
-      assert(direction == kBackwards);
-      if (interval->lower_bound.IsNull()) {
-        // continue looking, no lower bound
-        return true;
-      }
-      std::string encoded_lower_bound;
-      if (!EncodeJSONPrimitive(interval->lower_bound, &encoded_lower_bound)) {
-        // uhm...?
-        // TODO(icanadi) store encoded upper and lower bounds in Filter*?
-        assert(false);
-      }
-      // TODO(icanadi) we need to somehow decode this and use DocumentCompare()
-      int compare = secondary_key.compare(Slice(encoded_lower_bound));
-      // if (current key is smaller than lower bound) OR (current key is equal
-      // to lower bound, but inclusive is false) THEN stop looking. otherwise,
-      // continue
-      return (compare < 0 ||
-              (compare == 0 && interval->lower_inclusive == false))
-                 ? false
-                 : true;
-    }
-
-    assert(false);
-    // this is here just so compiler doesn't complain
-    return false;
-  }
-
- private:
-  std::string field_;
-  std::string name_;
-};
-
-Index* Index::CreateIndexFromDescription(const JSONDocument& description,
-                                         const std::string& name) {
-  if (!description.IsObject() || description.Count() != 1) {
-    // not supported yet
-    return nullptr;
-  }
-  const auto& field = *description.Items().begin();
-  if (field.second.IsInt64() == false || field.second.GetInt64() != 1) {
-    // not supported yet
-    return nullptr;
-  }
-  return new SimpleSortedIndex(field.first, name);
-}
-
-class CursorWithFilterIndexed : public Cursor {
- public:
-  CursorWithFilterIndexed(Iterator* primary_index_iter,
-                          Iterator* secondary_index_iter, const Index* index,
-                          const Filter* filter)
-      : primary_index_iter_(primary_index_iter),
-        secondary_index_iter_(secondary_index_iter),
-        index_(index),
-        filter_(filter),
-        valid_(true),
-        current_json_document_(nullptr) {
-    assert(filter_.get() != nullptr);
-    direction_ = index->Position(*filter_.get(), secondary_index_iter_.get());
-    UpdateIndexKey();
-    AdvanceUntilSatisfies();
-  }
-
-  virtual bool Valid() const override {
-    return valid_ && secondary_index_iter_->Valid();
-  }
-  virtual void Next() override {
-    assert(Valid());
-    Advance();
-    AdvanceUntilSatisfies();
-  }
-  // temporary object. copy it if you want to use it
-  virtual const JSONDocument& document() const override {
-    assert(Valid());
-    return *current_json_document_;
-  }
-  virtual Status status() const override {
-    if (!status_.ok()) {
-      return status_;
-    }
-    if (!primary_index_iter_->status().ok()) {
-      return primary_index_iter_->status();
-    }
-    return secondary_index_iter_->status();
-  }
-
- private:
-  void Advance() {
-    if (direction_ == Index::kForwards) {
-      secondary_index_iter_->Next();
-    } else {
-      secondary_index_iter_->Prev();
-    }
-    UpdateIndexKey();
-  }
-  void AdvanceUntilSatisfies() {
-    bool found = false;
-    while (secondary_index_iter_->Valid() &&
-           index_->ShouldContinueLooking(
-               *filter_.get(), index_key_.GetSecondaryKey(), direction_)) {
-      if (!UpdateJSONDocument()) {
-        // corruption happened
-        return;
-      }
-      if (filter_->SatisfiesFilter(*current_json_document_)) {
-        // we found satisfied!
-        found = true;
-        break;
-      } else {
-        // doesn't satisfy :(
-        Advance();
-      }
-    }
-    if (!found) {
-      valid_ = false;
-    }
-  }
-
-  bool UpdateJSONDocument() {
-    assert(secondary_index_iter_->Valid());
-    primary_index_iter_->Seek(index_key_.GetPrimaryKey());
-    if (!primary_index_iter_->Valid()) {
-      status_ = Status::Corruption(
-          "Inconsistency between primary and secondary index");
-      valid_ = false;
-      return false;
-    }
-    current_json_document_.reset(
-        JSONDocument::Deserialize(primary_index_iter_->value()));
-    assert(current_json_document_->IsOwner());
-    if (current_json_document_.get() == nullptr) {
-      status_ = Status::Corruption("JSON deserialization failed");
-      valid_ = false;
-      return false;
-    }
-    return true;
-  }
-  void UpdateIndexKey() {
-    if (secondary_index_iter_->Valid()) {
-      index_key_ = IndexKey(secondary_index_iter_->key());
-      if (!index_key_.ok()) {
-        status_ = Status::Corruption("Invalid index key");
-        valid_ = false;
-      }
-    }
-  }
-  std::unique_ptr<Iterator> primary_index_iter_;
-  std::unique_ptr<Iterator> secondary_index_iter_;
-  // we don't own index_
-  const Index* index_;
-  Index::Direction direction_;
-  std::unique_ptr<const Filter> filter_;
-  bool valid_;
-  IndexKey index_key_;
-  std::unique_ptr<JSONDocument> current_json_document_;
-  Status status_;
-};
-
-class CursorFromIterator : public Cursor {
- public:
-  explicit CursorFromIterator(Iterator* iter)
-      : iter_(iter), current_json_document_(nullptr) {
-    iter_->SeekToFirst();
-    UpdateCurrentJSON();
-  }
-
-  virtual bool Valid() const override { return status_.ok() && iter_->Valid(); }
-  virtual void Next() override {
-    iter_->Next();
-    UpdateCurrentJSON();
-  }
-  virtual const JSONDocument& document() const override {
-    assert(Valid());
-    return *current_json_document_;
-  };
-  virtual Status status() const override {
-    if (!status_.ok()) {
-      return status_;
-    }
-    return iter_->status();
-  }
-
-  // not part of public Cursor interface
-  Slice key() const { return iter_->key(); }
-
- private:
-  void UpdateCurrentJSON() {
-    if (Valid()) {
-      current_json_document_.reset(JSONDocument::Deserialize(iter_->value()));
-      if (current_json_document_.get() == nullptr) {
-        status_ = Status::Corruption("JSON deserialization failed");
-      }
-    }
-  }
-
-  Status status_;
-  std::unique_ptr<Iterator> iter_;
-  std::unique_ptr<JSONDocument> current_json_document_;
-};
-
-class CursorWithFilter : public Cursor {
- public:
-  CursorWithFilter(Cursor* base_cursor, const Filter* filter)
-      : base_cursor_(base_cursor), filter_(filter) {
-    assert(filter_.get() != nullptr);
-    SeekToNextSatisfies();
-  }
-  virtual bool Valid() const override { return base_cursor_->Valid(); }
-  virtual void Next() override {
-    assert(Valid());
-    base_cursor_->Next();
-    SeekToNextSatisfies();
-  }
-  virtual const JSONDocument& document() const override {
-    assert(Valid());
-    return base_cursor_->document();
-  }
-  virtual Status status() const override { return base_cursor_->status(); }
-
- private:
-  void SeekToNextSatisfies() {
-    for (; base_cursor_->Valid(); base_cursor_->Next()) {
-      if (filter_->SatisfiesFilter(base_cursor_->document())) {
-        break;
-      }
-    }
-  }
-  std::unique_ptr<Cursor> base_cursor_;
-  std::unique_ptr<const Filter> filter_;
-};
-
-class CursorError : public Cursor {
- public:
-  explicit CursorError(Status s) : s_(s) { assert(!s.ok()); }
-  virtual Status status() const override { return s_; }
-  virtual bool Valid() const override { return false; }
-  virtual void Next() override {}
-  virtual const JSONDocument& document() const override {
-    assert(false);
-    // compiler complains otherwise
-    return trash_;
-  }
-
- private:
-  Status s_;
-  JSONDocument trash_;
-};
-
-class DocumentDBImpl : public DocumentDB {
- public:
-  DocumentDBImpl(
-      DB* db, ColumnFamilyHandle* primary_key_column_family,
-      const std::vector<std::pair<Index*, ColumnFamilyHandle*>>& indexes,
-      const Options& rocksdb_options)
-      : DocumentDB(db),
-        primary_key_column_family_(primary_key_column_family),
-        rocksdb_options_(rocksdb_options) {
-    for (const auto& index : indexes) {
-      name_to_index_.insert(
-          {index.first->Name(), IndexColumnFamily(index.first, index.second)});
-    }
-  }
-
-  ~DocumentDBImpl() {
-    for (auto& iter : name_to_index_) {
-      delete iter.second.index;
-      delete iter.second.column_family;
-    }
-    delete primary_key_column_family_;
-  }
-
-  virtual Status CreateIndex(const WriteOptions& write_options,
-                             const IndexDescriptor& index) override {
-    auto index_obj =
-        Index::CreateIndexFromDescription(*index.description, index.name);
-    if (index_obj == nullptr) {
-      return Status::InvalidArgument("Failed parsing index description");
-    }
-
-    ColumnFamilyHandle* cf_handle;
-    Status s =
-        CreateColumnFamily(ColumnFamilyOptions(rocksdb_options_),
-                           InternalSecondaryIndexName(index.name), &cf_handle);
-    if (!s.ok()) {
-      delete index_obj;
-      return s;
-    }
-
-    MutexLock l(&write_mutex_);
-
-    std::unique_ptr<CursorFromIterator> cursor(new CursorFromIterator(
-        DocumentDB::NewIterator(ReadOptions(), primary_key_column_family_)));
-
-    WriteBatch batch;
-    for (; cursor->Valid(); cursor->Next()) {
-      std::string secondary_index_key;
-      index_obj->GetIndexKey(cursor->document(), &secondary_index_key);
-      IndexKey index_key(Slice(secondary_index_key), cursor->key());
-      batch.Put(cf_handle, index_key.GetSliceParts(), SliceParts());
-    }
-
-    if (!cursor->status().ok()) {
-      delete index_obj;
-      return cursor->status();
-    }
-
-    {
-      MutexLock l_nti(&name_to_index_mutex_);
-      name_to_index_.insert(
-          {index.name, IndexColumnFamily(index_obj, cf_handle)});
-    }
-
-    return DocumentDB::Write(write_options, &batch);
-  }
-
-  virtual Status DropIndex(const std::string& name) override {
-    MutexLock l(&write_mutex_);
-
-    auto index_iter = name_to_index_.find(name);
-    if (index_iter == name_to_index_.end()) {
-      return Status::InvalidArgument("No such index");
-    }
-
-    Status s = DropColumnFamily(index_iter->second.column_family);
-    if (!s.ok()) {
-      return s;
-    }
-
-    delete index_iter->second.index;
-    delete index_iter->second.column_family;
-
-    // remove from name_to_index_
-    {
-      MutexLock l_nti(&name_to_index_mutex_);
-      name_to_index_.erase(index_iter);
-    }
-
-    return Status::OK();
-  }
-
-  virtual Status Insert(const WriteOptions& options,
-                        const JSONDocument& document) override {
-    WriteBatch batch;
-
-    if (!document.IsObject()) {
-      return Status::InvalidArgument("Document not an object");
-    }
-    if (!document.Contains(kPrimaryKey)) {
-      return Status::InvalidArgument("No primary key");
-    }
-    auto primary_key = document[kPrimaryKey];
-    if (primary_key.IsNull() ||
-        (!primary_key.IsString() && !primary_key.IsInt64())) {
-      return Status::InvalidArgument(
-          "Primary key format error");
-    }
-    std::string encoded_document;
-    document.Serialize(&encoded_document);
-    std::string primary_key_encoded;
-    if (!EncodeJSONPrimitive(primary_key, &primary_key_encoded)) {
-      // previous call should be guaranteed to pass because of all primary_key
-      // conditions checked before
-      assert(false);
-    }
-    Slice primary_key_slice(primary_key_encoded);
-
-    // Lock now, since we're starting DB operations
-    MutexLock l(&write_mutex_);
-    // check if there is already a document with the same primary key
-    PinnableSlice value;
-    Status s = DocumentDB::Get(ReadOptions(), primary_key_column_family_,
-                               primary_key_slice, &value);
-    if (!s.IsNotFound()) {
-      return s.ok() ? Status::InvalidArgument("Duplicate primary key!") : s;
-    }
-
-    batch.Put(primary_key_column_family_, primary_key_slice, encoded_document);
-
-    for (const auto& iter : name_to_index_) {
-      std::string secondary_index_key;
-      iter.second.index->GetIndexKey(document, &secondary_index_key);
-      IndexKey index_key(Slice(secondary_index_key), primary_key_slice);
-      batch.Put(iter.second.column_family, index_key.GetSliceParts(),
-                SliceParts());
-    }
-
-    return DocumentDB::Write(options, &batch);
-  }
-
-  virtual Status Remove(const ReadOptions& read_options,
-                        const WriteOptions& write_options,
-                        const JSONDocument& query) override {
-    MutexLock l(&write_mutex_);
-    std::unique_ptr<Cursor> cursor(
-        ConstructFilterCursor(read_options, nullptr, query));
-
-    WriteBatch batch;
-    for (; cursor->status().ok() && cursor->Valid(); cursor->Next()) {
-      const auto& document = cursor->document();
-      if (!document.IsObject()) {
-        return Status::Corruption("Document corruption");
-      }
-      if (!document.Contains(kPrimaryKey)) {
-        return Status::Corruption("Document corruption");
-      }
-      auto primary_key = document[kPrimaryKey];
-      if (primary_key.IsNull() ||
-          (!primary_key.IsString() && !primary_key.IsInt64())) {
-        return Status::Corruption("Document corruption");
-      }
-
-      // TODO(icanadi) Instead of doing this, just get primary key encoding from
-      // cursor, as it already has this information
-      std::string primary_key_encoded;
-      if (!EncodeJSONPrimitive(primary_key, &primary_key_encoded)) {
-        // previous call should be guaranteed to pass because of all primary_key
-        // conditions checked before
-        assert(false);
-      }
-      Slice primary_key_slice(primary_key_encoded);
-      batch.Delete(primary_key_column_family_, primary_key_slice);
-
-      for (const auto& iter : name_to_index_) {
-        std::string secondary_index_key;
-        iter.second.index->GetIndexKey(document, &secondary_index_key);
-        IndexKey index_key(Slice(secondary_index_key), primary_key_slice);
-        batch.Delete(iter.second.column_family, index_key.GetSliceParts());
-      }
-    }
-
-    if (!cursor->status().ok()) {
-      return cursor->status();
-    }
-
-    return DocumentDB::Write(write_options, &batch);
-  }
-
-  virtual Status Update(const ReadOptions& read_options,
-                        const WriteOptions& write_options,
-                        const JSONDocument& filter,
-                        const JSONDocument& updates) override {
-    MutexLock l(&write_mutex_);
-    std::unique_ptr<Cursor> cursor(
-        ConstructFilterCursor(read_options, nullptr, filter));
-
-    if (!updates.IsObject()) {
-        return Status::Corruption("Bad update document format");
-    }
-    WriteBatch batch;
-    for (; cursor->status().ok() && cursor->Valid(); cursor->Next()) {
-      const auto& old_document = cursor->document();
-      JSONDocument new_document(old_document);
-      if (!new_document.IsObject()) {
-        return Status::Corruption("Document corruption");
-      }
-      // TODO(icanadi) Make this nicer, something like class Filter
-      for (const auto& update : updates.Items()) {
-        if (update.first == "$set") {
-          JSONDocumentBuilder builder;
-          bool res __attribute__((unused)) = builder.WriteStartObject();
-          assert(res);
-          for (const auto& itr : update.second.Items()) {
-            if (itr.first == kPrimaryKey) {
-              return Status::NotSupported("Please don't change primary key");
-            }
-            res = builder.WriteKeyValue(itr.first, itr.second);
-            assert(res);
-          }
-          res = builder.WriteEndObject();
-          assert(res);
-          JSONDocument update_document = builder.GetJSONDocument();
-          builder.Reset();
-          res = builder.WriteStartObject();
-          assert(res);
-          for (const auto& itr : new_document.Items()) {
-            if (update_document.Contains(itr.first)) {
-              res = builder.WriteKeyValue(itr.first,
-                                          update_document[itr.first]);
-            } else {
-              res = builder.WriteKeyValue(itr.first, new_document[itr.first]);
-            }
-            assert(res);
-          }
-          res = builder.WriteEndObject();
-          assert(res);
-          new_document = builder.GetJSONDocument();
-          assert(new_document.IsOwner());
-        } else {
-          // TODO(icanadi) more commands
-          return Status::InvalidArgument("Can't understand update command");
-        }
-      }
-
-      // TODO(icanadi) reuse some of this code
-      if (!new_document.Contains(kPrimaryKey)) {
-        return Status::Corruption("Corrupted document -- primary key missing");
-      }
-      auto primary_key = new_document[kPrimaryKey];
-      if (primary_key.IsNull() ||
-          (!primary_key.IsString() && !primary_key.IsInt64())) {
-        // This will happen when document on storage doesn't have primary key,
-        // since we don't support any update operations on primary key. That's
-        // why this is corruption error
-        return Status::Corruption("Corrupted document -- primary key missing");
-      }
-      std::string encoded_document;
-      new_document.Serialize(&encoded_document);
-      std::string primary_key_encoded;
-      if (!EncodeJSONPrimitive(primary_key, &primary_key_encoded)) {
-        // previous call should be guaranteed to pass because of all primary_key
-        // conditions checked before
-        assert(false);
-      }
-      Slice primary_key_slice(primary_key_encoded);
-      batch.Put(primary_key_column_family_, primary_key_slice,
-                encoded_document);
-
-      for (const auto& iter : name_to_index_) {
-        std::string old_key, new_key;
-        iter.second.index->GetIndexKey(old_document, &old_key);
-        iter.second.index->GetIndexKey(new_document, &new_key);
-        if (old_key == new_key) {
-          // don't need to update this secondary index
-          continue;
-        }
-
-        IndexKey old_index_key(Slice(old_key), primary_key_slice);
-        IndexKey new_index_key(Slice(new_key), primary_key_slice);
-
-        batch.Delete(iter.second.column_family, old_index_key.GetSliceParts());
-        batch.Put(iter.second.column_family, new_index_key.GetSliceParts(),
-                  SliceParts());
-      }
-    }
-
-    if (!cursor->status().ok()) {
-      return cursor->status();
-    }
-
-    return DocumentDB::Write(write_options, &batch);
-  }
-
-  virtual Cursor* Query(const ReadOptions& read_options,
-                        const JSONDocument& query) override {
-    Cursor* cursor = nullptr;
-
-    if (!query.IsArray()) {
-      return new CursorError(
-          Status::InvalidArgument("Query has to be an array"));
-    }
-
-    // TODO(icanadi) support index "_id"
-    for (size_t i = 0; i < query.Count(); ++i) {
-      const auto& command_doc = query[i];
-      if (command_doc.Count() != 1) {
-        // there can be only one key-value pair in each of array elements.
-        // key is the command and value are the params
-        delete cursor;
-        return new CursorError(Status::InvalidArgument("Invalid query"));
-      }
-      const auto& command = *command_doc.Items().begin();
-
-      if (command.first == "$filter") {
-        cursor = ConstructFilterCursor(read_options, cursor, command.second);
-      } else {
-        // only filter is supported for now
-        delete cursor;
-        return new CursorError(Status::InvalidArgument("Invalid query"));
-      }
-    }
-
-    if (cursor == nullptr) {
-      cursor = new CursorFromIterator(
-          DocumentDB::NewIterator(read_options, primary_key_column_family_));
-    }
-
-    return cursor;
-  }
-
-  // RocksDB functions
-  using DB::Get;
-  virtual Status Get(const ReadOptions& options,
-                     ColumnFamilyHandle* column_family, const Slice& key,
-                     PinnableSlice* value) override {
-    return Status::NotSupported("");
-  }
-  virtual Status Get(const ReadOptions& options, const Slice& key,
-                     std::string* value) override {
-    return Status::NotSupported("");
-  }
-  virtual Status Write(const WriteOptions& options,
-                       WriteBatch* updates) override {
-    return Status::NotSupported("");
-  }
-  virtual Iterator* NewIterator(const ReadOptions& options,
-                                ColumnFamilyHandle* column_family) override {
-    return nullptr;
-  }
-  virtual Iterator* NewIterator(const ReadOptions& options) override {
-    return nullptr;
-  }
-
- private:
-  Cursor* ConstructFilterCursor(ReadOptions read_options, Cursor* cursor,
-                                const JSONDocument& query) {
-    std::unique_ptr<const Filter> filter(Filter::ParseFilter(query));
-    if (filter.get() == nullptr) {
-      return new CursorError(Status::InvalidArgument("Invalid query"));
-    }
-
-    IndexColumnFamily tmp_storage(nullptr, nullptr);
-
-    if (cursor == nullptr) {
-      IndexColumnFamily* index_column_family = nullptr;
-      if (query.Contains("$index") && query["$index"].IsString()) {
-        {
-          auto index_name = query["$index"];
-          MutexLock l(&name_to_index_mutex_);
-          auto index_iter = name_to_index_.find(index_name.GetString());
-          if (index_iter != name_to_index_.end()) {
-            tmp_storage = index_iter->second;
-            index_column_family = &tmp_storage;
-          } else {
-            return new CursorError(
-                Status::InvalidArgument("Index does not exist"));
-          }
-        }
-      }
-
-      if (index_column_family != nullptr &&
-          index_column_family->index->UsefulIndex(*filter.get())) {
-        std::vector<Iterator*> iterators;
-        Status s = DocumentDB::NewIterators(
-            read_options,
-            {primary_key_column_family_, index_column_family->column_family},
-            &iterators);
-        if (!s.ok()) {
-          delete cursor;
-          return new CursorError(s);
-        }
-        assert(iterators.size() == 2);
-        return new CursorWithFilterIndexed(iterators[0], iterators[1],
-                                           index_column_family->index,
-                                           filter.release());
-      } else {
-        return new CursorWithFilter(
-            new CursorFromIterator(DocumentDB::NewIterator(
-                read_options, primary_key_column_family_)),
-            filter.release());
-      }
-    } else {
-      return new CursorWithFilter(cursor, filter.release());
-    }
-    assert(false);
-    return nullptr;
-  }
-
-  // currently, we lock and serialize all writes to rocksdb. reads are not
-  // locked and always get consistent view of the database. we should optimize
-  // locking in the future
-  port::Mutex write_mutex_;
-  port::Mutex name_to_index_mutex_;
-  const char* kPrimaryKey = "_id";
-  struct IndexColumnFamily {
-    IndexColumnFamily(Index* _index, ColumnFamilyHandle* _column_family)
-        : index(_index), column_family(_column_family) {}
-    Index* index;
-    ColumnFamilyHandle* column_family;
-  };
-
-
-  // name_to_index_ protected:
-  // 1) when writing -- 1. lock write_mutex_, 2. lock name_to_index_mutex_
-  // 2) when reading -- lock name_to_index_mutex_ OR write_mutex_
-  std::unordered_map<std::string, IndexColumnFamily> name_to_index_;
-  ColumnFamilyHandle* primary_key_column_family_;
-  Options rocksdb_options_;
-};
-
-namespace {
-Options GetRocksDBOptionsFromOptions(const DocumentDBOptions& options) {
-  Options rocksdb_options;
-  rocksdb_options.max_background_compactions = options.background_threads - 1;
-  rocksdb_options.max_background_flushes = 1;
-  rocksdb_options.write_buffer_size = options.memtable_size;
-  rocksdb_options.max_write_buffer_number = 6;
-  BlockBasedTableOptions table_options;
-  table_options.block_cache = NewLRUCache(options.cache_size);
-  rocksdb_options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-  return rocksdb_options;
-}
-}  // namespace
-
-Status DocumentDB::Open(const DocumentDBOptions& options,
-                        const std::string& name,
-                        const std::vector<DocumentDB::IndexDescriptor>& indexes,
-                        DocumentDB** db, bool read_only) {
-  Options rocksdb_options = GetRocksDBOptionsFromOptions(options);
-  rocksdb_options.create_if_missing = true;
-
-  std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.push_back(ColumnFamilyDescriptor(
-      kDefaultColumnFamilyName, ColumnFamilyOptions(rocksdb_options)));
-  for (const auto& index : indexes) {
-    column_families.emplace_back(InternalSecondaryIndexName(index.name),
-                                 ColumnFamilyOptions(rocksdb_options));
-  }
-  std::vector<ColumnFamilyHandle*> handles;
-  DB* base_db;
-  Status s;
-  if (read_only) {
-    s = DB::OpenForReadOnly(DBOptions(rocksdb_options), name, column_families,
-                            &handles, &base_db);
-  } else {
-    s = DB::Open(DBOptions(rocksdb_options), name, column_families, &handles,
-                 &base_db);
-  }
-  if (!s.ok()) {
-    return s;
-  }
-
-  std::vector<std::pair<Index*, ColumnFamilyHandle*>> index_cf(indexes.size());
-  assert(handles.size() == indexes.size() + 1);
-  for (size_t i = 0; i < indexes.size(); ++i) {
-    auto index = Index::CreateIndexFromDescription(*indexes[i].description,
-                                                   indexes[i].name);
-    index_cf[i] = {index, handles[i + 1]};
-  }
-  *db = new DocumentDBImpl(base_db, handles[0], index_cf, rocksdb_options);
-  return Status::OK();
-}
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/document/document_db_test.cc b/thirdparty/rocksdb/utilities/document/document_db_test.cc
deleted file mode 100644
index e8f4138c0b..0000000000
--- a/thirdparty/rocksdb/utilities/document/document_db_test.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include <algorithm>
-
-#include "rocksdb/utilities/json_document.h"
-#include "rocksdb/utilities/document_db.h"
-
-#include "util/testharness.h"
-#include "util/testutil.h"
-
-namespace rocksdb {
-
-class DocumentDBTest : public testing::Test {
- public:
-  DocumentDBTest() {
-    dbname_ = test::TmpDir() + "/document_db_test";
-    DestroyDB(dbname_, Options());
-  }
-  ~DocumentDBTest() {
-    delete db_;
-    DestroyDB(dbname_, Options());
-  }
-
-  void AssertCursorIDs(Cursor* cursor, std::vector<int64_t> expected) {
-    std::vector<int64_t> got;
-    while (cursor->Valid()) {
-      ASSERT_TRUE(cursor->Valid());
-      ASSERT_TRUE(cursor->document().Contains("_id"));
-      got.push_back(cursor->document()["_id"].GetInt64());
-      cursor->Next();
-    }
-    std::sort(expected.begin(), expected.end());
-    std::sort(got.begin(), got.end());
-    ASSERT_TRUE(got == expected);
-  }
-
-  // converts ' to ", so that we don't have to escape " all over the place
-  std::string ConvertQuotes(const std::string& input) {
-    std::string output;
-    for (auto x : input) {
-      if (x == '\'') {
-        output.push_back('\"');
-      } else {
-        output.push_back(x);
-      }
-    }
-    return output;
-  }
-
-  void CreateIndexes(std::vector<DocumentDB::IndexDescriptor> indexes) {
-    for (auto i : indexes) {
-      ASSERT_OK(db_->CreateIndex(WriteOptions(), i));
-    }
-  }
-
-  JSONDocument* Parse(const std::string& doc) {
-    return JSONDocument::ParseJSON(ConvertQuotes(doc).c_str());
-  }
-
-  std::string dbname_;
-  DocumentDB* db_;
-};
-
-TEST_F(DocumentDBTest, SimpleQueryTest) {
-  DocumentDBOptions options;
-  DocumentDB::IndexDescriptor index;
-  index.description = Parse("{\"name\": 1}");
-  index.name = "name_index";
-
-  ASSERT_OK(DocumentDB::Open(options, dbname_, {}, &db_));
-  CreateIndexes({index});
-  delete db_;
-  // now there is index present
-  ASSERT_OK(DocumentDB::Open(options, dbname_, {index}, &db_));
-  delete index.description;
-
-  std::vector<std::string> json_objects = {
-      "{\"_id\': 1, \"name\": \"One\"}",   "{\"_id\": 2, \"name\": \"Two\"}",
-      "{\"_id\": 3, \"name\": \"Three\"}", "{\"_id\": 4, \"name\": \"Four\"}"};
-
-  for (auto& json : json_objects) {
-    std::unique_ptr<JSONDocument> document(Parse(json));
-    ASSERT_TRUE(document.get() != nullptr);
-    ASSERT_OK(db_->Insert(WriteOptions(), *document));
-  }
-
-  // inserting a document with existing primary key should return failure
-  {
-    std::unique_ptr<JSONDocument> document(Parse(json_objects[0]));
-    ASSERT_TRUE(document.get() != nullptr);
-    Status s = db_->Insert(WriteOptions(), *document);
-    ASSERT_TRUE(s.IsInvalidArgument());
-  }
-
-  // find equal to "Two"
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("[{'$filter': {'name': 'Two', '$index': 'name_index'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {2});
-  }
-
-  // find less than "Three"
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'name': {'$lt': 'Three'}, '$index': "
-        "'name_index'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-
-    AssertCursorIDs(cursor.get(), {1, 4});
-  }
-
-  // find less than "Three" without index
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("[{'$filter': {'name': {'$lt': 'Three'} }}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {1, 4});
-  }
-
-  // remove less or equal to "Three"
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("{'name': {'$lte': 'Three'}, '$index': 'name_index'}"));
-    ASSERT_OK(db_->Remove(ReadOptions(), WriteOptions(), *query));
-  }
-
-  // find all -- only "Two" left, everything else should be deleted
-  {
-    std::unique_ptr<JSONDocument> query(Parse("[]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {2});
-  }
-}
-
-TEST_F(DocumentDBTest, ComplexQueryTest) {
-  DocumentDBOptions options;
-  DocumentDB::IndexDescriptor priority_index;
-  priority_index.description = Parse("{'priority': 1}");
-  priority_index.name = "priority";
-  DocumentDB::IndexDescriptor job_name_index;
-  job_name_index.description = Parse("{'job_name': 1}");
-  job_name_index.name = "job_name";
-  DocumentDB::IndexDescriptor progress_index;
-  progress_index.description = Parse("{'progress': 1}");
-  progress_index.name = "progress";
-
-  ASSERT_OK(DocumentDB::Open(options, dbname_, {}, &db_));
-  CreateIndexes({priority_index, progress_index});
-  delete priority_index.description;
-  delete progress_index.description;
-
-  std::vector<std::string> json_objects = {
-      "{'_id': 1, 'job_name': 'play', 'priority': 10, 'progress': 14.2}",
-      "{'_id': 2, 'job_name': 'white', 'priority': 2, 'progress': 45.1}",
-      "{'_id': 3, 'job_name': 'straw', 'priority': 5, 'progress': 83.2}",
-      "{'_id': 4, 'job_name': 'temporary', 'priority': 3, 'progress': 14.9}",
-      "{'_id': 5, 'job_name': 'white', 'priority': 4, 'progress': 44.2}",
-      "{'_id': 6, 'job_name': 'tea', 'priority': 1, 'progress': 12.4}",
-      "{'_id': 7, 'job_name': 'delete', 'priority': 2, 'progress': 77.54}",
-      "{'_id': 8, 'job_name': 'rock', 'priority': 3, 'progress': 93.24}",
-      "{'_id': 9, 'job_name': 'steady', 'priority': 3, 'progress': 9.1}",
-      "{'_id': 10, 'job_name': 'white', 'priority': 1, 'progress': 61.4}",
-      "{'_id': 11, 'job_name': 'who', 'priority': 4, 'progress': 39.41}",
-      "{'_id': 12, 'job_name': 'who', 'priority': -1, 'progress': 39.42}",
-      "{'_id': 13, 'job_name': 'who', 'priority': -2, 'progress': 39.42}", };
-
-  // add index on the fly!
-  CreateIndexes({job_name_index});
-  delete job_name_index.description;
-
-  for (auto& json : json_objects) {
-    std::unique_ptr<JSONDocument> document(Parse(json));
-    ASSERT_TRUE(document != nullptr);
-    ASSERT_OK(db_->Insert(WriteOptions(), *document));
-  }
-
-  // 2 < priority < 4 AND progress > 10.0, index priority
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'priority': {'$lt': 4, '$gt': 2}, 'progress': {'$gt': "
-        "10.0}, '$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {4, 8});
-  }
-
-  // -1 <= priority <= 1, index priority
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'priority': {'$lte': 1, '$gte': -1},"
-        " '$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {6, 10, 12});
-  }
-
-  // 2 < priority < 4 AND progress > 10.0, index progress
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'priority': {'$lt': 4, '$gt': 2}, 'progress': {'$gt': "
-        "10.0}, '$index': 'progress'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {4, 8});
-  }
-
-  // job_name == 'white' AND priority >= 2, index job_name
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'job_name': 'white', 'priority': {'$gte': "
-        "2}, '$index': 'job_name'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {2, 5});
-  }
-
-  // 35.0 <= progress < 65.5, index progress
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'progress': {'$gt': 5.0, '$gte': 35.0, '$lt': 65.5}, "
-        "'$index': 'progress'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {2, 5, 10, 11, 12, 13});
-  }
-
-  // 2 < priority <= 4, index priority
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'priority': {'$gt': 2, '$lt': 8, '$lte': 4}, "
-        "'$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {4, 5, 8, 9, 11});
-  }
-
-  // Delete all whose progress is bigger than 50%
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("{'progress': {'$gt': 50.0}, '$index': 'progress'}"));
-    ASSERT_OK(db_->Remove(ReadOptions(), WriteOptions(), *query));
-  }
-
-  // 2 < priority < 6, index priority
-  {
-    std::unique_ptr<JSONDocument> query(Parse(
-        "[{'$filter': {'priority': {'$gt': 2, '$lt': 6}, "
-        "'$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    AssertCursorIDs(cursor.get(), {4, 5, 9, 11});
-  }
-
-  // update set priority to 10 where job_name is 'white'
-  {
-    std::unique_ptr<JSONDocument> query(Parse("{'job_name': 'white'}"));
-    std::unique_ptr<JSONDocument> update(Parse("{'$set': {'priority': 10}}"));
-    ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update));
-  }
-
-  // update twice: set priority to 15 where job_name is 'white'
-  {
-    std::unique_ptr<JSONDocument> query(Parse("{'job_name': 'white'}"));
-    std::unique_ptr<JSONDocument> update(Parse("{'$set': {'priority': 10},"
-                                               "'$set': {'priority': 15}}"));
-    ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update));
-  }
-
-  // update twice: set priority to 15 and
-  // progress to 40 where job_name is 'white'
-  {
-    std::unique_ptr<JSONDocument> query(Parse("{'job_name': 'white'}"));
-    std::unique_ptr<JSONDocument> update(
-        Parse("{'$set': {'priority': 10, 'progress': 35},"
-              "'$set': {'priority': 15, 'progress': 40}}"));
-    ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update));
-  }
-
-  // priority < 0
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("[{'$filter': {'priority': {'$lt': 0}, '$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    ASSERT_OK(cursor->status());
-    AssertCursorIDs(cursor.get(), {12, 13});
-  }
-
-  // -2 < priority < 0
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("[{'$filter': {'priority': {'$gt': -2, '$lt': 0},"
-        " '$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    ASSERT_OK(cursor->status());
-    AssertCursorIDs(cursor.get(), {12});
-  }
-
-  // -2 <= priority < 0
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("[{'$filter': {'priority': {'$gte': -2, '$lt': 0},"
-        " '$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    ASSERT_OK(cursor->status());
-    AssertCursorIDs(cursor.get(), {12, 13});
-  }
-
-  // 4 < priority
-  {
-    std::unique_ptr<JSONDocument> query(
-        Parse("[{'$filter': {'priority': {'$gt': 4}, '$index': 'priority'}}]"));
-    std::unique_ptr<Cursor> cursor(db_->Query(ReadOptions(), *query));
-    ASSERT_OK(cursor->status());
-    AssertCursorIDs(cursor.get(), {1, 2, 5});
-  }
-
-  Status s = db_->DropIndex("doesnt-exist");
-  ASSERT_TRUE(!s.ok());
-  ASSERT_OK(db_->DropIndex("priority"));
-}
-
-}  //  namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-#include <stdio.h>
-
-int main(int argc, char** argv) {
-  fprintf(stderr, "SKIPPED as DocumentDB is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/document/json_document.cc b/thirdparty/rocksdb/utilities/document/json_document.cc
deleted file mode 100644
index 6917923a35..0000000000
--- a/thirdparty/rocksdb/utilities/document/json_document.cc
+++ /dev/null
@@ -1,609 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-#ifndef ROCKSDB_LITE
-
-#include "rocksdb/utilities/json_document.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <assert.h>
-#include <inttypes.h>
-#include <string.h>
-
-#include <functional>
-#include <limits>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-
-#include "third-party/fbson/FbsonDocument.h"
-#include "third-party/fbson/FbsonJsonParser.h"
-#include "third-party/fbson/FbsonUtil.h"
-#include "util/coding.h"
-
-using std::placeholders::_1;
-
-namespace {
-
-size_t ObjectNumElem(const fbson::ObjectVal& objectVal) {
-  size_t size = 0;
-  for (auto keyValuePair : objectVal) {
-    (void)keyValuePair;
-    ++size;
-  }
-  return size;
-}
-
-template <typename Func>
-void InitJSONDocument(std::unique_ptr<char[]>* data,
-                      fbson::FbsonValue** value,
-                      Func f) {
-  // TODO(stash): maybe add function to FbsonDocument to avoid creating array?
-  fbson::FbsonWriter writer;
-  bool res __attribute__((unused)) = writer.writeStartArray();
-  assert(res);
-  uint32_t bytesWritten __attribute__((unused)) = f(writer);
-  assert(bytesWritten != 0);
-  res = writer.writeEndArray();
-  assert(res);
-  char* buf = new char[writer.getOutput()->getSize()];
-  memcpy(buf, writer.getOutput()->getBuffer(), writer.getOutput()->getSize());
-
-  *value = ((fbson::FbsonDocument *)buf)->getValue();
-  assert((*value)->isArray());
-  assert(((fbson::ArrayVal*)*value)->numElem() == 1);
-  *value = ((fbson::ArrayVal*)*value)->get(0);
-  data->reset(buf);
-}
-
-void InitString(std::unique_ptr<char[]>* data,
-                fbson::FbsonValue** value,
-                const std::string& s) {
-  InitJSONDocument(data, value, std::bind(
-      [](fbson::FbsonWriter& writer, const std::string& str) -> uint32_t {
-        bool res __attribute__((unused)) = writer.writeStartString();
-        assert(res);
-        auto bytesWritten = writer.writeString(str.c_str(),
-                            static_cast<uint32_t>(str.length()));
-        res = writer.writeEndString();
-        assert(res);
-        // If the string is empty, then bytesWritten == 0, and assert in
-        // InitJsonDocument will fail.
-        return bytesWritten + static_cast<uint32_t>(str.empty());
-      },
-  _1, s));
-}
-
-bool IsNumeric(fbson::FbsonValue* value) {
-  return value->isInt8() || value->isInt16() ||
-         value->isInt32() ||  value->isInt64();
-}
-
-int64_t GetInt64ValFromFbsonNumericType(fbson::FbsonValue* value) {
-  switch (value->type()) {
-    case fbson::FbsonType::T_Int8:
-      return reinterpret_cast<fbson::Int8Val*>(value)->val();
-    case fbson::FbsonType::T_Int16:
-      return reinterpret_cast<fbson::Int16Val*>(value)->val();
-    case fbson::FbsonType::T_Int32:
-      return reinterpret_cast<fbson::Int32Val*>(value)->val();
-    case fbson::FbsonType::T_Int64:
-      return reinterpret_cast<fbson::Int64Val*>(value)->val();
-    default:
-      assert(false);
-  }
-  return 0;
-}
-
-bool IsComparable(fbson::FbsonValue* left, fbson::FbsonValue* right) {
-  if (left->type() == right->type()) {
-    return true;
-  }
-  if (IsNumeric(left) && IsNumeric(right)) {
-    return true;
-  }
-  return false;
-}
-
-void CreateArray(std::unique_ptr<char[]>* data, fbson::FbsonValue** value) {
-  fbson::FbsonWriter writer;
-  bool res __attribute__((unused)) = writer.writeStartArray();
-  assert(res);
-  res = writer.writeEndArray();
-  assert(res);
-  data->reset(new char[writer.getOutput()->getSize()]);
-  memcpy(data->get(),
-         writer.getOutput()->getBuffer(),
-         writer.getOutput()->getSize());
-  *value = reinterpret_cast<fbson::FbsonDocument*>(data->get())->getValue();
-}
-
-void CreateObject(std::unique_ptr<char[]>* data, fbson::FbsonValue** value) {
-  fbson::FbsonWriter writer;
-  bool res __attribute__((unused)) = writer.writeStartObject();
-  assert(res);
-  res = writer.writeEndObject();
-  assert(res);
-  data->reset(new char[writer.getOutput()->getSize()]);
-  memcpy(data->get(),
-         writer.getOutput()->getBuffer(),
-         writer.getOutput()->getSize());
-  *value = reinterpret_cast<fbson::FbsonDocument*>(data->get())->getValue();
-}
-
-}  // namespace
-
-namespace rocksdb {
-
-
-// TODO(stash): find smth easier
-JSONDocument::JSONDocument() {
-  InitJSONDocument(&data_,
-                   &value_,
-                   std::bind(&fbson::FbsonWriter::writeNull, _1));
-}
-
-JSONDocument::JSONDocument(bool b) {
-  InitJSONDocument(&data_,
-                   &value_,
-                   std::bind(&fbson::FbsonWriter::writeBool, _1, b));
-}
-
-JSONDocument::JSONDocument(double d) {
-  InitJSONDocument(&data_,
-                   &value_,
-                   std::bind(&fbson::FbsonWriter::writeDouble, _1, d));
-}
-
-JSONDocument::JSONDocument(int8_t i) {
-  InitJSONDocument(&data_,
-                   &value_,
-                   std::bind(&fbson::FbsonWriter::writeInt8, _1, i));
-}
-
-JSONDocument::JSONDocument(int16_t i) {
-  InitJSONDocument(&data_,
-                   &value_,
-                   std::bind(&fbson::FbsonWriter::writeInt16, _1, i));
-}
-
-JSONDocument::JSONDocument(int32_t i) {
-  InitJSONDocument(&data_,
-                   &value_,
-                   std::bind(&fbson::FbsonWriter::writeInt32, _1, i));
-}
-
-JSONDocument::JSONDocument(int64_t i) {
-  InitJSONDocument(&data_,
-                   &value_,
-                   std::bind(&fbson::FbsonWriter::writeInt64, _1, i));
-}
-
-JSONDocument::JSONDocument(const std::string& s) {
-  InitString(&data_, &value_, s);
-}
-
-JSONDocument::JSONDocument(const char* s) : JSONDocument(std::string(s)) {
-}
-
-void JSONDocument::InitFromValue(const fbson::FbsonValue* val) {
-  data_.reset(new char[val->numPackedBytes()]);
-  memcpy(data_.get(), val, val->numPackedBytes());
-  value_ = reinterpret_cast<fbson::FbsonValue*>(data_.get());
-}
-
-// Private constructor
-JSONDocument::JSONDocument(fbson::FbsonValue* val, bool makeCopy) {
-  if (makeCopy) {
-    InitFromValue(val);
-  } else {
-    value_ = val;
-  }
-}
-
-JSONDocument::JSONDocument(Type _type) {
-  // TODO(icanadi) make all of this better by using templates
-  switch (_type) {
-    case kNull:
-      InitJSONDocument(&data_, &value_,
-                       std::bind(&fbson::FbsonWriter::writeNull, _1));
-      break;
-    case kObject:
-      CreateObject(&data_, &value_);
-      break;
-    case kBool:
-      InitJSONDocument(&data_, &value_,
-                       std::bind(&fbson::FbsonWriter::writeBool, _1, false));
-      break;
-    case kDouble:
-      InitJSONDocument(&data_, &value_,
-                       std::bind(&fbson::FbsonWriter::writeDouble, _1, 0.));
-      break;
-    case kArray:
-      CreateArray(&data_, &value_);
-      break;
-    case kInt64:
-      InitJSONDocument(&data_, &value_,
-                       std::bind(&fbson::FbsonWriter::writeInt64, _1, 0));
-      break;
-    case kString:
-      InitString(&data_, &value_, "");
-      break;
-    default:
-      assert(false);
-  }
-}
-
-JSONDocument::JSONDocument(const JSONDocument& jsonDocument) {
-  if (jsonDocument.IsOwner()) {
-    InitFromValue(jsonDocument.value_);
-  } else {
-    value_ = jsonDocument.value_;
-  }
-}
-
-JSONDocument::JSONDocument(JSONDocument&& jsonDocument) {
-  value_ = jsonDocument.value_;
-  data_.swap(jsonDocument.data_);
-}
-
-JSONDocument& JSONDocument::operator=(JSONDocument jsonDocument) {
-  value_ = jsonDocument.value_;
-  data_.swap(jsonDocument.data_);
-  return *this;
-}
-
-JSONDocument::Type JSONDocument::type() const {
-  switch (value_->type()) {
-    case fbson::FbsonType::T_Null:
-      return JSONDocument::kNull;
-
-    case fbson::FbsonType::T_True:
-    case fbson::FbsonType::T_False:
-      return JSONDocument::kBool;
-
-    case fbson::FbsonType::T_Int8:
-    case fbson::FbsonType::T_Int16:
-    case fbson::FbsonType::T_Int32:
-    case fbson::FbsonType::T_Int64:
-      return JSONDocument::kInt64;
-
-    case fbson::FbsonType::T_Double:
-      return JSONDocument::kDouble;
-
-    case fbson::FbsonType::T_String:
-      return JSONDocument::kString;
-
-    case fbson::FbsonType::T_Object:
-      return JSONDocument::kObject;
-
-    case fbson::FbsonType::T_Array:
-      return JSONDocument::kArray;
-
-    case fbson::FbsonType::T_Binary:
-    default:
-      assert(false);
-  }
-  return JSONDocument::kNull;
-}
-
-bool JSONDocument::Contains(const std::string& key) const {
-  assert(IsObject());
-  auto objectVal = reinterpret_cast<fbson::ObjectVal*>(value_);
-  return objectVal->find(key.c_str()) != nullptr;
-}
-
-JSONDocument JSONDocument::operator[](const std::string& key) const {
-  assert(IsObject());
-  auto objectVal = reinterpret_cast<fbson::ObjectVal*>(value_);
-  auto foundValue = objectVal->find(key.c_str());
-  assert(foundValue != nullptr);
-  // No need to save paths in const objects
-  JSONDocument ans(foundValue, false);
-  return ans;
-}
-
-size_t JSONDocument::Count() const {
-  assert(IsObject() || IsArray());
-  if (IsObject()) {
-    // TODO(stash): add to fbson?
-    const fbson::ObjectVal& objectVal =
-          *reinterpret_cast<fbson::ObjectVal*>(value_);
-    return ObjectNumElem(objectVal);
-  } else if (IsArray()) {
-    auto arrayVal = reinterpret_cast<fbson::ArrayVal*>(value_);
-    return arrayVal->numElem();
-  }
-  assert(false);
-  return 0;
-}
-
-JSONDocument JSONDocument::operator[](size_t i) const {
-  assert(IsArray());
-  auto arrayVal = reinterpret_cast<fbson::ArrayVal*>(value_);
-  auto foundValue = arrayVal->get(static_cast<int>(i));
-  JSONDocument ans(foundValue, false);
-  return ans;
-}
-
-bool JSONDocument::IsNull() const {
-  return value_->isNull();
-}
-
-bool JSONDocument::IsArray() const {
-  return value_->isArray();
-}
-
-bool JSONDocument::IsBool() const {
-  return value_->isTrue() || value_->isFalse();
-}
-
-bool JSONDocument::IsDouble() const {
-  return value_->isDouble();
-}
-
-bool JSONDocument::IsInt64() const {
-  return value_->isInt8() || value_->isInt16() ||
-         value_->isInt32() || value_->isInt64();
-}
-
-bool JSONDocument::IsObject() const {
-  return value_->isObject();
-}
-
-bool JSONDocument::IsString() const {
-  return value_->isString();
-}
-
-bool JSONDocument::GetBool() const {
-  assert(IsBool());
-  return value_->isTrue();
-}
-
-double JSONDocument::GetDouble() const {
-  assert(IsDouble());
-  return ((fbson::DoubleVal*)value_)->val();
-}
-
-int64_t JSONDocument::GetInt64() const {
-  assert(IsInt64());
-  return GetInt64ValFromFbsonNumericType(value_);
-}
-
-std::string JSONDocument::GetString() const {
-  assert(IsString());
-  fbson::StringVal* stringVal = (fbson::StringVal*)value_;
-  return std::string(stringVal->getBlob(), stringVal->getBlobLen());
-}
-
-namespace {
-
-// FbsonValue can be int8, int16, int32, int64
-bool CompareNumeric(fbson::FbsonValue* left, fbson::FbsonValue* right) {
-  assert(IsNumeric(left) && IsNumeric(right));
-  return GetInt64ValFromFbsonNumericType(left) ==
-         GetInt64ValFromFbsonNumericType(right);
-}
-
-bool CompareSimpleTypes(fbson::FbsonValue* left, fbson::FbsonValue* right) {
-  if (IsNumeric(left)) {
-    return CompareNumeric(left, right);
-  }
-  if (left->numPackedBytes() != right->numPackedBytes()) {
-    return false;
-  }
-  return memcmp(left, right, left->numPackedBytes()) == 0;
-}
-
-bool CompareFbsonValue(fbson::FbsonValue* left, fbson::FbsonValue* right) {
-  if (!IsComparable(left, right)) {
-    return false;
-  }
-
-  switch (left->type()) {
-    case fbson::FbsonType::T_True:
-    case fbson::FbsonType::T_False:
-    case fbson::FbsonType::T_Null:
-      return true;
-    case fbson::FbsonType::T_Int8:
-    case fbson::FbsonType::T_Int16:
-    case fbson::FbsonType::T_Int32:
-    case fbson::FbsonType::T_Int64:
-      return CompareNumeric(left, right);
-    case fbson::FbsonType::T_String:
-    case fbson::FbsonType::T_Double:
-      return CompareSimpleTypes(left, right);
-    case fbson::FbsonType::T_Object:
-    {
-      auto leftObject = reinterpret_cast<fbson::ObjectVal*>(left);
-      auto rightObject = reinterpret_cast<fbson::ObjectVal*>(right);
-      if (ObjectNumElem(*leftObject) != ObjectNumElem(*rightObject)) {
-        return false;
-      }
-      for (auto && keyValue : *leftObject) {
-        std::string str(keyValue.getKeyStr(), keyValue.klen());
-        if (rightObject->find(str.c_str()) == nullptr) {
-          return false;
-        }
-        if (!CompareFbsonValue(keyValue.value(),
-                               rightObject->find(str.c_str()))) {
-          return false;
-        }
-      }
-      return true;
-    }
-    case fbson::FbsonType::T_Array:
-    {
-      auto leftArr = reinterpret_cast<fbson::ArrayVal*>(left);
-      auto rightArr = reinterpret_cast<fbson::ArrayVal*>(right);
-      if (leftArr->numElem() != rightArr->numElem()) {
-        return false;
-      }
-      for (int i = 0; i < static_cast<int>(leftArr->numElem()); ++i) {
-        if (!CompareFbsonValue(leftArr->get(i), rightArr->get(i))) {
-          return false;
-        }
-      }
-      return true;
-    }
-    default:
-      assert(false);
-  }
-  return false;
-}
-
-}  // namespace
-
-bool JSONDocument::operator==(const JSONDocument& rhs) const {
-  return CompareFbsonValue(value_, rhs.value_);
-}
-
-bool JSONDocument::operator!=(const JSONDocument& rhs) const {
-  return !(*this == rhs);
-}
-
-JSONDocument JSONDocument::Copy() const {
-  return JSONDocument(value_, true);
-}
-
-bool JSONDocument::IsOwner() const {
-  return data_.get() != nullptr;
-}
-
-std::string JSONDocument::DebugString() const {
-  fbson::FbsonToJson fbsonToJson;
-  return fbsonToJson.json(value_);
-}
-
-JSONDocument::ItemsIteratorGenerator JSONDocument::Items() const {
-  assert(IsObject());
-  return ItemsIteratorGenerator(*(reinterpret_cast<fbson::ObjectVal*>(value_)));
-}
-
-// TODO(icanadi) (perf) allocate objects with arena
-JSONDocument* JSONDocument::ParseJSON(const char* json) {
-  fbson::FbsonJsonParser parser;
-  if (!parser.parse(json)) {
-    return nullptr;
-  }
-
-  auto fbsonVal = fbson::FbsonDocument::createValue(
-                    parser.getWriter().getOutput()->getBuffer(),
-              static_cast<uint32_t>(parser.getWriter().getOutput()->getSize()));
-
-  if (fbsonVal == nullptr) {
-    return nullptr;
-  }
-
-  return new JSONDocument(fbsonVal, true);
-}
-
-void JSONDocument::Serialize(std::string* dst) const {
-  // first byte is reserved for header
-  // currently, header is only version number. that will help us provide
-  // backwards compatility. we might also store more information here if
-  // necessary
-  dst->push_back(kSerializationFormatVersion);
-  dst->push_back(FBSON_VER);
-  dst->append(reinterpret_cast<char*>(value_), value_->numPackedBytes());
-}
-
-const char JSONDocument::kSerializationFormatVersion = 2;
-
-JSONDocument* JSONDocument::Deserialize(const Slice& src) {
-  Slice input(src);
-  if (src.size() == 0) {
-    return nullptr;
-  }
-  char header = input[0];
-  if (header == 1) {
-    assert(false);
-  }
-  input.remove_prefix(1);
-  auto value = fbson::FbsonDocument::createValue(input.data(),
-                static_cast<uint32_t>(input.size()));
-  if (value == nullptr) {
-    return nullptr;
-  }
-
-  return new JSONDocument(value, true);
-}
-
-class JSONDocument::const_item_iterator::Impl {
- public:
-  typedef fbson::ObjectVal::const_iterator It;
-
-  explicit Impl(It it) : it_(it) {}
-
-  const char* getKeyStr() const {
-    return it_->getKeyStr();
-  }
-
-  uint8_t klen() const {
-    return it_->klen();
-  }
-
-  It& operator++() {
-    return ++it_;
-  }
-
-  bool operator!=(const Impl& other) {
-    return it_ != other.it_;
-  }
-
-  fbson::FbsonValue* value() const {
-    return it_->value();
-  }
-
- private:
-  It it_;
-};
-
-JSONDocument::const_item_iterator::const_item_iterator(Impl* impl)
-: it_(impl) {}
-
-JSONDocument::const_item_iterator::const_item_iterator(const_item_iterator&& a)
-: it_(std::move(a.it_)) {}
-
-JSONDocument::const_item_iterator&
-  JSONDocument::const_item_iterator::operator++() {
-  ++(*it_);
-  return *this;
-}
-
-bool JSONDocument::const_item_iterator::operator!=(
-                                  const const_item_iterator& other) {
-  return *it_ != *(other.it_);
-}
-
-JSONDocument::const_item_iterator::~const_item_iterator() {
-}
-
-JSONDocument::const_item_iterator::value_type
-  JSONDocument::const_item_iterator::operator*() {
-  return JSONDocument::const_item_iterator::value_type(std::string(it_->getKeyStr(), it_->klen()),
-    JSONDocument(it_->value(), false));
-}
-
-JSONDocument::ItemsIteratorGenerator::ItemsIteratorGenerator(
-                                      const fbson::ObjectVal& object)
-  : object_(object) {}
-
-JSONDocument::const_item_iterator
-      JSONDocument::ItemsIteratorGenerator::begin() const {
-  return const_item_iterator(new const_item_iterator::Impl(object_.begin()));
-}
-
-JSONDocument::const_item_iterator
-      JSONDocument::ItemsIteratorGenerator::end() const {
-  return const_item_iterator(new const_item_iterator::Impl(object_.end()));
-}
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/document/json_document_builder.cc b/thirdparty/rocksdb/utilities/document/json_document_builder.cc
deleted file mode 100644
index 7aa95e465c..0000000000
--- a/thirdparty/rocksdb/utilities/document/json_document_builder.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-#include <assert.h>
-#include <limits>
-#include <stdint.h>
-#include "rocksdb/utilities/json_document.h"
-#include "third-party/fbson/FbsonWriter.h"
-
-namespace rocksdb {
-JSONDocumentBuilder::JSONDocumentBuilder()
-: writer_(new fbson::FbsonWriter()) {
-}
-
-JSONDocumentBuilder::JSONDocumentBuilder(fbson::FbsonOutStream* out)
-: writer_(new fbson::FbsonWriter(*out)) {
-}
-
-void JSONDocumentBuilder::Reset() {
-  writer_->reset();
-}
-
-bool JSONDocumentBuilder::WriteStartArray() {
-  return writer_->writeStartArray();
-}
-
-bool JSONDocumentBuilder::WriteEndArray() {
-  return writer_->writeEndArray();
-}
-
-bool JSONDocumentBuilder::WriteStartObject() {
-  return writer_->writeStartObject();
-}
-
-bool JSONDocumentBuilder::WriteEndObject() {
-  return writer_->writeEndObject();
-}
-
-bool JSONDocumentBuilder::WriteKeyValue(const std::string& key,
-                                        const JSONDocument& value) {
-  assert(key.size() <= std::numeric_limits<uint8_t>::max());
-  size_t bytesWritten = writer_->writeKey(key.c_str(),
-    static_cast<uint8_t>(key.size()));
-  if (bytesWritten == 0) {
-    return false;
-  }
-  return WriteJSONDocument(value);
-}
-
-bool JSONDocumentBuilder::WriteJSONDocument(const JSONDocument& value) {
-  switch (value.type()) {
-    case JSONDocument::kNull:
-      return writer_->writeNull() != 0;
-    case JSONDocument::kInt64:
-      return writer_->writeInt64(value.GetInt64());
-    case JSONDocument::kDouble:
-      return writer_->writeDouble(value.GetDouble());
-    case JSONDocument::kBool:
-      return writer_->writeBool(value.GetBool());
-    case JSONDocument::kString:
-    {
-      bool res = writer_->writeStartString();
-      if (!res) {
-        return false;
-      }
-      const std::string& str = value.GetString();
-      res = writer_->writeString(str.c_str(),
-                  static_cast<uint32_t>(str.size()));
-      if (!res) {
-        return false;
-      }
-      return writer_->writeEndString();
-    }
-    case JSONDocument::kArray:
-    {
-      bool res = WriteStartArray();
-      if (!res) {
-        return false;
-      }
-      for (size_t i = 0; i < value.Count(); ++i) {
-        res = WriteJSONDocument(value[i]);
-        if (!res) {
-          return false;
-        }
-      }
-      return WriteEndArray();
-    }
-    case JSONDocument::kObject:
-    {
-      bool res = WriteStartObject();
-      if (!res) {
-        return false;
-      }
-      for (auto keyValue : value.Items()) {
-        WriteKeyValue(keyValue.first, keyValue.second);
-      }
-      return WriteEndObject();
-    }
-    default:
-      assert(false);
-  }
-  return false;
-}
-
-JSONDocument JSONDocumentBuilder::GetJSONDocument() {
-  fbson::FbsonValue* value =
-      fbson::FbsonDocument::createValue(writer_->getOutput()->getBuffer(),
-                       static_cast<uint32_t>(writer_->getOutput()->getSize()));
-  return JSONDocument(value, true);
-}
-
-JSONDocumentBuilder::~JSONDocumentBuilder() {
-}
-
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/document/json_document_test.cc b/thirdparty/rocksdb/utilities/document/json_document_test.cc
deleted file mode 100644
index c7bfb39f38..0000000000
--- a/thirdparty/rocksdb/utilities/document/json_document_test.cc
+++ /dev/null
@@ -1,341 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include <map>
-#include <set>
-#include <string>
-
-#include "rocksdb/utilities/json_document.h"
-
-#include "util/testutil.h"
-#include "util/testharness.h"
-
-namespace rocksdb {
-namespace {
-void AssertField(const JSONDocument& json, const std::string& field) {
-  ASSERT_TRUE(json.Contains(field));
-  ASSERT_TRUE(json[field].IsNull());
-}
-
-void AssertField(const JSONDocument& json, const std::string& field,
-                 const std::string& expected) {
-  ASSERT_TRUE(json.Contains(field));
-  ASSERT_TRUE(json[field].IsString());
-  ASSERT_EQ(expected, json[field].GetString());
-}
-
-void AssertField(const JSONDocument& json, const std::string& field,
-                 int64_t expected) {
-  ASSERT_TRUE(json.Contains(field));
-  ASSERT_TRUE(json[field].IsInt64());
-  ASSERT_EQ(expected, json[field].GetInt64());
-}
-
-void AssertField(const JSONDocument& json, const std::string& field,
-                 bool expected) {
-  ASSERT_TRUE(json.Contains(field));
-  ASSERT_TRUE(json[field].IsBool());
-  ASSERT_EQ(expected, json[field].GetBool());
-}
-
-void AssertField(const JSONDocument& json, const std::string& field,
-                 double expected) {
-  ASSERT_TRUE(json.Contains(field));
-  ASSERT_TRUE(json[field].IsDouble());
-  ASSERT_DOUBLE_EQ(expected, json[field].GetDouble());
-}
-}  // namespace
-
-class JSONDocumentTest : public testing::Test {
- public:
-  JSONDocumentTest()
-  : rnd_(101)
-  {}
-
-  void AssertSampleJSON(const JSONDocument& json) {
-    AssertField(json, "title", std::string("json"));
-    AssertField(json, "type", std::string("object"));
-    // properties
-    ASSERT_TRUE(json.Contains("properties"));
-    ASSERT_TRUE(json["properties"].Contains("flags"));
-    ASSERT_TRUE(json["properties"]["flags"].IsArray());
-    ASSERT_EQ(3u, json["properties"]["flags"].Count());
-    ASSERT_TRUE(json["properties"]["flags"][0].IsInt64());
-    ASSERT_EQ(10, json["properties"]["flags"][0].GetInt64());
-    ASSERT_TRUE(json["properties"]["flags"][1].IsString());
-    ASSERT_EQ("parse", json["properties"]["flags"][1].GetString());
-    ASSERT_TRUE(json["properties"]["flags"][2].IsObject());
-    AssertField(json["properties"]["flags"][2], "tag", std::string("no"));
-    AssertField(json["properties"]["flags"][2], std::string("status"));
-    AssertField(json["properties"], "age", 110.5e-4);
-    AssertField(json["properties"], "depth", static_cast<int64_t>(-10));
-    // test iteration
-    std::set<std::string> expected({"flags", "age", "depth"});
-    for (auto item : json["properties"].Items()) {
-      auto iter = expected.find(item.first);
-      ASSERT_TRUE(iter != expected.end());
-      expected.erase(iter);
-    }
-    ASSERT_EQ(0U, expected.size());
-    ASSERT_TRUE(json.Contains("latlong"));
-    ASSERT_TRUE(json["latlong"].IsArray());
-    ASSERT_EQ(2u, json["latlong"].Count());
-    ASSERT_TRUE(json["latlong"][0].IsDouble());
-    ASSERT_EQ(53.25, json["latlong"][0].GetDouble());
-    ASSERT_TRUE(json["latlong"][1].IsDouble());
-    ASSERT_EQ(43.75, json["latlong"][1].GetDouble());
-    AssertField(json, "enabled", true);
-  }
-
-  const std::string kSampleJSON =
-      "{ \"title\" : \"json\", \"type\" : \"object\", \"properties\" : { "
-      "\"flags\": [10, \"parse\", {\"tag\": \"no\", \"status\": null}], "
-      "\"age\": 110.5e-4, \"depth\": -10 }, \"latlong\": [53.25, 43.75], "
-      "\"enabled\": true }";
-
-  const std::string kSampleJSONDifferent =
-      "{ \"title\" : \"json\", \"type\" : \"object\", \"properties\" : { "
-      "\"flags\": [10, \"parse\", {\"tag\": \"no\", \"status\": 2}], "
-      "\"age\": 110.5e-4, \"depth\": -10 }, \"latlong\": [53.25, 43.75], "
-      "\"enabled\": true }";
-
-  Random rnd_;
-};
-
-TEST_F(JSONDocumentTest, MakeNullTest) {
-  JSONDocument x;
-  ASSERT_TRUE(x.IsNull());
-  ASSERT_TRUE(x.IsOwner());
-  ASSERT_TRUE(!x.IsBool());
-}
-
-TEST_F(JSONDocumentTest, MakeBoolTest) {
-  {
-    JSONDocument x(true);
-    ASSERT_TRUE(x.IsOwner());
-    ASSERT_TRUE(x.IsBool());
-    ASSERT_TRUE(!x.IsInt64());
-    ASSERT_EQ(x.GetBool(), true);
-  }
-
-  {
-    JSONDocument x(false);
-    ASSERT_TRUE(x.IsOwner());
-    ASSERT_TRUE(x.IsBool());
-    ASSERT_TRUE(!x.IsInt64());
-    ASSERT_EQ(x.GetBool(), false);
-  }
-}
-
-TEST_F(JSONDocumentTest, MakeInt64Test) {
-  JSONDocument x(static_cast<int64_t>(16));
-  ASSERT_TRUE(x.IsInt64());
-  ASSERT_TRUE(x.IsInt64());
-  ASSERT_TRUE(!x.IsBool());
-  ASSERT_TRUE(x.IsOwner());
-  ASSERT_EQ(x.GetInt64(), 16);
-}
-
-TEST_F(JSONDocumentTest, MakeStringTest) {
-  JSONDocument x("string");
-  ASSERT_TRUE(x.IsOwner());
-  ASSERT_TRUE(x.IsString());
-  ASSERT_TRUE(!x.IsBool());
-  ASSERT_EQ(x.GetString(), "string");
-}
-
-TEST_F(JSONDocumentTest, MakeDoubleTest) {
-  JSONDocument x(5.6);
-  ASSERT_TRUE(x.IsOwner());
-  ASSERT_TRUE(x.IsDouble());
-  ASSERT_TRUE(!x.IsBool());
-  ASSERT_EQ(x.GetDouble(), 5.6);
-}
-
-TEST_F(JSONDocumentTest, MakeByTypeTest) {
-  {
-    JSONDocument x(JSONDocument::kNull);
-    ASSERT_TRUE(x.IsNull());
-  }
-  {
-    JSONDocument x(JSONDocument::kBool);
-    ASSERT_TRUE(x.IsBool());
-  }
-  {
-    JSONDocument x(JSONDocument::kString);
-    ASSERT_TRUE(x.IsString());
-  }
-  {
-    JSONDocument x(JSONDocument::kInt64);
-    ASSERT_TRUE(x.IsInt64());
-  }
-  {
-    JSONDocument x(JSONDocument::kDouble);
-    ASSERT_TRUE(x.IsDouble());
-  }
-  {
-    JSONDocument x(JSONDocument::kObject);
-    ASSERT_TRUE(x.IsObject());
-  }
-  {
-    JSONDocument x(JSONDocument::kArray);
-    ASSERT_TRUE(x.IsArray());
-  }
-}
-
-TEST_F(JSONDocumentTest, Parsing) {
-  std::unique_ptr<JSONDocument> parsed_json(
-          JSONDocument::ParseJSON(kSampleJSON.c_str()));
-  ASSERT_TRUE(parsed_json->IsOwner());
-  ASSERT_TRUE(parsed_json != nullptr);
-  AssertSampleJSON(*parsed_json);
-
-  // test deep copying
-  JSONDocument copied_json_document(*parsed_json);
-  AssertSampleJSON(copied_json_document);
-  ASSERT_TRUE(copied_json_document == *parsed_json);
-
-  std::unique_ptr<JSONDocument> parsed_different_sample(
-      JSONDocument::ParseJSON(kSampleJSONDifferent.c_str()));
-  ASSERT_TRUE(parsed_different_sample != nullptr);
-  ASSERT_TRUE(!(*parsed_different_sample == copied_json_document));
-
-  // parse error
-  const std::string kFaultyJSON =
-      kSampleJSON.substr(0, kSampleJSON.size() - 10);
-  ASSERT_TRUE(JSONDocument::ParseJSON(kFaultyJSON.c_str()) == nullptr);
-}
-
-TEST_F(JSONDocumentTest, Serialization) {
-  std::unique_ptr<JSONDocument> parsed_json(
-            JSONDocument::ParseJSON(kSampleJSON.c_str()));
-  ASSERT_TRUE(parsed_json != nullptr);
-  ASSERT_TRUE(parsed_json->IsOwner());
-  std::string serialized;
-  parsed_json->Serialize(&serialized);
-
-  std::unique_ptr<JSONDocument> deserialized_json(
-            JSONDocument::Deserialize(Slice(serialized)));
-  ASSERT_TRUE(deserialized_json != nullptr);
-  AssertSampleJSON(*deserialized_json);
-
-  // deserialization failure
-  ASSERT_TRUE(JSONDocument::Deserialize(
-                  Slice(serialized.data(), serialized.size() - 10)) == nullptr);
-}
-
-TEST_F(JSONDocumentTest, OperatorEqualsTest) {
-  // kNull
-  ASSERT_TRUE(JSONDocument() == JSONDocument());
-
-  // kBool
-  ASSERT_TRUE(JSONDocument(false) != JSONDocument());
-  ASSERT_TRUE(JSONDocument(false) == JSONDocument(false));
-  ASSERT_TRUE(JSONDocument(true) == JSONDocument(true));
-  ASSERT_TRUE(JSONDocument(false) != JSONDocument(true));
-
-  // kString
-  ASSERT_TRUE(JSONDocument("test") != JSONDocument());
-  ASSERT_TRUE(JSONDocument("test") == JSONDocument("test"));
-
-  // kInt64
-  ASSERT_TRUE(JSONDocument(static_cast<int64_t>(15)) != JSONDocument());
-  ASSERT_TRUE(JSONDocument(static_cast<int64_t>(15)) !=
-              JSONDocument(static_cast<int64_t>(14)));
-  ASSERT_TRUE(JSONDocument(static_cast<int64_t>(15)) ==
-              JSONDocument(static_cast<int64_t>(15)));
-
-  unique_ptr<JSONDocument> arrayWithInt8Doc(JSONDocument::ParseJSON("[8]"));
-  ASSERT_TRUE(arrayWithInt8Doc != nullptr);
-  ASSERT_TRUE(arrayWithInt8Doc->IsArray());
-  ASSERT_TRUE((*arrayWithInt8Doc)[0].IsInt64());
-  ASSERT_TRUE((*arrayWithInt8Doc)[0] == JSONDocument(static_cast<int64_t>(8)));
-
-  unique_ptr<JSONDocument> arrayWithInt16Doc(JSONDocument::ParseJSON("[512]"));
-  ASSERT_TRUE(arrayWithInt16Doc != nullptr);
-  ASSERT_TRUE(arrayWithInt16Doc->IsArray());
-  ASSERT_TRUE((*arrayWithInt16Doc)[0].IsInt64());
-  ASSERT_TRUE((*arrayWithInt16Doc)[0] ==
-              JSONDocument(static_cast<int64_t>(512)));
-
-  unique_ptr<JSONDocument> arrayWithInt32Doc(
-    JSONDocument::ParseJSON("[1000000]"));
-  ASSERT_TRUE(arrayWithInt32Doc != nullptr);
-  ASSERT_TRUE(arrayWithInt32Doc->IsArray());
-  ASSERT_TRUE((*arrayWithInt32Doc)[0].IsInt64());
-  ASSERT_TRUE((*arrayWithInt32Doc)[0] ==
-               JSONDocument(static_cast<int64_t>(1000000)));
-
-  // kDouble
-  ASSERT_TRUE(JSONDocument(15.) != JSONDocument());
-  ASSERT_TRUE(JSONDocument(15.) != JSONDocument(14.));
-  ASSERT_TRUE(JSONDocument(15.) == JSONDocument(15.));
-}
-
-TEST_F(JSONDocumentTest, JSONDocumentBuilderTest) {
-  unique_ptr<JSONDocument> parsedArray(
-    JSONDocument::ParseJSON("[1, [123, \"a\", \"b\"], {\"b\":\"c\"}]"));
-  ASSERT_TRUE(parsedArray != nullptr);
-
-  JSONDocumentBuilder builder;
-  ASSERT_TRUE(builder.WriteStartArray());
-  ASSERT_TRUE(builder.WriteJSONDocument(1));
-
-  ASSERT_TRUE(builder.WriteStartArray());
-    ASSERT_TRUE(builder.WriteJSONDocument(123));
-    ASSERT_TRUE(builder.WriteJSONDocument("a"));
-    ASSERT_TRUE(builder.WriteJSONDocument("b"));
-  ASSERT_TRUE(builder.WriteEndArray());
-
-  ASSERT_TRUE(builder.WriteStartObject());
-    ASSERT_TRUE(builder.WriteKeyValue("b", "c"));
-  ASSERT_TRUE(builder.WriteEndObject());
-
-  ASSERT_TRUE(builder.WriteEndArray());
-
-  ASSERT_TRUE(*parsedArray == builder.GetJSONDocument());
-}
-
-TEST_F(JSONDocumentTest, OwnershipTest) {
-  std::unique_ptr<JSONDocument> parsed(
-          JSONDocument::ParseJSON(kSampleJSON.c_str()));
-  ASSERT_TRUE(parsed != nullptr);
-  ASSERT_TRUE(parsed->IsOwner());
-
-  // Copy constructor from owner -> owner
-  JSONDocument copy_constructor(*parsed);
-  ASSERT_TRUE(copy_constructor.IsOwner());
-
-  // Copy constructor from non-owner -> non-owner
-  JSONDocument non_owner((*parsed)["properties"]);
-  ASSERT_TRUE(!non_owner.IsOwner());
-
-  // Move constructor from owner -> owner
-  JSONDocument moved_from_owner(std::move(copy_constructor));
-  ASSERT_TRUE(moved_from_owner.IsOwner());
-
-  // Move constructor from non-owner -> non-owner
-  JSONDocument moved_from_non_owner(std::move(non_owner));
-  ASSERT_TRUE(!moved_from_non_owner.IsOwner());
-}
-
-}  //  namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-#include <stdio.h>
-
-int main(int argc, char** argv) {
-  fprintf(stderr, "SKIPPED as JSONDocument is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/env_librados.cc b/thirdparty/rocksdb/utilities/env_librados.cc
index 4a0b262739..753444da16 100644
--- a/thirdparty/rocksdb/utilities/env_librados.cc
+++ b/thirdparty/rocksdb/utilities/env_librados.cc
@@ -1485,4 +1485,5 @@ EnvLibrados* EnvLibrados::Default() {
                                  default_pool_name);
   return &default_env;
 }
+// @lint-ignore TXT4 T25377293 Grandfathered in
 }
\ No newline at end of file
diff --git a/thirdparty/rocksdb/utilities/env_librados_test.cc b/thirdparty/rocksdb/utilities/env_librados_test.cc
index 7d9b252ea4..fb10224e7d 100644
--- a/thirdparty/rocksdb/utilities/env_librados_test.cc
+++ b/thirdparty/rocksdb/utilities/env_librados_test.cc
@@ -108,7 +108,7 @@ class EnvLibradosTest : public testing::Test {
 
 TEST_F(EnvLibradosTest, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
 
   ASSERT_OK(env_->CreateDir("/dir"));
@@ -150,8 +150,8 @@ TEST_F(EnvLibradosTest, Basics) {
   ASSERT_EQ(3U, file_size);
 
   // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_TRUE(
     !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok());
   ASSERT_TRUE(!seq_file);
@@ -169,9 +169,9 @@ TEST_F(EnvLibradosTest, Basics) {
 }
 
 TEST_F(EnvLibradosTest, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   Slice result;
   char scratch[100];
 
@@ -210,7 +210,7 @@ TEST_F(EnvLibradosTest, ReadWrite) {
 
 TEST_F(EnvLibradosTest, Locks) {
   FileLock* lock = nullptr;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
 
   ASSERT_OK(env_->CreateDir("/dir"));
 
@@ -229,7 +229,7 @@ TEST_F(EnvLibradosTest, Misc) {
   ASSERT_OK(env_->GetTestDirectory(&test_dir));
   ASSERT_TRUE(!test_dir.empty());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_TRUE(!env_->NewWritableFile("/a/b", &writable_file, soptions_).ok());
 
   ASSERT_OK(env_->NewWritableFile("/a", &writable_file, soptions_));
@@ -249,14 +249,14 @@ TEST_F(EnvLibradosTest, LargeWrite) {
     write_data.append(1, 'h');
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->CreateDir("/dir"));
   ASSERT_OK(env_->NewWritableFile("/dir/g", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
   ASSERT_OK(writable_file->Append(write_data));
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
   ASSERT_OK(env_->NewSequentialFile("/dir/g", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
@@ -282,7 +282,7 @@ TEST_F(EnvLibradosTest, FrequentlySmallWrite) {
     write_data.append(1, 'h');
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->CreateDir("/dir"));
   ASSERT_OK(env_->NewWritableFile("/dir/g", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
@@ -292,7 +292,7 @@ TEST_F(EnvLibradosTest, FrequentlySmallWrite) {
   }
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
   ASSERT_OK(env_->NewSequentialFile("/dir/g", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
@@ -317,7 +317,7 @@ TEST_F(EnvLibradosTest, Truncate) {
     write_data.append(1, 'h');
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->CreateDir("/dir"));
   ASSERT_OK(env_->NewWritableFile("/dir/g", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append(write_data));
@@ -801,7 +801,7 @@ class EnvLibradosMutipoolTest : public testing::Test {
 
 TEST_F(EnvLibradosMutipoolTest, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
   std::vector<std::string> v = {"/tmp/dir1", "/tmp/dir2", "/tmp/dir3", "/tmp/dir4", "dir"};
 
@@ -850,8 +850,8 @@ TEST_F(EnvLibradosMutipoolTest, Basics) {
     ASSERT_EQ(3U, file_size);
 
     // Check that opening non-existent file fails.
-    unique_ptr<SequentialFile> seq_file;
-    unique_ptr<RandomAccessFile> rand_file;
+    std::unique_ptr<SequentialFile> seq_file;
+    std::unique_ptr<RandomAccessFile> rand_file;
     ASSERT_TRUE(
       !env_->NewSequentialFile(dir_non_existent.c_str(), &seq_file, soptions_).ok());
     ASSERT_TRUE(!seq_file);
diff --git a/thirdparty/rocksdb/utilities/env_mirror.cc b/thirdparty/rocksdb/utilities/env_mirror.cc
index 64c0b68711..327d8e1622 100644
--- a/thirdparty/rocksdb/utilities/env_mirror.cc
+++ b/thirdparty/rocksdb/utilities/env_mirror.cc
@@ -16,11 +16,11 @@ namespace rocksdb {
 // Env's.  This is useful for debugging purposes.
 class SequentialFileMirror : public SequentialFile {
  public:
-  unique_ptr<SequentialFile> a_, b_;
+  std::unique_ptr<SequentialFile> a_, b_;
   std::string fname;
   explicit SequentialFileMirror(std::string f) : fname(f) {}
 
-  Status Read(size_t n, Slice* result, char* scratch) {
+  Status Read(size_t n, Slice* result, char* scratch) override {
     Slice aslice;
     Status as = a_->Read(n, &aslice, scratch);
     if (as == Status::OK()) {
@@ -44,13 +44,13 @@ class SequentialFileMirror : public SequentialFile {
     return as;
   }
 
-  Status Skip(uint64_t n) {
+  Status Skip(uint64_t n) override {
     Status as = a_->Skip(n);
     Status bs = b_->Skip(n);
     assert(as == bs);
     return as;
   }
-  Status InvalidateCache(size_t offset, size_t length) {
+  Status InvalidateCache(size_t offset, size_t length) override {
     Status as = a_->InvalidateCache(offset, length);
     Status bs = b_->InvalidateCache(offset, length);
     assert(as == bs);
@@ -60,11 +60,12 @@ class SequentialFileMirror : public SequentialFile {
 
 class RandomAccessFileMirror : public RandomAccessFile {
  public:
-  unique_ptr<RandomAccessFile> a_, b_;
+  std::unique_ptr<RandomAccessFile> a_, b_;
   std::string fname;
   explicit RandomAccessFileMirror(std::string f) : fname(f) {}
 
-  Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const {
+  Status Read(uint64_t offset, size_t n, Slice* result,
+              char* scratch) const override {
     Status as = a_->Read(offset, n, result, scratch);
     if (as == Status::OK()) {
       char* bscratch = new char[n];
@@ -86,7 +87,7 @@ class RandomAccessFileMirror : public RandomAccessFile {
     return as;
   }
 
-  size_t GetUniqueId(char* id, size_t max_size) const {
+  size_t GetUniqueId(char* id, size_t max_size) const override {
     // NOTE: not verified
     return a_->GetUniqueId(id, max_size);
   }
@@ -94,7 +95,7 @@ class RandomAccessFileMirror : public RandomAccessFile {
 
 class WritableFileMirror : public WritableFile {
  public:
-  unique_ptr<WritableFile> a_, b_;
+  std::unique_ptr<WritableFile> a_, b_;
   std::string fname;
   explicit WritableFileMirror(std::string f) : fname(f) {}
 
@@ -190,7 +191,7 @@ class WritableFileMirror : public WritableFile {
 };
 
 Status EnvMirror::NewSequentialFile(const std::string& f,
-                                    unique_ptr<SequentialFile>* r,
+                                    std::unique_ptr<SequentialFile>* r,
                                     const EnvOptions& options) {
   if (f.find("/proc/") == 0) {
     return a_->NewSequentialFile(f, r, options);
@@ -207,7 +208,7 @@ Status EnvMirror::NewSequentialFile(const std::string& f,
 }
 
 Status EnvMirror::NewRandomAccessFile(const std::string& f,
-                                      unique_ptr<RandomAccessFile>* r,
+                                      std::unique_ptr<RandomAccessFile>* r,
                                       const EnvOptions& options) {
   if (f.find("/proc/") == 0) {
     return a_->NewRandomAccessFile(f, r, options);
@@ -224,7 +225,7 @@ Status EnvMirror::NewRandomAccessFile(const std::string& f,
 }
 
 Status EnvMirror::NewWritableFile(const std::string& f,
-                                  unique_ptr<WritableFile>* r,
+                                  std::unique_ptr<WritableFile>* r,
                                   const EnvOptions& options) {
   if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options);
   WritableFileMirror* mf = new WritableFileMirror(f);
@@ -240,7 +241,7 @@ Status EnvMirror::NewWritableFile(const std::string& f,
 
 Status EnvMirror::ReuseWritableFile(const std::string& fname,
                                     const std::string& old_fname,
-                                    unique_ptr<WritableFile>* r,
+                                    std::unique_ptr<WritableFile>* r,
                                     const EnvOptions& options) {
   if (fname.find("/proc/") == 0)
     return a_->ReuseWritableFile(fname, old_fname, r, options);
diff --git a/thirdparty/rocksdb/utilities/env_mirror_test.cc b/thirdparty/rocksdb/utilities/env_mirror_test.cc
index 2bf8ec8583..812595ca1e 100644
--- a/thirdparty/rocksdb/utilities/env_mirror_test.cc
+++ b/thirdparty/rocksdb/utilities/env_mirror_test.cc
@@ -32,7 +32,7 @@ class EnvMirrorTest : public testing::Test {
 
 TEST_F(EnvMirrorTest, Basics) {
   uint64_t file_size;
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   std::vector<std::string> children;
 
   ASSERT_OK(env_->CreateDir("/dir"));
@@ -91,8 +91,8 @@ TEST_F(EnvMirrorTest, Basics) {
   ASSERT_EQ(3U, file_size);
 
   // Check that opening non-existent file fails.
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   ASSERT_TRUE(
       !env_->NewSequentialFile("/dir/non_existent", &seq_file, soptions_).ok());
   ASSERT_TRUE(!seq_file);
@@ -110,9 +110,9 @@ TEST_F(EnvMirrorTest, Basics) {
 }
 
 TEST_F(EnvMirrorTest, ReadWrite) {
-  unique_ptr<WritableFile> writable_file;
-  unique_ptr<SequentialFile> seq_file;
-  unique_ptr<RandomAccessFile> rand_file;
+  std::unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<RandomAccessFile> rand_file;
   Slice result;
   char scratch[100];
 
@@ -162,7 +162,7 @@ TEST_F(EnvMirrorTest, Misc) {
   ASSERT_OK(env_->GetTestDirectory(&test_dir));
   ASSERT_TRUE(!test_dir.empty());
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_));
 
   // These are no-ops, but we test they return success.
@@ -181,13 +181,13 @@ TEST_F(EnvMirrorTest, LargeWrite) {
     write_data.append(1, static_cast<char>(i));
   }
 
-  unique_ptr<WritableFile> writable_file;
+  std::unique_ptr<WritableFile> writable_file;
   ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_));
   ASSERT_OK(writable_file->Append("foo"));
   ASSERT_OK(writable_file->Append(write_data));
   writable_file.reset();
 
-  unique_ptr<SequentialFile> seq_file;
+  std::unique_ptr<SequentialFile> seq_file;
   Slice result;
   ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_));
   ASSERT_OK(seq_file->Read(3, &result, scratch));  // Read "foo".
diff --git a/thirdparty/rocksdb/utilities/env_timed.cc b/thirdparty/rocksdb/utilities/env_timed.cc
index 2afa0e0b58..82fc6401c6 100644
--- a/thirdparty/rocksdb/utilities/env_timed.cc
+++ b/thirdparty/rocksdb/utilities/env_timed.cc
@@ -17,121 +17,118 @@ class TimedEnv : public EnvWrapper {
  public:
   explicit TimedEnv(Env* base_env) : EnvWrapper(base_env) {}
 
-  virtual Status NewSequentialFile(const std::string& fname,
-                                   unique_ptr<SequentialFile>* result,
-                                   const EnvOptions& options) override {
+  Status NewSequentialFile(const std::string& fname,
+                           std::unique_ptr<SequentialFile>* result,
+                           const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_sequential_file_nanos);
     return EnvWrapper::NewSequentialFile(fname, result, options);
   }
 
-  virtual Status NewRandomAccessFile(const std::string& fname,
-                                     unique_ptr<RandomAccessFile>* result,
-                                     const EnvOptions& options) override {
+  Status NewRandomAccessFile(const std::string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_random_access_file_nanos);
     return EnvWrapper::NewRandomAccessFile(fname, result, options);
   }
 
-  virtual Status NewWritableFile(const std::string& fname,
-                                 unique_ptr<WritableFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewWritableFile(const std::string& fname,
+                         std::unique_ptr<WritableFile>* result,
+                         const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_writable_file_nanos);
     return EnvWrapper::NewWritableFile(fname, result, options);
   }
 
-  virtual Status ReuseWritableFile(const std::string& fname,
-                                   const std::string& old_fname,
-                                   unique_ptr<WritableFile>* result,
-                                   const EnvOptions& options) override {
+  Status ReuseWritableFile(const std::string& fname,
+                           const std::string& old_fname,
+                           std::unique_ptr<WritableFile>* result,
+                           const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_reuse_writable_file_nanos);
     return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options);
   }
 
-  virtual Status NewRandomRWFile(const std::string& fname,
-                                 unique_ptr<RandomRWFile>* result,
-                                 const EnvOptions& options) override {
+  Status NewRandomRWFile(const std::string& fname,
+                         std::unique_ptr<RandomRWFile>* result,
+                         const EnvOptions& options) override {
     PERF_TIMER_GUARD(env_new_random_rw_file_nanos);
     return EnvWrapper::NewRandomRWFile(fname, result, options);
   }
 
-  virtual Status NewDirectory(const std::string& name,
-                              unique_ptr<Directory>* result) override {
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
     PERF_TIMER_GUARD(env_new_directory_nanos);
     return EnvWrapper::NewDirectory(name, result);
   }
 
-  virtual Status FileExists(const std::string& fname) override {
+  Status FileExists(const std::string& fname) override {
     PERF_TIMER_GUARD(env_file_exists_nanos);
     return EnvWrapper::FileExists(fname);
   }
 
-  virtual Status GetChildren(const std::string& dir,
-                             std::vector<std::string>* result) override {
+  Status GetChildren(const std::string& dir,
+                     std::vector<std::string>* result) override {
     PERF_TIMER_GUARD(env_get_children_nanos);
     return EnvWrapper::GetChildren(dir, result);
   }
 
-  virtual Status GetChildrenFileAttributes(
+  Status GetChildrenFileAttributes(
       const std::string& dir, std::vector<FileAttributes>* result) override {
     PERF_TIMER_GUARD(env_get_children_file_attributes_nanos);
     return EnvWrapper::GetChildrenFileAttributes(dir, result);
   }
 
-  virtual Status DeleteFile(const std::string& fname) override {
+  Status DeleteFile(const std::string& fname) override {
     PERF_TIMER_GUARD(env_delete_file_nanos);
     return EnvWrapper::DeleteFile(fname);
   }
 
-  virtual Status CreateDir(const std::string& dirname) override {
+  Status CreateDir(const std::string& dirname) override {
     PERF_TIMER_GUARD(env_create_dir_nanos);
     return EnvWrapper::CreateDir(dirname);
   }
 
-  virtual Status CreateDirIfMissing(const std::string& dirname) override {
+  Status CreateDirIfMissing(const std::string& dirname) override {
     PERF_TIMER_GUARD(env_create_dir_if_missing_nanos);
     return EnvWrapper::CreateDirIfMissing(dirname);
   }
 
-  virtual Status DeleteDir(const std::string& dirname) override {
+  Status DeleteDir(const std::string& dirname) override {
     PERF_TIMER_GUARD(env_delete_dir_nanos);
     return EnvWrapper::DeleteDir(dirname);
   }
 
-  virtual Status GetFileSize(const std::string& fname,
-                             uint64_t* file_size) override {
+  Status GetFileSize(const std::string& fname, uint64_t* file_size) override {
     PERF_TIMER_GUARD(env_get_file_size_nanos);
     return EnvWrapper::GetFileSize(fname, file_size);
   }
 
-  virtual Status GetFileModificationTime(const std::string& fname,
-                                         uint64_t* file_mtime) override {
+  Status GetFileModificationTime(const std::string& fname,
+                                 uint64_t* file_mtime) override {
     PERF_TIMER_GUARD(env_get_file_modification_time_nanos);
     return EnvWrapper::GetFileModificationTime(fname, file_mtime);
   }
 
-  virtual Status RenameFile(const std::string& src,
-                            const std::string& dst) override {
+  Status RenameFile(const std::string& src, const std::string& dst) override {
     PERF_TIMER_GUARD(env_rename_file_nanos);
     return EnvWrapper::RenameFile(src, dst);
   }
 
-  virtual Status LinkFile(const std::string& src,
-                          const std::string& dst) override {
+  Status LinkFile(const std::string& src, const std::string& dst) override {
     PERF_TIMER_GUARD(env_link_file_nanos);
     return EnvWrapper::LinkFile(src, dst);
   }
 
-  virtual Status LockFile(const std::string& fname, FileLock** lock) override {
+  Status LockFile(const std::string& fname, FileLock** lock) override {
     PERF_TIMER_GUARD(env_lock_file_nanos);
     return EnvWrapper::LockFile(fname, lock);
   }
 
-  virtual Status UnlockFile(FileLock* lock) override {
+  Status UnlockFile(FileLock* lock) override {
     PERF_TIMER_GUARD(env_unlock_file_nanos);
     return EnvWrapper::UnlockFile(lock);
   }
 
-  virtual Status NewLogger(const std::string& fname,
-                           shared_ptr<Logger>* result) override {
+  Status NewLogger(const std::string& fname,
+                   std::shared_ptr<Logger>* result) override {
     PERF_TIMER_GUARD(env_new_logger_nanos);
     return EnvWrapper::NewLogger(fname, result);
   }
@@ -141,7 +138,7 @@ Env* NewTimedEnv(Env* base_env) { return new TimedEnv(base_env); }
 
 #else  // ROCKSDB_LITE
 
-Env* NewTimedEnv(Env* base_env) { return nullptr; }
+Env* NewTimedEnv(Env* /*base_env*/) { return nullptr; }
 
 #endif  // !ROCKSDB_LITE
 
diff --git a/thirdparty/rocksdb/utilities/env_timed_test.cc b/thirdparty/rocksdb/utilities/env_timed_test.cc
index 41d05e14cc..8bdef6396e 100644
--- a/thirdparty/rocksdb/utilities/env_timed_test.cc
+++ b/thirdparty/rocksdb/utilities/env_timed_test.cc
@@ -36,7 +36,7 @@ int main(int argc, char** argv) {
 #else  // ROCKSDB_LITE
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as TimedEnv is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/geodb/geodb_impl.cc b/thirdparty/rocksdb/utilities/geodb/geodb_impl.cc
deleted file mode 100644
index a574e84fa3..0000000000
--- a/thirdparty/rocksdb/utilities/geodb/geodb_impl.cc
+++ /dev/null
@@ -1,477 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#ifndef ROCKSDB_LITE
-
-#include "utilities/geodb/geodb_impl.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <limits>
-#include <map>
-#include <string>
-#include <vector>
-#include "util/coding.h"
-#include "util/filename.h"
-#include "util/string_util.h"
-
-//
-// There are two types of keys. The first type of key-values
-// maps a geo location to the set of object ids and their values.
-// Table 1
-//   key     : p + : + $quadkey + : + $id +
-//             : + $latitude + : + $longitude
-//   value  :  value of the object
-// This table can be used to find all objects that reside near
-// a specified geolocation.
-//
-// Table 2
-//   key  : 'k' + : + $id
-//   value:  $quadkey
-
-namespace rocksdb {
-
-const double GeoDBImpl::PI = 3.141592653589793;
-const double GeoDBImpl::EarthRadius = 6378137;
-const double GeoDBImpl::MinLatitude = -85.05112878;
-const double GeoDBImpl::MaxLatitude = 85.05112878;
-const double GeoDBImpl::MinLongitude = -180;
-const double GeoDBImpl::MaxLongitude = 180;
-
-GeoDBImpl::GeoDBImpl(DB* db, const GeoDBOptions& options) :
-  GeoDB(db, options), db_(db), options_(options) {
-}
-
-GeoDBImpl::~GeoDBImpl() {
-}
-
-Status GeoDBImpl::Insert(const GeoObject& obj) {
-  WriteBatch batch;
-
-  // It is possible that this id is already associated with
-  // with a different position. We first have to remove that
-  // association before we can insert the new one.
-
-  // remove existing object, if it exists
-  GeoObject old;
-  Status status = GetById(obj.id, &old);
-  if (status.ok()) {
-    assert(obj.id.compare(old.id) == 0);
-    std::string quadkey = PositionToQuad(old.position, Detail);
-    std::string key1 = MakeKey1(old.position, old.id, quadkey);
-    std::string key2 = MakeKey2(old.id);
-    batch.Delete(Slice(key1));
-    batch.Delete(Slice(key2));
-  } else if (status.IsNotFound()) {
-    // What if another thread is trying to insert the same ID concurrently?
-  } else {
-    return status;
-  }
-
-  // insert new object
-  std::string quadkey = PositionToQuad(obj.position, Detail);
-  std::string key1 = MakeKey1(obj.position, obj.id, quadkey);
-  std::string key2 = MakeKey2(obj.id);
-  batch.Put(Slice(key1), Slice(obj.value));
-  batch.Put(Slice(key2), Slice(quadkey));
-  return db_->Write(woptions_, &batch);
-}
-
-Status GeoDBImpl::GetByPosition(const GeoPosition& pos,
-                                const Slice& id,
-                                std::string* value) {
-  std::string quadkey = PositionToQuad(pos, Detail);
-  std::string key1 = MakeKey1(pos, id, quadkey);
-  return db_->Get(roptions_, Slice(key1), value);
-}
-
-Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) {
-  Status status;
-  std::string quadkey;
-
-  // create an iterator so that we can get a consistent picture
-  // of the database.
-  Iterator* iter = db_->NewIterator(roptions_);
-
-  // create key for table2
-  std::string kt = MakeKey2(id);
-  Slice key2(kt);
-
-  iter->Seek(key2);
-  if (iter->Valid() && iter->status().ok()) {
-    if (iter->key().compare(key2) == 0) {
-      quadkey = iter->value().ToString();
-    }
-  }
-  if (quadkey.size() == 0) {
-    delete iter;
-    return Status::NotFound(key2);
-  }
-
-  //
-  // Seek to the quadkey + id prefix
-  //
-  std::string prefix = MakeKey1Prefix(quadkey, id);
-  iter->Seek(Slice(prefix));
-  assert(iter->Valid());
-  if (!iter->Valid() || !iter->status().ok()) {
-    delete iter;
-    return Status::NotFound();
-  }
-
-  // split the key into p + quadkey + id + lat + lon
-  Slice key = iter->key();
-  std::vector<std::string> parts = StringSplit(key.ToString(), ':');
-  assert(parts.size() == 5);
-  assert(parts[0] == "p");
-  assert(parts[1] == quadkey);
-  assert(parts[2] == id);
-
-  // fill up output parameters
-  object->position.latitude = atof(parts[3].c_str());
-  object->position.longitude = atof(parts[4].c_str());
-  object->id = id.ToString();  // this is redundant
-  object->value = iter->value().ToString();
-  delete iter;
-  return Status::OK();
-}
-
-
-Status GeoDBImpl::Remove(const Slice& id) {
-  // Read the object from the database
-  GeoObject obj;
-  Status status = GetById(id, &obj);
-  if (!status.ok()) {
-    return status;
-  }
-
-  // remove the object by atomically deleting it from both tables
-  std::string quadkey = PositionToQuad(obj.position, Detail);
-  std::string key1 = MakeKey1(obj.position, obj.id, quadkey);
-  std::string key2 = MakeKey2(obj.id);
-  WriteBatch batch;
-  batch.Delete(Slice(key1));
-  batch.Delete(Slice(key2));
-  return db_->Write(woptions_, &batch);
-}
-
-class GeoIteratorImpl : public GeoIterator {
- private:
-  std::vector<GeoObject> values_;
-  std::vector<GeoObject>::iterator iter_;
- public:
-  explicit GeoIteratorImpl(std::vector<GeoObject> values)
-    : values_(std::move(values)) {
-    iter_ = values_.begin();
-  }
-  virtual void Next() override;
-  virtual bool Valid() const override;
-  virtual const GeoObject& geo_object() override;
-  virtual Status status() const override;
-};
-
-class GeoErrorIterator : public GeoIterator {
- private:
-  Status status_;
- public:
-  explicit GeoErrorIterator(Status s) : status_(s) {}
-  virtual void Next() override {};
-  virtual bool Valid() const override { return false; }
-  virtual const GeoObject& geo_object() override {
-    GeoObject* g = new GeoObject();
-    return *g;
-  }
-  virtual Status status() const override { return status_; }
-};
-
-void GeoIteratorImpl::Next() {
-  assert(Valid());
-  iter_++;
-}
-
-bool GeoIteratorImpl::Valid() const {
-  return iter_ != values_.end();
-}
-
-const GeoObject& GeoIteratorImpl::geo_object() {
-  assert(Valid());
-  return *iter_;
-}
-
-Status GeoIteratorImpl::status() const {
-  return Status::OK();
-}
-
-GeoIterator* GeoDBImpl::SearchRadial(const GeoPosition& pos,
-  double radius,
-  int number_of_values) {
-  std::vector<GeoObject> values;
-
-  // Gather all bounding quadkeys
-  std::vector<std::string> qids;
-  Status s = searchQuadIds(pos, radius, &qids);
-  if (!s.ok()) {
-    return new GeoErrorIterator(s);
-  }
-
-  // create an iterator
-  Iterator* iter = db_->NewIterator(ReadOptions());
-
-  // Process each prospective quadkey
-  for (std::string qid : qids) {
-    // The user is interested in only these many objects.
-    if (number_of_values == 0) {
-      break;
-    }
-
-    // convert quadkey to db key prefix
-    std::string dbkey = MakeQuadKeyPrefix(qid);
-
-    for (iter->Seek(dbkey);
-         number_of_values > 0 && iter->Valid() && iter->status().ok();
-         iter->Next()) {
-      // split the key into p + quadkey + id + lat + lon
-      Slice key = iter->key();
-      std::vector<std::string> parts = StringSplit(key.ToString(), ':');
-      assert(parts.size() == 5);
-      assert(parts[0] == "p");
-      std::string* quadkey = &parts[1];
-
-      // If the key we are looking for is a prefix of the key
-      // we found from the database, then this is one of the keys
-      // we are looking for.
-      auto res = std::mismatch(qid.begin(), qid.end(), quadkey->begin());
-      if (res.first == qid.end()) {
-        GeoPosition obj_pos(atof(parts[3].c_str()), atof(parts[4].c_str()));
-        GeoObject obj(obj_pos, parts[4], iter->value().ToString());
-        values.push_back(obj);
-        number_of_values--;
-      } else {
-        break;
-      }
-    }
-  }
-  delete iter;
-  return new GeoIteratorImpl(std::move(values));
-}
-
-std::string GeoDBImpl::MakeKey1(const GeoPosition& pos, Slice id,
-                                std::string quadkey) {
-  std::string lat = rocksdb::ToString(pos.latitude);
-  std::string lon = rocksdb::ToString(pos.longitude);
-  std::string key = "p:";
-  key.reserve(5 + quadkey.size() + id.size() + lat.size() + lon.size());
-  key.append(quadkey);
-  key.append(":");
-  key.append(id.ToString());
-  key.append(":");
-  key.append(lat);
-  key.append(":");
-  key.append(lon);
-  return key;
-}
-
-std::string GeoDBImpl::MakeKey2(Slice id) {
-  std::string key = "k:";
-  key.append(id.ToString());
-  return key;
-}
-
-std::string GeoDBImpl::MakeKey1Prefix(std::string quadkey,
-                                      Slice id) {
-  std::string key = "p:";
-  key.reserve(3 + quadkey.size() + id.size());
-  key.append(quadkey);
-  key.append(":");
-  key.append(id.ToString());
-  return key;
-}
-
-std::string GeoDBImpl::MakeQuadKeyPrefix(std::string quadkey) {
-  std::string key = "p:";
-  key.append(quadkey);
-  return key;
-}
-
-// convert degrees to radians
-double GeoDBImpl::radians(double x) {
-  return (x * PI) / 180;
-}
-
-// convert radians to degrees
-double GeoDBImpl::degrees(double x) {
-  return (x * 180) / PI;
-}
-
-// convert a gps location to quad coordinate
-std::string GeoDBImpl::PositionToQuad(const GeoPosition& pos,
-                                      int levelOfDetail) {
-  Pixel p = PositionToPixel(pos, levelOfDetail);
-  Tile tile = PixelToTile(p);
-  return TileToQuadKey(tile, levelOfDetail);
-}
-
-GeoPosition GeoDBImpl::displaceLatLon(double lat, double lon,
-                                      double deltay, double deltax) {
-  double dLat = deltay / EarthRadius;
-  double dLon = deltax / (EarthRadius * cos(radians(lat)));
-  return GeoPosition(lat + degrees(dLat),
-                     lon + degrees(dLon));
-}
-
-//
-// Return the distance between two positions on the earth
-//
-double GeoDBImpl::distance(double lat1, double lon1,
-                           double lat2, double lon2) {
-  double lon = radians(lon2 - lon1);
-  double lat = radians(lat2 - lat1);
-
-  double a = (sin(lat / 2) * sin(lat / 2)) +
-              cos(radians(lat1)) * cos(radians(lat2)) *
-              (sin(lon / 2) * sin(lon / 2));
-  double angle = 2 * atan2(sqrt(a), sqrt(1 - a));
-  return angle * EarthRadius;
-}
-
-//
-// Returns all the quadkeys inside the search range
-//
-Status GeoDBImpl::searchQuadIds(const GeoPosition& position,
-                                double radius,
-                                std::vector<std::string>* quadKeys) {
-  // get the outline of the search square
-  GeoPosition topLeftPos = boundingTopLeft(position, radius);
-  GeoPosition bottomRightPos = boundingBottomRight(position, radius);
-
-  Pixel topLeft =  PositionToPixel(topLeftPos, Detail);
-  Pixel bottomRight =  PositionToPixel(bottomRightPos, Detail);
-
-  // how many level of details to look for
-  int numberOfTilesAtMaxDepth = static_cast<int>(std::floor((bottomRight.x - topLeft.x) / 256));
-  int zoomLevelsToRise = static_cast<int>(std::floor(std::log(numberOfTilesAtMaxDepth) / std::log(2)));
-  zoomLevelsToRise++;
-  int levels = std::max(0, Detail - zoomLevelsToRise);
-
-  quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude,
-                                                 topLeftPos.longitude),
-                                     levels));
-  quadKeys->push_back(PositionToQuad(GeoPosition(topLeftPos.latitude,
-                                                 bottomRightPos.longitude),
-                                     levels));
-  quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude,
-                                                 topLeftPos.longitude),
-                                     levels));
-  quadKeys->push_back(PositionToQuad(GeoPosition(bottomRightPos.latitude,
-                                                 bottomRightPos.longitude),
-                                     levels));
-  return Status::OK();
-}
-
-// Determines the ground resolution (in meters per pixel) at a specified
-// latitude and level of detail.
-// Latitude (in degrees) at which to measure the ground resolution.
-// Level of detail, from 1 (lowest detail) to 23 (highest detail).
-// Returns the ground resolution, in meters per pixel.
-double GeoDBImpl::GroundResolution(double latitude, int levelOfDetail) {
-  latitude = clip(latitude, MinLatitude, MaxLatitude);
-  return cos(latitude * PI / 180) * 2 * PI * EarthRadius /
-         MapSize(levelOfDetail);
-}
-
-// Converts a point from latitude/longitude WGS-84 coordinates (in degrees)
-// into pixel XY coordinates at a specified level of detail.
-GeoDBImpl::Pixel GeoDBImpl::PositionToPixel(const GeoPosition& pos,
-                                            int levelOfDetail) {
-  double latitude = clip(pos.latitude, MinLatitude, MaxLatitude);
-  double x = (pos.longitude + 180) / 360;
-  double sinLatitude = sin(latitude * PI / 180);
-  double y = 0.5 - std::log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI);
-  double mapSize = MapSize(levelOfDetail);
-  double X = std::floor(clip(x * mapSize + 0.5, 0, mapSize - 1));
-  double Y = std::floor(clip(y * mapSize + 0.5, 0, mapSize - 1));
-  return Pixel((unsigned int)X, (unsigned int)Y);
-}
-
-GeoPosition GeoDBImpl::PixelToPosition(const Pixel& pixel, int levelOfDetail) {
-  double mapSize = MapSize(levelOfDetail);
-  double x = (clip(pixel.x, 0, mapSize - 1) / mapSize) - 0.5;
-  double y = 0.5 - (clip(pixel.y, 0, mapSize - 1) / mapSize);
-  double latitude = 90 - 360 * atan(exp(-y * 2 * PI)) / PI;
-  double longitude = 360 * x;
-  return GeoPosition(latitude, longitude);
-}
-
-// Converts a Pixel to a Tile
-GeoDBImpl::Tile GeoDBImpl::PixelToTile(const Pixel& pixel) {
-  unsigned int tileX = static_cast<unsigned int>(std::floor(pixel.x / 256));
-  unsigned int tileY = static_cast<unsigned int>(std::floor(pixel.y / 256));
-  return Tile(tileX, tileY);
-}
-
-GeoDBImpl::Pixel GeoDBImpl::TileToPixel(const Tile& tile) {
-  unsigned int pixelX = tile.x * 256;
-  unsigned int pixelY = tile.y * 256;
-  return Pixel(pixelX, pixelY);
-}
-
-// Convert a Tile to a quadkey
-std::string GeoDBImpl::TileToQuadKey(const Tile& tile, int levelOfDetail) {
-  std::stringstream quadKey;
-  for (int i = levelOfDetail; i > 0; i--) {
-    char digit = '0';
-    int mask = 1 << (i - 1);
-    if ((tile.x & mask) != 0) {
-      digit++;
-    }
-    if ((tile.y & mask) != 0) {
-      digit++;
-      digit++;
-    }
-    quadKey << digit;
-  }
-  return quadKey.str();
-}
-
-//
-// Convert a quadkey to a tile and its level of detail
-//
-void GeoDBImpl::QuadKeyToTile(std::string quadkey, Tile* tile,
-                              int* levelOfDetail) {
-  tile->x = tile->y = 0;
-  *levelOfDetail = static_cast<int>(quadkey.size());
-  const char* key = reinterpret_cast<const char*>(quadkey.c_str());
-  for (int i = *levelOfDetail; i > 0; i--) {
-    int mask = 1 << (i - 1);
-    switch (key[*levelOfDetail - i]) {
-      case '0':
-        break;
-
-      case '1':
-        tile->x |= mask;
-        break;
-
-      case '2':
-        tile->y |= mask;
-        break;
-
-      case '3':
-        tile->x |= mask;
-        tile->y |= mask;
-        break;
-
-      default:
-        std::stringstream msg;
-        msg << quadkey;
-        msg << " Invalid QuadKey.";
-        throw std::runtime_error(msg.str());
-    }
-  }
-}
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/geodb/geodb_impl.h b/thirdparty/rocksdb/utilities/geodb/geodb_impl.h
deleted file mode 100644
index 6b15f54221..0000000000
--- a/thirdparty/rocksdb/utilities/geodb/geodb_impl.h
+++ /dev/null
@@ -1,185 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-
-#ifndef ROCKSDB_LITE
-
-#pragma once
-#include <algorithm>
-#include <cmath>
-#include <string>
-#include <sstream>
-#include <stdexcept>
-#include <vector>
-
-#include "rocksdb/utilities/geo_db.h"
-#include "rocksdb/utilities/stackable_db.h"
-#include "rocksdb/env.h"
-#include "rocksdb/status.h"
-
-namespace rocksdb {
-
-// A specific implementation of GeoDB
-
-class GeoDBImpl : public GeoDB {
- public:
-  GeoDBImpl(DB* db, const GeoDBOptions& options);
-  ~GeoDBImpl();
-
-  // Associate the GPS location with the identified by 'id'. The value
-  // is a blob that is associated with this object.
-  virtual Status Insert(const GeoObject& object) override;
-
-  // Retrieve the value of the object located at the specified GPS
-  // location and is identified by the 'id'.
-  virtual Status GetByPosition(const GeoPosition& pos, const Slice& id,
-                               std::string* value) override;
-
-  // Retrieve the value of the object identified by the 'id'. This method
-  // could be potentially slower than GetByPosition
-  virtual Status GetById(const Slice& id, GeoObject* object) override;
-
-  // Delete the specified object
-  virtual Status Remove(const Slice& id) override;
-
-  // Returns a list of all items within a circular radius from the
-  // specified gps location
-  virtual GeoIterator* SearchRadial(const GeoPosition& pos, double radius,
-                                    int number_of_values) override;
-
- private:
-  DB* db_;
-  const GeoDBOptions options_;
-  const WriteOptions woptions_;
-  const ReadOptions roptions_;
-
-  // MSVC requires the definition for this static const to be in .CC file
-  // The value of PI
-  static const double PI;
-
-  // convert degrees to radians
-  static double radians(double x);
-
-  // convert radians to degrees
-  static double degrees(double x);
-
-  // A pixel class that captures X and Y coordinates
-  class Pixel {
-   public:
-    unsigned int x;
-    unsigned int y;
-    Pixel(unsigned int a, unsigned int b) :
-     x(a), y(b) {
-    }
-  };
-
-  // A Tile in the geoid
-  class Tile {
-   public:
-    unsigned int x;
-    unsigned int y;
-    Tile(unsigned int a, unsigned int b) :
-     x(a), y(b) {
-    }
-  };
-
-  // convert a gps location to quad coordinate
-  static std::string PositionToQuad(const GeoPosition& pos, int levelOfDetail);
-
-  // arbitrary constant use for WGS84 via
-  // http://en.wikipedia.org/wiki/World_Geodetic_System
-  // http://mathforum.org/library/drmath/view/51832.html
-  // http://msdn.microsoft.com/en-us/library/bb259689.aspx
-  // http://www.tuicool.com/articles/NBrE73
-  //
-  const int Detail = 23;
-  // MSVC requires the definition for this static const to be in .CC file
-  static const double EarthRadius;
-  static const double MinLatitude;
-  static const double MaxLatitude;
-  static const double MinLongitude;
-  static const double MaxLongitude;
-
-  // clips a number to the specified minimum and maximum values.
-  static double clip(double n, double minValue, double maxValue) {
-    return fmin(fmax(n, minValue), maxValue);
-  }
-
-  // Determines the map width and height (in pixels) at a specified level
-  // of detail, from 1 (lowest detail) to 23 (highest detail).
-  // Returns the map width and height in pixels.
-  static unsigned int MapSize(int levelOfDetail) {
-    return (unsigned int)(256 << levelOfDetail);
-  }
-
-  // Determines the ground resolution (in meters per pixel) at a specified
-  // latitude and level of detail.
-  // Latitude (in degrees) at which to measure the ground resolution.
-  // Level of detail, from 1 (lowest detail) to 23 (highest detail).
-  // Returns the ground resolution, in meters per pixel.
-  static double GroundResolution(double latitude, int levelOfDetail);
-
-  // Converts a point from latitude/longitude WGS-84 coordinates (in degrees)
-  // into pixel XY coordinates at a specified level of detail.
-  static Pixel PositionToPixel(const GeoPosition& pos, int levelOfDetail);
-
-  static GeoPosition PixelToPosition(const Pixel& pixel, int levelOfDetail);
-
-  // Converts a Pixel to a Tile
-  static Tile PixelToTile(const Pixel& pixel);
-
-  static Pixel TileToPixel(const Tile& tile);
-
-  // Convert a Tile to a quadkey
-  static std::string TileToQuadKey(const Tile& tile, int levelOfDetail);
-
-  // Convert a quadkey to a tile and its level of detail
-  static void QuadKeyToTile(std::string quadkey, Tile* tile,
-                            int *levelOfDetail);
-
-  // Return the distance between two positions on the earth
-  static double distance(double lat1, double lon1,
-                         double lat2, double lon2);
-  static GeoPosition displaceLatLon(double lat, double lon,
-                                    double deltay, double deltax);
-
-  //
-  // Returns the top left position after applying the delta to
-  // the specified position
-  //
-  static GeoPosition boundingTopLeft(const GeoPosition& in, double radius) {
-    return displaceLatLon(in.latitude, in.longitude, -radius, -radius);
-  }
-
-  //
-  // Returns the bottom right position after applying the delta to
-  // the specified position
-  static GeoPosition boundingBottomRight(const GeoPosition& in,
-                                         double radius) {
-    return displaceLatLon(in.latitude, in.longitude, radius, radius);
-  }
-
-  //
-  // Get all quadkeys within a radius of a specified position
-  //
-  Status searchQuadIds(const GeoPosition& position,
-                       double radius,
-                       std::vector<std::string>* quadKeys);
-
-  //
-  // Create keys for accessing rocksdb table(s)
-  //
-  static std::string MakeKey1(const GeoPosition& pos,
-                              Slice id,
-                              std::string quadkey);
-  static std::string MakeKey2(Slice id);
-  static std::string MakeKey1Prefix(std::string quadkey,
-                                    Slice id);
-  static std::string MakeQuadKeyPrefix(std::string quadkey);
-};
-
-}  // namespace rocksdb
-
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/geodb/geodb_test.cc b/thirdparty/rocksdb/utilities/geodb/geodb_test.cc
deleted file mode 100644
index dcdb982517..0000000000
--- a/thirdparty/rocksdb/utilities/geodb/geodb_test.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
-#ifndef ROCKSDB_LITE
-#include "utilities/geodb/geodb_impl.h"
-
-#include <cctype>
-#include "util/testharness.h"
-
-namespace rocksdb {
-
-class GeoDBTest : public testing::Test {
- public:
-  static const std::string kDefaultDbName;
-  static Options options;
-  DB* db;
-  GeoDB* geodb;
-
-  GeoDBTest() {
-    GeoDBOptions geodb_options;
-    EXPECT_OK(DestroyDB(kDefaultDbName, options));
-    options.create_if_missing = true;
-    Status status = DB::Open(options, kDefaultDbName, &db);
-    geodb =  new GeoDBImpl(db, geodb_options);
-  }
-
-  ~GeoDBTest() {
-    delete geodb;
-  }
-
-  GeoDB* getdb() {
-    return geodb;
-  }
-};
-
-const std::string GeoDBTest::kDefaultDbName = test::TmpDir() + "/geodb_test";
-Options GeoDBTest::options = Options();
-
-// Insert, Get and Remove
-TEST_F(GeoDBTest, SimpleTest) {
-  GeoPosition pos1(100, 101);
-  std::string id1("id1");
-  std::string value1("value1");
-
-  // insert first object into database
-  GeoObject obj1(pos1, id1, value1);
-  Status status = getdb()->Insert(obj1);
-  ASSERT_TRUE(status.ok());
-
-  // insert second object into database
-  GeoPosition pos2(200, 201);
-  std::string id2("id2");
-  std::string value2 = "value2";
-  GeoObject obj2(pos2, id2, value2);
-  status = getdb()->Insert(obj2);
-  ASSERT_TRUE(status.ok());
-
-  // retrieve first object using position
-  std::string value;
-  status = getdb()->GetByPosition(pos1, Slice(id1), &value);
-  ASSERT_TRUE(status.ok());
-  ASSERT_EQ(value, value1);
-
-  // retrieve first object using id
-  GeoObject obj;
-  status = getdb()->GetById(Slice(id1), &obj);
-  ASSERT_TRUE(status.ok());
-  ASSERT_EQ(obj.position.latitude, 100);
-  ASSERT_EQ(obj.position.longitude, 101);
-  ASSERT_EQ(obj.id.compare(id1), 0);
-  ASSERT_EQ(obj.value, value1);
-
-  // delete first object
-  status = getdb()->Remove(Slice(id1));
-  ASSERT_TRUE(status.ok());
-  status = getdb()->GetByPosition(pos1, Slice(id1), &value);
-  ASSERT_TRUE(status.IsNotFound());
-  status = getdb()->GetById(id1, &obj);
-  ASSERT_TRUE(status.IsNotFound());
-
-  // check that we can still find second object
-  status = getdb()->GetByPosition(pos2, id2, &value);
-  ASSERT_TRUE(status.ok());
-  ASSERT_EQ(value, value2);
-  status = getdb()->GetById(id2, &obj);
-  ASSERT_TRUE(status.ok());
-}
-
-// Search.
-// Verify distances via http://www.stevemorse.org/nearest/distance.php
-TEST_F(GeoDBTest, Search) {
-  GeoPosition pos1(45, 45);
-  std::string id1("mid1");
-  std::string value1 = "midvalue1";
-
-  // insert object at 45 degree latitude
-  GeoObject obj1(pos1, id1, value1);
-  Status status = getdb()->Insert(obj1);
-  ASSERT_TRUE(status.ok());
-
-  // search all objects centered at 46 degree latitude with
-  // a radius of 200 kilometers. We should find the one object that
-  // we inserted earlier.
-  GeoIterator* iter1 = getdb()->SearchRadial(GeoPosition(46, 46), 200000);
-  ASSERT_TRUE(status.ok());
-  ASSERT_EQ(iter1->geo_object().value, "midvalue1");
-  uint32_t size = 0;
-  while (iter1->Valid()) {
-    size++;
-    iter1->Next();
-  }
-  ASSERT_EQ(size, 1U);
-  delete iter1;
-
-  // search all objects centered at 46 degree latitude with
-  // a radius of 2 kilometers. There should be none.
-  GeoIterator* iter2 = getdb()->SearchRadial(GeoPosition(46, 46), 2);
-  ASSERT_TRUE(status.ok());
-  ASSERT_FALSE(iter2->Valid());
-  delete iter2;
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-#else
-
-#include <stdio.h>
-
-int main() {
-  fprintf(stderr, "SKIPPED\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/lua/rocks_lua_compaction_filter.cc b/thirdparty/rocksdb/utilities/lua/rocks_lua_compaction_filter.cc
deleted file mode 100644
index 0934ca9c92..0000000000
--- a/thirdparty/rocksdb/utilities/lua/rocks_lua_compaction_filter.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#if defined(LUA) && !defined(ROCKSDB_LITE)
-#include "rocksdb/utilities/lua/rocks_lua_compaction_filter.h"
-
-extern "C" {
-#include <luaconf.h>
-}
-
-#include "rocksdb/compaction_filter.h"
-
-namespace rocksdb {
-namespace lua {
-
-const std::string kFilterFunctionName = "Filter";
-const std::string kNameFunctionName = "Name";
-
-void RocksLuaCompactionFilter::LogLuaError(const char* format, ...) const {
-  if (options_.error_log.get() != nullptr &&
-      error_count_ < options_.error_limit_per_filter) {
-    error_count_++;
-
-    va_list ap;
-    va_start(ap, format);
-    options_.error_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap);
-    va_end(ap);
-  }
-}
-
-bool RocksLuaCompactionFilter::Filter(int level, const Slice& key,
-                                      const Slice& existing_value,
-                                      std::string* new_value,
-                                      bool* value_changed) const {
-  auto* lua_state = lua_state_wrapper_.GetLuaState();
-  // push the right function into the lua stack
-  lua_getglobal(lua_state, kFilterFunctionName.c_str());
-
-  int error_no = 0;
-  int num_input_values;
-  int num_return_values;
-  if (options_.ignore_value == false) {
-    // push input arguments into the lua stack
-    lua_pushnumber(lua_state, level);
-    lua_pushlstring(lua_state, key.data(), key.size());
-    lua_pushlstring(lua_state, existing_value.data(), existing_value.size());
-    num_input_values = 3;
-    num_return_values = 3;
-  } else {
-    // If ignore_value is set to true, then we only put two arguments
-    // and expect one return value
-    lua_pushnumber(lua_state, level);
-    lua_pushlstring(lua_state, key.data(), key.size());
-    num_input_values = 2;
-    num_return_values = 1;
-  }
-
-  // perform the lua call
-  if ((error_no =
-           lua_pcall(lua_state, num_input_values, num_return_values, 0)) != 0) {
-    LogLuaError("[Lua] Error(%d) in Filter function --- %s", error_no,
-                lua_tostring(lua_state, -1));
-    // pops out the lua error from stack
-    lua_pop(lua_state, 1);
-    return false;
-  }
-
-  // As lua_pcall went successfully, it can be guaranteed that the top
-  // three elements in the Lua stack are the three returned values.
-
-  bool has_error = false;
-  const int kIndexIsFiltered = -num_return_values;
-  const int kIndexValueChanged = -num_return_values + 1;
-  const int kIndexNewValue = -num_return_values + 2;
-
-  // check the types of three return values
-  // is_filtered
-  if (!lua_isboolean(lua_state, kIndexIsFiltered)) {
-    LogLuaError(
-        "[Lua] Error in Filter function -- "
-        "1st return value (is_filtered) is not a boolean "
-        "while a boolean is expected.");
-    has_error = true;
-  }
-
-  if (options_.ignore_value == false) {
-    // value_changed
-    if (!lua_isboolean(lua_state, kIndexValueChanged)) {
-      LogLuaError(
-          "[Lua] Error in Filter function -- "
-          "2nd return value (value_changed) is not a boolean "
-          "while a boolean is expected.");
-      has_error = true;
-    }
-    // new_value
-    if (!lua_isstring(lua_state, kIndexNewValue)) {
-      LogLuaError(
-          "[Lua] Error in Filter function -- "
-          "3rd return value (new_value) is not a string "
-          "while a string is expected.");
-      has_error = true;
-    }
-  }
-
-  if (has_error) {
-    lua_pop(lua_state, num_return_values);
-    return false;
-  }
-
-  // Fetch the return values
-  bool is_filtered = false;
-  if (!has_error) {
-    is_filtered = lua_toboolean(lua_state, kIndexIsFiltered);
-    if (options_.ignore_value == false) {
-      *value_changed = lua_toboolean(lua_state, kIndexValueChanged);
-      if (*value_changed) {
-        const char* new_value_buf = lua_tostring(lua_state, kIndexNewValue);
-        const size_t new_value_size = lua_strlen(lua_state, kIndexNewValue);
-        // Note that any string that lua_tostring returns always has a zero at
-        // its end, bu/t it can have other zeros inside it
-        assert(new_value_buf[new_value_size] == '\0');
-        assert(strlen(new_value_buf) <= new_value_size);
-        new_value->assign(new_value_buf, new_value_size);
-      }
-    } else {
-      *value_changed = false;
-    }
-  }
-  // pops the three return values.
-  lua_pop(lua_state, num_return_values);
-  return is_filtered;
-}
-
-const char* RocksLuaCompactionFilter::Name() const {
-  if (name_ != "") {
-    return name_.c_str();
-  }
-  auto* lua_state = lua_state_wrapper_.GetLuaState();
-  // push the right function into the lua stack
-  lua_getglobal(lua_state, kNameFunctionName.c_str());
-
-  // perform the call (0 arguments, 1 result)
-  int error_no;
-  if ((error_no = lua_pcall(lua_state, 0, 1, 0)) != 0) {
-    LogLuaError("[Lua] Error(%d) in Name function --- %s", error_no,
-                lua_tostring(lua_state, -1));
-    // pops out the lua error from stack
-    lua_pop(lua_state, 1);
-    return name_.c_str();
-  }
-
-  // check the return value
-  if (!lua_isstring(lua_state, -1)) {
-    LogLuaError(
-        "[Lua] Error in Name function -- "
-        "return value is not a string while string is expected");
-  } else {
-    const char* name_buf = lua_tostring(lua_state, -1);
-    const size_t name_size __attribute__((unused)) = lua_strlen(lua_state, -1);
-    assert(name_buf[name_size] == '\0');
-    assert(strlen(name_buf) <= name_size);
-    name_ = name_buf;
-  }
-  lua_pop(lua_state, 1);
-  return name_.c_str();
-}
-
-/* Not yet supported
-bool RocksLuaCompactionFilter::FilterMergeOperand(
-    int level, const Slice& key, const Slice& operand) const {
-  auto* lua_state = lua_state_wrapper_.GetLuaState();
-  // push the right function into the lua stack
-  lua_getglobal(lua_state, "FilterMergeOperand");
-
-  // push input arguments into the lua stack
-  lua_pushnumber(lua_state, level);
-  lua_pushlstring(lua_state, key.data(), key.size());
-  lua_pushlstring(lua_state, operand.data(), operand.size());
-
-  // perform the call (3 arguments, 1 result)
-  int error_no;
-  if ((error_no = lua_pcall(lua_state, 3, 1, 0)) != 0) {
-    LogLuaError("[Lua] Error(%d) in FilterMergeOperand function --- %s",
-        error_no, lua_tostring(lua_state, -1));
-    // pops out the lua error from stack
-    lua_pop(lua_state, 1);
-    return false;
-  }
-
-  bool is_filtered = false;
-  // check the return value
-  if (!lua_isboolean(lua_state, -1)) {
-    LogLuaError("[Lua] Error in FilterMergeOperand function -- "
-                "return value is not a boolean while boolean is expected");
-  } else {
-    is_filtered = lua_toboolean(lua_state, -1);
-  }
-
-  lua_pop(lua_state, 1);
-
-  return is_filtered;
-}
-*/
-
-bool RocksLuaCompactionFilter::IgnoreSnapshots() const {
-  return options_.ignore_snapshots;
-}
-
-RocksLuaCompactionFilterFactory::RocksLuaCompactionFilterFactory(
-    const RocksLuaCompactionFilterOptions opt)
-    : opt_(opt) {
-  auto filter = CreateCompactionFilter(CompactionFilter::Context());
-  name_ = std::string("RocksLuaCompactionFilterFactory::") +
-          std::string(filter->Name());
-}
-
-std::unique_ptr<CompactionFilter>
-RocksLuaCompactionFilterFactory::CreateCompactionFilter(
-    const CompactionFilter::Context& context) {
-  std::lock_guard<std::mutex> lock(opt_mutex_);
-  return std::unique_ptr<CompactionFilter>(new RocksLuaCompactionFilter(opt_));
-}
-
-std::string RocksLuaCompactionFilterFactory::GetScript() {
-  std::lock_guard<std::mutex> lock(opt_mutex_);
-  return opt_.lua_script;
-}
-
-void RocksLuaCompactionFilterFactory::SetScript(const std::string& new_script) {
-  std::lock_guard<std::mutex> lock(opt_mutex_);
-  opt_.lua_script = new_script;
-}
-
-const char* RocksLuaCompactionFilterFactory::Name() const {
-  return name_.c_str();
-}
-
-}  // namespace lua
-}  // namespace rocksdb
-#endif  // defined(LUA) && !defined(ROCKSDB_LITE)
diff --git a/thirdparty/rocksdb/utilities/lua/rocks_lua_test.cc b/thirdparty/rocksdb/utilities/lua/rocks_lua_test.cc
deleted file mode 100644
index 025acaf6d6..0000000000
--- a/thirdparty/rocksdb/utilities/lua/rocks_lua_test.cc
+++ /dev/null
@@ -1,498 +0,0 @@
-//  Copyright (c) 2016, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#include <stdio.h>
-
-#if !defined(ROCKSDB_LITE)
-
-#if defined(LUA)
-
-#include <string>
-
-#include "db/db_test_util.h"
-#include "port/stack_trace.h"
-#include "rocksdb/compaction_filter.h"
-#include "rocksdb/db.h"
-#include "rocksdb/utilities/lua/rocks_lua_compaction_filter.h"
-#include "util/testharness.h"
-
-namespace rocksdb {
-
-class StopOnErrorLogger : public Logger {
- public:
-  using Logger::Logv;
-  virtual void Logv(const char* format, va_list ap) override {
-    vfprintf(stderr, format, ap);
-    fprintf(stderr, "\n");
-    FAIL();
-  }
-};
-
-
-class RocksLuaTest : public testing::Test {
- public:
-  RocksLuaTest() : rnd_(301) {
-    temp_dir_ = test::TmpDir(Env::Default());
-    db_ = nullptr;
-  }
-
-  std::string RandomString(int len) {
-    std::string res;
-    for (int i = 0; i < len; ++i) {
-      res += rnd_.Uniform(26) + 'a';
-    }
-    return res;
-  }
-
-  void CreateDBWithLuaCompactionFilter(
-      const lua::RocksLuaCompactionFilterOptions& lua_opt,
-      const std::string& db_path,
-      std::unordered_map<std::string, std::string>* kvs,
-      const int kNumFlushes = 5,
-      std::shared_ptr<rocksdb::lua::RocksLuaCompactionFilterFactory>*
-          output_factory = nullptr) {
-    const int kKeySize = 10;
-    const int kValueSize = 50;
-    const int kKeysPerFlush = 2;
-    auto factory =
-        std::make_shared<rocksdb::lua::RocksLuaCompactionFilterFactory>(
-            lua_opt);
-    if (output_factory != nullptr) {
-      *output_factory = factory;
-    }
-
-    options_ = Options();
-    options_.create_if_missing = true;
-    options_.compaction_filter_factory = factory;
-    options_.disable_auto_compactions = true;
-    options_.max_bytes_for_level_base =
-        (kKeySize + kValueSize) * kKeysPerFlush * 2;
-    options_.max_bytes_for_level_multiplier = 2;
-    options_.target_file_size_base = (kKeySize + kValueSize) * kKeysPerFlush;
-    options_.level0_file_num_compaction_trigger = 2;
-    DestroyDB(db_path, options_);
-    ASSERT_OK(DB::Open(options_, db_path, &db_));
-
-    for (int f = 0; f < kNumFlushes; ++f) {
-      for (int i = 0; i < kKeysPerFlush; ++i) {
-        std::string key = RandomString(kKeySize);
-        std::string value = RandomString(kValueSize);
-        kvs->insert({key, value});
-        ASSERT_OK(db_->Put(WriteOptions(), key, value));
-      }
-      db_->Flush(FlushOptions());
-    }
-  }
-
-  ~RocksLuaTest() {
-    if (db_) {
-      delete db_;
-    }
-  }
-  std::string temp_dir_;
-  DB* db_;
-  Random rnd_;
-  Options options_;
-};
-
-TEST_F(RocksLuaTest, Default) {
-  // If nothing is set in the LuaCompactionFilterOptions, then
-  // RocksDB will keep all the key / value pairs, but it will also
-  // print our error log indicating failure.
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-
-  std::unordered_map<std::string, std::string> kvs;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs);
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    ASSERT_OK(db_->Get(ReadOptions(), entry.first, &value));
-    ASSERT_EQ(value, entry.second);
-  }
-}
-
-TEST_F(RocksLuaTest, KeepsAll) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  // keeps all the key value pairs
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  return false, false, \"\"\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"KeepsAll\"\n"
-      "end\n"
-      "\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs);
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    ASSERT_OK(db_->Get(ReadOptions(), entry.first, &value));
-    ASSERT_EQ(value, entry.second);
-  }
-}
-
-TEST_F(RocksLuaTest, GetName) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  const std::string kScriptName = "SimpleLuaCompactionFilter";
-  lua_opt.lua_script =
-      std::string(
-          "function Filter(level, key, existing_value)\n"
-          "  return false, false, \"\"\n"
-          "end\n"
-          "\n"
-          "function FilterMergeOperand(level, key, operand)\n"
-          "  return false\n"
-          "end\n"
-          "function Name()\n"
-          "  return \"") + kScriptName + "\"\n"
-      "end\n"
-      "\n";
-
-  std::shared_ptr<CompactionFilterFactory> factory =
-      std::make_shared<lua::RocksLuaCompactionFilterFactory>(lua_opt);
-  std::string factory_name(factory->Name());
-  ASSERT_NE(factory_name.find(kScriptName), std::string::npos);
-}
-
-TEST_F(RocksLuaTest, RemovesAll) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  // removes all the key value pairs
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  return true, false, \"\"\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"RemovesAll\"\n"
-      "end\n"
-      "\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs);
-  // Issue full compaction and expect nothing is in the DB.
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    auto s = db_->Get(ReadOptions(), entry.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-}
-
-TEST_F(RocksLuaTest, FilterByKey) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  // removes all keys whose initial is less than 'r'
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  if key:sub(1,1) < 'r' then\n"
-      "    return true, false, \"\"\n"
-      "  end\n"
-      "  return false, false, \"\"\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"KeepsAll\"\n"
-      "end\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs);
-  // Issue full compaction and expect nothing is in the DB.
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    auto s = db_->Get(ReadOptions(), entry.first, &value);
-    if (entry.first[0] < 'r') {
-      ASSERT_TRUE(s.IsNotFound());
-    } else {
-      ASSERT_TRUE(s.ok());
-      ASSERT_TRUE(value == entry.second);
-    }
-  }
-}
-
-TEST_F(RocksLuaTest, FilterByValue) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  // removes all values whose initial is less than 'r'
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  if existing_value:sub(1,1) < 'r' then\n"
-      "    return true, false, \"\"\n"
-      "  end\n"
-      "  return false, false, \"\"\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"FilterByValue\"\n"
-      "end\n"
-      "\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs);
-  // Issue full compaction and expect nothing is in the DB.
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    auto s = db_->Get(ReadOptions(), entry.first, &value);
-    if (entry.second[0] < 'r') {
-      ASSERT_TRUE(s.IsNotFound());
-    } else {
-      ASSERT_TRUE(s.ok());
-      ASSERT_EQ(value, entry.second);
-    }
-  }
-}
-
-TEST_F(RocksLuaTest, ChangeValue) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  // Replace all values by their reversed key
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  return false, true, key:reverse()\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"ChangeValue\"\n"
-      "end\n"
-      "\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs);
-  // Issue full compaction and expect nothing is in the DB.
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    ASSERT_OK(db_->Get(ReadOptions(), entry.first, &value));
-    std::string new_value = entry.first;
-    std::reverse(new_value.begin(), new_value.end());
-    ASSERT_EQ(value, new_value);
-  }
-}
-
-TEST_F(RocksLuaTest, ConditionallyChangeAndFilterValue) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  // Performs the following logic:
-  // If key[0] < 'h' --> replace value by reverse key
-  // If key[0] >= 'r' --> keep the original key value
-  // Otherwise, filter the key value
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  if key:sub(1,1) < 'h' then\n"
-      "    return false, true, key:reverse()\n"
-      "  elseif key:sub(1,1) < 'r' then\n"
-      "    return true, false, \"\"\n"
-      "  end\n"
-      "  return false, false, \"\"\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"ConditionallyChangeAndFilterValue\"\n"
-      "end\n"
-      "\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs);
-  // Issue full compaction and expect nothing is in the DB.
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    auto s = db_->Get(ReadOptions(), entry.first, &value);
-    if (entry.first[0] < 'h') {
-      ASSERT_TRUE(s.ok());
-      std::string new_value = entry.first;
-      std::reverse(new_value.begin(), new_value.end());
-      ASSERT_EQ(value, new_value);
-    } else if (entry.first[0] < 'r') {
-      ASSERT_TRUE(s.IsNotFound());
-    } else {
-      ASSERT_TRUE(s.ok());
-      ASSERT_EQ(value, entry.second);
-    }
-  }
-}
-
-TEST_F(RocksLuaTest, DynamicChangeScript) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  lua_opt.error_log = std::make_shared<StopOnErrorLogger>();
-  // keeps all the key value pairs
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  return false, false, \"\"\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"KeepsAll\"\n"
-      "end\n"
-      "\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  std::shared_ptr<rocksdb::lua::RocksLuaCompactionFilterFactory> factory;
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs, 30, &factory);
-  uint64_t count = 0;
-  ASSERT_TRUE(db_->GetIntProperty(
-      rocksdb::DB::Properties::kNumEntriesActiveMemTable, &count));
-  ASSERT_EQ(count, 0);
-  ASSERT_TRUE(db_->GetIntProperty(
-      rocksdb::DB::Properties::kNumEntriesImmMemTables, &count));
-  ASSERT_EQ(count, 0);
-
-  CompactRangeOptions cr_opt;
-  cr_opt.bottommost_level_compaction =
-      rocksdb::BottommostLevelCompaction::kForce;
-
-  // Issue full compaction and expect everything is in the DB.
-  ASSERT_OK(db_->CompactRange(cr_opt, nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    ASSERT_OK(db_->Get(ReadOptions(), entry.first, &value));
-    ASSERT_EQ(value, entry.second);
-  }
-
-  // change the lua script to removes all the key value pairs
-  factory->SetScript(
-      "function Filter(level, key, existing_value)\n"
-      "  return true, false, \"\"\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"RemovesAll\"\n"
-      "end\n"
-      "\n");
-  {
-    std::string key = "another-key";
-    std::string value = "another-value";
-    kvs.insert({key, value});
-    ASSERT_OK(db_->Put(WriteOptions(), key, value));
-    db_->Flush(FlushOptions());
-  }
-
-  cr_opt.change_level = true;
-  cr_opt.target_level = 5;
-  // Issue full compaction and expect nothing is in the DB.
-  ASSERT_OK(db_->CompactRange(cr_opt, nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    auto s = db_->Get(ReadOptions(), entry.first, &value);
-    ASSERT_TRUE(s.IsNotFound());
-  }
-}
-
-TEST_F(RocksLuaTest, LuaConditionalTypeError) {
-  std::string db_path = temp_dir_ + "/rocks_lua_test";
-
-  lua::RocksLuaCompactionFilterOptions lua_opt;
-  // Filter() error when input key's initial >= 'r'
-  lua_opt.lua_script =
-      "function Filter(level, key, existing_value)\n"
-      "  if existing_value:sub(1,1) >= 'r' then\n"
-      "    return true, 2, \"\" -- incorrect type of 2nd return value\n"
-      "  end\n"
-      "  return true, false, \"\"\n"
-      "end\n"
-      "\n"
-      "function FilterMergeOperand(level, key, operand)\n"
-      "  return false\n"
-      "end\n"
-      "function Name()\n"
-      "  return \"BuggyCode\"\n"
-      "end\n"
-      "\n";
-
-  std::unordered_map<std::string, std::string> kvs;
-  // Create DB with 10 files
-  CreateDBWithLuaCompactionFilter(lua_opt, db_path, &kvs, 10);
-
-  // Issue full compaction and expect all keys which initial is < 'r'
-  // will be deleted as we keep the key value when we hit an error.
-  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-
-  for (auto const& entry : kvs) {
-    std::string value;
-    auto s = db_->Get(ReadOptions(), entry.first, &value);
-    if (entry.second[0] < 'r') {
-      ASSERT_TRUE(s.IsNotFound());
-    } else {
-      ASSERT_TRUE(s.ok());
-      ASSERT_EQ(value, entry.second);
-    }
-  }
-}
-
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  rocksdb::port::InstallStackTraceHandler();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int argc, char** argv) {
-  printf("LUA_PATH is not set.  Ignoring the test.\n");
-}
-
-#endif  // defined(LUA)
-
-#else
-
-int main(int argc, char** argv) {
-  printf("Lua is not supported in RocksDBLite.  Ignoring the test.\n");
-}
-
-#endif  // !defined(ROCKSDB_LITE)
diff --git a/thirdparty/rocksdb/utilities/memory/memory_test.cc b/thirdparty/rocksdb/utilities/memory/memory_test.cc
index ee4f8740d6..8d976ef921 100644
--- a/thirdparty/rocksdb/utilities/memory/memory_test.cc
+++ b/thirdparty/rocksdb/utilities/memory/memory_test.cc
@@ -19,7 +19,7 @@ namespace rocksdb {
 
 class MemoryTest : public testing::Test {
  public:
-  MemoryTest() : kDbDir(test::TmpDir() + "/memory_test"), rnd_(301) {
+  MemoryTest() : kDbDir(test::PerThreadDBPath("memory_test")), rnd_(301) {
     assert(Env::Default()->CreateDirIfMissing(kDbDir).ok());
   }
 
@@ -269,7 +269,7 @@ int main(int argc, char** argv) {
 #else
 #include <cstdio>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   printf("Skipped in RocksDBLite as utilities are not supported.\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/merge_operators.h b/thirdparty/rocksdb/utilities/merge_operators.h
index 72f805a861..4c720b822f 100644
--- a/thirdparty/rocksdb/utilities/merge_operators.h
+++ b/thirdparty/rocksdb/utilities/merge_operators.h
@@ -3,13 +3,13 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).
 //
-#ifndef MERGE_OPERATORS_H
-#define MERGE_OPERATORS_H
+#pragma once
+#include "rocksdb/merge_operator.h"
 
-#include <memory>
 #include <stdio.h>
 
-#include "rocksdb/merge_operator.h"
+#include <memory>
+#include <string>
 
 namespace rocksdb {
 
@@ -19,9 +19,10 @@ class MergeOperators {
   static std::shared_ptr<MergeOperator> CreateDeprecatedPutOperator();
   static std::shared_ptr<MergeOperator> CreateUInt64AddOperator();
   static std::shared_ptr<MergeOperator> CreateStringAppendOperator();
+  static std::shared_ptr<MergeOperator> CreateStringAppendOperator(char delim_char);
   static std::shared_ptr<MergeOperator> CreateStringAppendTESTOperator();
   static std::shared_ptr<MergeOperator> CreateMaxOperator();
-  static std::shared_ptr<MergeOperator> CreateCassandraMergeOperator();
+  static std::shared_ptr<MergeOperator> CreateBytesXOROperator();
 
   // Will return a different merge operator depending on the string.
   // TODO: Hook the "name" up to the actual Name() of the MergeOperators?
@@ -39,16 +40,13 @@ class MergeOperators {
       return CreateStringAppendTESTOperator();
     } else if (name == "max") {
       return CreateMaxOperator();
-    } else if (name == "cassandra") {
-      return CreateCassandraMergeOperator();
+    } else if (name == "bytesxor") {
+      return CreateBytesXOROperator();
     } else {
       // Empty or unknown, just return nullptr
       return nullptr;
     }
   }
-
 };
 
-} // namespace rocksdb
-
-#endif
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/merge_operators/bytesxor.cc b/thirdparty/rocksdb/utilities/merge_operators/bytesxor.cc
new file mode 100644
index 0000000000..1444580eb4
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/merge_operators/bytesxor.cc
@@ -0,0 +1,59 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <algorithm>
+#include <string>
+
+#include "utilities/merge_operators/bytesxor.h"
+
+namespace rocksdb {
+
+std::shared_ptr<MergeOperator> MergeOperators::CreateBytesXOROperator() {
+  return std::make_shared<BytesXOROperator>();
+}
+
+bool BytesXOROperator::Merge(const Slice& /*key*/,
+                            const Slice* existing_value,
+                            const Slice& value,
+                            std::string* new_value,
+                            Logger* /*logger*/) const {
+  XOR(existing_value, value, new_value);
+  return true;
+}
+
+void BytesXOROperator::XOR(const Slice* existing_value,
+          const Slice& value, std::string* new_value) const {
+  if (!existing_value) {
+    new_value->clear();
+    new_value->assign(value.data(), value.size());
+    return;
+  }
+
+  size_t min_size = std::min(existing_value->size(), value.size());
+  size_t max_size = std::max(existing_value->size(), value.size());
+
+  new_value->clear();
+  new_value->reserve(max_size);
+
+  const char* existing_value_data = existing_value->data();
+  const char* value_data = value.data();
+
+  for (size_t i = 0; i < min_size; i++) {
+    new_value->push_back(existing_value_data[i] ^ value_data[i]);
+  }
+
+  if (existing_value->size() == max_size) {
+    for (size_t i = min_size; i < max_size; i++) {
+      new_value->push_back(existing_value_data[i]);
+    }
+  } else {
+	  assert(value.size() == max_size);
+    for (size_t i = min_size; i < max_size; i++) {
+      new_value->push_back(value_data[i]);
+    }
+  }
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/merge_operators/bytesxor.h b/thirdparty/rocksdb/utilities/merge_operators/bytesxor.h
new file mode 100644
index 0000000000..762e372445
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/merge_operators/bytesxor.h
@@ -0,0 +1,39 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace rocksdb {
+
+// A 'model' merge operator that XORs two (same sized) array of bytes.
+// Implemented as an AssociativeMergeOperator for simplicity and example.
+class BytesXOROperator : public AssociativeMergeOperator {
+ public:
+  // XORs the two array of bytes one byte at a time and stores the result
+  // in new_value. len is the number of xored bytes, and the length of new_value
+  virtual bool Merge(const Slice& key,
+                     const Slice* existing_value,
+                     const Slice& value,
+                     std::string* new_value,
+                     Logger* logger) const override;
+
+  virtual const char* Name() const override {
+    return "BytesXOR";
+  }
+
+  void XOR(const Slice* existing_value, const Slice& value,
+          std::string* new_value) const;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/merge_operators/max.cc b/thirdparty/rocksdb/utilities/merge_operators/max.cc
index 5f42e816ef..1ef66a34b0 100644
--- a/thirdparty/rocksdb/utilities/merge_operators/max.cc
+++ b/thirdparty/rocksdb/utilities/merge_operators/max.cc
@@ -19,8 +19,8 @@ namespace {  // anonymous namespace
 // Slice::compare
 class MaxOperator : public MergeOperator {
  public:
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
     Slice& max = merge_out->existing_operand;
     if (merge_in.existing_value) {
       max = Slice(merge_in.existing_value->data(),
@@ -38,9 +38,9 @@ class MaxOperator : public MergeOperator {
     return true;
   }
 
-  virtual bool PartialMerge(const Slice& key, const Slice& left_operand,
-                            const Slice& right_operand, std::string* new_value,
-                            Logger* logger) const override {
+  bool PartialMerge(const Slice& /*key*/, const Slice& left_operand,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* /*logger*/) const override {
     if (left_operand.compare(right_operand) >= 0) {
       new_value->assign(left_operand.data(), left_operand.size());
     } else {
@@ -49,10 +49,10 @@ class MaxOperator : public MergeOperator {
     return true;
   }
 
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value,
-                                 Logger* logger) const override {
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
     Slice max;
     for (const auto& operand : operand_list) {
       if (max.compare(operand) < 0) {
@@ -64,7 +64,7 @@ class MaxOperator : public MergeOperator {
     return true;
   }
 
-  virtual const char* Name() const override { return "MaxOperator"; }
+  const char* Name() const override { return "MaxOperator"; }
 };
 
 }  // end of anonymous namespace
diff --git a/thirdparty/rocksdb/utilities/merge_operators/put.cc b/thirdparty/rocksdb/utilities/merge_operators/put.cc
index 7f206ad3b0..a4b135fef5 100644
--- a/thirdparty/rocksdb/utilities/merge_operators/put.cc
+++ b/thirdparty/rocksdb/utilities/merge_operators/put.cc
@@ -22,11 +22,9 @@ namespace { // anonymous namespace
 // From the client-perspective, semantics are the same.
 class PutOperator : public MergeOperator {
  public:
-  virtual bool FullMerge(const Slice& key,
-                         const Slice* existing_value,
-                         const std::deque<std::string>& operand_sequence,
-                         std::string* new_value,
-                         Logger* logger) const override {
+  bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+                 const std::deque<std::string>& operand_sequence,
+                 std::string* new_value, Logger* /*logger*/) const override {
     // Put basically only looks at the current/latest value
     assert(!operand_sequence.empty());
     assert(new_value != nullptr);
@@ -34,40 +32,36 @@ class PutOperator : public MergeOperator {
     return true;
   }
 
-  virtual bool PartialMerge(const Slice& key,
-                            const Slice& left_operand,
-                            const Slice& right_operand,
-                            std::string* new_value,
-                            Logger* logger) const override {
+  bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/,
+                    const Slice& right_operand, std::string* new_value,
+                    Logger* /*logger*/) const override {
     new_value->assign(right_operand.data(), right_operand.size());
     return true;
   }
 
   using MergeOperator::PartialMergeMulti;
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value, Logger* logger) const
-      override {
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
     new_value->assign(operand_list.back().data(), operand_list.back().size());
     return true;
   }
 
-  virtual const char* Name() const override {
-    return "PutOperator";
-  }
+  const char* Name() const override { return "PutOperator"; }
 };
 
 class PutOperatorV2 : public PutOperator {
-  virtual bool FullMerge(const Slice& key, const Slice* existing_value,
-                         const std::deque<std::string>& operand_sequence,
-                         std::string* new_value,
-                         Logger* logger) const override {
+  bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/,
+                 const std::deque<std::string>& /*operand_sequence*/,
+                 std::string* /*new_value*/,
+                 Logger* /*logger*/) const override {
     assert(false);
     return false;
   }
 
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
     // Put basically only looks at the current/latest value
     assert(!merge_in.operand_list.empty());
     merge_out->existing_operand = merge_in.operand_list.back();
diff --git a/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend.cc b/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend.cc
index ff19348f07..d9c135fd37 100644
--- a/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend.cc
+++ b/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend.cc
@@ -21,12 +21,10 @@ StringAppendOperator::StringAppendOperator(char delim_char)
 }
 
 // Implementation for the merge operation (concatenates two strings)
-bool StringAppendOperator::Merge(const Slice& key,
+bool StringAppendOperator::Merge(const Slice& /*key*/,
                                  const Slice* existing_value,
-                                 const Slice& value,
-                                 std::string* new_value,
-                                 Logger* logger) const {
-
+                                 const Slice& value, std::string* new_value,
+                                 Logger* /*logger*/) const {
   // Clear the *new_value for writing.
   assert(new_value);
   new_value->clear();
@@ -54,4 +52,8 @@ std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator() {
   return std::make_shared<StringAppendOperator>(',');
 }
 
+std::shared_ptr<MergeOperator> MergeOperators::CreateStringAppendOperator(char delim_char) {
+  return std::make_shared<StringAppendOperator>(delim_char);
+}
+
 } // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend2.cc b/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
index 2d7b7423ce..6e46d80a13 100644
--- a/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
+++ b/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend2.cc
@@ -68,16 +68,16 @@ bool StringAppendTESTOperator::FullMergeV2(
 }
 
 bool StringAppendTESTOperator::PartialMergeMulti(
-    const Slice& key, const std::deque<Slice>& operand_list,
-    std::string* new_value, Logger* logger) const {
+    const Slice& /*key*/, const std::deque<Slice>& /*operand_list*/,
+    std::string* /*new_value*/, Logger* /*logger*/) const {
   return false;
 }
 
 // A version of PartialMerge that actually performs "partial merging".
 // Use this to simulate the exact behaviour of the StringAppendOperator.
 bool StringAppendTESTOperator::_AssocPartialMergeMulti(
-    const Slice& key, const std::deque<Slice>& operand_list,
-    std::string* new_value, Logger* logger) const {
+    const Slice& /*key*/, const std::deque<Slice>& operand_list,
+    std::string* new_value, Logger* /*logger*/) const {
   // Clear the *new_value for writing
   assert(new_value);
   new_value->clear();
diff --git a/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc b/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
index a12e130d1f..54c89a03ab 100644
--- a/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
+++ b/thirdparty/rocksdb/utilities/merge_operators/string_append/stringappend_test.cc
@@ -23,7 +23,7 @@ using namespace rocksdb;
 namespace rocksdb {
 
 // Path to the database on file system
-const std::string kDbName = test::TmpDir() + "/stringappend_test";
+const std::string kDbName = test::PerThreadDBPath("stringappend_test");
 
 namespace {
 // OpenDb opens a (possibly new) rocksdb database with a StringAppendOperator
diff --git a/thirdparty/rocksdb/utilities/merge_operators/uint64add.cc b/thirdparty/rocksdb/utilities/merge_operators/uint64add.cc
index d782173751..b998e1b8e4 100644
--- a/thirdparty/rocksdb/utilities/merge_operators/uint64add.cc
+++ b/thirdparty/rocksdb/utilities/merge_operators/uint64add.cc
@@ -20,11 +20,9 @@ namespace { // anonymous namespace
 // Implemented as an AssociativeMergeOperator for simplicity and example.
 class UInt64AddOperator : public AssociativeMergeOperator {
  public:
-  virtual bool Merge(const Slice& key,
-                     const Slice* existing_value,
-                     const Slice& value,
-                     std::string* new_value,
-                     Logger* logger) const override {
+  bool Merge(const Slice& /*key*/, const Slice* existing_value,
+             const Slice& value, std::string* new_value,
+             Logger* logger) const override {
     uint64_t orig_value = 0;
     if (existing_value){
       orig_value = DecodeInteger(*existing_value, logger);
@@ -38,9 +36,7 @@ class UInt64AddOperator : public AssociativeMergeOperator {
     return true;  // Return true always since corruption will be treated as 0
   }
 
-  virtual const char* Name() const override {
-    return "UInt64AddOperator";
-  }
+  const char* Name() const override { return "UInt64AddOperator"; }
 
  private:
   // Takes the string and decodes it into a uint64_t
diff --git a/thirdparty/rocksdb/utilities/object_registry_test.cc b/thirdparty/rocksdb/utilities/object_registry_test.cc
index 40fb387bc9..4444d8712f 100644
--- a/thirdparty/rocksdb/utilities/object_registry_test.cc
+++ b/thirdparty/rocksdb/utilities/object_registry_test.cc
@@ -18,13 +18,14 @@ class EnvRegistryTest : public testing::Test {
 int EnvRegistryTest::num_a = 0;
 int EnvRegistryTest::num_b = 0;
 
-static Registrar<Env> test_reg_a("a://.*", [](const std::string& uri,
-                                              std::unique_ptr<Env>* env_guard) {
-  ++EnvRegistryTest::num_a;
-  return Env::Default();
-});
+static Registrar<Env> test_reg_a("a://.*",
+                                 [](const std::string& /*uri*/,
+                                    std::unique_ptr<Env>* /*env_guard*/) {
+                                   ++EnvRegistryTest::num_a;
+                                   return Env::Default();
+                                 });
 
-static Registrar<Env> test_reg_b("b://.*", [](const std::string& uri,
+static Registrar<Env> test_reg_b("b://.*", [](const std::string& /*uri*/,
                                               std::unique_ptr<Env>* env_guard) {
   ++EnvRegistryTest::num_b;
   // Env::Default() is a singleton so we can't grant ownership directly to the
@@ -64,7 +65,7 @@ int main(int argc, char** argv) {
 #else  // ROCKSDB_LITE
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as EnvRegistry is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/option_change_migration/option_change_migration.cc b/thirdparty/rocksdb/utilities/option_change_migration/option_change_migration.cc
index c9e7fbce0e..21721b8f1a 100644
--- a/thirdparty/rocksdb/utilities/option_change_migration/option_change_migration.cc
+++ b/thirdparty/rocksdb/utilities/option_change_migration/option_change_migration.cc
@@ -157,8 +157,9 @@ Status OptionChangeMigration(std::string dbname, const Options& old_opts,
 }  // namespace rocksdb
 #else
 namespace rocksdb {
-Status OptionChangeMigration(std::string dbname, const Options& old_opts,
-                             const Options& new_opts) {
+Status OptionChangeMigration(std::string /*dbname*/,
+                             const Options& /*old_opts*/,
+                             const Options& /*new_opts*/) {
   return Status::NotSupported();
 }
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/options/options_util.cc b/thirdparty/rocksdb/utilities/options/options_util.cc
index 21734923f5..3975eadd75 100644
--- a/thirdparty/rocksdb/utilities/options/options_util.cc
+++ b/thirdparty/rocksdb/utilities/options/options_util.cc
@@ -15,20 +15,28 @@ namespace rocksdb {
 Status LoadOptionsFromFile(const std::string& file_name, Env* env,
                            DBOptions* db_options,
                            std::vector<ColumnFamilyDescriptor>* cf_descs,
-                           bool ignore_unknown_options) {
+                           bool ignore_unknown_options,
+                           std::shared_ptr<Cache>* cache) {
   RocksDBOptionsParser parser;
   Status s = parser.Parse(file_name, env, ignore_unknown_options);
   if (!s.ok()) {
     return s;
   }
-
   *db_options = *parser.db_opt();
-
   const std::vector<std::string>& cf_names = *parser.cf_names();
   const std::vector<ColumnFamilyOptions>& cf_opts = *parser.cf_opts();
   cf_descs->clear();
   for (size_t i = 0; i < cf_opts.size(); ++i) {
     cf_descs->push_back({cf_names[i], cf_opts[i]});
+    if (cache != nullptr) {
+      TableFactory* tf = cf_opts[i].table_factory.get();
+      if (tf != nullptr && tf->GetOptions() != nullptr &&
+          tf->Name() == BlockBasedTableFactory().Name()) {
+        auto* loaded_bbt_opt =
+            reinterpret_cast<BlockBasedTableOptions*>(tf->GetOptions());
+        loaded_bbt_opt->block_cache = *cache;
+      }
+    }
   }
   return Status::OK();
 }
@@ -63,15 +71,15 @@ Status GetLatestOptionsFileName(const std::string& dbpath,
 Status LoadLatestOptions(const std::string& dbpath, Env* env,
                          DBOptions* db_options,
                          std::vector<ColumnFamilyDescriptor>* cf_descs,
-                         bool ignore_unknown_options) {
+                         bool ignore_unknown_options,
+                         std::shared_ptr<Cache>* cache) {
   std::string options_file_name;
   Status s = GetLatestOptionsFileName(dbpath, env, &options_file_name);
   if (!s.ok()) {
     return s;
   }
-
   return LoadOptionsFromFile(dbpath + "/" + options_file_name, env, db_options,
-                             cf_descs, ignore_unknown_options);
+                             cf_descs, ignore_unknown_options, cache);
 }
 
 Status CheckOptionsCompatibility(
diff --git a/thirdparty/rocksdb/utilities/options/options_util_test.cc b/thirdparty/rocksdb/utilities/options/options_util_test.cc
index 2ca8d47672..ed7bfdfd6f 100644
--- a/thirdparty/rocksdb/utilities/options/options_util_test.cc
+++ b/thirdparty/rocksdb/utilities/options/options_util_test.cc
@@ -24,8 +24,8 @@
 #ifndef GFLAGS
 bool FLAGS_enable_print = false;
 #else
-#include <gflags/gflags.h>
-using GFLAGS::ParseCommandLineFlags;
+#include "util/gflags_compat.h"
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
 DEFINE_bool(enable_print, false, "Print options generated to console.");
 #endif  // GFLAGS
 
@@ -34,7 +34,7 @@ class OptionsUtilTest : public testing::Test {
  public:
   OptionsUtilTest() : rnd_(0xFB) {
     env_.reset(new test::StringEnv(Env::Default()));
-    dbname_ = test::TmpDir() + "/options_util_test";
+    dbname_ = test::PerThreadDBPath("options_util_test");
   }
 
  protected:
@@ -94,38 +94,85 @@ TEST_F(OptionsUtilTest, SaveAndLoad) {
   }
 }
 
+TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) {
+  // creating db
+  DBOptions db_opt;
+  db_opt.create_if_missing = true;
+  // initialize BlockBasedTableOptions
+  std::shared_ptr<Cache> cache = NewLRUCache(1 * 1024);
+  BlockBasedTableOptions bbt_opts;
+  bbt_opts.block_size = 32 * 1024;
+  // saving cf options
+  std::vector<ColumnFamilyOptions> cf_opts;
+  ColumnFamilyOptions default_column_family_opt = ColumnFamilyOptions();
+  default_column_family_opt.table_factory.reset(
+      NewBlockBasedTableFactory(bbt_opts));
+  cf_opts.push_back(default_column_family_opt);
+
+  ColumnFamilyOptions cf_opt_sample = ColumnFamilyOptions();
+  cf_opt_sample.table_factory.reset(NewBlockBasedTableFactory(bbt_opts));
+  cf_opts.push_back(cf_opt_sample);
+
+  ColumnFamilyOptions cf_opt_plain_table_opt = ColumnFamilyOptions();
+  cf_opt_plain_table_opt.table_factory.reset(NewPlainTableFactory());
+  cf_opts.push_back(cf_opt_plain_table_opt);
+
+  std::vector<std::string> cf_names;
+  cf_names.push_back(kDefaultColumnFamilyName);
+  cf_names.push_back("cf_sample");
+  cf_names.push_back("cf_plain_table_sample");
+  // Saving DB in file
+  const std::string kFileName = "OPTIONS-LOAD_CACHE_123456";
+  PersistRocksDBOptions(db_opt, cf_names, cf_opts, kFileName, env_.get());
+  DBOptions loaded_db_opt;
+  std::vector<ColumnFamilyDescriptor> loaded_cf_descs;
+  ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt,
+                                &loaded_cf_descs, false, &cache));
+  for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
+    if (IsBlockBasedTableFactory(cf_opts[i].table_factory.get())) {
+      auto* loaded_bbt_opt = reinterpret_cast<BlockBasedTableOptions*>(
+          loaded_cf_descs[i].options.table_factory->GetOptions());
+      // Expect the same cache will be loaded
+      if (loaded_bbt_opt != nullptr) {
+        ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get());
+      }
+    }
+  }
+}
+
 namespace {
 class DummyTableFactory : public TableFactory {
  public:
   DummyTableFactory() {}
-  virtual ~DummyTableFactory() {}
+  ~DummyTableFactory() override {}
 
-  virtual const char* Name() const override { return "DummyTableFactory"; }
+  const char* Name() const override { return "DummyTableFactory"; }
 
-  virtual Status NewTableReader(
-      const TableReaderOptions& table_reader_options,
-      unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
-      unique_ptr<TableReader>* table_reader,
-      bool prefetch_index_and_filter_in_cache) const override {
+  Status NewTableReader(
+      const TableReaderOptions& /*table_reader_options*/,
+      std::unique_ptr<RandomAccessFileReader>&& /*file*/,
+      uint64_t /*file_size*/, std::unique_ptr<TableReader>* /*table_reader*/,
+      bool /*prefetch_index_and_filter_in_cache*/) const override {
     return Status::NotSupported();
   }
 
-  virtual TableBuilder* NewTableBuilder(
-      const TableBuilderOptions& table_builder_options,
-      uint32_t column_family_id, WritableFileWriter* file) const override {
+  TableBuilder* NewTableBuilder(
+      const TableBuilderOptions& /*table_builder_options*/,
+      uint32_t /*column_family_id*/,
+      WritableFileWriter* /*file*/) const override {
     return nullptr;
   }
 
-  virtual Status SanitizeOptions(
-      const DBOptions& db_opts,
-      const ColumnFamilyOptions& cf_opts) const override {
+  Status SanitizeOptions(
+      const DBOptions& /*db_opts*/,
+      const ColumnFamilyOptions& /*cf_opts*/) const override {
     return Status::NotSupported();
   }
 
-  virtual std::string GetPrintableTableOptions() const override { return ""; }
+  std::string GetPrintableTableOptions() const override { return ""; }
 
-  Status GetOptionString(std::string* opt_string,
-                         const std::string& delimiter) const override {
+  Status GetOptionString(std::string* /*opt_string*/,
+                         const std::string& /*delimiter*/) const override {
     return Status::OK();
   }
 };
@@ -133,39 +180,39 @@ class DummyTableFactory : public TableFactory {
 class DummyMergeOperator : public MergeOperator {
  public:
   DummyMergeOperator() {}
-  virtual ~DummyMergeOperator() {}
+  ~DummyMergeOperator() override {}
 
-  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
-                           MergeOperationOutput* merge_out) const override {
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* /*merge_out*/) const override {
     return false;
   }
 
-  virtual bool PartialMergeMulti(const Slice& key,
-                                 const std::deque<Slice>& operand_list,
-                                 std::string* new_value,
-                                 Logger* logger) const override {
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& /*operand_list*/,
+                         std::string* /*new_value*/,
+                         Logger* /*logger*/) const override {
     return false;
   }
 
-  virtual const char* Name() const override { return "DummyMergeOperator"; }
+  const char* Name() const override { return "DummyMergeOperator"; }
 };
 
 class DummySliceTransform : public SliceTransform {
  public:
   DummySliceTransform() {}
-  virtual ~DummySliceTransform() {}
+  ~DummySliceTransform() override {}
 
   // Return the name of this transformation.
-  virtual const char* Name() const { return "DummySliceTransform"; }
+  const char* Name() const override { return "DummySliceTransform"; }
 
   // transform a src in domain to a dst in the range
-  virtual Slice Transform(const Slice& src) const { return src; }
+  Slice Transform(const Slice& src) const override { return src; }
 
   // determine whether this is a valid src upon the function applies
-  virtual bool InDomain(const Slice& src) const { return false; }
+  bool InDomain(const Slice& /*src*/) const override { return false; }
 
   // determine whether dst=Transform(src) for some src
-  virtual bool InRange(const Slice& dst) const { return false; }
+  bool InRange(const Slice& /*dst*/) const override { return false; }
 };
 
 }  // namespace
@@ -310,7 +357,7 @@ int main(int argc, char** argv) {
 #else
 #include <cstdio>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   printf("Skipped in RocksDBLite as utilities are not supported.\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.cc b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.cc
index 714af2c62a..f7f72df6df 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.cc
@@ -163,7 +163,7 @@ PersistentCache::StatsType BlockCacheTier::Stats() {
       stats_.read_hit_latency_.Average());
   Add(&stats, "persistentcache.blockcachetier.read_miss_latency",
       stats_.read_miss_latency_.Average());
-  Add(&stats, "persistenetcache.blockcachetier.write_latency",
+  Add(&stats, "persistentcache.blockcachetier.write_latency",
       stats_.write_latency_.Average());
 
   auto out = PersistentCacheTier::Stats();
@@ -263,7 +263,7 @@ Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) {
   return Status::OK();
 }
 
-Status BlockCacheTier::Lookup(const Slice& key, unique_ptr<char[]>* val,
+Status BlockCacheTier::Lookup(const Slice& key, std::unique_ptr<char[]>* val,
                               size_t* size) {
   StopWatchNano timer(opt_.env, /*auto_start=*/ true);
 
@@ -287,7 +287,7 @@ Status BlockCacheTier::Lookup(const Slice& key, unique_ptr<char[]>* val,
 
   assert(file->refs_);
 
-  unique_ptr<char[]> scratch(new char[lba.size_]);
+  std::unique_ptr<char[]> scratch(new char[lba.size_]);
   Slice blk_key;
   Slice blk_val;
 
@@ -369,7 +369,7 @@ bool BlockCacheTier::Reserve(const size_t size) {
 
   const double retain_fac = (100 - kEvictPct) / static_cast<double>(100);
   while (size + size_ > opt_.cache_size * retain_fac) {
-    unique_ptr<BlockCacheFile> f(metadata_.Evict());
+    std::unique_ptr<BlockCacheFile> f(metadata_.Evict());
     if (!f) {
       // nothing is evictable
       return false;
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.h b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.h
index 9a8dec3a77..670463a87f 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.h
+++ b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier.h
@@ -44,10 +44,10 @@ class BlockCacheTier : public PersistentCacheTier {
  public:
   explicit BlockCacheTier(const PersistentCacheConfig& opt)
       : opt_(opt),
-        insert_ops_(opt_.max_write_pipeline_backlog_size),
+        insert_ops_(static_cast<size_t>(opt_.max_write_pipeline_backlog_size)),
         buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()),
-        writer_(this, opt_.writer_qdepth, opt_.writer_dispatch_size) {
-    Info(opt_.log, "Initializing allocator. size=%d B count=%d",
+        writer_(this, opt_.writer_qdepth, static_cast<size_t>(opt_.writer_dispatch_size)) {
+    Info(opt_.log, "Initializing allocator. size=%d B count=%" ROCKSDB_PRIszt,
          opt_.write_buffer_size, opt_.write_buffer_count());
   }
 
@@ -92,7 +92,7 @@ class BlockCacheTier : public PersistentCacheTier {
     ~InsertOp() {}
 
     InsertOp() = delete;
-    InsertOp(InsertOp&& rhs) = default;
+    InsertOp(InsertOp&& /*rhs*/) = default;
     InsertOp& operator=(InsertOp&& rhs) = default;
 
     // used for estimating size by bounded queue
@@ -100,7 +100,7 @@ class BlockCacheTier : public PersistentCacheTier {
 
     std::string key_;
     std::string data_;
-    const bool signal_ = false;  // signal to request processing thread to exit
+    bool signal_ = false;  // signal to request processing thread to exit
   };
 
   // entry point for insert thread
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
index 85e0610b7e..ce6335fb58 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
@@ -62,7 +62,8 @@ Status BlockCacheFile::Delete(uint64_t* size) {
 // <-- 4 --><-- 4  --><-- 4   --><-- 4     --><-- key size  --><-- v-size -->
 //
 struct CacheRecordHeader {
-  CacheRecordHeader() {}
+  CacheRecordHeader()
+    : magic_(0), crc_(0), key_size_(0), val_size_(0) {}
   CacheRecordHeader(const uint32_t magic, const uint32_t key_size,
                     const uint32_t val_size)
       : magic_(magic), crc_(0), key_size_(key_size), val_size_(val_size) {}
@@ -277,7 +278,7 @@ WriteableCacheFile::~WriteableCacheFile() {
   ClearBuffers();
 }
 
-bool WriteableCacheFile::Create(const bool enable_direct_writes,
+bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/,
                                 const bool enable_direct_reads) {
   WriteLock _(&rwlock_);
 
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.h b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
index 3922136d67..b7f820b068 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
+++ b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_file.h
@@ -103,13 +103,15 @@ class BlockCacheFile : public LRUElement<BlockCacheFile> {
   virtual ~BlockCacheFile() {}
 
   // append key/value to file and return LBA locator to user
-  virtual bool Append(const Slice& key, const Slice& val, LBA* const lba) {
+  virtual bool Append(const Slice& /*key*/, const Slice& /*val*/,
+                      LBA* const /*lba*/) {
     assert(!"not implemented");
     return false;
   }
 
   // read from the record locator (LBA) and return key, value and status
-  virtual bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) {
+  virtual bool Read(const LBA& /*lba*/, Slice* /*key*/, Slice* /*block*/,
+                    char* /*scratch*/) {
     assert(!"not implemented");
     return false;
   }
@@ -147,7 +149,7 @@ class RandomAccessCacheFile : public BlockCacheFile {
  public:
   explicit RandomAccessCacheFile(Env* const env, const std::string& dir,
                                  const uint32_t cache_id,
-                                 const shared_ptr<Logger>& log)
+                                 const std::shared_ptr<Logger>& log)
       : BlockCacheFile(env, dir, cache_id), log_(log) {}
 
   virtual ~RandomAccessCacheFile() {}
@@ -263,11 +265,11 @@ class ThreadedWriter : public Writer {
     IO& operator=(const IO&) = default;
     size_t Size() const { return sizeof(IO); }
 
-    WritableFile* file_ = nullptr;           // File to write to
-    CacheWriteBuffer* const buf_ = nullptr;  // buffer to write
-    uint64_t file_off_ = 0;                  // file offset
-    bool signal_ = false;                    // signal to exit thread loop
-    std::function<void()> callback_;         // Callback on completion
+    WritableFile* file_ = nullptr;     // File to write to
+    CacheWriteBuffer* buf_ = nullptr;  // buffer to write
+    uint64_t file_off_ = 0;            // file offset
+    bool signal_ = false;              // signal to exit thread loop
+    std::function<void()> callback_;   // Callback on completion
   };
 
   explicit ThreadedWriter(PersistentCacheTier* const cache, const size_t qdepth,
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc
index 84d901bc47..3382fda31d 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/block_cache_tier_metadata.cc
@@ -63,7 +63,8 @@ bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) {
 BlockInfo* BlockCacheTierMetadata::Remove(const Slice& key) {
   BlockInfo lookup_key(key);
   BlockInfo* binfo = nullptr;
-  bool ok __attribute__((__unused__)) = block_index_.Erase(&lookup_key, &binfo);
+  bool ok __attribute__((__unused__));
+  ok = block_index_.Erase(&lookup_key, &binfo);
   assert(ok);
   return binfo;
 }
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/hash_table_bench.cc b/thirdparty/rocksdb/utilities/persistent_cache/hash_table_bench.cc
index 65bcd7723f..7797f458ca 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/hash_table_bench.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/hash_table_bench.cc
@@ -10,16 +10,19 @@
 #include <cstdio>
 int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
 #else
-#include <gflags/gflags.h>
 
 #include <atomic>
 #include <functional>
 #include <string>
 #include <unordered_map>
+#include <unistd.h>
+#include <sys/time.h>
 
 #include "port/port_posix.h"
 #include "rocksdb/env.h"
+#include "util/gflags_compat.h"
 #include "util/mutexlock.h"
+#include "util/random.h"
 #include "utilities/persistent_cache/hash_table.h"
 
 using std::string;
@@ -272,9 +275,9 @@ class GranularLockImpl : public HashTableImpl<size_t, string> {
 // main
 //
 int main(int argc, char** argv) {
-  GFLAGS::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
-  GFLAGS::ParseCommandLineFlags(&argc, &argv, false);
+  GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") +
+                                    std::string(argv[0]) + " [OPTIONS]...");
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false);
 
   //
   // Micro benchmark unordered_map
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/hash_table_evictable.h b/thirdparty/rocksdb/utilities/persistent_cache/hash_table_evictable.h
index 6557eb440e..40b693cec5 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/hash_table_evictable.h
+++ b/thirdparty/rocksdb/utilities/persistent_cache/hash_table_evictable.h
@@ -88,7 +88,7 @@ class EvictableHashTable : private HashTable<T*, Hash, Equal> {
 
       WriteLock _(&hash_table::locks_[idx]);
       LRUListType& lru = lru_lists_[idx];
-      if (!lru.IsEmpty() && (t = lru.Pop())) {
+      if (!lru.IsEmpty() && (t = lru.Pop()) != nullptr) {
         assert(!t->refs_);
         // We got an item to evict, erase from the bucket
         const uint64_t h = Hash()(t);
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/hash_table_test.cc b/thirdparty/rocksdb/utilities/persistent_cache/hash_table_test.cc
index 1a6df4e614..d6ff3e68e4 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/hash_table_test.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/hash_table_test.cc
@@ -20,7 +20,7 @@
 namespace rocksdb {
 
 struct HashTableTest : public testing::Test {
-  ~HashTableTest() { map_.Clear(&HashTableTest::ClearNode); }
+  ~HashTableTest() override { map_.Clear(&HashTableTest::ClearNode); }
 
   struct Node {
     Node() {}
@@ -43,13 +43,15 @@ struct HashTableTest : public testing::Test {
     }
   };
 
-  static void ClearNode(Node node) {}
+  static void ClearNode(Node /*node*/) {}
 
   HashTable<Node, Hash, Equal> map_;
 };
 
 struct EvictableHashTableTest : public testing::Test {
-  ~EvictableHashTableTest() { map_.Clear(&EvictableHashTableTest::ClearNode); }
+  ~EvictableHashTableTest() override {
+    map_.Clear(&EvictableHashTableTest::ClearNode);
+  }
 
   struct Node : LRUElement<Node> {
     Node() {}
@@ -73,7 +75,7 @@ struct EvictableHashTableTest : public testing::Test {
     }
   };
 
-  static void ClearNode(Node* node) {}
+  static void ClearNode(Node* /*node*/) {}
 
   EvictableHashTable<Node, Hash, Equal> map_;
 };
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
index 4aeb0549cb..64d75c7a51 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_bench.cc
@@ -9,7 +9,6 @@
 #include <cstdio>
 int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
 #else
-#include <gflags/gflags.h>
 #include <atomic>
 #include <functional>
 #include <memory>
@@ -25,6 +24,7 @@ int main() { fprintf(stderr, "Please install gflags to run tools\n"); }
 #include "monitoring/histogram.h"
 #include "port/port.h"
 #include "table/block_builder.h"
+#include "util/gflags_compat.h"
 #include "util/mutexlock.h"
 #include "util/stop_watch.h"
 
@@ -251,7 +251,7 @@ class CacheTierBenchmark {
 
   // create data for a key by filling with a certain pattern
   std::unique_ptr<char[]> NewBlock(const uint64_t val) {
-    unique_ptr<char[]> data(new char[FLAGS_iosize]);
+    std::unique_ptr<char[]> data(new char[FLAGS_iosize]);
     memset(data.get(), val % 255, FLAGS_iosize);
     return data;
   }
@@ -307,9 +307,9 @@ class CacheTierBenchmark {
 // main
 //
 int main(int argc, char** argv) {
-  GFLAGS::SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) +
-                          " [OPTIONS]...");
-  GFLAGS::ParseCommandLineFlags(&argc, &argv, false);
+  GFLAGS_NAMESPACE::SetUsageMessage(std::string("\nUSAGE:\n") +
+                                    std::string(argv[0]) + " [OPTIONS]...");
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, false);
 
   std::ostringstream msg;
   msg << "Config" << std::endl
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.cc b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
index 5affc4085d..e3b1e39e0e 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.cc
@@ -132,7 +132,7 @@ std::unique_ptr<PersistentTieredCache> NewTieredCache(
 }
 
 PersistentCacheTierTest::PersistentCacheTierTest()
-    : path_(test::TmpDir(Env::Default()) + "/cache_test") {
+    : path_(test::PerThreadDBPath("cache_test")) {
 #ifdef OS_LINUX
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
   rocksdb::SyncPoint::GetInstance()->SetCallBack("NewRandomAccessFile:O_DIRECT",
@@ -157,7 +157,7 @@ TEST_F(PersistentCacheTierTest, DISABLED_BlockCacheInsertWithFileCreateError) {
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
 
-#ifdef TRAVIS
+#if defined(TRAVIS) || defined(ROCKSDB_VALGRIND_RUN)
 // Travis is unable to handle the normal version of the tests running out of
 // fds, out of space and timeouts. This is an easier version of the test
 // specifically written for Travis
@@ -435,7 +435,7 @@ void PersistentCacheDBTest::RunTest(
   }
 }
 
-#ifdef TRAVIS
+#if defined(TRAVIS) || defined(ROCKSDB_VALGRIND_RUN)
 // Travis is unable to handle the normal version of the tests running out of
 // fds, out of space and timeouts. This is an easier version of the test
 // specifically written for Travis
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.h b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.h
index 77fd172ba0..ad99ea864b 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.h
+++ b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_test.h
@@ -157,7 +157,7 @@ class PersistentCacheTierTest : public testing::Test {
       memset(edata, '0' + (i % 10), sizeof(edata));
       auto k = prefix + PaddedNumber(i, /*count=*/8);
       Slice key(k);
-      unique_ptr<char[]> block;
+      std::unique_ptr<char[]> block;
       size_t block_size;
 
       if (eviction_enabled) {
@@ -210,7 +210,7 @@ class PersistentCacheTierTest : public testing::Test {
   }
 
   const std::string path_;
-  shared_ptr<Logger> log_;
+  std::shared_ptr<Logger> log_;
   std::shared_ptr<PersistentCacheTier> cache_;
   std::atomic<size_t> key_{0};
   size_t max_keys_ = 0;
@@ -233,8 +233,8 @@ class PersistentCacheDBTest : public DBTestBase {
 
   // insert data to table
   void Insert(const Options& options,
-              const BlockBasedTableOptions& table_options, const int num_iter,
-              std::vector<std::string>* values) {
+              const BlockBasedTableOptions& /*table_options*/,
+              const int num_iter, std::vector<std::string>* values) {
     CreateAndReopenWithCF({"pikachu"}, options);
     // default column family doesn't have block cache
     Options no_block_cache_opts;
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
index 0f500e8712..732762a165 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.cc
@@ -75,12 +75,12 @@ Status PersistentCacheTier::Close() {
   return Status::OK();
 }
 
-bool PersistentCacheTier::Reserve(const size_t size) {
+bool PersistentCacheTier::Reserve(const size_t /*size*/) {
   // default implementation is a pass through
   return true;
 }
 
-bool PersistentCacheTier::Erase(const Slice& key) {
+bool PersistentCacheTier::Erase(const Slice& /*key*/) {
   // default implementation is a pass through since not all cache tiers might
   // support erase
   return true;
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.h b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
index 25e0b3c0d6..8803c33558 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
+++ b/thirdparty/rocksdb/utilities/persistent_cache/persistent_cache_tier.h
@@ -251,20 +251,20 @@ class PersistentCacheTier : public PersistentCache {
   // Print stats to string recursively
   virtual std::string PrintStats();
 
-  virtual PersistentCache::StatsType Stats();
+  virtual PersistentCache::StatsType Stats() override;
 
   // Insert to page cache
   virtual Status Insert(const Slice& page_key, const char* data,
-                        const size_t size) = 0;
+                        const size_t size) override = 0;
 
   // Lookup page cache by page identifier
   virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
-                        size_t* size) = 0;
+                        size_t* size) override = 0;
 
   // Does it store compressed data ?
-  virtual bool IsCompressed() = 0;
+  virtual bool IsCompressed() override = 0;
 
-  virtual std::string GetPrintableOptions() const = 0;
+  virtual std::string GetPrintableOptions() const override = 0;
 
   // Return a reference to next tier
   virtual Tier& next_tier() { return next_tier_; }
diff --git a/thirdparty/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc b/thirdparty/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
index d190a21028..177fc91690 100644
--- a/thirdparty/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
+++ b/thirdparty/rocksdb/utilities/persistent_cache/volatile_tier_impl.cc
@@ -106,7 +106,7 @@ Status VolatileCacheTier::Lookup(const Slice& page_key,
   return Status::NotFound("key not found in volatile cache");
 }
 
-bool VolatileCacheTier::Erase(const Slice& key) {
+bool VolatileCacheTier::Erase(const Slice& /*key*/) {
   assert(!"not supported");
   return true;
 }
diff --git a/thirdparty/rocksdb/utilities/redis/README b/thirdparty/rocksdb/utilities/redis/README
deleted file mode 100644
index 8b17bc05a6..0000000000
--- a/thirdparty/rocksdb/utilities/redis/README
+++ /dev/null
@@ -1,14 +0,0 @@
-This folder defines a REDIS-style interface for Rocksdb.
-Right now it is written as a simple tag-on in the rocksdb::RedisLists class.
-It implements Redis Lists, and supports only the "non-blocking operations".
-
-Internally, the set of lists are stored in a rocksdb database, mapping keys to
-values. Each "value" is the list itself, storing a sequence of "elements".
-Each element is stored as a 32-bit-integer, followed by a sequence of bytes.
-The 32-bit-integer represents the length of the element (that is, the number
-of bytes that follow). And then that many bytes follow.
-
-
-NOTE: This README file may be old. See the actual redis_lists.cc file for
-definitive details on the implementation. There should be a header at the top
-of that file, explaining a bit of the implementation details.
diff --git a/thirdparty/rocksdb/utilities/redis/redis_list_exception.h b/thirdparty/rocksdb/utilities/redis/redis_list_exception.h
deleted file mode 100644
index f93bcbb190..0000000000
--- a/thirdparty/rocksdb/utilities/redis/redis_list_exception.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * A simple structure for exceptions in RedisLists.
- *
- * @author Deon Nicholas (dnicholas@fb.com)
- * Copyright 2013 Facebook
- */
-
-#ifndef ROCKSDB_LITE
-#pragma once
-#include <exception>
-
-namespace rocksdb {
-
-class RedisListException: public std::exception {
- public:
-  const char* what() const throw() override {
-    return "Invalid operation or corrupt data in Redis List.";
-  }
-};
-
-} // namespace rocksdb
-#endif
diff --git a/thirdparty/rocksdb/utilities/redis/redis_list_iterator.h b/thirdparty/rocksdb/utilities/redis/redis_list_iterator.h
deleted file mode 100644
index 73907ddf8c..0000000000
--- a/thirdparty/rocksdb/utilities/redis/redis_list_iterator.h
+++ /dev/null
@@ -1,309 +0,0 @@
-// Copyright 2013 Facebook
-/**
- * RedisListIterator:
- * An abstraction over the "list" concept (e.g.: for redis lists).
- * Provides functionality to read, traverse, edit, and write these lists.
- *
- * Upon construction, the RedisListIterator is given a block of list data.
- * Internally, it stores a pointer to the data and a pointer to current item.
- * It also stores a "result" list that will be mutated over time.
- *
- * Traversal and mutation are done by "forward iteration".
- * The Push() and Skip() methods will advance the iterator to the next item.
- * However, Push() will also "write the current item to the result".
- * Skip() will simply move to next item, causing current item to be dropped.
- *
- * Upon completion, the result (accessible by WriteResult()) will be saved.
- * All "skipped" items will be gone; all "pushed" items will remain.
- *
- * @throws Any of the operations may throw a RedisListException if an invalid
- *          operation is performed or if the data is found to be corrupt.
- *
- * @notes By default, if WriteResult() is called part-way through iteration,
- *        it will automatically advance the iterator to the end, and Keep()
- *        all items that haven't been traversed yet. This may be subject
- *        to review.
- *
- * @notes Can access the "current" item via GetCurrent(), and other
- *        list-specific information such as Length().
- *
- * @notes The internal representation is due to change at any time. Presently,
- *        the list is represented as follows:
- *          - 32-bit integer header: the number of items in the list
- *          - For each item:
- *              - 32-bit int (n): the number of bytes representing this item
- *              - n bytes of data: the actual data.
- *
- * @author Deon Nicholas (dnicholas@fb.com)
- */
-
-#pragma once
-#ifndef ROCKSDB_LITE
-
-#include <string>
-
-#include "redis_list_exception.h"
-#include "rocksdb/slice.h"
-#include "util/coding.h"
-
-namespace rocksdb {
-
-/// An abstraction over the "list" concept.
-/// All operations may throw a RedisListException
-class RedisListIterator {
- public:
-  /// Construct a redis-list-iterator based on data.
-  /// If the data is non-empty, it must formatted according to @notes above.
-  ///
-  /// If the data is valid, we can assume the following invariant(s):
-  ///  a) length_, num_bytes_ are set correctly.
-  ///  b) cur_byte_ always refers to the start of the current element,
-  ///       just before the bytes that specify element length.
-  ///  c) cur_elem_ is always the index of the current element.
-  ///  d) cur_elem_length_ is always the number of bytes in current element,
-  ///       excluding the 4-byte header itself.
-  ///  e) result_ will always contain data_[0..cur_byte_) and a header
-  ///  f) Whenever corrupt data is encountered or an invalid operation is
-  ///      attempted, a RedisListException will immediately be thrown.
-  explicit RedisListIterator(const std::string& list_data)
-      : data_(list_data.data()),
-        num_bytes_(static_cast<uint32_t>(list_data.size())),
-        cur_byte_(0),
-        cur_elem_(0),
-        cur_elem_length_(0),
-        length_(0),
-        result_() {
-    // Initialize the result_ (reserve enough space for header)
-    InitializeResult();
-
-    // Parse the data only if it is not empty.
-    if (num_bytes_ == 0) {
-      return;
-    }
-
-    // If non-empty, but less than 4 bytes, data must be corrupt
-    if (num_bytes_ < sizeof(length_)) {
-      ThrowError("Corrupt header.");    // Will break control flow
-    }
-
-    // Good. The first bytes specify the number of elements
-    length_ = DecodeFixed32(data_);
-    cur_byte_ = sizeof(length_);
-
-    // If we have at least one element, point to that element.
-    // Also, read the first integer of the element (specifying the size),
-    //   if possible.
-    if (length_ > 0) {
-      if (cur_byte_ + sizeof(cur_elem_length_) <= num_bytes_) {
-        cur_elem_length_ = DecodeFixed32(data_+cur_byte_);
-      } else {
-        ThrowError("Corrupt data for first element.");
-      }
-    }
-
-    // At this point, we are fully set-up.
-    // The invariants described in the header should now be true.
-  }
-
-  /// Reserve some space for the result_.
-  /// Equivalent to result_.reserve(bytes).
-  void Reserve(int bytes) {
-    result_.reserve(bytes);
-  }
-
-  /// Go to next element in data file.
-  /// Also writes the current element to result_.
-  RedisListIterator& Push() {
-    WriteCurrentElement();
-    MoveNext();
-    return *this;
-  }
-
-  /// Go to next element in data file.
-  /// Drops/skips the current element. It will not be written to result_.
-  RedisListIterator& Skip() {
-    MoveNext();
-    --length_;          // One less item
-    --cur_elem_;        // We moved one forward, but index did not change
-    return *this;
-  }
-
-  /// Insert elem into the result_ (just BEFORE the current element / byte)
-  /// Note: if Done() (i.e.: iterator points to end), this will append elem.
-  void InsertElement(const Slice& elem) {
-    // Ensure we are in a valid state
-    CheckErrors();
-
-    const int kOrigSize = static_cast<int>(result_.size());
-    result_.resize(kOrigSize + SizeOf(elem));
-    EncodeFixed32(result_.data() + kOrigSize,
-                  static_cast<uint32_t>(elem.size()));
-    memcpy(result_.data() + kOrigSize + sizeof(uint32_t), elem.data(),
-           elem.size());
-    ++length_;
-    ++cur_elem_;
-  }
-
-  /// Access the current element, and save the result into *curElem
-  void GetCurrent(Slice* curElem) {
-    // Ensure we are in a valid state
-    CheckErrors();
-
-    // Ensure that we are not past the last element.
-    if (Done()) {
-      ThrowError("Invalid dereferencing.");
-    }
-
-    // Dereference the element
-    *curElem = Slice(data_+cur_byte_+sizeof(cur_elem_length_),
-                     cur_elem_length_);
-  }
-
-  // Number of elements
-  int Length() const {
-    return length_;
-  }
-
-  // Number of bytes in the final representation (i.e: WriteResult().size())
-  int Size() const {
-    // result_ holds the currently written data
-    // data_[cur_byte..num_bytes-1] is the remainder of the data
-    return static_cast<int>(result_.size() + (num_bytes_ - cur_byte_));
-  }
-
-  // Reached the end?
-  bool Done() const {
-    return cur_byte_ >= num_bytes_ || cur_elem_ >= length_;
-  }
-
-  /// Returns a string representing the final, edited, data.
-  /// Assumes that all bytes of data_ in the range [0,cur_byte_) have been read
-  ///  and that result_ contains this data.
-  /// The rest of the data must still be written.
-  /// So, this method ADVANCES THE ITERATOR TO THE END before writing.
-  Slice WriteResult() {
-    CheckErrors();
-
-    // The header should currently be filled with dummy data (0's)
-    // Correctly update the header.
-    // Note, this is safe since result_ is a vector (guaranteed contiguous)
-    EncodeFixed32(&result_[0],length_);
-
-    // Append the remainder of the data to the result.
-    result_.insert(result_.end(),data_+cur_byte_, data_ +num_bytes_);
-
-    // Seek to end of file
-    cur_byte_ = num_bytes_;
-    cur_elem_ = length_;
-    cur_elem_length_ = 0;
-
-    // Return the result
-    return Slice(result_.data(),result_.size());
-  }
-
- public: // Static public functions
-
-  /// An upper-bound on the amount of bytes needed to store this element.
-  /// This is used to hide representation information from the client.
-  /// E.G. This can be used to compute the bytes we want to Reserve().
-  static uint32_t SizeOf(const Slice& elem) {
-    // [Integer Length . Data]
-    return static_cast<uint32_t>(sizeof(uint32_t) + elem.size());
-  }
-
- private: // Private functions
-
-  /// Initializes the result_ string.
-  /// It will fill the first few bytes with 0's so that there is
-  ///  enough space for header information when we need to write later.
-  /// Currently, "header information" means: the length (number of elements)
-  /// Assumes that result_ is empty to begin with
-  void InitializeResult() {
-    assert(result_.empty());            // Should always be true.
-    result_.resize(sizeof(uint32_t),0); // Put a block of 0's as the header
-  }
-
-  /// Go to the next element (used in Push() and Skip())
-  void MoveNext() {
-    CheckErrors();
-
-    // Check to make sure we are not already in a finished state
-    if (Done()) {
-      ThrowError("Attempting to iterate past end of list.");
-    }
-
-    // Move forward one element.
-    cur_byte_ += sizeof(cur_elem_length_) + cur_elem_length_;
-    ++cur_elem_;
-
-    // If we are at the end, finish
-    if (Done()) {
-      cur_elem_length_ = 0;
-      return;
-    }
-
-    // Otherwise, we should be able to read the new element's length
-    if (cur_byte_ + sizeof(cur_elem_length_) > num_bytes_) {
-      ThrowError("Corrupt element data.");
-    }
-
-    // Set the new element's length
-    cur_elem_length_ = DecodeFixed32(data_+cur_byte_);
-
-    return;
-  }
-
-  /// Append the current element (pointed to by cur_byte_) to result_
-  /// Assumes result_ has already been reserved appropriately.
-  void WriteCurrentElement() {
-    // First verify that the iterator is still valid.
-    CheckErrors();
-    if (Done()) {
-      ThrowError("Attempting to write invalid element.");
-    }
-
-    // Append the cur element.
-    result_.insert(result_.end(),
-                   data_+cur_byte_,
-                   data_+cur_byte_+ sizeof(uint32_t) + cur_elem_length_);
-  }
-
-  /// Will ThrowError() if necessary.
-  /// Checks for common/ubiquitous errors that can arise after most operations.
-  /// This method should be called before any reading operation.
-  /// If this function succeeds, then we are guaranteed to be in a valid state.
-  /// Other member functions should check for errors and ThrowError() also
-  ///  if an error occurs that is specific to it even while in a valid state.
-  void CheckErrors() {
-    // Check if any crazy thing has happened recently
-    if ((cur_elem_ > length_) ||                              // Bad index
-        (cur_byte_ > num_bytes_) ||                           // No more bytes
-        (cur_byte_ + cur_elem_length_ > num_bytes_) ||        // Item too large
-        (cur_byte_ == num_bytes_ && cur_elem_ != length_) ||  // Too many items
-        (cur_elem_ == length_ && cur_byte_ != num_bytes_)) {  // Too many bytes
-      ThrowError("Corrupt data.");
-    }
-  }
-
-  /// Will throw an exception based on the passed-in message.
-  /// This function is guaranteed to STOP THE CONTROL-FLOW.
-  /// (i.e.: you do not have to call "return" after calling ThrowError)
-  void ThrowError(const char* const msg = NULL) {
-    // TODO: For now we ignore the msg parameter. This can be expanded later.
-    throw RedisListException();
-  }
-
- private:
-  const char* const data_;      // A pointer to the data (the first byte)
-  const uint32_t num_bytes_;    // The number of bytes in this list
-
-  uint32_t cur_byte_;           // The current byte being read
-  uint32_t cur_elem_;           // The current element being read
-  uint32_t cur_elem_length_;    // The number of bytes in current element
-
-  uint32_t length_;             // The number of elements in this list
-  std::vector<char> result_;    // The output data
-};
-
-} // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/redis/redis_lists.cc b/thirdparty/rocksdb/utilities/redis/redis_lists.cc
deleted file mode 100644
index 2b38a2da4b..0000000000
--- a/thirdparty/rocksdb/utilities/redis/redis_lists.cc
+++ /dev/null
@@ -1,552 +0,0 @@
-// Copyright 2013 Facebook
-/**
- * A (persistent) Redis API built using the rocksdb backend.
- * Implements Redis Lists as described on: http://redis.io/commands#list
- *
- * @throws All functions may throw a RedisListException on error/corruption.
- *
- * @notes Internally, the set of lists is stored in a rocksdb database,
- *        mapping keys to values. Each "value" is the list itself, storing
- *        some kind of internal representation of the data. All the
- *        representation details are handled by the RedisListIterator class.
- *        The present file should be oblivious to the representation details,
- *        handling only the client (Redis) API, and the calls to rocksdb.
- *
- * @TODO  Presently, all operations take at least O(NV) time where
- *        N is the number of elements in the list, and V is the average
- *        number of bytes per value in the list. So maybe, with merge operator
- *        we can improve this to an optimal O(V) amortized time, since we
- *        wouldn't have to read and re-write the entire list.
- *
- * @author Deon Nicholas (dnicholas@fb.com)
- */
-
-#ifndef ROCKSDB_LITE
-#include "redis_lists.h"
-
-#include <iostream>
-#include <memory>
-#include <cmath>
-
-#include "rocksdb/slice.h"
-#include "util/coding.h"
-
-namespace rocksdb
-{
-
-/// Constructors
-
-RedisLists::RedisLists(const std::string& db_path,
-                       Options options, bool destructive)
-    : put_option_(),
-      get_option_() {
-
-  // Store the name of the database
-  db_name_ = db_path;
-
-  // If destructive, destroy the DB before re-opening it.
-  if (destructive) {
-    DestroyDB(db_name_, Options());
-  }
-
-  // Now open and deal with the db
-  DB* db;
-  Status s = DB::Open(options, db_name_, &db);
-  if (!s.ok()) {
-    std::cerr << "ERROR " << s.ToString() << std::endl;
-    assert(false);
-  }
-
-  db_ = std::unique_ptr<DB>(db);
-}
-
-
-/// Accessors
-
-// Number of elements in the list associated with key
-//   : throws RedisListException
-int RedisLists::Length(const std::string& key) {
-  // Extract the string data representing the list.
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Return the length
-  RedisListIterator it(data);
-  return it.Length();
-}
-
-// Get the element at the specified index in the (list: key)
-// Returns <empty> ("") on out-of-bounds
-//   : throws RedisListException
-bool RedisLists::Index(const std::string& key, int32_t index,
-                       std::string* result) {
-  // Extract the string data representing the list.
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Handle REDIS negative indices (from the end); fast iff Length() takes O(1)
-  if (index < 0) {
-    index = Length(key) - (-index);  //replace (-i) with (N-i).
-  }
-
-  // Iterate through the list until the desired index is found.
-  int curIndex = 0;
-  RedisListIterator it(data);
-  while(curIndex < index && !it.Done()) {
-    ++curIndex;
-    it.Skip();
-  }
-
-  // If we actually found the index
-  if (curIndex == index && !it.Done()) {
-    Slice elem;
-    it.GetCurrent(&elem);
-    if (result != NULL) {
-      *result = elem.ToString();
-    }
-
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// Return a truncated version of the list.
-// First, negative values for first/last are interpreted as "end of list".
-// So, if first == -1, then it is re-set to index: (Length(key) - 1)
-// Then, return exactly those indices i such that first <= i <= last.
-//   : throws RedisListException
-std::vector<std::string> RedisLists::Range(const std::string& key,
-                                           int32_t first, int32_t last) {
-  // Extract the string data representing the list.
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Handle negative bounds (-1 means last element, etc.)
-  int listLen = Length(key);
-  if (first < 0) {
-    first = listLen - (-first);           // Replace (-x) with (N-x)
-  }
-  if (last < 0) {
-    last = listLen - (-last);
-  }
-
-  // Verify bounds (and truncate the range so that it is valid)
-  first = std::max(first, 0);
-  last = std::min(last, listLen-1);
-  int len = std::max(last-first+1, 0);
-
-  // Initialize the resulting list
-  std::vector<std::string> result(len);
-
-  // Traverse the list and update the vector
-  int curIdx = 0;
-  Slice elem;
-  for (RedisListIterator it(data); !it.Done() && curIdx<=last; it.Skip()) {
-    if (first <= curIdx && curIdx <= last) {
-      it.GetCurrent(&elem);
-      result[curIdx-first].assign(elem.data(),elem.size());
-    }
-
-    ++curIdx;
-  }
-
-  // Return the result. Might be empty
-  return result;
-}
-
-// Print the (list: key) out to stdout. For debugging mostly. Public for now.
-void RedisLists::Print(const std::string& key) {
-  // Extract the string data representing the list.
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Iterate through the list and print the items
-  Slice elem;
-  for (RedisListIterator it(data); !it.Done(); it.Skip()) {
-    it.GetCurrent(&elem);
-    std::cout << "ITEM " << elem.ToString() << std::endl;
-  }
-
-  //Now print the byte data
-  RedisListIterator it(data);
-  std::cout << "==Printing data==" << std::endl;
-  std::cout << data.size() << std::endl;
-  std::cout << it.Size() << " " << it.Length() << std::endl;
-  Slice result = it.WriteResult();
-  std::cout << result.data() << std::endl;
-  if (true) {
-    std::cout << "size: " << result.size() << std::endl;
-    const char* val = result.data();
-    for(int i=0; i<(int)result.size(); ++i) {
-      std::cout << (int)val[i] << " " << (val[i]>=32?val[i]:' ') << std::endl;
-    }
-    std::cout << std::endl;
-  }
-}
-
-/// Insert/Update Functions
-/// Note: The "real" insert function is private. See below.
-
-// InsertBefore and InsertAfter are simply wrappers around the Insert function.
-int RedisLists::InsertBefore(const std::string& key, const std::string& pivot,
-                             const std::string& value) {
-  return Insert(key, pivot, value, false);
-}
-
-int RedisLists::InsertAfter(const std::string& key, const std::string& pivot,
-                            const std::string& value) {
-  return Insert(key, pivot, value, true);
-}
-
-// Prepend value onto beginning of (list: key)
-//   : throws RedisListException
-int RedisLists::PushLeft(const std::string& key, const std::string& value) {
-  // Get the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Construct the result
-  RedisListIterator it(data);
-  it.Reserve(it.Size() + it.SizeOf(value));
-  it.InsertElement(value);
-
-  // Push the data back to the db and return the length
-  db_->Put(put_option_, key, it.WriteResult());
-  return it.Length();
-}
-
-// Append value onto end of (list: key)
-// TODO: Make this O(1) time. Might require MergeOperator.
-//   : throws RedisListException
-int RedisLists::PushRight(const std::string& key, const std::string& value) {
-  // Get the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Create an iterator to the data and seek to the end.
-  RedisListIterator it(data);
-  it.Reserve(it.Size() + it.SizeOf(value));
-  while (!it.Done()) {
-    it.Push();    // Write each element as we go
-  }
-
-  // Insert the new element at the current position (the end)
-  it.InsertElement(value);
-
-  // Push it back to the db, and return length
-  db_->Put(put_option_, key, it.WriteResult());
-  return it.Length();
-}
-
-// Set (list: key)[idx] = val. Return true on success, false on fail.
-//   : throws RedisListException
-bool RedisLists::Set(const std::string& key, int32_t index,
-                     const std::string& value) {
-  // Get the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Handle negative index for REDIS (meaning -index from end of list)
-  if (index < 0) {
-    index = Length(key) - (-index);
-  }
-
-  // Iterate through the list until we find the element we want
-  int curIndex = 0;
-  RedisListIterator it(data);
-  it.Reserve(it.Size() + it.SizeOf(value));  // Over-estimate is fine
-  while(curIndex < index && !it.Done()) {
-    it.Push();
-    ++curIndex;
-  }
-
-  // If not found, return false (this occurs when index was invalid)
-  if (it.Done() || curIndex != index) {
-    return false;
-  }
-
-  // Write the new element value, and drop the previous element value
-  it.InsertElement(value);
-  it.Skip();
-
-  // Write the data to the database
-  // Check status, since it needs to return true/false guarantee
-  Status s = db_->Put(put_option_, key, it.WriteResult());
-
-  // Success
-  return s.ok();
-}
-
-/// Delete / Remove / Pop functions
-
-// Trim (list: key) so that it will only contain the indices from start..stop
-//  Invalid indices will not generate an error, just empty,
-//  or the portion of the list that fits in this interval
-//   : throws RedisListException
-bool RedisLists::Trim(const std::string& key, int32_t start, int32_t stop) {
-  // Get the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Handle negative indices in REDIS
-  int listLen = Length(key);
-  if (start < 0) {
-    start = listLen - (-start);
-  }
-  if (stop < 0) {
-    stop = listLen - (-stop);
-  }
-
-  // Truncate bounds to only fit in the list
-  start = std::max(start, 0);
-  stop = std::min(stop, listLen-1);
-
-  // Construct an iterator for the list. Drop all undesired elements.
-  int curIndex = 0;
-  RedisListIterator it(data);
-  it.Reserve(it.Size());          // Over-estimate
-  while(!it.Done()) {
-    // If not within the range, just skip the item (drop it).
-    // Otherwise, continue as usual.
-    if (start <= curIndex && curIndex <= stop) {
-      it.Push();
-    } else {
-      it.Skip();
-    }
-
-    // Increment the current index
-    ++curIndex;
-  }
-
-  // Write the (possibly empty) result to the database
-  Status s = db_->Put(put_option_, key, it.WriteResult());
-
-  // Return true as long as the write succeeded
-  return s.ok();
-}
-
-// Return and remove the first element in the list (or "" if empty)
-//   : throws RedisListException
-bool RedisLists::PopLeft(const std::string& key, std::string* result) {
-  // Get the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Point to first element in the list (if it exists), and get its value/size
-  RedisListIterator it(data);
-  if (it.Length() > 0) {            // Proceed only if list is non-empty
-    Slice elem;
-    it.GetCurrent(&elem);           // Store the value of the first element
-    it.Reserve(it.Size() - it.SizeOf(elem));
-    it.Skip();                      // DROP the first item and move to next
-
-    // Update the db
-    db_->Put(put_option_, key, it.WriteResult());
-
-    // Return the value
-    if (result != NULL) {
-      *result = elem.ToString();
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-// Remove and return the last element in the list (or "" if empty)
-// TODO: Make this O(1). Might require MergeOperator.
-//   : throws RedisListException
-bool RedisLists::PopRight(const std::string& key, std::string* result) {
-  // Extract the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Construct an iterator to the data and move to last element
-  RedisListIterator it(data);
-  it.Reserve(it.Size());
-  int len = it.Length();
-  int curIndex = 0;
-  while(curIndex < (len-1) && !it.Done()) {
-    it.Push();
-    ++curIndex;
-  }
-
-  // Extract and drop/skip the last element
-  if (curIndex == len-1) {
-    assert(!it.Done());         // Sanity check. Should not have ended here.
-
-    // Extract and pop the element
-    Slice elem;
-    it.GetCurrent(&elem);       // Save value of element.
-    it.Skip();                  // Skip the element
-
-    // Write the result to the database
-    db_->Put(put_option_, key, it.WriteResult());
-
-    // Return the value
-    if (result != NULL) {
-      *result = elem.ToString();
-    }
-    return true;
-  } else {
-    // Must have been an empty list
-    assert(it.Done() && len==0 && curIndex == 0);
-    return false;
-  }
-}
-
-// Remove the (first or last) "num" occurrences of value in (list: key)
-//   : throws RedisListException
-int RedisLists::Remove(const std::string& key, int32_t num,
-                       const std::string& value) {
-  // Negative num ==> RemoveLast; Positive num ==> Remove First
-  if (num < 0) {
-    return RemoveLast(key, -num, value);
-  } else if (num > 0) {
-    return RemoveFirst(key, num, value);
-  } else {
-    return RemoveFirst(key, Length(key), value);
-  }
-}
-
-// Remove the first "num" occurrences of value in (list: key).
-//   : throws RedisListException
-int RedisLists::RemoveFirst(const std::string& key, int32_t num,
-                            const std::string& value) {
-  // Ensure that the number is positive
-  assert(num >= 0);
-
-  // Extract the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Traverse the list, appending all but the desired occurrences of value
-  int numSkipped = 0;         // Keep track of the number of times value is seen
-  Slice elem;
-  RedisListIterator it(data);
-  it.Reserve(it.Size());
-  while (!it.Done()) {
-    it.GetCurrent(&elem);
-
-    if (elem == value && numSkipped < num) {
-      // Drop this item if desired
-      it.Skip();
-      ++numSkipped;
-    } else {
-      // Otherwise keep the item and proceed as normal
-      it.Push();
-    }
-  }
-
-  // Put the result back to the database
-  db_->Put(put_option_, key, it.WriteResult());
-
-  // Return the number of elements removed
-  return numSkipped;
-}
-
-
-// Remove the last "num" occurrences of value in (list: key).
-// TODO: I traverse the list 2x. Make faster. Might require MergeOperator.
-//   : throws RedisListException
-int RedisLists::RemoveLast(const std::string& key, int32_t num,
-                           const std::string& value) {
-  // Ensure that the number is positive
-  assert(num >= 0);
-
-  // Extract the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Temporary variable to hold the "current element" in the blocks below
-  Slice elem;
-
-  // Count the total number of occurrences of value
-  int totalOccs = 0;
-  for (RedisListIterator it(data); !it.Done(); it.Skip()) {
-    it.GetCurrent(&elem);
-    if (elem == value) {
-      ++totalOccs;
-    }
-  }
-
-  // Construct an iterator to the data. Reserve enough space for the result.
-  RedisListIterator it(data);
-  int bytesRemoved = std::min(num,totalOccs)*it.SizeOf(value);
-  it.Reserve(it.Size() - bytesRemoved);
-
-  // Traverse the list, appending all but the desired occurrences of value.
-  // Note: "Drop the last k occurrences" is equivalent to
-  //  "keep only the first n-k occurrences", where n is total occurrences.
-  int numKept = 0;          // Keep track of the number of times value is kept
-  while(!it.Done()) {
-    it.GetCurrent(&elem);
-
-    // If we are within the deletion range and equal to value, drop it.
-    // Otherwise, append/keep/push it.
-    if (elem == value) {
-      if (numKept < totalOccs - num) {
-        it.Push();
-        ++numKept;
-      } else {
-        it.Skip();
-      }
-    } else {
-      // Always append the others
-      it.Push();
-    }
-  }
-
-  // Put the result back to the database
-  db_->Put(put_option_, key, it.WriteResult());
-
-  // Return the number of elements removed
-  return totalOccs - numKept;
-}
-
-/// Private functions
-
-// Insert element value into (list: key), right before/after
-//  the first occurrence of pivot
-//   : throws RedisListException
-int RedisLists::Insert(const std::string& key, const std::string& pivot,
-                       const std::string& value, bool insert_after) {
-  // Get the original list data
-  std::string data;
-  db_->Get(get_option_, key, &data);
-
-  // Construct an iterator to the data and reserve enough space for result.
-  RedisListIterator it(data);
-  it.Reserve(it.Size() + it.SizeOf(value));
-
-  // Iterate through the list until we find the element we want
-  Slice elem;
-  bool found = false;
-  while(!it.Done() && !found) {
-    it.GetCurrent(&elem);
-
-    // When we find the element, insert the element and mark found
-    if (elem == pivot) {                // Found it!
-      found = true;
-      if (insert_after == true) {       // Skip one more, if inserting after it
-        it.Push();
-      }
-      it.InsertElement(value);
-    } else {
-      it.Push();
-    }
-
-  }
-
-  // Put the data (string) into the database
-  if (found) {
-    db_->Put(put_option_, key, it.WriteResult());
-  }
-
-  // Returns the new (possibly unchanged) length of the list
-  return it.Length();
-}
-
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/redis/redis_lists.h b/thirdparty/rocksdb/utilities/redis/redis_lists.h
deleted file mode 100644
index 6c8b9551ea..0000000000
--- a/thirdparty/rocksdb/utilities/redis/redis_lists.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/**
- * A (persistent) Redis API built using the rocksdb backend.
- * Implements Redis Lists as described on: http://redis.io/commands#list
- *
- * @throws All functions may throw a RedisListException
- *
- * @author Deon Nicholas (dnicholas@fb.com)
- * Copyright 2013 Facebook
- */
-
-#ifndef ROCKSDB_LITE
-#pragma once
-
-#include <string>
-#include "rocksdb/db.h"
-#include "redis_list_iterator.h"
-#include "redis_list_exception.h"
-
-namespace rocksdb {
-
-/// The Redis functionality (see http://redis.io/commands#list)
-/// All functions may THROW a RedisListException
-class RedisLists {
- public: // Constructors / Destructors
-  /// Construct a new RedisLists database, with name/path of db.
-  /// Will clear the database on open iff destructive is true (default false).
-  /// Otherwise, it will restore saved changes.
-  /// May throw RedisListException
-  RedisLists(const std::string& db_path,
-             Options options, bool destructive = false);
-
- public:  // Accessors
-  /// The number of items in (list: key)
-  int Length(const std::string& key);
-
-  /// Search the list for the (index)'th item (0-based) in (list:key)
-  /// A negative index indicates: "from end-of-list"
-  /// If index is within range: return true, and return the value in *result.
-  /// If (index < -length OR index>=length), then index is out of range:
-  ///   return false (and *result is left unchanged)
-  /// May throw RedisListException
-  bool Index(const std::string& key, int32_t index,
-             std::string* result);
-
-  /// Return (list: key)[first..last] (inclusive)
-  /// May throw RedisListException
-  std::vector<std::string> Range(const std::string& key,
-                                 int32_t first, int32_t last);
-
-  /// Prints the entire (list: key), for debugging.
-  void Print(const std::string& key);
-
- public: // Insert/Update
-  /// Insert value before/after pivot in (list: key). Return the length.
-  /// May throw RedisListException
-  int InsertBefore(const std::string& key, const std::string& pivot,
-                   const std::string& value);
-  int InsertAfter(const std::string& key, const std::string& pivot,
-                  const std::string& value);
-
-  /// Push / Insert value at beginning/end of the list. Return the length.
-  /// May throw RedisListException
-  int PushLeft(const std::string& key, const std::string& value);
-  int PushRight(const std::string& key, const std::string& value);
-
-  /// Set (list: key)[idx] = val. Return true on success, false on fail
-  /// May throw RedisListException
-  bool Set(const std::string& key, int32_t index, const std::string& value);
-
- public: // Delete / Remove / Pop / Trim
-  /// Trim (list: key) so that it will only contain the indices from start..stop
-  /// Returns true on success
-  /// May throw RedisListException
-  bool Trim(const std::string& key, int32_t start, int32_t stop);
-
-  /// If list is empty, return false and leave *result unchanged.
-  /// Else, remove the first/last elem, store it in *result, and return true
-  bool PopLeft(const std::string& key, std::string* result);  // First
-  bool PopRight(const std::string& key, std::string* result); // Last
-
-  /// Remove the first (or last) num occurrences of value from the list (key)
-  /// Return the number of elements removed.
-  /// May throw RedisListException
-  int Remove(const std::string& key, int32_t num,
-             const std::string& value);
-  int RemoveFirst(const std::string& key, int32_t num,
-                  const std::string& value);
-  int RemoveLast(const std::string& key, int32_t num,
-                 const std::string& value);
-
- private: // Private Functions
-  /// Calls InsertBefore or InsertAfter
-  int Insert(const std::string& key, const std::string& pivot,
-             const std::string& value, bool insert_after);
- private:
-  std::string db_name_;       // The actual database name/path
-  WriteOptions put_option_;
-  ReadOptions get_option_;
-
-  /// The backend rocksdb database.
-  /// Map : key --> list
-  ///       where a list is a sequence of elements
-  ///       and an element is a 4-byte integer (n), followed by n bytes of data
-  std::unique_ptr<DB> db_;
-};
-
-} // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/redis/redis_lists_test.cc b/thirdparty/rocksdb/utilities/redis/redis_lists_test.cc
deleted file mode 100644
index 22acdff644..0000000000
--- a/thirdparty/rocksdb/utilities/redis/redis_lists_test.cc
+++ /dev/null
@@ -1,894 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-/**
- * A test harness for the Redis API built on rocksdb.
- *
- * USAGE: Build with: "make redis_test" (in rocksdb directory).
- *        Run unit tests with: "./redis_test"
- *        Manual/Interactive user testing: "./redis_test -m"
- *        Manual user testing + restart database: "./redis_test -m -d"
- *
- * TODO:  Add LARGE random test cases to verify efficiency and scalability
- *
- * @author Deon Nicholas (dnicholas@fb.com)
- */
-
-#ifndef ROCKSDB_LITE
-
-#include <iostream>
-#include <cctype>
-
-#include "redis_lists.h"
-#include "util/testharness.h"
-#include "util/random.h"
-
-using namespace rocksdb;
-
-namespace rocksdb {
-
-class RedisListsTest : public testing::Test {
- public:
-  static const std::string kDefaultDbName;
-  static Options options;
-
-  RedisListsTest() {
-    options.create_if_missing = true;
-  }
-};
-
-const std::string RedisListsTest::kDefaultDbName =
-    test::TmpDir() + "/redis_lists_test";
-Options RedisListsTest::options = Options();
-
-// operator== and operator<< are defined below for vectors (lists)
-// Needed for ASSERT_EQ
-
-namespace {
-void AssertListEq(const std::vector<std::string>& result,
-                  const std::vector<std::string>& expected_result) {
-  ASSERT_EQ(result.size(), expected_result.size());
-  for (size_t i = 0; i < result.size(); ++i) {
-    ASSERT_EQ(result[i], expected_result[i]);
-  }
-}
-}  // namespace
-
-// PushRight, Length, Index, Range
-TEST_F(RedisListsTest, SimpleTest) {
-  RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // Simple PushRight (should return the new length each time)
-  ASSERT_EQ(redis.PushRight("k1", "v1"), 1);
-  ASSERT_EQ(redis.PushRight("k1", "v2"), 2);
-  ASSERT_EQ(redis.PushRight("k1", "v3"), 3);
-
-  // Check Length and Index() functions
-  ASSERT_EQ(redis.Length("k1"), 3);        // Check length
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "v1");   // Check valid indices
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "v2");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "v3");
-
-  // Check range function and vectors
-  std::vector<std::string> result = redis.Range("k1", 0, 2);   // Get the list
-  std::vector<std::string> expected_result(3);
-  expected_result[0] = "v1";
-  expected_result[1] = "v2";
-  expected_result[2] = "v3";
-  AssertListEq(result, expected_result);
-}
-
-// PushLeft, Length, Index, Range
-TEST_F(RedisListsTest, SimpleTest2) {
-  RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // Simple PushRight
-  ASSERT_EQ(redis.PushLeft("k1", "v3"), 1);
-  ASSERT_EQ(redis.PushLeft("k1", "v2"), 2);
-  ASSERT_EQ(redis.PushLeft("k1", "v1"), 3);
-
-  // Check Length and Index() functions
-  ASSERT_EQ(redis.Length("k1"), 3);        // Check length
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "v1");   // Check valid indices
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "v2");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "v3");
-
-  // Check range function and vectors
-  std::vector<std::string> result = redis.Range("k1", 0, 2);   // Get the list
-  std::vector<std::string> expected_result(3);
-  expected_result[0] = "v1";
-  expected_result[1] = "v2";
-  expected_result[2] = "v3";
-  AssertListEq(result, expected_result);
-}
-
-// Exhaustive test of the Index() function
-TEST_F(RedisListsTest, IndexTest) {
-  RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // Empty Index check (return empty and should not crash or edit tempv)
-  tempv = "yo";
-  ASSERT_TRUE(!redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "yo");
-  ASSERT_TRUE(!redis.Index("fda", 3, &tempv));
-  ASSERT_EQ(tempv, "yo");
-  ASSERT_TRUE(!redis.Index("random", -12391, &tempv));
-  ASSERT_EQ(tempv, "yo");
-
-  // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3]
-  redis.PushRight("k1", "v1");
-  redis.PushRight("k1", "v2");
-  redis.PushRight("k1", "v3");
-  redis.PushLeft("k1", "v4");
-  redis.PushLeft("k1", "v4");
-  redis.PushLeft("k1", "v6");
-
-  // Simple, non-negative indices
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "v6");
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "v4");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "v4");
-  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-  ASSERT_EQ(tempv, "v1");
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "v2");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "v3");
-
-  // Negative indices
-  ASSERT_TRUE(redis.Index("k1", -6, &tempv));
-  ASSERT_EQ(tempv, "v6");
-  ASSERT_TRUE(redis.Index("k1", -5, &tempv));
-  ASSERT_EQ(tempv, "v4");
-  ASSERT_TRUE(redis.Index("k1", -4, &tempv));
-  ASSERT_EQ(tempv, "v4");
-  ASSERT_TRUE(redis.Index("k1", -3, &tempv));
-  ASSERT_EQ(tempv, "v1");
-  ASSERT_TRUE(redis.Index("k1", -2, &tempv));
-  ASSERT_EQ(tempv, "v2");
-  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
-  ASSERT_EQ(tempv, "v3");
-
-  // Out of bounds (return empty, no crash)
-  ASSERT_TRUE(!redis.Index("k1", 6, &tempv));
-  ASSERT_TRUE(!redis.Index("k1", 123219, &tempv));
-  ASSERT_TRUE(!redis.Index("k1", -7, &tempv));
-  ASSERT_TRUE(!redis.Index("k1", -129, &tempv));
-}
-
-
-// Exhaustive test of the Range() function
-TEST_F(RedisListsTest, RangeTest) {
-  RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // Simple Pushes (will yield: [v6, v4, v4, v1, v2, v3])
-  redis.PushRight("k1", "v1");
-  redis.PushRight("k1", "v2");
-  redis.PushRight("k1", "v3");
-  redis.PushLeft("k1", "v4");
-  redis.PushLeft("k1", "v4");
-  redis.PushLeft("k1", "v6");
-
-  // Sanity check (check the length;  make sure it's 6)
-  ASSERT_EQ(redis.Length("k1"), 6);
-
-  // Simple range
-  std::vector<std::string> res = redis.Range("k1", 1, 4);
-  ASSERT_EQ((int)res.size(), 4);
-  ASSERT_EQ(res[0], "v4");
-  ASSERT_EQ(res[1], "v4");
-  ASSERT_EQ(res[2], "v1");
-  ASSERT_EQ(res[3], "v2");
-
-  // Negative indices (i.e.: measured from the end)
-  res = redis.Range("k1", 2, -1);
-  ASSERT_EQ((int)res.size(), 4);
-  ASSERT_EQ(res[0], "v4");
-  ASSERT_EQ(res[1], "v1");
-  ASSERT_EQ(res[2], "v2");
-  ASSERT_EQ(res[3], "v3");
-
-  res = redis.Range("k1", -6, -4);
-  ASSERT_EQ((int)res.size(), 3);
-  ASSERT_EQ(res[0], "v6");
-  ASSERT_EQ(res[1], "v4");
-  ASSERT_EQ(res[2], "v4");
-
-  res = redis.Range("k1", -1, 5);
-  ASSERT_EQ((int)res.size(), 1);
-  ASSERT_EQ(res[0], "v3");
-
-  // Partial / Broken indices
-  res = redis.Range("k1", -3, 1000000);
-  ASSERT_EQ((int)res.size(), 3);
-  ASSERT_EQ(res[0], "v1");
-  ASSERT_EQ(res[1], "v2");
-  ASSERT_EQ(res[2], "v3");
-
-  res = redis.Range("k1", -1000000, 1);
-  ASSERT_EQ((int)res.size(), 2);
-  ASSERT_EQ(res[0], "v6");
-  ASSERT_EQ(res[1], "v4");
-
-  // Invalid indices
-  res = redis.Range("k1", 7, 9);
-  ASSERT_EQ((int)res.size(), 0);
-
-  res = redis.Range("k1", -8, -7);
-  ASSERT_EQ((int)res.size(), 0);
-
-  res = redis.Range("k1", 3, 2);
-  ASSERT_EQ((int)res.size(), 0);
-
-  res = redis.Range("k1", 5, -2);
-  ASSERT_EQ((int)res.size(), 0);
-
-  // Range matches Index
-  res = redis.Range("k1", -6, -4);
-  ASSERT_TRUE(redis.Index("k1", -6, &tempv));
-  ASSERT_EQ(tempv, res[0]);
-  ASSERT_TRUE(redis.Index("k1", -5, &tempv));
-  ASSERT_EQ(tempv, res[1]);
-  ASSERT_TRUE(redis.Index("k1", -4, &tempv));
-  ASSERT_EQ(tempv, res[2]);
-
-  // Last check
-  res = redis.Range("k1", 0, -6);
-  ASSERT_EQ((int)res.size(), 1);
-  ASSERT_EQ(res[0], "v6");
-}
-
-// Exhaustive test for InsertBefore(), and InsertAfter()
-TEST_F(RedisListsTest, InsertTest) {
-  RedisLists redis(kDefaultDbName, options, true);
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // Insert on empty list (return 0, and do not crash)
-  ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "a"), 0);
-  ASSERT_EQ(redis.InsertAfter("k1", "other-non-exist", "c"), 0);
-  ASSERT_EQ(redis.Length("k1"), 0);
-
-  // Push some preliminary stuff [g, f, e, d, c, b, a]
-  redis.PushLeft("k1", "a");
-  redis.PushLeft("k1", "b");
-  redis.PushLeft("k1", "c");
-  redis.PushLeft("k1", "d");
-  redis.PushLeft("k1", "e");
-  redis.PushLeft("k1", "f");
-  redis.PushLeft("k1", "g");
-  ASSERT_EQ(redis.Length("k1"), 7);
-
-  // Test InsertBefore
-  int newLength = redis.InsertBefore("k1", "e", "hello");
-  ASSERT_EQ(newLength, 8);
-  ASSERT_EQ(redis.Length("k1"), newLength);
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "f");
-  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-  ASSERT_EQ(tempv, "e");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "hello");
-
-  // Test InsertAfter
-  newLength =  redis.InsertAfter("k1", "c", "bye");
-  ASSERT_EQ(newLength, 9);
-  ASSERT_EQ(redis.Length("k1"), newLength);
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "bye");
-
-  // Test bad value on InsertBefore
-  newLength = redis.InsertBefore("k1", "yo", "x");
-  ASSERT_EQ(newLength, 9);
-  ASSERT_EQ(redis.Length("k1"), newLength);
-
-  // Test bad value on InsertAfter
-  newLength = redis.InsertAfter("k1", "xxxx", "y");
-  ASSERT_EQ(newLength, 9);
-  ASSERT_EQ(redis.Length("k1"), newLength);
-
-  // Test InsertBefore beginning
-  newLength = redis.InsertBefore("k1", "g", "begggggggggggggggg");
-  ASSERT_EQ(newLength, 10);
-  ASSERT_EQ(redis.Length("k1"), newLength);
-
-  // Test InsertAfter end
-  newLength = redis.InsertAfter("k1", "a", "enddd");
-  ASSERT_EQ(newLength, 11);
-  ASSERT_EQ(redis.Length("k1"), newLength);
-
-  // Make sure nothing weird happened.
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "begggggggggggggggg");
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "g");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "f");
-  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-  ASSERT_EQ(tempv, "hello");
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "e");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "d");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "c");
-  ASSERT_TRUE(redis.Index("k1", 7, &tempv));
-  ASSERT_EQ(tempv, "bye");
-  ASSERT_TRUE(redis.Index("k1", 8, &tempv));
-  ASSERT_EQ(tempv, "b");
-  ASSERT_TRUE(redis.Index("k1", 9, &tempv));
-  ASSERT_EQ(tempv, "a");
-  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
-  ASSERT_EQ(tempv, "enddd");
-}
-
-// Exhaustive test of Set function
-TEST_F(RedisListsTest, SetTest) {
-  RedisLists redis(kDefaultDbName, options, true);
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // Set on empty list (return false, and do not crash)
-  ASSERT_EQ(redis.Set("k1", 7, "a"), false);
-  ASSERT_EQ(redis.Set("k1", 0, "a"), false);
-  ASSERT_EQ(redis.Set("k1", -49, "cx"), false);
-  ASSERT_EQ(redis.Length("k1"), 0);
-
-  // Push some preliminary stuff [g, f, e, d, c, b, a]
-  redis.PushLeft("k1", "a");
-  redis.PushLeft("k1", "b");
-  redis.PushLeft("k1", "c");
-  redis.PushLeft("k1", "d");
-  redis.PushLeft("k1", "e");
-  redis.PushLeft("k1", "f");
-  redis.PushLeft("k1", "g");
-  ASSERT_EQ(redis.Length("k1"), 7);
-
-  // Test Regular Set
-  ASSERT_TRUE(redis.Set("k1", 0, "0"));
-  ASSERT_TRUE(redis.Set("k1", 3, "3"));
-  ASSERT_TRUE(redis.Set("k1", 6, "6"));
-  ASSERT_TRUE(redis.Set("k1", 2, "2"));
-  ASSERT_TRUE(redis.Set("k1", 5, "5"));
-  ASSERT_TRUE(redis.Set("k1", 1, "1"));
-  ASSERT_TRUE(redis.Set("k1", 4, "4"));
-
-  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "0");
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "1");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "2");
-  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-  ASSERT_EQ(tempv, "3");
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "4");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "5");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "6");
-
-  // Set with negative indices
-  ASSERT_TRUE(redis.Set("k1", -7, "a"));
-  ASSERT_TRUE(redis.Set("k1", -4, "d"));
-  ASSERT_TRUE(redis.Set("k1", -1, "g"));
-  ASSERT_TRUE(redis.Set("k1", -5, "c"));
-  ASSERT_TRUE(redis.Set("k1", -2, "f"));
-  ASSERT_TRUE(redis.Set("k1", -6, "b"));
-  ASSERT_TRUE(redis.Set("k1", -3, "e"));
-
-  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "a");
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "b");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "c");
-  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-  ASSERT_EQ(tempv, "d");
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "e");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "f");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "g");
-
-  // Bad indices (just out-of-bounds / off-by-one check)
-  ASSERT_EQ(redis.Set("k1", -8, "off-by-one in negative index"), false);
-  ASSERT_EQ(redis.Set("k1", 7, "off-by-one-error in positive index"), false);
-  ASSERT_EQ(redis.Set("k1", 43892, "big random index should fail"), false);
-  ASSERT_EQ(redis.Set("k1", -21391, "large negative index should fail"), false);
-
-  // One last check (to make sure nothing weird happened)
-  ASSERT_EQ(redis.Length("k1"), 7); // Size should not change
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "a");
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "b");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "c");
-  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-  ASSERT_EQ(tempv, "d");
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "e");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "f");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "g");
-}
-
-// Testing Insert, Push, and Set, in a mixed environment
-TEST_F(RedisListsTest, InsertPushSetTest) {
-  RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // A series of pushes and insertions
-  // Will result in [newbegin, z, a, aftera, x, newend]
-  // Also, check the return value sometimes (should return length)
-  int lengthCheck;
-  lengthCheck = redis.PushLeft("k1", "a");
-  ASSERT_EQ(lengthCheck, 1);
-  redis.PushLeft("k1", "z");
-  redis.PushRight("k1", "x");
-  lengthCheck = redis.InsertAfter("k1", "a", "aftera");
-  ASSERT_EQ(lengthCheck , 4);
-  redis.InsertBefore("k1", "z", "newbegin");  // InsertBefore beginning of list
-  redis.InsertAfter("k1", "x", "newend");     // InsertAfter end of list
-
-  // Check
-  std::vector<std::string> res = redis.Range("k1", 0, -1); // Get the list
-  ASSERT_EQ((int)res.size(), 6);
-  ASSERT_EQ(res[0], "newbegin");
-  ASSERT_EQ(res[5], "newend");
-  ASSERT_EQ(res[3], "aftera");
-
-  // Testing duplicate values/pivots (multiple occurrences of 'a')
-  ASSERT_TRUE(redis.Set("k1", 0, "a"));     // [a, z, a, aftera, x, newend]
-  redis.InsertAfter("k1", "a", "happy");    // [a, happy, z, a, aftera, ...]
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "happy");
-  redis.InsertBefore("k1", "a", "sad");     // [sad, a, happy, z, a, aftera, ...]
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "sad");
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "happy");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "aftera");
-  redis.InsertAfter("k1", "a", "zz");         // [sad, a, zz, happy, z, a, aftera, ...]
-  ASSERT_TRUE(redis.Index("k1", 2, &tempv));
-  ASSERT_EQ(tempv, "zz");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "aftera");
-  ASSERT_TRUE(redis.Set("k1", 1, "nota"));    // [sad, nota, zz, happy, z, a, ...]
-  redis.InsertBefore("k1", "a", "ba");        // [sad, nota, zz, happy, z, ba, a, ...]
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "z");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "ba");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "a");
-
-  // We currently have: [sad, nota, zz, happy, z, ba, a, aftera, x, newend]
-  // redis.Print("k1");   // manually check
-
-  // Test Inserting before/after non-existent values
-  lengthCheck = redis.Length("k1"); // Ensure that the length doesn't change
-  ASSERT_EQ(lengthCheck, 10);
-  ASSERT_EQ(redis.InsertBefore("k1", "non-exist", "randval"), lengthCheck);
-  ASSERT_EQ(redis.InsertAfter("k1", "nothing", "a"), lengthCheck);
-  ASSERT_EQ(redis.InsertAfter("randKey", "randVal", "ranValue"), 0); // Empty
-  ASSERT_EQ(redis.Length("k1"), lengthCheck); // The length should not change
-
-  // Simply Test the Set() function
-  redis.Set("k1", 5, "ba2");
-  redis.InsertBefore("k1", "ba2", "beforeba2");
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "z");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "beforeba2");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "ba2");
-  ASSERT_TRUE(redis.Index("k1", 7, &tempv));
-  ASSERT_EQ(tempv, "a");
-
-  // We have: [sad, nota, zz, happy, z, beforeba2, ba2, a, aftera, x, newend]
-
-  // Set() with negative indices
-  redis.Set("k1", -1, "endprank");
-  ASSERT_TRUE(!redis.Index("k1", 11, &tempv));
-  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
-  ASSERT_EQ(tempv, "endprank"); // Ensure Set worked correctly
-  redis.Set("k1", -11, "t");
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "t");
-
-  // Test out of bounds Set
-  ASSERT_EQ(redis.Set("k1", -12, "ssd"), false);
-  ASSERT_EQ(redis.Set("k1", 11, "sasd"), false);
-  ASSERT_EQ(redis.Set("k1", 1200, "big"), false);
-}
-
-// Testing Trim, Pop
-TEST_F(RedisListsTest, TrimPopTest) {
-  RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // A series of pushes and insertions
-  // Will result in [newbegin, z, a, aftera, x, newend]
-  redis.PushLeft("k1", "a");
-  redis.PushLeft("k1", "z");
-  redis.PushRight("k1", "x");
-  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
-  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
-  redis.InsertAfter("k1", "a", "aftera");
-
-  // Simple PopLeft/Right test
-  ASSERT_TRUE(redis.PopLeft("k1", &tempv));
-  ASSERT_EQ(tempv, "newbegin");
-  ASSERT_EQ(redis.Length("k1"), 5);
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "z");
-  ASSERT_TRUE(redis.PopRight("k1", &tempv));
-  ASSERT_EQ(tempv, "newend");
-  ASSERT_EQ(redis.Length("k1"), 4);
-  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
-  ASSERT_EQ(tempv, "x");
-
-  // Now have: [z, a, aftera, x]
-
-  // Test Trim
-  ASSERT_TRUE(redis.Trim("k1", 0, -1));       // [z, a, aftera, x] (do nothing)
-  ASSERT_EQ(redis.Length("k1"), 4);
-  ASSERT_TRUE(redis.Trim("k1", 0, 2));                     // [z, a, aftera]
-  ASSERT_EQ(redis.Length("k1"), 3);
-  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
-  ASSERT_EQ(tempv, "aftera");
-  ASSERT_TRUE(redis.Trim("k1", 1, 1));                     // [a]
-  ASSERT_EQ(redis.Length("k1"), 1);
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "a");
-
-  // Test out of bounds (empty) trim
-  ASSERT_TRUE(redis.Trim("k1", 1, 0));
-  ASSERT_EQ(redis.Length("k1"), 0);
-
-  // Popping with empty list (return empty without error)
-  ASSERT_TRUE(!redis.PopLeft("k1", &tempv));
-  ASSERT_TRUE(!redis.PopRight("k1", &tempv));
-  ASSERT_TRUE(redis.Trim("k1", 0, 5));
-
-  // Exhaustive Trim test (negative and invalid indices)
-  // Will start in [newbegin, z, a, aftera, x, newend]
-  redis.PushLeft("k1", "a");
-  redis.PushLeft("k1", "z");
-  redis.PushRight("k1", "x");
-  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
-  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
-  redis.InsertAfter("k1", "a", "aftera");
-  ASSERT_TRUE(redis.Trim("k1", -6, -1));                     // Should do nothing
-  ASSERT_EQ(redis.Length("k1"), 6);
-  ASSERT_TRUE(redis.Trim("k1", 1, -2));
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "z");
-  ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-  ASSERT_EQ(tempv, "x");
-  ASSERT_EQ(redis.Length("k1"), 4);
-  ASSERT_TRUE(redis.Trim("k1", -3, -2));
-  ASSERT_EQ(redis.Length("k1"), 2);
-}
-
-// Testing Remove, RemoveFirst, RemoveLast
-TEST_F(RedisListsTest, RemoveTest) {
-  RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // A series of pushes and insertions
-  // Will result in [newbegin, z, a, aftera, x, newend, a, a]
-  redis.PushLeft("k1", "a");
-  redis.PushLeft("k1", "z");
-  redis.PushRight("k1", "x");
-  redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
-  redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
-  redis.InsertAfter("k1", "a", "aftera");
-  redis.PushRight("k1", "a");
-  redis.PushRight("k1", "a");
-
-  // Verify
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "newbegin");
-  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
-  ASSERT_EQ(tempv, "a");
-
-  // Check RemoveFirst (Remove the first two 'a')
-  // Results in [newbegin, z, aftera, x, newend, a]
-  int numRemoved = redis.Remove("k1", 2, "a");
-  ASSERT_EQ(numRemoved, 2);
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "newbegin");
-  ASSERT_TRUE(redis.Index("k1", 1, &tempv));
-  ASSERT_EQ(tempv, "z");
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "newend");
-  ASSERT_TRUE(redis.Index("k1", 5, &tempv));
-  ASSERT_EQ(tempv, "a");
-  ASSERT_EQ(redis.Length("k1"), 6);
-
-  // Repopulate some stuff
-  // Results in: [x, x, x, x, x, newbegin, z, x, aftera, x, newend, a, x]
-  redis.PushLeft("k1", "x");
-  redis.PushLeft("k1", "x");
-  redis.PushLeft("k1", "x");
-  redis.PushLeft("k1", "x");
-  redis.PushLeft("k1", "x");
-  redis.PushRight("k1", "x");
-  redis.InsertAfter("k1", "z", "x");
-
-  // Test removal from end
-  numRemoved = redis.Remove("k1", -2, "x");
-  ASSERT_EQ(numRemoved, 2);
-  ASSERT_TRUE(redis.Index("k1", 8, &tempv));
-  ASSERT_EQ(tempv, "aftera");
-  ASSERT_TRUE(redis.Index("k1", 9, &tempv));
-  ASSERT_EQ(tempv, "newend");
-  ASSERT_TRUE(redis.Index("k1", 10, &tempv));
-  ASSERT_EQ(tempv, "a");
-  ASSERT_TRUE(!redis.Index("k1", 11, &tempv));
-  numRemoved = redis.Remove("k1", -2, "x");
-  ASSERT_EQ(numRemoved, 2);
-  ASSERT_TRUE(redis.Index("k1", 4, &tempv));
-  ASSERT_EQ(tempv, "newbegin");
-  ASSERT_TRUE(redis.Index("k1", 6, &tempv));
-  ASSERT_EQ(tempv, "aftera");
-
-  // We now have: [x, x, x, x, newbegin, z, aftera, newend, a]
-  ASSERT_EQ(redis.Length("k1"), 9);
-  ASSERT_TRUE(redis.Index("k1", -1, &tempv));
-  ASSERT_EQ(tempv, "a");
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "x");
-
-  // Test over-shooting (removing more than there exists)
-  numRemoved = redis.Remove("k1", -9000, "x");
-  ASSERT_EQ(numRemoved , 4);    // Only really removed 4
-  ASSERT_EQ(redis.Length("k1"), 5);
-  ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-  ASSERT_EQ(tempv, "newbegin");
-  numRemoved = redis.Remove("k1", 1, "x");
-  ASSERT_EQ(numRemoved, 0);
-
-  // Try removing ALL!
-  numRemoved = redis.Remove("k1", 0, "newbegin");   // REMOVE 0 will remove all!
-  ASSERT_EQ(numRemoved, 1);
-
-  // Removal from an empty-list
-  ASSERT_TRUE(redis.Trim("k1", 1, 0));
-  numRemoved = redis.Remove("k1", 1, "z");
-  ASSERT_EQ(numRemoved, 0);
-}
-
-
-// Test Multiple keys and Persistence
-TEST_F(RedisListsTest, PersistenceMultiKeyTest) {
-  std::string tempv;  // Used below for all Index(), PopRight(), PopLeft()
-
-  // Block one: populate a single key in the database
-  {
-    RedisLists redis(kDefaultDbName, options, true);   // Destructive
-
-    // A series of pushes and insertions
-    // Will result in [newbegin, z, a, aftera, x, newend, a, a]
-    redis.PushLeft("k1", "a");
-    redis.PushLeft("k1", "z");
-    redis.PushRight("k1", "x");
-    redis.InsertBefore("k1", "z", "newbegin");    // InsertBefore start of list
-    redis.InsertAfter("k1", "x", "newend");       // InsertAfter end of list
-    redis.InsertAfter("k1", "a", "aftera");
-    redis.PushRight("k1", "a");
-    redis.PushRight("k1", "a");
-
-    ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-    ASSERT_EQ(tempv, "aftera");
-  }
-
-  // Block two: make sure changes were saved and add some other key
-  {
-    RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive
-
-    // Check
-    ASSERT_EQ(redis.Length("k1"), 8);
-    ASSERT_TRUE(redis.Index("k1", 3, &tempv));
-    ASSERT_EQ(tempv, "aftera");
-
-    redis.PushRight("k2", "randomkey");
-    redis.PushLeft("k2", "sas");
-
-    redis.PopLeft("k1", &tempv);
-  }
-
-  // Block three: Verify the changes from block 2
-  {
-    RedisLists redis(kDefaultDbName, options, false); // Persistent, non-destructive
-
-    // Check
-    ASSERT_EQ(redis.Length("k1"), 7);
-    ASSERT_EQ(redis.Length("k2"), 2);
-    ASSERT_TRUE(redis.Index("k1", 0, &tempv));
-    ASSERT_EQ(tempv, "z");
-    ASSERT_TRUE(redis.Index("k2", -2, &tempv));
-    ASSERT_EQ(tempv, "sas");
-  }
-}
-
-/// THE manual REDIS TEST begins here
-/// THIS WILL ONLY OCCUR IF YOU RUN: ./redis_test -m
-
-namespace {
-void MakeUpper(std::string* const s) {
-  int len = static_cast<int>(s->length());
-  for (int i = 0; i < len; ++i) {
-    (*s)[i] = toupper((*s)[i]);  // C-version defined in <ctype.h>
-  }
-}
-
-/// Allows the user to enter in REDIS commands into the command-line.
-/// This is useful for manual / interacticve testing / debugging.
-///  Use destructive=true to clean the database before use.
-///  Use destructive=false to remember the previous state (i.e.: persistent)
-/// Should be called from main function.
-int manual_redis_test(bool destructive){
-  RedisLists redis(RedisListsTest::kDefaultDbName,
-                   RedisListsTest::options,
-                   destructive);
-
-  // TODO: Right now, please use spaces to separate each word.
-  //  In actual redis, you can use quotes to specify compound values
-  //  Example: RPUSH mylist "this is a compound value"
-
-  std::string command;
-  while(true) {
-    std::cin >> command;
-    MakeUpper(&command);
-
-    if (command == "LINSERT") {
-      std::string k, t, p, v;
-      std::cin >> k >> t >> p >> v;
-      MakeUpper(&t);
-      if (t=="BEFORE") {
-        std::cout << redis.InsertBefore(k, p, v) << std::endl;
-      } else if (t=="AFTER") {
-        std::cout << redis.InsertAfter(k, p, v) << std::endl;
-      }
-    } else if (command == "LPUSH") {
-      std::string k, v;
-      std::cin >> k >> v;
-      redis.PushLeft(k, v);
-    } else if (command == "RPUSH") {
-      std::string k, v;
-      std::cin >> k >> v;
-      redis.PushRight(k, v);
-    } else if (command == "LPOP") {
-      std::string k;
-      std::cin >> k;
-      std::string res;
-      redis.PopLeft(k, &res);
-      std::cout << res << std::endl;
-    } else if (command == "RPOP") {
-      std::string k;
-      std::cin >> k;
-      std::string res;
-      redis.PopRight(k, &res);
-      std::cout << res << std::endl;
-    } else if (command == "LREM") {
-      std::string k;
-      int amt;
-      std::string v;
-
-      std::cin >> k >> amt >> v;
-      std::cout << redis.Remove(k, amt, v) << std::endl;
-    } else if (command == "LLEN") {
-      std::string k;
-      std::cin >> k;
-      std::cout << redis.Length(k) << std::endl;
-    } else if (command == "LRANGE") {
-      std::string k;
-      int i, j;
-      std::cin >> k >> i >> j;
-      std::vector<std::string> res = redis.Range(k, i, j);
-      for (auto it = res.begin(); it != res.end(); ++it) {
-        std::cout << " " << (*it);
-      }
-      std::cout << std::endl;
-    } else if (command == "LTRIM") {
-      std::string k;
-      int i, j;
-      std::cin >> k >> i >> j;
-      redis.Trim(k, i, j);
-    } else if (command == "LSET") {
-      std::string k;
-      int idx;
-      std::string v;
-      std::cin >> k >> idx >> v;
-      redis.Set(k, idx, v);
-    } else if (command == "LINDEX") {
-      std::string k;
-      int idx;
-      std::cin >> k >> idx;
-      std::string res;
-      redis.Index(k, idx, &res);
-      std::cout << res << std::endl;
-    } else if (command == "PRINT") {      // Added by Deon
-      std::string k;
-      std::cin >> k;
-      redis.Print(k);
-    } else if (command == "QUIT") {
-      return 0;
-    } else {
-      std::cout << "unknown command: " << command << std::endl;
-    }
-  }
-}
-}  // namespace
-
-} // namespace rocksdb
-
-
-// USAGE: "./redis_test" for default (unit tests)
-//        "./redis_test -m" for manual testing (redis command api)
-//        "./redis_test -m -d" for destructive manual test (erase db before use)
-
-
-namespace {
-// Check for "want" argument in the argument list
-bool found_arg(int argc, char* argv[], const char* want){
-  for(int i=1; i<argc; ++i){
-    if (strcmp(argv[i], want) == 0) {
-      return true;
-    }
-  }
-  return false;
-}
-}  // namespace
-
-// Will run unit tests.
-// However, if -m is specified, it will do user manual/interactive testing
-// -m -d is manual and destructive (will clear the database before use)
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-  if (found_arg(argc, argv, "-m")) {
-    bool destructive = found_arg(argc, argv, "-d");
-    return rocksdb::manual_redis_test(destructive);
-  } else {
-    return RUN_ALL_TESTS();
-  }
-}
-
-#else
-#include <stdio.h>
-
-int main(int argc, char* argv[]) {
-  fprintf(stderr, "SKIPPED as redis is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/simulator_cache/sim_cache.cc b/thirdparty/rocksdb/utilities/simulator_cache/sim_cache.cc
index e3d8016579..8629b60b09 100644
--- a/thirdparty/rocksdb/utilities/simulator_cache/sim_cache.cc
+++ b/thirdparty/rocksdb/utilities/simulator_cache/sim_cache.cc
@@ -46,7 +46,8 @@ class CacheActivityLogger {
     if (!status.ok()) {
       return status;
     }
-    file_writer_.reset(new WritableFileWriter(std::move(log_file), env_opts));
+    file_writer_.reset(new WritableFileWriter(std::move(log_file),
+                                              activity_log_file, env_opts));
 
     max_logging_size_ = max_logging_size;
     activity_logging_enabled_.store(true);
@@ -89,6 +90,7 @@ class CacheActivityLogger {
     log_line += key.ToString(true);
     log_line += " - ";
     AppendNumberTo(&log_line, size);
+  // @lint-ignore TXT2 T25377293 Grandfathered in
 		log_line += "\n";
 
     // line format: "ADD - <KEY> - <KEY-SIZE>"
@@ -155,20 +157,19 @@ class SimCacheImpl : public SimCache {
       : cache_(cache),
         key_only_cache_(NewLRUCache(sim_capacity, num_shard_bits)),
         miss_times_(0),
-        hit_times_(0) {}
+        hit_times_(0),
+        stats_(nullptr) {}
 
-  virtual ~SimCacheImpl() {}
-  virtual void SetCapacity(size_t capacity) override {
-    cache_->SetCapacity(capacity);
-  }
+  ~SimCacheImpl() override {}
+  void SetCapacity(size_t capacity) override { cache_->SetCapacity(capacity); }
 
-  virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override {
     cache_->SetStrictCapacityLimit(strict_capacity_limit);
   }
 
-  virtual Status Insert(const Slice& key, void* value, size_t charge,
-                        void (*deleter)(const Slice& key, void* value),
-                        Handle** handle, Priority priority) override {
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value), Handle** handle,
+                Priority priority) override {
     // The handle and value passed in are for real cache, so we pass nullptr
     // to key_only_cache_ for both instead. Also, the deleter function pointer
     // will be called by user to perform some external operation which should
@@ -177,7 +178,7 @@ class SimCacheImpl : public SimCache {
     Handle* h = key_only_cache_->Lookup(key);
     if (h == nullptr) {
       key_only_cache_->Insert(key, nullptr, charge,
-                              [](const Slice& k, void* v) {}, nullptr,
+                              [](const Slice& /*k*/, void* /*v*/) {}, nullptr,
                               priority);
     } else {
       key_only_cache_->Release(h);
@@ -188,7 +189,7 @@ class SimCacheImpl : public SimCache {
     return cache_->Insert(key, value, charge, deleter, handle, priority);
   }
 
-  virtual Handle* Lookup(const Slice& key, Statistics* stats) override {
+  Handle* Lookup(const Slice& key, Statistics* stats) override {
     Handle* h = key_only_cache_->Lookup(key);
     if (h != nullptr) {
       key_only_cache_->Release(h);
@@ -204,79 +205,75 @@ class SimCacheImpl : public SimCache {
     return cache_->Lookup(key, stats);
   }
 
-  virtual bool Ref(Handle* handle) override { return cache_->Ref(handle); }
+  bool Ref(Handle* handle) override { return cache_->Ref(handle); }
 
-  virtual bool Release(Handle* handle, bool force_erase = false) override {
+  bool Release(Handle* handle, bool force_erase = false) override {
     return cache_->Release(handle, force_erase);
   }
 
-  virtual void Erase(const Slice& key) override {
+  void Erase(const Slice& key) override {
     cache_->Erase(key);
     key_only_cache_->Erase(key);
   }
 
-  virtual void* Value(Handle* handle) override { return cache_->Value(handle); }
+  void* Value(Handle* handle) override { return cache_->Value(handle); }
 
-  virtual uint64_t NewId() override { return cache_->NewId(); }
+  uint64_t NewId() override { return cache_->NewId(); }
 
-  virtual size_t GetCapacity() const override { return cache_->GetCapacity(); }
+  size_t GetCapacity() const override { return cache_->GetCapacity(); }
 
-  virtual bool HasStrictCapacityLimit() const override {
+  bool HasStrictCapacityLimit() const override {
     return cache_->HasStrictCapacityLimit();
   }
 
-  virtual size_t GetUsage() const override { return cache_->GetUsage(); }
+  size_t GetUsage() const override { return cache_->GetUsage(); }
 
-  virtual size_t GetUsage(Handle* handle) const override {
+  size_t GetUsage(Handle* handle) const override {
     return cache_->GetUsage(handle);
   }
 
-  virtual size_t GetPinnedUsage() const override {
-    return cache_->GetPinnedUsage();
-  }
+  size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
 
-  virtual void DisownData() override {
+  void DisownData() override {
     cache_->DisownData();
     key_only_cache_->DisownData();
   }
 
-  virtual void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
-                                      bool thread_safe) override {
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe) override {
     // only apply to _cache since key_only_cache doesn't hold value
     cache_->ApplyToAllCacheEntries(callback, thread_safe);
   }
 
-  virtual void EraseUnRefEntries() override {
+  void EraseUnRefEntries() override {
     cache_->EraseUnRefEntries();
     key_only_cache_->EraseUnRefEntries();
   }
 
-  virtual size_t GetSimCapacity() const override {
+  size_t GetSimCapacity() const override {
     return key_only_cache_->GetCapacity();
   }
-  virtual size_t GetSimUsage() const override {
-    return key_only_cache_->GetUsage();
-  }
-  virtual void SetSimCapacity(size_t capacity) override {
+  size_t GetSimUsage() const override { return key_only_cache_->GetUsage(); }
+  void SetSimCapacity(size_t capacity) override {
     key_only_cache_->SetCapacity(capacity);
   }
 
-  virtual uint64_t get_miss_counter() const override {
+  uint64_t get_miss_counter() const override {
     return miss_times_.load(std::memory_order_relaxed);
   }
 
-  virtual uint64_t get_hit_counter() const override {
+  uint64_t get_hit_counter() const override {
     return hit_times_.load(std::memory_order_relaxed);
   }
 
-  virtual void reset_counter() override {
+  void reset_counter() override {
     miss_times_.store(0, std::memory_order_relaxed);
     hit_times_.store(0, std::memory_order_relaxed);
     SetTickerCount(stats_, SIM_BLOCK_CACHE_HIT, 0);
     SetTickerCount(stats_, SIM_BLOCK_CACHE_MISS, 0);
   }
 
-  virtual std::string ToString() const override {
+  std::string ToString() const override {
     std::string res;
     res.append("SimCache MISSes: " + std::to_string(get_miss_counter()) + "\n");
     res.append("SimCache HITs:    " + std::to_string(get_hit_counter()) + "\n");
@@ -288,7 +285,7 @@ class SimCacheImpl : public SimCache {
     return res;
   }
 
-  virtual std::string GetPrintableOptions() const override {
+  std::string GetPrintableOptions() const override {
     std::string ret;
     ret.reserve(20000);
     ret.append("    cache_options:\n");
@@ -298,18 +295,15 @@ class SimCacheImpl : public SimCache {
     return ret;
   }
 
-  virtual Status StartActivityLogging(const std::string& activity_log_file,
-                                      Env* env,
-                                      uint64_t max_logging_size = 0) override {
+  Status StartActivityLogging(const std::string& activity_log_file, Env* env,
+                              uint64_t max_logging_size = 0) override {
     return cache_activity_logger_.StartLogging(activity_log_file, env,
                                                max_logging_size);
   }
 
-  virtual void StopActivityLogging() override {
-    cache_activity_logger_.StopLogging();
-  }
+  void StopActivityLogging() override { cache_activity_logger_.StopLogging(); }
 
-  virtual Status GetActivityLoggingStatus() override {
+  Status GetActivityLoggingStatus() override {
     return cache_activity_logger_.bg_status();
   }
 
diff --git a/thirdparty/rocksdb/utilities/simulator_cache/sim_cache_test.cc b/thirdparty/rocksdb/utilities/simulator_cache/sim_cache_test.cc
index 4c175c9477..7f0f904a72 100644
--- a/thirdparty/rocksdb/utilities/simulator_cache/sim_cache_test.cc
+++ b/thirdparty/rocksdb/utilities/simulator_cache/sim_cache_test.cc
@@ -39,7 +39,7 @@ class SimCacheTest : public DBTestBase {
     return options;
   }
 
-  void InitTable(const Options& options) {
+  void InitTable(const Options& /*options*/) {
     std::string value(kValueSize, 'a');
     for (size_t i = 0; i < kNumBlocks * 2; i++) {
       ASSERT_OK(Put(ToString(i), value.c_str()));
@@ -154,7 +154,7 @@ TEST_F(SimCacheTest, SimCacheLogging) {
     Flush();
   }
 
-  std::string log_file = test::TmpDir(env_) + "/cache_log.txt";
+  std::string log_file = test::PerThreadDBPath(env_, "cache_log.txt");
   ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_));
   for (int i = 0; i < num_block_entries; i++) {
     ASSERT_EQ(Get(Key(i)), "val");
@@ -193,6 +193,7 @@ TEST_F(SimCacheTest, SimCacheLogging) {
   ASSERT_EQ(add_num, num_block_entries);
 
   // Log things again but stop logging automatically after reaching 512 bytes
+ // @lint-ignore TXT2 T25377293 Grandfathered in
 	int max_size = 512;
   ASSERT_OK(sim_cache->StartActivityLogging(log_file, env_, max_size));
   for (int it = 0; it < 10; it++) {
diff --git a/thirdparty/rocksdb/utilities/spatialdb/spatial_db.cc b/thirdparty/rocksdb/utilities/spatialdb/spatial_db.cc
deleted file mode 100644
index 539ddd06ee..0000000000
--- a/thirdparty/rocksdb/utilities/spatialdb/spatial_db.cc
+++ /dev/null
@@ -1,919 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include "rocksdb/utilities/spatial_db.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <algorithm>
-#include <condition_variable>
-#include <inttypes.h>
-#include <string>
-#include <vector>
-#include <mutex>
-#include <thread>
-#include <set>
-#include <unordered_set>
-
-#include "rocksdb/cache.h"
-#include "rocksdb/options.h"
-#include "rocksdb/memtablerep.h"
-#include "rocksdb/slice_transform.h"
-#include "rocksdb/statistics.h"
-#include "rocksdb/table.h"
-#include "rocksdb/db.h"
-#include "rocksdb/utilities/stackable_db.h"
-#include "util/coding.h"
-#include "utilities/spatialdb/utils.h"
-#include "port/port.h"
-
-namespace rocksdb {
-namespace spatial {
-
-// Column families are used to store element's data and spatial indexes. We use
-// [default] column family to store the element data. This is the format of
-// [default] column family:
-// * id (fixed 64 big endian) -> blob (length prefixed slice) feature_set
-// (serialized)
-// We have one additional column family for each spatial index. The name of the
-// column family is [spatial$<spatial_index_name>]. The format is:
-// * quad_key (fixed 64 bit big endian) id (fixed 64 bit big endian) -> ""
-// We store information about indexes in [metadata] column family. Format is:
-// * spatial$<spatial_index_name> -> bbox (4 double encodings) tile_bits
-// (varint32)
-
-namespace {
-const std::string kMetadataColumnFamilyName("metadata");
-inline std::string GetSpatialIndexColumnFamilyName(
-    const std::string& spatial_index_name) {
-  return "spatial$" + spatial_index_name;
-}
-inline bool GetSpatialIndexName(const std::string& column_family_name,
-                                Slice* dst) {
-  *dst = Slice(column_family_name);
-  if (dst->starts_with("spatial$")) {
-    dst->remove_prefix(8);  // strlen("spatial$")
-    return true;
-  }
-  return false;
-}
-
-}  // namespace
-
-void Variant::Init(const Variant& v, Data& d) {
-  switch (v.type_) {
-    case kNull:
-      break;
-    case kBool:
-      d.b = v.data_.b;
-      break;
-    case kInt:
-      d.i = v.data_.i;
-      break;
-    case kDouble:
-      d.d = v.data_.d;
-      break;
-    case kString:
-      new (d.s) std::string(*GetStringPtr(v.data_));
-      break;
-    default:
-      assert(false);
-  }
-}
-
-Variant& Variant::operator=(const Variant& v) {
-  // Construct first a temp so exception from a string ctor
-  // does not change this object
-  Data tmp;
-  Init(v, tmp);
-
-  Type thisType = type_;
-  // Boils down to copying bits so safe
-  std::swap(tmp, data_);
-  type_ = v.type_;
-
-  Destroy(thisType, tmp);
-
-  return *this;
-}
-
-Variant& Variant::operator=(Variant&& rhs) {
-  Destroy(type_, data_);
-  if (rhs.type_ == kString) {
-    new (data_.s) std::string(std::move(*GetStringPtr(rhs.data_)));
-  } else {
-    data_ = rhs.data_;
-  }
-  type_ = rhs.type_;
-  rhs.type_ = kNull;
-  return *this;
-}
-
-bool Variant::operator==(const Variant& rhs) const {
-  if (type_ != rhs.type_) {
-    return false;
-  }
-
-  switch (type_) {
-    case kNull:
-      return true;
-    case kBool:
-      return data_.b == rhs.data_.b;
-    case kInt:
-      return data_.i == rhs.data_.i;
-    case kDouble:
-      return data_.d == rhs.data_.d;
-    case kString:
-      return *GetStringPtr(data_) == *GetStringPtr(rhs.data_);
-    default:
-      assert(false);
-  }
-  // it will never reach here, but otherwise the compiler complains
-  return false;
-}
-
-FeatureSet* FeatureSet::Set(const std::string& key, const Variant& value) {
-  map_.insert({key, value});
-  return this;
-}
-
-bool FeatureSet::Contains(const std::string& key) const {
-  return map_.find(key) != map_.end();
-}
-
-const Variant& FeatureSet::Get(const std::string& key) const {
-  auto itr = map_.find(key);
-  assert(itr != map_.end());
-  return itr->second;
-}
-
-FeatureSet::iterator FeatureSet::Find(const std::string& key) const {
-  return iterator(map_.find(key));
-}
-
-void FeatureSet::Clear() { map_.clear(); }
-
-void FeatureSet::Serialize(std::string* output) const {
-  for (const auto& iter : map_) {
-    PutLengthPrefixedSlice(output, iter.first);
-    output->push_back(static_cast<char>(iter.second.type()));
-    switch (iter.second.type()) {
-      case Variant::kNull:
-        break;
-      case Variant::kBool:
-        output->push_back(static_cast<char>(iter.second.get_bool()));
-        break;
-      case Variant::kInt:
-        PutVarint64(output, iter.second.get_int());
-        break;
-      case Variant::kDouble: {
-        PutDouble(output, iter.second.get_double());
-        break;
-      }
-      case Variant::kString:
-        PutLengthPrefixedSlice(output, iter.second.get_string());
-        break;
-      default:
-        assert(false);
-    }
-  }
-}
-
-bool FeatureSet::Deserialize(const Slice& input) {
-  assert(map_.empty());
-  Slice s(input);
-  while (s.size()) {
-    Slice key;
-    if (!GetLengthPrefixedSlice(&s, &key) || s.size() == 0) {
-      return false;
-    }
-    char type = s[0];
-    s.remove_prefix(1);
-    switch (type) {
-      case Variant::kNull: {
-        map_.insert({key.ToString(), Variant()});
-        break;
-      }
-      case Variant::kBool: {
-        if (s.size() == 0) {
-          return false;
-        }
-        map_.insert({key.ToString(), Variant(static_cast<bool>(s[0]))});
-        s.remove_prefix(1);
-        break;
-      }
-      case Variant::kInt: {
-        uint64_t v;
-        if (!GetVarint64(&s, &v)) {
-          return false;
-        }
-        map_.insert({key.ToString(), Variant(v)});
-        break;
-      }
-      case Variant::kDouble: {
-        double d;
-        if (!GetDouble(&s, &d)) {
-          return false;
-        }
-        map_.insert({key.ToString(), Variant(d)});
-        break;
-      }
-      case Variant::kString: {
-        Slice str;
-        if (!GetLengthPrefixedSlice(&s, &str)) {
-          return false;
-        }
-        map_.insert({key.ToString(), str.ToString()});
-        break;
-      }
-      default:
-        return false;
-    }
-  }
-  return true;
-}
-
-std::string FeatureSet::DebugString() const {
-  std::string out = "{";
-  bool comma = false;
-  for (const auto& iter : map_) {
-    if (comma) {
-      out.append(", ");
-    } else {
-      comma = true;
-    }
-    out.append("\"" + iter.first + "\": ");
-    switch (iter.second.type()) {
-      case Variant::kNull:
-        out.append("null");
-        break;
-      case Variant::kBool:
-        if (iter.second.get_bool()) {
-          out.append("true");
-        } else {
-          out.append("false");
-        }
-        break;
-      case Variant::kInt: {
-        char buf[32];
-        snprintf(buf, sizeof(buf), "%" PRIu64, iter.second.get_int());
-        out.append(buf);
-        break;
-      }
-      case Variant::kDouble: {
-        char buf[32];
-        snprintf(buf, sizeof(buf), "%lf", iter.second.get_double());
-        out.append(buf);
-        break;
-      }
-      case Variant::kString:
-        out.append("\"" + iter.second.get_string() + "\"");
-        break;
-      default:
-        assert(false);
-    }
-  }
-  return out + "}";
-}
-
-class ValueGetter {
- public:
-  ValueGetter() {}
-  virtual ~ValueGetter() {}
-
-  virtual bool Get(uint64_t id) = 0;
-  virtual const Slice value() const = 0;
-
-  virtual Status status() const = 0;
-};
-
-class ValueGetterFromDB : public ValueGetter {
- public:
-  ValueGetterFromDB(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {}
-
-  virtual bool Get(uint64_t id) override {
-    std::string encoded_id;
-    PutFixed64BigEndian(&encoded_id, id);
-    status_ = db_->Get(ReadOptions(), cf_, encoded_id, &value_);
-    if (status_.IsNotFound()) {
-      status_ = Status::Corruption("Index inconsistency");
-      return false;
-    }
-
-    return true;
-  }
-
-  virtual const Slice value() const override { return value_; }
-
-  virtual Status status() const override { return status_; }
-
- private:
-  std::string value_;
-  DB* db_;
-  ColumnFamilyHandle* cf_;
-  Status status_;
-};
-
-class ValueGetterFromIterator : public ValueGetter {
- public:
-  explicit ValueGetterFromIterator(Iterator* iterator) : iterator_(iterator) {}
-
-  virtual bool Get(uint64_t id) override {
-    std::string encoded_id;
-    PutFixed64BigEndian(&encoded_id, id);
-    iterator_->Seek(encoded_id);
-
-    if (!iterator_->Valid() || iterator_->key() != Slice(encoded_id)) {
-      status_ = Status::Corruption("Index inconsistency");
-      return false;
-    }
-
-    return true;
-  }
-
-  virtual const Slice value() const override { return iterator_->value(); }
-
-  virtual Status status() const override { return status_; }
-
- private:
-  std::unique_ptr<Iterator> iterator_;
-  Status status_;
-};
-
-class SpatialIndexCursor : public Cursor {
- public:
-  // tile_box is inclusive
-  SpatialIndexCursor(Iterator* spatial_iterator, ValueGetter* value_getter,
-                     const BoundingBox<uint64_t>& tile_bbox, uint32_t tile_bits)
-      : value_getter_(value_getter), valid_(true) {
-    // calculate quad keys we'll need to query
-    std::vector<uint64_t> quad_keys;
-    quad_keys.reserve((tile_bbox.max_x - tile_bbox.min_x + 1) *
-                      (tile_bbox.max_y - tile_bbox.min_y + 1));
-    for (uint64_t x = tile_bbox.min_x; x <= tile_bbox.max_x; ++x) {
-      for (uint64_t y = tile_bbox.min_y; y <= tile_bbox.max_y; ++y) {
-        quad_keys.push_back(GetQuadKeyFromTile(x, y, tile_bits));
-      }
-    }
-    std::sort(quad_keys.begin(), quad_keys.end());
-
-    // load primary key ids for all quad keys
-    for (auto quad_key : quad_keys) {
-      std::string encoded_quad_key;
-      PutFixed64BigEndian(&encoded_quad_key, quad_key);
-      Slice slice_quad_key(encoded_quad_key);
-
-      // If CheckQuadKey is true, there is no need to reseek, since
-      // spatial_iterator is already pointing at the correct quad key. This is
-      // an optimization.
-      if (!CheckQuadKey(spatial_iterator, slice_quad_key)) {
-        spatial_iterator->Seek(slice_quad_key);
-      }
-
-      while (CheckQuadKey(spatial_iterator, slice_quad_key)) {
-        // extract ID from spatial_iterator
-        uint64_t id;
-        bool ok = GetFixed64BigEndian(
-            Slice(spatial_iterator->key().data() + sizeof(uint64_t),
-                  sizeof(uint64_t)),
-            &id);
-        if (!ok) {
-          valid_ = false;
-          status_ = Status::Corruption("Spatial index corruption");
-          break;
-        }
-        primary_key_ids_.insert(id);
-        spatial_iterator->Next();
-      }
-    }
-
-    if (!spatial_iterator->status().ok()) {
-      status_ = spatial_iterator->status();
-      valid_ = false;
-    }
-    delete spatial_iterator;
-
-    valid_ = valid_ && !primary_key_ids_.empty();
-
-    if (valid_) {
-      primary_keys_iterator_ = primary_key_ids_.begin();
-      ExtractData();
-    }
-  }
-
-  virtual bool Valid() const override { return valid_; }
-
-  virtual void Next() override {
-    assert(valid_);
-
-    ++primary_keys_iterator_;
-    if (primary_keys_iterator_ == primary_key_ids_.end()) {
-      valid_ = false;
-      return;
-    }
-
-    ExtractData();
-  }
-
-  virtual const Slice blob() override { return current_blob_; }
-  virtual const FeatureSet& feature_set() override {
-    return current_feature_set_;
-  }
-
-  virtual Status status() const override {
-    if (!status_.ok()) {
-      return status_;
-    }
-    return value_getter_->status();
-  }
-
- private:
-  // * returns true if spatial iterator is on the current quad key and all is
-  // well
-  // * returns false if spatial iterator is not on current, or iterator is
-  // invalid or corruption
-  bool CheckQuadKey(Iterator* spatial_iterator, const Slice& quad_key) {
-    if (!spatial_iterator->Valid()) {
-      return false;
-    }
-    if (spatial_iterator->key().size() != 2 * sizeof(uint64_t)) {
-      status_ = Status::Corruption("Invalid spatial index key");
-      valid_ = false;
-      return false;
-    }
-    Slice spatial_iterator_quad_key(spatial_iterator->key().data(),
-                                    sizeof(uint64_t));
-    if (spatial_iterator_quad_key != quad_key) {
-      // caller needs to reseek
-      return false;
-    }
-    // if we come to here, we have found the quad key
-    return true;
-  }
-
-  void ExtractData() {
-    assert(valid_);
-    valid_ = value_getter_->Get(*primary_keys_iterator_);
-
-    if (valid_) {
-      Slice data = value_getter_->value();
-      current_feature_set_.Clear();
-      if (!GetLengthPrefixedSlice(&data, &current_blob_) ||
-          !current_feature_set_.Deserialize(data)) {
-        status_ = Status::Corruption("Primary key column family corruption");
-        valid_ = false;
-      }
-    }
-
-  }
-
-  unique_ptr<ValueGetter> value_getter_;
-  bool valid_;
-  Status status_;
-
-  FeatureSet current_feature_set_;
-  Slice current_blob_;
-
-  // This is loaded from spatial iterator.
-  std::unordered_set<uint64_t> primary_key_ids_;
-  std::unordered_set<uint64_t>::iterator primary_keys_iterator_;
-};
-
-class ErrorCursor : public Cursor {
- public:
-  explicit ErrorCursor(Status s) : s_(s) { assert(!s.ok()); }
-  virtual Status status() const override { return s_; }
-  virtual bool Valid() const override { return false; }
-  virtual void Next() override { assert(false); }
-
-  virtual const Slice blob() override {
-    assert(false);
-    return Slice();
-  }
-  virtual const FeatureSet& feature_set() override {
-    assert(false);
-    // compiler complains otherwise
-    return trash_;
-  }
-
- private:
-  Status s_;
-  FeatureSet trash_;
-};
-
-class SpatialDBImpl : public SpatialDB {
- public:
-  // * db -- base DB that needs to be forwarded to StackableDB
-  // * data_column_family -- column family used to store the data
-  // * spatial_indexes -- a list of spatial indexes together with column
-  // families that correspond to those spatial indexes
-  // * next_id -- next ID in auto-incrementing ID. This is usually
-  // `max_id_currenty_in_db + 1`
-  SpatialDBImpl(
-      DB* db, ColumnFamilyHandle* data_column_family,
-      const std::vector<std::pair<SpatialIndexOptions, ColumnFamilyHandle*>>&
-          spatial_indexes,
-      uint64_t next_id, bool read_only)
-      : SpatialDB(db),
-        data_column_family_(data_column_family),
-        next_id_(next_id),
-        read_only_(read_only) {
-    for (const auto& index : spatial_indexes) {
-      name_to_index_.insert(
-          {index.first.name, IndexColumnFamily(index.first, index.second)});
-    }
-  }
-
-  ~SpatialDBImpl() {
-    for (auto& iter : name_to_index_) {
-      delete iter.second.column_family;
-    }
-    delete data_column_family_;
-  }
-
-  virtual Status Insert(
-      const WriteOptions& write_options, const BoundingBox<double>& bbox,
-      const Slice& blob, const FeatureSet& feature_set,
-      const std::vector<std::string>& spatial_indexes) override {
-    WriteBatch batch;
-
-    if (spatial_indexes.size() == 0) {
-      return Status::InvalidArgument("Spatial indexes can't be empty");
-    }
-
-    const size_t kWriteOutEveryBytes = 1024 * 1024;  // 1MB
-    uint64_t id = next_id_.fetch_add(1);
-
-    for (const auto& si : spatial_indexes) {
-      auto itr = name_to_index_.find(si);
-      if (itr == name_to_index_.end()) {
-        return Status::InvalidArgument("Can't find index " + si);
-      }
-      const auto& spatial_index = itr->second.index;
-      if (!spatial_index.bbox.Intersects(bbox)) {
-        continue;
-      }
-      BoundingBox<uint64_t> tile_bbox = GetTileBoundingBox(spatial_index, bbox);
-
-      for (uint64_t x = tile_bbox.min_x; x <= tile_bbox.max_x; ++x) {
-        for (uint64_t y = tile_bbox.min_y; y <= tile_bbox.max_y; ++y) {
-          // see above for format
-          std::string key;
-          PutFixed64BigEndian(
-              &key, GetQuadKeyFromTile(x, y, spatial_index.tile_bits));
-          PutFixed64BigEndian(&key, id);
-          batch.Put(itr->second.column_family, key, Slice());
-          if (batch.GetDataSize() >= kWriteOutEveryBytes) {
-            Status s = Write(write_options, &batch);
-            batch.Clear();
-            if (!s.ok()) {
-              return s;
-            }
-          }
-        }
-      }
-    }
-
-    // see above for format
-    std::string data_key;
-    PutFixed64BigEndian(&data_key, id);
-    std::string data_value;
-    PutLengthPrefixedSlice(&data_value, blob);
-    feature_set.Serialize(&data_value);
-    batch.Put(data_column_family_, data_key, data_value);
-
-    return Write(write_options, &batch);
-  }
-
-  virtual Status Compact(int num_threads) override {
-    std::vector<ColumnFamilyHandle*> column_families;
-    column_families.push_back(data_column_family_);
-
-    for (auto& iter : name_to_index_) {
-      column_families.push_back(iter.second.column_family);
-    }
-
-    std::mutex state_mutex;
-    std::condition_variable cv;
-    Status s;
-    int threads_running = 0;
-
-    std::vector<port::Thread> threads;
-
-    for (auto cfh : column_families) {
-      threads.emplace_back([&, cfh] {
-          {
-            std::unique_lock<std::mutex> lk(state_mutex);
-            cv.wait(lk, [&] { return threads_running < num_threads; });
-            threads_running++;
-          }
-
-          Status t = Flush(FlushOptions(), cfh);
-          if (t.ok()) {
-            t = CompactRange(CompactRangeOptions(), cfh, nullptr, nullptr);
-          }
-
-          {
-            std::unique_lock<std::mutex> lk(state_mutex);
-            threads_running--;
-            if (s.ok() && !t.ok()) {
-              s = t;
-            }
-            cv.notify_one();
-          }
-      });
-    }
-
-    for (auto& t : threads) {
-      t.join();
-    }
-
-    return s;
-  }
-
-  virtual Cursor* Query(const ReadOptions& read_options,
-                        const BoundingBox<double>& bbox,
-                        const std::string& spatial_index) override {
-    auto itr = name_to_index_.find(spatial_index);
-    if (itr == name_to_index_.end()) {
-      return new ErrorCursor(Status::InvalidArgument(
-          "Spatial index " + spatial_index + " not found"));
-    }
-    const auto& si = itr->second.index;
-    Iterator* spatial_iterator;
-    ValueGetter* value_getter;
-
-    if (read_only_) {
-      spatial_iterator = NewIterator(read_options, itr->second.column_family);
-      value_getter = new ValueGetterFromDB(this, data_column_family_);
-    } else {
-      std::vector<Iterator*> iterators;
-      Status s = NewIterators(read_options,
-                              {data_column_family_, itr->second.column_family},
-                              &iterators);
-      if (!s.ok()) {
-        return new ErrorCursor(s);
-      }
-
-      spatial_iterator = iterators[1];
-      value_getter = new ValueGetterFromIterator(iterators[0]);
-    }
-    return new SpatialIndexCursor(spatial_iterator, value_getter,
-                                  GetTileBoundingBox(si, bbox), si.tile_bits);
-  }
-
- private:
-  ColumnFamilyHandle* data_column_family_;
-  struct IndexColumnFamily {
-    SpatialIndexOptions index;
-    ColumnFamilyHandle* column_family;
-    IndexColumnFamily(const SpatialIndexOptions& _index,
-                      ColumnFamilyHandle* _cf)
-        : index(_index), column_family(_cf) {}
-  };
-  // constant after construction!
-  std::unordered_map<std::string, IndexColumnFamily> name_to_index_;
-
-  std::atomic<uint64_t> next_id_;
-  bool read_only_;
-};
-
-namespace {
-DBOptions GetDBOptionsFromSpatialDBOptions(const SpatialDBOptions& options) {
-  DBOptions db_options;
-  db_options.max_open_files = 50000;
-  db_options.max_background_compactions = 3 * options.num_threads / 4;
-  db_options.max_background_flushes =
-      options.num_threads - db_options.max_background_compactions;
-  db_options.env->SetBackgroundThreads(db_options.max_background_compactions,
-                                       Env::LOW);
-  db_options.env->SetBackgroundThreads(db_options.max_background_flushes,
-                                       Env::HIGH);
-  db_options.statistics = CreateDBStatistics();
-  if (options.bulk_load) {
-    db_options.stats_dump_period_sec = 600;
-  } else {
-    db_options.stats_dump_period_sec = 1800;  // 30min
-  }
-  return db_options;
-}
-
-ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options,
-                                           std::shared_ptr<Cache> block_cache) {
-  ColumnFamilyOptions column_family_options;
-  column_family_options.write_buffer_size = 128 * 1024 * 1024;  // 128MB
-  column_family_options.max_write_buffer_number = 4;
-  column_family_options.max_bytes_for_level_base = 256 * 1024 * 1024;  // 256MB
-  column_family_options.target_file_size_base = 64 * 1024 * 1024;      // 64MB
-  column_family_options.level0_file_num_compaction_trigger = 2;
-  column_family_options.level0_slowdown_writes_trigger = 16;
-  column_family_options.level0_stop_writes_trigger = 32;
-  // only compress levels >= 2
-  column_family_options.compression_per_level.resize(
-      column_family_options.num_levels);
-  for (int i = 0; i < column_family_options.num_levels; ++i) {
-    if (i < 2) {
-      column_family_options.compression_per_level[i] = kNoCompression;
-    } else {
-      column_family_options.compression_per_level[i] = kLZ4Compression;
-    }
-  }
-  BlockBasedTableOptions table_options;
-  table_options.block_cache = block_cache;
-  column_family_options.table_factory.reset(
-      NewBlockBasedTableFactory(table_options));
-  return column_family_options;
-}
-
-ColumnFamilyOptions OptimizeOptionsForDataColumnFamily(
-    ColumnFamilyOptions options, std::shared_ptr<Cache> block_cache) {
-  options.prefix_extractor.reset(NewNoopTransform());
-  BlockBasedTableOptions block_based_options;
-  block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
-  block_based_options.block_cache = block_cache;
-  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
-  return options;
-}
-
-}  // namespace
-
-class MetadataStorage {
- public:
-  MetadataStorage(DB* db, ColumnFamilyHandle* cf) : db_(db), cf_(cf) {}
-  ~MetadataStorage() {}
-
-  // format: <min_x double> <min_y double> <max_x double> <max_y double>
-  // <tile_bits varint32>
-  Status AddIndex(const SpatialIndexOptions& index) {
-    std::string encoded_index;
-    PutDouble(&encoded_index, index.bbox.min_x);
-    PutDouble(&encoded_index, index.bbox.min_y);
-    PutDouble(&encoded_index, index.bbox.max_x);
-    PutDouble(&encoded_index, index.bbox.max_y);
-    PutVarint32(&encoded_index, index.tile_bits);
-    return db_->Put(WriteOptions(), cf_,
-                    GetSpatialIndexColumnFamilyName(index.name), encoded_index);
-  }
-
-  Status GetIndex(const std::string& name, SpatialIndexOptions* dst) {
-    std::string value;
-    Status s = db_->Get(ReadOptions(), cf_,
-                        GetSpatialIndexColumnFamilyName(name), &value);
-    if (!s.ok()) {
-      return s;
-    }
-    dst->name = name;
-    Slice encoded_index(value);
-    bool ok = GetDouble(&encoded_index, &(dst->bbox.min_x));
-    ok = ok && GetDouble(&encoded_index, &(dst->bbox.min_y));
-    ok = ok && GetDouble(&encoded_index, &(dst->bbox.max_x));
-    ok = ok && GetDouble(&encoded_index, &(dst->bbox.max_y));
-    ok = ok && GetVarint32(&encoded_index, &(dst->tile_bits));
-    return ok ? Status::OK() : Status::Corruption("Index encoding corrupted");
-  }
-
- private:
-  DB* db_;
-  ColumnFamilyHandle* cf_;
-};
-
-Status SpatialDB::Create(
-    const SpatialDBOptions& options, const std::string& name,
-    const std::vector<SpatialIndexOptions>& spatial_indexes) {
-  DBOptions db_options = GetDBOptionsFromSpatialDBOptions(options);
-  db_options.create_if_missing = true;
-  db_options.create_missing_column_families = true;
-  db_options.error_if_exists = true;
-
-  auto block_cache = NewLRUCache(options.cache_size);
-  ColumnFamilyOptions column_family_options =
-      GetColumnFamilyOptions(options, block_cache);
-
-  std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.push_back(ColumnFamilyDescriptor(
-      kDefaultColumnFamilyName,
-      OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
-  column_families.push_back(
-      ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
-
-  for (const auto& index : spatial_indexes) {
-    column_families.emplace_back(GetSpatialIndexColumnFamilyName(index.name),
-                                 column_family_options);
-  }
-
-  std::vector<ColumnFamilyHandle*> handles;
-  DB* base_db;
-  Status s = DB::Open(db_options, name, column_families, &handles, &base_db);
-  if (!s.ok()) {
-    return s;
-  }
-  MetadataStorage metadata(base_db, handles[1]);
-  for (const auto& index : spatial_indexes) {
-    s = metadata.AddIndex(index);
-    if (!s.ok()) {
-      break;
-    }
-  }
-
-  for (auto h : handles) {
-    delete h;
-  }
-  delete base_db;
-
-  return s;
-}
-
-Status SpatialDB::Open(const SpatialDBOptions& options, const std::string& name,
-                       SpatialDB** db, bool read_only) {
-  DBOptions db_options = GetDBOptionsFromSpatialDBOptions(options);
-  auto block_cache = NewLRUCache(options.cache_size);
-  ColumnFamilyOptions column_family_options =
-      GetColumnFamilyOptions(options, block_cache);
-
-  Status s;
-  std::vector<std::string> existing_column_families;
-  std::vector<std::string> spatial_indexes;
-  s = DB::ListColumnFamilies(db_options, name, &existing_column_families);
-  if (!s.ok()) {
-    return s;
-  }
-  for (const auto& cf_name : existing_column_families) {
-    Slice spatial_index;
-    if (GetSpatialIndexName(cf_name, &spatial_index)) {
-      spatial_indexes.emplace_back(spatial_index.data(), spatial_index.size());
-    }
-  }
-
-  std::vector<ColumnFamilyDescriptor> column_families;
-  column_families.push_back(ColumnFamilyDescriptor(
-      kDefaultColumnFamilyName,
-      OptimizeOptionsForDataColumnFamily(column_family_options, block_cache)));
-  column_families.push_back(
-      ColumnFamilyDescriptor(kMetadataColumnFamilyName, column_family_options));
-
-  for (const auto& index : spatial_indexes) {
-    column_families.emplace_back(GetSpatialIndexColumnFamilyName(index),
-                                 column_family_options);
-  }
-  std::vector<ColumnFamilyHandle*> handles;
-  DB* base_db;
-  if (read_only) {
-    s = DB::OpenForReadOnly(db_options, name, column_families, &handles,
-                            &base_db);
-  } else {
-    s = DB::Open(db_options, name, column_families, &handles, &base_db);
-  }
-  if (!s.ok()) {
-    return s;
-  }
-
-  MetadataStorage metadata(base_db, handles[1]);
-
-  std::vector<std::pair<SpatialIndexOptions, ColumnFamilyHandle*>> index_cf;
-  assert(handles.size() == spatial_indexes.size() + 2);
-  for (size_t i = 0; i < spatial_indexes.size(); ++i) {
-    SpatialIndexOptions index_options;
-    s = metadata.GetIndex(spatial_indexes[i], &index_options);
-    if (!s.ok()) {
-      break;
-    }
-    index_cf.emplace_back(index_options, handles[i + 2]);
-  }
-  uint64_t next_id = 1;
-  if (s.ok()) {
-    // find next_id
-    Iterator* iter = base_db->NewIterator(ReadOptions(), handles[0]);
-    iter->SeekToLast();
-    if (iter->Valid()) {
-      uint64_t last_id = 0;
-      if (!GetFixed64BigEndian(iter->key(), &last_id)) {
-        s = Status::Corruption("Invalid key in data column family");
-      } else {
-        next_id = last_id + 1;
-      }
-    }
-    delete iter;
-  }
-  if (!s.ok()) {
-    for (auto h : handles) {
-      delete h;
-    }
-    delete base_db;
-    return s;
-  }
-
-  // I don't need metadata column family any more, so delete it
-  delete handles[1];
-  *db = new SpatialDBImpl(base_db, handles[0], index_cf, next_id, read_only);
-  return Status::OK();
-}
-
-}  // namespace spatial
-}  // namespace rocksdb
-#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/spatialdb/spatial_db_test.cc b/thirdparty/rocksdb/utilities/spatialdb/spatial_db_test.cc
deleted file mode 100644
index 7e0d67489f..0000000000
--- a/thirdparty/rocksdb/utilities/spatialdb/spatial_db_test.cc
+++ /dev/null
@@ -1,307 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#ifndef ROCKSDB_LITE
-
-#include <vector>
-#include <string>
-#include <set>
-
-#include "rocksdb/utilities/spatial_db.h"
-#include "util/compression.h"
-#include "util/testharness.h"
-#include "util/testutil.h"
-#include "util/random.h"
-
-namespace rocksdb {
-namespace spatial {
-
-class SpatialDBTest : public testing::Test {
- public:
-  SpatialDBTest() {
-    dbname_ = test::TmpDir() + "/spatial_db_test";
-    DestroyDB(dbname_, Options());
-  }
-
-  void AssertCursorResults(BoundingBox<double> bbox, const std::string& index,
-                           const std::vector<std::string>& blobs) {
-    Cursor* c = db_->Query(ReadOptions(), bbox, index);
-    ASSERT_OK(c->status());
-    std::multiset<std::string> b;
-    for (auto x : blobs) {
-      b.insert(x);
-    }
-
-    while (c->Valid()) {
-      auto itr = b.find(c->blob().ToString());
-      ASSERT_TRUE(itr != b.end());
-      b.erase(itr);
-      c->Next();
-    }
-    ASSERT_EQ(b.size(), 0U);
-    ASSERT_OK(c->status());
-    delete c;
-  }
-
-  std::string dbname_;
-  SpatialDB* db_;
-};
-
-TEST_F(SpatialDBTest, FeatureSetSerializeTest) {
-  if (!LZ4_Supported()) {
-    return;
-  }
-  FeatureSet fs;
-
-  fs.Set("a", std::string("b"));
-  fs.Set("x", static_cast<uint64_t>(3));
-  fs.Set("y", false);
-  fs.Set("n", Variant());  // null
-  fs.Set("m", 3.25);
-
-  ASSERT_TRUE(fs.Find("w") == fs.end());
-  ASSERT_TRUE(fs.Find("x") != fs.end());
-  ASSERT_TRUE((*fs.Find("x")).second == Variant(static_cast<uint64_t>(3)));
-  ASSERT_TRUE((*fs.Find("y")).second != Variant(true));
-  std::set<std::string> keys({"a", "x", "y", "n", "m"});
-  for (const auto& x : fs) {
-    ASSERT_TRUE(keys.find(x.first) != keys.end());
-    keys.erase(x.first);
-  }
-  ASSERT_EQ(keys.size(), 0U);
-
-  std::string serialized;
-  fs.Serialize(&serialized);
-
-  FeatureSet deserialized;
-  ASSERT_TRUE(deserialized.Deserialize(serialized));
-
-  ASSERT_TRUE(deserialized.Contains("a"));
-  ASSERT_EQ(deserialized.Get("a").type(), Variant::kString);
-  ASSERT_EQ(deserialized.Get("a").get_string(), "b");
-  ASSERT_TRUE(deserialized.Contains("x"));
-  ASSERT_EQ(deserialized.Get("x").type(), Variant::kInt);
-  ASSERT_EQ(deserialized.Get("x").get_int(), static_cast<uint64_t>(3));
-  ASSERT_TRUE(deserialized.Contains("y"));
-  ASSERT_EQ(deserialized.Get("y").type(), Variant::kBool);
-  ASSERT_EQ(deserialized.Get("y").get_bool(), false);
-  ASSERT_TRUE(deserialized.Contains("n"));
-  ASSERT_EQ(deserialized.Get("n").type(), Variant::kNull);
-  ASSERT_TRUE(deserialized.Contains("m"));
-  ASSERT_EQ(deserialized.Get("m").type(), Variant::kDouble);
-  ASSERT_EQ(deserialized.Get("m").get_double(), 3.25);
-
-  // corrupted serialization
-  serialized = serialized.substr(0, serialized.size() - 4);
-  deserialized.Clear();
-  ASSERT_TRUE(!deserialized.Deserialize(serialized));
-}
-
-TEST_F(SpatialDBTest, TestNextID) {
-  if (!LZ4_Supported()) {
-    return;
-  }
-  ASSERT_OK(SpatialDB::Create(
-      SpatialDBOptions(), dbname_,
-      {SpatialIndexOptions("simple", BoundingBox<double>(0, 0, 100, 100), 2)}));
-
-  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
-  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(5, 5, 10, 10),
-                        "one", FeatureSet(), {"simple"}));
-  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(10, 10, 15, 15),
-                        "two", FeatureSet(), {"simple"}));
-  delete db_;
-  db_ = nullptr;
-
-  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
-  assert(db_ != nullptr);
-  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(55, 55, 65, 65),
-                        "three", FeatureSet(), {"simple"}));
-  delete db_;
-
-  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
-  AssertCursorResults(BoundingBox<double>(0, 0, 100, 100), "simple",
-                      {"one", "two", "three"});
-  delete db_;
-}
-
-TEST_F(SpatialDBTest, FeatureSetTest) {
-  if (!LZ4_Supported()) {
-    return;
-  }
-  ASSERT_OK(SpatialDB::Create(
-      SpatialDBOptions(), dbname_,
-      {SpatialIndexOptions("simple", BoundingBox<double>(0, 0, 100, 100), 2)}));
-  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
-
-  FeatureSet fs;
-  fs.Set("a", std::string("b"));
-  fs.Set("c", std::string("d"));
-
-  ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(5, 5, 10, 10),
-                        "one", fs, {"simple"}));
-
-  Cursor* c =
-      db_->Query(ReadOptions(), BoundingBox<double>(5, 5, 10, 10), "simple");
-
-  ASSERT_TRUE(c->Valid());
-  ASSERT_EQ(c->blob().compare("one"), 0);
-  FeatureSet returned = c->feature_set();
-  ASSERT_TRUE(returned.Contains("a"));
-  ASSERT_TRUE(!returned.Contains("b"));
-  ASSERT_TRUE(returned.Contains("c"));
-  ASSERT_EQ(returned.Get("a").type(), Variant::kString);
-  ASSERT_EQ(returned.Get("a").get_string(), "b");
-  ASSERT_EQ(returned.Get("c").type(), Variant::kString);
-  ASSERT_EQ(returned.Get("c").get_string(), "d");
-
-  c->Next();
-  ASSERT_TRUE(!c->Valid());
-
-  delete c;
-  delete db_;
-}
-
-TEST_F(SpatialDBTest, SimpleTest) {
-  if (!LZ4_Supported()) {
-    return;
-  }
-  // iter 0 -- not read only
-  // iter 1 -- read only
-  for (int iter = 0; iter < 2; ++iter) {
-    DestroyDB(dbname_, Options());
-    ASSERT_OK(SpatialDB::Create(
-        SpatialDBOptions(), dbname_,
-        {SpatialIndexOptions("index", BoundingBox<double>(0, 0, 128, 128),
-                             3)}));
-    ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
-    assert(db_ != nullptr);
-
-    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(33, 17, 63, 79),
-                          "one", FeatureSet(), {"index"}));
-    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(65, 65, 111, 111),
-                          "two", FeatureSet(), {"index"}));
-    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(1, 49, 127, 63),
-                          "three", FeatureSet(), {"index"}));
-    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(20, 100, 21, 101),
-                          "four", FeatureSet(), {"index"}));
-    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(81, 33, 127, 63),
-                          "five", FeatureSet(), {"index"}));
-    ASSERT_OK(db_->Insert(WriteOptions(), BoundingBox<double>(1, 65, 47, 95),
-                          "six", FeatureSet(), {"index"}));
-
-    if (iter == 1) {
-      delete db_;
-      db_ = nullptr;
-      ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_, true));
-    }
-
-    AssertCursorResults(BoundingBox<double>(33, 17, 47, 31), "index", {"one"});
-    AssertCursorResults(BoundingBox<double>(17, 33, 79, 63), "index",
-                        {"one", "three"});
-    AssertCursorResults(BoundingBox<double>(17, 81, 63, 111), "index",
-                        {"four", "six"});
-    AssertCursorResults(BoundingBox<double>(85, 86, 85, 86), "index", {"two"});
-    AssertCursorResults(BoundingBox<double>(33, 1, 127, 111), "index",
-                        {"one", "two", "three", "five", "six"});
-    // even though the bounding box doesn't intersect, we got "four" back
-    // because
-    // it's in the same tile
-    AssertCursorResults(BoundingBox<double>(18, 98, 19, 99), "index", {"four"});
-    AssertCursorResults(BoundingBox<double>(130, 130, 131, 131), "index", {});
-    AssertCursorResults(BoundingBox<double>(81, 17, 127, 31), "index", {});
-    AssertCursorResults(BoundingBox<double>(90, 50, 91, 51), "index",
-                        {"three", "five"});
-
-    delete db_;
-    db_ = nullptr;
-  }
-}
-
-namespace {
-std::string RandomStr(Random* rnd) {
-  std::string r;
-  for (int k = 0; k < 10; ++k) {
-    r.push_back(rnd->Uniform(26) + 'a');
-  }
-  return r;
-}
-
-BoundingBox<int> RandomBoundingBox(int limit, Random* rnd, int max_size) {
-  BoundingBox<int> r;
-  r.min_x = rnd->Uniform(limit - 1);
-  r.min_y = rnd->Uniform(limit - 1);
-  r.max_x = r.min_x + rnd->Uniform(std::min(limit - 1 - r.min_x, max_size)) + 1;
-  r.max_y = r.min_y + rnd->Uniform(std::min(limit - 1 - r.min_y, max_size)) + 1;
-  return r;
-}
-
-BoundingBox<double> ScaleBB(BoundingBox<int> b, double step) {
-  return BoundingBox<double>(b.min_x * step + 1, b.min_y * step + 1,
-                             (b.max_x + 1) * step - 1,
-                             (b.max_y + 1) * step - 1);
-}
-
-}  // namespace
-
-TEST_F(SpatialDBTest, RandomizedTest) {
-  if (!LZ4_Supported()) {
-    return;
-  }
-  Random rnd(301);
-  std::vector<std::pair<std::string, BoundingBox<int>>> elements;
-
-  BoundingBox<double> spatial_index_bounds(0, 0, (1LL << 32), (1LL << 32));
-  ASSERT_OK(SpatialDB::Create(
-      SpatialDBOptions(), dbname_,
-      {SpatialIndexOptions("index", spatial_index_bounds, 7)}));
-  ASSERT_OK(SpatialDB::Open(SpatialDBOptions(), dbname_, &db_));
-  double step = (1LL << 32) / (1 << 7);
-
-  for (int i = 0; i < 1000; ++i) {
-    std::string blob = RandomStr(&rnd);
-    BoundingBox<int> bbox = RandomBoundingBox(128, &rnd, 10);
-    ASSERT_OK(db_->Insert(WriteOptions(), ScaleBB(bbox, step), blob,
-                          FeatureSet(), {"index"}));
-    elements.push_back(make_pair(blob, bbox));
-  }
-
-  // parallel
-  db_->Compact(2);
-  // serial
-  db_->Compact(1);
-
-  for (int i = 0; i < 1000; ++i) {
-    BoundingBox<int> int_bbox = RandomBoundingBox(128, &rnd, 10);
-    BoundingBox<double> double_bbox = ScaleBB(int_bbox, step);
-    std::vector<std::string> blobs;
-    for (auto e : elements) {
-      if (e.second.Intersects(int_bbox)) {
-        blobs.push_back(e.first);
-      }
-    }
-    AssertCursorResults(double_bbox, "index", blobs);
-  }
-
-  delete db_;
-}
-
-}  // namespace spatial
-}  // namespace rocksdb
-
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-#include <stdio.h>
-
-int main(int argc, char** argv) {
-  fprintf(stderr, "SKIPPED as SpatialDB is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/spatialdb/utils.h b/thirdparty/rocksdb/utilities/spatialdb/utils.h
deleted file mode 100644
index fe4b4e2532..0000000000
--- a/thirdparty/rocksdb/utilities/spatialdb/utils.h
+++ /dev/null
@@ -1,95 +0,0 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-
-#pragma once
-#include <string>
-#include <algorithm>
-
-#include "rocksdb/utilities/spatial_db.h"
-
-namespace rocksdb {
-namespace spatial {
-
-// indexing idea from http://msdn.microsoft.com/en-us/library/bb259689.aspx
-inline uint64_t GetTileFromCoord(double x, double start, double end,
-                                 uint32_t tile_bits) {
-  if (x < start) {
-    return 0;
-  }
-  uint64_t tiles = 1ull << tile_bits;
-  uint64_t r = static_cast<uint64_t>(((x - start) / (end - start)) * tiles);
-  return std::min(r, tiles - 1);
-}
-
-inline uint64_t GetQuadKeyFromTile(uint64_t tile_x, uint64_t tile_y,
-                                   uint32_t tile_bits) {
-  uint64_t quad_key = 0;
-  for (uint32_t i = 0; i < tile_bits; ++i) {
-    uint64_t mask = (1ull << i);
-    quad_key |= (tile_x & mask) << i;
-    quad_key |= (tile_y & mask) << (i + 1);
-  }
-  return quad_key;
-}
-
-inline BoundingBox<uint64_t> GetTileBoundingBox(
-    const SpatialIndexOptions& spatial_index, BoundingBox<double> bbox) {
-  return BoundingBox<uint64_t>(
-      GetTileFromCoord(bbox.min_x, spatial_index.bbox.min_x,
-                       spatial_index.bbox.max_x, spatial_index.tile_bits),
-      GetTileFromCoord(bbox.min_y, spatial_index.bbox.min_y,
-                       spatial_index.bbox.max_y, spatial_index.tile_bits),
-      GetTileFromCoord(bbox.max_x, spatial_index.bbox.min_x,
-                       spatial_index.bbox.max_x, spatial_index.tile_bits),
-      GetTileFromCoord(bbox.max_y, spatial_index.bbox.min_y,
-                       spatial_index.bbox.max_y, spatial_index.tile_bits));
-}
-
-// big endian can be compared using memcpy
-inline void PutFixed64BigEndian(std::string* dst, uint64_t value) {
-  char buf[sizeof(value)];
-  buf[0] = (value >> 56) & 0xff;
-  buf[1] = (value >> 48) & 0xff;
-  buf[2] = (value >> 40) & 0xff;
-  buf[3] = (value >> 32) & 0xff;
-  buf[4] = (value >> 24) & 0xff;
-  buf[5] = (value >> 16) & 0xff;
-  buf[6] = (value >> 8) & 0xff;
-  buf[7] = value & 0xff;
-  dst->append(buf, sizeof(buf));
-}
-
-// big endian can be compared using memcpy
-inline bool GetFixed64BigEndian(const Slice& input, uint64_t* value) {
-  if (input.size() < sizeof(uint64_t)) {
-    return false;
-  }
-  auto ptr = input.data();
-  *value = (static_cast<uint64_t>(static_cast<unsigned char>(ptr[0])) << 56) |
-           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[1])) << 48) |
-           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[2])) << 40) |
-           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[3])) << 32) |
-           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[4])) << 24) |
-           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[5])) << 16) |
-           (static_cast<uint64_t>(static_cast<unsigned char>(ptr[6])) << 8) |
-           static_cast<uint64_t>(static_cast<unsigned char>(ptr[7]));
-  return true;
-}
-
-inline void PutDouble(std::string* dst, double d) {
-  dst->append(reinterpret_cast<char*>(&d), sizeof(double));
-}
-
-inline bool GetDouble(Slice* input, double* d) {
-  if (input->size() < sizeof(double)) {
-    return false;
-  }
-  memcpy(d, input->data(), sizeof(double));
-  input->remove_prefix(sizeof(double));
-  return true;
-}
-
-}  // namespace spatial
-}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
index 304cdfff88..eaf2c501d1 100644
--- a/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
+++ b/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.cc
@@ -12,26 +12,15 @@
 namespace rocksdb {
 
 CompactOnDeletionCollector::CompactOnDeletionCollector(
-    size_t sliding_window_size,
-    size_t deletion_trigger) {
-  deletion_trigger_ = deletion_trigger;
-
-  // First, compute the number of keys in each bucket.
-  bucket_size_ =
-      (sliding_window_size + kNumBuckets - 1) / kNumBuckets;
-  assert(bucket_size_ > 0U);
-
-  Reset();
-}
-
-void CompactOnDeletionCollector::Reset() {
-  for (int i = 0; i < kNumBuckets; ++i) {
-    num_deletions_in_buckets_[i] = 0;
-  }
-  current_bucket_ = 0;
-  num_keys_in_current_bucket_ = 0;
-  num_deletions_in_observation_window_ = 0;
-  need_compaction_ = false;
+    size_t sliding_window_size, size_t deletion_trigger)
+    : bucket_size_((sliding_window_size + kNumBuckets - 1) / kNumBuckets),
+      current_bucket_(0),
+      num_keys_in_current_bucket_(0),
+      num_deletions_in_observation_window_(0),
+      deletion_trigger_(deletion_trigger),
+      need_compaction_(false),
+      finished_(false) {
+  memset(num_deletions_in_buckets_, 0, sizeof(size_t) * kNumBuckets);
 }
 
 // AddUserKey() will be called when a new key/value pair is inserted into the
@@ -39,10 +28,17 @@ void CompactOnDeletionCollector::Reset() {
 // @params key    the user key that is inserted into the table.
 // @params value  the value that is inserted into the table.
 // @params file_size  file size up to now
-Status CompactOnDeletionCollector::AddUserKey(
-    const Slice& key, const Slice& value,
-    EntryType type, SequenceNumber seq,
-    uint64_t file_size) {
+Status CompactOnDeletionCollector::AddUserKey(const Slice& /*key*/,
+                                              const Slice& /*value*/,
+                                              EntryType type,
+                                              SequenceNumber /*seq*/,
+                                              uint64_t /*file_size*/) {
+  assert(!finished_);
+  if (bucket_size_ == 0) {
+    // This collector is effectively disabled
+    return Status::OK();
+  }
+
   if (need_compaction_) {
     // If the output file already needs to be compacted, skip the check.
     return Status::OK();
@@ -77,16 +73,16 @@ Status CompactOnDeletionCollector::AddUserKey(
 
 TablePropertiesCollector*
 CompactOnDeletionCollectorFactory::CreateTablePropertiesCollector(
-    TablePropertiesCollectorFactory::Context context) {
+    TablePropertiesCollectorFactory::Context /*context*/) {
   return new CompactOnDeletionCollector(
-      sliding_window_size_, deletion_trigger_);
+      sliding_window_size_.load(), deletion_trigger_.load());
 }
 
-std::shared_ptr<TablePropertiesCollectorFactory>
+std::shared_ptr<CompactOnDeletionCollectorFactory>
     NewCompactOnDeletionCollectorFactory(
         size_t sliding_window_size,
         size_t deletion_trigger) {
-  return std::shared_ptr<TablePropertiesCollectorFactory>(
+  return std::shared_ptr<CompactOnDeletionCollectorFactory>(
       new CompactOnDeletionCollectorFactory(
           sliding_window_size, deletion_trigger));
 }
diff --git a/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h b/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
index bd240e5170..96c8db896f 100644
--- a/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
+++ b/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector.h
@@ -9,38 +9,6 @@
 #include "rocksdb/utilities/table_properties_collectors.h"
 namespace rocksdb {
 
-// A factory of a table property collector that marks a SST
-// file as need-compaction when it observe at least "D" deletion
-// entries in any "N" consecutive entires.
-class CompactOnDeletionCollectorFactory
-    : public TablePropertiesCollectorFactory {
- public:
-  // A factory of a table property collector that marks a SST
-  // file as need-compaction when it observe at least "D" deletion
-  // entries in any "N" consecutive entires.
-  //
-  // @param sliding_window_size "N"
-  // @param deletion_trigger "D"
-  CompactOnDeletionCollectorFactory(
-      size_t sliding_window_size,
-      size_t deletion_trigger) :
-          sliding_window_size_(sliding_window_size),
-          deletion_trigger_(deletion_trigger) {}
-
-  virtual ~CompactOnDeletionCollectorFactory() {}
-
-  virtual TablePropertiesCollector* CreateTablePropertiesCollector(
-      TablePropertiesCollectorFactory::Context context) override;
-
-  virtual const char* Name() const override {
-    return "CompactOnDeletionCollector";
-  }
-
- private:
-  size_t sliding_window_size_;
-  size_t deletion_trigger_;
-};
-
 class CompactOnDeletionCollector : public TablePropertiesCollector {
  public:
   CompactOnDeletionCollector(
@@ -60,8 +28,8 @@ class CompactOnDeletionCollector : public TablePropertiesCollector {
   // for writing the properties block.
   // @params properties  User will add their collected statistics to
   // `properties`.
-  virtual Status Finish(UserCollectedProperties* properties) override {
-    Reset();
+  virtual Status Finish(UserCollectedProperties* /*properties*/) override {
+    finished_ = true;
     return Status::OK();
   }
 
@@ -98,6 +66,7 @@ class CompactOnDeletionCollector : public TablePropertiesCollector {
   size_t deletion_trigger_;
   // true if the current SST file needs to be compacted.
   bool need_compaction_;
+  bool finished_;
 };
 }  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
index 3c946bf414..101aa988b6 100644
--- a/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
+++ b/thirdparty/rocksdb/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
@@ -20,7 +20,7 @@
 #include "util/random.h"
 #include "utilities/table_properties_collectors/compact_on_deletion_collector.h"
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   const int kWindowSizes[] =
       {1000, 10000, 10000, 127, 128, 129, 255, 256, 257, 2, 10000};
   const int kDeletionTriggers[] =
@@ -40,7 +40,7 @@ int main(int argc, char** argv) {
   // randomize tests
   rocksdb::Random rnd(301);
   const int kMaxTestSize = 100000l;
-  for (int random_test = 0; random_test < 100; random_test++) {
+  for (int random_test = 0; random_test < 30; random_test++) {
     int window_size = rnd.Uniform(kMaxTestSize) + 1;
     int deletion_trigger = rnd.Uniform(window_size);
     window_sizes.emplace_back(window_size);
@@ -58,12 +58,12 @@ int main(int argc, char** argv) {
     const int kBias = (kNumDeletionTrigger + kBucketSize - 1) / kBucketSize;
     // Simple test
     {
-      std::unique_ptr<rocksdb::TablePropertiesCollector> collector;
       auto factory = rocksdb::NewCompactOnDeletionCollectorFactory(
           kWindowSize, kNumDeletionTrigger);
-      collector.reset(factory->CreateTablePropertiesCollector(context));
       const int kSample = 10;
       for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+        std::unique_ptr<rocksdb::TablePropertiesCollector> collector(
+            factory->CreateTablePropertiesCollector(context));
         int deletions = 0;
         for (int i = 0; i < kPaddedWindowSize; ++i) {
           if (i % kSample < delete_rate) {
@@ -90,12 +90,12 @@ int main(int argc, char** argv) {
 
     // Only one section of a file satisfies the compaction trigger
     {
-      std::unique_ptr<rocksdb::TablePropertiesCollector> collector;
       auto factory = rocksdb::NewCompactOnDeletionCollectorFactory(
           kWindowSize, kNumDeletionTrigger);
-      collector.reset(factory->CreateTablePropertiesCollector(context));
       const int kSample = 10;
       for (int delete_rate = 0; delete_rate <= kSample; ++delete_rate) {
+        std::unique_ptr<rocksdb::TablePropertiesCollector> collector(
+            factory->CreateTablePropertiesCollector(context));
         int deletions = 0;
         for (int section = 0; section < 5; ++section) {
           int initial_entries = rnd.Uniform(kWindowSize) + kWindowSize;
@@ -171,7 +171,7 @@ int main(int argc, char** argv) {
   fprintf(stderr, "PASSED\n");
 }
 #else
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as RocksDBLite does not include utilities.\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/trace/file_trace_reader_writer.cc b/thirdparty/rocksdb/utilities/trace/file_trace_reader_writer.cc
new file mode 100644
index 0000000000..4a81516a8b
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/trace/file_trace_reader_writer.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "utilities/trace/file_trace_reader_writer.h"
+
+#include "util/coding.h"
+#include "util/file_reader_writer.h"
+#include "util/trace_replay.h"
+
+namespace rocksdb {
+
+const unsigned int FileTraceReader::kBufferSize = 1024;  // 1KB
+
+FileTraceReader::FileTraceReader(
+    std::unique_ptr<RandomAccessFileReader>&& reader)
+    : file_reader_(std::move(reader)),
+      offset_(0),
+      buffer_(new char[kBufferSize]) {}
+
+FileTraceReader::~FileTraceReader() {
+  Close();
+  delete[] buffer_;
+}
+
+Status FileTraceReader::Close() {
+  file_reader_.reset();
+  return Status::OK();
+}
+
+Status FileTraceReader::Read(std::string* data) {
+  assert(file_reader_ != nullptr);
+  Status s = file_reader_->Read(offset_, kTraceMetadataSize, &result_, buffer_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (result_.size() == 0) {
+    // No more data to read
+    // Todo: Come up with a better way to indicate end of data. May be this
+    // could be avoided once footer is introduced.
+    return Status::Incomplete();
+  }
+  if (result_.size() < kTraceMetadataSize) {
+    return Status::Corruption("Corrupted trace file.");
+  }
+  *data = result_.ToString();
+  offset_ += kTraceMetadataSize;
+
+  uint32_t payload_len =
+      DecodeFixed32(&buffer_[kTraceTimestampSize + kTraceTypeSize]);
+
+  // Read Payload
+  unsigned int bytes_to_read = payload_len;
+  unsigned int to_read =
+      bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+  while (to_read > 0) {
+    s = file_reader_->Read(offset_, to_read, &result_, buffer_);
+    if (!s.ok()) {
+      return s;
+    }
+    if (result_.size() < to_read) {
+      return Status::Corruption("Corrupted trace file.");
+    }
+    data->append(result_.data(), result_.size());
+
+    offset_ += to_read;
+    bytes_to_read -= to_read;
+    to_read = bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
+  }
+
+  return s;
+}
+
+FileTraceWriter::~FileTraceWriter() { Close(); }
+
+Status FileTraceWriter::Close() {
+  file_writer_.reset();
+  return Status::OK();
+}
+
+Status FileTraceWriter::Write(const Slice& data) {
+  return file_writer_->Append(data);
+}
+
+uint64_t FileTraceWriter::GetFileSize() { return file_writer_->GetFileSize(); }
+
+Status NewFileTraceReader(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceReader>* trace_reader) {
+  std::unique_ptr<RandomAccessFile> trace_file;
+  Status s = env->NewRandomAccessFile(trace_filename, &trace_file, env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  file_reader.reset(
+      new RandomAccessFileReader(std::move(trace_file), trace_filename));
+  trace_reader->reset(new FileTraceReader(std::move(file_reader)));
+  return s;
+}
+
+Status NewFileTraceWriter(Env* env, const EnvOptions& env_options,
+                          const std::string& trace_filename,
+                          std::unique_ptr<TraceWriter>* trace_writer) {
+  std::unique_ptr<WritableFile> trace_file;
+  Status s = env->NewWritableFile(trace_filename, &trace_file, env_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<WritableFileWriter> file_writer;
+  file_writer.reset(new WritableFileWriter(std::move(trace_file),
+                                           trace_filename, env_options));
+  trace_writer->reset(new FileTraceWriter(std::move(file_writer)));
+  return s;
+}
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/trace/file_trace_reader_writer.h b/thirdparty/rocksdb/utilities/trace/file_trace_reader_writer.h
new file mode 100644
index 0000000000..863f5d9d06
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/trace/file_trace_reader_writer.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/trace_reader_writer.h"
+
+namespace rocksdb {
+
+class RandomAccessFileReader;
+class WritableFileWriter;
+
+// FileTraceReader allows reading RocksDB traces from a file.
+class FileTraceReader : public TraceReader {
+ public:
+  explicit FileTraceReader(std::unique_ptr<RandomAccessFileReader>&& reader);
+  ~FileTraceReader();
+
+  virtual Status Read(std::string* data) override;
+  virtual Status Close() override;
+
+ private:
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  Slice result_;
+  size_t offset_;
+  char* const buffer_;
+
+  static const unsigned int kBufferSize;
+};
+
+// FileTraceWriter allows writing RocksDB traces to a file.
+class FileTraceWriter : public TraceWriter {
+ public:
+  explicit FileTraceWriter(std::unique_ptr<WritableFileWriter>&& file_writer)
+      : file_writer_(std::move(file_writer)) {}
+  ~FileTraceWriter();
+
+  virtual Status Write(const Slice& data) override;
+  virtual Status Close() override;
+  virtual uint64_t GetFileSize() override;
+
+ private:
+  std::unique_ptr<WritableFileWriter> file_writer_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.cc b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.cc
index 89d3226d5d..48c9180ae9 100644
--- a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.cc
+++ b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.cc
@@ -80,8 +80,11 @@ Status OptimisticTransaction::Rollback() {
 // 'exclusive' is unused for OptimisticTransaction.
 Status OptimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
                                       const Slice& key, bool read_only,
-                                      bool exclusive, bool untracked) {
-  if (untracked) {
+                                      bool exclusive, const bool do_validate,
+                                      const bool assume_tracked) {
+  assert(!assume_tracked);  // not supported
+  (void)assume_tracked;
+  if (!do_validate) {
     return Status::OK();
   }
   uint32_t cfh_id = GetColumnFamilyID(column_family);
diff --git a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.h b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.h
index 5a19489f23..445979b961 100644
--- a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.h
+++ b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction.h
@@ -48,8 +48,8 @@ class OptimisticTransaction : public TransactionBaseImpl {
 
  protected:
   Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
-                 bool read_only, bool exclusive,
-                 bool untracked = false) override;
+                 bool read_only, bool exclusive, const bool do_validate = true,
+                 const bool assume_tracked = false) override;
 
  private:
   OptimisticTransactionDB* const txn_db_;
diff --git a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
index 48f8380577..49240a98c2 100644
--- a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
+++ b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_db_impl.h
@@ -15,11 +15,13 @@ namespace rocksdb {
 class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
  public:
   explicit OptimisticTransactionDBImpl(DB* db, bool take_ownership = true)
-      : OptimisticTransactionDB(db), db_(db), db_owner_(take_ownership) {}
+      : OptimisticTransactionDB(db), db_owner_(take_ownership) {}
 
   ~OptimisticTransactionDBImpl() {
+    // Prevent this stackable from destroying
+    // base db
     if (!db_owner_) {
-      db_.release();
+      db_ = nullptr;
     }
   }
 
@@ -27,11 +29,9 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB {
                                 const OptimisticTransactionOptions& txn_options,
                                 Transaction* old_txn) override;
 
-  DB* GetBaseDB() override { return db_.get(); }
-
  private:
-  std::unique_ptr<DB> db_;
-  bool db_owner_;
+
+   bool db_owner_;
 
   void ReinitializeTransaction(Transaction* txn,
                                const WriteOptions& write_options,
diff --git a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_test.cc b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_test.cc
index f627f0e095..fbb0d44fdc 100644
--- a/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_test.cc
+++ b/thirdparty/rocksdb/utilities/transactions/optimistic_transaction_test.cc
@@ -26,19 +26,18 @@ namespace rocksdb {
 class OptimisticTransactionTest : public testing::Test {
  public:
   OptimisticTransactionDB* txn_db;
-  DB* db;
   string dbname;
   Options options;
 
   OptimisticTransactionTest() {
     options.create_if_missing = true;
     options.max_write_buffer_number = 2;
-    dbname = test::TmpDir() + "/optimistic_transaction_testdb";
+    dbname = test::PerThreadDBPath("optimistic_transaction_testdb");
 
     DestroyDB(dbname, options);
     Open();
   }
-  ~OptimisticTransactionTest() {
+  ~OptimisticTransactionTest() override {
     delete txn_db;
     DestroyDB(dbname, options);
   }
@@ -54,7 +53,6 @@ class OptimisticTransactionTest : public testing::Test {
     Status s = OptimisticTransactionDB::Open(options, dbname, &txn_db);
     assert(s.ok());
     assert(txn_db != nullptr);
-    db = txn_db->GetBaseDB();
   }
 };
 
@@ -64,8 +62,8 @@ TEST_F(OptimisticTransactionTest, SuccessTest) {
   string value;
   Status s;
 
-  db->Put(write_options, Slice("foo"), Slice("bar"));
-  db->Put(write_options, Slice("foo2"), Slice("bar"));
+  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
+  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -81,7 +79,7 @@ TEST_F(OptimisticTransactionTest, SuccessTest) {
   s = txn->Commit();
   ASSERT_OK(s);
 
-  db->Get(read_options, "foo", &value);
+  txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -93,8 +91,8 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest) {
   string value;
   Status s;
 
-  db->Put(write_options, "foo", "bar");
-  db->Put(write_options, "foo2", "bar");
+  txn_db->Put(write_options, "foo", "bar");
+  txn_db->Put(write_options, "foo2", "bar");
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -102,10 +100,10 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest) {
   txn->Put("foo", "bar2");
 
   // This Put outside of a transaction will conflict with the previous write
-  s = db->Put(write_options, "foo", "barz");
+  s = txn_db->Put(write_options, "foo", "barz");
   ASSERT_OK(s);
 
-  s = db->Get(read_options, "foo", &value);
+  s = txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "barz");
   ASSERT_EQ(1, txn->GetNumKeys());
 
@@ -113,9 +111,9 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest) {
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  db->Get(read_options, "foo", &value);
+  txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "barz");
-  db->Get(read_options, "foo2", &value);
+  txn_db->Get(read_options, "foo2", &value);
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -128,29 +126,29 @@ TEST_F(OptimisticTransactionTest, WriteConflictTest2) {
   string value;
   Status s;
 
-  db->Put(write_options, "foo", "bar");
-  db->Put(write_options, "foo2", "bar");
+  txn_db->Put(write_options, "foo", "bar");
+  txn_db->Put(write_options, "foo2", "bar");
 
   txn_options.set_snapshot = true;
   Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
   ASSERT_TRUE(txn);
 
   // This Put outside of a transaction will conflict with a later write
-  s = db->Put(write_options, "foo", "barz");
+  s = txn_db->Put(write_options, "foo", "barz");
   ASSERT_OK(s);
 
   txn->Put("foo", "bar2");  // Conflicts with write done after snapshot taken
 
-  s = db->Get(read_options, "foo", &value);
+  s = txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "barz");
 
   s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());  // Txn should not commit
 
   // Verify that transaction did not write anything
-  db->Get(read_options, "foo", &value);
+  txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "barz");
-  db->Get(read_options, "foo2", &value);
+  txn_db->Get(read_options, "foo2", &value);
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -163,8 +161,8 @@ TEST_F(OptimisticTransactionTest, ReadConflictTest) {
   string value;
   Status s;
 
-  db->Put(write_options, "foo", "bar");
-  db->Put(write_options, "foo2", "bar");
+  txn_db->Put(write_options, "foo", "bar");
+  txn_db->Put(write_options, "foo2", "bar");
 
   txn_options.set_snapshot = true;
   Transaction* txn = txn_db->BeginTransaction(write_options, txn_options);
@@ -177,10 +175,10 @@ TEST_F(OptimisticTransactionTest, ReadConflictTest) {
   ASSERT_EQ(value, "bar");
 
   // This Put outside of a transaction will conflict with the previous read
-  s = db->Put(write_options, "foo", "barz");
+  s = txn_db->Put(write_options, "foo", "barz");
   ASSERT_OK(s);
 
-  s = db->Get(read_options, "foo", &value);
+  s = txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "barz");
 
   s = txn->Commit();
@@ -221,8 +219,8 @@ TEST_F(OptimisticTransactionTest, FlushTest) {
   string value;
   Status s;
 
-  db->Put(write_options, Slice("foo"), Slice("bar"));
-  db->Put(write_options, Slice("foo2"), Slice("bar"));
+  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
+  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -238,18 +236,18 @@ TEST_F(OptimisticTransactionTest, FlushTest) {
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a memtable to flush
-  s = db->Put(write_options, "dummy", "dummy");
+  s = txn_db->Put(write_options, "dummy", "dummy");
   ASSERT_OK(s);
 
   // force a memtable flush
   FlushOptions flush_ops;
-  db->Flush(flush_ops);
+  txn_db->Flush(flush_ops);
 
   s = txn->Commit();
   // txn should commit since the flushed table is still in MemtableList History
   ASSERT_OK(s);
 
-  db->Get(read_options, "foo", &value);
+  txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "bar2");
 
   delete txn;
@@ -261,8 +259,8 @@ TEST_F(OptimisticTransactionTest, FlushTest2) {
   string value;
   Status s;
 
-  db->Put(write_options, Slice("foo"), Slice("bar"));
-  db->Put(write_options, Slice("foo2"), Slice("bar"));
+  txn_db->Put(write_options, Slice("foo"), Slice("bar"));
+  txn_db->Put(write_options, Slice("foo2"), Slice("bar"));
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -278,33 +276,33 @@ TEST_F(OptimisticTransactionTest, FlushTest2) {
   ASSERT_EQ(value, "bar2");
 
   // Put a random key so we have a MemTable to flush
-  s = db->Put(write_options, "dummy", "dummy");
+  s = txn_db->Put(write_options, "dummy", "dummy");
   ASSERT_OK(s);
 
   // force a memtable flush
   FlushOptions flush_ops;
-  db->Flush(flush_ops);
+  txn_db->Flush(flush_ops);
 
   // Put a random key so we have a MemTable to flush
-  s = db->Put(write_options, "dummy", "dummy2");
+  s = txn_db->Put(write_options, "dummy", "dummy2");
   ASSERT_OK(s);
 
   // force a memtable flush
-  db->Flush(flush_ops);
+  txn_db->Flush(flush_ops);
 
-  s = db->Put(write_options, "dummy", "dummy3");
+  s = txn_db->Put(write_options, "dummy", "dummy3");
   ASSERT_OK(s);
 
   // force a memtable flush
   // Since our test db has max_write_buffer_number=2, this flush will cause
   // the first memtable to get purged from the MemtableList history.
-  db->Flush(flush_ops);
+  txn_db->Flush(flush_ops);
 
   s = txn->Commit();
   // txn should not commit since MemTableList History is not large enough
   ASSERT_TRUE(s.IsTryAgain());
 
-  db->Get(read_options, "foo", &value);
+  txn_db->Get(read_options, "foo", &value);
   ASSERT_EQ(value, "bar");
 
   delete txn;
@@ -316,13 +314,13 @@ TEST_F(OptimisticTransactionTest, NoSnapshotTest) {
   string value;
   Status s;
 
-  db->Put(write_options, "AAA", "bar");
+  txn_db->Put(write_options, "AAA", "bar");
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
   // Modify key after transaction start
-  db->Put(write_options, "AAA", "bar1");
+  txn_db->Put(write_options, "AAA", "bar1");
 
   // Read and write without a snapshot
   txn->GetForUpdate(read_options, "AAA", &value);
@@ -345,14 +343,14 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
   string value;
   Status s;
 
-  db->Put(write_options, "AAA", "bar");
-  db->Put(write_options, "BBB", "bar");
-  db->Put(write_options, "CCC", "bar");
+  txn_db->Put(write_options, "AAA", "bar");
+  txn_db->Put(write_options, "BBB", "bar");
+  txn_db->Put(write_options, "CCC", "bar");
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
 
-  db->Put(write_options, "AAA", "bar1");
+  txn_db->Put(write_options, "AAA", "bar1");
 
   // Read and write without a snapshot
   txn->GetForUpdate(read_options, "AAA", &value);
@@ -360,7 +358,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
   txn->Put("AAA", "bar2");
 
   // Modify BBB before snapshot is taken
-  db->Put(write_options, "BBB", "bar1");
+  txn_db->Put(write_options, "BBB", "bar1");
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
@@ -370,7 +368,7 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
   ASSERT_EQ(value, "bar1");
   txn->Put("BBB", "bar2");
 
-  db->Put(write_options, "CCC", "bar1");
+  txn_db->Put(write_options, "CCC", "bar1");
 
   // Set a new snapshot
   txn->SetSnapshot();
@@ -391,26 +389,26 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar2");
 
-  s = db->Get(read_options, "AAA", &value);
+  s = txn_db->Get(read_options, "AAA", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar1");
-  s = db->Get(read_options, "BBB", &value);
+  s = txn_db->Get(read_options, "BBB", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar1");
-  s = db->Get(read_options, "CCC", &value);
+  s = txn_db->Get(read_options, "CCC", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar1");
 
   s = txn->Commit();
   ASSERT_OK(s);
 
-  s = db->Get(read_options, "AAA", &value);
+  s = txn_db->Get(read_options, "AAA", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar2");
-  s = db->Get(read_options, "BBB", &value);
+  s = txn_db->Get(read_options, "BBB", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar2");
-  s = db->Get(read_options, "CCC", &value);
+  s = txn_db->Get(read_options, "CCC", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar2");
 
@@ -419,8 +417,8 @@ TEST_F(OptimisticTransactionTest, MultipleSnapshotTest) {
   txn = txn_db->BeginTransaction(write_options);
 
   // Potentially conflicting writes
-  db->Put(write_options, "ZZZ", "zzz");
-  db->Put(write_options, "XXX", "xxx");
+  txn_db->Put(write_options, "ZZZ", "zzz");
+  txn_db->Put(write_options, "XXX", "xxx");
 
   txn->SetSnapshot();
 
@@ -457,9 +455,9 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
   ColumnFamilyOptions cf_options;
 
   // Create 2 new column families
-  s = db->CreateColumnFamily(cf_options, "CFA", &cfa);
+  s = txn_db->CreateColumnFamily(cf_options, "CFA", &cfa);
   ASSERT_OK(s);
-  s = db->CreateColumnFamily(cf_options, "CFB", &cfb);
+  s = txn_db->CreateColumnFamily(cf_options, "CFB", &cfb);
   ASSERT_OK(s);
 
   delete cfa;
@@ -482,7 +480,6 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
                                     &txn_db);
   ASSERT_OK(s);
   assert(txn_db != nullptr);
-  db = txn_db->GetBaseDB();
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -499,9 +496,9 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
   batch.Put("foo", "foo");
   batch.Put(handles[1], "AAA", "bar");
   batch.Put(handles[1], "AAAZZZ", "bar");
-  s = db->Write(write_options, &batch);
+  s = txn_db->Write(write_options, &batch);
   ASSERT_OK(s);
-  db->Delete(write_options, handles[1], "AAAZZZ");
+  txn_db->Delete(write_options, handles[1], "AAAZZZ");
 
   // These keys do no conflict with existing writes since they're in
   // different column families
@@ -516,9 +513,9 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
   // Txn should commit
   s = txn->Commit();
   ASSERT_OK(s);
-  s = db->Get(read_options, "AAA", &value);
+  s = txn_db->Get(read_options, "AAA", &value);
   ASSERT_TRUE(s.IsNotFound());
-  s = db->Get(read_options, handles[2], "AAAZZZ", &value);
+  s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value);
   ASSERT_EQ(value, "barbar");
 
   Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")};
@@ -534,7 +531,7 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
   // Verify txn did not commit
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
-  s = db->Get(read_options, handles[1], "AAAZZZ", &value);
+  s = txn_db->Get(read_options, handles[1], "AAAZZZ", &value);
   ASSERT_EQ(value, "barbar");
 
   delete txn;
@@ -572,11 +569,11 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
   // Txn should commit
   s = txn->Commit();
   ASSERT_OK(s);
-  s = db->Get(read_options, handles[2], "ZZZ", &value);
+  s = txn_db->Get(read_options, handles[2], "ZZZ", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   // Put a key which will conflict with the next txn using the previous snapshot
-  db->Put(write_options, handles[2], "foo", "000");
+  txn_db->Put(write_options, handles[2], "foo", "000");
 
   results = txn2->MultiGetForUpdate(snapshot_read_options, multiget_cfh,
                                     multiget_keys, &values);
@@ -592,9 +589,9 @@ TEST_F(OptimisticTransactionTest, ColumnFamiliesTest) {
   s = txn2->Commit();
   ASSERT_TRUE(s.IsBusy());
 
-  s = db->DropColumnFamily(handles[1]);
+  s = txn_db->DropColumnFamily(handles[1]);
   ASSERT_OK(s);
-  s = db->DropColumnFamily(handles[2]);
+  s = txn_db->DropColumnFamily(handles[2]);
   ASSERT_OK(s);
 
   delete txn;
@@ -611,7 +608,7 @@ TEST_F(OptimisticTransactionTest, EmptyTest) {
   string value;
   Status s;
 
-  s = db->Put(write_options, "aaa", "aaa");
+  s = txn_db->Put(write_options, "aaa", "aaa");
   ASSERT_OK(s);
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
@@ -636,7 +633,7 @@ TEST_F(OptimisticTransactionTest, EmptyTest) {
   s = txn->GetForUpdate(read_options, "aaa", &value);
   ASSERT_EQ(value, "aaa");
 
-  s = db->Put(write_options, "aaa", "xxx");
+  s = txn_db->Put(write_options, "aaa", "xxx");
   s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
   delete txn;
@@ -799,7 +796,7 @@ TEST_F(OptimisticTransactionTest, LostUpdate) {
   delete txn1;
   delete txn2;
 
-  s = db->Get(read_options, "1", &value);
+  s = txn_db->Get(read_options, "1", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "8");
 }
@@ -814,7 +811,7 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) {
   Transaction* txn = txn_db->BeginTransaction(write_options);
   txn->PutUntracked("untracked", "0");
   txn->Rollback();
-  s = db->Get(read_options, "untracked", &value);
+  s = txn_db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   delete txn;
@@ -827,13 +824,13 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) {
 
   // Write to the untracked key outside of the transaction and verify
   // it doesn't prevent the transaction from committing.
-  s = db->Put(write_options, "untracked", "x");
+  s = txn_db->Put(write_options, "untracked", "x");
   ASSERT_OK(s);
 
   s = txn->Commit();
   ASSERT_OK(s);
 
-  s = db->Get(read_options, "untracked", &value);
+  s = txn_db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   delete txn;
@@ -844,12 +841,12 @@ TEST_F(OptimisticTransactionTest, UntrackedWrites) {
 
   // Write to tracked key outside of the transaction and verify that the
   // untracked keys are not written when the commit fails.
-  s = db->Delete(write_options, "tracked");
+  s = txn_db->Delete(write_options, "tracked");
 
   s = txn->Commit();
   ASSERT_TRUE(s.IsBusy());
 
-  s = db->Get(read_options, "untracked", &value);
+  s = txn_db->Get(read_options, "untracked", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   delete txn;
@@ -863,19 +860,19 @@ TEST_F(OptimisticTransactionTest, IteratorTest) {
   Status s;
 
   // Write some keys to the db
-  s = db->Put(write_options, "A", "a");
+  s = txn_db->Put(write_options, "A", "a");
   ASSERT_OK(s);
 
-  s = db->Put(write_options, "G", "g");
+  s = txn_db->Put(write_options, "G", "g");
   ASSERT_OK(s);
 
-  s = db->Put(write_options, "F", "f");
+  s = txn_db->Put(write_options, "F", "f");
   ASSERT_OK(s);
 
-  s = db->Put(write_options, "C", "c");
+  s = txn_db->Put(write_options, "C", "c");
   ASSERT_OK(s);
 
-  s = db->Put(write_options, "D", "d");
+  s = txn_db->Put(write_options, "D", "d");
   ASSERT_OK(s);
 
   Transaction* txn = txn_db->BeginTransaction(write_options);
@@ -898,10 +895,10 @@ TEST_F(OptimisticTransactionTest, IteratorTest) {
   const Snapshot* snapshot = txn->GetSnapshot();
 
   // Write some keys to the db after the snapshot
-  s = db->Put(write_options, "BB", "xx");
+  s = txn_db->Put(write_options, "BB", "xx");
   ASSERT_OK(s);
 
-  s = db->Put(write_options, "C", "xx");
+  s = txn_db->Put(write_options, "C", "xx");
   ASSERT_OK(s);
 
   read_options.snapshot = snapshot;
@@ -995,7 +992,7 @@ TEST_F(OptimisticTransactionTest, SavepointTest) {
   s = txn->Commit();
   ASSERT_OK(s);
 
-  s = db->Get(read_options, "B", &value);
+  s = txn_db->Get(read_options, "B", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 
@@ -1107,28 +1104,28 @@ TEST_F(OptimisticTransactionTest, SavepointTest) {
   s = txn->Commit();
   ASSERT_OK(s);
 
-  s = db->Get(read_options, "F", &value);
+  s = txn_db->Get(read_options, "F", &value);
   ASSERT_OK(s);
   ASSERT_EQ("f", value);
 
-  s = db->Get(read_options, "G", &value);
+  s = txn_db->Get(read_options, "G", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = db->Get(read_options, "A", &value);
+  s = txn_db->Get(read_options, "A", &value);
   ASSERT_OK(s);
   ASSERT_EQ("aa", value);
 
-  s = db->Get(read_options, "B", &value);
+  s = txn_db->Get(read_options, "B", &value);
   ASSERT_OK(s);
   ASSERT_EQ("b", value);
 
-  s = db->Get(read_options, "C", &value);
+  s = txn_db->Get(read_options, "C", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = db->Get(read_options, "D", &value);
+  s = txn_db->Get(read_options, "D", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  s = db->Get(read_options, "E", &value);
+  s = txn_db->Get(read_options, "E", &value);
   ASSERT_TRUE(s.IsNotFound());
 
   delete txn;
@@ -1141,7 +1138,7 @@ TEST_F(OptimisticTransactionTest, UndoGetForUpdateTest) {
   string value;
   Status s;
 
-  db->Put(write_options, "A", "");
+  txn_db->Put(write_options, "A", "");
 
   Transaction* txn1 = txn_db->BeginTransaction(write_options);
   ASSERT_TRUE(txn1);
@@ -1350,7 +1347,7 @@ TEST_F(OptimisticTransactionTest, OptimisticTransactionStressTest) {
   }
 
   // Verify that data is consistent
-  Status s = RandomTransactionInserter::Verify(db, num_sets);
+  Status s = RandomTransactionInserter::Verify(txn_db, num_sets);
   ASSERT_OK(s);
 }
 
@@ -1391,7 +1388,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(
       stderr,
       "SKIPPED as optimistic_transaction is not supported in ROCKSDB_LITE\n");
diff --git a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.cc b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.cc
index 68b8b4f1a7..042ae95ab2 100644
--- a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.cc
+++ b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.cc
@@ -37,7 +37,7 @@ TransactionID PessimisticTransaction::GenTxnID() {
 
 PessimisticTransaction::PessimisticTransaction(
     TransactionDB* txn_db, const WriteOptions& write_options,
-    const TransactionOptions& txn_options)
+    const TransactionOptions& txn_options, const bool init)
     : TransactionBaseImpl(txn_db->GetRootDB(), write_options),
       txn_db_impl_(nullptr),
       expiration_time_(0),
@@ -46,11 +46,14 @@ PessimisticTransaction::PessimisticTransaction(
       waiting_key_(nullptr),
       lock_timeout_(0),
       deadlock_detect_(false),
-      deadlock_detect_depth_(0) {
+      deadlock_detect_depth_(0),
+      skip_concurrency_control_(false) {
   txn_db_impl_ =
       static_cast_with_check<PessimisticTransactionDB, TransactionDB>(txn_db);
   db_impl_ = static_cast_with_check<DBImpl, DB>(db_);
-  Initialize(txn_options);
+  if (init) {
+    Initialize(txn_options);
+  }
 }
 
 void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
@@ -61,6 +64,7 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
   deadlock_detect_ = txn_options.deadlock_detect;
   deadlock_detect_depth_ = txn_options.deadlock_detect_depth;
   write_batch_.SetMaxBytes(txn_options.max_write_batch_size);
+  skip_concurrency_control_ = txn_options.skip_concurrency_control;
 
   lock_timeout_ = txn_options.lock_timeout * 1000;
   if (lock_timeout_ < 0) {
@@ -82,6 +86,8 @@ void PessimisticTransaction::Initialize(const TransactionOptions& txn_options) {
   if (expiration_time_ > 0) {
     txn_db_impl_->InsertExpirableTransaction(txn_id_, this);
   }
+  use_only_the_last_commit_time_batch_for_recovery_ =
+      txn_options.use_only_the_last_commit_time_batch_for_recovery;
 }
 
 PessimisticTransaction::~PessimisticTransaction() {
@@ -125,7 +131,7 @@ WriteCommittedTxn::WriteCommittedTxn(TransactionDB* txn_db,
                                      const TransactionOptions& txn_options)
     : PessimisticTransaction(txn_db, write_options, txn_options){};
 
-Status WriteCommittedTxn::CommitBatch(WriteBatch* batch) {
+Status PessimisticTransaction::CommitBatch(WriteBatch* batch) {
   TransactionKeyMap keys_to_unlock;
   Status s = LockBatch(batch, &keys_to_unlock);
 
@@ -148,7 +154,7 @@ Status WriteCommittedTxn::CommitBatch(WriteBatch* batch) {
 
   if (can_commit) {
     txn_state_.store(AWAITING_COMMIT);
-    s = db_->Write(write_options_, batch);
+    s = CommitBatchInternal(batch);
     if (s.ok()) {
       txn_state_.store(COMMITED);
     }
@@ -192,10 +198,11 @@ Status PessimisticTransaction::Prepare() {
     txn_state_.store(AWAITING_PREPARE);
     // transaction can't expire after preparation
     expiration_time_ = 0;
+    assert(log_number_ == 0 ||
+           txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+
     s = PrepareInternal();
     if (s.ok()) {
-      assert(log_number_ != 0);
-      dbimpl_->MarkLogAsContainingPrepSection(log_number_);
       txn_state_.store(PREPARED);
     }
   } else if (txn_state_ == LOCKS_STOLEN) {
@@ -217,10 +224,38 @@ Status WriteCommittedTxn::PrepareInternal() {
   WriteOptions write_options = write_options_;
   write_options.disableWAL = false;
   WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
-  Status s =
-      db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
-                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
-                          /* disable_memtable*/ true);
+  class MarkLogCallback : public PreReleaseCallback {
+   public:
+    MarkLogCallback(DBImpl* db, bool two_write_queues)
+        : db_(db), two_write_queues_(two_write_queues) {
+      (void)two_write_queues_;  // to silence unused private field warning
+    }
+    virtual Status Callback(SequenceNumber, bool is_mem_disabled,
+                            uint64_t log_number) override {
+#ifdef NDEBUG
+      (void)is_mem_disabled;
+#endif
+      assert(log_number != 0);
+      assert(!two_write_queues_ || is_mem_disabled);  // implies the 2nd queue
+      db_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(log_number);
+      return Status::OK();
+    }
+
+   private:
+    DBImpl* db_;
+    bool two_write_queues_;
+  } mark_log_callback(db_impl_,
+                      db_impl_->immutable_db_options().two_write_queues);
+
+  WriteCallback* const kNoWriteCallback = nullptr;
+  const uint64_t kRefNoLog = 0;
+  const bool kDisableMemtable = true;
+  SequenceNumber* const KIgnoreSeqUsed = nullptr;
+  const size_t kNoBatchCount = 0;
+  Status s = db_impl_->WriteImpl(
+      write_options, GetWriteBatch()->GetWriteBatch(), kNoWriteCallback,
+      &log_number_, kRefNoLog, kDisableMemtable, KIgnoreSeqUsed, kNoBatchCount,
+      &mark_log_callback);
   return s;
 }
 
@@ -261,7 +296,14 @@ Status PessimisticTransaction::Commit() {
           "Commit-time batch contains values that will not be committed.");
     } else {
       txn_state_.store(AWAITING_COMMIT);
+      if (log_number_ > 0) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+            log_number_);
+      }
       s = CommitWithoutPrepareInternal();
+      if (!name_.empty()) {
+        txn_db_impl_->UnregisterTransaction(this);
+      }
       Clear();
       if (s.ok()) {
         txn_state_.store(COMMITED);
@@ -282,7 +324,8 @@ Status PessimisticTransaction::Commit() {
     // to determine what prep logs must be kept around,
     // not the prep section heap.
     assert(log_number_ > 0);
-    dbimpl_->MarkLogAsHavingPrepSectionFlushed(log_number_);
+    dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+        log_number_);
     txn_db_impl_->UnregisterTransaction(this);
 
     Clear();
@@ -301,7 +344,27 @@ Status PessimisticTransaction::Commit() {
 }
 
 Status WriteCommittedTxn::CommitWithoutPrepareInternal() {
-  Status s = db_->Write(write_options_, GetWriteBatch()->GetWriteBatch());
+  uint64_t seq_used = kMaxSequenceNumber;
+  auto s =
+      db_impl_->WriteImpl(write_options_, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, /*log_used*/ nullptr,
+                          /*log_ref*/ 0, /*disable_memtable*/ false, &seq_used);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
+  return s;
+}
+
+Status WriteCommittedTxn::CommitBatchInternal(WriteBatch* batch, size_t) {
+  uint64_t seq_used = kMaxSequenceNumber;
+  auto s = db_impl_->WriteImpl(write_options_, batch, /*callback*/ nullptr,
+                               /*log_used*/ nullptr, /*log_ref*/ 0,
+                               /*disable_memtable*/ false, &seq_used);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
   return s;
 }
 
@@ -319,26 +382,44 @@ Status WriteCommittedTxn::CommitInternal() {
   // in non recovery mode and simply insert the values
   WriteBatchInternal::Append(working_batch, GetWriteBatch()->GetWriteBatch());
 
-  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                               log_number_);
+  uint64_t seq_used = kMaxSequenceNumber;
+  auto s =
+      db_impl_->WriteImpl(write_options_, working_batch, /*callback*/ nullptr,
+                          /*log_used*/ nullptr, /*log_ref*/ log_number_,
+                          /*disable_memtable*/ false, &seq_used);  
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (s.ok()) {
+    SetId(seq_used);
+  }
   return s;
 }
 
-Status WriteCommittedTxn::Rollback() {
+Status PessimisticTransaction::Rollback() {
   Status s;
   if (txn_state_ == PREPARED) {
-    WriteBatch rollback_marker;
-    WriteBatchInternal::MarkRollback(&rollback_marker, name_);
     txn_state_.store(AWAITING_ROLLBACK);
-    s = db_impl_->WriteImpl(write_options_, &rollback_marker);
+
+    s = RollbackInternal();
+
     if (s.ok()) {
       // we do not need to keep our prepared section around
       assert(log_number_ > 0);
-      dbimpl_->MarkLogAsHavingPrepSectionFlushed(log_number_);
+      dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+          log_number_);
       Clear();
       txn_state_.store(ROLLEDBACK);
     }
   } else if (txn_state_ == STARTED) {
+    if (log_number_ > 0) {
+      assert(txn_db_impl_->GetTxnDBOptions().write_policy == WRITE_UNPREPARED);
+      assert(GetId() > 0);
+      s = RollbackInternal();
+
+      if (s.ok()) {
+        dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+            log_number_);
+      }
+    }
     // prepare couldn't have taken place
     Clear();
   } else if (txn_state_ == COMMITED) {
@@ -351,6 +432,13 @@ Status WriteCommittedTxn::Rollback() {
   return s;
 }
 
+Status WriteCommittedTxn::RollbackInternal() {
+  WriteBatch rollback_marker;
+  WriteBatchInternal::MarkRollback(&rollback_marker, name_);
+  auto s = db_impl_->WriteImpl(write_options_, &rollback_marker);
+  return s;
+}
+
 Status PessimisticTransaction::RollbackToSavePoint() {
   if (txn_state_ != STARTED) {
     return Status::InvalidArgument("Transaction is beyond state for rollback.");
@@ -391,18 +479,17 @@ Status PessimisticTransaction::LockBatch(WriteBatch* batch,
       }
     }
 
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& /* unused */) override {
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& /* unused */) override {
       RecordKey(column_family_id, key);
       return Status::OK();
     }
-    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& /* unused */) override {
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& /* unused */) override {
       RecordKey(column_family_id, key);
       return Status::OK();
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
       RecordKey(column_family_id, key);
       return Status::OK();
     }
@@ -449,16 +536,20 @@ Status PessimisticTransaction::LockBatch(WriteBatch* batch,
 // the snapshot time.
 Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
                                        const Slice& key, bool read_only,
-                                       bool exclusive, bool untracked) {
+                                       bool exclusive, const bool do_validate,
+                                       const bool assume_tracked) {
+  assert(!assume_tracked || !do_validate);
+  Status s;
+  if (UNLIKELY(skip_concurrency_control_)) {
+    return s;
+  }
   uint32_t cfh_id = GetColumnFamilyID(column_family);
   std::string key_str = key.ToString();
   bool previously_locked;
   bool lock_upgrade = false;
-  Status s;
 
   // lock this key if this transactions hasn't already locked it
-  SequenceNumber current_seqno = kMaxSequenceNumber;
-  SequenceNumber new_seqno = kMaxSequenceNumber;
+  SequenceNumber tracked_at_seq = kMaxSequenceNumber;
 
   const auto& tracked_keys = GetTrackedKeys();
   const auto tracked_keys_cf = tracked_keys.find(cfh_id);
@@ -473,7 +564,7 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
         lock_upgrade = true;
       }
       previously_locked = true;
-      current_seqno = iter->second.seq;
+      tracked_at_seq = iter->second.seq;
     }
   }
 
@@ -491,23 +582,32 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
   // any writes since this transaction's snapshot.
   // TODO(agiardullo): could optimize by supporting shared txn locks in the
   // future
-  if (untracked || snapshot_ == nullptr) {
+  if (!do_validate || snapshot_ == nullptr) {
+    if (assume_tracked && !previously_locked) {
+      s = Status::InvalidArgument(
+          "assume_tracked is set but it is not tracked yet");
+    }
     // Need to remember the earliest sequence number that we know that this
     // key has not been modified after.  This is useful if this same
     // transaction
     // later tries to lock this key again.
-    if (current_seqno == kMaxSequenceNumber) {
+    if (tracked_at_seq == kMaxSequenceNumber) {
       // Since we haven't checked a snapshot, we only know this key has not
       // been modified since after we locked it.
-      new_seqno = db_->GetLatestSequenceNumber();
-    } else {
-      new_seqno = current_seqno;
+      // Note: when last_seq_same_as_publish_seq_==false this is less than the
+      // latest allocated seq but it is ok since i) this is just a heuristic
+      // used only as a hint to avoid actual check for conflicts, ii) this would
+      // cause a false positive only if the snapthot is taken right after the
+      // lock, which would be an unusual sequence.
+      tracked_at_seq = db_->GetLatestSequenceNumber();
     }
   } else {
     // If a snapshot is set, we need to make sure the key hasn't been modified
     // since the snapshot.  This must be done after we locked the key.
+    // If we already have validated an earilier snapshot it must has been
+    // reflected in tracked_at_seq and ValidateSnapshot will return OK.
     if (s.ok()) {
-      s = ValidateSnapshot(column_family, key, current_seqno, &new_seqno);
+      s = ValidateSnapshot(column_family, key, &tracked_at_seq);
 
       if (!s.ok()) {
         // Failed to validate key
@@ -526,8 +626,11 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
   }
 
   if (s.ok()) {
-    // Let base class know we've conflict checked this key.
-    TrackKey(cfh_id, key_str, new_seqno, read_only, exclusive);
+    // We must track all the locked keys so that we can unlock them later. If
+    // the key is already locked, this func will update some stats on the
+    // tracked key. It could also update the tracked_at_seq if it is lower than
+    // the existing trackey seq.
+    TrackKey(cfh_id, key_str, tracked_at_seq, read_only, exclusive);
   }
 
   return s;
@@ -535,27 +638,33 @@ Status PessimisticTransaction::TryLock(ColumnFamilyHandle* column_family,
 
 // Return OK() if this key has not been modified more recently than the
 // transaction snapshot_.
+// tracked_at_seq is the global seq at which we either locked the key or already
+// have done ValidateSnapshot.
 Status PessimisticTransaction::ValidateSnapshot(
     ColumnFamilyHandle* column_family, const Slice& key,
-    SequenceNumber prev_seqno, SequenceNumber* new_seqno) {
+    SequenceNumber* tracked_at_seq) {
   assert(snapshot_);
 
-  SequenceNumber seq = snapshot_->GetSequenceNumber();
-  if (prev_seqno <= seq) {
-    // If the key has been previous validated at a sequence number earlier
-    // than the curent snapshot's sequence number, we already know it has not
-    // been modified.
+  SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+  if (*tracked_at_seq <= snap_seq) {
+    // If the key has been previous validated (or locked) at a sequence number
+    // earlier than the current snapshot's sequence number, we already know it
+    // has not been modified aftter snap_seq either.
     return Status::OK();
   }
+  // Otherwise we have either
+  // 1: tracked_at_seq == kMaxSequenceNumber, i.e., first time tracking the key
+  // 2: snap_seq < tracked_at_seq: last time we lock the key was via
+  // do_validate=false which means we had skipped ValidateSnapshot. In both
+  // cases we should do ValidateSnapshot now.
 
-  *new_seqno = seq;
+  *tracked_at_seq = snap_seq;
 
   ColumnFamilyHandle* cfh =
       column_family ? column_family : db_impl_->DefaultColumnFamily();
 
-  return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
-                                               snapshot_->GetSequenceNumber(),
-                                               false /* cache_only */);
+  return TransactionUtil::CheckKeyForConflicts(
+      db_impl_, cfh, key.ToString(), snap_seq, false /* cache_only */);
 }
 
 bool PessimisticTransaction::TryStealingLocks() {
diff --git a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.h b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.h
index 5c6d4d2618..1f851818e9 100644
--- a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.h
+++ b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction.h
@@ -38,7 +38,8 @@ class PessimisticTransactionDB;
 class PessimisticTransaction : public TransactionBaseImpl {
  public:
   PessimisticTransaction(TransactionDB* db, const WriteOptions& write_options,
-                         const TransactionOptions& txn_options);
+                         const TransactionOptions& txn_options,
+                         const bool init = true);
 
   virtual ~PessimisticTransaction();
 
@@ -49,9 +50,12 @@ class PessimisticTransaction : public TransactionBaseImpl {
 
   Status Commit() override;
 
-  virtual Status CommitBatch(WriteBatch* batch) = 0;
+  // It is basically Commit without going through Prepare phase. The write batch
+  // is also directly provided instead of expecting txn to gradually batch the
+  // transactions writes to an internal write batch.
+  Status CommitBatch(WriteBatch* batch);
 
-  Status Rollback() override = 0;
+  Status Rollback() override;
 
   Status RollbackToSavePoint() override;
 
@@ -110,19 +114,30 @@ class PessimisticTransaction : public TransactionBaseImpl {
   int64_t GetDeadlockDetectDepth() const { return deadlock_detect_depth_; }
 
  protected:
+  // Refer to
+  // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery
+  bool use_only_the_last_commit_time_batch_for_recovery_ = false;
+
   virtual Status PrepareInternal() = 0;
 
   virtual Status CommitWithoutPrepareInternal() = 0;
 
+  // batch_cnt if non-zero is the number of sub-batches. A sub-batch is a batch
+  // with no duplicate keys. If zero, then the number of sub-batches is unknown.
+  virtual Status CommitBatchInternal(WriteBatch* batch,
+                                     size_t batch_cnt = 0) = 0;
+
   virtual Status CommitInternal() = 0;
 
-  void Initialize(const TransactionOptions& txn_options);
+  virtual Status RollbackInternal() = 0;
+
+  virtual void Initialize(const TransactionOptions& txn_options);
 
   Status LockBatch(WriteBatch* batch, TransactionKeyMap* keys_to_unlock);
 
   Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
-                 bool read_only, bool exclusive,
-                 bool untracked = false) override;
+                 bool read_only, bool exclusive, const bool do_validate = true,
+                 const bool assume_tracked = false) override;
 
   void Clear() override;
 
@@ -134,6 +149,7 @@ class PessimisticTransaction : public TransactionBaseImpl {
   uint64_t expiration_time_;
 
  private:
+  friend class TransactionTest_ValidateSnapshotTest_Test;
   // Used to create unique ids for transactions.
   static std::atomic<TransactionID> txn_id_counter_;
 
@@ -168,8 +184,12 @@ class PessimisticTransaction : public TransactionBaseImpl {
   // Whether to perform deadlock detection or not.
   int64_t deadlock_detect_depth_;
 
-  Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key,
-                          SequenceNumber prev_seqno, SequenceNumber* new_seqno);
+  // Refer to TransactionOptions::skip_concurrency_control
+  bool skip_concurrency_control_;
+
+  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                  const Slice& key,
+                                  SequenceNumber* tracked_at_seq);
 
   void UnlockGetForUpdate(ColumnFamilyHandle* column_family,
                           const Slice& key) override;
@@ -186,45 +206,22 @@ class WriteCommittedTxn : public PessimisticTransaction {
 
   virtual ~WriteCommittedTxn() {}
 
-  Status CommitBatch(WriteBatch* batch) override;
-
-  Status Rollback() override;
-
  private:
   Status PrepareInternal() override;
 
   Status CommitWithoutPrepareInternal() override;
 
+  Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
+
   Status CommitInternal() override;
 
-  Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key,
-                          SequenceNumber prev_seqno, SequenceNumber* new_seqno);
+  Status RollbackInternal() override;
 
   // No copying allowed
   WriteCommittedTxn(const WriteCommittedTxn&);
   void operator=(const WriteCommittedTxn&);
 };
 
-// Used at commit time to check whether transaction is committing before its
-// expiration time.
-class TransactionCallback : public WriteCallback {
- public:
-  explicit TransactionCallback(PessimisticTransaction* txn) : txn_(txn) {}
-
-  Status Callback(DB* /* unused */) override {
-    if (txn_->IsExpired()) {
-      return Status::Expired();
-    } else {
-      return Status::OK();
-    }
-  }
-
-  bool AllowWriteBatching() override { return true; }
-
- private:
-  PessimisticTransaction* txn_;
-};
-
 }  // namespace rocksdb
 
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.cc b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
index 8fa9575e43..8eb21777a9 100644
--- a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
+++ b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.cc
@@ -22,8 +22,11 @@
 #include "rocksdb/utilities/transaction_db.h"
 #include "util/cast_util.h"
 #include "util/mutexlock.h"
+#include "util/sync_point.h"
 #include "utilities/transactions/pessimistic_transaction.h"
 #include "utilities/transactions/transaction_db_mutex_impl.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
 
 namespace rocksdb {
 
@@ -75,15 +78,34 @@ PessimisticTransactionDB::PessimisticTransactionDB(
 PessimisticTransactionDB::~PessimisticTransactionDB() {
   while (!transactions_.empty()) {
     delete transactions_.begin()->second;
+    // TODO(myabandeh): this seems to be an unsafe approach as it is not quite
+    // clear whether delete would also remove the entry from transactions_.
   }
 }
 
+Status PessimisticTransactionDB::VerifyCFOptions(const ColumnFamilyOptions&) {
+  return Status::OK();
+}
+
 Status PessimisticTransactionDB::Initialize(
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles) {
   for (auto cf_ptr : handles) {
     AddColumnFamily(cf_ptr);
   }
+  // Verify cf options
+  for (auto handle : handles) {
+    ColumnFamilyDescriptor cfd;
+    Status s = handle->GetDescriptor(&cfd);
+    if (!s.ok()) {
+      return s;
+    }
+    s = VerifyCFOptions(cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
   // Re-enable compaction for the column families that initially had
   // compaction enabled.
   std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
@@ -102,23 +124,42 @@ Status PessimisticTransactionDB::Initialize(
   for (auto it = rtrxs.begin(); it != rtrxs.end(); it++) {
     auto recovered_trx = it->second;
     assert(recovered_trx);
-    assert(recovered_trx->log_number_);
+    assert(recovered_trx->batches_.size() == 1);
+    const auto& seq = recovered_trx->batches_.begin()->first;
+    const auto& batch_info = recovered_trx->batches_.begin()->second;
+    assert(batch_info.log_number_);
     assert(recovered_trx->name_.length());
 
     WriteOptions w_options;
     w_options.sync = true;
     TransactionOptions t_options;
+    // This would help avoiding deadlock for keys that although exist in the WAL
+    // did not go through concurrency control. This includes the merge that
+    // MyRocks uses for auto-inc columns. It is safe to do so, since (i) if
+    // there is a conflict between the keys of two transactions that must be
+    // avoided, it is already avoided by the application, MyRocks, before the
+    // restart (ii) application, MyRocks, guarntees to rollback/commit the
+    // recovered transactions before new transactions start.
+    t_options.skip_concurrency_control = true;
 
     Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
     assert(real_trx);
-    real_trx->SetLogNumber(recovered_trx->log_number_);
+    real_trx->SetLogNumber(batch_info.log_number_);
+    assert(seq != kMaxSequenceNumber);
+    if (GetTxnDBOptions().write_policy != WRITE_COMMITTED) {
+      real_trx->SetId(seq);
+    }
 
     s = real_trx->SetName(recovered_trx->name_);
     if (!s.ok()) {
       break;
     }
 
-    s = real_trx->RebuildFromWriteBatch(recovered_trx->batch_);
+    s = real_trx->RebuildFromWriteBatch(batch_info.batch_);
+    // WriteCommitted set this to to disable this check that is specific to
+    // WritePrepared txns
+    assert(batch_info.batch_cnt_ == 0 ||
+           real_trx->GetWriteBatch()->SubBatchCnt() == batch_info.batch_cnt_);
     real_trx->SetState(Transaction::PREPARED);
     if (!s.ok()) {
       break;
@@ -141,17 +182,6 @@ Transaction* WriteCommittedTxnDB::BeginTransaction(
   }
 }
 
-Transaction* WritePreparedTxnDB::BeginTransaction(
-    const WriteOptions& write_options, const TransactionOptions& txn_options,
-    Transaction* old_txn) {
-  if (old_txn != nullptr) {
-    ReinitializeTransaction(old_txn, write_options, txn_options);
-    return old_txn;
-  } else {
-    return new WritePreparedTxn(this, write_options, txn_options);
-  }
-}
-
 TransactionDBOptions PessimisticTransactionDB::ValidateTxnDBOptions(
     const TransactionDBOptions& txn_db_options) {
   TransactionDBOptions validated = txn_db_options;
@@ -190,18 +220,31 @@ Status TransactionDB::Open(
     const std::vector<ColumnFamilyDescriptor>& column_families,
     std::vector<ColumnFamilyHandle*>* handles, TransactionDB** dbptr) {
   Status s;
-  DB* db;
+  DB* db = nullptr;
 
+  ROCKS_LOG_WARN(db_options.info_log, "Transaction write_policy is %" PRId32,
+                 static_cast<int>(txn_db_options.write_policy));
   std::vector<ColumnFamilyDescriptor> column_families_copy = column_families;
   std::vector<size_t> compaction_enabled_cf_indices;
   DBOptions db_options_2pc = db_options;
   PrepareWrap(&db_options_2pc, &column_families_copy,
               &compaction_enabled_cf_indices);
-  s = DB::Open(db_options_2pc, dbname, column_families_copy, handles, &db);
+  const bool use_seq_per_batch =
+      txn_db_options.write_policy == WRITE_PREPARED ||
+      txn_db_options.write_policy == WRITE_UNPREPARED;
+  const bool use_batch_per_txn =
+      txn_db_options.write_policy == WRITE_COMMITTED ||
+      txn_db_options.write_policy == WRITE_PREPARED;
+  s = DBImpl::Open(db_options_2pc, dbname, column_families_copy, handles, &db,
+                   use_seq_per_batch, use_batch_per_txn);
   if (s.ok()) {
     s = WrapDB(db, txn_db_options, compaction_enabled_cf_indices, *handles,
                dbptr);
   }
+  if (!s.ok()) {
+    // just in case it was not deleted (and not set to nullptr).
+    delete db;
+  }
   return s;
 }
 
@@ -233,46 +276,66 @@ Status TransactionDB::WrapDB(
     DB* db, const TransactionDBOptions& txn_db_options,
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
-  PessimisticTransactionDB* txn_db;
+  assert(db != nullptr);
+  assert(dbptr != nullptr);
+  *dbptr = nullptr;
+  std::unique_ptr<PessimisticTransactionDB> txn_db;
   switch (txn_db_options.write_policy) {
     case WRITE_UNPREPARED:
-      return Status::NotSupported("WRITE_UNPREPARED is not implemented yet");
+      txn_db.reset(new WriteUnpreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+      break;
     case WRITE_PREPARED:
-      txn_db = new WritePreparedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+      txn_db.reset(new WritePreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
       break;
     case WRITE_COMMITTED:
     default:
-      txn_db = new WriteCommittedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+      txn_db.reset(new WriteCommittedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
   }
-  *dbptr = txn_db;
+  txn_db->UpdateCFComparatorMap(handles);
   Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
+  // In case of a failure at this point, db is deleted via the txn_db destructor
+  // and set to nullptr.
+  if (s.ok()) {
+    *dbptr = txn_db.release();
+  }
   return s;
 }
 
 Status TransactionDB::WrapStackableDB(
     // make sure this stackable_db is already opened with memtable history
-    // enabled,
-    // auto compaction distabled and 2 phase commit enabled
+    // enabled, auto compaction distabled and 2 phase commit enabled
     StackableDB* db, const TransactionDBOptions& txn_db_options,
     const std::vector<size_t>& compaction_enabled_cf_indices,
     const std::vector<ColumnFamilyHandle*>& handles, TransactionDB** dbptr) {
-  PessimisticTransactionDB* txn_db;
+  assert(db != nullptr);
+  assert(dbptr != nullptr);
+  *dbptr = nullptr;
+  std::unique_ptr<PessimisticTransactionDB> txn_db;
+
   switch (txn_db_options.write_policy) {
     case WRITE_UNPREPARED:
-      return Status::NotSupported("WRITE_UNPREPARED is not implemented yet");
+      txn_db.reset(new WriteUnpreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
+      break;
     case WRITE_PREPARED:
-      txn_db = new WritePreparedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+      txn_db.reset(new WritePreparedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
       break;
     case WRITE_COMMITTED:
     default:
-      txn_db = new WriteCommittedTxnDB(
-          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options));
+      txn_db.reset(new WriteCommittedTxnDB(
+          db, PessimisticTransactionDB::ValidateTxnDBOptions(txn_db_options)));
   }
-  *dbptr = txn_db;
+  txn_db->UpdateCFComparatorMap(handles);
   Status s = txn_db->Initialize(compaction_enabled_cf_indices, handles);
+  // In case of a failure at this point, db is deleted via the txn_db destructor
+  // and set to nullptr.
+  if (s.ok()) {
+    *dbptr = txn_db.release();
+  }
   return s;
 }
 
@@ -287,10 +350,15 @@ Status PessimisticTransactionDB::CreateColumnFamily(
     const ColumnFamilyOptions& options, const std::string& column_family_name,
     ColumnFamilyHandle** handle) {
   InstrumentedMutexLock l(&column_family_mutex_);
+  Status s = VerifyCFOptions(options);
+  if (!s.ok()) {
+    return s;
+  }
 
-  Status s = db_->CreateColumnFamily(options, column_family_name, handle);
+  s = db_->CreateColumnFamily(options, column_family_name, handle);
   if (s.ok()) {
     lock_mgr_.AddColumnFamily((*handle)->GetID());
+    UpdateCFComparatorMap(*handle);
   }
 
   return s;
@@ -345,7 +413,7 @@ Transaction* PessimisticTransactionDB::BeginInternalTransaction(
 //
 // Put(), Merge(), and Delete() only lock a single key per call.  Write() will
 // sort its keys before locking them.  This guarantees that TransactionDB write
-// methods cannot deadlock with eachother (but still could deadlock with a
+// methods cannot deadlock with each other (but still could deadlock with a
 // Transaction).
 Status PessimisticTransactionDB::Put(const WriteOptions& options,
                                      ColumnFamilyHandle* column_family,
@@ -390,6 +458,28 @@ Status PessimisticTransactionDB::Delete(const WriteOptions& wopts,
   return s;
 }
 
+Status PessimisticTransactionDB::SingleDelete(const WriteOptions& wopts,
+                                              ColumnFamilyHandle* column_family,
+                                              const Slice& key) {
+  Status s;
+
+  Transaction* txn = BeginInternalTransaction(wopts);
+  txn->DisableIndexing();
+
+  // Since the client didn't create a transaction, they don't care about
+  // conflict checking for this write.  So we just need to do
+  // SingleDeleteUntracked().
+  s = txn->SingleDeleteUntracked(column_family, key);
+
+  if (s.ok()) {
+    s = txn->Commit();
+  }
+
+  delete txn;
+
+  return s;
+}
+
 Status PessimisticTransactionDB::Merge(const WriteOptions& options,
                                        ColumnFamilyHandle* column_family,
                                        const Slice& key, const Slice& value) {
@@ -433,6 +523,16 @@ Status PessimisticTransactionDB::Write(const WriteOptions& opts,
   return s;
 }
 
+Status WriteCommittedTxnDB::Write(
+    const WriteOptions& opts,
+    const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
+  if (optimizations.skip_concurrency_control) {
+    return db_impl_->Write(opts, updates);
+  } else {
+    return Write(opts, updates);
+  }
+}
+
 void PessimisticTransactionDB::InsertExpirableTransaction(
     TransactionID tx_id, PessimisticTransaction* tx) {
   assert(tx->GetExpirationTime() > 0);
@@ -519,288 +619,5 @@ void PessimisticTransactionDB::UnregisterTransaction(Transaction* txn) {
   transactions_.erase(it);
 }
 
-// Returns true if commit_seq <= snapshot_seq
-bool WritePreparedTxnDB::IsInSnapshot(uint64_t prep_seq,
-                                      uint64_t snapshot_seq) {
-  // Here we try to infer the return value without looking into prepare list.
-  // This would help avoiding synchronization over a shared map.
-  // TODO(myabandeh): read your own writes
-  // TODO(myabandeh): optimize this. This sequence of checks must be correct but
-  // not necessary efficient
-  if (snapshot_seq < prep_seq) {
-    // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq
-    return false;
-  }
-  if (!delayed_prepared_empty_.load(std::memory_order_acquire)) {
-    // We should not normally reach here
-    ReadLock rl(&prepared_mutex_);
-    if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
-      // Then it is not committed yet
-      return false;
-    }
-  }
-  auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE;
-  CommitEntry cached;
-  bool exist = GetCommitEntry(indexed_seq, &cached);
-  if (!exist) {
-    // It is not committed, so it must be still prepared
-    return false;
-  }
-  if (prep_seq == cached.prep_seq) {
-    // It is committed and also not evicted from commit cache
-    return cached.commit_seq <= snapshot_seq;
-  }
-  // At this point we dont know if it was committed or it is still prepared
-  auto max_evicted_seq = max_evicted_seq_.load(std::memory_order_acquire);
-  if (max_evicted_seq < prep_seq) {
-    // Not evicted from cache and also not present, so must be still prepared
-    return false;
-  }
-  // When advancing max_evicted_seq_, we move older entires from prepared to
-  // delayed_prepared_. Also we move evicted entries from commit cache to
-  // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <=
-  // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in
-  // old_commit_map_, iii) committed with no conflict with any snapshot (i)
-  // delayed_prepared_ is checked above
-  if (max_evicted_seq < snapshot_seq) {  // then (ii) cannot be the case
-    // only (iii) is the case: committed
-    // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq <
-    // snapshot_seq
-    return true;
-  }
-  // else (ii) might be the case: check the commit data saved for this snapshot.
-  // If there was no overlapping commit entry, then it is committed with a
-  // commit_seq lower than any live snapshot, including snapshot_seq.
-  if (old_commit_map_empty_.load(std::memory_order_acquire)) {
-    return true;
-  }
-  {
-    // We should not normally reach here
-    ReadLock rl(&old_commit_map_mutex_);
-    auto old_commit_entry = old_commit_map_.find(prep_seq);
-    if (old_commit_entry == old_commit_map_.end() ||
-        old_commit_entry->second <= snapshot_seq) {
-      return true;
-    }
-  }
-  // (ii) it the case: it is committed but after the snapshot_seq
-  return false;
-}
-
-void WritePreparedTxnDB::AddPrepared(uint64_t seq) {
-  ROCKS_LOG_DEBUG(info_log_, "Txn %" PRIu64 " Prepareing", seq);
-  WriteLock wl(&prepared_mutex_);
-  prepared_txns_.push(seq);
-}
-
-void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq,
-                                      uint64_t commit_seq) {
-  ROCKS_LOG_DEBUG(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64,
-                  prepare_seq, commit_seq);
-  auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE;
-  CommitEntry evicted;
-  bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted);
-  if (to_be_evicted) {
-    auto prev_max = max_evicted_seq_.load(std::memory_order_acquire);
-    if (prev_max < evicted.commit_seq) {
-      // TODO(myabandeh) inc max in larger steps to avoid frequent updates
-      auto max_evicted_seq = evicted.commit_seq;
-      // When max_evicted_seq_ advances, move older entries from prepared_txns_
-      // to delayed_prepared_. This guarantees that if a seq is lower than max,
-      // then it is not in prepared_txns_ ans save an expensive, synchronized
-      // lookup from a shared set. delayed_prepared_ is expected to be empty in
-      // normal cases.
-      {
-        WriteLock wl(&prepared_mutex_);
-        while (!prepared_txns_.empty() &&
-               prepared_txns_.top() <= max_evicted_seq) {
-          auto to_be_popped = prepared_txns_.top();
-          delayed_prepared_.insert(to_be_popped);
-          prepared_txns_.pop();
-          delayed_prepared_empty_.store(false, std::memory_order_release);
-        }
-      }
-
-      // With each change to max_evicted_seq_ fetch the live snapshots behind it
-      SequenceNumber curr_seq;
-      std::vector<SequenceNumber> all_snapshots;
-      bool update_snapshots = false;
-      {
-        InstrumentedMutex(db_impl_->mutex());
-        // We use this to identify how fresh are the snapshot list. Since this
-        // is done atomically with obtaining the snapshot list, the one with
-        // the larger seq is more fresh. If the seq is equal the full snapshot
-        // list could be different since taking snapshots does not increase
-        // the db seq. However since we only care about snapshots before the
-        // new max, such recent snapshots would not be included the in the
-        // list anyway.
-        curr_seq = db_impl_->GetLatestSequenceNumber();
-        if (curr_seq > snapshots_version_) {
-          // This is to avoid updating the snapshots_ if it already updated
-          // with a more recent vesion by a concrrent thread
-          update_snapshots = true;
-          // We only care about snapshots lower then max
-          all_snapshots =
-              db_impl_->snapshots().GetAll(nullptr, max_evicted_seq);
-        }
-      }
-      if (update_snapshots) {
-        WriteLock wl(&snapshots_mutex_);
-        snapshots_version_ = curr_seq;
-        // We update the list concurrently with the readers.
-        // Both new and old lists are sorted and the new list is subset of the
-        // previous list plus some new items. Thus if a snapshot repeats in
-        // both new and old lists, it will appear upper in the new list. So if
-        // we simply insert the new snapshots in order, if an overwritten item
-        // is still valid in the new list is either written to the same place in
-        // the array or it is written in a higher palce before it gets
-        // overwritten by another item. This guarantess a reader that reads the
-        // list bottom-up will eventaully see a snapshot that repeats in the
-        // update, either before it gets overwritten by the writer or
-        // afterwards.
-        size_t i = 0;
-        auto it = all_snapshots.begin();
-        for (; it != all_snapshots.end() && i < SNAPSHOT_CACHE_SIZE;
-             it++, i++) {
-          snapshot_cache_[i].store(*it, std::memory_order_release);
-        }
-        snapshots_.clear();
-        for (; it != all_snapshots.end(); it++) {
-          // Insert them to a vector that is less efficient to access
-          // concurrently
-          snapshots_.push_back(*it);
-        }
-        // Update the size at the end. Otherwise a parallel reader might read
-        // items that are not set yet.
-        snapshots_total_.store(all_snapshots.size(), std::memory_order_release);
-      }
-      while (prev_max < max_evicted_seq &&
-             !max_evicted_seq_.compare_exchange_weak(
-                 prev_max, max_evicted_seq, std::memory_order_release,
-                 std::memory_order_acquire)) {
-      };
-    }
-    // After each eviction from commit cache, check if the commit entry should
-    // be kept around because it overlaps with a live snapshot.
-    // First check the snapshot cache that is efficient for concurrent access
-    auto cnt = snapshots_total_.load(std::memory_order_acquire);
-    // The list might get updated concurrently as we are reading from it. The
-    // reader should be able to read all the snapshots that are still valid
-    // after the update. Since the survived snapshots are written in a higher
-    // place before gets overwritten the reader that reads bottom-up will
-    // eventully see it.
-    const bool next_is_larger = true;
-    SequenceNumber snapshot_seq = kMaxSequenceNumber;
-    size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE);
-    for (; 0 < ip1; ip1--) {
-      snapshot_seq = snapshot_cache_[ip1 - 1].load(std::memory_order_acquire);
-      if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
-                                   snapshot_seq, !next_is_larger)) {
-        break;
-      }
-    }
-    if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && ip1 == SNAPSHOT_CACHE_SIZE &&
-                 snapshot_seq < evicted.prep_seq)) {
-      // Then access the less efficient list of snapshots_
-      ReadLock rl(&snapshots_mutex_);
-      // Items could have moved from the snapshots_ to snapshot_cache_ before
-      // accquiring the lock. To make sure that we do not miss a valid snapshot,
-      // read snapshot_cache_ again while holding the lock.
-      for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
-        snapshot_seq = snapshot_cache_[i].load(std::memory_order_acquire);
-        if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
-                                     snapshot_seq, next_is_larger)) {
-          break;
-        }
-      }
-      for (auto snapshot_seq_2 : snapshots_) {
-        if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
-                                     snapshot_seq_2, next_is_larger)) {
-          break;
-        }
-      }
-    }
-  }
-  bool succ =
-      ExchangeCommitEntry(indexed_seq, evicted, {prepare_seq, commit_seq});
-  if (!succ) {
-    // A very rare event, in which the commit entry is updated before we do.
-    // Here we apply a very simple solution of retrying.
-    // TODO(myabandeh): do precautions to detect bugs that cause infinite loops
-    AddCommitted(prepare_seq, commit_seq);
-    return;
-  }
-  {
-    WriteLock wl(&prepared_mutex_);
-    prepared_txns_.erase(prepare_seq);
-    bool was_empty = delayed_prepared_.empty();
-    if (!was_empty) {
-      delayed_prepared_.erase(prepare_seq);
-      bool is_empty = delayed_prepared_.empty();
-      if (was_empty != is_empty) {
-        delayed_prepared_empty_.store(is_empty, std::memory_order_release);
-      }
-    }
-  }
-}
-
-bool WritePreparedTxnDB::GetCommitEntry(uint64_t indexed_seq,
-                                        CommitEntry* entry) {
-  // TODO(myabandeh): implement lock-free commit_cache_
-  ReadLock rl(&commit_cache_mutex_);
-  *entry = commit_cache_[indexed_seq];
-  return (entry->commit_seq != 0);  // initialized
-}
-
-bool WritePreparedTxnDB::AddCommitEntry(uint64_t indexed_seq,
-                                        CommitEntry& new_entry,
-                                        CommitEntry* evicted_entry) {
-  // TODO(myabandeh): implement lock-free commit_cache_
-  WriteLock wl(&commit_cache_mutex_);
-  *evicted_entry = commit_cache_[indexed_seq];
-  commit_cache_[indexed_seq] = new_entry;
-  return (evicted_entry->commit_seq != 0);  // initialized
-}
-
-bool WritePreparedTxnDB::ExchangeCommitEntry(uint64_t indexed_seq,
-                                             CommitEntry& expected_entry,
-                                             CommitEntry new_entry) {
-  // TODO(myabandeh): implement lock-free commit_cache_
-  WriteLock wl(&commit_cache_mutex_);
-  auto& evicted_entry = commit_cache_[indexed_seq];
-  if (evicted_entry.prep_seq != expected_entry.prep_seq) {
-    return false;
-  }
-  commit_cache_[indexed_seq] = new_entry;
-  return true;
-}
-
-// 10m entry, 80MB size
-size_t WritePreparedTxnDB::DEF_COMMIT_CACHE_SIZE = static_cast<size_t>(1 << 21);
-size_t WritePreparedTxnDB::DEF_SNAPSHOT_CACHE_SIZE =
-    static_cast<size_t>(1 << 7);
-
-bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
-    const uint64_t& prep_seq, const uint64_t& commit_seq,
-    const uint64_t& snapshot_seq, const bool next_is_larger = true) {
-  // If we do not store an entry in old_commit_map we assume it is committed in
-  // all snapshots. if commit_seq <= snapshot_seq, it is considered already in
-  // the snapshot so we need not to keep the entry around for this snapshot.
-  if (commit_seq <= snapshot_seq) {
-    // continue the search if the next snapshot could be smaller than commit_seq
-    return !next_is_larger;
-  }
-  // then snapshot_seq < commit_seq
-  if (prep_seq <= snapshot_seq) {  // overlapping range
-    WriteLock wl(&old_commit_map_mutex_);
-    old_commit_map_empty_.store(false, std::memory_order_release);
-    old_commit_map_[prep_seq] = commit_seq;
-    // Storing once is enough. No need to check it for other snapshots.
-    return false;
-  }
-  // continue the search if the next snapshot could be larger than prep_seq
-  return next_is_larger;
-}
-
 }  //  namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.h b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.h
index e3eec6b602..e80b28852e 100644
--- a/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.h
+++ b/thirdparty/rocksdb/utilities/transactions/pessimistic_transaction_db.h
@@ -13,6 +13,9 @@
 #include <unordered_map>
 #include <vector>
 
+#include "db/db_iter.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
 #include "rocksdb/utilities/transaction_db.h"
@@ -32,8 +35,11 @@ class PessimisticTransactionDB : public TransactionDB {
 
   virtual ~PessimisticTransactionDB();
 
-  Status Initialize(const std::vector<size_t>& compaction_enabled_cf_indices,
-                    const std::vector<ColumnFamilyHandle*>& handles);
+  virtual const Snapshot* GetSnapshot() override { return db_->GetSnapshot(); }
+
+  virtual Status Initialize(
+      const std::vector<size_t>& compaction_enabled_cf_indices,
+      const std::vector<ColumnFamilyHandle*>& handles);
 
   Transaction* BeginTransaction(const WriteOptions& write_options,
                                 const TransactionOptions& txn_options,
@@ -49,12 +55,17 @@ class PessimisticTransactionDB : public TransactionDB {
                         ColumnFamilyHandle* column_family,
                         const Slice& key) override;
 
+  using StackableDB::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& wopts,
+                              ColumnFamilyHandle* column_family,
+                              const Slice& key) override;
+
   using StackableDB::Merge;
   virtual Status Merge(const WriteOptions& options,
                        ColumnFamilyHandle* column_family, const Slice& key,
                        const Slice& value) override;
 
-  using StackableDB::Write;
+  using TransactionDB::Write;
   virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override;
 
   using StackableDB::CreateColumnFamily;
@@ -104,23 +115,36 @@ class PessimisticTransactionDB : public TransactionDB {
   std::vector<DeadlockPath> GetDeadlockInfoBuffer() override;
   void SetDeadlockInfoBufferSize(uint32_t target_size) override;
 
-  struct CommitEntry {
-    uint64_t prep_seq;
-    uint64_t commit_seq;
-    CommitEntry() : prep_seq(0), commit_seq(0) {}
-    CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {}
-  };
+  // The default implementation does nothing. The actual implementation is moved
+  // to the child classes that actually need this information. This was due to
+  // an odd performance drop we observed when the added std::atomic member to
+  // the base class even when the subclass do not read it in the fast path.
+  virtual void UpdateCFComparatorMap(const std::vector<ColumnFamilyHandle*>&) {}
+  virtual void UpdateCFComparatorMap(ColumnFamilyHandle*) {}
 
  protected:
+  DBImpl* db_impl_;
+  std::shared_ptr<Logger> info_log_;
+  const TransactionDBOptions txn_db_options_;
+
   void ReinitializeTransaction(
       Transaction* txn, const WriteOptions& write_options,
       const TransactionOptions& txn_options = TransactionOptions());
-  DBImpl* db_impl_;
-  std::shared_ptr<Logger> info_log_;
+
+  virtual Status VerifyCFOptions(const ColumnFamilyOptions& cf_options);
 
  private:
   friend class WritePreparedTxnDB;
-  const TransactionDBOptions txn_db_options_;
+  friend class WritePreparedTxnDBMock;
+  friend class WriteUnpreparedTxn;
+  friend class TransactionTest_DoubleEmptyWrite_Test;
+  friend class TransactionTest_DuplicateKeys_Test;
+  friend class TransactionTest_PersistentTwoPhaseTransactionTest_Test;
+  friend class TransactionStressTest_TwoPhaseLongPrepareTest_Test;
+  friend class TransactionTest_TwoPhaseDoubleRecoveryTest_Test;
+  friend class TransactionTest_TwoPhaseOutOfOrderDelete_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class WriteUnpreparedTransactionTest_MarkLogWithPrepSection_Test;
   TransactionLockMgr lock_mgr_;
 
   // Must be held when adding/dropping column families.
@@ -137,6 +161,10 @@ class PessimisticTransactionDB : public TransactionDB {
   // map from name to two phase transaction instance
   std::mutex name_map_mutex_;
   std::unordered_map<TransactionName, Transaction*> transactions_;
+
+  // Signal that we are testing a crash scenario. Some asserts could be relaxed
+  // in such cases.
+  virtual void TEST_Crash() {}
 };
 
 // A PessimisticTransactionDB that writes the data to the DB after the commit.
@@ -156,160 +184,13 @@ class WriteCommittedTxnDB : public PessimisticTransactionDB {
   Transaction* BeginTransaction(const WriteOptions& write_options,
                                 const TransactionOptions& txn_options,
                                 Transaction* old_txn) override;
-};
-
-// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC.
-// In this way some data in the DB might not be committed. The DB provides
-// mechanisms to tell such data apart from committed data.
-class WritePreparedTxnDB : public PessimisticTransactionDB {
- public:
-  explicit WritePreparedTxnDB(DB* db,
-                              const TransactionDBOptions& txn_db_options)
-      : PessimisticTransactionDB(db, txn_db_options),
-        SNAPSHOT_CACHE_SIZE(DEF_SNAPSHOT_CACHE_SIZE),
-        COMMIT_CACHE_SIZE(DEF_COMMIT_CACHE_SIZE) {
-    init(txn_db_options);
-  }
-
-  explicit WritePreparedTxnDB(StackableDB* db,
-                              const TransactionDBOptions& txn_db_options)
-      : PessimisticTransactionDB(db, txn_db_options),
-        SNAPSHOT_CACHE_SIZE(DEF_SNAPSHOT_CACHE_SIZE),
-        COMMIT_CACHE_SIZE(DEF_COMMIT_CACHE_SIZE) {
-    init(txn_db_options);
-  }
-
-  virtual ~WritePreparedTxnDB() {}
-
-  Transaction* BeginTransaction(const WriteOptions& write_options,
-                                const TransactionOptions& txn_options,
-                                Transaction* old_txn) override;
-
-  // Check whether the transaction that wrote the value with seqeunce number seq
-  // is visible to the snapshot with sequence number snapshot_seq
-  bool IsInSnapshot(uint64_t seq, uint64_t snapshot_seq);
-  // Add the trasnaction with prepare sequence seq to the prepared list
-  void AddPrepared(uint64_t seq);
-  // Add the transaction with prepare sequence prepare_seq and commit sequence
-  // commit_seq to the commit map
-  void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq);
-
- private:
-  friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
-
-  void init(const TransactionDBOptions& /* unused */) {
-    snapshot_cache_ = unique_ptr<std::atomic<SequenceNumber>[]>(
-        new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
-    commit_cache_ =
-        unique_ptr<CommitEntry[]>(new CommitEntry[COMMIT_CACHE_SIZE]{});
-  }
 
-  // A heap with the amortized O(1) complexity for erase. It uses one extra heap
-  // to keep track of erased entries that are not yet on top of the main heap.
-  class PreparedHeap {
-    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
-        heap_;
-    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
-        erased_heap_;
-
-   public:
-    bool empty() { return heap_.empty(); }
-    uint64_t top() { return heap_.top(); }
-    void push(uint64_t v) { heap_.push(v); }
-    void pop() {
-      heap_.pop();
-      while (!heap_.empty() && !erased_heap_.empty() &&
-             heap_.top() == erased_heap_.top()) {
-        heap_.pop();
-        erased_heap_.pop();
-      }
-    }
-    void erase(uint64_t seq) {
-      if (!heap_.empty()) {
-        if (seq < heap_.top()) {
-          // Already popped, ignore it.
-        } else if (heap_.top() == seq) {
-          heap_.pop();
-        } else {  // (heap_.top() > seq)
-          // Down the heap, remember to pop it later
-          erased_heap_.push(seq);
-        }
-      }
-    }
-  };
-
-  // Get the commit entry with index indexed_seq from the commit table. It
-  // returns true if such entry exists.
-  bool GetCommitEntry(uint64_t indexed_seq, CommitEntry* entry);
-  // Rewrite the entry with the index indexed_seq in the commit table with the
-  // commit entry <prep_seq, commit_seq>. If the rewrite results into eviction,
-  // sets the evicted_entry and returns true.
-  bool AddCommitEntry(uint64_t indexed_seq, CommitEntry& new_entry,
-                      CommitEntry* evicted_entry);
-  // Rewrite the entry with the index indexed_seq in the commit table with the
-  // commit entry new_entry only if the existing entry matches the
-  // expected_entry. Returns false otherwise.
-  bool ExchangeCommitEntry(uint64_t indexed_seq, CommitEntry& expected_entry,
-                           CommitEntry new_entry);
-
-  // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq <
-  // commit_seq. Return false if checking the next snapshot(s) is not needed.
-  // This is the case if the entry already added to old_commit_map_ or none of
-  // the next snapshots could satisfy the condition. next_is_larger: the next
-  // snapshot will be a larger value
-  bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq,
-                               const uint64_t& commit_seq,
-                               const uint64_t& snapshot_seq,
-                               const bool next_is_larger);
-
-  // The list of live snapshots at the last time that max_evicted_seq_ advanced.
-  // The list stored into two data structures: in snapshot_cache_ that is
-  // efficient for concurrent reads, and in snapshots_ if the data does not fit
-  // into snapshot_cache_. The total number of snapshots in the two lists
-  std::atomic<size_t> snapshots_total_ = {};
-  // The list sorted in ascending order. Thread-safety for writes is provided
-  // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for
-  // each entry. In x86_64 architecture such reads are compiled to simple read
-  // instructions. 128 entries
-  // TODO(myabandeh): avoid non-const static variables
-  static size_t DEF_SNAPSHOT_CACHE_SIZE;
-  const size_t SNAPSHOT_CACHE_SIZE;
-  unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
-  // 2nd list for storing snapshots. The list sorted in ascending order.
-  // Thread-safety is provided with snapshots_mutex_.
-  std::vector<SequenceNumber> snapshots_;
-  // The version of the latest list of snapshots. This can be used to avoid
-  // rewrittiing a list that is concurrently updated with a more recent version.
-  SequenceNumber snapshots_version_ = 0;
-
-  // A heap of prepared transactions. Thread-safety is provided with
-  // prepared_mutex_.
-  PreparedHeap prepared_txns_;
-  // TODO(myabandeh): avoid non-const static variables
-  static size_t DEF_COMMIT_CACHE_SIZE;
-  const size_t COMMIT_CACHE_SIZE;
-  // commit_cache_ must be initialized to zero to tell apart an empty index from
-  // a filled one. Thread-safety is provided with commit_cache_mutex_.
-  unique_ptr<CommitEntry[]> commit_cache_;
-  // The largest evicted *commit* sequence number from the commit_cache_
-  std::atomic<uint64_t> max_evicted_seq_ = {};
-  // A map of the evicted entries from commit_cache_ that has to be kept around
-  // to service the old snapshots. This is expected to be empty normally.
-  // Thread-safety is provided with old_commit_map_mutex_.
-  std::map<uint64_t, uint64_t> old_commit_map_;
-  // A set of long-running prepared transactions that are not finished by the
-  // time max_evicted_seq_ advances their sequence number. This is expected to
-  // be empty normally. Thread-safety is provided with prepared_mutex_.
-  std::set<uint64_t> delayed_prepared_;
-  // Update when delayed_prepared_.empty() changes. Expected to be true
-  // normally.
-  std::atomic<bool> delayed_prepared_empty_ = {true};
-  // Update when old_commit_map_.empty() changes. Expected to be true normally.
-  std::atomic<bool> old_commit_map_empty_ = {true};
-  port::RWMutex prepared_mutex_;
-  port::RWMutex old_commit_map_mutex_;
-  port::RWMutex commit_cache_mutex_;
-  port::RWMutex snapshots_mutex_;
+  // Optimized version of ::Write that makes use of skip_concurrency_control
+  // hint
+  using TransactionDB::Write;
+  virtual Status Write(const WriteOptions& opts,
+                       const TransactionDBWriteOptimizations& optimizations,
+                       WriteBatch* updates) override;
 };
 
 }  //  namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/transactions/snapshot_checker.cc b/thirdparty/rocksdb/utilities/transactions/snapshot_checker.cc
new file mode 100644
index 0000000000..695c020ef7
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/snapshot_checker.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/snapshot_checker.h"
+
+#ifdef ROCKSDB_LITE
+#include <assert.h>
+#endif  // ROCKSDB_LITE
+
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+namespace rocksdb {
+
+#ifdef ROCKSDB_LITE
+WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
+    WritePreparedTxnDB* /*txn_db*/) {}
+
+SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot(
+    SequenceNumber /*sequence*/, SequenceNumber /*snapshot_sequence*/) const {
+  // Should never be called in LITE mode.
+  assert(false);
+  return SnapshotCheckerResult::kInSnapshot;
+}
+
+#else
+
+WritePreparedSnapshotChecker::WritePreparedSnapshotChecker(
+    WritePreparedTxnDB* txn_db)
+    : txn_db_(txn_db){};
+
+SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot(
+    SequenceNumber sequence, SequenceNumber snapshot_sequence) const {
+  bool snapshot_released = false;
+  // TODO(myabandeh): set min_uncommitted
+  bool in_snapshot = txn_db_->IsInSnapshot(
+      sequence, snapshot_sequence, kMinUnCommittedSeq, &snapshot_released);
+  if (snapshot_released) {
+    return SnapshotCheckerResult::kSnapshotReleased;
+  }
+  return in_snapshot ? SnapshotCheckerResult::kInSnapshot
+                     : SnapshotCheckerResult::kNotInSnapshot;
+}
+
+#endif  // ROCKSDB_LITE
+DisableGCSnapshotChecker DisableGCSnapshotChecker::instance_;
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_base.cc b/thirdparty/rocksdb/utilities/transactions/transaction_base.cc
index 4612dfa549..212c822429 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_base.cc
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_base.cc
@@ -7,6 +7,12 @@
 
 #include "utilities/transactions/transaction_base.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+
 #include "db/db_impl.h"
 #include "db/column_family.h"
 #include "rocksdb/comparator.h"
@@ -55,6 +61,7 @@ void TransactionBaseImpl::Reinitialize(DB* db,
                                        const WriteOptions& write_options) {
   Clear();
   ClearSnapshot();
+  id_ = 0;
   db_ = db;
   name_.clear();
   log_number_ = 0;
@@ -96,7 +103,8 @@ void TransactionBaseImpl::SetSnapshotIfNeeded() {
 
 Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family,
                                     const SliceParts& key, bool read_only,
-                                    bool exclusive, bool untracked) {
+                                    bool exclusive, const bool do_validate,
+                                    const bool assume_tracked) {
   size_t key_size = 0;
   for (int i = 0; i < key.num_parts; ++i) {
     key_size += key.parts[i].size();
@@ -109,7 +117,8 @@ Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family,
     str.append(key.parts[i].data(), key.parts[i].size());
   }
 
-  return TryLock(column_family, str, read_only, exclusive, untracked);
+  return TryLock(column_family, str, read_only, exclusive, do_validate,
+                 assume_tracked);
 }
 
 void TransactionBaseImpl::SetSavePoint() {
@@ -178,6 +187,19 @@ Status TransactionBaseImpl::RollbackToSavePoint() {
   }
 }
 
+Status TransactionBaseImpl::PopSavePoint() {
+  if (save_points_ == nullptr ||
+      save_points_->empty()) {
+    // No SavePoint yet.
+    assert(write_batch_.PopSavePoint().IsNotFound());
+    return Status::NotFound();
+  }
+
+  assert(!save_points_->empty());
+  save_points_->pop();
+  return write_batch_.PopSavePoint();
+}
+
 Status TransactionBaseImpl::Get(const ReadOptions& read_options,
                                 ColumnFamilyHandle* column_family,
                                 const Slice& key, std::string* value) {
@@ -201,8 +223,15 @@ Status TransactionBaseImpl::Get(const ReadOptions& read_options,
 Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
                                          ColumnFamilyHandle* column_family,
                                          const Slice& key, std::string* value,
-                                         bool exclusive) {
-  Status s = TryLock(column_family, key, true /* read_only */, exclusive);
+                                         bool exclusive,
+                                         const bool do_validate) {
+  if (!do_validate && read_options.snapshot != nullptr) {
+    return Status::InvalidArgument(
+        "If do_validate is false then GetForUpdate with snapshot is not "
+        "defined.");
+  }
+  Status s =
+      TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
 
   if (s.ok() && value != nullptr) {
     assert(value != nullptr);
@@ -220,8 +249,15 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options,
                                          ColumnFamilyHandle* column_family,
                                          const Slice& key,
                                          PinnableSlice* pinnable_val,
-                                         bool exclusive) {
-  Status s = TryLock(column_family, key, true /* read_only */, exclusive);
+                                         bool exclusive,
+                                         const bool do_validate) {
+  if (!do_validate && read_options.snapshot != nullptr) {
+    return Status::InvalidArgument(
+        "If do_validate is false then GetForUpdate with snapshot is not "
+        "defined.");
+  }
+  Status s =
+      TryLock(column_family, key, true /* read_only */, exclusive, do_validate);
 
   if (s.ok() && pinnable_val != nullptr) {
     s = Get(read_options, column_family, key, pinnable_val);
@@ -289,9 +325,11 @@ Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options,
 }
 
 Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
-                                const Slice& key, const Slice& value) {
-  Status s =
-      TryLock(column_family, key, false /* read_only */, true /* exclusive */);
+                                const Slice& key, const Slice& value,
+                                const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Put(column_family, key, value);
@@ -304,10 +342,11 @@ Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
 }
 
 Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
-                                const SliceParts& key,
-                                const SliceParts& value) {
-  Status s =
-      TryLock(column_family, key, false /* read_only */, true /* exclusive */);
+                                const SliceParts& key, const SliceParts& value,
+                                const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Put(column_family, key, value);
@@ -320,9 +359,11 @@ Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family,
 }
 
 Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family,
-                                  const Slice& key, const Slice& value) {
-  Status s =
-      TryLock(column_family, key, false /* read_only */, true /* exclusive */);
+                                  const Slice& key, const Slice& value,
+                                  const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Merge(column_family, key, value);
@@ -335,9 +376,11 @@ Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family,
 }
 
 Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
-                                   const Slice& key) {
-  Status s =
-      TryLock(column_family, key, false /* read_only */, true /* exclusive */);
+                                   const Slice& key,
+                                   const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Delete(column_family, key);
@@ -350,9 +393,11 @@ Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
 }
 
 Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
-                                   const SliceParts& key) {
-  Status s =
-      TryLock(column_family, key, false /* read_only */, true /* exclusive */);
+                                   const SliceParts& key,
+                                   const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Delete(column_family, key);
@@ -365,9 +410,11 @@ Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family,
 }
 
 Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
-                                         const Slice& key) {
-  Status s =
-      TryLock(column_family, key, false /* read_only */, true /* exclusive */);
+                                         const Slice& key,
+                                         const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
 
   if (s.ok()) {
     s = GetBatchForWrite()->SingleDelete(column_family, key);
@@ -380,9 +427,11 @@ Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
 }
 
 Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
-                                         const SliceParts& key) {
-  Status s =
-      TryLock(column_family, key, false /* read_only */, true /* exclusive */);
+                                         const SliceParts& key,
+                                         const bool assume_tracked) {
+  const bool do_validate = !assume_tracked;
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, do_validate, assume_tracked);
 
   if (s.ok()) {
     s = GetBatchForWrite()->SingleDelete(column_family, key);
@@ -397,7 +446,7 @@ Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family,
 Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
                                          const Slice& key, const Slice& value) {
   Status s = TryLock(column_family, key, false /* read_only */,
-                     true /* exclusive */, true /* untracked */);
+                     true /* exclusive */, false /* do_validate */);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Put(column_family, key, value);
@@ -413,7 +462,7 @@ Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family,
                                          const SliceParts& key,
                                          const SliceParts& value) {
   Status s = TryLock(column_family, key, false /* read_only */,
-                     true /* exclusive */, true /* untracked */);
+                     true /* exclusive */, false /* do_validate */);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Put(column_family, key, value);
@@ -429,7 +478,7 @@ Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family,
                                            const Slice& key,
                                            const Slice& value) {
   Status s = TryLock(column_family, key, false /* read_only */,
-                     true /* exclusive */, true /* untracked */);
+                     true /* exclusive */, false /* do_validate */);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Merge(column_family, key, value);
@@ -444,7 +493,7 @@ Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family,
 Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
                                             const Slice& key) {
   Status s = TryLock(column_family, key, false /* read_only */,
-                     true /* exclusive */, true /* untracked */);
+                     true /* exclusive */, false /* do_validate */);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Delete(column_family, key);
@@ -459,7 +508,7 @@ Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
 Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
                                             const SliceParts& key) {
   Status s = TryLock(column_family, key, false /* read_only */,
-                     true /* exclusive */, true /* untracked */);
+                     true /* exclusive */, false /* do_validate */);
 
   if (s.ok()) {
     s = GetBatchForWrite()->Delete(column_family, key);
@@ -471,6 +520,21 @@ Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family,
   return s;
 }
 
+Status TransactionBaseImpl::SingleDeleteUntracked(
+    ColumnFamilyHandle* column_family, const Slice& key) {
+  Status s = TryLock(column_family, key, false /* read_only */,
+                     true /* exclusive */, false /* do_validate */);
+
+  if (s.ok()) {
+    s = GetBatchForWrite()->SingleDelete(column_family, key);
+    if (s.ok()) {
+      num_deletes_++;
+    }
+  }
+
+  return s;
+}
+
 void TransactionBaseImpl::PutLogData(const Slice& blob) {
   write_batch_.PutLogData(blob);
 }
@@ -515,6 +579,8 @@ void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key,
 }
 
 // Add a key to the given TransactionKeyMap
+// seq for pessimistic transactions is the sequence number from which we know
+// there has not been a concurrent update to the key.
 void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
                                    const std::string& key, SequenceNumber seq,
                                    bool read_only, bool exclusive) {
@@ -527,6 +593,10 @@ void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id,
     // Now tracking this key with an earlier sequence number
     iter->second.seq = seq;
   }
+  // else we do not update the seq. The smaller the tracked seq, the stronger it
+  // the guarantee since it implies from the seq onward there has not been a
+  // concurrent update to the key. So we update the seq if it implies stronger
+  // guarantees, i.e., if it is smaller than the existing trakced seq.
 
   if (read_only) {
     iter->second.num_reads++;
@@ -591,6 +661,9 @@ WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() {
 
 void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) {
   if (snapshot != nullptr) {
+    ROCKS_LOG_DETAILS(dbimpl_->immutable_db_options().info_log,
+                      "ReleaseSnapshot %" PRIu64 " Set",
+                      snapshot->GetSequenceNumber());
     db->ReleaseSnapshot(snapshot);
   }
 }
@@ -601,7 +674,7 @@ void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family,
   auto& cf_tracked_keys = tracked_keys_[column_family_id];
   std::string key_str = key.ToString();
   bool can_decrement = false;
-  bool can_unlock __attribute__((unused)) = false;
+  bool can_unlock __attribute__((__unused__)) = false;
 
   if (save_points_ != nullptr && !save_points_->empty()) {
     // Check if this key was fetched ForUpdate in this SavePoint
@@ -677,7 +750,7 @@ Status TransactionBaseImpl::RebuildFromWriteBatch(WriteBatch* src_batch) {
     // this is used for reconstructing prepared transactions upon
     // recovery. there should not be any meta markers in the batches
     // we are processing.
-    Status MarkBeginPrepare() override { return Status::InvalidArgument(); }
+    Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
 
     Status MarkEndPrepare(const Slice&) override {
       return Status::InvalidArgument();
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_base.h b/thirdparty/rocksdb/utilities/transactions/transaction_base.h
index c73b329f40..9154b32740 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_base.h
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_base.h
@@ -36,15 +36,18 @@ class TransactionBaseImpl : public Transaction {
 
   // Called before executing Put, Merge, Delete, and GetForUpdate.  If TryLock
   // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed.
-  // untracked will be true if called from PutUntracked, DeleteUntracked, or
-  // MergeUntracked.
+  // do_validate will be false if called from PutUntracked, DeleteUntracked,
+  // MergeUntracked, or GetForUpdate(do_validate=false)
   virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key,
                          bool read_only, bool exclusive,
-                         bool untracked = false) = 0;
+                         const bool do_validate = true,
+                         const bool assume_tracked = false) = 0;
 
   void SetSavePoint() override;
 
   Status RollbackToSavePoint() override;
+  
+  Status PopSavePoint() override;
 
   using Transaction::Get;
   Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
@@ -61,16 +64,19 @@ class TransactionBaseImpl : public Transaction {
   using Transaction::GetForUpdate;
   Status GetForUpdate(const ReadOptions& options,
                       ColumnFamilyHandle* column_family, const Slice& key,
-                      std::string* value, bool exclusive) override;
+                      std::string* value, bool exclusive,
+                      const bool do_validate) override;
 
   Status GetForUpdate(const ReadOptions& options,
                       ColumnFamilyHandle* column_family, const Slice& key,
-                      PinnableSlice* pinnable_val, bool exclusive) override;
+                      PinnableSlice* pinnable_val, bool exclusive,
+                      const bool do_validate) override;
 
   Status GetForUpdate(const ReadOptions& options, const Slice& key,
-                      std::string* value, bool exclusive) override {
+                      std::string* value, bool exclusive,
+                      const bool do_validate) override {
     return GetForUpdate(options, db_->DefaultColumnFamily(), key, value,
-                        exclusive);
+                        exclusive, do_validate);
   }
 
   std::vector<Status> MultiGet(
@@ -107,36 +113,38 @@ class TransactionBaseImpl : public Transaction {
                         ColumnFamilyHandle* column_family) override;
 
   Status Put(ColumnFamilyHandle* column_family, const Slice& key,
-             const Slice& value) override;
+             const Slice& value, const bool assume_tracked = false) override;
   Status Put(const Slice& key, const Slice& value) override {
     return Put(nullptr, key, value);
   }
 
   Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
-             const SliceParts& value) override;
+             const SliceParts& value,
+             const bool assume_tracked = false) override;
   Status Put(const SliceParts& key, const SliceParts& value) override {
     return Put(nullptr, key, value);
   }
 
   Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
-               const Slice& value) override;
+               const Slice& value, const bool assume_tracked = false) override;
   Status Merge(const Slice& key, const Slice& value) override {
     return Merge(nullptr, key, value);
   }
 
-  Status Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
+  Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                const bool assume_tracked = false) override;
   Status Delete(const Slice& key) override { return Delete(nullptr, key); }
-  Status Delete(ColumnFamilyHandle* column_family,
-                const SliceParts& key) override;
+  Status Delete(ColumnFamilyHandle* column_family, const SliceParts& key,
+                const bool assume_tracked = false) override;
   Status Delete(const SliceParts& key) override { return Delete(nullptr, key); }
 
-  Status SingleDelete(ColumnFamilyHandle* column_family,
-                      const Slice& key) override;
+  Status SingleDelete(ColumnFamilyHandle* column_family, const Slice& key,
+                      const bool assume_tracked = false) override;
   Status SingleDelete(const Slice& key) override {
     return SingleDelete(nullptr, key);
   }
-  Status SingleDelete(ColumnFamilyHandle* column_family,
-                      const SliceParts& key) override;
+  Status SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key,
+                      const bool assume_tracked = false) override;
   Status SingleDelete(const SliceParts& key) override {
     return SingleDelete(nullptr, key);
   }
@@ -170,18 +178,24 @@ class TransactionBaseImpl : public Transaction {
     return DeleteUntracked(nullptr, key);
   }
 
+  Status SingleDeleteUntracked(ColumnFamilyHandle* column_family,
+                               const Slice& key) override;
+  Status SingleDeleteUntracked(const Slice& key) override {
+    return SingleDeleteUntracked(nullptr, key);
+  }
+
   void PutLogData(const Slice& blob) override;
 
   WriteBatchWithIndex* GetWriteBatch() override;
 
-  virtual void SetLockTimeout(int64_t timeout) override { /* Do nothing */
+  virtual void SetLockTimeout(int64_t /*timeout*/) override { /* Do nothing */
   }
 
   const Snapshot* GetSnapshot() const override {
     return snapshot_ ? snapshot_.get() : nullptr;
   }
 
-  void SetSnapshot() override;
+  virtual void SetSnapshot() override;
   void SetSnapshotOnNextOperation(
       std::shared_ptr<TransactionNotifier> notifier = nullptr) override;
 
@@ -226,7 +240,7 @@ class TransactionBaseImpl : public Transaction {
 
   // iterates over the given batch and makes the appropriate inserts.
   // used for rebuilding prepared transactions after recovery.
-  Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
+  virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
 
   WriteBatch* GetCommitTimeWriteBatch() override;
 
@@ -297,7 +311,9 @@ class TransactionBaseImpl : public Transaction {
   WriteBatchWithIndex write_batch_;
 
  private:
-  // batch to be written at commit time
+  friend class WritePreparedTxn;
+  // Extra data to be persisted with the commit. Note this is only used when
+  // prepare phase is not skipped.
   WriteBatch commit_time_batch_;
 
   // Stack of the Snapshot saved at each save point.  Saved snapshots may be
@@ -306,8 +322,7 @@ class TransactionBaseImpl : public Transaction {
 
   // Map from column_family_id to map of keys that are involved in this
   // transaction.
-  // Pessimistic Transactions will do conflict checking before adding a key
-  // by calling TrackKey().
+  // For Pessimistic Transactions this is the list of locked keys.
   // Optimistic Transactions will wait till commit time to do conflict checking.
   TransactionKeyMap tracked_keys_;
 
@@ -326,10 +341,10 @@ class TransactionBaseImpl : public Transaction {
   std::shared_ptr<TransactionNotifier> snapshot_notifier_ = nullptr;
 
   Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key,
-                 bool read_only, bool exclusive, bool untracked = false);
+                 bool read_only, bool exclusive, const bool do_validate = true,
+                 const bool assume_tracked = false);
 
   WriteBatchBase* GetBatchForWrite();
-
   void SetSnapshotInternal(const Snapshot* snapshot);
 };
 
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc b/thirdparty/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
index b6120a1688..244a950773 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_db_mutex_impl.cc
@@ -19,7 +19,7 @@ namespace rocksdb {
 class TransactionDBMutexImpl : public TransactionDBMutex {
  public:
   TransactionDBMutexImpl() {}
-  ~TransactionDBMutexImpl() {}
+  ~TransactionDBMutexImpl() override {}
 
   Status Lock() override;
 
@@ -36,7 +36,7 @@ class TransactionDBMutexImpl : public TransactionDBMutex {
 class TransactionDBCondVarImpl : public TransactionDBCondVar {
  public:
   TransactionDBCondVarImpl() {}
-  ~TransactionDBCondVarImpl() {}
+  ~TransactionDBCondVarImpl() override {}
 
   Status Wait(std::shared_ptr<TransactionDBMutex> mutex) override;
 
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.cc b/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.cc
index a72c2a12ff..9074a14945 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.cc
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.cc
@@ -20,10 +20,11 @@
 #include <string>
 #include <vector>
 
+#include "monitoring/perf_context_imp.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/utilities/transaction_db_mutex.h"
 #include "util/cast_util.h"
-#include "util/murmurhash.h"
+#include "util/hash.h"
 #include "util/sync_point.h"
 #include "util/thread_local.h"
 #include "utilities/transactions/pessimistic_transaction_db.h"
@@ -103,7 +104,7 @@ void DeadlockInfoBuffer::AddNewPath(DeadlockPath path) {
     return;
   }
 
-  paths_buffer_[buffer_idx_] = path;
+  paths_buffer_[buffer_idx_] = std::move(path);
   buffer_idx_ = (buffer_idx_ + 1) % paths_buffer_.size();
 }
 
@@ -182,8 +183,7 @@ TransactionLockMgr::~TransactionLockMgr() {}
 
 size_t LockMap::GetStripe(const std::string& key) const {
   assert(num_stripes_ > 0);
-  static murmur_hash hash;
-  size_t stripe = hash(key) % num_stripes_;
+  size_t stripe = static_cast<size_t>(GetSliceNPHash64(key)) % num_stripes_;
   return stripe;
 }
 
@@ -221,9 +221,9 @@ void TransactionLockMgr::RemoveColumnFamily(uint32_t column_family_id) {
   }
 }
 
-// Look up the LockMap shared_ptr for a given column_family_id.
+// Look up the LockMap std::shared_ptr for a given column_family_id.
 // Note:  The LockMap is only valid as long as the caller is still holding on
-//   to the returned shared_ptr.
+//   to the returned std::shared_ptr.
 std::shared_ptr<LockMap> TransactionLockMgr::GetLockMap(
     uint32_t column_family_id) {
   // First check thread-local cache
@@ -320,11 +320,10 @@ Status TransactionLockMgr::AcquireWithTimeout(
     uint32_t column_family_id, const std::string& key, Env* env,
     int64_t timeout, const LockInfo& lock_info) {
   Status result;
-  uint64_t start_time = 0;
   uint64_t end_time = 0;
 
   if (timeout > 0) {
-    start_time = env->NowMicros();
+    uint64_t start_time = env->NowMicros();
     end_time = start_time + timeout;
   }
 
@@ -347,6 +346,8 @@ Status TransactionLockMgr::AcquireWithTimeout(
                          &expire_time_hint, &wait_ids);
 
   if (!result.ok() && timeout != 0) {
+    PERF_TIMER_GUARD(key_lock_wait_time);
+    PERF_COUNTER_ADD(key_lock_wait_count, 1);
     // If we weren't able to acquire the lock, we will keep retrying as long
     // as the timeout allows.
     bool timed_out = false;
@@ -370,7 +371,7 @@ Status TransactionLockMgr::AcquireWithTimeout(
       if (wait_ids.size() != 0) {
         if (txn->IsDeadlockDetect()) {
           if (IncrementWaiters(txn, wait_ids, key, column_family_id,
-                               lock_info.exclusive)) {
+                               lock_info.exclusive, env)) {
             result = Status::Busy(Status::SubCode::kDeadlock);
             stripe->stripe_mutex->UnLock();
             return result;
@@ -442,10 +443,10 @@ void TransactionLockMgr::DecrementWaitersImpl(
 bool TransactionLockMgr::IncrementWaiters(
     const PessimisticTransaction* txn,
     const autovector<TransactionID>& wait_ids, const std::string& key,
-    const uint32_t& cf_id, const bool& exclusive) {
+    const uint32_t& cf_id, const bool& exclusive, Env* const env) {
   auto id = txn->GetID();
-  std::vector<int> queue_parents(txn->GetDeadlockDetectDepth());
-  std::vector<TransactionID> queue_values(txn->GetDeadlockDetectDepth());
+  std::vector<int> queue_parents(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
+  std::vector<TransactionID> queue_values(static_cast<size_t>(txn->GetDeadlockDetectDepth()));
   std::lock_guard<std::mutex> lock(wait_txn_map_mutex_);
   assert(!wait_txn_map_.Contains(id));
 
@@ -466,6 +467,7 @@ bool TransactionLockMgr::IncrementWaiters(
 
   const auto* next_ids = &wait_ids;
   int parent = -1;
+  int64_t deadlock_time = 0;
   for (int tail = 0, head = 0; head < txn->GetDeadlockDetectDepth(); head++) {
     int i = 0;
     if (next_ids) {
@@ -491,12 +493,14 @@ bool TransactionLockMgr::IncrementWaiters(
 
         auto extracted_info = wait_txn_map_.Get(queue_values[head]);
         path.push_back({queue_values[head], extracted_info.m_cf_id,
-                        extracted_info.m_waiting_key,
-                        extracted_info.m_exclusive});
+                        extracted_info.m_exclusive,
+                        extracted_info.m_waiting_key});
         head = queue_parents[head];
       }
+      env->GetCurrentTime(&deadlock_time);
       std::reverse(path.begin(), path.end());
-      dlock_buffer_.AddNewPath(DeadlockPath(path));
+      dlock_buffer_.AddNewPath(DeadlockPath(path, deadlock_time));
+      deadlock_time = 0;
       DecrementWaitersImpl(txn, wait_ids);
       return true;
     } else if (!wait_txn_map_.Contains(next)) {
@@ -509,7 +513,8 @@ bool TransactionLockMgr::IncrementWaiters(
   }
 
   // Wait cycle too big, just assume deadlock.
-  dlock_buffer_.AddNewPath(DeadlockPath(true));
+  env->GetCurrentTime(&deadlock_time);
+  dlock_buffer_.AddNewPath(DeadlockPath(deadlock_time, true));
   DecrementWaitersImpl(txn, wait_ids);
   return true;
 }
@@ -528,9 +533,10 @@ Status TransactionLockMgr::AcquireLocked(LockMap* lock_map,
 
   Status result;
   // Check if this key is already locked
-  if (stripe->keys.find(key) != stripe->keys.end()) {
+  auto stripe_iter = stripe->keys.find(key);
+  if (stripe_iter != stripe->keys.end()) {
     // Lock already held
-    LockInfo& lock_info = stripe->keys.at(key);
+    LockInfo& lock_info = stripe_iter->second;
     assert(lock_info.txn_ids.size() == 1 || !lock_info.exclusive);
 
     if (lock_info.exclusive || txn_lock_info.exclusive) {
@@ -588,6 +594,9 @@ void TransactionLockMgr::UnLockKey(const PessimisticTransaction* txn,
                                    const std::string& key,
                                    LockMapStripe* stripe, LockMap* lock_map,
                                    Env* env) {
+#ifdef NDEBUG
+  (void)env;
+#endif
   TransactionID txn_id = txn->GetID();
 
   auto stripe_iter = stripe->keys.find(key);
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.h b/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.h
index abf7c5d3d7..cb650d9fc8 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.h
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_lock_mgr.h
@@ -143,7 +143,7 @@ class TransactionLockMgr {
   bool IncrementWaiters(const PessimisticTransaction* txn,
                         const autovector<TransactionID>& wait_ids,
                         const std::string& key, const uint32_t& cf_id,
-                        const bool& exclusive);
+                        const bool& exclusive, Env* const env);
   void DecrementWaiters(const PessimisticTransaction* txn,
                         const autovector<TransactionID>& wait_ids);
   void DecrementWaitersImpl(const PessimisticTransaction* txn,
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_test.cc b/thirdparty/rocksdb/utilities/transactions/transaction_test.cc
index eac8e563d7..732d4c8124 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_test.cc
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_test.cc
@@ -9,7 +9,8 @@
 #define __STDC_FORMAT_MACROS
 #endif
 
-#include <inttypes.h>
+#include "utilities/transactions/transaction_test.h"
+
 #include <algorithm>
 #include <functional>
 #include <string>
@@ -18,6 +19,7 @@
 #include "db/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
 #include "rocksdb/utilities/transaction.h"
 #include "rocksdb/utilities/transaction_db.h"
 #include "table/mock_table.h"
@@ -38,114 +40,43 @@ using std::string;
 
 namespace rocksdb {
 
-class TransactionTest : public ::testing::TestWithParam<
-                            std::tuple<bool, bool, TxnDBWritePolicy>> {
- public:
-  TransactionDB* db;
-  FaultInjectionTestEnv* env;
-  string dbname;
-  Options options;
-
-  TransactionDBOptions txn_db_options;
-
-  TransactionTest() {
-    options.create_if_missing = true;
-    options.max_write_buffer_number = 2;
-    options.write_buffer_size = 4 * 1024;
-    options.level0_file_num_compaction_trigger = 2;
-    options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
-    env = new FaultInjectionTestEnv(Env::Default());
-    options.env = env;
-    options.concurrent_prepare = std::get<1>(GetParam());
-    dbname = test::TmpDir() + "/transaction_testdb";
-
-    DestroyDB(dbname, options);
-    txn_db_options.transaction_lock_timeout = 0;
-    txn_db_options.default_lock_timeout = 0;
-    txn_db_options.write_policy = std::get<2>(GetParam());
-    Status s;
-    if (std::get<0>(GetParam()) == false) {
-      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
-    } else {
-      s = OpenWithStackableDB();
-    }
-    assert(s.ok());
-  }
-
-  ~TransactionTest() {
-    delete db;
-    DestroyDB(dbname, options);
-    delete env;
-  }
-
-  Status ReOpenNoDelete() {
-    delete db;
-    db = nullptr;
-    env->AssertNoOpenFile();
-    env->DropUnsyncedFileData();
-    env->ResetState();
-    Status s;
-    if (std::get<0>(GetParam()) == false) {
-      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
-    } else {
-      s = OpenWithStackableDB();
-    }
-    return s;
-  }
-
-  Status ReOpen() {
-    delete db;
-    DestroyDB(dbname, options);
-    Status s;
-    if (std::get<0>(GetParam()) == false) {
-      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
-    } else {
-      s = OpenWithStackableDB();
-    }
-    return s;
-  }
-
-  Status OpenWithStackableDB() {
-    std::vector<size_t> compaction_enabled_cf_indices;
-    std::vector<ColumnFamilyDescriptor> column_families{ColumnFamilyDescriptor(
-        kDefaultColumnFamilyName, ColumnFamilyOptions(options))};
-
-    TransactionDB::PrepareWrap(&options, &column_families,
-                               &compaction_enabled_cf_indices);
-    std::vector<ColumnFamilyHandle*> handles;
-    DB* root_db;
-    Options options_copy(options);
-    Status s =
-        DB::Open(options_copy, dbname, column_families, &handles, &root_db);
-    if (s.ok()) {
-      assert(handles.size() == 1);
-      s = TransactionDB::WrapStackableDB(
-          new StackableDB(root_db), txn_db_options,
-          compaction_enabled_cf_indices, handles, &db);
-      delete handles[0];
-    }
-    return s;
-  }
-};
-
-class MySQLStyleTransactionTest : public TransactionTest {};
-class WritePreparedTransactionTest : public TransactionTest {};
-
-static const TxnDBWritePolicy wc = WRITE_COMMITTED;
-static const TxnDBWritePolicy wp = WRITE_PREPARED;
-// TODO(myabandeh): Instantiate the tests with other write policies
-INSTANTIATE_TEST_CASE_P(DBAsBaseDB, TransactionTest,
-                        ::testing::Values(std::make_tuple(false, false, wc)));
-INSTANTIATE_TEST_CASE_P(StackableDBAsBaseDB, TransactionTest,
-                        ::testing::Values(std::make_tuple(true, false, wc)));
-INSTANTIATE_TEST_CASE_P(MySQLStyleTransactionTest, MySQLStyleTransactionTest,
-                        ::testing::Values(std::make_tuple(false, false, wc),
-                                          std::make_tuple(false, true, wc),
-                                          std::make_tuple(true, false, wc),
-                                          std::make_tuple(true, true, wc)));
-INSTANTIATE_TEST_CASE_P(WritePreparedTransactionTest,
-                        WritePreparedTransactionTest,
-                        ::testing::Values(std::make_tuple(false, true, wp)));
+INSTANTIATE_TEST_CASE_P(
+    DBAsBaseDB, TransactionTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED),
+                      std::make_tuple(false, true, WRITE_COMMITTED),
+                      std::make_tuple(false, false, WRITE_PREPARED),
+                      std::make_tuple(false, true, WRITE_PREPARED),
+                      std::make_tuple(false, false, WRITE_UNPREPARED),
+                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+INSTANTIATE_TEST_CASE_P(
+    DBAsBaseDB, TransactionStressTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED),
+                      std::make_tuple(false, true, WRITE_COMMITTED),
+                      std::make_tuple(false, false, WRITE_PREPARED),
+                      std::make_tuple(false, true, WRITE_PREPARED),
+                      std::make_tuple(false, false, WRITE_UNPREPARED),
+                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+INSTANTIATE_TEST_CASE_P(
+    StackableDBAsBaseDB, TransactionTest,
+    ::testing::Values(std::make_tuple(true, true, WRITE_COMMITTED),
+                      std::make_tuple(true, true, WRITE_PREPARED),
+                      std::make_tuple(true, true, WRITE_UNPREPARED)));
+
+// MySQLStyleTransactionTest takes far too long for valgrind to run.
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+    MySQLStyleTransactionTest, MySQLStyleTransactionTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_COMMITTED, false),
+                      std::make_tuple(false, true, WRITE_COMMITTED, false),
+                      std::make_tuple(false, false, WRITE_PREPARED, false),
+                      std::make_tuple(false, false, WRITE_PREPARED, true),
+                      std::make_tuple(false, true, WRITE_PREPARED, false),
+                      std::make_tuple(false, true, WRITE_PREPARED, true),
+                      std::make_tuple(false, false, WRITE_UNPREPARED, false),
+                      std::make_tuple(false, false, WRITE_UNPREPARED, true),
+                      std::make_tuple(false, true, WRITE_UNPREPARED, false),
+                      std::make_tuple(false, true, WRITE_UNPREPARED, true)));
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, DoubleEmptyWrite) {
   WriteOptions write_options;
@@ -156,6 +87,27 @@ TEST_P(TransactionTest, DoubleEmptyWrite) {
 
   ASSERT_OK(db->Write(write_options, &batch));
   ASSERT_OK(db->Write(write_options, &batch));
+
+  // Also test committing empty transactions in 2PC
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  ASSERT_OK(txn0->Prepare());
+  ASSERT_OK(txn0->Commit());
+  delete txn0;
+
+  // Also test that it works during recovery
+  txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid2"));
+  txn0->Put(Slice("foo0"), Slice("bar0a"));
+  ASSERT_OK(txn0->Prepare());
+  delete txn0;
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+  ASSERT_OK(ReOpenNoDelete());
+  assert(db != nullptr);
+  txn0 = db->GetTransactionByName("xid2");
+  ASSERT_OK(txn0->Commit());
+  delete txn0;
 }
 
 TEST_P(TransactionTest, SuccessTest) {
@@ -163,11 +115,10 @@ TEST_P(TransactionTest, SuccessTest) {
 
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
-  Status s;
+  std::string value;
 
-  db->Put(write_options, Slice("foo"), Slice("bar"));
-  db->Put(write_options, Slice("foo2"), Slice("bar"));
+  ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+  ASSERT_OK(db->Put(write_options, Slice("foo2"), Slice("bar")));
 
   Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
   ASSERT_TRUE(txn);
@@ -175,29 +126,131 @@ TEST_P(TransactionTest, SuccessTest) {
   ASSERT_EQ(0, txn->GetNumPuts());
   ASSERT_LE(0, txn->GetID());
 
-  s = txn->GetForUpdate(read_options, "foo", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "bar");
 
-  s = txn->Put(Slice("foo"), Slice("bar2"));
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Put(Slice("foo"), Slice("bar2")));
 
   ASSERT_EQ(1, txn->GetNumPuts());
 
-  s = txn->GetForUpdate(read_options, "foo", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(txn->GetForUpdate(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
-  s = txn->Commit();
-  ASSERT_OK(s);
+  ASSERT_OK(txn->Commit());
 
-  s = db->Get(read_options, "foo", &value);
-  ASSERT_OK(s);
+  ASSERT_OK(db->Get(read_options, "foo", &value));
   ASSERT_EQ(value, "bar2");
 
   delete txn;
 }
 
+// The test clarifies the contract of do_validate and assume_tracked
+// in GetForUpdate and Put/Merge/Delete
+TEST_P(TransactionTest, AssumeExclusiveTracked) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  std::string value;
+  Status s;
+  TransactionOptions txn_options;
+  txn_options.lock_timeout = 1;
+  const bool EXCLUSIVE = true;
+  const bool DO_VALIDATE = true;
+  const bool ASSUME_LOCKED = true;
+
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn);
+  txn->SetSnapshot();
+
+  // commit a value after the snapshot is taken
+  ASSERT_OK(db->Put(write_options, Slice("foo"), Slice("bar")));
+
+  // By default write should fail to the commit after our snapshot
+  s = txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE);
+  ASSERT_TRUE(s.IsBusy());
+  // But the user could direct the db to skip validating the snapshot. The read
+  // value then should be the most recently committed
+  ASSERT_OK(
+      txn->GetForUpdate(read_options, "foo", &value, EXCLUSIVE, !DO_VALIDATE));
+  ASSERT_EQ(value, "bar");
+
+  // Although ValidateSnapshot is skipped the key must have still got locked
+  s = db->Put(write_options, Slice("foo"), Slice("bar"));
+  ASSERT_TRUE(s.IsTimedOut());
+
+  // By default the write operations should fail due to the commit after the
+  // snapshot
+  s = txn->Put(Slice("foo"), Slice("bar1"));
+  ASSERT_TRUE(s.IsBusy());
+  s = txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
+               !ASSUME_LOCKED);
+  ASSERT_TRUE(s.IsBusy());
+  // But the user could direct the db that it already assumes exclusive lock on
+  // the key due to the previous GetForUpdate call.
+  ASSERT_OK(txn->Put(db->DefaultColumnFamily(), Slice("foo"), Slice("bar1"),
+                     ASSUME_LOCKED));
+  ASSERT_OK(txn->Merge(db->DefaultColumnFamily(), Slice("foo"), Slice("bar2"),
+                       ASSUME_LOCKED));
+  ASSERT_OK(
+      txn->Delete(db->DefaultColumnFamily(), Slice("foo"), ASSUME_LOCKED));
+  ASSERT_OK(txn->SingleDelete(db->DefaultColumnFamily(), Slice("foo"),
+                              ASSUME_LOCKED));
+
+  txn->Rollback();
+  delete txn;
+}
+
+// This test clarifies the contract of ValidateSnapshot
+TEST_P(TransactionTest, ValidateSnapshotTest) {
+  for (bool with_flush : {true}) {
+    for (bool with_2pc : {true}) {
+      ASSERT_OK(ReOpen());
+      WriteOptions write_options;
+      ReadOptions read_options;
+      std::string value;
+
+      assert(db != nullptr);
+      Transaction* txn1 =
+          db->BeginTransaction(write_options, TransactionOptions());
+      ASSERT_TRUE(txn1);
+      ASSERT_OK(txn1->Put(Slice("foo"), Slice("bar1")));
+      if (with_2pc) {
+        ASSERT_OK(txn1->SetName("xid1"));
+        ASSERT_OK(txn1->Prepare());
+      }
+
+      if (with_flush) {
+        auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+        db_impl->TEST_FlushMemTable(true);
+        // Make sure the flushed memtable is not kept in memory
+        int max_memtable_in_history =
+            std::max(options.max_write_buffer_number,
+                     options.max_write_buffer_number_to_maintain) +
+            1;
+        for (int i = 0; i < max_memtable_in_history; i++) {
+          db->Put(write_options, Slice("key"), Slice("value"));
+          db_impl->TEST_FlushMemTable(true);
+        }
+      }
+
+      Transaction* txn2 =
+          db->BeginTransaction(write_options, TransactionOptions());
+      ASSERT_TRUE(txn2);
+      txn2->SetSnapshot();
+
+      ASSERT_OK(txn1->Commit());
+      delete txn1;
+
+      auto pes_txn2 = dynamic_cast<PessimisticTransaction*>(txn2);
+      // Test the simple case where the key is not tracked yet
+      auto trakced_seq = kMaxSequenceNumber;
+      auto s = pes_txn2->ValidateSnapshot(db->DefaultColumnFamily(), "foo",
+                                          &trakced_seq);
+      ASSERT_TRUE(s.IsBusy());
+      delete txn2;
+    }
+  }
+}
+
 TEST_P(TransactionTest, WaitingTxn) {
   WriteOptions write_options;
   ReadOptions read_options;
@@ -224,7 +277,7 @@ TEST_P(TransactionTest, WaitingTxn) {
   ASSERT_TRUE(txn2);
 
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn", [&](void* arg) {
+      "TransactionLockMgr::AcquireWithTimeout:WaitingTxn", [&](void* /*arg*/) {
         std::string key;
         uint32_t cf_id;
         std::vector<TransactionID> wait = txn2->GetWaitingTxns(&cf_id, &key);
@@ -234,15 +287,18 @@ TEST_P(TransactionTest, WaitingTxn) {
         ASSERT_EQ(cf_id, 0);
       });
 
+  get_perf_context()->Reset();
   // lock key in default cf
   s = txn1->GetForUpdate(read_options, "foo", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
 
   // lock key in cfa
   s = txn1->GetForUpdate(read_options, cfa, "foo", &value);
   ASSERT_OK(s);
   ASSERT_EQ(value, "bar");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 0);
 
   auto lock_data = db->GetLockStatusData();
   // Locked keys exist in both column family.
@@ -278,6 +334,8 @@ TEST_P(TransactionTest, WaitingTxn) {
   s = txn2->GetForUpdate(read_options, "foo", &value);
   ASSERT_TRUE(s.IsTimedOut());
   ASSERT_EQ(s.ToString(), "Operation timed out: Timeout waiting to lock key");
+  ASSERT_EQ(get_perf_context()->key_lock_wait_count, 1);
+  ASSERT_GE(get_perf_context()->key_lock_wait_time, 0);
 
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
   rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@@ -372,8 +430,8 @@ TEST_P(TransactionTest, SharedLocks) {
   s = txn2->GetForUpdate(read_options, "foo", nullptr);
   ASSERT_OK(s);
 
-  txn1->Rollback();
-  txn2->Rollback();
+  ASSERT_OK(txn1->Rollback());
+  ASSERT_OK(txn2->Rollback());
 
   // Test txn1 trying to downgrade its lock.
   s = txn1->GetForUpdate(read_options, "foo", nullptr, true /* exclusive */);
@@ -446,7 +504,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
   std::atomic<uint32_t> checkpoints(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
-      [&](void* arg) { checkpoints.fetch_add(1); });
+      [&](void* /*arg*/) { checkpoints.fetch_add(1); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   // We want the leaf transactions to block and hold everyone back.
@@ -484,6 +542,14 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
     ASSERT_EQ(dlock_buffer.size(), curr_dlock_buffer_len_);
     auto dlock_entry = dlock_buffer[0].path;
     ASSERT_EQ(dlock_entry.size(), kInitialMaxDeadlocks);
+    int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
+    int64_t cur_deadlock_time = 0;
+    for (auto const& dl_path_rec : dlock_buffer) {
+      cur_deadlock_time = dl_path_rec.deadlock_time;
+      ASSERT_NE(cur_deadlock_time, 0);
+      ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
+      pre_deadlock_time = cur_deadlock_time;
+    }
 
     int64_t curr_waiting_key = 0;
 
@@ -571,7 +637,7 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
   std::atomic<uint32_t> checkpoints_shared(0);
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
       "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
-      [&](void* arg) { checkpoints_shared.fetch_add(1); });
+      [&](void* /*arg*/) { checkpoints_shared.fetch_add(1); });
   rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
   std::vector<port::Thread> threads_shared;
@@ -615,7 +681,8 @@ TEST_P(TransactionTest, DeadlockCycleShared) {
   }
 }
 
-TEST_P(TransactionTest, DeadlockCycle) {
+#ifndef ROCKSDB_VALGRIND_RUN
+TEST_P(TransactionStressTest, DeadlockCycle) {
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
@@ -643,7 +710,7 @@ TEST_P(TransactionTest, DeadlockCycle) {
     std::atomic<uint32_t> checkpoints(0);
     rocksdb::SyncPoint::GetInstance()->SetCallBack(
         "TransactionLockMgr::AcquireWithTimeout:WaitingTxn",
-        [&](void* arg) { checkpoints.fetch_add(1); });
+        [&](void* /*arg*/) { checkpoints.fetch_add(1); });
     rocksdb::SyncPoint::GetInstance()->EnableProcessing();
 
     // We want the last transaction in the chain to block and hold everyone
@@ -689,6 +756,15 @@ TEST_P(TransactionTest, DeadlockCycle) {
     ASSERT_EQ(dlock_entry.size(), check_len);
     ASSERT_EQ(dlock_buffer[0].limit_exceeded, check_limit_flag);
 
+    int64_t pre_deadlock_time = dlock_buffer[0].deadlock_time;
+    int64_t cur_deadlock_time = 0;
+    for (auto const& dl_path_rec : dlock_buffer) {
+      cur_deadlock_time = dl_path_rec.deadlock_time;
+      ASSERT_NE(cur_deadlock_time, 0);
+      ASSERT_TRUE(cur_deadlock_time <= pre_deadlock_time);
+      pre_deadlock_time = cur_deadlock_time;
+    }
+
     // Iterates backwards over path verifying decreasing txn_ids.
     for (auto it = dlock_entry.rbegin(); it != dlock_entry.rend(); it++) {
       auto dl_node = *it;
@@ -714,7 +790,7 @@ TEST_P(TransactionTest, DeadlockCycle) {
   }
 }
 
-TEST_P(TransactionTest, DeadlockStress) {
+TEST_P(TransactionStressTest, DeadlockStress) {
   const uint32_t NUM_TXN_THREADS = 10;
   const uint32_t NUM_KEYS = 100;
   const uint32_t NUM_ITERS = 10000;
@@ -768,18 +844,19 @@ TEST_P(TransactionTest, DeadlockStress) {
     t.join();
   }
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, CommitTimeBatchFailTest) {
   WriteOptions write_options;
   TransactionOptions txn_options;
 
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
   ASSERT_TRUE(txn1);
 
-  txn1->GetCommitTimeWriteBatch()->Put("cat", "dog");
+  ASSERT_OK(txn1->GetCommitTimeWriteBatch()->Put("cat", "dog"));
 
   s = txn1->Put("foo", "bar");
   ASSERT_OK(s);
@@ -791,102 +868,182 @@ TEST_P(TransactionTest, CommitTimeBatchFailTest) {
   delete txn1;
 }
 
-TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) {
+TEST_P(TransactionTest, LogMarkLeakTest) {
+  TransactionOptions txn_options;
   WriteOptions write_options;
-  ReadOptions read_options;
+  options.write_buffer_size = 1024;
+  ASSERT_OK(ReOpenNoDelete());
+  assert(db != nullptr);
+  Random rnd(47);
+  std::vector<Transaction*> txns;
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  // At the beginning there should be no log containing prepare data
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+  for (size_t i = 0; i < 100; i++) {
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn->SetName("xid" + ToString(i)));
+    ASSERT_OK(txn->Put(Slice("foo" + ToString(i)), Slice("bar")));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_GT(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+    if (rnd.OneIn(5)) {
+      txns.push_back(txn);
+    } else {
+      ASSERT_OK(txn->Commit());
+      delete txn;
+    }
+    db_impl->TEST_FlushMemTable(true);
+  }
+  for (auto txn : txns) {
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  }
+  // At the end there should be no log left containing prepare data
+  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+  // Make sure that the underlying data structures are properly truncated and
+  // cause not leak
+  ASSERT_EQ(db_impl->TEST_PreparedSectionCompletedSize(), 0);
+  ASSERT_EQ(db_impl->TEST_LogsWithPrepSize(), 0);
+}
 
-  TransactionOptions txn_options;
+TEST_P(TransactionTest, SimpleTwoPhaseTransactionTest) {
+  for (bool cwb4recovery : {true, false}) {
+    ASSERT_OK(ReOpen());
+    WriteOptions write_options;
+    ReadOptions read_options;
 
-  string value;
-  Status s;
+    TransactionOptions txn_options;
+    txn_options.use_only_the_last_commit_time_batch_for_recovery = cwb4recovery;
 
-  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    string value;
+    Status s;
 
-  Transaction* txn = db->BeginTransaction(write_options, txn_options);
-  s = txn->SetName("xid");
-  ASSERT_OK(s);
+    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
 
-  ASSERT_EQ(db->GetTransactionByName("xid"), txn);
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    s = txn->SetName("xid");
+    ASSERT_OK(s);
 
-  // transaction put
-  s = txn->Put(Slice("foo"), Slice("bar"));
-  ASSERT_OK(s);
-  ASSERT_EQ(1, txn->GetNumPuts());
+    ASSERT_EQ(db->GetTransactionByName("xid"), txn);
 
-  // regular db put
-  s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
-  ASSERT_OK(s);
-  ASSERT_EQ(1, txn->GetNumPuts());
+    // transaction put
+    s = txn->Put(Slice("foo"), Slice("bar"));
+    ASSERT_OK(s);
+    ASSERT_EQ(1, txn->GetNumPuts());
 
-  // regular db read
-  db->Get(read_options, "foo2", &value);
-  ASSERT_EQ(value, "bar2");
+    // regular db put
+    s = db->Put(write_options, Slice("foo2"), Slice("bar2"));
+    ASSERT_OK(s);
+    ASSERT_EQ(1, txn->GetNumPuts());
 
-  // commit time put
-  txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs"));
-  txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats"));
+    // regular db read
+    db->Get(read_options, "foo2", &value);
+    ASSERT_EQ(value, "bar2");
 
-  // nothing has been prepped yet
-  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+    // commit time put
+    txn->GetCommitTimeWriteBatch()->Put(Slice("gtid"), Slice("dogs"));
+    txn->GetCommitTimeWriteBatch()->Put(Slice("gtid2"), Slice("cats"));
 
-  s = txn->Prepare();
-  ASSERT_OK(s);
+    // nothing has been prepped yet
+    ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
 
-  // data not im mem yet
-  s = db->Get(read_options, Slice("foo"), &value);
-  ASSERT_TRUE(s.IsNotFound());
-  s = db->Get(read_options, Slice("gtid"), &value);
-  ASSERT_TRUE(s.IsNotFound());
+    s = txn->Prepare();
+    ASSERT_OK(s);
 
-  // find trans in list of prepared transactions
-  std::vector<Transaction*> prepared_trans;
-  db->GetAllPreparedTransactions(&prepared_trans);
-  ASSERT_EQ(prepared_trans.size(), 1);
-  ASSERT_EQ(prepared_trans.front()->GetName(), "xid");
+    // data not im mem yet
+    s = db->Get(read_options, Slice("foo"), &value);
+    ASSERT_TRUE(s.IsNotFound());
+    s = db->Get(read_options, Slice("gtid"), &value);
+    ASSERT_TRUE(s.IsNotFound());
 
-  auto log_containing_prep =
-      db_impl->TEST_FindMinLogContainingOutstandingPrep();
-  ASSERT_GT(log_containing_prep, 0);
+    // find trans in list of prepared transactions
+    std::vector<Transaction*> prepared_trans;
+    db->GetAllPreparedTransactions(&prepared_trans);
+    ASSERT_EQ(prepared_trans.size(), 1);
+    ASSERT_EQ(prepared_trans.front()->GetName(), "xid");
 
-  // make commit
-  s = txn->Commit();
-  ASSERT_OK(s);
+    auto log_containing_prep =
+        db_impl->TEST_FindMinLogContainingOutstandingPrep();
+    ASSERT_GT(log_containing_prep, 0);
 
-  // value is now available
-  s = db->Get(read_options, "foo", &value);
-  ASSERT_OK(s);
-  ASSERT_EQ(value, "bar");
+    // make commit
+    s = txn->Commit();
+    ASSERT_OK(s);
 
-  s = db->Get(read_options, "gtid", &value);
-  ASSERT_OK(s);
-  ASSERT_EQ(value, "dogs");
+    // value is now available
+    s = db->Get(read_options, "foo", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(value, "bar");
 
-  s = db->Get(read_options, "gtid2", &value);
-  ASSERT_OK(s);
-  ASSERT_EQ(value, "cats");
+    if (!cwb4recovery) {
+      s = db->Get(read_options, "gtid", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "dogs");
 
-  // we already committed
-  s = txn->Commit();
-  ASSERT_EQ(s, Status::InvalidArgument());
+      s = db->Get(read_options, "gtid2", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "cats");
+    }
 
-  // no longer is prpared results
-  db->GetAllPreparedTransactions(&prepared_trans);
-  ASSERT_EQ(prepared_trans.size(), 0);
-  ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+    // we already committed
+    s = txn->Commit();
+    ASSERT_EQ(s, Status::InvalidArgument());
+
+    // no longer is prepared results
+    db->GetAllPreparedTransactions(&prepared_trans);
+    ASSERT_EQ(prepared_trans.size(), 0);
+    ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
+
+    // heap should not care about prepared section anymore
+    ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+    switch (txn_db_options.write_policy) {
+      case WRITE_COMMITTED:
+        // but now our memtable should be referencing the prep section
+        ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
+        ASSERT_EQ(log_containing_prep,
+                  db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+        break;
+      case WRITE_PREPARED:
+      case WRITE_UNPREPARED:
+        // In these modes memtable do not ref the prep sections
+        ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+        break;
+      default:
+        assert(false);
+    }
 
-  // heap should not care about prepared section anymore
-  ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+    db_impl->TEST_FlushMemTable(true);
+    // After flush the recoverable state must be visible
+    if (cwb4recovery) {
+      s = db->Get(read_options, "gtid", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "dogs");
 
-  // but now our memtable should be referencing the prep section
-  ASSERT_EQ(log_containing_prep,
-            db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      s = db->Get(read_options, "gtid2", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "cats");
+    }
 
-  db_impl->TEST_FlushMemTable(true);
+    // after memtable flush we can now relese the log
+    ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
+    ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
 
-  // after memtable flush we can now relese the log
-  ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+    delete txn;
 
-  delete txn;
+    if (cwb4recovery) {
+      // kill and reopen to trigger recovery
+      s = ReOpenNoDelete();
+      ASSERT_OK(s);
+      assert(db != nullptr);
+      s = db->Get(read_options, "gtid", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "dogs");
+
+      s = db->Get(read_options, "gtid2", &value);
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "cats");
+    }
+  }
 }
 
 TEST_P(TransactionTest, TwoPhaseNameTest) {
@@ -942,52 +1099,83 @@ TEST_P(TransactionTest, TwoPhaseNameTest) {
   s = txn1->SetName("name4");
   ASSERT_EQ(s, Status::InvalidArgument());
 
+  txn1->Rollback();
+  txn2->Rollback();
   delete txn1;
   delete txn2;
 }
 
 TEST_P(TransactionTest, TwoPhaseEmptyWriteTest) {
-  Status s;
-  std::string value;
-
-  WriteOptions write_options;
-  ReadOptions read_options;
-  TransactionOptions txn_options;
-  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn1);
-  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
-  ASSERT_TRUE(txn2);
-
-  s = txn1->SetName("joe");
-  ASSERT_OK(s);
-
-  s = txn2->SetName("bob");
-  ASSERT_OK(s);
+  for (bool cwb4recovery : {true, false}) {
+    for (bool test_with_empty_wal : {true, false}) {
+      if (!cwb4recovery && test_with_empty_wal) {
+        continue;
+      }
+      ASSERT_OK(ReOpen());
+      Status s;
+      std::string value;
+
+      WriteOptions write_options;
+      ReadOptions read_options;
+      TransactionOptions txn_options;
+      txn_options.use_only_the_last_commit_time_batch_for_recovery =
+          cwb4recovery;
+      Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_TRUE(txn1);
+      Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_TRUE(txn2);
+
+      s = txn1->SetName("joe");
+      ASSERT_OK(s);
 
-  s = txn1->Prepare();
-  ASSERT_OK(s);
+      s = txn2->SetName("bob");
+      ASSERT_OK(s);
 
-  s = txn1->Commit();
-  ASSERT_OK(s);
+      s = txn1->Prepare();
+      ASSERT_OK(s);
 
-  delete txn1;
+      s = txn1->Commit();
+      ASSERT_OK(s);
 
-  txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar"));
+      delete txn1;
 
-  s = txn2->Prepare();
-  ASSERT_OK(s);
+      txn2->GetCommitTimeWriteBatch()->Put(Slice("foo"), Slice("bar"));
 
-  s = txn2->Commit();
-  ASSERT_OK(s);
+      s = txn2->Prepare();
+      ASSERT_OK(s);
 
-  s = db->Get(read_options, "foo", &value);
-  ASSERT_OK(s);
-  ASSERT_EQ(value, "bar");
+      s = txn2->Commit();
+      ASSERT_OK(s);
 
-  delete txn2;
+      delete txn2;
+      if (!cwb4recovery) {
+        s = db->Get(read_options, "foo", &value);
+        ASSERT_OK(s);
+        ASSERT_EQ(value, "bar");
+      } else {
+        if (test_with_empty_wal) {
+          DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+          db_impl->TEST_FlushMemTable(true);
+          // After flush the state must be visible
+          s = db->Get(read_options, "foo", &value);
+          ASSERT_OK(s);
+          ASSERT_EQ(value, "bar");
+        }
+        db->FlushWAL(true);
+        // kill and reopen to trigger recovery
+        s = ReOpenNoDelete();
+        ASSERT_OK(s);
+        assert(db != nullptr);
+        s = db->Get(read_options, "foo", &value);
+        ASSERT_OK(s);
+        ASSERT_EQ(value, "bar");
+      }
+    }
+  }
 }
 
-TEST_P(TransactionTest, TwoPhaseExpirationTest) {
+#ifndef ROCKSDB_VALGRIND_RUN
+TEST_P(TransactionStressTest, TwoPhaseExpirationTest) {
   Status s;
 
   WriteOptions write_options;
@@ -1025,7 +1213,7 @@ TEST_P(TransactionTest, TwoPhaseRollbackTest) {
 
   TransactionOptions txn_options;
 
-  string value;
+  std::string value;
   Status s;
 
   DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
@@ -1096,7 +1284,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
 
   TransactionOptions txn_options;
 
-  string value;
+  std::string value;
   Status s;
 
   DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
@@ -1142,8 +1330,10 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   db->FlushWAL(false);
   delete txn;
   // kill and reopen
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
   s = ReOpenNoDelete();
   ASSERT_OK(s);
+  assert(db != nullptr);
   db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
 
   // find trans in list of prepared transactions
@@ -1178,7 +1368,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   s = txn->Commit();
   ASSERT_EQ(s, Status::InvalidArgument());
 
-  // no longer is prpared results
+  // no longer is prepared results
   prepared_trans.clear();
   db->GetAllPreparedTransactions(&prepared_trans);
   ASSERT_EQ(prepared_trans.size(), 0);
@@ -1189,13 +1379,32 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   // heap should not care about prepared section anymore
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
 
-  // but now our memtable should be referencing the prep section
-  ASSERT_EQ(log_containing_prep,
-            db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // but now our memtable should be referencing the prep section
+      ASSERT_EQ(log_containing_prep,
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      ASSERT_GE(log_containing_prep, db_impl->MinLogNumberToKeep());
+
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
+
+  // Add a dummy record to memtable before a flush. Otherwise, the
+  // memtable will be empty and flush will be skipped.
+  s = db->Put(write_options, Slice("foo3"), Slice("bar3"));
+  ASSERT_OK(s);
 
   db_impl->TEST_FlushMemTable(true);
 
-  // after memtable flush we can now relese the log
+  // after memtable flush we can now release the log
+  ASSERT_GT(db_impl->MinLogNumberToKeep(), log_containing_prep);
   ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
 
   delete txn;
@@ -1203,6 +1412,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) {
   // deleting transaction should unregister transaction
   ASSERT_EQ(db->GetTransactionByName("xid"), nullptr);
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 // TODO this test needs to be updated with serial commits
 TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
@@ -1221,11 +1431,11 @@ TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
     if (id % 2 == 0) {
       txn_options.expiration = 1000000;
     }
-    TransactionName name("xid_" + std::string(1, 'A' + id));
+    TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(id)));
     Transaction* txn = db->BeginTransaction(write_options, txn_options);
     ASSERT_OK(txn->SetName(name));
     for (int i = 0; i < 10; i++) {
-      std::string key(name + "_" + std::string(1, 'A' + i));
+      std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
       ASSERT_OK(txn->Put(key, "val"));
     }
     ASSERT_OK(txn->Prepare());
@@ -1276,9 +1486,9 @@ TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
   std::string value;
   Status s;
   for (uint32_t t = 0; t < NUM_TXN_THREADS; t++) {
-    TransactionName name("xid_" + std::string(1, 'A' + t));
+    TransactionName name("xid_" + std::string(1, 'A' + static_cast<char>(t)));
     for (int i = 0; i < 10; i++) {
-      std::string key(name + "_" + std::string(1, 'A' + i));
+      std::string key(name + "_" + std::string(1, static_cast<char>('A' + i)));
       s = db->Get(read_options, key, &value);
       ASSERT_OK(s);
       ASSERT_EQ(value, "val");
@@ -1286,7 +1496,7 @@ TEST_P(TransactionTest, DISABLED_TwoPhaseMultiThreadTest) {
   }
 }
 
-TEST_P(TransactionTest, TwoPhaseLongPrepareTest) {
+TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) {
   WriteOptions write_options;
   write_options.sync = true;
   write_options.disableWAL = false;
@@ -1313,12 +1523,14 @@ TEST_P(TransactionTest, TwoPhaseLongPrepareTest) {
   for (int i = 0; i < 1000; i++) {
     std::string key(i, 'k');
     std::string val(1000, 'v');
+    assert(db != nullptr);
     s = db->Put(write_options, key, val);
     ASSERT_OK(s);
 
     if (i % 29 == 0) {
       // crash
       env->SetFilesystemActive(false);
+      reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
       ReOpenNoDelete();
     } else if (i % 37 == 0) {
       // close
@@ -1387,6 +1599,7 @@ TEST_P(TransactionTest, TwoPhaseSequenceTest) {
   // kill and reopen
   env->SetFilesystemActive(false);
   ReOpenNoDelete();
+  assert(db != nullptr);
 
   // value is now available
   s = db->Get(read_options, "foo4", &value);
@@ -1421,6 +1634,7 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) {
 
   // kill and reopen
   env->SetFilesystemActive(false);
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
   ReOpenNoDelete();
 
   // commit old txn
@@ -1452,6 +1666,7 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) {
   // kill and reopen
   env->SetFilesystemActive(false);
   ReOpenNoDelete();
+  assert(db != nullptr);
 
   // value is now available
   s = db->Get(read_options, "foo", &value);
@@ -1467,7 +1682,7 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
 
   Status s;
-  string v;
+  std::string v;
   ColumnFamilyHandle *cfa, *cfb;
 
   // Create 2 new column families
@@ -1536,9 +1751,20 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
             txn2->GetLogNumber());
 
-  // we should see txn1s log refernced by the memtables
-  ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(),
-            txn1->GetLogNumber());
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // we should see txn1s log refernced by the memtables
+      ASSERT_EQ(txn1->GetLogNumber(),
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
 
   // flush default cf to crate new log
   s = db->Put(wopts, "foo", "bar2");
@@ -1556,20 +1782,42 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest) {
   // heap should not show any logs
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
 
-  // should show the first txn log
-  ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(),
-            txn1->GetLogNumber());
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // should show the first txn log
+      ASSERT_EQ(txn1->GetLogNumber(),
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
 
   // flush only cfa memtable
-  s = db_impl->TEST_FlushMemTable(true, cfa);
-  ASSERT_OK(s);
-
-  // should show the first txn log
-  ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(),
-            txn2->GetLogNumber());
+  s = db_impl->TEST_FlushMemTable(true, false, cfa);
+  ASSERT_OK(s);
+
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // should show the first txn log
+      ASSERT_EQ(txn2->GetLogNumber(),
+                db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(0, db_impl->TEST_FindMinPrepLogReferencedByMemTable());
+      break;
+    default:
+      assert(false);
+  }
 
   // flush only cfb memtable
-  s = db_impl->TEST_FlushMemTable(true, cfb);
+  s = db_impl->TEST_FlushMemTable(true, false, cfb);
   ASSERT_OK(s);
 
   // should show not dependency on logs
@@ -1638,8 +1886,20 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
   ASSERT_OK(s);
 
   ASSERT_GT(db_impl->TEST_LogfileNumber(), prepare_log_no);
-  ASSERT_GT(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
-  ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), db_impl->TEST_LogfileNumber());
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // This cf is empty and should ref the latest log
+      ASSERT_GT(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
+      ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), db_impl->TEST_LogfileNumber());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // This cf is not flushed yet and should ref the log that has its data
+      ASSERT_EQ(cfh_a->cfd()->GetLogNumber(), prepare_log_no);
+      break;
+    default:
+      assert(false);
+  }
   ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
             prepare_log_no);
   ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
@@ -1648,29 +1908,52 @@ TEST_P(TransactionTest, TwoPhaseLogRollingTest2) {
   s = txn1->Commit();
   ASSERT_OK(s);
 
-  ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), prepare_log_no);
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(),
+                prepare_log_no);
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // In these modes memtable do not ref the prep sections
+      ASSERT_EQ(db_impl->TEST_FindMinPrepLogReferencedByMemTable(), 0);
+      break;
+    default:
+      assert(false);
+  }
 
-  ASSERT_TRUE(!db_impl->TEST_UnableToFlushOldestLog());
+  ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
 
   // request a flush for all column families such that the earliest
   // alive log file can be killed
-  db_impl->TEST_HandleWALFull();
+  db_impl->TEST_SwitchWAL();
   // log cannot be flushed because txn2 has not been commited
   ASSERT_TRUE(!db_impl->TEST_IsLogGettingFlushed());
-  ASSERT_TRUE(db_impl->TEST_UnableToFlushOldestLog());
+  ASSERT_TRUE(db_impl->TEST_UnableToReleaseOldestLog());
 
   // assert that cfa has a flush requested
   ASSERT_TRUE(cfh_a->cfd()->imm()->HasFlushRequested());
 
-  // cfb should not be flushed becuse it has no data from LOG A
-  ASSERT_TRUE(!cfh_b->cfd()->imm()->HasFlushRequested());
+  switch (txn_db_options.write_policy) {
+    case WRITE_COMMITTED:
+      // cfb should not be flushed becuse it has no data from LOG A
+      ASSERT_TRUE(!cfh_b->cfd()->imm()->HasFlushRequested());
+      break;
+    case WRITE_PREPARED:
+    case WRITE_UNPREPARED:
+      // cfb should be flushed becuse it has prepared data from LOG A
+      ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
+      break;
+    default:
+      assert(false);
+  }
 
   // cfb now has data from LOG A
   s = txn2->Commit();
   ASSERT_OK(s);
 
-  db_impl->TEST_HandleWALFull();
-  ASSERT_TRUE(!db_impl->TEST_UnableToFlushOldestLog());
+  db_impl->TEST_SwitchWAL();
+  ASSERT_TRUE(!db_impl->TEST_UnableToReleaseOldestLog());
 
   // we should see that cfb now has a flush requested
   ASSERT_TRUE(cfh_b->cfd()->imm()->HasFlushRequested());
@@ -1734,7 +2017,9 @@ TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) {
 
   // kill and reopen
   env->SetFilesystemActive(false);
+  reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
   ReOpenNoDelete();
+  assert(db != nullptr);
 
   s = db->Get(read_options, "first", &value);
   ASSERT_OK(s);
@@ -1840,7 +2125,7 @@ TEST_P(TransactionTest, WriteConflictTest2) {
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   db->Put(write_options, "foo", "bar");
@@ -1888,7 +2173,7 @@ TEST_P(TransactionTest, ReadConflictTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
   TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   db->Put(write_options, "foo", "bar");
@@ -1926,7 +2211,7 @@ TEST_P(TransactionTest, TxnOnlyTest) {
 
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn = db->BeginTransaction(write_options);
@@ -1944,7 +2229,7 @@ TEST_P(TransactionTest, TxnOnlyTest) {
 TEST_P(TransactionTest, FlushTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  string value;
+  std::string value;
   Status s;
 
   db->Put(write_options, Slice("foo"), Slice("bar"));
@@ -2003,6 +2288,7 @@ TEST_P(TransactionTest, FlushTest2) {
 
     Status s = ReOpen();
     ASSERT_OK(s);
+    assert(db != nullptr);
 
     WriteOptions write_options;
     ReadOptions read_options, snapshot_read_options;
@@ -2136,7 +2422,7 @@ TEST_P(TransactionTest, FlushTest2) {
 TEST_P(TransactionTest, NoSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   db->Put(write_options, "AAA", "bar");
@@ -2166,12 +2452,12 @@ TEST_P(TransactionTest, NoSnapshotTest) {
 TEST_P(TransactionTest, MultipleSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  string value;
+  std::string value;
   Status s;
 
-  db->Put(write_options, "AAA", "bar");
-  db->Put(write_options, "BBB", "bar");
-  db->Put(write_options, "CCC", "bar");
+  ASSERT_OK(db->Put(write_options, "AAA", "bar"));
+  ASSERT_OK(db->Put(write_options, "BBB", "bar"));
+  ASSERT_OK(db->Put(write_options, "CCC", "bar"));
 
   Transaction* txn = db->BeginTransaction(write_options);
   ASSERT_TRUE(txn);
@@ -2179,24 +2465,24 @@ TEST_P(TransactionTest, MultipleSnapshotTest) {
   db->Put(write_options, "AAA", "bar1");
 
   // Read and write without a snapshot
-  txn->GetForUpdate(read_options, "AAA", &value);
+  ASSERT_OK(txn->GetForUpdate(read_options, "AAA", &value));
   ASSERT_EQ(value, "bar1");
   s = txn->Put("AAA", "bar2");
   ASSERT_OK(s);
 
   // Modify BBB before snapshot is taken
-  db->Put(write_options, "BBB", "bar1");
+  ASSERT_OK(db->Put(write_options, "BBB", "bar1"));
 
   txn->SetSnapshot();
   snapshot_read_options.snapshot = txn->GetSnapshot();
 
   // Read and write with snapshot
-  txn->GetForUpdate(snapshot_read_options, "BBB", &value);
+  ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "BBB", &value));
   ASSERT_EQ(value, "bar1");
   s = txn->Put("BBB", "bar2");
   ASSERT_OK(s);
 
-  db->Put(write_options, "CCC", "bar1");
+  ASSERT_OK(db->Put(write_options, "CCC", "bar1"));
 
   // Set a new snapshot
   txn->SetSnapshot();
@@ -2453,7 +2739,6 @@ TEST_P(TransactionTest, ColumnFamiliesTest) {
 TEST_P(TransactionTest, ColumnFamiliesTest2) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  TransactionOptions txn_options;
   string value;
   Status s;
 
@@ -2635,7 +2920,7 @@ TEST_P(TransactionTest, LostUpdate) {
   WriteOptions write_options;
   ReadOptions read_options, read_options1, read_options2;
   TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   // Test 2 transactions writing to the same key in multiple orders and
@@ -2760,7 +3045,7 @@ TEST_P(TransactionTest, LostUpdate) {
 TEST_P(TransactionTest, UntrackedWrites) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   // Verify transaction rollback works for untracked keys.
@@ -2851,7 +3136,7 @@ TEST_P(TransactionTest, ReinitializeTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   // Set txn expiration timeout to 0 microseconds (expires instantly)
@@ -2957,7 +3242,7 @@ TEST_P(TransactionTest, Rollback) {
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
@@ -3007,7 +3292,7 @@ TEST_P(TransactionTest, LockLimitTest) {
   ASSERT_OK(s);
 
   // Create a txn and verify we can only lock up to 3 keys
-  Transaction* txn = db->BeginTransaction(write_options);
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
   ASSERT_TRUE(txn);
 
   s = txn->Put("X", "x");
@@ -3040,7 +3325,7 @@ TEST_P(TransactionTest, LockLimitTest) {
   s = txn->Get(read_options, "W", &value);
   ASSERT_TRUE(s.IsNotFound());
 
-  Transaction* txn2 = db->BeginTransaction(write_options);
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
   ASSERT_TRUE(txn2);
 
   // "X" currently locked
@@ -3099,8 +3384,7 @@ TEST_P(TransactionTest, LockLimitTest) {
 TEST_P(TransactionTest, IteratorTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   // Write some keys to the db
@@ -3218,7 +3502,7 @@ TEST_P(TransactionTest, IteratorTest) {
 TEST_P(TransactionTest, DisableIndexingTest) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn = db->BeginTransaction(write_options);
@@ -3280,8 +3564,7 @@ TEST_P(TransactionTest, DisableIndexingTest) {
 TEST_P(TransactionTest, SavepointTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn = db->BeginTransaction(write_options);
@@ -3569,11 +3852,84 @@ TEST_P(TransactionTest, SavepointTest2) {
   delete txn2;
 }
 
+TEST_P(TransactionTest, SavepointTest3) {
+  WriteOptions write_options;
+  ReadOptions read_options;
+  TransactionOptions txn_options;
+  Status s;
+
+  txn_options.lock_timeout = 1;  // 1 ms
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn1);
+
+  s = txn1->PopSavePoint();  // No SavePoint present
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Put("A", "");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Still no SavePoint present
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn1->SetSavePoint();  // 1
+
+  s = txn1->Put("A", "a");
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();  // Remove 1
+  ASSERT_TRUE(txn1->RollbackToSavePoint().IsNotFound());
+
+  // Verify that "A" is still locked
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_TRUE(txn2);
+
+  s = txn2->Put("A", "a2");
+  ASSERT_TRUE(s.IsTimedOut());
+  delete txn2;
+
+  txn1->SetSavePoint();  // 2
+
+  s = txn1->Put("B", "b");
+  ASSERT_OK(s);
+
+  txn1->SetSavePoint();  // 3
+
+  s = txn1->Put("B", "b2");
+  ASSERT_OK(s);
+
+  ASSERT_OK(txn1->RollbackToSavePoint());  // Roll back to 2
+
+  s = txn1->PopSavePoint();
+  ASSERT_OK(s);
+
+  s = txn1->PopSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+
+  s = txn1->Commit();
+  ASSERT_OK(s);
+  delete txn1;
+
+  std::string value;
+
+  // tnx1 should have modified "A" to "a"
+  s = db->Get(read_options, "A", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("a", value);
+
+  // tnx1 should have set "B" to just "b"
+  s = db->Get(read_options, "B", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("b", value);
+
+  s = db->Get(read_options, "C", &value);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
 TEST_P(TransactionTest, UndoGetForUpdateTest) {
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   txn_options.lock_timeout = 1;  // 1 ms
@@ -3717,7 +4073,7 @@ TEST_P(TransactionTest, UndoGetForUpdateTest2) {
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
-  string value;
+  std::string value;
   Status s;
 
   s = db->Put(write_options, "A", "");
@@ -3922,7 +4278,7 @@ TEST_P(TransactionTest, UndoGetForUpdateTest2) {
 TEST_P(TransactionTest, TimeoutTest) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   delete db;
@@ -4059,7 +4415,7 @@ TEST_P(TransactionTest, TimeoutTest) {
 TEST_P(TransactionTest, SingleDeleteTest) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn = db->BeginTransaction(write_options);
@@ -4157,7 +4513,7 @@ TEST_P(TransactionTest, SingleDeleteTest) {
 TEST_P(TransactionTest, MergeTest) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn = db->BeginTransaction(write_options, TransactionOptions());
@@ -4213,7 +4569,7 @@ TEST_P(TransactionTest, MergeTest) {
 TEST_P(TransactionTest, DeferSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
   Status s;
 
   s = db->Put(write_options, "A", "a0");
@@ -4264,7 +4620,7 @@ TEST_P(TransactionTest, DeferSnapshotTest) {
 TEST_P(TransactionTest, DeferSnapshotTest2) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn1 = db->BeginTransaction(write_options);
@@ -4321,7 +4677,7 @@ TEST_P(TransactionTest, DeferSnapshotTest2) {
 TEST_P(TransactionTest, DeferSnapshotSavePointTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn1 = db->BeginTransaction(write_options);
@@ -4429,7 +4785,7 @@ TEST_P(TransactionTest, DeferSnapshotSavePointTest) {
 TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) {
   WriteOptions write_options;
   ReadOptions read_options;
-  string value;
+  std::string value;
 
   class Notifier : public TransactionNotifier {
    private:
@@ -4439,7 +4795,7 @@ TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) {
     explicit Notifier(const Snapshot** snapshot_ptr)
         : snapshot_ptr_(snapshot_ptr) {}
 
-    void SnapshotCreated(const Snapshot* newSnapshot) {
+    void SnapshotCreated(const Snapshot* newSnapshot) override {
       *snapshot_ptr_ = newSnapshot;
     }
   };
@@ -4488,7 +4844,7 @@ TEST_P(TransactionTest, SetSnapshotOnNextOperationWithNotification) {
 TEST_P(TransactionTest, ClearSnapshotTest) {
   WriteOptions write_options;
   ReadOptions read_options, snapshot_read_options;
-  string value;
+  std::string value;
   Status s;
 
   s = db->Put(write_options, "foo", "0");
@@ -4535,7 +4891,6 @@ TEST_P(TransactionTest, ClearSnapshotTest) {
 TEST_P(TransactionTest, ToggleAutoCompactionTest) {
   Status s;
 
-  TransactionOptions txn_options;
   ColumnFamilyHandle *cfa, *cfb;
   ColumnFamilyOptions cf_options;
 
@@ -4591,13 +4946,13 @@ TEST_P(TransactionTest, ToggleAutoCompactionTest) {
   }
 }
 
-TEST_P(TransactionTest, ExpiredTransactionDataRace1) {
+TEST_P(TransactionStressTest, ExpiredTransactionDataRace1) {
   // In this test, txn1 should succeed committing,
   // as the callback is called after txn1 starts committing.
   rocksdb::SyncPoint::GetInstance()->LoadDependency(
       {{"TransactionTest::ExpirableTransactionDataRace:1"}});
   rocksdb::SyncPoint::GetInstance()->SetCallBack(
-      "TransactionTest::ExpirableTransactionDataRace:1", [&](void* arg) {
+      "TransactionTest::ExpirableTransactionDataRace:1", [&](void* /*arg*/) {
         WriteOptions write_options;
         TransactionOptions txn_options;
 
@@ -4637,21 +4992,24 @@ TEST_P(TransactionTest, ExpiredTransactionDataRace1) {
   rocksdb::SyncPoint::GetInstance()->DisableProcessing();
 }
 
+#ifndef ROCKSDB_VALGRIND_RUN
 namespace {
-Status TransactionStressTestInserter(TransactionDB* db,
-                                     const size_t num_transactions,
-                                     const size_t num_sets,
-                                     const size_t num_keys_per_set) {
-  size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
-  Random64 _rand(seed);
+// cmt_delay_ms is the delay between prepare and commit
+// first_id is the id of the first transaction
+Status TransactionStressTestInserter(
+    TransactionDB* db, const size_t num_transactions, const size_t num_sets,
+    const size_t num_keys_per_set, Random64* rand,
+    const uint64_t cmt_delay_ms = 0, const uint64_t first_id = 0) {
   WriteOptions write_options;
   ReadOptions read_options;
   TransactionOptions txn_options;
-  txn_options.set_snapshot = true;
+  // Inside the inserter we might also retake the snapshot. We do both since two
+  // separte functions are engaged for each.
+  txn_options.set_snapshot = rand->OneIn(2);
 
-  RandomTransactionInserter inserter(&_rand, write_options, read_options,
-                                     num_keys_per_set,
-                                     static_cast<uint16_t>(num_sets));
+  RandomTransactionInserter inserter(
+      rand, write_options, read_options, num_keys_per_set,
+      static_cast<uint16_t>(num_sets), cmt_delay_ms, first_id);
 
   for (size_t t = 0; t < num_transactions; t++) {
     bool success = inserter.TransactionDBInsert(db, txn_options);
@@ -4663,7 +5021,8 @@ Status TransactionStressTestInserter(TransactionDB* db,
 
   // Make sure at least some of the transactions succeeded.  It's ok if
   // some failed due to write-conflicts.
-  if (inserter.GetFailureCount() > num_transactions / 2) {
+  if (num_transactions != 1 &&
+      inserter.GetFailureCount() > num_transactions / 2) {
     return Status::TryAgain("Too many transactions failed! " +
                             std::to_string(inserter.GetFailureCount()) + " / " +
                             std::to_string(num_transactions));
@@ -4673,42 +5032,100 @@ Status TransactionStressTestInserter(TransactionDB* db,
 }
 }  // namespace
 
+// Worker threads add a number to a key from each set of keys. The checker
+// threads verify that the sum of all keys in each set are equal.
 TEST_P(MySQLStyleTransactionTest, TransactionStressTest) {
-  const size_t num_threads = 4;
+  // Small write buffer to trigger more compactions
+  options.write_buffer_size = 1024;
+  ReOpenNoDelete();
+  const size_t num_workers = 4;   // worker threads count
+  const size_t num_checkers = 2;  // checker threads count
+  const size_t num_slow_checkers = 2;  // checker threads emulating backups
+  const size_t num_slow_workers = 1;   // slow worker threads count
   const size_t num_transactions_per_thread = 10000;
-  const size_t num_sets = 3;
+  const uint16_t num_sets = 3;
   const size_t num_keys_per_set = 100;
   // Setting the key-space to be 100 keys should cause enough write-conflicts
   // to make this test interesting.
 
   std::vector<port::Thread> threads;
+  std::atomic<uint32_t> finished = {0};
+  bool TAKE_SNAPSHOT = true;
+  uint64_t time_seed = env->NowMicros();
+  printf("time_seed is %" PRIu64 "\n", time_seed);  // would help to reproduce
 
   std::function<void()> call_inserter = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
     ASSERT_OK(TransactionStressTestInserter(db, num_transactions_per_thread,
-                                            num_sets, num_keys_per_set));
+                                            num_sets, num_keys_per_set, &rand));
+    finished++;
+  };
+  std::function<void()> call_checker = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
+    // Verify that data is consistent
+    while (finished < num_workers) {
+      Status s = RandomTransactionInserter::Verify(
+          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand);
+      ASSERT_OK(s);
+    }
+  };
+  std::function<void()> call_slow_checker = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
+    // Verify that data is consistent
+    while (finished < num_workers) {
+      uint64_t delay_ms = rand.Uniform(100) + 1;
+      Status s = RandomTransactionInserter::Verify(
+          db, num_sets, num_keys_per_set, TAKE_SNAPSHOT, &rand, delay_ms);
+      ASSERT_OK(s);
+    }
+  };
+  std::function<void()> call_slow_inserter = [&] {
+    size_t thd_seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    Random64 rand(time_seed * thd_seed);
+    uint64_t id = 0;
+    // Verify that data is consistent
+    while (finished < num_workers) {
+      uint64_t delay_ms = rand.Uniform(500) + 1;
+      ASSERT_OK(TransactionStressTestInserter(db, 1, num_sets, num_keys_per_set,
+                                              &rand, delay_ms, id++));
+    }
   };
 
-  // Create N threads that use RandomTransactionInserter to write
-  // many transactions.
-  for (uint32_t i = 0; i < num_threads; i++) {
+  for (uint32_t i = 0; i < num_workers; i++) {
     threads.emplace_back(call_inserter);
   }
+  for (uint32_t i = 0; i < num_checkers; i++) {
+    threads.emplace_back(call_checker);
+  }
+  if (with_slow_threads_) {
+    for (uint32_t i = 0; i < num_slow_checkers; i++) {
+      threads.emplace_back(call_slow_checker);
+    }
+    for (uint32_t i = 0; i < num_slow_workers; i++) {
+      threads.emplace_back(call_slow_inserter);
+    }
+  }
 
-  // Wait for all threads to run
+  // Wait for all threads to finish
   for (auto& t : threads) {
     t.join();
   }
 
   // Verify that data is consistent
-  Status s = RandomTransactionInserter::Verify(db, num_sets);
+  Status s = RandomTransactionInserter::Verify(db, num_sets, num_keys_per_set,
+                                               !TAKE_SNAPSHOT);
   ASSERT_OK(s);
 }
+#endif  // ROCKSDB_VALGRIND_RUN
 
 TEST_P(TransactionTest, MemoryLimitTest) {
   TransactionOptions txn_options;
   // Header (12 bytes) + NOOP (1 byte) + 2 * 8 bytes for data.
   txn_options.max_write_batch_size = 29;
-  string value;
+  std::string value;
   Status s;
 
   Transaction* txn = db->BeginTransaction(WriteOptions(), txn_options);
@@ -4726,142 +5143,638 @@ TEST_P(TransactionTest, MemoryLimitTest) {
   ASSERT_EQ(2, txn->GetNumPuts());
 
   s = txn->Put(Slice("b"), Slice("...."));
-  ASSERT_TRUE(s.IsMemoryLimit());
-  ASSERT_EQ(2, txn->GetNumPuts());
+  auto pdb = reinterpret_cast<PessimisticTransactionDB*>(db);
+  // For write unprepared, write batches exceeding max_write_batch_size will
+  // just flush to DB instead of returning a memory limit error.
+  if (pdb->GetTxnDBOptions().write_policy != WRITE_UNPREPARED) {
+    ASSERT_TRUE(s.IsMemoryLimit());
+    ASSERT_EQ(2, txn->GetNumPuts());
+  } else {
+    ASSERT_OK(s);
+    ASSERT_EQ(3, txn->GetNumPuts());
+  }
 
   txn->Rollback();
   delete txn;
 }
 
-// Test WritePreparedTxnDB's IsInSnapshot against different ordering of
-// snapshot, max_committed_seq_, prepared, and commit entries.
-TEST_P(WritePreparedTransactionTest, IsInSnapshotTest) {
-  WriteOptions wo;
-  // Use small commit cache to trigger lots of eviction and fast advance of
-  // max_evicted_seq_
-  // will take effect after ReOpen
-  WritePreparedTxnDB::DEF_COMMIT_CACHE_SIZE = 8;
-  // Same for snapshot cache size
-  WritePreparedTxnDB::DEF_SNAPSHOT_CACHE_SIZE = 5;
-
-  // Take some preliminary snapshots first. This is to stress the data structure
-  // that holds the old snapshots as it will be designed to be efficient when
-  // only a few snapshots are below the max_evicted_seq_.
-  for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) {
-    // Leave some gap between the preliminary snapshots and the final snapshot
-    // that we check. This should test for also different overlapping scnearios
-    // between the last snapshot and the commits.
-    for (int max_gap = 1; max_gap < 10; max_gap++) {
-      // Since we do not actually write to db, we mock the seq as it would be
-      // increaased by the db. The only exception is that we need db seq to
-      // advance for our snapshots. for which we apply a dummy put each time we
-      // increase our mock of seq.
-      uint64_t seq = 0;
-      // At each step we prepare a txn and then we commit it in the next txn.
-      // This emulates the consecuitive transactions that write to the same key
-      uint64_t cur_txn = 0;
-      // Number of snapshots taken so far
-      int num_snapshots = 0;
-      std::vector<const Snapshot*> to_be_released;
-      // Number of gaps applied so far
-      int gap_cnt = 0;
-      // The final snapshot that we will inspect
-      uint64_t snapshot = 0;
-      bool found_committed = false;
-      // To stress the data structure that maintain prepared txns, at each cycle
-      // we add a new prepare txn. These do not mean to be committed for
-      // snapshot inspection.
-      std::set<uint64_t> prepared;
-      // We keep the list of txns comitted before we take the last snaphot.
-      // These should be the only seq numbers that will be found in the snapshot
-      std::set<uint64_t> committed_before;
-      ReOpen();  // to restart the db
-      WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
-      assert(wp_db);
-      assert(wp_db->db_impl_);
-      // We continue until max advances a bit beyond the snapshot.
-      while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) {
-        // do prepare for a transaction
-        wp_db->db_impl_->Put(wo, "key", "value");  // dummy put to inc db seq
-        seq++;
-        ASSERT_EQ(wp_db->db_impl_->GetLatestSequenceNumber(), seq);
-        wp_db->AddPrepared(seq);
-        prepared.insert(seq);
-
-        // If cur_txn is not started, do prepare for it.
-        if (!cur_txn) {
-          wp_db->db_impl_->Put(wo, "key", "value");  // dummy put to inc db seq
-          seq++;
-          ASSERT_EQ(wp_db->db_impl_->GetLatestSequenceNumber(), seq);
-          cur_txn = seq;
-          wp_db->AddPrepared(cur_txn);
-        } else {                                     // else commit it
-          wp_db->db_impl_->Put(wo, "key", "value");  // dummy put to inc db seq
-          seq++;
-          ASSERT_EQ(wp_db->db_impl_->GetLatestSequenceNumber(), seq);
-          wp_db->AddCommitted(cur_txn, seq);
-          if (!snapshot) {
-            committed_before.insert(cur_txn);
-          }
-          cur_txn = 0;
-        }
+// This test clarifies the existing expectation from the sequence number
+// algorithm. It could detect mistakes in updating the code but it is not
+// necessarily the one acceptable way. If the algorithm is legitimately changed,
+// this unit test should be updated as well.
+TEST_P(TransactionStressTest, SeqAdvanceTest) {
+  // TODO(myabandeh): must be test with false before new releases
+  const bool short_test = true;
+  WriteOptions wopts;
+  FlushOptions fopt;
+
+  options.disable_auto_compactions = true;
+  ASSERT_OK(ReOpen());
+
+  // Do the test with NUM_BRANCHES branches in it. Each run of a test takes some
+  // of the branches. This is the same as counting a binary number where i-th
+  // bit represents whether we take branch i in the represented by the number.
+  const size_t NUM_BRANCHES = short_test ? 6 : 10;
+  // Helper function that shows if the branch is to be taken in the run
+  // represented by the number n.
+  auto branch_do = [&](size_t n, size_t* branch) {
+    assert(*branch < NUM_BRANCHES);
+    const size_t filter = static_cast<size_t>(1) << *branch;
+    return n & filter;
+  };
+  const size_t max_n = static_cast<size_t>(1) << NUM_BRANCHES;
+  for (size_t n = 0; n < max_n; n++) {
+    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    size_t branch = 0;
+    auto seq = db_impl->GetLatestSequenceNumber();
+    exp_seq = seq;
+    txn_t0(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
 
-        if (num_snapshots < max_snapshots - 1) {
-          // Take preliminary snapshots
-          auto tmp_snapshot = db->GetSnapshot();
-          to_be_released.push_back(tmp_snapshot);
-          num_snapshots++;
-        } else if (gap_cnt < max_gap) {
-          // Wait for some gap before taking the final snapshot
-          gap_cnt++;
-        } else if (!snapshot) {
-          // Take the final snapshot if it is not already taken
-          auto tmp_snapshot = db->GetSnapshot();
-          to_be_released.push_back(tmp_snapshot);
-          snapshot = tmp_snapshot->GetSequenceNumber();
-          // We increase the db seq artificailly by a dummy Put. Check that this
-          // technique is effective and db seq is that same as ours.
-          ASSERT_EQ(snapshot, seq);
-          num_snapshots++;
-        }
+    // Doing it twice might detect some bugs
+    txn_t0(1);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    txn_t1(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
 
-        // If the snapshot is taken, verify seq numbers visible to it. We redo
-        // it at each cycle to test that the system is still sound when
-        // max_evicted_seq_ advances.
-        if (snapshot) {
-          for (uint64_t s = 0; s <= seq; s++) {
-            bool was_committed =
-                (committed_before.find(s) != committed_before.end());
-            bool is_in_snapshot = wp_db->IsInSnapshot(s, snapshot);
-            if (was_committed != is_in_snapshot) {
-              printf("max_snapshots %d max_gap %d seq %" PRIu64 " max %" PRIu64
-                     " snapshot %" PRIu64
-                     " gap_cnt %d num_snapshots %d s %" PRIu64 "\n",
-                     max_snapshots, max_gap, seq,
-                     wp_db->max_evicted_seq_.load(), snapshot, gap_cnt,
-                     num_snapshots, s);
-            }
-            ASSERT_EQ(was_committed, is_in_snapshot);
-            found_committed = found_committed || is_in_snapshot;
+    txn_t3(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+
+    txn_t4(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+
+    txn_t2(0);
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    if (branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->Flush(fopt));
+      seq = db_impl->TEST_GetLastVisibleSequence();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    if (!short_test && branch_do(n, &branch)) {
+      ASSERT_OK(db_impl->FlushWAL(true));
+      ASSERT_OK(ReOpenNoDelete());
+      db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      seq = db_impl->GetLatestSequenceNumber();
+      ASSERT_EQ(exp_seq, seq);
+    }
+    ASSERT_OK(ReOpen());
+  }
+}
+
+// Verify that the optimization would not compromize the correctness
+TEST_P(TransactionTest, Optimizations) {
+  size_t comb_cnt = size_t(1) << 2;  // 2 is number of optimization vars
+  for (size_t new_comb = 0; new_comb < comb_cnt; new_comb++) {
+    TransactionDBWriteOptimizations optimizations;
+    optimizations.skip_concurrency_control = IsInCombination(0, new_comb);
+    optimizations.skip_duplicate_key_check = IsInCombination(1, new_comb);
+
+    ASSERT_OK(ReOpen());
+    WriteOptions write_options;
+    WriteBatch batch;
+    batch.Put(Slice("k"), Slice("v1"));
+    ASSERT_OK(db->Write(write_options, &batch));
+
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    ASSERT_OK(db->Get(ropt, db->DefaultColumnFamily(), "k", &pinnable_val));
+    ASSERT_TRUE(pinnable_val == ("v1"));
+  }
+}
+
+// A comparator that uses only the first three bytes
+class ThreeBytewiseComparator : public Comparator {
+ public:
+  ThreeBytewiseComparator() {}
+  const char* Name() const override { return "test.ThreeBytewiseComparator"; }
+  int Compare(const Slice& a, const Slice& b) const override {
+    Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
+    Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
+    return na.compare(nb);
+  }
+  bool Equal(const Slice& a, const Slice& b) const override {
+    Slice na = Slice(a.data(), a.size() < 3 ? a.size() : 3);
+    Slice nb = Slice(b.data(), b.size() < 3 ? b.size() : 3);
+    return na == nb;
+  }
+  // This methods below dont seem relevant to this test. Implement them if
+  // proven othersize.
+  void FindShortestSeparator(std::string* start,
+                             const Slice& limit) const override {
+    const Comparator* bytewise_comp = BytewiseComparator();
+    bytewise_comp->FindShortestSeparator(start, limit);
+  }
+  void FindShortSuccessor(std::string* key) const override {
+    const Comparator* bytewise_comp = BytewiseComparator();
+    bytewise_comp->FindShortSuccessor(key);
+  }
+};
+
+// Test that the transactional db can handle duplicate keys in the write batch
+TEST_P(TransactionTest, DuplicateKeys) {
+  ColumnFamilyOptions cf_options;
+  std::string cf_name = "two";
+  ColumnFamilyHandle* cf_handle = nullptr;
+  {
+    ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+    WriteOptions write_options;
+    WriteBatch batch;
+    batch.Put(Slice("key"), Slice("value"));
+    batch.Put(Slice("key2"), Slice("value2"));
+    // duplicate the keys
+    batch.Put(Slice("key"), Slice("value3"));
+    // duplicate the 2nd key. It should not be counted duplicate since a
+    // sub-patch is cut after the last duplicate.
+    batch.Put(Slice("key2"), Slice("value4"));
+    // duplicate the keys but in a different cf. It should not be counted as
+    // duplicate keys
+    batch.Put(cf_handle, Slice("key"), Slice("value5"));
+
+    ASSERT_OK(db->Write(write_options, &batch));
+
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("value3"));
+    s = db->Get(ropt, db->DefaultColumnFamily(), "key2", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("value4"));
+    s = db->Get(ropt, cf_handle, "key", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("value5"));
+
+    delete cf_handle;
+  }
+
+  // Test with non-bytewise comparator
+  {
+    ASSERT_OK(ReOpen());
+    std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
+    cf_options.comparator = comp_gc.get();
+    ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+    WriteOptions write_options;
+    WriteBatch batch;
+    batch.Put(cf_handle, Slice("key"), Slice("value"));
+    // The first three bytes are the same, do it must be counted as duplicate
+    batch.Put(cf_handle, Slice("key2"), Slice("value2"));
+    // check for 2nd duplicate key in cf with non-default comparator
+    batch.Put(cf_handle, Slice("key2b"), Slice("value2b"));
+    ASSERT_OK(db->Write(write_options, &batch));
+
+    // The value must be the most recent value for all the keys equal to "key",
+    // including "key2"
+    ReadOptions ropt;
+    PinnableSlice pinnable_val;
+    ASSERT_OK(db->Get(ropt, cf_handle, "key", &pinnable_val));
+    ASSERT_TRUE(pinnable_val == ("value2b"));
+
+    // Test duplicate keys with rollback
+    TransactionOptions txn_options;
+    Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(cf_handle, Slice("key3"), Slice("value3")));
+    ASSERT_OK(txn0->Merge(cf_handle, Slice("key4"), Slice("value4")));
+    ASSERT_OK(txn0->Rollback());
+    ASSERT_OK(db->Get(ropt, cf_handle, "key5", &pinnable_val));
+    ASSERT_TRUE(pinnable_val == ("value2b"));
+    delete txn0;
+
+    delete cf_handle;
+    cf_options.comparator = BytewiseComparator();
+  }
+
+  for (bool do_prepare : {true, false}) {
+    for (bool do_rollback : {true, false}) {
+      for (bool with_commit_batch : {true, false}) {
+        if (with_commit_batch && !do_prepare) {
+          continue;
+        }
+        if (with_commit_batch && do_rollback) {
+          continue;
+        }
+        ASSERT_OK(ReOpen());
+        ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+        TransactionOptions txn_options;
+        txn_options.use_only_the_last_commit_time_batch_for_recovery = false;
+        WriteOptions write_options;
+        Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+        auto s = txn0->SetName("xid");
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo0"), Slice("bar0a"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo0"), Slice("bar0b"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo1"), Slice("bar1"));
+        ASSERT_OK(s);
+        s = txn0->Merge(Slice("foo2"), Slice("bar2a"));
+        ASSERT_OK(s);
+        // Repeat a key after the start of a sub-patch. This should not cause a
+        // duplicate in the most recent sub-patch and hence not creating a new
+        // sub-patch.
+        s = txn0->Put(Slice("foo0"), Slice("bar0c"));
+        ASSERT_OK(s);
+        s = txn0->Merge(Slice("foo2"), Slice("bar2b"));
+        ASSERT_OK(s);
+        // duplicate the keys but in a different cf. It should not be counted as
+        // duplicate.
+        s = txn0->Put(cf_handle, Slice("foo0"), Slice("bar0-cf1"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo3"), Slice("bar3"));
+        ASSERT_OK(s);
+        s = txn0->Merge(Slice("foo3"), Slice("bar3"));
+        ASSERT_OK(s);
+        s = txn0->Put(Slice("foo4"), Slice("bar4"));
+        ASSERT_OK(s);
+        s = txn0->Delete(Slice("foo4"));
+        ASSERT_OK(s);
+        s = txn0->SingleDelete(Slice("foo4"));
+        ASSERT_OK(s);
+        if (do_prepare) {
+          s = txn0->Prepare();
+          ASSERT_OK(s);
+        }
+        if (do_rollback) {
+          // Test rolling back the batch with duplicates
+          s = txn0->Rollback();
+          ASSERT_OK(s);
+        } else {
+          if (with_commit_batch) {
+            assert(do_prepare);
+            auto cb = txn0->GetCommitTimeWriteBatch();
+            // duplicate a key in the original batch
+            // TODO(myabandeh): the behavior of GetCommitTimeWriteBatch
+            // conflicting with the prepared batch is currently undefined and
+            // gives different results in different implementations.
+
+            // s = cb->Put(Slice("foo0"), Slice("bar0d"));
+            // ASSERT_OK(s);
+            // add a new duplicate key
+            s = cb->Put(Slice("foo6"), Slice("bar6a"));
+            ASSERT_OK(s);
+            s = cb->Put(Slice("foo6"), Slice("bar6b"));
+            ASSERT_OK(s);
+            // add a duplicate key that is removed in the same batch
+            s = cb->Put(Slice("foo7"), Slice("bar7a"));
+            ASSERT_OK(s);
+            s = cb->Delete(Slice("foo7"));
+            ASSERT_OK(s);
           }
+          s = txn0->Commit();
+          ASSERT_OK(s);
         }
-      }
-      // Safety check to make sure the test actually ran
-      ASSERT_TRUE(found_committed);
-      // As an extra check, check if prepared set will be properly empty after
-      // they are committed.
-      if (cur_txn) {
-        wp_db->AddCommitted(cur_txn, seq);
-      }
-      for (auto p : prepared) {
-        wp_db->AddCommitted(p, seq);
-      }
-      ASSERT_TRUE(wp_db->delayed_prepared_.empty());
-      ASSERT_TRUE(wp_db->prepared_txns_.empty());
-      for (auto s : to_be_released) {
-        db->ReleaseSnapshot(s);
-      }
+        delete txn0;
+        ReadOptions ropt;
+        PinnableSlice pinnable_val;
+
+        if (do_rollback) {
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+        } else {
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar0c"));
+          s = db->Get(ropt, cf_handle, "foo0", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar0-cf1"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar1"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo2", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar2a,bar2b"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo3", &pinnable_val);
+          ASSERT_OK(s);
+          ASSERT_TRUE(pinnable_val == ("bar3,bar3"));
+          s = db->Get(ropt, db->DefaultColumnFamily(), "foo4", &pinnable_val);
+          ASSERT_TRUE(s.IsNotFound());
+          if (with_commit_batch) {
+            s = db->Get(ropt, db->DefaultColumnFamily(), "foo6", &pinnable_val);
+            ASSERT_OK(s);
+            ASSERT_TRUE(pinnable_val == ("bar6b"));
+            s = db->Get(ropt, db->DefaultColumnFamily(), "foo7", &pinnable_val);
+            ASSERT_TRUE(s.IsNotFound());
+          }
+        }
+        delete cf_handle;
+      }  // with_commit_batch
+    }    // do_rollback
+  }      // do_prepare
+
+  {
+    // Also test with max_successive_merges > 0. max_successive_merges will not
+    // affect our algorithm for duplicate key insertion but we add the test to
+    // verify that.
+    cf_options.max_successive_merges = 2;
+    cf_options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    ASSERT_OK(ReOpen());
+    db->CreateColumnFamily(cf_options, cf_name, &cf_handle);
+    WriteOptions write_options;
+    // Ensure one value for the key
+    ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value")));
+    WriteBatch batch;
+    // Merge more than max_successive_merges times
+    batch.Merge(cf_handle, Slice("key"), Slice("1"));
+    batch.Merge(cf_handle, Slice("key"), Slice("2"));
+    batch.Merge(cf_handle, Slice("key"), Slice("3"));
+    batch.Merge(cf_handle, Slice("key"), Slice("4"));
+    ASSERT_OK(db->Write(write_options, &batch));
+    ReadOptions read_options;
+    string value;
+    ASSERT_OK(db->Get(read_options, cf_handle, "key", &value));
+    ASSERT_EQ(value, "value,1,2,3,4");
+    delete cf_handle;
+  }
+
+  {
+    // Test that the duplicate detection is not compromised after rolling back
+    // to a save point
+    TransactionOptions txn_options;
+    WriteOptions write_options;
+    Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
+    txn0->SetSavePoint();
+    ASSERT_OK(txn0->RollbackToSavePoint());
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+  }
+
+  // Test sucessfull recovery after a crash
+  {
+    ASSERT_OK(ReOpen());
+    TransactionOptions txn_options;
+    WriteOptions write_options;
+    ReadOptions ropt;
+    Transaction* txn0;
+    PinnableSlice pinnable_val;
+    Status s;
+
+    std::unique_ptr<const Comparator> comp_gc(new ThreeBytewiseComparator());
+    cf_options.comparator = comp_gc.get();
+    ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+    delete cf_handle;
+    std::vector<ColumnFamilyDescriptor> cfds{
+        ColumnFamilyDescriptor(kDefaultColumnFamilyName,
+                               ColumnFamilyOptions(options)),
+        ColumnFamilyDescriptor(cf_name, cf_options),
+    };
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+
+    ASSERT_OK(db->Put(write_options, "foo0", "init"));
+    ASSERT_OK(db->Put(write_options, "foo1", "init"));
+    ASSERT_OK(db->Put(write_options, handles[1], "foo0", "init"));
+    ASSERT_OK(db->Put(write_options, handles[1], "foo1", "init"));
+
+    // one entry
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0a")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0a"));
+
+    // two entries, no duplicate
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("foo0"), Slice("bar0b")));
+    ASSERT_OK(txn0->Put(handles[1], Slice("fol1"), Slice("bar1b")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0b")));
+    ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1b")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    db->FlushWAL(true);
+    // Flush only cf 1
+    reinterpret_cast<DBImpl*>(db->GetRootDB())
+        ->TEST_FlushMemTable(true, false, handles[1]);
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0b"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1b"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0b"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "fol1", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1b"));
+
+    // one duplicate with ::Put
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0c")));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey1"), Slice("bar1d")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0c")));
+    ASSERT_OK(txn0->Put(Slice("foo1"), Slice("bar1c")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0d")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    reinterpret_cast<DBImpl*>(db->GetRootDB())
+        ->TEST_FlushMemTable(true, false, handles[1]);
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0d"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo1", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1c"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1d"));
+
+    // Duplicate with ::Put, ::Delete
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0e")));
+    ASSERT_OK(txn0->Delete(handles[1], Slice("key-nonkey1")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
+    ASSERT_OK(txn0->Delete(Slice("foo0")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    reinterpret_cast<DBImpl*>(db->GetRootDB())
+        ->TEST_FlushMemTable(true, false, handles[1]);
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+
+    // Duplicate with ::Put, ::SingleDelete
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar0g")));
+    ASSERT_OK(txn0->SingleDelete(handles[1], Slice("key-nonkey1")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0e")));
+    ASSERT_OK(txn0->SingleDelete(Slice("foo0")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    reinterpret_cast<DBImpl*>(db->GetRootDB())
+        ->TEST_FlushMemTable(true, false, handles[1]);
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_TRUE(s.IsNotFound());
+
+    // Duplicate with ::Put, ::Merge
+    txn0 = db->BeginTransaction(write_options, txn_options);
+    ASSERT_OK(txn0->SetName("xid"));
+    ASSERT_OK(txn0->Put(handles[1], Slice("key-nonkey0"), Slice("bar1i")));
+    ASSERT_OK(txn0->Merge(handles[1], Slice("key-nonkey1"), Slice("bar1j")));
+    ASSERT_OK(txn0->Put(Slice("foo0"), Slice("bar0f")));
+    ASSERT_OK(txn0->Merge(Slice("foo0"), Slice("bar0g")));
+    ASSERT_OK(txn0->Prepare());
+    delete txn0;
+    // This will check the asserts inside recovery code
+    ASSERT_OK(db->FlushWAL(true));
+    // Flush only cf 1
+    reinterpret_cast<DBImpl*>(db->GetRootDB())
+        ->TEST_FlushMemTable(true, false, handles[1]);
+    reinterpret_cast<PessimisticTransactionDB*>(db)->TEST_Crash();
+    ASSERT_OK(ReOpenNoDelete(cfds, &handles));
+    txn0 = db->GetTransactionByName("xid");
+    ASSERT_TRUE(txn0 != nullptr);
+    ASSERT_OK(txn0->Commit());
+    delete txn0;
+    pinnable_val.Reset();
+    s = db->Get(ropt, db->DefaultColumnFamily(), "foo0", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar0f,bar0g"));
+    pinnable_val.Reset();
+    s = db->Get(ropt, handles[1], "key-nonkey2", &pinnable_val);
+    ASSERT_OK(s);
+    ASSERT_TRUE(pinnable_val == ("bar1i,bar1j"));
+
+    for (auto h : handles) {
+      delete h;
     }
+    delete db;
+    db = nullptr;
   }
 }
 
@@ -4875,7 +5788,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr,
           "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
   return 0;
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_test.h b/thirdparty/rocksdb/utilities/transactions/transaction_test.h
new file mode 100644
index 0000000000..33b2c51ea2
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_test.h
@@ -0,0 +1,478 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "util/fault_injection_test_env.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/transaction_test_util.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+
+#include "port/port.h"
+
+namespace rocksdb {
+
+// Return true if the ith bit is set in combination represented by comb
+bool IsInCombination(size_t i, size_t comb) { return comb & (size_t(1) << i); }
+
+class TransactionTestBase : public ::testing::Test {
+ public:
+  TransactionDB* db;
+  FaultInjectionTestEnv* env;
+  std::string dbname;
+  Options options;
+
+  TransactionDBOptions txn_db_options;
+  bool use_stackable_db_;
+
+  TransactionTestBase(bool use_stackable_db, bool two_write_queue,
+                      TxnDBWritePolicy write_policy)
+      : db(nullptr), env(nullptr), use_stackable_db_(use_stackable_db) {
+    options.create_if_missing = true;
+    options.max_write_buffer_number = 2;
+    options.write_buffer_size = 4 * 1024;
+    options.level0_file_num_compaction_trigger = 2;
+    options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+    env = new FaultInjectionTestEnv(Env::Default());
+    options.env = env;
+    options.two_write_queues = two_write_queue;
+    dbname = test::PerThreadDBPath("transaction_testdb");
+
+    DestroyDB(dbname, options);
+    txn_db_options.transaction_lock_timeout = 0;
+    txn_db_options.default_lock_timeout = 0;
+    txn_db_options.write_policy = write_policy;
+    txn_db_options.rollback_merge_operands = true;
+    Status s;
+    if (use_stackable_db == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    } else {
+      s = OpenWithStackableDB();
+    }
+    assert(s.ok());
+  }
+
+  ~TransactionTestBase() {
+    delete db;
+    db = nullptr;
+    // This is to skip the assert statement in FaultInjectionTestEnv. There
+    // seems to be a bug in btrfs that the makes readdir return recently
+    // unlink-ed files. By using the default fs we simply ignore errors resulted
+    // from attempting to delete such files in DestroyDB.
+    options.env = Env::Default();
+    DestroyDB(dbname, options);
+    delete env;
+  }
+
+  Status ReOpenNoDelete() {
+    delete db;
+    db = nullptr;
+    env->AssertNoOpenFile();
+    env->DropUnsyncedFileData();
+    env->ResetState();
+    Status s;
+    if (use_stackable_db_ == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    } else {
+      s = OpenWithStackableDB();
+    }
+    assert(!s.ok() || db != nullptr);
+    return s;
+  }
+
+  Status ReOpenNoDelete(std::vector<ColumnFamilyDescriptor>& cfs,
+                        std::vector<ColumnFamilyHandle*>* handles) {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete db;
+    db = nullptr;
+    env->AssertNoOpenFile();
+    env->DropUnsyncedFileData();
+    env->ResetState();
+    Status s;
+    if (use_stackable_db_ == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles,
+                              &db);
+    } else {
+      s = OpenWithStackableDB(cfs, handles);
+    }
+    assert(db != nullptr);
+    return s;
+  }
+
+  Status ReOpen() {
+    delete db;
+    db = nullptr;
+    DestroyDB(dbname, options);
+    Status s;
+    if (use_stackable_db_ == false) {
+      s = TransactionDB::Open(options, txn_db_options, dbname, &db);
+    } else {
+      s = OpenWithStackableDB();
+    }
+    assert(db != nullptr);
+    return s;
+  }
+
+  Status OpenWithStackableDB(std::vector<ColumnFamilyDescriptor>& cfs,
+                             std::vector<ColumnFamilyHandle*>* handles) {
+    std::vector<size_t> compaction_enabled_cf_indices;
+    TransactionDB::PrepareWrap(&options, &cfs, &compaction_enabled_cf_indices);
+    DB* root_db = nullptr;
+    Options options_copy(options);
+    const bool use_seq_per_batch =
+        txn_db_options.write_policy == WRITE_PREPARED ||
+        txn_db_options.write_policy == WRITE_UNPREPARED;
+    const bool use_batch_per_txn =
+        txn_db_options.write_policy == WRITE_COMMITTED ||
+        txn_db_options.write_policy == WRITE_PREPARED;
+    Status s = DBImpl::Open(options_copy, dbname, cfs, handles, &root_db,
+                            use_seq_per_batch, use_batch_per_txn);
+    StackableDB* stackable_db = new StackableDB(root_db);
+    if (s.ok()) {
+      assert(root_db != nullptr);
+      s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options,
+                                         compaction_enabled_cf_indices,
+                                         *handles, &db);
+    }
+    if (!s.ok()) {
+      delete stackable_db;
+      // just in case it was not deleted (and not set to nullptr).
+      delete root_db;
+    }
+    return s;
+  }
+
+  Status OpenWithStackableDB() {
+    std::vector<size_t> compaction_enabled_cf_indices;
+    std::vector<ColumnFamilyDescriptor> column_families{ColumnFamilyDescriptor(
+        kDefaultColumnFamilyName, ColumnFamilyOptions(options))};
+
+    TransactionDB::PrepareWrap(&options, &column_families,
+                               &compaction_enabled_cf_indices);
+    std::vector<ColumnFamilyHandle*> handles;
+    DB* root_db = nullptr;
+    Options options_copy(options);
+    const bool use_seq_per_batch =
+        txn_db_options.write_policy == WRITE_PREPARED ||
+        txn_db_options.write_policy == WRITE_UNPREPARED;
+    const bool use_batch_per_txn =
+        txn_db_options.write_policy == WRITE_COMMITTED ||
+        txn_db_options.write_policy == WRITE_PREPARED;
+    Status s = DBImpl::Open(options_copy, dbname, column_families, &handles,
+                            &root_db, use_seq_per_batch, use_batch_per_txn);
+    if (!s.ok()) {
+      delete root_db;
+      return s;
+    }
+    StackableDB* stackable_db = new StackableDB(root_db);
+    assert(root_db != nullptr);
+    assert(handles.size() == 1);
+    s = TransactionDB::WrapStackableDB(stackable_db, txn_db_options,
+                                       compaction_enabled_cf_indices, handles,
+                                       &db);
+    delete handles[0];
+    if (!s.ok()) {
+      delete stackable_db;
+      // just in case it was not deleted (and not set to nullptr).
+      delete root_db;
+    }
+    return s;
+  }
+
+  std::atomic<size_t> linked = {0};
+  std::atomic<size_t> exp_seq = {0};
+  std::atomic<size_t> commit_writes = {0};
+  std::atomic<size_t> expected_commits = {0};
+  std::function<void(size_t, Status)> txn_t0_with_status = [&](size_t index,
+                                                               Status exp_s) {
+    // Test DB's internal txn. It involves no prepare phase nor a commit marker.
+    WriteOptions wopts;
+    auto s = db->Put(wopts, "key" + std::to_string(index), "value");
+    ASSERT_EQ(exp_s, s);
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq++;
+    } else {
+      // Consume one seq per batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for commit
+        exp_seq++;
+      }
+    }
+  };
+  std::function<void(size_t)> txn_t0 = [&](size_t index) {
+    return txn_t0_with_status(index, Status::OK());
+  };
+  std::function<void(size_t)> txn_t1 = [&](size_t index) {
+    // Testing directly writing a write batch. Functionality-wise it is
+    // equivalent to commit without prepare.
+    WriteBatch wb;
+    auto istr = std::to_string(index);
+    ASSERT_OK(wb.Put("k1" + istr, "v1"));
+    ASSERT_OK(wb.Put("k2" + istr, "v2"));
+    ASSERT_OK(wb.Put("k3" + istr, "v3"));
+    WriteOptions wopts;
+    auto s = db->Write(wopts, &wb);
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq += 3;
+    } else {
+      // Consume one seq per batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for commit
+        exp_seq++;
+      }
+    }
+    ASSERT_OK(s);
+  };
+  std::function<void(size_t)> txn_t2 = [&](size_t index) {
+    // Commit without prepare. It should write to DB without a commit marker.
+    TransactionOptions txn_options;
+    WriteOptions write_options;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    auto istr = std::to_string(index);
+    ASSERT_OK(txn->SetName("xid" + istr));
+    ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+    ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+    ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+    ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+    ASSERT_OK(txn->Commit());
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq += 4;
+    } else {
+      // Consume one seq per batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for commit
+        exp_seq++;
+      }
+    }
+    delete txn;
+  };
+  std::function<void(size_t)> txn_t3 = [&](size_t index) {
+    // A full 2pc txn that also involves a commit marker.
+    TransactionOptions txn_options;
+    WriteOptions write_options;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    auto istr = std::to_string(index);
+    ASSERT_OK(txn->SetName("xid" + istr));
+    ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+    ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+    ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+    ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+    ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5")));
+    expected_commits++;
+    ASSERT_OK(txn->Prepare());
+    commit_writes++;
+    ASSERT_OK(txn->Commit());
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // Consume one seq per key
+      exp_seq += 5;
+    } else {
+      // Consume one seq per batch
+      exp_seq++;
+      // Consume one seq per commit marker
+      exp_seq++;
+    }
+    delete txn;
+  };
+  std::function<void(size_t)> txn_t4 = [&](size_t index) {
+    // A full 2pc txn that also involves a commit marker.
+    TransactionOptions txn_options;
+    WriteOptions write_options;
+    Transaction* txn = db->BeginTransaction(write_options, txn_options);
+    auto istr = std::to_string(index);
+    ASSERT_OK(txn->SetName("xid" + istr));
+    ASSERT_OK(txn->Put(Slice("foo" + istr), Slice("bar")));
+    ASSERT_OK(txn->Put(Slice("foo2" + istr), Slice("bar2")));
+    ASSERT_OK(txn->Put(Slice("foo3" + istr), Slice("bar3")));
+    ASSERT_OK(txn->Put(Slice("foo4" + istr), Slice("bar4")));
+    ASSERT_OK(txn->Put(Slice("foo5" + istr), Slice("bar5")));
+    expected_commits++;
+    ASSERT_OK(txn->Prepare());
+    commit_writes++;
+    ASSERT_OK(txn->Rollback());
+    if (txn_db_options.write_policy == TxnDBWritePolicy::WRITE_COMMITTED) {
+      // No seq is consumed for deleting the txn buffer
+      exp_seq += 0;
+    } else {
+      // Consume one seq per batch
+      exp_seq++;
+      // Consume one seq per rollback batch
+      exp_seq++;
+      if (options.two_write_queues) {
+        // Consume one seq for rollback commit
+        exp_seq++;
+      }
+    }
+    delete txn;
+  };
+
+  // Test that we can change write policy after a clean shutdown (which would
+  // empty the WAL)
+  void CrossCompatibilityTest(TxnDBWritePolicy from_policy,
+                              TxnDBWritePolicy to_policy, bool empty_wal) {
+    TransactionOptions txn_options;
+    ReadOptions read_options;
+    WriteOptions write_options;
+    uint32_t index = 0;
+    Random rnd(1103);
+    options.write_buffer_size = 1024;  // To create more sst files
+    std::unordered_map<std::string, std::string> committed_kvs;
+    Transaction* txn;
+
+    txn_db_options.write_policy = from_policy;
+    ReOpen();
+
+    for (int i = 0; i < 1024; i++) {
+      auto istr = std::to_string(index);
+      auto k = Slice("foo-" + istr).ToString();
+      auto v = Slice("bar-" + istr).ToString();
+      // For test the duplicate keys
+      auto v2 = Slice("bar2-" + istr).ToString();
+      auto type = rnd.Uniform(4);
+      switch (type) {
+        case 0:
+          committed_kvs[k] = v;
+          ASSERT_OK(db->Put(write_options, k, v));
+          committed_kvs[k] = v2;
+          ASSERT_OK(db->Put(write_options, k, v2));
+          break;
+        case 1: {
+          WriteBatch wb;
+          committed_kvs[k] = v;
+          wb.Put(k, v);
+          committed_kvs[k] = v2;
+          wb.Put(k, v2);
+          ASSERT_OK(db->Write(write_options, &wb));
+
+        } break;
+        case 2:
+        case 3:
+          txn = db->BeginTransaction(write_options, txn_options);
+          ASSERT_OK(txn->SetName("xid" + istr));
+          committed_kvs[k] = v;
+          ASSERT_OK(txn->Put(k, v));
+          committed_kvs[k] = v2;
+          ASSERT_OK(txn->Put(k, v2));
+
+          if (type == 3) {
+            ASSERT_OK(txn->Prepare());
+          }
+          ASSERT_OK(txn->Commit());
+          delete txn;
+          break;
+        default:
+          assert(0);
+      }
+
+      index++;
+    }  // for i
+
+    txn_db_options.write_policy = to_policy;
+    auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    // Before upgrade/downgrade the WAL must be emptied
+    if (empty_wal) {
+      db_impl->TEST_FlushMemTable();
+    } else {
+      db_impl->FlushWAL(true);
+    }
+    auto s = ReOpenNoDelete();
+    if (empty_wal) {
+      ASSERT_OK(s);
+    } else {
+      // Test that we can detect the WAL that is produced by an incompatible
+      // WritePolicy and fail fast before mis-interpreting the WAL.
+      ASSERT_TRUE(s.IsNotSupported());
+      return;
+    }
+    db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    // Check that WAL is empty
+    VectorLogPtr log_files;
+    db_impl->GetSortedWalFiles(log_files);
+    ASSERT_EQ(0, log_files.size());
+
+    for (auto& kv : committed_kvs) {
+      std::string value;
+      s = db->Get(read_options, kv.first, &value);
+      if (s.IsNotFound()) {
+        printf("key = %s\n", kv.first.c_str());
+      }
+      ASSERT_OK(s);
+      if (kv.second != value) {
+        printf("key = %s\n", kv.first.c_str());
+      }
+      ASSERT_EQ(kv.second, value);
+    }
+  }
+};
+
+class TransactionTest : public TransactionTestBase,
+                        virtual public ::testing::WithParamInterface<
+                            std::tuple<bool, bool, TxnDBWritePolicy>> {
+ public:
+  TransactionTest()
+      : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam())){};
+};
+
+class TransactionStressTest : public TransactionTest {};
+
+class MySQLStyleTransactionTest
+    : public TransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, bool>> {
+ public:
+  MySQLStyleTransactionTest()
+      : TransactionTestBase(std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam())),
+        with_slow_threads_(std::get<3>(GetParam())) {
+    if (with_slow_threads_ &&
+        (txn_db_options.write_policy == WRITE_PREPARED ||
+         txn_db_options.write_policy == WRITE_UNPREPARED)) {
+      // The corner case with slow threads involves the caches filling
+      // over which would not happen even with artifial delays. To help
+      // such cases to show up we lower the size of the cache-related data
+      // structures.
+      txn_db_options.wp_snapshot_cache_bits = 1;
+      txn_db_options.wp_commit_cache_bits = 10;
+      EXPECT_OK(ReOpen());
+    }
+  };
+
+ protected:
+  // Also emulate slow threads by addin artiftial delays
+  const bool with_slow_threads_;
+};
+
+}  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_util.cc b/thirdparty/rocksdb/utilities/transactions/transaction_util.cc
index ad03a94320..ec6f7e60ae 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_util.cc
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_util.cc
@@ -22,11 +22,10 @@
 
 namespace rocksdb {
 
-Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl,
-                                             ColumnFamilyHandle* column_family,
-                                             const std::string& key,
-                                             SequenceNumber key_seq,
-                                             bool cache_only) {
+Status TransactionUtil::CheckKeyForConflicts(
+    DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key,
+    SequenceNumber snap_seq, bool cache_only, ReadCallback* snap_checker,
+    SequenceNumber min_uncommitted) {
   Status result;
 
   auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
@@ -42,7 +41,8 @@ Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl,
     SequenceNumber earliest_seq =
         db_impl->GetEarliestMemTableSequenceNumber(sv, true);
 
-    result = CheckKey(db_impl, sv, earliest_seq, key_seq, key, cache_only);
+    result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, cache_only,
+                      snap_checker, min_uncommitted);
 
     db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
   }
@@ -52,8 +52,10 @@ Status TransactionUtil::CheckKeyForConflicts(DBImpl* db_impl,
 
 Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
                                  SequenceNumber earliest_seq,
-                                 SequenceNumber key_seq, const std::string& key,
-                                 bool cache_only) {
+                                 SequenceNumber snap_seq,
+                                 const std::string& key, bool cache_only,
+                                 ReadCallback* snap_checker,
+                                 SequenceNumber min_uncommitted) {
   Status result;
   bool need_to_read_sst = false;
 
@@ -73,9 +75,11 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
       result = Status::TryAgain(
           "Transaction ould not check for conflicts as the MemTable does not "
           "countain a long enough history to check write at SequenceNumber: ",
-          ToString(key_seq));
+          ToString(snap_seq));
     }
-  } else if (key_seq < earliest_seq) {
+  } else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) {
+    // Use <= for min_uncommitted since earliest_seq is actually the largest sec
+    // before this memtable was created
     need_to_read_sst = true;
 
     if (cache_only) {
@@ -91,7 +95,7 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
                "max_write_buffer_number_to_maintain option could reduce the "
                "frequency "
                "of this error.",
-               key_seq, earliest_seq);
+               snap_seq, earliest_seq);
       result = Status::TryAgain(msg);
     }
   }
@@ -105,9 +109,13 @@ Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
 
     if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
       result = s;
-    } else if (found_record_for_key && (seq > key_seq)) {
-      // Write Conflict
-      result = Status::Busy();
+    } else if (found_record_for_key) {
+      bool write_conflict = snap_checker == nullptr
+                                ? snap_seq < seq
+                                : !snap_checker->IsVisible(seq);
+      if (write_conflict) {
+        result = Status::Busy();
+      }
     }
   }
 
diff --git a/thirdparty/rocksdb/utilities/transactions/transaction_util.h b/thirdparty/rocksdb/utilities/transactions/transaction_util.h
index 5c6b8fa490..0fe0e87d86 100644
--- a/thirdparty/rocksdb/utilities/transactions/transaction_util.h
+++ b/thirdparty/rocksdb/utilities/transactions/transaction_util.h
@@ -10,6 +10,9 @@
 #include <string>
 #include <unordered_map>
 
+#include "db/dbformat.h"
+#include "db/read_callback.h"
+
 #include "rocksdb/db.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@@ -40,7 +43,7 @@ class WriteBatchWithIndex;
 
 class TransactionUtil {
  public:
-  // Verifies there have been no writes to this key in the db since this
+  // Verifies there have been no commits to this key in the db since this
   // sequence number.
   //
   // If cache_only is true, then this function will not attempt to read any
@@ -49,10 +52,11 @@ class TransactionUtil {
   //
   // Returns OK on success, BUSY if there is a conflicting write, or other error
   // status for any unexpected errors.
-  static Status CheckKeyForConflicts(DBImpl* db_impl,
-                                     ColumnFamilyHandle* column_family,
-                                     const std::string& key,
-                                     SequenceNumber key_seq, bool cache_only);
+  static Status CheckKeyForConflicts(
+      DBImpl* db_impl, ColumnFamilyHandle* column_family,
+      const std::string& key, SequenceNumber snap_seq, bool cache_only,
+      ReadCallback* snap_checker = nullptr,
+      SequenceNumber min_uncommitted = kMaxSequenceNumber);
 
   // For each key,SequenceNumber pair in the TransactionKeyMap, this function
   // will verify there have been no writes to the key in the db since that
@@ -69,8 +73,10 @@ class TransactionUtil {
 
  private:
   static Status CheckKey(DBImpl* db_impl, SuperVersion* sv,
-                         SequenceNumber earliest_seq, SequenceNumber key_seq,
-                         const std::string& key, bool cache_only);
+                         SequenceNumber earliest_seq, SequenceNumber snap_seq,
+                         const std::string& key, bool cache_only,
+                         ReadCallback* snap_checker = nullptr,
+                         SequenceNumber min_uncommitted = kMaxSequenceNumber);
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/transactions/write_prepared_transaction_test.cc b/thirdparty/rocksdb/utilities/transactions/write_prepared_transaction_test.cc
new file mode 100644
index 0000000000..c0f5a10682
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_prepared_transaction_test.cc
@@ -0,0 +1,3179 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/transaction_test.h"
+
+#include <inttypes.h>
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <string>
+#include <thread>
+
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/debug.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "table/mock_table.h"
+#include "util/fault_injection_test_env.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+#include "util/transaction_test_util.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+#include "port/port.h"
+
+using std::string;
+
+namespace rocksdb {
+
+using CommitEntry = WritePreparedTxnDB::CommitEntry;
+using CommitEntry64b = WritePreparedTxnDB::CommitEntry64b;
+using CommitEntry64bFormat = WritePreparedTxnDB::CommitEntry64bFormat;
+
+TEST(PreparedHeap, BasicsTest) {
+  WritePreparedTxnDB::PreparedHeap heap;
+  heap.push(14l);
+  // Test with one element
+  ASSERT_EQ(14l, heap.top());
+  heap.push(24l);
+  heap.push(34l);
+  // Test that old min is still on top
+  ASSERT_EQ(14l, heap.top());
+  heap.push(13l);
+  // Test that the new min will be on top
+  ASSERT_EQ(13l, heap.top());
+  // Test that it is persistent
+  ASSERT_EQ(13l, heap.top());
+  heap.push(44l);
+  heap.push(54l);
+  heap.push(64l);
+  heap.push(74l);
+  heap.push(84l);
+  // Test that old min is still on top
+  ASSERT_EQ(13l, heap.top());
+  heap.erase(24l);
+  // Test that old min is still on top
+  ASSERT_EQ(13l, heap.top());
+  heap.erase(14l);
+  // Test that old min is still on top
+  ASSERT_EQ(13l, heap.top());
+  heap.erase(13l);
+  // Test that the new comes to the top after multiple erase
+  ASSERT_EQ(34l, heap.top());
+  heap.erase(34l);
+  // Test that the new comes to the top after single erase
+  ASSERT_EQ(44l, heap.top());
+  heap.erase(54l);
+  ASSERT_EQ(44l, heap.top());
+  heap.pop();  // pop 44l
+  // Test that the erased items are ignored after pop
+  ASSERT_EQ(64l, heap.top());
+  heap.erase(44l);
+  // Test that erasing an already popped item would work
+  ASSERT_EQ(64l, heap.top());
+  heap.erase(84l);
+  ASSERT_EQ(64l, heap.top());
+  heap.push(85l);
+  heap.push(86l);
+  heap.push(87l);
+  heap.push(88l);
+  heap.push(89l);
+  heap.erase(87l);
+  heap.erase(85l);
+  heap.erase(89l);
+  heap.erase(86l);
+  heap.erase(88l);
+  // Test top remains the same after a random order of many erases
+  ASSERT_EQ(64l, heap.top());
+  heap.pop();
+  // Test that pop works with a series of random pending erases
+  ASSERT_EQ(74l, heap.top());
+  ASSERT_FALSE(heap.empty());
+  heap.pop();
+  // Test that empty works
+  ASSERT_TRUE(heap.empty());
+}
+
+// This is a scenario reconstructed from a buggy trace. Test that the bug does
+// not resurface again.
+TEST(PreparedHeap, EmptyAtTheEnd) {
+  WritePreparedTxnDB::PreparedHeap heap;
+  heap.push(40l);
+  ASSERT_EQ(40l, heap.top());
+  // Although not a recommended scenario, we must be resilient against erase
+  // without a prior push.
+  heap.erase(50l);
+  ASSERT_EQ(40l, heap.top());
+  heap.push(60l);
+  ASSERT_EQ(40l, heap.top());
+
+  heap.erase(60l);
+  ASSERT_EQ(40l, heap.top());
+  heap.erase(40l);
+  ASSERT_TRUE(heap.empty());
+
+  heap.push(40l);
+  ASSERT_EQ(40l, heap.top());
+  heap.erase(50l);
+  ASSERT_EQ(40l, heap.top());
+  heap.push(60l);
+  ASSERT_EQ(40l, heap.top());
+
+  heap.erase(40l);
+  // Test that the erase has not emptied the heap (we had a bug doing that)
+  ASSERT_FALSE(heap.empty());
+  ASSERT_EQ(60l, heap.top());
+  heap.erase(60l);
+  ASSERT_TRUE(heap.empty());
+}
+
+// Generate random order of PreparedHeap access and test that the heap will be
+// successfully emptied at the end.
+TEST(PreparedHeap, Concurrent) {
+  const size_t t_cnt = 10;
+  rocksdb::port::Thread t[t_cnt];
+  Random rnd(1103);
+  WritePreparedTxnDB::PreparedHeap heap;
+  port::RWMutex prepared_mutex;
+
+  for (size_t n = 0; n < 100; n++) {
+    for (size_t i = 0; i < t_cnt; i++) {
+      // This is not recommended usage but we should be resilient against it.
+      bool skip_push = rnd.OneIn(5);
+      t[i] = rocksdb::port::Thread([&heap, &prepared_mutex, skip_push, i]() {
+        auto seq = i;
+        std::this_thread::yield();
+        if (!skip_push) {
+          WriteLock wl(&prepared_mutex);
+          heap.push(seq);
+        }
+        std::this_thread::yield();
+        {
+          WriteLock wl(&prepared_mutex);
+          heap.erase(seq);
+        }
+      });
+    }
+    for (size_t i = 0; i < t_cnt; i++) {
+      t[i].join();
+    }
+    ASSERT_TRUE(heap.empty());
+  }
+}
+
+// Test that WriteBatchWithIndex correctly counts the number of sub-batches
+TEST(WriteBatchWithIndex, SubBatchCnt) {
+  ColumnFamilyOptions cf_options;
+  std::string cf_name = "two";
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  const std::string dbname = test::PerThreadDBPath("transaction_testdb");
+  DestroyDB(dbname, options);
+  ASSERT_OK(DB::Open(options, dbname, &db));
+  ColumnFamilyHandle* cf_handle = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle));
+  WriteOptions write_options;
+  size_t batch_cnt = 1;
+  size_t save_points = 0;
+  std::vector<size_t> batch_cnt_at;
+  WriteBatchWithIndex batch(db->DefaultColumnFamily()->GetComparator(), 0, true,
+                            0);
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  batch.Put(Slice("key"), Slice("value"));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  batch.Put(Slice("key2"), Slice("value2"));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  // duplicate the keys
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  batch.Put(Slice("key"), Slice("value3"));
+  batch_cnt++;
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  // duplicate the 2nd key. It should not be counted duplicate since a
+  // sub-patch is cut after the last duplicate.
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  batch.Put(Slice("key2"), Slice("value4"));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+  // duplicate the keys but in a different cf. It should not be counted as
+  // duplicate keys
+  batch_cnt_at.push_back(batch_cnt);
+  batch.SetSavePoint();
+  save_points++;
+  batch.Put(cf_handle, Slice("key"), Slice("value5"));
+  ASSERT_EQ(batch_cnt, batch.SubBatchCnt());
+
+  // Test that the number of sub-batches matches what we count with
+  // SubBatchCounter
+  std::map<uint32_t, const Comparator*> comparators;
+  comparators[0] = db->DefaultColumnFamily()->GetComparator();
+  comparators[cf_handle->GetID()] = cf_handle->GetComparator();
+  SubBatchCounter counter(comparators);
+  ASSERT_OK(batch.GetWriteBatch()->Iterate(&counter));
+  ASSERT_EQ(batch_cnt, counter.BatchCount());
+
+  // Test that RollbackToSavePoint will properly resets the number of
+  // sub-batches
+  for (size_t i = save_points; i > 0; i--) {
+    batch.RollbackToSavePoint();
+    ASSERT_EQ(batch_cnt_at[i - 1], batch.SubBatchCnt());
+  }
+
+  // Test the count is right with random batches
+  {
+    const size_t TOTAL_KEYS = 20;  // 20 ~= 10 to cause a few randoms
+    Random rnd(1131);
+    std::string keys[TOTAL_KEYS];
+    for (size_t k = 0; k < TOTAL_KEYS; k++) {
+      int len = static_cast<int>(rnd.Uniform(50));
+      keys[k] = test::RandomKey(&rnd, len);
+    }
+    for (size_t i = 0; i < 1000; i++) {  // 1000 random batches
+      WriteBatchWithIndex rndbatch(db->DefaultColumnFamily()->GetComparator(),
+                                   0, true, 0);
+      for (size_t k = 0; k < 10; k++) {  // 10 key per batch
+        size_t ki = static_cast<size_t>(rnd.Uniform(TOTAL_KEYS));
+        Slice key = Slice(keys[ki]);
+        std::string buffer;
+        Slice value = Slice(test::RandomString(&rnd, 16, &buffer));
+        rndbatch.Put(key, value);
+      }
+      SubBatchCounter batch_counter(comparators);
+      ASSERT_OK(rndbatch.GetWriteBatch()->Iterate(&batch_counter));
+      ASSERT_EQ(rndbatch.SubBatchCnt(), batch_counter.BatchCount());
+    }
+  }
+
+  delete cf_handle;
+  delete db;
+}
+
+TEST(CommitEntry64b, BasicTest) {
+  const size_t INDEX_BITS = static_cast<size_t>(21);
+  const size_t INDEX_SIZE = static_cast<size_t>(1ull << INDEX_BITS);
+  const CommitEntry64bFormat FORMAT(static_cast<size_t>(INDEX_BITS));
+
+  // zero-initialized CommitEntry64b should indicate an empty entry
+  CommitEntry64b empty_entry64b;
+  uint64_t empty_index = 11ul;
+  CommitEntry empty_entry;
+  bool ok = empty_entry64b.Parse(empty_index, &empty_entry, FORMAT);
+  ASSERT_FALSE(ok);
+
+  // the zero entry is reserved for un-initialized entries
+  const size_t MAX_COMMIT = (1 << FORMAT.COMMIT_BITS) - 1 - 1;
+  // Samples over the numbers that are covered by that many index bits
+  std::array<uint64_t, 4> is = {{0, 1, INDEX_SIZE / 2 + 1, INDEX_SIZE - 1}};
+  // Samples over the numbers that are covered by that many commit bits
+  std::array<uint64_t, 4> ds = {{0, 1, MAX_COMMIT / 2 + 1, MAX_COMMIT}};
+  // Iterate over prepare numbers that have i) cover all bits of a sequence
+  // number, and ii) include some bits that fall into the range of index or
+  // commit bits
+  for (uint64_t base = 1; base < kMaxSequenceNumber; base *= 2) {
+    for (uint64_t i : is) {
+      for (uint64_t d : ds) {
+        uint64_t p = base + i + d;
+        for (uint64_t c : {p, p + d / 2, p + d}) {
+          uint64_t index = p % INDEX_SIZE;
+          CommitEntry before(p, c), after;
+          CommitEntry64b entry64b(before, FORMAT);
+          ok = entry64b.Parse(index, &after, FORMAT);
+          ASSERT_TRUE(ok);
+          if (!(before == after)) {
+            printf("base %" PRIu64 " i %" PRIu64 " d %" PRIu64 " p %" PRIu64
+                   " c %" PRIu64 " index %" PRIu64 "\n",
+                   base, i, d, p, c, index);
+          }
+          ASSERT_EQ(before, after);
+        }
+      }
+    }
+  }
+}
+
+class WritePreparedTxnDBMock : public WritePreparedTxnDB {
+ public:
+  WritePreparedTxnDBMock(DBImpl* db_impl, TransactionDBOptions& opt)
+      : WritePreparedTxnDB(db_impl, opt) {}
+  void SetDBSnapshots(const std::vector<SequenceNumber>& snapshots) {
+    snapshots_ = snapshots;
+  }
+  void TakeSnapshot(SequenceNumber seq) { snapshots_.push_back(seq); }
+
+ protected:
+  const std::vector<SequenceNumber> GetSnapshotListFromDB(
+      SequenceNumber /* unused */) override {
+    return snapshots_;
+  }
+
+ private:
+  std::vector<SequenceNumber> snapshots_;
+};
+
+class WritePreparedTransactionTestBase : public TransactionTestBase {
+ public:
+  WritePreparedTransactionTestBase(bool use_stackable_db, bool two_write_queue,
+                                   TxnDBWritePolicy write_policy)
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){};
+
+ protected:
+  void UpdateTransactionDBOptions(size_t snapshot_cache_bits,
+                                  size_t commit_cache_bits) {
+    txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits;
+    txn_db_options.wp_commit_cache_bits = commit_cache_bits;
+  }
+  void UpdateTransactionDBOptions(size_t snapshot_cache_bits) {
+    txn_db_options.wp_snapshot_cache_bits = snapshot_cache_bits;
+  }
+  // If expect_update is set, check if it actually updated old_commit_map_. If
+  // it did not and yet suggested not to check the next snapshot, do the
+  // opposite to check if it was not a bad suggestion.
+  void MaybeUpdateOldCommitMapTestWithNext(uint64_t prepare, uint64_t commit,
+                                           uint64_t snapshot,
+                                           uint64_t next_snapshot,
+                                           bool expect_update) {
+    WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+    // reset old_commit_map_empty_ so that its value indicate whether
+    // old_commit_map_ was updated
+    wp_db->old_commit_map_empty_ = true;
+    bool check_next = wp_db->MaybeUpdateOldCommitMap(prepare, commit, snapshot,
+                                                     snapshot < next_snapshot);
+    if (expect_update == wp_db->old_commit_map_empty_) {
+      printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64
+             " next: %" PRIu64 "\n",
+             prepare, commit, snapshot, next_snapshot);
+    }
+    EXPECT_EQ(!expect_update, wp_db->old_commit_map_empty_);
+    if (!check_next && wp_db->old_commit_map_empty_) {
+      // do the opposite to make sure it was not a bad suggestion
+      const bool dont_care_bool = true;
+      wp_db->MaybeUpdateOldCommitMap(prepare, commit, next_snapshot,
+                                     dont_care_bool);
+      if (!wp_db->old_commit_map_empty_) {
+        printf("prepare: %" PRIu64 " commit: %" PRIu64 " snapshot: %" PRIu64
+               " next: %" PRIu64 "\n",
+               prepare, commit, snapshot, next_snapshot);
+      }
+      EXPECT_TRUE(wp_db->old_commit_map_empty_);
+    }
+  }
+
+  // Test that a CheckAgainstSnapshots thread reading old_snapshots will not
+  // miss a snapshot because of a concurrent update by UpdateSnapshots that is
+  // writing new_snapshots. Both threads are broken at two points. The sync
+  // points to enforce them are specified by a1, a2, b1, and b2. CommitEntry
+  // entry is expected to be vital for one of the snapshots that is common
+  // between the old and new list of snapshots.
+  void SnapshotConcurrentAccessTestInternal(
+      WritePreparedTxnDB* wp_db,
+      const std::vector<SequenceNumber>& old_snapshots,
+      const std::vector<SequenceNumber>& new_snapshots, CommitEntry& entry,
+      SequenceNumber& version, size_t a1, size_t a2, size_t b1, size_t b2) {
+    // First reset the snapshot list
+    const std::vector<SequenceNumber> empty_snapshots;
+    wp_db->old_commit_map_empty_ = true;
+    wp_db->UpdateSnapshots(empty_snapshots, ++version);
+    // Then initialize it with the old_snapshots
+    wp_db->UpdateSnapshots(old_snapshots, ++version);
+
+    // Starting from the first thread, cut each thread at two points
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a1),
+         "WritePreparedTxnDB::UpdateSnapshots:s:start"},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b1),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a1)},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(a2),
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b1)},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(b2),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(a2)},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:end",
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(b2)},
+    });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    {
+      ASSERT_TRUE(wp_db->old_commit_map_empty_);
+      rocksdb::port::Thread t1(
+          [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
+      rocksdb::port::Thread t2([&]() { wp_db->CheckAgainstSnapshots(entry); });
+      t1.join();
+      t2.join();
+      ASSERT_FALSE(wp_db->old_commit_map_empty_);
+    }
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+
+    wp_db->old_commit_map_empty_ = true;
+    wp_db->UpdateSnapshots(empty_snapshots, ++version);
+    wp_db->UpdateSnapshots(old_snapshots, ++version);
+    // Starting from the second thread, cut each thread at two points
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a1),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:start"},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b1),
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a1)},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:" + std::to_string(a2),
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b1)},
+        {"WritePreparedTxnDB::CheckAgainstSnapshots:p:" + std::to_string(b2),
+         "WritePreparedTxnDB::UpdateSnapshots:s:" + std::to_string(a2)},
+        {"WritePreparedTxnDB::UpdateSnapshots:p:end",
+         "WritePreparedTxnDB::CheckAgainstSnapshots:s:" + std::to_string(b2)},
+    });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    {
+      ASSERT_TRUE(wp_db->old_commit_map_empty_);
+      rocksdb::port::Thread t1(
+          [&]() { wp_db->UpdateSnapshots(new_snapshots, version); });
+      rocksdb::port::Thread t2([&]() { wp_db->CheckAgainstSnapshots(entry); });
+      t1.join();
+      t2.join();
+      ASSERT_FALSE(wp_db->old_commit_map_empty_);
+    }
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  // Verify value of keys.
+  void VerifyKeys(const std::unordered_map<std::string, std::string>& data,
+                  const Snapshot* snapshot = nullptr) {
+    std::string value;
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    for (auto& kv : data) {
+      auto s = db->Get(read_options, kv.first, &value);
+      ASSERT_TRUE(s.ok() || s.IsNotFound());
+      if (s.ok()) {
+        if (kv.second != value) {
+          printf("key = %s\n", kv.first.c_str());
+        }
+        ASSERT_EQ(kv.second, value);
+      } else {
+        ASSERT_EQ(kv.second, "NOT_FOUND");
+      }
+
+      // Try with MultiGet API too
+      std::vector<std::string> values;
+      auto s_vec = db->MultiGet(read_options, {db->DefaultColumnFamily()},
+                                {kv.first}, &values);
+      ASSERT_EQ(1, values.size());
+      ASSERT_EQ(1, s_vec.size());
+      s = s_vec[0];
+      ASSERT_TRUE(s.ok() || s.IsNotFound());
+      if (s.ok()) {
+        ASSERT_TRUE(kv.second == values[0]);
+      } else {
+        ASSERT_EQ(kv.second, "NOT_FOUND");
+      }
+    }
+  }
+
+  // Verify all versions of keys.
+  void VerifyInternalKeys(const std::vector<KeyVersion>& expected_versions) {
+    std::vector<KeyVersion> versions;
+    const size_t kMaxKeys = 100000;
+    ASSERT_OK(GetAllKeyVersions(db, expected_versions.front().user_key,
+                                expected_versions.back().user_key, kMaxKeys,
+                                &versions));
+    ASSERT_EQ(expected_versions.size(), versions.size());
+    for (size_t i = 0; i < versions.size(); i++) {
+      ASSERT_EQ(expected_versions[i].user_key, versions[i].user_key);
+      ASSERT_EQ(expected_versions[i].sequence, versions[i].sequence);
+      ASSERT_EQ(expected_versions[i].type, versions[i].type);
+      if (versions[i].type != kTypeDeletion &&
+          versions[i].type != kTypeSingleDeletion) {
+        ASSERT_EQ(expected_versions[i].value, versions[i].value);
+      }
+      // Range delete not supported.
+      assert(expected_versions[i].type != kTypeRangeDeletion);
+    }
+  }
+};
+
+class WritePreparedTransactionTest
+    : public WritePreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy>> {
+ public:
+  WritePreparedTransactionTest()
+      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
+                                         std::get<1>(GetParam()),
+                                         std::get<2>(GetParam())){};
+};
+
+#ifndef ROCKSDB_VALGRIND_RUN
+class SnapshotConcurrentAccessTest
+    : public WritePreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, size_t, size_t>> {
+ public:
+  SnapshotConcurrentAccessTest()
+      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
+                                         std::get<1>(GetParam()),
+                                         std::get<2>(GetParam())),
+        split_id_(std::get<3>(GetParam())),
+        split_cnt_(std::get<4>(GetParam())){};
+
+ protected:
+  // A test is split into split_cnt_ tests, each identified with split_id_ where
+  // 0 <= split_id_ < split_cnt_
+  size_t split_id_;
+  size_t split_cnt_;
+};
+#endif  // ROCKSDB_VALGRIND_RUN
+
+class SeqAdvanceConcurrentTest
+    : public WritePreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy, size_t, size_t>> {
+ public:
+  SeqAdvanceConcurrentTest()
+      : WritePreparedTransactionTestBase(std::get<0>(GetParam()),
+                                         std::get<1>(GetParam()),
+                                         std::get<2>(GetParam())),
+        split_id_(std::get<3>(GetParam())),
+        split_cnt_(std::get<4>(GetParam())){};
+
+ protected:
+  // A test is split into split_cnt_ tests, each identified with split_id_ where
+  // 0 <= split_id_ < split_cnt_
+  size_t split_id_;
+  size_t split_cnt_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    WritePreparedTransactionTest, WritePreparedTransactionTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED),
+                      std::make_tuple(false, true, WRITE_PREPARED)));
+
+#ifndef ROCKSDB_VALGRIND_RUN
+INSTANTIATE_TEST_CASE_P(
+    TwoWriteQueues, SnapshotConcurrentAccessTest,
+    ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 1, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 2, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 3, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 4, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 5, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 6, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 7, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 8, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 9, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 10, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 11, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 12, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 13, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 14, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 15, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 16, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 17, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 18, 20),
+                      std::make_tuple(false, true, WRITE_PREPARED, 19, 20)));
+
+INSTANTIATE_TEST_CASE_P(
+    OneWriteQueue, SnapshotConcurrentAccessTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 1, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 2, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 3, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 4, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 5, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 6, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 7, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 8, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 9, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 10, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 11, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 12, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 13, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 14, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 15, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 16, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 17, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 18, 20),
+                      std::make_tuple(false, false, WRITE_PREPARED, 19, 20)));
+
+INSTANTIATE_TEST_CASE_P(
+    TwoWriteQueues, SeqAdvanceConcurrentTest,
+    ::testing::Values(std::make_tuple(false, true, WRITE_PREPARED, 0, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 1, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 2, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 3, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 4, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 5, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 6, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 7, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 8, 10),
+                      std::make_tuple(false, true, WRITE_PREPARED, 9, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    OneWriteQueue, SeqAdvanceConcurrentTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_PREPARED, 0, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 1, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 2, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 3, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 4, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 5, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 6, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 7, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 8, 10),
+                      std::make_tuple(false, false, WRITE_PREPARED, 9, 10)));
+#endif  // ROCKSDB_VALGRIND_RUN
+
+TEST_P(WritePreparedTransactionTest, CommitMapTest) {
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  assert(wp_db);
+  assert(wp_db->db_impl_);
+  size_t size = wp_db->COMMIT_CACHE_SIZE;
+  CommitEntry c = {5, 12}, e;
+  bool evicted = wp_db->AddCommitEntry(c.prep_seq % size, c, &e);
+  ASSERT_FALSE(evicted);
+
+  // Should be able to read the same value
+  CommitEntry64b dont_care;
+  bool found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(c, e);
+  // Should be able to distinguish between overlapping entries
+  found = wp_db->GetCommitEntry((c.prep_seq + size) % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_NE(c.prep_seq + size, e.prep_seq);
+  // Should be able to detect non-existent entry
+  found = wp_db->GetCommitEntry((c.prep_seq + 1) % size, &dont_care, &e);
+  ASSERT_FALSE(found);
+
+  // Reject an invalid exchange
+  CommitEntry e2 = {c.prep_seq + size, c.commit_seq + size};
+  CommitEntry64b e2_64b(e2, wp_db->FORMAT);
+  bool exchanged = wp_db->ExchangeCommitEntry(e2.prep_seq % size, e2_64b, e);
+  ASSERT_FALSE(exchanged);
+  // check whether it did actually reject that
+  found = wp_db->GetCommitEntry(e2.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(c, e);
+
+  // Accept a valid exchange
+  CommitEntry64b c_64b(c, wp_db->FORMAT);
+  CommitEntry e3 = {c.prep_seq + size, c.commit_seq + size + 1};
+  exchanged = wp_db->ExchangeCommitEntry(c.prep_seq % size, c_64b, e3);
+  ASSERT_TRUE(exchanged);
+  // check whether it did actually accepted that
+  found = wp_db->GetCommitEntry(c.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(e3, e);
+
+  // Rewrite an entry
+  CommitEntry e4 = {e3.prep_seq + size, e3.commit_seq + size + 1};
+  evicted = wp_db->AddCommitEntry(e4.prep_seq % size, e4, &e);
+  ASSERT_TRUE(evicted);
+  ASSERT_EQ(e3, e);
+  found = wp_db->GetCommitEntry(e4.prep_seq % size, &dont_care, &e);
+  ASSERT_TRUE(found);
+  ASSERT_EQ(e4, e);
+}
+
+TEST_P(WritePreparedTransactionTest, MaybeUpdateOldCommitMap) {
+  // If prepare <= snapshot < commit we should keep the entry around since its
+  // nonexistence could be interpreted as committed in the snapshot while it is
+  // not true. We keep such entries around by adding them to the
+  // old_commit_map_.
+  uint64_t p /*prepare*/, c /*commit*/, s /*snapshot*/, ns /*next_snapshot*/;
+  p = 10l, c = 15l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  // If we do not expect the old commit map to be updated, try also with a next
+  // snapshot that is expected to update the old commit map. This would test
+  // that MaybeUpdateOldCommitMap would not prevent us from checking the next
+  // snapshot that must be checked.
+  p = 10l, c = 15l, s = 20l, ns = 11l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+  p = 10l, c = 20l, s = 20l, ns = 19l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  p = 10l, c = 20l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+  p = 20l, c = 20l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  p = 20l, c = 20l, s = 20l, ns = 19l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+
+  p = 10l, c = 25l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true);
+
+  p = 20l, c = 25l, s = 20l, ns = 21l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, true);
+
+  p = 21l, c = 25l, s = 20l, ns = 22l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+  p = 21l, c = 25l, s = 20l, ns = 19l;
+  MaybeUpdateOldCommitMapTestWithNext(p, c, s, ns, false);
+}
+
+// Reproduce the bug with two snapshots with the same seuqence number and test
+// that the release of the first snapshot will not affect the reads by the other
+// snapshot
+TEST_P(WritePreparedTransactionTest, DoubleSnapshot) {
+  TransactionOptions txn_options;
+  Status s;
+
+  // Insert initial value
+  ASSERT_OK(db->Put(WriteOptions(), "key", "value1"));
+
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  Transaction* txn =
+      wp_db->BeginTransaction(WriteOptions(), txn_options, nullptr);
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Put("key", "value2"));
+  ASSERT_OK(txn->Prepare());
+  // Three snapshots with the same seq number
+  const Snapshot* snapshot0 = wp_db->GetSnapshot();
+  const Snapshot* snapshot1 = wp_db->GetSnapshot();
+  const Snapshot* snapshot2 = wp_db->GetSnapshot();
+  ASSERT_OK(txn->Commit());
+  SequenceNumber cache_size = wp_db->COMMIT_CACHE_SIZE;
+  SequenceNumber overlap_seq = txn->GetId() + cache_size;
+  delete txn;
+
+  // 4th snapshot with a larger seq
+  const Snapshot* snapshot3 = wp_db->GetSnapshot();
+  // Cause an eviction to advance max evicted seq number
+  // This also fetches the 4 snapshots from db since their seq is lower than the
+  // new max
+  wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+  ReadOptions ropt;
+  // It should see the value before commit
+  ropt.snapshot = snapshot2;
+  PinnableSlice pinnable_val;
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  wp_db->ReleaseSnapshot(snapshot1);
+
+  // It should still see the value before commit
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  // Cause an eviction to advance max evicted seq number and trigger updating
+  // the snapshot list
+  overlap_seq += cache_size;
+  wp_db->AddCommitted(overlap_seq, overlap_seq);
+
+  // It should still see the value before commit
+  s = wp_db->Get(ropt, wp_db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_OK(s);
+  ASSERT_TRUE(pinnable_val == "value1");
+  pinnable_val.Reset();
+
+  wp_db->ReleaseSnapshot(snapshot0);
+  wp_db->ReleaseSnapshot(snapshot2);
+  wp_db->ReleaseSnapshot(snapshot3);
+}
+
+size_t UniqueCnt(std::vector<SequenceNumber> vec) {
+  std::set<SequenceNumber> aset;
+  for (auto i : vec) {
+    aset.insert(i);
+  }
+  return aset.size();
+}
+// Test that the entries in old_commit_map_ get garbage collected properly
+TEST_P(WritePreparedTransactionTest, OldCommitMapGC) {
+  const size_t snapshot_cache_bits = 0;
+  const size_t commit_cache_bits = 0;
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+
+  SequenceNumber seq = 0;
+  // Take the first snapshot that overlaps with two txn
+  auto prep_seq = ++seq;
+  wp_db->AddPrepared(prep_seq);
+  auto prep_seq2 = ++seq;
+  wp_db->AddPrepared(prep_seq2);
+  auto snap_seq1 = seq;
+  wp_db->TakeSnapshot(snap_seq1);
+  auto commit_seq = ++seq;
+  wp_db->AddCommitted(prep_seq, commit_seq);
+  wp_db->RemovePrepared(prep_seq);
+  auto commit_seq2 = ++seq;
+  wp_db->AddCommitted(prep_seq2, commit_seq2);
+  wp_db->RemovePrepared(prep_seq2);
+  // Take the 2nd and 3rd snapshot that overlap with the same txn
+  prep_seq = ++seq;
+  wp_db->AddPrepared(prep_seq);
+  auto snap_seq2 = seq;
+  wp_db->TakeSnapshot(snap_seq2);
+  seq++;
+  auto snap_seq3 = seq;
+  wp_db->TakeSnapshot(snap_seq3);
+  seq++;
+  commit_seq = ++seq;
+  wp_db->AddCommitted(prep_seq, commit_seq);
+  wp_db->RemovePrepared(prep_seq);
+  // Make sure max_evicted_seq_ will be larger than 2nd snapshot by evicting the
+  // only item in the commit_cache_ via another commit.
+  prep_seq = ++seq;
+  wp_db->AddPrepared(prep_seq);
+  commit_seq = ++seq;
+  wp_db->AddCommitted(prep_seq, commit_seq);
+  wp_db->RemovePrepared(prep_seq);
+
+  // Verify that the evicted commit entries for all snapshots are in the
+  // old_commit_map_
+  {
+    ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(3, wp_db->old_commit_map_.size());
+    ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1]));
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq2]));
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+  }
+
+  // Verify that the 2nd snapshot is cleaned up after the release
+  wp_db->ReleaseSnapshotInternal(snap_seq2);
+  {
+    ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(2, wp_db->old_commit_map_.size());
+    ASSERT_EQ(2, UniqueCnt(wp_db->old_commit_map_[snap_seq1]));
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+  }
+
+  // Verify that the 1st snapshot is cleaned up after the release
+  wp_db->ReleaseSnapshotInternal(snap_seq1);
+  {
+    ASSERT_FALSE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(1, wp_db->old_commit_map_.size());
+    ASSERT_EQ(1, UniqueCnt(wp_db->old_commit_map_[snap_seq3]));
+  }
+
+  // Verify that the 3rd snapshot is cleaned up after the release
+  wp_db->ReleaseSnapshotInternal(snap_seq3);
+  {
+    ASSERT_TRUE(wp_db->old_commit_map_empty_.load());
+    ReadLock rl(&wp_db->old_commit_map_mutex_);
+    ASSERT_EQ(0, wp_db->old_commit_map_.size());
+  }
+}
+
+TEST_P(WritePreparedTransactionTest, CheckAgainstSnapshotsTest) {
+  std::vector<SequenceNumber> snapshots = {100l, 200l, 300l, 400l, 500l,
+                                           600l, 700l, 800l, 900l};
+  const size_t snapshot_cache_bits = 2;
+  const uint64_t cache_size = 1ul << snapshot_cache_bits;
+  // Safety check to express the intended size in the test. Can be adjusted if
+  // the snapshots lists changed.
+  assert((1ul << snapshot_cache_bits) * 2 + 1 == snapshots.size());
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  UpdateTransactionDBOptions(snapshot_cache_bits);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+  SequenceNumber version = 1000l;
+  ASSERT_EQ(0, wp_db->snapshots_total_);
+  wp_db->UpdateSnapshots(snapshots, version);
+  ASSERT_EQ(snapshots.size(), wp_db->snapshots_total_);
+  // seq numbers are chosen so that we have two of them between each two
+  // snapshots. If the diff of two consecutive seq is more than 5, there is a
+  // snapshot between them.
+  std::vector<SequenceNumber> seqs = {50l,  55l,  150l, 155l, 250l, 255l, 350l,
+                                      355l, 450l, 455l, 550l, 555l, 650l, 655l,
+                                      750l, 755l, 850l, 855l, 950l, 955l};
+  assert(seqs.size() > 1);
+  for (size_t i = 0; i < seqs.size() - 1; i++) {
+    wp_db->old_commit_map_empty_ = true;  // reset
+    CommitEntry commit_entry = {seqs[i], seqs[i + 1]};
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    // Expect update if there is snapshot in between the prepare and commit
+    bool expect_update = commit_entry.commit_seq - commit_entry.prep_seq > 5 &&
+                         commit_entry.commit_seq >= snapshots.front() &&
+                         commit_entry.prep_seq <= snapshots.back();
+    ASSERT_EQ(expect_update, !wp_db->old_commit_map_empty_);
+  }
+
+  // Test that search will include multiple snapshot from snapshot cache
+  {
+    // exclude first and last item in the cache
+    CommitEntry commit_entry = {snapshots.front() + 1,
+                                snapshots[cache_size - 1] - 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), cache_size - 2);
+  }
+
+  // Test that search will include multiple snapshot from old snapshots
+  {
+    // include two in the middle
+    CommitEntry commit_entry = {snapshots[cache_size] + 1,
+                                snapshots[cache_size + 2] + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), 2);
+  }
+
+  // Test that search will include both snapshot cache and old snapshots
+  // Case 1: includes all in snapshot cache
+  {
+    CommitEntry commit_entry = {snapshots.front() - 1, snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size());
+  }
+
+  // Case 2: includes all snapshot caches except the smallest
+  {
+    CommitEntry commit_entry = {snapshots.front() + 1, snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - 1);
+  }
+
+  // Case 3: includes only the largest of snapshot cache
+  {
+    CommitEntry commit_entry = {snapshots[cache_size - 1] - 1,
+                                snapshots.back() + 1};
+    wp_db->old_commit_map_empty_ = true;  // reset
+    wp_db->old_commit_map_.clear();
+    wp_db->CheckAgainstSnapshots(commit_entry);
+    ASSERT_EQ(wp_db->old_commit_map_.size(), snapshots.size() - cache_size + 1);
+  }
+}
+
+// This test is too slow for travis
+#ifndef TRAVIS
+#ifndef ROCKSDB_VALGRIND_RUN
+// Test that CheckAgainstSnapshots will not miss a live snapshot if it is run in
+// parallel with UpdateSnapshots.
+TEST_P(SnapshotConcurrentAccessTest, SnapshotConcurrentAccessTest) {
+  // We have a sync point in the method under test after checking each snapshot.
+  // If you increase the max number of snapshots in this test, more sync points
+  // in the methods must also be added.
+  const std::vector<SequenceNumber> snapshots = {10l, 20l, 30l, 40l, 50l,
+                                                 60l, 70l, 80l, 90l, 100l};
+  const size_t snapshot_cache_bits = 2;
+  // Safety check to express the intended size in the test. Can be adjusted if
+  // the snapshots lists changed.
+  assert((1ul << snapshot_cache_bits) * 2 + 2 == snapshots.size());
+  SequenceNumber version = 1000l;
+  // Choose the cache size so that the new snapshot list could replace all the
+  // existing items in the cache and also have some overflow.
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  UpdateTransactionDBOptions(snapshot_cache_bits);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+  const size_t extra = 2;
+  size_t loop_id = 0;
+  // Add up to extra items that do not fit into the cache
+  for (size_t old_size = 1; old_size <= wp_db->SNAPSHOT_CACHE_SIZE + extra;
+       old_size++) {
+    const std::vector<SequenceNumber> old_snapshots(
+        snapshots.begin(), snapshots.begin() + old_size);
+
+    // Each member of old snapshot might or might not appear in the new list. We
+    // create a common_snapshots for each combination.
+    size_t new_comb_cnt = size_t(1) << old_size;
+    for (size_t new_comb = 0; new_comb < new_comb_cnt; new_comb++, loop_id++) {
+      if (loop_id % split_cnt_ != split_id_) continue;
+      printf(".");  // To signal progress
+      fflush(stdout);
+      std::vector<SequenceNumber> common_snapshots;
+      for (size_t i = 0; i < old_snapshots.size(); i++) {
+        if (IsInCombination(i, new_comb)) {
+          common_snapshots.push_back(old_snapshots[i]);
+        }
+      }
+      // And add some new snapshots to the common list
+      for (size_t added_snapshots = 0;
+           added_snapshots <= snapshots.size() - old_snapshots.size();
+           added_snapshots++) {
+        std::vector<SequenceNumber> new_snapshots = common_snapshots;
+        for (size_t i = 0; i < added_snapshots; i++) {
+          new_snapshots.push_back(snapshots[old_snapshots.size() + i]);
+        }
+        for (auto it = common_snapshots.begin(); it != common_snapshots.end();
+             it++) {
+          auto snapshot = *it;
+          // Create a commit entry that is around the snapshot and thus should
+          // be not be discarded
+          CommitEntry entry = {static_cast<uint64_t>(snapshot - 1),
+                               snapshot + 1};
+          // The critical part is when iterating the snapshot cache. Afterwards,
+          // we are operating under the lock
+          size_t a_range =
+              std::min(old_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1;
+          size_t b_range =
+              std::min(new_snapshots.size(), wp_db->SNAPSHOT_CACHE_SIZE) + 1;
+          // Break each thread at two points
+          for (size_t a1 = 1; a1 <= a_range; a1++) {
+            for (size_t a2 = a1 + 1; a2 <= a_range; a2++) {
+              for (size_t b1 = 1; b1 <= b_range; b1++) {
+                for (size_t b2 = b1 + 1; b2 <= b_range; b2++) {
+                  SnapshotConcurrentAccessTestInternal(
+                      wp_db.get(), old_snapshots, new_snapshots, entry, version,
+                      a1, a2, b1, b2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  printf("\n");
+}
+#endif  // ROCKSDB_VALGRIND_RUN
+#endif  // TRAVIS
+
+// This test clarifies the contract of AdvanceMaxEvictedSeq method
+TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqBasicTest) {
+  DBImpl* mock_db = new DBImpl(options, dbname);
+  std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+      new WritePreparedTxnDBMock(mock_db, txn_db_options));
+
+  // 1. Set the initial values for max, prepared, and snapshots
+  SequenceNumber zero_max = 0l;
+  // Set the initial list of prepared txns
+  const std::vector<SequenceNumber> initial_prepared = {10,  30,  50, 100,
+                                                        150, 200, 250};
+  for (auto p : initial_prepared) {
+    wp_db->AddPrepared(p);
+  }
+  // This updates the max value and also set old prepared
+  SequenceNumber init_max = 100;
+  wp_db->AdvanceMaxEvictedSeq(zero_max, init_max);
+  const std::vector<SequenceNumber> initial_snapshots = {20, 40};
+  wp_db->SetDBSnapshots(initial_snapshots);
+  // This will update the internal cache of snapshots from the DB
+  wp_db->UpdateSnapshots(initial_snapshots, init_max);
+
+  // 2. Invoke AdvanceMaxEvictedSeq
+  const std::vector<SequenceNumber> latest_snapshots = {20, 110, 220, 300};
+  wp_db->SetDBSnapshots(latest_snapshots);
+  SequenceNumber new_max = 200;
+  wp_db->AdvanceMaxEvictedSeq(init_max, new_max);
+
+  // 3. Verify that the state matches with AdvanceMaxEvictedSeq contract
+  // a. max should be updated to new_max
+  ASSERT_EQ(wp_db->max_evicted_seq_, new_max);
+  // b. delayed prepared should contain every txn <= max and prepared should
+  // only contain txns > max
+  auto it = initial_prepared.begin();
+  for (; it != initial_prepared.end() && *it <= new_max; it++) {
+    ASSERT_EQ(1, wp_db->delayed_prepared_.erase(*it));
+  }
+  ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+  for (; it != initial_prepared.end() && !wp_db->prepared_txns_.empty();
+       it++, wp_db->prepared_txns_.pop()) {
+    ASSERT_EQ(*it, wp_db->prepared_txns_.top());
+  }
+  ASSERT_TRUE(it == initial_prepared.end());
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  // c. snapshots should contain everything below new_max
+  auto sit = latest_snapshots.begin();
+  for (size_t i = 0; sit != latest_snapshots.end() && *sit <= new_max &&
+                     i < wp_db->snapshots_total_;
+       sit++, i++) {
+    ASSERT_TRUE(i < wp_db->snapshots_total_);
+    // This test is in small scale and the list of snapshots are assumed to be
+    // within the cache size limit. This is just a safety check to double check
+    // that assumption.
+    ASSERT_TRUE(i < wp_db->SNAPSHOT_CACHE_SIZE);
+    ASSERT_EQ(*sit, wp_db->snapshot_cache_[i]);
+  }
+}
+
+// A new snapshot should always be always larger than max_evicted_seq_
+// Otherwise the snapshot does not go through AdvanceMaxEvictedSeq
+TEST_P(WritePreparedTransactionTest, NewSnapshotLargerThanMax) {
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  Transaction* txn0 = db->BeginTransaction(woptions, txn_options);
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value")));
+  ASSERT_OK(txn0->Commit());
+  const SequenceNumber seq = txn0->GetId();  // is also prepare seq
+  delete txn0;
+  std::vector<Transaction*> txns;
+  // Inc seq without committing anything
+  for (int i = 0; i < 10; i++) {
+    Transaction* txn = db->BeginTransaction(woptions, txn_options);
+    ASSERT_OK(txn->SetName("xid" + std::to_string(i)));
+    ASSERT_OK(txn->Put(Slice("key" + std::to_string(i)), Slice("value")));
+    ASSERT_OK(txn->Prepare());
+    txns.push_back(txn);
+  }
+
+  // The new commit is seq + 10
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  auto snap = wp_db->GetSnapshot();
+  const SequenceNumber last_seq = snap->GetSequenceNumber();
+  wp_db->ReleaseSnapshot(snap);
+  ASSERT_LT(seq, last_seq);
+  // Otherwise our test is not effective
+  ASSERT_LT(last_seq - seq, wp_db->INC_STEP_FOR_MAX_EVICTED);
+
+  // Evict seq out of commit cache
+  const SequenceNumber overwrite_seq = seq + wp_db->COMMIT_CACHE_SIZE;
+  // Check that the next write could make max go beyond last
+  auto last_max = wp_db->max_evicted_seq_.load();
+  wp_db->AddCommitted(overwrite_seq, overwrite_seq);
+  // Check that eviction has advanced the max
+  ASSERT_LT(last_max, wp_db->max_evicted_seq_.load());
+  // Check that the new max has not advanced the last seq
+  ASSERT_LT(wp_db->max_evicted_seq_.load(), last_seq);
+  for (auto txn : txns) {
+    txn->Rollback();
+    delete txn;
+  }
+}
+
+// A new snapshot should always be always larger than max_evicted_seq_
+// In very rare cases max could be below last published seq. Test that
+// taking snapshot will wait for max to catch up.
+TEST_P(WritePreparedTransactionTest, MaxCatchupWithNewSnapshot) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WriteOptions woptions;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+  const int writes = 50;
+  const int batch_cnt = 4;
+  rocksdb::port::Thread t1([&]() {
+    for (int i = 0; i < writes; i++) {
+      WriteBatch batch;
+      // For duplicate keys cause 4 commit entries, each evicting an entry that
+      // is not published yet, thus causing max ecited seq go higher than last
+      // published.
+      for (int b = 0; b < batch_cnt; b++) {
+        batch.Put("foo", "foo");
+      }
+      db->Write(woptions, &batch);
+    }
+  });
+
+  rocksdb::port::Thread t2([&]() {
+    while (wp_db->max_evicted_seq_ == 0) {  // wait for insert thread
+      std::this_thread::yield();
+    }
+    for (int i = 0; i < 10; i++) {
+      auto snap = db->GetSnapshot();
+      if (snap->GetSequenceNumber() != 0) {
+        ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+      }  // seq 0 is ok to be less than max since nothing is visible to it
+      db->ReleaseSnapshot(snap);
+    }
+  });
+
+  t1.join();
+  t2.join();
+
+  // Make sure that the test has worked and seq number has advanced as we
+  // thought
+  auto snap = db->GetSnapshot();
+  ASSERT_GT(snap->GetSequenceNumber(), batch_cnt * writes - 1);
+  db->ReleaseSnapshot(snap);
+}
+
+// Check that old_commit_map_ cleanup works correctly if the snapshot equals
+// max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, CleanupSnapshotEqualToMax) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // only 1 entry => frequent eviction
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WriteOptions woptions;
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // Insert something to increase seq
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  auto snap = db->GetSnapshot();
+  auto snap_seq = snap->GetSequenceNumber();
+  // Another insert should trigger eviction + load snapshot from db
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  // This is the scenario that we check agaisnt
+  ASSERT_EQ(snap_seq, wp_db->max_evicted_seq_);
+  // old_commit_map_ now has some data that needs gc
+  ASSERT_EQ(1, wp_db->snapshots_total_);
+  ASSERT_EQ(1, wp_db->old_commit_map_.size());
+
+  db->ReleaseSnapshot(snap);
+
+  // Another insert should trigger eviction + load snapshot from db
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+
+  // the snapshot and related metadata must be properly garbage collected
+  ASSERT_EQ(0, wp_db->snapshots_total_);
+  ASSERT_TRUE(wp_db->snapshots_all_.empty());
+  ASSERT_EQ(0, wp_db->old_commit_map_.size());
+}
+
+TEST_P(WritePreparedTransactionTest, AdvanceSeqByOne) {
+  auto snap = db->GetSnapshot();
+  auto seq1 = snap->GetSequenceNumber();
+  db->ReleaseSnapshot(snap);
+
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  wp_db->AdvanceSeqByOne();
+
+  snap = db->GetSnapshot();
+  auto seq2 = snap->GetSequenceNumber();
+  db->ReleaseSnapshot(snap);
+
+  ASSERT_LT(seq1, seq2);
+}
+
+// Test that the txn Initilize calls the overridden functions
+TEST_P(WritePreparedTransactionTest, TxnInitialize) {
+  TransactionOptions txn_options;
+  WriteOptions write_options;
+  ASSERT_OK(db->Put(write_options, "key", "value"));
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value1")));
+  ASSERT_OK(txn0->Prepare());
+
+  // SetSnapshot is overridden to update min_uncommitted_
+  txn_options.set_snapshot = true;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  auto snap = txn1->GetSnapshot();
+  auto snap_impl = reinterpret_cast<const SnapshotImpl*>(snap);
+  // If ::Initialize calls the overriden SetSnapshot, min_uncommitted_ must be
+  // udpated
+  ASSERT_GT(snap_impl->min_uncommitted_, kMinUnCommittedSeq);
+
+  txn0->Rollback();
+  txn1->Rollback();
+  delete txn0;
+  delete txn1;
+}
+
+// This tests that transactions with duplicate keys perform correctly after max
+// is advancing their prepared sequence numbers. This will not be the case if
+// for example the txn does not add the prepared seq for the second sub-batch to
+// the PreparedHeap structure.
+TEST_P(WritePreparedTransactionTest, AdvanceMaxEvictedSeqWithDuplicatesTest) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  ASSERT_OK(txn0->SetName("xid"));
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value1")));
+  ASSERT_OK(txn0->Put(Slice("key"), Slice("value2")));
+  ASSERT_OK(txn0->Prepare());
+
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // Ensure that all the prepared sequence numbers will be removed from the
+  // PreparedHeap.
+  SequenceNumber new_max = wp_db->COMMIT_CACHE_SIZE;
+  wp_db->AdvanceMaxEvictedSeq(0, new_max);
+
+  ReadOptions ropt;
+  PinnableSlice pinnable_val;
+  auto s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+  delete txn0;
+
+  wp_db->db_impl_->FlushWAL(true);
+  wp_db->TEST_Crash();
+  ReOpenNoDelete();
+  assert(db != nullptr);
+  wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  wp_db->AdvanceMaxEvictedSeq(0, new_max);
+  s = db->Get(ropt, db->DefaultColumnFamily(), "key", &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+
+  txn0 = db->GetTransactionByName("xid");
+  ASSERT_OK(txn0->Rollback());
+  delete txn0;
+}
+
+TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) {
+  // Given the sequential run of txns, with this timeout we should never see a
+  // deadlock nor a timeout unless we have a key conflict, which should be
+  // almost infeasible.
+  txn_db_options.transaction_lock_timeout = 1000;
+  txn_db_options.default_lock_timeout = 1000;
+  ReOpen();
+  FlushOptions fopt;
+
+  // Number of different txn types we use in this test
+  const size_t type_cnt = 5;
+  // The size of the first write group
+  // TODO(myabandeh): This should be increase for pre-release tests
+  const size_t first_group_size = 2;
+  // Total number of txns we run in each test
+  // TODO(myabandeh): This should be increase for pre-release tests
+  const size_t txn_cnt = first_group_size + 1;
+
+  size_t base[txn_cnt + 1] = {
+      1,
+  };
+  for (size_t bi = 1; bi <= txn_cnt; bi++) {
+    base[bi] = base[bi - 1] * type_cnt;
+  }
+  const size_t max_n = static_cast<size_t>(std::pow(type_cnt, txn_cnt));
+  printf("Number of cases being tested is %" ROCKSDB_PRIszt "\n", max_n);
+  for (size_t n = 0; n < max_n; n++, ReOpen()) {
+    if (n % split_cnt_ != split_id_) continue;
+    if (n % 1000 == 0) {
+      printf("Tested %" ROCKSDB_PRIszt " cases so far\n", n);
+    }
+    DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    auto seq = db_impl->TEST_GetLastVisibleSequence();
+    exp_seq = seq;
+    // This is increased before writing the batch for commit
+    commit_writes = 0;
+    // This is increased before txn starts linking if it expects to do a commit
+    // eventually
+    expected_commits = 0;
+    std::vector<port::Thread> threads;
+
+    linked = 0;
+    std::atomic<bool> batch_formed(false);
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::EnterAsBatchGroupLeader:End",
+        [&](void* /*arg*/) { batch_formed = true; });
+    rocksdb::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Wait", [&](void* /*arg*/) {
+          linked++;
+          if (linked == 1) {
+            // Wait until the others are linked too.
+            while (linked < first_group_size) {
+            }
+          } else if (linked == 1 + first_group_size) {
+            // Make the 2nd batch of the rest of writes plus any followup
+            // commits from the first batch
+            while (linked < txn_cnt + commit_writes) {
+            }
+          }
+          // Then we will have one or more batches consisting of follow-up
+          // commits from the 2nd batch. There is a bit of non-determinism here
+          // but it should be tolerable.
+        });
+
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    for (size_t bi = 0; bi < txn_cnt; bi++) {
+      // get the bi-th digit in number system based on type_cnt
+      size_t d = (n % base[bi + 1]) / base[bi];
+      switch (d) {
+        case 0:
+          threads.emplace_back(txn_t0, bi);
+          break;
+        case 1:
+          threads.emplace_back(txn_t1, bi);
+          break;
+        case 2:
+          threads.emplace_back(txn_t2, bi);
+          break;
+        case 3:
+          threads.emplace_back(txn_t3, bi);
+          break;
+        case 4:
+          threads.emplace_back(txn_t3, bi);
+          break;
+        default:
+          assert(false);
+      }
+      // wait to be linked
+      while (linked.load() <= bi) {
+      }
+      // after a queue of size first_group_size
+      if (bi + 1 == first_group_size) {
+        while (!batch_formed) {
+        }
+        // to make it more deterministic, wait until the commits are linked
+        while (linked.load() <= bi + expected_commits) {
+        }
+      }
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+    if (options.two_write_queues) {
+      // In this case none of the above scheduling tricks to deterministically
+      // form merged batches works because the writes go to separate queues.
+      // This would result in different write groups in each run of the test. We
+      // still keep the test since although non-deterministic and hard to debug,
+      // it is still useful to have.
+      // TODO(myabandeh): Add a deterministic unit test for two_write_queues
+    }
+
+    // Check if memtable inserts advanced seq number as expected
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_EQ(exp_seq, seq);
+
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    // Check if recovery preserves the last sequence number
+    db_impl->FlushWAL(true);
+    ReOpenNoDelete();
+    assert(db != nullptr);
+    db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    seq = db_impl->TEST_GetLastVisibleSequence();
+    ASSERT_LE(exp_seq, seq);
+
+    // Check if flush preserves the last sequence number
+    db_impl->Flush(fopt);
+    seq = db_impl->GetLatestSequenceNumber();
+    ASSERT_LE(exp_seq, seq);
+
+    // Check if recovery after flush preserves the last sequence number
+    db_impl->FlushWAL(true);
+    ReOpenNoDelete();
+    assert(db != nullptr);
+    db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    seq = db_impl->GetLatestSequenceNumber();
+    ASSERT_LE(exp_seq, seq);
+  }
+}
+
+// Run a couple of different txns among them some uncommitted. Restart the db at
+// a couple points to check whether the list of uncommitted txns are recovered
+// properly.
+TEST_P(WritePreparedTransactionTest, BasicRecoveryTest) {
+  options.disable_auto_compactions = true;
+  ReOpen();
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+
+  txn_t0(0);
+
+  TransactionOptions txn_options;
+  WriteOptions write_options;
+  size_t index = 1000;
+  Transaction* txn0 = db->BeginTransaction(write_options, txn_options);
+  auto istr0 = std::to_string(index);
+  auto s = txn0->SetName("xid" + istr0);
+  ASSERT_OK(s);
+  s = txn0->Put(Slice("foo0" + istr0), Slice("bar0" + istr0));
+  ASSERT_OK(s);
+  s = txn0->Prepare();
+  auto prep_seq_0 = txn0->GetId();
+
+  txn_t1(0);
+
+  index++;
+  Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+  auto istr1 = std::to_string(index);
+  s = txn1->SetName("xid" + istr1);
+  ASSERT_OK(s);
+  s = txn1->Put(Slice("foo1" + istr1), Slice("bar"));
+  ASSERT_OK(s);
+  s = txn1->Prepare();
+  auto prep_seq_1 = txn1->GetId();
+
+  txn_t2(0);
+
+  ReadOptions ropt;
+  PinnableSlice pinnable_val;
+  // Check the value is not committed before restart
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+  pinnable_val.Reset();
+
+  delete txn0;
+  delete txn1;
+  wp_db->db_impl_->FlushWAL(true);
+  wp_db->TEST_Crash();
+  ReOpenNoDelete();
+  assert(db != nullptr);
+  wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // After recovery, all the uncommitted txns (0 and 1) should be inserted into
+  // delayed_prepared_
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+  ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_);
+  ASSERT_LE(prep_seq_1, wp_db->max_evicted_seq_);
+  {
+    ReadLock rl(&wp_db->prepared_mutex_);
+    ASSERT_EQ(2, wp_db->delayed_prepared_.size());
+    ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_0) !=
+                wp_db->delayed_prepared_.end());
+    ASSERT_TRUE(wp_db->delayed_prepared_.find(prep_seq_1) !=
+                wp_db->delayed_prepared_.end());
+  }
+
+  // Check the value is still not committed after restart
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.IsNotFound());
+  pinnable_val.Reset();
+
+  txn_t3(0);
+
+  // Test that a recovered txns will be properly marked committed for the next
+  // recovery
+  txn1 = db->GetTransactionByName("xid" + istr1);
+  ASSERT_NE(txn1, nullptr);
+  txn1->Commit();
+  delete txn1;
+
+  index++;
+  Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+  auto istr2 = std::to_string(index);
+  s = txn2->SetName("xid" + istr2);
+  ASSERT_OK(s);
+  s = txn2->Put(Slice("foo2" + istr2), Slice("bar"));
+  ASSERT_OK(s);
+  s = txn2->Prepare();
+  auto prep_seq_2 = txn2->GetId();
+
+  delete txn2;
+  wp_db->db_impl_->FlushWAL(true);
+  wp_db->TEST_Crash();
+  ReOpenNoDelete();
+  assert(db != nullptr);
+  wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+
+  // 0 and 2 are prepared and 1 is committed
+  {
+    ReadLock rl(&wp_db->prepared_mutex_);
+    ASSERT_EQ(2, wp_db->delayed_prepared_.size());
+    const auto& end = wp_db->delayed_prepared_.end();
+    ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_0), end);
+    ASSERT_EQ(wp_db->delayed_prepared_.find(prep_seq_1), end);
+    ASSERT_NE(wp_db->delayed_prepared_.find(prep_seq_2), end);
+  }
+  ASSERT_LE(prep_seq_0, wp_db->max_evicted_seq_);
+  ASSERT_LE(prep_seq_2, wp_db->max_evicted_seq_);
+
+  // Commit all the remaining txns
+  txn0 = db->GetTransactionByName("xid" + istr0);
+  ASSERT_NE(txn0, nullptr);
+  txn0->Commit();
+  txn2 = db->GetTransactionByName("xid" + istr2);
+  ASSERT_NE(txn2, nullptr);
+  txn2->Commit();
+
+  // Check the value is committed after commit
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.ok());
+  ASSERT_TRUE(pinnable_val == ("bar0" + istr0));
+  pinnable_val.Reset();
+
+  delete txn0;
+  delete txn2;
+  wp_db->db_impl_->FlushWAL(true);
+  ReOpenNoDelete();
+  assert(db != nullptr);
+  wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  ASSERT_TRUE(wp_db->prepared_txns_.empty());
+  ASSERT_TRUE(wp_db->delayed_prepared_empty_);
+
+  // Check the value is still committed after recovery
+  s = db->Get(ropt, db->DefaultColumnFamily(), "foo0" + istr0, &pinnable_val);
+  ASSERT_TRUE(s.ok());
+  ASSERT_TRUE(pinnable_val == ("bar0" + istr0));
+  pinnable_val.Reset();
+}
+
+// After recovery the commit map is empty while the max is set. The code would
+// go through a different path which requires a separate test. Test that the
+// committed data before the restart is visible to all snapshots.
+TEST_P(WritePreparedTransactionTest, IsInSnapshotEmptyMapTest) {
+  for (bool end_with_prepare : {false, true}) {
+    ReOpen();
+    WriteOptions woptions;
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    SequenceNumber prepare_seq = kMaxSequenceNumber;
+    if (end_with_prepare) {
+      TransactionOptions txn_options;
+      Transaction* txn = db->BeginTransaction(woptions, txn_options);
+      ASSERT_OK(txn->SetName("xid0"));
+      ASSERT_OK(txn->Prepare());
+      prepare_seq = txn->GetId();
+      delete txn;
+    }
+    dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
+    auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+    db_impl->FlushWAL(true);
+    ReOpenNoDelete();
+    WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+    assert(wp_db != nullptr);
+    ASSERT_GT(wp_db->max_evicted_seq_, 0);  // max after recovery
+    // Take a snapshot right after recovery
+    const Snapshot* snap = db->GetSnapshot();
+    auto snap_seq = snap->GetSequenceNumber();
+    ASSERT_GT(snap_seq, 0);
+
+    for (SequenceNumber seq = 0;
+         seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) {
+      ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq));
+    }
+    if (end_with_prepare) {
+      ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq));
+    }
+    // trivial check
+    ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq));
+
+    db->ReleaseSnapshot(snap);
+
+    ASSERT_OK(db->Put(woptions, "key", "value"));
+    // Take a snapshot after some writes
+    snap = db->GetSnapshot();
+    snap_seq = snap->GetSequenceNumber();
+    for (SequenceNumber seq = 0;
+         seq <= wp_db->max_evicted_seq_ && seq != prepare_seq; seq++) {
+      ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq));
+    }
+    if (end_with_prepare) {
+      ASSERT_FALSE(wp_db->IsInSnapshot(prepare_seq, snap_seq));
+    }
+    // trivial check
+    ASSERT_FALSE(wp_db->IsInSnapshot(snap_seq + 1, snap_seq));
+
+    db->ReleaseSnapshot(snap);
+  }
+}
+
+// Shows the contract of IsInSnapshot when called on invalid/released snapshots
+TEST_P(WritePreparedTransactionTest, IsInSnapshotReleased) {
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  WriteOptions woptions;
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  // snap seq = 1
+  const Snapshot* snap1 = db->GetSnapshot();
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  ASSERT_OK(db->Put(woptions, "key", "value"));
+  // snap seq = 3
+  const Snapshot* snap2 = db->GetSnapshot();
+  const SequenceNumber seq = 1;
+  // Evict seq out of commit cache
+  size_t overwrite_seq = wp_db->COMMIT_CACHE_SIZE + seq;
+  wp_db->AddCommitted(overwrite_seq, overwrite_seq);
+  SequenceNumber snap_seq;
+  uint64_t min_uncommitted = kMinUnCommittedSeq;
+  bool released;
+
+  released = false;
+  snap_seq = snap1->GetSequenceNumber();
+  ASSERT_LE(seq, snap_seq);
+  // Valid snapshot lower than max
+  ASSERT_LE(snap_seq, wp_db->max_evicted_seq_);
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  ASSERT_FALSE(released);
+
+  released = false;
+  snap_seq = snap1->GetSequenceNumber();
+  // Invaid snapshot lower than max
+  ASSERT_LE(snap_seq + 1, wp_db->max_evicted_seq_);
+  ASSERT_TRUE(
+      wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  db->ReleaseSnapshot(snap1);
+
+  released = false;
+  // Released snapshot lower than max
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  // The release does not take affect until the next max advance
+  ASSERT_FALSE(released);
+
+  released = false;
+  // Invaid snapshot lower than max
+  ASSERT_TRUE(
+      wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  // This make the snapshot release to reflect in txn db structures
+  wp_db->AdvanceMaxEvictedSeq(wp_db->max_evicted_seq_,
+                              wp_db->max_evicted_seq_ + 1);
+
+  released = false;
+  // Released snapshot lower than max
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  released = false;
+  // Invaid snapshot lower than max
+  ASSERT_TRUE(
+      wp_db->IsInSnapshot(seq, snap_seq + 1, min_uncommitted, &released));
+  ASSERT_TRUE(released);
+
+  snap_seq = snap2->GetSequenceNumber();
+
+  released = false;
+  // Unreleased snapshot lower than max
+  ASSERT_TRUE(wp_db->IsInSnapshot(seq, snap_seq, min_uncommitted, &released));
+  ASSERT_FALSE(released);
+
+  db->ReleaseSnapshot(snap2);
+}
+
+// Test WritePreparedTxnDB's IsInSnapshot against different ordering of
+// snapshot, max_committed_seq_, prepared, and commit entries.
+TEST_P(WritePreparedTransactionTest, IsInSnapshotTest) {
+  WriteOptions wo;
+  // Use small commit cache to trigger lots of eviction and fast advance of
+  // max_evicted_seq_
+  const size_t commit_cache_bits = 3;
+  // Same for snapshot cache size
+  const size_t snapshot_cache_bits = 2;
+
+  // Take some preliminary snapshots first. This is to stress the data structure
+  // that holds the old snapshots as it will be designed to be efficient when
+  // only a few snapshots are below the max_evicted_seq_.
+  for (int max_snapshots = 1; max_snapshots < 20; max_snapshots++) {
+    // Leave some gap between the preliminary snapshots and the final snapshot
+    // that we check. This should test for also different overlapping scenarios
+    // between the last snapshot and the commits.
+    for (int max_gap = 1; max_gap < 10; max_gap++) {
+      // Since we do not actually write to db, we mock the seq as it would be
+      // increased by the db. The only exception is that we need db seq to
+      // advance for our snapshots. for which we apply a dummy put each time we
+      // increase our mock of seq.
+      uint64_t seq = 0;
+      // At each step we prepare a txn and then we commit it in the next txn.
+      // This emulates the consecutive transactions that write to the same key
+      uint64_t cur_txn = 0;
+      // Number of snapshots taken so far
+      int num_snapshots = 0;
+      // Number of gaps applied so far
+      int gap_cnt = 0;
+      // The final snapshot that we will inspect
+      uint64_t snapshot = 0;
+      bool found_committed = false;
+      // To stress the data structure that maintain prepared txns, at each cycle
+      // we add a new prepare txn. These do not mean to be committed for
+      // snapshot inspection.
+      std::set<uint64_t> prepared;
+      // We keep the list of txns committed before we take the last snapshot.
+      // These should be the only seq numbers that will be found in the snapshot
+      std::set<uint64_t> committed_before;
+      // The set of commit seq numbers to be excluded from IsInSnapshot queries
+      std::set<uint64_t> commit_seqs;
+      DBImpl* mock_db = new DBImpl(options, dbname);
+      UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+      std::unique_ptr<WritePreparedTxnDBMock> wp_db(
+          new WritePreparedTxnDBMock(mock_db, txn_db_options));
+      // We continue until max advances a bit beyond the snapshot.
+      while (!snapshot || wp_db->max_evicted_seq_ < snapshot + 100) {
+        // do prepare for a transaction
+        seq++;
+        wp_db->AddPrepared(seq);
+        prepared.insert(seq);
+
+        // If cur_txn is not started, do prepare for it.
+        if (!cur_txn) {
+          seq++;
+          cur_txn = seq;
+          wp_db->AddPrepared(cur_txn);
+        } else {                                     // else commit it
+          seq++;
+          wp_db->AddCommitted(cur_txn, seq);
+          wp_db->RemovePrepared(cur_txn);
+          commit_seqs.insert(seq);
+          if (!snapshot) {
+            committed_before.insert(cur_txn);
+          }
+          cur_txn = 0;
+        }
+
+        if (num_snapshots < max_snapshots - 1) {
+          // Take preliminary snapshots
+          wp_db->TakeSnapshot(seq);
+          num_snapshots++;
+        } else if (gap_cnt < max_gap) {
+          // Wait for some gap before taking the final snapshot
+          gap_cnt++;
+        } else if (!snapshot) {
+          // Take the final snapshot if it is not already taken
+          snapshot = seq;
+          wp_db->TakeSnapshot(snapshot);
+          num_snapshots++;
+        }
+
+        // If the snapshot is taken, verify seq numbers visible to it. We redo
+        // it at each cycle to test that the system is still sound when
+        // max_evicted_seq_ advances.
+        if (snapshot) {
+          for (uint64_t s = 1;
+               s <= seq && commit_seqs.find(s) == commit_seqs.end(); s++) {
+            bool was_committed =
+                (committed_before.find(s) != committed_before.end());
+            bool is_in_snapshot = wp_db->IsInSnapshot(s, snapshot);
+            if (was_committed != is_in_snapshot) {
+              printf("max_snapshots %d max_gap %d seq %" PRIu64 " max %" PRIu64
+                     " snapshot %" PRIu64
+                     " gap_cnt %d num_snapshots %d s %" PRIu64 "\n",
+                     max_snapshots, max_gap, seq,
+                     wp_db->max_evicted_seq_.load(), snapshot, gap_cnt,
+                     num_snapshots, s);
+            }
+            ASSERT_EQ(was_committed, is_in_snapshot);
+            found_committed = found_committed || is_in_snapshot;
+          }
+        }
+      }
+      // Safety check to make sure the test actually ran
+      ASSERT_TRUE(found_committed);
+      // As an extra check, check if prepared set will be properly empty after
+      // they are committed.
+      if (cur_txn) {
+        wp_db->AddCommitted(cur_txn, seq);
+        wp_db->RemovePrepared(cur_txn);
+      }
+      for (auto p : prepared) {
+        wp_db->AddCommitted(p, seq);
+        wp_db->RemovePrepared(p);
+      }
+      ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+      ASSERT_TRUE(wp_db->prepared_txns_.empty());
+    }
+  }
+}
+
+void ASSERT_SAME(ReadOptions roptions, TransactionDB* db, Status exp_s,
+                 PinnableSlice& exp_v, Slice key) {
+  Status s;
+  PinnableSlice v;
+  s = db->Get(roptions, db->DefaultColumnFamily(), key, &v);
+  ASSERT_TRUE(exp_s == s);
+  ASSERT_TRUE(s.ok() || s.IsNotFound());
+  if (s.ok()) {
+    ASSERT_TRUE(exp_v == v);
+  }
+
+  // Try with MultiGet API too
+  std::vector<std::string> values;
+  auto s_vec =
+      db->MultiGet(roptions, {db->DefaultColumnFamily()}, {key}, &values);
+  ASSERT_EQ(1, values.size());
+  ASSERT_EQ(1, s_vec.size());
+  s = s_vec[0];
+  ASSERT_TRUE(exp_s == s);
+  ASSERT_TRUE(s.ok() || s.IsNotFound());
+  if (s.ok()) {
+    ASSERT_TRUE(exp_v == values[0]);
+  }
+}
+
+void ASSERT_SAME(TransactionDB* db, Status exp_s, PinnableSlice& exp_v,
+                 Slice key) {
+  ASSERT_SAME(ReadOptions(), db, exp_s, exp_v, key);
+}
+
+TEST_P(WritePreparedTransactionTest, RollbackTest) {
+  ReadOptions roptions;
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  const size_t num_keys = 4;
+  const size_t num_values = 5;
+  for (size_t ikey = 1; ikey <= num_keys; ikey++) {
+    for (size_t ivalue = 0; ivalue < num_values; ivalue++) {
+      for (bool crash : {false, true}) {
+        ReOpen();
+        WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+        std::string key_str = "key" + ToString(ikey);
+        switch (ivalue) {
+          case 0:
+            break;
+          case 1:
+            ASSERT_OK(db->Put(woptions, key_str, "initvalue1"));
+            break;
+          case 2:
+            ASSERT_OK(db->Merge(woptions, key_str, "initvalue2"));
+            break;
+          case 3:
+            ASSERT_OK(db->Delete(woptions, key_str));
+            break;
+          case 4:
+            ASSERT_OK(db->SingleDelete(woptions, key_str));
+            break;
+          default:
+            assert(0);
+        }
+
+        PinnableSlice v1;
+        auto s1 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key1"), &v1);
+        PinnableSlice v2;
+        auto s2 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key2"), &v2);
+        PinnableSlice v3;
+        auto s3 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key3"), &v3);
+        PinnableSlice v4;
+        auto s4 =
+            db->Get(roptions, db->DefaultColumnFamily(), Slice("key4"), &v4);
+        Transaction* txn = db->BeginTransaction(woptions, txn_options);
+        auto s = txn->SetName("xid0");
+        ASSERT_OK(s);
+        s = txn->Put(Slice("key1"), Slice("value1"));
+        ASSERT_OK(s);
+        s = txn->Merge(Slice("key2"), Slice("value2"));
+        ASSERT_OK(s);
+        s = txn->Delete(Slice("key3"));
+        ASSERT_OK(s);
+        s = txn->SingleDelete(Slice("key4"));
+        ASSERT_OK(s);
+        s = txn->Prepare();
+        ASSERT_OK(s);
+
+        {
+          ReadLock rl(&wp_db->prepared_mutex_);
+          ASSERT_FALSE(wp_db->prepared_txns_.empty());
+          ASSERT_EQ(txn->GetId(), wp_db->prepared_txns_.top());
+        }
+
+        ASSERT_SAME(db, s1, v1, "key1");
+        ASSERT_SAME(db, s2, v2, "key2");
+        ASSERT_SAME(db, s3, v3, "key3");
+        ASSERT_SAME(db, s4, v4, "key4");
+
+        if (crash) {
+          delete txn;
+          auto db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+          db_impl->FlushWAL(true);
+          dynamic_cast<WritePreparedTxnDB*>(db)->TEST_Crash();
+          ReOpenNoDelete();
+          assert(db != nullptr);
+          wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+          txn = db->GetTransactionByName("xid0");
+          ASSERT_FALSE(wp_db->delayed_prepared_empty_);
+          ReadLock rl(&wp_db->prepared_mutex_);
+          ASSERT_TRUE(wp_db->prepared_txns_.empty());
+          ASSERT_FALSE(wp_db->delayed_prepared_.empty());
+          ASSERT_TRUE(wp_db->delayed_prepared_.find(txn->GetId()) !=
+                      wp_db->delayed_prepared_.end());
+        }
+
+        ASSERT_SAME(db, s1, v1, "key1");
+        ASSERT_SAME(db, s2, v2, "key2");
+        ASSERT_SAME(db, s3, v3, "key3");
+        ASSERT_SAME(db, s4, v4, "key4");
+
+        s = txn->Rollback();
+        ASSERT_OK(s);
+
+        {
+          ASSERT_TRUE(wp_db->delayed_prepared_empty_);
+          ReadLock rl(&wp_db->prepared_mutex_);
+          ASSERT_TRUE(wp_db->prepared_txns_.empty());
+          ASSERT_TRUE(wp_db->delayed_prepared_.empty());
+        }
+
+        ASSERT_SAME(db, s1, v1, "key1");
+        ASSERT_SAME(db, s2, v2, "key2");
+        ASSERT_SAME(db, s3, v3, "key3");
+        ASSERT_SAME(db, s4, v4, "key4");
+        delete txn;
+      }
+    }
+  }
+}
+
+TEST_P(WritePreparedTransactionTest, DisableGCDuringRecoveryTest) {
+  // Use large buffer to avoid memtable flush after 1024 insertions
+  options.write_buffer_size = 1024 * 1024;
+  ReOpen();
+  std::vector<KeyVersion> versions;
+  uint64_t seq = 0;
+  for (uint64_t i = 1; i <= 1024; i++) {
+    std::string v = "bar" + ToString(i);
+    ASSERT_OK(db->Put(WriteOptions(), "foo", v));
+    VerifyKeys({{"foo", v}});
+    seq++;  // one for the key/value
+    KeyVersion kv = {"foo", v, seq, kTypeValue};
+    if (options.two_write_queues) {
+      seq++;  // one for the commit
+    }
+    versions.emplace_back(kv);
+  }
+  std::reverse(std::begin(versions), std::end(versions));
+  VerifyInternalKeys(versions);
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  db_impl->FlushWAL(true);
+  // Use small buffer to ensure memtable flush during recovery
+  options.write_buffer_size = 1024;
+  ReOpenNoDelete();
+  VerifyInternalKeys(versions);
+}
+
+TEST_P(WritePreparedTransactionTest, SequenceNumberZeroTest) {
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "bar"));
+  VerifyKeys({{"foo", "bar"}});
+  const Snapshot* snapshot = db->GetSnapshot();
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Compaction will output keys with sequence number 0, if it is visible to
+  // earliest snapshot. Make sure IsInSnapshot() report sequence number 0 is
+  // visible to any snapshot.
+  VerifyKeys({{"foo", "bar"}});
+  VerifyKeys({{"foo", "bar"}}, snapshot);
+  VerifyInternalKeys({{"foo", "bar", 0, kTypeValue}});
+  db->ReleaseSnapshot(snapshot);
+}
+
+// Compaction should not remove a key if it is not committed, and should
+// proceed with older versions of the key as-if the new version doesn't exist.
+TEST_P(WritePreparedTransactionTest, CompactionShouldKeepUncommittedKeys) {
+  options.disable_auto_compactions = true;
+  ReOpen();
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  // Snapshots to avoid keys get evicted.
+  std::vector<const Snapshot*> snapshots;
+  // Keep track of expected sequence number.
+  SequenceNumber expected_seq = 0;
+
+  auto add_key = [&](std::function<Status()> func) {
+    ASSERT_OK(func());
+    expected_seq++;
+    if (options.two_write_queues) {
+      expected_seq++;  // 1 for commit
+    }
+    ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+    snapshots.push_back(db->GetSnapshot());
+  };
+
+  // Each key here represent a standalone test case.
+  add_key([&]() { return db->Put(WriteOptions(), "key1", "value1_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key2", "value2_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key3", "value3_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key4", "value4_1"); });
+  add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_1"); });
+  add_key([&]() { return db->Merge(WriteOptions(), "key5", "value5_2"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key6", "value6_1"); });
+  add_key([&]() { return db->Put(WriteOptions(), "key7", "value7_1"); });
+  ASSERT_OK(db->Flush(FlushOptions()));
+  add_key([&]() { return db->Delete(WriteOptions(), "key6"); });
+  add_key([&]() { return db->SingleDelete(WriteOptions(), "key7"); });
+
+  auto* transaction = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("key1", "value1_2"));
+  ASSERT_OK(transaction->Delete("key2"));
+  ASSERT_OK(transaction->SingleDelete("key3"));
+  ASSERT_OK(transaction->Merge("key4", "value4_2"));
+  ASSERT_OK(transaction->Merge("key5", "value5_3"));
+  ASSERT_OK(transaction->Put("key6", "value6_2"));
+  ASSERT_OK(transaction->Put("key7", "value7_2"));
+  // Prepare but not commit.
+  ASSERT_OK(transaction->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  ASSERT_OK(db->Flush(FlushOptions()));
+  for (auto* s : snapshots) {
+    db->ReleaseSnapshot(s);
+  }
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyKeys({
+      {"key1", "value1_1"},
+      {"key2", "value2_1"},
+      {"key3", "value3_1"},
+      {"key4", "value4_1"},
+      {"key5", "value5_1,value5_2"},
+      {"key6", "NOT_FOUND"},
+      {"key7", "NOT_FOUND"},
+  });
+  VerifyInternalKeys({
+      {"key1", "value1_2", expected_seq, kTypeValue},
+      {"key1", "value1_1", 0, kTypeValue},
+      {"key2", "", expected_seq, kTypeDeletion},
+      {"key2", "value2_1", 0, kTypeValue},
+      {"key3", "", expected_seq, kTypeSingleDeletion},
+      {"key3", "value3_1", 0, kTypeValue},
+      {"key4", "value4_2", expected_seq, kTypeMerge},
+      {"key4", "value4_1", 0, kTypeValue},
+      {"key5", "value5_3", expected_seq, kTypeMerge},
+      {"key5", "value5_1,value5_2", 0, kTypeValue},
+      {"key6", "value6_2", expected_seq, kTypeValue},
+      {"key7", "value7_2", expected_seq, kTypeValue},
+  });
+  ASSERT_OK(transaction->Commit());
+  VerifyKeys({
+      {"key1", "value1_2"},
+      {"key2", "NOT_FOUND"},
+      {"key3", "NOT_FOUND"},
+      {"key4", "value4_1,value4_2"},
+      {"key5", "value5_1,value5_2,value5_3"},
+      {"key6", "value6_2"},
+      {"key7", "value7_2"},
+  });
+  delete transaction;
+}
+
+// Compaction should keep keys visible to a snapshot based on commit sequence,
+// not just prepare sequence.
+TEST_P(WritePreparedTransactionTest, CompactionShouldKeepSnapshotVisibleKeys) {
+  options.disable_auto_compactions = true;
+  ReOpen();
+  // Keep track of expected sequence number.
+  SequenceNumber expected_seq = 0;
+  auto* txn1 = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn1->Put("key1", "value1_1"));
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  ASSERT_OK(txn1->Commit());
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  delete txn1;
+  // Take a snapshots to avoid keys get evicted before compaction.
+  const Snapshot* snapshot1 = db->GetSnapshot();
+  auto* txn2 = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(txn2->SetName("txn2"));
+  ASSERT_OK(txn2->Put("key2", "value2_1"));
+  ASSERT_OK(txn2->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  // txn1 commit before snapshot2 and it is visible to snapshot2.
+  // txn2 commit after snapshot2 and it is not visible.
+  const Snapshot* snapshot2 = db->GetSnapshot();
+  ASSERT_OK(txn2->Commit());
+  ASSERT_EQ(++expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  delete txn2;
+  // Take a snapshots to avoid keys get evicted before compaction.
+  const Snapshot* snapshot3 = db->GetSnapshot();
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_2"));
+  expected_seq++;  // 1 for write
+  SequenceNumber seq1 = expected_seq;
+  if (options.two_write_queues) {
+    expected_seq++;  // 1 for commit
+  }
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2_2"));
+  expected_seq++;  // 1 for write
+  SequenceNumber seq2 = expected_seq;
+  if (options.two_write_queues) {
+    expected_seq++;  // 1 for commit
+  }
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  ASSERT_OK(db->Flush(FlushOptions()));
+  db->ReleaseSnapshot(snapshot1);
+  db->ReleaseSnapshot(snapshot3);
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyKeys({{"key1", "value1_2"}, {"key2", "value2_2"}});
+  VerifyKeys({{"key1", "value1_1"}, {"key2", "NOT_FOUND"}}, snapshot2);
+  VerifyInternalKeys({
+      {"key1", "value1_2", seq1, kTypeValue},
+      // "value1_1" is visible to snapshot2. Also keys at bottom level visible
+      // to earliest snapshot will output with seq = 0.
+      {"key1", "value1_1", 0, kTypeValue},
+      {"key2", "value2_2", seq2, kTypeValue},
+  });
+  db->ReleaseSnapshot(snapshot2);
+}
+
+TEST_P(WritePreparedTransactionTest, SmallestUncommittedOptimization) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // disable commit cache
+  for (bool has_recent_prepare : {true, false}) {
+    UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+    ReOpen();
+
+    ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+    auto* transaction =
+        db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+    ASSERT_OK(transaction->SetName("txn"));
+    ASSERT_OK(transaction->Delete("key1"));
+    ASSERT_OK(transaction->Prepare());
+    // snapshot1 should get min_uncommitted from prepared_txns_ heap.
+    auto snapshot1 = db->GetSnapshot();
+    ASSERT_EQ(transaction->GetId(),
+              ((SnapshotImpl*)snapshot1)->min_uncommitted_);
+    // Add a commit to advance max_evicted_seq and move the prepared transaction
+    // into delayed_prepared_ set.
+    ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+    Transaction* txn2 = nullptr;
+    if (has_recent_prepare) {
+      txn2 =
+          db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+      ASSERT_OK(txn2->SetName("txn2"));
+      ASSERT_OK(txn2->Put("key3", "value3"));
+      ASSERT_OK(txn2->Prepare());
+    }
+    // snapshot2 should get min_uncommitted from delayed_prepared_ set.
+    auto snapshot2 = db->GetSnapshot();
+    ASSERT_EQ(transaction->GetId(),
+              ((SnapshotImpl*)snapshot1)->min_uncommitted_);
+    ASSERT_OK(transaction->Commit());
+    delete transaction;
+    if (has_recent_prepare) {
+      ASSERT_OK(txn2->Commit());
+      delete txn2;
+    }
+    VerifyKeys({{"key1", "NOT_FOUND"}});
+    VerifyKeys({{"key1", "value1"}}, snapshot1);
+    VerifyKeys({{"key1", "value1"}}, snapshot2);
+    db->ReleaseSnapshot(snapshot1);
+    db->ReleaseSnapshot(snapshot2);
+  }
+}
+
+// Insert two values, v1 and v2, for a key. Between prepare and commit of v2
+// take two snapshots, s1 and s2. Release s1 during compaction.
+// Test to make sure compaction doesn't get confused and think s1 can see both
+// values, and thus compact out the older value by mistake.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1"));
+  auto* transaction =
+      db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("key1", "value1_2"));
+  ASSERT_OK(transaction->Prepare());
+  auto snapshot1 = db->GetSnapshot();
+  // Increment sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+  auto snapshot2 = db->GetSnapshot();
+  ASSERT_OK(transaction->Commit());
+  delete transaction;
+  VerifyKeys({{"key1", "value1_2"}});
+  VerifyKeys({{"key1", "value1_1"}}, snapshot1);
+  VerifyKeys({{"key1", "value1_1"}}, snapshot2);
+  // Add a flush to avoid compaction to fallback to trivial move.
+
+  auto callback = [&](void*) {
+    // Release snapshot1 after CompactionIterator init.
+    // CompactionIterator need to figure out the earliest snapshot
+    // that can see key1:value1_2 is kMaxSequenceNumber, not
+    // snapshot1 or snapshot2.
+    db->ReleaseSnapshot(snapshot1);
+    // Add some keys to advance max_evicted_seq.
+    ASSERT_OK(db->Put(WriteOptions(), "key3", "value3"));
+    ASSERT_OK(db->Put(WriteOptions(), "key4", "value4"));
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  VerifyKeys({{"key1", "value1_2"}});
+  VerifyKeys({{"key1", "value1_1"}}, snapshot2);
+  db->ReleaseSnapshot(snapshot2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Insert two values, v1 and v2, for a key. Take two snapshots, s1 and s2,
+// after committing v2. Release s1 during compaction, right after compaction
+// processes v2 and before processes v1. Test to make sure compaction doesn't
+// get confused and believe v1 and v2 are visible to different snapshot
+// (v1 by s2, v2 by s1) and refuse to compact out v1.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value2"));
+  SequenceNumber v2_seq = db->GetLatestSequenceNumber();
+  auto* s1 = db->GetSnapshot();
+  // Advance sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "dummy"));
+  auto* s2 = db->GetSnapshot();
+
+  int count_value = 0;
+  auto callback = [&](void* arg) {
+    auto* ikey = reinterpret_cast<ParsedInternalKey*>(arg);
+    if (ikey->user_key == "key1") {
+      count_value++;
+      if (count_value == 2) {
+        // Processing v1.
+        db->ReleaseSnapshot(s1);
+        // Add some keys to advance max_evicted_seq and update
+        // old_commit_map.
+        ASSERT_OK(db->Put(WriteOptions(), "key3", "dummy"));
+        ASSERT_OK(db->Put(WriteOptions(), "key4", "dummy"));
+      }
+    }
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:ProcessKV",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // value1 should be compact out.
+  VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}});
+
+  // cleanup
+  db->ReleaseSnapshot(s2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Insert two values, v1 and v2, for a key. Insert another dummy key
+// so to evict the commit cache for v2, while v1 is still in commit cache.
+// Take two snapshots, s1 and s2. Release s1 during compaction.
+// Since commit cache for v2 is evicted, and old_commit_map don't have
+// s1 (it is released),
+// TODO(myabandeh): how can we be sure that the v2's commit info is evicted
+// (and not v1's)? Instead of putting a dummy, we can directly call
+// AddCommitted(v2_seq + cache_size, ...) to evict v2's entry from commit cache.
+TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 1;    // commit cache size = 2
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+
+  // Add a dummy key to evict v2 commit cache, but keep v1 commit cache.
+  // It also advance max_evicted_seq and can trigger old_commit_map cleanup.
+  auto add_dummy = [&]() {
+    auto* txn_dummy =
+        db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+    ASSERT_OK(txn_dummy->SetName("txn_dummy"));
+    ASSERT_OK(txn_dummy->Put("dummy", "dummy"));
+    ASSERT_OK(txn_dummy->Prepare());
+    ASSERT_OK(txn_dummy->Commit());
+    delete txn_dummy;
+  };
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  auto* txn =
+      db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Put("key1", "value2"));
+  ASSERT_OK(txn->Prepare());
+  // TODO(myabandeh): replace it with GetId()?
+  auto v2_seq = db->GetLatestSequenceNumber();
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  auto* s1 = db->GetSnapshot();
+  // Dummy key to advance sequence number.
+  add_dummy();
+  auto* s2 = db->GetSnapshot();
+
+  auto callback = [&](void*) {
+    db->ReleaseSnapshot(s1);
+    // Add some dummy entries to trigger s1 being cleanup from old_commit_map.
+    add_dummy();
+    add_dummy();
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // value1 should be compact out.
+  VerifyInternalKeys({{"key1", "value2", v2_seq, kTypeValue}});
+
+  db->ReleaseSnapshot(s2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotDuringCompaction) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 0;    // minimum commit cache
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  auto* transaction =
+      db->BeginTransaction(WriteOptions(), TransactionOptions(), nullptr);
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Delete("key1"));
+  ASSERT_OK(transaction->Prepare());
+  SequenceNumber del_seq = db->GetLatestSequenceNumber();
+  auto snapshot1 = db->GetSnapshot();
+  // Increment sequence number.
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+  auto snapshot2 = db->GetSnapshot();
+  ASSERT_OK(transaction->Commit());
+  delete transaction;
+  VerifyKeys({{"key1", "NOT_FOUND"}});
+  VerifyKeys({{"key1", "value1"}}, snapshot1);
+  VerifyKeys({{"key1", "value1"}}, snapshot2);
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  auto callback = [&](void* compaction) {
+    // Release snapshot1 after CompactionIterator init.
+    // CompactionIterator need to double check and find out snapshot2 is now
+    // the earliest existing snapshot.
+    if (compaction != nullptr) {
+      db->ReleaseSnapshot(snapshot1);
+      // Add some keys to advance max_evicted_seq.
+      ASSERT_OK(db->Put(WriteOptions(), "key3", "value3"));
+      ASSERT_OK(db->Put(WriteOptions(), "key4", "value4"));
+    }
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Only verify for key1. Both the put and delete for the key should be kept.
+  // Since the delete tombstone is not visible to snapshot2, we need to keep
+  // at least one version of the key, for write-conflict check.
+  VerifyInternalKeys({{"key1", "", del_seq, kTypeDeletion},
+                      {"key1", "value1", 0, kTypeValue}});
+  db->ReleaseSnapshot(snapshot2);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// A more complex test to verify compaction/flush should keep keys visible
+// to snapshots.
+TEST_P(WritePreparedTransactionTest,
+       CompactionShouldKeepSnapshotVisibleKeysRandomized) {
+  constexpr size_t kNumTransactions = 10;
+  constexpr size_t kNumIterations = 1000;
+
+  std::vector<Transaction*> transactions(kNumTransactions, nullptr);
+  std::vector<size_t> versions(kNumTransactions, 0);
+  std::unordered_map<std::string, std::string> current_data;
+  std::vector<const Snapshot*> snapshots;
+  std::vector<std::unordered_map<std::string, std::string>> snapshot_data;
+
+  Random rnd(1103);
+  options.disable_auto_compactions = true;
+  ReOpen();
+
+  for (size_t i = 0; i < kNumTransactions; i++) {
+    std::string key = "key" + ToString(i);
+    std::string value = "value0";
+    ASSERT_OK(db->Put(WriteOptions(), key, value));
+    current_data[key] = value;
+  }
+  VerifyKeys(current_data);
+
+  for (size_t iter = 0; iter < kNumIterations; iter++) {
+    auto r = rnd.Next() % (kNumTransactions + 1);
+    if (r < kNumTransactions) {
+      std::string key = "key" + ToString(r);
+      if (transactions[r] == nullptr) {
+        std::string value = "value" + ToString(versions[r] + 1);
+        auto* txn = db->BeginTransaction(WriteOptions());
+        ASSERT_OK(txn->SetName("txn" + ToString(r)));
+        ASSERT_OK(txn->Put(key, value));
+        ASSERT_OK(txn->Prepare());
+        transactions[r] = txn;
+      } else {
+        std::string value = "value" + ToString(++versions[r]);
+        ASSERT_OK(transactions[r]->Commit());
+        delete transactions[r];
+        transactions[r] = nullptr;
+        current_data[key] = value;
+      }
+    } else {
+      auto* snapshot = db->GetSnapshot();
+      VerifyKeys(current_data, snapshot);
+      snapshots.push_back(snapshot);
+      snapshot_data.push_back(current_data);
+    }
+    VerifyKeys(current_data);
+  }
+  // Take a last snapshot to test compaction with uncommitted prepared
+  // transaction.
+  snapshots.push_back(db->GetSnapshot());
+  snapshot_data.push_back(current_data);
+
+  assert(snapshots.size() == snapshot_data.size());
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    VerifyKeys(snapshot_data[i], snapshots[i]);
+  }
+  ASSERT_OK(db->Flush(FlushOptions()));
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    VerifyKeys(snapshot_data[i], snapshots[i]);
+  }
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    VerifyKeys(snapshot_data[i], snapshots[i]);
+  }
+  // cleanup
+  for (size_t i = 0; i < kNumTransactions; i++) {
+    if (transactions[i] == nullptr) {
+      continue;
+    }
+    ASSERT_OK(transactions[i]->Commit());
+    delete transactions[i];
+  }
+  for (size_t i = 0; i < snapshots.size(); i++) {
+    db->ReleaseSnapshot(snapshots[i]);
+  }
+}
+
+// Compaction should not apply the optimization to output key with sequence
+// number equal to 0 if the key is not visible to earliest snapshot, based on
+// commit sequence number.
+TEST_P(WritePreparedTransactionTest,
+       CompactionShouldKeepSequenceForUncommittedKeys) {
+  options.disable_auto_compactions = true;
+  ReOpen();
+  // Keep track of expected sequence number.
+  SequenceNumber expected_seq = 0;
+  auto* transaction = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("key1", "value1"));
+  ASSERT_OK(transaction->Prepare());
+  ASSERT_EQ(++expected_seq, db->GetLatestSequenceNumber());
+  SequenceNumber seq1 = expected_seq;
+  ASSERT_OK(db->Put(WriteOptions(), "key2", "value2"));
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+  expected_seq++;  // one for data
+  if (options.two_write_queues) {
+    expected_seq++;  // one for commit
+  }
+  ASSERT_EQ(expected_seq, db_impl->TEST_GetLastVisibleSequence());
+  ASSERT_OK(db->Flush(FlushOptions()));
+  // Dummy keys to avoid compaction trivially move files and get around actual
+  // compaction logic.
+  ASSERT_OK(db->Put(WriteOptions(), "a", "dummy"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "dummy"));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  VerifyKeys({
+      {"key1", "NOT_FOUND"},
+      {"key2", "value2"},
+  });
+  VerifyInternalKeys({
+      // "key1" has not been committed. It keeps its sequence number.
+      {"key1", "value1", seq1, kTypeValue},
+      // "key2" is committed and output with seq = 0.
+      {"key2", "value2", 0, kTypeValue},
+  });
+  ASSERT_OK(transaction->Commit());
+  VerifyKeys({
+      {"key1", "value1"},
+      {"key2", "value2"},
+  });
+  delete transaction;
+}
+
+TEST_P(WritePreparedTransactionTest, CommitAndSnapshotDuringCompaction) {
+  options.disable_auto_compactions = true;
+  ReOpen();
+
+  const Snapshot* snapshot = nullptr;
+  ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));
+  auto* txn = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(txn->SetName("txn"));
+  ASSERT_OK(txn->Put("key1", "value2"));
+  ASSERT_OK(txn->Prepare());
+
+  auto callback = [&](void*) {
+    // Snapshot is taken after compaction start. It should be taken into
+    // consideration for whether to compact out value1.
+    snapshot = db->GetSnapshot();
+    ASSERT_OK(txn->Commit());
+    delete txn;
+  };
+  SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit",
+                                        callback);
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_NE(nullptr, snapshot);
+  VerifyKeys({{"key1", "value2"}});
+  VerifyKeys({{"key1", "value1"}}, snapshot);
+  db->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(WritePreparedTransactionTest, Iterate) {
+  auto verify_state = [](Iterator* iter, const std::string& key,
+                         const std::string& value) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key, iter->key().ToString());
+    ASSERT_EQ(value, iter->value().ToString());
+  };
+
+  auto verify_iter = [&](const std::string& expected_val) {
+    // Get iterator from a concurrent transaction and make sure it has the
+    // same view as an iterator from the DB.
+    auto* txn = db->BeginTransaction(WriteOptions());
+
+    for (int i = 0; i < 2; i++) {
+      Iterator* iter = (i == 0)
+          ? db->NewIterator(ReadOptions())
+          : txn->GetIterator(ReadOptions());
+      // Seek
+      iter->Seek("foo");
+      verify_state(iter, "foo", expected_val);
+      // Next
+      iter->Seek("a");
+      verify_state(iter, "a", "va");
+      iter->Next();
+      verify_state(iter, "foo", expected_val);
+      // SeekForPrev
+      iter->SeekForPrev("y");
+      verify_state(iter, "foo", expected_val);
+      // Prev
+      iter->SeekForPrev("z");
+      verify_state(iter, "z", "vz");
+      iter->Prev();
+      verify_state(iter, "foo", expected_val);
+      delete iter;
+    }
+    delete txn;
+  };
+
+  ASSERT_OK(db->Put(WriteOptions(), "foo", "v1"));
+  auto* transaction = db->BeginTransaction(WriteOptions());
+  ASSERT_OK(transaction->SetName("txn"));
+  ASSERT_OK(transaction->Put("foo", "v2"));
+  ASSERT_OK(transaction->Prepare());
+  VerifyKeys({{"foo", "v1"}});
+  // dummy keys
+  ASSERT_OK(db->Put(WriteOptions(), "a", "va"));
+  ASSERT_OK(db->Put(WriteOptions(), "z", "vz"));
+  verify_iter("v1");
+  ASSERT_OK(transaction->Commit());
+  VerifyKeys({{"foo", "v2"}});
+  verify_iter("v2");
+  delete transaction;
+}
+
+TEST_P(WritePreparedTransactionTest, IteratorRefreshNotSupported) {
+  Iterator* iter = db->NewIterator(ReadOptions());
+  ASSERT_TRUE(iter->Refresh().IsNotSupported());
+  delete iter;
+}
+
+// Committing an delayed prepared has two non-atomic steps: update commit cache,
+// remove seq from delayed_prepared_. The read in IsInSnapshot also involves two
+// non-atomic steps of checking these two data structures. This test breaks each
+// in the middle to ensure correctness in spite of non-atomic execution.
+// Note: This test is limitted to the case where snapshot is larger than the
+// max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, NonAtomicCommitOfDelayedPrepared) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 3;    // 8 entries
+  for (auto split_read : {true, false}) {
+    std::vector<bool> split_options = {false};
+    if (split_read) {
+      // Also test for break before mutex
+      split_options.push_back(true);
+    }
+    for (auto split_before_mutex : split_options) {
+      UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+      ReOpen();
+      WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+      DBImpl* db_impl = reinterpret_cast<DBImpl*>(db->GetRootDB());
+      // Fill up the commit cache
+      std::string init_value("value1");
+      for (int i = 0; i < 10; i++) {
+        db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+      }
+      // Prepare a transaction but do not commit it
+      Transaction* txn =
+          db->BeginTransaction(WriteOptions(), TransactionOptions());
+      ASSERT_OK(txn->SetName("xid"));
+      ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
+      ASSERT_OK(txn->Prepare());
+      // Commit a bunch of entries to advance max evicted seq and make the
+      // prepared a delayed prepared
+      for (int i = 0; i < 10; i++) {
+        db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+      }
+      // The snapshot should not see the delayed prepared entry
+      auto snap = db->GetSnapshot();
+
+      if (split_read) {
+        if (split_before_mutex) {
+          // split before acquiring prepare_mutex_
+          rocksdb::SyncPoint::GetInstance()->LoadDependency(
+              {{"WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause",
+                "AtomicCommitOfDelayedPrepared:Commit:before"},
+               {"AtomicCommitOfDelayedPrepared:Commit:after",
+                "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume"}});
+        } else {
+          // split right after reading from the commit cache
+          rocksdb::SyncPoint::GetInstance()->LoadDependency(
+              {{"WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause",
+                "AtomicCommitOfDelayedPrepared:Commit:before"},
+               {"AtomicCommitOfDelayedPrepared:Commit:after",
+                "WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume"}});
+        }
+      } else {  // split commit
+        // split right before removing from delayed_prepared_
+        rocksdb::SyncPoint::GetInstance()->LoadDependency(
+            {{"WritePreparedTxnDB::RemovePrepared:pause",
+              "AtomicCommitOfDelayedPrepared:Read:before"},
+             {"AtomicCommitOfDelayedPrepared:Read:after",
+              "WritePreparedTxnDB::RemovePrepared:resume"}});
+      }
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      rocksdb::port::Thread commit_thread([&]() {
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:before");
+        ASSERT_OK(txn->Commit());
+        if (split_before_mutex) {
+          // Do bunch of inserts to evict the commit entry from the cache. This
+          // would prevent the 2nd look into commit cache under prepare_mutex_
+          // to see the commit entry.
+          auto seq = db_impl->TEST_GetLastVisibleSequence();
+          size_t tries = 0;
+          while (wp_db->max_evicted_seq_ < seq && tries < 50) {
+            db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+            tries++;
+          };
+          ASSERT_LT(tries, 50);
+        }
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Commit:after");
+        delete txn;
+      });
+
+      rocksdb::port::Thread read_thread([&]() {
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:before");
+        ReadOptions roptions;
+        roptions.snapshot = snap;
+        PinnableSlice value;
+        auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+        ASSERT_OK(s);
+        // It should not see the commit of delayed prepared
+        ASSERT_TRUE(value == init_value);
+        TEST_SYNC_POINT("AtomicCommitOfDelayedPrepared:Read:after");
+        db->ReleaseSnapshot(snap);
+      });
+
+      read_thread.join();
+      commit_thread.join();
+      rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+      rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+    }  // for split_before_mutex
+  }    // for split_read
+}
+
+// When max evicted seq advances a prepared seq, it involves two updates: i)
+// adding prepared seq to delayed_prepared_, ii) updating max_evicted_seq_.
+// ::IsInSnapshot also reads these two values in a non-atomic way. This test
+// ensures correctness if the update occurs after ::IsInSnapshot reads
+// delayed_prepared_empty_ and before it reads max_evicted_seq_.
+// Note: this test focuses on read snapshot larger than max_evicted_seq_.
+TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfDelayedPrepared) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 3;    // 8 entries
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // Fill up the commit cache
+  std::string init_value("value1");
+  for (int i = 0; i < 10; i++) {
+    db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+  }
+  // Prepare a transaction but do not commit it
+  Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn->SetName("xid"));
+  ASSERT_OK(txn->Put(Slice("key1"), Slice("value2")));
+  ASSERT_OK(txn->Prepare());
+  // Create a gap between prepare seq and snapshot seq
+  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  // The snapshot should not see the delayed prepared entry
+  auto snap = db->GetSnapshot();
+  ASSERT_LT(txn->GetId(), snap->GetSequenceNumber());
+
+  // split right after reading delayed_prepared_empty_
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause",
+        "AtomicUpdateOfDelayedPrepared:before"},
+       {"AtomicUpdateOfDelayedPrepared:after",
+        "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  rocksdb::port::Thread commit_thread([&]() {
+    TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:before");
+    // Commit a bunch of entries to advance max evicted seq and make the
+    // prepared a delayed prepared
+    size_t tries = 0;
+    while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
+      db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+      tries++;
+    };
+    ASSERT_LT(tries, 50);
+    // This is the case on which the test focuses
+    ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+    TEST_SYNC_POINT("AtomicUpdateOfDelayedPrepared:after");
+  });
+
+  rocksdb::port::Thread read_thread([&]() {
+    ReadOptions roptions;
+    roptions.snapshot = snap;
+    PinnableSlice value;
+    auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+    ASSERT_OK(s);
+    // It should not see the uncommitted value of delayed prepared
+    ASSERT_TRUE(value == init_value);
+    db->ReleaseSnapshot(snap);
+  });
+
+  read_thread.join();
+  commit_thread.join();
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Eviction from commit cache and update of max evicted seq are two non-atomic
+// steps. Similarly the read of max_evicted_seq_ in ::IsInSnapshot and reading
+// from commit cache are two non-atomic steps. This tests if the update occurs
+// after reading max_evicted_seq_ and before reading the commit cache.
+// Note: the test focuses on snapshot larger than max_evicted_seq_
+TEST_P(WritePreparedTransactionTest, NonAtomicUpdateOfMaxEvictedSeq) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  const size_t commit_cache_bits = 3;    // 8 entries
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  // Fill up the commit cache
+  std::string init_value("value1");
+  std::string last_value("value_final");
+  for (int i = 0; i < 10; i++) {
+    db->Put(WriteOptions(), Slice("key1"), Slice(init_value));
+  }
+  // Do an uncommitted write to prevent min_uncommitted optimization
+  Transaction* txn1 =
+      db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn1->SetName("xid1"));
+  ASSERT_OK(txn1->Put(Slice("key0"), last_value));
+  ASSERT_OK(txn1->Prepare());
+  // Do a write with prepare to get the prepare seq
+  Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn->SetName("xid"));
+  ASSERT_OK(txn->Put(Slice("key1"), last_value));
+  ASSERT_OK(txn->Prepare());
+  ASSERT_OK(txn->Commit());
+  // Create a gap between commit entry and snapshot seq
+  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+  // The snapshot should see the last commit
+  auto snap = db->GetSnapshot();
+  ASSERT_LE(txn->GetId(), snap->GetSequenceNumber());
+
+  // split right after reading max_evicted_seq_
+  rocksdb::SyncPoint::GetInstance()->LoadDependency(
+      {{"WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause",
+        "NonAtomicUpdateOfMaxEvictedSeq:before"},
+       {"NonAtomicUpdateOfMaxEvictedSeq:after",
+        "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  rocksdb::port::Thread commit_thread([&]() {
+    TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:before");
+    // Commit a bunch of entries to advance max evicted seq beyond txn->GetId()
+    size_t tries = 0;
+    while (wp_db->max_evicted_seq_ < txn->GetId() && tries < 50) {
+      db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
+      tries++;
+    };
+    ASSERT_LT(tries, 50);
+    // This is the case on which the test focuses
+    ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+    TEST_SYNC_POINT("NonAtomicUpdateOfMaxEvictedSeq:after");
+  });
+
+  rocksdb::port::Thread read_thread([&]() {
+    ReadOptions roptions;
+    roptions.snapshot = snap;
+    PinnableSlice value;
+    auto s = db->Get(roptions, db->DefaultColumnFamily(), "key1", &value);
+    ASSERT_OK(s);
+    // It should see the committed value of the evicted entry
+    ASSERT_TRUE(value == last_value);
+    db->ReleaseSnapshot(snap);
+  });
+
+  read_thread.join();
+  commit_thread.join();
+  delete txn;
+  txn1->Commit();
+  delete txn1;
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// Test when we add a prepared seq when the max_evicted_seq_ already goes beyond
+// that. The test focuses on a race condition between AddPrepared and
+// AdvanceMaxEvictedSeq functions.
+TEST_P(WritePreparedTransactionTest, AddPreparedBeforeMax) {
+  if (!options.two_write_queues) {
+    // This test is only for two write queues
+    return;
+  }
+  const size_t snapshot_cache_bits = 7;  // same as default
+  // 1 entry to advance max after the 2nd commit
+  const size_t commit_cache_bits = 0;
+  UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+  ReOpen();
+  WritePreparedTxnDB* wp_db = dynamic_cast<WritePreparedTxnDB*>(db);
+  std::string some_value("value_some");
+  std::string uncommitted_value("value_uncommitted");
+  // Prepare two uncommitted transactions
+  Transaction* txn1 =
+      db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn1->SetName("xid1"));
+  ASSERT_OK(txn1->Put(Slice("key1"), some_value));
+  ASSERT_OK(txn1->Prepare());
+  Transaction* txn2 =
+      db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn2->SetName("xid2"));
+  ASSERT_OK(txn2->Put(Slice("key2"), some_value));
+  ASSERT_OK(txn2->Prepare());
+  // Start the txn here so the other thread could get its id
+  Transaction* txn = db->BeginTransaction(WriteOptions(), TransactionOptions());
+  ASSERT_OK(txn->SetName("xid"));
+  ASSERT_OK(txn->Put(Slice("key0"), uncommitted_value));
+  port::Mutex txn_mutex_;
+
+  // t1) Insert prepared entry, t2) commit other entires to advance max
+  // evicted sec and finish checking the existing prepared entires, t1)
+  // AddPrepared, t2) update max_evicted_seq_
+  rocksdb::SyncPoint::GetInstance()->LoadDependency({
+      {"AddPrepared::begin:pause", "AddPreparedBeforeMax::read_thread:start"},
+      {"AdvanceMaxEvictedSeq::update_max:pause", "AddPrepared::begin:resume"},
+      {"AddPrepared::end", "AdvanceMaxEvictedSeq::update_max:resume"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  rocksdb::port::Thread write_thread([&]() {
+    txn_mutex_.Lock();
+    ASSERT_OK(txn->Prepare());
+    txn_mutex_.Unlock();
+  });
+
+  rocksdb::port::Thread read_thread([&]() {
+    TEST_SYNC_POINT("AddPreparedBeforeMax::read_thread:start");
+    // Publish seq number with a commit
+    ASSERT_OK(txn1->Commit());
+    // Since the commit cache size is one the 2nd commit evict the 1st one and
+    // invokes AdcanceMaxEvictedSeq
+    ASSERT_OK(txn2->Commit());
+
+    ReadOptions roptions;
+    PinnableSlice value;
+    // The snapshot should not see the uncommitted value from write_thread
+    auto snap = db->GetSnapshot();
+    ASSERT_LT(wp_db->max_evicted_seq_, snap->GetSequenceNumber());
+    // This is the scenario that we test for
+    txn_mutex_.Lock();
+    ASSERT_GT(wp_db->max_evicted_seq_, txn->GetId());
+    txn_mutex_.Unlock();
+    roptions.snapshot = snap;
+    auto s = db->Get(roptions, db->DefaultColumnFamily(), "key0", &value);
+    ASSERT_TRUE(s.IsNotFound());
+    db->ReleaseSnapshot(snap);
+  });
+
+  read_thread.join();
+  write_thread.join();
+  delete txn1;
+  delete txn2;
+  ASSERT_OK(txn->Commit());
+  delete txn;
+  rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// When an old prepared entry gets committed, there is a gap between the time
+// that it is published and when it is cleaned up from old_prepared_. This test
+// stresses such cases.
+TEST_P(WritePreparedTransactionTest, CommitOfDelayedPrepared) {
+  const size_t snapshot_cache_bits = 7;  // same as default
+  for (const size_t commit_cache_bits : {0, 2, 3}) {
+    for (const size_t sub_batch_cnt : {1, 2, 3}) {
+      UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits);
+      ReOpen();
+      std::atomic<const Snapshot*> snap = {nullptr};
+      std::atomic<SequenceNumber> exp_prepare = {0};
+      // Value is synchronized via snap
+      PinnableSlice value;
+      // Take a snapshot after publish and before RemovePrepared:Start
+      auto callback = [&](void* param) {
+        SequenceNumber prep_seq = *((SequenceNumber*)param);
+        if (prep_seq == exp_prepare.load()) {  // only for write_thread
+          ASSERT_EQ(nullptr, snap.load());
+          snap.store(db->GetSnapshot());
+          ReadOptions roptions;
+          roptions.snapshot = snap.load();
+          auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value);
+          ASSERT_OK(s);
+        }
+      };
+      SyncPoint::GetInstance()->SetCallBack("RemovePrepared:Start", callback);
+      SyncPoint::GetInstance()->EnableProcessing();
+      // Thread to cause frequent evictions
+      rocksdb::port::Thread eviction_thread([&]() {
+        // Too many txns might cause commit_seq - prepare_seq in another thread
+        // to go beyond DELTA_UPPERBOUND
+        for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
+          db->Put(WriteOptions(), Slice("key1"), Slice("value1"));
+        }
+      });
+      rocksdb::port::Thread write_thread([&]() {
+        for (int i = 0; i < 25 * (1 << commit_cache_bits); i++) {
+          Transaction* txn =
+              db->BeginTransaction(WriteOptions(), TransactionOptions());
+          ASSERT_OK(txn->SetName("xid"));
+          std::string val_str = "value" + ToString(i);
+          for (size_t b = 0; b < sub_batch_cnt; b++) {
+            ASSERT_OK(txn->Put(Slice("key"), val_str));
+          }
+          ASSERT_OK(txn->Prepare());
+          // Let an eviction to kick in
+          std::this_thread::yield();
+
+          exp_prepare.store(txn->GetId());
+          ASSERT_OK(txn->Commit());
+          delete txn;
+
+          // Read with the snapshot taken before delayed_prepared_ cleanup
+          ReadOptions roptions;
+          roptions.snapshot = snap.load();
+          ASSERT_NE(nullptr, roptions.snapshot);
+          PinnableSlice value2;
+          auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &value2);
+          ASSERT_OK(s);
+          // It should see its own write
+          ASSERT_TRUE(val_str == value2);
+          // The value read by snapshot should not change
+          ASSERT_STREQ(value2.ToString().c_str(), value.ToString().c_str());
+
+          db->ReleaseSnapshot(roptions.snapshot);
+          snap.store(nullptr);
+        }
+      });
+      write_thread.join();
+      eviction_thread.join();
+    }
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+    rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+}
+
+// Test that updating the commit map will not affect the existing snapshots
+TEST_P(WritePreparedTransactionTest, AtomicCommit) {
+  for (bool skip_prepare : {true, false}) {
+    rocksdb::SyncPoint::GetInstance()->LoadDependency({
+        {"WritePreparedTxnDB::AddCommitted:start",
+         "AtomicCommit::GetSnapshot:start"},
+        {"AtomicCommit::Get:end",
+         "WritePreparedTxnDB::AddCommitted:start:pause"},
+        {"WritePreparedTxnDB::AddCommitted:end", "AtomicCommit::Get2:start"},
+        {"AtomicCommit::Get2:end",
+         "WritePreparedTxnDB::AddCommitted:end:pause:"},
+    });
+    rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+    rocksdb::port::Thread write_thread([&]() {
+      if (skip_prepare) {
+        db->Put(WriteOptions(), Slice("key"), Slice("value"));
+      } else {
+        Transaction* txn =
+            db->BeginTransaction(WriteOptions(), TransactionOptions());
+        ASSERT_OK(txn->SetName("xid"));
+        ASSERT_OK(txn->Put(Slice("key"), Slice("value")));
+        ASSERT_OK(txn->Prepare());
+        ASSERT_OK(txn->Commit());
+        delete txn;
+      }
+    });
+    rocksdb::port::Thread read_thread([&]() {
+      ReadOptions roptions;
+      TEST_SYNC_POINT("AtomicCommit::GetSnapshot:start");
+      roptions.snapshot = db->GetSnapshot();
+      PinnableSlice val;
+      auto s = db->Get(roptions, db->DefaultColumnFamily(), "key", &val);
+      TEST_SYNC_POINT("AtomicCommit::Get:end");
+      TEST_SYNC_POINT("AtomicCommit::Get2:start");
+      ASSERT_SAME(roptions, db, s, val, "key");
+      TEST_SYNC_POINT("AtomicCommit::Get2:end");
+      db->ReleaseSnapshot(roptions.snapshot);
+    });
+    read_thread.join();
+    write_thread.join();
+    rocksdb::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Test that we can change write policy from WriteCommitted to WritePrepared
+// after a clean shutdown (which would empty the WAL)
+TEST_P(WritePreparedTransactionTest, WP_WC_DBBackwardCompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, empty_wal);
+}
+
+// Test that we fail fast if WAL is not emptied between changing the write
+// policy from WriteCommitted to WritePrepared
+TEST_P(WritePreparedTransactionTest, WP_WC_WALBackwardIncompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_COMMITTED, WRITE_PREPARED, !empty_wal);
+}
+
+// Test that we can change write policy from WritePrepare back to WriteCommitted
+// after a clean shutdown (which would empty the WAL)
+TEST_P(WritePreparedTransactionTest, WC_WP_ForwardCompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, empty_wal);
+}
+
+// Test that we fail fast if WAL is not emptied between changing the write
+// policy from WriteCommitted to WritePrepared
+TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) {
+  bool empty_wal = true;
+  CrossCompatibilityTest(WRITE_PREPARED, WRITE_COMMITTED, !empty_wal);
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.cc b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.cc
index 211e217240..4100925c55 100644
--- a/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.cc
+++ b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.cc
@@ -7,15 +7,22 @@
 
 #include "utilities/transactions/write_prepared_txn.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <map>
+#include <set>
 
 #include "db/column_family.h"
 #include "db/db_impl.h"
 #include "rocksdb/db.h"
 #include "rocksdb/status.h"
 #include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
 #include "utilities/transactions/pessimistic_transaction.h"
-#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/write_prepared_txn_db.h"
 
 namespace rocksdb {
 
@@ -24,63 +31,400 @@ struct WriteOptions;
 WritePreparedTxn::WritePreparedTxn(WritePreparedTxnDB* txn_db,
                                    const WriteOptions& write_options,
                                    const TransactionOptions& txn_options)
-    : PessimisticTransaction(txn_db, write_options, txn_options),
+    : PessimisticTransaction(txn_db, write_options, txn_options, false),
       wpt_db_(txn_db) {
+  // Call Initialize outside PessimisticTransaction constructor otherwise it
+  // would skip overridden functions in WritePreparedTxn since they are not
+  // defined yet in the constructor of PessimisticTransaction
+  Initialize(txn_options);
+}
+
+void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) {
   PessimisticTransaction::Initialize(txn_options);
+  prepare_batch_cnt_ = 0;
+}
+
+Status WritePreparedTxn::Get(const ReadOptions& read_options,
+                             ColumnFamilyHandle* column_family,
+                             const Slice& key, PinnableSlice* pinnable_val) {
+  auto snapshot = read_options.snapshot;
+  auto snap_seq =
+      snapshot != nullptr ? snapshot->GetSequenceNumber() : kMaxSequenceNumber;
+  SequenceNumber min_uncommitted =
+      kMinUnCommittedSeq;  // by default disable the optimization
+  if (snapshot != nullptr) {
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
+            ->min_uncommitted_;
+  }
+
+  WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted);
+  return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key,
+                                        pinnable_val, &callback);
+}
+
+Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) {
+  // Make sure to get iterator from WritePrepareTxnDB, not the root db.
+  Iterator* db_iter = wpt_db_->NewIterator(options);
+  assert(db_iter);
+
+  return write_batch_.NewIteratorWithBase(db_iter);
 }
 
-Status WritePreparedTxn::CommitBatch(WriteBatch* /* unused */) {
-  // TODO(myabandeh) Implement this
-  throw std::runtime_error("CommitBatch not Implemented");
-  return Status::OK();
+Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options,
+                                        ColumnFamilyHandle* column_family) {
+  // Make sure to get iterator from WritePrepareTxnDB, not the root db.
+  Iterator* db_iter = wpt_db_->NewIterator(options, column_family);
+  assert(db_iter);
+
+  return write_batch_.NewIteratorWithBase(column_family, db_iter);
 }
 
 Status WritePreparedTxn::PrepareInternal() {
   WriteOptions write_options = write_options_;
   write_options.disableWAL = false;
-  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_);
-  const bool disable_memtable = true;
-  uint64_t seq_used;
-  Status s =
-      db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
-                          /*callback*/ nullptr, &log_number_, /*log ref*/ 0,
-                          !disable_memtable, &seq_used);
-  prepare_seq_ = seq_used;
-  wpt_db_->AddPrepared(prepare_seq_);
+  const bool WRITE_AFTER_COMMIT = true;
+  const bool kFirstPrepareBatch = true;
+  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
+                                     !WRITE_AFTER_COMMIT);
+  // For each duplicate key we account for a new sub-batch
+  prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+  // Having AddPrepared in the PreReleaseCallback allows in-order addition of
+  // prepared entries to PreparedHeap and hence enables an optimization. Refer to
+  // SmallestUnCommittedSeq for more details.
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, prepare_batch_cnt_,
+      db_impl_->immutable_db_options().two_write_queues, kFirstPrepareBatch);
+  const bool DISABLE_MEMTABLE = true;
+  uint64_t seq_used = kMaxSequenceNumber;
+  Status s = db_impl_->WriteImpl(
+      write_options, GetWriteBatch()->GetWriteBatch(),
+      /*callback*/ nullptr, &log_number_, /*log ref*/ 0, !DISABLE_MEMTABLE,
+      &seq_used, prepare_batch_cnt_, &add_prepared_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  auto prepare_seq = seq_used;
+  SetId(prepare_seq);
   return s;
 }
 
 Status WritePreparedTxn::CommitWithoutPrepareInternal() {
-  // TODO(myabandeh) Implement this
-  throw std::runtime_error("Commit not Implemented");
-  return Status::OK();
+  // For each duplicate key we account for a new sub-batch
+  const size_t batch_cnt = GetWriteBatch()->SubBatchCnt();
+  return CommitBatchInternal(GetWriteBatch()->GetWriteBatch(), batch_cnt);
+}
+
+Status WritePreparedTxn::CommitBatchInternal(WriteBatch* batch,
+                                             size_t batch_cnt) {
+  return wpt_db_->WriteInternal(write_options_, batch, batch_cnt, this);
 }
 
 Status WritePreparedTxn::CommitInternal() {
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "CommitInternal prepare_seq: %" PRIu64, GetID());
   // We take the commit-time batch and append the Commit marker.
   // The Memtable will ignore the Commit marker in non-recovery mode
   WriteBatch* working_batch = GetCommitTimeWriteBatch();
-  // TODO(myabandeh): prevent the users from writing to txn after the prepare
-  // phase
-  assert(working_batch->Count() == 0);
+  const bool empty = working_batch->Count() == 0;
   WriteBatchInternal::MarkCommit(working_batch, name_);
 
-  // any operations appended to this working_batch will be ignored from WAL
-  working_batch->MarkWalTerminationPoint();
+  const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
+  if (!empty && for_recovery) {
+    // When not writing to memtable, we can still cache the latest write batch.
+    // The cached batch will be written to memtable in WriteRecoverableState
+    // during FlushMemTable
+    WriteBatchInternal::SetAsLastestPersistentState(working_batch);
+  }
 
-  const bool disable_memtable = true;
-  uint64_t seq_used;
+  auto prepare_seq = GetId();
+  const bool includes_data = !empty && !for_recovery;
+  assert(prepare_batch_cnt_);
+  size_t commit_batch_cnt = 0;
+  if (UNLIKELY(includes_data)) {
+    ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                   "Duplicate key overhead");
+    SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
+    auto s = working_batch->Iterate(&counter);
+    assert(s.ok());
+    commit_batch_cnt = counter.BatchCount();
+  }
+  const bool disable_memtable = !includes_data;
+  const bool do_one_write =
+      !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, commit_batch_cnt);
+  // This is to call AddPrepared on CommitTimeWriteBatch
+  const bool kFirstPrepareBatch = true;
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, commit_batch_cnt,
+      db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
+  uint64_t seq_used = kMaxSequenceNumber;
+  // Since the prepared batch is directly written to memtable, there is already
+  // a connection between the memtable and its WAL, so there is no need to
+  // redundantly reference the log that contains the prepared data.
+  const uint64_t zero_log_number = 0ull;
+  size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
   auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
-                               log_number_, disable_memtable, &seq_used);
-  uint64_t& commit_seq = seq_used;
-  wpt_db_->AddCommitted(prepare_seq_, commit_seq);
+                               zero_log_number, disable_memtable, &seq_used,
+                               batch_cnt, pre_release_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  const SequenceNumber commit_batch_seq = seq_used;
+  if (LIKELY(do_one_write || !s.ok())) {
+    if (LIKELY(s.ok())) {
+      // Note RemovePrepared should be called after WriteImpl that publishsed
+      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+      wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
+    }
+    if (UNLIKELY(!do_one_write)) {
+      wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+    }
+    return s;
+  }  // else do the 2nd write to publish seq
+  // Note: the 2nd write comes with a performance penality. So if we have too
+  // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
+  // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
+  // two_write_queues should be disabled to avoid many additional writes here.
+  const size_t kZeroData = 0;
+  // Update commit map only from the 2nd queue
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_aux_batch(
+      wpt_db_, db_impl_, prepare_seq, prepare_batch_cnt_, kZeroData,
+      commit_batch_seq, commit_batch_cnt);
+  WriteBatch empty_batch;
+  empty_batch.PutLogData(Slice());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(&empty_batch);
+  const bool DISABLE_MEMTABLE = true;
+  const size_t ONE_BATCH = 1;
+  const uint64_t NO_REF_LOG = 0;
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_aux_batch);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Note RemovePrepared should be called after WriteImpl that publishsed the
+  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+  wpt_db_->RemovePrepared(prepare_seq, prepare_batch_cnt_);
+  wpt_db_->RemovePrepared(commit_batch_seq, commit_batch_cnt);
+  return s;
+}
+
+Status WritePreparedTxn::RollbackInternal() {
+  ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                 "RollbackInternal prepare_seq: %" PRIu64, GetId());
+  WriteBatch rollback_batch;
+  assert(GetId() != kMaxSequenceNumber);
+  assert(GetId() > 0);
+  auto cf_map_shared_ptr = wpt_db_->GetCFHandleMap();
+  auto cf_comp_map_shared_ptr = wpt_db_->GetCFComparatorMap();
+  auto read_at_seq = kMaxSequenceNumber;
+  struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
+    DBImpl* db_;
+    ReadOptions roptions;
+    WritePreparedTxnReadCallback callback;
+    WriteBatch* rollback_batch_;
+    std::map<uint32_t, const Comparator*>& comparators_;
+    std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+    using CFKeys = std::set<Slice, SetComparator>;
+    std::map<uint32_t, CFKeys> keys_;
+    bool rollback_merge_operands_;
+    RollbackWriteBatchBuilder(
+        DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
+        WriteBatch* dst_batch,
+        std::map<uint32_t, const Comparator*>& comparators,
+        std::map<uint32_t, ColumnFamilyHandle*>& handles,
+        bool rollback_merge_operands)
+        : db_(db),
+          callback(wpt_db, snap_seq),  // disable min_uncommitted optimization
+          rollback_batch_(dst_batch),
+          comparators_(comparators),
+          handles_(handles),
+          rollback_merge_operands_(rollback_merge_operands) {}
+
+    Status Rollback(uint32_t cf, const Slice& key) {
+      Status s;
+      CFKeys& cf_keys = keys_[cf];
+      if (cf_keys.size() == 0) {  // just inserted
+        auto cmp = comparators_[cf];
+        keys_[cf] = CFKeys(SetComparator(cmp));
+      }
+      auto it = cf_keys.insert(key);
+      if (it.second ==
+          false) {  // second is false if a element already existed.
+        return s;
+      }
+
+      PinnableSlice pinnable_val;
+      bool not_used;
+      auto cf_handle = handles_[cf];
+      s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
+                       &callback);
+      assert(s.ok() || s.IsNotFound());
+      if (s.ok()) {
+        s = rollback_batch_->Put(cf_handle, key, pinnable_val);
+        assert(s.ok());
+      } else if (s.IsNotFound()) {
+        // There has been no readable value before txn. By adding a delete we
+        // make sure that there will be none afterwards either.
+        s = rollback_batch_->Delete(cf_handle, key);
+        assert(s.ok());
+      } else {
+        // Unexpected status. Return it to the user.
+      }
+      return s;
+    }
+
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& /*val*/) override {
+      return Rollback(cf, key);
+    }
+
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      return Rollback(cf, key);
+    }
+
+    Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+      return Rollback(cf, key);
+    }
+
+    Status MergeCF(uint32_t cf, const Slice& key,
+                   const Slice& /*val*/) override {
+      if (rollback_merge_operands_) {
+        return Rollback(cf, key);
+      } else {
+        return Status::OK();
+      }
+    }
+
+    Status MarkNoop(bool) override { return Status::OK(); }
+    Status MarkBeginPrepare(bool) override { return Status::OK(); }
+    Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+    Status MarkCommit(const Slice&) override { return Status::OK(); }
+    Status MarkRollback(const Slice&) override {
+      return Status::InvalidArgument();
+    }
+
+   protected:
+    bool WriteAfterCommit() const override { return false; }
+  } rollback_handler(db_impl_, wpt_db_, read_at_seq, &rollback_batch,
+                     *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
+                     wpt_db_->txn_db_options_.rollback_merge_operands);
+  auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&rollback_handler);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+  // The Rollback marker will be used as a batch separator
+  WriteBatchInternal::MarkRollback(&rollback_batch, name_);
+  bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+  const bool DISABLE_MEMTABLE = true;
+  const uint64_t NO_REF_LOG = 0;
+  uint64_t seq_used = kMaxSequenceNumber;
+  const size_t ONE_BATCH = 1;
+  const bool kFirstPrepareBatch = true;
+  // We commit the rolled back prepared batches. Although this is
+  // counter-intuitive, i) it is safe to do so, since the prepared batches are
+  // already canceled out by the rollback batch, ii) adding the commit entry to
+  // CommitCache will allow us to benefit from the existing mechanism in
+  // CommitCache that keeps an entry evicted due to max advance and yet overlaps
+  // with a live snapshot around so that the live snapshot properly skips the
+  // entry even if its prepare seq is lower than max_evicted_seq_.
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, ONE_BATCH,
+      db_impl_->immutable_db_options().two_write_queues, !kFirstPrepareBatch);
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, GetId(), prepare_batch_cnt_, ONE_BATCH);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
+  // Note: the rollback batch does not need AddPrepared since it is written to
+  // DB in one shot. min_uncommitted still works since it requires capturing
+  // data that is written to DB but not yet committed, while
+  // the rollback batch commits with PreReleaseCallback.
+  s = db_impl_->WriteImpl(write_options_, &rollback_batch, nullptr, nullptr,
+                          NO_REF_LOG, !DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          pre_release_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (do_one_write) {
+    wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
+    return s;
+  }  // else do the 2nd write for commit
+  uint64_t rollback_seq = seq_used;
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "RollbackInternal 2nd write rollback_seq: %" PRIu64,
+                    rollback_seq);
+  // Commit the batch by writing an empty batch to the queue that will release
+  // the commit sequence number to readers.
+  WritePreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
+      wpt_db_, db_impl_, GetId(), rollback_seq, prepare_batch_cnt_);
+  WriteBatch empty_batch;
+  empty_batch.PutLogData(Slice());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_prepare);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "RollbackInternal (status=%s) commit: %" PRIu64,
+                    s.ToString().c_str(), GetId());
+  if (s.ok()) {
+    wpt_db_->RemovePrepared(GetId(), prepare_batch_cnt_);
+  }
+  wpt_db_->RemovePrepared(rollback_seq, ONE_BATCH);
+
   return s;
 }
 
-Status WritePreparedTxn::Rollback() {
-  // TODO(myabandeh) Implement this
-  throw std::runtime_error("Rollback not Implemented");
-  return Status::OK();
+Status WritePreparedTxn::ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                          const Slice& key,
+                                          SequenceNumber* tracked_at_seq) {
+  assert(snapshot_);
+
+  SequenceNumber min_uncommitted =
+      static_cast_with_check<const SnapshotImpl, const Snapshot>(
+          snapshot_.get())
+          ->min_uncommitted_;
+  SequenceNumber snap_seq = snapshot_->GetSequenceNumber();
+  // tracked_at_seq is either max or the last snapshot with which this key was
+  // trackeed so there is no need to apply the IsInSnapshot to this comparison
+  // here as tracked_at_seq is not a prepare seq.
+  if (*tracked_at_seq <= snap_seq) {
+    // If the key has been previous validated at a sequence number earlier
+    // than the curent snapshot's sequence number, we already know it has not
+    // been modified.
+    return Status::OK();
+  }
+
+  *tracked_at_seq = snap_seq;
+
+  ColumnFamilyHandle* cfh =
+      column_family ? column_family : db_impl_->DefaultColumnFamily();
+
+  WritePreparedTxnReadCallback snap_checker(wpt_db_, snap_seq, min_uncommitted);
+  return TransactionUtil::CheckKeyForConflicts(db_impl_, cfh, key.ToString(),
+                                               snap_seq, false /* cache_only */,
+                                               &snap_checker, min_uncommitted);
+}
+
+void WritePreparedTxn::SetSnapshot() {
+  const bool kForWWConflictCheck = true;
+  SnapshotImpl* snapshot = wpt_db_->GetSnapshotInternal(kForWWConflictCheck);
+  SetSnapshotInternal(snapshot);
+}
+
+Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) {
+  auto ret = PessimisticTransaction::RebuildFromWriteBatch(src_batch);
+  prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+  return ret;
 }
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.h b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.h
index b7cc6ba1b0..2cd729cd2c 100644
--- a/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.h
+++ b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn.h
@@ -34,8 +34,8 @@ namespace rocksdb {
 
 class WritePreparedTxnDB;
 
-// This impl could write to DB also uncomitted data and then later tell apart
-// committed data from uncomitted data. Uncommitted data could be after the
+// This impl could write to DB also uncommitted data and then later tell apart
+// committed data from uncommitted data. Uncommitted data could be after the
 // Prepare phase in 2PC (WritePreparedTxn) or before that
 // (WriteUnpreparedTxnImpl).
 class WritePreparedTxn : public PessimisticTransaction {
@@ -45,30 +45,67 @@ class WritePreparedTxn : public PessimisticTransaction {
 
   virtual ~WritePreparedTxn() {}
 
-  Status CommitBatch(WriteBatch* batch) override;
-
-  Status Rollback() override;
+  // To make WAL commit markers visible, the snapshot will be based on the last
+  // seq in the WAL that is also published, LastPublishedSequence, as opposed to
+  // the last seq in the memtable.
+  using Transaction::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  // Note: The behavior is undefined in presence of interleaved writes to the
+  // same transaction.
+  // To make WAL commit markers visible, the snapshot will be
+  // based on the last seq in the WAL that is also published,
+  // LastPublishedSequence, as opposed to the last seq in the memtable.
+  using Transaction::GetIterator;
+  virtual Iterator* GetIterator(const ReadOptions& options) override;
+  virtual Iterator* GetIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual void SetSnapshot() override;
+
+ protected:
+  void Initialize(const TransactionOptions& txn_options) override;
+  // Override the protected SetId to make it visible to the friend class
+  // WritePreparedTxnDB
+  inline void SetId(uint64_t id) override { Transaction::SetId(id); }
 
  private:
+  friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
+  friend class WritePreparedTxnDB;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+
   Status PrepareInternal() override;
 
   Status CommitWithoutPrepareInternal() override;
 
+  Status CommitBatchInternal(WriteBatch* batch, size_t batch_cnt) override;
+
+  // Since the data is already written to memtables at the Prepare phase, the
+  // commit entails writing only a commit marker in the WAL. The sequence number
+  // of the commit marker is then the commit timestamp of the transaction. To
+  // make WAL commit markers visible, the snapshot will be based on the last seq
+  // in the WAL that is also published, LastPublishedSequence, as opposed to the
+  // last seq in the memtable.
   Status CommitInternal() override;
 
-  // TODO(myabandeh): verify that the current impl work with values being
-  // written with prepare sequence number too.
-  // Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice&
-  // key,
-  //                        SequenceNumber prev_seqno, SequenceNumber*
-  //                        new_seqno);
+  Status RollbackInternal() override;
+
+  virtual Status ValidateSnapshot(ColumnFamilyHandle* column_family,
+                                  const Slice& key,
+                                  SequenceNumber* tracked_at_seq) override;
+
+  virtual Status RebuildFromWriteBatch(WriteBatch* src_batch) override;
 
   // No copying allowed
   WritePreparedTxn(const WritePreparedTxn&);
   void operator=(const WritePreparedTxn&);
 
   WritePreparedTxnDB* wpt_db_;
-  uint64_t prepare_seq_;
+  // Number of sub-batches in prepare
+  size_t prepare_batch_cnt_ = 0;
 };
 
 }  // namespace rocksdb
diff --git a/thirdparty/rocksdb/utilities/transactions/write_prepared_txn_db.cc b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn_db.cc
new file mode 100644
index 0000000000..5364a9e058
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn_db.cc
@@ -0,0 +1,964 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/write_prepared_txn_db.h"
+
+#include <inttypes.h>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "util/sync_point.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/transaction_db_mutex_impl.h"
+
+namespace rocksdb {
+
+Status WritePreparedTxnDB::Initialize(
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  assert(dbimpl != nullptr);
+  auto rtxns = dbimpl->recovered_transactions();
+  for (auto rtxn : rtxns) {
+    // There should only one batch for WritePrepared policy.
+    assert(rtxn.second->batches_.size() == 1);
+    const auto& seq = rtxn.second->batches_.begin()->first;
+    const auto& batch_info = rtxn.second->batches_.begin()->second;
+    auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+    for (size_t i = 0; i < cnt; i++) {
+      AddPrepared(seq + i);
+    }
+  }
+  SequenceNumber prev_max = max_evicted_seq_;
+  SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
+  AdvanceMaxEvictedSeq(prev_max, last_seq);
+  // Create a gap between max and the next snapshot. This simplifies the logic
+  // in IsInSnapshot by not having to consider the special case of max ==
+  // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
+  if (last_seq) {
+    db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
+    db_impl_->versions_->SetLastSequence(last_seq + 1);
+    db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
+  }
+
+  db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
+  // A callback to commit a single sub-batch
+  class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
+   public:
+    explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
+        : db_(db) {}
+    Status Callback(SequenceNumber commit_seq,
+                    bool is_mem_disabled __attribute__((__unused__)),
+                    uint64_t) override {
+      assert(!is_mem_disabled);
+      db_->AddCommitted(commit_seq, commit_seq);
+      return Status::OK();
+    }
+
+   private:
+    WritePreparedTxnDB* db_;
+  };
+  db_impl_->SetRecoverableStatePreReleaseCallback(
+      new CommitSubBatchPreReleaseCallback(this));
+
+  auto s = PessimisticTransactionDB::Initialize(compaction_enabled_cf_indices,
+                                                handles);
+  return s;
+}
+
+Status WritePreparedTxnDB::VerifyCFOptions(
+    const ColumnFamilyOptions& cf_options) {
+  Status s = PessimisticTransactionDB::VerifyCFOptions(cf_options);
+  if (!s.ok()) {
+    return s;
+  }
+  if (!cf_options.memtable_factory->CanHandleDuplicatedKey()) {
+    return Status::InvalidArgument(
+        "memtable_factory->CanHandleDuplicatedKey() cannot be false with "
+        "WritePrpeared transactions");
+  }
+  return Status::OK();
+}
+
+Transaction* WritePreparedTxnDB::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options,
+    Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new WritePreparedTxn(this, write_options, txn_options);
+  }
+}
+
+Status WritePreparedTxnDB::Write(
+    const WriteOptions& opts,
+    const TransactionDBWriteOptimizations& optimizations, WriteBatch* updates) {
+  if (optimizations.skip_concurrency_control) {
+    // Skip locking the rows
+    const size_t UNKNOWN_BATCH_CNT = 0;
+    const size_t ONE_BATCH_CNT = 1;
+    const size_t batch_cnt = optimizations.skip_duplicate_key_check
+                                 ? ONE_BATCH_CNT
+                                 : UNKNOWN_BATCH_CNT;
+    WritePreparedTxn* NO_TXN = nullptr;
+    return WriteInternal(opts, updates, batch_cnt, NO_TXN);
+  } else {
+    // TODO(myabandeh): Make use of skip_duplicate_key_check hint
+    // Fall back to unoptimized version
+    return PessimisticTransactionDB::Write(opts, updates);
+  }
+}
+
+Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig,
+                                         WriteBatch* batch, size_t batch_cnt,
+                                         WritePreparedTxn* txn) {
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "CommitBatchInternal");
+  if (batch->Count() == 0) {
+    // Otherwise our 1 seq per batch logic will break since there is no seq
+    // increased for this batch.
+    return Status::OK();
+  }
+  if (batch_cnt == 0) {  // not provided, then compute it
+    // TODO(myabandeh): add an option to allow user skipping this cost
+    SubBatchCounter counter(*GetCFComparatorMap());
+    auto s = batch->Iterate(&counter);
+    assert(s.ok());
+    batch_cnt = counter.BatchCount();
+    WPRecordTick(TXN_DUPLICATE_KEY_OVERHEAD);
+    ROCKS_LOG_DETAILS(info_log_, "Duplicate key overhead: %" PRIu64 " batches",
+                      static_cast<uint64_t>(batch_cnt));
+  }
+  assert(batch_cnt);
+
+  bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+  WriteOptions write_options(write_options_orig);
+  bool sync = write_options.sync;
+  if (!do_one_write) {
+    // No need to sync on the first write
+    write_options.sync = false;
+  }
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(batch);
+  const bool DISABLE_MEMTABLE = true;
+  const uint64_t no_log_ref = 0;
+  uint64_t seq_used = kMaxSequenceNumber;
+  const size_t ZERO_PREPARES = 0;
+  const bool kSeperatePrepareCommitBatches = true;
+  // Since this is not 2pc, there is no need for AddPrepared but having it in
+  // the PreReleaseCallback enables an optimization. Refer to
+  // SmallestUnCommittedSeq for more details.
+  AddPreparedCallback add_prepared_callback(
+      this, db_impl_, batch_cnt,
+      db_impl_->immutable_db_options().two_write_queues,
+      !kSeperatePrepareCommitBatches);
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map(
+      this, db_impl_, kMaxSequenceNumber, ZERO_PREPARES, batch_cnt);
+  PreReleaseCallback* pre_release_callback;
+  if (do_one_write) {
+    pre_release_callback = &update_commit_map;
+  } else {
+    pre_release_callback = &add_prepared_callback;
+  }
+  auto s = db_impl_->WriteImpl(write_options, batch, nullptr, nullptr,
+                               no_log_ref, !DISABLE_MEMTABLE, &seq_used,
+                               batch_cnt, pre_release_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  uint64_t prepare_seq = seq_used;
+  if (txn != nullptr) {
+    txn->SetId(prepare_seq);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (do_one_write) {
+    return s;
+  }  // else do the 2nd write for commit
+  // Set the original value of sync
+  write_options.sync = sync;
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "CommitBatchInternal 2nd write prepare_seq: %" PRIu64,
+                    prepare_seq);
+  // Commit the batch by writing an empty batch to the 2nd queue that will
+  // release the commit sequence number to readers.
+  const size_t ZERO_COMMITS = 0;
+  WritePreparedCommitEntryPreReleaseCallback update_commit_map_with_prepare(
+      this, db_impl_, prepare_seq, batch_cnt, ZERO_COMMITS);
+  WriteBatch empty_batch;
+  empty_batch.PutLogData(Slice());
+  const size_t ONE_BATCH = 1;
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = db_impl_->WriteImpl(write_options, &empty_batch, nullptr, nullptr,
+                          no_log_ref, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_prepare);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Note RemovePrepared should be called after WriteImpl that publishsed the
+  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+  RemovePrepared(prepare_seq, batch_cnt);
+  return s;
+}
+
+Status WritePreparedTxnDB::Get(const ReadOptions& options,
+                               ColumnFamilyHandle* column_family,
+                               const Slice& key, PinnableSlice* value) {
+  // We are fine with the latest committed value. This could be done by
+  // specifying the snapshot as kMaxSequenceNumber.
+  SequenceNumber seq = kMaxSequenceNumber;
+  SequenceNumber min_uncommitted = 0;
+  if (options.snapshot != nullptr) {
+    seq = options.snapshot->GetSequenceNumber();
+    min_uncommitted = static_cast_with_check<const SnapshotImpl, const Snapshot>(
+                        options.snapshot)
+                        ->min_uncommitted_;
+  } else {
+    min_uncommitted = SmallestUnCommittedSeq();
+  }
+  WritePreparedTxnReadCallback callback(this, seq, min_uncommitted);
+  bool* dont_care = nullptr;
+  // Note: no need to specify a snapshot for read options as no specific
+  // snapshot is requested by the user.
+  return db_impl_->GetImpl(options, column_family, key, value, dont_care,
+                           &callback);
+}
+
+void WritePreparedTxnDB::UpdateCFComparatorMap(
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  auto cf_map = new std::map<uint32_t, const Comparator*>();
+  auto handle_map = new std::map<uint32_t, ColumnFamilyHandle*>();
+  for (auto h : handles) {
+    auto id = h->GetID();
+    const Comparator* comparator = h->GetComparator();
+    (*cf_map)[id] = comparator;
+    if (id != 0) {
+      (*handle_map)[id] = h;
+    } else {
+      // The pointer to the default cf handle in the handles will be deleted.
+      // Use the pointer maintained by the db instead.
+      (*handle_map)[id] = DefaultColumnFamily();
+    }
+  }
+  cf_map_.reset(cf_map);
+  handle_map_.reset(handle_map);
+}
+
+void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) {
+  auto old_cf_map_ptr = cf_map_.get();
+  assert(old_cf_map_ptr);
+  auto cf_map = new std::map<uint32_t, const Comparator*>(*old_cf_map_ptr);
+  auto old_handle_map_ptr = handle_map_.get();
+  assert(old_handle_map_ptr);
+  auto handle_map =
+      new std::map<uint32_t, ColumnFamilyHandle*>(*old_handle_map_ptr);
+  auto id = h->GetID();
+  const Comparator* comparator = h->GetComparator();
+  (*cf_map)[id] = comparator;
+  (*handle_map)[id] = h;
+  cf_map_.reset(cf_map);
+  handle_map_.reset(handle_map);
+}
+
+
+std::vector<Status> WritePreparedTxnDB::MultiGet(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  assert(values);
+  size_t num_keys = keys.size();
+  values->resize(num_keys);
+
+  std::vector<Status> stat_list(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    std::string* value = values ? &(*values)[i] : nullptr;
+    stat_list[i] = this->Get(options, column_family[i], keys[i], value);
+  }
+  return stat_list;
+}
+
+// Struct to hold ownership of snapshot and read callback for iterator cleanup.
+struct WritePreparedTxnDB::IteratorState {
+  IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
+                std::shared_ptr<ManagedSnapshot> s,
+                SequenceNumber min_uncommitted)
+      : callback(txn_db, sequence, min_uncommitted), snapshot(s) {}
+
+  WritePreparedTxnReadCallback callback;
+  std::shared_ptr<ManagedSnapshot> snapshot;
+};
+
+namespace {
+static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
+  delete reinterpret_cast<WritePreparedTxnDB::IteratorState*>(arg1);
+}
+}  // anonymous namespace
+
+Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options,
+                                          ColumnFamilyHandle* column_family) {
+  constexpr bool ALLOW_BLOB = true;
+  constexpr bool ALLOW_REFRESH = true;
+  std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+  SequenceNumber snapshot_seq = kMaxSequenceNumber;
+  SequenceNumber min_uncommitted = 0;
+  if (options.snapshot != nullptr) {
+    snapshot_seq = options.snapshot->GetSequenceNumber();
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl, const Snapshot>(
+            options.snapshot)
+            ->min_uncommitted_;
+  } else {
+    auto* snapshot = GetSnapshot();
+    // We take a snapshot to make sure that the related data in the commit map
+    // are not deleted.
+    snapshot_seq = snapshot->GetSequenceNumber();
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
+            ->min_uncommitted_;
+    own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+  }
+  assert(snapshot_seq != kMaxSequenceNumber);
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* state =
+      new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
+  auto* db_iter =
+      db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
+                                !ALLOW_BLOB, !ALLOW_REFRESH);
+  db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
+  return db_iter;
+}
+
+Status WritePreparedTxnDB::NewIterators(
+    const ReadOptions& options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  constexpr bool ALLOW_BLOB = true;
+  constexpr bool ALLOW_REFRESH = true;
+  std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+  SequenceNumber snapshot_seq = kMaxSequenceNumber;
+  SequenceNumber min_uncommitted = 0;
+  if (options.snapshot != nullptr) {
+    snapshot_seq = options.snapshot->GetSequenceNumber();
+    min_uncommitted = static_cast_with_check<const SnapshotImpl, const Snapshot>(
+                        options.snapshot)
+                        ->min_uncommitted_;
+  } else {
+    auto* snapshot = GetSnapshot();
+    // We take a snapshot to make sure that the related data in the commit map
+    // are not deleted.
+    snapshot_seq = snapshot->GetSequenceNumber();
+    own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
+            ->min_uncommitted_;
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  for (auto* column_family : column_families) {
+    auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    auto* state =
+        new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted);
+    auto* db_iter =
+        db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback,
+                                  !ALLOW_BLOB, !ALLOW_REFRESH);
+    db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr);
+    iterators->push_back(db_iter);
+  }
+  return Status::OK();
+}
+
+void WritePreparedTxnDB::Init(const TransactionDBOptions& /* unused */) {
+  // Adcance max_evicted_seq_ no more than 100 times before the cache wraps
+  // around.
+  INC_STEP_FOR_MAX_EVICTED =
+      std::max(COMMIT_CACHE_SIZE / 100, static_cast<size_t>(1));
+  snapshot_cache_ = std::unique_ptr<std::atomic<SequenceNumber>[]>(
+      new std::atomic<SequenceNumber>[SNAPSHOT_CACHE_SIZE] {});
+  commit_cache_ = std::unique_ptr<std::atomic<CommitEntry64b>[]>(
+      new std::atomic<CommitEntry64b>[COMMIT_CACHE_SIZE] {});
+}
+
+void WritePreparedTxnDB::CheckPreparedAgainstMax(SequenceNumber new_max) {
+  prepared_mutex_.AssertHeld();
+  // When max_evicted_seq_ advances, move older entries from prepared_txns_
+  // to delayed_prepared_. This guarantees that if a seq is lower than max,
+  // then it is not in prepared_txns_ and save an expensive, synchronized
+  // lookup from a shared set. delayed_prepared_ is expected to be empty in
+  // normal cases.
+  ROCKS_LOG_DETAILS(
+      info_log_,
+      "CheckPreparedAgainstMax prepared_txns_.empty() %d top: %" PRIu64,
+      prepared_txns_.empty(),
+      prepared_txns_.empty() ? 0 : prepared_txns_.top());
+  while (!prepared_txns_.empty() && prepared_txns_.top() <= new_max) {
+    auto to_be_popped = prepared_txns_.top();
+    delayed_prepared_.insert(to_be_popped);
+    ROCKS_LOG_WARN(info_log_,
+                   "prepared_mutex_ overhead %" PRIu64 " (prep=%" PRIu64
+                   " new_max=%" PRIu64,
+                   static_cast<uint64_t>(delayed_prepared_.size()),
+                   to_be_popped, new_max);
+    prepared_txns_.pop();
+    delayed_prepared_empty_.store(false, std::memory_order_release);
+  }
+}
+
+void WritePreparedTxnDB::AddPrepared(uint64_t seq) {
+  ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Preparing with max %" PRIu64,
+                    seq, max_evicted_seq_.load());
+  TEST_SYNC_POINT("AddPrepared::begin:pause");
+  TEST_SYNC_POINT("AddPrepared::begin:resume");
+  WriteLock wl(&prepared_mutex_);
+  prepared_txns_.push(seq);
+  auto new_max = future_max_evicted_seq_.load();
+  if (UNLIKELY(seq <= new_max)) {
+    // This should not happen in normal case
+    ROCKS_LOG_ERROR(
+        info_log_,
+        "Added prepare_seq is not larger than max_evicted_seq_: %" PRIu64
+        " <= %" PRIu64,
+        seq, new_max);
+    CheckPreparedAgainstMax(new_max);
+  }
+  TEST_SYNC_POINT("AddPrepared::end");
+}
+
+void WritePreparedTxnDB::AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
+                                      uint8_t loop_cnt) {
+  ROCKS_LOG_DETAILS(info_log_, "Txn %" PRIu64 " Committing with %" PRIu64,
+                    prepare_seq, commit_seq);
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start");
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:start:pause");
+  auto indexed_seq = prepare_seq % COMMIT_CACHE_SIZE;
+  CommitEntry64b evicted_64b;
+  CommitEntry evicted;
+  bool to_be_evicted = GetCommitEntry(indexed_seq, &evicted_64b, &evicted);
+  if (LIKELY(to_be_evicted)) {
+    assert(evicted.prep_seq != prepare_seq);
+    auto prev_max = max_evicted_seq_.load(std::memory_order_acquire);
+    ROCKS_LOG_DETAILS(info_log_,
+                      "Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64,
+                      evicted.prep_seq, evicted.commit_seq, prev_max);
+    if (prev_max < evicted.commit_seq) {
+      auto last = db_impl_->GetLastPublishedSequence();  // could be 0
+      SequenceNumber max_evicted_seq;
+      if (LIKELY(evicted.commit_seq < last)) {
+        assert(last > 0);
+        // Inc max in larger steps to avoid frequent updates
+        max_evicted_seq =
+            std::min(evicted.commit_seq + INC_STEP_FOR_MAX_EVICTED, last - 1);
+      } else {
+        // legit when a commit entry in a write batch overwrite the previous one
+        max_evicted_seq = evicted.commit_seq;
+      }
+      ROCKS_LOG_DETAILS(info_log_,
+                        "%lu Evicting %" PRIu64 ",%" PRIu64 " with max %" PRIu64
+                        " => %lu",
+                        prepare_seq, evicted.prep_seq, evicted.commit_seq,
+                        prev_max, max_evicted_seq);
+      AdvanceMaxEvictedSeq(prev_max, max_evicted_seq);
+    }
+    // After each eviction from commit cache, check if the commit entry should
+    // be kept around because it overlaps with a live snapshot.
+    CheckAgainstSnapshots(evicted);
+    if (UNLIKELY(!delayed_prepared_empty_.load(std::memory_order_acquire))) {
+      WriteLock wl(&prepared_mutex_);
+      for (auto dp : delayed_prepared_) {
+        if (dp == evicted.prep_seq) {
+          // This is a rare case that txn is committed but prepared_txns_ is not
+          // cleaned up yet. Refer to delayed_prepared_commits_ definition for
+          // why it should be kept updated.
+          delayed_prepared_commits_[evicted.prep_seq] = evicted.commit_seq;
+          ROCKS_LOG_DEBUG(info_log_,
+                          "delayed_prepared_commits_[%" PRIu64 "]=%" PRIu64,
+                          evicted.prep_seq, evicted.commit_seq);
+          break;
+        }
+      }
+    }
+  }
+  bool succ =
+      ExchangeCommitEntry(indexed_seq, evicted_64b, {prepare_seq, commit_seq});
+  if (UNLIKELY(!succ)) {
+    ROCKS_LOG_ERROR(info_log_,
+                    "ExchangeCommitEntry failed on [%" PRIu64 "] %" PRIu64
+                    ",%" PRIu64 " retrying...",
+                    indexed_seq, prepare_seq, commit_seq);
+    // A very rare event, in which the commit entry is updated before we do.
+    // Here we apply a very simple solution of retrying.
+    if (loop_cnt > 100) {
+      throw std::runtime_error("Infinite loop in AddCommitted!");
+    }
+    AddCommitted(prepare_seq, commit_seq, ++loop_cnt);
+    return;
+  }
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end");
+  TEST_SYNC_POINT("WritePreparedTxnDB::AddCommitted:end:pause");
+}
+
+void WritePreparedTxnDB::RemovePrepared(const uint64_t prepare_seq,
+                                        const size_t batch_cnt) {
+  TEST_SYNC_POINT_CALLBACK(
+      "RemovePrepared:Start",
+      const_cast<void*>(reinterpret_cast<const void*>(&prepare_seq)));
+  TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:pause");
+  TEST_SYNC_POINT("WritePreparedTxnDB::RemovePrepared:resume");
+  ROCKS_LOG_DETAILS(info_log_,
+                    "RemovePrepared %" PRIu64 " cnt: %" ROCKSDB_PRIszt,
+                    prepare_seq, batch_cnt);
+  WriteLock wl(&prepared_mutex_);
+  for (size_t i = 0; i < batch_cnt; i++) {
+    prepared_txns_.erase(prepare_seq + i);
+    bool was_empty = delayed_prepared_.empty();
+    if (!was_empty) {
+      delayed_prepared_.erase(prepare_seq + i);
+      auto it = delayed_prepared_commits_.find(prepare_seq + i);
+      if (it != delayed_prepared_commits_.end()) {
+        ROCKS_LOG_DETAILS(info_log_, "delayed_prepared_commits_.erase %" PRIu64,
+                          prepare_seq + i);
+        delayed_prepared_commits_.erase(it);
+      }
+      bool is_empty = delayed_prepared_.empty();
+      if (was_empty != is_empty) {
+        delayed_prepared_empty_.store(is_empty, std::memory_order_release);
+      }
+    }
+  }
+}
+
+bool WritePreparedTxnDB::GetCommitEntry(const uint64_t indexed_seq,
+                                        CommitEntry64b* entry_64b,
+                                        CommitEntry* entry) const {
+  *entry_64b = commit_cache_[static_cast<size_t>(indexed_seq)].load(std::memory_order_acquire);
+  bool valid = entry_64b->Parse(indexed_seq, entry, FORMAT);
+  return valid;
+}
+
+bool WritePreparedTxnDB::AddCommitEntry(const uint64_t indexed_seq,
+                                        const CommitEntry& new_entry,
+                                        CommitEntry* evicted_entry) {
+  CommitEntry64b new_entry_64b(new_entry, FORMAT);
+  CommitEntry64b evicted_entry_64b = commit_cache_[static_cast<size_t>(indexed_seq)].exchange(
+      new_entry_64b, std::memory_order_acq_rel);
+  bool valid = evicted_entry_64b.Parse(indexed_seq, evicted_entry, FORMAT);
+  return valid;
+}
+
+bool WritePreparedTxnDB::ExchangeCommitEntry(const uint64_t indexed_seq,
+                                             CommitEntry64b& expected_entry_64b,
+                                             const CommitEntry& new_entry) {
+  auto& atomic_entry = commit_cache_[static_cast<size_t>(indexed_seq)];
+  CommitEntry64b new_entry_64b(new_entry, FORMAT);
+  bool succ = atomic_entry.compare_exchange_strong(
+      expected_entry_64b, new_entry_64b, std::memory_order_acq_rel,
+      std::memory_order_acquire);
+  return succ;
+}
+
+void WritePreparedTxnDB::AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
+                                              const SequenceNumber& new_max) {
+  ROCKS_LOG_DETAILS(info_log_,
+                    "AdvanceMaxEvictedSeq overhead %" PRIu64 " => %" PRIu64,
+                    prev_max, new_max);
+  // Declare the intention before getting snapshot from the DB. This helps a
+  // concurrent GetSnapshot to wait to catch up with future_max_evicted_seq_ if
+  // it has not already. Otherwise the new snapshot is when we ask DB for
+  // snapshots smaller than future max.
+  auto updated_future_max = prev_max;
+  while (updated_future_max < new_max &&
+         !future_max_evicted_seq_.compare_exchange_weak(
+             updated_future_max, new_max, std::memory_order_acq_rel,
+             std::memory_order_relaxed)) {
+  };
+
+  {
+    WriteLock wl(&prepared_mutex_);
+    CheckPreparedAgainstMax(new_max);
+  }
+
+  // With each change to max_evicted_seq_ fetch the live snapshots behind it.
+  // We use max as the version of snapshots to identify how fresh are the
+  // snapshot list. This works because the snapshots are between 0 and
+  // max, so the larger the max, the more complete they are.
+  SequenceNumber new_snapshots_version = new_max;
+  std::vector<SequenceNumber> snapshots;
+  bool update_snapshots = false;
+  if (new_snapshots_version > snapshots_version_) {
+    // This is to avoid updating the snapshots_ if it already updated
+    // with a more recent vesion by a concrrent thread
+    update_snapshots = true;
+    // We only care about snapshots lower then max
+    snapshots = GetSnapshotListFromDB(new_max);
+  }
+  if (update_snapshots) {
+    UpdateSnapshots(snapshots, new_snapshots_version);
+    if (!snapshots.empty()) {
+      WriteLock wl(&old_commit_map_mutex_);
+      for (auto snap : snapshots) {
+        // This allows IsInSnapshot to tell apart the reads from in valid
+        // snapshots from the reads from committed values in valid snapshots.
+        old_commit_map_[snap];
+      }
+      old_commit_map_empty_.store(false, std::memory_order_release);
+    }
+  }
+  auto updated_prev_max = prev_max;
+  TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:pause");
+  TEST_SYNC_POINT("AdvanceMaxEvictedSeq::update_max:resume");
+  while (updated_prev_max < new_max &&
+         !max_evicted_seq_.compare_exchange_weak(updated_prev_max, new_max,
+                                                 std::memory_order_acq_rel,
+                                                 std::memory_order_relaxed)) {
+  };
+}
+
+const Snapshot* WritePreparedTxnDB::GetSnapshot() {
+  const bool kForWWConflictCheck = true;
+  return GetSnapshotInternal(!kForWWConflictCheck);
+}
+
+SnapshotImpl* WritePreparedTxnDB::GetSnapshotInternal(
+    bool for_ww_conflict_check) {
+  // Note: for this optimization setting the last sequence number and obtaining
+  // the smallest uncommitted seq should be done atomically. However to avoid
+  // the mutex overhead, we call SmallestUnCommittedSeq BEFORE taking the
+  // snapshot. Since we always updated the list of unprepared seq (via
+  // AddPrepared) AFTER the last sequence is updated, this guarantees that the
+  // smallest uncommitted seq that we pair with the snapshot is smaller or equal
+  // the value that would be obtained otherwise atomically. That is ok since
+  // this optimization works as long as min_uncommitted is less than or equal
+  // than the smallest uncommitted seq when the snapshot was taken.
+  auto min_uncommitted = WritePreparedTxnDB::SmallestUnCommittedSeq();
+  SnapshotImpl* snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+  assert(snap_impl);
+  SequenceNumber snap_seq = snap_impl->GetSequenceNumber();
+  // Note: Check against future_max_evicted_seq_ (in contrast with
+  // max_evicted_seq_) in case there is a concurrent AdvanceMaxEvictedSeq.
+  if (UNLIKELY(snap_seq != 0 && snap_seq <= future_max_evicted_seq_)) {
+    // There is a very rare case in which the commit entry evicts another commit
+    // entry that is not published yet thus advancing max evicted seq beyond the
+    // last published seq. This case is not likely in real-world setup so we
+    // handle it with a few retries.
+    size_t retry = 0;
+    SequenceNumber max;
+    while ((max = future_max_evicted_seq_.load()) != 0 &&
+           snap_impl->GetSequenceNumber() <= max && retry < 100) {
+      ROCKS_LOG_WARN(info_log_,
+                     "GetSnapshot snap: %" PRIu64 " max: %" PRIu64
+                     " retry %" ROCKSDB_PRIszt,
+                     snap_impl->GetSequenceNumber(), max, retry);
+      ReleaseSnapshot(snap_impl);
+      // Wait for last visible seq to catch up with max, and also go beyond it
+      // by one.
+      AdvanceSeqByOne();
+      snap_impl = db_impl_->GetSnapshotImpl(for_ww_conflict_check);
+      assert(snap_impl);
+      retry++;
+    }
+    assert(snap_impl->GetSequenceNumber() > max);
+    if (snap_impl->GetSequenceNumber() <= max) {
+      throw std::runtime_error(
+          "Snapshot seq " + ToString(snap_impl->GetSequenceNumber()) +
+          " after " + ToString(retry) +
+          " retries is still less than futre_max_evicted_seq_" + ToString(max));
+    }
+  }
+  EnhanceSnapshot(snap_impl, min_uncommitted);
+  ROCKS_LOG_DETAILS(
+      db_impl_->immutable_db_options().info_log,
+      "GetSnapshot %" PRIu64 " ww:%" PRIi32 " min_uncommitted: %" PRIu64,
+      snap_impl->GetSequenceNumber(), for_ww_conflict_check, min_uncommitted);
+  return snap_impl;
+}
+
+void WritePreparedTxnDB::AdvanceSeqByOne() {
+  // Inserting an empty value will i) let the max evicted entry to be
+  // published, i.e., max == last_published, increase the last published to
+  // be one beyond max, i.e., max < last_published.
+  WriteOptions woptions;
+  TransactionOptions txn_options;
+  Transaction* txn0 = BeginTransaction(woptions, txn_options, nullptr);
+  std::hash<std::thread::id> hasher;
+  char name[64];
+  snprintf(name, 64, "txn%" ROCKSDB_PRIszt, hasher(std::this_thread::get_id()));
+  assert(strlen(name) < 64 - 1);
+  Status s = txn0->SetName(name);
+  assert(s.ok());
+  if (s.ok()) {
+    // Without prepare it would simply skip the commit
+    s = txn0->Prepare();
+  }
+  assert(s.ok());
+  if (s.ok()) {
+    s = txn0->Commit();
+  }
+  assert(s.ok());
+  delete txn0;
+}
+
+const std::vector<SequenceNumber> WritePreparedTxnDB::GetSnapshotListFromDB(
+    SequenceNumber max) {
+  ROCKS_LOG_DETAILS(info_log_, "GetSnapshotListFromDB with max %" PRIu64, max);
+  InstrumentedMutexLock dblock(db_impl_->mutex());
+  db_impl_->mutex()->AssertHeld();
+  return db_impl_->snapshots().GetAll(nullptr, max);
+}
+
+void WritePreparedTxnDB::ReleaseSnapshotInternal(
+    const SequenceNumber snap_seq) {
+  // TODO(myabandeh): relax should enough since the synchronizatin is already
+  // done by snapshots_mutex_ under which this function is called.
+  if (snap_seq <= max_evicted_seq_.load(std::memory_order_acquire)) {
+    // Then this is a rare case that transaction did not finish before max
+    // advances. It is expected for a few read-only backup snapshots. For such
+    // snapshots we might have kept around a couple of entries in the
+    // old_commit_map_. Check and do garbage collection if that is the case.
+    bool need_gc = false;
+    {
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+                     snap_seq);
+      ReadLock rl(&old_commit_map_mutex_);
+      auto prep_set_entry = old_commit_map_.find(snap_seq);
+      need_gc = prep_set_entry != old_commit_map_.end();
+    }
+    if (need_gc) {
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+      ROCKS_LOG_WARN(info_log_, "old_commit_map_mutex_ overhead for %" PRIu64,
+                     snap_seq);
+      WriteLock wl(&old_commit_map_mutex_);
+      old_commit_map_.erase(snap_seq);
+      old_commit_map_empty_.store(old_commit_map_.empty(),
+                                  std::memory_order_release);
+    }
+  }
+}
+
+void WritePreparedTxnDB::CleanupReleasedSnapshots(
+    const std::vector<SequenceNumber>& new_snapshots,
+    const std::vector<SequenceNumber>& old_snapshots) {
+  auto newi = new_snapshots.begin();
+  auto oldi = old_snapshots.begin();
+  for (; newi != new_snapshots.end() && oldi != old_snapshots.end();) {
+    assert(*newi >= *oldi);  // cannot have new snapshots with lower seq
+    if (*newi == *oldi) {    // still not released
+      auto value = *newi;
+      while (newi != new_snapshots.end() && *newi == value) {
+        newi++;
+      }
+      while (oldi != old_snapshots.end() && *oldi == value) {
+        oldi++;
+      }
+    } else {
+      assert(*newi > *oldi);  // *oldi is released
+      ReleaseSnapshotInternal(*oldi);
+      oldi++;
+    }
+  }
+  // Everything remained in old_snapshots is released and must be cleaned up
+  for (; oldi != old_snapshots.end(); oldi++) {
+    ReleaseSnapshotInternal(*oldi);
+  }
+}
+
+void WritePreparedTxnDB::UpdateSnapshots(
+    const std::vector<SequenceNumber>& snapshots,
+    const SequenceNumber& version) {
+  ROCKS_LOG_DETAILS(info_log_, "UpdateSnapshots with version %" PRIu64,
+                    version);
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:start");
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:start");
+#ifndef NDEBUG
+  size_t sync_i = 0;
+#endif
+  ROCKS_LOG_DETAILS(info_log_, "snapshots_mutex_ overhead");
+  WriteLock wl(&snapshots_mutex_);
+  snapshots_version_ = version;
+  // We update the list concurrently with the readers.
+  // Both new and old lists are sorted and the new list is subset of the
+  // previous list plus some new items. Thus if a snapshot repeats in
+  // both new and old lists, it will appear upper in the new list. So if
+  // we simply insert the new snapshots in order, if an overwritten item
+  // is still valid in the new list is either written to the same place in
+  // the array or it is written in a higher palce before it gets
+  // overwritten by another item. This guarantess a reader that reads the
+  // list bottom-up will eventaully see a snapshot that repeats in the
+  // update, either before it gets overwritten by the writer or
+  // afterwards.
+  size_t i = 0;
+  auto it = snapshots.begin();
+  for (; it != snapshots.end() && i < SNAPSHOT_CACHE_SIZE; it++, i++) {
+    snapshot_cache_[i].store(*it, std::memory_order_release);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", ++sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
+  }
+#ifndef NDEBUG
+  // Release the remaining sync points since they are useless given that the
+  // reader would also use lock to access snapshots
+  for (++sync_i; sync_i <= 10; ++sync_i) {
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:", sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:", sync_i);
+  }
+#endif
+  snapshots_.clear();
+  for (; it != snapshots.end(); it++) {
+    // Insert them to a vector that is less efficient to access
+    // concurrently
+    snapshots_.push_back(*it);
+  }
+  // Update the size at the end. Otherwise a parallel reader might read
+  // items that are not set yet.
+  snapshots_total_.store(snapshots.size(), std::memory_order_release);
+
+  // Note: this must be done after the snapshots data structures are updated
+  // with the new list of snapshots.
+  CleanupReleasedSnapshots(snapshots, snapshots_all_);
+  snapshots_all_ = snapshots;
+
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:p:end");
+  TEST_SYNC_POINT("WritePreparedTxnDB::UpdateSnapshots:s:end");
+}
+
+void WritePreparedTxnDB::CheckAgainstSnapshots(const CommitEntry& evicted) {
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:start");
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:start");
+#ifndef NDEBUG
+  size_t sync_i = 0;
+#endif
+  // First check the snapshot cache that is efficient for concurrent access
+  auto cnt = snapshots_total_.load(std::memory_order_acquire);
+  // The list might get updated concurrently as we are reading from it. The
+  // reader should be able to read all the snapshots that are still valid
+  // after the update. Since the survived snapshots are written in a higher
+  // place before gets overwritten the reader that reads bottom-up will
+  // eventully see it.
+  const bool next_is_larger = true;
+  // We will set to true if the border line snapshot suggests that.
+  bool search_larger_list = false;
+  size_t ip1 = std::min(cnt, SNAPSHOT_CACHE_SIZE);
+  for (; 0 < ip1; ip1--) {
+    SequenceNumber snapshot_seq =
+        snapshot_cache_[ip1 - 1].load(std::memory_order_acquire);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:",
+                        ++sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
+    if (ip1 == SNAPSHOT_CACHE_SIZE) {  // border line snapshot
+      // snapshot_seq < commit_seq => larger_snapshot_seq <= commit_seq
+      // then later also continue the search to larger snapshots
+      search_larger_list = snapshot_seq < evicted.commit_seq;
+    }
+    if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                 snapshot_seq, !next_is_larger)) {
+      break;
+    }
+  }
+#ifndef NDEBUG
+  // Release the remaining sync points before accquiring the lock
+  for (++sync_i; sync_i <= 10; ++sync_i) {
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:", sync_i);
+    TEST_IDX_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:", sync_i);
+  }
+#endif
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:p:end");
+  TEST_SYNC_POINT("WritePreparedTxnDB::CheckAgainstSnapshots:s:end");
+  if (UNLIKELY(SNAPSHOT_CACHE_SIZE < cnt && search_larger_list)) {
+    // Then access the less efficient list of snapshots_
+    WPRecordTick(TXN_SNAPSHOT_MUTEX_OVERHEAD);
+    ROCKS_LOG_WARN(info_log_,
+                   "snapshots_mutex_ overhead for <%" PRIu64 ",%" PRIu64
+                   "> with %" ROCKSDB_PRIszt " snapshots",
+                   evicted.prep_seq, evicted.commit_seq, cnt);
+    ReadLock rl(&snapshots_mutex_);
+    // Items could have moved from the snapshots_ to snapshot_cache_ before
+    // accquiring the lock. To make sure that we do not miss a valid snapshot,
+    // read snapshot_cache_ again while holding the lock.
+    for (size_t i = 0; i < SNAPSHOT_CACHE_SIZE; i++) {
+      SequenceNumber snapshot_seq =
+          snapshot_cache_[i].load(std::memory_order_acquire);
+      if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                   snapshot_seq, next_is_larger)) {
+        break;
+      }
+    }
+    for (auto snapshot_seq_2 : snapshots_) {
+      if (!MaybeUpdateOldCommitMap(evicted.prep_seq, evicted.commit_seq,
+                                   snapshot_seq_2, next_is_larger)) {
+        break;
+      }
+    }
+  }
+}
+
+bool WritePreparedTxnDB::MaybeUpdateOldCommitMap(
+    const uint64_t& prep_seq, const uint64_t& commit_seq,
+    const uint64_t& snapshot_seq, const bool next_is_larger = true) {
+  // If we do not store an entry in old_commit_map_ we assume it is committed in
+  // all snapshots. If commit_seq <= snapshot_seq, it is considered already in
+  // the snapshot so we need not to keep the entry around for this snapshot.
+  if (commit_seq <= snapshot_seq) {
+    // continue the search if the next snapshot could be smaller than commit_seq
+    return !next_is_larger;
+  }
+  // then snapshot_seq < commit_seq
+  if (prep_seq <= snapshot_seq) {  // overlapping range
+    WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+    ROCKS_LOG_WARN(info_log_,
+                   "old_commit_map_mutex_ overhead for %" PRIu64
+                   " commit entry: <%" PRIu64 ",%" PRIu64 ">",
+                   snapshot_seq, prep_seq, commit_seq);
+    WriteLock wl(&old_commit_map_mutex_);
+    old_commit_map_empty_.store(false, std::memory_order_release);
+    auto& vec = old_commit_map_[snapshot_seq];
+    vec.insert(std::upper_bound(vec.begin(), vec.end(), prep_seq), prep_seq);
+    // We need to store it once for each overlapping snapshot. Returning true to
+    // continue the search if there is more overlapping snapshot.
+    return true;
+  }
+  // continue the search if the next snapshot could be larger than prep_seq
+  return next_is_larger;
+}
+
+WritePreparedTxnDB::~WritePreparedTxnDB() {
+  // At this point there could be running compaction/flush holding a
+  // SnapshotChecker, which holds a pointer back to WritePreparedTxnDB.
+  // Make sure those jobs finished before destructing WritePreparedTxnDB.
+  db_impl_->CancelAllBackgroundWork(true /*wait*/);
+}
+
+void SubBatchCounter::InitWithComp(const uint32_t cf) {
+  auto cmp = comparators_[cf];
+  keys_[cf] = CFKeys(SetComparator(cmp));
+}
+
+void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) {
+  CFKeys& cf_keys = keys_[cf];
+  if (cf_keys.size() == 0) {  // just inserted
+    InitWithComp(cf);
+  }
+  auto it = cf_keys.insert(key);
+  if (it.second == false) {  // second is false if a element already existed.
+    batches_++;
+    keys_.clear();
+    InitWithComp(cf);
+    keys_[cf].insert(key);
+  }
+}
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/write_prepared_txn_db.h b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn_db.h
new file mode 100644
index 0000000000..10d1dbf607
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_prepared_txn_db.h
@@ -0,0 +1,957 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/db_iter.h"
+#include "db/pre_release_callback.h"
+#include "db/read_callback.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/set_comparator.h"
+#include "util/string_util.h"
+#include "utilities/transactions/pessimistic_transaction.h"
+#include "utilities/transactions/pessimistic_transaction_db.h"
+#include "utilities/transactions/transaction_lock_mgr.h"
+#include "utilities/transactions/write_prepared_txn.h"
+
+namespace rocksdb {
+
+// A PessimisticTransactionDB that writes data to DB after prepare phase of 2PC.
+// In this way some data in the DB might not be committed. The DB provides
+// mechanisms to tell such data apart from committed data.
+class WritePreparedTxnDB : public PessimisticTransactionDB {
+ public:
+  explicit WritePreparedTxnDB(DB* db,
+                              const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options),
+        SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits),
+        SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
+        COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits),
+        COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
+        FORMAT(COMMIT_CACHE_BITS) {
+    Init(txn_db_options);
+  }
+
+  explicit WritePreparedTxnDB(StackableDB* db,
+                              const TransactionDBOptions& txn_db_options)
+      : PessimisticTransactionDB(db, txn_db_options),
+        SNAPSHOT_CACHE_BITS(txn_db_options.wp_snapshot_cache_bits),
+        SNAPSHOT_CACHE_SIZE(static_cast<size_t>(1ull << SNAPSHOT_CACHE_BITS)),
+        COMMIT_CACHE_BITS(txn_db_options.wp_commit_cache_bits),
+        COMMIT_CACHE_SIZE(static_cast<size_t>(1ull << COMMIT_CACHE_BITS)),
+        FORMAT(COMMIT_CACHE_BITS) {
+    Init(txn_db_options);
+  }
+
+  virtual ~WritePreparedTxnDB();
+
+  virtual Status Initialize(
+      const std::vector<size_t>& compaction_enabled_cf_indices,
+      const std::vector<ColumnFamilyHandle*>& handles) override;
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+
+  // Optimized version of ::Write that receives more optimization request such
+  // as skip_concurrency_control.
+  using PessimisticTransactionDB::Write;
+  Status Write(const WriteOptions& opts, const TransactionDBWriteOptimizations&,
+               WriteBatch* updates) override;
+
+  // Write the batch to the underlying DB and mark it as committed. Could be
+  // used by both directly from TxnDB or through a transaction.
+  Status WriteInternal(const WriteOptions& write_options, WriteBatch* batch,
+                       size_t batch_cnt, WritePreparedTxn* txn);
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+
+  using DB::NewIterators;
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  // Check whether the transaction that wrote the value with sequence number seq
+  // is visible to the snapshot with sequence number snapshot_seq.
+  // Returns true if commit_seq <= snapshot_seq
+  // If the snapshot_seq is already released and snapshot_seq <= max, sets
+  // *snap_released to true and returns true as well.
+  inline bool IsInSnapshot(uint64_t prep_seq, uint64_t snapshot_seq,
+                           uint64_t min_uncommitted = kMinUnCommittedSeq,
+                           bool* snap_released = nullptr) const {
+    ROCKS_LOG_DETAILS(info_log_,
+                      "IsInSnapshot %" PRIu64 " in %" PRIu64
+                      " min_uncommitted %" PRIu64,
+                      prep_seq, snapshot_seq, min_uncommitted);
+    assert(min_uncommitted >= kMinUnCommittedSeq);
+    // Caller is responsible to initialize snap_released.
+    assert(snap_released == nullptr || *snap_released == false);
+    // Here we try to infer the return value without looking into prepare list.
+    // This would help avoiding synchronization over a shared map.
+    // TODO(myabandeh): optimize this. This sequence of checks must be correct
+    // but not necessary efficient
+    if (prep_seq == 0) {
+      // Compaction will output keys to bottom-level with sequence number 0 if
+      // it is visible to the earliest snapshot.
+      ROCKS_LOG_DETAILS(
+          info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+          prep_seq, snapshot_seq, 1);
+      return true;
+    }
+    if (snapshot_seq < prep_seq) {
+      // snapshot_seq < prep_seq <= commit_seq => snapshot_seq < commit_seq
+      ROCKS_LOG_DETAILS(
+          info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+          prep_seq, snapshot_seq, 0);
+      return false;
+    }
+    if (prep_seq < min_uncommitted) {
+      ROCKS_LOG_DETAILS(info_log_,
+                        "IsInSnapshot %" PRIu64 " in %" PRIu64
+                        " returns %" PRId32
+                        " because of min_uncommitted %" PRIu64,
+                        prep_seq, snapshot_seq, 1, min_uncommitted);
+      return true;
+    }
+    // Commit of delayed prepared has two non-atomic steps: add to commit cache,
+    // remove from delayed prepared. Our reads from these two is also
+    // non-atomic. By looking into commit cache first thus we might not find the
+    // prep_seq neither in commit cache not in delayed_prepared_. To fix that i)
+    // we check if there was any delayed prepared BEFORE looking into commit
+    // cache, ii) if there was, we complete the search steps to be these: i)
+    // commit cache, ii) delayed prepared, commit cache again. In this way if
+    // the first query to commit cache missed the commit, the 2nd will catch it.
+    bool was_empty;
+    SequenceNumber max_evicted_seq_lb, max_evicted_seq_ub;
+    CommitEntry64b dont_care;
+    auto indexed_seq = prep_seq % COMMIT_CACHE_SIZE;
+    size_t repeats = 0;
+    do {
+      repeats++;
+      assert(repeats < 100);
+      if (UNLIKELY(repeats >= 100)) {
+        throw std::runtime_error(
+            "The read was intrupted 100 times by update to max_evicted_seq_. "
+            "This is unexpected in all setups");
+      }
+      max_evicted_seq_lb = max_evicted_seq_.load(std::memory_order_acquire);
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:pause");
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:max_evicted_seq_:resume");
+      was_empty = delayed_prepared_empty_.load(std::memory_order_acquire);
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:pause");
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:delayed_prepared_empty_:resume");
+      CommitEntry cached;
+      bool exist = GetCommitEntry(indexed_seq, &dont_care, &cached);
+      TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:pause");
+      TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:GetCommitEntry:resume");
+      if (exist && prep_seq == cached.prep_seq) {
+        // It is committed and also not evicted from commit cache
+        ROCKS_LOG_DETAILS(
+            info_log_,
+            "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+            prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq);
+        return cached.commit_seq <= snapshot_seq;
+      }
+      // else it could be committed but not inserted in the map which could
+      // happen after recovery, or it could be committed and evicted by another
+      // commit, or never committed.
+
+      // At this point we dont know if it was committed or it is still prepared
+      max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire);
+      if (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub)) {
+        continue;
+      }
+      // Note: max_evicted_seq_ when we did GetCommitEntry <= max_evicted_seq_ub
+      if (max_evicted_seq_ub < prep_seq) {
+        // Not evicted from cache and also not present, so must be still
+        // prepared
+        ROCKS_LOG_DETAILS(info_log_,
+                          "IsInSnapshot %" PRIu64 " in %" PRIu64
+                          " returns %" PRId32,
+                          prep_seq, snapshot_seq, 0);
+        return false;
+      }
+      TEST_SYNC_POINT("WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:pause");
+      TEST_SYNC_POINT(
+          "WritePreparedTxnDB::IsInSnapshot:prepared_mutex_:resume");
+      if (!was_empty) {
+        // We should not normally reach here
+        WPRecordTick(TXN_PREPARE_MUTEX_OVERHEAD);
+        ReadLock rl(&prepared_mutex_);
+        ROCKS_LOG_WARN(
+            info_log_, "prepared_mutex_ overhead %" PRIu64 " for %" PRIu64,
+            static_cast<uint64_t>(delayed_prepared_.size()), prep_seq);
+        if (delayed_prepared_.find(prep_seq) != delayed_prepared_.end()) {
+          // This is the order: 1) delayed_prepared_commits_ update, 2) publish
+          // 3) delayed_prepared_ clean up. So check if it is the case of a late
+          // clenaup.
+          auto it = delayed_prepared_commits_.find(prep_seq);
+          if (it == delayed_prepared_commits_.end()) {
+            // Then it is not committed yet
+            ROCKS_LOG_DETAILS(info_log_,
+                              "IsInSnapshot %" PRIu64 " in %" PRIu64
+                              " returns %" PRId32,
+                              prep_seq, snapshot_seq, 0);
+            return false;
+          } else {
+            ROCKS_LOG_DETAILS(info_log_,
+                              "IsInSnapshot %" PRIu64 " in %" PRIu64
+                              " commit: %" PRIu64 " returns %" PRId32,
+                              prep_seq, snapshot_seq, it->second,
+                              snapshot_seq <= it->second);
+            return it->second <= snapshot_seq;
+          }
+        } else {
+          // 2nd query to commit cache. Refer to was_empty comment above.
+          exist = GetCommitEntry(indexed_seq, &dont_care, &cached);
+          if (exist && prep_seq == cached.prep_seq) {
+            ROCKS_LOG_DETAILS(
+                info_log_,
+                "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+                prep_seq, snapshot_seq, cached.commit_seq <= snapshot_seq);
+            return cached.commit_seq <= snapshot_seq;
+          }
+          max_evicted_seq_ub = max_evicted_seq_.load(std::memory_order_acquire);
+        }
+      }
+    } while (UNLIKELY(max_evicted_seq_lb != max_evicted_seq_ub));
+    // When advancing max_evicted_seq_, we move older entires from prepared to
+    // delayed_prepared_. Also we move evicted entries from commit cache to
+    // old_commit_map_ if it overlaps with any snapshot. Since prep_seq <=
+    // max_evicted_seq_, we have three cases: i) in delayed_prepared_, ii) in
+    // old_commit_map_, iii) committed with no conflict with any snapshot. Case
+    // (i) delayed_prepared_ is checked above
+    if (max_evicted_seq_ub < snapshot_seq) {  // then (ii) cannot be the case
+      // only (iii) is the case: committed
+      // commit_seq <= max_evicted_seq_ < snapshot_seq => commit_seq <
+      // snapshot_seq
+      ROCKS_LOG_DETAILS(
+          info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+          prep_seq, snapshot_seq, 1);
+      return true;
+    }
+    // else (ii) might be the case: check the commit data saved for this
+    // snapshot. If there was no overlapping commit entry, then it is committed
+    // with a commit_seq lower than any live snapshot, including snapshot_seq.
+    if (old_commit_map_empty_.load(std::memory_order_acquire)) {
+      ROCKS_LOG_DETAILS(info_log_,
+                        "IsInSnapshot %" PRIu64 " in %" PRIu64
+                        " returns %" PRId32 " released=1",
+                        prep_seq, snapshot_seq, 0);
+      assert(snap_released);
+      // This snapshot is not valid anymore. We cannot tell if prep_seq is
+      // committed before or after the snapshot. Return true but also set
+      // snap_released to true.
+      *snap_released = true;
+      return true;
+    }
+    {
+      // We should not normally reach here unless sapshot_seq is old. This is a
+      // rare case and it is ok to pay the cost of mutex ReadLock for such old,
+      // reading transactions.
+      WPRecordTick(TXN_OLD_COMMIT_MAP_MUTEX_OVERHEAD);
+      ReadLock rl(&old_commit_map_mutex_);
+      auto prep_set_entry = old_commit_map_.find(snapshot_seq);
+      bool found = prep_set_entry != old_commit_map_.end();
+      if (found) {
+        auto& vec = prep_set_entry->second;
+        found = std::binary_search(vec.begin(), vec.end(), prep_seq);
+      } else {
+        // coming from compaction
+        ROCKS_LOG_DETAILS(info_log_,
+                          "IsInSnapshot %" PRIu64 " in %" PRIu64
+                          " returns %" PRId32 " released=1",
+                          prep_seq, snapshot_seq, 0);
+        // This snapshot is not valid anymore. We cannot tell if prep_seq is
+        // committed before or after the snapshot. Return true but also set
+        // snap_released to true.
+        assert(snap_released);
+        *snap_released = true;
+        return true;
+      }
+
+      if (!found) {
+        ROCKS_LOG_DETAILS(info_log_,
+                          "IsInSnapshot %" PRIu64 " in %" PRIu64
+                          " returns %" PRId32,
+                          prep_seq, snapshot_seq, 1);
+        return true;
+      }
+    }
+    // (ii) it the case: it is committed but after the snapshot_seq
+    ROCKS_LOG_DETAILS(
+        info_log_, "IsInSnapshot %" PRIu64 " in %" PRIu64 " returns %" PRId32,
+        prep_seq, snapshot_seq, 0);
+    return false;
+  }
+
+  // Add the transaction with prepare sequence seq to the prepared list.
+  // Note: must be called serially with increasing seq on each call.
+  void AddPrepared(uint64_t seq);
+  // Check if any of the prepared txns are less than new max_evicted_seq_. Must
+  // be called with prepared_mutex_ write locked.
+  void CheckPreparedAgainstMax(SequenceNumber new_max);
+  // Remove the transaction with prepare sequence seq from the prepared list
+  void RemovePrepared(const uint64_t seq, const size_t batch_cnt = 1);
+  // Add the transaction with prepare sequence prepare_seq and commit sequence
+  // commit_seq to the commit map. loop_cnt is to detect infinite loops.
+  // Note: must be called serially.
+  void AddCommitted(uint64_t prepare_seq, uint64_t commit_seq,
+                    uint8_t loop_cnt = 0);
+
+  struct CommitEntry {
+    uint64_t prep_seq;
+    uint64_t commit_seq;
+    CommitEntry() : prep_seq(0), commit_seq(0) {}
+    CommitEntry(uint64_t ps, uint64_t cs) : prep_seq(ps), commit_seq(cs) {}
+    bool operator==(const CommitEntry& rhs) const {
+      return prep_seq == rhs.prep_seq && commit_seq == rhs.commit_seq;
+    }
+  };
+
+  struct CommitEntry64bFormat {
+    explicit CommitEntry64bFormat(size_t index_bits)
+        : INDEX_BITS(index_bits),
+          PREP_BITS(static_cast<size_t>(64 - PAD_BITS - INDEX_BITS)),
+          COMMIT_BITS(static_cast<size_t>(64 - PREP_BITS)),
+          COMMIT_FILTER(static_cast<uint64_t>((1ull << COMMIT_BITS) - 1)),
+          DELTA_UPPERBOUND(static_cast<uint64_t>((1ull << COMMIT_BITS))) {}
+    // Number of higher bits of a sequence number that is not used. They are
+    // used to encode the value type, ...
+    const size_t PAD_BITS = static_cast<size_t>(8);
+    // Number of lower bits from prepare seq that can be skipped as they are
+    // implied by the index of the entry in the array
+    const size_t INDEX_BITS;
+    // Number of bits we use to encode the prepare seq
+    const size_t PREP_BITS;
+    // Number of bits we use to encode the commit seq.
+    const size_t COMMIT_BITS;
+    // Filter to encode/decode commit seq
+    const uint64_t COMMIT_FILTER;
+    // The value of commit_seq - prepare_seq + 1 must be less than this bound
+    const uint64_t DELTA_UPPERBOUND;
+  };
+
+  // Prepare Seq (64 bits) = PAD ... PAD PREP PREP ... PREP INDEX INDEX ...
+  // INDEX Delta Seq (64 bits)   = 0 0 0 0 0 0 0 0 0  0 0 0 DELTA DELTA ...
+  // DELTA DELTA Encoded Value         = PREP PREP .... PREP PREP DELTA DELTA
+  // ... DELTA DELTA PAD: first bits of a seq that is reserved for tagging and
+  // hence ignored PREP/INDEX: the used bits in a prepare seq number INDEX: the
+  // bits that do not have to be encoded (will be provided externally) DELTA:
+  // prep seq - commit seq + 1 Number of DELTA bits should be equal to number of
+  // index bits + PADs
+  struct CommitEntry64b {
+    constexpr CommitEntry64b() noexcept : rep_(0) {}
+
+    CommitEntry64b(const CommitEntry& entry, const CommitEntry64bFormat& format)
+        : CommitEntry64b(entry.prep_seq, entry.commit_seq, format) {}
+
+    CommitEntry64b(const uint64_t ps, const uint64_t cs,
+                   const CommitEntry64bFormat& format) {
+      assert(ps < static_cast<uint64_t>(
+                      (1ull << (format.PREP_BITS + format.INDEX_BITS))));
+      assert(ps <= cs);
+      uint64_t delta = cs - ps + 1;  // make initialized delta always >= 1
+      // zero is reserved for uninitialized entries
+      assert(0 < delta);
+      assert(delta < format.DELTA_UPPERBOUND);
+      if (delta >= format.DELTA_UPPERBOUND) {
+        throw std::runtime_error(
+            "commit_seq >> prepare_seq. The allowed distance is " +
+            ToString(format.DELTA_UPPERBOUND) + " commit_seq is " +
+            ToString(cs) + " prepare_seq is " + ToString(ps));
+      }
+      rep_ = (ps << format.PAD_BITS) & ~format.COMMIT_FILTER;
+      rep_ = rep_ | delta;
+    }
+
+    // Return false if the entry is empty
+    bool Parse(const uint64_t indexed_seq, CommitEntry* entry,
+               const CommitEntry64bFormat& format) {
+      uint64_t delta = rep_ & format.COMMIT_FILTER;
+      // zero is reserved for uninitialized entries
+      assert(delta < static_cast<uint64_t>((1ull << format.COMMIT_BITS)));
+      if (delta == 0) {
+        return false;  // initialized entry would have non-zero delta
+      }
+
+      assert(indexed_seq < static_cast<uint64_t>((1ull << format.INDEX_BITS)));
+      uint64_t prep_up = rep_ & ~format.COMMIT_FILTER;
+      prep_up >>= format.PAD_BITS;
+      const uint64_t& prep_low = indexed_seq;
+      entry->prep_seq = prep_up | prep_low;
+
+      entry->commit_seq = entry->prep_seq + delta - 1;
+      return true;
+    }
+
+   private:
+    uint64_t rep_;
+  };
+
+  // Struct to hold ownership of snapshot and read callback for cleanup.
+  struct IteratorState;
+
+  std::shared_ptr<std::map<uint32_t, const Comparator*>> GetCFComparatorMap() {
+    return cf_map_;
+  }
+  std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> GetCFHandleMap() {
+    return handle_map_;
+  }
+  void UpdateCFComparatorMap(
+      const std::vector<ColumnFamilyHandle*>& handles) override;
+  void UpdateCFComparatorMap(ColumnFamilyHandle* handle) override;
+
+  virtual const Snapshot* GetSnapshot() override;
+  SnapshotImpl* GetSnapshotInternal(bool for_ww_conflict_check);
+
+ protected:
+  virtual Status VerifyCFOptions(
+      const ColumnFamilyOptions& cf_options) override;
+
+ private:
+  friend class PreparedHeap_BasicsTest_Test;
+  friend class PreparedHeap_Concurrent_Test;
+  friend class PreparedHeap_EmptyAtTheEnd_Test;
+  friend class SnapshotConcurrentAccessTest_SnapshotConcurrentAccessTest_Test;
+  friend class WritePreparedCommitEntryPreReleaseCallback;
+  friend class WritePreparedTransactionTestBase;
+  friend class WritePreparedTxn;
+  friend class WritePreparedTxnDBMock;
+  friend class WritePreparedTransactionTest_AddPreparedBeforeMax_Test;
+  friend class WritePreparedTransactionTest_AdvanceMaxEvictedSeqBasicTest_Test;
+  friend class
+      WritePreparedTransactionTest_AdvanceMaxEvictedSeqWithDuplicatesTest_Test;
+  friend class WritePreparedTransactionTest_AdvanceSeqByOne_Test;
+  friend class WritePreparedTransactionTest_BasicRecoveryTest_Test;
+  friend class WritePreparedTransactionTest_CheckAgainstSnapshotsTest_Test;
+  friend class WritePreparedTransactionTest_CleanupSnapshotEqualToMax_Test;
+  friend class
+      WritePreparedTransactionTest_ConflictDetectionAfterRecoveryTest_Test;
+  friend class WritePreparedTransactionTest_CommitMapTest_Test;
+  friend class WritePreparedTransactionTest_DoubleSnapshot_Test;
+  friend class WritePreparedTransactionTest_IsInSnapshotEmptyMapTest_Test;
+  friend class WritePreparedTransactionTest_IsInSnapshotReleased_Test;
+  friend class WritePreparedTransactionTest_IsInSnapshotTest_Test;
+  friend class WritePreparedTransactionTest_NewSnapshotLargerThanMax_Test;
+  friend class WritePreparedTransactionTest_MaxCatchupWithNewSnapshot_Test;
+  friend class
+      WritePreparedTransactionTest_NonAtomicCommitOfDelayedPrepared_Test;
+  friend class
+      WritePreparedTransactionTest_NonAtomicUpdateOfDelayedPrepared_Test;
+  friend class WritePreparedTransactionTest_NonAtomicUpdateOfMaxEvictedSeq_Test;
+  friend class WritePreparedTransactionTest_OldCommitMapGC_Test;
+  friend class WritePreparedTransactionTest_RollbackTest_Test;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+
+  void Init(const TransactionDBOptions& /* unused */);
+
+  void WPRecordTick(uint32_t ticker_type) const {
+    RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type);
+  }
+
+  // A heap with the amortized O(1) complexity for erase. It uses one extra heap
+  // to keep track of erased entries that are not yet on top of the main heap.
+  class PreparedHeap {
+    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
+        heap_;
+    std::priority_queue<uint64_t, std::vector<uint64_t>, std::greater<uint64_t>>
+        erased_heap_;
+    // True when testing crash recovery
+    bool TEST_CRASH_ = false;
+    friend class WritePreparedTxnDB;
+
+   public:
+    ~PreparedHeap() {
+      if (!TEST_CRASH_) {
+        assert(heap_.empty());
+        assert(erased_heap_.empty());
+      }
+    }
+    bool empty() { return heap_.empty(); }
+    uint64_t top() { return heap_.top(); }
+    void push(uint64_t v) { heap_.push(v); }
+    void pop() {
+      heap_.pop();
+      while (!heap_.empty() && !erased_heap_.empty() &&
+             // heap_.top() > erased_heap_.top() could happen if we have erased
+             // a non-existent entry. Ideally the user should not do that but we
+             // should be resilient against it.
+             heap_.top() >= erased_heap_.top()) {
+        if (heap_.top() == erased_heap_.top()) {
+          heap_.pop();
+        }
+        uint64_t erased __attribute__((__unused__));
+        erased = erased_heap_.top();
+        erased_heap_.pop();
+        // No duplicate prepare sequence numbers
+        assert(erased_heap_.empty() || erased_heap_.top() != erased);
+      }
+      while (heap_.empty() && !erased_heap_.empty()) {
+        erased_heap_.pop();
+      }
+    }
+    void erase(uint64_t seq) {
+      if (!heap_.empty()) {
+        if (seq < heap_.top()) {
+          // Already popped, ignore it.
+        } else if (heap_.top() == seq) {
+          pop();
+          assert(heap_.empty() || heap_.top() != seq);
+        } else {  // (heap_.top() > seq)
+          // Down the heap, remember to pop it later
+          erased_heap_.push(seq);
+        }
+      }
+    }
+  };
+
+  void TEST_Crash() override { prepared_txns_.TEST_CRASH_ = true; }
+
+  // Get the commit entry with index indexed_seq from the commit table. It
+  // returns true if such entry exists.
+  bool GetCommitEntry(const uint64_t indexed_seq, CommitEntry64b* entry_64b,
+                      CommitEntry* entry) const;
+
+  // Rewrite the entry with the index indexed_seq in the commit table with the
+  // commit entry <prep_seq, commit_seq>. If the rewrite results into eviction,
+  // sets the evicted_entry and returns true.
+  bool AddCommitEntry(const uint64_t indexed_seq, const CommitEntry& new_entry,
+                      CommitEntry* evicted_entry);
+
+  // Rewrite the entry with the index indexed_seq in the commit table with the
+  // commit entry new_entry only if the existing entry matches the
+  // expected_entry. Returns false otherwise.
+  bool ExchangeCommitEntry(const uint64_t indexed_seq,
+                           CommitEntry64b& expected_entry,
+                           const CommitEntry& new_entry);
+
+  // Increase max_evicted_seq_ from the previous value prev_max to the new
+  // value. This also involves taking care of prepared txns that are not
+  // committed before new_max, as well as updating the list of live snapshots at
+  // the time of updating the max. Thread-safety: this function can be called
+  // concurrently. The concurrent invocations of this function is equivalent to
+  // a serial invocation in which the last invocation is the one with the
+  // largest new_max value.
+  void AdvanceMaxEvictedSeq(const SequenceNumber& prev_max,
+                            const SequenceNumber& new_max);
+
+  inline SequenceNumber SmallestUnCommittedSeq() {
+    // Since we update the prepare_heap always from the main write queue via
+    // PreReleaseCallback, the prepared_txns_.top() indicates the smallest
+    // prepared data in 2pc transactions. For non-2pc transactions that are
+    // written in two steps, we also update prepared_txns_ at the first step
+    // (via the same mechanism) so that their uncommitted data is reflected in
+    // SmallestUnCommittedSeq.
+    ReadLock rl(&prepared_mutex_);
+    // Since we are holding the mutex, and GetLatestSequenceNumber is updated
+    // after prepared_txns_ are, the value of GetLatestSequenceNumber would
+    // reflect any uncommitted data that is not added to prepared_txns_ yet.
+    // Otherwise, if there is no concurrent txn, this value simply reflects that
+    // latest value in the memtable.
+    if (!delayed_prepared_.empty()) {
+      assert(!delayed_prepared_empty_.load());
+      return *delayed_prepared_.begin();
+    }
+    if (prepared_txns_.empty()) {
+      return db_impl_->GetLatestSequenceNumber() + 1;
+    } else {
+      return std::min(prepared_txns_.top(),
+                      db_impl_->GetLatestSequenceNumber() + 1);
+    }
+  }
+  // Enhance the snapshot object by recording in it the smallest uncommitted seq
+  inline void EnhanceSnapshot(SnapshotImpl* snapshot,
+                              SequenceNumber min_uncommitted) {
+    assert(snapshot);
+    snapshot->min_uncommitted_ = min_uncommitted;
+  }
+
+  virtual const std::vector<SequenceNumber> GetSnapshotListFromDB(
+      SequenceNumber max);
+
+  // Will be called by the public ReleaseSnapshot method. Does the maintenance
+  // internal to WritePreparedTxnDB
+  void ReleaseSnapshotInternal(const SequenceNumber snap_seq);
+
+  // Update the list of snapshots corresponding to the soon-to-be-updated
+  // max_evicted_seq_. Thread-safety: this function can be called concurrently.
+  // The concurrent invocations of this function is equivalent to a serial
+  // invocation in which the last invocation is the one with the largest
+  // version value.
+  void UpdateSnapshots(const std::vector<SequenceNumber>& snapshots,
+                       const SequenceNumber& version);
+  // Check the new list of new snapshots against the old one to see  if any of
+  // the snapshots are released and to do the cleanup for the released snapshot.
+  void CleanupReleasedSnapshots(
+      const std::vector<SequenceNumber>& new_snapshots,
+      const std::vector<SequenceNumber>& old_snapshots);
+
+  // Check an evicted entry against live snapshots to see if it should be kept
+  // around or it can be safely discarded (and hence assume committed for all
+  // snapshots). Thread-safety: this function can be called concurrently. If it
+  // is called concurrently with multiple UpdateSnapshots, the result is the
+  // same as checking the intersection of the snapshot list before updates with
+  // the snapshot list of all the concurrent updates.
+  void CheckAgainstSnapshots(const CommitEntry& evicted);
+
+  // Add a new entry to old_commit_map_ if prep_seq <= snapshot_seq <
+  // commit_seq. Return false if checking the next snapshot(s) is not needed.
+  // This is the case if none of the next snapshots could satisfy the condition.
+  // next_is_larger: the next snapshot will be a larger value
+  bool MaybeUpdateOldCommitMap(const uint64_t& prep_seq,
+                               const uint64_t& commit_seq,
+                               const uint64_t& snapshot_seq,
+                               const bool next_is_larger);
+
+  // A trick to increase the last visible sequence number by one and also wait
+  // for the in-flight commits to be visible.
+  void AdvanceSeqByOne();
+
+  // The list of live snapshots at the last time that max_evicted_seq_ advanced.
+  // The list stored into two data structures: in snapshot_cache_ that is
+  // efficient for concurrent reads, and in snapshots_ if the data does not fit
+  // into snapshot_cache_. The total number of snapshots in the two lists
+  std::atomic<size_t> snapshots_total_ = {};
+  // The list sorted in ascending order. Thread-safety for writes is provided
+  // with snapshots_mutex_ and concurrent reads are safe due to std::atomic for
+  // each entry. In x86_64 architecture such reads are compiled to simple read
+  // instructions.
+  const size_t SNAPSHOT_CACHE_BITS;
+  const size_t SNAPSHOT_CACHE_SIZE;
+  std::unique_ptr<std::atomic<SequenceNumber>[]> snapshot_cache_;
+  // 2nd list for storing snapshots. The list sorted in ascending order.
+  // Thread-safety is provided with snapshots_mutex_.
+  std::vector<SequenceNumber> snapshots_;
+  // The list of all snapshots: snapshots_ + snapshot_cache_. This list although
+  // redundant but simplifies CleanupOldSnapshots implementation.
+  // Thread-safety is provided with snapshots_mutex_.
+  std::vector<SequenceNumber> snapshots_all_;
+  // The version of the latest list of snapshots. This can be used to avoid
+  // rewriting a list that is concurrently updated with a more recent version.
+  SequenceNumber snapshots_version_ = 0;
+
+  // A heap of prepared transactions. Thread-safety is provided with
+  // prepared_mutex_.
+  PreparedHeap prepared_txns_;
+  const size_t COMMIT_CACHE_BITS;
+  const size_t COMMIT_CACHE_SIZE;
+  const CommitEntry64bFormat FORMAT;
+  // commit_cache_ must be initialized to zero to tell apart an empty index from
+  // a filled one. Thread-safety is provided with commit_cache_mutex_.
+  std::unique_ptr<std::atomic<CommitEntry64b>[]> commit_cache_;
+  // The largest evicted *commit* sequence number from the commit_cache_. If a
+  // seq is smaller than max_evicted_seq_ is might or might not be present in
+  // commit_cache_. So commit_cache_ must first be checked before consulting
+  // with max_evicted_seq_.
+  std::atomic<uint64_t> max_evicted_seq_ = {};
+  // Order: 1) update future_max_evicted_seq_ = new_max, 2)
+  // GetSnapshotListFromDB(new_max), max_evicted_seq_ = new_max. Since
+  // GetSnapshotInternal guarantess that the snapshot seq is larger than
+  // future_max_evicted_seq_, this guarantes that if a snapshot is not larger
+  // than max has already being looked at via a GetSnapshotListFromDB(new_max).
+  std::atomic<uint64_t> future_max_evicted_seq_ = {};
+  // Advance max_evicted_seq_ by this value each time it needs an update. The
+  // larger the value, the less frequent advances we would have. We do not want
+  // it to be too large either as it would cause stalls by doing too much
+  // maintenance work under the lock.
+  size_t INC_STEP_FOR_MAX_EVICTED = 1;
+  // A map from old snapshots (expected to be used by a few read-only txns) to
+  // prepared sequence number of the evicted entries from commit_cache_ that
+  // overlaps with such snapshot. These are the prepared sequence numbers that
+  // the snapshot, to which they are mapped, cannot assume to be committed just
+  // because it is no longer in the commit_cache_. The vector must be sorted
+  // after each update.
+  // Thread-safety is provided with old_commit_map_mutex_.
+  std::map<SequenceNumber, std::vector<SequenceNumber>> old_commit_map_;
+  // A set of long-running prepared transactions that are not finished by the
+  // time max_evicted_seq_ advances their sequence number. This is expected to
+  // be empty normally. Thread-safety is provided with prepared_mutex_.
+  std::set<uint64_t> delayed_prepared_;
+  // Commit of a delayed prepared: 1) update commit cache, 2) update
+  // delayed_prepared_commits_, 3) publish seq, 3) clean up delayed_prepared_.
+  // delayed_prepared_commits_ will help us tell apart the unprepared txns from
+  // the ones that are committed but not cleaned up yet.
+  std::unordered_map<SequenceNumber, SequenceNumber> delayed_prepared_commits_;
+  // Update when delayed_prepared_.empty() changes. Expected to be true
+  // normally.
+  std::atomic<bool> delayed_prepared_empty_ = {true};
+  // Update when old_commit_map_.empty() changes. Expected to be true normally.
+  std::atomic<bool> old_commit_map_empty_ = {true};
+  mutable port::RWMutex prepared_mutex_;
+  mutable port::RWMutex old_commit_map_mutex_;
+  mutable port::RWMutex commit_cache_mutex_;
+  mutable port::RWMutex snapshots_mutex_;
+  // A cache of the cf comparators
+  // Thread safety: since it is a const it is safe to read it concurrently
+  std::shared_ptr<std::map<uint32_t, const Comparator*>> cf_map_;
+  // A cache of the cf handles
+  // Thread safety: since the handle is read-only object it is a const it is
+  // safe to read it concurrently
+  std::shared_ptr<std::map<uint32_t, ColumnFamilyHandle*>> handle_map_;
+};
+
+class WritePreparedTxnReadCallback : public ReadCallback {
+ public:
+  WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot)
+      : ReadCallback(snapshot), db_(db) {}
+  WritePreparedTxnReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot,
+                               SequenceNumber min_uncommitted)
+      : ReadCallback(snapshot, min_uncommitted), db_(db) {}
+
+  // Will be called to see if the seq number visible; if not it moves on to
+  // the next seq number.
+  inline virtual bool IsVisibleFullCheck(SequenceNumber seq) override {
+    auto snapshot = max_visible_seq_;
+    return db_->IsInSnapshot(seq, snapshot, min_uncommitted_);
+  }
+
+  // TODO(myabandeh): override Refresh when Iterator::Refresh is supported
+ private:
+  WritePreparedTxnDB* db_;
+};
+
+class AddPreparedCallback : public PreReleaseCallback {
+ public:
+  AddPreparedCallback(WritePreparedTxnDB* db, DBImpl* db_impl,
+                      size_t sub_batch_cnt, bool two_write_queues,
+                      bool first_prepare_batch)
+      : db_(db),
+        db_impl_(db_impl),
+        sub_batch_cnt_(sub_batch_cnt),
+        two_write_queues_(two_write_queues),
+        first_prepare_batch_(first_prepare_batch) {
+    (void)two_write_queues_;  // to silence unused private field warning
+  }
+  virtual Status Callback(SequenceNumber prepare_seq,
+                          bool is_mem_disabled __attribute__((__unused__)),
+                          uint64_t log_number) override {
+    // Always Prepare from the main queue
+    assert(!two_write_queues_ || !is_mem_disabled);  // implies the 1st queue
+    for (size_t i = 0; i < sub_batch_cnt_; i++) {
+      db_->AddPrepared(prepare_seq + i);
+    }
+    if (first_prepare_batch_) {
+      assert(log_number != 0);
+      db_impl_->logs_with_prep_tracker()->MarkLogAsContainingPrepSection(
+          log_number);
+    }
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  size_t sub_batch_cnt_;
+  bool two_write_queues_;
+  // It is 2PC and this is the first prepare batch. Always the case in 2PC
+  // unless it is WriteUnPrepared.
+  bool first_prepare_batch_;
+};
+
+class WritePreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
+ public:
+  // includes_data indicates that the commit also writes non-empty
+  // CommitTimeWriteBatch to memtable, which needs to be committed separately.
+  WritePreparedCommitEntryPreReleaseCallback(
+      WritePreparedTxnDB* db, DBImpl* db_impl, SequenceNumber prep_seq,
+      size_t prep_batch_cnt, size_t data_batch_cnt = 0,
+      SequenceNumber aux_seq = kMaxSequenceNumber, size_t aux_batch_cnt = 0)
+      : db_(db),
+        db_impl_(db_impl),
+        prep_seq_(prep_seq),
+        prep_batch_cnt_(prep_batch_cnt),
+        data_batch_cnt_(data_batch_cnt),
+        includes_data_(data_batch_cnt_ > 0),
+        aux_seq_(aux_seq),
+        aux_batch_cnt_(aux_batch_cnt),
+        includes_aux_batch_(aux_batch_cnt > 0) {
+    assert((prep_batch_cnt_ > 0) != (prep_seq == kMaxSequenceNumber));  // xor
+    assert(prep_batch_cnt_ > 0 || data_batch_cnt_ > 0);
+    assert((aux_batch_cnt_ > 0) != (aux_seq == kMaxSequenceNumber));  // xor
+  }
+
+  virtual Status Callback(SequenceNumber commit_seq,
+                          bool is_mem_disabled __attribute__((__unused__)),
+                          uint64_t) override {
+    // Always commit from the 2nd queue
+    assert(!db_impl_->immutable_db_options().two_write_queues ||
+           is_mem_disabled);
+    assert(includes_data_ || prep_seq_ != kMaxSequenceNumber);
+    // Data batch is what accompanied with the commit marker and affects the
+    // last seq in the commit batch.
+    const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
+                                         ? commit_seq
+                                         : commit_seq + data_batch_cnt_ - 1;
+    if (prep_seq_ != kMaxSequenceNumber) {
+      for (size_t i = 0; i < prep_batch_cnt_; i++) {
+        db_->AddCommitted(prep_seq_ + i, last_commit_seq);
+      }
+    }  // else there was no prepare phase
+    if (includes_aux_batch_) {
+      for (size_t i = 0; i < aux_batch_cnt_; i++) {
+        db_->AddCommitted(aux_seq_ + i, last_commit_seq);
+      }
+    }
+    if (includes_data_) {
+      assert(data_batch_cnt_);
+      // Commit the data that is accompanied with the commit request
+      for (size_t i = 0; i < data_batch_cnt_; i++) {
+        // For commit seq of each batch use the commit seq of the last batch.
+        // This would make debugging easier by having all the batches having
+        // the same sequence number.
+        db_->AddCommitted(commit_seq + i, last_commit_seq);
+      }
+    }
+    if (db_impl_->immutable_db_options().two_write_queues) {
+      assert(is_mem_disabled);  // implies the 2nd queue
+      // Publish the sequence number. We can do that here assuming the callback
+      // is invoked only from one write queue, which would guarantee that the
+      // publish sequence numbers will be in order, i.e., once a seq is
+      // published all the seq prior to that are also publishable.
+      db_impl_->SetLastPublishedSequence(last_commit_seq);
+    }
+    // else SequenceNumber that is updated as part of the write already does the
+    // publishing
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  // kMaxSequenceNumber if there was no prepare phase
+  SequenceNumber prep_seq_;
+  size_t prep_batch_cnt_;
+  size_t data_batch_cnt_;
+  // Data here is the batch that is written with the commit marker, either
+  // because it is commit without prepare or commit has a CommitTimeWriteBatch.
+  bool includes_data_;
+  // Auxiliary batch (if there is any) is a batch that is written before, but
+  // gets the same commit seq as prepare batch or data batch. This is used in
+  // two write queues where the CommitTimeWriteBatch becomes the aux batch and
+  // we do a separate write to actually commit everything.
+  SequenceNumber aux_seq_;
+  size_t aux_batch_cnt_;
+  bool includes_aux_batch_;
+};
+
+// For two_write_queues commit both the aborted batch and the cleanup batch and
+// then published the seq
+class WritePreparedRollbackPreReleaseCallback : public PreReleaseCallback {
+ public:
+  WritePreparedRollbackPreReleaseCallback(WritePreparedTxnDB* db,
+                                          DBImpl* db_impl,
+                                          SequenceNumber prep_seq,
+                                          SequenceNumber rollback_seq,
+                                          size_t prep_batch_cnt)
+      : db_(db),
+        db_impl_(db_impl),
+        prep_seq_(prep_seq),
+        rollback_seq_(rollback_seq),
+        prep_batch_cnt_(prep_batch_cnt) {
+    assert(prep_seq != kMaxSequenceNumber);
+    assert(rollback_seq != kMaxSequenceNumber);
+    assert(prep_batch_cnt_ > 0);
+  }
+
+  Status Callback(SequenceNumber commit_seq, bool is_mem_disabled,
+                  uint64_t) override {
+    // Always commit from the 2nd queue
+    assert(is_mem_disabled);  // implies the 2nd queue
+    assert(db_impl_->immutable_db_options().two_write_queues);
+#ifdef NDEBUG
+    (void)is_mem_disabled;
+#endif
+    const uint64_t last_commit_seq = commit_seq;
+    db_->AddCommitted(rollback_seq_, last_commit_seq);
+    for (size_t i = 0; i < prep_batch_cnt_; i++) {
+      db_->AddCommitted(prep_seq_ + i, last_commit_seq);
+    }
+    db_impl_->SetLastPublishedSequence(last_commit_seq);
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  SequenceNumber prep_seq_;
+  SequenceNumber rollback_seq_;
+  size_t prep_batch_cnt_;
+};
+
+// Count the number of sub-batches inside a batch. A sub-batch does not have
+// duplicate keys.
+struct SubBatchCounter : public WriteBatch::Handler {
+  explicit SubBatchCounter(std::map<uint32_t, const Comparator*>& comparators)
+      : comparators_(comparators), batches_(1) {}
+  std::map<uint32_t, const Comparator*>& comparators_;
+  using CFKeys = std::set<Slice, SetComparator>;
+  std::map<uint32_t, CFKeys> keys_;
+  size_t batches_;
+  size_t BatchCount() { return batches_; }
+  void AddKey(const uint32_t cf, const Slice& key);
+  void InitWithComp(const uint32_t cf);
+  Status MarkNoop(bool) override { return Status::OK(); }
+  Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+  Status MarkCommit(const Slice&) override { return Status::OK(); }
+  Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+    AddKey(cf, key);
+    return Status::OK();
+  }
+  Status MarkBeginPrepare(bool) override { return Status::OK(); }
+  Status MarkRollback(const Slice&) override { return Status::OK(); }
+  bool WriteAfterCommit() const override { return false; }
+};
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc b/thirdparty/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc
new file mode 100644
index 0000000000..9aee33b078
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_unprepared_transaction_test.cc
@@ -0,0 +1,450 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/transaction_test.h"
+#include "utilities/transactions/write_unprepared_txn.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace rocksdb {
+
+class WriteUnpreparedTransactionTestBase : public TransactionTestBase {
+ public:
+  WriteUnpreparedTransactionTestBase(bool use_stackable_db,
+                                     bool two_write_queue,
+                                     TxnDBWritePolicy write_policy)
+      : TransactionTestBase(use_stackable_db, two_write_queue, write_policy){}
+};
+
+class WriteUnpreparedTransactionTest
+    : public WriteUnpreparedTransactionTestBase,
+      virtual public ::testing::WithParamInterface<
+          std::tuple<bool, bool, TxnDBWritePolicy>> {
+ public:
+  WriteUnpreparedTransactionTest()
+      : WriteUnpreparedTransactionTestBase(std::get<0>(GetParam()),
+                                           std::get<1>(GetParam()),
+                                           std::get<2>(GetParam())){}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    WriteUnpreparedTransactionTest, WriteUnpreparedTransactionTest,
+    ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED),
+                      std::make_tuple(false, true, WRITE_UNPREPARED)));
+
+TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) {
+  auto verify_state = [](Iterator* iter, const std::string& key,
+                         const std::string& value) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(key, iter->key().ToString());
+    ASSERT_EQ(value, iter->value().ToString());
+  };
+
+  options.disable_auto_compactions = true;
+  ReOpen();
+
+  // The following tests checks whether reading your own write for
+  // a transaction works for write unprepared, when there are uncommitted
+  // values written into DB.
+  //
+  // Although the values written by DB::Put are technically committed, we add
+  // their seq num to unprep_seqs_ to pretend that they were written into DB
+  // as part of an unprepared batch, and then check if they are visible to the
+  // transaction.
+  auto snapshot0 = db->GetSnapshot();
+  ASSERT_OK(db->Put(WriteOptions(), "a", "v1"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "v2"));
+  auto snapshot2 = db->GetSnapshot();
+  ASSERT_OK(db->Put(WriteOptions(), "a", "v3"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "v4"));
+  auto snapshot4 = db->GetSnapshot();
+  ASSERT_OK(db->Put(WriteOptions(), "a", "v5"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "v6"));
+  auto snapshot6 = db->GetSnapshot();
+  ASSERT_OK(db->Put(WriteOptions(), "a", "v7"));
+  ASSERT_OK(db->Put(WriteOptions(), "b", "v8"));
+  auto snapshot8 = db->GetSnapshot();
+
+  TransactionOptions txn_options;
+  WriteOptions write_options;
+  Transaction* txn = db->BeginTransaction(write_options, txn_options);
+  WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+
+  ReadOptions roptions;
+  roptions.snapshot = snapshot0;
+
+  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
+      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
+  auto iter = txn->GetIterator(roptions);
+
+  // Test Get().
+  std::string value;
+
+  ASSERT_OK(txn->Get(roptions, Slice("a"), &value));
+  ASSERT_EQ(value, "v3");
+
+  ASSERT_OK(txn->Get(roptions, Slice("b"), &value));
+  ASSERT_EQ(value, "v4");
+
+  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
+      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
+  delete iter;
+  iter = txn->GetIterator(roptions);
+
+  ASSERT_OK(txn->Get(roptions, Slice("a"), &value));
+  ASSERT_EQ(value, "v7");
+
+  ASSERT_OK(txn->Get(roptions, Slice("b"), &value));
+  ASSERT_EQ(value, "v8");
+
+  wup_txn->unprep_seqs_.clear();
+
+  // Test Next().
+  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
+      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
+  delete iter;
+  iter = txn->GetIterator(roptions);
+
+  iter->Seek("a");
+  verify_state(iter, "a", "v3");
+
+  iter->Next();
+  verify_state(iter, "b", "v4");
+
+  iter->SeekToFirst();
+  verify_state(iter, "a", "v3");
+
+  iter->Next();
+  verify_state(iter, "b", "v4");
+
+  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
+      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
+  delete iter;
+  iter = txn->GetIterator(roptions);
+
+  iter->Seek("a");
+  verify_state(iter, "a", "v7");
+
+  iter->Next();
+  verify_state(iter, "b", "v8");
+
+  iter->SeekToFirst();
+  verify_state(iter, "a", "v7");
+
+  iter->Next();
+  verify_state(iter, "b", "v8");
+
+  wup_txn->unprep_seqs_.clear();
+
+  // Test Prev(). For Prev(), we need to adjust the snapshot to match what is
+  // possible in WriteUnpreparedTxn.
+  //
+  // Because of row locks and ValidateSnapshot, there cannot be any committed
+  // entries after snapshot, but before the first prepared key.
+  roptions.snapshot = snapshot2;
+  wup_txn->unprep_seqs_[snapshot2->GetSequenceNumber() + 1] =
+      snapshot4->GetSequenceNumber() - snapshot2->GetSequenceNumber();
+  delete iter;
+  iter = txn->GetIterator(roptions);
+
+  iter->SeekForPrev("b");
+  verify_state(iter, "b", "v4");
+
+  iter->Prev();
+  verify_state(iter, "a", "v3");
+
+  iter->SeekToLast();
+  verify_state(iter, "b", "v4");
+
+  iter->Prev();
+  verify_state(iter, "a", "v3");
+
+  roptions.snapshot = snapshot6;
+  wup_txn->unprep_seqs_[snapshot6->GetSequenceNumber() + 1] =
+      snapshot8->GetSequenceNumber() - snapshot6->GetSequenceNumber();
+  delete iter;
+  iter = txn->GetIterator(roptions);
+
+  iter->SeekForPrev("b");
+  verify_state(iter, "b", "v8");
+
+  iter->Prev();
+  verify_state(iter, "a", "v7");
+
+  iter->SeekToLast();
+  verify_state(iter, "b", "v8");
+
+  iter->Prev();
+  verify_state(iter, "a", "v7");
+
+  // Since the unprep_seqs_ data were faked for testing, we do not want the
+  // destructor for the transaction to be rolling back data that did not
+  // exist.
+  wup_txn->unprep_seqs_.clear();
+
+  db->ReleaseSnapshot(snapshot0);
+  db->ReleaseSnapshot(snapshot2);
+  db->ReleaseSnapshot(snapshot4);
+  db->ReleaseSnapshot(snapshot6);
+  db->ReleaseSnapshot(snapshot8);
+  delete iter;
+  delete txn;
+}
+
+// This tests how write unprepared behaves during recovery when the DB crashes
+// after a transaction has either been unprepared or prepared, and tests if
+// the changes are correctly applied for prepared transactions if we decide to
+// rollback/commit.
+TEST_P(WriteUnpreparedTransactionTest, RecoveryTest) {
+  WriteOptions write_options;
+  write_options.disableWAL = false;
+  TransactionOptions txn_options;
+  std::vector<Transaction*> prepared_trans;
+  WriteUnpreparedTxnDB* wup_db;
+  options.disable_auto_compactions = true;
+
+  enum Action { UNPREPARED, ROLLBACK, COMMIT };
+
+  // batch_size of 1 causes writes to DB for every marker.
+  for (size_t batch_size : {1, 1000000}) {
+    txn_options.max_write_batch_size = batch_size;
+    for (bool empty : {true, false}) {
+      for (Action a : {UNPREPARED, ROLLBACK, COMMIT}) {
+        for (int num_batches = 1; num_batches < 10; num_batches++) {
+          // Reset database.
+          prepared_trans.clear();
+          ReOpen();
+          wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+          if (!empty) {
+            for (int i = 0; i < num_batches; i++) {
+              ASSERT_OK(db->Put(WriteOptions(), "k" + ToString(i),
+                                "before value" + ToString(i)));
+            }
+          }
+
+          // Write num_batches unprepared batches.
+          Transaction* txn = db->BeginTransaction(write_options, txn_options);
+          WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+          txn->SetName("xid");
+          for (int i = 0; i < num_batches; i++) {
+            ASSERT_OK(txn->Put("k" + ToString(i), "value" + ToString(i)));
+            if (txn_options.max_write_batch_size == 1) {
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+            } else {
+              ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+            }
+          }
+          if (a == UNPREPARED) {
+            // This is done to prevent the destructor from rolling back the
+            // transaction for us, since we want to pretend we crashed and
+            // test that recovery does the rollback.
+            wup_txn->unprep_seqs_.clear();
+          } else {
+            txn->Prepare();
+          }
+          delete txn;
+
+          // Crash and run recovery code paths.
+          wup_db->db_impl_->FlushWAL(true);
+          wup_db->TEST_Crash();
+          ReOpenNoDelete();
+          assert(db != nullptr);
+
+          db->GetAllPreparedTransactions(&prepared_trans);
+          ASSERT_EQ(prepared_trans.size(), a == UNPREPARED ? 0 : 1);
+          if (a == ROLLBACK) {
+            ASSERT_OK(prepared_trans[0]->Rollback());
+            delete prepared_trans[0];
+          } else if (a == COMMIT) {
+            ASSERT_OK(prepared_trans[0]->Commit());
+            delete prepared_trans[0];
+          }
+
+          Iterator* iter = db->NewIterator(ReadOptions());
+          iter->SeekToFirst();
+          // Check that DB has before values.
+          if (!empty || a == COMMIT) {
+            for (int i = 0; i < num_batches; i++) {
+              ASSERT_TRUE(iter->Valid());
+              ASSERT_EQ(iter->key().ToString(), "k" + ToString(i));
+              if (a == COMMIT) {
+                ASSERT_EQ(iter->value().ToString(), "value" + ToString(i));
+              } else {
+                ASSERT_EQ(iter->value().ToString(),
+                          "before value" + ToString(i));
+              }
+              iter->Next();
+            }
+          }
+          ASSERT_FALSE(iter->Valid());
+          delete iter;
+        }
+      }
+    }
+  }
+}
+
+// Basic test to see that unprepared batch gets written to DB when batch size
+// is exceeded. It also does some basic checks to see if commit/rollback works
+// as expected for write unprepared.
+TEST_P(WriteUnpreparedTransactionTest, UnpreparedBatch) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  const int kNumKeys = 10;
+
+  // batch_size of 1 causes writes to DB for every marker.
+  for (size_t batch_size : {1, 1000000}) {
+    txn_options.max_write_batch_size = batch_size;
+    for (bool prepare : {false, true}) {
+      for (bool commit : {false, true}) {
+        ReOpen();
+        Transaction* txn = db->BeginTransaction(write_options, txn_options);
+        WriteUnpreparedTxn* wup_txn = dynamic_cast<WriteUnpreparedTxn*>(txn);
+        txn->SetName("xid");
+
+        for (int i = 0; i < kNumKeys; i++) {
+          txn->Put("k" + ToString(i), "v" + ToString(i));
+          if (txn_options.max_write_batch_size == 1) {
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), i + 1);
+          } else {
+            ASSERT_EQ(wup_txn->GetUnpreparedSequenceNumbers().size(), 0);
+          }
+        }
+
+        if (prepare) {
+          ASSERT_OK(txn->Prepare());
+        }
+
+        Iterator* iter = db->NewIterator(ReadOptions());
+        iter->SeekToFirst();
+        assert(!iter->Valid());
+        ASSERT_FALSE(iter->Valid());
+        delete iter;
+
+        if (commit) {
+          ASSERT_OK(txn->Commit());
+        } else {
+          ASSERT_OK(txn->Rollback());
+        }
+        delete txn;
+
+        iter = db->NewIterator(ReadOptions());
+        iter->SeekToFirst();
+
+        for (int i = 0; i < (commit ? kNumKeys : 0); i++) {
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ(iter->key().ToString(), "k" + ToString(i));
+          ASSERT_EQ(iter->value().ToString(), "v" + ToString(i));
+          iter->Next();
+        }
+        ASSERT_FALSE(iter->Valid());
+        delete iter;
+      }
+    }
+  }
+}
+
+// Test whether logs containing unprepared/prepared batches are kept even
+// after memtable finishes flushing, and whether they are removed when
+// transaction commits/aborts.
+//
+// TODO(lth): Merge with TransactionTest/TwoPhaseLogRollingTest tests.
+TEST_P(WriteUnpreparedTransactionTest, MarkLogWithPrepSection) {
+  WriteOptions write_options;
+  TransactionOptions txn_options;
+  // batch_size of 1 causes writes to DB for every marker.
+  txn_options.max_write_batch_size = 1;
+  const int kNumKeys = 10;
+
+  WriteOptions wopts;
+  wopts.sync = true;
+
+  for (bool prepare : {false, true}) {
+    for (bool commit : {false, true}) {
+      ReOpen();
+      auto wup_db = dynamic_cast<WriteUnpreparedTxnDB*>(db);
+      auto db_impl = wup_db->db_impl_;
+
+      Transaction* txn1 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn1->SetName("xid1"));
+
+      Transaction* txn2 = db->BeginTransaction(write_options, txn_options);
+      ASSERT_OK(txn2->SetName("xid2"));
+
+      // Spread this transaction across multiple log files.
+      for (int i = 0; i < kNumKeys; i++) {
+        ASSERT_OK(txn1->Put("k1" + ToString(i), "v" + ToString(i)));
+        if (i >= kNumKeys / 2) {
+          ASSERT_OK(txn2->Put("k2" + ToString(i), "v" + ToString(i)));
+        }
+
+        if (i > 0) {
+          db_impl->TEST_SwitchWAL();
+        }
+      }
+
+      ASSERT_GT(txn1->GetLogNumber(), 0);
+      ASSERT_GT(txn2->GetLogNumber(), 0);
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn1->GetLogNumber());
+      ASSERT_GT(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+
+      if (prepare) {
+        ASSERT_OK(txn1->Prepare());
+        ASSERT_OK(txn2->Prepare());
+      }
+
+      ASSERT_GE(db_impl->TEST_LogfileNumber(), txn1->GetLogNumber());
+      ASSERT_GE(db_impl->TEST_LogfileNumber(), txn2->GetLogNumber());
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn1->GetLogNumber());
+      if (commit) {
+        ASSERT_OK(txn1->Commit());
+      } else {
+        ASSERT_OK(txn1->Rollback());
+      }
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(),
+                txn2->GetLogNumber());
+
+      if (commit) {
+        ASSERT_OK(txn2->Commit());
+      } else {
+        ASSERT_OK(txn2->Rollback());
+      }
+
+      ASSERT_EQ(db_impl->TEST_FindMinLogContainingOutstandingPrep(), 0);
+
+      delete txn1;
+      delete txn2;
+    }
+  }
+}
+
+}  // namespace rocksdb
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn.cc b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn.cc
new file mode 100644
index 0000000000..731460eda6
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn.cc
@@ -0,0 +1,502 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "utilities/transactions/write_unprepared_txn.h"
+#include "db/db_impl.h"
+#include "util/cast_util.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+namespace rocksdb {
+
+bool WriteUnpreparedTxnReadCallback::IsVisibleFullCheck(SequenceNumber seq) {
+  auto unprep_seqs = txn_->GetUnpreparedSequenceNumbers();
+
+  // Since unprep_seqs maps prep_seq => prepare_batch_cnt, to check if seq is
+  // in unprep_seqs, we have to check if seq is equal to prep_seq or any of
+  // the prepare_batch_cnt seq nums after it.
+  //
+  // TODO(lth): Can be optimized with std::lower_bound if unprep_seqs is
+  // large.
+  for (const auto& it : unprep_seqs) {
+    if (it.first <= seq && seq < it.first + it.second) {
+      return true;
+    }
+  }
+
+  return db_->IsInSnapshot(seq, wup_snapshot_, min_uncommitted_);
+}
+
+SequenceNumber WriteUnpreparedTxnReadCallback::CalcMaxUnpreparedSequenceNumber(
+    WriteUnpreparedTxn* txn) {
+  auto unprep_seqs = txn->GetUnpreparedSequenceNumbers();
+  if (unprep_seqs.size()) {
+    return unprep_seqs.rbegin()->first + unprep_seqs.rbegin()->second - 1;
+  }
+  return 0;
+}
+
+WriteUnpreparedTxn::WriteUnpreparedTxn(WriteUnpreparedTxnDB* txn_db,
+                                       const WriteOptions& write_options,
+                                       const TransactionOptions& txn_options)
+    : WritePreparedTxn(txn_db, write_options, txn_options), wupt_db_(txn_db) {
+  max_write_batch_size_ = txn_options.max_write_batch_size;
+  // We set max bytes to zero so that we don't get a memory limit error.
+  // Instead of trying to keep write batch strictly under the size limit, we
+  // just flush to DB when the limit is exceeded in write unprepared, to avoid
+  // having retry logic. This also allows very big key-value pairs that exceed
+  // max bytes to succeed.
+  write_batch_.SetMaxBytes(0);
+}
+
+WriteUnpreparedTxn::~WriteUnpreparedTxn() {
+  if (!unprep_seqs_.empty()) {
+    assert(log_number_ > 0);
+    assert(GetId() > 0);
+    assert(!name_.empty());
+
+    // We should rollback regardless of GetState, but some unit tests that
+    // test crash recovery run the destructor assuming that rollback does not
+    // happen, so that rollback during recovery can be exercised.
+    if (GetState() == STARTED) {
+      auto s __attribute__((__unused__)) = RollbackInternal();
+      // TODO(lth): Better error handling.
+      assert(s.ok());
+      dbimpl_->logs_with_prep_tracker()->MarkLogAsHavingPrepSectionFlushed(
+          log_number_);
+    }
+  }
+}
+
+void WriteUnpreparedTxn::Initialize(const TransactionOptions& txn_options) {
+  PessimisticTransaction::Initialize(txn_options);
+  max_write_batch_size_ = txn_options.max_write_batch_size;
+  write_batch_.SetMaxBytes(0);
+  unprep_seqs_.clear();
+  write_set_keys_.clear();
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+                               const Slice& key, const Slice& value,
+                               const bool assume_tracked) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+}
+
+Status WriteUnpreparedTxn::Put(ColumnFamilyHandle* column_family,
+                               const SliceParts& key, const SliceParts& value,
+                               const bool assume_tracked) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Put(column_family, key, value, assume_tracked);
+}
+
+Status WriteUnpreparedTxn::Merge(ColumnFamilyHandle* column_family,
+                                 const Slice& key, const Slice& value,
+                                 const bool assume_tracked) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Merge(column_family, key, value, assume_tracked);
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+                                  const Slice& key, const bool assume_tracked) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+}
+
+Status WriteUnpreparedTxn::Delete(ColumnFamilyHandle* column_family,
+                                  const SliceParts& key,
+                                  const bool assume_tracked) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::Delete(column_family, key, assume_tracked);
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                        const Slice& key,
+                                        const bool assume_tracked) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked);
+}
+
+Status WriteUnpreparedTxn::SingleDelete(ColumnFamilyHandle* column_family,
+                                        const SliceParts& key,
+                                        const bool assume_tracked) {
+  Status s = MaybeFlushWriteBatchToDB();
+  if (!s.ok()) {
+    return s;
+  }
+  return TransactionBaseImpl::SingleDelete(column_family, key, assume_tracked);
+}
+
+Status WriteUnpreparedTxn::MaybeFlushWriteBatchToDB() {
+  const bool kPrepared = true;
+  Status s;
+  if (max_write_batch_size_ != 0 &&
+      write_batch_.GetDataSize() > max_write_batch_size_) {
+    assert(GetState() != PREPARED);
+    s = FlushWriteBatchToDB(!kPrepared);
+  }
+  return s;
+}
+
+void WriteUnpreparedTxn::UpdateWriteKeySet(uint32_t cfid, const Slice& key) {
+  // TODO(lth): write_set_keys_ can just be a std::string instead of a vector.
+  write_set_keys_[cfid].push_back(key.ToString());
+}
+
+Status WriteUnpreparedTxn::FlushWriteBatchToDB(bool prepared) {
+  if (name_.empty()) {
+    return Status::InvalidArgument("Cannot write to DB without SetName.");
+  }
+
+  // Update write_key_set_ for rollback purposes.
+  KeySetBuilder keyset_handler(
+      this, wupt_db_->txn_db_options_.rollback_merge_operands);
+  auto s = GetWriteBatch()->GetWriteBatch()->Iterate(&keyset_handler);
+  assert(s.ok());
+  if (!s.ok()) {
+    return s;
+  }
+
+  // TODO(lth): Reduce duplicate code with WritePrepared prepare logic.
+  WriteOptions write_options = write_options_;
+  write_options.disableWAL = false;
+  const bool WRITE_AFTER_COMMIT = true;
+  const bool first_prepare_batch = log_number_ == 0;
+  // MarkEndPrepare will change Noop marker to the appropriate marker.
+  WriteBatchInternal::MarkEndPrepare(GetWriteBatch()->GetWriteBatch(), name_,
+                                     !WRITE_AFTER_COMMIT, !prepared);
+  // For each duplicate key we account for a new sub-batch
+  prepare_batch_cnt_ = GetWriteBatch()->SubBatchCnt();
+  // AddPrepared better to be called in the pre-release callback otherwise there
+  // is a non-zero chance of max advancing prepare_seq and readers assume the
+  // data as committed.
+  // Also having it in the PreReleaseCallback allows in-order addition of
+  // prepared entries to PreparedHeap and hence enables an optimization. Refer
+  // to SmallestUnCommittedSeq for more details.
+  AddPreparedCallback add_prepared_callback(
+      wpt_db_, db_impl_, prepare_batch_cnt_,
+      db_impl_->immutable_db_options().two_write_queues, first_prepare_batch);
+  const bool DISABLE_MEMTABLE = true;
+  uint64_t seq_used = kMaxSequenceNumber;
+  // log_number_ should refer to the oldest log containing uncommitted data
+  // from the current transaction. This means that if log_number_ is set,
+  // WriteImpl should not overwrite that value, so set log_used to nullptr if
+  // log_number_ is already set.
+  uint64_t* log_used = log_number_ ? nullptr : &log_number_;
+  s = db_impl_->WriteImpl(write_options, GetWriteBatch()->GetWriteBatch(),
+                          /*callback*/ nullptr, log_used, /*log ref*/
+                          0, !DISABLE_MEMTABLE, &seq_used, prepare_batch_cnt_,
+                          &add_prepared_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  auto prepare_seq = seq_used;
+
+  // Only call SetId if it hasn't been set yet.
+  if (GetId() == 0) {
+    SetId(prepare_seq);
+  }
+  // unprep_seqs_ will also contain prepared seqnos since they are treated in
+  // the same way in the prepare/commit callbacks. See the comment on the
+  // definition of unprep_seqs_.
+  unprep_seqs_[prepare_seq] = prepare_batch_cnt_;
+
+  // Reset transaction state.
+  if (!prepared) {
+    prepare_batch_cnt_ = 0;
+    write_batch_.Clear();
+    WriteBatchInternal::InsertNoop(write_batch_.GetWriteBatch());
+  }
+
+  return s;
+}
+
+Status WriteUnpreparedTxn::PrepareInternal() {
+  const bool kPrepared = true;
+  return FlushWriteBatchToDB(kPrepared);
+}
+
+Status WriteUnpreparedTxn::CommitWithoutPrepareInternal() {
+  if (unprep_seqs_.empty()) {
+    assert(log_number_ == 0);
+    assert(GetId() == 0);
+    return WritePreparedTxn::CommitWithoutPrepareInternal();
+  }
+
+  // TODO(lth): We should optimize commit without prepare to not perform
+  // a prepare under the hood.
+  auto s = PrepareInternal();
+  if (!s.ok()) {
+    return s;
+  }
+  return CommitInternal();
+}
+
+Status WriteUnpreparedTxn::CommitInternal() {
+  // TODO(lth): Reduce duplicate code with WritePrepared commit logic.
+
+  // We take the commit-time batch and append the Commit marker.  The Memtable
+  // will ignore the Commit marker in non-recovery mode
+  WriteBatch* working_batch = GetCommitTimeWriteBatch();
+  const bool empty = working_batch->Count() == 0;
+  WriteBatchInternal::MarkCommit(working_batch, name_);
+
+  const bool for_recovery = use_only_the_last_commit_time_batch_for_recovery_;
+  if (!empty && for_recovery) {
+    // When not writing to memtable, we can still cache the latest write batch.
+    // The cached batch will be written to memtable in WriteRecoverableState
+    // during FlushMemTable
+    WriteBatchInternal::SetAsLastestPersistentState(working_batch);
+  }
+
+  const bool includes_data = !empty && !for_recovery;
+  size_t commit_batch_cnt = 0;
+  if (UNLIKELY(includes_data)) {
+    ROCKS_LOG_WARN(db_impl_->immutable_db_options().info_log,
+                   "Duplicate key overhead");
+    SubBatchCounter counter(*wpt_db_->GetCFComparatorMap());
+    auto s = working_batch->Iterate(&counter);
+    assert(s.ok());
+    commit_batch_cnt = counter.BatchCount();
+  }
+  const bool disable_memtable = !includes_data;
+  const bool do_one_write =
+      !db_impl_->immutable_db_options().two_write_queues || disable_memtable;
+  const bool publish_seq = do_one_write;
+  // Note: CommitTimeWriteBatch does not need AddPrepared since it is written to
+  // DB in one shot. min_uncommitted still works since it requires capturing
+  // data that is written to DB but not yet committed, while
+  // CommitTimeWriteBatch commits with PreReleaseCallback.
+  WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, unprep_seqs_, commit_batch_cnt, publish_seq);
+  uint64_t seq_used = kMaxSequenceNumber;
+  // Since the prepared batch is directly written to memtable, there is already
+  // a connection between the memtable and its WAL, so there is no need to
+  // redundantly reference the log that contains the prepared data.
+  const uint64_t zero_log_number = 0ull;
+  size_t batch_cnt = UNLIKELY(commit_batch_cnt) ? commit_batch_cnt : 1;
+  auto s = db_impl_->WriteImpl(write_options_, working_batch, nullptr, nullptr,
+                               zero_log_number, disable_memtable, &seq_used,
+                               batch_cnt, &update_commit_map);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (LIKELY(do_one_write || !s.ok())) {
+    if (LIKELY(s.ok())) {
+      // Note RemovePrepared should be called after WriteImpl that publishsed
+      // the seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+      for (const auto& seq : unprep_seqs_) {
+        wpt_db_->RemovePrepared(seq.first, seq.second);
+      }
+    }
+    unprep_seqs_.clear();
+    write_set_keys_.clear();
+    return s;
+  }  // else do the 2nd write to publish seq
+  // Note: the 2nd write comes with a performance penality. So if we have too
+  // many of commits accompanied with ComitTimeWriteBatch and yet we cannot
+  // enable use_only_the_last_commit_time_batch_for_recovery_ optimization,
+  // two_write_queues should be disabled to avoid many additional writes here.
+  class PublishSeqPreReleaseCallback : public PreReleaseCallback {
+   public:
+    explicit PublishSeqPreReleaseCallback(DBImpl* db_impl)
+        : db_impl_(db_impl) {}
+    Status Callback(SequenceNumber seq,
+                    bool is_mem_disabled __attribute__((__unused__)),
+                    uint64_t) override {
+      assert(is_mem_disabled);
+      assert(db_impl_->immutable_db_options().two_write_queues);
+      db_impl_->SetLastPublishedSequence(seq);
+      return Status::OK();
+    }
+
+   private:
+    DBImpl* db_impl_;
+  } publish_seq_callback(db_impl_);
+  WriteBatch empty_batch;
+  empty_batch.PutLogData(Slice());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(&empty_batch);
+  const bool DISABLE_MEMTABLE = true;
+  const size_t ONE_BATCH = 1;
+  const uint64_t NO_REF_LOG = 0;
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &publish_seq_callback);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Note RemovePrepared should be called after WriteImpl that publishsed the
+  // seq. Otherwise SmallestUnCommittedSeq optimization breaks.
+  for (const auto& seq : unprep_seqs_) {
+    wpt_db_->RemovePrepared(seq.first, seq.second);
+  }
+  unprep_seqs_.clear();
+  write_set_keys_.clear();
+  return s;
+}
+
+Status WriteUnpreparedTxn::RollbackInternal() {
+  // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
+  WriteBatchWithIndex rollback_batch(
+      wpt_db_->DefaultColumnFamily()->GetComparator(), 0, true, 0);
+  assert(GetId() != kMaxSequenceNumber);
+  assert(GetId() > 0);
+  const auto& cf_map = *wupt_db_->GetCFHandleMap();
+  auto read_at_seq = kMaxSequenceNumber;
+  Status s;
+
+  ReadOptions roptions;
+  // Note that we do not use WriteUnpreparedTxnReadCallback because we do not
+  // need to read our own writes when reading prior versions of the key for
+  // rollback.
+  WritePreparedTxnReadCallback callback(wpt_db_, read_at_seq);
+  for (const auto& cfkey : write_set_keys_) {
+    const auto cfid = cfkey.first;
+    const auto& keys = cfkey.second;
+    for (const auto& key : keys) {
+      const auto& cf_handle = cf_map.at(cfid);
+      PinnableSlice pinnable_val;
+      bool not_used;
+      s = db_impl_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
+                            &callback);
+
+      if (s.ok()) {
+        s = rollback_batch.Put(cf_handle, key, pinnable_val);
+        assert(s.ok());
+      } else if (s.IsNotFound()) {
+        s = rollback_batch.Delete(cf_handle, key);
+        assert(s.ok());
+      } else {
+        return s;
+      }
+    }
+  }
+
+  // The Rollback marker will be used as a batch separator
+  WriteBatchInternal::MarkRollback(rollback_batch.GetWriteBatch(), name_);
+  bool do_one_write = !db_impl_->immutable_db_options().two_write_queues;
+  const bool DISABLE_MEMTABLE = true;
+  const uint64_t NO_REF_LOG = 0;
+  uint64_t seq_used = kMaxSequenceNumber;
+  // TODO(lth): We write rollback batch all in a single batch here, but this
+  // should be subdivded into multiple batches as well. In phase 2, when key
+  // sets are read from WAL, this will happen naturally.
+  const size_t ONE_BATCH = 1;
+  // We commit the rolled back prepared batches. ALthough this is
+  // counter-intuitive, i) it is safe to do so, since the prepared batches are
+  // already canceled out by the rollback batch, ii) adding the commit entry to
+  // CommitCache will allow us to benefit from the existing mechanism in
+  // CommitCache that keeps an entry evicted due to max advance and yet overlaps
+  // with a live snapshot around so that the live snapshot properly skips the
+  // entry even if its prepare seq is lower than max_evicted_seq_.
+  WriteUnpreparedCommitEntryPreReleaseCallback update_commit_map(
+      wpt_db_, db_impl_, unprep_seqs_, ONE_BATCH);
+  // Note: the rollback batch does not need AddPrepared since it is written to
+  // DB in one shot. min_uncommitted still works since it requires capturing
+  // data that is written to DB but not yet committed, while the roolback
+  // batch commits with PreReleaseCallback.
+  s = db_impl_->WriteImpl(write_options_, rollback_batch.GetWriteBatch(),
+                          nullptr, nullptr, NO_REF_LOG, !DISABLE_MEMTABLE,
+                          &seq_used, rollback_batch.SubBatchCnt(),
+                          do_one_write ? &update_commit_map : nullptr);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  if (!s.ok()) {
+    return s;
+  }
+  if (do_one_write) {
+    for (const auto& seq : unprep_seqs_) {
+      wpt_db_->RemovePrepared(seq.first, seq.second);
+    }
+    unprep_seqs_.clear();
+    write_set_keys_.clear();
+    return s;
+  }  // else do the 2nd write for commit
+  uint64_t& prepare_seq = seq_used;
+  ROCKS_LOG_DETAILS(db_impl_->immutable_db_options().info_log,
+                    "RollbackInternal 2nd write prepare_seq: %" PRIu64,
+                    prepare_seq);
+  // Commit the batch by writing an empty batch to the queue that will release
+  // the commit sequence number to readers.
+  WriteUnpreparedRollbackPreReleaseCallback update_commit_map_with_prepare(
+      wpt_db_, db_impl_, unprep_seqs_, prepare_seq);
+  WriteBatch empty_batch;
+  empty_batch.PutLogData(Slice());
+  // In the absence of Prepare markers, use Noop as a batch separator
+  WriteBatchInternal::InsertNoop(&empty_batch);
+  s = db_impl_->WriteImpl(write_options_, &empty_batch, nullptr, nullptr,
+                          NO_REF_LOG, DISABLE_MEMTABLE, &seq_used, ONE_BATCH,
+                          &update_commit_map_with_prepare);
+  assert(!s.ok() || seq_used != kMaxSequenceNumber);
+  // Mark the txn as rolled back
+  if (s.ok()) {
+    for (const auto& seq : unprep_seqs_) {
+      wpt_db_->RemovePrepared(seq.first, seq.second);
+    }
+  }
+
+  unprep_seqs_.clear();
+  write_set_keys_.clear();
+  return s;
+}
+
+Status WriteUnpreparedTxn::Get(const ReadOptions& options,
+                               ColumnFamilyHandle* column_family,
+                               const Slice& key, PinnableSlice* value) {
+  auto snapshot = options.snapshot;
+  auto snap_seq =
+      snapshot != nullptr ? snapshot->GetSequenceNumber() : kMaxSequenceNumber;
+  SequenceNumber min_uncommitted =
+      kMinUnCommittedSeq;  // by default disable the optimization
+  if (snapshot != nullptr) {
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
+            ->min_uncommitted_;
+  }
+
+  WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted,
+                                          this);
+  return write_batch_.GetFromBatchAndDB(db_, options, column_family, key, value,
+                                        &callback);
+}
+
+Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options) {
+  return GetIterator(options, wupt_db_->DefaultColumnFamily());
+}
+
+Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options,
+                                          ColumnFamilyHandle* column_family) {
+  // Make sure to get iterator from WriteUnprepareTxnDB, not the root db.
+  Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this);
+  assert(db_iter);
+
+  return write_batch_.NewIteratorWithBase(column_family, db_iter);
+}
+
+const std::map<SequenceNumber, size_t>&
+WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() {
+  return unprep_seqs_;
+}
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn.h b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn.h
new file mode 100644
index 0000000000..68567eb6e0
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <set>
+
+#include "utilities/transactions/write_prepared_txn.h"
+#include "utilities/transactions/write_unprepared_txn_db.h"
+
+namespace rocksdb {
+
+class WriteUnpreparedTxnDB;
+class WriteUnpreparedTxn;
+
+class WriteUnpreparedTxnReadCallback : public ReadCallback {
+ public:
+  WriteUnpreparedTxnReadCallback(WritePreparedTxnDB* db,
+                                 SequenceNumber snapshot,
+                                 SequenceNumber min_uncommitted,
+                                 WriteUnpreparedTxn* txn)
+      // Pass our last uncommitted seq as the snapshot to the parent class to
+      // ensure that the parent will not prematurely filter out own writes. We
+      // will do the exact comparison agaisnt snapshots in IsVisibleFullCheck
+      // override.
+      : ReadCallback(CalcMaxVisibleSeq(txn, snapshot), min_uncommitted),
+        db_(db),
+        txn_(txn),
+        wup_snapshot_(snapshot) {}
+
+  virtual bool IsVisibleFullCheck(SequenceNumber seq) override;
+
+  bool CanReseekToSkip() override {
+    return wup_snapshot_ == max_visible_seq_;
+    // Otherwise our own writes uncommitted are in db, and the assumptions
+    // behind reseek optimizations are no longer valid.
+  }
+
+  // TODO(myabandeh): override Refresh when Iterator::Refresh is supported
+ private:
+  static SequenceNumber CalcMaxVisibleSeq(WriteUnpreparedTxn* txn,
+                                          SequenceNumber snapshot_seq) {
+    SequenceNumber max_unprepared = CalcMaxUnpreparedSequenceNumber(txn);
+    assert(snapshot_seq < max_unprepared || max_unprepared == 0 ||
+           snapshot_seq == kMaxSequenceNumber);
+    return std::max(max_unprepared, snapshot_seq);
+  }
+  static SequenceNumber CalcMaxUnpreparedSequenceNumber(
+      WriteUnpreparedTxn* txn);
+  WritePreparedTxnDB* db_;
+  WriteUnpreparedTxn* txn_;
+  SequenceNumber wup_snapshot_;
+};
+
+class WriteUnpreparedTxn : public WritePreparedTxn {
+ public:
+  WriteUnpreparedTxn(WriteUnpreparedTxnDB* db,
+                     const WriteOptions& write_options,
+                     const TransactionOptions& txn_options);
+
+  virtual ~WriteUnpreparedTxn();
+
+  using TransactionBaseImpl::Put;
+  virtual Status Put(ColumnFamilyHandle* column_family, const Slice& key,
+                     const Slice& value,
+                     const bool assume_tracked = false) override;
+  virtual Status Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                     const SliceParts& value,
+                     const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::Merge;
+  virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value,
+                       const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::Delete;
+  virtual Status Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                        const bool assume_tracked = false) override;
+  virtual Status Delete(ColumnFamilyHandle* column_family,
+                        const SliceParts& key,
+                        const bool assume_tracked = false) override;
+
+  using TransactionBaseImpl::SingleDelete;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const Slice& key,
+                              const bool assume_tracked = false) override;
+  virtual Status SingleDelete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key,
+                              const bool assume_tracked = false) override;
+
+  virtual Status RebuildFromWriteBatch(WriteBatch*) override {
+    // This function was only useful for recovering prepared transactions, but
+    // is unused for write prepared because a transaction may consist of
+    // multiple write batches.
+    //
+    // If there are use cases outside of recovery that can make use of this,
+    // then support could be added.
+    return Status::NotSupported("Not supported for WriteUnprepared");
+  }
+
+  const std::map<SequenceNumber, size_t>& GetUnpreparedSequenceNumbers();
+
+  void UpdateWriteKeySet(uint32_t cfid, const Slice& key);
+
+ protected:
+  void Initialize(const TransactionOptions& txn_options) override;
+
+  Status PrepareInternal() override;
+
+  Status CommitWithoutPrepareInternal() override;
+  Status CommitInternal() override;
+
+  Status RollbackInternal() override;
+
+  // Get and GetIterator needs to be overridden so that a ReadCallback to
+  // handle read-your-own-write is used.
+  using Transaction::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  using Transaction::GetIterator;
+  virtual Iterator* GetIterator(const ReadOptions& options) override;
+  virtual Iterator* GetIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+
+ private:
+  friend class WriteUnpreparedTransactionTest_ReadYourOwnWrite_Test;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+  friend class WriteUnpreparedTransactionTest_UnpreparedBatch_Test;
+  friend class WriteUnpreparedTxnDB;
+
+  Status MaybeFlushWriteBatchToDB();
+  Status FlushWriteBatchToDB(bool prepared);
+
+  // For write unprepared, we check on every writebatch append to see if
+  // max_write_batch_size_ has been exceeded, and then call
+  // FlushWriteBatchToDB if so. This logic is encapsulated in
+  // MaybeFlushWriteBatchToDB.
+  size_t max_write_batch_size_;
+  WriteUnpreparedTxnDB* wupt_db_;
+
+  // Ordered list of unprep_seq sequence numbers that we have already written
+  // to DB.
+  //
+  // This maps unprep_seq => prepare_batch_cnt for each unprepared batch
+  // written by this transaction.
+  //
+  // Note that this contains both prepared and unprepared batches, since they
+  // are treated similarily in prepare heap/commit map, so it simplifies the
+  // commit callbacks.
+  std::map<SequenceNumber, size_t> unprep_seqs_;
+
+  // Set of keys that have written to that have already been written to DB
+  // (ie. not in write_batch_).
+  //
+  std::map<uint32_t, std::vector<std::string>> write_set_keys_;
+};
+
+}  // namespace rocksdb
+
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn_db.cc b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn_db.cc
new file mode 100644
index 0000000000..55ca2b3ea7
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn_db.cc
@@ -0,0 +1,429 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/write_unprepared_txn_db.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "util/cast_util.h"
+
+namespace rocksdb {
+
+// Instead of reconstructing a Transaction object, and calling rollback on it,
+// we can be more efficient with RollbackRecoveredTransaction by skipping
+// unnecessary steps (eg. updating CommitMap, reconstructing keyset)
+Status WriteUnpreparedTxnDB::RollbackRecoveredTransaction(
+    const DBImpl::RecoveredTransaction* rtxn) {
+  // TODO(lth): Reduce duplicate code with WritePrepared rollback logic.
+  assert(rtxn->unprepared_);
+  auto cf_map_shared_ptr = WritePreparedTxnDB::GetCFHandleMap();
+  auto cf_comp_map_shared_ptr = WritePreparedTxnDB::GetCFComparatorMap();
+  WriteOptions w_options;
+  // If we crash during recovery, we can just recalculate and rewrite the
+  // rollback batch.
+  w_options.disableWAL = true;
+
+  class InvalidSnapshotReadCallback : public ReadCallback {
+   public:
+    InvalidSnapshotReadCallback(WritePreparedTxnDB* db, SequenceNumber snapshot)
+        : ReadCallback(snapshot), db_(db) {}
+
+    // Will be called to see if the seq number visible; if not it moves on to
+    // the next seq number.
+    inline bool IsVisibleFullCheck(SequenceNumber seq) override {
+      // Becomes true if it cannot tell by comparing seq with snapshot seq since
+      // the snapshot is not a real snapshot.
+      auto snapshot = max_visible_seq_;
+      bool released = false;
+      auto ret = db_->IsInSnapshot(seq, snapshot, min_uncommitted_, &released);
+      assert(!released || ret);
+      return ret;
+    }
+
+   private:
+    WritePreparedTxnDB* db_;
+  };
+
+  // Iterate starting with largest sequence number.
+  for (auto it = rtxn->batches_.rbegin(); it != rtxn->batches_.rend(); it++) {
+    auto last_visible_txn = it->first - 1;
+    const auto& batch = it->second.batch_;
+    WriteBatch rollback_batch;
+
+    struct RollbackWriteBatchBuilder : public WriteBatch::Handler {
+      DBImpl* db_;
+      ReadOptions roptions;
+      InvalidSnapshotReadCallback callback;
+      WriteBatch* rollback_batch_;
+      std::map<uint32_t, const Comparator*>& comparators_;
+      std::map<uint32_t, ColumnFamilyHandle*>& handles_;
+      using CFKeys = std::set<Slice, SetComparator>;
+      std::map<uint32_t, CFKeys> keys_;
+      bool rollback_merge_operands_;
+      RollbackWriteBatchBuilder(
+          DBImpl* db, WritePreparedTxnDB* wpt_db, SequenceNumber snap_seq,
+          WriteBatch* dst_batch,
+          std::map<uint32_t, const Comparator*>& comparators,
+          std::map<uint32_t, ColumnFamilyHandle*>& handles,
+          bool rollback_merge_operands)
+          : db_(db),
+            callback(wpt_db, snap_seq),
+            // disable min_uncommitted optimization
+            rollback_batch_(dst_batch),
+            comparators_(comparators),
+            handles_(handles),
+            rollback_merge_operands_(rollback_merge_operands) {}
+
+      Status Rollback(uint32_t cf, const Slice& key) {
+        Status s;
+        CFKeys& cf_keys = keys_[cf];
+        if (cf_keys.size() == 0) {  // just inserted
+          auto cmp = comparators_[cf];
+          keys_[cf] = CFKeys(SetComparator(cmp));
+        }
+        auto res = cf_keys.insert(key);
+        if (res.second ==
+            false) {  // second is false if a element already existed.
+          return s;
+        }
+
+        PinnableSlice pinnable_val;
+        bool not_used;
+        auto cf_handle = handles_[cf];
+        s = db_->GetImpl(roptions, cf_handle, key, &pinnable_val, &not_used,
+                         &callback);
+        assert(s.ok() || s.IsNotFound());
+        if (s.ok()) {
+          s = rollback_batch_->Put(cf_handle, key, pinnable_val);
+          assert(s.ok());
+        } else if (s.IsNotFound()) {
+          // There has been no readable value before txn. By adding a delete we
+          // make sure that there will be none afterwards either.
+          s = rollback_batch_->Delete(cf_handle, key);
+          assert(s.ok());
+        } else {
+          // Unexpected status. Return it to the user.
+        }
+        return s;
+      }
+
+      Status PutCF(uint32_t cf, const Slice& key,
+                   const Slice& /*val*/) override {
+        return Rollback(cf, key);
+      }
+
+      Status DeleteCF(uint32_t cf, const Slice& key) override {
+        return Rollback(cf, key);
+      }
+
+      Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+        return Rollback(cf, key);
+      }
+
+      Status MergeCF(uint32_t cf, const Slice& key,
+                     const Slice& /*val*/) override {
+        if (rollback_merge_operands_) {
+          return Rollback(cf, key);
+        } else {
+          return Status::OK();
+        }
+      }
+
+      // Recovered batches do not contain 2PC markers.
+      Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+      Status MarkBeginPrepare(bool) override {
+        return Status::InvalidArgument();
+      }
+      Status MarkEndPrepare(const Slice&) override {
+        return Status::InvalidArgument();
+      }
+      Status MarkCommit(const Slice&) override {
+        return Status::InvalidArgument();
+      }
+      Status MarkRollback(const Slice&) override {
+        return Status::InvalidArgument();
+      }
+    } rollback_handler(db_impl_, this, last_visible_txn, &rollback_batch,
+                       *cf_comp_map_shared_ptr.get(), *cf_map_shared_ptr.get(),
+                       txn_db_options_.rollback_merge_operands);
+
+    auto s = batch->Iterate(&rollback_handler);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // The Rollback marker will be used as a batch separator
+    WriteBatchInternal::MarkRollback(&rollback_batch, rtxn->name_);
+
+    const uint64_t kNoLogRef = 0;
+    const bool kDisableMemtable = true;
+    const size_t kOneBatch = 1;
+    uint64_t seq_used = kMaxSequenceNumber;
+    s = db_impl_->WriteImpl(w_options, &rollback_batch, nullptr, nullptr,
+                            kNoLogRef, !kDisableMemtable, &seq_used, kOneBatch);
+    if (!s.ok()) {
+      return s;
+    }
+
+    // If two_write_queues, we must manually release the sequence number to
+    // readers.
+    if (db_impl_->immutable_db_options().two_write_queues) {
+      db_impl_->SetLastPublishedSequence(seq_used);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status WriteUnpreparedTxnDB::Initialize(
+    const std::vector<size_t>& compaction_enabled_cf_indices,
+    const std::vector<ColumnFamilyHandle*>& handles) {
+  // TODO(lth): Reduce code duplication in this function.
+  auto dbimpl = reinterpret_cast<DBImpl*>(GetRootDB());
+  assert(dbimpl != nullptr);
+
+  db_impl_->SetSnapshotChecker(new WritePreparedSnapshotChecker(this));
+  // A callback to commit a single sub-batch
+  class CommitSubBatchPreReleaseCallback : public PreReleaseCallback {
+   public:
+    explicit CommitSubBatchPreReleaseCallback(WritePreparedTxnDB* db)
+        : db_(db) {}
+    Status Callback(SequenceNumber commit_seq,
+                    bool is_mem_disabled __attribute__((__unused__)),
+                    uint64_t) override {
+      assert(!is_mem_disabled);
+      db_->AddCommitted(commit_seq, commit_seq);
+      return Status::OK();
+    }
+
+   private:
+    WritePreparedTxnDB* db_;
+  };
+  db_impl_->SetRecoverableStatePreReleaseCallback(
+      new CommitSubBatchPreReleaseCallback(this));
+
+  // PessimisticTransactionDB::Initialize
+  for (auto cf_ptr : handles) {
+    AddColumnFamily(cf_ptr);
+  }
+  // Verify cf options
+  for (auto handle : handles) {
+    ColumnFamilyDescriptor cfd;
+    Status s = handle->GetDescriptor(&cfd);
+    if (!s.ok()) {
+      return s;
+    }
+    s = VerifyCFOptions(cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Re-enable compaction for the column families that initially had
+  // compaction enabled.
+  std::vector<ColumnFamilyHandle*> compaction_enabled_cf_handles;
+  compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
+  for (auto index : compaction_enabled_cf_indices) {
+    compaction_enabled_cf_handles.push_back(handles[index]);
+  }
+
+  // create 'real' transactions from recovered shell transactions
+  auto rtxns = dbimpl->recovered_transactions();
+  for (auto rtxn : rtxns) {
+    auto recovered_trx = rtxn.second;
+    assert(recovered_trx);
+    assert(recovered_trx->batches_.size() >= 1);
+    assert(recovered_trx->name_.length());
+
+    // We can only rollback transactions after AdvanceMaxEvictedSeq is called,
+    // but AddPrepared must occur before AdvanceMaxEvictedSeq, which is why
+    // two iterations is required.
+    if (recovered_trx->unprepared_) {
+      continue;
+    }
+
+    WriteOptions w_options;
+    w_options.sync = true;
+    TransactionOptions t_options;
+
+    auto first_log_number = recovered_trx->batches_.begin()->second.log_number_;
+    auto first_seq = recovered_trx->batches_.begin()->first;
+    auto last_prepare_batch_cnt =
+        recovered_trx->batches_.begin()->second.batch_cnt_;
+
+    Transaction* real_trx = BeginTransaction(w_options, t_options, nullptr);
+    assert(real_trx);
+    auto wupt =
+        static_cast_with_check<WriteUnpreparedTxn, Transaction>(real_trx);
+
+    real_trx->SetLogNumber(first_log_number);
+    real_trx->SetId(first_seq);
+    Status s = real_trx->SetName(recovered_trx->name_);
+    if (!s.ok()) {
+      break;
+    }
+    wupt->prepare_batch_cnt_ = last_prepare_batch_cnt;
+
+    for (auto batch : recovered_trx->batches_) {
+      const auto& seq = batch.first;
+      const auto& batch_info = batch.second;
+      auto cnt = batch_info.batch_cnt_ ? batch_info.batch_cnt_ : 1;
+      assert(batch_info.log_number_);
+
+      for (size_t i = 0; i < cnt; i++) {
+        AddPrepared(seq + i);
+      }
+      assert(wupt->unprep_seqs_.count(seq) == 0);
+      wupt->unprep_seqs_[seq] = cnt;
+      KeySetBuilder keyset_handler(wupt,
+                                   txn_db_options_.rollback_merge_operands);
+      s = batch_info.batch_->Iterate(&keyset_handler);
+      assert(s.ok());
+      if (!s.ok()) {
+        break;
+      }
+    }
+
+    wupt->write_batch_.Clear();
+    WriteBatchInternal::InsertNoop(wupt->write_batch_.GetWriteBatch());
+
+    real_trx->SetState(Transaction::PREPARED);
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  SequenceNumber prev_max = max_evicted_seq_;
+  SequenceNumber last_seq = db_impl_->GetLatestSequenceNumber();
+  AdvanceMaxEvictedSeq(prev_max, last_seq);
+  // Create a gap between max and the next snapshot. This simplifies the logic
+  // in IsInSnapshot by not having to consider the special case of max ==
+  // snapshot after recovery. This is tested in IsInSnapshotEmptyMapTest.
+  if (last_seq) {
+    db_impl_->versions_->SetLastAllocatedSequence(last_seq + 1);
+    db_impl_->versions_->SetLastSequence(last_seq + 1);
+    db_impl_->versions_->SetLastPublishedSequence(last_seq + 1);
+  }
+
+  // Compaction should start only after max_evicted_seq_ is set.
+  Status s = EnableAutoCompaction(compaction_enabled_cf_handles);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Rollback unprepared transactions.
+  for (auto rtxn : rtxns) {
+    auto recovered_trx = rtxn.second;
+    if (recovered_trx->unprepared_) {
+      s = RollbackRecoveredTransaction(recovered_trx);
+      if (!s.ok()) {
+        return s;
+      }
+      continue;
+    }
+  }
+
+  if (s.ok()) {
+    dbimpl->DeleteAllRecoveredTransactions();
+  }
+
+  return s;
+}
+
+Transaction* WriteUnpreparedTxnDB::BeginTransaction(
+    const WriteOptions& write_options, const TransactionOptions& txn_options,
+    Transaction* old_txn) {
+  if (old_txn != nullptr) {
+    ReinitializeTransaction(old_txn, write_options, txn_options);
+    return old_txn;
+  } else {
+    return new WriteUnpreparedTxn(this, write_options, txn_options);
+  }
+}
+
+// Struct to hold ownership of snapshot and read callback for iterator cleanup.
+struct WriteUnpreparedTxnDB::IteratorState {
+  IteratorState(WritePreparedTxnDB* txn_db, SequenceNumber sequence,
+                std::shared_ptr<ManagedSnapshot> s,
+                SequenceNumber min_uncommitted, WriteUnpreparedTxn* txn)
+      : callback(txn_db, sequence, min_uncommitted, txn), snapshot(s) {}
+  SequenceNumber MaxVisibleSeq() { return callback.max_visible_seq(); }
+
+  WriteUnpreparedTxnReadCallback callback;
+  std::shared_ptr<ManagedSnapshot> snapshot;
+};
+
+namespace {
+static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) {
+  delete reinterpret_cast<WriteUnpreparedTxnDB::IteratorState*>(arg1);
+}
+}  // anonymous namespace
+
+Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options,
+                                            ColumnFamilyHandle* column_family,
+                                            WriteUnpreparedTxn* txn) {
+  // TODO(lth): Refactor so that this logic is shared with WritePrepared.
+  constexpr bool ALLOW_BLOB = true;
+  constexpr bool ALLOW_REFRESH = true;
+  std::shared_ptr<ManagedSnapshot> own_snapshot = nullptr;
+  SequenceNumber snapshot_seq;
+  SequenceNumber min_uncommitted = 0;
+  if (options.snapshot != nullptr) {
+    snapshot_seq = options.snapshot->GetSequenceNumber();
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl, const Snapshot>(
+            options.snapshot)
+            ->min_uncommitted_;
+  } else {
+    auto* snapshot = GetSnapshot();
+    // We take a snapshot to make sure that the related data in the commit map
+    // are not deleted.
+    snapshot_seq = snapshot->GetSequenceNumber();
+    min_uncommitted =
+        static_cast_with_check<const SnapshotImpl, const Snapshot>(snapshot)
+            ->min_uncommitted_;
+    own_snapshot = std::make_shared<ManagedSnapshot>(db_impl_, snapshot);
+  }
+  assert(snapshot_seq != kMaxSequenceNumber);
+  auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+  auto* state =
+      new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn);
+  auto* db_iter =
+      db_impl_->NewIteratorImpl(options, cfd, state->MaxVisibleSeq(),
+                                &state->callback, !ALLOW_BLOB, !ALLOW_REFRESH);
+  db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr);
+  return db_iter;
+}
+
+Status KeySetBuilder::PutCF(uint32_t cf, const Slice& key,
+                            const Slice& /*val*/) {
+  txn_->UpdateWriteKeySet(cf, key);
+  return Status::OK();
+}
+
+Status KeySetBuilder::DeleteCF(uint32_t cf, const Slice& key) {
+  txn_->UpdateWriteKeySet(cf, key);
+  return Status::OK();
+}
+
+Status KeySetBuilder::SingleDeleteCF(uint32_t cf, const Slice& key) {
+  txn_->UpdateWriteKeySet(cf, key);
+  return Status::OK();
+}
+
+Status KeySetBuilder::MergeCF(uint32_t cf, const Slice& key,
+                              const Slice& /*val*/) {
+  if (rollback_merge_operands_) {
+    txn_->UpdateWriteKeySet(cf, key);
+  }
+  return Status::OK();
+}
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn_db.h b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn_db.h
new file mode 100644
index 0000000000..4b4e31e1b6
--- /dev/null
+++ b/thirdparty/rocksdb/utilities/transactions/write_unprepared_txn_db.h
@@ -0,0 +1,177 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include "utilities/transactions/write_prepared_txn_db.h"
+#include "utilities/transactions/write_unprepared_txn.h"
+
+namespace rocksdb {
+
+class WriteUnpreparedTxn;
+
+class WriteUnpreparedTxnDB : public WritePreparedTxnDB {
+ public:
+  using WritePreparedTxnDB::WritePreparedTxnDB;
+
+  Status Initialize(const std::vector<size_t>& compaction_enabled_cf_indices,
+                    const std::vector<ColumnFamilyHandle*>& handles) override;
+
+  Transaction* BeginTransaction(const WriteOptions& write_options,
+                                const TransactionOptions& txn_options,
+                                Transaction* old_txn) override;
+
+  // Struct to hold ownership of snapshot and read callback for cleanup.
+  struct IteratorState;
+
+  using WritePreparedTxnDB::NewIterator;
+  Iterator* NewIterator(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        WriteUnpreparedTxn* txn);
+
+ private:
+  Status RollbackRecoveredTransaction(const DBImpl::RecoveredTransaction* rtxn);
+};
+
+class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback {
+  // TODO(lth): Reduce code duplication with
+  // WritePreparedCommitEntryPreReleaseCallback
+ public:
+  // includes_data indicates that the commit also writes non-empty
+  // CommitTimeWriteBatch to memtable, which needs to be committed separately.
+  WriteUnpreparedCommitEntryPreReleaseCallback(
+      WritePreparedTxnDB* db, DBImpl* db_impl,
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      size_t data_batch_cnt = 0, bool publish_seq = true)
+      : db_(db),
+        db_impl_(db_impl),
+        unprep_seqs_(unprep_seqs),
+        data_batch_cnt_(data_batch_cnt),
+        includes_data_(data_batch_cnt_ > 0),
+        publish_seq_(publish_seq) {
+    assert(unprep_seqs.size() > 0);
+  }
+
+  virtual Status Callback(SequenceNumber commit_seq,
+                          bool is_mem_disabled __attribute__((__unused__)),
+                          uint64_t) override {
+    const uint64_t last_commit_seq = LIKELY(data_batch_cnt_ <= 1)
+                                         ? commit_seq
+                                         : commit_seq + data_batch_cnt_ - 1;
+    // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt.
+    for (const auto& s : unprep_seqs_) {
+      for (size_t i = 0; i < s.second; i++) {
+        db_->AddCommitted(s.first + i, last_commit_seq);
+      }
+    }
+
+    if (includes_data_) {
+      assert(data_batch_cnt_);
+      // Commit the data that is accompanied with the commit request
+      for (size_t i = 0; i < data_batch_cnt_; i++) {
+        // For commit seq of each batch use the commit seq of the last batch.
+        // This would make debugging easier by having all the batches having
+        // the same sequence number.
+        db_->AddCommitted(commit_seq + i, last_commit_seq);
+      }
+    }
+    if (db_impl_->immutable_db_options().two_write_queues && publish_seq_) {
+      assert(is_mem_disabled);  // implies the 2nd queue
+      // Publish the sequence number. We can do that here assuming the callback
+      // is invoked only from one write queue, which would guarantee that the
+      // publish sequence numbers will be in order, i.e., once a seq is
+      // published all the seq prior to that are also publishable.
+      db_impl_->SetLastPublishedSequence(last_commit_seq);
+    }
+    // else SequenceNumber that is updated as part of the write already does the
+    // publishing
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  const std::map<SequenceNumber, size_t>& unprep_seqs_;
+  size_t data_batch_cnt_;
+  // Either because it is commit without prepare or it has a
+  // CommitTimeWriteBatch
+  bool includes_data_;
+  // Should the callback also publishes the commit seq number
+  bool publish_seq_;
+};
+
+class WriteUnpreparedRollbackPreReleaseCallback : public PreReleaseCallback {
+  // TODO(lth): Reduce code duplication with
+  // WritePreparedCommitEntryPreReleaseCallback
+ public:
+  WriteUnpreparedRollbackPreReleaseCallback(
+      WritePreparedTxnDB* db, DBImpl* db_impl,
+      const std::map<SequenceNumber, size_t>& unprep_seqs,
+      SequenceNumber rollback_seq)
+      : db_(db),
+        db_impl_(db_impl),
+        unprep_seqs_(unprep_seqs),
+        rollback_seq_(rollback_seq) {
+    assert(unprep_seqs.size() > 0);
+    assert(db_impl_->immutable_db_options().two_write_queues);
+  }
+
+  virtual Status Callback(SequenceNumber commit_seq,
+                          bool is_mem_disabled __attribute__((__unused__)),
+                          uint64_t) override {
+    assert(is_mem_disabled);  // implies the 2nd queue
+    const uint64_t last_commit_seq = commit_seq;
+    db_->AddCommitted(rollback_seq_, last_commit_seq);
+    // Recall that unprep_seqs maps (un)prepared_seq => prepare_batch_cnt.
+    for (const auto& s : unprep_seqs_) {
+      for (size_t i = 0; i < s.second; i++) {
+        db_->AddCommitted(s.first + i, last_commit_seq);
+      }
+    }
+    db_impl_->SetLastPublishedSequence(last_commit_seq);
+    return Status::OK();
+  }
+
+ private:
+  WritePreparedTxnDB* db_;
+  DBImpl* db_impl_;
+  const std::map<SequenceNumber, size_t>& unprep_seqs_;
+  SequenceNumber rollback_seq_;
+};
+
+struct KeySetBuilder : public WriteBatch::Handler {
+  WriteUnpreparedTxn* txn_;
+  bool rollback_merge_operands_;
+
+  KeySetBuilder(WriteUnpreparedTxn* txn, bool rollback_merge_operands)
+      : txn_(txn), rollback_merge_operands_(rollback_merge_operands) {}
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override;
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override;
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override;
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override;
+
+  // Recovered batches do not contain 2PC markers.
+  Status MarkNoop(bool) override { return Status::InvalidArgument(); }
+  Status MarkBeginPrepare(bool) override { return Status::InvalidArgument(); }
+  Status MarkEndPrepare(const Slice&) override {
+    return Status::InvalidArgument();
+  }
+  Status MarkCommit(const Slice&) override { return Status::InvalidArgument(); }
+  Status MarkRollback(const Slice&) override {
+    return Status::InvalidArgument();
+  }
+};
+
+}  //  namespace rocksdb
+#endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.cc b/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.cc
index de3f2995c5..7ec60b6150 100644
--- a/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.cc
+++ b/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.cc
@@ -259,8 +259,8 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
     explicit Handler(Env* env) : env_(env) {}
     WriteBatch updates_ttl;
     Status batch_rewrite_status;
-    virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) override {
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
       std::string value_with_ts;
       Status st = AppendTS(value, &value_with_ts, env_);
       if (!st.ok()) {
@@ -271,8 +271,8 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
       }
       return Status::OK();
     }
-    virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                           const Slice& value) override {
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
       std::string value_with_ts;
       Status st = AppendTS(value, &value_with_ts, env_);
       if (!st.ok()) {
@@ -283,14 +283,11 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
       }
       return Status::OK();
     }
-    virtual Status DeleteCF(uint32_t column_family_id,
-                            const Slice& key) override {
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
       WriteBatchInternal::Delete(&updates_ttl, column_family_id, key);
       return Status::OK();
     }
-    virtual void LogData(const Slice& blob) override {
-      updates_ttl.PutLogData(blob);
-    }
+    void LogData(const Slice& blob) override { updates_ttl.PutLogData(blob); }
 
    private:
     Env* env_;
@@ -309,5 +306,16 @@ Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts,
   return new TtlIterator(db_->NewIterator(opts, column_family));
 }
 
+void DBWithTTLImpl::SetTtl(ColumnFamilyHandle *h, int32_t ttl) {
+  std::shared_ptr<TtlCompactionFilterFactory> filter;
+  Options opts;
+  opts = GetOptions(h);
+  filter = std::static_pointer_cast<TtlCompactionFilterFactory>(
+                                       opts.compaction_filter_factory);
+  if (!filter)
+    return;
+  filter->SetTtl(ttl);
+}
+
 }  // namespace rocksdb
 #endif  // ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.h b/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.h
index ab2b8cdf77..1148f4f90a 100644
--- a/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.h
+++ b/thirdparty/rocksdb/utilities/ttl/db_ttl_impl.h
@@ -94,6 +94,10 @@ class DBWithTTLImpl : public DBWithTTL {
   static const int32_t kMinTimestamp = 1368146402;  // 05/09/2013:5:40PM GMT-8
 
   static const int32_t kMaxTimestamp = 2147483647;  // 01/18/2038:7:14PM GMT-8
+
+  void SetTtl(int32_t ttl) override { SetTtl(DefaultColumnFamily(), ttl); }
+
+  void SetTtl(ColumnFamilyHandle *h, int32_t ttl) override;
 };
 
 class TtlIterator : public Iterator {
@@ -209,6 +213,10 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory {
         ttl_, env_, nullptr, std::move(user_comp_filter_from_factory)));
   }
 
+  void SetTtl(int32_t ttl) {
+    ttl_ = ttl;
+  }
+
   virtual const char* Name() const override {
     return "TtlCompactionFilterFactory";
   }
diff --git a/thirdparty/rocksdb/utilities/ttl/ttl_test.cc b/thirdparty/rocksdb/utilities/ttl/ttl_test.cc
index 586d0ce1f6..6a50eb29fe 100644
--- a/thirdparty/rocksdb/utilities/ttl/ttl_test.cc
+++ b/thirdparty/rocksdb/utilities/ttl/ttl_test.cc
@@ -30,7 +30,7 @@ class SpecialTimeEnv : public EnvWrapper {
   }
 
   void Sleep(int64_t sleep_time) { current_time_ += sleep_time; }
-  virtual Status GetCurrentTime(int64_t* current_time) override {
+  Status GetCurrentTime(int64_t* current_time) override {
     *current_time = current_time_;
     return Status::OK();
   }
@@ -43,7 +43,7 @@ class TtlTest : public testing::Test {
  public:
   TtlTest() {
     env_.reset(new SpecialTimeEnv(Env::Default()));
-    dbname_ = test::TmpDir() + "/db_ttl";
+    dbname_ = test::PerThreadDBPath("db_ttl");
     options_.create_if_missing = true;
     options_.env = env_.get();
     // ensure that compaction is kicked in to always strip timestamp from kvs
@@ -53,7 +53,7 @@ class TtlTest : public testing::Test {
     DestroyDB(dbname_, Options());
   }
 
-  ~TtlTest() {
+  ~TtlTest() override {
     CloseTtl();
     DestroyDB(dbname_, Options());
   }
@@ -283,6 +283,12 @@ class TtlTest : public testing::Test {
     delete dbiter;
   }
 
+  // Set ttl on open db
+  void SetTtl(int32_t ttl, ColumnFamilyHandle* cf = nullptr) {
+    ASSERT_TRUE(db_ttl_);
+    cf == nullptr ? db_ttl_->SetTtl(ttl) : db_ttl_->SetTtl(cf, ttl);
+  }
+
   class TestFilter : public CompactionFilter {
    public:
     TestFilter(const int64_t kSampleSize, const std::string& kNewValue)
@@ -295,9 +301,8 @@ class TtlTest : public testing::Test {
     // Keeps key if it is in [kSampleSize_/3, 2*kSampleSize_/3),
     // Change value if it is in [2*kSampleSize_/3, kSampleSize_)
     // Eg. kSampleSize_=6. Drop:key0-1...Keep:key2-3...Change:key4-5...
-    virtual bool Filter(int level, const Slice& key,
-                        const Slice& value, std::string* new_value,
-                        bool* value_changed) const override {
+    bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+                std::string* new_value, bool* value_changed) const override {
       assert(new_value != nullptr);
 
       std::string search_str = "0123456789";
@@ -328,9 +333,7 @@ class TtlTest : public testing::Test {
       }
     }
 
-    virtual const char* Name() const override {
-      return "TestFilter";
-    }
+    const char* Name() const override { return "TestFilter"; }
 
    private:
     const int64_t kSampleSize_;
@@ -344,17 +347,15 @@ class TtlTest : public testing::Test {
           kNewValue_(kNewValue) {
       }
 
-      virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
-          const CompactionFilter::Context& context) override {
+      std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+          const CompactionFilter::Context& /*context*/) override {
         return std::unique_ptr<CompactionFilter>(
             new TestFilter(kSampleSize_, kNewValue_));
       }
 
-      virtual const char* Name() const override {
-        return "TestFilterFactory";
-      }
+      const char* Name() const override { return "TestFilterFactory"; }
 
-    private:
+     private:
       const int64_t kSampleSize_;
       const std::string kNewValue_;
   };
@@ -364,14 +365,14 @@ class TtlTest : public testing::Test {
   static const int64_t kSampleSize_ = 100;
   std::string dbname_;
   DBWithTTL* db_ttl_;
-  unique_ptr<SpecialTimeEnv> env_;
+  std::unique_ptr<SpecialTimeEnv> env_;
 
  private:
   Options options_;
   KVMap kvmap_;
   KVMap::iterator kv_it_;
   const std::string kNewValue_ = "new_value";
-  unique_ptr<CompactionFilter> test_comp_filter_;
+  std::unique_ptr<CompactionFilter> test_comp_filter_;
 }; // class TtlTest
 
 // If TTL is non positive or not provided, the behaviour is TTL = infinity
@@ -626,6 +627,18 @@ TEST_F(TtlTest, ColumnFamiliesTest) {
   db_ttl_ = nullptr;
 }
 
+// Puts a set of values and checks its absence using Get after ttl
+TEST_F(TtlTest, ChangeTtlOnOpenDb) {
+  MakeKVMap(kSampleSize_);
+
+  OpenTtl(1);                                  // T=0:Open the db with ttl = 2
+  SetTtl(3);
+  // @lint-ignore TXT2 T25377293 Grandfathered in
+  PutValues(0, kSampleSize_);		       // T=0:Insert Set1. Delete at t=2
+  SleepCompactCheck(2, 0, kSampleSize_, true); // T=2:Set1 should be there
+  CloseTtl();
+}
+
 } //  namespace rocksdb
 
 // A black-box test for the ttl wrapper around rocksdb
@@ -637,7 +650,7 @@ int main(int argc, char** argv) {
 #else
 #include <stdio.h>
 
-int main(int argc, char** argv) {
+int main(int /*argc*/, char** /*argv*/) {
   fprintf(stderr, "SKIPPED as DBWithTTL is not supported in ROCKSDB_LITE\n");
   return 0;
 }
diff --git a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
index b2820109cc..c620ebd4d2 100644
--- a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
+++ b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -7,7 +7,6 @@
 
 #include "rocksdb/utilities/write_batch_with_index.h"
 
-#include <limits>
 #include <memory>
 
 #include "db/column_family.h"
@@ -19,6 +18,8 @@
 #include "rocksdb/comparator.h"
 #include "rocksdb/iterator.h"
 #include "util/arena.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
 #include "utilities/write_batch_with_index/write_batch_with_index_internal.h"
 
 namespace rocksdb {
@@ -41,7 +42,7 @@ class BaseDeltaIterator : public Iterator {
         delta_iterator_(delta_iterator),
         comparator_(comparator) {}
 
-  virtual ~BaseDeltaIterator() {}
+  ~BaseDeltaIterator() override {}
 
   bool Valid() const override {
     return current_at_base_ ? BaseValid() : DeltaValid();
@@ -78,6 +79,7 @@ class BaseDeltaIterator : public Iterator {
   void Next() override {
     if (!Valid()) {
       status_ = Status::NotSupported("Next() on invalid iterator");
+      return;
     }
 
     if (!forward_) {
@@ -113,6 +115,7 @@ class BaseDeltaIterator : public Iterator {
   void Prev() override {
     if (!Valid()) {
       status_ = Status::NotSupported("Prev() on invalid iterator");
+      return;
     }
 
     if (forward_) {
@@ -169,6 +172,21 @@ class BaseDeltaIterator : public Iterator {
  private:
   void AssertInvariants() {
 #ifndef NDEBUG
+    bool not_ok = false;
+    if (!base_iterator_->status().ok()) {
+      assert(!base_iterator_->Valid());
+      not_ok = true;
+    }
+    if (!delta_iterator_->status().ok()) {
+      assert(!delta_iterator_->Valid());
+      not_ok = true;
+    }
+    if (not_ok) {
+      assert(!Valid());
+      assert(!status().ok());
+      return;
+    }
+
     if (!Valid()) {
       return;
     }
@@ -237,13 +255,25 @@ class BaseDeltaIterator : public Iterator {
   void UpdateCurrent() {
 // Suppress false positive clang analyzer warnings.
 #ifndef __clang_analyzer__
+    status_ = Status::OK();
     while (true) {
       WriteEntry delta_entry;
       if (DeltaValid()) {
+        assert(delta_iterator_->status().ok());
         delta_entry = delta_iterator_->Entry();
+      } else if (!delta_iterator_->status().ok()) {
+        // Expose the error status and stop.
+        current_at_base_ = false;
+        return;
       }
       equal_keys_ = false;
       if (!BaseValid()) {
+        if (!base_iterator_->status().ok()) {
+          // Expose the error status and stop.
+          current_at_base_ = true;
+          return;
+        }
+
         // Base has finished.
         if (!DeltaValid()) {
           // Finished
@@ -310,9 +340,9 @@ class WBWIIteratorImpl : public WBWIIterator {
         skip_list_iter_(skip_list),
         write_batch_(write_batch) {}
 
-  virtual ~WBWIIteratorImpl() {}
+  ~WBWIIteratorImpl() override {}
 
-  virtual bool Valid() const override {
+  bool Valid() const override {
     if (!skip_list_iter_.Valid()) {
       return false;
     }
@@ -321,15 +351,17 @@ class WBWIIteratorImpl : public WBWIIterator {
             iter_entry->column_family == column_family_id_);
   }
 
-  virtual void SeekToFirst() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_, 0, 0);
+  void SeekToFirst() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
     skip_list_iter_.Seek(&search_entry);
   }
 
-  virtual void SeekToLast() override {
-    WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin,
-                                      column_family_id_ + 1, 0, 0);
+  void SeekToLast() override {
+    WriteBatchIndexEntry search_entry(
+        nullptr /* search_key */, column_family_id_ + 1,
+        true /* is_forward_direction */, true /* is_seek_to_first */);
     skip_list_iter_.Seek(&search_entry);
     if (!skip_list_iter_.Valid()) {
       skip_list_iter_.SeekToLast();
@@ -338,21 +370,25 @@ class WBWIIteratorImpl : public WBWIIterator {
     }
   }
 
-  virtual void Seek(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+  void Seek(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      true /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
     skip_list_iter_.Seek(&search_entry);
   }
 
-  virtual void SeekForPrev(const Slice& key) override {
-    WriteBatchIndexEntry search_entry(&key, column_family_id_);
+  void SeekForPrev(const Slice& key) override {
+    WriteBatchIndexEntry search_entry(&key, column_family_id_,
+                                      false /* is_forward_direction */,
+                                      false /* is_seek_to_first */);
     skip_list_iter_.SeekForPrev(&search_entry);
   }
 
-  virtual void Next() override { skip_list_iter_.Next(); }
+  void Next() override { skip_list_iter_.Next(); }
 
-  virtual void Prev() override { skip_list_iter_.Prev(); }
+  void Prev() override { skip_list_iter_.Prev(); }
 
-  virtual WriteEntry Entry() const override {
+  WriteEntry Entry() const override {
     WriteEntry ret;
     Slice blob, xid;
     const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key();
@@ -368,7 +404,7 @@ class WBWIIteratorImpl : public WBWIIterator {
     return ret;
   }
 
-  virtual Status status() const override {
+  Status status() const override {
     // this is in-memory data structure, so the only way status can be non-ok is
     // through memory corruption
     return Status::OK();
@@ -391,13 +427,21 @@ struct WriteBatchWithIndex::Rep {
         comparator(index_comparator, &write_batch),
         skip_list(comparator, &arena),
         overwrite_key(_overwrite_key),
-        last_entry_offset(0) {}
+        last_entry_offset(0),
+        last_sub_batch_offset(0),
+        sub_batch_cnt(1) {}
   ReadableWriteBatch write_batch;
   WriteBatchEntryComparator comparator;
   Arena arena;
   WriteBatchEntrySkipList skip_list;
   bool overwrite_key;
   size_t last_entry_offset;
+  // The starting offset of the last sub-batch. A sub-batch starts right before
+  // inserting a key that is a duplicate of a key in the last sub-batch. Zero,
+  // the default, means that no duplicate key is detected so far.
+  size_t last_sub_batch_offset;
+  // Total number of sub-batches in the write batch. Default is 1.
+  size_t sub_batch_cnt;
 
   // Remember current offset of internal write batch, which is used as
   // the starting offset of the next record.
@@ -449,6 +493,10 @@ bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId(
   }
   WriteBatchIndexEntry* non_const_entry =
       const_cast<WriteBatchIndexEntry*>(iter.GetRawEntry());
+  if (LIKELY(last_sub_batch_offset <= non_const_entry->offset)) {
+    last_sub_batch_offset = last_entry_offset;
+    sub_batch_cnt++;
+  }
   non_const_entry->offset = last_entry_offset;
   return true;
 }
@@ -477,106 +525,114 @@ void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) {
                           wb_data.size() - last_entry_offset);
   // Extract key
   Slice key;
-  bool success __attribute__((__unused__)) =
+  bool success __attribute__((__unused__));
+  success =
       ReadKeyFromWriteBatchEntry(&entry_ptr, &key, column_family_id != 0);
   assert(success);
 
-    auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
-    auto* index_entry =
-        new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id,
-                                       key.data() - wb_data.data(), key.size());
-    skip_list.Insert(index_entry);
-  }
-
-  void WriteBatchWithIndex::Rep::Clear() {
-    write_batch.Clear();
-    ClearIndex();
-  }
+  auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry));
+  auto* index_entry =
+      new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id,
+                                      key.data() - wb_data.data(), key.size());
+  skip_list.Insert(index_entry);
+}
 
-  void WriteBatchWithIndex::Rep::ClearIndex() {
-    skip_list.~WriteBatchEntrySkipList();
-    arena.~Arena();
-    new (&arena) Arena();
-    new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
-    last_entry_offset = 0;
-  }
+void WriteBatchWithIndex::Rep::Clear() {
+  write_batch.Clear();
+  ClearIndex();
+}
 
-  Status WriteBatchWithIndex::Rep::ReBuildIndex() {
-    Status s;
+void WriteBatchWithIndex::Rep::ClearIndex() {
+  skip_list.~WriteBatchEntrySkipList();
+  arena.~Arena();
+  new (&arena) Arena();
+  new (&skip_list) WriteBatchEntrySkipList(comparator, &arena);
+  last_entry_offset = 0;
+  last_sub_batch_offset = 0;
+  sub_batch_cnt = 1;
+}
 
-    ClearIndex();
+Status WriteBatchWithIndex::Rep::ReBuildIndex() {
+  Status s;
 
-    if (write_batch.Count() == 0) {
-      // Nothing to re-index
-      return s;
-    }
+  ClearIndex();
 
-    size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch);
+  if (write_batch.Count() == 0) {
+    // Nothing to re-index
+    return s;
+  }
 
-    Slice input(write_batch.Data());
-    input.remove_prefix(offset);
+  size_t offset = WriteBatchInternal::GetFirstOffset(&write_batch);
 
-    // Loop through all entries in Rep and add each one to the index
-    int found = 0;
-    while (s.ok() && !input.empty()) {
-      Slice key, value, blob, xid;
-      uint32_t column_family_id = 0;  // default
-      char tag = 0;
+  Slice input(write_batch.Data());
+  input.remove_prefix(offset);
 
-      // set offset of current entry for call to AddNewEntry()
-      last_entry_offset = input.data() - write_batch.Data().data();
+  // Loop through all entries in Rep and add each one to the index
+  int found = 0;
+  while (s.ok() && !input.empty()) {
+    Slice key, value, blob, xid;
+    uint32_t column_family_id = 0;  // default
+    char tag = 0;
 
-      s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key,
-                                   &value, &blob, &xid);
-      if (!s.ok()) {
-        break;
-      }
+    // set offset of current entry for call to AddNewEntry()
+    last_entry_offset = input.data() - write_batch.Data().data();
 
-      switch (tag) {
-        case kTypeColumnFamilyValue:
-        case kTypeValue:
-        case kTypeColumnFamilyDeletion:
-        case kTypeDeletion:
-        case kTypeColumnFamilySingleDeletion:
-        case kTypeSingleDeletion:
-        case kTypeColumnFamilyMerge:
-        case kTypeMerge:
-          found++;
-          if (!UpdateExistingEntryWithCfId(column_family_id, key)) {
-            AddNewEntry(column_family_id);
-          }
-          break;
-        case kTypeLogData:
-        case kTypeBeginPrepareXID:
-        case kTypeEndPrepareXID:
-        case kTypeCommitXID:
-        case kTypeRollbackXID:
-        case kTypeNoop:
-          break;
-        default:
-          return Status::Corruption("unknown WriteBatch tag");
-      }
+    s = ReadRecordFromWriteBatch(&input, &tag, &column_family_id, &key,
+                                  &value, &blob, &xid);
+    if (!s.ok()) {
+      break;
     }
 
-    if (s.ok() && found != write_batch.Count()) {
-      s = Status::Corruption("WriteBatch has wrong count");
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+      case kTypeValue:
+      case kTypeColumnFamilyDeletion:
+      case kTypeDeletion:
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+      case kTypeColumnFamilyMerge:
+      case kTypeMerge:
+        found++;
+        if (!UpdateExistingEntryWithCfId(column_family_id, key)) {
+          AddNewEntry(column_family_id);
+        }
+        break;
+      case kTypeLogData:
+      case kTypeBeginPrepareXID:
+      case kTypeBeginPersistedPrepareXID:
+      case kTypeBeginUnprepareXID:
+      case kTypeEndPrepareXID:
+      case kTypeCommitXID:
+      case kTypeRollbackXID:
+      case kTypeNoop:
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag in ReBuildIndex",
+                                  ToString(static_cast<unsigned int>(tag)));
     }
+  }
 
-    return s;
+  if (s.ok() && found != write_batch.Count()) {
+    s = Status::Corruption("WriteBatch has wrong count");
   }
 
-  WriteBatchWithIndex::WriteBatchWithIndex(
-      const Comparator* default_index_comparator, size_t reserved_bytes,
-      bool overwrite_key, size_t max_bytes)
-      : rep(new Rep(default_index_comparator, reserved_bytes, max_bytes,
-                    overwrite_key)) {}
+  return s;
+}
+
+WriteBatchWithIndex::WriteBatchWithIndex(
+    const Comparator* default_index_comparator, size_t reserved_bytes,
+    bool overwrite_key, size_t max_bytes)
+    : rep(new Rep(default_index_comparator, reserved_bytes, max_bytes,
+                  overwrite_key)) {}
 
-  WriteBatchWithIndex::~WriteBatchWithIndex() {}
+WriteBatchWithIndex::~WriteBatchWithIndex() {}
 
-  WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
+WriteBatch* WriteBatchWithIndex::GetWriteBatch() { return &rep->write_batch; }
 
-  WBWIIterator* WriteBatchWithIndex::NewIterator() {
-    return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch);
+size_t WriteBatchWithIndex::SubBatchCnt() { return rep->sub_batch_cnt; }
+
+WBWIIterator* WriteBatchWithIndex::NewIterator() {
+  return new WBWIIteratorImpl(0, &(rep->skip_list), &rep->write_batch);
 }
 
 WBWIIterator* WriteBatchWithIndex::NewIterator(
@@ -783,10 +839,18 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
                                               ColumnFamilyHandle* column_family,
                                               const Slice& key,
                                               PinnableSlice* pinnable_val) {
+  return GetFromBatchAndDB(db, read_options, column_family, key, pinnable_val,
+                           nullptr);
+}
+
+Status WriteBatchWithIndex::GetFromBatchAndDB(
+    DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) {
   Status s;
   MergeContext merge_context;
   const ImmutableDBOptions& immuable_db_options =
-      reinterpret_cast<DBImpl*>(db)->immutable_db_options();
+      static_cast_with_check<DBImpl, DB>(db->GetRootDB())
+          ->immutable_db_options();
 
   // Since the lifetime of the WriteBatch is the same as that of the transaction
   // we cannot pin it as otherwise the returned value will not be available
@@ -819,7 +883,13 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db,
          result == WriteBatchWithIndexInternal::Result::kNotFound);
 
   // Did not find key in batch OR could not resolve Merges.  Try DB.
-  s = db->Get(read_options, column_family, key, pinnable_val);
+  if (!callback) {
+    s = db->Get(read_options, column_family, key, pinnable_val);
+  } else {
+    s = static_cast_with_check<DBImpl, DB>(db->GetRootDB())
+            ->GetImpl(read_options, column_family, key, pinnable_val, nullptr,
+                      callback);
+  }
 
   if (s.ok() || s.IsNotFound()) {  // DB Get Succeeded
     if (result == WriteBatchWithIndexInternal::Result::kMergeInProgress) {
@@ -858,6 +928,8 @@ Status WriteBatchWithIndex::RollbackToSavePoint() {
   Status s = rep->write_batch.RollbackToSavePoint();
 
   if (s.ok()) {
+    rep->sub_batch_cnt = 1;
+    rep->last_sub_batch_offset = 0;
     s = rep->ReBuildIndex();
   }
 
@@ -872,5 +944,9 @@ void WriteBatchWithIndex::SetMaxBytes(size_t max_bytes) {
   rep->write_batch.SetMaxBytes(max_bytes);
 }
 
+size_t WriteBatchWithIndex::GetDataSize() const {
+  return rep->write_batch.GetDataSize();
+}
+
 }  // namespace rocksdb
 #endif  // !ROCKSDB_LITE
diff --git a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
index 385d16fe69..243672ce4d 100644
--- a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
+++ b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc
@@ -69,18 +69,36 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset,
     case kTypeLogData:
       *type = kLogDataRecord;
       break;
+    case kTypeNoop:
     case kTypeBeginPrepareXID:
+    case kTypeBeginPersistedPrepareXID:
+    case kTypeBeginUnprepareXID:
     case kTypeEndPrepareXID:
     case kTypeCommitXID:
     case kTypeRollbackXID:
       *type = kXIDRecord;
       break;
     default:
-      return Status::Corruption("unknown WriteBatch tag");
+      return Status::Corruption("unknown WriteBatch tag ",
+                                ToString(static_cast<unsigned int>(tag)));
   }
   return Status::OK();
 }
 
+// If both of `entry1` and `entry2` point to real entry in write batch, we
+// compare the entries as following:
+// 1. first compare the column family, the one with larger CF will be larger;
+// 2. Inside the same CF, we first decode the entry to find the key of the entry
+//    and the entry with larger key will be larger;
+// 3. If two entries are of the same CF and offset, the one with larger offset
+//    will be larger.
+// Some times either `entry1` or `entry2` is dummy entry, which is actually
+// a search key. In this case, in step 2, we don't go ahead and decode the
+// entry but use the value in WriteBatchIndexEntry::search_key.
+// One special case is WriteBatchIndexEntry::key_size is kFlagMinInCf.
+// This indicate that we are going to seek to the first of the column family.
+// Once we see this, this entry will be smaller than all the real entries of
+// the column family.
 int WriteBatchEntryComparator::operator()(
     const WriteBatchIndexEntry* entry1,
     const WriteBatchIndexEntry* entry2) const {
@@ -90,9 +108,10 @@ int WriteBatchEntryComparator::operator()(
     return -1;
   }
 
-  if (entry1->offset == WriteBatchIndexEntry::kFlagMin) {
+  // Deal with special case of seeking to the beginning of a column family
+  if (entry1->is_min_in_cf()) {
     return -1;
-  } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) {
+  } else if (entry2->is_min_in_cf()) {
     return 1;
   }
 
diff --git a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
index ac20f1b862..3eed7c724c 100644
--- a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
+++ b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.h
@@ -31,21 +31,52 @@ struct WriteBatchIndexEntry {
         key_offset(ko),
         key_size(ksz),
         search_key(nullptr) {}
-  WriteBatchIndexEntry(const Slice* sk, uint32_t c)
-      : offset(0),
-        column_family(c),
+  // Create a dummy entry as the search key. This index entry won't be backed
+  // by an entry from the write batch, but a pointer to the search key. Or a
+  // special flag of offset can indicate we are seek to first.
+  // @_search_key: the search key
+  // @_column_family: column family
+  // @is_forward_direction: true for Seek(). False for SeekForPrev()
+  // @is_seek_to_first: true if we seek to the beginning of the column family
+  //                    _search_key should be null in this case.
+  WriteBatchIndexEntry(const Slice* _search_key, uint32_t _column_family,
+                       bool is_forward_direction, bool is_seek_to_first)
+      // For SeekForPrev(), we need to make the dummy entry larger than any
+      // entry who has the same search key. Otherwise, we'll miss those entries.
+      : offset(is_forward_direction ? 0 : port::kMaxSizet),
+        column_family(_column_family),
         key_offset(0),
-        key_size(0),
-        search_key(sk) {}
+        key_size(is_seek_to_first ? kFlagMinInCf : 0),
+        search_key(_search_key) {
+    assert(_search_key != nullptr || is_seek_to_first);
+  }
 
-  // If this flag appears in the offset, it indicates a key that is smaller
-  // than any other entry for the same column family
-  static const size_t kFlagMin = port::kMaxSizet;
+  // If this flag appears in the key_size, it indicates a
+  // key that is smaller than any other entry for the same column family.
+  static const size_t kFlagMinInCf = port::kMaxSizet;
+
+  bool is_min_in_cf() const {
+    assert(key_size != kFlagMinInCf ||
+           (key_offset == 0 && search_key == nullptr));
+    return key_size == kFlagMinInCf;
+  }
 
-  size_t offset;           // offset of an entry in write batch's string buffer.
-  uint32_t column_family;  // column family of the entry.
+  // offset of an entry in write batch's string buffer. If this is a dummy
+  // lookup key, in which case search_key != nullptr, offset is set to either
+  // 0 or max, only for comparison purpose. Because when entries have the same
+  // key, the entry with larger offset is larger, offset = 0 will make a seek
+  // key small or equal than all the entries with the seek key, so that Seek()
+  // will find all the entries of the same key. Similarly, offset = MAX will
+  // make the entry just larger than all entries with the search key so
+  // SeekForPrev() will see all the keys with the same key.
+  size_t offset;
+  uint32_t column_family;  // c1olumn family of the entry.
   size_t key_offset;       // offset of the key in write batch's string buffer.
-  size_t key_size;         // size of the key.
+  size_t key_size;         // size of the key. kFlagMinInCf indicates
+                           // that this is a dummy look up entry for
+                           // SeekToFirst() to the beginning of the column
+                           // family. We use the flag here to save a boolean
+                           // in the struct.
 
   const Slice* search_key;  // if not null, instead of reading keys from
                             // write batch, use it to compare. This is used
diff --git a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
index 5b1250a643..be715fe32c 100644
--- a/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
+++ b/thirdparty/rocksdb/utilities/write_batch_with_index/write_batch_with_index_test.cc
@@ -45,8 +45,8 @@ struct Entry {
 
 struct TestHandler : public WriteBatch::Handler {
   std::map<uint32_t, std::vector<Entry>> seen;
-  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
-                       const Slice& value) {
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
     Entry e;
     e.key = key.ToString();
     e.value = value.ToString();
@@ -54,8 +54,8 @@ struct TestHandler : public WriteBatch::Handler {
     seen[column_family_id].push_back(e);
     return Status::OK();
   }
-  virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
-                         const Slice& value) {
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
     Entry e;
     e.key = key.ToString();
     e.value = value.ToString();
@@ -63,8 +63,8 @@ struct TestHandler : public WriteBatch::Handler {
     seen[column_family_id].push_back(e);
     return Status::OK();
   }
-  virtual void LogData(const Slice& blob) {}
-  virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) {
+  void LogData(const Slice& /*blob*/) override {}
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
     Entry e;
     e.key = key.ToString();
     e.value = "";
@@ -506,22 +506,24 @@ typedef std::map<std::string, std::string> KVMap;
 class KVIter : public Iterator {
  public:
   explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {}
-  virtual bool Valid() const { return iter_ != map_->end(); }
-  virtual void SeekToFirst() { iter_ = map_->begin(); }
-  virtual void SeekToLast() {
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
     if (map_->empty()) {
       iter_ = map_->end();
     } else {
       iter_ = map_->find(map_->rbegin()->first);
     }
   }
-  virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); }
-  virtual void SeekForPrev(const Slice& k) {
+  void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  void SeekForPrev(const Slice& k) override {
     iter_ = map_->upper_bound(k.ToString());
     Prev();
   }
-  virtual void Next() { ++iter_; }
-  virtual void Prev() {
+  void Next() override { ++iter_; }
+  void Prev() override {
     if (iter_ == map_->begin()) {
       iter_ = map_->end();
       return;
@@ -529,9 +531,9 @@ class KVIter : public Iterator {
     --iter_;
   }
 
-  virtual Slice key() const { return iter_->first; }
-  virtual Slice value() const { return iter_->second; }
-  virtual Status status() const { return Status::OK(); }
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
 
  private:
   const KVMap* const map_;
@@ -621,7 +623,7 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
     for (int i = 0; i < 128; i++) {
       // Random walk and make sure iter and result_iter returns the
       // same key and value
-      int type = rnd.Uniform(5);
+      int type = rnd.Uniform(6);
       ASSERT_OK(iter->status());
       switch (type) {
         case 0:
@@ -642,7 +644,15 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
           result_iter->Seek(key);
           break;
         }
-        case 3:
+        case 3: {
+          // SeekForPrev to random key
+          auto key_idx = rnd.Uniform(static_cast<int>(source_strings.size()));
+          auto key = source_strings[key_idx];
+          iter->SeekForPrev(key);
+          result_iter->SeekForPrev(key);
+          break;
+        }
+        case 4:
           // Next
           if (is_valid) {
             iter->Next();
@@ -652,7 +662,7 @@ TEST_F(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) {
           }
           break;
         default:
-          assert(type == 4);
+          assert(type == 5);
           // Prev
           if (is_valid) {
             iter->Prev();
@@ -972,7 +982,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge) {
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
   options.create_if_missing = true;
 
-  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
 
   DestroyDB(dbname, options);
   Status s = DB::Open(options, dbname, &db);
@@ -1020,7 +1030,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchMerge2) {
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
   options.create_if_missing = true;
 
-  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
 
   DestroyDB(dbname, options);
   Status s = DB::Open(options, dbname, &db);
@@ -1078,7 +1088,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDB) {
   DB* db;
   Options options;
   options.create_if_missing = true;
-  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
 
   DestroyDB(dbname, options);
   Status s = DB::Open(options, dbname, &db);
@@ -1129,7 +1139,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge) {
   Options options;
 
   options.create_if_missing = true;
-  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
 
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
 
@@ -1255,7 +1265,7 @@ TEST_F(WriteBatchWithIndexTest, TestGetFromBatchAndDBMerge2) {
   Options options;
 
   options.create_if_missing = true;
-  std::string dbname = test::TmpDir() + "/write_batch_with_index_test";
+  std::string dbname = test::PerThreadDBPath("write_batch_with_index_test");
 
   options.merge_operator = MergeOperators::CreateFromStringId("stringappend");